Skip to content
Snippets Groups Projects
Commit 00f6604e authored by larsm's avatar larsm
Browse files

auf python 3 konvertiert, schönheitsop

parent 9f1b25c5
No related branches found
No related tags found
No related merge requests found
...@@ -10,29 +10,31 @@ from pdfminer.converter import PDFPageAggregator ...@@ -10,29 +10,31 @@ from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal from pdfminer.layout import LTTextBoxHorizontal
def get_text_elements(p_path): def get_text_elements(p_path):
fp = open(p_path, 'rb') fp = open(p_path, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
document = PDFDocument(parser) document = PDFDocument(parser)
if not document.is_extractable: if not document.is_extractable:
raise PDFTextExtractionNotAllowed raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager() rsrcmgr = PDFResourceManager()
laparams = LAParams(line_margin=0.1) laparams = LAParams(line_margin=0.1)
device = PDFPageAggregator(rsrcmgr, laparams=laparams) device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = [] pages = []
for page in PDFPage.create_pages(document): for page in PDFPage.create_pages(document):
interpreter.process_page(page) interpreter.process_page(page)
layout = device.get_result() layout = device.get_result()
elements_on_page = [] elements_on_page = []
for element in layout: for element in layout:
if type(element) is LTTextBoxHorizontal: if type(element) is LTTextBoxHorizontal:
pages.append(element) pages.append(element)
#pages.append(elements_on_page) #pages.append(elements_on_page)
return pages return pages
def find_relative_position(p_key_box, p_value_box): def find_relative_position(p_key_box, p_value_box):
kx0, ky0, kx1, ky1 = p_key_box kx0, ky0, kx1, ky1 = p_key_box
...@@ -41,9 +43,12 @@ def find_relative_position(p_key_box, p_value_box): ...@@ -41,9 +43,12 @@ def find_relative_position(p_key_box, p_value_box):
center_value_y = (abs(vy0 - vy1) / 2.0) + vy0 center_value_y = (abs(vy0 - vy1) / 2.0) + vy0
return (center_value_x - kx0, center_value_y - ky0) return (center_value_x - kx0, center_value_y - ky0)
def get_word_offset(p_sentence, p_word, p_seperator=" "): def get_word_offset(p_sentence, p_word, p_seperator=" "):
p_sentence = p_sentence.encode() # das wievielte wort eines satzes
p_sentence = str(p_sentence)
p_sentence = p_sentence.replace("\n", " ") p_sentence = p_sentence.replace("\n", " ")
# print('get_word_offset p_sentence(' + str(p_sentence) + '), p_word(' + str(p_word) + ')')
p_sentence = p_sentence.split(p_seperator) p_sentence = p_sentence.split(p_seperator)
for i in range(len(p_sentence)): for i in range(len(p_sentence)):
current_word = p_sentence[i] current_word = p_sentence[i]
...@@ -51,9 +56,10 @@ def get_word_offset(p_sentence, p_word, p_seperator=" "): ...@@ -51,9 +56,10 @@ def get_word_offset(p_sentence, p_word, p_seperator=" "):
return i return i
raise RuntimeError("Sentence does not contains word: %s -> %s" % (p_word, p_sentence)) raise RuntimeError("Sentence does not contains word: %s -> %s" % (p_word, p_sentence))
def generate_template(p_example_dict, p_text_element_list): def generate_template(p_example_dict, p_text_element_list):
template = {} template = {}
for key, value in p_example_dict.iteritems(): for key, value in p_example_dict.items():
key_box = None key_box = None
value_box = None value_box = None
...@@ -78,6 +84,7 @@ def generate_template(p_example_dict, p_text_element_list): ...@@ -78,6 +84,7 @@ def generate_template(p_example_dict, p_text_element_list):
return template return template
def is_point_in_box(p_point, p_box, p_padding=5): def is_point_in_box(p_point, p_box, p_padding=5):
x0, y0, x1, y1 = p_box x0, y0, x1, y1 = p_box
x, y = p_point x, y = p_point
...@@ -87,6 +94,7 @@ def is_point_in_box(p_point, p_box, p_padding=5): ...@@ -87,6 +94,7 @@ def is_point_in_box(p_point, p_box, p_padding=5):
y1 += p_padding y1 += p_padding
return x0 <= x and x <= x1 and y0 <= y and y <= y1 return x0 <= x and x <= x1 and y0 <= y and y <= y1
def read_template(p_template, p_text_element_list): def read_template(p_template, p_text_element_list):
for key, pos in p_template.iteritems(): for key, pos in p_template.iteritems():
key_box = None key_box = None
......
#from pdf_parser import get_text_elements, generate_template, read_template # from pdf_parser import get_text_elements, generate_template, read_template
from template_manager import create_template from template_manager import create_template
#path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2017/Eingangsrechnungen/2017-08-12 rabe getraenke 33971.pdf" # path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2017/Eingangsrechnungen/2017-08-12 rabe getraenke 33971.pdf"
path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-09-01 sipgate EB589544.pdf" path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-09-01 sipgate EB589544.pdf"
#e = get_text_elements(path) path = "pdf/2016-09-01 sipgate EB589544.pdf"
# e = get_text_elements(path)
#examples = {"Rechnungsbetrag": "183,43", "Datum": "12.8.2017", "RECHNUNGSNr": "33971"} # examples = {"Rechnungsbetrag": "183,43", "Datum": "12.8.2017", "RECHNUNGSNr": "33971"}
examples = {"Rechnung": "EB589544", "Rechnungsdatum": "01.09.2016", "Mandatsreferenz": "8ADB836C18A8491092EA62F7F35F8A28"} examples = {"Rechnung": "EB589544", "Rechnungsdatum": "01.09.2016", "Mandatsreferenz": "8ADB836C18A8491092EA62F7F35F8A28"}
create_template("sipgate", path, examples) create_template("sipgate", path, examples)
#g = generate_template(examples, e[0]) # g = generate_template(examples, e[0])
#path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-07-27 sipgate EB540908.pdf" # path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-07-27 sipgate EB540908.pdf"
#e = get_text_elements(path) # e = get_text_elements(path)
#read_template(g, e[0]) # read_template(g, e[0])
from pdf_parser import get_text_elements, generate_template, read_template from pdf_parser import get_text_elements, generate_template, read_template
import json import json
def create_template(p_name, p_path, p_examples): def create_template(p_name, p_path, p_examples):
elements = get_text_elements(p_path) elements = get_text_elements(p_path)
template = generate_template(p_examples, elements) template = generate_template(p_examples, elements)
...@@ -9,6 +10,7 @@ def create_template(p_name, p_path, p_examples): ...@@ -9,6 +10,7 @@ def create_template(p_name, p_path, p_examples):
with open(p_name + '.template', 'w') as outfile: with open(p_name + '.template', 'w') as outfile:
json.dump(data, outfile) json.dump(data, outfile)
def evaluate_pdf(p_template_name, p_path): def evaluate_pdf(p_template_name, p_path):
with open(p_template_name + '.template') as f: with open(p_template_name + '.template') as f:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment