From 00f6604eb337512dece123f55588bf6edb2819f3 Mon Sep 17 00:00:00 2001 From: larsm <a@a.a> Date: Wed, 27 Jun 2018 20:08:36 +0200 Subject: [PATCH] =?UTF-8?q?auf=20python=203=20konvertiert,=20sch=C3=B6nhei?= =?UTF-8?q?tsop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pdf_parser.py | 56 ++++++++++++++++++++++++++------------------- pdf_parser_test.py | 17 +++++++------- template_manager.py | 2 ++ 3 files changed, 43 insertions(+), 32 deletions(-) diff --git a/pdf_parser.py b/pdf_parser.py index d2af575..d1727ba 100644 --- a/pdf_parser.py +++ b/pdf_parser.py @@ -10,29 +10,31 @@ from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal + def get_text_elements(p_path): - fp = open(p_path, 'rb') - parser = PDFParser(fp) - document = PDFDocument(parser) - if not document.is_extractable: - raise PDFTextExtractionNotAllowed - rsrcmgr = PDFResourceManager() - laparams = LAParams(line_margin=0.1) - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - - pages = [] - for page in PDFPage.create_pages(document): - interpreter.process_page(page) - layout = device.get_result() - - elements_on_page = [] - for element in layout: - if type(element) is LTTextBoxHorizontal: - pages.append(element) - #pages.append(elements_on_page) - - return pages + fp = open(p_path, 'rb') + parser = PDFParser(fp) + document = PDFDocument(parser) + if not document.is_extractable: + raise PDFTextExtractionNotAllowed + rsrcmgr = PDFResourceManager() + laparams = LAParams(line_margin=0.1) + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + + pages = [] + for page in PDFPage.create_pages(document): + interpreter.process_page(page) + layout = device.get_result() + + elements_on_page = [] + for element in layout: + if type(element) is LTTextBoxHorizontal: + pages.append(element) + #pages.append(elements_on_page) + + return pages + def find_relative_position(p_key_box, p_value_box): kx0, ky0, kx1, ky1 = p_key_box @@ -41,9 +43,12 @@ def find_relative_position(p_key_box, p_value_box): center_value_y = (abs(vy0 - vy1) / 2.0) + vy0 return (center_value_x - kx0, center_value_y - ky0) + def get_word_offset(p_sentence, p_word, p_seperator=" "): - p_sentence = p_sentence.encode() + # das wievielte wort eines satzes + p_sentence = str(p_sentence) p_sentence = p_sentence.replace("\n", " ") + # print('get_word_offset p_sentence(' + str(p_sentence) + '), p_word(' + str(p_word) + ')') p_sentence = p_sentence.split(p_seperator) for i in range(len(p_sentence)): current_word = p_sentence[i] @@ -51,9 +56,10 @@ def get_word_offset(p_sentence, p_word, p_seperator=" "): return i raise RuntimeError("Sentence does not contains word: %s -> %s" % (p_word, p_sentence)) + def generate_template(p_example_dict, p_text_element_list): template = {} - for key, value in p_example_dict.iteritems(): + for key, value in p_example_dict.items(): key_box = None value_box = None @@ -78,6 +84,7 @@ def generate_template(p_example_dict, p_text_element_list): return template + def is_point_in_box(p_point, p_box, p_padding=5): x0, y0, x1, y1 = p_box x, y = p_point @@ -87,6 +94,7 @@ def is_point_in_box(p_point, p_box, p_padding=5): y1 += p_padding return x0 <= x and x <= x1 and y0 <= y and y <= y1 + def read_template(p_template, p_text_element_list): for key, pos in p_template.iteritems(): key_box = None diff --git a/pdf_parser_test.py b/pdf_parser_test.py index b691bea..d2afc01 100644 --- a/pdf_parser_test.py +++ b/pdf_parser_test.py @@ -1,16 +1,17 @@ -#from pdf_parser import get_text_elements, generate_template, read_template +# from pdf_parser import get_text_elements, generate_template, read_template from template_manager import create_template -#path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2017/Eingangsrechnungen/2017-08-12 rabe getraenke 33971.pdf" +# path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2017/Eingangsrechnungen/2017-08-12 rabe getraenke 33971.pdf" path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-09-01 sipgate EB589544.pdf" -#e = get_text_elements(path) +path = "pdf/2016-09-01 sipgate EB589544.pdf" +# e = get_text_elements(path) -#examples = {"Rechnungsbetrag": "183,43", "Datum": "12.8.2017", "RECHNUNGSNr": "33971"} +# examples = {"Rechnungsbetrag": "183,43", "Datum": "12.8.2017", "RECHNUNGSNr": "33971"} examples = {"Rechnung": "EB589544", "Rechnungsdatum": "01.09.2016", "Mandatsreferenz": "8ADB836C18A8491092EA62F7F35F8A28"} create_template("sipgate", path, examples) -#g = generate_template(examples, e[0]) +# g = generate_template(examples, e[0]) -#path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-07-27 sipgate EB540908.pdf" -#e = get_text_elements(path) -#read_template(g, e[0]) +# path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-07-27 sipgate EB540908.pdf" +# e = get_text_elements(path) +# read_template(g, e[0]) diff --git a/template_manager.py b/template_manager.py index 6cd567a..574ea0d 100644 --- a/template_manager.py +++ b/template_manager.py @@ -1,6 +1,7 @@ from pdf_parser import get_text_elements, generate_template, read_template import json + def create_template(p_name, p_path, p_examples): elements = get_text_elements(p_path) template = generate_template(p_examples, elements) @@ -9,6 +10,7 @@ def create_template(p_name, p_path, p_examples): with open(p_name + '.template', 'w') as outfile: json.dump(data, outfile) + def evaluate_pdf(p_template_name, p_path): with open(p_template_name + '.template') as f: -- GitLab