From a7fc7a956ca0d3c30a15b824bed1ca80022119d4 Mon Sep 17 00:00:00 2001 From: leo <Leo.Strohmidel@gmx.net> Date: Wed, 27 Jun 2018 20:20:56 +0200 Subject: [PATCH] +comments --- pdf_parser.py | 22 ++++++++++++++++++++++ template_manager.py | 7 +++++++ 2 files changed, 29 insertions(+) diff --git a/pdf_parser.py b/pdf_parser.py index d1727ba..0d5dd66 100644 --- a/pdf_parser.py +++ b/pdf_parser.py @@ -12,6 +12,9 @@ from pdfminer.layout import LTTextBoxHorizontal def get_text_elements(p_path): + """ + Gets all text-elements of the given document as list. + """ fp = open(p_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) @@ -37,6 +40,11 @@ def get_text_elements(p_path): def find_relative_position(p_key_box, p_value_box): + """ + Calculates the distance from key_box to value_box in order to provide a + position relative to key_box. Example: + key_box conatains "Total" and the value_box "34.42€" + """ kx0, ky0, kx1, ky1 = p_key_box vx0, vy0, vx1, vy1 = p_value_box center_value_x = (abs(vx0 - vx1) / 2.0) + vx0 @@ -45,6 +53,9 @@ def find_relative_position(p_key_box, p_value_box): def get_word_offset(p_sentence, p_word, p_seperator=" "): + """ + Gets the count (index) of a word in a sentence/string + """ # das wievielte wort eines satzes p_sentence = str(p_sentence) p_sentence = p_sentence.replace("\n", " ") @@ -58,6 +69,11 @@ def get_word_offset(p_sentence, p_word, p_seperator=" "): def generate_template(p_example_dict, p_text_element_list): + """ + Generates a template from examples. + The exmample is a dict containing key/value pair that are found in the + element list. + """ template = {} for key, value in p_example_dict.items(): key_box = None @@ -86,6 +102,9 @@ def generate_template(p_example_dict, p_text_element_list): def is_point_in_box(p_point, p_box, p_padding=5): + """ + Checks wether a point is in a specific box. The point can be padding of. + """ x0, y0, x1, y1 = p_box x, y = p_point x0 -= p_padding @@ -96,6 +115,9 @@ def is_point_in_box(p_point, p_box, p_padding=5): def read_template(p_template, p_text_element_list): + """ + Analysis all text elements according to the given template. + """ for key, pos in p_template.iteritems(): key_box = None for element in p_text_element_list: diff --git a/template_manager.py b/template_manager.py index 574ea0d..8e18e49 100644 --- a/template_manager.py +++ b/template_manager.py @@ -3,6 +3,10 @@ import json def create_template(p_name, p_path, p_examples): + """ + Generates a template from example and pdf file and + stores it as json + """ elements = get_text_elements(p_path) template = generate_template(p_examples, elements) data = [p_examples, template] @@ -12,6 +16,9 @@ def create_template(p_name, p_path, p_examples): def evaluate_pdf(p_template_name, p_path): + """ + Analysis a pdf according to the given template + """ with open(p_template_name + '.template') as f: data = json.load(f) -- GitLab