+comments

a7fc7a95 · leo · 00f6604e · a7fc7a95 · a7fc7a95
Commit a7fc7a95 authored 6 years ago by leo
--- a/pdf_parser.py
+++ b/pdf_parser.py
@@ -12,6 +12,9 @@ from pdfminer.layout import LTTextBoxHorizontal


 def get_text_elements(p_path):
+    """
+    Gets all text-elements of the given document as list.
+    """
    fp = open(p_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
@@ -37,6 +40,11 @@ def get_text_elements(p_path):


 def find_relative_position(p_key_box, p_value_box):
+    """
+    Calculates the distance from key_box to value_box in order to provide a
+    position relative to key_box. Example:
+    key_box conatains "Total" and the value_box "34.42€"
+    """
    kx0, ky0, kx1, ky1 = p_key_box
    vx0, vy0, vx1, vy1 = p_value_box
    center_value_x = (abs(vx0 - vx1) / 2.0) + vx0
@@ -45,6 +53,9 @@ def find_relative_position(p_key_box, p_value_box):


 def get_word_offset(p_sentence, p_word, p_seperator=" "):
+    """
+    Gets the count (index) of a word in a sentence/string
+    """
    # das wievielte wort eines satzes
    p_sentence = str(p_sentence)
    p_sentence = p_sentence.replace("\n", " ")
@@ -58,6 +69,11 @@ def get_word_offset(p_sentence, p_word, p_seperator=" "):


 def generate_template(p_example_dict, p_text_element_list):
+    """
+    Generates a template from examples.
+    The exmample is a dict containing key/value pair that are found in the
+    element list.
+    """
    template = {}
    for key, value in p_example_dict.items():
        key_box = None
@@ -86,6 +102,9 @@ def generate_template(p_example_dict, p_text_element_list):


 def is_point_in_box(p_point, p_box, p_padding=5):
+    """
+    Checks wether a point is in a specific box. The point can be padding of.
+    """
    x0, y0, x1, y1 = p_box
    x, y = p_point
    x0 -= p_padding
@@ -96,6 +115,9 @@ def is_point_in_box(p_point, p_box, p_padding=5):


 def read_template(p_template, p_text_element_list):
+    """
+    Analysis all text elements according to the given template.
+    """
    for key, pos in p_template.iteritems():
        key_box = None
        for element in p_text_element_list:

--- a/template_manager.py
+++ b/template_manager.py
@@ -3,6 +3,10 @@ import json


 def create_template(p_name, p_path, p_examples):
+    """
+    Generates a template from example and pdf file and
+    stores it as json
+    """
    elements = get_text_elements(p_path)
    template = generate_template(p_examples, elements)
    data = [p_examples, template]
@@ -12,6 +16,9 @@ def create_template(p_name, p_path, p_examples):


 def evaluate_pdf(p_template_name, p_path):
+    """
+    Analysis a pdf according to the given template
+    """

    with open(p_template_name + '.template') as f:
        data = json.load(f)