Skip to content
Snippets Groups Projects
Commit a7fc7a95 authored by leo's avatar leo
Browse files

+comments

parent 00f6604e
No related branches found
No related tags found
No related merge requests found
......@@ -12,6 +12,9 @@ from pdfminer.layout import LTTextBoxHorizontal
def get_text_elements(p_path):
"""
Gets all text-elements of the given document as list.
"""
fp = open(p_path, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
......@@ -37,6 +40,11 @@ def get_text_elements(p_path):
def find_relative_position(p_key_box, p_value_box):
"""
Calculates the distance from key_box to value_box in order to provide a
position relative to key_box. Example:
key_box conatains "Total" and the value_box "34.42€"
"""
kx0, ky0, kx1, ky1 = p_key_box
vx0, vy0, vx1, vy1 = p_value_box
center_value_x = (abs(vx0 - vx1) / 2.0) + vx0
......@@ -45,6 +53,9 @@ def find_relative_position(p_key_box, p_value_box):
def get_word_offset(p_sentence, p_word, p_seperator=" "):
"""
Gets the count (index) of a word in a sentence/string
"""
# das wievielte wort eines satzes
p_sentence = str(p_sentence)
p_sentence = p_sentence.replace("\n", " ")
......@@ -58,6 +69,11 @@ def get_word_offset(p_sentence, p_word, p_seperator=" "):
def generate_template(p_example_dict, p_text_element_list):
"""
Generates a template from examples.
The exmample is a dict containing key/value pair that are found in the
element list.
"""
template = {}
for key, value in p_example_dict.items():
key_box = None
......@@ -86,6 +102,9 @@ def generate_template(p_example_dict, p_text_element_list):
def is_point_in_box(p_point, p_box, p_padding=5):
"""
Checks wether a point is in a specific box. The point can be padding of.
"""
x0, y0, x1, y1 = p_box
x, y = p_point
x0 -= p_padding
......@@ -96,6 +115,9 @@ def is_point_in_box(p_point, p_box, p_padding=5):
def read_template(p_template, p_text_element_list):
"""
Analysis all text elements according to the given template.
"""
for key, pos in p_template.iteritems():
key_box = None
for element in p_text_element_list:
......
......@@ -3,6 +3,10 @@ import json
def create_template(p_name, p_path, p_examples):
"""
Generates a template from example and pdf file and
stores it as json
"""
elements = get_text_elements(p_path)
template = generate_template(p_examples, elements)
data = [p_examples, template]
......@@ -12,6 +16,9 @@ def create_template(p_name, p_path, p_examples):
def evaluate_pdf(p_template_name, p_path):
"""
Analysis a pdf according to the given template
"""
with open(p_template_name + '.template') as f:
data = json.load(f)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment