Skip to content
Snippets Groups Projects
Commit 00f6604e authored by larsm's avatar larsm
Browse files

auf python 3 konvertiert, schönheitsop

parent 9f1b25c5
No related branches found
No related tags found
No related merge requests found
......@@ -10,29 +10,31 @@ from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal
def get_text_elements(p_path):
fp = open(p_path, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams(line_margin=0.1)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = []
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
elements_on_page = []
for element in layout:
if type(element) is LTTextBoxHorizontal:
pages.append(element)
#pages.append(elements_on_page)
return pages
fp = open(p_path, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams(line_margin=0.1)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = []
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
elements_on_page = []
for element in layout:
if type(element) is LTTextBoxHorizontal:
pages.append(element)
#pages.append(elements_on_page)
return pages
def find_relative_position(p_key_box, p_value_box):
kx0, ky0, kx1, ky1 = p_key_box
......@@ -41,9 +43,12 @@ def find_relative_position(p_key_box, p_value_box):
center_value_y = (abs(vy0 - vy1) / 2.0) + vy0
return (center_value_x - kx0, center_value_y - ky0)
def get_word_offset(p_sentence, p_word, p_seperator=" "):
p_sentence = p_sentence.encode()
# das wievielte wort eines satzes
p_sentence = str(p_sentence)
p_sentence = p_sentence.replace("\n", " ")
# print('get_word_offset p_sentence(' + str(p_sentence) + '), p_word(' + str(p_word) + ')')
p_sentence = p_sentence.split(p_seperator)
for i in range(len(p_sentence)):
current_word = p_sentence[i]
......@@ -51,9 +56,10 @@ def get_word_offset(p_sentence, p_word, p_seperator=" "):
return i
raise RuntimeError("Sentence does not contains word: %s -> %s" % (p_word, p_sentence))
def generate_template(p_example_dict, p_text_element_list):
template = {}
for key, value in p_example_dict.iteritems():
for key, value in p_example_dict.items():
key_box = None
value_box = None
......@@ -78,6 +84,7 @@ def generate_template(p_example_dict, p_text_element_list):
return template
def is_point_in_box(p_point, p_box, p_padding=5):
x0, y0, x1, y1 = p_box
x, y = p_point
......@@ -87,6 +94,7 @@ def is_point_in_box(p_point, p_box, p_padding=5):
y1 += p_padding
return x0 <= x and x <= x1 and y0 <= y and y <= y1
def read_template(p_template, p_text_element_list):
for key, pos in p_template.iteritems():
key_box = None
......
#from pdf_parser import get_text_elements, generate_template, read_template
# from pdf_parser import get_text_elements, generate_template, read_template
from template_manager import create_template
#path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2017/Eingangsrechnungen/2017-08-12 rabe getraenke 33971.pdf"
# path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2017/Eingangsrechnungen/2017-08-12 rabe getraenke 33971.pdf"
path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-09-01 sipgate EB589544.pdf"
#e = get_text_elements(path)
path = "pdf/2016-09-01 sipgate EB589544.pdf"
# e = get_text_elements(path)
#examples = {"Rechnungsbetrag": "183,43", "Datum": "12.8.2017", "RECHNUNGSNr": "33971"}
# examples = {"Rechnungsbetrag": "183,43", "Datum": "12.8.2017", "RECHNUNGSNr": "33971"}
examples = {"Rechnung": "EB589544", "Rechnungsdatum": "01.09.2016", "Mandatsreferenz": "8ADB836C18A8491092EA62F7F35F8A28"}
create_template("sipgate", path, examples)
#g = generate_template(examples, e[0])
# g = generate_template(examples, e[0])
#path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-07-27 sipgate EB540908.pdf"
#e = get_text_elements(path)
#read_template(g, e[0])
# path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-07-27 sipgate EB540908.pdf"
# e = get_text_elements(path)
# read_template(g, e[0])
from pdf_parser import get_text_elements, generate_template, read_template
import json
def create_template(p_name, p_path, p_examples):
elements = get_text_elements(p_path)
template = generate_template(p_examples, elements)
......@@ -9,6 +10,7 @@ def create_template(p_name, p_path, p_examples):
with open(p_name + '.template', 'w') as outfile:
json.dump(data, outfile)
def evaluate_pdf(p_template_name, p_path):
with open(p_template_name + '.template') as f:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment