Skip to content
Snippets Groups Projects
Commit 9f1b25c5 authored by leo's avatar leo
Browse files

+init push

parents
No related branches found
No related tags found
No related merge requests found
[{"SUMME": "104,52", "Filiale Pos": "0011361161217017003523", "Datum": "10.01.18"}, {"SUMME": [[153.77157452, -14.178556000000015], 0], "Datum": [[80.8817166, -5.087157999999997], 0]}]
\ No newline at end of file
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal
def get_text_elements(p_path):
fp = open(p_path, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams(line_margin=0.1)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = []
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
elements_on_page = []
for element in layout:
if type(element) is LTTextBoxHorizontal:
pages.append(element)
#pages.append(elements_on_page)
return pages
def find_relative_position(p_key_box, p_value_box):
kx0, ky0, kx1, ky1 = p_key_box
vx0, vy0, vx1, vy1 = p_value_box
center_value_x = (abs(vx0 - vx1) / 2.0) + vx0
center_value_y = (abs(vy0 - vy1) / 2.0) + vy0
return (center_value_x - kx0, center_value_y - ky0)
def get_word_offset(p_sentence, p_word, p_seperator=" "):
p_sentence = p_sentence.encode()
p_sentence = p_sentence.replace("\n", " ")
p_sentence = p_sentence.split(p_seperator)
for i in range(len(p_sentence)):
current_word = p_sentence[i]
if current_word == p_word:
return i
raise RuntimeError("Sentence does not contains word: %s -> %s" % (p_word, p_sentence))
def generate_template(p_example_dict, p_text_element_list):
template = {}
for key, value in p_example_dict.iteritems():
key_box = None
value_box = None
for element in p_text_element_list:
if value_box is not None and key_box is not None:
break
if key in element.get_text():
key_box = element
if value in element.get_text():
value_box = element
if key_box is None:
print("Key not found: '%s'" % key)
continue
if value_box is None:
print("Value not found: '%s'" % value)
continue
rel_pos = find_relative_position(key_box.bbox, value_box.bbox)
value_offset = get_word_offset(value_box.get_text(), value)
template[key] = (rel_pos, value_offset)
return template
def is_point_in_box(p_point, p_box, p_padding=5):
x0, y0, x1, y1 = p_box
x, y = p_point
x0 -= p_padding
y0 -= p_padding
x1 += p_padding
y1 += p_padding
return x0 <= x and x <= x1 and y0 <= y and y <= y1
def read_template(p_template, p_text_element_list):
for key, pos in p_template.iteritems():
key_box = None
for element in p_text_element_list:
if key in element.get_text():
key_box = element
break
if key_box is None:
print("Key not found '%s'. Skipping ..." % key)
continue
x0, y0, x1, y1 = key_box.bbox
rx, ry = pos[0]
offset = pos[1]
estimated_position = (x0 + rx, y0 + ry)
max_padding = 15
for element in p_text_element_list:
for i in range(max_padding):
if is_point_in_box(estimated_position, element.bbox, i):
text = element.get_text().replace("\n", " ").split(" ")
print("Found value %s: '%s' with offset %d" % (key, text[offset], i))
break
#print("Found no value for '%s' with max padding %d" % (key, max_padding))
#from pdf_parser import get_text_elements, generate_template, read_template
from template_manager import create_template
#path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2017/Eingangsrechnungen/2017-08-12 rabe getraenke 33971.pdf"
path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-09-01 sipgate EB589544.pdf"
#e = get_text_elements(path)
#examples = {"Rechnungsbetrag": "183,43", "Datum": "12.8.2017", "RECHNUNGSNr": "33971"}
examples = {"Rechnung": "EB589544", "Rechnungsdatum": "01.09.2016", "Mandatsreferenz": "8ADB836C18A8491092EA62F7F35F8A28"}
create_template("sipgate", path, examples)
#g = generate_template(examples, e[0])
#path = "/home/reverend/Programmierung/git_projects/warpzone-rechnungen/2016/Eingangsrechnungen/2016-07-27 sipgate EB540908.pdf"
#e = get_text_elements(path)
#read_template(g, e[0])
from pdf_parser import get_text_elements, generate_template, read_template
import json
def create_template(p_name, p_path, p_examples):
elements = get_text_elements(p_path)
template = generate_template(p_examples, elements)
data = [p_examples, template]
with open(p_name + '.template', 'w') as outfile:
json.dump(data, outfile)
def evaluate_pdf(p_template_name, p_path):
with open(p_template_name + '.template') as f:
data = json.load(f)
template = data[1]
elements = get_text_elements(p_path)
content = read_template(template, elements)
with open("output.shmebulok", "w") as out:
json.dump(content, out)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment