pdf_parser.py 4.64 KiB
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal
def get_text_elements(p_path):
"""
Gets all text-elements of the given document as list.
"""
fp = open(p_path, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams(line_margin=0.1)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = []
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
elements_on_page = []
for element in layout:
if type(element) is LTTextBoxHorizontal:
pages.append(element)
#pages.append(elements_on_page)
return pages
def find_relative_position(p_key_box, p_value_box):
"""
Calculates the distance from key_box to value_box in order to provide a
position relative to key_box. Example:
key_box conatains "Total" and the value_box "34.42€"
"""
kx0, ky0, kx1, ky1 = p_key_box
vx0, vy0, vx1, vy1 = p_value_box
center_value_x = (abs(vx0 - vx1) / 2.0) + vx0
center_value_y = (abs(vy0 - vy1) / 2.0) + vy0
return (center_value_x - kx0, center_value_y - ky0)
def get_word_offset(p_sentence, p_word, p_seperator=" "):
"""
Gets the count (index) of a word in a sentence/string
"""
# das wievielte wort eines satzes
p_sentence = str(p_sentence)
p_sentence = p_sentence.replace("\n", " ")
# print('get_word_offset p_sentence(' + str(p_sentence) + '), p_word(' + str(p_word) + ')')
p_sentence = p_sentence.split(p_seperator)
for i in range(len(p_sentence)):
current_word = p_sentence[i]
if current_word == p_word:
return i
raise RuntimeError("Sentence does not contains word: %s -> %s" % (p_word, p_sentence))
def generate_template(p_example_dict, p_text_element_list):
"""
Generates a template from examples.
The exmample is a dict containing key/value pair that are found in the
element list.
"""
template = {}
for key, value in p_example_dict.items():
key_box = None
value_box = None
for element in p_text_element_list:
if value_box is not None and key_box is not None:
break
if key in element.get_text():
key_box = element
if value in element.get_text():
value_box = element
if key_box is None:
print("Key not found: '%s'" % key)
continue
if value_box is None:
print("Value not found: '%s'" % value)
continue
rel_pos = find_relative_position(key_box.bbox, value_box.bbox)
value_offset = get_word_offset(value_box.get_text(), value)
template[key] = (rel_pos, value_offset)
return template
def is_point_in_box(p_point, p_box, p_padding=5):
"""
Checks wether a point is in a specific box. The point can be padding of.
"""
x0, y0, x1, y1 = p_box
x, y = p_point
x0 -= p_padding
y0 -= p_padding
x1 += p_padding
y1 += p_padding
return x0 <= x and x <= x1 and y0 <= y and y <= y1
def read_template(p_template, p_text_element_list):
"""
Analysis all text elements according to the given template.
"""
for key, pos in p_template.iteritems():
key_box = None
for element in p_text_element_list:
if key in element.get_text():
key_box = element
break
if key_box is None:
print("Key not found '%s'. Skipping ..." % key)
continue
x0, y0, x1, y1 = key_box.bbox
rx, ry = pos[0]
offset = pos[1]
estimated_position = (x0 + rx, y0 + ry)
max_padding = 15
for element in p_text_element_list:
for i in range(max_padding):
if is_point_in_box(estimated_position, element.bbox, i):
text = element.get_text().replace("\n", " ").split(" ")
print("Found value %s: '%s' with offset %d" % (key, text[offset], i))
break
#print("Found no value for '%s' with max padding %d" % (key, max_padding))