from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal def get_text_elements(p_path): """ Gets all text-elements of the given document as list. """ fp = open(p_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams(line_margin=0.1) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() elements_on_page = [] for element in layout: if type(element) is LTTextBoxHorizontal: pages.append(element) #pages.append(elements_on_page) return pages def find_relative_position(p_key_box, p_value_box): """ Calculates the distance from key_box to value_box in order to provide a position relative to key_box. Example: key_box conatains "Total" and the value_box "34.42€" """ kx0, ky0, kx1, ky1 = p_key_box vx0, vy0, vx1, vy1 = p_value_box center_value_x = (abs(vx0 - vx1) / 2.0) + vx0 center_value_y = (abs(vy0 - vy1) / 2.0) + vy0 return (center_value_x - kx0, center_value_y - ky0) def get_word_offset(p_sentence, p_word, p_seperator=" "): """ Gets the count (index) of a word in a sentence/string """ # das wievielte wort eines satzes p_sentence = str(p_sentence) p_sentence = p_sentence.replace("\n", " ") # print('get_word_offset p_sentence(' + str(p_sentence) + '), p_word(' + str(p_word) + ')') p_sentence = p_sentence.split(p_seperator) for i in range(len(p_sentence)): current_word = p_sentence[i] if current_word == p_word: return i raise RuntimeError("Sentence does not contains word: %s -> %s" % (p_word, p_sentence)) def generate_template(p_example_dict, p_text_element_list): """ Generates a template from examples. The exmample is a dict containing key/value pair that are found in the element list. """ template = {} for key, value in p_example_dict.items(): key_box = None value_box = None for element in p_text_element_list: if value_box is not None and key_box is not None: break if key in element.get_text(): key_box = element if value in element.get_text(): value_box = element if key_box is None: print("Key not found: '%s'" % key) continue if value_box is None: print("Value not found: '%s'" % value) continue rel_pos = find_relative_position(key_box.bbox, value_box.bbox) value_offset = get_word_offset(value_box.get_text(), value) template[key] = (rel_pos, value_offset) return template def is_point_in_box(p_point, p_box, p_padding=5): """ Checks wether a point is in a specific box. The point can be padding of. """ x0, y0, x1, y1 = p_box x, y = p_point x0 -= p_padding y0 -= p_padding x1 += p_padding y1 += p_padding return x0 <= x and x <= x1 and y0 <= y and y <= y1 def read_template(p_template, p_text_element_list): """ Analysis all text elements according to the given template. """ for key, pos in p_template.iteritems(): key_box = None for element in p_text_element_list: if key in element.get_text(): key_box = element break if key_box is None: print("Key not found '%s'. Skipping ..." % key) continue x0, y0, x1, y1 = key_box.bbox rx, ry = pos[0] offset = pos[1] estimated_position = (x0 + rx, y0 + ry) max_padding = 15 for element in p_text_element_list: for i in range(max_padding): if is_point_in_box(estimated_position, element.bbox, i): text = element.get_text().replace("\n", " ").split(" ") print("Found value %s: '%s' with offset %d" % (key, text[offset], i)) break #print("Found no value for '%s' with max padding %d" % (key, max_padding))