Skip to content
Snippets Groups Projects
Commit 069fa980 authored by leo's avatar leo
Browse files

changes i dont remember #yolo

parent dfd53745
No related branches found
No related tags found
No related merge requests found
import PyPDF2
import pytesseract
from PIL import Image
from wand.image import Image as wImage
import io
def _detect_image_rotated(p_image):
try:
osd_data = pytesseract.image_to_osd(p_image).split("\n")
except pytesseract.TesseractError:
return False
orientation_line = osd_data[1]
orientation = int(orientation_line.split(" ")[-1])
if orientation > 160 and orientation < 200:
return True
return False
def get_pdf_as_image(p_path_to_pdf):
image = wImage(filename=p_path_to_pdf, resolution=300)
converted = image.convert("jpg")
image_list = []
for img in converted.sequence:
page = wImage(image=img)
blob = page.make_blob("jpg")
i = Image.open(io.BytesIO(blob))
if _detect_image_rotated(i):
i = i.rotate(180)
image_list.append(i)
return image_list
\ No newline at end of file
sharpness.jpg

172 KiB

from pdf_parser import get_textelements_from_pdf, generate_template, get_contents_from_textelements #from pdf_parser import get_textelements_from_pdf, generate_template, get_contents_from_textelements
#import json
from pdf_to_image import get_pdf_as_image
from image_to_text import generate_text_from_image
from text_model import Page
import os
import json import json
class Match():
def __init__(self, p_word="", p_page_number=0, p_block_number=0, p_line_number=0, p_word_number=0):
self.word = p_word
self.page_number = p_page_number
self.block_number = p_block_number
self.line_number = p_line_number
self.word_number = p_word_number
def validate_match(self, p_data):
page = p_data[self.page_number]
block = page.get_block_at(self.block_number)
line = block.get_line_at(self.line_number)
word = line.get_word_at(self.word_number)
print(str(word) + "<->" + str(self.word))
def __str__(self):
return self.word + ": " + str(self.page_number) + " > " + str(self.block_number) + " > " + str(self.line_number) + " > " + str(self.word_number)
def is_template_fitting(p_template, p_page_list):
hook = p_template[0]
wanted_page, wanted_block, wanted_line, wanted_word, hook_word = hook
page = p_page_list[wanted_page]
block = page.get_block_at(wanted_block)
line = block.get_line_at(wanted_line)
word = line.get_word_at(wanted_word)
return str(word) == hook_word
def save_page_list_to_file(p_page_list, p_name):
json_list = []
for page in p_page_list:
json_list.append(page.to_json())
file = open(p_name + ".ptt", "w")
json.dump(json_list, file)
file.close()
def get_pdf_as_text(p_path, p_seperator="/"):
pdf_name = p_path.split(p_seperator)[-1]
page_list = []
if os.path.isfile(pdf_name + ".ptt"):
json_list = json.load(open(pdf_name + ".ptt"))
for json_page in json_list:
page = Page.from_json(json_page)
page_list.append(page)
else:
image_list = get_pdf_as_image(p_path)
for i in range(len(image_list)):
image = image_list[i]
page = generate_text_from_image(image)
page.number = i
page_list.append(page)
save_page_list_to_file(page_list, pdf_name)
return page_list
#def get_text_from_pdf(p_path):
# pdf_name = p_path.split("/")[-1:]
# if os.path.isfile(p_path):
def evaluate_pdf(p_path, p_template):
page_list = get_pdf_as_text(p_path)
if is_template_fitting(p_template, page_list):
print("Template fits")
else:
print("Temṕlate not suitable for this pdf")
return
for entry in p_template[1].items():
key, offset_set = entry
for page in page_list:
for block in page.get_block_list():
for line in block.get_line_list():
for word in line.get_word_list():
if str(word) == key:
wanted_block = block.index - offset_set[1]
wanted_line = line.index - offset_set[2]
wanted_word = word.index - offset_set[3]
print(key)
value_block = page.get_block_at(wanted_block)
value_line = value_block.get_line_at(wanted_line)
value_word = value_line.get_word_at(wanted_word)
print(value_word)
def create_template(p_name, p_path, p_examples):
"""
Generates a template from example and pdf file and
stores it as json
"""
elements = get_textelements_from_pdf(p_path)
template = generate_template(p_examples, elements)
data = [p_examples, template]
with open(p_name + '.template', 'w') as outfile: def create_template(p_name, p_path, p_example, p_hook):
json.dump(data, outfile) page_list = get_pdf_as_text(p_path)
matches_dict = {}
for key in p_example:
matches_dict[key] = [None, None]
hook = None
template = {}
for page in page_list:
for block in page.get_block_list():
for line in block.get_line_list():
for word in line.get_word_list():
for example in p_example.items():
key, value = example
if str(word) == key:
key_match = Match(key)
key_match.page_number = page.index
key_match.block_number = block.index
key_match.line_number = line.index
key_match.word_number = word.index
match_pair = matches_dict[key]
match_pair[0] = key_match
matches_dict[key] = match_pair
elif str(word) == value:
value_match = Match(value)
value_match.page_number = page.index
value_match.block_number = block.index
value_match.line_number = line.index
value_match.word_number = word.index
match_pair = matches_dict[key]
match_pair[1] = value_match
matches_dict[key] = match_pair
elif str(word) == p_hook:
hook = [page.index, block.index, line.index, word.index, p_hook]
def evaluate_pdf(p_template_name, p_path): for match_pair in matches_dict.values():
""" key_match, value_match = match_pair
Analysis a pdf according to the given template if key_match is not None:
""" key_match.validate_match(page_list)
if value_match is not None:
value_match.validate_match(page_list)
if key_match is None or value_match is None:
continue
page_offset = key_match.page_number - value_match.page_number
# block_offset = key_match.block_number - value_match.block_number
# line_offset = key_match.line_number - value_match.line_number
# word_offset = key_match.word_number - value_match.word_number
print('\nEvaluating:', p_path) # offset_set = [page_offset, block_offset, line_offset, word_offset]
with open(p_template_name + '.template') as f: key = key_match.word
data = json.load(f) # [{'Rechnung': 'EB589544', 'Rechnungsdatum': '01.09.2016', 'Mandatsreferenz': '8ADB836C18A8491092EA62F7F35F8A28'}, {'Rechnung': [[125.29500000000002, 29.985000000000014], 2], 'Rechnungsdatum': [[125.29500000000002, 29.985000000000014], 0], 'Mandatsreferenz': [[125.90500000000006, 6.095000000000027], 1]}] value_position = [page_offset, value_match.block_number, value_match.line_number, value_match.word_number]
template[key] = value_position
template = data[1] # {'Rechnung': [[125.29500000000002, 29.985000000000014], 2], 'Rechnungsdatum': [[125.29500000000002, 29.985000000000014], 0], 'Mandatsreferenz': [[125.90500000000006, 6.095000000000027], 1]} return (hook, template)
textelements = get_textelements_from_pdf(p_path) # [<LTTextBoxHorizontal(0) 60.000,712.080,231.864,721.376 'sipgate GmbH - Gladbacher Str. 74 - 40219 Düsseldorf\n'>, <LTTextBoxHorizontal(1) 65.000,663.580,163.690,699.200 'Warpzone e.V.\nJan-Marten Brüggemann\nAm Hawerkamp 31\n'>, <LTTextBoxHorizontal(2) 65.000,639.470,129.350,651.660 '48155 Münster\n'>, <LTTextBoxHorizontal(3) 375.000,663.580,451.560,723.200 'Rechnungsdatum\nLeistungsdatum\nRechnungsnummer\nBezahlung per\nKundennummer\n'>, <LTTextBoxHorizontal(4) 465.000,663.470,535.590,723.660 '01.09.2016\n01.09.2016\nEB589544\nSEPA-Lastschrift\n1967957\n'>, <LTTextBoxHorizontal(5) 375.000,617.500,395.280,629.120 'Seite\n'>, <LTTextBoxHorizontal(6) 465.000,617.390,485.230,629.580 '1 / 1\n'>, <LTTextBoxHorizontal(7) 60.000,555.824,200.976,575.328 'Rechnung EB589544\n'>, <LTTextBoxHorizontal(8) 90.000,537.390,184.160,549.580 'Art.-Nr. Bezeichnung\n'>, <LTTextBoxHorizontal(9) 60.000,525.500,78.050,549.580 'Pos.\n \n'>, <LTTextBoxHorizontal(10) 280.000,537.390,307.500,549.580 'Menge\n'>, <LTTextBoxHorizontal(11) 318.000,525.500,364.840,549.580 'Einzelpreis\nnetto\n'>, <LTTextBoxHorizontal(12) 378.000,525.500,424.840,549.580 'Einzelpreis\nbrutto\n'>, <LTTextBoxHorizontal(13) 438.000,525.500,452.930,549.580 'USt\n \n'>, <LTTextBoxHorizontal(14) 468.000,525.500,520.580,549.580 'Gesamtpreis\nnetto\n'>, <LTTextBoxHorizontal(15) 60.000,489.500,67.330,501.120 ' 1\n'>, <LTTextBoxHorizontal(16) 90.000,489.500,95.000,501.120 '1\n'>, <LTTextBoxHorizontal(17) 130.000,489.500,252.050,501.120 'sipgate.de, Telefonieguthaben\n'>, <LTTextBoxHorizontal(18) 302.440,489.500,307.440,501.120 '1\n'>, <LTTextBoxHorizontal(19) 348.360,489.500,365.250,501.120 '8,40\n'>, <LTTextBoxHorizontal(20) 402.800,489.500,424.650,501.120 '10,00\n'>, <LTTextBoxHorizontal(21) 438.000,489.500,455.590,501.120 '19%\n'>, <LTTextBoxHorizontal(22) 498.950,489.500,534.210,501.120 '8,40 EUR\n'>, <LTTextBoxHorizontal(23) 310.000,447.500,408.610,459.120 'Summe Positionen netto\n'>, <LTTextBoxHorizontal(24) 310.000,411.500,419.970,423.120 '19% USt. auf EUR 8,40 (DE)\n'>, <LTTextBoxHorizontal(25) 492.740,447.390,531.790,459.580 '8,40 EUR\n'>, <LTTextBoxHorizontal(26) 492.780,411.390,531.830,423.580 '1,60 EUR\n'>, <LTTextBoxHorizontal(27) 310.000,369.390,383.390,381.580 'Rechnungsbetrag\n'>, <LTTextBoxHorizontal(28) 487.780,369.390,532.390,381.580 '10,00 EUR\n'>, <LTTextBoxHorizontal(29) 60.000,307.390,414.900,331.580 'Der Betrag in Höhe von 10,00 EUR wird am 06.09.2016 von Ihrem Konto abgebucht.\nInhaber: WARPZONE E.V., IBAN: DE70XXXXXXXXXXXXXXXXX3738, BIC: WELADED1MST\n'>, <LTTextBoxHorizontal(30) 60.000,283.390,311.810,295.580 'Mandatsreferenz: 8ADB836C18A8491092EA62F7F35F8A28\n'>, <LTTextBoxHorizontal(31) 60.000,223.500,485.630,271.120 'Die Umsatzsteuer wird in Höhe des in Ihrem Land geltenden Umsatzsteuersatzes in Rechnung gestellt. Der\nUmsatzsteuersatz bestimmt sich nach dem Ort der Leistung, d.h. nach dem Sitz Ihres Unternehmens bzw.\nnach Ihrem gewöhnlichen Aufenthaltsort. Sollten Ihre Rechnung (Anschrift bzw. Umsatzsteuersatz) nicht\nkorrekt sein, informieren Sie bitte unsere Kundenbetreuung unter basic@sipgate.de.\n'>, <LTTextBoxHorizontal(32) 50.000,58.000,377.192,77.296 'sipgate GmbH, Gladbacher Str. 74, 40219 Düsseldorf, HRB 39841 Düsseldorf, GF: Tim Mois, Thilo Salmon\nUSt-ID: DE219349391, Finanzamt Düsseldorf, Steuer-Nr.: 106/5724/7147, Support: basic@sipgate.de\n'>, <LTTextBoxHorizontal(33) 50.000,28.000,334.008,47.296 'Bank: Commerzbank Düsseldorf, IBAN: DE10 3004 0000 0181 1488 06, BIC: COBADEFFXXX\nGläubiger-ID: DE73ZZZ00000359204\n'>]
contents = get_contents_from_textelements(template, textelements) # None
with open("output.shmebulok", "w") as out:
json.dump(contents, out)
test.png 0 → 100644
test.png

6.51 KiB

class WordRawData():
"""
Represents the raw-data generated by (py)tesseract
"""
def __init__(self, p_data):
self.level = int(p_data[0])
self.page = int(p_data[1])
self.block = int(p_data[2])
self.paragraph = int(p_data[3])
self.line = int(p_data[4])
self.word = int(p_data[5])
self.left = int(p_data[6])
self.top = int(p_data[7])
self.width = int(p_data[8])
self.height = int(p_data[9])
self.confidence = int(p_data[10])
if len(p_data) >= 12:
self.text = p_data[11].replace("", "-").replace(" ", "")
else:
self.text = ""
def to_json(self):
return [
self.level,
self.page,
self.block,
self.paragraph,
self.line,
self.word,
self.left,
self.top,
self.width,
self.height,
self.confidence,
self.text
]
@staticmethod
def from_json(p_json):
return WordRawData(p_json)
class Word():
"""
Represents a word.
"""
def __init__(self, p_data):
self.raw_data = WordRawData(p_data)
self.literal = self.raw_data.text
self.index = self.raw_data.word
self.block = None
self.line = None
def to_json(self):
return self.raw_data.to_json()
@staticmethod
def from_json(p_json):
return Word(p_json)
def __str__(self):
return self.literal
class Line():
def __init__(self, p_index):
self.block = None
self.index = p_index
self.word_list = []
def insert_word(self, p_word):
self.word_list.append(p_word)
p_word.line = self
p_word.block = self.block
def to_json(self):
words_as_json = []
for word in self.word_list:
words_as_json.append(word.to_json())
return [self.index, words_as_json]
@staticmethod
def from_json(p_json):
index = p_json[0]
line = Line(index)
for word_as_json in p_json[1]:
word = Word.from_json(word_as_json)
word.line = line
word.block = line.block
line.word_list.append(word)
return line
def __str__(self):
string = ""
for word in self.word_list:
string += str(word) + " "
return string
def get_word_list(self):
return self.word_list
def get_word_at(self, p_index):
return self.word_list[p_index]
def __repr__(self):
return str(self)
class Block():
def __init__(self, p_index):
self.page = None
self.index = p_index
self.line_list = {}
def _add_line(self, p_index):
line = Line(p_index)
line.block = self
self.line_list[p_index] = line
return line
def insert_word(self, p_word):
index = p_word.raw_data.line
line = None
if index in self.line_list:
line = self.line_list[index]
else:
line = self._add_line(index)
line.insert_word(p_word)
self.line_list[index] = line
def get_line_list(self):
return self.line_list.values()
def get_line_at(self, p_index):
return self.line_list[p_index]
def to_json(self):
lines_as_json = []
for line in self.line_list.values():
lines_as_json.append(line.to_json())
return [self.index, lines_as_json]
@staticmethod
def from_json(p_json):
index = p_json[0]
block = Block(index)
for line_as_json in p_json[1]:
line = Line.from_json(line_as_json)
line.block = block
block.line_list[line.index] = line
return block
def __str__(self):
string = ""
for line in self.line_list.values():
string += str(line) + "\n"
return string
class Page():
def __init__(self, p_index=0):
self.index = p_index
self.block_list = {}
def _add_block(self, p_index):
block = Block(p_index)
block.page = self
self.block_list[p_index] = block
return block
def insert_word(self, p_word):
index = p_word.raw_data.block
block = None
if index in self.block_list:
block = self.block_list[index]
else:
block = self._add_block(index)
block.insert_word(p_word)
self.block_list[index] = block
def get_block_list(self):
return self.block_list.values()
def get_block_at(self, p_index):
return self.block_list[p_index]
def to_json(self):
blocks_as_json = []
for block in self.block_list.values():
blocks_as_json.append(block.to_json())
return [self.index, blocks_as_json]
@staticmethod
def from_json(p_json):
index = p_json[0]
page = Page(index)
for block_as_json in p_json[1]:
block = Block.from_json(block_as_json)
block.page = page
page.block_list[block.index] = block
return page
def __str__(self):
string = ""
for block in self.block_list.values():
string += str(block) + "\n\n\n"
return string
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment