changes i dont remember #yolo

069fa980 · leo · dfd53745 · 069fa980 · 069fa980 · 069fa980
Commit 069fa980 authored 6 years ago by leo
--- a/pdf_to_image.py
+++ b/pdf_to_image.py
+import PyPDF2
+import pytesseract
+from PIL import Image
+from wand.image import Image as wImage
+import io
+def _detect_image_rotated(p_image):
+    try:
+        osd_data = pytesseract.image_to_osd(p_image).split("\n")
+    except pytesseract.TesseractError:
+        return False
+    orientation_line = osd_data[1]
+    orientation = int(orientation_line.split(" ")[-1])
+    if orientation > 160 and orientation < 200:
+        return True
+    return False
+def get_pdf_as_image(p_path_to_pdf):
+    image = wImage(filename=p_path_to_pdf, resolution=300)
+    converted = image.convert("jpg")
+    image_list = []
+    for img in converted.sequence:
+        page = wImage(image=img)
+        blob = page.make_blob("jpg")
+        i = Image.open(io.BytesIO(blob))
+        if _detect_image_rotated(i):
+            i = i.rotate(180)
+        image_list.append(i)
+    return image_list
\ No newline at end of file
--- a/sharpness.jpg
+++ b/sharpness.jpg
--- a/template_manager.py
+++ b/template_manager.py
-from pdf_parser import get_textelements_from_pdf, generate_template, get_contents_from_textelements
+#from pdf_parser import get_textelements_from_pdf, generate_template, get_contents_from_textelements
+#import json
+from pdf_to_image import get_pdf_as_image
+from image_to_text import generate_text_from_image
+from text_model import Page
+import os
 import json
+class Match():
+    def __init__(self, p_word="", p_page_number=0, p_block_number=0, p_line_number=0, p_word_number=0):
+        self.word = p_word
+        self.page_number = p_page_number
+        self.block_number = p_block_number
+        self.line_number = p_line_number
+        self.word_number = p_word_number
+    def validate_match(self, p_data):
+        page = p_data[self.page_number]
+        block = page.get_block_at(self.block_number)
+        line = block.get_line_at(self.line_number)
+        word = line.get_word_at(self.word_number)
+        print(str(word) + "<->" + str(self.word))
+    def __str__(self):
+        return self.word + ": " + str(self.page_number) + " > " + str(self.block_number) + " > " + str(self.line_number) + " > " + str(self.word_number)
+def is_template_fitting(p_template, p_page_list):
+    hook = p_template[0]
+    wanted_page, wanted_block, wanted_line, wanted_word, hook_word = hook
+    page = p_page_list[wanted_page]
+    block = page.get_block_at(wanted_block)
+    line = block.get_line_at(wanted_line)
+    word = line.get_word_at(wanted_word)
+    return str(word) == hook_word
+def save_page_list_to_file(p_page_list, p_name):
+    json_list = []
+    for page in p_page_list:
+        json_list.append(page.to_json())
+    file = open(p_name + ".ptt", "w")
+    json.dump(json_list, file)
+    file.close()
+def get_pdf_as_text(p_path, p_seperator="/"):
+    pdf_name = p_path.split(p_seperator)[-1]
+    page_list = []
+    if os.path.isfile(pdf_name + ".ptt"):
+        json_list = json.load(open(pdf_name + ".ptt"))
+        for json_page in json_list:
+            page = Page.from_json(json_page)
+            page_list.append(page)
+    else:
+        image_list = get_pdf_as_image(p_path)
+        for i in range(len(image_list)):
+            image = image_list[i]
+            page = generate_text_from_image(image)
+            page.number = i
+            page_list.append(page)
+        save_page_list_to_file(page_list, pdf_name)
+    return page_list
+#def get_text_from_pdf(p_path):
+#    pdf_name = p_path.split("/")[-1:]
+#    if os.path.isfile(p_path):
+def evaluate_pdf(p_path, p_template):
+    page_list = get_pdf_as_text(p_path)
+    if is_template_fitting(p_template, page_list):
+        print("Template fits")
+    else:
+        print("Temṕlate not suitable for this pdf")
+        return
+    for entry in p_template[1].items():
+        key, offset_set = entry
+        for page in page_list:
+            for block in page.get_block_list():
+                for line in block.get_line_list():
+                    for word in line.get_word_list():
+                        if str(word) == key:
+                            wanted_block = block.index - offset_set[1]
+                            wanted_line = line.index - offset_set[2]
+                            wanted_word = word.index - offset_set[3]
+                            print(key)
+                            value_block = page.get_block_at(wanted_block)
+                            value_line = value_block.get_line_at(wanted_line)
+                            value_word = value_line.get_word_at(wanted_word)
+                            print(value_word)
-def create_template(p_name, p_path, p_examples):
-    """
-    Generates a template from example and pdf file and
-    stores it as json
-    """
-    elements = get_textelements_from_pdf(p_path)
-    template = generate_template(p_examples, elements)
-    data = [p_examples, template]
-    with open(p_name + '.template', 'w') as outfile:
+def create_template(p_name, p_path, p_example, p_hook):
-        json.dump(data, outfile)
+    page_list = get_pdf_as_text(p_path)
+    matches_dict = {}
+    for key in p_example:
+        matches_dict[key] = [None, None]
+    hook = None
+    template = {}
+    for page in page_list:
+            for block in page.get_block_list():
+                for line in block.get_line_list():
+                    for word in line.get_word_list():
+                        for example in p_example.items():
+                            key, value = example
+                            if str(word) == key:
+                                key_match = Match(key)
+                                key_match.page_number = page.index
+                                key_match.block_number = block.index
+                                key_match.line_number = line.index
+                                key_match.word_number = word.index
+                                match_pair = matches_dict[key]
+                                match_pair[0] = key_match
+                                matches_dict[key] = match_pair
+                            elif str(word) == value:
+                                value_match = Match(value)
+                                value_match.page_number = page.index
+                                value_match.block_number = block.index
+                                value_match.line_number = line.index
+                                value_match.word_number = word.index
+                                match_pair = matches_dict[key]
+                                match_pair[1] = value_match
+                                matches_dict[key] = match_pair
+                            elif str(word) == p_hook:
+                                hook = [page.index, block.index, line.index, word.index, p_hook]
-def evaluate_pdf(p_template_name, p_path):
+    for match_pair in matches_dict.values():
-    """
+        key_match, value_match = match_pair
-    Analysis a pdf according to the given template
+        if key_match is not None:
-    """
+            key_match.validate_match(page_list)
+        if value_match is not None:
+            value_match.validate_match(page_list)
+        if key_match is None or value_match is None:
+            continue
+        page_offset = key_match.page_number - value_match.page_number
+       # block_offset = key_match.block_number - value_match.block_number
+       # line_offset = key_match.line_number - value_match.line_number
+       # word_offset = key_match.word_number - value_match.word_number
-    print('\nEvaluating:', p_path)
+       # offset_set = [page_offset, block_offset, line_offset, word_offset]
-    with open(p_template_name + '.template') as f:
+        key = key_match.word
-        data = json.load(f)  # [{'Rechnung': 'EB589544', 'Rechnungsdatum': '01.09.2016', 'Mandatsreferenz': '8ADB836C18A8491092EA62F7F35F8A28'}, {'Rechnung': [[125.29500000000002, 29.985000000000014], 2], 'Rechnungsdatum': [[125.29500000000002, 29.985000000000014], 0], 'Mandatsreferenz': [[125.90500000000006, 6.095000000000027], 1]}]
+        value_position = [page_offset, value_match.block_number, value_match.line_number, value_match.word_number]
+        template[key] = value_position
-    template = data[1]  #                                                                                                                        {'Rechnung': [[125.29500000000002, 29.985000000000014], 2], 'Rechnungsdatum': [[125.29500000000002, 29.985000000000014], 0], 'Mandatsreferenz': [[125.90500000000006, 6.095000000000027], 1]}
+    return (hook, template)
-    textelements = get_textelements_from_pdf(p_path)  # [<LTTextBoxHorizontal(0) 60.000,712.080,231.864,721.376 'sipgate GmbH - Gladbacher Str. 74 - 40219 Düsseldorf\n'>, <LTTextBoxHorizontal(1) 65.000,663.580,163.690,699.200 'Warpzone e.V.\nJan-Marten Brüggemann\nAm Hawerkamp 31\n'>, <LTTextBoxHorizontal(2) 65.000,639.470,129.350,651.660 '48155 Münster\n'>, <LTTextBoxHorizontal(3) 375.000,663.580,451.560,723.200 'Rechnungsdatum\nLeistungsdatum\nRechnungsnummer\nBezahlung per\nKundennummer\n'>, <LTTextBoxHorizontal(4) 465.000,663.470,535.590,723.660 '01.09.2016\n01.09.2016\nEB589544\nSEPA-Lastschrift\n1967957\n'>, <LTTextBoxHorizontal(5) 375.000,617.500,395.280,629.120 'Seite\n'>, <LTTextBoxHorizontal(6) 465.000,617.390,485.230,629.580 '1 / 1\n'>, <LTTextBoxHorizontal(7) 60.000,555.824,200.976,575.328 'Rechnung EB589544\n'>, <LTTextBoxHorizontal(8) 90.000,537.390,184.160,549.580 'Art.-Nr. Bezeichnung\n'>, <LTTextBoxHorizontal(9) 60.000,525.500,78.050,549.580 'Pos.\n \n'>, <LTTextBoxHorizontal(10) 280.000,537.390,307.500,549.580 'Menge\n'>, <LTTextBoxHorizontal(11) 318.000,525.500,364.840,549.580 'Einzelpreis\nnetto\n'>, <LTTextBoxHorizontal(12) 378.000,525.500,424.840,549.580 'Einzelpreis\nbrutto\n'>, <LTTextBoxHorizontal(13) 438.000,525.500,452.930,549.580 'USt\n \n'>, <LTTextBoxHorizontal(14) 468.000,525.500,520.580,549.580 'Gesamtpreis\nnetto\n'>, <LTTextBoxHorizontal(15) 60.000,489.500,67.330,501.120 ' 1\n'>, <LTTextBoxHorizontal(16) 90.000,489.500,95.000,501.120 '1\n'>, <LTTextBoxHorizontal(17) 130.000,489.500,252.050,501.120 'sipgate.de, Telefonieguthaben\n'>, <LTTextBoxHorizontal(18) 302.440,489.500,307.440,501.120 '1\n'>, <LTTextBoxHorizontal(19) 348.360,489.500,365.250,501.120 '8,40\n'>, <LTTextBoxHorizontal(20) 402.800,489.500,424.650,501.120 '10,00\n'>, <LTTextBoxHorizontal(21) 438.000,489.500,455.590,501.120 '19%\n'>, <LTTextBoxHorizontal(22) 498.950,489.500,534.210,501.120 '8,40 EUR\n'>, <LTTextBoxHorizontal(23) 310.000,447.500,408.610,459.120 'Summe Positionen netto\n'>, <LTTextBoxHorizontal(24) 310.000,411.500,419.970,423.120 '19% USt. auf EUR 8,40 (DE)\n'>, <LTTextBoxHorizontal(25) 492.740,447.390,531.790,459.580 '8,40 EUR\n'>, <LTTextBoxHorizontal(26) 492.780,411.390,531.830,423.580 '1,60 EUR\n'>, <LTTextBoxHorizontal(27) 310.000,369.390,383.390,381.580 'Rechnungsbetrag\n'>, <LTTextBoxHorizontal(28) 487.780,369.390,532.390,381.580 '10,00 EUR\n'>, <LTTextBoxHorizontal(29) 60.000,307.390,414.900,331.580 'Der Betrag in Höhe von 10,00 EUR wird am 06.09.2016 von Ihrem Konto abgebucht.\nInhaber: WARPZONE E.V., IBAN: DE70XXXXXXXXXXXXXXXXX3738, BIC: WELADED1MST\n'>, <LTTextBoxHorizontal(30) 60.000,283.390,311.810,295.580 'Mandatsreferenz: 8ADB836C18A8491092EA62F7F35F8A28\n'>, <LTTextBoxHorizontal(31) 60.000,223.500,485.630,271.120 'Die Umsatzsteuer wird in Höhe des in Ihrem Land geltenden Umsatzsteuersatzes in Rechnung gestellt. Der\nUmsatzsteuersatz bestimmt sich nach dem Ort der Leistung, d.h. nach dem Sitz Ihres Unternehmens bzw.\nnach Ihrem gewöhnlichen Aufenthaltsort. Sollten Ihre Rechnung (Anschrift bzw. Umsatzsteuersatz) nicht\nkorrekt sein, informieren Sie bitte unsere Kundenbetreuung unter basic@sipgate.de.\n'>, <LTTextBoxHorizontal(32) 50.000,58.000,377.192,77.296 'sipgate GmbH, Gladbacher Str. 74, 40219 Düsseldorf, HRB 39841 Düsseldorf, GF: Tim Mois, Thilo Salmon\nUSt-ID: DE219349391, Finanzamt Düsseldorf, Steuer-Nr.: 106/5724/7147, Support: basic@sipgate.de\n'>, <LTTextBoxHorizontal(33) 50.000,28.000,334.008,47.296 'Bank: Commerzbank Düsseldorf, IBAN: DE10 3004 0000 0181 1488 06, BIC: COBADEFFXXX\nGläubiger-ID: DE73ZZZ00000359204\n'>]
-    contents = get_contents_from_textelements(template, textelements)  # None
-    with open("output.shmebulok", "w") as out:
-        json.dump(contents, out)
--- a/test.png
+++ b/test.png
--- a/text_model.py
+++ b/text_model.py
+class WordRawData():
+    """
+    Represents the raw-data generated by (py)tesseract
+    """
+    def __init__(self, p_data):
+        self.level = int(p_data[0])
+        self.page = int(p_data[1])
+        self.block = int(p_data[2])
+        self.paragraph = int(p_data[3])
+        self.line = int(p_data[4])
+        self.word = int(p_data[5])
+        self.left = int(p_data[6])
+        self.top = int(p_data[7])
+        self.width = int(p_data[8])
+        self.height = int(p_data[9])
+        self.confidence = int(p_data[10])
+        if len(p_data) >= 12:
+            self.text = p_data[11].replace("—", "-").replace(" ", "")
+        else:
+            self.text = ""
+    def to_json(self):
+        return [
+            self.level,
+            self.page,
+            self.block,
+            self.paragraph,
+            self.line,
+            self.word,
+            self.left,
+            self.top,
+            self.width,
+            self.height,
+            self.confidence,
+            self.text
+        ]
+    @staticmethod
+    def from_json(p_json):
+        return WordRawData(p_json)
+class Word():
+    """
+    Represents a word.
+    """
+    def __init__(self, p_data):
+        self.raw_data = WordRawData(p_data)
+        self.literal = self.raw_data.text
+        self.index = self.raw_data.word
+        self.block = None
+        self.line = None
+    def to_json(self):
+        return self.raw_data.to_json()
+    @staticmethod
+    def from_json(p_json):
+        return Word(p_json)
+    def __str__(self):
+        return self.literal
+class Line():
+    def __init__(self, p_index):
+        self.block = None
+        self.index = p_index
+        self.word_list = []
+    def insert_word(self, p_word):
+        self.word_list.append(p_word)
+        p_word.line = self
+        p_word.block = self.block
+    def to_json(self):
+        words_as_json = []
+        for word in self.word_list:
+            words_as_json.append(word.to_json())
+        return [self.index, words_as_json]
+    @staticmethod
+    def from_json(p_json):
+        index = p_json[0]
+        line = Line(index)
+        for word_as_json in p_json[1]:
+            word = Word.from_json(word_as_json)
+            word.line = line
+            word.block = line.block
+            line.word_list.append(word)
+        return line
+    def __str__(self):
+        string = ""
+        for word in self.word_list:
+            string += str(word) + " "
+        return string
+    def get_word_list(self):
+        return self.word_list
+    def get_word_at(self, p_index):
+        return self.word_list[p_index]
+    def __repr__(self):
+        return str(self)
+class Block():
+    def __init__(self, p_index):
+        self.page = None
+        self.index = p_index
+        self.line_list = {}
+    def _add_line(self, p_index):
+        line = Line(p_index)
+        line.block = self
+        self.line_list[p_index] = line
+        return line
+    def insert_word(self, p_word):
+        index = p_word.raw_data.line
+        line = None
+        if index in self.line_list:
+            line = self.line_list[index]
+        else:
+            line = self._add_line(index)
+        line.insert_word(p_word)
+        self.line_list[index] = line
+    def get_line_list(self):
+        return self.line_list.values()
+    def get_line_at(self, p_index):
+        return self.line_list[p_index]
+    def to_json(self):
+        lines_as_json = []
+        for line in self.line_list.values():
+            lines_as_json.append(line.to_json())
+        return [self.index, lines_as_json]
+    @staticmethod
+    def from_json(p_json):
+        index = p_json[0]
+        block = Block(index)
+        for line_as_json in p_json[1]:
+            line = Line.from_json(line_as_json)
+            line.block = block
+            block.line_list[line.index] = line
+        return block
+    def __str__(self):
+        string = ""
+        for line in self.line_list.values():
+            string += str(line) + "\n"
+        return string
+class Page():
+    def __init__(self, p_index=0):
+        self.index = p_index
+        self.block_list = {}
+    def _add_block(self, p_index):
+        block = Block(p_index)
+        block.page = self
+        self.block_list[p_index] = block
+        return block
+    def insert_word(self, p_word):
+        index = p_word.raw_data.block
+        block = None
+        if index in self.block_list:
+            block = self.block_list[index]
+        else:
+            block = self._add_block(index)
+        block.insert_word(p_word)
+        self.block_list[index] = block
+    def get_block_list(self):
+        return self.block_list.values()
+    def get_block_at(self, p_index):
+        return self.block_list[p_index]
+    def to_json(self):
+        blocks_as_json = []
+        for block in self.block_list.values():
+            blocks_as_json.append(block.to_json())
+        return [self.index, blocks_as_json]
+    @staticmethod
+    def from_json(p_json):
+        index = p_json[0]
+        page = Page(index)
+        for block_as_json in p_json[1]:
+            block = Block.from_json(block_as_json)
+            block.page = page
+            page.block_list[block.index] = block
+        return page
+    def __str__(self):
+        string = ""
+        for block in self.block_list.values():
+            string += str(block) + "\n\n\n"
+        return string
\ No newline at end of file