Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
warpDF-analyser
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Model registry
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
leo
warpDF-analyser
Commits
069fa980
Commit
069fa980
authored
6 years ago
by
leo
Browse files
Options
Downloads
Patches
Plain Diff
changes i dont remember #yolo
parent
dfd53745
No related branches found
No related tags found
No related merge requests found
Changes
25
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
pdf_to_image.py
+32
-0
32 additions, 0 deletions
pdf_to_image.py
sharpness.jpg
+0
-0
0 additions, 0 deletions
sharpness.jpg
template_manager.py
+143
-23
143 additions, 23 deletions
template_manager.py
test.png
+0
-0
0 additions, 0 deletions
test.png
text_model.py
+211
-0
211 additions, 0 deletions
text_model.py
with
386 additions
and
23 deletions
pdf_to_image.py
0 → 100644
+
32
−
0
View file @
069fa980
import
PyPDF2
import
pytesseract
from
PIL
import
Image
from
wand.image
import
Image
as
wImage
import
io
def
_detect_image_rotated
(
p_image
):
try
:
osd_data
=
pytesseract
.
image_to_osd
(
p_image
).
split
(
"
\n
"
)
except
pytesseract
.
TesseractError
:
return
False
orientation_line
=
osd_data
[
1
]
orientation
=
int
(
orientation_line
.
split
(
"
"
)[
-
1
])
if
orientation
>
160
and
orientation
<
200
:
return
True
return
False
def
get_pdf_as_image
(
p_path_to_pdf
):
image
=
wImage
(
filename
=
p_path_to_pdf
,
resolution
=
300
)
converted
=
image
.
convert
(
"
jpg
"
)
image_list
=
[]
for
img
in
converted
.
sequence
:
page
=
wImage
(
image
=
img
)
blob
=
page
.
make_blob
(
"
jpg
"
)
i
=
Image
.
open
(
io
.
BytesIO
(
blob
))
if
_detect_image_rotated
(
i
):
i
=
i
.
rotate
(
180
)
image_list
.
append
(
i
)
return
image_list
\ No newline at end of file
This diff is collapsed.
Click to expand it.
sharpness.jpg
0 → 100644
+
0
−
0
View file @
069fa980
172 KiB
This diff is collapsed.
Click to expand it.
template_manager.py
+
143
−
23
View file @
069fa980
from
pdf_parser
import
get_textelements_from_pdf
,
generate_template
,
get_contents_from_textelements
#from pdf_parser import get_textelements_from_pdf, generate_template, get_contents_from_textelements
#import json
from
pdf_to_image
import
get_pdf_as_image
from
image_to_text
import
generate_text_from_image
from
text_model
import
Page
import
os
import
json
import
json
class
Match
():
def
__init__
(
self
,
p_word
=
""
,
p_page_number
=
0
,
p_block_number
=
0
,
p_line_number
=
0
,
p_word_number
=
0
):
self
.
word
=
p_word
self
.
page_number
=
p_page_number
self
.
block_number
=
p_block_number
self
.
line_number
=
p_line_number
self
.
word_number
=
p_word_number
def
validate_match
(
self
,
p_data
):
page
=
p_data
[
self
.
page_number
]
block
=
page
.
get_block_at
(
self
.
block_number
)
line
=
block
.
get_line_at
(
self
.
line_number
)
word
=
line
.
get_word_at
(
self
.
word_number
)
print
(
str
(
word
)
+
"
<->
"
+
str
(
self
.
word
))
def
__str__
(
self
):
return
self
.
word
+
"
:
"
+
str
(
self
.
page_number
)
+
"
>
"
+
str
(
self
.
block_number
)
+
"
>
"
+
str
(
self
.
line_number
)
+
"
>
"
+
str
(
self
.
word_number
)
def
is_template_fitting
(
p_template
,
p_page_list
):
hook
=
p_template
[
0
]
wanted_page
,
wanted_block
,
wanted_line
,
wanted_word
,
hook_word
=
hook
page
=
p_page_list
[
wanted_page
]
block
=
page
.
get_block_at
(
wanted_block
)
line
=
block
.
get_line_at
(
wanted_line
)
word
=
line
.
get_word_at
(
wanted_word
)
return
str
(
word
)
==
hook_word
def
save_page_list_to_file
(
p_page_list
,
p_name
):
json_list
=
[]
for
page
in
p_page_list
:
json_list
.
append
(
page
.
to_json
())
file
=
open
(
p_name
+
"
.ptt
"
,
"
w
"
)
json
.
dump
(
json_list
,
file
)
file
.
close
()
def
get_pdf_as_text
(
p_path
,
p_seperator
=
"
/
"
):
pdf_name
=
p_path
.
split
(
p_seperator
)[
-
1
]
page_list
=
[]
if
os
.
path
.
isfile
(
pdf_name
+
"
.ptt
"
):
json_list
=
json
.
load
(
open
(
pdf_name
+
"
.ptt
"
))
for
json_page
in
json_list
:
page
=
Page
.
from_json
(
json_page
)
page_list
.
append
(
page
)
else
:
image_list
=
get_pdf_as_image
(
p_path
)
for
i
in
range
(
len
(
image_list
)):
image
=
image_list
[
i
]
page
=
generate_text_from_image
(
image
)
page
.
number
=
i
page_list
.
append
(
page
)
save_page_list_to_file
(
page_list
,
pdf_name
)
return
page_list
#def get_text_from_pdf(p_path):
# pdf_name = p_path.split("/")[-1:]
# if os.path.isfile(p_path):
def
evaluate_pdf
(
p_path
,
p_template
):
page_list
=
get_pdf_as_text
(
p_path
)
if
is_template_fitting
(
p_template
,
page_list
):
print
(
"
Template fits
"
)
else
:
print
(
"
Temṕlate not suitable for this pdf
"
)
return
for
entry
in
p_template
[
1
].
items
():
key
,
offset_set
=
entry
for
page
in
page_list
:
for
block
in
page
.
get_block_list
():
for
line
in
block
.
get_line_list
():
for
word
in
line
.
get_word_list
():
if
str
(
word
)
==
key
:
wanted_block
=
block
.
index
-
offset_set
[
1
]
wanted_line
=
line
.
index
-
offset_set
[
2
]
wanted_word
=
word
.
index
-
offset_set
[
3
]
print
(
key
)
value_block
=
page
.
get_block_at
(
wanted_block
)
value_line
=
value_block
.
get_line_at
(
wanted_line
)
value_word
=
value_line
.
get_word_at
(
wanted_word
)
print
(
value_word
)
def
create_template
(
p_name
,
p_path
,
p_examples
):
"""
Generates a template from example and pdf file and
stores it as json
"""
elements
=
get_textelements_from_pdf
(
p_path
)
template
=
generate_template
(
p_examples
,
elements
)
data
=
[
p_examples
,
template
]
with
open
(
p_name
+
'
.template
'
,
'
w
'
)
as
outfile
:
def
create_template
(
p_name
,
p_path
,
p_example
,
p_hook
)
:
json
.
dump
(
data
,
outfile
)
page_list
=
get_pdf_as_text
(
p_path
)
matches_dict
=
{}
for
key
in
p_example
:
matches_dict
[
key
]
=
[
None
,
None
]
hook
=
None
template
=
{}
for
page
in
page_list
:
for
block
in
page
.
get_block_list
():
for
line
in
block
.
get_line_list
():
for
word
in
line
.
get_word_list
():
for
example
in
p_example
.
items
():
key
,
value
=
example
if
str
(
word
)
==
key
:
key_match
=
Match
(
key
)
key_match
.
page_number
=
page
.
index
key_match
.
block_number
=
block
.
index
key_match
.
line_number
=
line
.
index
key_match
.
word_number
=
word
.
index
match_pair
=
matches_dict
[
key
]
match_pair
[
0
]
=
key_match
matches_dict
[
key
]
=
match_pair
elif
str
(
word
)
==
value
:
value_match
=
Match
(
value
)
value_match
.
page_number
=
page
.
index
value_match
.
block_number
=
block
.
index
value_match
.
line_number
=
line
.
index
value_match
.
word_number
=
word
.
index
match_pair
=
matches_dict
[
key
]
match_pair
[
1
]
=
value_match
matches_dict
[
key
]
=
match_pair
elif
str
(
word
)
==
p_hook
:
hook
=
[
page
.
index
,
block
.
index
,
line
.
index
,
word
.
index
,
p_hook
]
def
evaluate_pdf
(
p_template_name
,
p_path
):
for
match_pair
in
matches_dict
.
values
():
"""
key_match
,
value_match
=
match_pair
Analysis a pdf according to the given template
if
key_match
is
not
None
:
"""
key_match
.
validate_match
(
page_list
)
if
value_match
is
not
None
:
value_match
.
validate_match
(
page_list
)
if
key_match
is
None
or
value_match
is
None
:
continue
page_offset
=
key_match
.
page_number
-
value_match
.
page_number
# block_offset = key_match.block_number - value_match.block_number
# line_offset = key_match.line_number - value_match.line_number
# word_offset = key_match.word_number - value_match.word_number
print
(
'
\n
Evaluating:
'
,
p_path
)
# offset_set = [page_offset, block_offset, line_offset, word_offset]
with
open
(
p_template_name
+
'
.template
'
)
as
f
:
key
=
key_match
.
word
data
=
json
.
load
(
f
)
# [{'Rechnung': 'EB589544', 'Rechnungsdatum': '01.09.2016', 'Mandatsreferenz': '8ADB836C18A8491092EA62F7F35F8A28'}, {'Rechnung': [[125.29500000000002, 29.985000000000014], 2], 'Rechnungsdatum': [[125.29500000000002, 29.985000000000014], 0], 'Mandatsreferenz': [[125.90500000000006, 6.095000000000027], 1]}]
value_position
=
[
page_offset
,
value_match
.
block_number
,
value_match
.
line_number
,
value_match
.
word_number
]
template
[
key
]
=
value_position
template
=
data
[
1
]
# {'Rechnung': [[125.29500000000002, 29.985000000000014], 2], 'Rechnungsdatum': [[125.29500000000002, 29.985000000000014], 0], 'Mandatsreferenz': [[125.90500000000006, 6.095000000000027], 1]}
return
(
hook
,
template
)
textelements
=
get_textelements_from_pdf
(
p_path
)
# [<LTTextBoxHorizontal(0) 60.000,712.080,231.864,721.376 'sipgate GmbH - Gladbacher Str. 74 - 40219 Düsseldorf\n'>, <LTTextBoxHorizontal(1) 65.000,663.580,163.690,699.200 'Warpzone e.V.\nJan-Marten Brüggemann\nAm Hawerkamp 31\n'>, <LTTextBoxHorizontal(2) 65.000,639.470,129.350,651.660 '48155 Münster\n'>, <LTTextBoxHorizontal(3) 375.000,663.580,451.560,723.200 'Rechnungsdatum\nLeistungsdatum\nRechnungsnummer\nBezahlung per\nKundennummer\n'>, <LTTextBoxHorizontal(4) 465.000,663.470,535.590,723.660 '01.09.2016\n01.09.2016\nEB589544\nSEPA-Lastschrift\n1967957\n'>, <LTTextBoxHorizontal(5) 375.000,617.500,395.280,629.120 'Seite\n'>, <LTTextBoxHorizontal(6) 465.000,617.390,485.230,629.580 '1 / 1\n'>, <LTTextBoxHorizontal(7) 60.000,555.824,200.976,575.328 'Rechnung EB589544\n'>, <LTTextBoxHorizontal(8) 90.000,537.390,184.160,549.580 'Art.-Nr. Bezeichnung\n'>, <LTTextBoxHorizontal(9) 60.000,525.500,78.050,549.580 'Pos.\n \n'>, <LTTextBoxHorizontal(10) 280.000,537.390,307.500,549.580 'Menge\n'>, <LTTextBoxHorizontal(11) 318.000,525.500,364.840,549.580 'Einzelpreis\nnetto\n'>, <LTTextBoxHorizontal(12) 378.000,525.500,424.840,549.580 'Einzelpreis\nbrutto\n'>, <LTTextBoxHorizontal(13) 438.000,525.500,452.930,549.580 'USt\n \n'>, <LTTextBoxHorizontal(14) 468.000,525.500,520.580,549.580 'Gesamtpreis\nnetto\n'>, <LTTextBoxHorizontal(15) 60.000,489.500,67.330,501.120 ' 1\n'>, <LTTextBoxHorizontal(16) 90.000,489.500,95.000,501.120 '1\n'>, <LTTextBoxHorizontal(17) 130.000,489.500,252.050,501.120 'sipgate.de, Telefonieguthaben\n'>, <LTTextBoxHorizontal(18) 302.440,489.500,307.440,501.120 '1\n'>, <LTTextBoxHorizontal(19) 348.360,489.500,365.250,501.120 '8,40\n'>, <LTTextBoxHorizontal(20) 402.800,489.500,424.650,501.120 '10,00\n'>, <LTTextBoxHorizontal(21) 438.000,489.500,455.590,501.120 '19%\n'>, <LTTextBoxHorizontal(22) 498.950,489.500,534.210,501.120 '8,40 EUR\n'>, <LTTextBoxHorizontal(23) 310.000,447.500,408.610,459.120 'Summe Positionen netto\n'>, <LTTextBoxHorizontal(24) 310.000,411.500,419.970,423.120 '19% USt. auf EUR 8,40 (DE)\n'>, <LTTextBoxHorizontal(25) 492.740,447.390,531.790,459.580 '8,40 EUR\n'>, <LTTextBoxHorizontal(26) 492.780,411.390,531.830,423.580 '1,60 EUR\n'>, <LTTextBoxHorizontal(27) 310.000,369.390,383.390,381.580 'Rechnungsbetrag\n'>, <LTTextBoxHorizontal(28) 487.780,369.390,532.390,381.580 '10,00 EUR\n'>, <LTTextBoxHorizontal(29) 60.000,307.390,414.900,331.580 'Der Betrag in Höhe von 10,00 EUR wird am 06.09.2016 von Ihrem Konto abgebucht.\nInhaber: WARPZONE E.V., IBAN: DE70XXXXXXXXXXXXXXXXX3738, BIC: WELADED1MST\n'>, <LTTextBoxHorizontal(30) 60.000,283.390,311.810,295.580 'Mandatsreferenz: 8ADB836C18A8491092EA62F7F35F8A28\n'>, <LTTextBoxHorizontal(31) 60.000,223.500,485.630,271.120 'Die Umsatzsteuer wird in Höhe des in Ihrem Land geltenden Umsatzsteuersatzes in Rechnung gestellt. Der\nUmsatzsteuersatz bestimmt sich nach dem Ort der Leistung, d.h. nach dem Sitz Ihres Unternehmens bzw.\nnach Ihrem gewöhnlichen Aufenthaltsort. Sollten Ihre Rechnung (Anschrift bzw. Umsatzsteuersatz) nicht\nkorrekt sein, informieren Sie bitte unsere Kundenbetreuung unter basic@sipgate.de.\n'>, <LTTextBoxHorizontal(32) 50.000,58.000,377.192,77.296 'sipgate GmbH, Gladbacher Str. 74, 40219 Düsseldorf, HRB 39841 Düsseldorf, GF: Tim Mois, Thilo Salmon\nUSt-ID: DE219349391, Finanzamt Düsseldorf, Steuer-Nr.: 106/5724/7147, Support: basic@sipgate.de\n'>, <LTTextBoxHorizontal(33) 50.000,28.000,334.008,47.296 'Bank: Commerzbank Düsseldorf, IBAN: DE10 3004 0000 0181 1488 06, BIC: COBADEFFXXX\nGläubiger-ID: DE73ZZZ00000359204\n'>]
contents
=
get_contents_from_textelements
(
template
,
textelements
)
# None
with
open
(
"
output.shmebulok
"
,
"
w
"
)
as
out
:
json
.
dump
(
contents
,
out
)
This diff is collapsed.
Click to expand it.
test.png
0 → 100644
+
0
−
0
View file @
069fa980
6.51 KiB
This diff is collapsed.
Click to expand it.
text_model.py
0 → 100644
+
211
−
0
View file @
069fa980
class
WordRawData
():
"""
Represents the raw-data generated by (py)tesseract
"""
def
__init__
(
self
,
p_data
):
self
.
level
=
int
(
p_data
[
0
])
self
.
page
=
int
(
p_data
[
1
])
self
.
block
=
int
(
p_data
[
2
])
self
.
paragraph
=
int
(
p_data
[
3
])
self
.
line
=
int
(
p_data
[
4
])
self
.
word
=
int
(
p_data
[
5
])
self
.
left
=
int
(
p_data
[
6
])
self
.
top
=
int
(
p_data
[
7
])
self
.
width
=
int
(
p_data
[
8
])
self
.
height
=
int
(
p_data
[
9
])
self
.
confidence
=
int
(
p_data
[
10
])
if
len
(
p_data
)
>=
12
:
self
.
text
=
p_data
[
11
].
replace
(
"
—
"
,
"
-
"
).
replace
(
"
"
,
""
)
else
:
self
.
text
=
""
def
to_json
(
self
):
return
[
self
.
level
,
self
.
page
,
self
.
block
,
self
.
paragraph
,
self
.
line
,
self
.
word
,
self
.
left
,
self
.
top
,
self
.
width
,
self
.
height
,
self
.
confidence
,
self
.
text
]
@staticmethod
def
from_json
(
p_json
):
return
WordRawData
(
p_json
)
class
Word
():
"""
Represents a word.
"""
def
__init__
(
self
,
p_data
):
self
.
raw_data
=
WordRawData
(
p_data
)
self
.
literal
=
self
.
raw_data
.
text
self
.
index
=
self
.
raw_data
.
word
self
.
block
=
None
self
.
line
=
None
def
to_json
(
self
):
return
self
.
raw_data
.
to_json
()
@staticmethod
def
from_json
(
p_json
):
return
Word
(
p_json
)
def
__str__
(
self
):
return
self
.
literal
class
Line
():
def
__init__
(
self
,
p_index
):
self
.
block
=
None
self
.
index
=
p_index
self
.
word_list
=
[]
def
insert_word
(
self
,
p_word
):
self
.
word_list
.
append
(
p_word
)
p_word
.
line
=
self
p_word
.
block
=
self
.
block
def
to_json
(
self
):
words_as_json
=
[]
for
word
in
self
.
word_list
:
words_as_json
.
append
(
word
.
to_json
())
return
[
self
.
index
,
words_as_json
]
@staticmethod
def
from_json
(
p_json
):
index
=
p_json
[
0
]
line
=
Line
(
index
)
for
word_as_json
in
p_json
[
1
]:
word
=
Word
.
from_json
(
word_as_json
)
word
.
line
=
line
word
.
block
=
line
.
block
line
.
word_list
.
append
(
word
)
return
line
def
__str__
(
self
):
string
=
""
for
word
in
self
.
word_list
:
string
+=
str
(
word
)
+
"
"
return
string
def
get_word_list
(
self
):
return
self
.
word_list
def
get_word_at
(
self
,
p_index
):
return
self
.
word_list
[
p_index
]
def
__repr__
(
self
):
return
str
(
self
)
class
Block
():
def
__init__
(
self
,
p_index
):
self
.
page
=
None
self
.
index
=
p_index
self
.
line_list
=
{}
def
_add_line
(
self
,
p_index
):
line
=
Line
(
p_index
)
line
.
block
=
self
self
.
line_list
[
p_index
]
=
line
return
line
def
insert_word
(
self
,
p_word
):
index
=
p_word
.
raw_data
.
line
line
=
None
if
index
in
self
.
line_list
:
line
=
self
.
line_list
[
index
]
else
:
line
=
self
.
_add_line
(
index
)
line
.
insert_word
(
p_word
)
self
.
line_list
[
index
]
=
line
def
get_line_list
(
self
):
return
self
.
line_list
.
values
()
def
get_line_at
(
self
,
p_index
):
return
self
.
line_list
[
p_index
]
def
to_json
(
self
):
lines_as_json
=
[]
for
line
in
self
.
line_list
.
values
():
lines_as_json
.
append
(
line
.
to_json
())
return
[
self
.
index
,
lines_as_json
]
@staticmethod
def
from_json
(
p_json
):
index
=
p_json
[
0
]
block
=
Block
(
index
)
for
line_as_json
in
p_json
[
1
]:
line
=
Line
.
from_json
(
line_as_json
)
line
.
block
=
block
block
.
line_list
[
line
.
index
]
=
line
return
block
def
__str__
(
self
):
string
=
""
for
line
in
self
.
line_list
.
values
():
string
+=
str
(
line
)
+
"
\n
"
return
string
class
Page
():
def
__init__
(
self
,
p_index
=
0
):
self
.
index
=
p_index
self
.
block_list
=
{}
def
_add_block
(
self
,
p_index
):
block
=
Block
(
p_index
)
block
.
page
=
self
self
.
block_list
[
p_index
]
=
block
return
block
def
insert_word
(
self
,
p_word
):
index
=
p_word
.
raw_data
.
block
block
=
None
if
index
in
self
.
block_list
:
block
=
self
.
block_list
[
index
]
else
:
block
=
self
.
_add_block
(
index
)
block
.
insert_word
(
p_word
)
self
.
block_list
[
index
]
=
block
def
get_block_list
(
self
):
return
self
.
block_list
.
values
()
def
get_block_at
(
self
,
p_index
):
return
self
.
block_list
[
p_index
]
def
to_json
(
self
):
blocks_as_json
=
[]
for
block
in
self
.
block_list
.
values
():
blocks_as_json
.
append
(
block
.
to_json
())
return
[
self
.
index
,
blocks_as_json
]
@staticmethod
def
from_json
(
p_json
):
index
=
p_json
[
0
]
page
=
Page
(
index
)
for
block_as_json
in
p_json
[
1
]:
block
=
Block
.
from_json
(
block_as_json
)
block
.
page
=
page
page
.
block_list
[
block
.
index
]
=
block
return
page
def
__str__
(
self
):
string
=
""
for
block
in
self
.
block_list
.
values
():
string
+=
str
(
block
)
+
"
\n\n\n
"
return
string
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Prev
1
2
Next
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment