diff --git a/evaluation.py b/evaluation.py index bd4294dfd5508a1a1b9ce1bae876dc051b3d55c1..7a6a77c088ab6fbe7d8bd2fb6e46fb26cb836b86 100644 --- a/evaluation.py +++ b/evaluation.py @@ -1,9 +1,11 @@ from model import * from peewee import fn import json +from difflib import SequenceMatcher + punctuation = ['!', '?', '.', ',', ';', ':', '-', '_', '"', "'", '…', '—', '#', '@'] -semantic = ['e.v.', 'ev', 'ev.', 'e.v', 'herr', 'hr', 'frau', 'fr', 'ag', 'gmbh', '& co kg'] +semantic = ['e.v.', 'ev.', 'e.v', 'ev', 'herr', 'hr', 'frau', 'fr', 'ag', 'gmbh', '& co kg', 'dr.', 'dr'] def remove_punctuation(text): for char in punctuation: @@ -16,9 +18,9 @@ def remove_semantics(text): return text def replace_umlautes(text): - text = text.replace("oe", "ö") + text = text.replace("ö", "oe") text = text.replace("ß", "ss") - text = text.replace("ae", "ä") + text = text.replace("ä", "ae") text = text.replace("ue", "ü") return text @@ -62,7 +64,35 @@ def update_answer_from_file(): jzon = json.load(file) update_answer_text_batch(jzon) -def generate_answer_classes(): +def generate_answer_classes(treshhold): + questions = Question.select() + for q in questions: + answers = Answer.select().where(Answer.question == q) + answer_list = [] + for a in answers: + answer_list.append(a) + + length = len(answer_list) + classes = [] + for i in range(length): + answer = answer_list[i] + if answer is None: + continue + current_class = [answer.text] + for j in range(i+1, length): + bnswer = answer_list[j] + if bnswer is None: + continue + r = SequenceMatcher(None, answer.text, bnswer.text).ratio() + if r > treshhold: + current_class.append(bnswer.text) + answer_list[j] = None + current_class = list(set(current_class)) + current_class = sorted(current_class) + classes.append(current_class) + + with open("q%d-classes.json" % q.id, "w") as file: + json.dump(classes, file, indent=4, sort_keys=True) def aggregate_answer_file(): diff --git a/model/__init__.py b/model/__init__.py index a5f64e1e8b6bec163a9a5524863e3991a093e5d5..8d40f66d6cb06877dcfc32a1845125a06f615d70 100644 --- a/model/__init__.py +++ b/model/__init__.py @@ -3,9 +3,9 @@ from peewee import Model from peewee import MySQLDatabase import json -mysql_creds = json.load(open("./creds.json")) +#mysql_creds = json.load(open("./creds.json")) -sqlite = SqliteDatabase("cfd.sqlite", pragmas={'foreign_keys': 1}) +sqlite = SqliteDatabase("model/cfd.sqlite", pragmas={'foreign_keys': 1}) sqlite.connect() #sqlite = MySQLDatabase(mysql_creds["db"], user=mysql_creds["user"], password=mysql_creds['password'], # host=mysql_creds['host'], port=int(mysql_creds['port']),)