From 78f4515a711c5624d363ab315e06c7f8c5bb3f2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de>
Date: Sat, 5 May 2018 11:37:54 +0200
Subject: [PATCH] Add word dictionary tests

---
 code_mario/clef18_task1_data.py | 43 +++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py
index a900511..0ee0be9 100644
--- a/code_mario/clef18_task1_data.py
+++ b/code_mario/clef18_task1_data.py
@@ -167,24 +167,47 @@ def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame):
     print("\n\n\n")
 
 
+def check_word_dictionary_overlap(cert_df: DataFrame, dict_df: DataFrame, dict_file: str):
+    words = set()
+    with open(dict_file, "r", encoding="utf8") as dict_reader:
+        for line in dict_reader.readlines():
+            words.add(line.strip().split(" ")[0])
+        dict_reader.close()
+
+    cert_words = set()
+    for i, row in cert_df.iterrows():
+        for word in str(row["RawText"]).lower().split(" "):
+            cert_words.add(word)
+
+    dict_words = set()
+    for i, row in dict_df.iterrows():
+        for word in str(row["DiagnosisText"]).lower().split(" "):
+            dict_words.add(word)
+
+    inter_cert_words = words.intersection(cert_words)
+    print(len(inter_cert_words) / len(cert_words))
+
+    inter_dict_words = words.intersection(dict_words)
+    print(len(inter_dict_words) / len(dict_words))
+
+
 if __name__ == "__main__":
     # Just for debugging / development purposes
     AppContext.initialize_by_app_name("Clef18Task1-Data")
 
     clef_task_data = Clef18Task1Data()
 
-    certificates = clef_task_data.read_it_train_certificates()
-    clef_task_data.add_masked_icd10_column(certificates, 4, "RARE-ICD10")
-
-    it_dict = clef_task_data.read_it_dictionary()
-    fr_dict = clef_task_data.read_fr_dictionary()
-    hu_dict = clef_task_data.read_hu_dictionary()
+    it_certificates = clef_task_data.read_it_train_certificates()
+    it_dictionary = clef_task_data.read_it_dictionary()
+    check_word_dictionary_overlap(it_certificates, it_dictionary, "data/dictionary/it-en.txt")
 
-    print("HU: ", len(hu_dict["ICD10"].unique()))
-    print("IT: ", len(it_dict["ICD10"].unique()))
-    print("FR: ", len(fr_dict["ICD10"].unique()))
+    hu_certificates = clef_task_data.read_hu_train_certificates()
+    hu_dictionary = clef_task_data.read_hu_dictionary()
+    check_word_dictionary_overlap(hu_certificates, hu_dictionary, "data/dictionary/hu-en.txt")
 
-    print("HU: ", hu_dict["ICD10"].value_counts())
+    fr_certificates = clef_task_data.read_fr_train_certificates()
+    fr_dictionary = clef_task_data.read_fr_dictionary()
+    check_word_dictionary_overlap(fr_certificates, fr_dictionary, "data/dictionary/fr-en.txt")
 
     # certificates = pdu.extract_icd10_chapter("ICD10", "ICD10_chapter").fit_transform(certificates)
     # certificates = pdu.extract_icd10_subchapter("ICD10", "ICD10_subchapter").fit_transform(certificates)
-- 
GitLab