Skip to content
Snippets Groups Projects
Commit 78f4515a authored by Mario Sänger's avatar Mario Sänger
Browse files

Add word dictionary tests

parent 0faabe5c
No related merge requests found
......@@ -167,24 +167,47 @@ def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame):
print("\n\n\n")
def check_word_dictionary_overlap(cert_df: DataFrame, dict_df: DataFrame, dict_file: str):
words = set()
with open(dict_file, "r", encoding="utf8") as dict_reader:
for line in dict_reader.readlines():
words.add(line.strip().split(" ")[0])
dict_reader.close()
cert_words = set()
for i, row in cert_df.iterrows():
for word in str(row["RawText"]).lower().split(" "):
cert_words.add(word)
dict_words = set()
for i, row in dict_df.iterrows():
for word in str(row["DiagnosisText"]).lower().split(" "):
dict_words.add(word)
inter_cert_words = words.intersection(cert_words)
print(len(inter_cert_words) / len(cert_words))
inter_dict_words = words.intersection(dict_words)
print(len(inter_dict_words) / len(dict_words))
if __name__ == "__main__":
# Just for debugging / development purposes
AppContext.initialize_by_app_name("Clef18Task1-Data")
clef_task_data = Clef18Task1Data()
certificates = clef_task_data.read_it_train_certificates()
clef_task_data.add_masked_icd10_column(certificates, 4, "RARE-ICD10")
it_dict = clef_task_data.read_it_dictionary()
fr_dict = clef_task_data.read_fr_dictionary()
hu_dict = clef_task_data.read_hu_dictionary()
it_certificates = clef_task_data.read_it_train_certificates()
it_dictionary = clef_task_data.read_it_dictionary()
check_word_dictionary_overlap(it_certificates, it_dictionary, "data/dictionary/it-en.txt")
print("HU: ", len(hu_dict["ICD10"].unique()))
print("IT: ", len(it_dict["ICD10"].unique()))
print("FR: ", len(fr_dict["ICD10"].unique()))
hu_certificates = clef_task_data.read_hu_train_certificates()
hu_dictionary = clef_task_data.read_hu_dictionary()
check_word_dictionary_overlap(hu_certificates, hu_dictionary, "data/dictionary/hu-en.txt")
print("HU: ", hu_dict["ICD10"].value_counts())
fr_certificates = clef_task_data.read_fr_train_certificates()
fr_dictionary = clef_task_data.read_fr_dictionary()
check_word_dictionary_overlap(fr_certificates, fr_dictionary, "data/dictionary/fr-en.txt")
# certificates = pdu.extract_icd10_chapter("ICD10", "ICD10_chapter").fit_transform(certificates)
# certificates = pdu.extract_icd10_subchapter("ICD10", "ICD10_subchapter").fit_transform(certificates)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment