diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index a900511f67ff8f22e30d61f2a9debbd98ac24ecc..0ee0be9ccaafaf891dca6fd03a22bdb9ba551f2c 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -167,24 +167,47 @@ def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame): print("\n\n\n") +def check_word_dictionary_overlap(cert_df: DataFrame, dict_df: DataFrame, dict_file: str): + words = set() + with open(dict_file, "r", encoding="utf8") as dict_reader: + for line in dict_reader.readlines(): + words.add(line.strip().split(" ")[0]) + dict_reader.close() + + cert_words = set() + for i, row in cert_df.iterrows(): + for word in str(row["RawText"]).lower().split(" "): + cert_words.add(word) + + dict_words = set() + for i, row in dict_df.iterrows(): + for word in str(row["DiagnosisText"]).lower().split(" "): + dict_words.add(word) + + inter_cert_words = words.intersection(cert_words) + print(len(inter_cert_words) / len(cert_words)) + + inter_dict_words = words.intersection(dict_words) + print(len(inter_dict_words) / len(dict_words)) + + if __name__ == "__main__": # Just for debugging / development purposes AppContext.initialize_by_app_name("Clef18Task1-Data") clef_task_data = Clef18Task1Data() - certificates = clef_task_data.read_it_train_certificates() - clef_task_data.add_masked_icd10_column(certificates, 4, "RARE-ICD10") - - it_dict = clef_task_data.read_it_dictionary() - fr_dict = clef_task_data.read_fr_dictionary() - hu_dict = clef_task_data.read_hu_dictionary() + it_certificates = clef_task_data.read_it_train_certificates() + it_dictionary = clef_task_data.read_it_dictionary() + check_word_dictionary_overlap(it_certificates, it_dictionary, "data/dictionary/it-en.txt") - print("HU: ", len(hu_dict["ICD10"].unique())) - print("IT: ", len(it_dict["ICD10"].unique())) - print("FR: ", len(fr_dict["ICD10"].unique())) + hu_certificates = clef_task_data.read_hu_train_certificates() + hu_dictionary = clef_task_data.read_hu_dictionary() + check_word_dictionary_overlap(hu_certificates, hu_dictionary, "data/dictionary/hu-en.txt") - print("HU: ", hu_dict["ICD10"].value_counts()) + fr_certificates = clef_task_data.read_fr_train_certificates() + fr_dictionary = clef_task_data.read_fr_dictionary() + check_word_dictionary_overlap(fr_certificates, fr_dictionary, "data/dictionary/fr-en.txt") # certificates = pdu.extract_icd10_chapter("ICD10", "ICD10_chapter").fit_transform(certificates) # certificates = pdu.extract_icd10_subchapter("ICD10", "ICD10_subchapter").fit_transform(certificates)