From a8ed4c2f0aa33be2396eada89dad813b5ad106aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de> Date: Thu, 10 May 2018 09:30:11 +0200 Subject: [PATCH] Revise FastTextModel class structures --- code_mario/clef18_task1_base.py | 92 +++++++------- code_mario/clef18_task1_data.py | 207 ++++++++++++++++++++------------ code_mario/clef18_task1_emb1.py | 27 +++-- code_mario/clef18_task1_emb2.py | 6 +- code_mario/ft_embeddings.py | 96 +++++++++++++-- code_mario/preprocessing.py | 22 +++- 6 files changed, 299 insertions(+), 151 deletions(-) diff --git a/code_mario/clef18_task1_base.py b/code_mario/clef18_task1_base.py index 0d4f860..d941363 100644 --- a/code_mario/clef18_task1_base.py +++ b/code_mario/clef18_task1_base.py @@ -72,57 +72,57 @@ class Clef18Task1Base(LoggingMixin): test_sets = [ # ("dict", dict_embeddings, config.dict_df), - ("cert-train", conf.train_data, conf.train_df), + #("cert-train", conf.train_data, conf.train_df), ("cert-val", conf.val_data, conf.val_df), ("cert-test", conf.test_data, conf.test_df) ] named_classifiers = [ - # ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()), - # ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")), - # ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)), - # ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)), - # ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)), - #("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)), - - ("DNN-200-Test", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-test", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=False, dropout_rate=0.0, epochs=1, batch_size=16)), - - # ("DNN-200", lambda label, input_dim, output_dim, val_data: - # self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), - # - # ("DNN-300", lambda label, input_dim, output_dim, val_data: - # self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), - # - # ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data: - # self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), - # ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data: - # self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), - # - # ("DNN-200-100", lambda label, input_dim, output_dim, val_data: - # self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), - # ("DNN-200-200", lambda label, input_dim, output_dim, val_data: - # self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), - # ("DNN-300-200", lambda label, input_dim, output_dim, val_data: - # self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], - # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), - # - # ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data: - # self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), - # ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data: - # self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), - # ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data: - # self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim,hidden_layer_sizes=[300, 200], - # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()), + ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")), + ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)), + ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)), + ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)), + # ("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)), + + # ("DNN-200-Test", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-test", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + # batch_normalization=False, dropout_rate=0.0, epochs=1, batch_size=16)), + + ("DNN-200", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + + ("DNN-300", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], + batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + + ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], + batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + + ("DNN-200-100", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], + batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + ("DNN-200-200", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], + batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + ("DNN-300-200", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], + batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + + ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], + batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], + batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim,hidden_layer_sizes=[300, 200], + batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")), ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent")) diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index 8c36f54..7a149c5 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -2,6 +2,7 @@ import os from typing import List import pandas as pd +from keras.preprocessing.text import Tokenizer, text_to_word_sequence from pandas import DataFrame from tqdm import tqdm @@ -60,12 +61,12 @@ class Clef18Task1Data(LoggingMixin): calculees_file = os.path.join(base_folder, "CausesCalculees_IT_1.csv") brutes_file = os.path.join(base_folder, "CausesBrutes_IT_1.csv") - return self._read_certificates([calculees_file], [brutes_file]) + return self._read_certificates([calculees_file], [brutes_file], "it") def read_it_dictionary(self) -> DataFrame: base_folder = "data/train/IT/training/raw/dictionaries" dictionary_file = os.path.join(base_folder, "dictionary_IT.csv") - return self._read_icd10_dictionary(dictionary_file, "iso-8859-1") + return self._read_icd10_dictionary([dictionary_file], "iso-8859-1", "it") def read_it_test_certificates(self) -> DataFrame: brutes_file = "data/test/IT/test/raw/corpus/CausesBrutes_IT_2.csv" @@ -79,12 +80,12 @@ class Clef18Task1Data(LoggingMixin): calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv") brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv") - return self._read_certificates([calculees_file], [brutes_file]) + return self._read_certificates([calculees_file], [brutes_file], "it") def read_hu_dictionary(self) -> DataFrame: base_folder = "data/train/HU/training/raw/dictionaries" dictionary_file = os.path.join(base_folder, "Hungarian_dictionary_UTF8.csv") - return self._read_icd10_dictionary(dictionary_file, "utf-8") + return self._read_icd10_dictionary([dictionary_file], "utf-8", "hu") def read_hu_test_certificates(self) -> DataFrame: brutes_file = "data/test/HU/test/raw/corpus/CausesBrutes_HU_2.csv" @@ -108,14 +109,17 @@ class Clef18Task1Data(LoggingMixin): os.path.join(base_folder, "CausesBrutes_FR_2014.csv") ] - return self._read_certificates(calculees_files, brutes_files) + return self._read_certificates(calculees_files, brutes_files, "fr") def read_fr_dictionary(self) -> DataFrame: - # FIXME: Load other training files from 2011-2015! - base_folder = "data/train/FR/training/aligned/dictionaries" - dictionary_file = os.path.join(base_folder, "Dictionnaire2006-2010.csv") - return self._read_icd10_dictionary(dictionary_file, "utf-8") + dictionary_files = [ + os.path.join(base_folder, "Dictionnaire2006-2010.csv"), + os.path.join(base_folder, "Dictionnaire2014.csv"), + os.path.join(base_folder, "Dictionnaire2015.csv") + + ] + return self._read_icd10_dictionary(dictionary_files, "utf-8", "fr") def read_fr_test_certificates(self) -> DataFrame: brutes_file = "data/test/FR/test/raw/corpus/CausesBrutes_FR_2.csv" @@ -129,60 +133,13 @@ class Clef18Task1Data(LoggingMixin): self.read_hu_dictionary()]) def read_all_con_certificates(self) -> DataFrame: - return pd.concat([self.read_fr_train_certificates(), - self.read_it_train_certificates(), - self.read_hu_train_certificates()]) - - # -------------------------------------------------------------------------------- - - def filter_single_code_lines(self, certificate_df: DataFrame) -> DataFrame: - multi_code_lines = [key for key, group in groupby(certificate_df.index.values) if len(list(group)) > 1] - self.logger.info("Start filtering %s lines with multiple codes", len(multi_code_lines)) - - original_size = len(certificate_df) - certificate_df = certificate_df.drop(multi_code_lines) - self.logger.info("Filtered %s out of %s entries due to single code constraint", len(certificate_df), original_size) - - return certificate_df - - def add_masked_icd10_column(self, certificate_df: DataFrame, min_support: int, mask_code: str = "RARE-ICD10") -> DataFrame: - code_frequency_distribution = certificate_df["ICD10"].value_counts() - icd_masker = pdu.mask_icd10("ICD10", "ICD10_masked", code_frequency_distribution, min_support, mask_code) - - certificate_df = icd_masker.fit_transform(certificate_df) - - num_infrequent_codes = certificate_df["ICD10_masked"].value_counts()[mask_code] - self.logger.info("Added masked icd10 code column. Found %s codes with support less than %s", num_infrequent_codes, min_support) - - return certificate_df - - def down_sample_by_icd10_frequency(self, certificate_df: DataFrame, max_freq: int): - self.logger.info("Down sampled data set with %s entries", len(certificate_df)) - icd10_codes = certificate_df["ICD10"].unique() - - data_sets = [] - for code in tqdm(icd10_codes,desc="down-sample", total=len(icd10_codes)): - entries_by_code = certificate_df.query("ICD10 == '%s'" % code) - if len(entries_by_code) > max_freq: - unique_texts = entries_by_code["RawText"].unique() - - unique_entries = [] - for text in unique_texts: - unique_entries.append(entries_by_code.query("RawText == \"%s\"" % text)[0:1]) - - unique_entries.append(entries_by_code.sample(max(max_freq-len(unique_texts), 10))) - entries_by_code = pd.concat(unique_entries) - - data_sets.append(entries_by_code) - - sampled_df = pd.concat(data_sets) - sampled_df = sampled_df.sample(frac=1) # Reshuffle! - self.logger.info("Down sampled data set contains %s entries", len(sampled_df)) - return sampled_df + all_certificates = pd.concat([self.read_fr_train_certificates(), self.read_it_train_certificates(), self.read_hu_train_certificates()]) + self.logger.info("Found %s death certificate lines", len(all_certificates)) + return all_certificates # -------------------------------------------------------------------------------- - def _read_certificates(self, calculees_files: List[str], brutus_files: List[str]) -> DataFrame: + def _read_certificates(self, calculees_files: List[str], brutus_files: List[str], language: str) -> DataFrame: calculees_data = [] for calculees_file in calculees_files: self.logger.info("Reading calculees file from %s", calculees_file) @@ -210,12 +167,21 @@ class Clef18Task1Data(LoggingMixin): self.logger.info("Removed %s lines with ICD10 'nan'", num_unchecked_data - len(joined_data)) joined_data = pdu.clean_text("RawText").fit_transform(joined_data) + joined_data["Lang"] = language + + word_tagger = pdu.tag_words_with_language("RawText", "RawText", "Lang") + joined_data = word_tagger.fit_transform(joined_data) + + return joined_data[["RawText", "ICD10", "Lang"]] - return joined_data[["RawText", "ICD10"]] + def _read_icd10_dictionary(self, dictionary_files: List[str], encoding: str, lang: str) -> DataFrame: + dictionary_data = [] + for dict_file in dictionary_files: + self.logger.info("Reading ICD10 dictionary from %s", dict_file) + dictionary_data.append(pd.read_csv(dict_file, sep=";", encoding=encoding, + skipinitialspace=True, error_bad_lines=False)) + dictionary_data = pd.concat(dictionary_data) - def _read_icd10_dictionary(self, file: str, encoding: str) -> DataFrame: - self.logger.info("Reading ICD-10 dictionary from %s", file) - dictionary_data = pd.read_csv(file, sep=";", encoding=encoding, skipinitialspace=True) num_dictionary_entries = len(dictionary_data) self.logger.info("Found %s dictionary entries", num_dictionary_entries) @@ -228,18 +194,87 @@ class Clef18Task1Data(LoggingMixin): dictionary_data.columns = ["ICD10", "Standardized", "DiagnosisText"] dictionary_data["ICD10"] = dictionary_data["ICD10"].astype(str) - self.logger.info("Removed %s duplicates from dictionary", num_dictionary_entries - len(dictionary_data)) + dictionary_data["Lang"] = lang + + self.logger.info("Tag language %s on all words", lang) + word_tagger = pdu.tag_words_with_language("DiagnosisText", "DiagnosisText", "Lang") + dictionary_data = word_tagger.fit_transform(dictionary_data) + return dictionary_data def _read_test_data(self, file: str) -> DataFrame: self.logger.info("Reading test certificates from %s", file) - test_data = pd.read_csv(file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], skipinitialspace=True) + test_data = pd.read_csv(file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], + skipinitialspace=True, error_bad_lines=False) self.logger.info("Found %s test certificate lines.", len(test_data)) return test_data + # -------------------------------------------------------------------------------- + + def filter_single_code_lines(self, certificate_df: DataFrame) -> DataFrame: + multi_code_lines = [key for key, group in groupby(certificate_df.index.values) if len(list(group)) > 1] + self.logger.info("Start filtering %s lines with multiple codes", len(multi_code_lines)) + + original_size = len(certificate_df) + certificate_df = certificate_df.drop(multi_code_lines) + self.logger.info("Filtered %s out of %s entries due to single code constraint", len(certificate_df), original_size) + + return certificate_df + + def add_masked_icd10_column(self, certificate_df: DataFrame, min_support: int, mask_code: str = "RARE-ICD10") -> DataFrame: + code_frequency_distribution = certificate_df["ICD10"].value_counts() + icd_masker = pdu.mask_icd10("ICD10", "ICD10_masked", code_frequency_distribution, min_support, mask_code) + + certificate_df = icd_masker.fit_transform(certificate_df) + + num_infrequent_codes = certificate_df["ICD10_masked"].value_counts()[mask_code] + self.logger.info("Added masked icd10 code column. Found %s codes with support less than %s", num_infrequent_codes, min_support) + + return certificate_df + + def down_sample_by_icd10_frequency(self, certificate_df: DataFrame, max_freq: int): + self.logger.info("Down sampled data set with %s entries", len(certificate_df)) + icd10_codes = certificate_df["ICD10"].unique() + + data_sets = [] + for code in tqdm(icd10_codes,desc="down-sample", total=len(icd10_codes)): + entries_by_code = certificate_df.query("ICD10 == '%s'" % code) + if len(entries_by_code) > max_freq: + unique_texts = entries_by_code["RawText"].unique() + + unique_entries = [] + for text in unique_texts: + unique_entries.append(entries_by_code.query("RawText == \"%s\"" % text)[0:1]) + + unique_entries.append(entries_by_code.sample(max(max_freq-len(unique_texts), 10))) + entries_by_code = pd.concat(unique_entries) + + data_sets.append(entries_by_code) + + sampled_df = pd.concat(data_sets) + sampled_df = sampled_df.sample(frac=1) # Reshuffle! + self.logger.info("Down sampled data set contains %s entries", len(sampled_df)) + return sampled_df + + def extend_certificates_by_dictionaries(self, certificate_df: DataFrame, dictionary_df: DataFrame) -> DataFrame: + original_size = len(certificate_df) + + dict_icd10_codes = dictionary_df["ICD10"].unique() + cert_icd10_codes = certificate_df["ICD10"].unique() + + unseen_icd10_codes = [dict_icd10 for dict_icd10 in dict_icd10_codes if dict_icd10 not in cert_icd10_codes] + for unseen_icd10 in tqdm(unseen_icd10_codes, desc="extend-cert", total=len(unseen_icd10_codes)): + entries = dictionary_df.query("ICD10 == '%s'" % unseen_icd10) + + for key, row in entries.iterrows(): + certificate_df = certificate_df.append({"RawText": row["DiagnosisText"], "ICD10": row["ICD10"] }, ignore_index=True) + + extended_size = len(certificate_df) + self.logger.info("Extended cert data set with %s from dictionary (%s in total)" % (extended_size - original_size, extended_size)) + return certificate_df def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame): print("Data set: ", ds_name) @@ -344,30 +379,44 @@ if __name__ == "__main__": AppContext.initialize_by_app_name("Clef18Task1-Data") clef_task_data = Clef18Task1Data() + + #all_cert = clef_task_data.read_all_con_certificates() + #check_label_distribution(all_cert) + #clef_task_data.down_sample_by_icd10_frequency(all_cert, 4000) + it_certificates = clef_task_data.read_it_train_certificates() - it_dictionary = clef_task_data.read_it_dictionary() - check_label_distribution(it_certificates) - #it_certificates = clef_task_data.down_sample_by_icd10_frequency(it_certificates, 800) - check_label_distribution(it_certificates) + word_tagger = pdu.tag_words_with_language("RawText", "RawText", "Lang") + word_tagger.fit_transform(it_certificates) -# check_word_dictionary_overlap(it_certificates, it_dictionary, "data/dictionary/it-en.txt") + it_dictionary = clef_task_data.read_it_dictionary() + clef_task_data.extend_certificates_by_dictionaries(it_certificates, it_dictionary) hu_certificates = clef_task_data.read_hu_train_certificates() hu_dictionary = clef_task_data.read_hu_dictionary() - - check_label_distribution(hu_certificates) - #hu_certificates = clef_task_data.down_sample_by_icd10_frequency(hu_certificates, 2750) - check_label_distribution(hu_certificates) - # check_word_dictionary_overlap(hu_certificates, hu_dictionary, "data/dictionary/hu-en.txt") + clef_task_data.extend_certificates_by_dictionaries(hu_certificates, hu_dictionary) fr_certificates = clef_task_data.read_fr_train_certificates() fr_dictionary = clef_task_data.read_fr_dictionary() + clef_task_data.extend_certificates_by_dictionaries(fr_certificates, fr_dictionary) - check_label_distribution(fr_certificates) - fr_certificates = clef_task_data.down_sample_by_icd10_frequency(fr_certificates, 2750) - check_label_distribution(fr_certificates) + #check_label_distribution(it_certificates) + #it_certificates = clef_task_data.down_sample_by_icd10_frequency(it_certificates, 800) + #check_label_distribution(it_certificates) + + #hu_certificates = clef_task_data.read_hu_train_certificates() + #hu_dictionary = clef_task_data.read_hu_dictionary() + #check_label_distribution(hu_certificates) + #hu_certificates = clef_task_data.down_sample_by_icd10_frequency(hu_certificates, 2750) + #check_label_distribution(hu_certificates) + #check_word_dictionary_overlap(hu_certificates, hu_dictionary, "data/dictionary/hu-en.txt") + + # fr_certificates = clef_task_data.read_fr_train_certificates() + # fr_dictionary = clef_task_data.read_fr_dictionary() + # check_label_distribution(fr_certificates) + # fr_certificates = clef_task_data.down_sample_by_icd10_frequency(fr_certificates, 2750) + # check_label_distribution(fr_certificates) # check_word_dictionary_overlap(fr_certificates, fr_dictionary, "data/dictionary/fr-en.txt") diff --git a/code_mario/clef18_task1_emb1.py b/code_mario/clef18_task1_emb1.py index dad8532..fabf803 100644 --- a/code_mario/clef18_task1_emb1.py +++ b/code_mario/clef18_task1_emb1.py @@ -26,7 +26,7 @@ from typing import Tuple, Dict, List, Callable from app_context import AppContext from clef18_task1_data import Clef18Task1Data -from ft_embeddings import FastTextEmbeddings, FastTextModel +from ft_embeddings import FastTextEmbeddings, FastTextModel, SingleLanguageFastTextModel from preprocessing import DataPreparationUtil as pdu from keras_extension import KerasUtil as ku @@ -54,19 +54,19 @@ class Clef18Task1Emb1(Clef18Task1Base): def build_embedding_model(self, word_index: Dict, ft_model: FastTextModel, max_cert_length: int, max_dict_length: int): # TODO: Make hyper-parameter configurable! - # TODO: Think about using CNNs instead of RNNs since we can have multiple ICD-10 per line! - - embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size)) + embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size())) for word, i in word_index.items(): try: - embedding_vector = ft_model.lookup(word) + language = word[0:2] + token = word[2:] + embedding_vector = ft_model.lookup(token, language) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector except KeyError: self.logger.error("Can't create embedding for '%s'", word) - embedding = Embedding(len(word_index)+1, ft_model.vector_size, weights=[embedding_matrix], mask_zero=True) + embedding = Embedding(len(word_index)+1, ft_model.vector_size(), weights=[embedding_matrix], mask_zero=True) # Model 1: Learn a representation of a line originating from a death certificate input_certificate_line = Input((max_cert_length, )) @@ -256,7 +256,7 @@ class Clef18Task1Emb1(Clef18Task1Base): dict_df, max_dict_length = self.prepare_dictionary_df(dict_df, "train", label_encoders, keras_tokenizer) return Emb1Configuration(train_cert_df, val_cert_df, test_cert_df, dict_df, max_cert_length, max_dict_length, - ft_model.vector_size, strat_column, label_encoders, keras_tokenizer) + ft_model.vector_size(), strat_column, label_encoders, keras_tokenizer) def prepare_certificate_df(self, certificate_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders, keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]: @@ -439,13 +439,14 @@ if __name__ == "__main__": train_emb_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int) train_emb_parser.add_argument("--down_sample", help="Maximal frequency of ICD10 code until start down sampling", default=None, type=int) + train_emb_parser.add_argument("--extend_cert", help="Indicates whether to extend the certificates with dict_entries", default=True, type=bool) train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str) train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) train_emb_parser.add_argument("--strat_min_freq", help="Min frequency of an icd10 code to be an own class during stratification", default=8, type=int) train_emb_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') train_emb_parser.add_argument("--single_only", help="Indicates whether just to use the single code lines from the data", default=False, type=bool) - train_emb_parser.add_argument("--max_pos_samples", help="Maximal number of positive samples to use", default=10, type=int) + train_emb_parser.add_argument("--max_pos_samples", help="Maximal number of positive samples to use", default=12, type=int) train_emb_parser.add_argument("--neg_sampling", help="Negative sampling strategy to use", default="ext1", choices=["def", "ext1"]) train_emb_parser.add_argument("--num_neg_samples", help="Number of negative samples to use (default strategy)", default=75, type=int) train_emb_parser.add_argument("--num_neg_cha", help="Number of negative chapter samples to use (ext1 strategy)", default=10, type=int) @@ -483,18 +484,22 @@ if __name__ == "__main__": dictionary = clef_data.read_dictionary_by_id(args.lang) certificates = clef_data.read_train_certifcates_by_id(args.lang) + # sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + # ft_model = SingleLanguageFastTextModel("it", "it", FastText(sentences, min_count=1)) + ft_embeddings = FastTextEmbeddings() ft_model = ft_embeddings.load_embeddings_by_id(args.lang) - #sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - #ft_model = FastTextModel("dummy", [FastText(sentences, min_count=1)]) - if args.down_sample: certificates = clef_data.down_sample_by_icd10_frequency(certificates, args.down_sample) if args.single_only: certificates = clef_data.filter_single_code_lines(certificates) + if args.extend_cert: + certificates = clef_data.extend_certificates_by_dictionaries(certificates, dictionary) + #certificates = certificates + if args.strat_splits: certificates = clef_data.add_masked_icd10_column(certificates, args.strat_min_freq) diff --git a/code_mario/clef18_task1_emb2.py b/code_mario/clef18_task1_emb2.py index 6cc3200..c5f79e9 100644 --- a/code_mario/clef18_task1_emb2.py +++ b/code_mario/clef18_task1_emb2.py @@ -155,7 +155,7 @@ class Clef18Task1Emb2(Clef18Task1Base): # TODO: Make hyper-parameter configurable! # TODO: Think about using CNNs instead of RNNs! - embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size)) + embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size())) for word, i in word_index.items(): try: embedding_vector = ft_model.lookup(word) @@ -165,7 +165,7 @@ class Clef18Task1Emb2(Clef18Task1Base): except KeyError: self.logger.error("Can't create embedding for '%s'", word) - input_embedding = Embedding(len(word_index)+1, ft_model.vector_size, weights=[embedding_matrix], mask_zero=True) + input_embedding = Embedding(len(word_index)+1, ft_model.vector_size(), weights=[embedding_matrix], mask_zero=True) # Model 1: Learn a representation of a line originating from a death certificate or the dictionary input_certificate_line = Input((conf.max_length, )) @@ -225,7 +225,7 @@ class Clef18Task1Emb2(Clef18Task1Base): self.logger.info("Start preparation of test cert data (%s instances)", len(test_df)) test_df, _ = self.prepare_cert_dict_df(test_df, "test", label_encoders, keras_tokenizer) - return Configuration(train_df, val_df, test_df, max_length, ft_model.vector_size, + return Configuration(train_df, val_df, test_df, max_length, ft_model.vector_size(), strat_column, label_encoders, keras_tokenizer) def prepare_cert_dict_df(self, cert_dict_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders, diff --git a/code_mario/ft_embeddings.py b/code_mario/ft_embeddings.py index f2b3b7f..47fb9b5 100644 --- a/code_mario/ft_embeddings.py +++ b/code_mario/ft_embeddings.py @@ -1,6 +1,6 @@ import numpy as np -from typing import List +from typing import List, Dict from gensim.models import FastText from app_context import AppContext @@ -9,18 +9,82 @@ from util import LoggingMixin class FastTextModel(LoggingMixin): - def __init__(self, name: str, ft_models: List[FastText]): + def __init__(self, name: str): LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) self.name = name + + def lookup(self, word: str, language: str): + raise NotImplementedError() + + def vector_size(self): + raise NotImplementedError() + + +class SingleLanguageFastTextModel(FastTextModel): + + def __init__(self, name: str, lang: str, ft_model: FastText): + FastTextModel.__init__(self, name) + self.name = name + self.lang = lang.lower() + self.ft_model = ft_model + + def lookup(self, word: str, lang: str): + embeddings = [] + if self.lang == lang.lower(): + try: + return embeddings.append(self.ft_model[word]) + except KeyError as error: + self.logger.warn("Can't create embedding for " + word) + return np.zeros(self.ft_model.vector_size) + else: + self.logger.warn("FastText model doesn't support language %s", lang) + return np.zeros(self.ft_model.vector_size) + + def vector_size(self): + return self.ft_model.vector_size + + +class MultiLanguageFastTextModel(FastTextModel): + + def __init__(self, name: str, ft_models: Dict[str, FastText]): + FastTextModel.__init__(self, name) + self.name = name self.ft_models = ft_models - self.vector_size = sum([ft_model.vector_size for ft_model in self.ft_models]) + self.emb_size = ft_models.values()[0].vector_size - def lookup(self, word: str): + def lookup(self, word: str, lang: str): + lang = lang.lower() + + if lang in self.ft_models: + ft_model = self.ft_models[lang] + + try: + return ft_model[word] + except KeyError as error: + self.logger.warn("Can't create embedding for " + word) + return np.zeros(ft_model.vector_size) + else: + self.logger.warn("FastText model doesn't support language %s", lang) + return np.zeros(self.emb_size) + + def vector_size(self): + return self.emb_size + + +class MultiLanguageConcatenationFastTextModel(FastTextModel): + + def __init__(self, name: str, ft_models: List[FastText]): + FastTextModel.__init__(self, name) + self.name = name + self.ft_models = ft_models + self.emb_size = sum([ft_model.vector_size for ft_model in ft_models]) + + def lookup(self, word: str, lang: str): embeddings = [] for ft_model in self.ft_models: try: embeddings.append(ft_model[word]) - except KeyError as error: + except KeyError: self.logger.warn("Can't create embedding for " + word) embeddings.append(np.zeros(ft_model.vector_size)) @@ -29,6 +93,9 @@ class FastTextModel(LoggingMixin): else: return np.concatenate(embeddings) + def vector_size(self): + return self.emb_size + class FastTextEmbeddings(LoggingMixin): @@ -37,18 +104,25 @@ class FastTextEmbeddings(LoggingMixin): def load_embeddings_by_id(self, id: str) -> FastTextModel: if id == "it": - return FastTextModel("it", [self.load_it_embeddings()]) + return SingleLanguageFastTextModel("it", "it", self.load_it_embeddings()) elif id == "hu": - return FastTextModel("hu", [self.load_hu_embeddings()]) + return SingleLanguageFastTextModel("hu", "hu", self.load_hu_embeddings()) elif id == "fr": - return FastTextModel("fr", [self.load_fr_embeddings()]) + return SingleLanguageFastTextModel("fr", "fr", self.load_fr_embeddings()) + + elif id == "multi": + return MultiLanguageFastTextModel("multi", { + "it" : self.load_it_embeddings(), + "hu" : self.load_hu_embeddings(), + "fr" : self.load_fr_embeddings() + }) elif id == "all-con": - return FastTextModel("all-con", [self.load_fr_embeddings(), - self.load_it_embeddings(), - self.load_hu_embeddings()]) + return MultiLanguageConcatenationFastTextModel("all-con", [self.load_fr_embeddings(), + self.load_it_embeddings(), + self.load_hu_embeddings()]) else: raise AssertionError("Unsupported language: " + id) diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py index ac7fad0..d474372 100644 --- a/code_mario/preprocessing.py +++ b/code_mario/preprocessing.py @@ -2,7 +2,7 @@ import re import numpy as np from gensim.models import FastText -from keras.preprocessing.text import Tokenizer +from keras.preprocessing.text import Tokenizer, text_to_word_sequence from pandas import DataFrame from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import LabelEncoder @@ -95,6 +95,10 @@ class DataPreparationUtil(object): return MapFunction(column, _clean) + @staticmethod + def tag_words_with_language(text_column: str, target_column: str, lang_column: str): + return LanguageWordTagger(text_column, target_column, lang_column) + class FitMixin(object): @@ -240,3 +244,19 @@ class KerasSequencer(BaseEstimator, TransformerMixin): sequences = self.keras_tokenizer.texts_to_sequences(texts) return PandasUtil.append_column(data, self.target_column, sequences) + + +class LanguageWordTagger(BaseEstimator, FitMixin, TransformerMixin): + + def __init__(self, text_column: str, target_column: str, lang_column: str): + self.text_column = text_column + self.target_column = target_column + self.lang_column = lang_column + + def transform(self, data: DataFrame, y=None): + def _tag_words(row): + language = row[self.lang_column] + return " ".join("%s%s" % (language, word) for word in text_to_word_sequence(row[self.text_column])) + + data[self.target_column] = data.apply(_tag_words, axis=1) + return data -- GitLab