diff --git a/code_jurica/_config.py b/code_jurica/_config.py index d998713e17b9c79c93f4a1cb26fff09cf00647fc..8ea5d0c745372a3c97c92233e2ba4f8339153718 100644 --- a/code_jurica/_config.py +++ b/code_jurica/_config.py @@ -11,23 +11,52 @@ DATA_IT=IT_HOME+'corpus/' TRAINING = { 'FR': { - 'CB': DATA_FR + 'CausesBrutes_FR_2014.csv', - 'CC': DATA_FR + 'CausesCalculees_FR_2014.csv', - 'Ident': DATA_FR + 'Ident_FR_2014.csv' + 'CB': [ + #DATA_FR + 'CausesBrutes_FR_2006-2012.csv', + #DATA_FR + 'CausesBrutes_FR_2013.csv', + DATA_FR + 'CausesBrutes_FR_2014.csv' + ], + 'CC': [ + #DATA_FR + 'CausesCalculees_FR_2006-2012.csv', + #DATA_FR + 'CausesCalculees_FR_2013.csv', + DATA_FR + 'CausesCalculees_FR_2014.csv' + ], + 'Ident': [ + DATA_FR + 'Ident_FR_2014.csv' + ], + "Encoding" : "latin1", + "SplitMultiCode" : False }, 'HU': { - 'CB': DATA_HU + 'CausesBrutes_HU_1.csv', - 'CC': DATA_HU + 'CausesCalculees_HU_1.csv', - 'Ident': DATA_HU + 'Ident_HU_1.csv' + 'CB': [ + DATA_HU + 'CausesBrutes_HU_1.csv' + ], + 'CC': [ + DATA_HU + 'CausesCalculees_HU_1.csv' + ], + 'Ident': [ + DATA_HU + 'Ident_HU_1.csv' + ], + "Encoding" : "latin1", + "SplitMultiCode" : False }, 'IT': { - 'CB': DATA_IT + 'CausesBrutes_IT_1.csv', - 'CC': DATA_IT + 'CausesCalculees_IT_1.csv', - 'Ident': DATA_IT + 'Ident_IT_1.csv' + 'CB': [ + DATA_IT + 'CausesBrutes_IT_1.csv' + ], + 'CC': [ + DATA_IT + 'CausesCalculees_IT_1.csv' + ], + 'Ident': [ + DATA_IT + 'Ident_IT_1.csv' + ], + "Encoding" : "latin1", + "SplitMultiCode" : False } } DICT_FR=FR_HOME+'dictionaries/Dictionnaire2015.csv' DICT_HU=HU_HOME+'dictionaries/Hungarian_dictionary_UTF8.csv' DICT_IT=IT_HOME+'dictionaries/dictionary_IT.csv' -DICT_PREPROCESED=PREPARED_DATA_FOLDER+'dictionaries.p' \ No newline at end of file + +DICT_PREPROCESED=PREPARED_DATA_FOLDER+'dictionaries.p' diff --git a/code_jurica/loader.py b/code_jurica/loader.py index a195673137bf53359dd98efad1de5f7a87eeb1a8..3d06d5572c5d35313e325068a6cf5880a7e3f42e 100644 --- a/code_jurica/loader.py +++ b/code_jurica/loader.py @@ -1,17 +1,19 @@ from util import * import numpy as np +from keras.preprocessing.sequence import pad_sequences +from keras.preprocessing.text import Tokenizer +from keras.layers import Embedding import random from sklearn.model_selection import train_test_split import pickle -import os - -from keras.preprocessing.text import Tokenizer -from keras.layers import Embedding #REPRODUCIBLE -os.environ['PYTHONHASHSEED'] = '0' np.random.seed(42) +import random random.seed(12345) +import os +os.environ['PYTHONHASHSEED'] = '0' +#REPRODUCIBLE source_kerasTokenizer = Tokenizer() target_kerasTokenizer = Tokenizer() @@ -22,6 +24,7 @@ SEED = 777 frCorpora, frErrors = prepareData.prepareData('FR') itCorpora, itErrors = prepareData.prepareData('IT') huCorpora, huErrors = prepareData.prepareData('HU') +# print(len(frErrors), len(itErrors), len(huErrors)) try: df = pd.DataFrame(frErrors+itErrors+huErrors, columns=['Dataset','DocID', 'MissingRowID']) @@ -56,6 +59,11 @@ source_kerasTokenizer.word_index=source_vocab with open('models/s2s_source_tokenizer_extended.p', 'wb') as handle: pickle.dump(source_kerasTokenizer, handle) +# source_word_sequence=kerasTokenizer.texts_to_sequences(source_corpus) +# source_max_sequence = max([len(x) for x in source_word_sequence]) +# source_word_sequence = pad_sequences(source_word_sequence, maxlen=source_max_sequence, padding='post') +# print('See source lengths: {} {}'.format(source_max_sequence_tokenizer, source_max_sequence)) + #TARGET TOKENS target_corpus=['sos '+x[1]+' eos' for x in corpora] target_tokens = tokenizer.transform([x for x in target_corpus]) @@ -67,12 +75,6 @@ target_kerasTokenizer.word_index=target_vocab with open('models/s2s_target_tokenizer_extended.p', 'wb') as handle: pickle.dump(target_kerasTokenizer, handle) - -# source_word_sequence=kerasTokenizer.texts_to_sequences(source_corpus) -# source_max_sequence = max([len(x) for x in source_word_sequence]) -# source_word_sequence = pad_sequences(source_word_sequence, maxlen=source_max_sequence, padding='post') -# print('See source lengths: {} {}'.format(source_max_sequence_tokenizer, source_max_sequence)) - # target_word_sequence=kerasTokenizer.texts_to_sequences(target_corpus) # target_max_sequence = max([len(x) for x in target_word_sequence]) # target_word_sequence = pad_sequences(target_word_sequence, maxlen=target_max_sequence, padding='post') @@ -103,20 +105,20 @@ target_embedding_layer = Embedding(target_embeddings.shape[0], mask_zero=True) #generate train/test split -source_train, source_val, _, _ = train_test_split(source_corpus, labels, test_size=0.01, random_state=777) -target_train, target_val, labels_train, labels_val = train_test_split(target_corpus, labels, test_size=0.01, random_state=777) - -data_set_train_test = { - 'source_train':source_train, - 'source_val':source_val, - 'target_train':target_train, - 'target_val':target_val, - 'labels_train':labels_train, - 'labels_val':labels_val -} - -with open('models/train_test_split_extended.p', 'wb') as handle: - pickle.dump(data_set_train_test, handle) +source_train, source_val, _, _ = train_test_split(source_corpus, labels, test_size=0.05, random_state=777) +target_train, target_val, labels_train, labels_val = train_test_split(target_corpus, labels, test_size=0.05, random_state=777) + +# data_set_train_test = { +# 'source_train':source_train, +# 'source_val':source_val, +# 'target_train':target_train, +# 'target_val':target_val, +# 'labels_train':labels_train, +# 'labels_val':labels_val +# } +# +# with open('models/train_test_split_extended.p', 'wb') as handle: +# pickle.dump(data_set_train_test, handle) # # target_train_onehot = np.zeros((len(target_train), target_max_sequence, len(target_vocab)+1)) # for seq_id, sequence in enumerate(target_train): diff --git a/code_jurica/util.py b/code_jurica/util.py index f1bd951791f108d7c4fa48e6b0468a41baae075c..64de4cce88cc8023d6eb06ded7e362b02e0f32fc 100644 --- a/code_jurica/util.py +++ b/code_jurica/util.py @@ -223,26 +223,51 @@ class prepareData(): errors = pickle.load(open('data/preprocesed/Errors_{}.p'.format(dataset),'rb')) except FileNotFoundError: - cb = pd.read_csv(TRAINING[dataset]['CB'], sep=';', encoding = "latin1") - cc = pd.read_csv(TRAINING[dataset]['CC'], sep=';', encoding = "latin1") + file_encoding = TRAINING[dataset]['Encoding'] + split_multi_code = TRAINING[dataset]['SplitMultiCode'] + + cc_files = TRAINING[dataset]['CC'] + cc_data = [pd.read_csv(cc_file, sep=';', encoding=file_encoding, skipinitialspace=True) for cc_file in cc_files] + cc = pd.concat(cc_data) + + cb_files = TRAINING[dataset]['CB'] + cb_data = [pd.read_csv(cb_file, sep=';', encoding=file_encoding, skipinitialspace=True) for cb_file in cb_files] + cb = pd.concat(cb_data) + data = [] errors= [] for index, row in tqdm(cb.iterrows(), ascii=True, desc='Preparing {}'.format(dataset)): try: - text = cc[(cc.DocID == row.DocID) & (cc.LineID == row.LineID)] + text = cc[(cc.DocID == row.DocID) & (cc.LineID == row.LineID) & (cc.YearCoded == row.YearCoded)] + num_icd10_codes = len(text) # print(text.StandardText.values[0], text.ICD10.values[0]) - data.append([ - row.RawText.lower(), - text.StandardText.values[0].lower(), - text.ICD10.values[0] - ]) + + appended_data = False + if split_multi_code and num_icd10_codes > 1: + parts = row.RawText.lower().split(",") + if len(parts) == num_icd10_codes: + for i in range(num_icd10_codes): + data.append([parts[i], text.StandardText.values[i], text.ICD10.values[i]]) + + appended_data = True + + if not appended_data: + data.append([ + row.RawText.lower(), + text.StandardText.values[0].lower(), + text.ICD10.values[0] + ]) + except Exception as e: # print(e) # print(row.DocID, row.LineID) errors.append([dataset, row.DocID, row.LineID]) + output_folder = "data/preprocesed/" + os.makedirs(output_folder, exist_ok=True) + pickle.dump(data, open('data/preprocesed/Train_{}.p'.format(dataset),'wb')) pickle.dump(errors, open('data/preprocesed/Errors_{}.p'.format(dataset),'wb')) diff --git a/code_mario/clef18_task1_base.py b/code_mario/clef18_task1_base.py new file mode 100644 index 0000000000000000000000000000000000000000..0d4f8605f4c20f09b204aac3c508af328a70cc06 --- /dev/null +++ b/code_mario/clef18_task1_base.py @@ -0,0 +1,371 @@ +import os +import pandas as pd +import keras as k +import pickle + +from numpy.core.records import ndarray + +from pandas import DataFrame +from argparse import Namespace +from typing import Callable, List, Tuple + +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import SGDClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import f1_score, accuracy_score +from sklearn.svm import LinearSVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.externals import joblib +from sklearn.model_selection import train_test_split + +from app_context import AppContext +from util import LoggingMixin +from dnn_classifiers import NeuralNetworkClassifiers as nnc +from keras_extension import KerasUtil as ku + + +class ICD10LabelEncoders(object): + + def __init__(self, chapter_encoder: LabelEncoder, section_encoder: LabelEncoder, + subsection_encoder: LabelEncoder, code_encoder: LabelEncoder): + self.chapter_encoder = chapter_encoder + self.section_encoder = section_encoder + self.subsection_encoder = subsection_encoder + self.code_encoder = code_encoder + + +class EvaluationConfiguration(object): + + def __init__(self, target_labels: List[str], label_encoders: ICD10LabelEncoders, input_dim: int, + train_data: ndarray, train_df: ndarray, val_data: ndarray, val_df: ndarray, + test_data: ndarray, test_df: ndarray): + self.target_labels = target_labels + self.label_encoders = label_encoders + self.input_dim = input_dim + self.train_data = train_data + self.train_df = train_df + self.val_data = val_data + self.val_df = val_df + self.test_data = test_data + self.test_df = test_df + + +class EvaluationResult(object): + + def __init__(self, target_label: str, classifier_name: str, data_set_name: str, accuracy: float): + self.target_label = target_label + self.classifier_name = classifier_name + self.data_set_name = data_set_name + self.accuracy = accuracy + + +class Clef18Task1Base(LoggingMixin): + + def __init__(self): + LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + + def run_evaluation(self, conf: EvaluationConfiguration): + + target_label_configs = self.get_label_configuration(conf.target_labels, conf.label_encoders) + + test_sets = [ + # ("dict", dict_embeddings, config.dict_df), + ("cert-train", conf.train_data, conf.train_df), + ("cert-val", conf.val_data, conf.val_df), + ("cert-test", conf.test_data, conf.test_df) + ] + + named_classifiers = [ + # ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()), + # ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")), + # ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)), + # ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)), + # ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)), + #("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)), + + ("DNN-200-Test", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-test", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + batch_normalization=False, dropout_rate=0.0, epochs=1, batch_size=16)), + + # ("DNN-200", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + # + # ("DNN-300", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], + # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + # + # ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + # ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], + # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + # + # ("DNN-200-100", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], + # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + # ("DNN-200-200", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], + # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + # ("DNN-300-200", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], + # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + # + # ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], + # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + # ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], + # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + # ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim,hidden_layer_sizes=[300, 200], + # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + + ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")), + ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent")) + ] + + num_experiments = len(target_label_configs) * len(test_sets) * len(named_classifiers) + cur_experiment = 1 + + models_dir = os.path.join(AppContext.default().output_dir, "models") + os.makedirs(models_dir, exist_ok=True) + + results = [] + for target_label, target_column, label_encoder in target_label_configs: + self.logger.info("Start evaluation experiments with label %s", target_label) + output_dim = len(label_encoder.classes_) + train_labels = conf.train_df[target_column].values + + #complete_train_data = np.append(dict_embeddings, train_cert_embeddings, axis=0) + #complete_train_labels = np.append(config.dict_df[target_column].values, config.train_cert_df[target_column].values, axis=0) + + self.logger.info("Build complete training samples (data: %s, labels: %s)", conf.train_data.shape, train_labels.shape) + + for cl_name, classifier_factory in named_classifiers: + self.logger.info("Start training of classifier %s", cl_name) + classifier = classifier_factory(target_label, conf.input_dim, output_dim, conf.val_data) + classifier.fit(conf.train_data, train_labels) + + classifier_file_name = "cl_{}_{}.model".format(cl_name, target_label).lower() + classifier_file = os.path.join(models_dir, classifier_file_name) + try: + joblib.dump(classifier, classifier_file) + except: + self.logger.error("Error while saving classifier %s to %s", cl_name, classifier_file) + + self.logger.info("Start evaluation of %s", cl_name) + for ts_name, inputs, data_frame in test_sets: + gold_labels = data_frame[target_column].values + + self.logger.info("Evaluate data set %s", ts_name) + prediction = classifier.predict(inputs) + acc_score = accuracy_score(gold_labels, prediction) + + self.logger.info("Evaluation result: label=%s | classifier=%s | data_set=%s | acc_score=%s", + target_label, cl_name, ts_name, acc_score) + results.append(EvaluationResult(target_label, cl_name, ts_name, acc_score)) + + self.logger.info("Finished experiment %s out of %s", cur_experiment, num_experiments) + cur_experiment += 1 + + return results + + def prepare_label_encoders(self, dict_df: DataFrame, cert_df: DataFrame) -> ICD10LabelEncoders: + self.logger.info("Fitting label encoder to ICD10 codes") + icd10_code_encoder = LabelEncoder() + icd10_code_encoder.fit(list([icd10.strip() for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip() for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 codes within the data set", len(icd10_code_encoder.classes_)) + + self.logger.info("Fitting label encoder to ICD10 chapters") + icd10_chapter_encoder = LabelEncoder() + icd10_chapter_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 chapters within the data set", len(icd10_chapter_encoder.classes_)) + + self.logger.info("Fitting label encoder to ICD10 section") + icd10_section_encoder = LabelEncoder() + icd10_section_encoder.fit(list([icd10.strip().lower()[0:2] for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip().lower()[0:2] for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 sections within the data set", len(icd10_section_encoder.classes_)) + + self.logger.info("Fitting label encoder to ICD10 subsection") + icd10_subsection_encoder = LabelEncoder() + icd10_subsection_encoder.fit(list([icd10.strip().lower()[0:3] for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip().lower()[0:3] for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 subsections within the data set", len(icd10_subsection_encoder.classes_)) + + return ICD10LabelEncoders(icd10_chapter_encoder, icd10_section_encoder, icd10_subsection_encoder, icd10_code_encoder) + + def get_label_configuration(self, target_labels: List[str], icd10_encoders: ICD10LabelEncoders) -> List: + label_configs = [] + + for target_label in target_labels: + if target_label == "chap" or target_label == "chapter": + label_configs.append((target_label, "ICD10_chapter_encoded", icd10_encoders.chapter_encoder)) + elif target_label == "sect" or target_label == "section": + label_configs.append((target_label, "ICD10_section_encoded", icd10_encoders.section_encoder)) + elif target_label == "subs" or target_label == "subsection": + label_configs.append((target_label, "ICD10_subsection_encoded", icd10_encoders.subsection_encoder)) + elif target_label == "code" or target_label == "icd10": + label_configs.append((target_label, "ICD10_encoded", icd10_encoders.code_encoder)) + else: + self.logger.error("Can't create label configuration for label " + target_label) + + return label_configs + + def split_train_test(self, certificate_df: DataFrame, train_size: float, + stratified_splits: bool, label_column: str) -> Tuple[DataFrame, DataFrame]: + if stratified_splits: + self.logger.info("Creating stratified splits for column %s", label_column) + training_data, test_data = train_test_split(certificate_df, train_size=train_size, stratify=certificate_df[label_column]) + else: + self.logger.info("Creating non-stratified splits") + training_data, test_data = train_test_split(certificate_df, train_size=train_size) + + return training_data, test_data + + def save_evaluation_results(self, eval_results: List[EvaluationResult]): + result_configurations = [ + ("results.csv", None), + ("results_by_classifier.csv", lambda result: result.classifier_name), + ("results_by_data_set.csv", lambda result: result.data_set_name), + ("results_by_label.csv", lambda result: result.target_label) + ] + + for file_name, sort_key in result_configurations: + results_file = os.path.join(AppContext.default().output_dir, file_name) + with open(results_file, "w", encoding="utf8") as result_writer: + if sort_key: + eval_results = sorted(eval_results, key=sort_key) + + for r in eval_results: + result_writer.write("%s\t%s\t%s\t%s\n" % (r.target_label, r.classifier_name, r.data_set_name, r.accuracy)) + result_writer.close() + + def save_arguments(self, arguments: Namespace): + arguments_file = os.path.join(AppContext.default().log_dir, "arguments.txt") + self.logger.info("Saving arguments to " + arguments_file) + + with open(arguments_file, 'w', encoding="utf8") as writer: + for key, value in arguments.__dict__.items(): + writer.write("%s=%s\n" % (str(key), str(value))) + writer.close() + + def reload_embedding_model(self, emb_model_file: str): + self.logger.info("Reloading embedding model from " + emb_model_file) + return k.models.load_model(emb_model_file) + + def reload_classifier(self, classifer_file: str): + self.logger.info("Loading classifier from %s", classifer_file) + classifier = joblib.load(classifer_file) + return classifier + + def create_dnn_classifier(self, model_name, label: str, val_data: Tuple, **kwargs): + if val_data is not None: + monitor_loss = "val_loss" + else: + monitor_loss = "loss" + + callbacks = [ + ku.best_model_checkpointing_by_model_name(model_name), + ku.csv_logging_callback(model_name, label), + ku.early_stopping(monitor_loss, 5) + ] + + kwargs["callbacks"] = callbacks + return nnc.dense_network(**kwargs) + + def save_configuration(self, configuration): + label_encoder_file = os.path.join(AppContext.default().output_dir, "label_encoder.pk") + self.logger.info("Saving label encoder to " + label_encoder_file) + with open(label_encoder_file, 'wb') as encoder_writer: + pickle.dump(configuration.label_encoders, encoder_writer) + encoder_writer.close() + + keras_tokenizer_file = os.path.join(AppContext.default().output_dir, "keras_tokenizer.pk") + self.logger.info("Saving keras sequencer to " + keras_tokenizer_file) + with open(keras_tokenizer_file, 'wb') as keras_sequencer_writer: + pickle.dump(configuration.keras_tokenizer, keras_sequencer_writer) + keras_sequencer_writer.close() + + configuration_file = os.path.join(AppContext.default().output_dir, "configuration.pk") + self.logger.info("Saving configuration to " + configuration_file) + with open(configuration_file, 'wb') as train_conf_writer: + pickle.dump(configuration, train_conf_writer) + train_conf_writer.close() + + def reload_configuration(self, file_path: str): + self.logger.info("Reloading configuration from " + file_path) + with open(file_path, 'rb') as train_conf_reader: + configuration = pickle.load(train_conf_reader) + train_conf_reader.close() + + return configuration + + +class NegativeSampling(LoggingMixin): + + def __init__(self): + LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + + def get_strategy_by_name(self, name: str, args: Namespace) -> Callable: + #FIXME: Make args to dictionary + + if name == "def": + return self.default_strategy(args.num_neg_samples) + elif name == "ext1": + return self.extended1_strategy(args.num_neg_cha, args.num_neg_sec, args.num_neg_sub, args.num_neg_oth) + else: + raise AssertionError("Unsupported negative sampling strategy: " + name) + + def default_strategy(self, num_negative_samples: int) -> Callable: + def _sample(dictionary_df: DataFrame, line_icd10_code: str): + negative_samples = dictionary_df.query("ICD10 != '%s'" % line_icd10_code) + + # Only necessary during development and tests with only very few examples + if len(negative_samples) > 0: + negative_samples = negative_samples.sample(min(num_negative_samples, len(negative_samples))) + + return negative_samples + + return _sample + + def extended1_strategy(self, num_chapter_samples, num_section_samples, num_subsection_samples, num_other_samples): + def _sample(dictionary_df: DataFrame, icd10_code: str): + icd10_chapter = icd10_code[0].lower() + icd10_section = icd10_code[0:2].lower() + icd10_subsection = icd10_code[0:3].lower() + + chapter_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter == '%s'" % (icd10_code, icd10_chapter)) + if len(chapter_samples) > 0: + chapter_samples = chapter_samples.sample(min(num_chapter_samples, len(chapter_samples))) + + # section_samples = dictionary_df.query("ICD10 != '%s' & ICD10_section == '%s'" % (icd10_code, icd10_section)) + # if len(section_samples) > 0: + # section_samples = section_samples.sample(min(num_section_samples, len(section_samples))) + # + # subsection_samples = dictionary_df.query("ICD10 != '%s' & ICD10_subsection == '%s'" % (icd10_code, icd10_subsection)) + # if len(subsection_samples) > 0: + # subsection_samples = subsection_samples.sample(min(num_subsection_samples, len(subsection_samples))) + # + #act_sim_samples = len(chapter_samples) + len(section_samples) + len(subsection_samples) + + act_sim_samples = len(chapter_samples) + exp_sim_samples = num_chapter_samples + num_section_samples + num_subsection_samples + + other_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter != '%s'" % (icd10_code, icd10_chapter)) + other_samples = other_samples.sample(min(num_other_samples + (exp_sim_samples - act_sim_samples), len(other_samples))) + + # print("#Chapter samples: ", len(chapter_samples)) + # print("#Section samples: ", len(section_samples)) + # print("#Subsection samples: ", len(subsection_samples)) + # print("#Other samples: ", len(other_samples)) + + return pd.concat([chapter_samples, other_samples]) + #return pd.concat([chapter_samples, section_samples, subsection_samples, other_samples]) + return _sample diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index a900511f67ff8f22e30d61f2a9debbd98ac24ecc..8c36f54c9c184dae695017f654fa0929dcf8cbc1 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -1,7 +1,11 @@ import os +from typing import List + import pandas as pd from pandas import DataFrame +from tqdm import tqdm + from app_context import AppContext from util import LoggingMixin from itertools import groupby @@ -13,26 +17,41 @@ class Clef18Task1Data(LoggingMixin): def __init__(self): LoggingMixin.__init__(self, self.__class__.__name__) - def read_train_certifcates_by_language(self, lang: str) -> DataFrame: - if lang == "it": + def read_train_certifcates_by_id(self, id: str) -> DataFrame: + if id == "it": return self.read_it_train_certificates() - elif lang == "hu": + elif id == "hu": return self.read_hu_train_certificates() - elif lang == "fr": + elif id == "fr": return self.read_fr_train_certificates() + elif id == "all-con": + return self.read_all_con_certificates() else: - raise AssertionError("Unsupported language: " + lang) + raise AssertionError("Unsupported language: " + id) - def read_dictionary_by_language(self, lang: str) -> DataFrame: + def read_test_certifcates_by_lang(self, lang: str) -> DataFrame: if lang == "it": - return self.read_it_dictionary() + return self.read_it_test_certificates() elif lang == "hu": - return self.read_hu_dictionary() + return self.read_hu_test_certificates() elif lang == "fr": - return self.read_fr_dictionary() + return self.read_fr_test_certificates() + else: raise AssertionError("Unsupported language: " + lang) + def read_dictionary_by_id(self, id: str) -> DataFrame: + if id == "it": + return self.read_it_dictionary() + elif id == "hu": + return self.read_hu_dictionary() + elif id == "fr": + return self.read_fr_dictionary() + elif id == "all-con": + return self.read_all_con_dictionary() + else: + raise AssertionError("Unsupported language: " + id) + # -------------------------------------------------------------------------------- def read_it_train_certificates(self) -> DataFrame: @@ -41,13 +60,17 @@ class Clef18Task1Data(LoggingMixin): calculees_file = os.path.join(base_folder, "CausesCalculees_IT_1.csv") brutes_file = os.path.join(base_folder, "CausesBrutes_IT_1.csv") - return self._read_certificates(calculees_file, brutes_file) + return self._read_certificates([calculees_file], [brutes_file]) def read_it_dictionary(self) -> DataFrame: base_folder = "data/train/IT/training/raw/dictionaries" dictionary_file = os.path.join(base_folder, "dictionary_IT.csv") return self._read_icd10_dictionary(dictionary_file, "iso-8859-1") + def read_it_test_certificates(self) -> DataFrame: + brutes_file = "data/test/IT/test/raw/corpus/CausesBrutes_IT_2.csv" + return self._read_test_data(brutes_file) + # -------------------------------------------------------------------------------- def read_hu_train_certificates(self) -> DataFrame: @@ -56,23 +79,36 @@ class Clef18Task1Data(LoggingMixin): calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv") brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv") - return self._read_certificates(calculees_file, brutes_file) + return self._read_certificates([calculees_file], [brutes_file]) def read_hu_dictionary(self) -> DataFrame: base_folder = "data/train/HU/training/raw/dictionaries" dictionary_file = os.path.join(base_folder, "Hungarian_dictionary_UTF8.csv") return self._read_icd10_dictionary(dictionary_file, "utf-8") + def read_hu_test_certificates(self) -> DataFrame: + brutes_file = "data/test/HU/test/raw/corpus/CausesBrutes_HU_2.csv" + return self._read_test_data(brutes_file) + # -------------------------------------------------------------------------------- def read_fr_train_certificates(self) -> DataFrame: # FIXME: Load other training files from 2011-2015! base_folder = "data/train/FR/training/raw/corpus/" - calculees_file = os.path.join(base_folder, "CausesCalculees_FR_2006-2012.csv") - brutes_file = os.path.join(base_folder, "CausesBrutes_FR_2006-2012.csv") + calculees_files = [ + os.path.join(base_folder, "CausesCalculees_FR_2006-2012.csv"), + os.path.join(base_folder, "CausesCalculees_FR_2013.csv"), + os.path.join(base_folder, "CausesCalculees_FR_2014.csv") + ] + + brutes_files = [ + os.path.join(base_folder, "CausesBrutes_FR_2006-2012.csv"), + os.path.join(base_folder, "CausesBrutes_FR_2013.csv"), + os.path.join(base_folder, "CausesBrutes_FR_2014.csv") + ] - return self._read_certificates(calculees_file, brutes_file) + return self._read_certificates(calculees_files, brutes_files) def read_fr_dictionary(self) -> DataFrame: # FIXME: Load other training files from 2011-2015! @@ -81,6 +117,22 @@ class Clef18Task1Data(LoggingMixin): dictionary_file = os.path.join(base_folder, "Dictionnaire2006-2010.csv") return self._read_icd10_dictionary(dictionary_file, "utf-8") + def read_fr_test_certificates(self) -> DataFrame: + brutes_file = "data/test/FR/test/raw/corpus/CausesBrutes_FR_2.csv" + return self._read_test_data(brutes_file) + + # -------------------------------------------------------------------------------- + + def read_all_con_dictionary(self) -> DataFrame: + return pd.concat([self.read_fr_dictionary(), + self.read_it_dictionary(), + self.read_hu_dictionary()]) + + def read_all_con_certificates(self) -> DataFrame: + return pd.concat([self.read_fr_train_certificates(), + self.read_it_train_certificates(), + self.read_hu_train_certificates()]) + # -------------------------------------------------------------------------------- def filter_single_code_lines(self, certificate_df: DataFrame) -> DataFrame: @@ -104,19 +156,61 @@ class Clef18Task1Data(LoggingMixin): return certificate_df + def down_sample_by_icd10_frequency(self, certificate_df: DataFrame, max_freq: int): + self.logger.info("Down sampled data set with %s entries", len(certificate_df)) + icd10_codes = certificate_df["ICD10"].unique() + + data_sets = [] + for code in tqdm(icd10_codes,desc="down-sample", total=len(icd10_codes)): + entries_by_code = certificate_df.query("ICD10 == '%s'" % code) + if len(entries_by_code) > max_freq: + unique_texts = entries_by_code["RawText"].unique() + + unique_entries = [] + for text in unique_texts: + unique_entries.append(entries_by_code.query("RawText == \"%s\"" % text)[0:1]) + + unique_entries.append(entries_by_code.sample(max(max_freq-len(unique_texts), 10))) + entries_by_code = pd.concat(unique_entries) + + data_sets.append(entries_by_code) + + sampled_df = pd.concat(data_sets) + sampled_df = sampled_df.sample(frac=1) # Reshuffle! + self.logger.info("Down sampled data set contains %s entries", len(sampled_df)) + return sampled_df + # -------------------------------------------------------------------------------- - def _read_certificates(self, calculees_file: str, brutus_file: str) -> DataFrame: - self.logger.info("Reading calculees file from %s", calculees_file) - calculees_data = pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"], skipinitialspace=True) - self.logger.info("Found %s death certificate lines", len(calculees_data)) + def _read_certificates(self, calculees_files: List[str], brutus_files: List[str]) -> DataFrame: + calculees_data = [] + for calculees_file in calculees_files: + self.logger.info("Reading calculees file from %s", calculees_file) + calculees_data.append(pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], + skipinitialspace=True)) + self.logger.info("Found %s death certificate entries", len(calculees_data[-1])) + + calculees_data = pd.concat(calculees_data) + self.logger.info("Found %s death certificate lines in total", len(calculees_data)) + + brutus_data = [] + for brutus_file in brutus_files: + self.logger.info("Reading brutus file from %s", brutus_file) + brutus_data.append(pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], + skipinitialspace=True)) + self.logger.info("Found %s death certificate entries", len(brutus_data[-1])) - self.logger.info("Reading brutus file from %s", brutus_file) - brutus_data = pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"]) + brutus_data = pd.concat(brutus_data) joined_data = brutus_data.join(calculees_data, lsuffix="_b", rsuffix="_c") joined_data["ICD10"] = joined_data["ICD10"].astype(str) + num_unchecked_data = len(joined_data) + joined_data = joined_data.query("ICD10 != 'nan'") + self.logger.info("Removed %s lines with ICD10 'nan'", num_unchecked_data - len(joined_data)) + + joined_data = pdu.clean_text("RawText").fit_transform(joined_data) + return joined_data[["RawText", "ICD10"]] def _read_icd10_dictionary(self, file: str, encoding: str) -> DataFrame: @@ -139,6 +233,13 @@ class Clef18Task1Data(LoggingMixin): return dictionary_data + def _read_test_data(self, file: str) -> DataFrame: + self.logger.info("Reading test certificates from %s", file) + test_data = pd.read_csv(file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], skipinitialspace=True) + self.logger.info("Found %s test certificate lines.", len(test_data)) + + return test_data + def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame): print("Data set: ", ds_name) @@ -167,24 +268,108 @@ def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame): print("\n\n\n") +def check_word_dictionary_overlap(cert_df: DataFrame, dict_df: DataFrame, dict_file: str): + words = set() + with open(dict_file, "r", encoding="utf8") as dict_reader: + for line in dict_reader.readlines(): + words.add(line.strip().split(" ")[0]) + dict_reader.close() + + cert_words = set() + for i, row in cert_df.iterrows(): + for word in str(row["RawText"]).lower().split(" "): + cert_words.add(word) + + dict_words = set() + for i, row in dict_df.iterrows(): + for word in str(row["DiagnosisText"]).lower().split(" "): + dict_words.add(word) + + inter_cert_words = words.intersection(cert_words) + print(len(inter_cert_words) / len(cert_words)) + + inter_dict_words = words.intersection(dict_words) + print(len(inter_dict_words) / len(dict_words)) + +def check_multi_code_lines(cert_df: DataFrame): + duplicate_ids = cert_df.index.duplicated(keep=False) + lines_with_multiple_codes = cert_df[duplicate_ids] + + num_multi_code_lines = len(lines_with_multiple_codes.index.unique()) + + line_occurrences = cert_df.index.value_counts() + + + correct_single = set() + incorrect_single = set() + + correct_multi = set() + incorrect_multi = set() + + for i, row in cert_df.iterrows(): + num_commas = str(row["RawText"]).count(",") + num_icd10_codes = line_occurrences[i] + + if num_commas == (num_icd10_codes - 1) and num_icd10_codes == 1: + correct_single.add(i) + + if num_commas != (num_icd10_codes - 1) and num_icd10_codes == 1: + incorrect_single.add(i) + + if num_commas == (num_icd10_codes-1) and num_icd10_codes > 1: + correct_multi.add(i) + + if num_commas != (num_icd10_codes - 1) and num_icd10_codes > 1: + incorrect_multi.add(i) + + + print("Number of multi code lines: ", num_multi_code_lines) + print("-> Equal number of commas and codes: ", len(correct_multi)) + + print("Multi line codes:") + print("\tCorrect: ", len(correct_multi)) + print("\tIncorrect: ", len(incorrect_multi)) + + print("Single line codes:") + print("\tCorrect: ", len(correct_single)) + print("\tIncorrect: ", len(incorrect_single)) + +def check_label_distribution(cert_df: DataFrame): + distribution = cert_df["ICD10"].value_counts() + print(distribution) + + if __name__ == "__main__": # Just for debugging / development purposes AppContext.initialize_by_app_name("Clef18Task1-Data") clef_task_data = Clef18Task1Data() + it_certificates = clef_task_data.read_it_train_certificates() + it_dictionary = clef_task_data.read_it_dictionary() + + check_label_distribution(it_certificates) + #it_certificates = clef_task_data.down_sample_by_icd10_frequency(it_certificates, 800) + check_label_distribution(it_certificates) + +# check_word_dictionary_overlap(it_certificates, it_dictionary, "data/dictionary/it-en.txt") + + hu_certificates = clef_task_data.read_hu_train_certificates() + hu_dictionary = clef_task_data.read_hu_dictionary() + + check_label_distribution(hu_certificates) + #hu_certificates = clef_task_data.down_sample_by_icd10_frequency(hu_certificates, 2750) + check_label_distribution(hu_certificates) + # check_word_dictionary_overlap(hu_certificates, hu_dictionary, "data/dictionary/hu-en.txt") - certificates = clef_task_data.read_it_train_certificates() - clef_task_data.add_masked_icd10_column(certificates, 4, "RARE-ICD10") + fr_certificates = clef_task_data.read_fr_train_certificates() + fr_dictionary = clef_task_data.read_fr_dictionary() - it_dict = clef_task_data.read_it_dictionary() - fr_dict = clef_task_data.read_fr_dictionary() - hu_dict = clef_task_data.read_hu_dictionary() + check_label_distribution(fr_certificates) + fr_certificates = clef_task_data.down_sample_by_icd10_frequency(fr_certificates, 2750) + check_label_distribution(fr_certificates) - print("HU: ", len(hu_dict["ICD10"].unique())) - print("IT: ", len(it_dict["ICD10"].unique())) - print("FR: ", len(fr_dict["ICD10"].unique())) - print("HU: ", hu_dict["ICD10"].value_counts()) + # check_word_dictionary_overlap(fr_certificates, fr_dictionary, "data/dictionary/fr-en.txt") # certificates = pdu.extract_icd10_chapter("ICD10", "ICD10_chapter").fit_transform(certificates) # certificates = pdu.extract_icd10_subchapter("ICD10", "ICD10_subchapter").fit_transform(certificates) diff --git a/code_mario/clef18_task1_emb1.py b/code_mario/clef18_task1_emb1.py index cc9529225480f80f4edb5d6539a2eb276fac0aab..dad8532f8cfa39af49a2d87dd059e8ffdba4bb80 100644 --- a/code_mario/clef18_task1_emb1.py +++ b/code_mario/clef18_task1_emb1.py @@ -1,55 +1,37 @@ +import concurrent + +from gensim.models import FastText + from init import * +from concurrent.futures import ThreadPoolExecutor + +from clef18_task1_base import ICD10LabelEncoders, Clef18Task1Base, EvaluationConfiguration, NegativeSampling +from clef18_task1_emb2 import EvaluationResult import argparse import numpy as np import pandas as pd import keras as k -import pickle import os -from argparse import Namespace -from gensim.models import FastText from keras import Input, Model -from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, CSVLogger from keras.layers import Bidirectional, Dense, Dot, LSTM, Embedding from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer from pandas import DataFrame -from sklearn.dummy import DummyClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import SGDClassifier from sklearn.metrics import f1_score, accuracy_score -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline -from sklearn.preprocessing import LabelEncoder -from sklearn.svm import LinearSVC -from sklearn.tree import DecisionTreeClassifier from tqdm import tqdm from typing import Tuple, Dict, List, Callable -from sklearn.externals import joblib -import ft_embeddings from app_context import AppContext from clef18_task1_data import Clef18Task1Data -from dnn_classifiers import NeuralNetworkClassifiers as nnc -from ft_embeddings import FastTextEmbeddings +from ft_embeddings import FastTextEmbeddings, FastTextModel from preprocessing import DataPreparationUtil as pdu from keras_extension import KerasUtil as ku -from util import LoggingMixin -class ICD10LabelEncoders(object): - - def __init__(self, chapter_encoder: LabelEncoder, section_encoder: LabelEncoder, - subsection_encoder: LabelEncoder, code_encoder: LabelEncoder): - self.chapter_encoder = chapter_encoder - self.section_encoder = section_encoder - self.subsection_encoder = subsection_encoder - self.code_encoder = code_encoder - - -class Configuration(object): +class Emb1Configuration(object): def __init__(self, train_cert_df: DataFrame, val_cert_df: DataFrame, test_cert_df: DataFrame, dict_df: DataFrame, max_cert_length: int, max_dict_length: int, ft_embedding_size: int, label_column: str, @@ -65,31 +47,58 @@ class Configuration(object): self.label_encoders = label_encoders self.keras_tokenizer = keras_tokenizer +class Clef18Task1Emb1(Clef18Task1Base): + + def __init__(self): + Clef18Task1Base.__init__(self) -class EvaluationResult(object): + def build_embedding_model(self, word_index: Dict, ft_model: FastTextModel, max_cert_length: int, max_dict_length: int): + # TODO: Make hyper-parameter configurable! + # TODO: Think about using CNNs instead of RNNs since we can have multiple ICD-10 per line! + + embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size)) + for word, i in word_index.items(): + try: + embedding_vector = ft_model.lookup(word) + if embedding_vector is not None: + # words not found in embedding index will be all-zeros. + embedding_matrix[i] = embedding_vector + except KeyError: + self.logger.error("Can't create embedding for '%s'", word) - def __init__(self, target_label: str, classifier_name: str, data_set_name: str, accuracy: float): - self.target_label = target_label - self.classifier_name = classifier_name - self.data_set_name = data_set_name - self.accuracy = accuracy + embedding = Embedding(len(word_index)+1, ft_model.vector_size, weights=[embedding_matrix], mask_zero=True) + # Model 1: Learn a representation of a line originating from a death certificate + input_certificate_line = Input((max_cert_length, )) + cert_embeddings = embedding(input_certificate_line) + certificate_rnn = Bidirectional(LSTM(200), name="cert_rnn")(cert_embeddings) -class Clef18Task1Emb1(LoggingMixin): + # Model 2: Learn a representation of a line in the ICD-10 dictionary (~ DiagnosisText) + input_dictionary_line = Input((max_dict_length, )) + dictionary_embeddings = embedding(input_dictionary_line) + dictionary_rnn = Bidirectional(LSTM(200), name="dict_rnn")(dictionary_embeddings) - def __init__(self): - LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + # Calculate similarity between both representations + dot_product = Dot(axes=1, normalize=True)([certificate_rnn, dictionary_rnn]) - def train_embedding_model(self, config: Configuration, ft_model: FastText, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: - self.logger.info("Start building training pairs") - train_pair_data = self.build_pairs(config.train_cert_df, config.dict_df, neg_sampling_strategy) - self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts()) + output = Dense(1, activation='sigmoid')(dot_product) + # Create the primary training model + model = Model(inputs=[input_certificate_line, input_dictionary_line], outputs=output, name="ICD10-Embedding-Model1") + model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"]) + + return model + + def train_embedding_model(self, config: Emb1Configuration, ft_model: FastTextModel, max_pos_samples: int, neg_sampling_strategy: Callable, + epochs: int, batch_size: int, workers: int, chunk_size: int) -> Model: self.logger.info("Start building embedding model") - model = self.build_embedding_model(config.keras_tokenizer.word_index, ft_model, - config.max_cert_length, config.max_dict_length) + model = self.build_embedding_model(config.keras_tokenizer.word_index, ft_model, config.max_cert_length, config.max_dict_length) model.summary(print_fn=self.logger.info) + self.logger.info("Start building training pairs") + train_pair_data = self.build_pairs(config.train_cert_df, config.dict_df, max_pos_samples, neg_sampling_strategy) + self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts()) + cert_inputs = pad_sequences(train_pair_data["Cert_input"].values, maxlen=config.max_cert_length, padding="post") dict_inputs = pad_sequences(train_pair_data["Dict_input"].values, maxlen=config.max_dict_length, padding="post") labels = train_pair_data["Label"].values @@ -99,18 +108,18 @@ class Clef18Task1Emb1(LoggingMixin): if config.val_cert_df is not None and len(config.test_cert_df) > 0: self.logger.info("Start creation of validation pairs") - val_pair_data = self.build_pairs(config.val_cert_df, config.dict_df, neg_sampling_strategy) + val_pair_data = self.build_pairs(config.val_cert_df, config.dict_df, max_pos_samples, neg_sampling_strategy) val_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length, padding="post") val_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length, padding="post") val_gold_labels = val_pair_data["Label"].values - model.fit([cert_inputs, dict_inputs], labels, epochs=epochs, batch_size=batch_size, - validation_data=([val_cert_inputs, val_dict_inputs], val_gold_labels), + val_data = ([val_cert_inputs, val_dict_inputs], val_gold_labels) + model.fit([cert_inputs, dict_inputs], labels, epochs=epochs, batch_size=batch_size, validation_data=val_data, callbacks=[ku.best_model_checkpointing_by_file_path(best_model_file, "val_loss")]) else: model.fit([cert_inputs, dict_inputs], labels, epochs=epochs, batch_size=batch_size, - callbacks=[ku.best_model_checkpointing_by_file_path(best_model_file)]) + callbacks=[ku.best_model_checkpointing_by_file_path(best_model_file)]) model_file = os.path.join(AppContext.default().output_dir, "embedding_model_last.h5") self.logger.info("Saving last model to %s", model_file) @@ -125,7 +134,7 @@ class Clef18Task1Emb1(LoggingMixin): self.logger.info("Start evaluation of embedding model!") self.logger.info("Start creation of test pairs") - test_pair_data = self.build_pairs(config.test_cert_df, config.dict_df, neg_sampling_strategy) + test_pair_data = self.build_pairs(config.test_cert_df, config.dict_df, max_pos_samples, neg_sampling_strategy) test_cert_inputs = pad_sequences(test_pair_data["Cert_input"].values, maxlen=config.max_cert_length, padding="post") test_dict_inputs = pad_sequences(test_pair_data["Dict_input"].values, maxlen=config.max_dict_length, padding="post") @@ -142,199 +151,83 @@ class Clef18Task1Emb1(LoggingMixin): return model - def train_and_evaluate_classifiers(self, emb_model: Model, config: Configuration, target_labels: List) -> List[EvaluationResult]: - self.logger.info("Start training and evaluation of classifier models") + # --------------------------------------------------------------------------------------------------------------------------------------- - self.logger.info("Building dictionary embeddings") - dict_input = emb_model.inputs[1] - dict_rnn = Model(inputs=dict_input, outputs=emb_model.get_layer("dict_rnn").output, name="Dict-RNN-Model") - dict_inputs = pad_sequences(config.dict_df["Token_ids"].values, maxlen=config.max_dict_length) - dict_embeddings = dict_rnn.predict(dict_inputs, verbose=1, batch_size=1) + def train_and_evaluate_classifiers(self, emb_model: Model, config: Emb1Configuration, target_labels: List) -> List[EvaluationResult]: + self.logger.info("Start training and evaluation of classifier models") - self.logger.info("Building certificate embeddings") cert_input = emb_model.inputs[0] cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model") - train_cert_inputs = pad_sequences(config.train_cert_df["Token_ids"].values, maxlen=config.max_cert_length) + self.logger.info("Building dictionary embeddings") + dict_input = emb_model.inputs[1] + #dict_rnn = Model(inputs=dict_input, outputs=emb_model.get_layer("dict_rnn").output, name="Dict-RNN-Model") + dict_inputs = pad_sequences(config.dict_df["Token_ids"].values, maxlen=config.max_cert_length, padding="post") + dict_embeddings = cert_rnn.predict(dict_inputs, verbose=1, batch_size=1) + + self.logger.info("Building train certificate embeddings") + train_cert_inputs = pad_sequences(config.train_cert_df["Token_ids"].values, maxlen=config.max_cert_length, padding="post") train_cert_embeddings = cert_rnn.predict(train_cert_inputs, verbose=1) self.logger.info("cert train input shape: %s", train_cert_embeddings.shape) - val_cert_inputs = pad_sequences(config.val_cert_df["Token_ids"].values, maxlen=config.max_cert_length) - val_cert_embeddings = cert_rnn.predict(val_cert_inputs, verbose=1) - self.logger.info("cert val input shape: %s", val_cert_embeddings.shape) + self.logger.info("Building val certificate embeddings") + val_inputs = pad_sequences(config.val_cert_df["Token_ids"].values, maxlen=config.max_cert_length, padding="post") + val_embeddings = cert_rnn.predict(val_inputs, verbose=1) + self.logger.info("cert val input shape: %s", val_embeddings.shape) - test_cert_inputs = pad_sequences(config.test_cert_df["Token_ids"].values, maxlen=config.max_cert_length) - test_cert_embeddings = cert_rnn.predict(test_cert_inputs, verbose=1) - self.logger.info("cert test input shape: %s", test_cert_embeddings.shape) + self.logger.info("Building test certificate embeddings") + test_inputs = pad_sequences(config.test_cert_df["Token_ids"].values, maxlen=config.max_cert_length, padding="post") + test_embeddings = cert_rnn.predict(test_inputs, verbose=1) + self.logger.info("cert test input shape: %s", test_embeddings.shape) target_label_configs = self.get_label_configuration(target_labels, config.label_encoders) + target_label_columns = [label_column for _, label_column, _ in target_label_configs] - test_sets = [ - #("dict", dict_embeddings, config.dict_df), - ("cert-train", train_cert_embeddings, config.train_cert_df), - ("cert-val", val_cert_embeddings, config.val_cert_df), - ("cert-test", test_cert_embeddings, config.test_cert_df) - ] - - named_classifiers = [ - ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()), - ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")), - ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)), - ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)), - ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)), - ("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)), - - ("DNN-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=False, dropout_rate=0.0, epochs=10, batch_size=2)), - ("DNN-300", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - - ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - - ("DNN-200-100", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - ("DNN-200-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - ("DNN-300-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - - ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - - ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")), - ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent")) - ] - - num_experiments = len(target_label_configs) * len(test_sets) * len(named_classifiers) - cur_experiment = 1 + train_labels = pd.concat([config.dict_df[target_label_columns], config.train_cert_df[target_label_columns]]) + train_data = np.append(dict_embeddings, train_cert_embeddings, axis=0) input_dim = cert_rnn.output.shape[1].value - models_dir = os.path.join(AppContext.default().output_dir, "models") - os.makedirs(models_dir, exist_ok=True) - - results = [] - for target_label, target_column, label_encoder in target_label_configs: - self.logger.info("Start evaluation experiments with label %s", target_label) - output_dim = len(label_encoder.classes_) - - complete_train_data = np.append(dict_embeddings, train_cert_embeddings, axis=0) - complete_train_labels = np.append(config.dict_df[target_column].values, config.train_cert_df[target_column].values, axis=0) - self.logger.info("Build complete training samples (data: %s, labels: %s)", complete_train_data.shape, complete_train_labels.shape) - - for cl_name, classifier_factory in named_classifiers: - self.logger.info("Start training of classifier %s", cl_name) - classifier = classifier_factory(target_label, input_dim, output_dim, val_cert_embeddings) - classifier.fit(complete_train_data, complete_train_labels) - - classifier_file_name = "cl_{}_{}.model".format(cl_name, target_label).lower() - classifier_file = os.path.join(models_dir, classifier_file_name) - try: - joblib.dump(classifier, classifier_file) - except: - self.logger.error("Error while saving classifier %s to %s", cl_name, classifier_file) - - self.logger.info("Start evaluation of %s", cl_name) - for ts_name, inputs, data_frame in test_sets: - gold_labels = data_frame[target_column].values - - self.logger.info("Evaluate data set %s", ts_name) - prediction = classifier.predict(inputs) - acc_score = accuracy_score(gold_labels, prediction) - - self.logger.info("Evaluation result: label=%s | classifier=%s | data_set=%s | acc_score=%s", - target_label, cl_name, ts_name, acc_score) - results.append(EvaluationResult(target_label, cl_name, ts_name, acc_score)) - - self.logger.info("Finished experiment %s out of %s", cur_experiment, num_experiments) - cur_experiment += 1 - - return results - - def get_label_configuration(self, target_labels: List[str], icd10_encoders: ICD10LabelEncoders) -> List: - label_configs = [] - - for target_label in target_labels: - if target_label == "chap" or target_label == "chapter": - label_configs.append((target_label, "ICD10_chapter_encoded", icd10_encoders.chapter_encoder)) - elif target_label == "sect" or target_label == "section": - label_configs.append((target_label, "ICD10_section_encoded", icd10_encoders.section_encoder)) - elif target_label == "subs" or target_label == "subsection": - label_configs.append((target_label, "ICD10_subsection_encoded", icd10_encoders.subsection_encoder)) - elif target_label == "code" or target_label == "icd10": - label_configs.append((target_label, "ICD10_encoded", icd10_encoders.code_encoder)) - else: - self.logger.error("Can't create label configuration for label " + target_label) + eval_configuration = EvaluationConfiguration(target_labels, config.label_encoders, input_dim, train_data, train_labels, + val_embeddings, config.val_cert_df, test_embeddings, config.test_cert_df) + return self.run_evaluation(eval_configuration) - return label_configs + # --------------------------------------------------------------------------------------------------------------------------------------- - def split_train_test(self, certificate_df: DataFrame, train_size: float, - stratified_splits: bool, label_column: str) -> Tuple[DataFrame, DataFrame]: - if stratified_splits: - self.logger.info("Creating stratified splits for column %s", label_column) - training_data, test_data = train_test_split(certificate_df, train_size=train_size, stratify=certificate_df[label_column]) - else: - self.logger.info("Creating non-stratified splits") - training_data, test_data = train_test_split(certificate_df, train_size=train_size) + def predict(self, emb_model: Model, classifier, conf: Emb1Configuration, test_df: DataFrame) -> DataFrame: + self.logger.info("Start preprocessing test data") + preprocessing_pipeline = Pipeline([ + ("LowercaseText", pdu.to_lowercase("RawText")), + ("TokenizeText", pdu.keras_sequencing("RawText", "Word_ids", conf.keras_tokenizer, False)) + ]) - return training_data, test_data + #test_df = Clef18Task1Data().read_it_train_certificates() - def build_embedding_model(self, word_index: Dict, ft_model: FastText, max_cert_length: int, max_dict_length: int): - # TODO: Make hyper-parameter configurable! - # TODO: Think about using CNNs instead of RNNs since we can have multiple ICD-10 per line! + test_df = preprocessing_pipeline.fit_transform(test_df) + test_emb_inputs = pad_sequences(test_df["Word_ids"].values, maxlen=conf.max_cert_length, padding="post") + self.logger.info("Finished preprocessing of test data (shape: %s)", test_emb_inputs.shape) - embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size)) - for word, i in word_index.items(): - try: - embedding_vector = ft_model[word] - if embedding_vector is not None: - # words not found in embedding index will be all-zeros. - embedding_matrix[i] = embedding_vector - except KeyError: - self.logger.error("Can't create embedding for '%s'", word) - - embedding = Embedding(len(word_index)+1, ft_model.vector_size, weights=[embedding_matrix], mask_zero=True) - - # Model 1: Learn a representation of a line originating from a death certificate - input_certificate_line = Input((max_cert_length, )) - cert_embeddings = embedding(input_certificate_line) - certificate_rnn = Bidirectional(LSTM(200), name="cert_rnn")(cert_embeddings) - - # Model 2: Learn a representation of a line in the ICD-10 dictionary (~ DiagnosisText) - input_dictionary_line = Input((max_dict_length, )) - dictionary_embeddings = embedding(input_dictionary_line) - dictionary_rnn = Bidirectional(LSTM(200), name="dict_rnn")(dictionary_embeddings) + self.logger.info("Start generation of embeddings") + cert_input = emb_model.inputs[0] + cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model") + test_cl_inputs = cert_rnn.predict(test_emb_inputs, 16, verbose=1) - # Calculate similarity between both representations - dot_product = Dot(axes=1, normalize=True)([certificate_rnn, dictionary_rnn]) + predictions = classifier.predict(test_cl_inputs) + predicted_labels = conf.label_encoders.code_encoder.inverse_transform(predictions) - output = Dense(1, activation='sigmoid')(dot_product) + result = DataFrame(columns=["DocID", "YearCoded", "LineID", "Rank", "StandardText", "ICD10", "IntervalText"]) + for i, (id, row) in tqdm(enumerate(test_df.iterrows()), desc="build-result", total=len(test_df)): + result = result.append({"DocID": id[1], "YearCoded" : id[0], "LineID" : id[2], "Rank": "", + "StandardText": "", "ICD10" : predicted_labels[i], "IntervalText": ""}, ignore_index=True) - # Create the primary training model - model = Model(inputs=[input_certificate_line, input_dictionary_line], outputs=output, name="ICD10-Embedding-Model") - model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"]) + output_file = os.path.join(AppContext.default().output_dir, "prediction_train.csv") + result.to_csv(output_file, sep=";", index=False) + return result - return model + # --------------------------------------------------------------------------------------------------------------------------------------- - def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, train_ratio: float, val_ratio: float, - strat_column: str, samples: int=None, stratified_splits: bool=False) -> Configuration: + def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastTextModel, train_ratio: float, val_ratio: float, + strat_column: str, samples: int=None, stratified_splits: bool=False) -> Emb1Configuration: if samples: self.logger.info("Sampling %s instances", samples) @@ -362,35 +255,8 @@ class Clef18Task1Emb1(LoggingMixin): self.logger.info("Start preparation of dictionary data (%s instances)", len(dict_df)) dict_df, max_dict_length = self.prepare_dictionary_df(dict_df, "train", label_encoders, keras_tokenizer) - return Configuration(train_cert_df, val_cert_df, test_cert_df, dict_df, max_cert_length, max_dict_length, - ft_model.vector_size, strat_column, label_encoders, keras_tokenizer) - - def prepare_label_encoders(self, dict_df: DataFrame, cert_df: DataFrame) -> ICD10LabelEncoders: - self.logger.info("Fitting label encoder to ICD10 codes") - icd10_code_encoder = LabelEncoder() - icd10_code_encoder.fit(list([icd10.strip() for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip() for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 codes within the data set", len(icd10_code_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 chapters") - icd10_chapter_encoder = LabelEncoder() - icd10_chapter_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 chapters within the data set", len(icd10_chapter_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 section") - icd10_section_encoder = LabelEncoder() - icd10_section_encoder.fit(list([icd10.strip().lower()[0:2] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0:2] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 sections within the data set", len(icd10_section_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 subsection") - icd10_subsection_encoder = LabelEncoder() - icd10_subsection_encoder.fit(list([icd10.strip().lower()[0:3] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0:3] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 subsections within the data set", len(icd10_subsection_encoder.classes_)) - - return ICD10LabelEncoders(icd10_chapter_encoder, icd10_section_encoder, icd10_subsection_encoder, icd10_code_encoder) + return Emb1Configuration(train_cert_df, val_cert_df, test_cert_df, dict_df, max_cert_length, max_dict_length, + ft_model.vector_size, strat_column, label_encoders, keras_tokenizer) def prepare_certificate_df(self, certificate_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders, keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]: @@ -454,9 +320,7 @@ class Clef18Task1Emb1(LoggingMixin): return dict_data_prepared, max_length - def build_pairs(self, certificate_data: DataFrame, dictionary_data: DataFrame, neg_sampling_strategy: Callable): - # FIXME: This can be implemented more efficiently! - # FIXME: Improve sampling of negative instances (especially if code is I-XXX sample other codes of the same class (e.g. I-YYY) + def build_pairs(self, certificate_data: DataFrame, dictionary_data: DataFrame, max_pos_samples: int, neg_sampling_strategy: Callable): # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2 certificate_vectors = [] @@ -468,6 +332,14 @@ class Clef18Task1Emb1(LoggingMixin): # Build positive examples (based on training data) dictionary_entries = dictionary_data.query("ICD10 == '%s'" % line_icd10_code) + if len(dictionary_entries) > 0: + dictionary_entries = dictionary_entries.sample(min(max_pos_samples, len(dictionary_entries))) + else: + # Add at least one example + certificate_vectors.append(cert_row["Token_ids"]) + dictionary_vectors.append(cert_row["Token_ids"]) + labels.append(1.0) + #self.logger.info("Found %s entries for ICD-10 code %s", len(dictionary_entries), line_icd10_code) for i, dict_row in dictionary_entries.iterrows(): certificate_vectors.append(cert_row["Token_ids"]) @@ -488,146 +360,68 @@ class Clef18Task1Emb1(LoggingMixin): data = {"Cert_input": certificate_vectors, "Dict_input": dictionary_vectors, "Label": labels} return pd.DataFrame(data) - def build_rnn_input(self, data: DataFrame, column: str, max_length: int, vector_size: int) -> np.ndarray: - data_matrix = np.zeros((len(data), max_length, vector_size)) - - for i, (_, row) in tqdm(enumerate(data.iterrows()), desc="build-matrices", total=len(data)): - data_matrix[i] = row[column] - - return data_matrix - - def save_evaluation_results(self, eval_results: List[EvaluationResult]): - result_configurations = [ - ("results.csv", None), - ("results_by_classifier.csv", lambda result: result.classifier_name), - ("results_by_data_set.csv", lambda result: result.data_set_name), - ("results_by_label.csv", lambda result: result.target_label) - ] - - for file_name, sort_key in result_configurations: - results_file = os.path.join(AppContext.default().output_dir, file_name) - with open(results_file, "w", encoding="utf8") as result_writer: - if sort_key: - eval_results = sorted(eval_results, key=sort_key) - - for r in eval_results: - result_writer.write("%s\t%s\t%s\t%s\n" % (r.target_label, r.classifier_name, r.data_set_name, r.accuracy)) - result_writer.close() - - def save_arguments(self, arguments: Namespace): - arguments_file = os.path.join(AppContext.default().log_dir, "arguments.txt") - self.logger.info("Saving arguments to " + arguments_file) - - with open(arguments_file, 'w', encoding="utf8") as writer: - for key, value in arguments.__dict__.items(): - writer.write("%s=%s\n" % (str(key), str(value))) - writer.close() - - def save_configuration(self, configuration: Configuration): - label_encoder_file = os.path.join(AppContext.default().output_dir, "label_encoder.pk") - self.logger.info("Saving label encoder to " + label_encoder_file) - with open(label_encoder_file, 'wb') as encoder_writer: - pickle.dump(configuration.label_encoders, encoder_writer) - encoder_writer.close() - - keras_tokenizer_file = os.path.join(AppContext.default().output_dir, "keras_tokenizer.pk") - self.logger.info("Saving keras sequencer to " + keras_tokenizer_file) - with open(keras_tokenizer_file, 'wb') as keras_sequencer_writer: - pickle.dump(configuration.keras_tokenizer, keras_sequencer_writer) - keras_sequencer_writer.close() - - configuration_file = os.path.join(AppContext.default().output_dir, "configuration.pk") - self.logger.info("Saving configuration to " + configuration_file) - with open(configuration_file, 'wb') as train_conf_writer: - pickle.dump(configuration, train_conf_writer) - train_conf_writer.close() - - def reload_configuration(self, file_path: str): - self.logger.info("Reloading configuration from " + file_path) - with open(args.train_conf, 'rb') as train_conf_reader: - configuration = pickle.load(train_conf_reader) - train_conf_reader.close() - - return configuration - - def reload_embedding_model(self, emb_model_file: str): - self.logger.info("Reloading embedding model from " + emb_model_file) - return k.models.load_model(args.emb_model) - - def create_dnn_classifier(self, model_name, label: str, val_data: Tuple, **kwargs): - if val_data is not None: - monitor_loss = "val_loss" - else: - monitor_loss = "loss" - - callbacks = [ - ku.best_model_checkpointing_by_model_name(model_name), - ku.csv_logging_callback(model_name, label), - ku.early_stopping(monitor_loss, 5) - ] + def build_pairs_para(self, certificate_data: DataFrame, dictionary_data: DataFrame, max_pos_samples: int, + neg_sampling_strategy: Callable, workers: int, chunk_size: int): + # FIXME: This can be implemented more efficiently! + # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2 - kwargs["callbacks"] = callbacks - return nnc.dense_network(**kwargs) + def _run_build_pairs(df_slice: DataFrame): + loc_certificate_vectors = [] + loc_dictionary_vectors = [] + loc_labels = [] + for i, cert_row in df_slice.iterrows(): + line_icd10_code = cert_row["ICD10"] -class NegativeSampling(LoggingMixin): + # Build positive examples (based on training data) + dictionary_entries = dictionary_data.query("ICD10 == '%s'" % line_icd10_code) + dictionary_entries = dictionary_entries.sample(min(max_pos_samples, len(dictionary_entries))) - def __init__(self): - LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + #self.logger.info("Found %s entries for ICD-10 code %s", len(dictionary_entries), line_icd10_code) + for i, dict_row in dictionary_entries.iterrows(): + loc_certificate_vectors.append(cert_row["Token_ids"]) + loc_dictionary_vectors.append(dict_row["Token_ids"]) - def get_strategy_by_name(self, name: str, args: Namespace) -> Callable: - #FIXME: Make args to dictionary - - if name == "def": - return self.default_strategy(args.num_neg_samples) - elif name == "ext1": - return self.extended1_strategy(args.num_neg_cha, args.num_neg_sec, args.num_neg_sub, args.num_neg_oth) - else: - raise AssertionError("Unsupported negative sampling strategy: " + name) + loc_labels.append(1.0) - def default_strategy(self, num_negative_samples: int) -> Callable: - def _sample(dictionary_df: DataFrame, line_icd10_code: str): - negative_samples = dictionary_df.query("ICD10 != '%s'" % line_icd10_code) + # Build negative samples + # Find illegal ICD-10 for this line + negative_samples = neg_sampling_strategy(certificate_data, line_icd10_code) - # Only necessary during development and tests with only very few examples - if len(negative_samples) > 0: - negative_samples = negative_samples.sample(min(num_negative_samples, len(negative_samples))) + for i, neg_row in negative_samples.iterrows(): + loc_certificate_vectors.append(cert_row["Token_ids"]) + loc_dictionary_vectors.append(neg_row["Token_ids"]) - return negative_samples + loc_labels.append(0.0) - return _sample + return loc_certificate_vectors, loc_dictionary_vectors, loc_labels - def extended1_strategy(self, num_chapter_samples, num_section_samples, num_subsection_samples, num_other_samples): - def _sample(dictionary_df: DataFrame, icd10_code: str): - icd10_chapter = icd10_code[0].lower() - icd10_section = icd10_code[0:2].lower() - icd10_subsection = icd10_code[0:3].lower() - - chapter_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter == '%s'" % (icd10_code, icd10_chapter)) - if len(chapter_samples) > 0: - chapter_samples = chapter_samples.sample(min(num_chapter_samples, len(chapter_samples))) + certificate_vectors = [] + dictionary_vectors = [] + labels = [] - section_samples = dictionary_df.query("ICD10 != '%s' & ICD10_section == '%s'" % (icd10_code, icd10_section)) - if len(section_samples) > 0: - section_samples = section_samples.sample(min(num_section_samples, len(section_samples))) + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = [] + for i in range(0, len(certificate_data), chunk_size): + futures.append(executor.submit(_run_build_pairs, certificate_data[i:i + chunk_size])) - subsection_samples = dictionary_df.query("ICD10 != '%s' & ICD10_subsection == '%s'" % (icd10_code, icd10_subsection)) - if len(subsection_samples) > 0: - subsection_samples = subsection_samples.sample(min(num_subsection_samples, len(subsection_samples))) + compl_futures = concurrent.futures.as_completed(futures) + for future in tqdm(compl_futures, desc="build-pairs", total=len(futures)): + slice_result = future.result() + certificate_vectors = certificate_vectors + slice_result[0] + dictionary_vectors = dictionary_vectors + slice_result[1] + labels = labels + slice_result[2] - exp_sim_samples = num_chapter_samples + num_section_samples + num_subsection_samples - act_sim_samples = len(chapter_samples) + len(section_samples) + len(subsection_samples) + data = {"Cert_input": certificate_vectors, "Dict_input": dictionary_vectors, "Label": labels} + return pd.DataFrame(data) - other_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter != '%s'" % (icd10_code, icd10_chapter)) - other_samples = other_samples.sample(min(num_other_samples + (exp_sim_samples - act_sim_samples), len(other_samples))) + def build_rnn_input(self, data: DataFrame, column: str, max_length: int, vector_size: int) -> np.ndarray: + data_matrix = np.zeros((len(data), max_length, vector_size)) - # print("#Chapter samples: ", len(chapter_samples)) - # print("#Section samples: ", len(section_samples)) - # print("#Subsection samples: ", len(subsection_samples)) - # print("#Other samples: ", len(other_samples)) + for i, (_, row) in tqdm(enumerate(data.iterrows()), desc="build-matrices", total=len(data)): + data_matrix[i] = row[column] - return pd.concat([chapter_samples, section_samples, subsection_samples, other_samples]) - return _sample + return data_matrix if __name__ == "__main__": @@ -635,23 +429,29 @@ if __name__ == "__main__": subparsers = parser.add_subparsers(dest="mode") train_emb_parser = subparsers.add_parser("train-emb") - train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"]) + train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "all-con"]) train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int) train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int) + train_emb_parser.add_argument("--workers", help="Number of threads during pair building", default=4, type=int) + train_emb_parser.add_argument("--slice_size", help="Number of cert entries to be handled by one thread during pair building", default=1000, type=int) train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.3, type=float) train_emb_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int) + train_emb_parser.add_argument("--down_sample", help="Maximal frequency of ICD10 code until start down sampling", default=None, type=int) train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str) train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) + train_emb_parser.add_argument("--strat_min_freq", help="Min frequency of an icd10 code to be an own class during stratification", default=8, type=int) train_emb_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') + train_emb_parser.add_argument("--single_only", help="Indicates whether just to use the single code lines from the data", default=False, type=bool) + train_emb_parser.add_argument("--max_pos_samples", help="Maximal number of positive samples to use", default=10, type=int) train_emb_parser.add_argument("--neg_sampling", help="Negative sampling strategy to use", default="ext1", choices=["def", "ext1"]) train_emb_parser.add_argument("--num_neg_samples", help="Number of negative samples to use (default strategy)", default=75, type=int) train_emb_parser.add_argument("--num_neg_cha", help="Number of negative chapter samples to use (ext1 strategy)", default=10, type=int) - train_emb_parser.add_argument("--num_neg_sec", help="Number of negative section samples to use (ext1 strategy)", default=10, type=int) - train_emb_parser.add_argument("--num_neg_sub", help="Number of negative subsection samples to use (ext1 strategy)", default=10, type=int) - train_emb_parser.add_argument("--num_neg_oth", help="Number of negative other samples to use (ext1 strategy)", default=40, type=int) + train_emb_parser.add_argument("--num_neg_sec", help="Number of negative section samples to use (ext1 strategy)", default=20, type=int) + train_emb_parser.add_argument("--num_neg_sub", help="Number of negative subsection samples to use (ext1 strategy)", default=20, type=int) + train_emb_parser.add_argument("--num_neg_oth", help="Number of negative other samples to use (ext1 strategy)", default=10, type=int) eval_classifier_parser = subparsers.add_parser("eval-cl") eval_classifier_parser.add_argument("emb_model", help="Path to the embedding model to use") @@ -664,43 +464,75 @@ if __name__ == "__main__": eval_classifier_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) eval_classifier_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') + predict_parser = subparsers.add_parser("pred") + predict_parser.add_argument("emb_model", help="Path to the learned embedding model to use") + predict_parser.add_argument("cl_model", help="Path to the learned classifier model to use") + predict_parser.add_argument("train_conf", help="Path to the training configuration dump") + predict_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"]) + args = parser.parse_args() AppContext.initialize_by_app_name(Clef18Task1Emb1.__name__ + "-" + args.mode) + clef18_task1 = Clef18Task1Emb1() + clef18_task1.save_arguments(args) + clef_data = Clef18Task1Data() - dictionary = clef_data.read_dictionary_by_language(args.lang) - #dictionary = dictionary.sample(1200) - certificates = clef_data.read_train_certifcates_by_language(args.lang) - certificates = clef_data.filter_single_code_lines(certificates) - certificates = clef_data.add_masked_icd10_column(certificates, 10) + if args.mode == "train-emb": + dictionary = clef_data.read_dictionary_by_id(args.lang) + certificates = clef_data.read_train_certifcates_by_id(args.lang) - sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - #ft_model = FastText(sentences, min_count=1) + ft_embeddings = FastTextEmbeddings() + ft_model = ft_embeddings.load_embeddings_by_id(args.lang) - ft_embeddings = FastTextEmbeddings() - ft_model = ft_embeddings.load_embeddings_by_language(args.lang) + #sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + #ft_model = FastTextModel("dummy", [FastText(sentences, min_count=1)]) - clef18_task1 = Clef18Task1Emb1() - clef18_task1.save_arguments(args) + if args.down_sample: + certificates = clef_data.down_sample_by_icd10_frequency(certificates, args.down_sample) - if args.mode == "train-emb": - configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, args.val_ratio,args.strat_column, + if args.single_only: + certificates = clef_data.filter_single_code_lines(certificates) + + if args.strat_splits: + certificates = clef_data.add_masked_icd10_column(certificates, args.strat_min_freq) + + configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, args.val_ratio, args.strat_column, args.samples, args.strat_splits) clef18_task1.save_configuration(configuration) neg_sampling = NegativeSampling() neg_sampling_strategy = neg_sampling.get_strategy_by_name(args.neg_sampling, args) - embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, neg_sampling_strategy, args.epochs, args.batch_size) + embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, args.max_pos_samples, neg_sampling_strategy, + args.epochs, args.batch_size, args.workers, args.slice_size) + + clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) elif args.mode == "eval-cl": configuration = clef18_task1.reload_configuration(args.train_conf) embedding_model = clef18_task1.reload_embedding_model(args.emb_model) - eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) - clef18_task1.save_evaluation_results(eval_result) + clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) + + elif args.mode == "pred": + embedding_model = clef18_task1.reload_embedding_model(args.emb_model) + classifer_model = clef18_task1.reload_classifier(args.cl_model) + configuration = clef18_task1.reload_configuration(args.train_conf) + + test_certificates = clef_data.read_test_certifcates_by_lang(args.lang) + + ft_embeddings = FastTextEmbeddings() + ft_model = ft_embeddings.load_embeddings_by_id(args.lang) + + #sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + #ft_model = FastTextModel("dummy", [FastText(sentences, min_count=1)]) + + clef18_task1.predict(embedding_model, classifer_model, configuration, test_certificates) + + #eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) + #clef18_task1.save_evaluation_results(eval_result) diff --git a/code_mario/clef18_task1_emb2.py b/code_mario/clef18_task1_emb2.py index 7cc15c73ed539bce8ba721f98847c6028efc9991..6cc3200f8b371960f4e240d624cfeb07d7f6c02c 100644 --- a/code_mario/clef18_task1_emb2.py +++ b/code_mario/clef18_task1_emb2.py @@ -1,42 +1,28 @@ from init import * +from clef18_task1_base import Clef18Task1Base, EvaluationConfiguration, NegativeSampling import argparse import numpy as np import pandas as pd import keras as k -import pickle import os -from argparse import Namespace -from gensim.models import FastText from keras import Input, Model -from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, CSVLogger from keras.layers import Bidirectional, Dense, Dot, LSTM, Embedding, GlobalMaxPool1D from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer from pandas import DataFrame -from sklearn.dummy import DummyClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import SGDClassifier from sklearn.metrics import f1_score, accuracy_score -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder -from sklearn.svm import LinearSVC -from sklearn.tree import DecisionTreeClassifier from tqdm import tqdm from typing import Tuple, Dict, List, Callable -from sklearn.externals import joblib -import ft_embeddings from app_context import AppContext from clef18_task1_data import Clef18Task1Data -from dnn_classifiers import NeuralNetworkClassifiers as nnc -from ft_embeddings import FastTextEmbeddings +from ft_embeddings import FastTextEmbeddings, FastTextModel from preprocessing import DataPreparationUtil as pdu from keras_extension import KerasUtil as ku -from util import LoggingMixin class ICD10LabelEncoders(object): @@ -72,12 +58,12 @@ class EvaluationResult(object): self.accuracy = accuracy -class Clef18Task1Emb2(LoggingMixin): +class Clef18Task1Emb2(Clef18Task1Base): def __init__(self): - LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + Clef18Task1Base.__init__(self) - def train_embedding_model(self, config: Configuration, ft_model: FastText, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: + def train_embedding_model(self, config: Configuration, ft_model: FastTextModel, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: self.logger.info("Start building training pairs") train_pair_data = self.build_pairs(config.train_df, neg_sampling_strategy) self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts()) @@ -159,137 +145,20 @@ class Clef18Task1Emb2(LoggingMixin): target_label_configs = self.get_label_configuration(target_labels, config.label_encoders) - test_sets = [ - #("dict", dict_embeddings, config.dict_df), - ("cert-train", train_embeddings, config.train_df), - ("cert-val", val_embeddings, config.val_df), - ("cert-test", test_embeddings, config.test_df) - ] - - named_classifiers = [ - ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()), - ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")), - ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)), - ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)), - ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)), - ("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)), - - ("DNN-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=False, dropout_rate=0.0, epochs=10, batch_size=2)), - ("DNN-300", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - - ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - - ("DNN-200-100", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - ("DNN-200-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - ("DNN-300-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - - ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - - ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")), - ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent")) - ] - - num_experiments = len(target_label_configs) * len(test_sets) * len(named_classifiers) - cur_experiment = 1 - input_dim = text_rnn.output.shape[1].value - models_dir = os.path.join(AppContext.default().output_dir, "models") - os.makedirs(models_dir, exist_ok=True) - - results = [] - for target_label, target_column, label_encoder in target_label_configs: - self.logger.info("Start evaluation experiments with label %s", target_label) - output_dim = len(label_encoder.classes_) - - for cl_name, classifier_factory in named_classifiers: - self.logger.info("Start training of classifier %s", cl_name) - classifier = classifier_factory(target_label, input_dim, output_dim, val_embeddings) - classifier.fit(train_embeddings, config.train_df[target_column].values) - - classifier_file_name = "cl_{}_{}.model".format(cl_name, target_label).lower() - classifier_file = os.path.join(models_dir, classifier_file_name) - try: - joblib.dump(classifier, classifier_file) - except: - self.logger.error("Error while saving classifier %s to %s", cl_name, classifier_file) - - self.logger.info("Start evaluation of %s", cl_name) - for ts_name, inputs, data_frame in test_sets: - gold_labels = data_frame[target_column].values - - self.logger.info("Evaluate data set %s", ts_name) - prediction = classifier.predict(inputs) - acc_score = accuracy_score(gold_labels, prediction) - - self.logger.info("Evaluation result: label=%s | classifier=%s | data_set=%s | acc_score=%s", - target_label, cl_name, ts_name, acc_score) - results.append(EvaluationResult(target_label, cl_name, ts_name, acc_score)) - - self.logger.info("Finished experiment %s out of %s", cur_experiment, num_experiments) - cur_experiment += 1 - - return results - - def get_label_configuration(self, target_labels: List[str], icd10_encoders: ICD10LabelEncoders) -> List: - label_configs = [] - - for target_label in target_labels: - if target_label == "chap" or target_label == "chapter": - label_configs.append((target_label, "ICD10_chapter_encoded", icd10_encoders.chapter_encoder)) - elif target_label == "sect" or target_label == "section": - label_configs.append((target_label, "ICD10_section_encoded", icd10_encoders.section_encoder)) - elif target_label == "subs" or target_label == "subsection": - label_configs.append((target_label, "ICD10_subsection_encoded", icd10_encoders.subsection_encoder)) - elif target_label == "code" or target_label == "icd10": - label_configs.append((target_label, "ICD10_encoded", icd10_encoders.code_encoder)) - else: - self.logger.error("Can't create label configuration for label " + target_label) - - return label_configs - - def split_train_test(self, certificate_df: DataFrame, train_size: float, - stratified_splits: bool, label_column: str) -> Tuple[DataFrame, DataFrame]: - if stratified_splits: - self.logger.info("Creating stratified splits for column %s", label_column) - training_data, test_data = train_test_split(certificate_df, train_size=train_size, stratify=certificate_df[label_column]) - else: - self.logger.info("Creating non-stratified splits") - training_data, test_data = train_test_split(certificate_df, train_size=train_size) - - return training_data, test_data + eval_config = EvaluationConfiguration(target_labels, config.label_encoders, input_dim, train_embeddings, config.train_df, + val_embeddings, config.val_df, test_embeddings, config.test_df) + return self.run_evaluation(eval_config) - def build_embedding_model(self, word_index: Dict, ft_model: FastText, conf: Configuration): + def build_embedding_model(self, word_index: Dict, ft_model: FastTextModel, conf: Configuration): # TODO: Make hyper-parameter configurable! - # TODO: Think about using CNNs instead of RNNs since we can have multiple ICD-10 per line! + # TODO: Think about using CNNs instead of RNNs! embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size)) for word, i in word_index.items(): try: - embedding_vector = ft_model[word] + embedding_vector = ft_model.lookup(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector @@ -321,7 +190,7 @@ class Clef18Task1Emb2(LoggingMixin): return model - def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, train_ratio: float, val_ratio: float, + def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastTextModel, train_ratio: float, val_ratio: float, strat_column: str, samples: int=None, stratified_splits: bool=False) -> Configuration: cert_df = cert_df[["RawText", "ICD10"]] @@ -359,33 +228,6 @@ class Clef18Task1Emb2(LoggingMixin): return Configuration(train_df, val_df, test_df, max_length, ft_model.vector_size, strat_column, label_encoders, keras_tokenizer) - def prepare_label_encoders(self, dict_df: DataFrame, cert_df: DataFrame) -> ICD10LabelEncoders: - self.logger.info("Fitting label encoder to ICD10 codes") - icd10_code_encoder = LabelEncoder() - icd10_code_encoder.fit(list([icd10.strip() for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip() for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 codes within the data set", len(icd10_code_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 chapters") - icd10_chapter_encoder = LabelEncoder() - icd10_chapter_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 chapters within the data set", len(icd10_chapter_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 section") - icd10_section_encoder = LabelEncoder() - icd10_section_encoder.fit(list([icd10.strip().lower()[0:2] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0:2] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 sections within the data set", len(icd10_section_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 subsection") - icd10_subsection_encoder = LabelEncoder() - icd10_subsection_encoder.fit(list([icd10.strip().lower()[0:3] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0:3] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 subsections within the data set", len(icd10_subsection_encoder.classes_)) - - return ICD10LabelEncoders(icd10_chapter_encoder, icd10_section_encoder, icd10_subsection_encoder, icd10_code_encoder) - def prepare_cert_dict_df(self, cert_dict_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders, keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]: pipeline = Pipeline([ @@ -419,9 +261,7 @@ class Clef18Task1Emb2(LoggingMixin): return data_prepared, max_length def build_pairs(self, data_df: DataFrame, neg_sampling_strategy: Callable): - # FIXME: This can be implemented more efficiently! - # FIXME: Improve sampling of negative instances (especially if code is I-XXX sample other codes of the same class (e.g. I-YYY) - # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2 + # TODO: Think about to use a negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2 text_vectors = [] icd10_codes = [] @@ -445,146 +285,13 @@ class Clef18Task1Emb2(LoggingMixin): data = {"Cert_input": text_vectors, "ICD10_input": icd10_codes, "Label": labels} return pd.DataFrame(data) - def save_evaluation_results(self, eval_results: List[EvaluationResult]): - result_configurations = [ - ("results.csv", None), - ("results_by_classifier.csv", lambda result: result.classifier_name), - ("results_by_data_set.csv", lambda result: result.data_set_name), - ("results_by_label.csv", lambda result: result.target_label) - ] - - for file_name, sort_key in result_configurations: - results_file = os.path.join(AppContext.default().output_dir, file_name) - with open(results_file, "w", encoding="utf8") as result_writer: - if sort_key: - eval_results = sorted(eval_results, key=sort_key) - - for r in eval_results: - result_writer.write("%s\t%s\t%s\t%s\n" % (r.target_label, r.classifier_name, r.data_set_name, r.accuracy)) - result_writer.close() - - def save_arguments(self, arguments: Namespace): - arguments_file = os.path.join(AppContext.default().log_dir, "arguments.txt") - self.logger.info("Saving arguments to " + arguments_file) - - with open(arguments_file, 'w', encoding="utf8") as writer: - for key, value in arguments.__dict__.items(): - writer.write("%s=%s\n" % (str(key), str(value))) - writer.close() - - def save_configuration(self, configuration: Configuration): - label_encoder_file = os.path.join(AppContext.default().output_dir, "label_encoder.pk") - self.logger.info("Saving label encoder to " + label_encoder_file) - with open(label_encoder_file, 'wb') as encoder_writer: - pickle.dump(configuration.label_encoders, encoder_writer) - encoder_writer.close() - - keras_tokenizer_file = os.path.join(AppContext.default().output_dir, "keras_tokenizer.pk") - self.logger.info("Saving keras sequencer to " + keras_tokenizer_file) - with open(keras_tokenizer_file, 'wb') as keras_sequencer_writer: - pickle.dump(configuration.keras_tokenizer, keras_sequencer_writer) - keras_sequencer_writer.close() - - configuration_file = os.path.join(AppContext.default().output_dir, "configuration.pk") - self.logger.info("Saving configuration to " + configuration_file) - with open(configuration_file, 'wb') as train_conf_writer: - pickle.dump(configuration, train_conf_writer) - train_conf_writer.close() - - def reload_configuration(self, file_path: str): - self.logger.info("Reloading configuration from " + file_path) - with open(args.train_conf, 'rb') as train_conf_reader: - configuration = pickle.load(train_conf_reader) - train_conf_reader.close() - - return configuration - - def reload_embedding_model(self, emb_model_file: str): - self.logger.info("Reloading embedding model from " + emb_model_file) - return k.models.load_model(args.emb_model) - - def create_dnn_classifier(self, model_name, label: str, val_data: Tuple, **kwargs): - if val_data is not None: - monitor_loss = "val_loss" - else: - monitor_loss = "loss" - - callbacks = [ - ku.best_model_checkpointing_by_model_name(model_name), - ku.csv_logging_callback(model_name, label), - ku.early_stopping(monitor_loss, 5) - ] - - kwargs["callbacks"] = callbacks - return nnc.dense_network(**kwargs) - - -class NegativeSampling(LoggingMixin): - - def __init__(self): - LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) - - def get_strategy_by_name(self, name: str, args: Namespace) -> Callable: - #FIXME: Make args to dictionary - - if name == "def": - return self.default_strategy(args.num_neg_samples) - elif name == "ext1": - return self.extended1_strategy(args.num_neg_cha, args.num_neg_sec, args.num_neg_sub, args.num_neg_oth) - else: - raise AssertionError("Unsupported negative sampling strategy: " + name) - - def default_strategy(self, num_negative_samples: int) -> Callable: - def _sample(dictionary_df: DataFrame, line_icd10_code: str): - negative_samples = dictionary_df.query("ICD10 != '%s'" % line_icd10_code) - - # Only necessary during development and tests with only very few examples - if len(negative_samples) > 0: - negative_samples = negative_samples.sample(min(num_negative_samples, len(negative_samples))) - - return negative_samples - - return _sample - - def extended1_strategy(self, num_chapter_samples, num_section_samples, num_subsection_samples, num_other_samples): - def _sample(dictionary_df: DataFrame, icd10_code: str): - icd10_chapter = icd10_code[0].lower() - icd10_section = icd10_code[0:2].lower() - icd10_subsection = icd10_code[0:3].lower() - - chapter_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter == '%s'" % (icd10_code, icd10_chapter)) - if len(chapter_samples) > 0: - chapter_samples = chapter_samples.sample(min(num_chapter_samples, len(chapter_samples))) - - section_samples = dictionary_df.query("ICD10 != '%s' & ICD10_section == '%s'" % (icd10_code, icd10_section)) - if len(section_samples) > 0: - section_samples = section_samples.sample(min(num_section_samples, len(section_samples))) - - subsection_samples = dictionary_df.query("ICD10 != '%s' & ICD10_subsection == '%s'" % (icd10_code, icd10_subsection)) - if len(subsection_samples) > 0: - subsection_samples = subsection_samples.sample(min(num_subsection_samples, len(subsection_samples))) - - exp_sim_samples = num_chapter_samples + num_section_samples + num_subsection_samples - act_sim_samples = len(chapter_samples) + len(section_samples) + len(subsection_samples) - - other_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter != '%s'" % (icd10_code, icd10_chapter)) - other_samples = other_samples.sample(min(num_other_samples + (exp_sim_samples - act_sim_samples), len(other_samples))) - - # print("#Chapter samples: ", len(chapter_samples)) - # print("#Section samples: ", len(section_samples)) - # print("#Subsection samples: ", len(subsection_samples)) - # print("#Other samples: ", len(other_samples)) - - return pd.concat([chapter_samples, section_samples, subsection_samples, other_samples]) - return _sample - if __name__ == "__main__": parser = argparse.ArgumentParser(prog="CLEF2018") subparsers = parser.add_subparsers(dest="mode") - train_emb_parser = subparsers.add_parser("train-emb2") - train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"]) + train_emb_parser = subparsers.add_parser("train-emb") + train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "all-con"]) train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int) train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int) train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) @@ -597,10 +304,10 @@ if __name__ == "__main__": train_emb_parser.add_argument("--neg_sampling", help="Negative sampling strategy to use", default="ext1", choices=["def", "ext1"]) train_emb_parser.add_argument("--num_neg_samples", help="Number of negative samples to use (default strategy)", default=75, type=int) - train_emb_parser.add_argument("--num_neg_cha", help="Number of negative chapter samples to use (ext1 strategy)", default=10, type=int) - train_emb_parser.add_argument("--num_neg_sec", help="Number of negative section samples to use (ext1 strategy)", default=10, type=int) - train_emb_parser.add_argument("--num_neg_sub", help="Number of negative subsection samples to use (ext1 strategy)", default=10, type=int) - train_emb_parser.add_argument("--num_neg_oth", help="Number of negative other samples to use (ext1 strategy)", default=40, type=int) + train_emb_parser.add_argument("--num_neg_cha", help="Number of negative chapter samples to use (ext1 strategy)", default=20, type=int) + train_emb_parser.add_argument("--num_neg_sec", help="Number of negative section samples to use (ext1 strategy)", default=20, type=int) + train_emb_parser.add_argument("--num_neg_sub", help="Number of negative subsection samples to use (ext1 strategy)", default=20, type=int) + train_emb_parser.add_argument("--num_neg_oth", help="Number of negative other samples to use (ext1 strategy)", default=45, type=int) eval_classifier_parser = subparsers.add_parser("eval-cl") eval_classifier_parser.add_argument("emb_model", help="Path to the embedding model to use") @@ -618,10 +325,10 @@ if __name__ == "__main__": AppContext.initialize_by_app_name(Clef18Task1Emb2.__name__ + "-" + args.mode) clef_data = Clef18Task1Data() - dictionary = clef_data.read_dictionary_by_language(args.lang) + dictionary = clef_data.read_dictionary_by_id(args.lang) #dictionary = dictionary.sample(1200) - certificates = clef_data.read_train_certifcates_by_language(args.lang) + certificates = clef_data.read_train_certifcates_by_id(args.lang) #certificates = clef_data.filter_single_code_lines(certificates) #certificates = clef_data.add_masked_icd10_column(certificates, 10) @@ -629,12 +336,12 @@ if __name__ == "__main__": #ft_model = FastText(sentences, min_count=1) ft_embeddings = FastTextEmbeddings() - ft_model = ft_embeddings.load_embeddings_by_language(args.lang) + ft_model = ft_embeddings.load_embeddings_by_id(args.lang) clef18_task1 = Clef18Task1Emb2() clef18_task1.save_arguments(args) - if args.mode == "train-emb2": + if args.mode == "train-emb": configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, args.val_ratio,args.strat_column, args.samples, args.strat_splits) clef18_task1.save_configuration(configuration) @@ -651,5 +358,3 @@ if __name__ == "__main__": eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) clef18_task1.save_evaluation_results(eval_result) - - diff --git a/code_mario/ft_embeddings.py b/code_mario/ft_embeddings.py index 644b5b51bd426ca3f839bff705bcd9eadf2963e9..f2b3b7f32f2c2927cde63c370335295c22b31816 100644 --- a/code_mario/ft_embeddings.py +++ b/code_mario/ft_embeddings.py @@ -1,22 +1,57 @@ +import numpy as np + +from typing import List from gensim.models import FastText +from app_context import AppContext from util import LoggingMixin +class FastTextModel(LoggingMixin): + + def __init__(self, name: str, ft_models: List[FastText]): + LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + self.name = name + self.ft_models = ft_models + self.vector_size = sum([ft_model.vector_size for ft_model in self.ft_models]) + + def lookup(self, word: str): + embeddings = [] + for ft_model in self.ft_models: + try: + embeddings.append(ft_model[word]) + except KeyError as error: + self.logger.warn("Can't create embedding for " + word) + embeddings.append(np.zeros(ft_model.vector_size)) + + if len(embeddings) == 1: + return embeddings[0] + else: + return np.concatenate(embeddings) + + class FastTextEmbeddings(LoggingMixin): def __init__(self): - LoggingMixin.__init__(self, __class__.__name__) - - def load_embeddings_by_language(self, lang: str) -> FastText: - if lang == "it": - return self.load_it_embeddings() - elif lang == "hu": - return self.load_hu_embeddings() - elif lang == "fr": - return self.load_fr_embeddings() + LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + + def load_embeddings_by_id(self, id: str) -> FastTextModel: + if id == "it": + return FastTextModel("it", [self.load_it_embeddings()]) + + elif id == "hu": + return FastTextModel("hu", [self.load_hu_embeddings()]) + + elif id == "fr": + return FastTextModel("fr", [self.load_fr_embeddings()]) + + elif id == "all-con": + return FastTextModel("all-con", [self.load_fr_embeddings(), + self.load_it_embeddings(), + self.load_hu_embeddings()]) + else: - raise AssertionError("Unsupported language: " + lang) + raise AssertionError("Unsupported language: " + id) # ------------------------------------------------------------------------------------ diff --git a/code_mario/init.py b/code_mario/init.py index 71c6dc4e493672d8f0997d3dafa210fc810ccc77..04658c6c596ae5687dbe632415b50bc8d158cac6 100644 --- a/code_mario/init.py +++ b/code_mario/init.py @@ -26,7 +26,8 @@ rn.seed(12345) # non-reproducible results. # For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res -session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) +#session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) +session_conf = tf.ConfigProto(intra_op_parallelism_threads=0, inter_op_parallelism_threads=0) from keras import backend as K diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py index 47a2a6bb397910687cd39f80e44269cb5135bee4..ac7fad0b7829b001fa62edc66803d433d97c4b4d 100644 --- a/code_mario/preprocessing.py +++ b/code_mario/preprocessing.py @@ -88,6 +88,14 @@ class DataPreparationUtil(object): return MapFunction(icd10_column, _mask_icd10, target_column) + @staticmethod + def clean_text(column:str): + def _clean(value): + return str(value).replace("\"", " ") + + return MapFunction(column, _clean) + + class FitMixin(object): def fit(self, data, y=None): @@ -228,7 +236,7 @@ class KerasSequencer(BaseEstimator, TransformerMixin): return self def transform(self, data: DataFrame, y=None): - texts = data[self.text_column].values + texts = data[self.text_column].astype(str).values sequences = self.keras_tokenizer.texts_to_sequences(texts) return PandasUtil.append_column(data, self.target_column, sequences)