diff --git a/code_mario/clef18_task1_base.py b/code_mario/clef18_task1_base.py new file mode 100644 index 0000000000000000000000000000000000000000..a16c72a8a2d06d77bd86629b2e86cbeca4f7e82b --- /dev/null +++ b/code_mario/clef18_task1_base.py @@ -0,0 +1,358 @@ +import os +import pandas as pd +import keras as k +import pickle + +from numpy.core.records import ndarray + +from pandas import DataFrame +from argparse import Namespace +from typing import Callable, List, Tuple + +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import SGDClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import f1_score, accuracy_score +from sklearn.svm import LinearSVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.externals import joblib +from sklearn.model_selection import train_test_split + +from app_context import AppContext +from util import LoggingMixin +from dnn_classifiers import NeuralNetworkClassifiers as nnc +from keras_extension import KerasUtil as ku + + +class ICD10LabelEncoders(object): + + def __init__(self, chapter_encoder: LabelEncoder, section_encoder: LabelEncoder, + subsection_encoder: LabelEncoder, code_encoder: LabelEncoder): + self.chapter_encoder = chapter_encoder + self.section_encoder = section_encoder + self.subsection_encoder = subsection_encoder + self.code_encoder = code_encoder + + +class EvaluationConfiguration(object): + + def __init__(self, target_labels: List[str], label_encoders: ICD10LabelEncoders, input_dim: int, + train_data: ndarray, train_df: ndarray, val_data: ndarray, val_df: ndarray, + test_data: ndarray, test_df: ndarray): + self.target_labels = target_labels + self.label_encoders = label_encoders + self.input_dim = input_dim + self.train_data = train_data + self.train_df = train_df + self.val_data = val_data + self.val_df = val_df + self.test_data = test_data + self.test_df = test_df + + +class EvaluationResult(object): + + def __init__(self, target_label: str, classifier_name: str, data_set_name: str, accuracy: float): + self.target_label = target_label + self.classifier_name = classifier_name + self.data_set_name = data_set_name + self.accuracy = accuracy + + +class Clef18Task1Base(LoggingMixin): + + def __init__(self): + LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + + def run_evaluation(self, conf: EvaluationConfiguration): + + target_label_configs = self.get_label_configuration(conf.target_labels, conf.label_encoders) + + test_sets = [ + # ("dict", dict_embeddings, config.dict_df), + ("cert-train", conf.train_data, conf.train_df), + ("cert-val", conf.val_data, conf.val_df), + ("cert-test", conf.test_data, conf.test_df) + ] + + named_classifiers = [ + ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()), + ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")), + ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)), + ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)), + ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)), + ("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)), + + ("DNN-200", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + batch_normalization=False, dropout_rate=0.0, epochs=10, batch_size=2)), + ("DNN-300", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], + batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), + + ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), + ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], + batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), + + ("DNN-200-100", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], + batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), + ("DNN-200-200", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], + batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), + ("DNN-300-200", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], + batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), + + ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], + batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), + ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], + batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), + ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim,hidden_layer_sizes=[300, 200], + batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), + + ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")), + ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent")) + ] + + num_experiments = len(target_label_configs) * len(test_sets) * len(named_classifiers) + cur_experiment = 1 + + models_dir = os.path.join(AppContext.default().output_dir, "models") + os.makedirs(models_dir, exist_ok=True) + + results = [] + for target_label, target_column, label_encoder in target_label_configs: + self.logger.info("Start evaluation experiments with label %s", target_label) + output_dim = len(label_encoder.classes_) + train_labels = conf.train_df[target_column].values + + #complete_train_data = np.append(dict_embeddings, train_cert_embeddings, axis=0) + #complete_train_labels = np.append(config.dict_df[target_column].values, config.train_cert_df[target_column].values, axis=0) + + self.logger.info("Build complete training samples (data: %s, labels: %s)", conf.train_data.shape, train_labels.shape) + + for cl_name, classifier_factory in named_classifiers: + self.logger.info("Start training of classifier %s", cl_name) + classifier = classifier_factory(target_label, conf.input_dim, output_dim, conf.val_data) + classifier.fit(conf.train_data, train_labels) + + classifier_file_name = "cl_{}_{}.model".format(cl_name, target_label).lower() + classifier_file = os.path.join(models_dir, classifier_file_name) + try: + joblib.dump(classifier, classifier_file) + except: + self.logger.error("Error while saving classifier %s to %s", cl_name, classifier_file) + + self.logger.info("Start evaluation of %s", cl_name) + for ts_name, inputs, data_frame in test_sets: + gold_labels = data_frame[target_column].values + + self.logger.info("Evaluate data set %s", ts_name) + prediction = classifier.predict(inputs) + acc_score = accuracy_score(gold_labels, prediction) + + self.logger.info("Evaluation result: label=%s | classifier=%s | data_set=%s | acc_score=%s", + target_label, cl_name, ts_name, acc_score) + results.append(EvaluationResult(target_label, cl_name, ts_name, acc_score)) + + self.logger.info("Finished experiment %s out of %s", cur_experiment, num_experiments) + cur_experiment += 1 + + return results + + def prepare_label_encoders(self, dict_df: DataFrame, cert_df: DataFrame) -> ICD10LabelEncoders: + self.logger.info("Fitting label encoder to ICD10 codes") + icd10_code_encoder = LabelEncoder() + icd10_code_encoder.fit(list([icd10.strip() for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip() for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 codes within the data set", len(icd10_code_encoder.classes_)) + + self.logger.info("Fitting label encoder to ICD10 chapters") + icd10_chapter_encoder = LabelEncoder() + icd10_chapter_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 chapters within the data set", len(icd10_chapter_encoder.classes_)) + + self.logger.info("Fitting label encoder to ICD10 section") + icd10_section_encoder = LabelEncoder() + icd10_section_encoder.fit(list([icd10.strip().lower()[0:2] for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip().lower()[0:2] for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 sections within the data set", len(icd10_section_encoder.classes_)) + + self.logger.info("Fitting label encoder to ICD10 subsection") + icd10_subsection_encoder = LabelEncoder() + icd10_subsection_encoder.fit(list([icd10.strip().lower()[0:3] for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip().lower()[0:3] for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 subsections within the data set", len(icd10_subsection_encoder.classes_)) + + return ICD10LabelEncoders(icd10_chapter_encoder, icd10_section_encoder, icd10_subsection_encoder, icd10_code_encoder) + + def get_label_configuration(self, target_labels: List[str], icd10_encoders: ICD10LabelEncoders) -> List: + label_configs = [] + + for target_label in target_labels: + if target_label == "chap" or target_label == "chapter": + label_configs.append((target_label, "ICD10_chapter_encoded", icd10_encoders.chapter_encoder)) + elif target_label == "sect" or target_label == "section": + label_configs.append((target_label, "ICD10_section_encoded", icd10_encoders.section_encoder)) + elif target_label == "subs" or target_label == "subsection": + label_configs.append((target_label, "ICD10_subsection_encoded", icd10_encoders.subsection_encoder)) + elif target_label == "code" or target_label == "icd10": + label_configs.append((target_label, "ICD10_encoded", icd10_encoders.code_encoder)) + else: + self.logger.error("Can't create label configuration for label " + target_label) + + return label_configs + + def split_train_test(self, certificate_df: DataFrame, train_size: float, + stratified_splits: bool, label_column: str) -> Tuple[DataFrame, DataFrame]: + if stratified_splits: + self.logger.info("Creating stratified splits for column %s", label_column) + training_data, test_data = train_test_split(certificate_df, train_size=train_size, stratify=certificate_df[label_column]) + else: + self.logger.info("Creating non-stratified splits") + training_data, test_data = train_test_split(certificate_df, train_size=train_size) + + return training_data, test_data + + def save_evaluation_results(self, eval_results: List[EvaluationResult]): + result_configurations = [ + ("results.csv", None), + ("results_by_classifier.csv", lambda result: result.classifier_name), + ("results_by_data_set.csv", lambda result: result.data_set_name), + ("results_by_label.csv", lambda result: result.target_label) + ] + + for file_name, sort_key in result_configurations: + results_file = os.path.join(AppContext.default().output_dir, file_name) + with open(results_file, "w", encoding="utf8") as result_writer: + if sort_key: + eval_results = sorted(eval_results, key=sort_key) + + for r in eval_results: + result_writer.write("%s\t%s\t%s\t%s\n" % (r.target_label, r.classifier_name, r.data_set_name, r.accuracy)) + result_writer.close() + + def save_arguments(self, arguments: Namespace): + arguments_file = os.path.join(AppContext.default().log_dir, "arguments.txt") + self.logger.info("Saving arguments to " + arguments_file) + + with open(arguments_file, 'w', encoding="utf8") as writer: + for key, value in arguments.__dict__.items(): + writer.write("%s=%s\n" % (str(key), str(value))) + writer.close() + + def reload_embedding_model(self, emb_model_file: str): + self.logger.info("Reloading embedding model from " + emb_model_file) + return k.models.load_model(emb_model_file) + + def create_dnn_classifier(self, model_name, label: str, val_data: Tuple, **kwargs): + if val_data is not None: + monitor_loss = "val_loss" + else: + monitor_loss = "loss" + + callbacks = [ + ku.best_model_checkpointing_by_model_name(model_name), + ku.csv_logging_callback(model_name, label), + ku.early_stopping(monitor_loss, 5) + ] + + kwargs["callbacks"] = callbacks + return nnc.dense_network(**kwargs) + + def save_configuration(self, configuration): + label_encoder_file = os.path.join(AppContext.default().output_dir, "label_encoder.pk") + self.logger.info("Saving label encoder to " + label_encoder_file) + with open(label_encoder_file, 'wb') as encoder_writer: + pickle.dump(configuration.label_encoders, encoder_writer) + encoder_writer.close() + + keras_tokenizer_file = os.path.join(AppContext.default().output_dir, "keras_tokenizer.pk") + self.logger.info("Saving keras sequencer to " + keras_tokenizer_file) + with open(keras_tokenizer_file, 'wb') as keras_sequencer_writer: + pickle.dump(configuration.keras_tokenizer, keras_sequencer_writer) + keras_sequencer_writer.close() + + configuration_file = os.path.join(AppContext.default().output_dir, "configuration.pk") + self.logger.info("Saving configuration to " + configuration_file) + with open(configuration_file, 'wb') as train_conf_writer: + pickle.dump(configuration, train_conf_writer) + train_conf_writer.close() + + def reload_configuration(self, file_path: str): + self.logger.info("Reloading configuration from " + file_path) + with open(file_path, 'rb') as train_conf_reader: + configuration = pickle.load(train_conf_reader) + train_conf_reader.close() + + return configuration + + +class NegativeSampling(LoggingMixin): + + def __init__(self): + LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + + def get_strategy_by_name(self, name: str, args: Namespace) -> Callable: + #FIXME: Make args to dictionary + + if name == "def": + return self.default_strategy(args.num_neg_samples) + elif name == "ext1": + return self.extended1_strategy(args.num_neg_cha, args.num_neg_sec, args.num_neg_sub, args.num_neg_oth) + else: + raise AssertionError("Unsupported negative sampling strategy: " + name) + + def default_strategy(self, num_negative_samples: int) -> Callable: + def _sample(dictionary_df: DataFrame, line_icd10_code: str): + negative_samples = dictionary_df.query("ICD10 != '%s'" % line_icd10_code) + + # Only necessary during development and tests with only very few examples + if len(negative_samples) > 0: + negative_samples = negative_samples.sample(min(num_negative_samples, len(negative_samples))) + + return negative_samples + + return _sample + + def extended1_strategy(self, num_chapter_samples, num_section_samples, num_subsection_samples, num_other_samples): + def _sample(dictionary_df: DataFrame, icd10_code: str): + icd10_chapter = icd10_code[0].lower() + icd10_section = icd10_code[0:2].lower() + icd10_subsection = icd10_code[0:3].lower() + + chapter_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter == '%s'" % (icd10_code, icd10_chapter)) + if len(chapter_samples) > 0: + chapter_samples = chapter_samples.sample(min(num_chapter_samples, len(chapter_samples))) + + section_samples = dictionary_df.query("ICD10 != '%s' & ICD10_section == '%s'" % (icd10_code, icd10_section)) + if len(section_samples) > 0: + section_samples = section_samples.sample(min(num_section_samples, len(section_samples))) + + subsection_samples = dictionary_df.query("ICD10 != '%s' & ICD10_subsection == '%s'" % (icd10_code, icd10_subsection)) + if len(subsection_samples) > 0: + subsection_samples = subsection_samples.sample(min(num_subsection_samples, len(subsection_samples))) + + exp_sim_samples = num_chapter_samples + num_section_samples + num_subsection_samples + act_sim_samples = len(chapter_samples) + len(section_samples) + len(subsection_samples) + + other_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter != '%s'" % (icd10_code, icd10_chapter)) + other_samples = other_samples.sample(min(num_other_samples + (exp_sim_samples - act_sim_samples), len(other_samples))) + + # print("#Chapter samples: ", len(chapter_samples)) + # print("#Section samples: ", len(section_samples)) + # print("#Subsection samples: ", len(subsection_samples)) + # print("#Other samples: ", len(other_samples)) + + return pd.concat([chapter_samples, section_samples, subsection_samples, other_samples]) + return _sample diff --git a/code_mario/clef18_task1_emb1.py b/code_mario/clef18_task1_emb1.py index c5b380efebe6722e93c030a15ae323759e2d794e..9f933e7a31d33d76bb642d383b02c854fa7549bd 100644 --- a/code_mario/clef18_task1_emb1.py +++ b/code_mario/clef18_task1_emb1.py @@ -1,55 +1,32 @@ from init import * +from clef18_task1_base import ICD10LabelEncoders, Clef18Task1Base, EvaluationConfiguration, NegativeSampling +from clef18_task1_emb2 import EvaluationResult + import argparse import numpy as np import pandas as pd import keras as k -import pickle import os -from argparse import Namespace -from gensim.models import FastText from keras import Input, Model -from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, CSVLogger from keras.layers import Bidirectional, Dense, Dot, LSTM, Embedding from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer from pandas import DataFrame -from sklearn.dummy import DummyClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import SGDClassifier from sklearn.metrics import f1_score, accuracy_score -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline -from sklearn.preprocessing import LabelEncoder -from sklearn.svm import LinearSVC -from sklearn.tree import DecisionTreeClassifier from tqdm import tqdm from typing import Tuple, Dict, List, Callable -from sklearn.externals import joblib -import ft_embeddings from app_context import AppContext from clef18_task1_data import Clef18Task1Data -from dnn_classifiers import NeuralNetworkClassifiers as nnc from ft_embeddings import FastTextEmbeddings, FastTextModel from preprocessing import DataPreparationUtil as pdu from keras_extension import KerasUtil as ku -from util import LoggingMixin - - -class ICD10LabelEncoders(object): - def __init__(self, chapter_encoder: LabelEncoder, section_encoder: LabelEncoder, - subsection_encoder: LabelEncoder, code_encoder: LabelEncoder): - self.chapter_encoder = chapter_encoder - self.section_encoder = section_encoder - self.subsection_encoder = subsection_encoder - self.code_encoder = code_encoder - -class Configuration(object): +class Emb1Configuration(object): def __init__(self, train_cert_df: DataFrame, val_cert_df: DataFrame, test_cert_df: DataFrame, dict_df: DataFrame, max_cert_length: int, max_dict_length: int, ft_embedding_size: int, label_column: str, @@ -66,21 +43,12 @@ class Configuration(object): self.keras_tokenizer = keras_tokenizer -class EvaluationResult(object): - - def __init__(self, target_label: str, classifier_name: str, data_set_name: str, accuracy: float): - self.target_label = target_label - self.classifier_name = classifier_name - self.data_set_name = data_set_name - self.accuracy = accuracy - - -class Clef18Task1Emb1(LoggingMixin): +class Clef18Task1Emb1(Clef18Task1Base): def __init__(self): - LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + Clef18Task1Base.__init__(self) - def train_embedding_model(self, config: Configuration, ft_model: FastTextModel, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: + def train_embedding_model(self, config: Emb1Configuration, ft_model: FastTextModel, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: self.logger.info("Start building training pairs") train_pair_data = self.build_pairs(config.train_cert_df, config.dict_df, neg_sampling_strategy) self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts()) @@ -142,7 +110,7 @@ class Clef18Task1Emb1(LoggingMixin): return model - def train_and_evaluate_classifiers(self, emb_model: Model, config: Configuration, target_labels: List) -> List[EvaluationResult]: + def train_and_evaluate_classifiers(self, emb_model: Model, config: Emb1Configuration, target_labels: List) -> List[EvaluationResult]: self.logger.info("Start training and evaluation of classifier models") self.logger.info("Building dictionary embeddings") @@ -151,7 +119,7 @@ class Clef18Task1Emb1(LoggingMixin): dict_inputs = pad_sequences(config.dict_df["Token_ids"].values, maxlen=config.max_dict_length) dict_embeddings = dict_rnn.predict(dict_inputs, verbose=1, batch_size=1) - self.logger.info("Building certificate embeddings") + self.logger.info("Building train certificate embeddings") cert_input = emb_model.inputs[0] cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model") @@ -159,142 +127,29 @@ class Clef18Task1Emb1(LoggingMixin): train_cert_embeddings = cert_rnn.predict(train_cert_inputs, verbose=1) self.logger.info("cert train input shape: %s", train_cert_embeddings.shape) - val_cert_inputs = pad_sequences(config.val_cert_df["Token_ids"].values, maxlen=config.max_cert_length) - val_cert_embeddings = cert_rnn.predict(val_cert_inputs, verbose=1) - self.logger.info("cert val input shape: %s", val_cert_embeddings.shape) + self.logger.info("Building val certificate embeddings") + val_inputs = pad_sequences(config.val_cert_df["Token_ids"].values, maxlen=config.max_cert_length) + val_embeddings = cert_rnn.predict(val_inputs, verbose=1) + self.logger.info("cert val input shape: %s", val_embeddings.shape) - test_cert_inputs = pad_sequences(config.test_cert_df["Token_ids"].values, maxlen=config.max_cert_length) - test_cert_embeddings = cert_rnn.predict(test_cert_inputs, verbose=1) - self.logger.info("cert test input shape: %s", test_cert_embeddings.shape) + self.logger.info("Building test certificate embeddings") + test_inputs = pad_sequences(config.test_cert_df["Token_ids"].values, maxlen=config.max_cert_length) + test_embeddings = cert_rnn.predict(test_inputs, verbose=1) + self.logger.info("cert test input shape: %s", test_embeddings.shape) target_label_configs = self.get_label_configuration(target_labels, config.label_encoders) + target_label_columns = [label_column for _, label_column, _ in target_label_configs] - test_sets = [ - #("dict", dict_embeddings, config.dict_df), - ("cert-train", train_cert_embeddings, config.train_cert_df), - ("cert-val", val_cert_embeddings, config.val_cert_df), - ("cert-test", test_cert_embeddings, config.test_cert_df) - ] - - named_classifiers = [ - ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()), - ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")), - ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)), - ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)), - ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)), - ("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)), - - ("DNN-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=False, dropout_rate=0.0, epochs=10, batch_size=2)), - ("DNN-300", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - - ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - - ("DNN-200-100", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - ("DNN-200-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - ("DNN-300-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - - ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - - ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")), - ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent")) - ] - - num_experiments = len(target_label_configs) * len(test_sets) * len(named_classifiers) - cur_experiment = 1 + train_labels = pd.concat([config.dict_df[target_label_columns], config.train_cert_df[target_label_columns]]) + train_data = np.append(dict_embeddings, train_cert_embeddings, axis=0) input_dim = cert_rnn.output.shape[1].value - models_dir = os.path.join(AppContext.default().output_dir, "models") - os.makedirs(models_dir, exist_ok=True) - - results = [] - for target_label, target_column, label_encoder in target_label_configs: - self.logger.info("Start evaluation experiments with label %s", target_label) - output_dim = len(label_encoder.classes_) - - complete_train_data = np.append(dict_embeddings, train_cert_embeddings, axis=0) - complete_train_labels = np.append(config.dict_df[target_column].values, config.train_cert_df[target_column].values, axis=0) - self.logger.info("Build complete training samples (data: %s, labels: %s)", complete_train_data.shape, complete_train_labels.shape) - - for cl_name, classifier_factory in named_classifiers: - self.logger.info("Start training of classifier %s", cl_name) - classifier = classifier_factory(target_label, input_dim, output_dim, val_cert_embeddings) - classifier.fit(complete_train_data, complete_train_labels) - - classifier_file_name = "cl_{}_{}.model".format(cl_name, target_label).lower() - classifier_file = os.path.join(models_dir, classifier_file_name) - try: - joblib.dump(classifier, classifier_file) - except: - self.logger.error("Error while saving classifier %s to %s", cl_name, classifier_file) - - self.logger.info("Start evaluation of %s", cl_name) - for ts_name, inputs, data_frame in test_sets: - gold_labels = data_frame[target_column].values - - self.logger.info("Evaluate data set %s", ts_name) - prediction = classifier.predict(inputs) - acc_score = accuracy_score(gold_labels, prediction) - - self.logger.info("Evaluation result: label=%s | classifier=%s | data_set=%s | acc_score=%s", - target_label, cl_name, ts_name, acc_score) - results.append(EvaluationResult(target_label, cl_name, ts_name, acc_score)) - - self.logger.info("Finished experiment %s out of %s", cur_experiment, num_experiments) - cur_experiment += 1 - - return results - - def get_label_configuration(self, target_labels: List[str], icd10_encoders: ICD10LabelEncoders) -> List: - label_configs = [] - - for target_label in target_labels: - if target_label == "chap" or target_label == "chapter": - label_configs.append((target_label, "ICD10_chapter_encoded", icd10_encoders.chapter_encoder)) - elif target_label == "sect" or target_label == "section": - label_configs.append((target_label, "ICD10_section_encoded", icd10_encoders.section_encoder)) - elif target_label == "subs" or target_label == "subsection": - label_configs.append((target_label, "ICD10_subsection_encoded", icd10_encoders.subsection_encoder)) - elif target_label == "code" or target_label == "icd10": - label_configs.append((target_label, "ICD10_encoded", icd10_encoders.code_encoder)) - else: - self.logger.error("Can't create label configuration for label " + target_label) - - return label_configs - - def split_train_test(self, certificate_df: DataFrame, train_size: float, - stratified_splits: bool, label_column: str) -> Tuple[DataFrame, DataFrame]: - if stratified_splits: - self.logger.info("Creating stratified splits for column %s", label_column) - training_data, test_data = train_test_split(certificate_df, train_size=train_size, stratify=certificate_df[label_column]) - else: - self.logger.info("Creating non-stratified splits") - training_data, test_data = train_test_split(certificate_df, train_size=train_size) + eval_configuration = EvaluationConfiguration(target_labels, config.label_encoders, input_dim, train_data, train_labels, + val_embeddings, config.val_cert_df, test_embeddings, config.test_cert_df) + return self.run_evaluation(eval_configuration) - return training_data, test_data + # --------------------------------------------------------------------------------------------------------------------------------------- def build_embedding_model(self, word_index: Dict, ft_model: FastTextModel, max_cert_length: int, max_dict_length: int): # TODO: Make hyper-parameter configurable! @@ -334,7 +189,7 @@ class Clef18Task1Emb1(LoggingMixin): return model def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastTextModel, train_ratio: float, val_ratio: float, - strat_column: str, samples: int=None, stratified_splits: bool=False) -> Configuration: + strat_column: str, samples: int=None, stratified_splits: bool=False) -> Emb1Configuration: if samples: self.logger.info("Sampling %s instances", samples) @@ -362,35 +217,8 @@ class Clef18Task1Emb1(LoggingMixin): self.logger.info("Start preparation of dictionary data (%s instances)", len(dict_df)) dict_df, max_dict_length = self.prepare_dictionary_df(dict_df, "train", label_encoders, keras_tokenizer) - return Configuration(train_cert_df, val_cert_df, test_cert_df, dict_df, max_cert_length, max_dict_length, - ft_model.vector_size, strat_column, label_encoders, keras_tokenizer) - - def prepare_label_encoders(self, dict_df: DataFrame, cert_df: DataFrame) -> ICD10LabelEncoders: - self.logger.info("Fitting label encoder to ICD10 codes") - icd10_code_encoder = LabelEncoder() - icd10_code_encoder.fit(list([icd10.strip() for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip() for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 codes within the data set", len(icd10_code_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 chapters") - icd10_chapter_encoder = LabelEncoder() - icd10_chapter_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 chapters within the data set", len(icd10_chapter_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 section") - icd10_section_encoder = LabelEncoder() - icd10_section_encoder.fit(list([icd10.strip().lower()[0:2] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0:2] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 sections within the data set", len(icd10_section_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 subsection") - icd10_subsection_encoder = LabelEncoder() - icd10_subsection_encoder.fit(list([icd10.strip().lower()[0:3] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0:3] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 subsections within the data set", len(icd10_subsection_encoder.classes_)) - - return ICD10LabelEncoders(icd10_chapter_encoder, icd10_section_encoder, icd10_subsection_encoder, icd10_code_encoder) + return Emb1Configuration(train_cert_df, val_cert_df, test_cert_df, dict_df, max_cert_length, max_dict_length, + ft_model.vector_size, strat_column, label_encoders, keras_tokenizer) def prepare_certificate_df(self, certificate_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders, keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]: @@ -496,139 +324,6 @@ class Clef18Task1Emb1(LoggingMixin): return data_matrix - def save_evaluation_results(self, eval_results: List[EvaluationResult]): - result_configurations = [ - ("results.csv", None), - ("results_by_classifier.csv", lambda result: result.classifier_name), - ("results_by_data_set.csv", lambda result: result.data_set_name), - ("results_by_label.csv", lambda result: result.target_label) - ] - - for file_name, sort_key in result_configurations: - results_file = os.path.join(AppContext.default().output_dir, file_name) - with open(results_file, "w", encoding="utf8") as result_writer: - if sort_key: - eval_results = sorted(eval_results, key=sort_key) - - for r in eval_results: - result_writer.write("%s\t%s\t%s\t%s\n" % (r.target_label, r.classifier_name, r.data_set_name, r.accuracy)) - result_writer.close() - - def save_arguments(self, arguments: Namespace): - arguments_file = os.path.join(AppContext.default().log_dir, "arguments.txt") - self.logger.info("Saving arguments to " + arguments_file) - - with open(arguments_file, 'w', encoding="utf8") as writer: - for key, value in arguments.__dict__.items(): - writer.write("%s=%s\n" % (str(key), str(value))) - writer.close() - - def save_configuration(self, configuration: Configuration): - label_encoder_file = os.path.join(AppContext.default().output_dir, "label_encoder.pk") - self.logger.info("Saving label encoder to " + label_encoder_file) - with open(label_encoder_file, 'wb') as encoder_writer: - pickle.dump(configuration.label_encoders, encoder_writer) - encoder_writer.close() - - keras_tokenizer_file = os.path.join(AppContext.default().output_dir, "keras_tokenizer.pk") - self.logger.info("Saving keras sequencer to " + keras_tokenizer_file) - with open(keras_tokenizer_file, 'wb') as keras_sequencer_writer: - pickle.dump(configuration.keras_tokenizer, keras_sequencer_writer) - keras_sequencer_writer.close() - - configuration_file = os.path.join(AppContext.default().output_dir, "configuration.pk") - self.logger.info("Saving configuration to " + configuration_file) - with open(configuration_file, 'wb') as train_conf_writer: - pickle.dump(configuration, train_conf_writer) - train_conf_writer.close() - - def reload_configuration(self, file_path: str): - self.logger.info("Reloading configuration from " + file_path) - with open(args.train_conf, 'rb') as train_conf_reader: - configuration = pickle.load(train_conf_reader) - train_conf_reader.close() - - return configuration - - def reload_embedding_model(self, emb_model_file: str): - self.logger.info("Reloading embedding model from " + emb_model_file) - return k.models.load_model(args.emb_model) - - def create_dnn_classifier(self, model_name, label: str, val_data: Tuple, **kwargs): - if val_data is not None: - monitor_loss = "val_loss" - else: - monitor_loss = "loss" - - callbacks = [ - ku.best_model_checkpointing_by_model_name(model_name), - ku.csv_logging_callback(model_name, label), - ku.early_stopping(monitor_loss, 5) - ] - - kwargs["callbacks"] = callbacks - return nnc.dense_network(**kwargs) - - -class NegativeSampling(LoggingMixin): - - def __init__(self): - LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) - - def get_strategy_by_name(self, name: str, args: Namespace) -> Callable: - #FIXME: Make args to dictionary - - if name == "def": - return self.default_strategy(args.num_neg_samples) - elif name == "ext1": - return self.extended1_strategy(args.num_neg_cha, args.num_neg_sec, args.num_neg_sub, args.num_neg_oth) - else: - raise AssertionError("Unsupported negative sampling strategy: " + name) - - def default_strategy(self, num_negative_samples: int) -> Callable: - def _sample(dictionary_df: DataFrame, line_icd10_code: str): - negative_samples = dictionary_df.query("ICD10 != '%s'" % line_icd10_code) - - # Only necessary during development and tests with only very few examples - if len(negative_samples) > 0: - negative_samples = negative_samples.sample(min(num_negative_samples, len(negative_samples))) - - return negative_samples - - return _sample - - def extended1_strategy(self, num_chapter_samples, num_section_samples, num_subsection_samples, num_other_samples): - def _sample(dictionary_df: DataFrame, icd10_code: str): - icd10_chapter = icd10_code[0].lower() - icd10_section = icd10_code[0:2].lower() - icd10_subsection = icd10_code[0:3].lower() - - chapter_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter == '%s'" % (icd10_code, icd10_chapter)) - if len(chapter_samples) > 0: - chapter_samples = chapter_samples.sample(min(num_chapter_samples, len(chapter_samples))) - - section_samples = dictionary_df.query("ICD10 != '%s' & ICD10_section == '%s'" % (icd10_code, icd10_section)) - if len(section_samples) > 0: - section_samples = section_samples.sample(min(num_section_samples, len(section_samples))) - - subsection_samples = dictionary_df.query("ICD10 != '%s' & ICD10_subsection == '%s'" % (icd10_code, icd10_subsection)) - if len(subsection_samples) > 0: - subsection_samples = subsection_samples.sample(min(num_subsection_samples, len(subsection_samples))) - - exp_sim_samples = num_chapter_samples + num_section_samples + num_subsection_samples - act_sim_samples = len(chapter_samples) + len(section_samples) + len(subsection_samples) - - other_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter != '%s'" % (icd10_code, icd10_chapter)) - other_samples = other_samples.sample(min(num_other_samples + (exp_sim_samples - act_sim_samples), len(other_samples))) - - # print("#Chapter samples: ", len(chapter_samples)) - # print("#Section samples: ", len(section_samples)) - # print("#Subsection samples: ", len(subsection_samples)) - # print("#Other samples: ", len(other_samples)) - - return pd.concat([chapter_samples, section_samples, subsection_samples, other_samples]) - return _sample - if __name__ == "__main__": parser = argparse.ArgumentParser(prog="CLEF2018") @@ -648,10 +343,10 @@ if __name__ == "__main__": train_emb_parser.add_argument("--neg_sampling", help="Negative sampling strategy to use", default="ext1", choices=["def", "ext1"]) train_emb_parser.add_argument("--num_neg_samples", help="Number of negative samples to use (default strategy)", default=75, type=int) - train_emb_parser.add_argument("--num_neg_cha", help="Number of negative chapter samples to use (ext1 strategy)", default=10, type=int) - train_emb_parser.add_argument("--num_neg_sec", help="Number of negative section samples to use (ext1 strategy)", default=10, type=int) - train_emb_parser.add_argument("--num_neg_sub", help="Number of negative subsection samples to use (ext1 strategy)", default=10, type=int) - train_emb_parser.add_argument("--num_neg_oth", help="Number of negative other samples to use (ext1 strategy)", default=40, type=int) + train_emb_parser.add_argument("--num_neg_cha", help="Number of negative chapter samples to use (ext1 strategy)", default=20, type=int) + train_emb_parser.add_argument("--num_neg_sec", help="Number of negative section samples to use (ext1 strategy)", default=20, type=int) + train_emb_parser.add_argument("--num_neg_sub", help="Number of negative subsection samples to use (ext1 strategy)", default=20, type=int) + train_emb_parser.add_argument("--num_neg_oth", help="Number of negative other samples to use (ext1 strategy)", default=45, type=int) eval_classifier_parser = subparsers.add_parser("eval-cl") eval_classifier_parser.add_argument("emb_model", help="Path to the embedding model to use") diff --git a/code_mario/clef18_task1_emb2.py b/code_mario/clef18_task1_emb2.py index 1eeb7d8e46ece374856cefebda986fa6987eaf0f..f4ba77fd8a9d9862123b920288188f5ae194fd90 100644 --- a/code_mario/clef18_task1_emb2.py +++ b/code_mario/clef18_task1_emb2.py @@ -1,42 +1,28 @@ from init import * +from clef18_task1_base import Clef18Task1Base, EvaluationConfiguration, NegativeSampling import argparse import numpy as np import pandas as pd import keras as k -import pickle import os -from argparse import Namespace -from gensim.models import FastText from keras import Input, Model -from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, CSVLogger from keras.layers import Bidirectional, Dense, Dot, LSTM, Embedding, GlobalMaxPool1D from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer from pandas import DataFrame -from sklearn.dummy import DummyClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import SGDClassifier from sklearn.metrics import f1_score, accuracy_score -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder -from sklearn.svm import LinearSVC -from sklearn.tree import DecisionTreeClassifier from tqdm import tqdm from typing import Tuple, Dict, List, Callable -from sklearn.externals import joblib -import ft_embeddings from app_context import AppContext from clef18_task1_data import Clef18Task1Data -from dnn_classifiers import NeuralNetworkClassifiers as nnc from ft_embeddings import FastTextEmbeddings, FastTextModel from preprocessing import DataPreparationUtil as pdu from keras_extension import KerasUtil as ku -from util import LoggingMixin class ICD10LabelEncoders(object): @@ -72,10 +58,10 @@ class EvaluationResult(object): self.accuracy = accuracy -class Clef18Task1Emb2(LoggingMixin): +class Clef18Task1Emb2(Clef18Task1Base): def __init__(self): - LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + Clef18Task1Base.__init__(self) def train_embedding_model(self, config: Configuration, ft_model: FastTextModel, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: self.logger.info("Start building training pairs") @@ -159,132 +145,15 @@ class Clef18Task1Emb2(LoggingMixin): target_label_configs = self.get_label_configuration(target_labels, config.label_encoders) - test_sets = [ - #("dict", dict_embeddings, config.dict_df), - ("cert-train", train_embeddings, config.train_df), - ("cert-val", val_embeddings, config.val_df), - ("cert-test", test_embeddings, config.test_df) - ] - - named_classifiers = [ - ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()), - ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")), - ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)), - ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)), - ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)), - ("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)), - - ("DNN-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=False, dropout_rate=0.0, epochs=10, batch_size=2)), - ("DNN-300", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - - ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - - ("DNN-200-100", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - ("DNN-200-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - ("DNN-300-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)), - - ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)), - - ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")), - ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent")) - ] - - num_experiments = len(target_label_configs) * len(test_sets) * len(named_classifiers) - cur_experiment = 1 - input_dim = text_rnn.output.shape[1].value - models_dir = os.path.join(AppContext.default().output_dir, "models") - os.makedirs(models_dir, exist_ok=True) - - results = [] - for target_label, target_column, label_encoder in target_label_configs: - self.logger.info("Start evaluation experiments with label %s", target_label) - output_dim = len(label_encoder.classes_) - - for cl_name, classifier_factory in named_classifiers: - self.logger.info("Start training of classifier %s", cl_name) - classifier = classifier_factory(target_label, input_dim, output_dim, val_embeddings) - classifier.fit(train_embeddings, config.train_df[target_column].values) - - classifier_file_name = "cl_{}_{}.model".format(cl_name, target_label).lower() - classifier_file = os.path.join(models_dir, classifier_file_name) - try: - joblib.dump(classifier, classifier_file) - except: - self.logger.error("Error while saving classifier %s to %s", cl_name, classifier_file) - - self.logger.info("Start evaluation of %s", cl_name) - for ts_name, inputs, data_frame in test_sets: - gold_labels = data_frame[target_column].values - - self.logger.info("Evaluate data set %s", ts_name) - prediction = classifier.predict(inputs) - acc_score = accuracy_score(gold_labels, prediction) - - self.logger.info("Evaluation result: label=%s | classifier=%s | data_set=%s | acc_score=%s", - target_label, cl_name, ts_name, acc_score) - results.append(EvaluationResult(target_label, cl_name, ts_name, acc_score)) - - self.logger.info("Finished experiment %s out of %s", cur_experiment, num_experiments) - cur_experiment += 1 - - return results - - def get_label_configuration(self, target_labels: List[str], icd10_encoders: ICD10LabelEncoders) -> List: - label_configs = [] - - for target_label in target_labels: - if target_label == "chap" or target_label == "chapter": - label_configs.append((target_label, "ICD10_chapter_encoded", icd10_encoders.chapter_encoder)) - elif target_label == "sect" or target_label == "section": - label_configs.append((target_label, "ICD10_section_encoded", icd10_encoders.section_encoder)) - elif target_label == "subs" or target_label == "subsection": - label_configs.append((target_label, "ICD10_subsection_encoded", icd10_encoders.subsection_encoder)) - elif target_label == "code" or target_label == "icd10": - label_configs.append((target_label, "ICD10_encoded", icd10_encoders.code_encoder)) - else: - self.logger.error("Can't create label configuration for label " + target_label) - - return label_configs - - def split_train_test(self, certificate_df: DataFrame, train_size: float, - stratified_splits: bool, label_column: str) -> Tuple[DataFrame, DataFrame]: - if stratified_splits: - self.logger.info("Creating stratified splits for column %s", label_column) - training_data, test_data = train_test_split(certificate_df, train_size=train_size, stratify=certificate_df[label_column]) - else: - self.logger.info("Creating non-stratified splits") - training_data, test_data = train_test_split(certificate_df, train_size=train_size) - - return training_data, test_data + eval_config = EvaluationConfiguration(target_labels, config.label_encoders, input_dim, train_embeddings, config.train_df, + val_embeddings, config.val_df, test_embeddings, config.test_df) + return self.run_evaluation(eval_config) def build_embedding_model(self, word_index: Dict, ft_model: FastTextModel, conf: Configuration): # TODO: Make hyper-parameter configurable! - # TODO: Think about using CNNs instead of RNNs since we can have multiple ICD-10 per line! + # TODO: Think about using CNNs instead of RNNs! embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size)) for word, i in word_index.items(): @@ -359,33 +228,6 @@ class Clef18Task1Emb2(LoggingMixin): return Configuration(train_df, val_df, test_df, max_length, ft_model.vector_size, strat_column, label_encoders, keras_tokenizer) - def prepare_label_encoders(self, dict_df: DataFrame, cert_df: DataFrame) -> ICD10LabelEncoders: - self.logger.info("Fitting label encoder to ICD10 codes") - icd10_code_encoder = LabelEncoder() - icd10_code_encoder.fit(list([icd10.strip() for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip() for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 codes within the data set", len(icd10_code_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 chapters") - icd10_chapter_encoder = LabelEncoder() - icd10_chapter_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 chapters within the data set", len(icd10_chapter_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 section") - icd10_section_encoder = LabelEncoder() - icd10_section_encoder.fit(list([icd10.strip().lower()[0:2] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0:2] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 sections within the data set", len(icd10_section_encoder.classes_)) - - self.logger.info("Fitting label encoder to ICD10 subsection") - icd10_subsection_encoder = LabelEncoder() - icd10_subsection_encoder.fit(list([icd10.strip().lower()[0:3] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0:3] for icd10 in cert_df["ICD10"].values])) - self.logger.info("Found %s distinct ICD10 subsections within the data set", len(icd10_subsection_encoder.classes_)) - - return ICD10LabelEncoders(icd10_chapter_encoder, icd10_section_encoder, icd10_subsection_encoder, icd10_code_encoder) - def prepare_cert_dict_df(self, cert_dict_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders, keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]: pipeline = Pipeline([ @@ -419,9 +261,7 @@ class Clef18Task1Emb2(LoggingMixin): return data_prepared, max_length def build_pairs(self, data_df: DataFrame, neg_sampling_strategy: Callable): - # FIXME: This can be implemented more efficiently! - # FIXME: Improve sampling of negative instances (especially if code is I-XXX sample other codes of the same class (e.g. I-YYY) - # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2 + # TODO: Think about to use a negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2 text_vectors = [] icd10_codes = [] @@ -445,139 +285,6 @@ class Clef18Task1Emb2(LoggingMixin): data = {"Cert_input": text_vectors, "ICD10_input": icd10_codes, "Label": labels} return pd.DataFrame(data) - def save_evaluation_results(self, eval_results: List[EvaluationResult]): - result_configurations = [ - ("results.csv", None), - ("results_by_classifier.csv", lambda result: result.classifier_name), - ("results_by_data_set.csv", lambda result: result.data_set_name), - ("results_by_label.csv", lambda result: result.target_label) - ] - - for file_name, sort_key in result_configurations: - results_file = os.path.join(AppContext.default().output_dir, file_name) - with open(results_file, "w", encoding="utf8") as result_writer: - if sort_key: - eval_results = sorted(eval_results, key=sort_key) - - for r in eval_results: - result_writer.write("%s\t%s\t%s\t%s\n" % (r.target_label, r.classifier_name, r.data_set_name, r.accuracy)) - result_writer.close() - - def save_arguments(self, arguments: Namespace): - arguments_file = os.path.join(AppContext.default().log_dir, "arguments.txt") - self.logger.info("Saving arguments to " + arguments_file) - - with open(arguments_file, 'w', encoding="utf8") as writer: - for key, value in arguments.__dict__.items(): - writer.write("%s=%s\n" % (str(key), str(value))) - writer.close() - - def save_configuration(self, configuration: Configuration): - label_encoder_file = os.path.join(AppContext.default().output_dir, "label_encoder.pk") - self.logger.info("Saving label encoder to " + label_encoder_file) - with open(label_encoder_file, 'wb') as encoder_writer: - pickle.dump(configuration.label_encoders, encoder_writer) - encoder_writer.close() - - keras_tokenizer_file = os.path.join(AppContext.default().output_dir, "keras_tokenizer.pk") - self.logger.info("Saving keras sequencer to " + keras_tokenizer_file) - with open(keras_tokenizer_file, 'wb') as keras_sequencer_writer: - pickle.dump(configuration.keras_tokenizer, keras_sequencer_writer) - keras_sequencer_writer.close() - - configuration_file = os.path.join(AppContext.default().output_dir, "configuration.pk") - self.logger.info("Saving configuration to " + configuration_file) - with open(configuration_file, 'wb') as train_conf_writer: - pickle.dump(configuration, train_conf_writer) - train_conf_writer.close() - - def reload_configuration(self, file_path: str): - self.logger.info("Reloading configuration from " + file_path) - with open(args.train_conf, 'rb') as train_conf_reader: - configuration = pickle.load(train_conf_reader) - train_conf_reader.close() - - return configuration - - def reload_embedding_model(self, emb_model_file: str): - self.logger.info("Reloading embedding model from " + emb_model_file) - return k.models.load_model(args.emb_model) - - def create_dnn_classifier(self, model_name, label: str, val_data: Tuple, **kwargs): - if val_data is not None: - monitor_loss = "val_loss" - else: - monitor_loss = "loss" - - callbacks = [ - ku.best_model_checkpointing_by_model_name(model_name), - ku.csv_logging_callback(model_name, label), - ku.early_stopping(monitor_loss, 5) - ] - - kwargs["callbacks"] = callbacks - return nnc.dense_network(**kwargs) - - -class NegativeSampling(LoggingMixin): - - def __init__(self): - LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) - - def get_strategy_by_name(self, name: str, args: Namespace) -> Callable: - #FIXME: Make args to dictionary - - if name == "def": - return self.default_strategy(args.num_neg_samples) - elif name == "ext1": - return self.extended1_strategy(args.num_neg_cha, args.num_neg_sec, args.num_neg_sub, args.num_neg_oth) - else: - raise AssertionError("Unsupported negative sampling strategy: " + name) - - def default_strategy(self, num_negative_samples: int) -> Callable: - def _sample(dictionary_df: DataFrame, line_icd10_code: str): - negative_samples = dictionary_df.query("ICD10 != '%s'" % line_icd10_code) - - # Only necessary during development and tests with only very few examples - if len(negative_samples) > 0: - negative_samples = negative_samples.sample(min(num_negative_samples, len(negative_samples))) - - return negative_samples - - return _sample - - def extended1_strategy(self, num_chapter_samples, num_section_samples, num_subsection_samples, num_other_samples): - def _sample(dictionary_df: DataFrame, icd10_code: str): - icd10_chapter = icd10_code[0].lower() - icd10_section = icd10_code[0:2].lower() - icd10_subsection = icd10_code[0:3].lower() - - chapter_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter == '%s'" % (icd10_code, icd10_chapter)) - if len(chapter_samples) > 0: - chapter_samples = chapter_samples.sample(min(num_chapter_samples, len(chapter_samples))) - - section_samples = dictionary_df.query("ICD10 != '%s' & ICD10_section == '%s'" % (icd10_code, icd10_section)) - if len(section_samples) > 0: - section_samples = section_samples.sample(min(num_section_samples, len(section_samples))) - - subsection_samples = dictionary_df.query("ICD10 != '%s' & ICD10_subsection == '%s'" % (icd10_code, icd10_subsection)) - if len(subsection_samples) > 0: - subsection_samples = subsection_samples.sample(min(num_subsection_samples, len(subsection_samples))) - - exp_sim_samples = num_chapter_samples + num_section_samples + num_subsection_samples - act_sim_samples = len(chapter_samples) + len(section_samples) + len(subsection_samples) - - other_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter != '%s'" % (icd10_code, icd10_chapter)) - other_samples = other_samples.sample(min(num_other_samples + (exp_sim_samples - act_sim_samples), len(other_samples))) - - # print("#Chapter samples: ", len(chapter_samples)) - # print("#Section samples: ", len(section_samples)) - # print("#Subsection samples: ", len(subsection_samples)) - # print("#Other samples: ", len(other_samples)) - - return pd.concat([chapter_samples, section_samples, subsection_samples, other_samples]) - return _sample - if __name__ == "__main__": parser = argparse.ArgumentParser(prog="CLEF2018") @@ -651,5 +358,3 @@ if __name__ == "__main__": eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) clef18_task1.save_evaluation_results(eval_result) - -