diff --git a/.gitignore b/.gitignore index 36f0c569634108fe7569127b27ab301f60810a40..be1159776fc2acb9e33eeb00652aa2dd2d344db8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ -.idea/ +*.idea/ **/_env +**/_logs **/embeddings *.pyc diff --git a/code_mario/clef18_task1_v2.py b/code_mario/clef18_task1_v2.py index 253d2fe3db2c254ac5db47d6becee1c4e84f4f00..5ddb39766226d44bf1b6b8b433720c7f77717083 100644 --- a/code_mario/clef18_task1_v2.py +++ b/code_mario/clef18_task1_v2.py @@ -1,6 +1,8 @@ import argparse import numpy as np import pandas as pd +import keras as k +import pickle import os from gensim.models import FastText @@ -22,7 +24,9 @@ from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from tqdm import tqdm from typing import Tuple, Dict, List +from sklearn.externals import joblib +import ft_embeddings from app_context import AppContext from clef18_task1_data import Clef18Task1Data from dnn_classifiers import NeuralNetworkClassifiers as nnc @@ -32,25 +36,37 @@ from keras_extension import KerasUtil as ku from util import LoggingMixin -class TrainingConfiguration(object): +class ICD10LabelEncoders(object): - def __init__(self, train_cert_df: DataFrame, val_cert_df: DataFrame, dict_df: DataFrame, + def __init__(self, chapter_encoder: LabelEncoder, section_encoder: LabelEncoder, + subsection_encoder: LabelEncoder, code_encoder: LabelEncoder): + self.chapter_encoder = chapter_encoder + self.section_encoder = section_encoder + self.subsection_encoder = subsection_encoder + self.code_encoder = code_encoder + + +class Configuration(object): + + def __init__(self, train_cert_df: DataFrame, val_cert_df: DataFrame, test_cert_df: DataFrame, dict_df: DataFrame, max_cert_length: int, max_dict_length: int, ft_embedding_size: int, label_column: str, - label_encoder: LabelEncoder, keras_tokenizer: Tokenizer): + label_encoders: ICD10LabelEncoders, keras_tokenizer: Tokenizer): self.train_cert_df = train_cert_df self.val_cert_df = val_cert_df + self.test_cert_df = test_cert_df self.dict_df = dict_df self.max_cert_length = max_cert_length self.max_dict_length = max_dict_length self.ft_embedding_size = ft_embedding_size self.label_column = label_column - self.label_encoder = label_encoder + self.label_encoders = label_encoders self.keras_tokenizer = keras_tokenizer class EvaluationResult(object): - def __init__(self, classifier_name: str, data_set_name: str, accuracy: float): + def __init__(self, target_label: str, classifier_name: str, data_set_name: str, accuracy: float): + self.target_label = target_label self.classifier_name = classifier_name self.data_set_name = data_set_name self.accuracy = accuracy @@ -61,138 +77,201 @@ class Clef18Task1V2(LoggingMixin): def __init__(self): LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) - def train_embedding_model(self, train_conf: TrainingConfiguration, ft_model: FastText, + def train_embedding_model(self, config: Configuration, ft_model: FastText, neg_samples: int, epochs: int, batch_size: int) -> Model: self.logger.info("Start building training pairs") - train_pair_data = self.build_pairs(train_conf.train_cert_df, train_conf.dict_df, neg_samples) + train_pair_data = self.build_pairs(config.train_cert_df, config.dict_df, neg_samples) self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts()) self.logger.info("Start building embedding model") - model = self.build_embedding_model(train_conf.keras_tokenizer.word_index, ft_model, - train_conf.max_cert_length, train_conf.max_dict_length) + model = self.build_embedding_model(config.keras_tokenizer.word_index, ft_model, + config.max_cert_length, config.max_dict_length) + model.summary(print_fn=self.logger.info) - cert_inputs = pad_sequences(train_pair_data["Cert_input"].values, maxlen=train_conf.max_cert_length) - dict_inputs = pad_sequences(train_pair_data["Dict_input"].values, maxlen=train_conf.max_dict_length) + cert_inputs = pad_sequences(train_pair_data["Cert_input"].values, maxlen=config.max_cert_length) + dict_inputs = pad_sequences(train_pair_data["Dict_input"].values, maxlen=config.max_dict_length) labels = train_pair_data["Label"].values self.logger.info("Start training of embedding model") - model.fit([cert_inputs, dict_inputs], labels, epochs=epochs, batch_size=batch_size) + best_model_file = os.path.join(AppContext.default().output_dir, "embedding_model_best.h5") - model_file = os.path.join(AppContext.default().output_dir, "model.h5") + if config.val_cert_df is not None and len(config.test_cert_df) > 0: + self.logger.info("Start creation of validation pairs") + val_pair_data = self.build_pairs(config.val_cert_df, config.dict_df, neg_samples) + + val_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length) + val_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length) + val_gold_labels = val_pair_data["Label"].values + + model.fit([cert_inputs, dict_inputs], labels, epochs=epochs, batch_size=batch_size, + validation_data=([val_cert_inputs, val_dict_inputs], val_gold_labels), + callbacks=[ku.best_model_checkpointing_by_file_path(best_model_file, "val_loss")]) + else: + model.fit([cert_inputs, dict_inputs], labels, epochs=epochs, batch_size=batch_size, + callbacks=[ku.best_model_checkpointing_by_file_path(best_model_file)]) - self.logger.info("Saving model to %s", model_file) + model_file = os.path.join(AppContext.default().output_dir, "embedding_model_last.h5") + self.logger.info("Saving last model to %s", model_file) model.save(model_file) + self.logger.info("Reloading best embedding model from %s", best_model_file) + model = k.models.load_model(best_model_file) + ## ---------------------------------------------------------------------------------------------------------- - if train_conf.val_cert_df is not None and len(train_conf.val_cert_df) > 0: + if config.val_cert_df is not None and len(config.val_cert_df) > 0: self.logger.info("Start evaluation of embedding model!") self.logger.info("Start creation of test pairs") - test_pair_data = self.build_pairs(train_conf.val_cert_df, train_conf.dict_df, neg_samples) + val_pair_data = self.build_pairs(config.test_cert_df, config.dict_df, neg_samples) - test_cert_inputs = pad_sequences(test_pair_data["Cert_input"].values, maxlen=train_conf.max_cert_length) - test_dict_inputs = pad_sequences(test_pair_data["Dict_input"].values, maxlen=train_conf.max_dict_length) - gold_labels = test_pair_data["Label"].values + test_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length) + test_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length) + test_gold_labels = val_pair_data["Label"].values self.logger.info("Start prediction of test labels") pred_labels = model.predict([test_cert_inputs, test_dict_inputs], verbose=1) pred_labels = (pred_labels > 0.5).astype(float) - f1_value = f1_score(gold_labels, pred_labels) - acc_value = accuracy_score(gold_labels, pred_labels) + f1_value = f1_score(test_gold_labels, pred_labels) + acc_value = accuracy_score(test_gold_labels, pred_labels) self.logger.info("Result: f1_score= %s | acc_score= %s", f1_value, acc_value) return model - def train_and_evaluate_classifiers(self, emb_model: Model, data_set: TrainingConfiguration) -> List[EvaluationResult]: + def train_and_evaluate_classifiers(self, emb_model: Model, config: Configuration, target_labels: List) -> List[EvaluationResult]: self.logger.info("Start training and evaluation of classifier models") - label_column = "ICD10_chapter_encoded" self.logger.info("Building dictionary embeddings") dict_input = emb_model.inputs[1] dict_rnn = Model(inputs=dict_input, outputs=emb_model.get_layer("dict_rnn").output, name="Dict-RNN-Model") - dict_inputs = pad_sequences(data_set.dict_df["Token_ids"].values, maxlen=data_set.max_dict_length) - dict_embeddings = dict_rnn.predict(dict_inputs, verbose=1) + dict_inputs = pad_sequences(config.dict_df["Token_ids"].values, maxlen=config.max_dict_length) + dict_embeddings = dict_rnn.predict(dict_inputs, verbose=1, batch_size=1) self.logger.info("Building certificate embeddings") cert_input = emb_model.inputs[0] cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model") - train_cert_inputs = pad_sequences(data_set.train_cert_df["Token_ids"].values, maxlen=data_set.max_cert_length) + train_cert_inputs = pad_sequences(config.train_cert_df["Token_ids"].values, maxlen=config.max_cert_length) train_cert_embeddings = cert_rnn.predict(train_cert_inputs, verbose=1) - self.logger.info("Certificate train input shape: %s", train_cert_embeddings.shape) + self.logger.info("cert train input shape: %s", train_cert_embeddings.shape) - val_cert_inputs = pad_sequences(data_set.val_cert_df["Token_ids"].values, maxlen=data_set.max_cert_length) + val_cert_inputs = pad_sequences(config.val_cert_df["Token_ids"].values, maxlen=config.max_cert_length) val_cert_embeddings = cert_rnn.predict(val_cert_inputs, verbose=1) - self.logger.info("Certificate val input shape: %s", val_cert_embeddings.shape) + self.logger.info("cert val input shape: %s", val_cert_embeddings.shape) - num_classes = len(data_set.label_encoder.classes_) + test_cert_inputs = pad_sequences(config.test_cert_df["Token_ids"].values, maxlen=config.max_cert_length) + test_cert_embeddings = cert_rnn.predict(test_cert_inputs, verbose=1) + self.logger.info("cert test input shape: %s", test_cert_embeddings.shape) - named_classifiers = [ - ("KNN", KNeighborsClassifier(metric="cosine", n_jobs=6)), - ("SGD", SGDClassifier(verbose=1, random_state=42)), - ("DT", DecisionTreeClassifier(random_state=42)), - ("RF", RandomForestClassifier(verbose=1, random_state=42)), - ("LinearSVM", LinearSVC(max_iter=5000, verbose=1, random_state=42)), - - ("DNN-1-200", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], False, 0.0, 50, 2, - callbacks=[ku.best_model_checkpointing("dnn-1-200")])), - ("DNN-1-300", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], False, 0.0, 50, 2, - callbacks=[ku.best_model_checkpointing("dnn-1-300")])), - ("DNN-200-BN-DO", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], True, 0.5, 50, 2, - callbacks=[ku.best_model_checkpointing("dnn-1-200-bn-do50")])), - ("DNN-300-BN-DO", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], True, 0.5, 50, 2, - callbacks=[ku.best_model_checkpointing("dnn-1-300-bn-do50")])), - - ("DNN-200-100", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], False, 0.0, 1, 2, - callbacks=[ku.best_model_checkpointing("dnn-200-100")])), - ("DNN-200-200", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], False, 0.0, 50, 2, - callbacks=[ku.best_model_checkpointing("dnn-200-200")])), - ("DNN-200-100-BN-DO", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], True, 0.5, 50, 2, - callbacks=[ku.best_model_checkpointing("dnn-200-100-bn-do50")])), - ("DNN-200-200-BN-DO", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], True, 0.5, 50, 2, - callbacks=[ku.best_model_checkpointing("dnn-200-200-bn-do50")])), - - # ("Test-DNN-200-BN-DO", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], True, 0.5, 1, 2, - # callbacks=[ku.best_model_checkpointing("dnn-300-bn-do")])), - - ('DU1', DummyClassifier(strategy="stratified")), - ('DU2', DummyClassifier(strategy="most_frequent")) - ] + target_label_configs = self.get_label_configuration(target_labels, config.label_encoders) test_sets = [ - #("dict", dict_embeddings, data_set.dict_df[label_column].values), - ("cert-train", train_cert_embeddings, data_set.train_cert_df[label_column].values), - ("cert-val", val_cert_embeddings, data_set.val_cert_df[label_column].values) + #("dict", dict_embeddings, config.dict_df), + ("cert-train", train_cert_embeddings, config.train_cert_df), + ("cert-val", val_cert_embeddings, config.val_cert_df), + ("cert-test", test_cert_embeddings, config.test_cert_df) + ] + + named_classifiers = [ + ("KNN", lambda num_classes: KNeighborsClassifier()), + ("KNN-Cos", lambda num_classes: KNeighborsClassifier(metric="cosine")), + ("SGD", lambda num_classes: SGDClassifier(verbose=1, random_state=42)), + ("DT", lambda num_classes: DecisionTreeClassifier(random_state=42)), + ("RF", lambda num_classes: RandomForestClassifier(verbose=1, random_state=42)), + ("LinearSVM", lambda num_classes: LinearSVC(max_iter=5000, verbose=1, random_state=42)), + + ("DNN-1-200", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], False, 0.0, 50, 2, + callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-200")])), + ("DNN-1-300", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], False, 0.0, 50, 2, + callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-300")])), + ("DNN-200-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], True, 0.5, 50, 2, + callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-200-bn-do50")])), + ("DNN-300-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], True, 0.5, 50, 2, + callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-300-bn-do50")])), + + ("DNN-200-100", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], False, 0.0, 1, 2, + callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-100")])), + ("DNN-200-200", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], False, 0.0, 50, 2, + callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-200")])), + ("DNN-200-100-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], True, 0.5, 50, 2, + callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-100-bn-do50")])), + ("DNN-200-200-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], True, 0.5, 50, 2, + callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-200-bn-do50")])), + + # ("Test-DNN-200-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], True, 0.5, 1, 4, + # callbacks=[ku.best_model_checkpointing_by_model_name("test-dnn-200-bn-do")])), + + ('DU1', lambda num_classes: DummyClassifier(strategy="stratified")), + ('DU2', lambda num_classes: DummyClassifier(strategy="most_frequent")) ] + num_experiments = len(target_label_configs) * len(test_sets) * len(named_classifiers) + cur_experiment = 1 + results = [] - for name, classifier in named_classifiers: - self.logger.info("Start training of classifier %s", name) - classifier.fit(dict_embeddings, data_set.dict_df[label_column].values) - classifier.fit(train_cert_embeddings, data_set.train_cert_df[label_column].values) + for target_label, target_column, label_encoder in target_label_configs: + self.logger.info("Start evaluation experiments with label %s", target_label) + num_classes = len(label_encoder.classes_) + + complete_train_data = np.append(dict_embeddings, train_cert_embeddings, axis=0) + complete_train_labels = np.append(config.dict_df[target_column].values, config.train_cert_df[target_column].values, axis=0) + self.logger.info("Build complete training samples (data: %s, labels: %s)", complete_train_data.shape, complete_train_labels.shape) + + for cl_name, classifier_factory in named_classifiers: + self.logger.info("Start training of classifier %s", cl_name) + classifier = classifier_factory(num_classes) + classifier.fit(complete_train_data, complete_train_labels) - self.logger.info("Start evaluation of %s", name) + classifier_file_name = "cl_{}_{}.model".format(cl_name, target_label).lower() + classifier_file = os.path.join(AppContext.default().output_dir, classifier_file_name) + try: + joblib.dump(classifier, classifier_file) + except: + self.logger.error("Error while saving classifier %s to %s", cl_name, classifier_file) - for ts_name, inputs, gold_labels in test_sets: - self.logger.info("Evaluate data set %s", ts_name) - prediction = classifier.predict(inputs) - acc_score = accuracy_score(gold_labels, prediction) + self.logger.info("Start evaluation of %s", cl_name) + for ts_name, inputs, data_frame in test_sets: + gold_labels = data_frame[target_column].values - self.logger.info("Evaluation result: classifier=%s | data_set=%s | acc_score=%s", name, ts_name, acc_score) - results.append(EvaluationResult(name, ts_name, acc_score)) + self.logger.info("Evaluate data set %s", ts_name) + prediction = classifier.predict(inputs) + acc_score = accuracy_score(gold_labels, prediction) + + self.logger.info("Evaluation result: label=%s | classifier=%s | data_set=%s | acc_score=%s", + target_label, cl_name, ts_name, acc_score) + results.append(EvaluationResult(target_label, cl_name, ts_name, acc_score)) + + self.logger.info("Finished experiment %s out of %s", cur_experiment, num_experiments) + cur_experiment += 1 return results - def split_train_test(self, certificate_df: DataFrame, test_ratio: float, + def get_label_configuration(self, target_labels: List[str], icd10_encoders: ICD10LabelEncoders) -> List: + label_configs = [] + + for target_label in target_labels: + if target_label == "chap" or target_label == "chapter": + label_configs.append((target_label, "ICD10_chapter_encoded", icd10_encoders.chapter_encoder)) + elif target_label == "sect" or target_label == "section": + label_configs.append((target_label, "ICD10_section_encoded", icd10_encoders.section_encoder)) + elif target_label == "subs" or target_label == "subsection": + label_configs.append((target_label, "ICD10_subsection_encoded", icd10_encoders.subsection_encoder)) + elif target_label == "code" or target_label == "icd10": + label_configs.append((target_label, "ICD10_encoded", icd10_encoders.code_encoder)) + else: + self.logger.error("Can't create label configuration for label " + target_label) + + return label_configs + + def split_train_test(self, certificate_df: DataFrame, train_size: float, stratified_splits: bool, label_column: str) -> Tuple[DataFrame, DataFrame]: if stratified_splits: - splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42) + splitter = StratifiedShuffleSplit(n_splits=1, train_size=train_size, random_state=42) split = splitter.split(certificate_df, certificate_df[label_column]) else: - splitter = ShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42) + splitter = ShuffleSplit(n_splits=1, train_size=train_size, random_state=42) split = splitter.split(certificate_df) for train_indices, test_indices in split: @@ -238,54 +317,83 @@ class Clef18Task1V2(LoggingMixin): return model - def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, val_ratio: float, - label_column: str, samples: int=None, stratified_splits: bool=False) -> TrainingConfiguration: + def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, train_ratio: float, val_ratio: float, + label_column: str, samples: int=None, stratified_splits: bool=False) -> Configuration: + if samples: self.logger.info("Sampling %s instances", samples) cert_df = cert_df.sample(samples, random_state=42) - self.logger.info("Splitting certificate lines into train and validation") - train_cert_df, val_cert_df = self.split_train_test(cert_df, val_ratio, stratified_splits, label_column) - self.logger.info("Finished splitting: train=%s instances, test=%s instances", len(train_cert_df), len(val_cert_df)) - - label_encoder = LabelEncoder() - - if label_column == "ICD10_encoded": - self.logger.info("Fitting label encoder to ICD10 codes") - label_encoder.fit(list([icd10.strip().lower() for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower() for icd10 in cert_df["ICD10"].values])) - - elif label_column == "ICD10_chapter_encoded": - self.logger.info("Fitting label encoder to ICD10 chapters") - label_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values])) + self.logger.info("Splitting certificate lines into train and evaluation data set") + train_cert_df, evaluation_cert_df = self.split_train_test(cert_df, train_ratio, stratified_splits, label_column) + self.logger.info("Finished splitting: train=%s instances, evaluation=%s instances", len(train_cert_df), len(evaluation_cert_df)) - else: - raise AssertionError("Can't encode label column '%s'" % label_column) - - self.logger.info("Found %s distinct labels in training data", len(label_encoder.classes_)) + self.logger.info("Splitting evaluation data set into validation and test set") + val_cert_df, test_cert_df = self.split_train_test(evaluation_cert_df, val_ratio, stratified_splits, label_column) + label_encoders = self.prepare_label_encoders(dict_df, cert_df) keras_tokenizer = Tokenizer(oov_token="<UNK>") self.logger.info("Start preparation of training cert data (%s instances)", len(train_cert_df)) - train_cert_df, max_cert_length = self.prepare_certificate_df(train_cert_df, "train", label_encoder, keras_tokenizer) + train_cert_df, max_cert_length = self.prepare_certificate_df(train_cert_df, "train", label_encoders, keras_tokenizer) self.logger.info("Start preparation of validation cert data (%s instances)", len(val_cert_df)) - val_cert_df, _ = self.prepare_certificate_df(val_cert_df, "validation", label_encoder, keras_tokenizer) - - self.logger.info("Start preparation of dictionary data (%s instances)", len(dict_df)) - dict_df, max_dict_length = self.prepare_dictionary_df(dict_df, "train", label_encoder, keras_tokenizer) + val_cert_df, _ = self.prepare_certificate_df(val_cert_df, "validation", label_encoders, keras_tokenizer) - return TrainingConfiguration(train_cert_df, val_cert_df, dict_df, max_cert_length, max_dict_length, - ft_model.vector_size, label_column, label_encoder, keras_tokenizer) + self.logger.info("Start preparation of test cert data (%s instances)", len(test_cert_df)) + test_cert_df, _ = self.prepare_certificate_df(test_cert_df, "test", label_encoders, keras_tokenizer) - def prepare_certificate_df(self, certificate_df: DataFrame, mode: str, label_encoder: LabelEncoder, + self.logger.info("Start preparation of dictionary data (%s instances)", len(dict_df)) + dict_df, max_dict_length = self.prepare_dictionary_df(dict_df, "train", label_encoders, keras_tokenizer) + + return Configuration(train_cert_df, val_cert_df, test_cert_df, dict_df, max_cert_length, max_dict_length, + ft_model.vector_size, label_column, label_encoders, keras_tokenizer) + + def prepare_label_encoders(self, dict_df: DataFrame, cert_df: DataFrame) -> ICD10LabelEncoders: + self.logger.info("Fitting label encoder to ICD10 codes") + icd10_code_encoder = LabelEncoder() + icd10_code_encoder.fit(list([icd10.strip() for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip() for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 codes within the data set", len(icd10_code_encoder.classes_)) + + self.logger.info("Fitting label encoder to ICD10 chapters") + icd10_chapter_encoder = LabelEncoder() + icd10_chapter_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 chapters within the data set", len(icd10_chapter_encoder.classes_)) + + self.logger.info("Fitting label encoder to ICD10 section") + icd10_section_encoder = LabelEncoder() + icd10_section_encoder.fit(list([icd10.strip().lower()[0:2] for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip().lower()[0:2] for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 sections within the data set", len(icd10_section_encoder.classes_)) + + self.logger.info("Fitting label encoder to ICD10 subsection") + icd10_subsection_encoder = LabelEncoder() + icd10_subsection_encoder.fit(list([icd10.strip().lower()[0:3] for icd10 in dict_df["ICD10"].values]) + + list([icd10.strip().lower()[0:3] for icd10 in cert_df["ICD10"].values])) + self.logger.info("Found %s distinct ICD10 subsections within the data set", len(icd10_subsection_encoder.classes_)) + + return ICD10LabelEncoders(icd10_chapter_encoder, icd10_section_encoder, icd10_subsection_encoder, icd10_code_encoder) + + def prepare_certificate_df(self, certificate_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders, keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]: certificate_pipeline = Pipeline([ - ("Encode-ICD10-codes", pdu.encode_labels("ICD10", "ICD10_encoded")), - ("Extract-ICD10-chapter", pdu.extract_icd10_chapter("ICD10", "ICD10_chapter")), - ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded", label_encoder, False)), + ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded", + icd10_encoders.chapter_encoder, False)), + + ("Extract-ICD10-section", pdu.extract_icd10_section("ICD10", "ICD10_section")), + ("Encode-ICD10-section", pdu.encode_labels("ICD10_section", "ICD10_section_encoded", + icd10_encoders.section_encoder, False)), + + ("Extract-ICD10-subsection", pdu.extract_icd10_subsection("ICD10", "ICD10_subsection")), + ("Encode-ICD10-subsection", pdu.encode_labels("ICD10_subsection", "ICD10_subsection_encoded", + icd10_encoders.subsection_encoder, False)), + + ("Clean-ICD10-code", pdu.strip("ICD10")), + ("Encode-ICD10-code", pdu.encode_labels("ICD10", "ICD10_encoded", + icd10_encoders.code_encoder, False)), ("LowercaseText", pdu.to_lowercase("RawText")), ("TokenizeText", pdu.keras_sequencing("RawText", "Token_ids", keras_tokenizer, (mode == "train"))) @@ -300,13 +408,24 @@ class Clef18Task1V2(LoggingMixin): return cert_data_prepared, max_length - def prepare_dictionary_df(self, dictionary_df: DataFrame, mode: str, label_encoder: LabelEncoder, + def prepare_dictionary_df(self, dictionary_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders, keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]: dictionary_pipeline = Pipeline([ - ("Encode-ICD10-codes", pdu.encode_labels("ICD10", "ICD10_encoded")), - ("Extract-ICD10-chapter", pdu.extract_icd10_chapter("ICD10", "ICD10_chapter")), - ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded", label_encoder, False)), + ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded", + icd10_encoders.chapter_encoder, False)), + + ("Extract-ICD10-section", pdu.extract_icd10_section("ICD10", "ICD10_section")), + ("Encode-ICD10-section", pdu.encode_labels("ICD10_section", "ICD10_section_encoded", + icd10_encoders.section_encoder, False)), + + ("Extract-ICD10-subsection", pdu.extract_icd10_subsection("ICD10", "ICD10_subsection")), + ("Encode-ICD10-subsection", pdu.encode_labels("ICD10_subsection", "ICD10_subsection_encoded", + icd10_encoders.subsection_encoder, False)), + + ("Clean-ICD10-code", pdu.strip("ICD10")), + ("Encode-ICD10-code", pdu.encode_labels("ICD10", "ICD10_encoded", + icd10_encoders.code_encoder, False)), ("CombineTexts", pdu.combine_texts(["DiagnosisText", "Standardized"], "DictText")), ("LowercaseText", pdu.to_lowercase("DictText")), @@ -366,7 +485,8 @@ class Clef18Task1V2(LoggingMixin): result_configurations = [ ("results.csv", None), ("results_by_classifier.csv", lambda r: r.classifier_name), - ("results_by_data_set.csv", lambda r: r.data_set_name) + ("results_by_data_set.csv", lambda r: r.data_set_name), + ("results_by_label.csv", lambda r: r.target_label) ] for file_name, sort_key in result_configurations: @@ -376,38 +496,97 @@ class Clef18Task1V2(LoggingMixin): eval_results = sorted(eval_results, key=sort_key) for r in eval_results: - result_writer.write("%s\t%s\t%s\n" % (r.classifier_name, r.data_set_name, r.accuracy)) + result_writer.write("%s\t%s\t%s\t%s\n" % (r.target_label, r.classifier_name, r.data_set_name, r.accuracy)) result_writer.close() + def save_configuration(self, configuration: Configuration): + label_encoder_file = os.path.join(AppContext.default().output_dir, "label_encoder.pk") + self.logger.info("Saving label encoder to " + label_encoder_file) + with open(label_encoder_file, 'wb') as encoder_writer: + pickle.dump(configuration.label_encoders, encoder_writer) + encoder_writer.close() + + keras_tokenizer_file = os.path.join(AppContext.default().output_dir, "keras_tokenizer.pk") + self.logger.info("Saving keras sequencer to " + keras_tokenizer_file) + with open(keras_tokenizer_file, 'wb') as keras_sequencer_writer: + pickle.dump(configuration.keras_tokenizer, keras_sequencer_writer) + keras_sequencer_writer.close() + + configuration_file = os.path.join(AppContext.default().output_dir, "configuration.pk") + self.logger.info("Saving configuration to " + configuration_file) + with open(configuration_file, 'wb') as train_conf_writer: + pickle.dump(configuration, train_conf_writer) + train_conf_writer.close() + + def reload_configuration(self, file_path: str): + self.logger.info("Reloading configuration from " + file_path) + with open(args.train_conf, 'rb') as train_conf_reader: + configuration = pickle.load(train_conf_reader) + train_conf_reader.close() + + return configuration + + def reload_embedding_model(self, emb_model_file: str): + self.logger.info("Reloading embedding model from " + emb_model_file) + return k.models.load_model(args.emb_model) + if __name__ == "__main__": parser = argparse.ArgumentParser(prog="CLEF2018") - parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int) - parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int) - parser.add_argument("--val_ratio", help="Ratio of validation samples to use", default=0.2, type=float) - parser.add_argument("--neg_samples", help="Number of negative samples for each pair to use", default=75, type=int) - parser.add_argument("--train_samples", help="Number of instances to sample from the training data", default=None, type=int) - parser.add_argument("--label_column", help="Column used to train the models", default="ICD10_chapter_encoded", type=str) - parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) + subparsers = parser.add_subparsers(dest="mode") + + train_emb_parser = subparsers.add_parser("train-emb") + train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int) + train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int) + train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) + train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.4, type=float) + train_emb_parser.add_argument("--neg_samples", help="Number of negative samples for each pair to use", default=75, type=int) + train_emb_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int) + train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_encoded", type=str) + train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) + train_emb_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') + + eval_classifier_parser = subparsers.add_parser("eval-cl") + eval_classifier_parser.add_argument("emb_model", help="Path to the embedding model to use") + eval_classifier_parser.add_argument("train_conf", help="Path to the training configuration dump") + eval_classifier_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) + eval_classifier_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.4, type=float) + eval_classifier_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int) + eval_classifier_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_encoded", type=str) + eval_classifier_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) + eval_classifier_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') args = parser.parse_args() - AppContext.initialize_by_app_name("eCLEF2018-Task1") + AppContext.initialize_by_app_name(args.mode) clef_data = Clef18Task1Data() it_dictionary = clef_data.read_it_dictionary() it_certificates = clef_data.read_it_train_certificates() it_certificates = clef_data.filter_single_code_lines(it_certificates) + #sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + #ft_it_model = FastText(sentences, min_count=1) + ft_embeddings = FastTextEmbeddings() ft_it_model = ft_embeddings.load_it_embeddings() clef18_task1 = Clef18Task1V2() - data_set = clef18_task1.prepare_data_set(it_certificates, it_dictionary, ft_it_model, args.val_ratio, - args.label_column, args.train_samples, args.strat_splits) - embedding_model = clef18_task1.train_embedding_model(data_set, ft_it_model, args.neg_samples, args.epochs, args.batch_size) + if args.mode == "train-emb": + configuration = clef18_task1.prepare_data_set( + it_certificates, it_dictionary, ft_it_model, args.train_ratio, + args.val_ratio,args.strat_column, args.samples, args.strat_splits) + clef18_task1.save_configuration(configuration) + + embedding_model = clef18_task1.train_embedding_model(configuration, ft_it_model, args.neg_samples, args.epochs, args.batch_size) - eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, data_set) + elif args.mode == "eval-cl": + configuration = clef18_task1.reload_configuration(args.train_conf) + embedding_model = clef18_task1.reload_embedding_model(args.emb_model) + + eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) clef18_task1.save_evaluation_results(eval_result) + + diff --git a/code_mario/keras_extension.py b/code_mario/keras_extension.py index 59a3900c95f91b307167c51eab4cc8ac9c101cd4..33d4116fc34883bb017293d5d0fe997901f23594 100644 --- a/code_mario/keras_extension.py +++ b/code_mario/keras_extension.py @@ -12,8 +12,12 @@ from util import LoggingMixin class KerasUtil(object): @staticmethod - def best_model_checkpointing(model_name: str, monitor_loss: str = "loss"): - best_model_file = os.path.join(AppContext.default().output_dir, "optimal_%s.h5" % model_name) + def best_model_checkpointing_by_model_name(model_name: str, monitor_loss: str = "loss"): + best_model_file = os.path.join(AppContext.default().output_dir, "%s_best.h5" % model_name) + return ModelCheckpoint(filepath=best_model_file, monitor=monitor_loss, save_best_only=True, verbose=1) + + @staticmethod + def best_model_checkpointing_by_file_path(best_model_file: str, monitor_loss: str = "loss"): return ModelCheckpoint(filepath=best_model_file, monitor=monitor_loss, save_best_only=True, verbose=1) @@ -59,7 +63,7 @@ class ExtendedKerasClassifier(KerasClassifier, LoggingMixin): else: self.logger.debug("Model wasn't re-fitted -> re-using existing model") pass - + self.logger.info("Classifer has %s classes", len(self.classes_)) return super(ExtendedKerasClassifier, self).predict(x, **kwargs) def __getstate__(self): diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py index 45fd4e01c2f3ec4b1fcaa94dd7044e42121dd770..7f1f14c36c511430fb19d4ef23e4f4f047a14829 100644 --- a/code_mario/preprocessing.py +++ b/code_mario/preprocessing.py @@ -20,6 +20,13 @@ class DataPreparationUtil(object): return MapFunction(column, _lower) + @staticmethod + def strip(column: str): + def _strip(text): + return str(text).strip() + + return MapFunction(column, _strip) + @staticmethod def tokenize(text_column: str, token_column: str = "tokens"): return SimpleTokenizer(text_column, token_column) @@ -57,6 +64,20 @@ class DataPreparationUtil(object): return MapFunction(icd10_column, _extract, target_column) + @staticmethod + def extract_icd10_section(icd10_column: str, target_column: str): + def _extract(value): + return value.strip()[0:2].lower() + + return MapFunction(icd10_column, _extract, target_column) + + @staticmethod + def extract_icd10_subsection(icd10_column: str, target_column: str): + def _extract(value): + return value.strip()[0:3].lower() + + return MapFunction(icd10_column, _extract, target_column) + @staticmethod def extract_icd10_subchapter(icd10_column: str, target_column: str): def _extract(value): diff --git a/requirements.txt b/requirements.txt index f8f9105a42859062355469b8ae9a329179f9b796..1f98a40279acebe576e2230ae1ed2e6d79b82d9e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ sklearn tensorflow tqdm h5py +cython