From 54b111abfef40c1879b6b619012d2eb8460a8114 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de> Date: Tue, 8 May 2018 15:12:46 +0200 Subject: [PATCH] Add prediction implementation --- code_mario/clef18_task1_base.py | 87 ++++++++++++++++++--------------- code_mario/clef18_task1_data.py | 33 ++++++++++++- code_mario/clef18_task1_emb1.py | 80 ++++++++++++++++++++++++------ 3 files changed, 145 insertions(+), 55 deletions(-) diff --git a/code_mario/clef18_task1_base.py b/code_mario/clef18_task1_base.py index cf1c35e..0d4f860 100644 --- a/code_mario/clef18_task1_base.py +++ b/code_mario/clef18_task1_base.py @@ -78,47 +78,51 @@ class Clef18Task1Base(LoggingMixin): ] named_classifiers = [ - ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()), - ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")), - ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)), - ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)), - ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)), + # ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()), + # ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")), + # ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)), + # ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)), + # ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)), #("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)), - ("DNN-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=False, dropout_rate=0.0, epochs=10, batch_size=16)), - - ("DNN-300", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), - - ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), - ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), - - ("DNN-200-100", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), - ("DNN-200-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), - ("DNN-300-200", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], - batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), - - ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), - ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), - ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data: - self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim,hidden_layer_sizes=[300, 200], - batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + ("DNN-200-Test", lambda label, input_dim, output_dim, val_data: + self.create_dnn_classifier("dnn-200-test", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + batch_normalization=False, dropout_rate=0.0, epochs=1, batch_size=16)), + + # ("DNN-200", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + # + # ("DNN-300", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], + # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + # + # ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200], + # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + # ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300], + # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + # + # ("DNN-200-100", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], + # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + # ("DNN-200-200", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], + # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + # ("DNN-300-200", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200], + # batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)), + # + # ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100], + # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + # ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200], + # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), + # ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data: + # self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim,hidden_layer_sizes=[300, 200], + # batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)), ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")), ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent")) @@ -256,6 +260,11 @@ class Clef18Task1Base(LoggingMixin): self.logger.info("Reloading embedding model from " + emb_model_file) return k.models.load_model(emb_model_file) + def reload_classifier(self, classifer_file: str): + self.logger.info("Loading classifier from %s", classifer_file) + classifier = joblib.load(classifer_file) + return classifier + def create_dnn_classifier(self, model_name, label: str, val_data: Tuple, **kwargs): if val_data is not None: monitor_loss = "val_loss" diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index 8fd724f..b550774 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -27,6 +27,17 @@ class Clef18Task1Data(LoggingMixin): else: raise AssertionError("Unsupported language: " + id) + def read_test_certifcates_by_lang(self, lang: str) -> DataFrame: + if lang == "it": + return self.read_it_test_certificates() + elif lang == "hu": + return self.read_hu_test_certificates() + elif lang == "fr": + return self.read_fr_test_certificates() + + else: + raise AssertionError("Unsupported language: " + lang) + def read_dictionary_by_id(self, id: str) -> DataFrame: if id == "it": return self.read_it_dictionary() @@ -54,6 +65,10 @@ class Clef18Task1Data(LoggingMixin): dictionary_file = os.path.join(base_folder, "dictionary_IT.csv") return self._read_icd10_dictionary(dictionary_file, "iso-8859-1") + def read_it_test_certificates(self) -> DataFrame: + brutes_file = "data/test/IT/test/raw/corpus/CausesBrutes_IT_2.csv" + return self._read_test_data(brutes_file) + # -------------------------------------------------------------------------------- def read_hu_train_certificates(self) -> DataFrame: @@ -69,6 +84,10 @@ class Clef18Task1Data(LoggingMixin): dictionary_file = os.path.join(base_folder, "Hungarian_dictionary_UTF8.csv") return self._read_icd10_dictionary(dictionary_file, "utf-8") + def read_hu_test_certificates(self) -> DataFrame: + brutes_file = "data/test/HU/test/raw/corpus/CausesBrutes_HU_2.csv" + return self._read_test_data(brutes_file) + # -------------------------------------------------------------------------------- def read_fr_train_certificates(self) -> DataFrame: @@ -96,6 +115,10 @@ class Clef18Task1Data(LoggingMixin): dictionary_file = os.path.join(base_folder, "Dictionnaire2006-2010.csv") return self._read_icd10_dictionary(dictionary_file, "utf-8") + def read_fr_test_certificates(self) -> DataFrame: + brutes_file = "data/test/FR/test/raw/corpus/CausesBrutes_FR_2.csv" + return self._read_test_data(brutes_file) + # -------------------------------------------------------------------------------- def read_all_con_dictionary(self) -> DataFrame: @@ -146,7 +169,8 @@ class Clef18Task1Data(LoggingMixin): brutus_data = [] for brutus_file in brutus_files: self.logger.info("Reading brutus file from %s", brutus_file) - brutus_data.append(pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"])) + brutus_data.append(pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], + skipinitialspace=True)) self.logger.info("Found %s death certificate entries", len(brutus_data[-1])) @@ -177,6 +201,13 @@ class Clef18Task1Data(LoggingMixin): return dictionary_data + def _read_test_data(self, file: str) -> DataFrame: + self.logger.info("Reading test certificates from %s", file) + test_data = pd.read_csv(file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], skipinitialspace=True) + self.logger.info("Found %s test certificate lines.", len(test_data)) + + return test_data + def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame): print("Data set: ", ds_name) diff --git a/code_mario/clef18_task1_emb1.py b/code_mario/clef18_task1_emb1.py index 8d00d18..71be48f 100644 --- a/code_mario/clef18_task1_emb1.py +++ b/code_mario/clef18_task1_emb1.py @@ -1,4 +1,6 @@ from gensim.models import FastText +from sklearn.base import BaseEstimator +from sklearn.svm import SVC from init import * @@ -148,19 +150,21 @@ class Clef18Task1Emb1(Clef18Task1Base): return model + # --------------------------------------------------------------------------------------------------------------------------------------- + def train_and_evaluate_classifiers(self, emb_model: Model, config: Emb1Configuration, target_labels: List) -> List[EvaluationResult]: self.logger.info("Start training and evaluation of classifier models") + cert_input = emb_model.inputs[0] + cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model") + self.logger.info("Building dictionary embeddings") dict_input = emb_model.inputs[1] - dict_rnn = Model(inputs=dict_input, outputs=emb_model.get_layer("dict_rnn").output, name="Dict-RNN-Model") - dict_inputs = pad_sequences(config.dict_df["Token_ids"].values, maxlen=config.max_dict_length, padding="post") - dict_embeddings = dict_rnn.predict(dict_inputs, verbose=1, batch_size=1) + #dict_rnn = Model(inputs=dict_input, outputs=emb_model.get_layer("dict_rnn").output, name="Dict-RNN-Model") + dict_inputs = pad_sequences(config.dict_df["Token_ids"].values, maxlen=config.max_cert_length, padding="post") + dict_embeddings = cert_rnn.predict(dict_inputs, verbose=1, batch_size=1) self.logger.info("Building train certificate embeddings") - cert_input = emb_model.inputs[0] - cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model") - train_cert_inputs = pad_sequences(config.train_cert_df["Token_ids"].values, maxlen=config.max_cert_length, padding="post") train_cert_embeddings = cert_rnn.predict(train_cert_inputs, verbose=1) self.logger.info("cert train input shape: %s", train_cert_embeddings.shape) @@ -189,6 +193,38 @@ class Clef18Task1Emb1(Clef18Task1Base): # --------------------------------------------------------------------------------------------------------------------------------------- + def predict(self, emb_model: Model, classifier, conf: Emb1Configuration, test_df: DataFrame) -> DataFrame: + self.logger.info("Start preprocessing test data") + preprocessing_pipeline = Pipeline([ + ("LowercaseText", pdu.to_lowercase("RawText")), + ("TokenizeText", pdu.keras_sequencing("RawText", "Word_ids", conf.keras_tokenizer, False)) + ]) + + #test_df = Clef18Task1Data().read_it_train_certificates() + + test_df = preprocessing_pipeline.fit_transform(test_df) + test_emb_inputs = pad_sequences(test_df["Word_ids"].values, maxlen=conf.max_cert_length, padding="post") + self.logger.info("Finished preprocessing of test data (shape: %s)", test_emb_inputs.shape) + + self.logger.info("Start generation of embeddings") + cert_input = emb_model.inputs[0] + cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model") + test_cl_inputs = cert_rnn.predict(test_emb_inputs, 16, verbose=1) + + predictions = classifier.predict(test_cl_inputs) + predicted_labels = conf.label_encoders.code_encoder.inverse_transform(predictions) + + result = DataFrame(columns=["DocID", "YearCoded", "LineID", "Rank", "StandardText", "ICD10", "IntervalText"]) + for i, (id, row) in tqdm(enumerate(test_df.iterrows()), desc="build-result", total=len(test_df)): + result = result.append({"DocID": id[1], "YearCoded" : id[0], "LineID" : id[2], "Rank": "", + "StandardText": "", "ICD10" : predicted_labels[i], "IntervalText": ""}, ignore_index=True) + + output_file = os.path.join(AppContext.default().output_dir, "prediction_train.csv") + result.to_csv(output_file, sep=";", index=False) + return result + + # --------------------------------------------------------------------------------------------------------------------------------------- + def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastTextModel, train_ratio: float, val_ratio: float, strat_column: str, samples: int=None, stratified_splits: bool=False) -> Emb1Configuration: @@ -362,10 +398,11 @@ if __name__ == "__main__": eval_classifier_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) eval_classifier_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') - eval_emb_classifer_parser = subparsers.add_parser("eval-emb-cl") - eval_emb_classifer_parser.add_argument("emb_model", help="Path to the embedding model to use") - eval_emb_classifer_parser.add_argument("train_conf", help="Path to the training configuration dump") - eval_emb_classifer_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"]) + predict_parser = subparsers.add_parser("pred") + predict_parser.add_argument("emb_model", help="Path to the learned embedding model to use") + predict_parser.add_argument("cl_model", help="Path to the learned classifier model to use") + predict_parser.add_argument("train_conf", help="Path to the training configuration dump") + predict_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"]) args = parser.parse_args() @@ -375,10 +412,11 @@ if __name__ == "__main__": clef18_task1.save_arguments(args) clef_data = Clef18Task1Data() - dictionary = clef_data.read_dictionary_by_id(args.lang) - certificates = clef_data.read_train_certifcates_by_id(args.lang) if args.mode == "train-emb": + dictionary = clef_data.read_dictionary_by_id(args.lang) + certificates = clef_data.read_train_certifcates_by_id(args.lang) + ft_embeddings = FastTextEmbeddings() ft_model = ft_embeddings.load_embeddings_by_id(args.lang) @@ -400,15 +438,27 @@ if __name__ == "__main__": embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, neg_sampling_strategy, args.epochs, args.batch_size) + clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) + elif args.mode == "eval-cl": configuration = clef18_task1.reload_configuration(args.train_conf) embedding_model = clef18_task1.reload_embedding_model(args.emb_model) - elif args.mode == "eval-emb-cl": - configuration = clef18_task1.reload_configuration(args.train_conf) + clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) + + elif args.mode == "pred": embedding_model = clef18_task1.reload_embedding_model(args.emb_model) + classifer_model = clef18_task1.reload_classifier(args.cl_model) + configuration = clef18_task1.reload_configuration(args.train_conf) + + test_certificates = clef_data.read_test_certifcates_by_lang(args.lang) + + ft_embeddings = FastTextEmbeddings() + #ft_model = ft_embeddings.load_embeddings_by_id(args.lang) + sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + ft_model = FastTextModel("dummy", [FastText(sentences, min_count=1)]) - clef18_task1.evaluate_embedding_classifier(embedding_model, configuration, certificates) + clef18_task1.predict(embedding_model, classifer_model, configuration, test_certificates) #eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) #clef18_task1.save_evaluation_results(eval_result) -- GitLab