From 54b111abfef40c1879b6b619012d2eb8460a8114 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de>
Date: Tue, 8 May 2018 15:12:46 +0200
Subject: [PATCH] Add prediction implementation

---
 code_mario/clef18_task1_base.py | 87 ++++++++++++++++++---------------
 code_mario/clef18_task1_data.py | 33 ++++++++++++-
 code_mario/clef18_task1_emb1.py | 80 ++++++++++++++++++++++++------
 3 files changed, 145 insertions(+), 55 deletions(-)

diff --git a/code_mario/clef18_task1_base.py b/code_mario/clef18_task1_base.py
index cf1c35e..0d4f860 100644
--- a/code_mario/clef18_task1_base.py
+++ b/code_mario/clef18_task1_base.py
@@ -78,47 +78,51 @@ class Clef18Task1Base(LoggingMixin):
         ]
 
         named_classifiers = [
-            ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()),
-            ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")),
-            ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)),
-            ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)),
-            ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)),
+            # ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()),
+            # ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")),
+            # ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)),
+            # ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)),
+            # ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)),
             #("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)),
 
-            ("DNN-200", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
-                                            batch_normalization=False, dropout_rate=0.0, epochs=10, batch_size=16)),
-
-            ("DNN-300", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
-                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
-
-            ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
-                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
-            ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
-                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
-
-            ("DNN-200-100", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
-                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
-            ("DNN-200-200", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
-                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
-            ("DNN-300-200", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200],
-                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
-
-            ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
-                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
-            ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
-                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
-            ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim,hidden_layer_sizes=[300, 200],
-                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
+            ("DNN-200-Test", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-test", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=1, batch_size=16)),
+
+            # ("DNN-200", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
+            #                                 batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
+            #
+            # ("DNN-300", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
+            #                                 batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
+            #
+            # ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
+            #                                 batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
+            # ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
+            #                                 batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
+            #
+            # ("DNN-200-100", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
+            #                                 batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
+            # ("DNN-200-200", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
+            #                                 batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
+            # ("DNN-300-200", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200],
+            #                                 batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
+            #
+            # ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
+            #                                 batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
+            # ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
+            #                                 batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
+            # ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim,hidden_layer_sizes=[300, 200],
+            #                                 batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
 
             ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")),
             ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent"))
@@ -256,6 +260,11 @@ class Clef18Task1Base(LoggingMixin):
         self.logger.info("Reloading embedding model from " + emb_model_file)
         return k.models.load_model(emb_model_file)
 
+    def reload_classifier(self, classifer_file: str):
+        self.logger.info("Loading classifier from %s", classifer_file)
+        classifier = joblib.load(classifer_file)
+        return classifier
+
     def create_dnn_classifier(self, model_name, label: str, val_data: Tuple, **kwargs):
         if val_data is not None:
             monitor_loss = "val_loss"
diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py
index 8fd724f..b550774 100644
--- a/code_mario/clef18_task1_data.py
+++ b/code_mario/clef18_task1_data.py
@@ -27,6 +27,17 @@ class Clef18Task1Data(LoggingMixin):
         else:
             raise AssertionError("Unsupported language: " + id)
 
+    def read_test_certifcates_by_lang(self, lang: str) -> DataFrame:
+        if lang == "it":
+            return self.read_it_test_certificates()
+        elif lang == "hu":
+            return self.read_hu_test_certificates()
+        elif lang == "fr":
+            return self.read_fr_test_certificates()
+
+        else:
+            raise AssertionError("Unsupported language: " + lang)
+
     def read_dictionary_by_id(self, id: str) -> DataFrame:
         if id == "it":
             return self.read_it_dictionary()
@@ -54,6 +65,10 @@ class Clef18Task1Data(LoggingMixin):
         dictionary_file = os.path.join(base_folder, "dictionary_IT.csv")
         return self._read_icd10_dictionary(dictionary_file, "iso-8859-1")
 
+    def read_it_test_certificates(self) -> DataFrame:
+        brutes_file = "data/test/IT/test/raw/corpus/CausesBrutes_IT_2.csv"
+        return self._read_test_data(brutes_file)
+
     # --------------------------------------------------------------------------------
 
     def read_hu_train_certificates(self) -> DataFrame:
@@ -69,6 +84,10 @@ class Clef18Task1Data(LoggingMixin):
         dictionary_file = os.path.join(base_folder, "Hungarian_dictionary_UTF8.csv")
         return self._read_icd10_dictionary(dictionary_file, "utf-8")
 
+    def read_hu_test_certificates(self) -> DataFrame:
+        brutes_file = "data/test/HU/test/raw/corpus/CausesBrutes_HU_2.csv"
+        return self._read_test_data(brutes_file)
+
     # --------------------------------------------------------------------------------
 
     def read_fr_train_certificates(self) -> DataFrame:
@@ -96,6 +115,10 @@ class Clef18Task1Data(LoggingMixin):
         dictionary_file = os.path.join(base_folder, "Dictionnaire2006-2010.csv")
         return self._read_icd10_dictionary(dictionary_file, "utf-8")
 
+    def read_fr_test_certificates(self) -> DataFrame:
+        brutes_file = "data/test/FR/test/raw/corpus/CausesBrutes_FR_2.csv"
+        return self._read_test_data(brutes_file)
+
     # --------------------------------------------------------------------------------
 
     def read_all_con_dictionary(self) -> DataFrame:
@@ -146,7 +169,8 @@ class Clef18Task1Data(LoggingMixin):
         brutus_data = []
         for brutus_file in brutus_files:
             self.logger.info("Reading brutus file from %s", brutus_file)
-            brutus_data.append(pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"]))
+            brutus_data.append(pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"],
+                                           skipinitialspace=True))
             self.logger.info("Found %s death certificate entries", len(brutus_data[-1]))
 
 
@@ -177,6 +201,13 @@ class Clef18Task1Data(LoggingMixin):
 
         return dictionary_data
 
+    def _read_test_data(self, file: str) -> DataFrame:
+        self.logger.info("Reading test certificates from %s", file)
+        test_data = pd.read_csv(file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], skipinitialspace=True)
+        self.logger.info("Found %s test certificate lines.", len(test_data))
+
+        return test_data
+
 
 def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame):
     print("Data set: ", ds_name)
diff --git a/code_mario/clef18_task1_emb1.py b/code_mario/clef18_task1_emb1.py
index 8d00d18..71be48f 100644
--- a/code_mario/clef18_task1_emb1.py
+++ b/code_mario/clef18_task1_emb1.py
@@ -1,4 +1,6 @@
 from gensim.models import FastText
+from sklearn.base import BaseEstimator
+from sklearn.svm import SVC
 
 from init import *
 
@@ -148,19 +150,21 @@ class Clef18Task1Emb1(Clef18Task1Base):
 
         return model
 
+    # ---------------------------------------------------------------------------------------------------------------------------------------
+
     def train_and_evaluate_classifiers(self, emb_model: Model, config: Emb1Configuration, target_labels: List) -> List[EvaluationResult]:
         self.logger.info("Start training and evaluation of classifier models")
 
+        cert_input = emb_model.inputs[0]
+        cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model")
+
         self.logger.info("Building dictionary embeddings")
         dict_input = emb_model.inputs[1]
-        dict_rnn = Model(inputs=dict_input, outputs=emb_model.get_layer("dict_rnn").output, name="Dict-RNN-Model")
-        dict_inputs = pad_sequences(config.dict_df["Token_ids"].values, maxlen=config.max_dict_length, padding="post")
-        dict_embeddings = dict_rnn.predict(dict_inputs, verbose=1, batch_size=1)
+        #dict_rnn = Model(inputs=dict_input, outputs=emb_model.get_layer("dict_rnn").output, name="Dict-RNN-Model")
+        dict_inputs = pad_sequences(config.dict_df["Token_ids"].values, maxlen=config.max_cert_length, padding="post")
+        dict_embeddings = cert_rnn.predict(dict_inputs, verbose=1, batch_size=1)
 
         self.logger.info("Building train certificate embeddings")
-        cert_input = emb_model.inputs[0]
-        cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model")
-
         train_cert_inputs = pad_sequences(config.train_cert_df["Token_ids"].values, maxlen=config.max_cert_length, padding="post")
         train_cert_embeddings = cert_rnn.predict(train_cert_inputs, verbose=1)
         self.logger.info("cert train input shape: %s", train_cert_embeddings.shape)
@@ -189,6 +193,38 @@ class Clef18Task1Emb1(Clef18Task1Base):
 
     # ---------------------------------------------------------------------------------------------------------------------------------------
 
+    def predict(self, emb_model: Model, classifier, conf: Emb1Configuration, test_df: DataFrame) -> DataFrame:
+        self.logger.info("Start preprocessing test data")
+        preprocessing_pipeline = Pipeline([
+            ("LowercaseText", pdu.to_lowercase("RawText")),
+            ("TokenizeText", pdu.keras_sequencing("RawText", "Word_ids", conf.keras_tokenizer, False))
+        ])
+
+        #test_df = Clef18Task1Data().read_it_train_certificates()
+
+        test_df = preprocessing_pipeline.fit_transform(test_df)
+        test_emb_inputs = pad_sequences(test_df["Word_ids"].values, maxlen=conf.max_cert_length, padding="post")
+        self.logger.info("Finished preprocessing of test data (shape: %s)", test_emb_inputs.shape)
+
+        self.logger.info("Start generation of embeddings")
+        cert_input = emb_model.inputs[0]
+        cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model")
+        test_cl_inputs = cert_rnn.predict(test_emb_inputs, 16, verbose=1)
+
+        predictions = classifier.predict(test_cl_inputs)
+        predicted_labels = conf.label_encoders.code_encoder.inverse_transform(predictions)
+
+        result = DataFrame(columns=["DocID", "YearCoded", "LineID", "Rank", "StandardText", "ICD10", "IntervalText"])
+        for i, (id, row) in tqdm(enumerate(test_df.iterrows()), desc="build-result", total=len(test_df)):
+            result = result.append({"DocID": id[1], "YearCoded" : id[0], "LineID" : id[2], "Rank": "",
+                                    "StandardText": "", "ICD10" : predicted_labels[i], "IntervalText": ""}, ignore_index=True)
+
+        output_file = os.path.join(AppContext.default().output_dir, "prediction_train.csv")
+        result.to_csv(output_file, sep=";", index=False)
+        return result
+
+    # ---------------------------------------------------------------------------------------------------------------------------------------
+
     def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastTextModel, train_ratio: float, val_ratio: float,
                          strat_column: str, samples: int=None, stratified_splits: bool=False) -> Emb1Configuration:
 
@@ -362,10 +398,11 @@ if __name__ == "__main__":
     eval_classifier_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool)
     eval_classifier_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append')
 
-    eval_emb_classifer_parser = subparsers.add_parser("eval-emb-cl")
-    eval_emb_classifer_parser.add_argument("emb_model", help="Path to the embedding model to use")
-    eval_emb_classifer_parser.add_argument("train_conf", help="Path to the training configuration dump")
-    eval_emb_classifer_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"])
+    predict_parser = subparsers.add_parser("pred")
+    predict_parser.add_argument("emb_model", help="Path to the learned embedding model to use")
+    predict_parser.add_argument("cl_model", help="Path to the learned classifier model to use")
+    predict_parser.add_argument("train_conf", help="Path to the training configuration dump")
+    predict_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"])
 
     args = parser.parse_args()
 
@@ -375,10 +412,11 @@ if __name__ == "__main__":
     clef18_task1.save_arguments(args)
 
     clef_data = Clef18Task1Data()
-    dictionary = clef_data.read_dictionary_by_id(args.lang)
-    certificates = clef_data.read_train_certifcates_by_id(args.lang)
 
     if args.mode == "train-emb":
+        dictionary = clef_data.read_dictionary_by_id(args.lang)
+        certificates = clef_data.read_train_certifcates_by_id(args.lang)
+
         ft_embeddings = FastTextEmbeddings()
         ft_model = ft_embeddings.load_embeddings_by_id(args.lang)
 
@@ -400,15 +438,27 @@ if __name__ == "__main__":
 
         embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, neg_sampling_strategy, args.epochs, args.batch_size)
 
+        clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels)
+
     elif args.mode == "eval-cl":
         configuration = clef18_task1.reload_configuration(args.train_conf)
         embedding_model = clef18_task1.reload_embedding_model(args.emb_model)
 
-    elif args.mode == "eval-emb-cl":
-        configuration = clef18_task1.reload_configuration(args.train_conf)
+        clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels)
+
+    elif args.mode == "pred":
         embedding_model = clef18_task1.reload_embedding_model(args.emb_model)
+        classifer_model = clef18_task1.reload_classifier(args.cl_model)
+        configuration = clef18_task1.reload_configuration(args.train_conf)
+
+        test_certificates = clef_data.read_test_certifcates_by_lang(args.lang)
+
+        ft_embeddings = FastTextEmbeddings()
+        #ft_model = ft_embeddings.load_embeddings_by_id(args.lang)
+        sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+        ft_model = FastTextModel("dummy", [FastText(sentences, min_count=1)])
 
-        clef18_task1.evaluate_embedding_classifier(embedding_model, configuration, certificates)
+        clef18_task1.predict(embedding_model, classifer_model, configuration, test_certificates)
 
     #eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels)
     #clef18_task1.save_evaluation_results(eval_result)
-- 
GitLab