From a8ed4c2f0aa33be2396eada89dad813b5ad106aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de>
Date: Thu, 10 May 2018 09:30:11 +0200
Subject: [PATCH] Revise FastTextModel class structures

---
 code_mario/clef18_task1_base.py |  92 +++++++-------
 code_mario/clef18_task1_data.py | 207 ++++++++++++++++++++------------
 code_mario/clef18_task1_emb1.py |  27 +++--
 code_mario/clef18_task1_emb2.py |   6 +-
 code_mario/ft_embeddings.py     |  96 +++++++++++++--
 code_mario/preprocessing.py     |  22 +++-
 6 files changed, 299 insertions(+), 151 deletions(-)

diff --git a/code_mario/clef18_task1_base.py b/code_mario/clef18_task1_base.py
index 0d4f860..d941363 100644
--- a/code_mario/clef18_task1_base.py
+++ b/code_mario/clef18_task1_base.py
@@ -72,57 +72,57 @@ class Clef18Task1Base(LoggingMixin):
 
         test_sets = [
             # ("dict", dict_embeddings, config.dict_df),
-            ("cert-train", conf.train_data, conf.train_df),
+            #("cert-train", conf.train_data, conf.train_df),
             ("cert-val", conf.val_data, conf.val_df),
             ("cert-test", conf.test_data, conf.test_df)
         ]
 
         named_classifiers = [
-            # ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()),
-            # ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")),
-            # ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)),
-            # ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)),
-            # ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)),
-            #("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)),
-
-            ("DNN-200-Test", lambda label, input_dim, output_dim, val_data:
-                self.create_dnn_classifier("dnn-200-test", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
-                                            batch_normalization=False, dropout_rate=0.0, epochs=1, batch_size=16)),
-
-            # ("DNN-200", lambda label, input_dim, output_dim, val_data:
-            #     self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
-            #                                 batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
-            #
-            # ("DNN-300", lambda label, input_dim, output_dim, val_data:
-            #     self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
-            #                                 batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
-            #
-            # ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data:
-            #     self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
-            #                                 batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
-            # ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data:
-            #     self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
-            #                                 batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
-            #
-            # ("DNN-200-100", lambda label, input_dim, output_dim, val_data:
-            #     self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
-            #                                 batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
-            # ("DNN-200-200", lambda label, input_dim, output_dim, val_data:
-            #     self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
-            #                                 batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
-            # ("DNN-300-200", lambda label, input_dim, output_dim, val_data:
-            #     self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200],
-            #                                 batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
-            #
-            # ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data:
-            #     self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
-            #                                 batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
-            # ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data:
-            #     self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
-            #                                 batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
-            # ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data:
-            #     self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim,hidden_layer_sizes=[300, 200],
-            #                                 batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
+            ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()),
+            ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")),
+            ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)),
+            ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)),
+            ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)),
+            # ("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)),
+
+            # ("DNN-200-Test", lambda label, input_dim, output_dim, val_data:
+            #     self.create_dnn_classifier("dnn-200-test", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
+            #                                 batch_normalization=False, dropout_rate=0.0, epochs=1, batch_size=16)),
+
+            ("DNN-200", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
+
+            ("DNN-300", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
+
+            ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
+            ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
+
+            ("DNN-200-100", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
+            ("DNN-200-200", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
+            ("DNN-300-200", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=16)),
+
+            ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
+            ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
+            ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim,hidden_layer_sizes=[300, 200],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=16)),
 
             ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")),
             ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent"))
diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py
index 8c36f54..7a149c5 100644
--- a/code_mario/clef18_task1_data.py
+++ b/code_mario/clef18_task1_data.py
@@ -2,6 +2,7 @@ import os
 from typing import List
 
 import pandas as pd
+from keras.preprocessing.text import Tokenizer, text_to_word_sequence
 
 from pandas import DataFrame
 from tqdm import tqdm
@@ -60,12 +61,12 @@ class Clef18Task1Data(LoggingMixin):
         calculees_file = os.path.join(base_folder, "CausesCalculees_IT_1.csv")
         brutes_file = os.path.join(base_folder, "CausesBrutes_IT_1.csv")
 
-        return self._read_certificates([calculees_file], [brutes_file])
+        return self._read_certificates([calculees_file], [brutes_file], "it")
 
     def read_it_dictionary(self) -> DataFrame:
         base_folder = "data/train/IT/training/raw/dictionaries"
         dictionary_file = os.path.join(base_folder, "dictionary_IT.csv")
-        return self._read_icd10_dictionary(dictionary_file, "iso-8859-1")
+        return self._read_icd10_dictionary([dictionary_file], "iso-8859-1", "it")
 
     def read_it_test_certificates(self) -> DataFrame:
         brutes_file = "data/test/IT/test/raw/corpus/CausesBrutes_IT_2.csv"
@@ -79,12 +80,12 @@ class Clef18Task1Data(LoggingMixin):
         calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv")
         brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv")
 
-        return self._read_certificates([calculees_file], [brutes_file])
+        return self._read_certificates([calculees_file], [brutes_file], "it")
 
     def read_hu_dictionary(self) -> DataFrame:
         base_folder = "data/train/HU/training/raw/dictionaries"
         dictionary_file = os.path.join(base_folder, "Hungarian_dictionary_UTF8.csv")
-        return self._read_icd10_dictionary(dictionary_file, "utf-8")
+        return self._read_icd10_dictionary([dictionary_file], "utf-8", "hu")
 
     def read_hu_test_certificates(self) -> DataFrame:
         brutes_file = "data/test/HU/test/raw/corpus/CausesBrutes_HU_2.csv"
@@ -108,14 +109,17 @@ class Clef18Task1Data(LoggingMixin):
             os.path.join(base_folder, "CausesBrutes_FR_2014.csv")
         ]
 
-        return self._read_certificates(calculees_files, brutes_files)
+        return self._read_certificates(calculees_files, brutes_files, "fr")
 
     def read_fr_dictionary(self) -> DataFrame:
-        # FIXME: Load other training files from 2011-2015!
-
         base_folder = "data/train/FR/training/aligned/dictionaries"
-        dictionary_file = os.path.join(base_folder, "Dictionnaire2006-2010.csv")
-        return self._read_icd10_dictionary(dictionary_file, "utf-8")
+        dictionary_files = [
+            os.path.join(base_folder, "Dictionnaire2006-2010.csv"),
+            os.path.join(base_folder, "Dictionnaire2014.csv"),
+            os.path.join(base_folder, "Dictionnaire2015.csv")
+
+        ]
+        return self._read_icd10_dictionary(dictionary_files, "utf-8", "fr")
 
     def read_fr_test_certificates(self) -> DataFrame:
         brutes_file = "data/test/FR/test/raw/corpus/CausesBrutes_FR_2.csv"
@@ -129,60 +133,13 @@ class Clef18Task1Data(LoggingMixin):
                           self.read_hu_dictionary()])
 
     def read_all_con_certificates(self) -> DataFrame:
-        return pd.concat([self.read_fr_train_certificates(),
-                          self.read_it_train_certificates(),
-                          self.read_hu_train_certificates()])
-
-    # --------------------------------------------------------------------------------
-
-    def filter_single_code_lines(self, certificate_df: DataFrame) -> DataFrame:
-        multi_code_lines = [key for key, group in groupby(certificate_df.index.values) if len(list(group)) > 1]
-        self.logger.info("Start filtering %s lines with multiple codes", len(multi_code_lines))
-
-        original_size = len(certificate_df)
-        certificate_df = certificate_df.drop(multi_code_lines)
-        self.logger.info("Filtered %s out of %s entries due to single code constraint", len(certificate_df), original_size)
-
-        return certificate_df
-
-    def add_masked_icd10_column(self, certificate_df: DataFrame, min_support: int, mask_code: str = "RARE-ICD10") -> DataFrame:
-        code_frequency_distribution = certificate_df["ICD10"].value_counts()
-        icd_masker = pdu.mask_icd10("ICD10", "ICD10_masked", code_frequency_distribution, min_support, mask_code)
-
-        certificate_df = icd_masker.fit_transform(certificate_df)
-
-        num_infrequent_codes = certificate_df["ICD10_masked"].value_counts()[mask_code]
-        self.logger.info("Added masked icd10 code column. Found %s codes with support less than %s", num_infrequent_codes, min_support)
-
-        return certificate_df
-
-    def down_sample_by_icd10_frequency(self, certificate_df: DataFrame, max_freq: int):
-        self.logger.info("Down sampled data set with %s entries", len(certificate_df))
-        icd10_codes = certificate_df["ICD10"].unique()
-
-        data_sets = []
-        for code in tqdm(icd10_codes,desc="down-sample", total=len(icd10_codes)):
-            entries_by_code = certificate_df.query("ICD10 == '%s'" % code)
-            if len(entries_by_code) > max_freq:
-                unique_texts = entries_by_code["RawText"].unique()
-
-                unique_entries = []
-                for text in unique_texts:
-                    unique_entries.append(entries_by_code.query("RawText == \"%s\"" % text)[0:1])
-
-                unique_entries.append(entries_by_code.sample(max(max_freq-len(unique_texts), 10)))
-                entries_by_code = pd.concat(unique_entries)
-
-            data_sets.append(entries_by_code)
-
-        sampled_df = pd.concat(data_sets)
-        sampled_df = sampled_df.sample(frac=1) # Reshuffle!
-        self.logger.info("Down sampled data set contains %s entries", len(sampled_df))
-        return sampled_df
+        all_certificates = pd.concat([self.read_fr_train_certificates(), self.read_it_train_certificates(), self.read_hu_train_certificates()])
+        self.logger.info("Found %s death certificate lines", len(all_certificates))
+        return all_certificates
 
     # --------------------------------------------------------------------------------
 
-    def _read_certificates(self, calculees_files: List[str], brutus_files: List[str]) -> DataFrame:
+    def _read_certificates(self, calculees_files: List[str], brutus_files: List[str], language: str) -> DataFrame:
         calculees_data = []
         for calculees_file in calculees_files:
             self.logger.info("Reading calculees file from %s", calculees_file)
@@ -210,12 +167,21 @@ class Clef18Task1Data(LoggingMixin):
         self.logger.info("Removed %s lines with ICD10 'nan'", num_unchecked_data - len(joined_data))
 
         joined_data = pdu.clean_text("RawText").fit_transform(joined_data)
+        joined_data["Lang"] = language
+
+        word_tagger = pdu.tag_words_with_language("RawText", "RawText", "Lang")
+        joined_data = word_tagger.fit_transform(joined_data)
+
+        return joined_data[["RawText", "ICD10", "Lang"]]
 
-        return joined_data[["RawText", "ICD10"]]
+    def _read_icd10_dictionary(self, dictionary_files: List[str], encoding: str, lang: str) -> DataFrame:
+        dictionary_data = []
+        for dict_file in dictionary_files:
+            self.logger.info("Reading ICD10 dictionary from %s", dict_file)
+            dictionary_data.append(pd.read_csv(dict_file, sep=";", encoding=encoding,
+                                               skipinitialspace=True, error_bad_lines=False))
+        dictionary_data = pd.concat(dictionary_data)
 
-    def _read_icd10_dictionary(self, file: str, encoding: str) -> DataFrame:
-        self.logger.info("Reading ICD-10 dictionary from %s", file)
-        dictionary_data = pd.read_csv(file, sep=";", encoding=encoding, skipinitialspace=True)
         num_dictionary_entries = len(dictionary_data)
         self.logger.info("Found %s dictionary entries", num_dictionary_entries)
 
@@ -228,18 +194,87 @@ class Clef18Task1Data(LoggingMixin):
         dictionary_data.columns = ["ICD10", "Standardized", "DiagnosisText"]
 
         dictionary_data["ICD10"] = dictionary_data["ICD10"].astype(str)
-
         self.logger.info("Removed %s duplicates from dictionary", num_dictionary_entries - len(dictionary_data))
 
+        dictionary_data["Lang"] = lang
+
+        self.logger.info("Tag language %s on all words", lang)
+        word_tagger = pdu.tag_words_with_language("DiagnosisText", "DiagnosisText", "Lang")
+        dictionary_data = word_tagger.fit_transform(dictionary_data)
+
         return dictionary_data
 
     def _read_test_data(self, file: str) -> DataFrame:
         self.logger.info("Reading test certificates from %s", file)
-        test_data = pd.read_csv(file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], skipinitialspace=True)
+        test_data = pd.read_csv(file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"],
+                                skipinitialspace=True, error_bad_lines=False)
         self.logger.info("Found %s test certificate lines.", len(test_data))
 
         return test_data
 
+    # --------------------------------------------------------------------------------
+
+    def filter_single_code_lines(self, certificate_df: DataFrame) -> DataFrame:
+        multi_code_lines = [key for key, group in groupby(certificate_df.index.values) if len(list(group)) > 1]
+        self.logger.info("Start filtering %s lines with multiple codes", len(multi_code_lines))
+
+        original_size = len(certificate_df)
+        certificate_df = certificate_df.drop(multi_code_lines)
+        self.logger.info("Filtered %s out of %s entries due to single code constraint", len(certificate_df), original_size)
+
+        return certificate_df
+
+    def add_masked_icd10_column(self, certificate_df: DataFrame, min_support: int, mask_code: str = "RARE-ICD10") -> DataFrame:
+        code_frequency_distribution = certificate_df["ICD10"].value_counts()
+        icd_masker = pdu.mask_icd10("ICD10", "ICD10_masked", code_frequency_distribution, min_support, mask_code)
+
+        certificate_df = icd_masker.fit_transform(certificate_df)
+
+        num_infrequent_codes = certificate_df["ICD10_masked"].value_counts()[mask_code]
+        self.logger.info("Added masked icd10 code column. Found %s codes with support less than %s", num_infrequent_codes, min_support)
+
+        return certificate_df
+
+    def down_sample_by_icd10_frequency(self, certificate_df: DataFrame, max_freq: int):
+        self.logger.info("Down sampled data set with %s entries", len(certificate_df))
+        icd10_codes = certificate_df["ICD10"].unique()
+
+        data_sets = []
+        for code in tqdm(icd10_codes,desc="down-sample", total=len(icd10_codes)):
+            entries_by_code = certificate_df.query("ICD10 == '%s'" % code)
+            if len(entries_by_code) > max_freq:
+                unique_texts = entries_by_code["RawText"].unique()
+
+                unique_entries = []
+                for text in unique_texts:
+                    unique_entries.append(entries_by_code.query("RawText == \"%s\"" % text)[0:1])
+
+                unique_entries.append(entries_by_code.sample(max(max_freq-len(unique_texts), 10)))
+                entries_by_code = pd.concat(unique_entries)
+
+            data_sets.append(entries_by_code)
+
+        sampled_df = pd.concat(data_sets)
+        sampled_df = sampled_df.sample(frac=1) # Reshuffle!
+        self.logger.info("Down sampled data set contains %s entries", len(sampled_df))
+        return sampled_df
+
+    def extend_certificates_by_dictionaries(self, certificate_df: DataFrame, dictionary_df: DataFrame) -> DataFrame:
+        original_size = len(certificate_df)
+
+        dict_icd10_codes = dictionary_df["ICD10"].unique()
+        cert_icd10_codes = certificate_df["ICD10"].unique()
+
+        unseen_icd10_codes = [dict_icd10 for dict_icd10 in dict_icd10_codes if dict_icd10 not in cert_icd10_codes]
+        for unseen_icd10 in tqdm(unseen_icd10_codes, desc="extend-cert", total=len(unseen_icd10_codes)):
+            entries = dictionary_df.query("ICD10 == '%s'" % unseen_icd10)
+
+            for key, row in entries.iterrows():
+                certificate_df = certificate_df.append({"RawText": row["DiagnosisText"], "ICD10": row["ICD10"] }, ignore_index=True)
+
+        extended_size = len(certificate_df)
+        self.logger.info("Extended cert data set with %s from dictionary (%s in total)" % (extended_size - original_size, extended_size))
+        return certificate_df
 
 def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame):
     print("Data set: ", ds_name)
@@ -344,30 +379,44 @@ if __name__ == "__main__":
     AppContext.initialize_by_app_name("Clef18Task1-Data")
 
     clef_task_data = Clef18Task1Data()
+
+    #all_cert = clef_task_data.read_all_con_certificates()
+    #check_label_distribution(all_cert)
+    #clef_task_data.down_sample_by_icd10_frequency(all_cert, 4000)
+
     it_certificates = clef_task_data.read_it_train_certificates()
-    it_dictionary = clef_task_data.read_it_dictionary()
 
-    check_label_distribution(it_certificates)
-    #it_certificates = clef_task_data.down_sample_by_icd10_frequency(it_certificates, 800)
-    check_label_distribution(it_certificates)
+    word_tagger = pdu.tag_words_with_language("RawText", "RawText", "Lang")
+    word_tagger.fit_transform(it_certificates)
 
-#    check_word_dictionary_overlap(it_certificates, it_dictionary, "data/dictionary/it-en.txt")
+    it_dictionary = clef_task_data.read_it_dictionary()
+    clef_task_data.extend_certificates_by_dictionaries(it_certificates, it_dictionary)
 
     hu_certificates = clef_task_data.read_hu_train_certificates()
     hu_dictionary = clef_task_data.read_hu_dictionary()
-
-    check_label_distribution(hu_certificates)
-    #hu_certificates = clef_task_data.down_sample_by_icd10_frequency(hu_certificates, 2750)
-    check_label_distribution(hu_certificates)
-    # check_word_dictionary_overlap(hu_certificates, hu_dictionary, "data/dictionary/hu-en.txt")
+    clef_task_data.extend_certificates_by_dictionaries(hu_certificates, hu_dictionary)
 
     fr_certificates = clef_task_data.read_fr_train_certificates()
     fr_dictionary = clef_task_data.read_fr_dictionary()
+    clef_task_data.extend_certificates_by_dictionaries(fr_certificates, fr_dictionary)
 
-    check_label_distribution(fr_certificates)
-    fr_certificates = clef_task_data.down_sample_by_icd10_frequency(fr_certificates, 2750)
-    check_label_distribution(fr_certificates)
+    #check_label_distribution(it_certificates)
+    #it_certificates = clef_task_data.down_sample_by_icd10_frequency(it_certificates, 800)
+    #check_label_distribution(it_certificates)
+
+    #hu_certificates = clef_task_data.read_hu_train_certificates()
+    #hu_dictionary = clef_task_data.read_hu_dictionary()
 
+    #check_label_distribution(hu_certificates)
+    #hu_certificates = clef_task_data.down_sample_by_icd10_frequency(hu_certificates, 2750)
+    #check_label_distribution(hu_certificates)
+    #check_word_dictionary_overlap(hu_certificates, hu_dictionary, "data/dictionary/hu-en.txt")
+
+    # fr_certificates = clef_task_data.read_fr_train_certificates()
+    # fr_dictionary = clef_task_data.read_fr_dictionary()
+    # check_label_distribution(fr_certificates)
+    # fr_certificates = clef_task_data.down_sample_by_icd10_frequency(fr_certificates, 2750)
+    # check_label_distribution(fr_certificates)
 
     # check_word_dictionary_overlap(fr_certificates, fr_dictionary, "data/dictionary/fr-en.txt")
 
diff --git a/code_mario/clef18_task1_emb1.py b/code_mario/clef18_task1_emb1.py
index dad8532..fabf803 100644
--- a/code_mario/clef18_task1_emb1.py
+++ b/code_mario/clef18_task1_emb1.py
@@ -26,7 +26,7 @@ from typing import Tuple, Dict, List, Callable
 
 from app_context import AppContext
 from clef18_task1_data import Clef18Task1Data
-from ft_embeddings import FastTextEmbeddings, FastTextModel
+from ft_embeddings import FastTextEmbeddings, FastTextModel, SingleLanguageFastTextModel
 from preprocessing import DataPreparationUtil as pdu
 from keras_extension import KerasUtil as ku
 
@@ -54,19 +54,19 @@ class Clef18Task1Emb1(Clef18Task1Base):
 
     def build_embedding_model(self, word_index: Dict, ft_model: FastTextModel, max_cert_length: int, max_dict_length: int):
         # TODO: Make hyper-parameter configurable!
-        # TODO: Think about using CNNs instead of RNNs since we can have multiple ICD-10 per line!
-
-        embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size))
+        embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size()))
         for word, i in word_index.items():
             try:
-                embedding_vector = ft_model.lookup(word)
+                language = word[0:2]
+                token = word[2:]
+                embedding_vector = ft_model.lookup(token, language)
                 if embedding_vector is not None:
                     # words not found in embedding index will be all-zeros.
                     embedding_matrix[i] = embedding_vector
             except KeyError:
                 self.logger.error("Can't create embedding for '%s'", word)
 
-        embedding = Embedding(len(word_index)+1, ft_model.vector_size, weights=[embedding_matrix], mask_zero=True)
+        embedding = Embedding(len(word_index)+1, ft_model.vector_size(), weights=[embedding_matrix], mask_zero=True)
 
         # Model 1: Learn a representation of a line originating from a death certificate
         input_certificate_line = Input((max_cert_length, ))
@@ -256,7 +256,7 @@ class Clef18Task1Emb1(Clef18Task1Base):
         dict_df, max_dict_length = self.prepare_dictionary_df(dict_df, "train", label_encoders, keras_tokenizer)
 
         return Emb1Configuration(train_cert_df, val_cert_df, test_cert_df, dict_df, max_cert_length, max_dict_length,
-                                 ft_model.vector_size, strat_column, label_encoders, keras_tokenizer)
+                                 ft_model.vector_size(), strat_column, label_encoders, keras_tokenizer)
 
     def prepare_certificate_df(self, certificate_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders,
                                keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]:
@@ -439,13 +439,14 @@ if __name__ == "__main__":
 
     train_emb_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int)
     train_emb_parser.add_argument("--down_sample", help="Maximal frequency of ICD10 code until start down sampling", default=None, type=int)
+    train_emb_parser.add_argument("--extend_cert", help="Indicates whether to extend the certificates with dict_entries", default=True, type=bool)
     train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str)
     train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool)
     train_emb_parser.add_argument("--strat_min_freq", help="Min frequency of an icd10 code to be an own class during stratification", default=8, type=int)
     train_emb_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append')
     train_emb_parser.add_argument("--single_only", help="Indicates whether just to use the single code lines from the data", default=False, type=bool)
 
-    train_emb_parser.add_argument("--max_pos_samples", help="Maximal number of positive samples to use", default=10, type=int)
+    train_emb_parser.add_argument("--max_pos_samples", help="Maximal number of positive samples to use", default=12, type=int)
     train_emb_parser.add_argument("--neg_sampling", help="Negative sampling strategy to use", default="ext1", choices=["def", "ext1"])
     train_emb_parser.add_argument("--num_neg_samples", help="Number of negative samples to use (default strategy)", default=75, type=int)
     train_emb_parser.add_argument("--num_neg_cha", help="Number of negative chapter samples to use (ext1 strategy)", default=10, type=int)
@@ -483,18 +484,22 @@ if __name__ == "__main__":
         dictionary = clef_data.read_dictionary_by_id(args.lang)
         certificates = clef_data.read_train_certifcates_by_id(args.lang)
 
+        # sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+        # ft_model = SingleLanguageFastTextModel("it", "it", FastText(sentences, min_count=1))
+
         ft_embeddings = FastTextEmbeddings()
         ft_model = ft_embeddings.load_embeddings_by_id(args.lang)
 
-        #sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
-        #ft_model = FastTextModel("dummy", [FastText(sentences, min_count=1)])
-
         if args.down_sample:
             certificates = clef_data.down_sample_by_icd10_frequency(certificates, args.down_sample)
 
         if args.single_only:
             certificates = clef_data.filter_single_code_lines(certificates)
 
+        if args.extend_cert:
+            certificates = clef_data.extend_certificates_by_dictionaries(certificates, dictionary)
+            #certificates = certificates
+
         if args.strat_splits:
             certificates = clef_data.add_masked_icd10_column(certificates, args.strat_min_freq)
 
diff --git a/code_mario/clef18_task1_emb2.py b/code_mario/clef18_task1_emb2.py
index 6cc3200..c5f79e9 100644
--- a/code_mario/clef18_task1_emb2.py
+++ b/code_mario/clef18_task1_emb2.py
@@ -155,7 +155,7 @@ class Clef18Task1Emb2(Clef18Task1Base):
         # TODO: Make hyper-parameter configurable!
         # TODO: Think about using CNNs instead of RNNs!
 
-        embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size))
+        embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size()))
         for word, i in word_index.items():
             try:
                 embedding_vector = ft_model.lookup(word)
@@ -165,7 +165,7 @@ class Clef18Task1Emb2(Clef18Task1Base):
             except KeyError:
                 self.logger.error("Can't create embedding for '%s'", word)
 
-        input_embedding = Embedding(len(word_index)+1, ft_model.vector_size, weights=[embedding_matrix], mask_zero=True)
+        input_embedding = Embedding(len(word_index)+1, ft_model.vector_size(), weights=[embedding_matrix], mask_zero=True)
 
         # Model 1: Learn a representation of a line originating from a death certificate or the dictionary
         input_certificate_line = Input((conf.max_length, ))
@@ -225,7 +225,7 @@ class Clef18Task1Emb2(Clef18Task1Base):
         self.logger.info("Start preparation of test cert data (%s instances)", len(test_df))
         test_df, _ = self.prepare_cert_dict_df(test_df, "test", label_encoders, keras_tokenizer)
 
-        return Configuration(train_df, val_df, test_df, max_length, ft_model.vector_size,
+        return Configuration(train_df, val_df, test_df, max_length, ft_model.vector_size(),
                              strat_column, label_encoders, keras_tokenizer)
 
     def prepare_cert_dict_df(self, cert_dict_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders,
diff --git a/code_mario/ft_embeddings.py b/code_mario/ft_embeddings.py
index f2b3b7f..47fb9b5 100644
--- a/code_mario/ft_embeddings.py
+++ b/code_mario/ft_embeddings.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from typing import List
+from typing import List, Dict
 from gensim.models import FastText
 
 from app_context import AppContext
@@ -9,18 +9,82 @@ from util import LoggingMixin
 
 class FastTextModel(LoggingMixin):
 
-    def __init__(self, name: str, ft_models: List[FastText]):
+    def __init__(self, name: str):
         LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file)
         self.name = name
+
+    def lookup(self, word: str, language: str):
+        raise NotImplementedError()
+
+    def vector_size(self):
+        raise NotImplementedError()
+
+
+class SingleLanguageFastTextModel(FastTextModel):
+
+    def __init__(self, name: str, lang: str, ft_model: FastText):
+        FastTextModel.__init__(self, name)
+        self.name = name
+        self.lang = lang.lower()
+        self.ft_model = ft_model
+
+    def lookup(self, word: str, lang: str):
+        embeddings = []
+        if self.lang == lang.lower():
+            try:
+                return  embeddings.append(self.ft_model[word])
+            except KeyError as error:
+                self.logger.warn("Can't create embedding for " + word)
+                return np.zeros(self.ft_model.vector_size)
+        else:
+            self.logger.warn("FastText model doesn't support language %s", lang)
+            return np.zeros(self.ft_model.vector_size)
+
+    def vector_size(self):
+        return self.ft_model.vector_size
+
+
+class MultiLanguageFastTextModel(FastTextModel):
+
+    def __init__(self, name: str, ft_models: Dict[str, FastText]):
+        FastTextModel.__init__(self, name)
+        self.name = name
         self.ft_models = ft_models
-        self.vector_size = sum([ft_model.vector_size for ft_model in self.ft_models])
+        self.emb_size = ft_models.values()[0].vector_size
 
-    def lookup(self, word: str):
+    def lookup(self, word: str, lang: str):
+        lang = lang.lower()
+
+        if lang in self.ft_models:
+            ft_model = self.ft_models[lang]
+
+            try:
+                return ft_model[word]
+            except KeyError as error:
+                self.logger.warn("Can't create embedding for " + word)
+                return np.zeros(ft_model.vector_size)
+        else:
+            self.logger.warn("FastText model doesn't support language %s", lang)
+            return np.zeros(self.emb_size)
+
+    def vector_size(self):
+        return self.emb_size
+
+
+class MultiLanguageConcatenationFastTextModel(FastTextModel):
+
+    def __init__(self, name: str, ft_models: List[FastText]):
+        FastTextModel.__init__(self, name)
+        self.name = name
+        self.ft_models = ft_models
+        self.emb_size = sum([ft_model.vector_size for ft_model in ft_models])
+
+    def lookup(self, word: str, lang: str):
         embeddings = []
         for ft_model in self.ft_models:
             try:
                 embeddings.append(ft_model[word])
-            except KeyError as error:
+            except KeyError:
                 self.logger.warn("Can't create embedding for " + word)
                 embeddings.append(np.zeros(ft_model.vector_size))
 
@@ -29,6 +93,9 @@ class FastTextModel(LoggingMixin):
         else:
             return np.concatenate(embeddings)
 
+    def vector_size(self):
+        return self.emb_size
+
 
 class FastTextEmbeddings(LoggingMixin):
 
@@ -37,18 +104,25 @@ class FastTextEmbeddings(LoggingMixin):
 
     def load_embeddings_by_id(self, id: str) -> FastTextModel:
         if id == "it":
-            return FastTextModel("it", [self.load_it_embeddings()])
+            return SingleLanguageFastTextModel("it", "it", self.load_it_embeddings())
 
         elif id == "hu":
-            return FastTextModel("hu", [self.load_hu_embeddings()])
+            return SingleLanguageFastTextModel("hu", "hu", self.load_hu_embeddings())
 
         elif id == "fr":
-            return FastTextModel("fr", [self.load_fr_embeddings()])
+            return SingleLanguageFastTextModel("fr", "fr", self.load_fr_embeddings())
+
+        elif id == "multi":
+            return MultiLanguageFastTextModel("multi", {
+                "it" : self.load_it_embeddings(),
+                "hu" : self.load_hu_embeddings(),
+                "fr" : self.load_fr_embeddings()
+            })
 
         elif id == "all-con":
-            return FastTextModel("all-con", [self.load_fr_embeddings(),
-                                             self.load_it_embeddings(),
-                                             self.load_hu_embeddings()])
+            return MultiLanguageConcatenationFastTextModel("all-con", [self.load_fr_embeddings(),
+                                                                       self.load_it_embeddings(),
+                                                                       self.load_hu_embeddings()])
 
         else:
             raise AssertionError("Unsupported language: " + id)
diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py
index ac7fad0..d474372 100644
--- a/code_mario/preprocessing.py
+++ b/code_mario/preprocessing.py
@@ -2,7 +2,7 @@ import re
 import numpy as np
 
 from gensim.models import FastText
-from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.text import Tokenizer, text_to_word_sequence
 from pandas import DataFrame
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.preprocessing import LabelEncoder
@@ -95,6 +95,10 @@ class DataPreparationUtil(object):
 
         return MapFunction(column, _clean)
 
+    @staticmethod
+    def tag_words_with_language(text_column: str, target_column: str, lang_column: str):
+        return LanguageWordTagger(text_column, target_column, lang_column)
+
 
 class FitMixin(object):
 
@@ -240,3 +244,19 @@ class KerasSequencer(BaseEstimator, TransformerMixin):
         sequences = self.keras_tokenizer.texts_to_sequences(texts)
 
         return PandasUtil.append_column(data, self.target_column, sequences)
+
+
+class LanguageWordTagger(BaseEstimator, FitMixin, TransformerMixin):
+
+    def __init__(self, text_column: str, target_column: str, lang_column: str):
+        self.text_column = text_column
+        self.target_column = target_column
+        self.lang_column = lang_column
+
+    def transform(self, data: DataFrame, y=None):
+        def _tag_words(row):
+            language = row[self.lang_column]
+            return " ".join("%s%s" % (language, word) for word in text_to_word_sequence(row[self.text_column]))
+
+        data[self.target_column] = data.apply(_tag_words, axis=1)
+        return data
-- 
GitLab