diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py
index 7a149c50a4975b15e86087fcb081d8285cfd4999..d24c43a05c101cf8d35a11651d5726dfd9b8ad49 100644
--- a/code_mario/clef18_task1_data.py
+++ b/code_mario/clef18_task1_data.py
@@ -2,7 +2,6 @@ import os
 from typing import List
 
 import pandas as pd
-from keras.preprocessing.text import Tokenizer, text_to_word_sequence
 
 from pandas import DataFrame
 from tqdm import tqdm
@@ -80,7 +79,7 @@ class Clef18Task1Data(LoggingMixin):
         calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv")
         brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv")
 
-        return self._read_certificates([calculees_file], [brutes_file], "it")
+        return self._read_certificates([calculees_file], [brutes_file], "hu")
 
     def read_hu_dictionary(self) -> DataFrame:
         base_folder = "data/train/HU/training/raw/dictionaries"
@@ -169,9 +168,6 @@ class Clef18Task1Data(LoggingMixin):
         joined_data = pdu.clean_text("RawText").fit_transform(joined_data)
         joined_data["Lang"] = language
 
-        word_tagger = pdu.tag_words_with_language("RawText", "RawText", "Lang")
-        joined_data = word_tagger.fit_transform(joined_data)
-
         return joined_data[["RawText", "ICD10", "Lang"]]
 
     def _read_icd10_dictionary(self, dictionary_files: List[str], encoding: str, lang: str) -> DataFrame:
@@ -198,10 +194,6 @@ class Clef18Task1Data(LoggingMixin):
 
         dictionary_data["Lang"] = lang
 
-        self.logger.info("Tag language %s on all words", lang)
-        word_tagger = pdu.tag_words_with_language("DiagnosisText", "DiagnosisText", "Lang")
-        dictionary_data = word_tagger.fit_transform(dictionary_data)
-
         return dictionary_data
 
     def _read_test_data(self, file: str) -> DataFrame:
@@ -260,22 +252,104 @@ class Clef18Task1Data(LoggingMixin):
         return sampled_df
 
     def extend_certificates_by_dictionaries(self, certificate_df: DataFrame, dictionary_df: DataFrame) -> DataFrame:
+        self.logger.info("Start extending certificate data set with dictionary entries (original size: %s)", len(certificate_df))
         original_size = len(certificate_df)
 
         dict_icd10_codes = dictionary_df["ICD10"].unique()
         cert_icd10_codes = certificate_df["ICD10"].unique()
 
         unseen_icd10_codes = [dict_icd10 for dict_icd10 in dict_icd10_codes if dict_icd10 not in cert_icd10_codes]
-        for unseen_icd10 in tqdm(unseen_icd10_codes, desc="extend-cert", total=len(unseen_icd10_codes)):
-            entries = dictionary_df.query("ICD10 == '%s'" % unseen_icd10)
 
-            for key, row in entries.iterrows():
-                certificate_df = certificate_df.append({"RawText": row["DiagnosisText"], "ICD10": row["ICD10"] }, ignore_index=True)
+        unseen_mask = dictionary_df["ICD10"].isin(unseen_icd10_codes)
+        lines_with_unseen_codes = dictionary_df.loc[unseen_mask]
+
+        new_rows = certificate_df.copy().iloc[0:0]
+        for i, row in tqdm(lines_with_unseen_codes.iterrows(), desc="extend-cert", total=len(lines_with_unseen_codes)):
+            new_rows = new_rows.append({"RawText": row["DiagnosisText"], "ICD10": row["ICD10"], "Lang": row["Lang"] }, ignore_index=True)
+
+        extended_size = len(new_rows)
+        certificate_df = pd.concat([certificate_df, new_rows])
 
-        extended_size = len(certificate_df)
         self.logger.info("Extended cert data set with %s from dictionary (%s in total)" % (extended_size - original_size, extended_size))
         return certificate_df
 
+    def remove_duplicates_from_certificates(self, certificate_df: DataFrame):
+        self.logger.info("Start removing duplicates from certificate data set (size: %s)", len(certificate_df))
+
+        cleaned_cert_df = certificate_df.drop_duplicates(subset=["RawText", "ICD10"])
+        self.logger.info("Removed %s duplicates from certificate data set (new size: %s)",
+                         len(certificate_df) - len(cleaned_cert_df), len(cleaned_cert_df))
+
+        return cleaned_cert_df
+
+    def split_multi_code_lines(self, certificate_df: DataFrame) -> DataFrame:
+        self.logger.info("Start splitting multi code lines in certificate data set (size: %s)", len(certificate_df))
+        duplicate_ids = certificate_df.index.duplicated(keep=False)
+        lines_with_multiple_codes = certificate_df[duplicate_ids].index.values
+
+        mask_multicode = certificate_df.index.isin(lines_with_multiple_codes)
+        singlecode_rows = certificate_df[~mask_multicode]
+        multicode_rows = certificate_df[mask_multicode]
+
+        last_index = -1
+        text_pos = 0
+        new_rows = singlecode_rows.copy().iloc[0:0]
+        for index, row in tqdm(multicode_rows.iterrows(), desc="split-multi", total=len(multicode_rows)):
+            split = str(row["RawText"]).split(",")
+
+            if last_index != index:
+                text_pos = 0
+            else:
+                text_pos = text_pos + 1
+
+            if text_pos < len(split):
+                row["RawText"] = split[text_pos]
+                new_rows = new_rows.append(row)
+            else:
+                new_rows = new_rows.append(row)
+            last_index = index
+
+        result = pd.concat([singlecode_rows, new_rows])
+        self.logger.info("Finished multi code line splitting. Adding %s new rows (new size: %s)", len(new_rows), len(result))
+        return result
+
+    def duplicate_less_frequent(self, certificate_df: DataFrame, min_freq: int):
+        self.logger.info("Start duplicating less frequent ICD10 code entries (size: %s)", len(certificate_df))
+
+        code_counts = certificate_df["ICD10"].value_counts()
+        less_frequent_codes = set([code for code, freq in code_counts.iteritems() if freq < min_freq])
+
+        less_frequent_mask = certificate_df["ICD10"].isin(less_frequent_codes)
+        less_frequent_rows = certificate_df[less_frequent_mask]
+
+        new_rows = certificate_df.copy().iloc[0:0]
+        for code in tqdm(less_frequent_codes, desc="build-dup", total=len(less_frequent_codes)):
+            num_dumplicates = min_freq - code_counts[code]
+            duplicates = less_frequent_rows.query("ICD10 == '%s'" % code).sample(num_dumplicates, replace=True)
+            new_rows = pd.concat([new_rows, duplicates])
+
+        result = pd.concat([certificate_df, new_rows])
+        self.logger.info("Added %s duplicates to data set to gurantee min frequency %s (new size: %s)",
+                         len(new_rows), min_freq, len(result))
+
+        return result
+
+    def language_tag_data(self, data_set: DataFrame, text_column: str, lang_column: str) -> DataFrame:
+        self.logger.info("Start word tagging column %s with language from %s (size: %s)", text_column, lang_column, len(data_set))
+
+        word_tagger = pdu.tag_words_with_language(text_column, text_column, lang_column)
+        data_set = word_tagger.fit_transform(data_set)
+
+        self.logger.info("Finished word tagging data")
+        return data_set
+
+    def filter_nan_texts(self, certifcate_df: DataFrame) -> DataFrame:
+        self.logger.info("Start filtering nan texts (size: %s)", len(certifcate_df))
+        nan_mask = certifcate_df["RawText"].isin(["nan"])
+        certifcate_df = certifcate_df[~nan_mask]
+        self.logger.info("Finished filtering nan texts (size: %s)", len(certifcate_df))
+        return certifcate_df
+
 def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame):
     print("Data set: ", ds_name)
 
@@ -385,20 +459,29 @@ if __name__ == "__main__":
     #clef_task_data.down_sample_by_icd10_frequency(all_cert, 4000)
 
     it_certificates = clef_task_data.read_it_train_certificates()
-
-    word_tagger = pdu.tag_words_with_language("RawText", "RawText", "Lang")
-    word_tagger.fit_transform(it_certificates)
-
     it_dictionary = clef_task_data.read_it_dictionary()
-    clef_task_data.extend_certificates_by_dictionaries(it_certificates, it_dictionary)
+
+    it_certificates = clef_task_data.extend_certificates_by_dictionaries(it_certificates, it_dictionary)
+    it_certificates = clef_task_data.remove_duplicates_from_certificates(it_certificates)
+    it_certificates = clef_task_data.split_multi_code_lines(it_certificates)
+    it_certificates = clef_task_data.duplicate_less_frequent(it_certificates, 4)
+    print("IT: ", len(it_certificates))
 
     hu_certificates = clef_task_data.read_hu_train_certificates()
     hu_dictionary = clef_task_data.read_hu_dictionary()
-    clef_task_data.extend_certificates_by_dictionaries(hu_certificates, hu_dictionary)
+    hu_certificates = clef_task_data.extend_certificates_by_dictionaries(hu_certificates, hu_dictionary)
+    hu_certificates = clef_task_data.remove_duplicates_from_certificates(hu_certificates)
+    hu_certificates = clef_task_data.split_multi_code_lines(hu_certificates)
+    hu_certificates = clef_task_data.duplicate_less_frequent(hu_certificates, 4)
+    print("HU: ", len(hu_certificates))
 
     fr_certificates = clef_task_data.read_fr_train_certificates()
     fr_dictionary = clef_task_data.read_fr_dictionary()
-    clef_task_data.extend_certificates_by_dictionaries(fr_certificates, fr_dictionary)
+    fr_certificates = clef_task_data.extend_certificates_by_dictionaries(fr_certificates, fr_dictionary)
+    fr_certificates = clef_task_data.remove_duplicates_from_certificates(fr_certificates)
+    fr_certificates = clef_task_data.split_multi_code_lines(fr_certificates)
+    fr_certificates = clef_task_data.duplicate_less_frequent(fr_certificates, 4)
+    print("FR: ", len(fr_certificates))
 
     #check_label_distribution(it_certificates)
     #it_certificates = clef_task_data.down_sample_by_icd10_frequency(it_certificates, 800)
diff --git a/code_mario/clef18_task1_emb1.py b/code_mario/clef18_task1_emb1.py
index fabf803ccdb9b6dfe68f4b976741c7fd01b85334..b3e58c388b9ce9e7dd6fef510d55c76088bb1a73 100644
--- a/code_mario/clef18_task1_emb1.py
+++ b/code_mario/clef18_task1_emb1.py
@@ -89,8 +89,8 @@ class Clef18Task1Emb1(Clef18Task1Base):
 
         return model
 
-    def train_embedding_model(self, config: Emb1Configuration, ft_model: FastTextModel, max_pos_samples: int, neg_sampling_strategy: Callable,
-                              epochs: int, batch_size: int, workers: int, chunk_size: int) -> Model:
+    def train_embedding_model(self, config: Emb1Configuration, ft_model: FastTextModel, max_pos_samples: int,
+                              neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model:
         self.logger.info("Start building embedding model")
         model = self.build_embedding_model(config.keras_tokenizer.word_index, ft_model, config.max_cert_length, config.max_dict_length)
         model.summary(print_fn=self.logger.info)
@@ -227,11 +227,7 @@ class Clef18Task1Emb1(Clef18Task1Base):
     # ---------------------------------------------------------------------------------------------------------------------------------------
 
     def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastTextModel, train_ratio: float, val_ratio: float,
-                         strat_column: str, samples: int=None, stratified_splits: bool=False) -> Emb1Configuration:
-
-        if samples:
-            self.logger.info("Sampling %s instances", samples)
-            cert_df = cert_df.sample(samples, random_state=42)
+                         strat_column: str, stratified_splits: bool=False) -> Emb1Configuration:
 
         self.logger.info("Splitting certificate lines into train and evaluation data set")
         train_cert_df, evaluation_cert_df = self.split_train_test(cert_df, train_ratio, stratified_splits, strat_column)
@@ -429,19 +425,17 @@ if __name__ == "__main__":
     subparsers = parser.add_subparsers(dest="mode")
 
     train_emb_parser = subparsers.add_parser("train-emb")
-    train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "all-con"])
+    train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "ac", "am"])
     train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int)
     train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int)
-    train_emb_parser.add_argument("--workers", help="Number of threads during pair building", default=4, type=int)
-    train_emb_parser.add_argument("--slice_size", help="Number of cert entries to be handled by one thread during pair building", default=1000, type=int)
     train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float)
-    train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.3, type=float)
+    train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.5, type=float)
 
     train_emb_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int)
+    train_emb_parser.add_argument("--min_freq", help="Minimal number of instances per ICD10 code", default=10, type=int)
     train_emb_parser.add_argument("--down_sample", help="Maximal frequency of ICD10 code until start down sampling", default=None, type=int)
-    train_emb_parser.add_argument("--extend_cert", help="Indicates whether to extend the certificates with dict_entries", default=True, type=bool)
-    train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str)
-    train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool)
+    train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10", type=str)
+    train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=True, type=bool)
     train_emb_parser.add_argument("--strat_min_freq", help="Min frequency of an icd10 code to be an own class during stratification", default=8, type=int)
     train_emb_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append')
     train_emb_parser.add_argument("--single_only", help="Indicates whether just to use the single code lines from the data", default=False, type=bool)
@@ -457,12 +451,7 @@ if __name__ == "__main__":
     eval_classifier_parser = subparsers.add_parser("eval-cl")
     eval_classifier_parser.add_argument("emb_model", help="Path to the embedding model to use")
     eval_classifier_parser.add_argument("train_conf", help="Path to the training configuration dump")
-    eval_classifier_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"])
-    eval_classifier_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float)
-    eval_classifier_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.4, type=float)
-    eval_classifier_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int)
-    eval_classifier_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str)
-    eval_classifier_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool)
+    eval_classifier_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "ac", "am"])
     eval_classifier_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append')
 
     predict_parser = subparsers.add_parser("pred")
@@ -484,34 +473,40 @@ if __name__ == "__main__":
         dictionary = clef_data.read_dictionary_by_id(args.lang)
         certificates = clef_data.read_train_certifcates_by_id(args.lang)
 
-        # sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
-        # ft_model = SingleLanguageFastTextModel("it", "it", FastText(sentences, min_count=1))
+        sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+        ft_model = SingleLanguageFastTextModel("hu", "hu", FastText(sentences, min_count=1))
 
         ft_embeddings = FastTextEmbeddings()
-        ft_model = ft_embeddings.load_embeddings_by_id(args.lang)
+        #ft_model = ft_embeddings.load_embeddings_by_id(args.lang)
+
+        certificates = clef_data.extend_certificates_by_dictionaries(certificates, dictionary)
+        certificates = clef_data.remove_duplicates_from_certificates(certificates)
+        certificates = clef_data.filter_nan_texts(certificates)
+
+        if args.lang == "it" or args.lang == "hu":
+            certificates = clef_data.split_multi_code_lines(certificates)
 
-        if args.down_sample:
-            certificates = clef_data.down_sample_by_icd10_frequency(certificates, args.down_sample)
+        certificates = clef_data.language_tag_data(certificates, "RawText", "Lang")
 
-        if args.single_only:
-            certificates = clef_data.filter_single_code_lines(certificates)
+        certificates = clef_data.remove_duplicates_from_certificates(certificates)
+        certificates = clef_data.duplicate_less_frequent(certificates, args.min_freq)
 
-        if args.extend_cert:
-            certificates = clef_data.extend_certificates_by_dictionaries(certificates, dictionary)
-            #certificates = certificates
+        if args.samples:
+            icd10_sample = certificates["ICD10"].sample(args.samples, random_state=42).values
+            sample_mask = certificates["ICD10"].isin(icd10_sample)
+            certificates = certificates[sample_mask]
 
-        if args.strat_splits:
-            certificates = clef_data.add_masked_icd10_column(certificates, args.strat_min_freq)
+        dictionary = clef_data.language_tag_data(dictionary, "DiagnosisText", "Lang")
 
-        configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, args.val_ratio, args.strat_column,
-                                                      args.samples, args.strat_splits)
+        configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio,
+                                                      args.val_ratio, args.strat_column, args.strat_splits)
         clef18_task1.save_configuration(configuration)
 
         neg_sampling = NegativeSampling()
         neg_sampling_strategy = neg_sampling.get_strategy_by_name(args.neg_sampling, args)
 
         embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, args.max_pos_samples, neg_sampling_strategy,
-                                                             args.epochs, args.batch_size, args.workers, args.slice_size)
+                                                             args.epochs, args.batch_size)
 
         clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels)
 
diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py
index d474372548f0c08eccb008c5058e1a464c80bf69..45b540db2fbe180913d1eefaea1dadbcb2290951 100644
--- a/code_mario/preprocessing.py
+++ b/code_mario/preprocessing.py
@@ -256,7 +256,8 @@ class LanguageWordTagger(BaseEstimator, FitMixin, TransformerMixin):
     def transform(self, data: DataFrame, y=None):
         def _tag_words(row):
             language = row[self.lang_column]
-            return " ".join("%s%s" % (language, word) for word in text_to_word_sequence(row[self.text_column]))
+            result = " ".join("%s%s" % (language, word) for word in text_to_word_sequence(str(row[self.text_column])))
+            return result
 
         data[self.target_column] = data.apply(_tag_words, axis=1)
         return data