diff --git a/code_mario/clef18_task1.py b/code_mario/clef18_task1.py
deleted file mode 100644
index 812579cfe7746b72e5e71e18fd549e2aa7f6da7c..0000000000000000000000000000000000000000
--- a/code_mario/clef18_task1.py
+++ /dev/null
@@ -1,357 +0,0 @@
-import argparse
-import numpy as np
-import pandas as pd
-import os
-
-from gensim.models import FastText
-from keras import Input, Model
-from keras.layers import Bidirectional, Dense, Dot, LSTM
-from pandas import DataFrame
-from sklearn.dummy import DummyClassifier
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.linear_model import SGDClassifier
-from sklearn.metrics import f1_score, accuracy_score
-from sklearn.model_selection import ShuffleSplit
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.pipeline import Pipeline
-from sklearn.svm import LinearSVC
-from sklearn.tree import DecisionTreeClassifier
-from tqdm import tqdm
-from typing import Tuple
-
-from app_context import AppContext
-from clef18_task1_data import Clef18Task1Data
-from dnn_classifiers import NeuralNetworkClassifiers
-from ft_embeddings import FastTextEmbeddings
-from preprocessing import DataPreparationUtil as pdu
-from util import LoggingMixin
-
-
-class TrainingConfiguration(object):
-
-    def __init__(self, train_cert_df: DataFrame, val_cert_df: DataFrame, dict_df: DataFrame,
-                 max_cert_length: int, max_dict_length: int, ft_embedding_size: int):
-        self.train_cert_df = train_cert_df
-        self.val_cert_df = val_cert_df
-        self.dict_df = dict_df
-        self.max_cert_length = max_cert_length
-        self.max_dict_length = max_dict_length
-        self.ft_embedding_size = ft_embedding_size
-
-
-class Clef18Task1(LoggingMixin):
-
-    def __init__(self):
-        LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file)
-
-    def train_embedding_model(self, train_conf: TrainingConfiguration, neg_samples: int, epochs: int, batch_size: int) -> Model:
-        self.logger.info("Start model training procedure")
-
-        self.logger.info("Start building training pairs")
-        train_pair_data = self.build_pairs(train_conf.train_cert_df, train_conf.dict_df, neg_samples)
-        print(train_pair_data["Label"].value_counts())
-
-        self.logger.info("Start building training matrices")
-        cert_matrix, dict_matrix, labels = self.build_matrices(train_pair_data, train_conf.max_cert_length,
-                                                               train_conf.max_dict_length, train_conf.ft_embedding_size)
-
-        self.logger.info("Start building model")
-        model = self.build_embedding_model(train_conf.ft_embedding_size, train_conf.max_cert_length, train_conf.max_dict_length)
-
-        self.logger.info("Start training of models")
-        model.fit([cert_matrix, dict_matrix], labels, epochs=epochs, batch_size=batch_size)
-
-        model_file = os.path.join(AppContext.default().output_dir, "model.h5")
-
-        self.logger.info("Saving model to %s", model_file)
-        model.save(model_file)
-
-        ## ----------------------------------------------------------------------------------------------------------
-
-        if train_conf.val_cert_df is not None and len(train_conf.val_cert_df) > 0:
-            self.logger.info("Start evaluation of model!")
-
-            self.logger.info("Start creation of test pairs")
-            test_pair_data = self.build_pairs(train_conf.val_cert_df, train_conf.dict_df, neg_samples)
-
-            self.logger.info("Start building test matrices")
-            test_cert_matrix, test_dict_matrix, gold_labels = self.build_matrices(
-                  test_pair_data, train_conf.max_cert_length, train_conf.max_dict_length, train_conf.ft_embedding_size)
-
-            self.logger.info("Start prediction of test labels")
-            pred_labels = model.predict([test_cert_matrix, test_dict_matrix], verbose=1)
-            pred_labels = (pred_labels > 0.5).astype(float)
-
-            f1_value = f1_score(gold_labels, pred_labels)
-            acc_value = accuracy_score(gold_labels, pred_labels)
-
-            self.logger.info("Result: f1_score= %s | acc_score= %s", f1_value, acc_value)
-
-        return model
-
-    def train_knn_classifier(self, emb_model: Model, data_set: TrainingConfiguration):
-        self.logger.info("Start classifier training")
-        label_column = "ICD10_chapter_encoded"
-
-        self.logger.info("Building dictionary embeddings")
-        dict_input = emb_model.inputs[1]
-        dict_rnn = Model(inputs=dict_input, outputs=emb_model.get_layer("dict_rnn").output, name="Dict-RNN-Model")
-        dict_matrix = self.build_rnn_input(data_set.dict_df, "FtMatrix", dict_input.shape[1].value, dict_input.shape[2].value)
-        dict_embeddings = dict_rnn.predict(dict_matrix, verbose=1)
-
-        self.logger.info("Building certificate embeddings")
-        cert_input = emb_model.inputs[0]
-        cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model")
-        cert_matrix = self.build_rnn_input(data_set.train_cert_df, "FtMatrix", cert_input.shape[1].value, cert_input.shape[2].value)
-        cert_embeddings = cert_rnn.predict(cert_matrix, verbose=1)
-
-        self.logger.info("Learning K-nearest-neighbor classifier")
-        classifier = KNeighborsClassifier(metric='cosine')
-        classifier.fit(dict_embeddings, data_set.dict_df[label_column].values)
-
-        self.logger.info("Start evaluation")
-        cert_pred = classifier.predict(cert_embeddings)
-
-        acc_score = accuracy_score(data_set.train_cert_df[label_column].values, cert_pred)
-        self.logger.info("KNN Accuracy: %s", acc_score)
-
-    def train_classifiers(self, emb_model: Model, data_set: TrainingConfiguration):
-        self.logger.info("Start classifier evaluation")
-        label_column = "ICD10_chapter_encoded"
-
-        cert_input = emb_model.inputs[0]
-        cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model")
-
-        self.logger.info("Building training certificate embeddings")
-        train_cert_matrix = self.build_rnn_input(data_set.train_cert_df, "FtMatrix", cert_input.shape[1].value, cert_input.shape[2].value)
-        train_cert_embeddings = cert_rnn.predict(train_cert_matrix, verbose=1)
-
-        self.logger.info("Building validation certificate embeddings")
-        val_cert_matrix = self.build_rnn_input(data_set.val_cert_df, "FtMatrix", cert_input.shape[1].value, cert_input.shape[2].value)
-        val_cert_embeddings = cert_rnn.predict(val_cert_matrix, verbose=1)
-
-        num_classes = data_set.train_cert_df[label_column].unique().size
-
-        named_classifiers = [
-            ("SGD", SGDClassifier(verbose=1, random_state=42)),
-            ("DT", DecisionTreeClassifier(random_state=42)),
-            ("RF", RandomForestClassifier(verbose=1, random_state=42)),
-            ("LinearSVM", LinearSVC(max_iter=250000, verbose=True, random_state=42)),
-            ("DNN-1-<300>", NeuralNetworkClassifiers.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], False, 0.0, 50, 2)),
-            ("DNN-1-<200>", NeuralNetworkClassifiers.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], False, 0.0, 50, 2)),
-            ("DNN-1-<300>-BN-DO", NeuralNetworkClassifiers.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], True, 0.5, 50, 2)),
-            ("DNN-1-<200>-BN-DO", NeuralNetworkClassifiers.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], True, 0.5, 50, 2)),
-            ("DNN-2-<200, 100>", NeuralNetworkClassifiers.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], False, 0.0, 30, 2)),
-            ("DNN-2-<200, 200>", NeuralNetworkClassifiers.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], False, 0.0, 30, 2)),
-            ("DNN-2-<200, 100>-BN-DO", NeuralNetworkClassifiers.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], True, 0.5, 30, 2)),
-            ("DNN-2-<200, 200>-BN-DO", NeuralNetworkClassifiers.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], True, 0.5, 30, 2)),
-            ('DU1', DummyClassifier(strategy="stratified")),
-            ('DU2', DummyClassifier(strategy="most_frequent"))
-        ]
-
-        for name, classifier in named_classifiers:
-            self.logger.info("Start training of classifier %s", name)
-            classifier.fit(train_cert_embeddings, data_set.train_cert_df[label_column].values)
-
-            train_prediction = classifier.predict(train_cert_embeddings)
-            train_acc_score = accuracy_score(data_set.train_cert_df[label_column].values, train_prediction)
-
-            val_prediction = classifier.predict(val_cert_embeddings)
-            val_acc_score = accuracy_score(data_set.val_cert_df[label_column].values, val_prediction)
-
-            self.logger.info("Evaluation result %s: train_acc=%s | val_acc=%s", name, train_acc_score, val_acc_score)
-
-    def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, val_ratio: float, samples: int=None) -> TrainingConfiguration:
-        if samples:
-            print("Sampling %s instances" % samples)
-            cert_df = cert_df.sample(samples, random_state=42)
-
-        self.logger.info("Splitting certificate lines into train and validation")
-        train_cert_df, val_cert_df = self.split_train_test(cert_df, val_ratio)
-        self.logger.info("Finished splitting: train=%s instances, test=%s instances", len(train_cert_df), len(val_cert_df))
-
-        self.logger.info("Start preparation of training cert data (%s instances)", len(train_cert_df))
-        train_cert_df, max_cert_length = self.prepare_certificate_df(train_cert_df, ft_model)
-
-        self.logger.info("Start preparation of validation cert data (%s instances)", len(val_cert_df))
-        val_cert_df, _ = self.prepare_certificate_df(val_cert_df, ft_model, max_cert_length)
-
-        self.logger.info("Start preparation of dictionary data (%s instances)", len(dict_df))
-        dict_df, max_dict_length = self.prepare_dictionary_df(dict_df, ft_model)
-
-        return TrainingConfiguration(train_cert_df, val_cert_df, dict_df, max_cert_length, max_dict_length, ft_model.vector_size)
-
-    def split_train_test(self, certificate_df: DataFrame, test_ratio: float) -> Tuple[DataFrame, DataFrame]:
-        # FIXME: Use stratified shuffling!
-        #splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42)
-        #split = splitter.split(certificate_df, certificate_df["ICD10"])
-
-        splitter = ShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42)
-        split = splitter.split(certificate_df)
-
-        for train_indices, test_indices in split:
-            training_data = certificate_df.iloc[train_indices]
-            test_data = certificate_df.iloc[test_indices]
-
-        return training_data, test_data
-
-    def build_embedding_model(self, ft_embedding_size: int, max_cert_length: int, max_dict_length: int):
-        # TODO: Think about building a embedding layer
-        # TODO: Make hyper-parameter configurable!
-        # TODO: Think about using CNNs instead of RNNs since we can have multiple ICD-10 per line!
-        # TODO: Think about
-
-        # Model 1: Learn a representation of a line originating from a death certificate
-        input_certificate_line = Input((max_cert_length, ft_embedding_size))
-        certificate_rnn = Bidirectional(LSTM(200), name="cert_rnn")(input_certificate_line)
-
-        # Model 2: Learn a representation of a line in the ICD-10 dictionary (~ DiagnosisText)
-        input_dictionary_line = Input((max_dict_length, ft_embedding_size))
-        dictionary_rnn = Bidirectional(LSTM(200), name="dict_rnn")(input_dictionary_line)
-
-        # Calculate similarity between both representations
-        dot_product = Dot(axes=1, normalize=True)([certificate_rnn, dictionary_rnn])
-
-        output = Dense(1, activation='sigmoid')(dot_product)
-
-        # Create the primary training model
-        model = Model(inputs=[input_certificate_line, input_dictionary_line], outputs=output, name="Cert/Dict-Embedding-Model")
-        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
-
-        return model
-
-    def prepare_certificate_df(self, certificate_df: DataFrame, ft_model: FastText, max_length: int=None) -> Tuple[DataFrame, int]:
-        certificate_pipeline = Pipeline([
-            ("Encode-ICD10-codes", pdu.encode_labels("ICD10", "ICD10_encoded")),
-
-            ("Extract-ICD10-chapter", pdu.extract_icd10_chapter("ICD10", "ICD10_chapter")),
-            ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded")),
-
-            ("LowercaseText", pdu.to_lowercase("RawText")),
-            ("TokenizeDiagnosis", pdu.tokenize("RawText", "Tokens")),
-            ("CountTokens", pdu.count_values("Tokens", "NumTokens")),
-
-            ("LookupFastTextVectors", pdu.lookup_fast_text_vectors("Tokens", "FtVectors", ft_model))
-        ])
-
-        cert_data_prepared = certificate_pipeline.fit_transform(certificate_df)
-
-        if not max_length:
-            max_length = cert_data_prepared["NumTokens"].max()
-
-        cert_data_prepared = pdu.vectors_to_matrix("FtVectors", "FtMatrix", ft_model.vector_size, max_length).fit_transform(cert_data_prepared)
-
-        return cert_data_prepared, max_length
-
-    def prepare_dictionary_df(self, dictionary_df: DataFrame, ft_model: FastText) -> Tuple[DataFrame, int]:
-        dictionary_pipeline = Pipeline([
-            ("Encode-ICD10-codes", pdu.encode_labels("ICD10", "ICD10_encoded")),
-
-            ("Extract-ICD10-chapter", pdu.extract_icd10_chapter("ICD10", "ICD10_chapter")),
-            ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded")),
-
-            ("CombineTexts", pdu.combine_texts(["DiagnosisText", "Standardized"], "DictText")),
-            ("LowercaseText", pdu.to_lowercase("DictText")),
-            ("TokenizeDiagnosis", pdu.tokenize("DictText", "Tokens")),
-            ("CountTokens", pdu.count_values("Tokens", "NumTokens")),
-
-            ("LookupTokenIds", pdu.lookup_fast_text_vectors("Tokens", "FtVectors", ft_model)),
-        ])
-
-        dict_data_prepared = dictionary_pipeline.fit_transform(dictionary_df)
-        max_length = dict_data_prepared["NumTokens"].max()
-
-        dict_data_prepared = pdu.vectors_to_matrix("FtVectors", "FtMatrix", ft_model.vector_size, max_length).fit_transform(dict_data_prepared)
-
-        return dict_data_prepared, max_length
-
-    def build_pairs(self, certificate_data: DataFrame, dictionary_data: DataFrame, num_neg_samples: int):
-        # FIXME: This can be implemented more efficiently!
-        # FIXME: Improve sampling of negative instances (especially if code is I-XXX sample other codes of the same class (e.g. I-YYY)
-        # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2
-
-        certificate_vectors = []
-        dictionary_vectors = []
-        labels = []
-
-        for i, cert_row in tqdm(certificate_data.iterrows(), desc="build-pairs", total=len(certificate_data)):
-            line_icd10_code = cert_row["ICD10"]
-
-            # Build positive examples (based on training data)
-            dictionary_entries = dictionary_data.query("ICD10 == '%s'" % line_icd10_code)
-            #self.logger.info("Found %s entries for ICD-10 code %s", len(dictionary_entries), line_icd10_code)
-            for i, dict_row in dictionary_entries.iterrows():
-                certificate_vectors.append(cert_row["FtMatrix"])
-                dictionary_vectors.append(dict_row["FtMatrix"])
-
-                labels.append(1.0)
-
-            # Find illegal ICD-10 for this line
-            negative_samples = dictionary_data.query("ICD10 != '%s'" % line_icd10_code)
-            negative_samples = negative_samples.sample(num_neg_samples)
-
-            # Build negative samples
-            for i, dict_row in negative_samples.iterrows():
-                certificate_vectors.append(cert_row["FtMatrix"])
-                dictionary_vectors.append(dict_row["FtMatrix"])
-
-                labels.append(0.0)
-
-        data = {"CertFtMatrix": certificate_vectors, "DictFtMatrix": dictionary_vectors, "Label": labels}
-        return pd.DataFrame(data)
-
-    def build_matrices(self, pair_data: DataFrame, max_cert_length: int, max_dict_length: int, vector_size: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        num_pairs = len(pair_data)
-
-        certificate_matrix = np.zeros((num_pairs, max_cert_length, vector_size))
-        dictionary_matrix = np.zeros((num_pairs, max_dict_length, vector_size))
-        label_matrix = np.zeros((num_pairs))
-
-        for i, (_, row) in tqdm(enumerate(pair_data.iterrows()), desc="build-matrices", total=num_pairs):
-            certificate_matrix[i] = row["CertFtMatrix"]
-            dictionary_matrix[i] = row["DictFtMatrix"]
-            label_matrix[i] = row["Label"]
-
-        return certificate_matrix, dictionary_matrix, label_matrix
-
-    def build_rnn_input(self, data: DataFrame, column: str, max_length: int, vector_size: int) -> np.ndarray:
-        data_matrix = np.zeros((len(data), max_length, vector_size))
-
-        for i, (_, row) in tqdm(enumerate(data.iterrows()), desc="build-matrices", total=len(data)):
-            data_matrix[i] = row[column]
-
-        return data_matrix
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(prog="CLEF2018")
-    parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int)
-    parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int)
-    parser.add_argument("--val_ratio", help="Ratio of validation samples to use", default=0.2, type=float)
-    parser.add_argument("--neg_samples", help="Number of negative samples for each pair to use", default=75, type=int)
-    parser.add_argument("--train_samples", help="Number of instances to sample from the training data", default=None, type=int)
-
-    args = parser.parse_args()
-
-    AppContext.initialize_by_app_name("eCLEF2018-Task1")
-
-    clef_data = Clef18Task1Data()
-    it_dictionary = clef_data.read_it_dictionary()
-    it_certificates = clef_data.read_it_train_certificates()
-    it_certificates = clef_data.filter_single_code_lines(it_certificates)
-
-    ft_embeddings = FastTextEmbeddings()
-    ft_it_model = ft_embeddings.load_it_embeddings()
-
-    clef18_task1 = Clef18Task1()
-
-    data_set = clef18_task1.prepare_data_set(it_certificates, it_dictionary, ft_it_model, args.val_ratio, args.train_samples)
-
-    embedding_model = clef18_task1.train_embedding_model(data_set, args.neg_samples, args.epochs, args.batch_size)
-
-    clef18_task1.train_knn_classifier(embedding_model, data_set)
-    clef18_task1.train_classifiers(embedding_model, data_set)
-
-
-
-
diff --git a/code_mario/clef18_task1_v2.py b/code_mario/clef18_task1_emb1.py
similarity index 82%
rename from code_mario/clef18_task1_v2.py
rename to code_mario/clef18_task1_emb1.py
index 357137e2aade2722e9716841fe1a5caed4f8dc28..cc9529225480f80f4edb5d6539a2eb276fac0aab 100644
--- a/code_mario/clef18_task1_v2.py
+++ b/code_mario/clef18_task1_emb1.py
@@ -1,15 +1,16 @@
-import argparse
-from argparse import Namespace
+from init import *
 
+import argparse
 import numpy as np
 import pandas as pd
 import keras as k
 import pickle
 import os
 
+from argparse import Namespace
 from gensim.models import FastText
 from keras import Input, Model
-from keras.callbacks import ModelCheckpoint
+from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, CSVLogger
 from keras.layers import Bidirectional, Dense, Dot, LSTM, Embedding
 from keras.preprocessing.sequence import pad_sequences
 from keras.preprocessing.text import Tokenizer
@@ -74,7 +75,7 @@ class EvaluationResult(object):
         self.accuracy = accuracy
 
 
-class Clef18Task1V2(LoggingMixin):
+class Clef18Task1Emb1(LoggingMixin):
 
     def __init__(self):
         LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file)
@@ -89,8 +90,8 @@ class Clef18Task1V2(LoggingMixin):
                                            config.max_cert_length, config.max_dict_length)
         model.summary(print_fn=self.logger.info)
 
-        cert_inputs = pad_sequences(train_pair_data["Cert_input"].values, maxlen=config.max_cert_length)
-        dict_inputs = pad_sequences(train_pair_data["Dict_input"].values, maxlen=config.max_dict_length)
+        cert_inputs = pad_sequences(train_pair_data["Cert_input"].values, maxlen=config.max_cert_length, padding="post")
+        dict_inputs = pad_sequences(train_pair_data["Dict_input"].values, maxlen=config.max_dict_length, padding="post")
         labels = train_pair_data["Label"].values
 
         self.logger.info("Start training of embedding model")
@@ -100,8 +101,8 @@ class Clef18Task1V2(LoggingMixin):
             self.logger.info("Start creation of validation pairs")
             val_pair_data = self.build_pairs(config.val_cert_df, config.dict_df, neg_sampling_strategy)
 
-            val_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length)
-            val_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length)
+            val_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length, padding="post")
+            val_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length, padding="post")
             val_gold_labels = val_pair_data["Label"].values
 
             model.fit([cert_inputs, dict_inputs], labels, epochs=epochs, batch_size=batch_size,
@@ -124,11 +125,11 @@ class Clef18Task1V2(LoggingMixin):
             self.logger.info("Start evaluation of embedding model!")
 
             self.logger.info("Start creation of test pairs")
-            val_pair_data = self.build_pairs(config.test_cert_df, config.dict_df, neg_sampling_strategy)
+            test_pair_data = self.build_pairs(config.test_cert_df, config.dict_df, neg_sampling_strategy)
 
-            test_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length)
-            test_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length)
-            test_gold_labels = val_pair_data["Label"].values
+            test_cert_inputs = pad_sequences(test_pair_data["Cert_input"].values, maxlen=config.max_cert_length, padding="post")
+            test_dict_inputs = pad_sequences(test_pair_data["Dict_input"].values, maxlen=config.max_dict_length, padding="post")
+            test_gold_labels = test_pair_data["Label"].values
 
             self.logger.info("Start prediction of test labels")
             pred_labels = model.predict([test_cert_inputs, test_dict_inputs], verbose=1)
@@ -176,45 +177,63 @@ class Clef18Task1V2(LoggingMixin):
         ]
 
         named_classifiers = [
-            ("KNN", lambda num_classes: KNeighborsClassifier()),
-            ("KNN-Cos", lambda num_classes: KNeighborsClassifier(metric="cosine")),
-            ("SGD", lambda num_classes: SGDClassifier(verbose=1, random_state=42)),
-            ("DT", lambda num_classes: DecisionTreeClassifier(random_state=42)),
-            ("RF", lambda num_classes: RandomForestClassifier(verbose=1, random_state=42)),
-            ("LinearSVM", lambda num_classes: LinearSVC(max_iter=5000, verbose=1, random_state=42)),
-
-            ("DNN-1-200", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], False, 0.0, 50, 2,
-                                                                callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-200")])),
-            ("DNN-1-300", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], False, 0.0, 50, 2,
-                                                                callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-300")])),
-            ("DNN-200-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], True, 0.5, 50, 2,
-                                                                  callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-200-bn-do50")])),
-            ("DNN-300-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], True, 0.5, 50, 2,
-                                                                  callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-300-bn-do50")])),
-
-            ("DNN-200-100", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], False, 0.0, 1, 2,
-                                                                  callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-100")])),
-            ("DNN-200-200", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], False, 0.0, 50, 2,
-                                                                  callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-200")])),
-            ("DNN-200-100-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], True, 0.5, 50, 2,
-                                                                        callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-100-bn-do50")])),
-            ("DNN-200-200-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], True, 0.5, 50, 2,
-                                                                        callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-200-bn-do50")])),
-
-            # ("Test-DNN-200-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], True, 0.5, 1, 4,
-            #                                                               callbacks=[ku.best_model_checkpointing_by_model_name("test-dnn-200-bn-do")])),
-
-            ('DU1', lambda num_classes: DummyClassifier(strategy="stratified")),
-            ('DU2', lambda num_classes: DummyClassifier(strategy="most_frequent"))
+            ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()),
+            ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")),
+            ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)),
+            ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)),
+            ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)),
+            ("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)),
+
+            ("DNN-200", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=10, batch_size=2)),
+            ("DNN-300", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)),
+
+            ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)),
+            ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)),
+
+            ("DNN-200-100", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)),
+            ("DNN-200-200", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)),
+            ("DNN-300-200", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)),
+
+            ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)),
+            ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)),
+            ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)),
+
+            ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")),
+            ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent"))
         ]
 
         num_experiments = len(target_label_configs) * len(test_sets) * len(named_classifiers)
         cur_experiment = 1
 
+        input_dim = cert_rnn.output.shape[1].value
+
+        models_dir = os.path.join(AppContext.default().output_dir, "models")
+        os.makedirs(models_dir, exist_ok=True)
+
         results = []
         for target_label, target_column, label_encoder in target_label_configs:
             self.logger.info("Start evaluation experiments with label %s", target_label)
-            num_classes = len(label_encoder.classes_)
+            output_dim = len(label_encoder.classes_)
 
             complete_train_data = np.append(dict_embeddings, train_cert_embeddings, axis=0)
             complete_train_labels = np.append(config.dict_df[target_column].values, config.train_cert_df[target_column].values, axis=0)
@@ -222,11 +241,11 @@ class Clef18Task1V2(LoggingMixin):
 
             for cl_name, classifier_factory in named_classifiers:
                 self.logger.info("Start training of classifier %s", cl_name)
-                classifier = classifier_factory(num_classes)
+                classifier = classifier_factory(target_label, input_dim, output_dim, val_cert_embeddings)
                 classifier.fit(complete_train_data, complete_train_labels)
 
                 classifier_file_name = "cl_{}_{}.model".format(cl_name, target_label).lower()
-                classifier_file = os.path.join(AppContext.default().output_dir, classifier_file_name)
+                classifier_file = os.path.join(models_dir, classifier_file_name)
                 try:
                     joblib.dump(classifier, classifier_file)
                 except:
@@ -291,7 +310,7 @@ class Clef18Task1V2(LoggingMixin):
             except KeyError:
                 self.logger.error("Can't create embedding for '%s'", word)
 
-        embedding = Embedding(len(word_index)+1, ft_model.vector_size, weights=[embedding_matrix])
+        embedding = Embedding(len(word_index)+1, ft_model.vector_size, weights=[embedding_matrix], mask_zero=True)
 
         # Model 1: Learn a representation of a line originating from a death certificate
         input_certificate_line = Input((max_cert_length, ))
@@ -456,15 +475,13 @@ class Clef18Task1V2(LoggingMixin):
 
                 labels.append(1.0)
 
+            # Build negative samples
             # Find illegal ICD-10 for this line
-            #negative_samples = dictionary_data.query("ICD10 != '%s'" % line_icd10_code)
-            #negative_samples = negative_samples.sample(num_neg_samples)
             negative_samples = neg_sampling_strategy(certificate_data, line_icd10_code)
 
-            # Build negative samples
-            for i, dict_row in negative_samples.iterrows():
+            for i, neg_row in negative_samples.iterrows():
                 certificate_vectors.append(cert_row["Token_ids"])
-                dictionary_vectors.append(dict_row["Token_ids"])
+                dictionary_vectors.append(neg_row["Token_ids"])
 
                 labels.append(0.0)
 
@@ -497,6 +514,15 @@ class Clef18Task1V2(LoggingMixin):
                     result_writer.write("%s\t%s\t%s\t%s\n" % (r.target_label, r.classifier_name, r.data_set_name, r.accuracy))
                 result_writer.close()
 
+    def save_arguments(self, arguments: Namespace):
+        arguments_file = os.path.join(AppContext.default().log_dir, "arguments.txt")
+        self.logger.info("Saving arguments to " + arguments_file)
+
+        with open(arguments_file, 'w', encoding="utf8") as writer:
+            for key, value in arguments.__dict__.items():
+                writer.write("%s=%s\n" % (str(key), str(value)))
+            writer.close()
+
     def save_configuration(self, configuration: Configuration):
         label_encoder_file = os.path.join(AppContext.default().output_dir, "label_encoder.pk")
         self.logger.info("Saving label encoder to " + label_encoder_file)
@@ -528,6 +554,21 @@ class Clef18Task1V2(LoggingMixin):
         self.logger.info("Reloading embedding model from " + emb_model_file)
         return k.models.load_model(args.emb_model)
 
+    def create_dnn_classifier(self, model_name, label: str, val_data: Tuple, **kwargs):
+        if val_data is not None:
+            monitor_loss = "val_loss"
+        else:
+            monitor_loss = "loss"
+
+        callbacks = [
+            ku.best_model_checkpointing_by_model_name(model_name),
+            ku.csv_logging_callback(model_name, label),
+            ku.early_stopping(monitor_loss, 5)
+        ]
+
+        kwargs["callbacks"] = callbacks
+        return nnc.dense_network(**kwargs)
+
 
 class NegativeSampling(LoggingMixin):
 
@@ -547,7 +588,10 @@ class NegativeSampling(LoggingMixin):
     def default_strategy(self, num_negative_samples: int) -> Callable:
         def _sample(dictionary_df: DataFrame, line_icd10_code: str):
             negative_samples = dictionary_df.query("ICD10 != '%s'" % line_icd10_code)
-            negative_samples = negative_samples.sample(num_negative_samples)
+
+            # Only necessary during development and tests with only very few examples
+            if len(negative_samples) > 0:
+                negative_samples = negative_samples.sample(min(num_negative_samples, len(negative_samples)))
 
             return negative_samples
 
@@ -622,29 +666,33 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
 
-    AppContext.initialize_by_app_name(args.mode)
+    AppContext.initialize_by_app_name(Clef18Task1Emb1.__name__ + "-" + args.mode)
 
     clef_data = Clef18Task1Data()
     dictionary = clef_data.read_dictionary_by_language(args.lang)
+    #dictionary = dictionary.sample(1200)
+
     certificates = clef_data.read_train_certifcates_by_language(args.lang)
     certificates = clef_data.filter_single_code_lines(certificates)
     certificates = clef_data.add_masked_icd10_column(certificates, 10)
 
     sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
-    ft_model = FastText(sentences, min_count=1)
+    #ft_model = FastText(sentences, min_count=1)
 
     ft_embeddings = FastTextEmbeddings()
-    #ft_model = ft_embeddings.load_embeddings_by_language(args.lang)
+    ft_model = ft_embeddings.load_embeddings_by_language(args.lang)
 
-    clef18_task1 = Clef18Task1V2()
-    neg_sampling = NegativeSampling()
+    clef18_task1 = Clef18Task1Emb1()
+    clef18_task1.save_arguments(args)
 
     if args.mode == "train-emb":
         configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, args.val_ratio,args.strat_column,
                                                       args.samples, args.strat_splits)
-        neg_sampling_strategy = neg_sampling.get_strategy_by_name(args.neg_sampling, args)
         clef18_task1.save_configuration(configuration)
 
+        neg_sampling = NegativeSampling()
+        neg_sampling_strategy = neg_sampling.get_strategy_by_name(args.neg_sampling, args)
+
         embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, neg_sampling_strategy, args.epochs, args.batch_size)
 
     elif args.mode == "eval-cl":
diff --git a/code_mario/clef18_task1_emb2.py b/code_mario/clef18_task1_emb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cc15c73ed539bce8ba721f98847c6028efc9991
--- /dev/null
+++ b/code_mario/clef18_task1_emb2.py
@@ -0,0 +1,655 @@
+from init import *
+
+import argparse
+import numpy as np
+import pandas as pd
+import keras as k
+import pickle
+import os
+
+from argparse import Namespace
+from gensim.models import FastText
+from keras import Input, Model
+from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, CSVLogger
+from keras.layers import Bidirectional, Dense, Dot, LSTM, Embedding, GlobalMaxPool1D
+from keras.preprocessing.sequence import pad_sequences
+from keras.preprocessing.text import Tokenizer
+from pandas import DataFrame
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import SGDClassifier
+from sklearn.metrics import f1_score, accuracy_score
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder
+from sklearn.svm import LinearSVC
+from sklearn.tree import DecisionTreeClassifier
+from tqdm import tqdm
+from typing import Tuple, Dict, List, Callable
+from sklearn.externals import joblib
+
+import ft_embeddings
+from app_context import AppContext
+from clef18_task1_data import Clef18Task1Data
+from dnn_classifiers import NeuralNetworkClassifiers as nnc
+from ft_embeddings import FastTextEmbeddings
+from preprocessing import DataPreparationUtil as pdu
+from keras_extension import KerasUtil as ku
+from util import LoggingMixin
+
+
+class ICD10LabelEncoders(object):
+
+    def __init__(self, chapter_encoder: LabelEncoder, section_encoder: LabelEncoder,
+                 subsection_encoder: LabelEncoder, code_encoder: LabelEncoder):
+        self.chapter_encoder = chapter_encoder
+        self.section_encoder = section_encoder
+        self.subsection_encoder = subsection_encoder
+        self.code_encoder = code_encoder
+
+
+class Configuration(object):
+
+    def __init__(self, train_df: DataFrame, val_df: DataFrame, test_df: DataFrame, max_length: int, ft_embedding_size: int,
+                 label_column: str, label_encoders: ICD10LabelEncoders, keras_tokenizer: Tokenizer):
+        self.train_df = train_df
+        self.val_df = val_df
+        self.test_df = test_df
+        self.max_length = max_length
+        self.ft_embedding_size = ft_embedding_size
+        self.label_column = label_column
+        self.label_encoders = label_encoders
+        self.keras_tokenizer = keras_tokenizer
+
+
+class EvaluationResult(object):
+
+    def __init__(self, target_label: str, classifier_name: str, data_set_name: str, accuracy: float):
+        self.target_label = target_label
+        self.classifier_name = classifier_name
+        self.data_set_name = data_set_name
+        self.accuracy = accuracy
+
+
+class Clef18Task1Emb2(LoggingMixin):
+
+    def __init__(self):
+        LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file)
+
+    def train_embedding_model(self, config: Configuration, ft_model: FastText, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model:
+        self.logger.info("Start building training pairs")
+        train_pair_data = self.build_pairs(config.train_df, neg_sampling_strategy)
+        self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts())
+
+        self.logger.info("Start building embedding model")
+        model = self.build_embedding_model(config.keras_tokenizer.word_index, ft_model, config)
+        model.summary(print_fn=self.logger.info)
+
+        cert_inputs = pad_sequences(train_pair_data["Cert_input"].values, maxlen=config.max_length, padding="post")
+        icd10_inputs = train_pair_data["ICD10_input"].values
+        labels = train_pair_data["Label"].values
+
+        self.logger.info("Start training of embedding model")
+        best_model_file = os.path.join(AppContext.default().output_dir, "embedding_model_best.h5")
+
+        if config.val_df is not None and len(config.test_df) > 0:
+            self.logger.info("Start creation of validation pairs")
+            val_pair_data = self.build_pairs(config.val_df, neg_sampling_strategy)
+
+            val_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_length, padding="post")
+            val_icd10_inputs = val_pair_data["ICD10_input"].values
+            val_gold_labels = val_pair_data["Label"].values
+
+            model.fit([cert_inputs, icd10_inputs], labels, epochs=epochs, batch_size=batch_size,
+                      validation_data=([val_cert_inputs, val_icd10_inputs], val_gold_labels),
+                      callbacks=[ku.best_model_checkpointing_by_file_path(best_model_file, "val_loss")])
+        else:
+            model.fit([cert_inputs, icd10_inputs], labels, epochs=epochs, batch_size=batch_size,
+                    callbacks=[ku.best_model_checkpointing_by_file_path(best_model_file)])
+
+        model_file = os.path.join(AppContext.default().output_dir, "embedding_model_last.h5")
+        self.logger.info("Saving last model to %s", model_file)
+        model.save(model_file)
+
+        self.logger.info("Reloading best embedding model from %s", best_model_file)
+        model = k.models.load_model(best_model_file)
+
+        ## ----------------------------------------------------------------------------------------------------------
+
+        if config.val_df is not None and len(config.val_df) > 0:
+            self.logger.info("Start evaluation of embedding model!")
+
+            self.logger.info("Start creation of test pairs")
+            test_pair_data = self.build_pairs(config.test_df, neg_sampling_strategy)
+
+            test_cert_inputs = pad_sequences(test_pair_data["Cert_input"].values, maxlen=config.max_length, padding="post")
+            test_icd10_inputs = test_pair_data["ICD10_input"].values
+            test_gold_labels = test_pair_data["Label"].values
+
+            self.logger.info("Start prediction of test labels")
+            pred_labels = model.predict([test_cert_inputs, test_icd10_inputs], verbose=1)
+            pred_labels = (pred_labels > 0.5).astype(float)
+
+            f1_value = f1_score(test_gold_labels, pred_labels)
+            acc_value = accuracy_score(test_gold_labels, pred_labels)
+
+            self.logger.info("Result: f1_score= %s | acc_score= %s", f1_value, acc_value)
+
+        return model
+
+    def train_and_evaluate_classifiers(self, emb_model: Model, config: Configuration, target_labels: List) -> List[EvaluationResult]:
+        self.logger.info("Start training and evaluation of classifier models")
+
+        self.logger.info("Building embeddings for training data")
+        text_input = emb_model.inputs[0]
+        text_rnn = Model(inputs=text_input, outputs=emb_model.get_layer("text_rnn").output, name="Cert-RNN-Model")
+
+        train_inputs = pad_sequences(config.train_df["Token_ids"].values, maxlen=config.max_length)
+        train_embeddings = text_rnn.predict(train_inputs, verbose=1)
+        self.logger.info("cert train input shape: %s", train_embeddings.shape)
+
+        val_inputs = pad_sequences(config.val_df["Token_ids"].values, maxlen=config.max_length)
+        val_embeddings = text_rnn.predict(val_inputs, verbose=1)
+        self.logger.info("cert val input shape: %s", val_embeddings.shape)
+
+        test_inputs = pad_sequences(config.test_df["Token_ids"].values, maxlen=config.max_length)
+        test_embeddings = text_rnn.predict(test_inputs, verbose=1)
+        self.logger.info("cert test input shape: %s", test_embeddings.shape)
+
+        target_label_configs = self.get_label_configuration(target_labels, config.label_encoders)
+
+        test_sets = [
+            #("dict", dict_embeddings, config.dict_df),
+            ("cert-train", train_embeddings, config.train_df),
+            ("cert-val", val_embeddings, config.val_df),
+            ("cert-test", test_embeddings, config.test_df)
+        ]
+
+        named_classifiers = [
+            ("KNN", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier()),
+            ("KNN-Cos", lambda label, input_dim, output_dim, val_data: KNeighborsClassifier(metric="cosine")),
+            ("SGD", lambda label, input_dim, output_dim, val_data: SGDClassifier(verbose=1, random_state=42)),
+            ("DT", lambda label, input_dim, output_dim, val_data: DecisionTreeClassifier(random_state=42)),
+            ("RF", lambda label, input_dim, output_dim, val_data: RandomForestClassifier(verbose=1, random_state=42)),
+            ("LinearSVM", lambda label, input_dim, output_dim, val_data: LinearSVC(max_iter=10000, verbose=1, random_state=42)),
+
+            ("DNN-200", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=10, batch_size=2)),
+            ("DNN-300", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)),
+
+            ("DNN-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)),
+            ("DNN-300-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)),
+
+            ("DNN-200-100", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-100", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)),
+            ("DNN-200-200", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)),
+            ("DNN-300-200", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300-200", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200],
+                                            batch_normalization=False, dropout_rate=0.0, epochs=50, batch_size=2)),
+
+            ("DNN-200-100-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-100-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 100],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)),
+            ("DNN-200-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-200-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[200, 200],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)),
+            ("DNN-300-200-BN-DO", lambda label, input_dim, output_dim, val_data:
+                self.create_dnn_classifier("dnn-300-200-bn-do", label, val_data, input_dim=input_dim, output_dim=output_dim, hidden_layer_sizes=[300, 200],
+                                            batch_normalization=True, dropout_rate=0.5, epochs=50, batch_size=2)),
+
+            ('DU1', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="stratified")),
+            ('DU2', lambda label, input_dim, output_dim, val_data: DummyClassifier(strategy="most_frequent"))
+        ]
+
+        num_experiments = len(target_label_configs) * len(test_sets) * len(named_classifiers)
+        cur_experiment = 1
+
+        input_dim = text_rnn.output.shape[1].value
+
+        models_dir = os.path.join(AppContext.default().output_dir, "models")
+        os.makedirs(models_dir, exist_ok=True)
+
+        results = []
+        for target_label, target_column, label_encoder in target_label_configs:
+            self.logger.info("Start evaluation experiments with label %s", target_label)
+            output_dim = len(label_encoder.classes_)
+
+            for cl_name, classifier_factory in named_classifiers:
+                self.logger.info("Start training of classifier %s", cl_name)
+                classifier = classifier_factory(target_label, input_dim, output_dim, val_embeddings)
+                classifier.fit(train_embeddings, config.train_df[target_column].values)
+
+                classifier_file_name = "cl_{}_{}.model".format(cl_name, target_label).lower()
+                classifier_file = os.path.join(models_dir, classifier_file_name)
+                try:
+                    joblib.dump(classifier, classifier_file)
+                except:
+                    self.logger.error("Error while saving classifier %s to %s", cl_name, classifier_file)
+
+                self.logger.info("Start evaluation of %s", cl_name)
+                for ts_name, inputs, data_frame in test_sets:
+                    gold_labels = data_frame[target_column].values
+
+                    self.logger.info("Evaluate data set %s", ts_name)
+                    prediction = classifier.predict(inputs)
+                    acc_score = accuracy_score(gold_labels, prediction)
+
+                    self.logger.info("Evaluation result: label=%s | classifier=%s | data_set=%s | acc_score=%s",
+                                     target_label, cl_name, ts_name, acc_score)
+                    results.append(EvaluationResult(target_label, cl_name, ts_name, acc_score))
+
+                    self.logger.info("Finished experiment %s out of %s", cur_experiment, num_experiments)
+                    cur_experiment += 1
+
+        return results
+
+    def get_label_configuration(self, target_labels: List[str], icd10_encoders: ICD10LabelEncoders) -> List:
+        label_configs = []
+
+        for target_label in target_labels:
+            if target_label == "chap" or target_label == "chapter":
+                label_configs.append((target_label, "ICD10_chapter_encoded", icd10_encoders.chapter_encoder))
+            elif target_label == "sect" or target_label == "section":
+                label_configs.append((target_label, "ICD10_section_encoded", icd10_encoders.section_encoder))
+            elif target_label == "subs" or target_label == "subsection":
+                label_configs.append((target_label, "ICD10_subsection_encoded", icd10_encoders.subsection_encoder))
+            elif target_label == "code" or target_label == "icd10":
+                label_configs.append((target_label, "ICD10_encoded", icd10_encoders.code_encoder))
+            else:
+                self.logger.error("Can't create label configuration for label " + target_label)
+
+        return label_configs
+
+    def split_train_test(self, certificate_df: DataFrame, train_size: float,
+                         stratified_splits: bool, label_column: str) -> Tuple[DataFrame, DataFrame]:
+        if stratified_splits:
+            self.logger.info("Creating stratified splits for column %s", label_column)
+            training_data, test_data = train_test_split(certificate_df, train_size=train_size, stratify=certificate_df[label_column])
+        else:
+            self.logger.info("Creating non-stratified splits")
+            training_data, test_data = train_test_split(certificate_df, train_size=train_size)
+
+        return training_data, test_data
+
+    def build_embedding_model(self, word_index: Dict, ft_model: FastText, conf: Configuration):
+        # TODO: Make hyper-parameter configurable!
+        # TODO: Think about using CNNs instead of RNNs since we can have multiple ICD-10 per line!
+
+        embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size))
+        for word, i in word_index.items():
+            try:
+                embedding_vector = ft_model[word]
+                if embedding_vector is not None:
+                    # words not found in embedding index will be all-zeros.
+                    embedding_matrix[i] = embedding_vector
+            except KeyError:
+                self.logger.error("Can't create embedding for '%s'", word)
+
+        input_embedding = Embedding(len(word_index)+1, ft_model.vector_size, weights=[embedding_matrix], mask_zero=True)
+
+        # Model 1: Learn a representation of a line originating from a death certificate or the dictionary
+        input_certificate_line = Input((conf.max_length, ))
+        cert_embeddings = input_embedding(input_certificate_line)
+        certificate_rnn = Bidirectional(LSTM(200), name="text_rnn")(cert_embeddings)
+
+        # Model 2: Learn a representation of a ICD-10 code
+        num_icd10_codes = len(conf.label_encoders.code_encoder.classes_)
+
+        icd10_input = Input((1, ))
+        icd10_embedding = Embedding(num_icd10_codes, 400, mask_zero=False)(icd10_input)
+        icd10_embedding = GlobalMaxPool1D()(icd10_embedding)
+
+        # Calculate similarity between both representations
+        dot_product = Dot(axes=1, normalize=True)([certificate_rnn, icd10_embedding])
+
+        output = Dense(1, activation='sigmoid')(dot_product)
+
+        # Create the primary training model
+        model = Model(inputs=[input_certificate_line, icd10_input], outputs=output, name="ICD10-Embedding-Model2")
+        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
+
+        return model
+
+    def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, train_ratio: float, val_ratio: float,
+                         strat_column: str, samples: int=None, stratified_splits: bool=False) -> Configuration:
+
+        cert_df = cert_df[["RawText", "ICD10"]]
+        cert_df.columns = ["Text", "ICD10"]
+
+        dict_df = dict_df[["DiagnosisText", "ICD10"]]
+        dict_df.columns = ["Text", "ICD10"]
+
+        complete_df = pd.concat([cert_df, dict_df])
+        self.logger.info("Concatenated certificate and dictionary entries. Found %s in total.", len(complete_df))
+
+        if samples:
+            self.logger.info("Sampling %s instances", samples)
+            complete_df = complete_df.sample(samples, random_state=42)
+
+        self.logger.info("Splitting certificate lines into train and evaluation data set")
+        train_df, evaluation_df = self.split_train_test(complete_df, train_ratio, stratified_splits, strat_column)
+        self.logger.info("Finished splitting: train=%s instances, evaluation=%s instances", len(train_df), len(evaluation_df))
+
+        self.logger.info("Splitting evaluation data set into validation and test set")
+        val_df, test_df = self.split_train_test(evaluation_df, val_ratio, stratified_splits, strat_column)
+
+        label_encoders = self.prepare_label_encoders(dict_df, cert_df)
+        keras_tokenizer = Tokenizer(oov_token="<UNK>")
+
+        self.logger.info("Start preparation of training data (%s instances)", len(train_df))
+        train_df, max_length = self.prepare_cert_dict_df(train_df, "train", label_encoders, keras_tokenizer)
+
+        self.logger.info("Start preparation of validation data (%s instances)", len(val_df))
+        val_df, _ = self.prepare_cert_dict_df(val_df, "validation", label_encoders, keras_tokenizer)
+
+        self.logger.info("Start preparation of test cert data (%s instances)", len(test_df))
+        test_df, _ = self.prepare_cert_dict_df(test_df, "test", label_encoders, keras_tokenizer)
+
+        return Configuration(train_df, val_df, test_df, max_length, ft_model.vector_size,
+                             strat_column, label_encoders, keras_tokenizer)
+
+    def prepare_label_encoders(self, dict_df: DataFrame, cert_df: DataFrame) -> ICD10LabelEncoders:
+        self.logger.info("Fitting label encoder to ICD10 codes")
+        icd10_code_encoder = LabelEncoder()
+        icd10_code_encoder.fit(list([icd10.strip() for icd10 in dict_df["ICD10"].values]) +
+                                list([icd10.strip() for icd10 in cert_df["ICD10"].values]))
+        self.logger.info("Found %s distinct ICD10 codes within the data set", len(icd10_code_encoder.classes_))
+
+        self.logger.info("Fitting label encoder to ICD10 chapters")
+        icd10_chapter_encoder = LabelEncoder()
+        icd10_chapter_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) +
+                                  list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values]))
+        self.logger.info("Found %s distinct ICD10 chapters within the data set", len(icd10_chapter_encoder.classes_))
+
+        self.logger.info("Fitting label encoder to ICD10 section")
+        icd10_section_encoder = LabelEncoder()
+        icd10_section_encoder.fit(list([icd10.strip().lower()[0:2] for icd10 in dict_df["ICD10"].values]) +
+                                  list([icd10.strip().lower()[0:2] for icd10 in cert_df["ICD10"].values]))
+        self.logger.info("Found %s distinct ICD10 sections within the data set", len(icd10_section_encoder.classes_))
+
+        self.logger.info("Fitting label encoder to ICD10 subsection")
+        icd10_subsection_encoder = LabelEncoder()
+        icd10_subsection_encoder.fit(list([icd10.strip().lower()[0:3] for icd10 in dict_df["ICD10"].values]) +
+                                     list([icd10.strip().lower()[0:3] for icd10 in cert_df["ICD10"].values]))
+        self.logger.info("Found %s distinct ICD10 subsections within the data set", len(icd10_subsection_encoder.classes_))
+
+        return ICD10LabelEncoders(icd10_chapter_encoder, icd10_section_encoder, icd10_subsection_encoder, icd10_code_encoder)
+
+    def prepare_cert_dict_df(self, cert_dict_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders,
+                             keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]:
+        pipeline = Pipeline([
+            ("Extract-ICD10-chapter", pdu.extract_icd10_chapter("ICD10", "ICD10_chapter")),
+            ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded",
+                                                       icd10_encoders.chapter_encoder, False)),
+
+            ("Extract-ICD10-section", pdu.extract_icd10_section("ICD10", "ICD10_section")),
+            ("Encode-ICD10-section", pdu.encode_labels("ICD10_section", "ICD10_section_encoded",
+                                                       icd10_encoders.section_encoder, False)),
+
+            ("Extract-ICD10-subsection", pdu.extract_icd10_subsection("ICD10", "ICD10_subsection")),
+            ("Encode-ICD10-subsection", pdu.encode_labels("ICD10_subsection", "ICD10_subsection_encoded",
+                                                       icd10_encoders.subsection_encoder, False)),
+
+            ("Clean-ICD10-code", pdu.strip("ICD10")),
+            ("Encode-ICD10-code", pdu.encode_labels("ICD10", "ICD10_encoded",
+                                                     icd10_encoders.code_encoder, False)),
+
+            ("LowercaseText", pdu.to_lowercase("Text")),
+            ("TokenizeText", pdu.keras_sequencing("Text", "Token_ids", keras_tokenizer, (mode == "train")))
+        ])
+
+        data_prepared = pipeline.fit_transform(cert_dict_df)
+
+        if mode == "train":
+            max_length = max([len(array) for array in data_prepared["Token_ids"].values])
+        else:
+            max_length = None
+
+        return data_prepared, max_length
+
+    def build_pairs(self, data_df: DataFrame, neg_sampling_strategy: Callable):
+        # FIXME: This can be implemented more efficiently!
+        # FIXME: Improve sampling of negative instances (especially if code is I-XXX sample other codes of the same class (e.g. I-YYY)
+        # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2
+
+        text_vectors = []
+        icd10_codes = []
+        labels = []
+
+        for i, data_row in tqdm(data_df.iterrows(), desc="build-pairs", total=len(data_df)):
+            # Build positive sample (based on training data)
+            text_vectors.append(data_row["Token_ids"])
+            icd10_codes.append(data_row["ICD10_encoded"])
+            labels.append(1.0)
+
+            # Build negative samples
+            negative_samples = neg_sampling_strategy(data_df, data_row["ICD10"])
+
+            for i, neg_row in negative_samples.iterrows():
+                text_vectors.append(data_row["Token_ids"])
+                icd10_codes.append(neg_row["ICD10_encoded"])
+
+                labels.append(0.0)
+
+        data = {"Cert_input": text_vectors, "ICD10_input": icd10_codes, "Label": labels}
+        return pd.DataFrame(data)
+
+    def save_evaluation_results(self, eval_results: List[EvaluationResult]):
+        result_configurations = [
+            ("results.csv", None),
+            ("results_by_classifier.csv", lambda result: result.classifier_name),
+            ("results_by_data_set.csv", lambda result: result.data_set_name),
+            ("results_by_label.csv", lambda result: result.target_label)
+        ]
+
+        for file_name, sort_key in result_configurations:
+            results_file = os.path.join(AppContext.default().output_dir, file_name)
+            with open(results_file, "w", encoding="utf8") as result_writer:
+                if sort_key:
+                    eval_results = sorted(eval_results, key=sort_key)
+
+                for r in eval_results:
+                    result_writer.write("%s\t%s\t%s\t%s\n" % (r.target_label, r.classifier_name, r.data_set_name, r.accuracy))
+                result_writer.close()
+
+    def save_arguments(self, arguments: Namespace):
+        arguments_file = os.path.join(AppContext.default().log_dir, "arguments.txt")
+        self.logger.info("Saving arguments to " + arguments_file)
+
+        with open(arguments_file, 'w', encoding="utf8") as writer:
+            for key, value in arguments.__dict__.items():
+                writer.write("%s=%s\n" % (str(key), str(value)))
+            writer.close()
+
+    def save_configuration(self, configuration: Configuration):
+        label_encoder_file = os.path.join(AppContext.default().output_dir, "label_encoder.pk")
+        self.logger.info("Saving label encoder to " + label_encoder_file)
+        with open(label_encoder_file, 'wb') as encoder_writer:
+            pickle.dump(configuration.label_encoders, encoder_writer)
+            encoder_writer.close()
+
+        keras_tokenizer_file = os.path.join(AppContext.default().output_dir, "keras_tokenizer.pk")
+        self.logger.info("Saving keras sequencer to " + keras_tokenizer_file)
+        with open(keras_tokenizer_file, 'wb') as keras_sequencer_writer:
+            pickle.dump(configuration.keras_tokenizer, keras_sequencer_writer)
+            keras_sequencer_writer.close()
+
+        configuration_file = os.path.join(AppContext.default().output_dir, "configuration.pk")
+        self.logger.info("Saving configuration to " + configuration_file)
+        with open(configuration_file, 'wb') as train_conf_writer:
+            pickle.dump(configuration, train_conf_writer)
+            train_conf_writer.close()
+
+    def reload_configuration(self, file_path: str):
+        self.logger.info("Reloading configuration from " + file_path)
+        with open(args.train_conf, 'rb') as train_conf_reader:
+            configuration = pickle.load(train_conf_reader)
+            train_conf_reader.close()
+
+        return configuration
+
+    def reload_embedding_model(self, emb_model_file: str):
+        self.logger.info("Reloading embedding model from " + emb_model_file)
+        return k.models.load_model(args.emb_model)
+
+    def create_dnn_classifier(self, model_name, label: str, val_data: Tuple, **kwargs):
+        if val_data is not None:
+            monitor_loss = "val_loss"
+        else:
+            monitor_loss = "loss"
+
+        callbacks = [
+            ku.best_model_checkpointing_by_model_name(model_name),
+            ku.csv_logging_callback(model_name, label),
+            ku.early_stopping(monitor_loss, 5)
+        ]
+
+        kwargs["callbacks"] = callbacks
+        return nnc.dense_network(**kwargs)
+
+
+class NegativeSampling(LoggingMixin):
+
+    def __init__(self):
+        LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file)
+
+    def get_strategy_by_name(self, name: str, args: Namespace) -> Callable:
+        #FIXME: Make args to dictionary
+
+        if name == "def":
+            return self.default_strategy(args.num_neg_samples)
+        elif name == "ext1":
+            return self.extended1_strategy(args.num_neg_cha, args.num_neg_sec, args.num_neg_sub, args.num_neg_oth)
+        else:
+            raise AssertionError("Unsupported negative sampling strategy: " + name)
+
+    def default_strategy(self, num_negative_samples: int) -> Callable:
+        def _sample(dictionary_df: DataFrame, line_icd10_code: str):
+            negative_samples = dictionary_df.query("ICD10 != '%s'" % line_icd10_code)
+
+            # Only necessary during development and tests with only very few examples
+            if len(negative_samples) > 0:
+                negative_samples = negative_samples.sample(min(num_negative_samples, len(negative_samples)))
+
+            return negative_samples
+
+        return _sample
+
+    def extended1_strategy(self, num_chapter_samples, num_section_samples, num_subsection_samples, num_other_samples):
+        def _sample(dictionary_df: DataFrame, icd10_code: str):
+            icd10_chapter = icd10_code[0].lower()
+            icd10_section = icd10_code[0:2].lower()
+            icd10_subsection = icd10_code[0:3].lower()
+
+            chapter_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter == '%s'" % (icd10_code, icd10_chapter))
+            if len(chapter_samples) > 0:
+                chapter_samples = chapter_samples.sample(min(num_chapter_samples, len(chapter_samples)))
+
+            section_samples = dictionary_df.query("ICD10 != '%s' & ICD10_section == '%s'" % (icd10_code, icd10_section))
+            if len(section_samples) > 0:
+                section_samples = section_samples.sample(min(num_section_samples, len(section_samples)))
+
+            subsection_samples = dictionary_df.query("ICD10 != '%s' & ICD10_subsection == '%s'" % (icd10_code, icd10_subsection))
+            if len(subsection_samples) > 0:
+                subsection_samples = subsection_samples.sample(min(num_subsection_samples, len(subsection_samples)))
+
+            exp_sim_samples = num_chapter_samples + num_section_samples + num_subsection_samples
+            act_sim_samples = len(chapter_samples) + len(section_samples) + len(subsection_samples)
+
+            other_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter != '%s'" % (icd10_code, icd10_chapter))
+            other_samples = other_samples.sample(min(num_other_samples + (exp_sim_samples - act_sim_samples), len(other_samples)))
+
+            # print("#Chapter samples: ", len(chapter_samples))
+            # print("#Section samples: ", len(section_samples))
+            # print("#Subsection samples: ", len(subsection_samples))
+            # print("#Other samples: ", len(other_samples))
+
+            return pd.concat([chapter_samples, section_samples, subsection_samples, other_samples])
+        return _sample
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(prog="CLEF2018")
+    subparsers = parser.add_subparsers(dest="mode")
+
+    train_emb_parser = subparsers.add_parser("train-emb2")
+    train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"])
+    train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int)
+    train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int)
+    train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float)
+    train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.3, type=float)
+
+    train_emb_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int)
+    train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str)
+    train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool)
+    train_emb_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append')
+
+    train_emb_parser.add_argument("--neg_sampling", help="Negative sampling strategy to use", default="ext1", choices=["def", "ext1"])
+    train_emb_parser.add_argument("--num_neg_samples", help="Number of negative samples to use (default strategy)", default=75, type=int)
+    train_emb_parser.add_argument("--num_neg_cha", help="Number of negative chapter samples to use (ext1 strategy)", default=10, type=int)
+    train_emb_parser.add_argument("--num_neg_sec", help="Number of negative section samples to use (ext1 strategy)", default=10, type=int)
+    train_emb_parser.add_argument("--num_neg_sub", help="Number of negative subsection samples to use (ext1 strategy)", default=10, type=int)
+    train_emb_parser.add_argument("--num_neg_oth", help="Number of negative other samples to use (ext1 strategy)", default=40, type=int)
+
+    eval_classifier_parser = subparsers.add_parser("eval-cl")
+    eval_classifier_parser.add_argument("emb_model", help="Path to the embedding model to use")
+    eval_classifier_parser.add_argument("train_conf", help="Path to the training configuration dump")
+    eval_classifier_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"])
+    eval_classifier_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float)
+    eval_classifier_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.4, type=float)
+    eval_classifier_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int)
+    eval_classifier_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str)
+    eval_classifier_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool)
+    eval_classifier_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append')
+
+    args = parser.parse_args()
+
+    AppContext.initialize_by_app_name(Clef18Task1Emb2.__name__ + "-" + args.mode)
+
+    clef_data = Clef18Task1Data()
+    dictionary = clef_data.read_dictionary_by_language(args.lang)
+    #dictionary = dictionary.sample(1200)
+
+    certificates = clef_data.read_train_certifcates_by_language(args.lang)
+    #certificates = clef_data.filter_single_code_lines(certificates)
+    #certificates = clef_data.add_masked_icd10_column(certificates, 10)
+
+    sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+    #ft_model = FastText(sentences, min_count=1)
+
+    ft_embeddings = FastTextEmbeddings()
+    ft_model = ft_embeddings.load_embeddings_by_language(args.lang)
+
+    clef18_task1 = Clef18Task1Emb2()
+    clef18_task1.save_arguments(args)
+
+    if args.mode == "train-emb2":
+        configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, args.val_ratio,args.strat_column,
+                                                      args.samples, args.strat_splits)
+        clef18_task1.save_configuration(configuration)
+
+        neg_sampling = NegativeSampling()
+        neg_sampling_strategy = neg_sampling.get_strategy_by_name(args.neg_sampling, args)
+
+        embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, neg_sampling_strategy, args.epochs, args.batch_size)
+
+    elif args.mode == "eval-cl":
+        configuration = clef18_task1.reload_configuration(args.train_conf)
+        embedding_model = clef18_task1.reload_embedding_model(args.emb_model)
+
+    eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels)
+    clef18_task1.save_evaluation_results(eval_result)
+
+
+
diff --git a/code_mario/dnn_classifiers.py b/code_mario/dnn_classifiers.py
index e4a4b25ac5802af1042aabc58318478634ef0d74..b19efb6fdcba20bcf503dd337367db12223a3db6 100644
--- a/code_mario/dnn_classifiers.py
+++ b/code_mario/dnn_classifiers.py
@@ -12,14 +12,14 @@ from keras_extension import ExtendedKerasClassifier
 class NeuralNetworkClassifiers(object):
 
     @staticmethod
-    def dense_network(input_size: int, target_classes: int, hidden_layer_sizes: List[int], batch_normalization: bool,
+    def dense_network(input_dim: int, output_dim: int, hidden_layer_sizes: List[int], batch_normalization: bool,
                       dropout_rate: float, epochs: int, batch_size: int, callbacks: List = None):
         def _build_model():
             model = Sequential()
 
             for i, layer_size in enumerate(hidden_layer_sizes):
                 if i == 0:
-                    model.add(Dense(layer_size, input_shape=(input_size,), kernel_initializer=VarianceScaling(), activation="selu"))
+                    model.add(Dense(layer_size, input_shape=(input_dim,), kernel_initializer=VarianceScaling(), activation="selu"))
                 else:
                     model.add(Dense(layer_size, kernel_initializer=VarianceScaling(), activation="selu"))
 
@@ -29,7 +29,7 @@ class NeuralNetworkClassifiers(object):
                 if dropout_rate and dropout_rate > 0.0:
                     model.add(Dropout(dropout_rate))
 
-            model.add(Dense(target_classes, activation="softmax"))
+            model.add(Dense(output_dim, activation="softmax"))
             model.compile(optimizer=Adam(), loss="sparse_categorical_crossentropy", metrics=['accuracy'])
 
             return model
diff --git a/code_mario/init.py b/code_mario/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..71c6dc4e493672d8f0997d3dafa210fc810ccc77
--- /dev/null
+++ b/code_mario/init.py
@@ -0,0 +1,42 @@
+import numpy as np
+import tensorflow as tf
+import random as rn
+
+# The below is necessary in Python 3.2.3 onwards to
+# have reproducible behavior for certain hash-based operations.
+# See these references for further details:
+# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
+# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926
+
+import os
+os.environ['PYTHONHASHSEED'] = '0'
+
+# The below is necessary for starting Numpy generated random numbers
+# in a well-defined initial state.
+
+np.random.seed(42)
+
+# The below is necessary for starting core Python generated random numbers
+# in a well-defined state.
+
+rn.seed(12345)
+
+# Force TensorFlow to use single thread.
+# Multiple threads are a potential source of
+# non-reproducible results.
+# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res
+
+session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+
+from keras import backend as K
+
+# The below tf.set_random_seed() will make random number generation
+# in the TensorFlow backend have a well-defined initial state.
+# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed
+
+tf.set_random_seed(1234)
+
+sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
+K.set_session(sess)
+
+print("Initialized all random seeds!")
diff --git a/code_mario/keras_extension.py b/code_mario/keras_extension.py
index 33d4116fc34883bb017293d5d0fe997901f23594..81cbb644158f1b441074a9e920d34873b2b2b58b 100644
--- a/code_mario/keras_extension.py
+++ b/code_mario/keras_extension.py
@@ -2,7 +2,7 @@ import os
 import keras as k
 
 from logging import Logger
-from keras.callbacks import Callback, ModelCheckpoint
+from keras.callbacks import Callback, ModelCheckpoint, CSVLogger, EarlyStopping
 from keras.wrappers.scikit_learn import KerasClassifier
 
 from app_context import AppContext
@@ -13,13 +13,33 @@ class KerasUtil(object):
 
     @staticmethod
     def best_model_checkpointing_by_model_name(model_name: str, monitor_loss: str = "loss"):
-        best_model_file = os.path.join(AppContext.default().output_dir, "%s_best.h5" % model_name)
+        models_dir = os.path.join(AppContext.default().output_dir, "models")
+        os.makedirs(models_dir, exist_ok=True)
+
+        best_model_file = os.path.join(models_dir, "%s_best.h5" % model_name)
         return ModelCheckpoint(filepath=best_model_file, monitor=monitor_loss, save_best_only=True, verbose=1)
 
     @staticmethod
     def best_model_checkpointing_by_file_path(best_model_file: str, monitor_loss: str = "loss"):
         return ModelCheckpoint(filepath=best_model_file, monitor=monitor_loss, save_best_only=True, verbose=1)
 
+    @staticmethod
+    def early_stopping(monitor_loss: str, patience: int):
+        return EarlyStopping(monitor_loss, patience=patience, verbose=1)
+
+    @staticmethod
+    def csv_logging_callback(model_name: str, label: str):
+        train_log_dir = os.path.join(AppContext.default().log_dir, "train_logs")
+        try:
+            os.makedirs(train_log_dir, exist_ok=True)
+        except:
+            print("Can't create train log directory: " + train_log_dir)
+
+        log_file_name = "%s_%s.log" % (model_name, label)
+        training_log_file = os.path.join(train_log_dir, log_file_name)
+
+        return CSVLogger(training_log_file, separator=";", append=True)
+
 
 class LoggerCallback(Callback):
 
@@ -53,7 +73,7 @@ class ExtendedKerasClassifier(KerasClassifier, LoggingMixin):
                 checkpoint_callbacks = [callback for callback in self.sk_params["callbacks"]
                                         if isinstance(callback, ModelCheckpoint) and callback.save_best_only]
                 if checkpoint_callbacks:
-                    self.logger.info("Reloading model from %s", checkpoint_callbacks[0].filepath)
+                    #self.logger.debug("Reloading model from %s", checkpoint_callbacks[0].filepath)
                     self.model = k.models.load_model(checkpoint_callbacks[0].filepath)
                     self.re_fitted = False
                 else:
@@ -61,9 +81,10 @@ class ExtendedKerasClassifier(KerasClassifier, LoggingMixin):
             else:
                 self.logger.debug("Can't find callbacks parameter. No callbacks configured?")
         else:
-            self.logger.debug("Model wasn't re-fitted -> re-using existing model")
+            #self.logger.debug("Model wasn't re-fitted -> re-using existing model")
             pass
-        self.logger.info("Classifer has %s classes", len(self.classes_))
+
+        #self.logger.debug("Classifer has %s classes", len(self.classes_))
         return super(ExtendedKerasClassifier, self).predict(x, **kwargs)
 
     def __getstate__(self):