diff --git a/.gitignore b/.gitignore
index 36f0c569634108fe7569127b27ab301f60810a40..be1159776fc2acb9e33eeb00652aa2dd2d344db8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
-.idea/
+*.idea/
 
 **/_env
+**/_logs
 **/embeddings
 
 *.pyc
diff --git a/code_mario/clef18_task1_v2.py b/code_mario/clef18_task1_v2.py
index 253d2fe3db2c254ac5db47d6becee1c4e84f4f00..5ddb39766226d44bf1b6b8b433720c7f77717083 100644
--- a/code_mario/clef18_task1_v2.py
+++ b/code_mario/clef18_task1_v2.py
@@ -1,6 +1,8 @@
 import argparse
 import numpy as np
 import pandas as pd
+import keras as k
+import pickle
 import os
 
 from gensim.models import FastText
@@ -22,7 +24,9 @@ from sklearn.svm import LinearSVC
 from sklearn.tree import DecisionTreeClassifier
 from tqdm import tqdm
 from typing import Tuple, Dict, List
+from sklearn.externals import joblib
 
+import ft_embeddings
 from app_context import AppContext
 from clef18_task1_data import Clef18Task1Data
 from dnn_classifiers import NeuralNetworkClassifiers as nnc
@@ -32,25 +36,37 @@ from keras_extension import KerasUtil as ku
 from util import LoggingMixin
 
 
-class TrainingConfiguration(object):
+class ICD10LabelEncoders(object):
 
-    def __init__(self, train_cert_df: DataFrame, val_cert_df: DataFrame, dict_df: DataFrame,
+    def __init__(self, chapter_encoder: LabelEncoder, section_encoder: LabelEncoder,
+                 subsection_encoder: LabelEncoder, code_encoder: LabelEncoder):
+        self.chapter_encoder = chapter_encoder
+        self.section_encoder = section_encoder
+        self.subsection_encoder = subsection_encoder
+        self.code_encoder = code_encoder
+
+
+class Configuration(object):
+
+    def __init__(self, train_cert_df: DataFrame, val_cert_df: DataFrame, test_cert_df: DataFrame, dict_df: DataFrame,
                  max_cert_length: int, max_dict_length: int, ft_embedding_size: int, label_column: str,
-                 label_encoder: LabelEncoder, keras_tokenizer: Tokenizer):
+                 label_encoders: ICD10LabelEncoders, keras_tokenizer: Tokenizer):
         self.train_cert_df = train_cert_df
         self.val_cert_df = val_cert_df
+        self.test_cert_df = test_cert_df
         self.dict_df = dict_df
         self.max_cert_length = max_cert_length
         self.max_dict_length = max_dict_length
         self.ft_embedding_size = ft_embedding_size
         self.label_column = label_column
-        self.label_encoder = label_encoder
+        self.label_encoders = label_encoders
         self.keras_tokenizer = keras_tokenizer
 
 
 class EvaluationResult(object):
 
-    def __init__(self, classifier_name: str, data_set_name: str, accuracy: float):
+    def __init__(self, target_label: str, classifier_name: str, data_set_name: str, accuracy: float):
+        self.target_label = target_label
         self.classifier_name = classifier_name
         self.data_set_name = data_set_name
         self.accuracy = accuracy
@@ -61,138 +77,201 @@ class Clef18Task1V2(LoggingMixin):
     def __init__(self):
         LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file)
 
-    def train_embedding_model(self, train_conf: TrainingConfiguration, ft_model: FastText,
+    def train_embedding_model(self, config: Configuration, ft_model: FastText,
                               neg_samples: int, epochs: int, batch_size: int) -> Model:
         self.logger.info("Start building training pairs")
-        train_pair_data = self.build_pairs(train_conf.train_cert_df, train_conf.dict_df, neg_samples)
+        train_pair_data = self.build_pairs(config.train_cert_df, config.dict_df, neg_samples)
         self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts())
 
         self.logger.info("Start building embedding model")
-        model = self.build_embedding_model(train_conf.keras_tokenizer.word_index, ft_model,
-                                           train_conf.max_cert_length, train_conf.max_dict_length)
+        model = self.build_embedding_model(config.keras_tokenizer.word_index, ft_model,
+                                           config.max_cert_length, config.max_dict_length)
+        model.summary(print_fn=self.logger.info)
 
-        cert_inputs = pad_sequences(train_pair_data["Cert_input"].values, maxlen=train_conf.max_cert_length)
-        dict_inputs = pad_sequences(train_pair_data["Dict_input"].values, maxlen=train_conf.max_dict_length)
+        cert_inputs = pad_sequences(train_pair_data["Cert_input"].values, maxlen=config.max_cert_length)
+        dict_inputs = pad_sequences(train_pair_data["Dict_input"].values, maxlen=config.max_dict_length)
         labels = train_pair_data["Label"].values
 
         self.logger.info("Start training of embedding model")
-        model.fit([cert_inputs, dict_inputs], labels, epochs=epochs, batch_size=batch_size)
+        best_model_file = os.path.join(AppContext.default().output_dir, "embedding_model_best.h5")
 
-        model_file = os.path.join(AppContext.default().output_dir, "model.h5")
+        if config.val_cert_df is not None and len(config.test_cert_df) > 0:
+            self.logger.info("Start creation of validation pairs")
+            val_pair_data = self.build_pairs(config.val_cert_df, config.dict_df, neg_samples)
+
+            val_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length)
+            val_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length)
+            val_gold_labels = val_pair_data["Label"].values
+
+            model.fit([cert_inputs, dict_inputs], labels, epochs=epochs, batch_size=batch_size,
+                      validation_data=([val_cert_inputs, val_dict_inputs], val_gold_labels),
+                      callbacks=[ku.best_model_checkpointing_by_file_path(best_model_file, "val_loss")])
+        else:
+            model.fit([cert_inputs, dict_inputs], labels, epochs=epochs, batch_size=batch_size,
+                    callbacks=[ku.best_model_checkpointing_by_file_path(best_model_file)])
 
-        self.logger.info("Saving model to %s", model_file)
+        model_file = os.path.join(AppContext.default().output_dir, "embedding_model_last.h5")
+        self.logger.info("Saving last model to %s", model_file)
         model.save(model_file)
 
+        self.logger.info("Reloading best embedding model from %s", best_model_file)
+        model = k.models.load_model(best_model_file)
+
         ## ----------------------------------------------------------------------------------------------------------
 
-        if train_conf.val_cert_df is not None and len(train_conf.val_cert_df) > 0:
+        if config.val_cert_df is not None and len(config.val_cert_df) > 0:
             self.logger.info("Start evaluation of embedding model!")
 
             self.logger.info("Start creation of test pairs")
-            test_pair_data = self.build_pairs(train_conf.val_cert_df, train_conf.dict_df, neg_samples)
+            val_pair_data = self.build_pairs(config.test_cert_df, config.dict_df, neg_samples)
 
-            test_cert_inputs = pad_sequences(test_pair_data["Cert_input"].values, maxlen=train_conf.max_cert_length)
-            test_dict_inputs = pad_sequences(test_pair_data["Dict_input"].values, maxlen=train_conf.max_dict_length)
-            gold_labels = test_pair_data["Label"].values
+            test_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length)
+            test_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length)
+            test_gold_labels = val_pair_data["Label"].values
 
             self.logger.info("Start prediction of test labels")
             pred_labels = model.predict([test_cert_inputs, test_dict_inputs], verbose=1)
             pred_labels = (pred_labels > 0.5).astype(float)
 
-            f1_value = f1_score(gold_labels, pred_labels)
-            acc_value = accuracy_score(gold_labels, pred_labels)
+            f1_value = f1_score(test_gold_labels, pred_labels)
+            acc_value = accuracy_score(test_gold_labels, pred_labels)
 
             self.logger.info("Result: f1_score= %s | acc_score= %s", f1_value, acc_value)
 
         return model
 
-    def train_and_evaluate_classifiers(self, emb_model: Model, data_set: TrainingConfiguration) -> List[EvaluationResult]:
+    def train_and_evaluate_classifiers(self, emb_model: Model, config: Configuration, target_labels: List) -> List[EvaluationResult]:
         self.logger.info("Start training and evaluation of classifier models")
-        label_column = "ICD10_chapter_encoded"
 
         self.logger.info("Building dictionary embeddings")
         dict_input = emb_model.inputs[1]
         dict_rnn = Model(inputs=dict_input, outputs=emb_model.get_layer("dict_rnn").output, name="Dict-RNN-Model")
-        dict_inputs = pad_sequences(data_set.dict_df["Token_ids"].values, maxlen=data_set.max_dict_length)
-        dict_embeddings = dict_rnn.predict(dict_inputs, verbose=1)
+        dict_inputs = pad_sequences(config.dict_df["Token_ids"].values, maxlen=config.max_dict_length)
+        dict_embeddings = dict_rnn.predict(dict_inputs, verbose=1, batch_size=1)
 
         self.logger.info("Building certificate embeddings")
         cert_input = emb_model.inputs[0]
         cert_rnn = Model(inputs=cert_input, outputs=emb_model.get_layer("cert_rnn").output, name="Cert-RNN-Model")
 
-        train_cert_inputs = pad_sequences(data_set.train_cert_df["Token_ids"].values, maxlen=data_set.max_cert_length)
+        train_cert_inputs = pad_sequences(config.train_cert_df["Token_ids"].values, maxlen=config.max_cert_length)
         train_cert_embeddings = cert_rnn.predict(train_cert_inputs, verbose=1)
-        self.logger.info("Certificate train input shape: %s", train_cert_embeddings.shape)
+        self.logger.info("cert train input shape: %s", train_cert_embeddings.shape)
 
-        val_cert_inputs = pad_sequences(data_set.val_cert_df["Token_ids"].values, maxlen=data_set.max_cert_length)
+        val_cert_inputs = pad_sequences(config.val_cert_df["Token_ids"].values, maxlen=config.max_cert_length)
         val_cert_embeddings = cert_rnn.predict(val_cert_inputs, verbose=1)
-        self.logger.info("Certificate val input shape: %s", val_cert_embeddings.shape)
+        self.logger.info("cert val input shape: %s", val_cert_embeddings.shape)
 
-        num_classes = len(data_set.label_encoder.classes_)
+        test_cert_inputs = pad_sequences(config.test_cert_df["Token_ids"].values, maxlen=config.max_cert_length)
+        test_cert_embeddings = cert_rnn.predict(test_cert_inputs, verbose=1)
+        self.logger.info("cert test input shape: %s", test_cert_embeddings.shape)
 
-        named_classifiers = [
-            ("KNN", KNeighborsClassifier(metric="cosine", n_jobs=6)),
-            ("SGD", SGDClassifier(verbose=1, random_state=42)),
-            ("DT", DecisionTreeClassifier(random_state=42)),
-            ("RF", RandomForestClassifier(verbose=1, random_state=42)),
-            ("LinearSVM", LinearSVC(max_iter=5000, verbose=1, random_state=42)),
-
-            ("DNN-1-200", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], False, 0.0, 50, 2,
-                                              callbacks=[ku.best_model_checkpointing("dnn-1-200")])),
-            ("DNN-1-300", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], False, 0.0, 50, 2,
-                                              callbacks=[ku.best_model_checkpointing("dnn-1-300")])),
-            ("DNN-200-BN-DO", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], True, 0.5, 50, 2,
-                                                callbacks=[ku.best_model_checkpointing("dnn-1-200-bn-do50")])),
-            ("DNN-300-BN-DO", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], True, 0.5, 50, 2,
-                                                callbacks=[ku.best_model_checkpointing("dnn-1-300-bn-do50")])),
-
-            ("DNN-200-100", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], False, 0.0, 1, 2,
-                                              callbacks=[ku.best_model_checkpointing("dnn-200-100")])),
-            ("DNN-200-200", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], False, 0.0, 50, 2,
-                                              callbacks=[ku.best_model_checkpointing("dnn-200-200")])),
-            ("DNN-200-100-BN-DO", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], True, 0.5, 50, 2,
-                                                    callbacks=[ku.best_model_checkpointing("dnn-200-100-bn-do50")])),
-            ("DNN-200-200-BN-DO", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], True, 0.5, 50, 2,
-                                                    callbacks=[ku.best_model_checkpointing("dnn-200-200-bn-do50")])),
-
-            # ("Test-DNN-200-BN-DO", nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], True, 0.5, 1, 2,
-            #                                         callbacks=[ku.best_model_checkpointing("dnn-300-bn-do")])),
-
-            ('DU1', DummyClassifier(strategy="stratified")),
-            ('DU2', DummyClassifier(strategy="most_frequent"))
-        ]
+        target_label_configs = self.get_label_configuration(target_labels, config.label_encoders)
 
         test_sets = [
-            #("dict", dict_embeddings, data_set.dict_df[label_column].values),
-            ("cert-train", train_cert_embeddings, data_set.train_cert_df[label_column].values),
-            ("cert-val", val_cert_embeddings, data_set.val_cert_df[label_column].values)
+            #("dict", dict_embeddings, config.dict_df),
+            ("cert-train", train_cert_embeddings, config.train_cert_df),
+            ("cert-val", val_cert_embeddings, config.val_cert_df),
+            ("cert-test", test_cert_embeddings, config.test_cert_df)
+        ]
+
+        named_classifiers = [
+            ("KNN", lambda num_classes: KNeighborsClassifier()),
+            ("KNN-Cos", lambda num_classes: KNeighborsClassifier(metric="cosine")),
+            ("SGD", lambda num_classes: SGDClassifier(verbose=1, random_state=42)),
+            ("DT", lambda num_classes: DecisionTreeClassifier(random_state=42)),
+            ("RF", lambda num_classes: RandomForestClassifier(verbose=1, random_state=42)),
+            ("LinearSVM", lambda num_classes: LinearSVC(max_iter=5000, verbose=1, random_state=42)),
+
+            ("DNN-1-200", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], False, 0.0, 50, 2,
+                                                                callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-200")])),
+            ("DNN-1-300", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], False, 0.0, 50, 2,
+                                                                callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-300")])),
+            ("DNN-200-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200], True, 0.5, 50, 2,
+                                                                  callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-200-bn-do50")])),
+            ("DNN-300-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [300], True, 0.5, 50, 2,
+                                                                  callbacks=[ku.best_model_checkpointing_by_model_name("dnn-1-300-bn-do50")])),
+
+            ("DNN-200-100", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], False, 0.0, 1, 2,
+                                                                  callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-100")])),
+            ("DNN-200-200", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], False, 0.0, 50, 2,
+                                                                  callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-200")])),
+            ("DNN-200-100-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 100], True, 0.5, 50, 2,
+                                                                        callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-100-bn-do50")])),
+            ("DNN-200-200-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], True, 0.5, 50, 2,
+                                                                        callbacks=[ku.best_model_checkpointing_by_model_name("dnn-200-200-bn-do50")])),
+
+            # ("Test-DNN-200-BN-DO", lambda num_classes: nnc.dense_network(cert_rnn.output.shape[1].value, num_classes, [200, 200], True, 0.5, 1, 4,
+            #                                                               callbacks=[ku.best_model_checkpointing_by_model_name("test-dnn-200-bn-do")])),
+
+            ('DU1', lambda num_classes: DummyClassifier(strategy="stratified")),
+            ('DU2', lambda num_classes: DummyClassifier(strategy="most_frequent"))
         ]
 
+        num_experiments = len(target_label_configs) * len(test_sets) * len(named_classifiers)
+        cur_experiment = 1
+
         results = []
-        for name, classifier in named_classifiers:
-            self.logger.info("Start training of classifier %s", name)
-            classifier.fit(dict_embeddings, data_set.dict_df[label_column].values)
-            classifier.fit(train_cert_embeddings, data_set.train_cert_df[label_column].values)
+        for target_label, target_column, label_encoder in target_label_configs:
+            self.logger.info("Start evaluation experiments with label %s", target_label)
+            num_classes = len(label_encoder.classes_)
+
+            complete_train_data = np.append(dict_embeddings, train_cert_embeddings, axis=0)
+            complete_train_labels = np.append(config.dict_df[target_column].values, config.train_cert_df[target_column].values, axis=0)
+            self.logger.info("Build complete training samples (data: %s, labels: %s)", complete_train_data.shape, complete_train_labels.shape)
+
+            for cl_name, classifier_factory in named_classifiers:
+                self.logger.info("Start training of classifier %s", cl_name)
+                classifier = classifier_factory(num_classes)
+                classifier.fit(complete_train_data, complete_train_labels)
 
-            self.logger.info("Start evaluation of %s", name)
+                classifier_file_name = "cl_{}_{}.model".format(cl_name, target_label).lower()
+                classifier_file = os.path.join(AppContext.default().output_dir, classifier_file_name)
+                try:
+                    joblib.dump(classifier, classifier_file)
+                except:
+                    self.logger.error("Error while saving classifier %s to %s", cl_name, classifier_file)
 
-            for ts_name, inputs, gold_labels in test_sets:
-                self.logger.info("Evaluate data set %s", ts_name)
-                prediction = classifier.predict(inputs)
-                acc_score = accuracy_score(gold_labels, prediction)
+                self.logger.info("Start evaluation of %s", cl_name)
+                for ts_name, inputs, data_frame in test_sets:
+                    gold_labels = data_frame[target_column].values
 
-                self.logger.info("Evaluation result: classifier=%s | data_set=%s | acc_score=%s", name, ts_name, acc_score)
-                results.append(EvaluationResult(name, ts_name, acc_score))
+                    self.logger.info("Evaluate data set %s", ts_name)
+                    prediction = classifier.predict(inputs)
+                    acc_score = accuracy_score(gold_labels, prediction)
+
+                    self.logger.info("Evaluation result: label=%s | classifier=%s | data_set=%s | acc_score=%s",
+                                     target_label, cl_name, ts_name, acc_score)
+                    results.append(EvaluationResult(target_label, cl_name, ts_name, acc_score))
+
+                    self.logger.info("Finished experiment %s out of %s", cur_experiment, num_experiments)
+                    cur_experiment += 1
 
         return results
 
-    def split_train_test(self, certificate_df: DataFrame, test_ratio: float,
+    def get_label_configuration(self, target_labels: List[str], icd10_encoders: ICD10LabelEncoders) -> List:
+        label_configs = []
+
+        for target_label in target_labels:
+            if target_label == "chap" or target_label == "chapter":
+                label_configs.append((target_label, "ICD10_chapter_encoded", icd10_encoders.chapter_encoder))
+            elif target_label == "sect" or target_label == "section":
+                label_configs.append((target_label, "ICD10_section_encoded", icd10_encoders.section_encoder))
+            elif target_label == "subs" or target_label == "subsection":
+                label_configs.append((target_label, "ICD10_subsection_encoded", icd10_encoders.subsection_encoder))
+            elif target_label == "code" or target_label == "icd10":
+                label_configs.append((target_label, "ICD10_encoded", icd10_encoders.code_encoder))
+            else:
+                self.logger.error("Can't create label configuration for label " + target_label)
+
+        return label_configs
+
+    def split_train_test(self, certificate_df: DataFrame, train_size: float,
                          stratified_splits: bool, label_column: str) -> Tuple[DataFrame, DataFrame]:
         if stratified_splits:
-            splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42)
+            splitter = StratifiedShuffleSplit(n_splits=1, train_size=train_size, random_state=42)
             split = splitter.split(certificate_df, certificate_df[label_column])
         else:
-            splitter = ShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42)
+            splitter = ShuffleSplit(n_splits=1, train_size=train_size, random_state=42)
             split = splitter.split(certificate_df)
 
         for train_indices, test_indices in split:
@@ -238,54 +317,83 @@ class Clef18Task1V2(LoggingMixin):
 
         return model
 
-    def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, val_ratio: float,
-                         label_column: str, samples: int=None, stratified_splits: bool=False) -> TrainingConfiguration:
+    def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, train_ratio: float, val_ratio: float,
+                         label_column: str, samples: int=None, stratified_splits: bool=False) -> Configuration:
+
         if samples:
             self.logger.info("Sampling %s instances", samples)
             cert_df = cert_df.sample(samples, random_state=42)
 
-        self.logger.info("Splitting certificate lines into train and validation")
-        train_cert_df, val_cert_df = self.split_train_test(cert_df, val_ratio, stratified_splits, label_column)
-        self.logger.info("Finished splitting: train=%s instances, test=%s instances", len(train_cert_df), len(val_cert_df))
-
-        label_encoder = LabelEncoder()
-
-        if label_column == "ICD10_encoded":
-            self.logger.info("Fitting label encoder to ICD10 codes")
-            label_encoder.fit(list([icd10.strip().lower() for icd10 in dict_df["ICD10"].values]) +
-                              list([icd10.strip().lower() for icd10 in cert_df["ICD10"].values]))
-
-        elif label_column == "ICD10_chapter_encoded":
-            self.logger.info("Fitting label encoder to ICD10 chapters")
-            label_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) +
-                              list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values]))
+        self.logger.info("Splitting certificate lines into train and evaluation data set")
+        train_cert_df, evaluation_cert_df = self.split_train_test(cert_df, train_ratio, stratified_splits, label_column)
+        self.logger.info("Finished splitting: train=%s instances, evaluation=%s instances", len(train_cert_df), len(evaluation_cert_df))
 
-        else:
-            raise AssertionError("Can't encode label column '%s'" % label_column)
-
-        self.logger.info("Found %s distinct labels in training data", len(label_encoder.classes_))
+        self.logger.info("Splitting evaluation data set into validation and test set")
+        val_cert_df, test_cert_df = self.split_train_test(evaluation_cert_df, val_ratio, stratified_splits, label_column)
 
+        label_encoders = self.prepare_label_encoders(dict_df, cert_df)
         keras_tokenizer = Tokenizer(oov_token="<UNK>")
 
         self.logger.info("Start preparation of training cert data (%s instances)", len(train_cert_df))
-        train_cert_df, max_cert_length = self.prepare_certificate_df(train_cert_df, "train", label_encoder, keras_tokenizer)
+        train_cert_df, max_cert_length = self.prepare_certificate_df(train_cert_df, "train", label_encoders, keras_tokenizer)
 
         self.logger.info("Start preparation of validation cert data (%s instances)", len(val_cert_df))
-        val_cert_df, _ = self.prepare_certificate_df(val_cert_df, "validation", label_encoder, keras_tokenizer)
-
-        self.logger.info("Start preparation of dictionary data (%s instances)", len(dict_df))
-        dict_df, max_dict_length = self.prepare_dictionary_df(dict_df, "train", label_encoder, keras_tokenizer)
+        val_cert_df, _ = self.prepare_certificate_df(val_cert_df, "validation", label_encoders, keras_tokenizer)
 
-        return TrainingConfiguration(train_cert_df, val_cert_df, dict_df, max_cert_length, max_dict_length,
-                                     ft_model.vector_size, label_column, label_encoder, keras_tokenizer)
+        self.logger.info("Start preparation of test cert data (%s instances)", len(test_cert_df))
+        test_cert_df, _ = self.prepare_certificate_df(test_cert_df, "test", label_encoders, keras_tokenizer)
 
-    def prepare_certificate_df(self, certificate_df: DataFrame, mode: str, label_encoder: LabelEncoder,
+        self.logger.info("Start preparation of dictionary data (%s instances)", len(dict_df))
+        dict_df, max_dict_length = self.prepare_dictionary_df(dict_df, "train", label_encoders, keras_tokenizer)
+
+        return Configuration(train_cert_df, val_cert_df, test_cert_df, dict_df, max_cert_length, max_dict_length,
+                             ft_model.vector_size, label_column, label_encoders, keras_tokenizer)
+
+    def prepare_label_encoders(self, dict_df: DataFrame, cert_df: DataFrame) -> ICD10LabelEncoders:
+        self.logger.info("Fitting label encoder to ICD10 codes")
+        icd10_code_encoder = LabelEncoder()
+        icd10_code_encoder.fit(list([icd10.strip() for icd10 in dict_df["ICD10"].values]) +
+                          list([icd10.strip() for icd10 in cert_df["ICD10"].values]))
+        self.logger.info("Found %s distinct ICD10 codes within the data set", len(icd10_code_encoder.classes_))
+
+        self.logger.info("Fitting label encoder to ICD10 chapters")
+        icd10_chapter_encoder = LabelEncoder()
+        icd10_chapter_encoder.fit(list([icd10.strip().lower()[0] for icd10 in dict_df["ICD10"].values]) +
+                                  list([icd10.strip().lower()[0] for icd10 in cert_df["ICD10"].values]))
+        self.logger.info("Found %s distinct ICD10 chapters within the data set", len(icd10_chapter_encoder.classes_))
+
+        self.logger.info("Fitting label encoder to ICD10 section")
+        icd10_section_encoder = LabelEncoder()
+        icd10_section_encoder.fit(list([icd10.strip().lower()[0:2] for icd10 in dict_df["ICD10"].values]) +
+                                  list([icd10.strip().lower()[0:2] for icd10 in cert_df["ICD10"].values]))
+        self.logger.info("Found %s distinct ICD10 sections within the data set", len(icd10_section_encoder.classes_))
+
+        self.logger.info("Fitting label encoder to ICD10 subsection")
+        icd10_subsection_encoder = LabelEncoder()
+        icd10_subsection_encoder.fit(list([icd10.strip().lower()[0:3] for icd10 in dict_df["ICD10"].values]) +
+                                     list([icd10.strip().lower()[0:3] for icd10 in cert_df["ICD10"].values]))
+        self.logger.info("Found %s distinct ICD10 subsections within the data set", len(icd10_subsection_encoder.classes_))
+
+        return ICD10LabelEncoders(icd10_chapter_encoder, icd10_section_encoder, icd10_subsection_encoder, icd10_code_encoder)
+
+    def prepare_certificate_df(self, certificate_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders,
                                keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]:
         certificate_pipeline = Pipeline([
-            ("Encode-ICD10-codes", pdu.encode_labels("ICD10", "ICD10_encoded")),
-
             ("Extract-ICD10-chapter", pdu.extract_icd10_chapter("ICD10", "ICD10_chapter")),
-            ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded", label_encoder, False)),
+            ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded",
+                                                       icd10_encoders.chapter_encoder, False)),
+
+            ("Extract-ICD10-section", pdu.extract_icd10_section("ICD10", "ICD10_section")),
+            ("Encode-ICD10-section", pdu.encode_labels("ICD10_section", "ICD10_section_encoded",
+                                                       icd10_encoders.section_encoder, False)),
+
+            ("Extract-ICD10-subsection", pdu.extract_icd10_subsection("ICD10", "ICD10_subsection")),
+            ("Encode-ICD10-subsection", pdu.encode_labels("ICD10_subsection", "ICD10_subsection_encoded",
+                                                       icd10_encoders.subsection_encoder, False)),
+
+            ("Clean-ICD10-code", pdu.strip("ICD10")),
+            ("Encode-ICD10-code", pdu.encode_labels("ICD10", "ICD10_encoded",
+                                                     icd10_encoders.code_encoder, False)),
 
             ("LowercaseText", pdu.to_lowercase("RawText")),
             ("TokenizeText", pdu.keras_sequencing("RawText", "Token_ids", keras_tokenizer, (mode == "train")))
@@ -300,13 +408,24 @@ class Clef18Task1V2(LoggingMixin):
 
         return cert_data_prepared, max_length
 
-    def prepare_dictionary_df(self, dictionary_df: DataFrame, mode: str, label_encoder: LabelEncoder,
+    def prepare_dictionary_df(self, dictionary_df: DataFrame, mode: str, icd10_encoders: ICD10LabelEncoders,
                               keras_tokenizer: Tokenizer) -> Tuple[DataFrame, int]:
         dictionary_pipeline = Pipeline([
-            ("Encode-ICD10-codes", pdu.encode_labels("ICD10", "ICD10_encoded")),
-
             ("Extract-ICD10-chapter", pdu.extract_icd10_chapter("ICD10", "ICD10_chapter")),
-            ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded", label_encoder, False)),
+            ("Encode-ICD10-chapter", pdu.encode_labels("ICD10_chapter", "ICD10_chapter_encoded",
+                                                       icd10_encoders.chapter_encoder, False)),
+
+            ("Extract-ICD10-section", pdu.extract_icd10_section("ICD10", "ICD10_section")),
+            ("Encode-ICD10-section", pdu.encode_labels("ICD10_section", "ICD10_section_encoded",
+                                                       icd10_encoders.section_encoder, False)),
+
+            ("Extract-ICD10-subsection", pdu.extract_icd10_subsection("ICD10", "ICD10_subsection")),
+            ("Encode-ICD10-subsection", pdu.encode_labels("ICD10_subsection", "ICD10_subsection_encoded",
+                                                       icd10_encoders.subsection_encoder, False)),
+
+            ("Clean-ICD10-code", pdu.strip("ICD10")),
+            ("Encode-ICD10-code", pdu.encode_labels("ICD10", "ICD10_encoded",
+                                                     icd10_encoders.code_encoder, False)),
 
             ("CombineTexts", pdu.combine_texts(["DiagnosisText", "Standardized"], "DictText")),
             ("LowercaseText", pdu.to_lowercase("DictText")),
@@ -366,7 +485,8 @@ class Clef18Task1V2(LoggingMixin):
         result_configurations = [
             ("results.csv", None),
             ("results_by_classifier.csv", lambda r: r.classifier_name),
-            ("results_by_data_set.csv", lambda r: r.data_set_name)
+            ("results_by_data_set.csv", lambda r: r.data_set_name),
+            ("results_by_label.csv", lambda r: r.target_label)
         ]
 
         for file_name, sort_key in result_configurations:
@@ -376,38 +496,97 @@ class Clef18Task1V2(LoggingMixin):
                     eval_results = sorted(eval_results, key=sort_key)
 
                 for r in eval_results:
-                    result_writer.write("%s\t%s\t%s\n" % (r.classifier_name, r.data_set_name, r.accuracy))
+                    result_writer.write("%s\t%s\t%s\t%s\n" % (r.target_label, r.classifier_name, r.data_set_name, r.accuracy))
                 result_writer.close()
 
+    def save_configuration(self, configuration: Configuration):
+        label_encoder_file = os.path.join(AppContext.default().output_dir, "label_encoder.pk")
+        self.logger.info("Saving label encoder to " + label_encoder_file)
+        with open(label_encoder_file, 'wb') as encoder_writer:
+            pickle.dump(configuration.label_encoders, encoder_writer)
+            encoder_writer.close()
+
+        keras_tokenizer_file = os.path.join(AppContext.default().output_dir, "keras_tokenizer.pk")
+        self.logger.info("Saving keras sequencer to " + keras_tokenizer_file)
+        with open(keras_tokenizer_file, 'wb') as keras_sequencer_writer:
+            pickle.dump(configuration.keras_tokenizer, keras_sequencer_writer)
+            keras_sequencer_writer.close()
+
+        configuration_file = os.path.join(AppContext.default().output_dir, "configuration.pk")
+        self.logger.info("Saving configuration to " + configuration_file)
+        with open(configuration_file, 'wb') as train_conf_writer:
+            pickle.dump(configuration, train_conf_writer)
+            train_conf_writer.close()
+
+    def reload_configuration(self, file_path: str):
+        self.logger.info("Reloading configuration from " + file_path)
+        with open(args.train_conf, 'rb') as train_conf_reader:
+            configuration = pickle.load(train_conf_reader)
+            train_conf_reader.close()
+
+        return configuration
+
+    def reload_embedding_model(self, emb_model_file: str):
+        self.logger.info("Reloading embedding model from " + emb_model_file)
+        return k.models.load_model(args.emb_model)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(prog="CLEF2018")
-    parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int)
-    parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int)
-    parser.add_argument("--val_ratio", help="Ratio of validation samples to use", default=0.2, type=float)
-    parser.add_argument("--neg_samples", help="Number of negative samples for each pair to use", default=75, type=int)
-    parser.add_argument("--train_samples", help="Number of instances to sample from the training data", default=None, type=int)
-    parser.add_argument("--label_column", help="Column used to train the models", default="ICD10_chapter_encoded", type=str)
-    parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool)
+    subparsers = parser.add_subparsers(dest="mode")
+
+    train_emb_parser = subparsers.add_parser("train-emb")
+    train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int)
+    train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int)
+    train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float)
+    train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.4, type=float)
+    train_emb_parser.add_argument("--neg_samples", help="Number of negative samples for each pair to use", default=75, type=int)
+    train_emb_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int)
+    train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_encoded", type=str)
+    train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool)
+    train_emb_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append')
+
+    eval_classifier_parser = subparsers.add_parser("eval-cl")
+    eval_classifier_parser.add_argument("emb_model", help="Path to the embedding model to use")
+    eval_classifier_parser.add_argument("train_conf", help="Path to the training configuration dump")
+    eval_classifier_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float)
+    eval_classifier_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.4, type=float)
+    eval_classifier_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int)
+    eval_classifier_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_encoded", type=str)
+    eval_classifier_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool)
+    eval_classifier_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append')
 
     args = parser.parse_args()
 
-    AppContext.initialize_by_app_name("eCLEF2018-Task1")
+    AppContext.initialize_by_app_name(args.mode)
 
     clef_data = Clef18Task1Data()
     it_dictionary = clef_data.read_it_dictionary()
     it_certificates = clef_data.read_it_train_certificates()
     it_certificates = clef_data.filter_single_code_lines(it_certificates)
 
+    #sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+    #ft_it_model = FastText(sentences, min_count=1)
+
     ft_embeddings = FastTextEmbeddings()
     ft_it_model = ft_embeddings.load_it_embeddings()
 
     clef18_task1 = Clef18Task1V2()
 
-    data_set = clef18_task1.prepare_data_set(it_certificates, it_dictionary, ft_it_model, args.val_ratio,
-                                             args.label_column, args.train_samples, args.strat_splits)
-    embedding_model = clef18_task1.train_embedding_model(data_set, ft_it_model, args.neg_samples, args.epochs, args.batch_size)
+    if args.mode == "train-emb":
+        configuration = clef18_task1.prepare_data_set(
+              it_certificates, it_dictionary, ft_it_model, args.train_ratio,
+              args.val_ratio,args.strat_column, args.samples, args.strat_splits)
+        clef18_task1.save_configuration(configuration)
+
+        embedding_model = clef18_task1.train_embedding_model(configuration, ft_it_model, args.neg_samples, args.epochs, args.batch_size)
 
-    eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, data_set)
+    elif args.mode == "eval-cl":
+        configuration = clef18_task1.reload_configuration(args.train_conf)
+        embedding_model = clef18_task1.reload_embedding_model(args.emb_model)
+
+    eval_result = clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels)
     clef18_task1.save_evaluation_results(eval_result)
 
+
+
diff --git a/code_mario/keras_extension.py b/code_mario/keras_extension.py
index 59a3900c95f91b307167c51eab4cc8ac9c101cd4..33d4116fc34883bb017293d5d0fe997901f23594 100644
--- a/code_mario/keras_extension.py
+++ b/code_mario/keras_extension.py
@@ -12,8 +12,12 @@ from util import LoggingMixin
 class KerasUtil(object):
 
     @staticmethod
-    def best_model_checkpointing(model_name: str, monitor_loss: str = "loss"):
-        best_model_file = os.path.join(AppContext.default().output_dir, "optimal_%s.h5" % model_name)
+    def best_model_checkpointing_by_model_name(model_name: str, monitor_loss: str = "loss"):
+        best_model_file = os.path.join(AppContext.default().output_dir, "%s_best.h5" % model_name)
+        return ModelCheckpoint(filepath=best_model_file, monitor=monitor_loss, save_best_only=True, verbose=1)
+
+    @staticmethod
+    def best_model_checkpointing_by_file_path(best_model_file: str, monitor_loss: str = "loss"):
         return ModelCheckpoint(filepath=best_model_file, monitor=monitor_loss, save_best_only=True, verbose=1)
 
 
@@ -59,7 +63,7 @@ class ExtendedKerasClassifier(KerasClassifier, LoggingMixin):
         else:
             self.logger.debug("Model wasn't re-fitted -> re-using existing model")
             pass
-
+        self.logger.info("Classifer has %s classes", len(self.classes_))
         return super(ExtendedKerasClassifier, self).predict(x, **kwargs)
 
     def __getstate__(self):
diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py
index 45fd4e01c2f3ec4b1fcaa94dd7044e42121dd770..7f1f14c36c511430fb19d4ef23e4f4f047a14829 100644
--- a/code_mario/preprocessing.py
+++ b/code_mario/preprocessing.py
@@ -20,6 +20,13 @@ class DataPreparationUtil(object):
 
         return MapFunction(column, _lower)
 
+    @staticmethod
+    def strip(column: str):
+        def _strip(text):
+            return str(text).strip()
+
+        return MapFunction(column, _strip)
+
     @staticmethod
     def tokenize(text_column: str, token_column: str = "tokens"):
         return SimpleTokenizer(text_column, token_column)
@@ -57,6 +64,20 @@ class DataPreparationUtil(object):
 
         return MapFunction(icd10_column, _extract, target_column)
 
+    @staticmethod
+    def extract_icd10_section(icd10_column: str, target_column: str):
+        def _extract(value):
+            return value.strip()[0:2].lower()
+
+        return MapFunction(icd10_column, _extract, target_column)
+
+    @staticmethod
+    def extract_icd10_subsection(icd10_column: str, target_column: str):
+        def _extract(value):
+            return value.strip()[0:3].lower()
+
+        return MapFunction(icd10_column, _extract, target_column)
+
     @staticmethod
     def extract_icd10_subchapter(icd10_column: str, target_column: str):
         def _extract(value):
diff --git a/requirements.txt b/requirements.txt
index f8f9105a42859062355469b8ae9a329179f9b796..1f98a40279acebe576e2230ae1ed2e6d79b82d9e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,4 @@ sklearn
 tensorflow
 tqdm
 h5py
+cython