diff --git a/code_jurica/_layers.py b/code_jurica/_layers.py
index 0f1896c4f77334274b6b173578dbc77f80461834..e5befe19d3255609af50068743d1305de1e78c5a 100644
--- a/code_jurica/_layers.py
+++ b/code_jurica/_layers.py
@@ -41,7 +41,6 @@ class SwishBeta(Layer):
         base_config = super(SwishBeta, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-
 class Length(layers.Layer):
     """
     Compute the length of vectors. This is used to compute a Tensor that has the same shape with y_true in margin_loss.
@@ -56,7 +55,6 @@ class Length(layers.Layer):
     def compute_output_shape(self, input_shape):
         return input_shape[:-1]
 
-
 class Mask(layers.Layer):
     """
     Mask a Tensor with shape=[None, num_capsule, dim_vector] either by the capsule with max length or by an additional
@@ -95,7 +93,6 @@ class Mask(layers.Layer):
         else:  # no true label provided
             return tuple([None, input_shape[1] * input_shape[2]])
 
-
 def squash(vectors, axis=-1):
     """
     The non-linear activation used in Capsule. It drives the length of a large vector to near 1 and small vector to 0
@@ -107,7 +104,6 @@ def squash(vectors, axis=-1):
     scale = s_squared_norm / (1 + s_squared_norm) / K.sqrt(s_squared_norm + K.epsilon())
     return scale * vectors
 
-
 class CapsuleLayer(layers.Layer):
     """
     The capsule layer. It is similar to Dense layer. Dense layer has `in_num` inputs, each is a scalar, the output of the
@@ -190,7 +186,6 @@ class CapsuleLayer(layers.Layer):
     def compute_output_shape(self, input_shape):
         return tuple([None, self.num_capsule, self.dim_capsule])
 
-
 def PrimaryCap(inputs, dim_capsule, n_channels, kernel_size, strides, padding):
     """
     Apply Conv2D `n_channels` times and concatenate all capsules
@@ -204,18 +199,6 @@ def PrimaryCap(inputs, dim_capsule, n_channels, kernel_size, strides, padding):
     outputs = layers.Reshape(target_shape=[-1, dim_capsule], name='')(output)
     return layers.Lambda(squash, name='')(outputs)
 
-
-"""
-# The following is another way to implement primary capsule layer. This is much slower.
-# Apply Conv2D `n_channels` times and concatenate all capsules
-def PrimaryCap(inputs, dim_capsule, n_channels, kernel_size, strides, padding):
-    outputs = []
-    for _ in range(n_channels):
-        output = layers.Conv2D(filters=dim_capsule, kernel_size=kernel_size, strides=strides, padding=padding)(inputs)
-        outputs.append(layers.Reshape([output.get_shape().as_list()[1] ** 2, dim_capsule])(output))
-    outputs = layers.Concatenate(axis=1)(outputs)
-    return layers.Lambda(squash)(outputs)
-"""
 def dot_product(x, kernel):
     """
     Wrapper for dot product operation, in order to be compatible with both
@@ -317,96 +300,82 @@ class Attention(Layer):
 
 class AttentionWithContext(Layer):
     """
-    Attention operation, with a context/query vector, for temporal data.
-    Supports Masking.
-    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
-    "Hierarchical Attention Networks for Document Classification"
-    by using a context vector to assist the attention
-    # Input shape
-        3D tensor with shape: `(samples, steps, features)`.
-    # Output shape
-        2D tensor with shape: `(samples, features)`.
-
-    How to use:
-    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
-    The dimensions are inferred based on the output shape of the RNN.
-
-    Note: The layer has been tested with Keras 2.0.6
-
-    Example:
-        model.add(LSTM(64, return_sequences=True))
-        model.add(AttentionWithContext())
-        # next add a Dense layer (for classification/regression) or whatever...
-    """
-
-    def __init__(self,
-                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
-                 W_constraint=None, u_constraint=None, b_constraint=None,
-                 bias=True, **kwargs):
+        Attention operation, with a context/query vector, for temporal data.
+        Supports Masking.
+        Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
+        "Hierarchical Attention Networks for Document Classification"
+        by using a context vector to assist the attention
+        # Input shape
+            3D tensor with shape: `(samples, steps, features)`.
+        # Output shape
+            2D tensor with shape: `(samples, features)`.
+        :param kwargs:
+        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
+        The dimensions are inferred based on the output shape of the RNN.
+        Example:
+            model.add(LSTM(64, return_sequences=True))
+            model.add(AttentionWithContext())
+        """
 
+    def __init__(self, init='glorot_uniform', kernel_regularizer=None, bias_regularizer=None, kernel_constraint=None, bias_constraint=None,  **kwargs):
         self.supports_masking = True
-        self.init = initializers.get('glorot_uniform')
+        self.init = initializers.get(init)
+        self.kernel_initializer = initializers.get('glorot_uniform')
 
-        self.W_regularizer = regularizers.get(W_regularizer)
-        self.u_regularizer = regularizers.get(u_regularizer)
-        self.b_regularizer = regularizers.get(b_regularizer)
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
 
-        self.W_constraint = constraints.get(W_constraint)
-        self.u_constraint = constraints.get(u_constraint)
-        self.b_constraint = constraints.get(b_constraint)
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
 
-        self.bias = bias
         super(AttentionWithContext, self).__init__(**kwargs)
 
     def build(self, input_shape):
-        assert len(input_shape) == 3
-
-        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
-                                 initializer=self.init,
+        self.kernel = self.add_weight((input_shape[-1], 1),
+                                 initializer=self.kernel_initializer,
                                  name='{}_W'.format(self.name),
-                                 regularizer=self.W_regularizer,
-                                 constraint=self.W_constraint)
-        if self.bias:
-            self.b = self.add_weight((input_shape[-1],),
-                                     initializer='zero',
-                                     name='{}_b'.format(self.name),
-                                     regularizer=self.b_regularizer,
-                                     constraint=self.b_constraint)
-
-        self.u = self.add_weight((input_shape[-1],),
-                                 initializer=self.init,
+                                 regularizer=self.kernel_regularizer,
+                                 constraint=self.kernel_constraint)
+        self.b = self.add_weight((input_shape[1],),
+                                 initializer='zero',
+                                 name='{}_b'.format(self.name),
+                                 regularizer=self.bias_regularizer,
+                                 constraint=self.bias_constraint)
+
+        self.u = self.add_weight((input_shape[1],),
+                                 initializer=self.kernel_initializer,
                                  name='{}_u'.format(self.name),
-                                 regularizer=self.u_regularizer,
-                                 constraint=self.u_constraint)
-
-        super(AttentionWithContext, self).build(input_shape)
+                                 regularizer=self.kernel_regularizer,
+                                 constraint=self.kernel_constraint)
+        self.built = True
 
-    def compute_mask(self, input, input_mask=None):
-        # do not pass the mask to the next layers
+    def compute_mask(self, input, mask):
         return None
 
     def call(self, x, mask=None):
-        uit = dot_product(x, self.W)
+        # (x, 40, 300) x (300, 1)
+        multData =  K.dot(x, self.kernel) # (x, 40, 1)
+        multData = K.squeeze(multData, -1) # (x, 40)
+        multData = multData + self.b # (x, 40) + (40,)
 
-        if self.bias:
-            uit += self.b
-
-        uit = K.tanh(uit)
-        # ait = K.dot(uit, self.u)
-        ait = dot_product(uit, self.u)
+        multData = K.tanh(multData) # (x, 40)
 
-        a = K.exp(ait)
+        multData = multData * self.u # (x, 40) * (40, 1) => (x, 1)
+        multData = K.exp(multData) # (X, 1)
 
         # apply mask after the exp. will be re-normalized next
         if mask is not None:
-            # Cast the mask to floatX to avoid float64 upcasting in theano
-            a *= K.cast(mask, K.floatx())
-
-        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
-
-        a = K.expand_dims(a)
-        weighted_input = x * a
+            mask = K.cast(mask, K.floatx()) #(x, 40)
+            multData = mask*multData #(x, 40) * (x, 40, )
+
+        # in some cases especially in the early stages of training the sum may be almost zero
+        # and this results in NaN's. A workaround is to add a very small positive number Îµ to the sum.
+        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
+        multData /= K.cast(K.sum(multData, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+        multData = K.expand_dims(multData)
+        weighted_input = x * multData
         return K.sum(weighted_input, axis=1)
 
+
     def compute_output_shape(self, input_shape):
-        return input_shape[0], input_shape[-1]
\ No newline at end of file
+        return (input_shape[0], input_shape[-1],)
\ No newline at end of file
diff --git a/code_jurica/classificationICD10_attention.py b/code_jurica/classificationICD10_attention.py
index 6ee8659ca5be54efe58d271e8cd65d54af81356f..ffaf572167c49c503a49f8d4534c7d20a20270c7 100644
--- a/code_jurica/classificationICD10_attention.py
+++ b/code_jurica/classificationICD10_attention.py
@@ -104,7 +104,8 @@ encoded_Y = encoder.transform(labels)
 # convert integers to dummy variables (i.e. one hot encoded)
 labels_one_hot = np_utils.to_categorical(encoded_Y)
 
-X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.02, random_state=777, stratify=labels)
+X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot,
+                                                    test_size=0.02, random_state=777, stratify=labels)
 print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test))
 
 try:
diff --git a/code_jurica/loader.py b/code_jurica/loader.py
index 99b7e970ddfd7b781492f89b78921a9b18f3ac9e..d8f80768412d757041d03e321a3938713be79d67 100644
--- a/code_jurica/loader.py
+++ b/code_jurica/loader.py
@@ -39,6 +39,7 @@ except Exception as e:
 # corpora=FR_sample+HU_sample+IT_sample
 
 corpora=frCorpora+itCorpora+huCorpora
+corpora=corpora[:10000]
 # print(len(corpora))
 # corpora=corpora[:int(len(corpora)*0.5)]
 # print(len(corpora))
diff --git a/code_jurica/multiTaskEmbeddings.py b/code_jurica/multiTaskEmbeddings.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code_jurica/seq2seq_attention.py b/code_jurica/seq2seq_attention.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d3688edb53cd64e07a81abec249b01713620d834 100644
--- a/code_jurica/seq2seq_attention.py
+++ b/code_jurica/seq2seq_attention.py
@@ -0,0 +1,239 @@
+# from comet_ml import Experiment
+# experiment = Experiment(api_key="hSd9vTj0EfMu72569YnVEvtvj")
+
+from loader import *
+from _layers import AttentionWithContext, Attention
+from keras.models import Model, load_model as keras_load_model
+from keras.layers import Input, LSTM, Dense, Embedding, GRU, Activation, dot, concatenate, Bidirectional, TimeDistributed
+from keras.utils import multi_gpu_model
+from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
+import tensorflow as tf
+import tqdm
+
+import pickle
+from sklearn.metrics import classification_report
+
+#REPRODUCIBLE
+np.random.seed(42)
+import random
+random.seed(12345)
+import os
+os.environ['PYTHONHASHSEED'] = '0'
+
+import tensorflow as tf
+# config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+config = tf.ConfigProto()
+from keras import backend as K
+tf.set_random_seed(1234)
+#REPRODUCIBLE
+
+
+###################################
+# TensorFlow wizardry
+# Don't pre-allocate memory; allocate as-needed
+config.gpu_options.allow_growth = True
+config.gpu_options.allocator_type = 'BFC'
+sess = tf.Session(graph=tf.get_default_graph(), config=config)
+K.set_session(sess)
+
+# LOAD ICD 10 CLASSIFICATION MODEL
+try:
+    icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5',
+              custom_objects={'Attention':Attention})
+except OSError:
+    from classificationICD10 import *
+    icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5')
+
+with open('models/icd10_tokenizer_extended.p', 'rb') as handle:
+    icd10Tokenizer = pickle.load(handle)
+
+with open('models/icd10_mappings_extended.p', 'rb') as handle:
+    encoded_Y = pickle.load(handle)
+# LOAD ICD 10 CLASSIFICATION MODEL
+
+callbacks_list = [
+    EarlyStopping(
+        monitor='val_loss',
+        patience=2,
+        # min_delta=0.001
+    ),
+    ModelCheckpoint(
+        filepath='models/s2s_att_extended.h5',
+        monitor='val_loss',
+        save_best_only=True,
+    ),
+    CSVLogger(
+        append=False,
+        filename='logs/s2s_att_extended_{}.csv'.format(date_label)
+    )
+]
+
+latent_dim = 256
+batch_size = 400
+epochs = 1
+
+train_data_generator = KerasBatchGenerator(batch_size,
+                                           source_train,
+                                           source_max_sequence_tokenizer,
+                                           source_kerasTokenizer,
+                                           target_train,
+                                           target_max_sequence_tokenizer,
+                                           target_kerasTokenizer
+                                           )
+
+validation_data_generator = KerasBatchGenerator(batch_size,
+                                           source_val,
+                                           source_max_sequence_tokenizer,
+                                           source_kerasTokenizer,
+                                           target_val,
+                                           target_max_sequence_tokenizer,
+                                           target_kerasTokenizer
+                                           )
+
+print("Lets train some stuff!")
+# Define an input sequence and process it.
+encoder_input = Input(shape=(source_max_sequence_tokenizer, ))
+x = source_embedding_layer(encoder_input)
+encoder_out, state_h, state_c = LSTM(latent_dim, return_sequences=True, unroll=True, return_state=True)(x)
+encoder_states = [state_h, state_c]
+
+# Set up the decoder, using `encoder_states` as initial state.
+decoder_input = Input(shape=(target_max_sequence_tokenizer, ))
+x_decode = target_embedding_layer(decoder_input)
+decoder_LSTM = LSTM(latent_dim, return_sequences=True, return_state = True, unroll=True)
+decoder, state_h_decode , state_c_decode = decoder_LSTM(x_decode, initial_state=encoder_states)
+
+# Equation (7) with 'dot' score from Section 3.1 in the paper.
+# Note that we reuse Softmax-activation layer instead of writing tensor calculation
+attention = dot([encoder_out, decoder], axes=[2, 2])
+attention = Activation('softmax')(attention)
+context = dot([attention, encoder_out], axes=[1,1])
+decoder_combined_context = concatenate([context, decoder])
+print(decoder_combined_context)
+
+decoder_dense = Dense(len(target_vocab)+1, activation='softmax')
+decoder_out = decoder_dense(decoder_combined_context) # equation (6) of the paper
+
+# MODEL
+model = Model([encoder_input, decoder_input], decoder_out)
+model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
+model.summary()
+# model.fit([source_train, target_train],
+#             target_train_onehot,
+#             batch_size=batch_size,
+#             callbacks=callbacks_list,
+#             epochs=epochs,
+#             validation_split=0.1
+#             # validation_data=([source_val, target_val], target_val_onehot)
+#       )
+
+model.fit_generator(
+    generator=train_data_generator.generate_data(),
+    steps_per_epoch=int(len(source_train)/batch_size)+1,
+    epochs=epochs,
+    callbacks=callbacks_list,
+    validation_data=validation_data_generator.generate_data(),
+    validation_steps=int(len(source_val)/batch_size)+1,
+    # use_multiprocessing=True,
+    # workers=10
+)
+
+# INFERENCE MODELS
+# Encoder inference model
+encoder_model_inf = Model(encoder_input, encoder_states)
+
+# Decoder inference model
+decoder_state_input_h = Input(shape=(256,))
+decoder_state_input_c = Input(shape=(256,))
+decoder_input_states = [decoder_state_input_h, decoder_state_input_c]
+
+decoder, decoder_h, decoder_c = decoder_LSTM(x_decode, initial_state=decoder_input_states)
+decoder_states = [decoder_h , decoder_c]
+
+attention = dot([encoder_out, decoder], axes=[2, 2])
+attention = Activation('softmax')(attention)
+context = dot([attention, encoder_out], axes=[1,1])
+
+print(context, decoder)
+decoder_combined_context = concatenate([context, decoder])
+print('decoder_combined_context\t', decoder_combined_context)
+
+decoder_out = decoder_dense(decoder_combined_context)
+decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
+                          outputs=[decoder_out] + decoder_states )
+
+def decode_seq(inp_seq):
+
+    states_val = encoder_model_inf.predict(inp_seq)
+    print('states_val\t', states_val)
+    input('inference encoder prediction\t')
+
+    target_seq = np.zeros((1, target_max_sequence_tokenizer))
+    target_seq[0, 0] = target_vocab['sos']
+
+    translated_sent = []
+    translated_index = []
+    stop_condition = False
+
+    while not stop_condition:
+
+        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
+        max_val_index = np.argmax(decoder_out[0, -1, :])
+
+        try:
+            sampled_fra_char = target_index_to_word_dict[max_val_index]
+        except KeyError:
+            # stop_condition = True
+            sampled_fra_char = 'eos'
+
+        translated_sent.append(sampled_fra_char)
+        translated_index.append(max_val_index)
+
+        if ((sampled_fra_char == 'eos') or (len(translated_sent) > target_max_sequence_tokenizer)):
+            stop_condition = True
+
+        target_seq = np.zeros((1, target_max_sequence_tokenizer))
+        target_seq[0, 0] = max_val_index
+        states_val = [decoder_h, decoder_c]
+
+    return translated_sent[:-1], translated_index[:-1]
+
+y_true = []
+y_pred = []
+
+
+source_val = source_kerasTokenizer.texts_to_sequences(source_val)
+source_val = pad_sequences(source_val, maxlen=source_max_sequence_tokenizer, padding='post')
+
+for seq_index in tqdm.tqdm(range(len(source_val))):
+# for seq_index in range(10):
+    inp_seq = source_val[seq_index:seq_index+1]
+    translated_sent, translated_index= decode_seq(inp_seq)
+
+    # PREDICT ICD10
+    source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)])
+    word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post')
+    icd10_code_index = icd10_model.predict(word_sequence)
+    # print(icd10_code_index, type(icd10_code_index))
+    max_val_index = np.argmax(icd10_code_index, axis=1)[0]
+    # print(max_val_index)
+    icd10_label = encoded_Y.inverse_transform(max_val_index)
+
+    # print('-')
+    # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1]
+    # print('Target indexes:', target_index)
+    # print('Decoded indexes:', translated_index)
+    #
+    # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index]))
+    # print('Decoded sentence:', " ".join(translated_sent))
+    #
+    # print('Target ICD-10:', labels_val[seq_index])
+    # print('Predict ICD-10:', icd10_label)
+
+    y_true.append(labels_val[seq_index])
+    y_pred.append(icd10_label)
+
+report = classification_report(y_true, y_pred)
+report_df = report_to_df(report)
+report_df.to_csv('logs/classification_report_extended.csv')
+print(report_df)
\ No newline at end of file