diff --git a/code_jurica/_layers.py b/code_jurica/_layers.py index 0f1896c4f77334274b6b173578dbc77f80461834..e5befe19d3255609af50068743d1305de1e78c5a 100644 --- a/code_jurica/_layers.py +++ b/code_jurica/_layers.py @@ -41,7 +41,6 @@ class SwishBeta(Layer): base_config = super(SwishBeta, self).get_config() return dict(list(base_config.items()) + list(config.items())) - class Length(layers.Layer): """ Compute the length of vectors. This is used to compute a Tensor that has the same shape with y_true in margin_loss. @@ -56,7 +55,6 @@ class Length(layers.Layer): def compute_output_shape(self, input_shape): return input_shape[:-1] - class Mask(layers.Layer): """ Mask a Tensor with shape=[None, num_capsule, dim_vector] either by the capsule with max length or by an additional @@ -95,7 +93,6 @@ class Mask(layers.Layer): else: # no true label provided return tuple([None, input_shape[1] * input_shape[2]]) - def squash(vectors, axis=-1): """ The non-linear activation used in Capsule. It drives the length of a large vector to near 1 and small vector to 0 @@ -107,7 +104,6 @@ def squash(vectors, axis=-1): scale = s_squared_norm / (1 + s_squared_norm) / K.sqrt(s_squared_norm + K.epsilon()) return scale * vectors - class CapsuleLayer(layers.Layer): """ The capsule layer. It is similar to Dense layer. Dense layer has `in_num` inputs, each is a scalar, the output of the @@ -190,7 +186,6 @@ class CapsuleLayer(layers.Layer): def compute_output_shape(self, input_shape): return tuple([None, self.num_capsule, self.dim_capsule]) - def PrimaryCap(inputs, dim_capsule, n_channels, kernel_size, strides, padding): """ Apply Conv2D `n_channels` times and concatenate all capsules @@ -204,18 +199,6 @@ def PrimaryCap(inputs, dim_capsule, n_channels, kernel_size, strides, padding): outputs = layers.Reshape(target_shape=[-1, dim_capsule], name='')(output) return layers.Lambda(squash, name='')(outputs) - -""" -# The following is another way to implement primary capsule layer. This is much slower. -# Apply Conv2D `n_channels` times and concatenate all capsules -def PrimaryCap(inputs, dim_capsule, n_channels, kernel_size, strides, padding): - outputs = [] - for _ in range(n_channels): - output = layers.Conv2D(filters=dim_capsule, kernel_size=kernel_size, strides=strides, padding=padding)(inputs) - outputs.append(layers.Reshape([output.get_shape().as_list()[1] ** 2, dim_capsule])(output)) - outputs = layers.Concatenate(axis=1)(outputs) - return layers.Lambda(squash)(outputs) -""" def dot_product(x, kernel): """ Wrapper for dot product operation, in order to be compatible with both @@ -317,96 +300,82 @@ class Attention(Layer): class AttentionWithContext(Layer): """ - Attention operation, with a context/query vector, for temporal data. - Supports Masking. - Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] - "Hierarchical Attention Networks for Document Classification" - by using a context vector to assist the attention - # Input shape - 3D tensor with shape: `(samples, steps, features)`. - # Output shape - 2D tensor with shape: `(samples, features)`. - - How to use: - Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. - The dimensions are inferred based on the output shape of the RNN. - - Note: The layer has been tested with Keras 2.0.6 - - Example: - model.add(LSTM(64, return_sequences=True)) - model.add(AttentionWithContext()) - # next add a Dense layer (for classification/regression) or whatever... - """ - - def __init__(self, - W_regularizer=None, u_regularizer=None, b_regularizer=None, - W_constraint=None, u_constraint=None, b_constraint=None, - bias=True, **kwargs): + Attention operation, with a context/query vector, for temporal data. + Supports Masking. + Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] + "Hierarchical Attention Networks for Document Classification" + by using a context vector to assist the attention + # Input shape + 3D tensor with shape: `(samples, steps, features)`. + # Output shape + 2D tensor with shape: `(samples, features)`. + :param kwargs: + Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. + The dimensions are inferred based on the output shape of the RNN. + Example: + model.add(LSTM(64, return_sequences=True)) + model.add(AttentionWithContext()) + """ + def __init__(self, init='glorot_uniform', kernel_regularizer=None, bias_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs): self.supports_masking = True - self.init = initializers.get('glorot_uniform') + self.init = initializers.get(init) + self.kernel_initializer = initializers.get('glorot_uniform') - self.W_regularizer = regularizers.get(W_regularizer) - self.u_regularizer = regularizers.get(u_regularizer) - self.b_regularizer = regularizers.get(b_regularizer) + self.kernel_regularizer = regularizers.get(kernel_regularizer) + self.bias_regularizer = regularizers.get(bias_regularizer) - self.W_constraint = constraints.get(W_constraint) - self.u_constraint = constraints.get(u_constraint) - self.b_constraint = constraints.get(b_constraint) + self.kernel_constraint = constraints.get(kernel_constraint) + self.bias_constraint = constraints.get(bias_constraint) - self.bias = bias super(AttentionWithContext, self).__init__(**kwargs) def build(self, input_shape): - assert len(input_shape) == 3 - - self.W = self.add_weight((input_shape[-1], input_shape[-1],), - initializer=self.init, + self.kernel = self.add_weight((input_shape[-1], 1), + initializer=self.kernel_initializer, name='{}_W'.format(self.name), - regularizer=self.W_regularizer, - constraint=self.W_constraint) - if self.bias: - self.b = self.add_weight((input_shape[-1],), - initializer='zero', - name='{}_b'.format(self.name), - regularizer=self.b_regularizer, - constraint=self.b_constraint) - - self.u = self.add_weight((input_shape[-1],), - initializer=self.init, + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint) + self.b = self.add_weight((input_shape[1],), + initializer='zero', + name='{}_b'.format(self.name), + regularizer=self.bias_regularizer, + constraint=self.bias_constraint) + + self.u = self.add_weight((input_shape[1],), + initializer=self.kernel_initializer, name='{}_u'.format(self.name), - regularizer=self.u_regularizer, - constraint=self.u_constraint) - - super(AttentionWithContext, self).build(input_shape) + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint) + self.built = True - def compute_mask(self, input, input_mask=None): - # do not pass the mask to the next layers + def compute_mask(self, input, mask): return None def call(self, x, mask=None): - uit = dot_product(x, self.W) + # (x, 40, 300) x (300, 1) + multData = K.dot(x, self.kernel) # (x, 40, 1) + multData = K.squeeze(multData, -1) # (x, 40) + multData = multData + self.b # (x, 40) + (40,) - if self.bias: - uit += self.b - - uit = K.tanh(uit) - # ait = K.dot(uit, self.u) - ait = dot_product(uit, self.u) + multData = K.tanh(multData) # (x, 40) - a = K.exp(ait) + multData = multData * self.u # (x, 40) * (40, 1) => (x, 1) + multData = K.exp(multData) # (X, 1) # apply mask after the exp. will be re-normalized next if mask is not None: - # Cast the mask to floatX to avoid float64 upcasting in theano - a *= K.cast(mask, K.floatx()) - - a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) - - a = K.expand_dims(a) - weighted_input = x * a + mask = K.cast(mask, K.floatx()) #(x, 40) + multData = mask*multData #(x, 40) * (x, 40, ) + + # in some cases especially in the early stages of training the sum may be almost zero + # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. + # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) + multData /= K.cast(K.sum(multData, axis=1, keepdims=True) + K.epsilon(), K.floatx()) + multData = K.expand_dims(multData) + weighted_input = x * multData return K.sum(weighted_input, axis=1) + def compute_output_shape(self, input_shape): - return input_shape[0], input_shape[-1] \ No newline at end of file + return (input_shape[0], input_shape[-1],) \ No newline at end of file diff --git a/code_jurica/classificationICD10_attention.py b/code_jurica/classificationICD10_attention.py index 6ee8659ca5be54efe58d271e8cd65d54af81356f..ffaf572167c49c503a49f8d4534c7d20a20270c7 100644 --- a/code_jurica/classificationICD10_attention.py +++ b/code_jurica/classificationICD10_attention.py @@ -104,7 +104,8 @@ encoded_Y = encoder.transform(labels) # convert integers to dummy variables (i.e. one hot encoded) labels_one_hot = np_utils.to_categorical(encoded_Y) -X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.02, random_state=777, stratify=labels) +X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, + test_size=0.02, random_state=777, stratify=labels) print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test)) try: diff --git a/code_jurica/loader.py b/code_jurica/loader.py index 99b7e970ddfd7b781492f89b78921a9b18f3ac9e..d8f80768412d757041d03e321a3938713be79d67 100644 --- a/code_jurica/loader.py +++ b/code_jurica/loader.py @@ -39,6 +39,7 @@ except Exception as e: # corpora=FR_sample+HU_sample+IT_sample corpora=frCorpora+itCorpora+huCorpora +corpora=corpora[:10000] # print(len(corpora)) # corpora=corpora[:int(len(corpora)*0.5)] # print(len(corpora)) diff --git a/code_jurica/multiTaskEmbeddings.py b/code_jurica/multiTaskEmbeddings.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code_jurica/seq2seq_attention.py b/code_jurica/seq2seq_attention.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d3688edb53cd64e07a81abec249b01713620d834 100644 --- a/code_jurica/seq2seq_attention.py +++ b/code_jurica/seq2seq_attention.py @@ -0,0 +1,239 @@ +# from comet_ml import Experiment +# experiment = Experiment(api_key="hSd9vTj0EfMu72569YnVEvtvj") + +from loader import * +from _layers import AttentionWithContext, Attention +from keras.models import Model, load_model as keras_load_model +from keras.layers import Input, LSTM, Dense, Embedding, GRU, Activation, dot, concatenate, Bidirectional, TimeDistributed +from keras.utils import multi_gpu_model +from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger +import tensorflow as tf +import tqdm + +import pickle +from sklearn.metrics import classification_report + +#REPRODUCIBLE +np.random.seed(42) +import random +random.seed(12345) +import os +os.environ['PYTHONHASHSEED'] = '0' + +import tensorflow as tf +# config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) +config = tf.ConfigProto() +from keras import backend as K +tf.set_random_seed(1234) +#REPRODUCIBLE + + +################################### +# TensorFlow wizardry +# Don't pre-allocate memory; allocate as-needed +config.gpu_options.allow_growth = True +config.gpu_options.allocator_type = 'BFC' +sess = tf.Session(graph=tf.get_default_graph(), config=config) +K.set_session(sess) + +# LOAD ICD 10 CLASSIFICATION MODEL +try: + icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5', + custom_objects={'Attention':Attention}) +except OSError: + from classificationICD10 import * + icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5') + +with open('models/icd10_tokenizer_extended.p', 'rb') as handle: + icd10Tokenizer = pickle.load(handle) + +with open('models/icd10_mappings_extended.p', 'rb') as handle: + encoded_Y = pickle.load(handle) +# LOAD ICD 10 CLASSIFICATION MODEL + +callbacks_list = [ + EarlyStopping( + monitor='val_loss', + patience=2, + # min_delta=0.001 + ), + ModelCheckpoint( + filepath='models/s2s_att_extended.h5', + monitor='val_loss', + save_best_only=True, + ), + CSVLogger( + append=False, + filename='logs/s2s_att_extended_{}.csv'.format(date_label) + ) +] + +latent_dim = 256 +batch_size = 400 +epochs = 1 + +train_data_generator = KerasBatchGenerator(batch_size, + source_train, + source_max_sequence_tokenizer, + source_kerasTokenizer, + target_train, + target_max_sequence_tokenizer, + target_kerasTokenizer + ) + +validation_data_generator = KerasBatchGenerator(batch_size, + source_val, + source_max_sequence_tokenizer, + source_kerasTokenizer, + target_val, + target_max_sequence_tokenizer, + target_kerasTokenizer + ) + +print("Lets train some stuff!") +# Define an input sequence and process it. +encoder_input = Input(shape=(source_max_sequence_tokenizer, )) +x = source_embedding_layer(encoder_input) +encoder_out, state_h, state_c = LSTM(latent_dim, return_sequences=True, unroll=True, return_state=True)(x) +encoder_states = [state_h, state_c] + +# Set up the decoder, using `encoder_states` as initial state. +decoder_input = Input(shape=(target_max_sequence_tokenizer, )) +x_decode = target_embedding_layer(decoder_input) +decoder_LSTM = LSTM(latent_dim, return_sequences=True, return_state = True, unroll=True) +decoder, state_h_decode , state_c_decode = decoder_LSTM(x_decode, initial_state=encoder_states) + +# Equation (7) with 'dot' score from Section 3.1 in the paper. +# Note that we reuse Softmax-activation layer instead of writing tensor calculation +attention = dot([encoder_out, decoder], axes=[2, 2]) +attention = Activation('softmax')(attention) +context = dot([attention, encoder_out], axes=[1,1]) +decoder_combined_context = concatenate([context, decoder]) +print(decoder_combined_context) + +decoder_dense = Dense(len(target_vocab)+1, activation='softmax') +decoder_out = decoder_dense(decoder_combined_context) # equation (6) of the paper + +# MODEL +model = Model([encoder_input, decoder_input], decoder_out) +model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy']) +model.summary() +# model.fit([source_train, target_train], +# target_train_onehot, +# batch_size=batch_size, +# callbacks=callbacks_list, +# epochs=epochs, +# validation_split=0.1 +# # validation_data=([source_val, target_val], target_val_onehot) +# ) + +model.fit_generator( + generator=train_data_generator.generate_data(), + steps_per_epoch=int(len(source_train)/batch_size)+1, + epochs=epochs, + callbacks=callbacks_list, + validation_data=validation_data_generator.generate_data(), + validation_steps=int(len(source_val)/batch_size)+1, + # use_multiprocessing=True, + # workers=10 +) + +# INFERENCE MODELS +# Encoder inference model +encoder_model_inf = Model(encoder_input, encoder_states) + +# Decoder inference model +decoder_state_input_h = Input(shape=(256,)) +decoder_state_input_c = Input(shape=(256,)) +decoder_input_states = [decoder_state_input_h, decoder_state_input_c] + +decoder, decoder_h, decoder_c = decoder_LSTM(x_decode, initial_state=decoder_input_states) +decoder_states = [decoder_h , decoder_c] + +attention = dot([encoder_out, decoder], axes=[2, 2]) +attention = Activation('softmax')(attention) +context = dot([attention, encoder_out], axes=[1,1]) + +print(context, decoder) +decoder_combined_context = concatenate([context, decoder]) +print('decoder_combined_context\t', decoder_combined_context) + +decoder_out = decoder_dense(decoder_combined_context) +decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states, + outputs=[decoder_out] + decoder_states ) + +def decode_seq(inp_seq): + + states_val = encoder_model_inf.predict(inp_seq) + print('states_val\t', states_val) + input('inference encoder prediction\t') + + target_seq = np.zeros((1, target_max_sequence_tokenizer)) + target_seq[0, 0] = target_vocab['sos'] + + translated_sent = [] + translated_index = [] + stop_condition = False + + while not stop_condition: + + decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val) + max_val_index = np.argmax(decoder_out[0, -1, :]) + + try: + sampled_fra_char = target_index_to_word_dict[max_val_index] + except KeyError: + # stop_condition = True + sampled_fra_char = 'eos' + + translated_sent.append(sampled_fra_char) + translated_index.append(max_val_index) + + if ((sampled_fra_char == 'eos') or (len(translated_sent) > target_max_sequence_tokenizer)): + stop_condition = True + + target_seq = np.zeros((1, target_max_sequence_tokenizer)) + target_seq[0, 0] = max_val_index + states_val = [decoder_h, decoder_c] + + return translated_sent[:-1], translated_index[:-1] + +y_true = [] +y_pred = [] + + +source_val = source_kerasTokenizer.texts_to_sequences(source_val) +source_val = pad_sequences(source_val, maxlen=source_max_sequence_tokenizer, padding='post') + +for seq_index in tqdm.tqdm(range(len(source_val))): +# for seq_index in range(10): + inp_seq = source_val[seq_index:seq_index+1] + translated_sent, translated_index= decode_seq(inp_seq) + + # PREDICT ICD10 + source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)]) + word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') + icd10_code_index = icd10_model.predict(word_sequence) + # print(icd10_code_index, type(icd10_code_index)) + max_val_index = np.argmax(icd10_code_index, axis=1)[0] + # print(max_val_index) + icd10_label = encoded_Y.inverse_transform(max_val_index) + + # print('-') + # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] + # print('Target indexes:', target_index) + # print('Decoded indexes:', translated_index) + # + # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) + # print('Decoded sentence:', " ".join(translated_sent)) + # + # print('Target ICD-10:', labels_val[seq_index]) + # print('Predict ICD-10:', icd10_label) + + y_true.append(labels_val[seq_index]) + y_pred.append(icd10_label) + +report = classification_report(y_true, y_pred) +report_df = report_to_df(report) +report_df.to_csv('logs/classification_report_extended.csv') +print(report_df) \ No newline at end of file