Generating results.

48634b43 · Jurica Seva · 8d1e8f63 · 48634b43 · 48634b43 · 48634b43
Commit 48634b43 authored 6 years ago by Jurica Seva
--- a/code_jurica/_config.py
+++ b/code_jurica/_config.py
@@ -9,6 +9,27 @@ DATA_FR=FR_HOME+'corpus/'
 DATA_HU=HU_HOME+'corpus/'
 DATA_IT=IT_HOME+'corpus/'

+TEST_FR = 'data/test/FR/raw/corpus/'
+TEST_IT = 'data/test/IT/raw/corpus/'
+TEST_HU = 'data/test/HU/raw/corpus/'
+
+RESULTS_DIR = 'data/results/WBI/'
+
+RESULTS = {
+    'FR': {
+        'in':TEST_FR + 'CausesBrutes_FR_2015F_1.csv',
+        'out':RESULTS_DIR + 'FR/raw/'
+        },
+    'HU':{
+        'in':TEST_HU + 'CausesBrutes_HU_2.csv',
+        'out':RESULTS_DIR + 'HU/raw/'
+    },
+    'IT': {
+        'in':TEST_IT + 'CausesBrutes_IT_2.csv',
+        'out':RESULTS_DIR + 'IT/raw/'
+        }
+    }
+
 TRAINING = {
    'FR': {
        'CB': [

--- a/code_jurica/data/results/methods.txt
+++ b/code_jurica/data/results/methods.txt
+The task was solved using two Deel Learning models: 1) sequence2sequence model with attention and  seperate classification model. The models were trained using only the data provided by the organizers, mostly due to our inability to obtain ICD10 dictionaries for selected languages.
\ No newline at end of file
--- a/code_jurica/data/results/team.txt
+++ b/code_jurica/data/results/team.txt
+1 postdoc and 1 PhD student collaborated to solve this problem with an language independent approach.
\ No newline at end of file
--- a/code_jurica/generate_runs.py
+++ b/code_jurica/generate_runs.py
+from _config import *
+import pickle
+import tqdm
+from _layers import  Attention
+from keras.models import Model, load_model as keras_load_model
+from keras.layers import Input
+from keras.preprocessing.sequence import pad_sequences
+from sklearn.metrics import classification_report
+from util import report_to_df
+import numpy as np
+import pandas as pd
+
+#REPRODUCIBLE
+np.random.seed(42)
+import random
+random.seed(12345)
+import os
+os.environ['PYTHONHASHSEED'] = '0'
+
+import tensorflow as tf
+session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+from keras import backend as K
+tf.set_random_seed(1234)
+sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
+K.set_session(sess)
+#REPRODUCIBLE
+
+langs = ['IT','HU','FR']
+runs = {
+        'run1':{
+            'icd10':'_duplicated',
+            's2s':''
+        },
+        'run2':{
+            'icd10': '_duplicated',
+            's2s': '_extended'
+        }
+    }
+
+for k,v in runs.items():
+
+    # ICD 10 STUFF
+    icd10_model = keras_load_model('models/icd10Classification_attention{}.h5'.format(v['icd10']), custom_objects={'Attention':Attention})
+    with open('models/icd10_tokenizer{}.p'.format(v['icd10']), 'rb') as handle:
+        icd10Tokenizer = pickle.load(handle)
+
+    with open('models/icd10_mappings{}.p'.format(v['icd10']), 'rb') as handle:
+        icd10Encoder = pickle.load(handle)
+    # ICD 10 STUFF
+
+    # S2S STUFF
+    S2S_model = keras_load_model('models/s2s{}.h5'.format(v['s2s']), custom_objects={'Attention':Attention})
+    with open('models/s2s_source_tokenizer{}.p'.format(v['s2s']), 'rb') as handle:
+        s2s_source_tokenizer = pickle.load(handle)
+    source_vocab = s2s_source_tokenizer.word_index
+    source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()}
+
+    with open('models/s2s_target_tokenizer{}.p'.format(v['s2s']), 'rb') as handle:
+        s2s_target_tokenizer = pickle.load(handle)
+
+    target_vocab =s2s_target_tokenizer.word_index
+    target_index_to_word_dict = {v:k.strip() for k,v in s2s_target_tokenizer.word_index.items()}
+    # S2S STUFF
+
+    # INFERENCE MODELS
+    encoder_input = S2S_model.get_layer('input_1').output
+    decoder_input = S2S_model.get_layer('input_2').output
+    x, state_h, state_c = S2S_model.get_layer('lstm_1').output
+    encoder_states = [state_h, state_c]
+
+    embed_2 = S2S_model.get_layer('embedding_2').output
+    decoder_LSTM = S2S_model.get_layer('lstm_2')
+    decoder_dense = S2S_model.get_layer('dense_1')
+
+    # Encoder inference model
+    encoder_model_inf = Model(encoder_input, encoder_states)
+
+    # Decoder inference model
+    decoder_state_input_h = Input(shape=(256,), name='inf_input1')
+    decoder_state_input_c = Input(shape=(256,), name='inf_input2')
+    decoder_input_states = [decoder_state_input_h, decoder_state_input_c]
+
+    decoder_out, decoder_h, decoder_c = decoder_LSTM(embed_2, initial_state=decoder_input_states)
+    decoder_states = [decoder_h , decoder_c]
+    decoder_out = decoder_dense(decoder_out)
+
+    decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
+                              outputs=[decoder_out] + decoder_states )
+
+    # encoder_model_inf.summary()
+    # decoder_model_inf.summary()
+
+    def decode_seq(inp_seq):
+        states_val = encoder_model_inf.predict(inp_seq)
+        target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
+        target_seq[0, 0] = target_vocab['sos']
+
+        translated_sent = []
+        translated_index = []
+        stop_condition = False
+
+        while not stop_condition:
+            decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
+            max_val_index = np.argmax(decoder_out[0, -1, :])
+            try:
+                sampled_fra_char = target_index_to_word_dict[max_val_index]
+            except KeyError:
+                sampled_fra_char = 'eos'
+
+            translated_sent.append(sampled_fra_char)
+            translated_index.append(max_val_index)
+
+            if ((sampled_fra_char == 'eos') or (len(translated_sent) > S2S_model.get_layer('input_2').output_shape[1])):
+                stop_condition = True
+
+            target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
+            target_seq[0, 0] = max_val_index
+            states_val = [decoder_h, decoder_c]
+
+        return translated_sent[:-1], translated_index[:-1]
+
+    for lang in langs:
+        df = pd.read_csv(RESULTS[lang]['in'], sep=';')
+        source_val = [x.lower() for x in df.RawText.values.tolist()]
+        source_val = s2s_source_tokenizer.texts_to_sequences(source_val)
+        source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')
+
+        results = []
+        for seq_index in tqdm.tqdm(range(len(source_val))):
+
+            inp_seq = source_val[seq_index:seq_index+1]
+            translated_sent, translated_index= decode_seq(inp_seq)
+            # print(translated_sent)
+            # input('ts')
+
+            # PREDICT ICD10
+            source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)])
+            word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post')
+            icd10_code_index = icd10_model.predict(word_sequence)
+            # print(icd10_code_index, type(icd10_code_index))
+            max_val_index = np.argmax(icd10_code_index, axis=1)[0]
+            # print(max_val_index)
+            icd10_label = icd10Encoder.inverse_transform(max_val_index)
+
+            results.append([" ".join(translated_sent)," ".join(translated_sent),icd10_label])
+
+        tmp_df = pd.DataFrame(results)
+        tmp_df.columns = ['Text1', 'Text2','ICD10']
+        result_df = pd.concat([df.ix[:,:3], tmp_df], axis=1)
+        result_df.to_csv(RESULTS[lang]['out']+k+'.csv', sep=';')
--- a/code_jurica/test.py
+++ b/code_jurica/test.py
@@ -114,8 +114,6 @@ def decode_seq(inp_seq):

 y_true = []
 y_pred = []
-# for seq_index in range(len(source_corpus)):
-

 source_val = s2s_source_tokenizer.texts_to_sequences(source_val)
 source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')