From 48634b43f13a15890bd61ddf9a98f55cb26585cf Mon Sep 17 00:00:00 2001
From: Jurica Seva <seva@informatik.hu-berlin.de>
Date: Fri, 11 May 2018 12:45:44 +0200
Subject: [PATCH] Generating results.

---
 code_jurica/_config.py               |  21 ++++
 code_jurica/data/results/methods.txt |   1 +
 code_jurica/data/results/team.txt    |   1 +
 code_jurica/generate_runs.py         | 150 +++++++++++++++++++++++++++
 code_jurica/test.py                  |   2 -
 5 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 code_jurica/data/results/methods.txt
 create mode 100644 code_jurica/data/results/team.txt
 create mode 100644 code_jurica/generate_runs.py

diff --git a/code_jurica/_config.py b/code_jurica/_config.py
index 8ea5d0c..fac8221 100644
--- a/code_jurica/_config.py
+++ b/code_jurica/_config.py
@@ -9,6 +9,27 @@ DATA_FR=FR_HOME+'corpus/'
 DATA_HU=HU_HOME+'corpus/'
 DATA_IT=IT_HOME+'corpus/'
 
+TEST_FR = 'data/test/FR/raw/corpus/'
+TEST_IT = 'data/test/IT/raw/corpus/'
+TEST_HU = 'data/test/HU/raw/corpus/'
+
+RESULTS_DIR = 'data/results/WBI/'
+
+RESULTS = {
+    'FR': {
+        'in':TEST_FR + 'CausesBrutes_FR_2015F_1.csv',
+        'out':RESULTS_DIR + 'FR/raw/'
+        },
+    'HU':{
+        'in':TEST_HU + 'CausesBrutes_HU_2.csv',
+        'out':RESULTS_DIR + 'HU/raw/'
+    },
+    'IT': {
+        'in':TEST_IT + 'CausesBrutes_IT_2.csv',
+        'out':RESULTS_DIR + 'IT/raw/'
+        }
+    }
+
 TRAINING = {
     'FR': {
         'CB': [
diff --git a/code_jurica/data/results/methods.txt b/code_jurica/data/results/methods.txt
new file mode 100644
index 0000000..6dd08fe
--- /dev/null
+++ b/code_jurica/data/results/methods.txt
@@ -0,0 +1 @@
+The task was solved using two Deel Learning models: 1) sequence2sequence model with attention and  seperate classification model. The models were trained using only the data provided by the organizers, mostly due to our inability to obtain ICD10 dictionaries for selected languages.
\ No newline at end of file
diff --git a/code_jurica/data/results/team.txt b/code_jurica/data/results/team.txt
new file mode 100644
index 0000000..7dcd995
--- /dev/null
+++ b/code_jurica/data/results/team.txt
@@ -0,0 +1 @@
+1 postdoc and 1 PhD student collaborated to solve this problem with an language independent approach.
\ No newline at end of file
diff --git a/code_jurica/generate_runs.py b/code_jurica/generate_runs.py
new file mode 100644
index 0000000..08d898c
--- /dev/null
+++ b/code_jurica/generate_runs.py
@@ -0,0 +1,150 @@
+from _config import *
+import pickle
+import tqdm
+from _layers import  Attention
+from keras.models import Model, load_model as keras_load_model
+from keras.layers import Input
+from keras.preprocessing.sequence import pad_sequences
+from sklearn.metrics import classification_report
+from util import report_to_df
+import numpy as np
+import pandas as pd
+
+#REPRODUCIBLE
+np.random.seed(42)
+import random
+random.seed(12345)
+import os
+os.environ['PYTHONHASHSEED'] = '0'
+
+import tensorflow as tf
+session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+from keras import backend as K
+tf.set_random_seed(1234)
+sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
+K.set_session(sess)
+#REPRODUCIBLE
+
+langs = ['IT','HU','FR']
+runs = {
+        'run1':{
+            'icd10':'_duplicated',
+            's2s':''
+        },
+        'run2':{
+            'icd10': '_duplicated',
+            's2s': '_extended'
+        }
+    }
+
+for k,v in runs.items():
+
+    # ICD 10 STUFF
+    icd10_model = keras_load_model('models/icd10Classification_attention{}.h5'.format(v['icd10']), custom_objects={'Attention':Attention})
+    with open('models/icd10_tokenizer{}.p'.format(v['icd10']), 'rb') as handle:
+        icd10Tokenizer = pickle.load(handle)
+
+    with open('models/icd10_mappings{}.p'.format(v['icd10']), 'rb') as handle:
+        icd10Encoder = pickle.load(handle)
+    # ICD 10 STUFF
+
+    # S2S STUFF
+    S2S_model = keras_load_model('models/s2s{}.h5'.format(v['s2s']), custom_objects={'Attention':Attention})
+    with open('models/s2s_source_tokenizer{}.p'.format(v['s2s']), 'rb') as handle:
+        s2s_source_tokenizer = pickle.load(handle)
+    source_vocab = s2s_source_tokenizer.word_index
+    source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()}
+
+    with open('models/s2s_target_tokenizer{}.p'.format(v['s2s']), 'rb') as handle:
+        s2s_target_tokenizer = pickle.load(handle)
+
+    target_vocab =s2s_target_tokenizer.word_index
+    target_index_to_word_dict = {v:k.strip() for k,v in s2s_target_tokenizer.word_index.items()}
+    # S2S STUFF
+
+    # INFERENCE MODELS
+    encoder_input = S2S_model.get_layer('input_1').output
+    decoder_input = S2S_model.get_layer('input_2').output
+    x, state_h, state_c = S2S_model.get_layer('lstm_1').output
+    encoder_states = [state_h, state_c]
+
+    embed_2 = S2S_model.get_layer('embedding_2').output
+    decoder_LSTM = S2S_model.get_layer('lstm_2')
+    decoder_dense = S2S_model.get_layer('dense_1')
+
+    # Encoder inference model
+    encoder_model_inf = Model(encoder_input, encoder_states)
+
+    # Decoder inference model
+    decoder_state_input_h = Input(shape=(256,), name='inf_input1')
+    decoder_state_input_c = Input(shape=(256,), name='inf_input2')
+    decoder_input_states = [decoder_state_input_h, decoder_state_input_c]
+
+    decoder_out, decoder_h, decoder_c = decoder_LSTM(embed_2, initial_state=decoder_input_states)
+    decoder_states = [decoder_h , decoder_c]
+    decoder_out = decoder_dense(decoder_out)
+
+    decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
+                              outputs=[decoder_out] + decoder_states )
+
+    # encoder_model_inf.summary()
+    # decoder_model_inf.summary()
+
+    def decode_seq(inp_seq):
+        states_val = encoder_model_inf.predict(inp_seq)
+        target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
+        target_seq[0, 0] = target_vocab['sos']
+
+        translated_sent = []
+        translated_index = []
+        stop_condition = False
+
+        while not stop_condition:
+            decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
+            max_val_index = np.argmax(decoder_out[0, -1, :])
+            try:
+                sampled_fra_char = target_index_to_word_dict[max_val_index]
+            except KeyError:
+                sampled_fra_char = 'eos'
+
+            translated_sent.append(sampled_fra_char)
+            translated_index.append(max_val_index)
+
+            if ((sampled_fra_char == 'eos') or (len(translated_sent) > S2S_model.get_layer('input_2').output_shape[1])):
+                stop_condition = True
+
+            target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
+            target_seq[0, 0] = max_val_index
+            states_val = [decoder_h, decoder_c]
+
+        return translated_sent[:-1], translated_index[:-1]
+
+    for lang in langs:
+        df = pd.read_csv(RESULTS[lang]['in'], sep=';')
+        source_val = [x.lower() for x in df.RawText.values.tolist()]
+        source_val = s2s_source_tokenizer.texts_to_sequences(source_val)
+        source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')
+
+        results = []
+        for seq_index in tqdm.tqdm(range(len(source_val))):
+
+            inp_seq = source_val[seq_index:seq_index+1]
+            translated_sent, translated_index= decode_seq(inp_seq)
+            # print(translated_sent)
+            # input('ts')
+
+            # PREDICT ICD10
+            source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)])
+            word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post')
+            icd10_code_index = icd10_model.predict(word_sequence)
+            # print(icd10_code_index, type(icd10_code_index))
+            max_val_index = np.argmax(icd10_code_index, axis=1)[0]
+            # print(max_val_index)
+            icd10_label = icd10Encoder.inverse_transform(max_val_index)
+
+            results.append([" ".join(translated_sent)," ".join(translated_sent),icd10_label])
+
+        tmp_df = pd.DataFrame(results)
+        tmp_df.columns = ['Text1', 'Text2','ICD10']
+        result_df = pd.concat([df.ix[:,:3], tmp_df], axis=1)
+        result_df.to_csv(RESULTS[lang]['out']+k+'.csv', sep=';')
diff --git a/code_jurica/test.py b/code_jurica/test.py
index 5f7db76..7fac572 100644
--- a/code_jurica/test.py
+++ b/code_jurica/test.py
@@ -114,8 +114,6 @@ def decode_seq(inp_seq):
 
 y_true = []
 y_pred = []
-# for seq_index in range(len(source_corpus)):
-
 
 source_val = s2s_source_tokenizer.texts_to_sequences(source_val)
 source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')
-- 
GitLab