From 87a185ed8453e24b6ba162a18e3bf32523ca9b84 Mon Sep 17 00:00:00 2001
From: Jurica Seva <seva@informatik.hu-berlin.de>
Date: Fri, 25 May 2018 15:56:25 +0200
Subject: [PATCH] Updated with results on the train/val dataset.

---
 code_jurica/test.py      | 280 ++++++++++++++++++++-------------------
 paper/40_experiments.tex |  10 +-
 2 files changed, 151 insertions(+), 139 deletions(-)

diff --git a/code_jurica/test.py b/code_jurica/test.py
index 7fac572..954c8fd 100644
--- a/code_jurica/test.py
+++ b/code_jurica/test.py
@@ -23,138 +23,150 @@ sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
 K.set_session(sess)
 #REPRODUCIBLE
 
-with open('models/train_test_split_extended.p', 'rb') as handle:
-    data_set = pickle.load(handle)
-
-source_val = data_set['source_val']
-target_val =data_set['target_val']
-labels_val = data_set['labels_val']
-# print(source_val)
-# input('source val')
-
-# ICD 10 STUFF
-icd10_model = keras_load_model('models/icd10Classification_attention_duplicated.h5', custom_objects={'Attention':Attention})
-with open('models/icd10_tokenizer_duplicated.p', 'rb') as handle:
-    icd10Tokenizer = pickle.load(handle)
-
-with open('models/icd10_mappings_duplicated.p', 'rb') as handle:
-    icd10Encoder = pickle.load(handle)
-# ICD 10 STUFF
-
-# S2S STUFF
-S2S_model = keras_load_model('models/s2s.h5', custom_objects={'Attention':Attention})
-with open('models/s2s_source_tokenizer.p', 'rb') as handle:
-    s2s_source_tokenizer = pickle.load(handle)
-source_vocab = s2s_source_tokenizer.word_index
-source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()}
-
-with open('models/s2s_target_tokenizer.p', 'rb') as handle:
-    s2s_target_tokenizer = pickle.load(handle)
-
-target_vocab =s2s_target_tokenizer.word_index
-target_index_to_word_dict = {v:k.strip() for k,v in s2s_target_tokenizer.word_index.items()}
-# S2S STUFF
-
-# INFERENCE MODELS
-encoder_input = S2S_model.get_layer('input_1').output
-decoder_input = S2S_model.get_layer('input_2').output
-x, state_h, state_c = S2S_model.get_layer('lstm_1').output
-encoder_states = [state_h, state_c]
-
-embed_2 = S2S_model.get_layer('embedding_2').output
-decoder_LSTM = S2S_model.get_layer('lstm_2')
-decoder_dense = S2S_model.get_layer('dense_1')
-
-# Encoder inference model
-encoder_model_inf = Model(encoder_input, encoder_states)
-
-# Decoder inference model
-decoder_state_input_h = Input(shape=(256,), name='inf_input1')
-decoder_state_input_c = Input(shape=(256,), name='inf_input2')
-decoder_input_states = [decoder_state_input_h, decoder_state_input_c]
-
-decoder_out, decoder_h, decoder_c = decoder_LSTM(embed_2, initial_state=decoder_input_states)
-decoder_states = [decoder_h , decoder_c]
-decoder_out = decoder_dense(decoder_out)
-
-decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
-                          outputs=[decoder_out] + decoder_states )
-
-# encoder_model_inf.summary()
-# decoder_model_inf.summary()
-
-def decode_seq(inp_seq):
-    states_val = encoder_model_inf.predict(inp_seq)
-    target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
-    target_seq[0, 0] = target_vocab['sos']
-
-    translated_sent = []
-    translated_index = []
-    stop_condition = False
-
-    while not stop_condition:
-        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
-        max_val_index = np.argmax(decoder_out[0, -1, :])
-        try:
-            sampled_fra_char = target_index_to_word_dict[max_val_index]
-        except KeyError:
-            sampled_fra_char = 'eos'
-
-        translated_sent.append(sampled_fra_char)
-        translated_index.append(max_val_index)
-
-        if ((sampled_fra_char == 'eos') or (len(translated_sent) > S2S_model.get_layer('input_2').output_shape[1])):
-            stop_condition = True
-
+runs = {
+        'run1':{
+            'icd10':'_duplicated',
+            's2s':''
+        },
+        'run2':{
+            'icd10': '_duplicated',
+            's2s': '_extended'
+        }
+    }
+
+for k,v in runs.items():
+
+    with open('models/train_test_split_extended.p', 'rb') as handle:
+        data_set = pickle.load(handle)
+
+    source_val = data_set['source_val']
+    target_val = data_set['target_val']
+    labels_val = data_set['labels_val']
+    print(source_val)
+    # input('source val')
+
+    # ICD 10 STUFF
+    icd10_model = keras_load_model('models/icd10Classification_attention{}.h5'.format(v['icd10']), custom_objects={'Attention':Attention})
+    with open('models/icd10_tokenizer{}.p'.format(v['icd10']), 'rb') as handle:
+        icd10Tokenizer = pickle.load(handle)
+
+    with open('models/icd10_mappings{}.p'.format(v['icd10']), 'rb') as handle:
+        icd10Encoder = pickle.load(handle)
+    # ICD 10 STUFF
+
+    # S2S STUFF
+    S2S_model = keras_load_model('models/s2s{}.h5'.format(v['s2s']), custom_objects={'Attention':Attention})
+    with open('models/s2s_source_tokenizer{}.p'.format(v['s2s']), 'rb') as handle:
+        s2s_source_tokenizer = pickle.load(handle)
+    source_vocab = s2s_source_tokenizer.word_index
+    source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()}
+
+    with open('models/s2s_target_tokenizer{}.p'.format(v['s2s']), 'rb') as handle:
+        s2s_target_tokenizer = pickle.load(handle)
+
+    target_vocab =s2s_target_tokenizer.word_index
+    target_index_to_word_dict = {v:k.strip() for k,v in s2s_target_tokenizer.word_index.items()}
+    # S2S STUFF
+
+    # INFERENCE MODELS
+    encoder_input = S2S_model.get_layer('input_1').output
+    decoder_input = S2S_model.get_layer('input_2').output
+    x, state_h, state_c = S2S_model.get_layer('lstm_1').output
+    encoder_states = [state_h, state_c]
+
+    embed_2 = S2S_model.get_layer('embedding_2').output
+    decoder_LSTM = S2S_model.get_layer('lstm_2')
+    decoder_dense = S2S_model.get_layer('dense_1')
+
+    # Encoder inference model
+    encoder_model_inf = Model(encoder_input, encoder_states)
+
+    # Decoder inference model
+    decoder_state_input_h = Input(shape=(256,), name='inf_input1')
+    decoder_state_input_c = Input(shape=(256,), name='inf_input2')
+    decoder_input_states = [decoder_state_input_h, decoder_state_input_c]
+
+    decoder_out, decoder_h, decoder_c = decoder_LSTM(embed_2, initial_state=decoder_input_states)
+    decoder_states = [decoder_h , decoder_c]
+    decoder_out = decoder_dense(decoder_out)
+
+    decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
+                              outputs=[decoder_out] + decoder_states )
+
+    # encoder_model_inf.summary()
+    # decoder_model_inf.summary()
+
+    def decode_seq(inp_seq):
+        states_val = encoder_model_inf.predict(inp_seq)
         target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
-        target_seq[0, 0] = max_val_index
-        states_val = [decoder_h, decoder_c]
-
-    return translated_sent[:-1], translated_index[:-1]
-
-y_true = []
-y_pred = []
-
-source_val = s2s_source_tokenizer.texts_to_sequences(source_val)
-source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')
-
-for seq_index in tqdm.tqdm(range(len(source_val))):
-
-    # inp_seq = source_val[seq_index:seq_index + 1]
-    # inp_seq = s2s_source_tokenizer.texts_to_sequences(inp_seq)
-    # inp_seq = pad_sequences(inp_seq, maxlen=S2S_model.get_layer('input_1').output_shape[1], padding='post')
-    # translated_sent, translated_index= decode_seq(inp_seq)
-    #
-    # target_seq = target_corpus[seq_index:seq_index + 1]
-    # target_seq = s2s_target_tokenizer.texts_to_sequences(target_seq)
-
-    inp_seq = source_val[seq_index:seq_index+1]
-    translated_sent, translated_index= decode_seq(inp_seq)
-
-    # PREDICT ICD10
-    source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)])
-    word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post')
-    icd10_code_index = icd10_model.predict(word_sequence)
-    # print(icd10_code_index, type(icd10_code_index))
-    max_val_index = np.argmax(icd10_code_index, axis=1)[0]
-    # print(max_val_index)
-    icd10_label = icd10Encoder.inverse_transform(max_val_index)
-
-    # print('-')
-    # target_index = target_seq[0]
-    # print('Target indexes:', target_index)
-    # print('Decoded indexes:', translated_index)
-    #
-    # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index]))
-    # print('Decoded sentence:', " ".join(translated_sent))
-    #
-    # print('Target ICD-10:', labels[seq_index])
-    # print('Predict ICD-10:', icd10_label)
-
-    y_true.append(labels_val[seq_index])
-    y_pred.append(icd10_label)
-
-report = classification_report(y_true, y_pred)
-report_df = report_to_df(report)
-report_df.to_csv('logs/classification_report_test_combined.csv')
-print(report_df)
\ No newline at end of file
+        target_seq[0, 0] = target_vocab['sos']
+
+        translated_sent = []
+        translated_index = []
+        stop_condition = False
+
+        while not stop_condition:
+            decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
+            max_val_index = np.argmax(decoder_out[0, -1, :])
+            try:
+                sampled_fra_char = target_index_to_word_dict[max_val_index]
+            except KeyError:
+                sampled_fra_char = 'eos'
+
+            translated_sent.append(sampled_fra_char)
+            translated_index.append(max_val_index)
+
+            if ((sampled_fra_char == 'eos') or (len(translated_sent) > S2S_model.get_layer('input_2').output_shape[1])):
+                stop_condition = True
+
+            target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
+            target_seq[0, 0] = max_val_index
+            states_val = [decoder_h, decoder_c]
+
+        return translated_sent[:-1], translated_index[:-1]
+
+    y_true = []
+    y_pred = []
+
+    source_val = s2s_source_tokenizer.texts_to_sequences(source_val)
+    source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')
+
+    for seq_index in tqdm.tqdm(range(len(source_val))):
+
+        # inp_seq = source_val[seq_index:seq_index + 1]
+        # inp_seq = s2s_source_tokenizer.texts_to_sequences(inp_seq)
+        # inp_seq = pad_sequences(inp_seq, maxlen=S2S_model.get_layer('input_1').output_shape[1], padding='post')
+        # translated_sent, translated_index= decode_seq(inp_seq)
+        #
+        # target_seq = target_corpus[seq_index:seq_index + 1]
+        # target_seq = s2s_target_tokenizer.texts_to_sequences(target_seq)
+
+        inp_seq = source_val[seq_index:seq_index+1]
+        translated_sent, translated_index= decode_seq(inp_seq)
+
+        # PREDICT ICD10
+        source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)])
+        word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post')
+        icd10_code_index = icd10_model.predict(word_sequence)
+        # print(icd10_code_index, type(icd10_code_index))
+        max_val_index = np.argmax(icd10_code_index, axis=1)[0]
+        # print(max_val_index)
+        icd10_label = icd10Encoder.inverse_transform(max_val_index)
+
+        # print('-')
+        # target_index = target_seq[0]
+        # print('Target indexes:', target_index)
+        # print('Decoded indexes:', translated_index)
+        #
+        # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index]))
+        # print('Decoded sentence:', " ".join(translated_sent))
+        #
+        # print('Target ICD-10:', labels[seq_index])
+        # print('Predict ICD-10:', icd10_label)
+
+        y_true.append(labels_val[seq_index])
+        y_pred.append(icd10_label)
+
+    report = classification_report(y_true, y_pred)
+    report_df = report_to_df(report)
+    report_df.to_csv('logs/classification_report_test{}.csv'.format(v['s2s']))
\ No newline at end of file
diff --git a/paper/40_experiments.tex b/paper/40_experiments.tex
index 31ea74c..18483e7 100644
--- a/paper/40_experiments.tex
+++ b/paper/40_experiments.tex
@@ -61,10 +61,10 @@ The results obtained from the two approaches are shown in Table \ref{tab:icd10Cl
 \begin{table}[]
 \centering
 \begin{tabular}{l|l|l|l|l|l|l}
-Mode & Model & Trained for epochs & Train Accuracy & Train Loss & Validation Accuracy & Validation Loss \\
+Tokenization & Model & Trained for epochs & Train Accuracy & Train Loss & Validation Accuracy & Validation Loss \\
 Word & Minimal &  69 & 0.925 & 0.190 & 0.937 & 0.169 \\
 Word & Extended &  41 & 0.950 & 0.156 & 0.954 & 0.141 \\
-Character & Minimal &   &  &  &  &  \\
+Character & Minimal &   91 & 0.732 & 1.186 & 0.516 & 2.505 \\
 \end{tabular}
 \caption{Named Entity Normalization: ICD-10 Classification }
 \label{tab:icd10Classification}
@@ -79,9 +79,9 @@ The results obtained during training are presented in Table \ref{tab:final_train
 \begin{table}[]
 \centering
 \begin{tabular}{l|l|l|l|l|l}
-Model & Trained for epochs & Train Accuracy & Train Loss & Validation Accuracy & Validation Loss \\
-S2S balanced + ICD-10 extended &  & & & & \\
-S2S extended + ICD-10 extended &  & & & & \\
+Model &  Precision & Recall & F-1\\
+S2S balanced + ICD-10 extended & 0.73 & 0.61 & 0.61 \\
+S2S extended + ICD-10 extended & 0.74 & 0.62 & 0.63 \\
 \end{tabular}
 \caption{Final Pipeline Evaluation}
 \label{tab:final_train}
-- 
GitLab