Skip to content
Snippets Groups Projects
Commit 87a185ed authored by Jurica Seva's avatar Jurica Seva
Browse files

Updated with results on the train/val dataset.

parent 8e93eea7
No related merge requests found
......@@ -23,138 +23,150 @@ sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)
#REPRODUCIBLE
with open('models/train_test_split_extended.p', 'rb') as handle:
data_set = pickle.load(handle)
source_val = data_set['source_val']
target_val =data_set['target_val']
labels_val = data_set['labels_val']
# print(source_val)
# input('source val')
# ICD 10 STUFF
icd10_model = keras_load_model('models/icd10Classification_attention_duplicated.h5', custom_objects={'Attention':Attention})
with open('models/icd10_tokenizer_duplicated.p', 'rb') as handle:
icd10Tokenizer = pickle.load(handle)
with open('models/icd10_mappings_duplicated.p', 'rb') as handle:
icd10Encoder = pickle.load(handle)
# ICD 10 STUFF
# S2S STUFF
S2S_model = keras_load_model('models/s2s.h5', custom_objects={'Attention':Attention})
with open('models/s2s_source_tokenizer.p', 'rb') as handle:
s2s_source_tokenizer = pickle.load(handle)
source_vocab = s2s_source_tokenizer.word_index
source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()}
with open('models/s2s_target_tokenizer.p', 'rb') as handle:
s2s_target_tokenizer = pickle.load(handle)
target_vocab =s2s_target_tokenizer.word_index
target_index_to_word_dict = {v:k.strip() for k,v in s2s_target_tokenizer.word_index.items()}
# S2S STUFF
# INFERENCE MODELS
encoder_input = S2S_model.get_layer('input_1').output
decoder_input = S2S_model.get_layer('input_2').output
x, state_h, state_c = S2S_model.get_layer('lstm_1').output
encoder_states = [state_h, state_c]
embed_2 = S2S_model.get_layer('embedding_2').output
decoder_LSTM = S2S_model.get_layer('lstm_2')
decoder_dense = S2S_model.get_layer('dense_1')
# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)
# Decoder inference model
decoder_state_input_h = Input(shape=(256,), name='inf_input1')
decoder_state_input_c = Input(shape=(256,), name='inf_input2')
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]
decoder_out, decoder_h, decoder_c = decoder_LSTM(embed_2, initial_state=decoder_input_states)
decoder_states = [decoder_h , decoder_c]
decoder_out = decoder_dense(decoder_out)
decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
outputs=[decoder_out] + decoder_states )
# encoder_model_inf.summary()
# decoder_model_inf.summary()
def decode_seq(inp_seq):
states_val = encoder_model_inf.predict(inp_seq)
target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
target_seq[0, 0] = target_vocab['sos']
translated_sent = []
translated_index = []
stop_condition = False
while not stop_condition:
decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
max_val_index = np.argmax(decoder_out[0, -1, :])
try:
sampled_fra_char = target_index_to_word_dict[max_val_index]
except KeyError:
sampled_fra_char = 'eos'
translated_sent.append(sampled_fra_char)
translated_index.append(max_val_index)
if ((sampled_fra_char == 'eos') or (len(translated_sent) > S2S_model.get_layer('input_2').output_shape[1])):
stop_condition = True
runs = {
'run1':{
'icd10':'_duplicated',
's2s':''
},
'run2':{
'icd10': '_duplicated',
's2s': '_extended'
}
}
for k,v in runs.items():
with open('models/train_test_split_extended.p', 'rb') as handle:
data_set = pickle.load(handle)
source_val = data_set['source_val']
target_val = data_set['target_val']
labels_val = data_set['labels_val']
print(source_val)
# input('source val')
# ICD 10 STUFF
icd10_model = keras_load_model('models/icd10Classification_attention{}.h5'.format(v['icd10']), custom_objects={'Attention':Attention})
with open('models/icd10_tokenizer{}.p'.format(v['icd10']), 'rb') as handle:
icd10Tokenizer = pickle.load(handle)
with open('models/icd10_mappings{}.p'.format(v['icd10']), 'rb') as handle:
icd10Encoder = pickle.load(handle)
# ICD 10 STUFF
# S2S STUFF
S2S_model = keras_load_model('models/s2s{}.h5'.format(v['s2s']), custom_objects={'Attention':Attention})
with open('models/s2s_source_tokenizer{}.p'.format(v['s2s']), 'rb') as handle:
s2s_source_tokenizer = pickle.load(handle)
source_vocab = s2s_source_tokenizer.word_index
source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()}
with open('models/s2s_target_tokenizer{}.p'.format(v['s2s']), 'rb') as handle:
s2s_target_tokenizer = pickle.load(handle)
target_vocab =s2s_target_tokenizer.word_index
target_index_to_word_dict = {v:k.strip() for k,v in s2s_target_tokenizer.word_index.items()}
# S2S STUFF
# INFERENCE MODELS
encoder_input = S2S_model.get_layer('input_1').output
decoder_input = S2S_model.get_layer('input_2').output
x, state_h, state_c = S2S_model.get_layer('lstm_1').output
encoder_states = [state_h, state_c]
embed_2 = S2S_model.get_layer('embedding_2').output
decoder_LSTM = S2S_model.get_layer('lstm_2')
decoder_dense = S2S_model.get_layer('dense_1')
# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)
# Decoder inference model
decoder_state_input_h = Input(shape=(256,), name='inf_input1')
decoder_state_input_c = Input(shape=(256,), name='inf_input2')
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]
decoder_out, decoder_h, decoder_c = decoder_LSTM(embed_2, initial_state=decoder_input_states)
decoder_states = [decoder_h , decoder_c]
decoder_out = decoder_dense(decoder_out)
decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
outputs=[decoder_out] + decoder_states )
# encoder_model_inf.summary()
# decoder_model_inf.summary()
def decode_seq(inp_seq):
states_val = encoder_model_inf.predict(inp_seq)
target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
target_seq[0, 0] = max_val_index
states_val = [decoder_h, decoder_c]
return translated_sent[:-1], translated_index[:-1]
y_true = []
y_pred = []
source_val = s2s_source_tokenizer.texts_to_sequences(source_val)
source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')
for seq_index in tqdm.tqdm(range(len(source_val))):
# inp_seq = source_val[seq_index:seq_index + 1]
# inp_seq = s2s_source_tokenizer.texts_to_sequences(inp_seq)
# inp_seq = pad_sequences(inp_seq, maxlen=S2S_model.get_layer('input_1').output_shape[1], padding='post')
# translated_sent, translated_index= decode_seq(inp_seq)
#
# target_seq = target_corpus[seq_index:seq_index + 1]
# target_seq = s2s_target_tokenizer.texts_to_sequences(target_seq)
inp_seq = source_val[seq_index:seq_index+1]
translated_sent, translated_index= decode_seq(inp_seq)
# PREDICT ICD10
source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)])
word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post')
icd10_code_index = icd10_model.predict(word_sequence)
# print(icd10_code_index, type(icd10_code_index))
max_val_index = np.argmax(icd10_code_index, axis=1)[0]
# print(max_val_index)
icd10_label = icd10Encoder.inverse_transform(max_val_index)
# print('-')
# target_index = target_seq[0]
# print('Target indexes:', target_index)
# print('Decoded indexes:', translated_index)
#
# print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index]))
# print('Decoded sentence:', " ".join(translated_sent))
#
# print('Target ICD-10:', labels[seq_index])
# print('Predict ICD-10:', icd10_label)
y_true.append(labels_val[seq_index])
y_pred.append(icd10_label)
report = classification_report(y_true, y_pred)
report_df = report_to_df(report)
report_df.to_csv('logs/classification_report_test_combined.csv')
print(report_df)
\ No newline at end of file
target_seq[0, 0] = target_vocab['sos']
translated_sent = []
translated_index = []
stop_condition = False
while not stop_condition:
decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
max_val_index = np.argmax(decoder_out[0, -1, :])
try:
sampled_fra_char = target_index_to_word_dict[max_val_index]
except KeyError:
sampled_fra_char = 'eos'
translated_sent.append(sampled_fra_char)
translated_index.append(max_val_index)
if ((sampled_fra_char == 'eos') or (len(translated_sent) > S2S_model.get_layer('input_2').output_shape[1])):
stop_condition = True
target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
target_seq[0, 0] = max_val_index
states_val = [decoder_h, decoder_c]
return translated_sent[:-1], translated_index[:-1]
y_true = []
y_pred = []
source_val = s2s_source_tokenizer.texts_to_sequences(source_val)
source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')
for seq_index in tqdm.tqdm(range(len(source_val))):
# inp_seq = source_val[seq_index:seq_index + 1]
# inp_seq = s2s_source_tokenizer.texts_to_sequences(inp_seq)
# inp_seq = pad_sequences(inp_seq, maxlen=S2S_model.get_layer('input_1').output_shape[1], padding='post')
# translated_sent, translated_index= decode_seq(inp_seq)
#
# target_seq = target_corpus[seq_index:seq_index + 1]
# target_seq = s2s_target_tokenizer.texts_to_sequences(target_seq)
inp_seq = source_val[seq_index:seq_index+1]
translated_sent, translated_index= decode_seq(inp_seq)
# PREDICT ICD10
source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)])
word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post')
icd10_code_index = icd10_model.predict(word_sequence)
# print(icd10_code_index, type(icd10_code_index))
max_val_index = np.argmax(icd10_code_index, axis=1)[0]
# print(max_val_index)
icd10_label = icd10Encoder.inverse_transform(max_val_index)
# print('-')
# target_index = target_seq[0]
# print('Target indexes:', target_index)
# print('Decoded indexes:', translated_index)
#
# print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index]))
# print('Decoded sentence:', " ".join(translated_sent))
#
# print('Target ICD-10:', labels[seq_index])
# print('Predict ICD-10:', icd10_label)
y_true.append(labels_val[seq_index])
y_pred.append(icd10_label)
report = classification_report(y_true, y_pred)
report_df = report_to_df(report)
report_df.to_csv('logs/classification_report_test{}.csv'.format(v['s2s']))
\ No newline at end of file
......@@ -61,10 +61,10 @@ The results obtained from the two approaches are shown in Table \ref{tab:icd10Cl
\begin{table}[]
\centering
\begin{tabular}{l|l|l|l|l|l|l}
Mode & Model & Trained for epochs & Train Accuracy & Train Loss & Validation Accuracy & Validation Loss \\
Tokenization & Model & Trained for epochs & Train Accuracy & Train Loss & Validation Accuracy & Validation Loss \\
Word & Minimal & 69 & 0.925 & 0.190 & 0.937 & 0.169 \\
Word & Extended & 41 & 0.950 & 0.156 & 0.954 & 0.141 \\
Character & Minimal & & & & & \\
Character & Minimal & 91 & 0.732 & 1.186 & 0.516 & 2.505 \\
\end{tabular}
\caption{Named Entity Normalization: ICD-10 Classification }
\label{tab:icd10Classification}
......@@ -79,9 +79,9 @@ The results obtained during training are presented in Table \ref{tab:final_train
\begin{table}[]
\centering
\begin{tabular}{l|l|l|l|l|l}
Model & Trained for epochs & Train Accuracy & Train Loss & Validation Accuracy & Validation Loss \\
S2S balanced + ICD-10 extended & & & & & \\
S2S extended + ICD-10 extended & & & & & \\
Model & Precision & Recall & F-1\\
S2S balanced + ICD-10 extended & 0.73 & 0.61 & 0.61 \\
S2S extended + ICD-10 extended & 0.74 & 0.62 & 0.63 \\
\end{tabular}
\caption{Final Pipeline Evaluation}
\label{tab:final_train}
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment