Skip to content
Snippets Groups Projects
Commit 48634b43 authored by Jurica Seva's avatar Jurica Seva
Browse files

Generating results.

parent 8d1e8f63
No related merge requests found
......@@ -9,6 +9,27 @@ DATA_FR=FR_HOME+'corpus/'
DATA_HU=HU_HOME+'corpus/'
DATA_IT=IT_HOME+'corpus/'
TEST_FR = 'data/test/FR/raw/corpus/'
TEST_IT = 'data/test/IT/raw/corpus/'
TEST_HU = 'data/test/HU/raw/corpus/'
RESULTS_DIR = 'data/results/WBI/'
RESULTS = {
'FR': {
'in':TEST_FR + 'CausesBrutes_FR_2015F_1.csv',
'out':RESULTS_DIR + 'FR/raw/'
},
'HU':{
'in':TEST_HU + 'CausesBrutes_HU_2.csv',
'out':RESULTS_DIR + 'HU/raw/'
},
'IT': {
'in':TEST_IT + 'CausesBrutes_IT_2.csv',
'out':RESULTS_DIR + 'IT/raw/'
}
}
TRAINING = {
'FR': {
'CB': [
......
The task was solved using two Deel Learning models: 1) sequence2sequence model with attention and seperate classification model. The models were trained using only the data provided by the organizers, mostly due to our inability to obtain ICD10 dictionaries for selected languages.
\ No newline at end of file
1 postdoc and 1 PhD student collaborated to solve this problem with an language independent approach.
\ No newline at end of file
from _config import *
import pickle
import tqdm
from _layers import Attention
from keras.models import Model, load_model as keras_load_model
from keras.layers import Input
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from util import report_to_df
import numpy as np
import pandas as pd
#REPRODUCIBLE
np.random.seed(42)
import random
random.seed(12345)
import os
os.environ['PYTHONHASHSEED'] = '0'
import tensorflow as tf
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
from keras import backend as K
tf.set_random_seed(1234)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)
#REPRODUCIBLE
langs = ['IT','HU','FR']
runs = {
'run1':{
'icd10':'_duplicated',
's2s':''
},
'run2':{
'icd10': '_duplicated',
's2s': '_extended'
}
}
for k,v in runs.items():
# ICD 10 STUFF
icd10_model = keras_load_model('models/icd10Classification_attention{}.h5'.format(v['icd10']), custom_objects={'Attention':Attention})
with open('models/icd10_tokenizer{}.p'.format(v['icd10']), 'rb') as handle:
icd10Tokenizer = pickle.load(handle)
with open('models/icd10_mappings{}.p'.format(v['icd10']), 'rb') as handle:
icd10Encoder = pickle.load(handle)
# ICD 10 STUFF
# S2S STUFF
S2S_model = keras_load_model('models/s2s{}.h5'.format(v['s2s']), custom_objects={'Attention':Attention})
with open('models/s2s_source_tokenizer{}.p'.format(v['s2s']), 'rb') as handle:
s2s_source_tokenizer = pickle.load(handle)
source_vocab = s2s_source_tokenizer.word_index
source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()}
with open('models/s2s_target_tokenizer{}.p'.format(v['s2s']), 'rb') as handle:
s2s_target_tokenizer = pickle.load(handle)
target_vocab =s2s_target_tokenizer.word_index
target_index_to_word_dict = {v:k.strip() for k,v in s2s_target_tokenizer.word_index.items()}
# S2S STUFF
# INFERENCE MODELS
encoder_input = S2S_model.get_layer('input_1').output
decoder_input = S2S_model.get_layer('input_2').output
x, state_h, state_c = S2S_model.get_layer('lstm_1').output
encoder_states = [state_h, state_c]
embed_2 = S2S_model.get_layer('embedding_2').output
decoder_LSTM = S2S_model.get_layer('lstm_2')
decoder_dense = S2S_model.get_layer('dense_1')
# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)
# Decoder inference model
decoder_state_input_h = Input(shape=(256,), name='inf_input1')
decoder_state_input_c = Input(shape=(256,), name='inf_input2')
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]
decoder_out, decoder_h, decoder_c = decoder_LSTM(embed_2, initial_state=decoder_input_states)
decoder_states = [decoder_h , decoder_c]
decoder_out = decoder_dense(decoder_out)
decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
outputs=[decoder_out] + decoder_states )
# encoder_model_inf.summary()
# decoder_model_inf.summary()
def decode_seq(inp_seq):
states_val = encoder_model_inf.predict(inp_seq)
target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
target_seq[0, 0] = target_vocab['sos']
translated_sent = []
translated_index = []
stop_condition = False
while not stop_condition:
decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
max_val_index = np.argmax(decoder_out[0, -1, :])
try:
sampled_fra_char = target_index_to_word_dict[max_val_index]
except KeyError:
sampled_fra_char = 'eos'
translated_sent.append(sampled_fra_char)
translated_index.append(max_val_index)
if ((sampled_fra_char == 'eos') or (len(translated_sent) > S2S_model.get_layer('input_2').output_shape[1])):
stop_condition = True
target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1]))
target_seq[0, 0] = max_val_index
states_val = [decoder_h, decoder_c]
return translated_sent[:-1], translated_index[:-1]
for lang in langs:
df = pd.read_csv(RESULTS[lang]['in'], sep=';')
source_val = [x.lower() for x in df.RawText.values.tolist()]
source_val = s2s_source_tokenizer.texts_to_sequences(source_val)
source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')
results = []
for seq_index in tqdm.tqdm(range(len(source_val))):
inp_seq = source_val[seq_index:seq_index+1]
translated_sent, translated_index= decode_seq(inp_seq)
# print(translated_sent)
# input('ts')
# PREDICT ICD10
source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)])
word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post')
icd10_code_index = icd10_model.predict(word_sequence)
# print(icd10_code_index, type(icd10_code_index))
max_val_index = np.argmax(icd10_code_index, axis=1)[0]
# print(max_val_index)
icd10_label = icd10Encoder.inverse_transform(max_val_index)
results.append([" ".join(translated_sent)," ".join(translated_sent),icd10_label])
tmp_df = pd.DataFrame(results)
tmp_df.columns = ['Text1', 'Text2','ICD10']
result_df = pd.concat([df.ix[:,:3], tmp_df], axis=1)
result_df.to_csv(RESULTS[lang]['out']+k+'.csv', sep=';')
......@@ -114,8 +114,6 @@ def decode_seq(inp_seq):
y_true = []
y_pred = []
# for seq_index in range(len(source_corpus)):
source_val = s2s_source_tokenizer.texts_to_sequences(source_val)
source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment