diff --git a/code_jurica/_config.py b/code_jurica/_config.py index 8ea5d0c745372a3c97c92233e2ba4f8339153718..fac822140a4f281e3f4752565874893077a61a91 100644 --- a/code_jurica/_config.py +++ b/code_jurica/_config.py @@ -9,6 +9,27 @@ DATA_FR=FR_HOME+'corpus/' DATA_HU=HU_HOME+'corpus/' DATA_IT=IT_HOME+'corpus/' +TEST_FR = 'data/test/FR/raw/corpus/' +TEST_IT = 'data/test/IT/raw/corpus/' +TEST_HU = 'data/test/HU/raw/corpus/' + +RESULTS_DIR = 'data/results/WBI/' + +RESULTS = { + 'FR': { + 'in':TEST_FR + 'CausesBrutes_FR_2015F_1.csv', + 'out':RESULTS_DIR + 'FR/raw/' + }, + 'HU':{ + 'in':TEST_HU + 'CausesBrutes_HU_2.csv', + 'out':RESULTS_DIR + 'HU/raw/' + }, + 'IT': { + 'in':TEST_IT + 'CausesBrutes_IT_2.csv', + 'out':RESULTS_DIR + 'IT/raw/' + } + } + TRAINING = { 'FR': { 'CB': [ diff --git a/code_jurica/data/results/methods.txt b/code_jurica/data/results/methods.txt new file mode 100644 index 0000000000000000000000000000000000000000..6dd08fec085c69354c0f8700d0382e700aa5ce1f --- /dev/null +++ b/code_jurica/data/results/methods.txt @@ -0,0 +1 @@ +The task was solved using two Deel Learning models: 1) sequence2sequence model with attention and seperate classification model. The models were trained using only the data provided by the organizers, mostly due to our inability to obtain ICD10 dictionaries for selected languages. \ No newline at end of file diff --git a/code_jurica/data/results/team.txt b/code_jurica/data/results/team.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dcd995dd15cd717c756e6f866ca3bb94c0ab56d --- /dev/null +++ b/code_jurica/data/results/team.txt @@ -0,0 +1 @@ +1 postdoc and 1 PhD student collaborated to solve this problem with an language independent approach. \ No newline at end of file diff --git a/code_jurica/generate_runs.py b/code_jurica/generate_runs.py new file mode 100644 index 0000000000000000000000000000000000000000..08d898c8e19dbea08e453143fc0289420746b13e --- /dev/null +++ b/code_jurica/generate_runs.py @@ -0,0 +1,150 @@ +from _config import * +import pickle +import tqdm +from _layers import Attention +from keras.models import Model, load_model as keras_load_model +from keras.layers import Input +from keras.preprocessing.sequence import pad_sequences +from sklearn.metrics import classification_report +from util import report_to_df +import numpy as np +import pandas as pd + +#REPRODUCIBLE +np.random.seed(42) +import random +random.seed(12345) +import os +os.environ['PYTHONHASHSEED'] = '0' + +import tensorflow as tf +session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) +from keras import backend as K +tf.set_random_seed(1234) +sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) +K.set_session(sess) +#REPRODUCIBLE + +langs = ['IT','HU','FR'] +runs = { + 'run1':{ + 'icd10':'_duplicated', + 's2s':'' + }, + 'run2':{ + 'icd10': '_duplicated', + 's2s': '_extended' + } + } + +for k,v in runs.items(): + + # ICD 10 STUFF + icd10_model = keras_load_model('models/icd10Classification_attention{}.h5'.format(v['icd10']), custom_objects={'Attention':Attention}) + with open('models/icd10_tokenizer{}.p'.format(v['icd10']), 'rb') as handle: + icd10Tokenizer = pickle.load(handle) + + with open('models/icd10_mappings{}.p'.format(v['icd10']), 'rb') as handle: + icd10Encoder = pickle.load(handle) + # ICD 10 STUFF + + # S2S STUFF + S2S_model = keras_load_model('models/s2s{}.h5'.format(v['s2s']), custom_objects={'Attention':Attention}) + with open('models/s2s_source_tokenizer{}.p'.format(v['s2s']), 'rb') as handle: + s2s_source_tokenizer = pickle.load(handle) + source_vocab = s2s_source_tokenizer.word_index + source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()} + + with open('models/s2s_target_tokenizer{}.p'.format(v['s2s']), 'rb') as handle: + s2s_target_tokenizer = pickle.load(handle) + + target_vocab =s2s_target_tokenizer.word_index + target_index_to_word_dict = {v:k.strip() for k,v in s2s_target_tokenizer.word_index.items()} + # S2S STUFF + + # INFERENCE MODELS + encoder_input = S2S_model.get_layer('input_1').output + decoder_input = S2S_model.get_layer('input_2').output + x, state_h, state_c = S2S_model.get_layer('lstm_1').output + encoder_states = [state_h, state_c] + + embed_2 = S2S_model.get_layer('embedding_2').output + decoder_LSTM = S2S_model.get_layer('lstm_2') + decoder_dense = S2S_model.get_layer('dense_1') + + # Encoder inference model + encoder_model_inf = Model(encoder_input, encoder_states) + + # Decoder inference model + decoder_state_input_h = Input(shape=(256,), name='inf_input1') + decoder_state_input_c = Input(shape=(256,), name='inf_input2') + decoder_input_states = [decoder_state_input_h, decoder_state_input_c] + + decoder_out, decoder_h, decoder_c = decoder_LSTM(embed_2, initial_state=decoder_input_states) + decoder_states = [decoder_h , decoder_c] + decoder_out = decoder_dense(decoder_out) + + decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states, + outputs=[decoder_out] + decoder_states ) + + # encoder_model_inf.summary() + # decoder_model_inf.summary() + + def decode_seq(inp_seq): + states_val = encoder_model_inf.predict(inp_seq) + target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1])) + target_seq[0, 0] = target_vocab['sos'] + + translated_sent = [] + translated_index = [] + stop_condition = False + + while not stop_condition: + decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val) + max_val_index = np.argmax(decoder_out[0, -1, :]) + try: + sampled_fra_char = target_index_to_word_dict[max_val_index] + except KeyError: + sampled_fra_char = 'eos' + + translated_sent.append(sampled_fra_char) + translated_index.append(max_val_index) + + if ((sampled_fra_char == 'eos') or (len(translated_sent) > S2S_model.get_layer('input_2').output_shape[1])): + stop_condition = True + + target_seq = np.zeros((1, S2S_model.get_layer('input_2').output_shape[1])) + target_seq[0, 0] = max_val_index + states_val = [decoder_h, decoder_c] + + return translated_sent[:-1], translated_index[:-1] + + for lang in langs: + df = pd.read_csv(RESULTS[lang]['in'], sep=';') + source_val = [x.lower() for x in df.RawText.values.tolist()] + source_val = s2s_source_tokenizer.texts_to_sequences(source_val) + source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post') + + results = [] + for seq_index in tqdm.tqdm(range(len(source_val))): + + inp_seq = source_val[seq_index:seq_index+1] + translated_sent, translated_index= decode_seq(inp_seq) + # print(translated_sent) + # input('ts') + + # PREDICT ICD10 + source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)]) + word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') + icd10_code_index = icd10_model.predict(word_sequence) + # print(icd10_code_index, type(icd10_code_index)) + max_val_index = np.argmax(icd10_code_index, axis=1)[0] + # print(max_val_index) + icd10_label = icd10Encoder.inverse_transform(max_val_index) + + results.append([" ".join(translated_sent)," ".join(translated_sent),icd10_label]) + + tmp_df = pd.DataFrame(results) + tmp_df.columns = ['Text1', 'Text2','ICD10'] + result_df = pd.concat([df.ix[:,:3], tmp_df], axis=1) + result_df.to_csv(RESULTS[lang]['out']+k+'.csv', sep=';') diff --git a/code_jurica/test.py b/code_jurica/test.py index 5f7db76a71240f9cd929eb05c6bd8eb7abed6ff1..7fac572fcb945a7f75fee9d7e91f9671c79aaf30 100644 --- a/code_jurica/test.py +++ b/code_jurica/test.py @@ -114,8 +114,6 @@ def decode_seq(inp_seq): y_true = [] y_pred = [] -# for seq_index in range(len(source_corpus)): - source_val = s2s_source_tokenizer.texts_to_sequences(source_val) source_val = pad_sequences(source_val, maxlen=S2S_model.get_layer('input_1').input_shape[1], padding='post')