From 39db3e78964b9d2596db93fc1b4456e5fb4592e4 Mon Sep 17 00:00:00 2001 From: Jurica Seva <seva@informatik.hu-berlin.de> Date: Thu, 3 May 2018 09:37:54 +0200 Subject: [PATCH] Added ICD 10 classification with attention (best results), attention with context (worst results) and char level classification (running atm). --- code_jurica/classificationICD10_attention.py | 29 +++-- .../classificationICD10_attention_char.py | 21 ++-- code_jurica/loader.py | 7 +- code_jurica/seq2seq.py | 66 ++++++---- code_jurica/test.py | 115 ++++++++++++++++++ code_jurica/train.sh | 3 + code_jurica/util.py | 16 ++- 7 files changed, 212 insertions(+), 45 deletions(-) create mode 100644 code_jurica/test.py create mode 100644 code_jurica/train.sh diff --git a/code_jurica/classificationICD10_attention.py b/code_jurica/classificationICD10_attention.py index c8d794f..14cf975 100644 --- a/code_jurica/classificationICD10_attention.py +++ b/code_jurica/classificationICD10_attention.py @@ -8,6 +8,7 @@ import random import traceback from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split +from collections import Counter from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer @@ -53,11 +54,25 @@ tokenizer=TokenizePreprocessor() kerasTokenizer = Tokenizer() dataLoader=prepareData() corpora=dataLoader.prepareDictionaries() -print("Extracted {} data points".format(len(corpora))) + +tmp =[x[1] for x in corpora] +labels_c = Counter(tmp) +labels_tmp = {k:v for k,v in labels_c.items() if v > 1} + +corpus = [] +labels = [] + +for item in corpora: + + if item[1] in labels_tmp: + labels.append(item[1]) + corpus.append(item[0]) + +num_labels=len(list(set(labels))) +print("Extracted {} data points with {} unique labels".format(len(corpus), num_labels)) # input('size of corpora') #prepareing the texts for input in RNN -corpus=[x[0] for x in corpora] tokens=tokenizer.transform([x for x in corpus]) tmp=[item for item in list(set(flatten(tokens))) if item.strip()] vocabulary={item.strip():i+1 for i,item in enumerate(tmp)} @@ -81,9 +96,6 @@ embedding_layer = Embedding( mask_zero=True) #preparing the labels as one hot encoding vector -labels=[x[1] for x in corpora] -num_labels=len(list(set(labels))) -print("Labels: {}\tUnique labels:{}".format(len(labels), num_labels)) encoder = LabelEncoder() encoder.fit(labels) with open('models/icd10_mappings.p', 'wb') as handle: @@ -94,7 +106,7 @@ encoded_Y = encoder.transform(labels) # convert integers to dummy variables (i.e. one hot encoded) labels_one_hot = np_utils.to_categorical(encoded_Y) -X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777) +X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.15, random_state=777, stratify=labels) print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test)) try: @@ -114,11 +126,12 @@ try: model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy']) model.summary() print("Traning Model...") - model.fit(X_train, Y_train, + model.fit(word_sequence, labels_one_hot, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list, - validation_split=0.25 + validation_data=[X_test, Y_test] + # validation_split=0.25 ) except Exception as e: diff --git a/code_jurica/classificationICD10_attention_char.py b/code_jurica/classificationICD10_attention_char.py index 9bf696a..8d9d2f2 100644 --- a/code_jurica/classificationICD10_attention_char.py +++ b/code_jurica/classificationICD10_attention_char.py @@ -6,6 +6,7 @@ from util import * import numpy as np import random import traceback +from collections import Counter from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split @@ -57,17 +58,21 @@ chars = set() kerasTokenizer = Tokenizer(char_level=True, filters=None) dataLoader=prepareData() corpora=dataLoader.prepareDictionaries() -print("Extracted {} data points".format(len(corpora))) +tmp =[x[1] for x in corpora] +labels_c = Counter(tmp) +labels_tmp = {k:v for k,v in labels_c.items() if v > 1} + + # input('size of corpora') for line in corpora: + if line[1] in labels_tmp: + labels.append(line[1]) + sentences.append(line[0]) - labels.append(line[1]) - sentences.append(line[0]) - - for ch in line[0]: - if (ch not in chars): - chars.add(ch) + for ch in line[0]: + if (ch not in chars): + chars.add(ch) chars = sorted(list(chars)) @@ -107,7 +112,7 @@ encoded_Y = encoder.transform(labels) # convert integers to dummy variables (i.e. one hot encoded) labels_one_hot = np_utils.to_categorical(encoded_Y) -X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777) +X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777, stratify=True) print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test)) try: diff --git a/code_jurica/loader.py b/code_jurica/loader.py index e4b9c05..72e7b7e 100644 --- a/code_jurica/loader.py +++ b/code_jurica/loader.py @@ -40,6 +40,9 @@ tmp =[item for item in list(set(flatten(source_tokens))) if item.strip()] source_vocab = {item.strip():i+1 for i,item in enumerate(tmp)} source_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)} kerasTokenizer.word_index=source_vocab +with open('models/s2s_source_tokenizer.p', 'wb') as handle: + pickle.dump(kerasTokenizer, handle) + source_word_sequence=kerasTokenizer.texts_to_sequences(source_corpus) source_max_sequence = max([len(x) for x in source_word_sequence]) source_word_sequence = pad_sequences(source_word_sequence, maxlen=source_max_sequence, padding='post') @@ -51,6 +54,9 @@ tmp=[item for item in list(set(flatten(target_tokens))) if item.strip()] target_vocab = {item.strip():i+1 for i,item in enumerate(tmp)} target_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)} kerasTokenizer.word_index=target_vocab +with open('models/s2s_target_tokenizer.p', 'wb') as handle: + pickle.dump(kerasTokenizer, handle) + target_word_sequence=kerasTokenizer.texts_to_sequences(target_corpus) target_max_sequence = max([len(x) for x in target_word_sequence]) target_word_sequence = pad_sequences(target_word_sequence, maxlen=target_max_sequence, padding='post') @@ -82,7 +88,6 @@ target_embedding_layer = Embedding(target_embeddings.shape[0], source_train, source_val, _, _ = train_test_split(source_word_sequence, labels, test_size=0.05, random_state=777) target_train, target_val, labels_train, labels_val = train_test_split(target_word_sequence, labels, test_size=0.05, random_state=777) - #target_val_onehot = np.zeros((len(target_val), target_max_sequence, len(target_vocab))) #target_train_onehot = np.zeros((len(target_train), target_max_sequence, len(target_vocab))) diff --git a/code_jurica/seq2seq.py b/code_jurica/seq2seq.py index 1b4087c..51564ce 100644 --- a/code_jurica/seq2seq.py +++ b/code_jurica/seq2seq.py @@ -2,14 +2,16 @@ # experiment = Experiment(api_key="hSd9vTj0EfMu72569YnVEvtvj") from loader import * - +from _layers import AttentionWithContext, Attention from keras.models import Model, load_model as keras_load_model from keras.layers import Input, LSTM, Dense, Embedding, GRU from keras.utils import multi_gpu_model from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger import tensorflow as tf +import tqdm import pickle +from sklearn.metrics import classification_report ################################### # TensorFlow wizardry @@ -21,10 +23,11 @@ config.gpu_options.allocator_type = 'BFC' # LOAD ICD 10 CLASSIFICATION MODEL try: - icd10_model = keras_load_model('models/icd10Classification.h5') + icd10_model = keras_load_model('models/icd10Classification_attention.h5', + custom_objects={'Attention':Attention}) except OSError: from classificationICD10 import * - icd10_model = keras_load_model('models/icd10Classification.h5') + icd10_model = keras_load_model('models/icd10Classification_attention.h5') with open('models/icd10_tokenizer.p', 'rb') as handle: kerasTokenizer = pickle.load(handle) @@ -45,7 +48,7 @@ callbacks_list = [ save_best_only=True, ), CSVLogger( - append=True, + append=False, filename='logs/s2s_{}.csv'.format(date_label) ) ] @@ -81,7 +84,7 @@ model.fit([source_train, target_train], batch_size=batch_size, callbacks=callbacks_list, epochs=epochs, - validation_split=0.25 + validation_split=0.2 # validation_data=([source_val, target_val], target_val_onehot) ) @@ -115,7 +118,12 @@ def decode_seq(inp_seq): while not stop_condition: decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val) max_val_index = np.argmax(decoder_out[0, -1, :]) - sampled_fra_char = target_index_to_word_dict[max_val_index] + try: + sampled_fra_char = target_index_to_word_dict[max_val_index] + except KeyError: + # stop_condition = True + sampled_fra_char = 'eos' + translated_sent.append(sampled_fra_char) translated_index.append(max_val_index) @@ -126,9 +134,13 @@ def decode_seq(inp_seq): target_seq[0, 0] = max_val_index states_val = [decoder_h, decoder_c] - return translated_sent, translated_index + return translated_sent[:-1], translated_index[:-1] + +y_true = [] +y_pred = [] -for seq_index in range(10): +for seq_index in tqdm.tqdm(range(len(source_val))): +# for seq_index in range(10): inp_seq = source_val[seq_index:seq_index+1] translated_sent, translated_index= decode_seq(inp_seq) @@ -136,18 +148,26 @@ for seq_index in range(10): source_word_sequence = kerasTokenizer.texts_to_sequences([" ".join(translated_sent)]) word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') icd10_code_index = icd10_model.predict(word_sequence) - print(icd10_code_index, type(icd10_code_index)) - max_val_index = np.argmax(icd10_code_index, axis=0) - print(max_val_index) - ice10_label = encoded_Y.inverse_transform(max_val_index) - - print('-') - target_index = np.trim_zeros(target_val[seq_index], 'b') - print('Target indexes:', target_index) - print('Decoded indexes:', translated_index) - - print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) - print('Decoded sentence:', " ".join(translated_sent)) - - print('Target ICD-10:', labels_val[seq_index]) - print('Predict ICD-10:', ice10_label) \ No newline at end of file + # print(icd10_code_index, type(icd10_code_index)) + max_val_index = np.argmax(icd10_code_index, axis=1)[0] + # print(max_val_index) + icd10_label = encoded_Y.inverse_transform(max_val_index) + + # print('-') + # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] + # print('Target indexes:', target_index) + # print('Decoded indexes:', translated_index) + # + # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) + # print('Decoded sentence:', " ".join(translated_sent)) + # + # print('Target ICD-10:', labels_val[seq_index]) + # print('Predict ICD-10:', icd10_label) + + y_true.append(labels_val[seq_index]) + y_pred.append(icd10_label) + +report = classification_report(y_true, y_pred) +report_df = report_to_df(report) +report_df.to_csv('logs/classification_report.csv') +print(report_df) \ No newline at end of file diff --git a/code_jurica/test.py b/code_jurica/test.py new file mode 100644 index 0000000..c3dd296 --- /dev/null +++ b/code_jurica/test.py @@ -0,0 +1,115 @@ +import pickle +import tqdm +from _layers import Attention +from keras.models import Model, load_model as keras_load_model +from keras.layers import Input + +# ICD 10 STUFF +icd10_model = keras_load_model('models/icd10Classification_attention.h5', custom_objects={'Attention':Attention}) +with open('models/icd10_tokenizer.p', 'rb') as handle: + icd10Tokenizer = pickle.load(handle) + +with open('models/icd10_mappings.p', 'rb') as handle: + icd10Encoder = pickle.load(handle) +# ICD 10 STUFF + +# S2S STUFF +S2S_model = keras_load_model('models/s2s.h5', custom_objects={'Attention':Attention}) +with open('models/s2s_source_tokenizer.p', 'rb') as handle: + s2s_source_tokenizer = pickle.load(handle) +source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()} + +with open('models/s2s_target_tokenizer.p', 'rb') as handle: + s2s_target_tokenizer = pickle.load(handle) +target_index_to_word_dict = {v:k.strip() for k,v in s2s_target_tokenizer.word_index.items()} +# S2S STUFF + +# INFERENCE MODELS +encoder_input = S2S_model.get_layer('input_1').output +decoder_input = S2S_model.get_layer('input_2').output +x, state_h, state_c = S2S_model.get_layer('lstm_1').output +encoder_states = [state_h, state_c] + +embed_2 = S2S_model.get_layer('embedding_2').output +decoder_LSTM = S2S_model.get_layer('lstm_2').output +decoder_dense = S2S_model.get_layer('dense_1').output + +# Encoder inference model +encoder_model_inf = Model(encoder_input, encoder_states) + +# Decoder inference model +decoder_state_input_h = Input(shape=(256,)) +decoder_state_input_c = Input(shape=(256,)) +decoder_input_states = [decoder_state_input_h, decoder_state_input_c] + +decoder_out, decoder_h, decoder_c = decoder_LSTM(embed_2, initial_state=decoder_input_states) +decoder_states = [decoder_h , decoder_c] +decoder_out = decoder_dense(decoder_out) + +decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states, + outputs=[decoder_out] + decoder_states ) + +def decode_seq(inp_seq): + states_val = encoder_model_inf.predict(inp_seq) + + target_seq = np.zeros((1, target_max_sequence)) + target_seq[0, 0] = target_vocab['sos'] + + translated_sent = [] + translated_index = [] + stop_condition = False + + while not stop_condition: + decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val) + max_val_index = np.argmax(decoder_out[0, -1, :]) + try: + sampled_fra_char = target_index_to_word_dict[max_val_index] + except KeyError: + # stop_condition = True + sampled_fra_char = 'eos' + + translated_sent.append(sampled_fra_char) + translated_index.append(max_val_index) + + if ((sampled_fra_char == 'eos') or (len(translated_sent) > target_max_sequence)): + stop_condition = True + + target_seq = np.zeros((1, target_max_sequence)) + target_seq[0, 0] = max_val_index + states_val = [decoder_h, decoder_c] + + return translated_sent[:-1], translated_index[:-1] + + +for seq_index in tqdm.tqdm(range(len(source_val))): +# for seq_index in range(10): + inp_seq = source_val[seq_index:seq_index+1] + translated_sent, translated_index= decode_seq(inp_seq) + + # PREDICT ICD10 + source_word_sequence = kerasTokenizer.texts_to_sequences([" ".join(translated_sent)]) + word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') + icd10_code_index = icd10_model.predict(word_sequence) + # print(icd10_code_index, type(icd10_code_index)) + max_val_index = np.argmax(icd10_code_index, axis=1)[0] + # print(max_val_index) + icd10_label = encoded_Y.inverse_transform(max_val_index) + + # print('-') + # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] + # print('Target indexes:', target_index) + # print('Decoded indexes:', translated_index) + # + # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) + # print('Decoded sentence:', " ".join(translated_sent)) + # + # print('Target ICD-10:', labels_val[seq_index]) + # print('Predict ICD-10:', icd10_label) + + y_true.append(labels_val[seq_index]) + y_pred.append(icd10_label) + +report = classification_report(y_true, y_pred) +report_df = report_to_df(report) +report_df.to_csv('logs/classification_report.csv') +print(report_df) \ No newline at end of file diff --git a/code_jurica/train.sh b/code_jurica/train.sh new file mode 100644 index 0000000..ae226cf --- /dev/null +++ b/code_jurica/train.sh @@ -0,0 +1,3 @@ +#!/bin/bash +CUDA_VISIBLE_DEVICES=3 /home/sevajuri/anaconda3/bin/python3 /home/sevajuri/projects/clef18/code_jurica/classificationICD10_attention.py +CUDA_VISIBLE_DEVICES=3 /home/sevajuri/anaconda3/bin/python3 /home/sevajuri/projects/clef18/code_jurica/seq2seq.py diff --git a/code_jurica/util.py b/code_jurica/util.py index 4d93a77..54255bb 100644 --- a/code_jurica/util.py +++ b/code_jurica/util.py @@ -11,6 +11,7 @@ import pickle from fastText import load_model import math import datetime +from io import StringIO now = datetime.datetime.now() date_label=now.strftime("%Y_%m_%d") @@ -104,6 +105,11 @@ prenormalize_dict = [ (re.compile("\s"), " "), ] +def report_to_df(report): + report = re.sub(r" +", " ", report).replace("avg / total", "avg/total").replace("\n ", "\n") + report_df = pd.read_csv(StringIO("Classes" + report), sep=' ', index_col=0) + return(report_df) + def map_regex_concepts(token): """replaces abbreviations matching simple REs, e.g. for numbers, percentages, gene names, by class tokens""" for regex, repl in regex_concept_dict: @@ -252,17 +258,17 @@ class prepareData(): for index, row in df.iterrows(): try: text = ' '.join([row['DiagnosisText'], row['Standardized']]) - label = row['Icd1'] except (KeyError, TypeError): text = row['DiagnosisText'] - label = row['Icd1'] + + label = str(row['Icd1']).strip().upper()[:4] if not isinstance(text, float): - preparedDictionary.append([text, label]) + preparedDictionary.append([text.lower(), label]) else: if not math.isnan(text): - preparedDictionary.append([ text, label ]) + preparedDictionary.append([ text.lower(), label ]) pickle.dump(preparedDictionary, open(DICT_PREPROCESED, 'wb')) - return preparedDictionary + return preparedDictionary \ No newline at end of file -- GitLab