diff --git a/.idea/deployment.xml b/.idea/deployment.xml index a0101359350c91a85fc7a058e2211bbc993ccb65..242dd10ff439bfb0dc8cfd560ff76be8c1b1def9 100644 --- a/.idea/deployment.xml +++ b/.idea/deployment.xml @@ -1,6 +1,6 @@ <?xml version="1.0" encoding="UTF-8"?> <project version="4"> - <component name="PublishConfigData" autoUpload="Always" serverName="guppi" createEmptyFolders="true" autoUploadExternalChanges="true"> + <component name="PublishConfigData" autoUpload="Always" serverName="guppi" createEmptyFolders="true" persistUploadOnCheckin="false" autoUploadExternalChanges="true"> <serverData> <paths name="guppi"> <serverdata> diff --git a/code_jurica/classificationICD10_attention.py b/code_jurica/classificationICD10_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..c8d794fe0e31333ce9bd7bdc6ee34979b45efe01 --- /dev/null +++ b/code_jurica/classificationICD10_attention.py @@ -0,0 +1,127 @@ +# from comet_ml import Experiment +# experiment=Experiment(api_key="hSd9vTj0EfMu72569YnVEvtvj") + +# from loader import * +from util import * +import numpy as np +import random +import traceback +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split + +from keras.preprocessing.sequence import pad_sequences +from keras.preprocessing.text import Tokenizer +from keras.optimizers import Adam +from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger +from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional +from keras.models import Model +from keras.utils import multi_gpu_model, np_utils + +import tensorflow as tf + +from _layers import AttentionWithContext, Attention + +################################### +# TensorFlow wizardry +config=tf.ConfigProto() + +# Don't pre-allocate memory; allocate as-needed +config.gpu_options.allow_growth=True +config.gpu_options.allocator_type='BFC' + +callbacks_list=[ + EarlyStopping( + monitor='val_loss', + patience=2, + ), + ModelCheckpoint( + filepath='models/icd10Classification_attention.h5', + monitor='val_loss', + save_best_only=True, + ), + CSVLogger( + append=True, + filename='logs/icd10Classification_attention_{}.csv'.format(date_label), + ) +] + +latent_dim = 512 +epochs = 100 +batch_size = 1000 + +tokenizer=TokenizePreprocessor() +kerasTokenizer = Tokenizer() +dataLoader=prepareData() +corpora=dataLoader.prepareDictionaries() +print("Extracted {} data points".format(len(corpora))) +# input('size of corpora') + +#prepareing the texts for input in RNN +corpus=[x[0] for x in corpora] +tokens=tokenizer.transform([x for x in corpus]) +tmp=[item for item in list(set(flatten(tokens))) if item.strip()] +vocabulary={item.strip():i+1 for i,item in enumerate(tmp)} +index_to_word_dict={i+1:item.strip() for i,item in enumerate(tmp)} +kerasTokenizer.word_index=vocabulary +# saving +with open('models/icd10_tokenizer.p', 'wb') as handle: + pickle.dump(kerasTokenizer, handle) + +source_word_sequence=kerasTokenizer.texts_to_sequences(corpus) +max_sequence = max([len(x) for x in source_word_sequence]) +word_sequence = pad_sequences(source_word_sequence, maxlen=max_sequence, padding='post') + +embedding_matrix=embedding_matrix(vocabulary) +embedding_layer = Embedding( + embedding_matrix.shape[0], + embedding_matrix.shape[1], + weights=[embedding_matrix], + input_length=max_sequence, + trainable=True, + mask_zero=True) + +#preparing the labels as one hot encoding vector +labels=[x[1] for x in corpora] +num_labels=len(list(set(labels))) +print("Labels: {}\tUnique labels:{}".format(len(labels), num_labels)) +encoder = LabelEncoder() +encoder.fit(labels) +with open('models/icd10_mappings.p', 'wb') as handle: + pickle.dump(encoder, handle) + +encoded_Y = encoder.transform(labels) + +# convert integers to dummy variables (i.e. one hot encoded) +labels_one_hot = np_utils.to_categorical(encoded_Y) + +X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777) +print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test)) + +try: + # LAYERS + print("Creating Model...") + inputs = Input(shape=(max_sequence,)) + embedding = embedding_layer(inputs) + decoder_LSTM = Bidirectional(LSTM(latent_dim, return_sequences=True)) + decoder_out = decoder_LSTM(embedding) #, initial_state=encoder_states) + attention = Attention()(decoder_out) + decoder_dense = Dense(num_labels, activation='softmax') + decoder_out = decoder_dense(attention) + + #MODEL + model = Model(inputs=inputs, outputs=decoder_out) + adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) + model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy']) + model.summary() + print("Traning Model...") + model.fit(X_train, Y_train, + batch_size=batch_size, + epochs=epochs, + callbacks=callbacks_list, + validation_split=0.25 + ) + +except Exception as e: + print(e) + traceback.print_exc() + diff --git a/code_jurica/classificationICD10_attention_char.py b/code_jurica/classificationICD10_attention_char.py new file mode 100644 index 0000000000000000000000000000000000000000..9bf696abfc64b0ef4b6a095aee7a608fc3783d73 --- /dev/null +++ b/code_jurica/classificationICD10_attention_char.py @@ -0,0 +1,141 @@ +# from comet_ml import Experiment +# experiment=Experiment(api_key="hSd9vTj0EfMu72569YnVEvtvj") + +# from loader import * +from util import * +import numpy as np +import random +import traceback +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split + +from keras.preprocessing.sequence import pad_sequences +from keras.preprocessing.text import Tokenizer +from keras.optimizers import Adam +from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger +from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional +from keras.models import Model +from keras.utils import multi_gpu_model, np_utils + +import tensorflow as tf + +from _layers import AttentionWithContext, Attention + +################################### +# TensorFlow wizardry +config=tf.ConfigProto() + +# Don't pre-allocate memory; allocate as-needed +config.gpu_options.allow_growth=True +config.gpu_options.allocator_type='BFC' + +callbacks_list=[ + EarlyStopping( + monitor='val_loss', + patience=2, + ), + ModelCheckpoint( + filepath='models/icd10Classification_attention_char.h5', + monitor='val_loss', + save_best_only=True, + ), + CSVLogger( + append=True, + filename='logs/icd10Classification_attention_char_{}.csv'.format(date_label), + ) +] + +latent_dim = 256 +epochs = 100 +batch_size = 1000 + +sentences = [] +labels=[] +chars = set() + +# tokenizer=TokenizePreprocessor() +kerasTokenizer = Tokenizer(char_level=True, filters=None) +dataLoader=prepareData() +corpora=dataLoader.prepareDictionaries() +print("Extracted {} data points".format(len(corpora))) +# input('size of corpora') + +for line in corpora: + + labels.append(line[1]) + sentences.append(line[0]) + + for ch in line[0]: + if (ch not in chars): + chars.add(ch) + +chars = sorted(list(chars)) + +char_to_index_dict={item.strip():i+1 for i,item in enumerate(chars)} +index_to_char_dict={i+1:item.strip() for i,item in enumerate(chars)} + +kerasTokenizer.word_index=char_to_index_dict +# saving +with open('models/icd10_char_tokenizer.p', 'wb') as handle: + pickle.dump(kerasTokenizer, handle) + +char_sequence=kerasTokenizer.texts_to_sequences(sentences) +# print(char_sequence) +max_sequence = max([len(x) for x in char_sequence]) +word_sequence = pad_sequences(char_sequence, maxlen=max_sequence, padding='post') + +embedding_matrix=embedding_matrix(char_to_index_dict) +embedding_layer = Embedding( + embedding_matrix.shape[0], + embedding_matrix.shape[1], + weights=[embedding_matrix], + input_length=max_sequence, + trainable=True, + mask_zero=True) + +#preparing the labels as one hot encoding vector + +num_labels=len(list(set(labels))) +print("Labels: {}\tUnique labels:{}".format(len(labels), num_labels)) +encoder = LabelEncoder() +encoder.fit(labels) +with open('models/icd10_char_mappings.p', 'wb') as handle: + pickle.dump(encoder, handle) + +encoded_Y = encoder.transform(labels) + +# convert integers to dummy variables (i.e. one hot encoded) +labels_one_hot = np_utils.to_categorical(encoded_Y) + +X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777) +print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test)) + +try: + # LAYERS + print("Creating Model...") + inputs = Input(shape=(max_sequence,)) + embedding = embedding_layer(inputs) + decoder_LSTM = Bidirectional(LSTM(latent_dim, return_sequences=True)) + decoder_out = decoder_LSTM(embedding) #, initial_state=encoder_states) + attention = Attention()(decoder_out) + decoder_dense = Dense(num_labels, activation='softmax') + decoder_out = decoder_dense(attention) + + #MODEL + model = Model(inputs=inputs, outputs=decoder_out) + # model = multi_gpu_model(tmp_model, gpus=2) + adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) + model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy']) + model.summary() + print("Traning Model...") + model.fit(X_train, Y_train, + batch_size=batch_size, + epochs=epochs, + callbacks=callbacks_list, + validation_split=0.25 + ) + +except Exception as e: + print(e) + traceback.print_exc() + diff --git a/code_jurica/seq2seq.py b/code_jurica/seq2seq.py index 41f589bba289c58c20afdb3b8c6e4de7239b4b51..1b4087c37a5be3c91e0214c19bb8f7148af5c62e 100644 --- a/code_jurica/seq2seq.py +++ b/code_jurica/seq2seq.py @@ -19,14 +19,12 @@ config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.allocator_type = 'BFC' - # LOAD ICD 10 CLASSIFICATION MODEL - try: - icd10_model = keras_load_model('models/classificationICD10.h5') + icd10_model = keras_load_model('models/icd10Classification.h5') except OSError: from classificationICD10 import * - icd10_model = keras_load_model('models/classificationICD10.h5') + icd10_model = keras_load_model('models/icd10Classification.h5') with open('models/icd10_tokenizer.p', 'rb') as handle: kerasTokenizer = pickle.load(handle)