From 535a7a63b846fee6fffeb492fc2fb26a4eccb299 Mon Sep 17 00:00:00 2001 From: Jurica Seva <seva@informatik.hu-berlin.de> Date: Sat, 5 May 2018 12:21:03 +0200 Subject: [PATCH] Started working on a pipeline using all data. Issues with memory (RAM) while generating one-hot label encodings. Solutions: use data generators to train via fit_generator. Examples: https://github.com/keras-team/keras/issues/1627 https://www.kaggle.com/ezietsman/simple-keras-model-with-data-generator https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.html --- code_jurica/classificationICD10_attention.py | 17 ++++++++-------- code_jurica/loader.py | 12 ++++++++--- code_jurica/seq2seq.py | 21 ++++++++++---------- code_jurica/test.py | 16 +++++++-------- code_jurica/util.py | 16 ++++++++++++++- 5 files changed, 52 insertions(+), 30 deletions(-) diff --git a/code_jurica/classificationICD10_attention.py b/code_jurica/classificationICD10_attention.py index a82fac8..a99624f 100644 --- a/code_jurica/classificationICD10_attention.py +++ b/code_jurica/classificationICD10_attention.py @@ -31,7 +31,8 @@ import os os.environ['PYTHONHASHSEED'] = '0' import tensorflow as tf -config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) +# config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) +config = tf.ConfigProto() from keras import backend as K tf.set_random_seed(1234) #REPRODUCIBLE @@ -54,24 +55,24 @@ callbacks_list=[ min_delta=0.005 ), ModelCheckpoint( - filepath='models/icd10Classification_attention.h5', + filepath='models/icd10Classification_attention_extended.h5', monitor='val_loss', save_best_only=True, ), CSVLogger( append=True, - filename='logs/icd10Classification_attention_{}.csv'.format(date_label), + filename='logs/icd10Classification_attention_extended_{}.csv'.format(date_label), ) ] latent_dim = 512 -epochs = 100 +epochs = 2 batch_size = 1000 tokenizer=TokenizePreprocessor() kerasTokenizer = Tokenizer() dataLoader=prepareData() -corpora=dataLoader.prepareDictionaries() +corpora=dataLoader.prepareDictionaries(unbalanced=True) tmp =[x[1] for x in corpora] labels_c = Counter(tmp) @@ -97,7 +98,7 @@ vocabulary={item.strip():i+1 for i,item in enumerate(tmp)} index_to_word_dict={i+1:item.strip() for i,item in enumerate(tmp)} kerasTokenizer.word_index=vocabulary # saving -with open('models/icd10_tokenizer.p', 'wb') as handle: +with open('models/icd10_tokenizer_extended.p', 'wb') as handle: pickle.dump(kerasTokenizer, handle) source_word_sequence=kerasTokenizer.texts_to_sequences(corpus) @@ -116,7 +117,7 @@ embedding_layer = Embedding( #preparing the labels as one hot encoding vector encoder = LabelEncoder() encoder.fit(labels) -with open('models/icd10_mappings.p', 'wb') as handle: +with open('models/icd10_mappings_extended.p', 'wb') as handle: pickle.dump(encoder, handle) encoded_Y = encoder.transform(labels) @@ -124,7 +125,7 @@ encoded_Y = encoder.transform(labels) # convert integers to dummy variables (i.e. one hot encoded) labels_one_hot = np_utils.to_categorical(encoded_Y) -X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.15, random_state=777, stratify=labels) +X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.1, random_state=777, stratify=labels) print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test)) try: diff --git a/code_jurica/loader.py b/code_jurica/loader.py index eea1eaa..85db1d5 100644 --- a/code_jurica/loader.py +++ b/code_jurica/loader.py @@ -37,6 +37,12 @@ IT_sample=random.sample(itCorpora, min_elements) HU_sample=random.sample(huCorpora, min_elements) corpora=FR_sample+HU_sample+IT_sample +# corpora=frCorpora+itCorpora+huCorpora +# print(len(corpora)) +# corpora=corpora[:int(len(corpora)*0.5)] +# print(len(corpora)) +# input('bla') + #labels - icd10 codes labels=[str(x[2]).strip() for x in corpora] @@ -48,7 +54,7 @@ tmp =[item for item in list(set(flatten(source_tokens))) if item.strip()] source_vocab = {item.strip():i+1 for i,item in enumerate(tmp)} source_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)} kerasTokenizer.word_index=source_vocab -with open('models/s2s_source_tokenizer.p', 'wb') as handle: +with open('models/s2s_source_tokenizer_extended.p', 'wb') as handle: pickle.dump(kerasTokenizer, handle) source_word_sequence=kerasTokenizer.texts_to_sequences(source_corpus) @@ -62,7 +68,7 @@ tmp=[item for item in list(set(flatten(target_tokens))) if item.strip()] target_vocab = {item.strip():i+1 for i,item in enumerate(tmp)} target_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)} kerasTokenizer.word_index=target_vocab -with open('models/s2s_target_tokenizer.p', 'wb') as handle: +with open('models/s2s_target_tokenizer_extended.p', 'wb') as handle: pickle.dump(kerasTokenizer, handle) target_word_sequence=kerasTokenizer.texts_to_sequences(target_corpus) @@ -105,7 +111,7 @@ data_set_train_test = { 'labels_val':labels_val } -with open('models/train_test_split.p', 'wb') as handle: +with open('models/train_test_split_extended.p', 'wb') as handle: pickle.dump(data_set_train_test, handle) target_train_onehot = np.zeros((len(target_train), target_max_sequence, len(target_vocab)+1)) diff --git a/code_jurica/seq2seq.py b/code_jurica/seq2seq.py index fd22ce9..5ff7310 100644 --- a/code_jurica/seq2seq.py +++ b/code_jurica/seq2seq.py @@ -21,7 +21,8 @@ import os os.environ['PYTHONHASHSEED'] = '0' import tensorflow as tf -config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) +# config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) +config = tf.ConfigProto() from keras import backend as K tf.set_random_seed(1234) #REPRODUCIBLE @@ -40,16 +41,16 @@ K.set_session(sess) # LOAD ICD 10 CLASSIFICATION MODEL try: - icd10_model = keras_load_model('models/icd10Classification_attention.h5', + icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5', custom_objects={'Attention':Attention}) except OSError: from classificationICD10 import * - icd10_model = keras_load_model('models/icd10Classification_attention.h5') + icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5') -with open('models/icd10_tokenizer.p', 'rb') as handle: +with open('models/icd10_tokenizer_extended.p', 'rb') as handle: icd10Tokenizer = pickle.load(handle) -with open('models/icd10_mappings.p', 'rb') as handle: +with open('models/icd10_mappings_extended.p', 'rb') as handle: encoded_Y = pickle.load(handle) # LOAD ICD 10 CLASSIFICATION MODEL @@ -60,19 +61,19 @@ callbacks_list = [ min_delta=0.005 ), ModelCheckpoint( - filepath='models/s2s.h5', + filepath='models/s2s_extended.h5', monitor='val_loss', save_best_only=True, ), CSVLogger( append=False, - filename='logs/s2s_{}.csv'.format(date_label) + filename='logs/s2s_extended_{}.csv'.format(date_label) ) ] latent_dim=256 batch_size=1000 -epochs=100 +epochs=2 print("Lets train some stuff!") # Define an input sequence and process it. @@ -101,7 +102,7 @@ model.fit([source_train, target_train], batch_size=batch_size, callbacks=callbacks_list, epochs=epochs, - validation_split=0.2 + validation_split=0.15 # validation_data=([source_val, target_val], target_val_onehot) ) @@ -185,5 +186,5 @@ for seq_index in tqdm.tqdm(range(len(source_val))): report = classification_report(y_true, y_pred) report_df = report_to_df(report) -report_df.to_csv('logs/classification_report.csv') +report_df.to_csv('logs/classification_report_extended.csv') print(report_df) \ No newline at end of file diff --git a/code_jurica/test.py b/code_jurica/test.py index e105d34..f6918e7 100644 --- a/code_jurica/test.py +++ b/code_jurica/test.py @@ -23,7 +23,7 @@ sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) K.set_session(sess) #REPRODUCIBLE -with open('models/train_test_split.p', 'rb') as handle: +with open('models/train_test_split_extended.p', 'rb') as handle: data_set = pickle.load(handle) source_val = data_set['source_val'] @@ -31,22 +31,22 @@ target_val =data_set['target_val'] labels_val = data_set['labels_val'] # ICD 10 STUFF -icd10_model = keras_load_model('models/icd10Classification_attention.h5', custom_objects={'Attention':Attention}) -with open('models/icd10_tokenizer.p', 'rb') as handle: +icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5', custom_objects={'Attention':Attention}) +with open('models/icd10_tokenizer_extended.p', 'rb') as handle: icd10Tokenizer = pickle.load(handle) -with open('models/icd10_mappings.p', 'rb') as handle: +with open('models/icd10_mappings_extended.p', 'rb') as handle: icd10Encoder = pickle.load(handle) # ICD 10 STUFF # S2S STUFF -S2S_model = keras_load_model('models/s2s.h5', custom_objects={'Attention':Attention}) -with open('models/s2s_source_tokenizer.p', 'rb') as handle: +S2S_model = keras_load_model('models/s2s_extended.h5', custom_objects={'Attention':Attention}) +with open('models/s2s_source_tokenizer_extended.p', 'rb') as handle: s2s_source_tokenizer = pickle.load(handle) source_vocab = s2s_source_tokenizer.word_index source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()} -with open('models/s2s_target_tokenizer.p', 'rb') as handle: +with open('models/s2s_target_tokenizer_extended.p', 'rb') as handle: s2s_target_tokenizer = pickle.load(handle) target_vocab =s2s_target_tokenizer.word_index @@ -151,5 +151,5 @@ for seq_index in range(len(source_val)): report = classification_report(y_true, y_pred) report_df = report_to_df(report) -report_df.to_csv('logs/classification_report_test.csv') +report_df.to_csv('logs/classification_report_test_extended.csv') print(report_df) \ No newline at end of file diff --git a/code_jurica/util.py b/code_jurica/util.py index b96b5d4..509f13d 100644 --- a/code_jurica/util.py +++ b/code_jurica/util.py @@ -250,7 +250,7 @@ class prepareData(): return data, errors - def prepareDictionaries(self): + def prepareDictionaries(self, unbalanced=False): preparedDictionary = [] @@ -280,6 +280,20 @@ class prepareData(): if not math.isnan(text): preparedDictionary.append([ text.lower(), label ]) + if unbalanced: + for k, v in TRAINING.items(): + df = pd.read_csv(v['CC'], sep=';', dtype=str, encoding="utf8") + for index, row in df.iterrows(): + + label = str(row['ICD10']).strip().upper()[:4] + text = row['StandardText'] + + if not isinstance(text, float): + preparedDictionary.append([text.lower().strip(), label]) + else: + if not math.isnan(text): + preparedDictionary.append([text.lower().strip(), label]) + pickle.dump(preparedDictionary, open(DICT_PREPROCESED, 'wb')) return preparedDictionary \ No newline at end of file -- GitLab