From 535a7a63b846fee6fffeb492fc2fb26a4eccb299 Mon Sep 17 00:00:00 2001
From: Jurica Seva <seva@informatik.hu-berlin.de>
Date: Sat, 5 May 2018 12:21:03 +0200
Subject: [PATCH] Started working on a pipeline using all data. Issues with
 memory (RAM) while generating one-hot label encodings. Solutions: use data
 generators to train via fit_generator. Examples:
 https://github.com/keras-team/keras/issues/1627
 https://www.kaggle.com/ezietsman/simple-keras-model-with-data-generator
 https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.html

---
 code_jurica/classificationICD10_attention.py | 17 ++++++++--------
 code_jurica/loader.py                        | 12 ++++++++---
 code_jurica/seq2seq.py                       | 21 ++++++++++----------
 code_jurica/test.py                          | 16 +++++++--------
 code_jurica/util.py                          | 16 ++++++++++++++-
 5 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/code_jurica/classificationICD10_attention.py b/code_jurica/classificationICD10_attention.py
index a82fac8..a99624f 100644
--- a/code_jurica/classificationICD10_attention.py
+++ b/code_jurica/classificationICD10_attention.py
@@ -31,7 +31,8 @@ import os
 os.environ['PYTHONHASHSEED'] = '0'
 
 import tensorflow as tf
-config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+# config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+config = tf.ConfigProto()
 from keras import backend as K
 tf.set_random_seed(1234)
 #REPRODUCIBLE
@@ -54,24 +55,24 @@ callbacks_list=[
         min_delta=0.005
     ),
     ModelCheckpoint(
-        filepath='models/icd10Classification_attention.h5',
+        filepath='models/icd10Classification_attention_extended.h5',
         monitor='val_loss',
         save_best_only=True,
     ),
     CSVLogger(
         append=True,
-        filename='logs/icd10Classification_attention_{}.csv'.format(date_label),
+        filename='logs/icd10Classification_attention_extended_{}.csv'.format(date_label),
     )
 ]
 
 latent_dim = 512
-epochs = 100
+epochs = 2
 batch_size = 1000
 
 tokenizer=TokenizePreprocessor()
 kerasTokenizer = Tokenizer()
 dataLoader=prepareData()
-corpora=dataLoader.prepareDictionaries()
+corpora=dataLoader.prepareDictionaries(unbalanced=True)
 
 tmp =[x[1] for x in corpora]
 labels_c = Counter(tmp)
@@ -97,7 +98,7 @@ vocabulary={item.strip():i+1 for i,item in enumerate(tmp)}
 index_to_word_dict={i+1:item.strip() for i,item in enumerate(tmp)}
 kerasTokenizer.word_index=vocabulary
 # saving
-with open('models/icd10_tokenizer.p', 'wb') as handle:
+with open('models/icd10_tokenizer_extended.p', 'wb') as handle:
     pickle.dump(kerasTokenizer, handle)
 
 source_word_sequence=kerasTokenizer.texts_to_sequences(corpus)
@@ -116,7 +117,7 @@ embedding_layer = Embedding(
 #preparing the labels as one hot encoding vector
 encoder = LabelEncoder()
 encoder.fit(labels)
-with open('models/icd10_mappings.p', 'wb') as handle:
+with open('models/icd10_mappings_extended.p', 'wb') as handle:
     pickle.dump(encoder, handle)
 
 encoded_Y = encoder.transform(labels)
@@ -124,7 +125,7 @@ encoded_Y = encoder.transform(labels)
 # convert integers to dummy variables (i.e. one hot encoded)
 labels_one_hot = np_utils.to_categorical(encoded_Y)
 
-X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.15, random_state=777, stratify=labels)
+X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.1, random_state=777, stratify=labels)
 print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test))
 
 try:
diff --git a/code_jurica/loader.py b/code_jurica/loader.py
index eea1eaa..85db1d5 100644
--- a/code_jurica/loader.py
+++ b/code_jurica/loader.py
@@ -37,6 +37,12 @@ IT_sample=random.sample(itCorpora, min_elements)
 HU_sample=random.sample(huCorpora, min_elements)
 corpora=FR_sample+HU_sample+IT_sample
 
+# corpora=frCorpora+itCorpora+huCorpora
+# print(len(corpora))
+# corpora=corpora[:int(len(corpora)*0.5)]
+# print(len(corpora))
+# input('bla')
+
 #labels - icd10 codes
 labels=[str(x[2]).strip() for x in corpora]
 
@@ -48,7 +54,7 @@ tmp =[item for item in list(set(flatten(source_tokens))) if item.strip()]
 source_vocab = {item.strip():i+1 for i,item in enumerate(tmp)}
 source_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)}
 kerasTokenizer.word_index=source_vocab
-with open('models/s2s_source_tokenizer.p', 'wb') as handle:
+with open('models/s2s_source_tokenizer_extended.p', 'wb') as handle:
     pickle.dump(kerasTokenizer, handle)
 
 source_word_sequence=kerasTokenizer.texts_to_sequences(source_corpus)
@@ -62,7 +68,7 @@ tmp=[item for item in list(set(flatten(target_tokens))) if item.strip()]
 target_vocab = {item.strip():i+1 for i,item in enumerate(tmp)}
 target_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)}
 kerasTokenizer.word_index=target_vocab
-with open('models/s2s_target_tokenizer.p', 'wb') as handle:
+with open('models/s2s_target_tokenizer_extended.p', 'wb') as handle:
     pickle.dump(kerasTokenizer, handle)
 
 target_word_sequence=kerasTokenizer.texts_to_sequences(target_corpus)
@@ -105,7 +111,7 @@ data_set_train_test = {
     'labels_val':labels_val
 }
 
-with open('models/train_test_split.p', 'wb') as handle:
+with open('models/train_test_split_extended.p', 'wb') as handle:
     pickle.dump(data_set_train_test, handle)
 
 target_train_onehot = np.zeros((len(target_train), target_max_sequence, len(target_vocab)+1))
diff --git a/code_jurica/seq2seq.py b/code_jurica/seq2seq.py
index fd22ce9..5ff7310 100644
--- a/code_jurica/seq2seq.py
+++ b/code_jurica/seq2seq.py
@@ -21,7 +21,8 @@ import os
 os.environ['PYTHONHASHSEED'] = '0'
 
 import tensorflow as tf
-config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+# config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+config = tf.ConfigProto()
 from keras import backend as K
 tf.set_random_seed(1234)
 #REPRODUCIBLE
@@ -40,16 +41,16 @@ K.set_session(sess)
 
 # LOAD ICD 10 CLASSIFICATION MODEL
 try:
-    icd10_model = keras_load_model('models/icd10Classification_attention.h5',
+    icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5',
               custom_objects={'Attention':Attention})
 except OSError:
     from classificationICD10 import *
-    icd10_model = keras_load_model('models/icd10Classification_attention.h5')
+    icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5')
 
-with open('models/icd10_tokenizer.p', 'rb') as handle:
+with open('models/icd10_tokenizer_extended.p', 'rb') as handle:
     icd10Tokenizer = pickle.load(handle)
 
-with open('models/icd10_mappings.p', 'rb') as handle:
+with open('models/icd10_mappings_extended.p', 'rb') as handle:
     encoded_Y = pickle.load(handle)
 # LOAD ICD 10 CLASSIFICATION MODEL
 
@@ -60,19 +61,19 @@ callbacks_list = [
         min_delta=0.005
     ),
     ModelCheckpoint(
-        filepath='models/s2s.h5',
+        filepath='models/s2s_extended.h5',
         monitor='val_loss',
         save_best_only=True,
     ),
     CSVLogger(
         append=False,
-        filename='logs/s2s_{}.csv'.format(date_label)
+        filename='logs/s2s_extended_{}.csv'.format(date_label)
     )
 ]
 
 latent_dim=256
 batch_size=1000
-epochs=100
+epochs=2
 
 print("Lets train some stuff!")
 # Define an input sequence and process it.
@@ -101,7 +102,7 @@ model.fit([source_train, target_train],
             batch_size=batch_size,
             callbacks=callbacks_list,
             epochs=epochs,
-            validation_split=0.2
+            validation_split=0.15
             # validation_data=([source_val, target_val], target_val_onehot)
       )
 
@@ -185,5 +186,5 @@ for seq_index in tqdm.tqdm(range(len(source_val))):
 
 report = classification_report(y_true, y_pred)
 report_df = report_to_df(report)
-report_df.to_csv('logs/classification_report.csv')
+report_df.to_csv('logs/classification_report_extended.csv')
 print(report_df)
\ No newline at end of file
diff --git a/code_jurica/test.py b/code_jurica/test.py
index e105d34..f6918e7 100644
--- a/code_jurica/test.py
+++ b/code_jurica/test.py
@@ -23,7 +23,7 @@ sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
 K.set_session(sess)
 #REPRODUCIBLE
 
-with open('models/train_test_split.p', 'rb') as handle:
+with open('models/train_test_split_extended.p', 'rb') as handle:
     data_set = pickle.load(handle)
 
 source_val = data_set['source_val']
@@ -31,22 +31,22 @@ target_val =data_set['target_val']
 labels_val = data_set['labels_val']
 
 # ICD 10 STUFF
-icd10_model = keras_load_model('models/icd10Classification_attention.h5', custom_objects={'Attention':Attention})
-with open('models/icd10_tokenizer.p', 'rb') as handle:
+icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5', custom_objects={'Attention':Attention})
+with open('models/icd10_tokenizer_extended.p', 'rb') as handle:
     icd10Tokenizer = pickle.load(handle)
 
-with open('models/icd10_mappings.p', 'rb') as handle:
+with open('models/icd10_mappings_extended.p', 'rb') as handle:
     icd10Encoder = pickle.load(handle)
 # ICD 10 STUFF
 
 # S2S STUFF
-S2S_model = keras_load_model('models/s2s.h5', custom_objects={'Attention':Attention})
-with open('models/s2s_source_tokenizer.p', 'rb') as handle:
+S2S_model = keras_load_model('models/s2s_extended.h5', custom_objects={'Attention':Attention})
+with open('models/s2s_source_tokenizer_extended.p', 'rb') as handle:
     s2s_source_tokenizer = pickle.load(handle)
 source_vocab = s2s_source_tokenizer.word_index
 source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()}
 
-with open('models/s2s_target_tokenizer.p', 'rb') as handle:
+with open('models/s2s_target_tokenizer_extended.p', 'rb') as handle:
     s2s_target_tokenizer = pickle.load(handle)
 
 target_vocab =s2s_target_tokenizer.word_index
@@ -151,5 +151,5 @@ for seq_index in range(len(source_val)):
 
 report = classification_report(y_true, y_pred)
 report_df = report_to_df(report)
-report_df.to_csv('logs/classification_report_test.csv')
+report_df.to_csv('logs/classification_report_test_extended.csv')
 print(report_df)
\ No newline at end of file
diff --git a/code_jurica/util.py b/code_jurica/util.py
index b96b5d4..509f13d 100644
--- a/code_jurica/util.py
+++ b/code_jurica/util.py
@@ -250,7 +250,7 @@ class prepareData():
         return data, errors
 
 
-    def prepareDictionaries(self):
+    def prepareDictionaries(self, unbalanced=False):
 
         preparedDictionary = []
 
@@ -280,6 +280,20 @@ class prepareData():
                         if not math.isnan(text):
                             preparedDictionary.append([ text.lower(), label ])
 
+            if unbalanced:
+                for k, v in TRAINING.items():
+                    df = pd.read_csv(v['CC'], sep=';', dtype=str, encoding="utf8")
+                    for index, row in df.iterrows():
+
+                        label = str(row['ICD10']).strip().upper()[:4]
+                        text = row['StandardText']
+
+                        if not isinstance(text, float):
+                            preparedDictionary.append([text.lower().strip(), label])
+                        else:
+                            if not math.isnan(text):
+                                preparedDictionary.append([text.lower().strip(), label])
+
             pickle.dump(preparedDictionary, open(DICT_PREPROCESED, 'wb'))
 
         return preparedDictionary
\ No newline at end of file
-- 
GitLab