Started working on a pipeline using all data. Issues with memory (RAM) while...

Started working on a pipeline using all data. Issues with memory (RAM) while generating one-hot label encodings. Solutions: use data generators to train via fit_generator. Examples: https://github.com/keras-team/keras/issues/1627 https://www.kaggle.com/ezietsman/simple-keras-model-with-data-generator https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.html

Started working on a pipeline using all data. Issues with memory (RAM) while...
Started working on a pipeline using all data. Issues with memory (RAM) while generating one-hot label encodings. Solutions: use data generators to train via fit_generator. Examples: https://github.com/keras-team/keras/issues/1627 https://www.kaggle.com/ezietsman/simple-keras-model-with-data-generator https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.html
535a7a63 · Jurica Seva · 86e59d9c · 535a7a63 · 535a7a63 · 535a7a63
Commit 535a7a63 authored 6 years ago by Jurica Seva
--- a/code_jurica/classificationICD10_attention.py
+++ b/code_jurica/classificationICD10_attention.py
@@ -31,7 +31,8 @@ import os
 os.environ['PYTHONHASHSEED'] = '0'

 import tensorflow as tf
-config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+# config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+config = tf.ConfigProto()
 from keras import backend as K
 tf.set_random_seed(1234)
 #REPRODUCIBLE
@@ -54,24 +55,24 @@ callbacks_list=[
        min_delta=0.005
    ),
    ModelCheckpoint(
-        filepath='models/icd10Classification_attention.h5',
+        filepath='models/icd10Classification_attention_extended.h5',
        monitor='val_loss',
        save_best_only=True,
    ),
    CSVLogger(
        append=True,
-        filename='logs/icd10Classification_attention_{}.csv'.format(date_label),
+        filename='logs/icd10Classification_attention_extended_{}.csv'.format(date_label),
    )
 ]

 latent_dim = 512
-epochs = 100
+epochs = 2
 batch_size = 1000

 tokenizer=TokenizePreprocessor()
 kerasTokenizer = Tokenizer()
 dataLoader=prepareData()
-corpora=dataLoader.prepareDictionaries()
+corpora=dataLoader.prepareDictionaries(unbalanced=True)

 tmp =[x[1] for x in corpora]
 labels_c = Counter(tmp)
@@ -97,7 +98,7 @@ vocabulary={item.strip():i+1 for i,item in enumerate(tmp)}
 index_to_word_dict={i+1:item.strip() for i,item in enumerate(tmp)}
 kerasTokenizer.word_index=vocabulary
 # saving
-with open('models/icd10_tokenizer.p', 'wb') as handle:
+with open('models/icd10_tokenizer_extended.p', 'wb') as handle:
    pickle.dump(kerasTokenizer, handle)

 source_word_sequence=kerasTokenizer.texts_to_sequences(corpus)
@@ -116,7 +117,7 @@ embedding_layer = Embedding(
 #preparing the labels as one hot encoding vector
 encoder = LabelEncoder()
 encoder.fit(labels)
-with open('models/icd10_mappings.p', 'wb') as handle:
+with open('models/icd10_mappings_extended.p', 'wb') as handle:
    pickle.dump(encoder, handle)

 encoded_Y = encoder.transform(labels)
@@ -124,7 +125,7 @@ encoded_Y = encoder.transform(labels)
 # convert integers to dummy variables (i.e. one hot encoded)
 labels_one_hot = np_utils.to_categorical(encoded_Y)

-X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.15, random_state=777, stratify=labels)
+X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.1, random_state=777, stratify=labels)
 print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test))

 try:

--- a/code_jurica/loader.py
+++ b/code_jurica/loader.py
@@ -37,6 +37,12 @@ IT_sample=random.sample(itCorpora, min_elements)
 HU_sample=random.sample(huCorpora, min_elements)
 corpora=FR_sample+HU_sample+IT_sample

+# corpora=frCorpora+itCorpora+huCorpora
+# print(len(corpora))
+# corpora=corpora[:int(len(corpora)*0.5)]
+# print(len(corpora))
+# input('bla')
+
 #labels - icd10 codes
 labels=[str(x[2]).strip() for x in corpora]

@@ -48,7 +54,7 @@ tmp =[item for item in list(set(flatten(source_tokens))) if item.strip()]
 source_vocab = {item.strip():i+1 for i,item in enumerate(tmp)}
 source_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)}
 kerasTokenizer.word_index=source_vocab
-with open('models/s2s_source_tokenizer.p', 'wb') as handle:
+with open('models/s2s_source_tokenizer_extended.p', 'wb') as handle:
    pickle.dump(kerasTokenizer, handle)

 source_word_sequence=kerasTokenizer.texts_to_sequences(source_corpus)
@@ -62,7 +68,7 @@ tmp=[item for item in list(set(flatten(target_tokens))) if item.strip()]
 target_vocab = {item.strip():i+1 for i,item in enumerate(tmp)}
 target_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)}
 kerasTokenizer.word_index=target_vocab
-with open('models/s2s_target_tokenizer.p', 'wb') as handle:
+with open('models/s2s_target_tokenizer_extended.p', 'wb') as handle:
    pickle.dump(kerasTokenizer, handle)

 target_word_sequence=kerasTokenizer.texts_to_sequences(target_corpus)
@@ -105,7 +111,7 @@ data_set_train_test = {
    'labels_val':labels_val
 }

-with open('models/train_test_split.p', 'wb') as handle:
+with open('models/train_test_split_extended.p', 'wb') as handle:
    pickle.dump(data_set_train_test, handle)

 target_train_onehot = np.zeros((len(target_train), target_max_sequence, len(target_vocab)+1))

--- a/code_jurica/seq2seq.py
+++ b/code_jurica/seq2seq.py
@@ -21,7 +21,8 @@ import os
 os.environ['PYTHONHASHSEED'] = '0'

 import tensorflow as tf
-config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+# config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+config = tf.ConfigProto()
 from keras import backend as K
 tf.set_random_seed(1234)
 #REPRODUCIBLE
@@ -40,16 +41,16 @@ K.set_session(sess)

 # LOAD ICD 10 CLASSIFICATION MODEL
 try:
-    icd10_model = keras_load_model('models/icd10Classification_attention.h5',
+    icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5',
              custom_objects={'Attention':Attention})
 except OSError:
    from classificationICD10 import *
-    icd10_model = keras_load_model('models/icd10Classification_attention.h5')
+    icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5')

-with open('models/icd10_tokenizer.p', 'rb') as handle:
+with open('models/icd10_tokenizer_extended.p', 'rb') as handle:
    icd10Tokenizer = pickle.load(handle)

-with open('models/icd10_mappings.p', 'rb') as handle:
+with open('models/icd10_mappings_extended.p', 'rb') as handle:
    encoded_Y = pickle.load(handle)
 # LOAD ICD 10 CLASSIFICATION MODEL

@@ -60,19 +61,19 @@ callbacks_list = [
        min_delta=0.005
    ),
    ModelCheckpoint(
-        filepath='models/s2s.h5',
+        filepath='models/s2s_extended.h5',
        monitor='val_loss',
        save_best_only=True,
    ),
    CSVLogger(
        append=False,
-        filename='logs/s2s_{}.csv'.format(date_label)
+        filename='logs/s2s_extended_{}.csv'.format(date_label)
    )
 ]

 latent_dim=256
 batch_size=1000
-epochs=100
+epochs=2

 print("Lets train some stuff!")
 # Define an input sequence and process it.
@@ -101,7 +102,7 @@ model.fit([source_train, target_train],
            batch_size=batch_size,
            callbacks=callbacks_list,
            epochs=epochs,
-            validation_split=0.2
+            validation_split=0.15
            # validation_data=([source_val, target_val], target_val_onehot)
      )

@@ -185,5 +186,5 @@ for seq_index in tqdm.tqdm(range(len(source_val))):

 report = classification_report(y_true, y_pred)
 report_df = report_to_df(report)
-report_df.to_csv('logs/classification_report.csv')
+report_df.to_csv('logs/classification_report_extended.csv')
 print(report_df)
\ No newline at end of file
--- a/code_jurica/test.py
+++ b/code_jurica/test.py
@@ -23,7 +23,7 @@ sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
 K.set_session(sess)
 #REPRODUCIBLE

-with open('models/train_test_split.p', 'rb') as handle:
+with open('models/train_test_split_extended.p', 'rb') as handle:
    data_set = pickle.load(handle)

 source_val = data_set['source_val']
@@ -31,22 +31,22 @@ target_val =data_set['target_val']
 labels_val = data_set['labels_val']

 # ICD 10 STUFF
-icd10_model = keras_load_model('models/icd10Classification_attention.h5', custom_objects={'Attention':Attention})
-with open('models/icd10_tokenizer.p', 'rb') as handle:
+icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5', custom_objects={'Attention':Attention})
+with open('models/icd10_tokenizer_extended.p', 'rb') as handle:
    icd10Tokenizer = pickle.load(handle)

-with open('models/icd10_mappings.p', 'rb') as handle:
+with open('models/icd10_mappings_extended.p', 'rb') as handle:
    icd10Encoder = pickle.load(handle)
 # ICD 10 STUFF

 # S2S STUFF
-S2S_model = keras_load_model('models/s2s.h5', custom_objects={'Attention':Attention})
-with open('models/s2s_source_tokenizer.p', 'rb') as handle:
+S2S_model = keras_load_model('models/s2s_extended.h5', custom_objects={'Attention':Attention})
+with open('models/s2s_source_tokenizer_extended.p', 'rb') as handle:
    s2s_source_tokenizer = pickle.load(handle)
 source_vocab = s2s_source_tokenizer.word_index
 source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()}

-with open('models/s2s_target_tokenizer.p', 'rb') as handle:
+with open('models/s2s_target_tokenizer_extended.p', 'rb') as handle:
    s2s_target_tokenizer = pickle.load(handle)

 target_vocab =s2s_target_tokenizer.word_index
@@ -151,5 +151,5 @@ for seq_index in range(len(source_val)):

 report = classification_report(y_true, y_pred)
 report_df = report_to_df(report)
-report_df.to_csv('logs/classification_report_test.csv')
+report_df.to_csv('logs/classification_report_test_extended.csv')
 print(report_df)
\ No newline at end of file
--- a/code_jurica/util.py
+++ b/code_jurica/util.py
@@ -250,7 +250,7 @@ class prepareData():
        return data, errors


-    def prepareDictionaries(self):
+    def prepareDictionaries(self, unbalanced=False):

        preparedDictionary = []

@@ -280,6 +280,20 @@ class prepareData():
                        if not math.isnan(text):
                            preparedDictionary.append([ text.lower(), label ])

+            if unbalanced:
+                for k, v in TRAINING.items():
+                    df = pd.read_csv(v['CC'], sep=';', dtype=str, encoding="utf8")
+                    for index, row in df.iterrows():
+
+                        label = str(row['ICD10']).strip().upper()[:4]
+                        text = row['StandardText']
+
+                        if not isinstance(text, float):
+                            preparedDictionary.append([text.lower().strip(), label])
+                        else:
+                            if not math.isnan(text):
+                                preparedDictionary.append([text.lower().strip(), label])
+
            pickle.dump(preparedDictionary, open(DICT_PREPROCESED, 'wb'))

        return preparedDictionary
\ No newline at end of file