Skip to content
Snippets Groups Projects
Commit 535a7a63 authored by Jurica Seva's avatar Jurica Seva
Browse files

Started working on a pipeline using all data. Issues with memory (RAM) while...

Started working on a pipeline using all data. Issues with memory (RAM) while generating one-hot label encodings. Solutions: use data generators to train via fit_generator. Examples:
https://github.com/keras-team/keras/issues/1627
https://www.kaggle.com/ezietsman/simple-keras-model-with-data-generator
https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.html
parent 86e59d9c
No related merge requests found
......@@ -31,7 +31,8 @@ import os
os.environ['PYTHONHASHSEED'] = '0'
import tensorflow as tf
config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
# config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config = tf.ConfigProto()
from keras import backend as K
tf.set_random_seed(1234)
#REPRODUCIBLE
......@@ -54,24 +55,24 @@ callbacks_list=[
min_delta=0.005
),
ModelCheckpoint(
filepath='models/icd10Classification_attention.h5',
filepath='models/icd10Classification_attention_extended.h5',
monitor='val_loss',
save_best_only=True,
),
CSVLogger(
append=True,
filename='logs/icd10Classification_attention_{}.csv'.format(date_label),
filename='logs/icd10Classification_attention_extended_{}.csv'.format(date_label),
)
]
latent_dim = 512
epochs = 100
epochs = 2
batch_size = 1000
tokenizer=TokenizePreprocessor()
kerasTokenizer = Tokenizer()
dataLoader=prepareData()
corpora=dataLoader.prepareDictionaries()
corpora=dataLoader.prepareDictionaries(unbalanced=True)
tmp =[x[1] for x in corpora]
labels_c = Counter(tmp)
......@@ -97,7 +98,7 @@ vocabulary={item.strip():i+1 for i,item in enumerate(tmp)}
index_to_word_dict={i+1:item.strip() for i,item in enumerate(tmp)}
kerasTokenizer.word_index=vocabulary
# saving
with open('models/icd10_tokenizer.p', 'wb') as handle:
with open('models/icd10_tokenizer_extended.p', 'wb') as handle:
pickle.dump(kerasTokenizer, handle)
source_word_sequence=kerasTokenizer.texts_to_sequences(corpus)
......@@ -116,7 +117,7 @@ embedding_layer = Embedding(
#preparing the labels as one hot encoding vector
encoder = LabelEncoder()
encoder.fit(labels)
with open('models/icd10_mappings.p', 'wb') as handle:
with open('models/icd10_mappings_extended.p', 'wb') as handle:
pickle.dump(encoder, handle)
encoded_Y = encoder.transform(labels)
......@@ -124,7 +125,7 @@ encoded_Y = encoder.transform(labels)
# convert integers to dummy variables (i.e. one hot encoded)
labels_one_hot = np_utils.to_categorical(encoded_Y)
X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.15, random_state=777, stratify=labels)
X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.1, random_state=777, stratify=labels)
print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test))
try:
......
......@@ -37,6 +37,12 @@ IT_sample=random.sample(itCorpora, min_elements)
HU_sample=random.sample(huCorpora, min_elements)
corpora=FR_sample+HU_sample+IT_sample
# corpora=frCorpora+itCorpora+huCorpora
# print(len(corpora))
# corpora=corpora[:int(len(corpora)*0.5)]
# print(len(corpora))
# input('bla')
#labels - icd10 codes
labels=[str(x[2]).strip() for x in corpora]
......@@ -48,7 +54,7 @@ tmp =[item for item in list(set(flatten(source_tokens))) if item.strip()]
source_vocab = {item.strip():i+1 for i,item in enumerate(tmp)}
source_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)}
kerasTokenizer.word_index=source_vocab
with open('models/s2s_source_tokenizer.p', 'wb') as handle:
with open('models/s2s_source_tokenizer_extended.p', 'wb') as handle:
pickle.dump(kerasTokenizer, handle)
source_word_sequence=kerasTokenizer.texts_to_sequences(source_corpus)
......@@ -62,7 +68,7 @@ tmp=[item for item in list(set(flatten(target_tokens))) if item.strip()]
target_vocab = {item.strip():i+1 for i,item in enumerate(tmp)}
target_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)}
kerasTokenizer.word_index=target_vocab
with open('models/s2s_target_tokenizer.p', 'wb') as handle:
with open('models/s2s_target_tokenizer_extended.p', 'wb') as handle:
pickle.dump(kerasTokenizer, handle)
target_word_sequence=kerasTokenizer.texts_to_sequences(target_corpus)
......@@ -105,7 +111,7 @@ data_set_train_test = {
'labels_val':labels_val
}
with open('models/train_test_split.p', 'wb') as handle:
with open('models/train_test_split_extended.p', 'wb') as handle:
pickle.dump(data_set_train_test, handle)
target_train_onehot = np.zeros((len(target_train), target_max_sequence, len(target_vocab)+1))
......
......@@ -21,7 +21,8 @@ import os
os.environ['PYTHONHASHSEED'] = '0'
import tensorflow as tf
config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
# config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config = tf.ConfigProto()
from keras import backend as K
tf.set_random_seed(1234)
#REPRODUCIBLE
......@@ -40,16 +41,16 @@ K.set_session(sess)
# LOAD ICD 10 CLASSIFICATION MODEL
try:
icd10_model = keras_load_model('models/icd10Classification_attention.h5',
icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5',
custom_objects={'Attention':Attention})
except OSError:
from classificationICD10 import *
icd10_model = keras_load_model('models/icd10Classification_attention.h5')
icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5')
with open('models/icd10_tokenizer.p', 'rb') as handle:
with open('models/icd10_tokenizer_extended.p', 'rb') as handle:
icd10Tokenizer = pickle.load(handle)
with open('models/icd10_mappings.p', 'rb') as handle:
with open('models/icd10_mappings_extended.p', 'rb') as handle:
encoded_Y = pickle.load(handle)
# LOAD ICD 10 CLASSIFICATION MODEL
......@@ -60,19 +61,19 @@ callbacks_list = [
min_delta=0.005
),
ModelCheckpoint(
filepath='models/s2s.h5',
filepath='models/s2s_extended.h5',
monitor='val_loss',
save_best_only=True,
),
CSVLogger(
append=False,
filename='logs/s2s_{}.csv'.format(date_label)
filename='logs/s2s_extended_{}.csv'.format(date_label)
)
]
latent_dim=256
batch_size=1000
epochs=100
epochs=2
print("Lets train some stuff!")
# Define an input sequence and process it.
......@@ -101,7 +102,7 @@ model.fit([source_train, target_train],
batch_size=batch_size,
callbacks=callbacks_list,
epochs=epochs,
validation_split=0.2
validation_split=0.15
# validation_data=([source_val, target_val], target_val_onehot)
)
......@@ -185,5 +186,5 @@ for seq_index in tqdm.tqdm(range(len(source_val))):
report = classification_report(y_true, y_pred)
report_df = report_to_df(report)
report_df.to_csv('logs/classification_report.csv')
report_df.to_csv('logs/classification_report_extended.csv')
print(report_df)
\ No newline at end of file
......@@ -23,7 +23,7 @@ sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)
#REPRODUCIBLE
with open('models/train_test_split.p', 'rb') as handle:
with open('models/train_test_split_extended.p', 'rb') as handle:
data_set = pickle.load(handle)
source_val = data_set['source_val']
......@@ -31,22 +31,22 @@ target_val =data_set['target_val']
labels_val = data_set['labels_val']
# ICD 10 STUFF
icd10_model = keras_load_model('models/icd10Classification_attention.h5', custom_objects={'Attention':Attention})
with open('models/icd10_tokenizer.p', 'rb') as handle:
icd10_model = keras_load_model('models/icd10Classification_attention_extended.h5', custom_objects={'Attention':Attention})
with open('models/icd10_tokenizer_extended.p', 'rb') as handle:
icd10Tokenizer = pickle.load(handle)
with open('models/icd10_mappings.p', 'rb') as handle:
with open('models/icd10_mappings_extended.p', 'rb') as handle:
icd10Encoder = pickle.load(handle)
# ICD 10 STUFF
# S2S STUFF
S2S_model = keras_load_model('models/s2s.h5', custom_objects={'Attention':Attention})
with open('models/s2s_source_tokenizer.p', 'rb') as handle:
S2S_model = keras_load_model('models/s2s_extended.h5', custom_objects={'Attention':Attention})
with open('models/s2s_source_tokenizer_extended.p', 'rb') as handle:
s2s_source_tokenizer = pickle.load(handle)
source_vocab = s2s_source_tokenizer.word_index
source_index_to_word_dict = {v:k.strip() for k,v in s2s_source_tokenizer.word_index.items()}
with open('models/s2s_target_tokenizer.p', 'rb') as handle:
with open('models/s2s_target_tokenizer_extended.p', 'rb') as handle:
s2s_target_tokenizer = pickle.load(handle)
target_vocab =s2s_target_tokenizer.word_index
......@@ -151,5 +151,5 @@ for seq_index in range(len(source_val)):
report = classification_report(y_true, y_pred)
report_df = report_to_df(report)
report_df.to_csv('logs/classification_report_test.csv')
report_df.to_csv('logs/classification_report_test_extended.csv')
print(report_df)
\ No newline at end of file
......@@ -250,7 +250,7 @@ class prepareData():
return data, errors
def prepareDictionaries(self):
def prepareDictionaries(self, unbalanced=False):
preparedDictionary = []
......@@ -280,6 +280,20 @@ class prepareData():
if not math.isnan(text):
preparedDictionary.append([ text.lower(), label ])
if unbalanced:
for k, v in TRAINING.items():
df = pd.read_csv(v['CC'], sep=';', dtype=str, encoding="utf8")
for index, row in df.iterrows():
label = str(row['ICD10']).strip().upper()[:4]
text = row['StandardText']
if not isinstance(text, float):
preparedDictionary.append([text.lower().strip(), label])
else:
if not math.isnan(text):
preparedDictionary.append([text.lower().strip(), label])
pickle.dump(preparedDictionary, open(DICT_PREPROCESED, 'wb'))
return preparedDictionary
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment