From 46df106c8e609f54aa28a91f2ad768471da42398 Mon Sep 17 00:00:00 2001 From: Jurica Seva <seva@informatik.hu-berlin.de> Date: Mon, 7 May 2018 17:16:18 +0200 Subject: [PATCH] fit_generator implemented. Slow as hell. --- code_jurica/classificationICD10_attention.py | 2 +- code_jurica/loader.py | 110 +++++++++++-------- code_jurica/seq2seq.py | 58 +++++++--- code_jurica/test.py | 2 +- code_jurica/util.py | 64 ++++++++++- 5 files changed, 170 insertions(+), 66 deletions(-) diff --git a/code_jurica/classificationICD10_attention.py b/code_jurica/classificationICD10_attention.py index a99624f..9593598 100644 --- a/code_jurica/classificationICD10_attention.py +++ b/code_jurica/classificationICD10_attention.py @@ -66,7 +66,7 @@ callbacks_list=[ ] latent_dim = 512 -epochs = 2 +epochs = 100 batch_size = 1000 tokenizer=TokenizePreprocessor() diff --git a/code_jurica/loader.py b/code_jurica/loader.py index 85db1d5..3d06d55 100644 --- a/code_jurica/loader.py +++ b/code_jurica/loader.py @@ -15,7 +15,8 @@ import os os.environ['PYTHONHASHSEED'] = '0' #REPRODUCIBLE -kerasTokenizer = Tokenizer() +source_kerasTokenizer = Tokenizer() +target_kerasTokenizer = Tokenizer() tokenizer = TokenizePreprocessor() prepareData = prepareData() SEED = 777 @@ -31,13 +32,13 @@ try: except Exception as e: print(e) -min_elements=min(len(frCorpora),len(itCorpora),len(huCorpora)) -FR_sample=random.sample(frCorpora, min_elements) -IT_sample=random.sample(itCorpora, min_elements) -HU_sample=random.sample(huCorpora, min_elements) -corpora=FR_sample+HU_sample+IT_sample +# min_elements=min(len(frCorpora),len(itCorpora),len(huCorpora)) +# FR_sample=random.sample(frCorpora, min_elements) +# IT_sample=random.sample(itCorpora, min_elements) +# HU_sample=random.sample(huCorpora, min_elements) +# corpora=FR_sample+HU_sample+IT_sample -# corpora=frCorpora+itCorpora+huCorpora +corpora=frCorpora+itCorpora+huCorpora # print(len(corpora)) # corpora=corpora[:int(len(corpora)*0.5)] # print(len(corpora)) @@ -50,30 +51,35 @@ labels=[str(x[2]).strip() for x in corpora] # source_corpus=['sos '+x[0]+' eos' for x in corpora] source_corpus=[x[0] for x in corpora] source_tokens = tokenizer.transform([x for x in source_corpus]) +source_max_sequence_tokenizer = max([len(x) for x in source_tokens]) tmp =[item for item in list(set(flatten(source_tokens))) if item.strip()] source_vocab = {item.strip():i+1 for i,item in enumerate(tmp)} source_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)} -kerasTokenizer.word_index=source_vocab +source_kerasTokenizer.word_index=source_vocab with open('models/s2s_source_tokenizer_extended.p', 'wb') as handle: - pickle.dump(kerasTokenizer, handle) + pickle.dump(source_kerasTokenizer, handle) -source_word_sequence=kerasTokenizer.texts_to_sequences(source_corpus) -source_max_sequence = max([len(x) for x in source_word_sequence]) -source_word_sequence = pad_sequences(source_word_sequence, maxlen=source_max_sequence, padding='post') +# source_word_sequence=kerasTokenizer.texts_to_sequences(source_corpus) +# source_max_sequence = max([len(x) for x in source_word_sequence]) +# source_word_sequence = pad_sequences(source_word_sequence, maxlen=source_max_sequence, padding='post') +# print('See source lengths: {} {}'.format(source_max_sequence_tokenizer, source_max_sequence)) #TARGET TOKENS target_corpus=['sos '+x[1]+' eos' for x in corpora] target_tokens = tokenizer.transform([x for x in target_corpus]) +target_max_sequence_tokenizer = max([len(x) for x in target_tokens]) tmp=[item for item in list(set(flatten(target_tokens))) if item.strip()] target_vocab = {item.strip():i+1 for i,item in enumerate(tmp)} target_index_to_word_dict = {i+1:item.strip() for i,item in enumerate(tmp)} -kerasTokenizer.word_index=target_vocab +target_kerasTokenizer.word_index=target_vocab with open('models/s2s_target_tokenizer_extended.p', 'wb') as handle: - pickle.dump(kerasTokenizer, handle) + pickle.dump(target_kerasTokenizer, handle) -target_word_sequence=kerasTokenizer.texts_to_sequences(target_corpus) -target_max_sequence = max([len(x) for x in target_word_sequence]) -target_word_sequence = pad_sequences(target_word_sequence, maxlen=target_max_sequence, padding='post') +# target_word_sequence=kerasTokenizer.texts_to_sequences(target_corpus) +# target_max_sequence = max([len(x) for x in target_word_sequence]) +# target_word_sequence = pad_sequences(target_word_sequence, maxlen=target_max_sequence, padding='post') +# print('See source lengths: {} {}'.format(target_max_sequence_tokenizer, target_max_sequence)) +# input("rawwwww") # print("Source vocabulary: {}".format(len(source_vocab))) # print(source_word_sequence.shape) @@ -86,7 +92,7 @@ source_embeddings=embedding_matrix(source_vocab) source_embedding_layer = Embedding(source_embeddings.shape[0], source_embeddings.shape[1], weights=[source_embeddings], - input_length=source_max_sequence, + input_length=source_max_sequence_tokenizer, trainable=True, mask_zero=True) @@ -94,37 +100,45 @@ target_embeddings=embedding_matrix(target_vocab) target_embedding_layer = Embedding(target_embeddings.shape[0], target_embeddings.shape[1], weights=[target_embeddings], - input_length=target_max_sequence, + input_length=target_max_sequence_tokenizer, trainable=True, mask_zero=True) #generate train/test split -source_train, source_val, _, _ = train_test_split(source_word_sequence, labels, test_size=0.05, random_state=777) -target_train, target_val, labels_train, labels_val = train_test_split(target_word_sequence, labels, test_size=0.05, random_state=777) - -data_set_train_test = { - 'source_train':source_train, - 'source_val':source_val, - 'target_train':target_train, - 'target_val':target_val, - 'labels_train':labels_train, - 'labels_val':labels_val -} - -with open('models/train_test_split_extended.p', 'wb') as handle: - pickle.dump(data_set_train_test, handle) - -target_train_onehot = np.zeros((len(target_train), target_max_sequence, len(target_vocab)+1)) -for seq_id, sequence in enumerate(target_train): - for item_id, item in enumerate(sequence): - if item_id > 0: - target_train_onehot[seq_id][item_id-1][int(item)]=1 - -target_val_onehot = np.zeros((len(target_val), target_max_sequence, len(target_vocab)+1)) -for seq_id, sequence in enumerate(target_val): - for item_id, item in enumerate(sequence): - if item_id > 0: - target_val_onehot[seq_id][item_id-1][int(item)]=1 - -print(target_train_onehot.shape, target_val_onehot.shape) -print("Prepared lables test and validation data set") \ No newline at end of file +source_train, source_val, _, _ = train_test_split(source_corpus, labels, test_size=0.05, random_state=777) +target_train, target_val, labels_train, labels_val = train_test_split(target_corpus, labels, test_size=0.05, random_state=777) + +# data_set_train_test = { +# 'source_train':source_train, +# 'source_val':source_val, +# 'target_train':target_train, +# 'target_val':target_val, +# 'labels_train':labels_train, +# 'labels_val':labels_val +# } +# +# with open('models/train_test_split_extended.p', 'wb') as handle: +# pickle.dump(data_set_train_test, handle) +# +# target_train_onehot = np.zeros((len(target_train), target_max_sequence, len(target_vocab)+1)) +# for seq_id, sequence in enumerate(target_train): +# for item_id, item in enumerate(sequence): +# if item_id > 0: +# target_train_onehot[seq_id][item_id-1][int(item)]=1 +# +# target_val_onehot = np.zeros((len(target_val), target_max_sequence, len(target_vocab)+1)) +# for seq_id, sequence in enumerate(target_val): +# for item_id, item in enumerate(sequence): +# if item_id > 0: +# target_val_onehot[seq_id][item_id-1][int(item)]=1 +# +# print(target_train_onehot.shape, target_val_onehot.shape) +# print("Prepared test and validation data set and labels: {}, {}, {}, {}, {}, {}".format( +# len(source_train), +# len(target_train), +# len(source_val), +# len(target_val), +# len(labels_train), +# len(labels_val) +# ) +# ) \ No newline at end of file diff --git a/code_jurica/seq2seq.py b/code_jurica/seq2seq.py index 5ff7310..78c68a0 100644 --- a/code_jurica/seq2seq.py +++ b/code_jurica/seq2seq.py @@ -72,24 +72,43 @@ callbacks_list = [ ] latent_dim=256 -batch_size=1000 -epochs=2 +batch_size=500 +epochs=100 + +train_data_generator = KerasBatchGenerator(batch_size, + source_train, + source_max_sequence_tokenizer, + source_kerasTokenizer, + target_train, + target_max_sequence_tokenizer, + target_kerasTokenizer + ) + +validation_data_generator = KerasBatchGenerator(batch_size, + source_val, + source_max_sequence_tokenizer, + source_kerasTokenizer, + target_val, + target_max_sequence_tokenizer, + target_kerasTokenizer + ) print("Lets train some stuff!") # Define an input sequence and process it. -encoder_input = Input(shape=(source_max_sequence, )) +encoder_input = Input(shape=(source_max_sequence_tokenizer, )) x = source_embedding_layer(encoder_input) x, state_h, state_c = LSTM(latent_dim, return_state=True)(x) encoder_states = [state_h, state_c] # Set up the decoder, using `encoder_states` as initial state. -decoder_input = Input(shape=(target_max_sequence, )) +decoder_input = Input(shape=(target_max_sequence_tokenizer, )) x = target_embedding_layer(decoder_input) decoder_LSTM = LSTM(latent_dim, return_sequences=True, return_state = True) decoder_out, _ , _ = decoder_LSTM(x, initial_state=encoder_states) decoder_dense = Dense(len(target_vocab)+1, activation='softmax') decoder_out = decoder_dense(decoder_out) + # Define the model that will turn # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` model = Model([encoder_input, decoder_input], decoder_out) @@ -97,14 +116,23 @@ model = Model([encoder_input, decoder_input], decoder_out) # Compile & run training model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy']) model.summary() -model.fit([source_train, target_train], - target_train_onehot, - batch_size=batch_size, - callbacks=callbacks_list, - epochs=epochs, - validation_split=0.15 - # validation_data=([source_val, target_val], target_val_onehot) - ) +# model.fit([source_train, target_train], +# target_train_onehot, +# batch_size=batch_size, +# callbacks=callbacks_list, +# epochs=epochs, +# validation_split=0.1 +# # validation_data=([source_val, target_val], target_val_onehot) +# ) + +model.fit_generator( + generator=train_data_generator.generate_data(), + steps_per_epoch=int(len(source_train)/batch_size)+1, + epochs=epochs, + callbacks=callbacks_list, + validation_data=validation_data_generator.generate_data(), + validation_steps=int(len(source_val)/batch_size)+1 +) # INFERENCE MODELS # Encoder inference model @@ -125,7 +153,7 @@ decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states, def decode_seq(inp_seq): states_val = encoder_model_inf.predict(inp_seq) - target_seq = np.zeros((1, target_max_sequence)) + target_seq = np.zeros((1, target_max_sequence_tokenizer)) target_seq[0, 0] = target_vocab['sos'] translated_sent = [] @@ -144,10 +172,10 @@ def decode_seq(inp_seq): translated_sent.append(sampled_fra_char) translated_index.append(max_val_index) - if ((sampled_fra_char == 'eos') or (len(translated_sent) > target_max_sequence)): + if ((sampled_fra_char == 'eos') or (len(translated_sent) > target_max_sequence_tokenizer)): stop_condition = True - target_seq = np.zeros((1, target_max_sequence)) + target_seq = np.zeros((1, target_max_sequence_tokenizer)) target_seq[0, 0] = max_val_index states_val = [decoder_h, decoder_c] diff --git a/code_jurica/test.py b/code_jurica/test.py index f6918e7..4cc8ef3 100644 --- a/code_jurica/test.py +++ b/code_jurica/test.py @@ -113,7 +113,7 @@ def decode_seq(inp_seq): y_true = [] y_pred = [] # for seq_index in range(len(source_corpus)): -for seq_index in range(len(source_val)): +for seq_index in tqdm.tqdm(range(len(source_val))): # inp_seq = source_val[seq_index:seq_index + 1] # inp_seq = s2s_source_tokenizer.texts_to_sequences(inp_seq) diff --git a/code_jurica/util.py b/code_jurica/util.py index 509f13d..31fc37b 100644 --- a/code_jurica/util.py +++ b/code_jurica/util.py @@ -15,6 +15,8 @@ from fastText import load_model import math import datetime from io import StringIO +import keras +from keras.preprocessing.sequence import pad_sequences #REPRODUCIBLE np.random.seed(42) @@ -296,4 +298,64 @@ class prepareData(): pickle.dump(preparedDictionary, open(DICT_PREPROCESED, 'wb')) - return preparedDictionary \ No newline at end of file + return preparedDictionary + + def generator(self, data, labels, batch_size): + # Create empty arrays to contain batch of data and labels# + batch_data = np.zeros((batch_size, 64, 64, 3)) + batch_labels = np.zeros((batch_size, 1)) + while True: + for i in range(batch_size): + # choose random index in data + index = random.choice(len(data), 1) + batch_data[i] = some_processing(data[index]) + batch_labels[i] = labels[index] + yield batch_data, batch_labels + +class KerasBatchGenerator(keras.utils.Sequence): + + def __init__(self, batch_size, source_corpus, source_maxlen, source_tokenizer, target_corpus, target_maxlen, target_tokenizer): + + self.batch_size = batch_size + + self.source_corpus = source_corpus + self.target_corpus = target_corpus + + self.source_maxlen = source_maxlen + self.target_maxlen = target_maxlen + + self.source_tokenizer = source_tokenizer + self.target_tokenizer = target_tokenizer + + # this will track the progress of the batches sequentially through the + # data set - once the data reaches the end of the data set it will reset + # back to zero + self.current_idx = 0 + + + def generate_data(self): + + while True: + + if self.current_idx * self.batch_size >= len(self.source_corpus): + # reset the index back to the start of the data set + self.current_idx = 0 + + batch_source = self.source_corpus[self.current_idx * self.batch_size:(self.current_idx + 1) * self.batch_size] + batch_source = self.source_tokenizer.texts_to_sequences(batch_source) + batch_source = pad_sequences(batch_source, maxlen=self.source_maxlen, padding='post') + + batch_target = self.target_corpus[self.current_idx * self.batch_size:(self.current_idx + 1) * self.batch_size] + batch_target = self.target_tokenizer.texts_to_sequences(batch_target) + + target_train_onehot = np.zeros((self.batch_size, self.target_maxlen, len(self.target_tokenizer.word_index) + 1)) + for seq_id, sequence in enumerate(batch_target): + for item_id, item in enumerate(sequence): + if item_id > 0: + target_train_onehot[seq_id][item_id - 1][int(item)] = 1 + + batch_target = pad_sequences(batch_target, maxlen=self.target_maxlen, padding='post') + + self.current_idx += 1 + + yield [batch_source, batch_target], target_train_onehot -- GitLab