From 2db3b1e3d565910104d16ea1435a5e23a88d7958 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de> Date: Fri, 22 Jun 2018 14:21:15 +0200 Subject: [PATCH] Add logging of seq2seq prediction --- code_jurica/loader.py | 2 +- code_jurica/loader_tmp.py | 10 +-- code_jurica/seq2seq.py | 91 ++++++++++++---------- code_jurica/seq2seq_attention.py | 87 +++++++++++---------- code_jurica/seq2seq_base.py | 100 +++++++++++++++++++++++++ code_jurica/test.py | 2 +- code_mario/app_context.py | 2 +- code_mario/clef18_task1_base.py | 2 +- code_mario/clef18_task1_data.py | 2 +- code_mario/{util.py => common_util.py} | 0 code_mario/ft_embeddings.py | 2 +- code_mario/keras_extension.py | 2 +- code_mario/preprocessing.py | 2 +- 13 files changed, 209 insertions(+), 95 deletions(-) create mode 100644 code_jurica/seq2seq_base.py rename code_mario/{util.py => common_util.py} (100%) diff --git a/code_jurica/loader.py b/code_jurica/loader.py index 406144f..ad3009c 100644 --- a/code_jurica/loader.py +++ b/code_jurica/loader.py @@ -144,4 +144,4 @@ with open('models/train_test_split_extended.p', 'wb') as handle: # len(labels_train), # len(labels_val) # ) -# ) \ No newline at end of file +# ) diff --git a/code_jurica/loader_tmp.py b/code_jurica/loader_tmp.py index 1e0190b..84ad201 100644 --- a/code_jurica/loader_tmp.py +++ b/code_jurica/loader_tmp.py @@ -9,13 +9,7 @@ from keras.preprocessing.text import Tokenizer from keras.layers import Embedding #REPRODUCIBLE -<<<<<<< HEAD os.environ['PYTHONHASHSEED'] = '0' -======= -from util import TokenizePreprocessor - - ->>>>>>> origin/master np.random.seed(42) random.seed(12345) @@ -27,11 +21,9 @@ SEED = 777 itCorpora, itErrors = prepareData.prepareData('IT') huCorpora, huErrors = prepareData.prepareData('HU') -<<<<<<< HEAD -======= frCorpora, frErrors = prepareData.prepareData('FR') # print(len(frErrors), len(itErrors), len(huErrors)) ->>>>>>> origin/master + try: df = pd.DataFrame(frErrors+itErrors+huErrors, columns=['Dataset','DocID', 'MissingRowID']) diff --git a/code_jurica/seq2seq.py b/code_jurica/seq2seq.py index 4fd324a..9aef90c 100644 --- a/code_jurica/seq2seq.py +++ b/code_jurica/seq2seq.py @@ -12,6 +12,7 @@ import tqdm import pickle from sklearn.metrics import classification_report +from seq2seq_base import run_pipeline_prediction #REPRODUCIBLE np.random.seed(42) @@ -183,42 +184,54 @@ def decode_seq(inp_seq): return translated_sent[:-1], translated_index[:-1] -y_true = [] -y_pred = [] - - -source_val = source_kerasTokenizer.texts_to_sequences(source_val) -source_val = pad_sequences(source_val, maxlen=source_max_sequence_tokenizer, padding='post') - -for seq_index in tqdm.tqdm(range(len(source_val))): -# for seq_index in range(10): - inp_seq = source_val[seq_index:seq_index+1] - translated_sent, translated_index= decode_seq(inp_seq) - - # PREDICT ICD10 - source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)]) - word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') - icd10_code_index = icd10_model.predict(word_sequence) - # print(icd10_code_index, type(icd10_code_index)) - max_val_index = np.argmax(icd10_code_index, axis=1)[0] - # print(max_val_index) - icd10_label = encoded_Y.inverse_transform(max_val_index) - - # print('-') - # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] - # print('Target indexes:', target_index) - # print('Decoded indexes:', translated_index) - # - # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) - # print('Decoded sentence:', " ".join(translated_sent)) - # - # print('Target ICD-10:', labels_val[seq_index]) - # print('Predict ICD-10:', icd10_label) - - y_true.append(labels_val[seq_index]) - y_pred.append(icd10_label) - -report = classification_report(y_true, y_pred) -report_df = report_to_df(report) -report_df.to_csv('logs/classification_report_extended.csv') -print(report_df) \ No newline at end of file + +max_icd10_length = icd10_model.layers[0].input_shape[1] + +run_pipeline_prediction(source_val, decode_seq, icd10_model, encoded_Y, labels_val, + source_kerasTokenizer, source_max_sequence_tokenizer, + icd10Tokenizer, max_icd10_length, + target_index_to_word_dict, target_val, + 'logs/seq2seq') + + + + +# y_true = [] +# y_pred = [] +# +# +# source_val = source_kerasTokenizer.texts_to_sequences(source_val) +# source_val = pad_sequences(source_val, maxlen=source_max_sequence_tokenizer, padding='post') +# +# for seq_index in tqdm.tqdm(range(len(source_val))): +# # for seq_index in range(10): +# inp_seq = source_val[seq_index:seq_index+1] +# translated_sent, translated_index= decode_seq(inp_seq) +# +# # PREDICT ICD10 +# source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)]) +# word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') +# icd10_code_index = icd10_model.predict(word_sequence) +# # print(icd10_code_index, type(icd10_code_index)) +# max_val_index = np.argmax(icd10_code_index, axis=1)[0] +# # print(max_val_index) +# icd10_label = encoded_Y.inverse_transform(max_val_index) +# +# # print('-') +# # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] +# # print('Target indexes:', target_index) +# # print('Decoded indexes:', translated_index) +# # +# # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) +# # print('Decoded sentence:', " ".join(translated_sent)) +# # +# # print('Target ICD-10:', labels_val[seq_index]) +# # print('Predict ICD-10:', icd10_label) +# +# y_true.append(labels_val[seq_index]) +# y_pred.append(icd10_label) +# +# report = classification_report(y_true, y_pred) +# report_df = report_to_df(report) +# report_df.to_csv('logs/classification_report_extended.csv') +# print(report_df) diff --git a/code_jurica/seq2seq_attention.py b/code_jurica/seq2seq_attention.py index 6074098..23bfd07 100644 --- a/code_jurica/seq2seq_attention.py +++ b/code_jurica/seq2seq_attention.py @@ -12,6 +12,7 @@ import tqdm import pickle from sklearn.metrics import classification_report +from seq2seq_base import run_pipeline_prediction #REPRODUCIBLE np.random.seed(42) @@ -230,42 +231,50 @@ def decode_seq(inp_seq): return translated_sent[:-1], translated_index[:-1] -y_true = [] -y_pred = [] - - -source_val = source_kerasTokenizer.texts_to_sequences(source_val) -source_val = pad_sequences(source_val, maxlen=source_max_sequence_tokenizer, padding='post') - -for seq_index in tqdm.tqdm(range(len(source_val))): -# for seq_index in range(10): - inp_seq = source_val[seq_index:seq_index+1] - translated_sent, translated_index= decode_seq(inp_seq) - - # PREDICT ICD10 - source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)]) - word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') - icd10_code_index = icd10_model.predict(word_sequence) - # print(icd10_code_index, type(icd10_code_index)) - max_val_index = np.argmax(icd10_code_index, axis=1)[0] - # print(max_val_index) - icd10_label = encoded_Y.inverse_transform(max_val_index) - - # print('-') - # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] - # print('Target indexes:', target_index) - # print('Decoded indexes:', translated_index) - # - # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) - # print('Decoded sentence:', " ".join(translated_sent)) - # - # print('Target ICD-10:', labels_val[seq_index]) - # print('Predict ICD-10:', icd10_label) - - y_true.append(labels_val[seq_index]) - y_pred.append(icd10_label) - -report = classification_report(y_true, y_pred) -report_df = report_to_df(report) -report_df.to_csv('logs/classification_report_extended.csv') -print(report_df) +max_icd10_length = icd10_model.layers[0].input_shape[1] + +run_pipeline_prediction(source_val, decode_seq, icd10_model, encoded_Y, labels_val, + source_kerasTokenizer, source_max_sequence_tokenizer, + icd10Tokenizer, max_icd10_length, + target_index_to_word_dict, target_val, + 'logs/seq2seq') + +# y_true = [] +# y_pred = [] +# +# +# source_val = source_kerasTokenizer.texts_to_sequences(source_val) +# source_val = pad_sequences(source_val, maxlen=source_max_sequence_tokenizer, padding='post') +# +# for seq_index in tqdm.tqdm(range(len(source_val))): +# # for seq_index in range(10): +# inp_seq = source_val[seq_index:seq_index+1] +# translated_sent, translated_index= decode_seq(inp_seq) +# +# # PREDICT ICD10 +# source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)]) +# word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') +# icd10_code_index = icd10_model.predict(word_sequence) +# # print(icd10_code_index, type(icd10_code_index)) +# max_val_index = np.argmax(icd10_code_index, axis=1)[0] +# # print(max_val_index) +# icd10_label = encoded_Y.inverse_transform(max_val_index) +# +# # print('-') +# # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] +# # print('Target indexes:', target_index) +# # print('Decoded indexes:', translated_index) +# # +# # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) +# # print('Decoded sentence:', " ".join(translated_sent)) +# # +# # print('Target ICD-10:', labels_val[seq_index]) +# # print('Predict ICD-10:', icd10_label) +# +# y_true.append(labels_val[seq_index]) +# y_pred.append(icd10_label) +# +# report = classification_report(y_true, y_pred) +# report_df = report_to_df(report) +# report_df.to_csv('logs/classification_report_extended.csv') +# print(report_df) diff --git a/code_jurica/seq2seq_base.py b/code_jurica/seq2seq_base.py new file mode 100644 index 0000000..ca5ff4b --- /dev/null +++ b/code_jurica/seq2seq_base.py @@ -0,0 +1,100 @@ +import tqdm +import numpy as np +import os +from typing import List, Callable, Dict, Iterable + +from keras import Model +from keras.preprocessing.sequence import pad_sequences +from keras.preprocessing.text import Tokenizer +from sklearn.metrics import classification_report +from sklearn.preprocessing import LabelEncoder + +from util import report_to_df + + +def run_pipeline_prediction(sentences: List[str], decode_seq_fnc: Callable, icd10_model: Model, + icd10_encoder: LabelEncoder, gold_labels: List, + source_tokenizer: Tokenizer, max_source_length: int, + icd10_tokenizer: Tokenizer, max_icd10_length: int, + target_index_to_word_dict: Dict, gold_target_indexes, + log_dir: str): + + os.makedirs(log_dir, exist_ok=True) + + y_true = [] + y_pred = [] + + source_val = source_tokenizer.texts_to_sequences(sentences) + source_val = pad_sequences(source_val, maxlen=max_source_length, padding='post') + + pred_texts = [] + pred_ids = [] + + gold_texts = [] + gold_ids = [] + + for seq_index in tqdm.tqdm(range(len(source_val)), total=len(source_val)): + # for seq_index in range(10): + inp_seq = source_val[seq_index:seq_index + 1] + translated_sent, translated_index = decode_seq_fnc(inp_seq) + + # PREDICT ICD10 + source_word_sequence = icd10_tokenizer.texts_to_sequences([" ".join(translated_sent)]) + word_sequence = pad_sequences(source_word_sequence, maxlen=max_icd10_length, padding='post') + icd10_code_index = icd10_model.predict(word_sequence) + # print(icd10_code_index, type(icd10_code_index)) + max_val_index = np.argmax(icd10_code_index, axis=1)[0] + + # print(max_val_index) + icd10_label = icd10_encoder.inverse_transform(max_val_index) + + y_true.append(gold_labels[seq_index]) + y_pred.append(icd10_label) + + # Debugging prediction + pred_text = " ".join(translated_sent) + pred_texts.append(pred_text) + + pred_indexes = " ".join(translated_index) + pred_ids.append(pred_indexes) + + gold_indexes = np.trim_zeros(gold_target_indexes[seq_index], 'b')[1:-1] + gold_ids.append(" ".join(gold_indexes)) + + gold_text = " ".join([target_index_to_word_dict[x] for x in gold_indexes]) + gold_texts.append(gold_text) + + print('Target indexes:', gold_indexes) + print('Decoded indexes:', pred_indexes) + + print('Target text:', gold_text) + print('Decoded text:', pred_text) + + print('Target ICD-10:', gold_labels[seq_index]) + print('Predict ICD-10:', icd10_label) + + print('----------------------------------------------') + + + # Write classication report + report = classification_report(y_true, y_pred) + report_df = report_to_df(report) + report_file = os.path.join(log_dir, 'classification_report.csv') + report_df.to_csv(report_file) + print(report_df) + + # Save prediction debugging information + save_list_to_file(pred_texts, os.path.join(log_dir, 'pred_texts.txt')) + save_list_to_file(pred_ids, os.path.join(log_dir, 'pred_ids.txt')) + + save_list_to_file(gold_texts, os.path.join(log_dir, 'gold_texts.txt')) + save_list_to_file(gold_ids, os.path.join(log_dir, 'gold_ids.txt')) + + +def save_list_to_file(values: Iterable, output_file: str): + with open(output_file, "w", encoding="utf-8") as output_writer: + output_content = ["%s\n" % value for value in values] + output_writer.writelines(output_content) + output_writer.close() + + diff --git a/code_jurica/test.py b/code_jurica/test.py index 954c8fd..11808bc 100644 --- a/code_jurica/test.py +++ b/code_jurica/test.py @@ -169,4 +169,4 @@ for k,v in runs.items(): report = classification_report(y_true, y_pred) report_df = report_to_df(report) - report_df.to_csv('logs/classification_report_test{}.csv'.format(v['s2s'])) \ No newline at end of file + report_df.to_csv('logs/classification_report_test{}.csv'.format(v['s2s'])) diff --git a/code_mario/app_context.py b/code_mario/app_context.py index 7723412..a32ca22 100644 --- a/code_mario/app_context.py +++ b/code_mario/app_context.py @@ -1,5 +1,5 @@ import os -from util import LogUtil +from common_util import LogUtil class AppContext(object): diff --git a/code_mario/clef18_task1_base.py b/code_mario/clef18_task1_base.py index d941363..8303bdd 100644 --- a/code_mario/clef18_task1_base.py +++ b/code_mario/clef18_task1_base.py @@ -21,7 +21,7 @@ from sklearn.externals import joblib from sklearn.model_selection import train_test_split from app_context import AppContext -from util import LoggingMixin +from common_util import LoggingMixin from dnn_classifiers import NeuralNetworkClassifiers as nnc from keras_extension import KerasUtil as ku diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index d24c43a..8bacd54 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -7,7 +7,7 @@ from pandas import DataFrame from tqdm import tqdm from app_context import AppContext -from util import LoggingMixin +from common_util import LoggingMixin from itertools import groupby from preprocessing import DataPreparationUtil as pdu diff --git a/code_mario/util.py b/code_mario/common_util.py similarity index 100% rename from code_mario/util.py rename to code_mario/common_util.py diff --git a/code_mario/ft_embeddings.py b/code_mario/ft_embeddings.py index 47fb9b5..57c18c0 100644 --- a/code_mario/ft_embeddings.py +++ b/code_mario/ft_embeddings.py @@ -4,7 +4,7 @@ from typing import List, Dict from gensim.models import FastText from app_context import AppContext -from util import LoggingMixin +from common_util import LoggingMixin class FastTextModel(LoggingMixin): diff --git a/code_mario/keras_extension.py b/code_mario/keras_extension.py index 81cbb64..0ee4c9e 100644 --- a/code_mario/keras_extension.py +++ b/code_mario/keras_extension.py @@ -6,7 +6,7 @@ from keras.callbacks import Callback, ModelCheckpoint, CSVLogger, EarlyStopping from keras.wrappers.scikit_learn import KerasClassifier from app_context import AppContext -from util import LoggingMixin +from common_util import LoggingMixin class KerasUtil(object): diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py index 45b540d..963aed4 100644 --- a/code_mario/preprocessing.py +++ b/code_mario/preprocessing.py @@ -8,7 +8,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import LabelEncoder from typing import Callable, List -from util import PandasUtil +from common_util import PandasUtil class DataPreparationUtil(object): -- GitLab