diff --git a/code_jurica/loader.py b/code_jurica/loader.py index 406144fa90e74f9eb3b1f09c550ef8f2ae027daa..ad3009c17d55127258a66e50c663026978c5fb44 100644 --- a/code_jurica/loader.py +++ b/code_jurica/loader.py @@ -144,4 +144,4 @@ with open('models/train_test_split_extended.p', 'wb') as handle: # len(labels_train), # len(labels_val) # ) -# ) \ No newline at end of file +# ) diff --git a/code_jurica/loader_tmp.py b/code_jurica/loader_tmp.py index 1e0190b285831b3b6c4c299943009456640e9945..84ad20142858fdc6deb4da7ccd998cd8f7122951 100644 --- a/code_jurica/loader_tmp.py +++ b/code_jurica/loader_tmp.py @@ -9,13 +9,7 @@ from keras.preprocessing.text import Tokenizer from keras.layers import Embedding #REPRODUCIBLE -<<<<<<< HEAD os.environ['PYTHONHASHSEED'] = '0' -======= -from util import TokenizePreprocessor - - ->>>>>>> origin/master np.random.seed(42) random.seed(12345) @@ -27,11 +21,9 @@ SEED = 777 itCorpora, itErrors = prepareData.prepareData('IT') huCorpora, huErrors = prepareData.prepareData('HU') -<<<<<<< HEAD -======= frCorpora, frErrors = prepareData.prepareData('FR') # print(len(frErrors), len(itErrors), len(huErrors)) ->>>>>>> origin/master + try: df = pd.DataFrame(frErrors+itErrors+huErrors, columns=['Dataset','DocID', 'MissingRowID']) diff --git a/code_jurica/seq2seq.py b/code_jurica/seq2seq.py index 4fd324a4280f14e3e50aeff1bbf248864a385489..9aef90ccd207ad30917cacb548d6ef165207b014 100644 --- a/code_jurica/seq2seq.py +++ b/code_jurica/seq2seq.py @@ -12,6 +12,7 @@ import tqdm import pickle from sklearn.metrics import classification_report +from seq2seq_base import run_pipeline_prediction #REPRODUCIBLE np.random.seed(42) @@ -183,42 +184,54 @@ def decode_seq(inp_seq): return translated_sent[:-1], translated_index[:-1] -y_true = [] -y_pred = [] - - -source_val = source_kerasTokenizer.texts_to_sequences(source_val) -source_val = pad_sequences(source_val, maxlen=source_max_sequence_tokenizer, padding='post') - -for seq_index in tqdm.tqdm(range(len(source_val))): -# for seq_index in range(10): - inp_seq = source_val[seq_index:seq_index+1] - translated_sent, translated_index= decode_seq(inp_seq) - - # PREDICT ICD10 - source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)]) - word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') - icd10_code_index = icd10_model.predict(word_sequence) - # print(icd10_code_index, type(icd10_code_index)) - max_val_index = np.argmax(icd10_code_index, axis=1)[0] - # print(max_val_index) - icd10_label = encoded_Y.inverse_transform(max_val_index) - - # print('-') - # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] - # print('Target indexes:', target_index) - # print('Decoded indexes:', translated_index) - # - # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) - # print('Decoded sentence:', " ".join(translated_sent)) - # - # print('Target ICD-10:', labels_val[seq_index]) - # print('Predict ICD-10:', icd10_label) - - y_true.append(labels_val[seq_index]) - y_pred.append(icd10_label) - -report = classification_report(y_true, y_pred) -report_df = report_to_df(report) -report_df.to_csv('logs/classification_report_extended.csv') -print(report_df) \ No newline at end of file + +max_icd10_length = icd10_model.layers[0].input_shape[1] + +run_pipeline_prediction(source_val, decode_seq, icd10_model, encoded_Y, labels_val, + source_kerasTokenizer, source_max_sequence_tokenizer, + icd10Tokenizer, max_icd10_length, + target_index_to_word_dict, target_val, + 'logs/seq2seq') + + + + +# y_true = [] +# y_pred = [] +# +# +# source_val = source_kerasTokenizer.texts_to_sequences(source_val) +# source_val = pad_sequences(source_val, maxlen=source_max_sequence_tokenizer, padding='post') +# +# for seq_index in tqdm.tqdm(range(len(source_val))): +# # for seq_index in range(10): +# inp_seq = source_val[seq_index:seq_index+1] +# translated_sent, translated_index= decode_seq(inp_seq) +# +# # PREDICT ICD10 +# source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)]) +# word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') +# icd10_code_index = icd10_model.predict(word_sequence) +# # print(icd10_code_index, type(icd10_code_index)) +# max_val_index = np.argmax(icd10_code_index, axis=1)[0] +# # print(max_val_index) +# icd10_label = encoded_Y.inverse_transform(max_val_index) +# +# # print('-') +# # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] +# # print('Target indexes:', target_index) +# # print('Decoded indexes:', translated_index) +# # +# # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) +# # print('Decoded sentence:', " ".join(translated_sent)) +# # +# # print('Target ICD-10:', labels_val[seq_index]) +# # print('Predict ICD-10:', icd10_label) +# +# y_true.append(labels_val[seq_index]) +# y_pred.append(icd10_label) +# +# report = classification_report(y_true, y_pred) +# report_df = report_to_df(report) +# report_df.to_csv('logs/classification_report_extended.csv') +# print(report_df) diff --git a/code_jurica/seq2seq_attention.py b/code_jurica/seq2seq_attention.py index 607409859290739da8db3fc589352382c9b472a6..23bfd073f7967d2b1ad3cda90dcf0c7f5fcaa75b 100644 --- a/code_jurica/seq2seq_attention.py +++ b/code_jurica/seq2seq_attention.py @@ -12,6 +12,7 @@ import tqdm import pickle from sklearn.metrics import classification_report +from seq2seq_base import run_pipeline_prediction #REPRODUCIBLE np.random.seed(42) @@ -230,42 +231,50 @@ def decode_seq(inp_seq): return translated_sent[:-1], translated_index[:-1] -y_true = [] -y_pred = [] - - -source_val = source_kerasTokenizer.texts_to_sequences(source_val) -source_val = pad_sequences(source_val, maxlen=source_max_sequence_tokenizer, padding='post') - -for seq_index in tqdm.tqdm(range(len(source_val))): -# for seq_index in range(10): - inp_seq = source_val[seq_index:seq_index+1] - translated_sent, translated_index= decode_seq(inp_seq) - - # PREDICT ICD10 - source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)]) - word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') - icd10_code_index = icd10_model.predict(word_sequence) - # print(icd10_code_index, type(icd10_code_index)) - max_val_index = np.argmax(icd10_code_index, axis=1)[0] - # print(max_val_index) - icd10_label = encoded_Y.inverse_transform(max_val_index) - - # print('-') - # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] - # print('Target indexes:', target_index) - # print('Decoded indexes:', translated_index) - # - # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) - # print('Decoded sentence:', " ".join(translated_sent)) - # - # print('Target ICD-10:', labels_val[seq_index]) - # print('Predict ICD-10:', icd10_label) - - y_true.append(labels_val[seq_index]) - y_pred.append(icd10_label) - -report = classification_report(y_true, y_pred) -report_df = report_to_df(report) -report_df.to_csv('logs/classification_report_extended.csv') -print(report_df) +max_icd10_length = icd10_model.layers[0].input_shape[1] + +run_pipeline_prediction(source_val, decode_seq, icd10_model, encoded_Y, labels_val, + source_kerasTokenizer, source_max_sequence_tokenizer, + icd10Tokenizer, max_icd10_length, + target_index_to_word_dict, target_val, + 'logs/seq2seq') + +# y_true = [] +# y_pred = [] +# +# +# source_val = source_kerasTokenizer.texts_to_sequences(source_val) +# source_val = pad_sequences(source_val, maxlen=source_max_sequence_tokenizer, padding='post') +# +# for seq_index in tqdm.tqdm(range(len(source_val))): +# # for seq_index in range(10): +# inp_seq = source_val[seq_index:seq_index+1] +# translated_sent, translated_index= decode_seq(inp_seq) +# +# # PREDICT ICD10 +# source_word_sequence = icd10Tokenizer.texts_to_sequences([" ".join(translated_sent)]) +# word_sequence = pad_sequences(source_word_sequence, maxlen=icd10_model.layers[0].input_shape[1], padding='post') +# icd10_code_index = icd10_model.predict(word_sequence) +# # print(icd10_code_index, type(icd10_code_index)) +# max_val_index = np.argmax(icd10_code_index, axis=1)[0] +# # print(max_val_index) +# icd10_label = encoded_Y.inverse_transform(max_val_index) +# +# # print('-') +# # target_index = np.trim_zeros(target_val[seq_index], 'b')[1:-1] +# # print('Target indexes:', target_index) +# # print('Decoded indexes:', translated_index) +# # +# # print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index])) +# # print('Decoded sentence:', " ".join(translated_sent)) +# # +# # print('Target ICD-10:', labels_val[seq_index]) +# # print('Predict ICD-10:', icd10_label) +# +# y_true.append(labels_val[seq_index]) +# y_pred.append(icd10_label) +# +# report = classification_report(y_true, y_pred) +# report_df = report_to_df(report) +# report_df.to_csv('logs/classification_report_extended.csv') +# print(report_df) diff --git a/code_jurica/seq2seq_base.py b/code_jurica/seq2seq_base.py new file mode 100644 index 0000000000000000000000000000000000000000..ca5ff4b9404659b3604f731a01951f9d36d14c0f --- /dev/null +++ b/code_jurica/seq2seq_base.py @@ -0,0 +1,100 @@ +import tqdm +import numpy as np +import os +from typing import List, Callable, Dict, Iterable + +from keras import Model +from keras.preprocessing.sequence import pad_sequences +from keras.preprocessing.text import Tokenizer +from sklearn.metrics import classification_report +from sklearn.preprocessing import LabelEncoder + +from util import report_to_df + + +def run_pipeline_prediction(sentences: List[str], decode_seq_fnc: Callable, icd10_model: Model, + icd10_encoder: LabelEncoder, gold_labels: List, + source_tokenizer: Tokenizer, max_source_length: int, + icd10_tokenizer: Tokenizer, max_icd10_length: int, + target_index_to_word_dict: Dict, gold_target_indexes, + log_dir: str): + + os.makedirs(log_dir, exist_ok=True) + + y_true = [] + y_pred = [] + + source_val = source_tokenizer.texts_to_sequences(sentences) + source_val = pad_sequences(source_val, maxlen=max_source_length, padding='post') + + pred_texts = [] + pred_ids = [] + + gold_texts = [] + gold_ids = [] + + for seq_index in tqdm.tqdm(range(len(source_val)), total=len(source_val)): + # for seq_index in range(10): + inp_seq = source_val[seq_index:seq_index + 1] + translated_sent, translated_index = decode_seq_fnc(inp_seq) + + # PREDICT ICD10 + source_word_sequence = icd10_tokenizer.texts_to_sequences([" ".join(translated_sent)]) + word_sequence = pad_sequences(source_word_sequence, maxlen=max_icd10_length, padding='post') + icd10_code_index = icd10_model.predict(word_sequence) + # print(icd10_code_index, type(icd10_code_index)) + max_val_index = np.argmax(icd10_code_index, axis=1)[0] + + # print(max_val_index) + icd10_label = icd10_encoder.inverse_transform(max_val_index) + + y_true.append(gold_labels[seq_index]) + y_pred.append(icd10_label) + + # Debugging prediction + pred_text = " ".join(translated_sent) + pred_texts.append(pred_text) + + pred_indexes = " ".join(translated_index) + pred_ids.append(pred_indexes) + + gold_indexes = np.trim_zeros(gold_target_indexes[seq_index], 'b')[1:-1] + gold_ids.append(" ".join(gold_indexes)) + + gold_text = " ".join([target_index_to_word_dict[x] for x in gold_indexes]) + gold_texts.append(gold_text) + + print('Target indexes:', gold_indexes) + print('Decoded indexes:', pred_indexes) + + print('Target text:', gold_text) + print('Decoded text:', pred_text) + + print('Target ICD-10:', gold_labels[seq_index]) + print('Predict ICD-10:', icd10_label) + + print('----------------------------------------------') + + + # Write classication report + report = classification_report(y_true, y_pred) + report_df = report_to_df(report) + report_file = os.path.join(log_dir, 'classification_report.csv') + report_df.to_csv(report_file) + print(report_df) + + # Save prediction debugging information + save_list_to_file(pred_texts, os.path.join(log_dir, 'pred_texts.txt')) + save_list_to_file(pred_ids, os.path.join(log_dir, 'pred_ids.txt')) + + save_list_to_file(gold_texts, os.path.join(log_dir, 'gold_texts.txt')) + save_list_to_file(gold_ids, os.path.join(log_dir, 'gold_ids.txt')) + + +def save_list_to_file(values: Iterable, output_file: str): + with open(output_file, "w", encoding="utf-8") as output_writer: + output_content = ["%s\n" % value for value in values] + output_writer.writelines(output_content) + output_writer.close() + + diff --git a/code_jurica/test.py b/code_jurica/test.py index 954c8fdcfa2245af9877de41bba4beba0db0aae2..11808bc4c8d811e73a32bab492621e9d1cfb3046 100644 --- a/code_jurica/test.py +++ b/code_jurica/test.py @@ -169,4 +169,4 @@ for k,v in runs.items(): report = classification_report(y_true, y_pred) report_df = report_to_df(report) - report_df.to_csv('logs/classification_report_test{}.csv'.format(v['s2s'])) \ No newline at end of file + report_df.to_csv('logs/classification_report_test{}.csv'.format(v['s2s'])) diff --git a/code_mario/app_context.py b/code_mario/app_context.py index 772341228320c615fba32b2bc1c9d654ffd5df29..a32ca22ca7d8c742ee35d3befa7e46382564f836 100644 --- a/code_mario/app_context.py +++ b/code_mario/app_context.py @@ -1,5 +1,5 @@ import os -from util import LogUtil +from common_util import LogUtil class AppContext(object): diff --git a/code_mario/clef18_task1_base.py b/code_mario/clef18_task1_base.py index d94136309f92acc71118e34a11e117b7b9fa8d19..8303bdd6a07368015ecfb0517bd71b97ab116301 100644 --- a/code_mario/clef18_task1_base.py +++ b/code_mario/clef18_task1_base.py @@ -21,7 +21,7 @@ from sklearn.externals import joblib from sklearn.model_selection import train_test_split from app_context import AppContext -from util import LoggingMixin +from common_util import LoggingMixin from dnn_classifiers import NeuralNetworkClassifiers as nnc from keras_extension import KerasUtil as ku diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index d24c43a05c101cf8d35a11651d5726dfd9b8ad49..8bacd54ff036147f05eb31ea283a96280c688a11 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -7,7 +7,7 @@ from pandas import DataFrame from tqdm import tqdm from app_context import AppContext -from util import LoggingMixin +from common_util import LoggingMixin from itertools import groupby from preprocessing import DataPreparationUtil as pdu diff --git a/code_mario/util.py b/code_mario/common_util.py similarity index 100% rename from code_mario/util.py rename to code_mario/common_util.py diff --git a/code_mario/ft_embeddings.py b/code_mario/ft_embeddings.py index 47fb9b5696038298f7cab8331d1172e8eb034264..57c18c0390bb7401064e1a664c3334e5104d6def 100644 --- a/code_mario/ft_embeddings.py +++ b/code_mario/ft_embeddings.py @@ -4,7 +4,7 @@ from typing import List, Dict from gensim.models import FastText from app_context import AppContext -from util import LoggingMixin +from common_util import LoggingMixin class FastTextModel(LoggingMixin): diff --git a/code_mario/keras_extension.py b/code_mario/keras_extension.py index 81cbb644158f1b441074a9e920d34873b2b2b58b..0ee4c9e3c18518b2f358c32f91a22f58f9359630 100644 --- a/code_mario/keras_extension.py +++ b/code_mario/keras_extension.py @@ -6,7 +6,7 @@ from keras.callbacks import Callback, ModelCheckpoint, CSVLogger, EarlyStopping from keras.wrappers.scikit_learn import KerasClassifier from app_context import AppContext -from util import LoggingMixin +from common_util import LoggingMixin class KerasUtil(object): diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py index 45b540db2fbe180913d1eefaea1dadbcb2290951..963aed4395b1079e2343f050631e895a15099d15 100644 --- a/code_mario/preprocessing.py +++ b/code_mario/preprocessing.py @@ -8,7 +8,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import LabelEncoder from typing import Callable, List -from util import PandasUtil +from common_util import PandasUtil class DataPreparationUtil(object):