From 935f6903873495355175131b143cc0b189848719 Mon Sep 17 00:00:00 2001 From: Jurica Seva <seva@informatik.hu-berlin.de> Date: Thu, 24 May 2018 14:43:24 +0200 Subject: [PATCH] Started writeup. --- code_jurica/classificationICD10_attention.py | 7 +++-- .../classificationICD10_attention_char.py | 30 ++++++++++--------- paper/40_experiments.tex | 16 +++++++++- paper/50_conclusion.tex | 24 ++++++++++++++- paper/wbi-eclef18.tex | 18 +++++------ 5 files changed, 66 insertions(+), 29 deletions(-) diff --git a/code_jurica/classificationICD10_attention.py b/code_jurica/classificationICD10_attention.py index fa74fd4..6ee8659 100644 --- a/code_jurica/classificationICD10_attention.py +++ b/code_jurica/classificationICD10_attention.py @@ -16,7 +16,7 @@ from keras import backend as K from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer from keras.optimizers import Adam -from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger +from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger, TensorBoard from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional from keras.models import Model from keras.utils import multi_gpu_model, np_utils @@ -38,6 +38,8 @@ config.gpu_options.allocator_type='BFC' sess = tf.Session(graph=tf.get_default_graph(), config=config) K.set_session(sess) +tbCallBack = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True) + callbacks_list=[ EarlyStopping( monitor='val_loss', @@ -52,7 +54,8 @@ callbacks_list=[ CSVLogger( append=True, filename='logs/icd10Classification_attention_duplicated_{}.csv'.format(date_label), - ) + ), + tbCallBack ] latent_dim = 512 diff --git a/code_jurica/classificationICD10_attention_char.py b/code_jurica/classificationICD10_attention_char.py index 8d9d2f2..238e3b5 100644 --- a/code_jurica/classificationICD10_attention_char.py +++ b/code_jurica/classificationICD10_attention_char.py @@ -9,6 +9,7 @@ import traceback from collections import Counter from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split +from itertools import chain from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer @@ -22,6 +23,7 @@ import tensorflow as tf from _layers import AttentionWithContext, Attention + ################################### # TensorFlow wizardry config=tf.ConfigProto() @@ -57,25 +59,21 @@ chars = set() # tokenizer=TokenizePreprocessor() kerasTokenizer = Tokenizer(char_level=True, filters=None) dataLoader=prepareData() -corpora=dataLoader.prepareDictionaries() -tmp =[x[1] for x in corpora] -labels_c = Counter(tmp) +corpora, labels_all =dataLoader.prepareDictionaries() +labels_c = Counter(labels_all) labels_tmp = {k:v for k,v in labels_c.items() if v > 1} - # input('size of corpora') -for line in corpora: - if line[1] in labels_tmp: - labels.append(line[1]) - sentences.append(line[0]) - - for ch in line[0]: - if (ch not in chars): - chars.add(ch) +for line, label in zip(corpora, labels_all): + if label in labels_tmp: + sentences.append(line) + labels.append(label) + tmp = set(line) + chars = set(chain(chars,tmp)) +# print(chars, len(chars)) chars = sorted(list(chars)) - char_to_index_dict={item.strip():i+1 for i,item in enumerate(chars)} index_to_char_dict={i+1:item.strip() for i,item in enumerate(chars)} @@ -111,8 +109,12 @@ encoded_Y = encoder.transform(labels) # convert integers to dummy variables (i.e. one hot encoded) labels_one_hot = np_utils.to_categorical(encoded_Y) +print(word_sequence.shape, labels_one_hot.shape) +print(type(word_sequence), type(labels_one_hot)) + +input('bla') -X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777, stratify=True) +X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777, stratify=labels) print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test)) try: diff --git a/paper/40_experiments.tex b/paper/40_experiments.tex index cfe2e53..9c3016a 100644 --- a/paper/40_experiments.tex +++ b/paper/40_experiments.tex @@ -1 +1,15 @@ -\nj{TODO: Insert text!} \ No newline at end of file +In this section we will present experminets and obtained results for the two developed models, both individually as well as combined in the proposed pipeline. +As mentioned in Section \ref{sec:methods}, The proposed pipeline combined two NN models. + +\subsection{Available datasets} +The CLEF e-Health 2018 Task 1 participants where provided with annotated death certificates for the three + +\subsection{Named Entity Recognition} +To identify possible tokens as candidates for death cause, we focused on the use of a sequence to sequence model. +The + +\subsection{Named Entity Normalization} + +\subsection{Final Pipeline} + + diff --git a/paper/50_conclusion.tex b/paper/50_conclusion.tex index cfe2e53..c20c770 100644 --- a/paper/50_conclusion.tex +++ b/paper/50_conclusion.tex @@ -1 +1,23 @@ -\nj{TODO: Insert text!} \ No newline at end of file +\nj{TODO: Insert text!} +In this paper we tackled the problem of information extraction of death causes in an multilingual environment. +The proposed solution was focused in language-independent models and relies on word embeddings for each of the languages. +The proposed pipeline is divided in two steps: (1) first, possible token describing the death cause are generated by using a sequence to sequence model with attention mechanism; then, (2) generated token sequence is normalized to a possible ICD-10 code. + +We detected several issues with the proposed pipeline. These issues also serve as prospecitve future work. +The word embeddings we used are not optimized to the problem domain but are trained in general text. +The mutual embeddings space is currently defined as concatenation of the the word embeddings models for individual tokens. +In this aspect, several possible improvements of the proposed pipeline are detected. +First, the use of in-domain target language embeddings as initial token embeddings. +Although this was our initial approach, the difficulties of finding adequate in-domain corpora for selected languages has proven to be to ohard to tackle. +Our current embedding space is merely a concatenation of the three target language embeddings. +Creating an unifying embeddings space would create a truly language-independent token representation. +Additionally, it was shown that in-domain embeddings improve the quality of achieved results. This will be the main focus on our future work. +The normalization step also suffered from lack of adequate training data. +Unfortunately, we were unable to obtain ICD-10 dictinaries for all languages and can, therefore, not guarantee the completeness of the ICD-10 label space. +Final downside of the proposed pipeline is the lack fo support for mutli-label classification. + + + + + + diff --git a/paper/wbi-eclef18.tex b/paper/wbi-eclef18.tex index 7b6b66c..0a94565 100644 --- a/paper/wbi-eclef18.tex +++ b/paper/wbi-eclef18.tex @@ -42,18 +42,13 @@ Bioinformatics, \\ Berlin, Germany\\ \maketitle % typeset the header of the contribution % \begin{abstract} -This paper describes the participation of the WBI team in the CLEF eHealth 2018 -shared task 1 (``Multilingual Information Extraction - ICD-10 coding''). Our -approach builds on two recurrent neural networks models to extract and classify -causes of death from French, Italian and Hungarian death certificates. First, we -employ a LSTM-based sequence-to-sequence model to obtain a symptom name from each -death certificate line. We then utilize a bidirectional LSTM model with -attention mechanism to assign the respective ICD-10 codes to the received -symptom names. Our model achieves \ldots - +This paper describes the participation of the WBI team in the CLEF eHealth 2018 shared task 1 (``Multilingual Information Extraction - ICD-10 coding''). +Our approach builds on two recurrent neural networks models to extract and classify causes of death from French, Italian and Hungarian death certificates. +First, we employ a LSTM-based sequence-to-sequence model to obtain a symptom name from each death certificate line. +We then utilize a bidirectional LSTM model with attention mechanism to assign the respective ICD-10 codes to the received symptom names. +Our model achieves \ldots -\keywords{ICD-10 coding \and Biomedical information extraction \and Multi-lingual sequence-to-sequence -model \and Represention learning \and Attention mechanism} +\keywords{ICD-10 coding \and Biomedical information extraction \and Multi-lingual sequence-to-sequence model \and Represention learning \and Attention mechanism} \end{abstract} @@ -64,6 +59,7 @@ model \and Represention learning \and Attention mechanism} \input{20_related_work} \section{Methods} +\label{sec:methods} \input{30_methods_intro} \input{31_methods_seq2seq} \input{32_methods_icd10} -- GitLab