Skip to content
Snippets Groups Projects
Commit 935f6903 authored by Jurica Seva's avatar Jurica Seva
Browse files

Started writeup.

parent 6daea87a
No related merge requests found
...@@ -16,7 +16,7 @@ from keras import backend as K ...@@ -16,7 +16,7 @@ from keras import backend as K
from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger, TensorBoard
from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional
from keras.models import Model from keras.models import Model
from keras.utils import multi_gpu_model, np_utils from keras.utils import multi_gpu_model, np_utils
...@@ -38,6 +38,8 @@ config.gpu_options.allocator_type='BFC' ...@@ -38,6 +38,8 @@ config.gpu_options.allocator_type='BFC'
sess = tf.Session(graph=tf.get_default_graph(), config=config) sess = tf.Session(graph=tf.get_default_graph(), config=config)
K.set_session(sess) K.set_session(sess)
tbCallBack = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)
callbacks_list=[ callbacks_list=[
EarlyStopping( EarlyStopping(
monitor='val_loss', monitor='val_loss',
...@@ -52,7 +54,8 @@ callbacks_list=[ ...@@ -52,7 +54,8 @@ callbacks_list=[
CSVLogger( CSVLogger(
append=True, append=True,
filename='logs/icd10Classification_attention_duplicated_{}.csv'.format(date_label), filename='logs/icd10Classification_attention_duplicated_{}.csv'.format(date_label),
) ),
tbCallBack
] ]
latent_dim = 512 latent_dim = 512
......
...@@ -9,6 +9,7 @@ import traceback ...@@ -9,6 +9,7 @@ import traceback
from collections import Counter from collections import Counter
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from itertools import chain
from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer from keras.preprocessing.text import Tokenizer
...@@ -22,6 +23,7 @@ import tensorflow as tf ...@@ -22,6 +23,7 @@ import tensorflow as tf
from _layers import AttentionWithContext, Attention from _layers import AttentionWithContext, Attention
################################### ###################################
# TensorFlow wizardry # TensorFlow wizardry
config=tf.ConfigProto() config=tf.ConfigProto()
...@@ -57,25 +59,21 @@ chars = set() ...@@ -57,25 +59,21 @@ chars = set()
# tokenizer=TokenizePreprocessor() # tokenizer=TokenizePreprocessor()
kerasTokenizer = Tokenizer(char_level=True, filters=None) kerasTokenizer = Tokenizer(char_level=True, filters=None)
dataLoader=prepareData() dataLoader=prepareData()
corpora=dataLoader.prepareDictionaries() corpora, labels_all =dataLoader.prepareDictionaries()
tmp =[x[1] for x in corpora] labels_c = Counter(labels_all)
labels_c = Counter(tmp)
labels_tmp = {k:v for k,v in labels_c.items() if v > 1} labels_tmp = {k:v for k,v in labels_c.items() if v > 1}
# input('size of corpora') # input('size of corpora')
for line in corpora: for line, label in zip(corpora, labels_all):
if line[1] in labels_tmp: if label in labels_tmp:
labels.append(line[1]) sentences.append(line)
sentences.append(line[0]) labels.append(label)
tmp = set(line)
for ch in line[0]: chars = set(chain(chars,tmp))
if (ch not in chars):
chars.add(ch)
# print(chars, len(chars))
chars = sorted(list(chars)) chars = sorted(list(chars))
char_to_index_dict={item.strip():i+1 for i,item in enumerate(chars)} char_to_index_dict={item.strip():i+1 for i,item in enumerate(chars)}
index_to_char_dict={i+1:item.strip() for i,item in enumerate(chars)} index_to_char_dict={i+1:item.strip() for i,item in enumerate(chars)}
...@@ -111,8 +109,12 @@ encoded_Y = encoder.transform(labels) ...@@ -111,8 +109,12 @@ encoded_Y = encoder.transform(labels)
# convert integers to dummy variables (i.e. one hot encoded) # convert integers to dummy variables (i.e. one hot encoded)
labels_one_hot = np_utils.to_categorical(encoded_Y) labels_one_hot = np_utils.to_categorical(encoded_Y)
print(word_sequence.shape, labels_one_hot.shape)
print(type(word_sequence), type(labels_one_hot))
input('bla')
X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777, stratify=True) X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777, stratify=labels)
print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test)) print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test))
try: try:
......
\nj{TODO: Insert text!} In this section we will present experminets and obtained results for the two developed models, both individually as well as combined in the proposed pipeline.
\ No newline at end of file As mentioned in Section \ref{sec:methods}, The proposed pipeline combined two NN models.
\subsection{Available datasets}
The CLEF e-Health 2018 Task 1 participants where provided with annotated death certificates for the three
\subsection{Named Entity Recognition}
To identify possible tokens as candidates for death cause, we focused on the use of a sequence to sequence model.
The
\subsection{Named Entity Normalization}
\subsection{Final Pipeline}
\nj{TODO: Insert text!} \nj{TODO: Insert text!}
\ No newline at end of file In this paper we tackled the problem of information extraction of death causes in an multilingual environment.
The proposed solution was focused in language-independent models and relies on word embeddings for each of the languages.
The proposed pipeline is divided in two steps: (1) first, possible token describing the death cause are generated by using a sequence to sequence model with attention mechanism; then, (2) generated token sequence is normalized to a possible ICD-10 code.
We detected several issues with the proposed pipeline. These issues also serve as prospecitve future work.
The word embeddings we used are not optimized to the problem domain but are trained in general text.
The mutual embeddings space is currently defined as concatenation of the the word embeddings models for individual tokens.
In this aspect, several possible improvements of the proposed pipeline are detected.
First, the use of in-domain target language embeddings as initial token embeddings.
Although this was our initial approach, the difficulties of finding adequate in-domain corpora for selected languages has proven to be to ohard to tackle.
Our current embedding space is merely a concatenation of the three target language embeddings.
Creating an unifying embeddings space would create a truly language-independent token representation.
Additionally, it was shown that in-domain embeddings improve the quality of achieved results. This will be the main focus on our future work.
The normalization step also suffered from lack of adequate training data.
Unfortunately, we were unable to obtain ICD-10 dictinaries for all languages and can, therefore, not guarantee the completeness of the ICD-10 label space.
Final downside of the proposed pipeline is the lack fo support for mutli-label classification.
...@@ -42,18 +42,13 @@ Bioinformatics, \\ Berlin, Germany\\ ...@@ -42,18 +42,13 @@ Bioinformatics, \\ Berlin, Germany\\
\maketitle % typeset the header of the contribution \maketitle % typeset the header of the contribution
% %
\begin{abstract} \begin{abstract}
This paper describes the participation of the WBI team in the CLEF eHealth 2018 This paper describes the participation of the WBI team in the CLEF eHealth 2018 shared task 1 (``Multilingual Information Extraction - ICD-10 coding'').
shared task 1 (``Multilingual Information Extraction - ICD-10 coding''). Our Our approach builds on two recurrent neural networks models to extract and classify causes of death from French, Italian and Hungarian death certificates.
approach builds on two recurrent neural networks models to extract and classify First, we employ a LSTM-based sequence-to-sequence model to obtain a symptom name from each death certificate line.
causes of death from French, Italian and Hungarian death certificates. First, we We then utilize a bidirectional LSTM model with attention mechanism to assign the respective ICD-10 codes to the received symptom names.
employ a LSTM-based sequence-to-sequence model to obtain a symptom name from each Our model achieves \ldots
death certificate line. We then utilize a bidirectional LSTM model with
attention mechanism to assign the respective ICD-10 codes to the received
symptom names. Our model achieves \ldots
\keywords{ICD-10 coding \and Biomedical information extraction \and Multi-lingual sequence-to-sequence \keywords{ICD-10 coding \and Biomedical information extraction \and Multi-lingual sequence-to-sequence model \and Represention learning \and Attention mechanism}
model \and Represention learning \and Attention mechanism}
\end{abstract} \end{abstract}
...@@ -64,6 +59,7 @@ model \and Represention learning \and Attention mechanism} ...@@ -64,6 +59,7 @@ model \and Represention learning \and Attention mechanism}
\input{20_related_work} \input{20_related_work}
\section{Methods} \section{Methods}
\label{sec:methods}
\input{30_methods_intro} \input{30_methods_intro}
\input{31_methods_seq2seq} \input{31_methods_seq2seq}
\input{32_methods_icd10} \input{32_methods_icd10}
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment