From 935f6903873495355175131b143cc0b189848719 Mon Sep 17 00:00:00 2001
From: Jurica Seva <seva@informatik.hu-berlin.de>
Date: Thu, 24 May 2018 14:43:24 +0200
Subject: [PATCH] Started writeup.

---
 code_jurica/classificationICD10_attention.py  |  7 +++--
 .../classificationICD10_attention_char.py     | 30 ++++++++++---------
 paper/40_experiments.tex                      | 16 +++++++++-
 paper/50_conclusion.tex                       | 24 ++++++++++++++-
 paper/wbi-eclef18.tex                         | 18 +++++------
 5 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/code_jurica/classificationICD10_attention.py b/code_jurica/classificationICD10_attention.py
index fa74fd4..6ee8659 100644
--- a/code_jurica/classificationICD10_attention.py
+++ b/code_jurica/classificationICD10_attention.py
@@ -16,7 +16,7 @@ from keras import backend as K
 from keras.preprocessing.sequence import pad_sequences
 from keras.preprocessing.text import Tokenizer
 from keras.optimizers import Adam
-from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
+from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger, TensorBoard
 from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional
 from keras.models import Model
 from keras.utils import multi_gpu_model, np_utils
@@ -38,6 +38,8 @@ config.gpu_options.allocator_type='BFC'
 sess = tf.Session(graph=tf.get_default_graph(), config=config)
 K.set_session(sess)
 
+tbCallBack = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)
+
 callbacks_list=[
     EarlyStopping(
         monitor='val_loss',
@@ -52,7 +54,8 @@ callbacks_list=[
     CSVLogger(
         append=True,
         filename='logs/icd10Classification_attention_duplicated_{}.csv'.format(date_label),
-    )
+    ),
+    tbCallBack
 ]
 
 latent_dim = 512
diff --git a/code_jurica/classificationICD10_attention_char.py b/code_jurica/classificationICD10_attention_char.py
index 8d9d2f2..238e3b5 100644
--- a/code_jurica/classificationICD10_attention_char.py
+++ b/code_jurica/classificationICD10_attention_char.py
@@ -9,6 +9,7 @@ import traceback
 from collections import Counter
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import train_test_split
+from itertools import chain
 
 from keras.preprocessing.sequence import pad_sequences
 from keras.preprocessing.text import Tokenizer
@@ -22,6 +23,7 @@ import tensorflow as tf
 
 from _layers import AttentionWithContext, Attention
 
+
 ###################################
 # TensorFlow wizardry
 config=tf.ConfigProto()
@@ -57,25 +59,21 @@ chars = set()
 # tokenizer=TokenizePreprocessor()
 kerasTokenizer = Tokenizer(char_level=True, filters=None)
 dataLoader=prepareData()
-corpora=dataLoader.prepareDictionaries()
-tmp =[x[1] for x in corpora]
-labels_c = Counter(tmp)
+corpora, labels_all =dataLoader.prepareDictionaries()
+labels_c = Counter(labels_all)
 labels_tmp = {k:v for k,v in labels_c.items() if v > 1}
 
-
 # input('size of corpora')
 
-for line in corpora:
-    if line[1] in labels_tmp:
-        labels.append(line[1])
-        sentences.append(line[0])
-
-        for ch in line[0]:
-            if (ch not in chars):
-                chars.add(ch)
+for line, label in zip(corpora, labels_all):
+    if label in labels_tmp:
+        sentences.append(line)
+        labels.append(label)
+        tmp =  set(line)
+        chars  = set(chain(chars,tmp))
 
+# print(chars, len(chars))
 chars = sorted(list(chars))
-
 char_to_index_dict={item.strip():i+1 for i,item in enumerate(chars)}
 index_to_char_dict={i+1:item.strip() for i,item in enumerate(chars)}
 
@@ -111,8 +109,12 @@ encoded_Y = encoder.transform(labels)
 
 # convert integers to dummy variables (i.e. one hot encoded)
 labels_one_hot = np_utils.to_categorical(encoded_Y)
+print(word_sequence.shape, labels_one_hot.shape)
+print(type(word_sequence), type(labels_one_hot))
+
+input('bla')
 
-X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777, stratify=True)
+X_train, X_test, Y_train, Y_test = train_test_split(word_sequence, labels_one_hot, test_size=0.05, random_state=777, stratify=labels)
 print("Prepared data: ", len(X_train), len(Y_train), len(X_test), len(Y_test))
 
 try:
diff --git a/paper/40_experiments.tex b/paper/40_experiments.tex
index cfe2e53..9c3016a 100644
--- a/paper/40_experiments.tex
+++ b/paper/40_experiments.tex
@@ -1 +1,15 @@
-\nj{TODO: Insert text!}
\ No newline at end of file
+In this section we will present experminets and obtained results for the two developed models, both individually as well as combined in the proposed pipeline.
+As mentioned in Section \ref{sec:methods}, The proposed pipeline combined two NN models.
+
+\subsection{Available datasets}
+The CLEF e-Health 2018 Task 1 participants where provided with annotated death certificates for the three
+
+\subsection{Named Entity Recognition}
+To identify possible tokens as candidates for death cause, we focused on the use of a sequence to sequence model.
+The
+
+\subsection{Named Entity Normalization}
+
+\subsection{Final Pipeline}
+
+
diff --git a/paper/50_conclusion.tex b/paper/50_conclusion.tex
index cfe2e53..c20c770 100644
--- a/paper/50_conclusion.tex
+++ b/paper/50_conclusion.tex
@@ -1 +1,23 @@
-\nj{TODO: Insert text!}
\ No newline at end of file
+\nj{TODO: Insert text!}
+In this paper we tackled the problem of information extraction of death causes in an multilingual environment.
+The proposed solution was focused in language-independent models and relies on word embeddings for each of the languages.
+The proposed pipeline is divided in two steps: (1) first, possible token describing the death cause are generated by using a sequence to sequence model with attention mechanism; then, (2) generated token sequence is normalized to a possible ICD-10 code.
+
+We detected several issues with the proposed pipeline. These issues also serve as prospecitve future work.
+The word embeddings we used are not optimized to the problem domain but are trained in general text.
+The mutual embeddings space is currently defined as concatenation of the the word embeddings models for individual tokens.
+In this aspect, several possible improvements of the proposed pipeline are detected.
+First, the use of in-domain target language embeddings as initial token embeddings.
+Although this was our initial approach, the difficulties of finding adequate in-domain corpora for selected languages has proven to be to ohard to tackle.
+Our current embedding space is merely a concatenation of the three target language embeddings.
+Creating an unifying embeddings space would create a truly language-independent token representation.
+Additionally, it was shown that in-domain embeddings improve the quality of achieved results. This will be the main focus on our future work.
+The normalization step also suffered from lack of adequate training data.
+Unfortunately, we were unable to obtain ICD-10 dictinaries for all languages and can, therefore, not guarantee the completeness of the ICD-10 label space.
+Final downside of the proposed pipeline is the lack fo support for mutli-label classification.
+
+
+
+
+
+
diff --git a/paper/wbi-eclef18.tex b/paper/wbi-eclef18.tex
index 7b6b66c..0a94565 100644
--- a/paper/wbi-eclef18.tex
+++ b/paper/wbi-eclef18.tex
@@ -42,18 +42,13 @@ Bioinformatics, \\ Berlin, Germany\\
 \maketitle              % typeset the header of the contribution
 % 
 \begin{abstract}
-This paper describes the participation of the WBI team in the CLEF eHealth 2018
-shared task 1 (``Multilingual Information Extraction - ICD-10 coding''). Our
-approach builds on two recurrent neural networks models to extract and classify
-causes of death from French, Italian and Hungarian death certificates. First, we
-employ a LSTM-based sequence-to-sequence model to obtain a symptom name from each
-death certificate line. We then utilize a bidirectional LSTM model with
-attention mechanism to assign the respective ICD-10 codes to the received
-symptom names. Our model achieves \ldots
- 
+This paper describes the participation of the WBI team in the CLEF eHealth 2018 shared task 1 (``Multilingual Information Extraction - ICD-10 coding'').
+Our approach builds on two recurrent neural networks models to extract and classify causes of death from French, Italian and Hungarian death certificates.
+First, we employ a LSTM-based sequence-to-sequence model to obtain a symptom name from each death certificate line.
+We then utilize a bidirectional LSTM model with attention mechanism to assign the respective ICD-10 codes to the received symptom names.
+Our model achieves \ldots
 
-\keywords{ICD-10 coding \and Biomedical information extraction \and Multi-lingual sequence-to-sequence 
-model \and Represention learning \and Attention mechanism}
+\keywords{ICD-10 coding \and Biomedical information extraction \and Multi-lingual sequence-to-sequence model \and Represention learning \and Attention mechanism}
   
 \end{abstract}  
 
@@ -64,6 +59,7 @@ model \and Represention learning \and Attention mechanism}
 \input{20_related_work} 
  
 \section{Methods}
+\label{sec:methods}
 \input{30_methods_intro}
 \input{31_methods_seq2seq}
 \input{32_methods_icd10} 
-- 
GitLab