From 77b166c89475f86da218bd80c276db215ab36df3 Mon Sep 17 00:00:00 2001
From: Jurica Seva <seva@informatik.hu-berlin.de>
Date: Thu, 31 May 2018 12:26:28 +0200
Subject: [PATCH] Final version. All info included in to the writing. Fixed
 typos. Added 4.4. Formated tables et al.

---
 code_jurica/util.py      |  9 ++++--
 paper/40_experiments.tex | 66 ++++++++++++++++++++++++++++++----------
 2 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/code_jurica/util.py b/code_jurica/util.py
index ecca41d..4eafea5 100644
--- a/code_jurica/util.py
+++ b/code_jurica/util.py
@@ -18,7 +18,7 @@ import keras
 from keras.preprocessing.sequence import pad_sequences
 from collections import Counter
 import random
-import os
+import osdf
 
 #REPRODUCIBLE
 np.random.seed(42)
@@ -330,6 +330,7 @@ class prepareData():
         random.shuffle(preparedDictionary)
         corpus = [item[0] for item in preparedDictionary]
         labels = [item[1] for item in preparedDictionary]
+        print(Counterlabels)
         return corpus, labels
 
 class KerasBatchGenerator(keras.utils.Sequence):
@@ -407,4 +408,8 @@ class KerasBatchGeneratorClassification(keras.utils.Sequence):
             batch_labels = self.labeler.transform(batch_labels)
 
             self.current_idx += 1
-            yield batch_source, batch_labels
\ No newline at end of file
+            yield batch_source, batch_labels
+            
+# dataLoader=prepareData()
+# corpus, labels =dataLoader.prepareDictionaries(unbalanced=False, oversampled=False)
+# corpus, labels =dataLoader.prepareDictionaries(unbalanced=True, oversampled=False)
diff --git a/paper/40_experiments.tex b/paper/40_experiments.tex
index 2f96df9..35a5dd1 100644
--- a/paper/40_experiments.tex
+++ b/paper/40_experiments.tex
@@ -12,12 +12,13 @@ Due to time constraints during development no cross-validation to optimize the (
 We either keep the default values of the hyper-parameters or set them to reasonable values according to existing work. 
 During model training we shuffle the training instances and use varying validation instances to perform a validation of the epoch.
 
-As representation for the input tokens of the model we use pre-trained fastText word embeddings. % \cite{bojanowski_enriching\_2016}. The embeddings were trained on Common Crawl and Wikipedia articles. 
-Embeddings' were trained using the following parameter settings: CBOW with position-weights, embedding dimension size 300, with character n-grams of length 5, a window of size 5 and 10 negatives. 
+%As representation for the input tokens of the model we use 
+Pre-trained fastText WEs % \cite{bojanowski_enriching\_2016}. The embeddings were trained on Common Crawl and Wikipedia articles. Embeddings' 
+were trained using the following parameter settings: CBOW with position-weights, embedding dimension size 300, with character n-grams of length 5, a window of size 5 and 10 negatives. 
 Unfortunately, they are trained on corpora not related with the biomedical domain and therefore do not represent the best possible textual basis for an embedding space for biomedical information extraction. 
 Final embedding space used by our models is created by concatenating individual embedding vectors for all three languages. 
 Thus the input of our model is embedding vector of size 900. 
-All models were implemented with the Keras library \footnote{\url{https://keras.io/}}.% in Version X.X.
+All models were implemented with the Keras\footnote{\url{https://keras.io/}} library.% in Version X.X.
 
 \subsection{Death cause extraction model} 
 To identify possible candidates for a death cause description, we focus on the use of an encoder-decoder model. 
@@ -29,7 +30,7 @@ This decoding process continues until a special end token is generated.
 The entire model is optimized using the Adam optimization algorithm \cite{kingma_adam:_2014} and a batch size of 700. 
 Model training was performed either for 100 epochs or until an early stopping criteria is met (no change in validation loss for two epochs).
 
-As the available dataset are imbalanced concerning the different languages, we devised two different evaluation settings: (1) DCEM-Balanced, where each language was supported by 49.823 randomly drawn instances (size of the smallest corpus) and (2) DCEM-Full, where all available data is used. 
+As the provided dataset are imbalanced regarding the tasks' languages, we devised two different evaluation settings: (1) DCEM-Balanced, where each language was supported by 49.823 randomly drawn instances (size of the smallest corpus) and (2) DCEM-Full, where all available data is used. 
 The results, obtained on the training and validation set, are shown in Table \ref{tab:s2s}.
 The figures reveal that distribution of training instances per language have a huge influence on the performance of the model. 
 The model trained on the full training data achieves an accuracy of 0.678 on the validation set. 
@@ -61,13 +62,16 @@ Thereafter an attention layer builds an adaptive weighted average over all LSTM
 The respective ICD-10 code will be determined by a dense layer with softmax activation function. 
 We use the Adam optimizer to perform model training. 
 The model was validated on 25\% of the data. 
-As for the extraction model, no cross-validation or hyper-parameter optimization was performed due to time constraints during development.
+As for the extraction model, no cross-validation or hyper-parameter optimization was performed.% due to time constraints during development.
 
 Once again, we devised two approaches. This was mainly caused by the lack of adequate training data in terms of coverage for individual ICD-10 codes.
 Therefore, we once again defined two training data settings: (1) minimal (ICD-10\_Minimal), where only ICD-10 codes with two or more supporting training instances are used. 
+This leaves us with 6.857 unique ICD-10 codes and discards 2.238 unique ICD-10 codes with support of one. 
 This, of course, minimizes the number of ICD-10 codes in the label space. 
 Therefore, (2) an extended (ICD-10\_Extended) dataset was defined. Here, the original ICD-10 code mappings, found in the supplied dictionaries, are extended with the training instances from individual certificate data from the three languages. 
+This generates 9.591 unique ICD-10 codes. 
 Finally, for the remaining ICD-10 codes that have only one supporting diagnosis text resp. death cause description, we duplicate those data points. 
+
 The goal of this approach is to extend our possible label space to all available ICD-10 codes. 
 The results obtained from the two approaches on the validation set are shown in Table \ref{tab:icd10Classification}. 
 Using the \textit{minimal} data set the model achieves an accuracy of 0.937. 
@@ -86,25 +90,21 @@ In contrast, using the extended data set the model reaches an accuracy of 0.954
 \hline
 ICD-10\_Minimal &  69 & 0.925 & 0.190 & 0.937 & 0.169 \\
 \hline
-ICD-10\_Extended &  41 & 0.950 & 0.156 & 0.954 & 0.141 \\
+ICD-10\_Extended\textbf{*} &  41 & 0.950 & 0.156 & 0.954 & 0.141 \\
 %Character & Minimal &   91 & 0.732 & 1.186 & 0.516 & 2.505 \\
 \bottomrule
 \end{tabularx}
 \caption{Experiment results for our ICD-10 classification model regarding different data settings. 
 The \textit{Minimal} setting uses only ICD-10 codes with two or more training instances in the supplied dictionary. 
-In contrast, \textit{Extended} additionally takes the diagnosis texts from the certificate data and duplicates ICD-10 training instances with only one diagnosis text in the dictionary and certificate lines.}
+In contrast, \textit{Extended} additionally takes the diagnosis texts from the certificate data and duplicates ICD-10 training instances with only one diagnosis text in the dictionary and certificate lines. \textbf{*} Used in final pipeline.}
 \end{table}
 
 \subsection{Complete Pipeline}
-The two models where combined to create the final pipeline. 
-We tested both neural models in the final pipeline, as their performance differs greatly. 
-As both ICD-10 classification models perform similarly, we used the word and extended ICD-10 classification model in the final pipeline. 
-The results obtained during training are presented in Table \ref{tab:final_train}. 
-Results obtained on the evaluation dataset are shown in Table \ref{tab:final_test}.
+\label{tab:final_train}
 
-\begin{table}[]
+\begin{table}[t!]
 \centering
-\begin{tabular}{|l|c|c|c|}
+\begin{tabular}{l|c|c|c}
 \toprule
 \textbf{Model} &  \textbf{Precision} & \textbf{Recall} & \textbf{F-score} \\
 \hline
@@ -114,8 +114,17 @@ Final-Full & 0.74 & 0.62 & 0.63 \\
 \bottomrule
 \end{tabular}
 \caption{Final Pipeline Performance - Training Data. Final-Balanced = DCEM-Balanced + ICD-10\_Extended. Final-Full = DCEM-Full + ICD-10\_Extended}
-\label{tab:final_train}
+
+
 \end{table}
+The two models where combined to create the final pipeline. 
+We tested both neural models in the final pipeline, as their performance differs greatly. 
+As both ICD-10 classification models perform similarly, we used the extended ICD-10 classification model, with word level tokens\footnote{Although models supporting character level tokens were developed and evaluated, their performance faired poorly compared to the word level tokens.}, in the final pipeline. 
+The results obtained during training are presented in Table \ref{tab:final_train}. 
+
+Although the individual models, as shown in Tables \ref{tab:s2s} and \ref{tab:icd10Classification} are promising, the final pipeline decreases their performance on hold-out dataset created during training. %, by roughly a third. 
+This can be contributed to several factors with the very imbalanced distribution of supporting ICD-10 codes, provided by the Organizers, the most  influential reason. 
+This severely impacts the decoder-encoder architecture used here as the token generation is biased towards the available data points. 
 
 \begin{table}[]
 \centering
@@ -154,4 +163,29 @@ Final-Full & 0.74 & 0.62 & 0.63 \\
 \label{tab:final_test}
 \end{table}
 
-
+Results obtained on the evaluation dataset, resulting from the two submitted official runs, are shown in Table \ref{tab:final_test}. 
+All of our approaches perform below the mean and median averages of all participants. 
+Surprisingly, there is a substantial difference in results obtained between the individual languages. 
+%This hints towards the unsuitability of out-of-domain WEs. 
+This confirms our assumptions about the (un)suitability of the proposed multi-lingual embedding space for this task. 
+The results also point that the size of the training corpora is not influencing the final results. 
+As seen, best results were obtained on the Italian dataset were trained on the smallest corpora. 
+Worst results were obtained on the middle, French, corpus while the biggest corpus, Hungarian, is in second place. 
+
+We identified several possible reasons for the obtained results. 
+These also represent (possible) points for future work. 
+As the main disadvantage of our approach the quality of the used WEs as well as the properties of the proposed language-independent embedding space are identified. 
+The use of out-of-domain WEs, as expected, proved to be suboptimal solution to this problem. 
+Although we tried to alleviate this by finding suitable external corpora to train domain-dependent WEs for each of the supported languages, we were unable to find any significant amount of in-domain documents (e.g. PubMed search for abstracts in either French, Hungarian or Italian found 7.843, 786 and 1659 articles respectively). 
+Combined with concatenating the three WEs representation of individual tokens in to an language-independent embedding space, we see work on language-independent WEs as the main focus point for future work. 
+%This point will be the main focus of future work on this problem. 
+Besides the issues with the used WEs, inability to obtain full ICD-10 dictionaries for the selected languages has also negatively influenced the results. 
+As a final limitation to our approach, lack of multi-label classification support has also been identified (i.e. not recognizing more than one death cause in a single input text). 
+
+%Probleme bei uns:
+%1 out of domain WE
+%2 language-independent embedding space is very elementary; this will be the emphasis for our future work
+%3 no additional training data (either for WEs or as datapoints) found and incorporated
+%%4 kein attentoon beim s2s. Wir koennen noch capsnet probieren 
+%4 somewhat limited set of supproted ICD-1o codes (e.g. we didnt have the full dictionaries)
+%5 no multilabel classification supported
\ No newline at end of file
-- 
GitLab