Add initial version of related work + minor adaptions

b7cc2cd2 · Mario Sänger · 6daea87a · b7cc2cd2 · b7cc2cd2 · b7cc2cd2
Commit b7cc2cd2 authored 6 years ago by Mario Sänger
--- a/paper/10_introduction.tex
+++ b/paper/10_introduction.tex
@@ -4,7 +4,11 @@ health documents, is a highly important task to support many applications in
 research, daily clinical routine and policy-making. Computer-aided approaches
 can improve decision making and support clinical processes, for example, by
 giving a more sophisticated overview about a research area, providing detailed
-information about the aetiopathology of a patient or disease patterns.  
+information about the aetiopathology of a patient or disease patterns. In the
+past years major advances have been made in the area of natural language
+processing. However, improvements in the field of biomedical text mining lag
+behind other domains mainly due to privacy issues and concerns regarding the
+processed data (e.g. electronic health records).
 The CLEF eHealth lab attends to this circumstance through organization of
 various shared tasks which aid and support the development of approaches to

--- a/paper/20_related_work.tex
+++ b/paper/20_related_work.tex
-\nm{TODO: Insert text!}
+The ICD-10 coding task has already been carried out in the 2016
\ No newline at end of file
+\cite{neveol_clinical_2016} and 2017 \cite{neveol_clef_2017} edition of the
+eHealth lab. Participating teams used a plethora of different approaches to
+tackle the classification problem. The methods can essentially be divided into
+two categories: knowledge-based
+\cite{cabot_sibm_2016,jonnagaddala_automatic_2017,van_mulligen_erasmus_2016} and
+machine learning approaches. The former relies on lexical sources, medical
+terminologies and other ontologies to match (parts of) the certificate text with
+entries from the knowledge-bases according to a rule framework.  For example, Di
+Nunzio et al. \cite{di_nunzio_lexicon_2017} calculate a score for each ICD-10
+dictionary entry by summing the binary or tf-idf weights of each term of a
+certificate line segment and assign the ICD-10 code with the highest score. In
+contrast, Ho-Dac et al. \cite{ho-dac_litl_2017} treat the problem as information
+retrieval task and utilze the SOLR search engine.
+The machine learning based approaches employ a variety techniques, e.g.
+Conditional Random Fields (CRFs) \cite{ho-dac_litl_2016}, Labeled Latent
+Dirichlet Analysis (LDA) \cite{dermouche_ecstra} and Support Vector Machines
+(SVMs) \cite{ebersbach_fusion_2017} with diverse hand-crafted features. Most
+similar to our approach is the work from Miftahutdinov and Tutbalina \cite{},
+which achieved the best results for English certificates in the last year's
+competition. They use a neural LSTM-based encoder-decoder model that processes the raw
+certificate text as input and encodes it into a vector representation.
+Furthermore a vector which captures the textual similarity between the
+certificate line and the symptons resp. diagnosis of the individual ICD-10 codes
+is used to integrate prior knowledge into the model. The concatenation of both
+vector representations is then used to output the characters and numbers of the
+ICD-10 code in the decoding step. In contrast to their work, our approach 
+introduces a model for multi-language ICD-10 classification. We utilitize two
+separate recurrent neural networks, one sequence to sequence model for symptom
+extraction and one for classification, to predict the ICD-10 codes for a
+certificate text independent from which language they originate. 
--- a/paper/references.bib
+++ b/paper/references.bib
+@inproceedings{neveol_clef_2017,
+	title = {{CLEF} {eHealth} 2017 {Multilingual} {Information} {Extraction} task overview: {ICD}10 coding of death certificates in {English} and {French}},
+	shorttitle = {{CLEF} {eHealth} 2017 {Multilingual} {Information} {Extraction} task overview},
+	booktitle = {{CLEF} 2017 {Evaluation} {Labs} and {Workshop}: {Online} {Working} {Notes}, {CEUR}-{WS}},
+	author = {Névéol, Aurélie and Anderson, Robert N. and Cohen, K. Bretonnel and Grouin, Cyril and Lavergne, Thomas and Rey, Grégoire and Robert, Aude and Rondet, Claire and Zweigenbaum, Pierre},
+	year = {2017},
+	keywords = {Read},
+	pages = {17},
+	file = {Fulltext:/Users/mario/Zotero/storage/8QATUX6Q/Névéol et al. - 2017 - CLEF eHealth 2017 Multilingual Information Extract.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/EV2SLCV8/Névéol et al. - 2017 - CLEF eHealth 2017 Multilingual Information Extract.pdf:application/pdf}
+}
+@inproceedings{miftakhutdinov_kfu_2017,
+	title = {Kfu at clef ehealth 2017 task 1: {Icd}-10 coding of english death certificates with recurrent neural networks},
+	booktitle = {{CLEF} 2017 {Online} {Working} {Notes}},
+	publisher = {CEUR-WS},
+	author = {Miftakhutdinov, Zulfat and Tutubalina, Elena},
+	year = {2017},
+	keywords = {CLEF, ICD-10-Classification, Read},
+	file = {Fulltext:/Users/mario/Zotero/storage/HRZ6Q8Q6/Miftakhutdinov und Tutubalina - 2017 - Kfu at clef ehealth 2017 task 1 Icd-10 coding of .pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/J8TXTUNT/Miftakhutdinov und Tutubalina - 2017 - Kfu at clef ehealth 2017 task 1 Icd-10 coding of .pdf:application/pdf}
+}
+@inproceedings{goeuriot_clef_2017,
+	title = {Clef 2017 ehealth evaluation lab overview},
+	booktitle = {International {Conference} of the {Cross}-{Language} {Evaluation} {Forum} for {European} {Languages}},
+	publisher = {Springer},
+	author = {Goeuriot, Lorraine and Kelly, Liadh and Suominen, Hanna and Névéol, Aurélie and Robert, Aude and Kanoulas, Evangelos and Spijker, Rene and Palotti, Joao and Zuccon, Guido},
+	year = {2017},
+	keywords = {Read},
+	pages = {291--303},
+	file = {Fulltext:/Users/mario/Zotero/storage/EEAVXG89/Goeuriot et al. - 2017 - Clef 2017 ehealth evaluation lab overview.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/TMNBJ6YC/978-3-319-65813-1_26.html:text/html}
+}
+@article{butt_classification_2013,
+	title = {Classification of cancer-related death certificates using machine learning},
+	volume = {6},
+	issn = {1836-1935},
+	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3674421/},
+	doi = {10.4066/AMJ.2013.1654},
+	abstract = {Background
+Cancer monitoring and prevention relies on the critical aspect of timely notification of cancer cases. However, the abstraction and classification of cancer from the free-text of pathology reports and other relevant documents, such as death certificates, exist as complex and time-consuming activities.
+Aims
+In this paper, approaches for the automatic detection of notifiable cancer cases as the cause of death from free-text death certificates supplied to Cancer Registries are investigated.
+Method
+A number of machine learning classifiers were studied. Features were extracted using natural language techniques and the Medtex toolkit. The numerous features encompassed stemmed words, bi-grams, and concepts from the SNOMED CT medical terminology. The baseline consisted of a keyword spotter using keywords extracted from the long description of ICD-10 cancer related codes.
+Results
+Death certificates with notifiable cancer listed as the cause of death can be effectively identified with the methods studied in this paper. A Support Vector Machine (SVM) classifier achieved best performance with an overall Fmeasure of 0.9866 when evaluated on a set of 5,000 freetext death certificates using the token stem feature set. The SNOMED CT concept plus token stem feature set reached the lowest variance (0.0032) and false negative rate (0.0297) while achieving an F-measure of 0.9864. The SVM classifier accounts for the first 18 of the top 40 evaluated runs, and entails the most robust classifier with a variance of 0.001141, half the variance of the other classifiers.
+Conclusion
+The selection of features significantly produced the most influences on the performance of the classifiers, although the type of classifier employed also affects performance. In contrast, the feature weighting schema created a negligible effect on performance. Specifically, it is found that stemmed tokens with or without SNOMED CT concepts create the most effective feature when combined with an SVM classifier.},
+	number = {5},
+	urldate = {2018-03-16},
+	journal = {The Australasian Medical Journal},
+	author = {Butt, Luke and Zuccon, Guido and Nguyen, Anthony and Bergheim, Anton and Grayson, Narelle},
+	month = may,
+	year = {2013},
+	pmid = {23745151},
+	pmcid = {PMC3674421},
+	pages = {292--299},
+	file = {PubMed Central Full Text PDF:/Users/mario/Zotero/storage/ZCUHSCHR/Butt et al. - 2013 - Classification of cancer-related death certificate.pdf:application/pdf}
+}
+@article{koopman_automatic_2015,
+	title = {Automatic {ICD}-10 classification of cancers from free-text death certificates},
+	volume = {84},
+	issn = {1386-5056},
+	url = {http://www.sciencedirect.com/science/article/pii/S1386505615300289},
+	doi = {10.1016/j.ijmedinf.2015.08.004},
+	abstract = {Objective
+Death certificates provide an invaluable source for cancer mortality statistics; however, this value can only be realised if accurate, quantitative data can be extracted from certificates – an aim hampered by both the volume and variable nature of certificates written in natural language. This paper proposes an automatic classification system for identifying cancer related causes of death from death certificates.
+Methods
+Detailed features, including terms, n-grams and SNOMED CT concepts were extracted from a collection of 447,336 death certificates. These features were used to train Support Vector Machine classifiers (one classifier for each cancer type). The classifiers were deployed in a cascaded architecture: the first level identified the presence of cancer (i.e., binary cancer/nocancer) and the second level identified the type of cancer (according to the ICD-10 classification system). A held-out test set was used to evaluate the effectiveness of the classifiers according to precision, recall and F-measure. In addition, detailed feature analysis was performed to reveal the characteristics of a successful cancer classification model.
+Results
+The system was highly effective at identifying cancer as the underlying cause of death (F-measure 0.94). The system was also effective at determining the type of cancer for common cancers (F-measure 0.7). Rare cancers, for which there was little training data, were difficult to classify accurately (F-measure 0.12). Factors influencing performance were the amount of training data and certain ambiguous cancers (e.g., those in the stomach region). The feature analysis revealed a combination of features were important for cancer type classification, with SNOMED CT concept and oncology specific morphology features proving the most valuable.
+Conclusion
+The system proposed in this study provides automatic identification and characterisation of cancers from large collections of free-text death certificates. This allows organisations such as Cancer Registries to monitor and report on cancer mortality in a timely and accurate manner. In addition, the methods and findings are generally applicable beyond cancer classification and to other sources of medical text besides death certificates.},
+	number = {11},
+	urldate = {2018-03-16},
+	journal = {International Journal of Medical Informatics},
+	author = {Koopman, Bevan and Zuccon, Guido and Nguyen, Anthony and Bergheim, Anton and Grayson, Narelle},
+	month = nov,
+	year = {2015},
+	pages = {956--965},
+	file = {ScienceDirect Full Text PDF:/Users/mario/Zotero/storage/P8HLCZWK/Koopman et al. - 2015 - Automatic ICD-10 classification of cancers from fr.pdf:application/pdf;ScienceDirect Snapshot:/Users/mario/Zotero/storage/X3AKYDDI/S1386505615300289.html:text/html}
+}
 @inproceedings{sutskever_sequence_2014,
 	title = {Sequence to sequence learning with neural networks},
 	booktitle = {Advances in neural information processing systems},
@@ -132,4 +220,102 @@
 	year = {2014},
 	pages = {1724--1734},
 	file = {Full Text PDF:/Users/mario/Zotero/storage/4NE9THT8/Cho et al. - 2014 - Learning Phrase Representations using RNN Encoder–.pdf:application/pdf}
+}
+@article{neveol_clinical_2016,
+	title = {Clinical {Information} {Extraction} at the {CLEF} {eHealth} {Evaluation} lab 2016},
+	volume = {1609},
+	issn = {1613-0073},
+	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5756095/},
+	abstract = {This paper reports on Task 2 of the 2016 CLEF eHealth evaluation lab which extended the previous information extraction tasks of ShARe/CLEF eHealth evaluation labs. The task continued with named entity recognition and normalization in French narratives, as offered in CLEF eHealth 2015. Named entity recognition involved ten types of entities including disorders that were defined according to Semantic Groups in the Unified Medical Language System® (UMLS®), which was also used for normalizing the entities. In addition, we introduced a large-scale classification task in French death certificates, which consisted of extracting causes of death as coded in the International Classification of Diseases, tenth revision (ICD10). Participant systems were evaluated against a blind reference standard of 832 titles of scientific articles indexed in MEDLINE, 4 drug monographs published by the European Medicines Agency (EMEA) and 27,850 death certificates using Precision, Recall and F-measure. In total, seven teams participated, including five in the entity recognition and normalization task, and five in the death certificate coding task. Three teams submitted their systems to our newly offered reproducibility track. For entity recognition, the highest performance was achieved on the EMEA corpus, with an overall F-measure of 0.702 for plain entities recognition and 0.529 for normalized entity recognition. For entity normalization, the highest performance was achieved on the MEDLINE corpus, with an overall F-measure of 0.552. For death certificate coding, the highest performance was 0.848 F-measure.},
+	urldate = {2018-05-23},
+	journal = {CEUR workshop proceedings},
+	author = {Névéol, Aurélie and Cohen, K. Bretonnel and Grouin, Cyril and Hamon, Thierry and Lavergne, Thomas and Kelly, Liadh and Goeuriot, Lorraine and Rey, Grégoire and Robert, Aude and Tannier, Xavier and Zweigenbaum, Pierre},
+	month = sep,
+	year = {2016},
+	pmid = {29308065},
+	pmcid = {PMC5756095},
+	pages = {28--42},
+	file = {PubMed Central Full Text PDF:/Users/mario/Zotero/storage/ZWWRZSZK/Névéol et al. - 2016 - Clinical Information Extraction at the CLEF eHealt.pdf:application/pdf}
+}
+@inproceedings{di_nunzio_lexicon_2017,
+	title = {A {Lexicon} {Based} {Approach} to {Classification} of {ICD}10 {Codes}. {IMS} {Unipd} at {CLEF} {eHealth} {Task}},
+	booktitle = {{CLEF} 2017 {Online} {Working} {Notes}},
+	publisher = {CEUR-WS},
+	author = {Di Nunzio, Giorgio Maria and Beghini, Federica and Vezzani, Federica and Henrot, Genevieve},
+	year = {2017},
+	file = {Fulltext:/Users/mario/Zotero/storage/HGHINDH3/Di Nunzio et al. - A Lexicon Based Approach to Classification of ICD1.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/LWSDB84Q/Di Nunzio et al. - A Lexicon Based Approach to Classification of ICD1.pdf:application/pdf}
+}
+@inproceedings{cabot_sibm_2016,
+	title = {{SIBM} at {CLEF} {eHealth} {Evaluation} {Lab} 2016: {Extracting} {Concepts} in {French} {Medical} {Texts} with {ECMT} and {CIMIND}},
+	booktitle = {{CLEF} 2015 {Online} {Working} {Notes}},
+	publisher = {CEUR-WS},
+	author = {Cabot, Chloé and Soualmia, Lina F. and Dahamna, Badisse and Darmoni, Stéfan J.},
+	year = {2016},
+	file = {Fulltext:/Users/mario/Zotero/storage/E4ZADEMU/Cabot et al. - SIBM at CLEF eHealth Evaluation Lab 2016 Extracti.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/C9ADYETR/Cabot et al. - SIBM at CLEF eHealth Evaluation Lab 2016 Extracti.pdf:application/pdf}
+}
+@inproceedings{van_mulligen_erasmus_2016,
+	title = {Erasmus {MC} at {CLEF} {eHealth} 2016: {Concept} {Recognition} and {Coding} in {French} {Texts}},
+	shorttitle = {Erasmus {MC} at {CLEF} {eHealth} 2016},
+	booktitle = {{CLEF} 2016 {Online} {Working} {Notes}},
+	publisher = {CEUR-WS},
+	author = {van Mulligen, Erik M. and Afzal, Zubair and Akhondi, Saber A. and Vo, Dang and Kors, Jan A.},
+	year = {2016},
+	file = {Fulltext:/Users/mario/Zotero/storage/AT3LSRP4/van Mulligen et al. - Erasmus MC at CLEF eHealth 2016 Concept Recogniti.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/5NNCIN2V/van Mulligen et al. - Erasmus MC at CLEF eHealth 2016 Concept Recogniti.pdf:application/pdf}
+}
+@inproceedings{mottin_bitem_2016,
+	title = {{BiTeM} at {CLEF} {eHealth} {Evaluation} {Lab} 2016 {Task} 2: {Multilingual} {Information} {Extraction}.},
+	booktitle = {{CLEF} 2016 {Online} {Working} {Notes}},
+	publisher = {CEUR-WS},
+	author = {Mottin, Luc and Gobeill, Julien and Mottaz, Anaïs and Pasche, Emilie and Gaudinat, Arnaud and Ruch, Patrick},
+	year = {2016},
+	file = {Fulltext:/Users/mario/Zotero/storage/LF9UGCQZ/Mottin et al. - 2016 - BiTeM at CLEF eHealth Evaluation Lab 2016 Task 2 .pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/BRRDY3TV/Mottin et al. - 2016 - BiTeM at CLEF eHealth Evaluation Lab 2016 Task 2 .pdf:application/pdf}
+}
+@inproceedings{jonnagaddala_automatic_2017,
+	title = {Automatic coding of death certificates to {ICD}-10 terminology},
+	booktitle = {{CLEF} 2017 {Online} {Working} {Notes}},
+	publisher = {CEUR-WS},
+	author = {Jonnagaddala, Jitendra and Hu, Feiyan},
+	year = {2017},
+	file = {Fulltext:/Users/mario/Zotero/storage/AW2YGWHC/Jonnagaddala und Hu - Automatic coding of death certificates to ICD-10 t.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/VHWNWWPC/Jonnagaddala und Hu - Automatic coding of death certificates to ICD-10 t.pdf:application/pdf}
+}
+@inproceedings{ho-dac_litl_2017,
+	title = {{LITL} at {CLEF} {eHealth}2017: automatic classification of death reports},
+	booktitle = {{CLEF} 2017 {Online} {Working} {Notes}},
+	publisher = {CEUR-WS},
+	author = {Ho-Dac, Lydia-Mai and Fabre, Cécile and Birski, Anouk and Boudraa, Imane and Bourriot, Aline and Cassier, Manon and Delvenne, Léa and Garcia-Gonzalez, Charline and Kang, Eun-Bee and Piccinini, Elisa},
+	year = {2017},
+	file = {Fulltext:/Users/mario/Zotero/storage/N2Q47RVL/Ho-Dac et al. - LITL at CLEF eHealth2017 automatic classification.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/D5T3NUAR/Ho-Dac et al. - LITL at CLEF eHealth2017 automatic classification.pdf:application/pdf}
+}
+@inproceedings{ho-dac_litl_2016,
+	title = {{LITL} at {CLEF} {eHealth}2016: recognizing entities in {French} biomedical documents},
+	booktitle = {{CLEF} 2016 {Online} {Working} {Notes}},
+	publisher = {CEUR-WS},
+	author = {Ho-Dac, Lydia-Mai and Tanguy, Ludovic and Grauby, Céline and Mby, Aurore Heu and Malosse, Justine and Rivière, Laura and Veltz-Mauclair, Amélie},
+	year = {2016},
+	file = {Fulltext:/Users/mario/Zotero/storage/9YCE3EVM/Ho-Dac et al. - LITL at CLEF eHealth2016 recognizing entities in .pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/I6YEA4ZT/Ho-Dac et al. - LITL at CLEF eHealth2016 recognizing entities in .pdf:application/pdf}
+}
+@inproceedings{dermouche_ecstra-inserm_2016,
+	title = {{ECSTRA}-{INSERM}@ {CLEF} {eHealth}2016-task 2: {ICD}10 {Code} {Extraction} from {Death} {Certificates}},
+	booktitle = {{CLEF} 2016 {Online} {Working} {Notes}},
+	author = {Dermouche, Mohamed and Looten, Vincent and Flicoteaux, Rémi and Chevret, Sylvie and Velcin, Julien and Taright, Namik},
+	year = {2016},
+	file = {Fulltext:/Users/mario/Zotero/storage/WVDFQFEK/Dermouche et al. - ECSTRA-INSERM@ CLEF eHealth2016-task 2 ICD10 Code.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/RIMPEW8L/Dermouche et al. - ECSTRA-INSERM@ CLEF eHealth2016-task 2 ICD10 Code.pdf:application/pdf}
+}
+@inproceedings{ebersbach_fusion_2017,
+	title = {Fusion {Methods} for {ICD}10 {Code} {Classification} of {Death} {Certificates} in {Multilingual} {Corpora}},
+	booktitle = {{CLEF} 2017 {Online} {Working} {Notes}},
+	publisher = {CEUR-WS},
+	author = {Ebersbach, Mike and Herms, Robert and Eibl, Maximilian},
+	year = {2017},
+	file = {Fulltext:/Users/mario/Zotero/storage/LKIZA2P4/Ebersbach et al. - 2017 - Fusion Methods for ICD10 Code Classification of De.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/CIX48RIC/Ebersbach et al. - 2017 - Fusion Methods for ICD10 Code Classification of De.pdf:application/pdf}
 }
\ No newline at end of file
--- a/paper/wbi-eclef18.tex
+++ b/paper/wbi-eclef18.tex
@@ -46,10 +46,10 @@ This paper describes the participation of the WBI team in the CLEF eHealth 2018
 shared task 1 (``Multilingual Information Extraction - ICD-10 coding''). Our
 approach builds on two recurrent neural networks models to extract and classify
 causes of death from French, Italian and Hungarian death certificates. First, we
-employ a LSTM-based sequence-to-sequence model to obtain a symptom name from each
+employ a sequence-to-sequence model based on a Long Short-Term Memory (LSTM) to
-death certificate line. We then utilize a bidirectional LSTM model with
+obtain symptoms from each death certificate line. We then utilize a
-attention mechanism to assign the respective ICD-10 codes to the received
+bidirectional LSTM model with attention mechanism to assign the respective
-symptom names. Our model achieves \ldots
+ICD-10 codes given the found symptoms. Our model achieves \ldots
 \keywords{ICD-10 coding \and Biomedical information extraction \and Multi-lingual sequence-to-sequence