diff --git a/paper/10_introduction.tex b/paper/10_introduction.tex index 9a00d1649a9e43bc6bda2fd146581fdbec7ef522..e65ceac5bcadfe22c92275c866129bb72ee74373 100644 --- a/paper/10_introduction.tex +++ b/paper/10_introduction.tex @@ -4,7 +4,11 @@ health documents, is a highly important task to support many applications in research, daily clinical routine and policy-making. Computer-aided approaches can improve decision making and support clinical processes, for example, by giving a more sophisticated overview about a research area, providing detailed -information about the aetiopathology of a patient or disease patterns. +information about the aetiopathology of a patient or disease patterns. In the +past years major advances have been made in the area of natural language +processing. However, improvements in the field of biomedical text mining lag +behind other domains mainly due to privacy issues and concerns regarding the +processed data (e.g. electronic health records). The CLEF eHealth lab attends to this circumstance through organization of various shared tasks which aid and support the development of approaches to diff --git a/paper/20_related_work.tex b/paper/20_related_work.tex index c12a892243cdf13bef8920cba83daac36f01ce24..690a80fa2d5579c51318d6acfc4339811f4d6649 100644 --- a/paper/20_related_work.tex +++ b/paper/20_related_work.tex @@ -1 +1,42 @@ -\nm{TODO: Insert text!} \ No newline at end of file +The ICD-10 coding task has already been carried out in the 2016 +\cite{neveol_clinical_2016} and 2017 \cite{neveol_clef_2017} edition of the +eHealth lab. Participating teams used a plethora of different approaches to +tackle the classification problem. The methods can essentially be divided into +two categories: knowledge-based +\cite{cabot_sibm_2016,jonnagaddala_automatic_2017,van_mulligen_erasmus_2016} and +machine learning approaches +\cite{dermouche_ecstra-inserm_2016,ebersbach_fusion_2017,ho-dac_litl_2016,miftakhutdinov_kfu_2017}. +The former relies on lexical sources, medical terminologies and other ontologies +to match (parts of) the certificate text with entries from the knowledge-bases +according to a rule framework. For example, Di Nunzio et al. +\cite{di_nunzio_lexicon_2017} calculate a score for each ICD-10 dictionary entry +by summing the binary or tf-idf weights of each term of a certificate line +segment and assign the ICD-10 code with the highest score. In contrast, Ho-Dac +et al. \cite{ho-dac_litl_2017} treat the problem as information retrieval task +and utilze the SOLR search engine. + +The machine learning based approaches employ a variety techniques, e.g. +Conditional Random Fields (CRFs) \cite{ho-dac_litl_2016}, Labeled Latent +Dirichlet Analysis (LDA) \cite{dermouche_ecstra-inserm_2016} and Support Vector Machines +(SVMs) \cite{ebersbach_fusion_2017} with diverse hand-crafted features. Most +similar to our approach is the work from Miftahutdinov and Tutbalina \cite{miftakhutdinov_kfu_2017}, +which achieved the best results for English certificates in the last year's +competition. They use a neural LSTM-based encoder-decoder model that processes the raw +certificate text as input and encodes it into a vector representation. +Furthermore a vector which captures the textual similarity between the +certificate line and the symptons resp. diagnosis of the individual ICD-10 codes +is used to integrate prior knowledge into the model. The concatenation of both +vector representations is then used to output the characters and numbers of the +ICD-10 code in the decoding step. In contrast to their work, our approach +introduces a model for multi-language ICD-10 classification. We utilitize two +separate recurrent neural networks, one sequence to sequence model for symptom +extraction and one for classification, to predict the ICD-10 codes for a +certificate text independent from which language they originate. + + + + + + + + diff --git a/paper/references.bib b/paper/references.bib index 00fb534aac09560bf582d54361365dcf49424082..1e803740ea1a1da1ad308c351b5968d56235ae65 100644 --- a/paper/references.bib +++ b/paper/references.bib @@ -1,4 +1,92 @@ +@inproceedings{neveol_clef_2017, + title = {{CLEF} {eHealth} 2017 {Multilingual} {Information} {Extraction} task overview: {ICD}10 coding of death certificates in {English} and {French}}, + shorttitle = {{CLEF} {eHealth} 2017 {Multilingual} {Information} {Extraction} task overview}, + booktitle = {{CLEF} 2017 {Evaluation} {Labs} and {Workshop}: {Online} {Working} {Notes}, {CEUR}-{WS}}, + author = {Névéol, Aurélie and Anderson, Robert N. and Cohen, K. Bretonnel and Grouin, Cyril and Lavergne, Thomas and Rey, Grégoire and Robert, Aude and Rondet, Claire and Zweigenbaum, Pierre}, + year = {2017}, + keywords = {Read}, + pages = {17}, + file = {Fulltext:/Users/mario/Zotero/storage/8QATUX6Q/Névéol et al. - 2017 - CLEF eHealth 2017 Multilingual Information Extract.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/EV2SLCV8/Névéol et al. - 2017 - CLEF eHealth 2017 Multilingual Information Extract.pdf:application/pdf} +} + +@inproceedings{miftakhutdinov_kfu_2017, + title = {Kfu at clef ehealth 2017 task 1: {Icd}-10 coding of english death certificates with recurrent neural networks}, + booktitle = {{CLEF} 2017 {Online} {Working} {Notes}}, + publisher = {CEUR-WS}, + author = {Miftakhutdinov, Zulfat and Tutubalina, Elena}, + year = {2017}, + keywords = {CLEF, ICD-10-Classification, Read}, + file = {Fulltext:/Users/mario/Zotero/storage/HRZ6Q8Q6/Miftakhutdinov und Tutubalina - 2017 - Kfu at clef ehealth 2017 task 1 Icd-10 coding of .pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/J8TXTUNT/Miftakhutdinov und Tutubalina - 2017 - Kfu at clef ehealth 2017 task 1 Icd-10 coding of .pdf:application/pdf} +} + +@inproceedings{goeuriot_clef_2017, + title = {Clef 2017 ehealth evaluation lab overview}, + booktitle = {International {Conference} of the {Cross}-{Language} {Evaluation} {Forum} for {European} {Languages}}, + publisher = {Springer}, + author = {Goeuriot, Lorraine and Kelly, Liadh and Suominen, Hanna and Névéol, Aurélie and Robert, Aude and Kanoulas, Evangelos and Spijker, Rene and Palotti, Joao and Zuccon, Guido}, + year = {2017}, + keywords = {Read}, + pages = {291--303}, + file = {Fulltext:/Users/mario/Zotero/storage/EEAVXG89/Goeuriot et al. - 2017 - Clef 2017 ehealth evaluation lab overview.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/TMNBJ6YC/978-3-319-65813-1_26.html:text/html} +} + +@article{butt_classification_2013, + title = {Classification of cancer-related death certificates using machine learning}, + volume = {6}, + issn = {1836-1935}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3674421/}, + doi = {10.4066/AMJ.2013.1654}, + abstract = {Background +Cancer monitoring and prevention relies on the critical aspect of timely notification of cancer cases. However, the abstraction and classification of cancer from the free-text of pathology reports and other relevant documents, such as death certificates, exist as complex and time-consuming activities. + +Aims +In this paper, approaches for the automatic detection of notifiable cancer cases as the cause of death from free-text death certificates supplied to Cancer Registries are investigated. + +Method +A number of machine learning classifiers were studied. Features were extracted using natural language techniques and the Medtex toolkit. The numerous features encompassed stemmed words, bi-grams, and concepts from the SNOMED CT medical terminology. The baseline consisted of a keyword spotter using keywords extracted from the long description of ICD-10 cancer related codes. + +Results +Death certificates with notifiable cancer listed as the cause of death can be effectively identified with the methods studied in this paper. A Support Vector Machine (SVM) classifier achieved best performance with an overall Fmeasure of 0.9866 when evaluated on a set of 5,000 freetext death certificates using the token stem feature set. The SNOMED CT concept plus token stem feature set reached the lowest variance (0.0032) and false negative rate (0.0297) while achieving an F-measure of 0.9864. The SVM classifier accounts for the first 18 of the top 40 evaluated runs, and entails the most robust classifier with a variance of 0.001141, half the variance of the other classifiers. + +Conclusion +The selection of features significantly produced the most influences on the performance of the classifiers, although the type of classifier employed also affects performance. In contrast, the feature weighting schema created a negligible effect on performance. Specifically, it is found that stemmed tokens with or without SNOMED CT concepts create the most effective feature when combined with an SVM classifier.}, + number = {5}, + urldate = {2018-03-16}, + journal = {The Australasian Medical Journal}, + author = {Butt, Luke and Zuccon, Guido and Nguyen, Anthony and Bergheim, Anton and Grayson, Narelle}, + month = may, + year = {2013}, + pmid = {23745151}, + pmcid = {PMC3674421}, + pages = {292--299}, + file = {PubMed Central Full Text PDF:/Users/mario/Zotero/storage/ZCUHSCHR/Butt et al. - 2013 - Classification of cancer-related death certificate.pdf:application/pdf} +} + +@article{koopman_automatic_2015, + title = {Automatic {ICD}-10 classification of cancers from free-text death certificates}, + volume = {84}, + issn = {1386-5056}, + url = {http://www.sciencedirect.com/science/article/pii/S1386505615300289}, + doi = {10.1016/j.ijmedinf.2015.08.004}, + abstract = {Objective +Death certificates provide an invaluable source for cancer mortality statistics; however, this value can only be realised if accurate, quantitative data can be extracted from certificates – an aim hampered by both the volume and variable nature of certificates written in natural language. This paper proposes an automatic classification system for identifying cancer related causes of death from death certificates. +Methods +Detailed features, including terms, n-grams and SNOMED CT concepts were extracted from a collection of 447,336 death certificates. These features were used to train Support Vector Machine classifiers (one classifier for each cancer type). The classifiers were deployed in a cascaded architecture: the first level identified the presence of cancer (i.e., binary cancer/nocancer) and the second level identified the type of cancer (according to the ICD-10 classification system). A held-out test set was used to evaluate the effectiveness of the classifiers according to precision, recall and F-measure. In addition, detailed feature analysis was performed to reveal the characteristics of a successful cancer classification model. +Results +The system was highly effective at identifying cancer as the underlying cause of death (F-measure 0.94). The system was also effective at determining the type of cancer for common cancers (F-measure 0.7). Rare cancers, for which there was little training data, were difficult to classify accurately (F-measure 0.12). Factors influencing performance were the amount of training data and certain ambiguous cancers (e.g., those in the stomach region). The feature analysis revealed a combination of features were important for cancer type classification, with SNOMED CT concept and oncology specific morphology features proving the most valuable. +Conclusion +The system proposed in this study provides automatic identification and characterisation of cancers from large collections of free-text death certificates. This allows organisations such as Cancer Registries to monitor and report on cancer mortality in a timely and accurate manner. In addition, the methods and findings are generally applicable beyond cancer classification and to other sources of medical text besides death certificates.}, + number = {11}, + urldate = {2018-03-16}, + journal = {International Journal of Medical Informatics}, + author = {Koopman, Bevan and Zuccon, Guido and Nguyen, Anthony and Bergheim, Anton and Grayson, Narelle}, + month = nov, + year = {2015}, + pages = {956--965}, + file = {ScienceDirect Full Text PDF:/Users/mario/Zotero/storage/P8HLCZWK/Koopman et al. - 2015 - Automatic ICD-10 classification of cancers from fr.pdf:application/pdf;ScienceDirect Snapshot:/Users/mario/Zotero/storage/X3AKYDDI/S1386505615300289.html:text/html} +} + @inproceedings{sutskever_sequence_2014, title = {Sequence to sequence learning with neural networks}, booktitle = {Advances in neural information processing systems}, @@ -132,4 +220,102 @@ year = {2014}, pages = {1724--1734}, file = {Full Text PDF:/Users/mario/Zotero/storage/4NE9THT8/Cho et al. - 2014 - Learning Phrase Representations using RNN Encoder–.pdf:application/pdf} +} + +@article{neveol_clinical_2016, + title = {Clinical {Information} {Extraction} at the {CLEF} {eHealth} {Evaluation} lab 2016}, + volume = {1609}, + issn = {1613-0073}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5756095/}, + abstract = {This paper reports on Task 2 of the 2016 CLEF eHealth evaluation lab which extended the previous information extraction tasks of ShARe/CLEF eHealth evaluation labs. The task continued with named entity recognition and normalization in French narratives, as offered in CLEF eHealth 2015. Named entity recognition involved ten types of entities including disorders that were defined according to Semantic Groups in the Unified Medical Language System® (UMLS®), which was also used for normalizing the entities. In addition, we introduced a large-scale classification task in French death certificates, which consisted of extracting causes of death as coded in the International Classification of Diseases, tenth revision (ICD10). Participant systems were evaluated against a blind reference standard of 832 titles of scientific articles indexed in MEDLINE, 4 drug monographs published by the European Medicines Agency (EMEA) and 27,850 death certificates using Precision, Recall and F-measure. In total, seven teams participated, including five in the entity recognition and normalization task, and five in the death certificate coding task. Three teams submitted their systems to our newly offered reproducibility track. For entity recognition, the highest performance was achieved on the EMEA corpus, with an overall F-measure of 0.702 for plain entities recognition and 0.529 for normalized entity recognition. For entity normalization, the highest performance was achieved on the MEDLINE corpus, with an overall F-measure of 0.552. For death certificate coding, the highest performance was 0.848 F-measure.}, + urldate = {2018-05-23}, + journal = {CEUR workshop proceedings}, + author = {Névéol, Aurélie and Cohen, K. Bretonnel and Grouin, Cyril and Hamon, Thierry and Lavergne, Thomas and Kelly, Liadh and Goeuriot, Lorraine and Rey, Grégoire and Robert, Aude and Tannier, Xavier and Zweigenbaum, Pierre}, + month = sep, + year = {2016}, + pmid = {29308065}, + pmcid = {PMC5756095}, + pages = {28--42}, + file = {PubMed Central Full Text PDF:/Users/mario/Zotero/storage/ZWWRZSZK/Névéol et al. - 2016 - Clinical Information Extraction at the CLEF eHealt.pdf:application/pdf} +} + +@inproceedings{di_nunzio_lexicon_2017, + title = {A {Lexicon} {Based} {Approach} to {Classification} of {ICD}10 {Codes}. {IMS} {Unipd} at {CLEF} {eHealth} {Task}}, + booktitle = {{CLEF} 2017 {Online} {Working} {Notes}}, + publisher = {CEUR-WS}, + author = {Di Nunzio, Giorgio Maria and Beghini, Federica and Vezzani, Federica and Henrot, Genevieve}, + year = {2017}, + file = {Fulltext:/Users/mario/Zotero/storage/HGHINDH3/Di Nunzio et al. - A Lexicon Based Approach to Classification of ICD1.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/LWSDB84Q/Di Nunzio et al. - A Lexicon Based Approach to Classification of ICD1.pdf:application/pdf} +} + +@inproceedings{cabot_sibm_2016, + title = {{SIBM} at {CLEF} {eHealth} {Evaluation} {Lab} 2016: {Extracting} {Concepts} in {French} {Medical} {Texts} with {ECMT} and {CIMIND}}, + booktitle = {{CLEF} 2015 {Online} {Working} {Notes}}, + publisher = {CEUR-WS}, + author = {Cabot, Chloé and Soualmia, Lina F. and Dahamna, Badisse and Darmoni, Stéfan J.}, + year = {2016}, + file = {Fulltext:/Users/mario/Zotero/storage/E4ZADEMU/Cabot et al. - SIBM at CLEF eHealth Evaluation Lab 2016 Extracti.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/C9ADYETR/Cabot et al. - SIBM at CLEF eHealth Evaluation Lab 2016 Extracti.pdf:application/pdf} +} + +@inproceedings{van_mulligen_erasmus_2016, + title = {Erasmus {MC} at {CLEF} {eHealth} 2016: {Concept} {Recognition} and {Coding} in {French} {Texts}}, + shorttitle = {Erasmus {MC} at {CLEF} {eHealth} 2016}, + booktitle = {{CLEF} 2016 {Online} {Working} {Notes}}, + publisher = {CEUR-WS}, + author = {van Mulligen, Erik M. and Afzal, Zubair and Akhondi, Saber A. and Vo, Dang and Kors, Jan A.}, + year = {2016}, + file = {Fulltext:/Users/mario/Zotero/storage/AT3LSRP4/van Mulligen et al. - Erasmus MC at CLEF eHealth 2016 Concept Recogniti.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/5NNCIN2V/van Mulligen et al. - Erasmus MC at CLEF eHealth 2016 Concept Recogniti.pdf:application/pdf} +} + +@inproceedings{mottin_bitem_2016, + title = {{BiTeM} at {CLEF} {eHealth} {Evaluation} {Lab} 2016 {Task} 2: {Multilingual} {Information} {Extraction}.}, + booktitle = {{CLEF} 2016 {Online} {Working} {Notes}}, + publisher = {CEUR-WS}, + author = {Mottin, Luc and Gobeill, Julien and Mottaz, Anaïs and Pasche, Emilie and Gaudinat, Arnaud and Ruch, Patrick}, + year = {2016}, + file = {Fulltext:/Users/mario/Zotero/storage/LF9UGCQZ/Mottin et al. - 2016 - BiTeM at CLEF eHealth Evaluation Lab 2016 Task 2 .pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/BRRDY3TV/Mottin et al. - 2016 - BiTeM at CLEF eHealth Evaluation Lab 2016 Task 2 .pdf:application/pdf} +} + +@inproceedings{jonnagaddala_automatic_2017, + title = {Automatic coding of death certificates to {ICD}-10 terminology}, + booktitle = {{CLEF} 2017 {Online} {Working} {Notes}}, + publisher = {CEUR-WS}, + author = {Jonnagaddala, Jitendra and Hu, Feiyan}, + year = {2017}, + file = {Fulltext:/Users/mario/Zotero/storage/AW2YGWHC/Jonnagaddala und Hu - Automatic coding of death certificates to ICD-10 t.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/VHWNWWPC/Jonnagaddala und Hu - Automatic coding of death certificates to ICD-10 t.pdf:application/pdf} +} + +@inproceedings{ho-dac_litl_2017, + title = {{LITL} at {CLEF} {eHealth}2017: automatic classification of death reports}, + booktitle = {{CLEF} 2017 {Online} {Working} {Notes}}, + publisher = {CEUR-WS}, + author = {Ho-Dac, Lydia-Mai and Fabre, Cécile and Birski, Anouk and Boudraa, Imane and Bourriot, Aline and Cassier, Manon and Delvenne, Léa and Garcia-Gonzalez, Charline and Kang, Eun-Bee and Piccinini, Elisa}, + year = {2017}, + file = {Fulltext:/Users/mario/Zotero/storage/N2Q47RVL/Ho-Dac et al. - LITL at CLEF eHealth2017 automatic classification.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/D5T3NUAR/Ho-Dac et al. - LITL at CLEF eHealth2017 automatic classification.pdf:application/pdf} +} + +@inproceedings{ho-dac_litl_2016, + title = {{LITL} at {CLEF} {eHealth}2016: recognizing entities in {French} biomedical documents}, + booktitle = {{CLEF} 2016 {Online} {Working} {Notes}}, + publisher = {CEUR-WS}, + author = {Ho-Dac, Lydia-Mai and Tanguy, Ludovic and Grauby, Céline and Mby, Aurore Heu and Malosse, Justine and Rivière, Laura and Veltz-Mauclair, Amélie}, + year = {2016}, + file = {Fulltext:/Users/mario/Zotero/storage/9YCE3EVM/Ho-Dac et al. - LITL at CLEF eHealth2016 recognizing entities in .pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/I6YEA4ZT/Ho-Dac et al. - LITL at CLEF eHealth2016 recognizing entities in .pdf:application/pdf} +} + +@inproceedings{dermouche_ecstra-inserm_2016, + title = {{ECSTRA}-{INSERM}@ {CLEF} {eHealth}2016-task 2: {ICD}10 {Code} {Extraction} from {Death} {Certificates}}, + booktitle = {{CLEF} 2016 {Online} {Working} {Notes}}, + author = {Dermouche, Mohamed and Looten, Vincent and Flicoteaux, Rémi and Chevret, Sylvie and Velcin, Julien and Taright, Namik}, + year = {2016}, + file = {Fulltext:/Users/mario/Zotero/storage/WVDFQFEK/Dermouche et al. - ECSTRA-INSERM@ CLEF eHealth2016-task 2 ICD10 Code.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/RIMPEW8L/Dermouche et al. - ECSTRA-INSERM@ CLEF eHealth2016-task 2 ICD10 Code.pdf:application/pdf} +} + +@inproceedings{ebersbach_fusion_2017, + title = {Fusion {Methods} for {ICD}10 {Code} {Classification} of {Death} {Certificates} in {Multilingual} {Corpora}}, + booktitle = {{CLEF} 2017 {Online} {Working} {Notes}}, + publisher = {CEUR-WS}, + author = {Ebersbach, Mike and Herms, Robert and Eibl, Maximilian}, + year = {2017}, + file = {Fulltext:/Users/mario/Zotero/storage/LKIZA2P4/Ebersbach et al. - 2017 - Fusion Methods for ICD10 Code Classification of De.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/CIX48RIC/Ebersbach et al. - 2017 - Fusion Methods for ICD10 Code Classification of De.pdf:application/pdf} } \ No newline at end of file