From afcb8d68b3ec079fa690e49ba5a181546f5275ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de>
Date: Thu, 31 May 2018 10:28:55 +0200
Subject: [PATCH] Finished word embedding explainations

---
 paper/20_related_work.tex  |  27 +++---
 paper/30_methods_intro.tex |   2 +-
 paper/references.bib       | 170 +++++++++++++++++++++++++++++++++++++
 3 files changed, 186 insertions(+), 13 deletions(-)

diff --git a/paper/20_related_work.tex b/paper/20_related_work.tex
index 3c2aa48..c743cbb 100644
--- a/paper/20_related_work.tex
+++ b/paper/20_related_work.tex
@@ -32,22 +32,25 @@ data from left to right, and and backward chain, consuming the data in the
 opposite direction. The final representation is typically the concatenation or a
 linear combination of both states. 
 
-\subsection{Word Embeddings}
-Distributional semantic models have been researched for decades in the area of natural language processing (NLP) \cite{}.
-The investigated models aim to represent words using a real-valued vector (also called embedding) based on a huge amount of unlabeled texts which captures syntactic and semantic similarities between words.
-Starting with the publication of the work from Collobert et al. \cite{} in 2008, word embeddings are one of the hot topics in NLP and a plethora of appraoches have been proposed \cite{}.
-
-The majority of todays embedding models are based on deep learning models trained to perform some kind of language modeling task \cite{}. 
-The most popular embedding model is the Word2Vec model introduced by Mikolov et. al \cite{}. 
+\subsection{Word Embeddings} 
+Distributional semantic models (DSMs) have been researched for decades in the area of natural language processing (NLP) \cite{turney_frequency_2010}.
+Based on a huge amount of unlabeled texts, DSMs aim to represent words using a real-valued vector (also called embedding) which captures syntactic and semantic similarities between the units.
+Starting with the publication of the work from Collobert et al. \cite{collobert_natural_2011} in 2011, learning embeddings for linguistic units, such as words, sentences or paragraphs, are one of the hot topics in NLP and a plethora of appraoches have been proposed \cite{bojanowski_enriching_2016,mikolov_distributed_2013,peters_deep_2018,pennington_glove}.
+ 
+The majority of todays embedding models are based on deep learning models trained to perform some kind of language modeling task \cite{peters_semi-supervised_2017,peters_deep_2018,pinter_mimicking_2017}. 
+The most popular embedding model is the Word2Vec model introduced by Mikolov et. al \cite{mikolov_distributed_2013, mikolov_efficient_2013}. 
 They propose two shallow neural network models, continuous bag-of-words (CBOW) and SkipGram, that are trained to reconstruct the context given a center word and vice versa.
-In contrast, Pennington et al. \cite{} use the ratio between co-occurrence probabilities of two words with another one to learn a vector representation.
+In contrast, Pennington et al. \cite{pennington_glove:_2014} use the ratio between co-occurrence probabilities of two words with another one to learn a vector representation.
+In \cite{peters_deep_2018} deep bi-directional LSTM models will be utilized to learn word embeddings that also capture different contexts of it. 
 
-The most recent models focus on the integration of subword and morphological information to provide suitable representations even for unseen, out-of-vocabulary words. 
-For example, Pinter et al. \cite{} try to reconstruct a pre-trained word embedding by learning a bi-directional LSTM model on character level. 
+Several recent models focus on the integration of subword and morphological information to provide suitable representations even for unseen, out-of-vocabulary words. 
+For example, Pinter et al. \cite{pinter_mimicking_2017} try to reconstruct a pre-trained word embedding by learning a bi-directional LSTM model on character level. 
 Similarily, Bojanowski et al. \cite{bojanowski_enriching_2016} adapt the SkipGram by taking character n-grams into account. 
-They assign a vector representation to each character n-gram and represent words by summing over all of these representations of a word.
-   
+Within their so called fastText model they assign a vector representation to each character n-gram and represent words by summing over all of these representations of a word.
 
+Next to embeddings that capture word similiarities in one language also multi- resp. cross-lingual approaches have been investigated.
+Proposed methods either learn a linear mapping between monolingual representations \cite{faruqui_improving_2014,xing_normalized_2015} or utilize word- \cite{guo_cross-lingual_2015,vyas_sparse_2016}, sentence- \cite{pham_learning_2015} or document-aligned \cite{sogaard_inverted_2015} corpora to build a shared embedding space.
+   
 \subsection{ICD-10 Classification}
 The ICD-10 coding task has already been carried out in the 2016 \cite{neveol_clinical_2016} and 2017 \cite{neveol_clef_2017} edition of the eHealth lab. 
 Participating teams used a plethora of different approaches to tackle the classification problem. 
diff --git a/paper/30_methods_intro.tex b/paper/30_methods_intro.tex
index 44b90df..1ebc32c 100644
--- a/paper/30_methods_intro.tex
+++ b/paper/30_methods_intro.tex
@@ -3,4 +3,4 @@ two-step process. First, we employ a neural, multi-language sequence-to-sequence
 model to receive a death cause description for a given death certificate line.
 We will then use a second classification model to assign the respective ICD-10
 codes to the obtained death cause. The remainder of this section detailed
-explanation of our two models.
\ No newline at end of file
+explanation of our two models. 
\ No newline at end of file
diff --git a/paper/references.bib b/paper/references.bib
index 7244f07..3b1cfed 100644
--- a/paper/references.bib
+++ b/paper/references.bib
@@ -1,4 +1,31 @@
 
+@article{peters_deep_2018,
+	title = {Deep contextualized word representations},
+	url = {https://openreview.net/forum?id=SJTCsqMUf},
+	journal = {arXiv:1802.05365},
+	abstract = {We introduce a new type of deep contextualized word representation that models both (1) complex characteristics of word use (e.g., syntax and semantics), and (2) how these uses vary across...},
+	urldate = {2018-02-16},
+	author = {Peters, Matthew E. and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},
+	month = feb,
+	year = {2018},
+	keywords = {Context Embeedings, Document Classification, Embeddings, Read},
+	file = {arXiv\:1802.05365 PDF:/Users/mario/Zotero/storage/89C2DP8R/Peters et al. - 2018 - Deep contextualized word representations.pdf:application/pdf;arXiv.org Snapshot:/Users/mario/Zotero/storage/YF7GZNUI/1802.html:text/html;Full Text PDF:/Users/mario/Zotero/storage/2SWMPWEA/Peters et al. - 2018 - Deep contextualized word representations.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/9X2UN33P/forum.html:text/html}
+}
+
+@article{peters_semi-supervised_2017,
+	title = {Semi-supervised sequence tagging with bidirectional language models},
+	url = {http://arxiv.org/abs/1705.00108},
+	abstract = {Pre-trained word embeddings learned from unlabeled text have become a standard component of neural network architectures for NLP tasks. However, in most cases, the recurrent network that operates on word-level representations to produce context sensitive representations is trained on relatively little labeled data. In this paper, we demonstrate a general semi-supervised approach for adding pre- trained context embeddings from bidirectional language models to NLP systems and apply it to sequence labeling tasks. We evaluate our model on two standard datasets for named entity recognition (NER) and chunking, and in both cases achieve state of the art results, surpassing previous systems that use other forms of transfer or joint learning with additional labeled data and task specific gazetteers.},
+	urldate = {2018-02-16},
+	journal = {arXiv:1705.00108},
+	author = {Peters, Matthew E. and Ammar, Waleed and Bhagavatula, Chandra and Power, Russell},
+	month = apr,
+	year = {2017},
+	note = {arXiv: 1705.00108},
+	keywords = {Read, Embeddings, Word Embeddings, Language Models},
+	file = {arXiv\:1705.00108 PDF:/Users/mario/Zotero/storage/DW4C3I9R/Peters et al. - 2017 - Semi-supervised sequence tagging with bidirectiona.pdf:application/pdf;arXiv.org Snapshot:/Users/mario/Zotero/storage/SQ4CHQJL/1705.html:text/html}
+}
+
 @article{bojanowski_enriching_2016,
 	title = {Enriching {Word} {Vectors} with {Subword} {Information}},
 	url = {http://arxiv.org/abs/1607.04606},
@@ -13,6 +40,20 @@
 	file = {arXiv\:1607.04606 PDF:/Users/mario/Zotero/storage/9WC5C7M6/Bojanowski et al. - 2016 - Enriching Word Vectors with Subword Information.pdf:application/pdf;arXiv.org Snapshot:/Users/mario/Zotero/storage/YPS6YZHR/1607.html:text/html}
 }
 
+@article{pinter_mimicking_2017,
+	title = {Mimicking {Word} {Embeddings} using {Subword} {RNNs}},
+	url = {http://arxiv.org/abs/1707.06961},
+	abstract = {Word embeddings improve generalization over lexical features by placing each word in a lower-dimensional space, using distributional information obtained from unlabeled data. However, the effectiveness of word embeddings for downstream NLP tasks is limited by out-of-vocabulary (OOV) words, for which embeddings do not exist. In this paper, we present MIMICK, an approach to generating OOV word embeddings compositionally, by learning a function from spellings to distributional embeddings. Unlike prior work, MIMICK does not require re-training on the original word embedding corpus; instead, learning is performed at the type level. Intrinsic and extrinsic evaluations demonstrate the power of this simple approach. On 23 languages, MIMICK improves performance over a word-based baseline for tagging part-of-speech and morphosyntactic attributes. It is competitive with (and complementary to) a supervised character-based model in low-resource settings.},
+	urldate = {2018-03-12},
+	journal = {arXiv:1707.06961 [cs]},
+	author = {Pinter, Yuval and Guthrie, Robert and Eisenstein, Jacob},
+	month = jul,
+	year = {2017},
+	note = {arXiv: 1707.06961},
+	keywords = {Embeddings, Read, Word Embeddings},
+	file = {arXiv\:1707.06961 PDF:/Users/mario/Zotero/storage/33XVJS9Z/Pinter et al. - 2017 - Mimicking Word Embeddings using Subword RNNs.pdf:application/pdf;arXiv.org Snapshot:/Users/mario/Zotero/storage/5U39STXC/1707.html:text/html}
+}
+
 @inproceedings{neveol_clef_2017,
 	title = {{CLEF} {eHealth} 2017 {Multilingual} {Information} {Extraction} task overview: {ICD}10 coding of death certificates in {English} and {French}},
 	shorttitle = {{CLEF} {eHealth} 2017 {Multilingual} {Information} {Extraction} task overview},
@@ -45,6 +86,36 @@
 	file = {Fulltext:/Users/mario/Zotero/storage/EEAVXG89/Goeuriot et al. - 2017 - Clef 2017 ehealth evaluation lab overview.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/TMNBJ6YC/978-3-319-65813-1_26.html:text/html}
 }
 
+@inproceedings{mikolov_distributed_2013,
+	title = {Distributed representations of words and phrases and their compositionality},
+	booktitle = {Advances in neural information processing systems},
+	author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S. and Dean, Jeff},
+	year = {2013},
+	keywords = {Embeddings, Read, Word Embeddings},
+	pages = {3111--3119},
+	file = {Fulltext:/Users/mario/Zotero/storage/Y7PKTLQX/Mikolov et al. - 2013 - Distributed representations of words and phrases a.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/8B9JIEQG/5021-distributed-representations-of-words-andphrases.html:text/html}
+}
+
+@article{mikolov_efficient_2013,
+	title = {Efficient estimation of word representations in vector space},
+	journal = {arXiv preprint arXiv:1301.3781},
+	author = {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
+	year = {2013},
+	keywords = {Embeddings, Read, Word Embeddings},
+	file = {Fulltext:/Users/mario/Zotero/storage/494A5KSG/Mikolov et al. - 2013 - Efficient estimation of word representations in ve.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/84YYF44Z/1301.html:text/html}
+}
+
+@inproceedings{pennington_glove:_2014,
+	title = {Glove: {Global} vectors for word representation},
+	shorttitle = {Glove},
+	booktitle = {Proceedings of the 2014 conference on empirical methods in natural language processing ({EMNLP})},
+	author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher},
+	year = {2014},
+	keywords = {Read, Word Embeddings},
+	pages = {1532--1543},
+	file = {Fulltext:/Users/mario/Zotero/storage/24PDQ7AG/Pennington et al. - 2014 - Glove Global vectors for word representation.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/IX997EFR/Pennington et al. - 2014 - Glove Global vectors for word representation.pdf:application/pdf}
+}
+
 @article{butt_classification_2013,
 	title = {Classification of cancer-related death certificates using machine learning},
 	volume = {6},
@@ -101,6 +172,30 @@ The system proposed in this study provides automatic identification and characte
 	file = {ScienceDirect Full Text PDF:/Users/mario/Zotero/storage/P8HLCZWK/Koopman et al. - 2015 - Automatic ICD-10 classification of cancers from fr.pdf:application/pdf;ScienceDirect Snapshot:/Users/mario/Zotero/storage/X3AKYDDI/S1386505615300289.html:text/html}
 }
 
+@article{turney_frequency_2010,
+	title = {From frequency to meaning: {Vector} space models of semantics},
+	volume = {37},
+	shorttitle = {From frequency to meaning},
+	journal = {Journal of artificial intelligence research},
+	author = {Turney, Peter D. and Pantel, Patrick},
+	year = {2010},
+	keywords = {Unread, Word Embeddings},
+	pages = {141--188},
+	file = {Snapshot:/Users/mario/Zotero/storage/9H8ZCIME/jair.html:text/html;Turney und Pantel - 2010 - From frequency to meaning Vector space models of .pdf:/Users/mario/Zotero/storage/8SPBC8M2/Turney und Pantel - 2010 - From frequency to meaning Vector space models of .pdf:application/pdf}
+}
+
+@article{collobert_natural_2011,
+	title = {Natural language processing (almost) from scratch},
+	volume = {12},
+	number = {Aug},
+	journal = {Journal of Machine Learning Research},
+	author = {Collobert, Ronan and Weston, Jason and Bottou, LÃ©on and Karlen, Michael and Kavukcuoglu, Koray and Kuksa, Pavel},
+	year = {2011},
+	keywords = {Unread, Word Embeddings},
+	pages = {2493--2537},
+	file = {Fulltext:/Users/mario/Zotero/storage/H9VZDLXY/Collobert et al. - 2011 - Natural language processing (almost) from scratch.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/NATRPDG4/collobert11a.html:text/html}
+}
+
 @inproceedings{sutskever_sequence_2014,
 	title = {Sequence to sequence learning with neural networks},
 	booktitle = {Advances in neural information processing systems},
@@ -362,4 +457,79 @@ The system proposed in this study provides automatic identification and characte
 	author = {Kingma, Diederik P. and Ba, Jimmy},
 	year = {2014},
 	file = {Snapshot:/Users/mario/Zotero/storage/YSR9BL4W/1412.html:text/html}
+}
+
+@inproceedings{vinyals_show_2015,
+	title = {Show and tell: {A} neural image caption generator},
+	shorttitle = {Show and tell},
+	booktitle = {Computer {Vision} and {Pattern} {Recognition} ({CVPR}), 2015 {IEEE} {Conference} on},
+	publisher = {IEEE},
+	author = {Vinyals, Oriol and Toshev, Alexander and Bengio, Samy and Erhan, Dumitru},
+	year = {2015},
+	pages = {3156--3164},
+	file = {Fulltext:/Users/mario/Zotero/storage/YYYDMHJD/Vinyals et al. - 2015 - Show and tell A neural image caption generator.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/XQBCTX6S/Vinyals et al. - 2015 - Show and tell A neural image caption generator.pdf:application/pdf}
+}
+
+@inproceedings{faruqui_improving_2014,
+	title = {Improving vector space word representations using multilingual correlation},
+	booktitle = {Proceedings of the 14th {Conference} of the {European} {Chapter} of the {Association} for {Computational} {Linguistics}},
+	author = {Faruqui, Manaal and Dyer, Chris},
+	year = {2014},
+	pages = {462--471},
+	file = {Fulltext:/Users/mario/Zotero/storage/8X3ZWRRV/Faruqui und Dyer - 2014 - Improving vector space word representations using .pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/9TNA35WS/Faruqui und Dyer - 2014 - Improving vector space word representations using .pdf:application/pdf}
+}
+
+@inproceedings{vyas_sparse_2016,
+	title = {Sparse bilingual word representations for cross-lingual lexical entailment},
+	booktitle = {Proceedings of the 2016 {Conference} of the {North} {American} {Chapter} of the {Association} for {Computational} {Linguistics}: {Human} {Language} {Technologies}},
+	author = {Vyas, Yogarshi and Carpuat, Marine},
+	year = {2016},
+	pages = {1187--1197},
+	file = {Fulltext:/Users/mario/Zotero/storage/WK4I4EGG/Vyas und Carpuat - 2016 - Sparse bilingual word representations for cross-li.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/A9SJS6LH/Vyas und Carpuat - 2016 - Sparse bilingual word representations for cross-li.pdf:application/pdf}
+}
+
+@inproceedings{pham_learning_2015,
+	title = {Learning distributed representations for multilingual text sequences},
+	booktitle = {Proceedings of the 1st {Workshop} on {Vector} {Space} {Modeling} for {Natural} {Language} {Processing}},
+	author = {Pham, Hieu and Luong, Thang and Manning, Christopher},
+	year = {2015},
+	pages = {88--94},
+	file = {Fulltext:/Users/mario/Zotero/storage/CF3JNTCK/Pham et al. - 2015 - Learning distributed representations for multiling.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/8STRAYTS/Pham et al. - 2015 - Learning distributed representations for multiling.pdf:application/pdf}
+}
+
+@article{vulic_bilingual_2016,
+	title = {Bilingual distributed word representations from document-aligned comparable data},
+	volume = {55},
+	journal = {Journal of Artificial Intelligence Research},
+	author = {VuliÄ‡, Ivan and Moens, Marie-Francine},
+	year = {2016},
+	pages = {953--994},
+	file = {Fulltext:/Users/mario/Zotero/storage/DF26LCTD/VuliÄ‡ und Moens - 2016 - Bilingual distributed word representations from do.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/UN3RUE5Y/10997.html:text/html}
+}
+
+@inproceedings{xing_normalized_2015,
+	title = {Normalized word embedding and orthogonal transform for bilingual word translation},
+	booktitle = {Proceedings of the 2015 {Conference} of the {North} {American} {Chapter} of the {Association} for {Computational} {Linguistics}: {Human} {Language} {Technologies}},
+	author = {Xing, Chao and Wang, Dong and Liu, Chao and Lin, Yiye},
+	year = {2015},
+	pages = {1006--1011},
+	file = {Fulltext:/Users/mario/Zotero/storage/W6XGG8LG/Xing et al. - 2015 - Normalized word embedding and orthogonal transform.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/TKYHHWZZ/Xing et al. - 2015 - Normalized word embedding and orthogonal transform.pdf:application/pdf}
+}
+
+@inproceedings{guo_cross-lingual_2015,
+	title = {Cross-lingual dependency parsing based on distributed representations},
+	volume = {1},
+	booktitle = {Proceedings of the 53rd {Annual} {Meeting} of the {Association} for {Computational} {Linguistics} and the 7th {International} {Joint} {Conference} on {Natural} {Language} {Processing} ({Volume} 1: {Long} {Papers})},
+	author = {Guo, Jiang and Che, Wanxiang and Yarowsky, David and Wang, Haifeng and Liu, Ting},
+	year = {2015},
+	pages = {1234--1244},
+	file = {Fulltext:/Users/mario/Zotero/storage/D6Q9WBXY/Guo et al. - 2015 - Cross-lingual dependency parsing based on distribu.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/K8NNEQ6H/Guo et al. - 2015 - Cross-lingual dependency parsing based on distribu.pdf:application/pdf}
+}
+
+@inproceedings{sogaard_inverted_2015,
+	title = {Inverted indexing for cross-lingual {NLP}},
+	booktitle = {The 53rd {Annual} {Meeting} of the {Association} for {Computational} {Linguistics} and the 7th {International} {Joint} {Conference} of the {Asian} {Federation} of {Natural} {Language} {Processing} ({ACL}-{IJCNLP} 2015)},
+	author = {SÃ¸gaard, Anders and AgiÄ‡, Å½eljko and Alonso, HÃ©ctor MartÃnez and Plank, Barbara and Bohnet, Bernd and Johannsen, Anders},
+	year = {2015},
+	file = {Fulltext:/Users/mario/Zotero/storage/UZN66Q7M/SÃ¸gaard et al. - 2015 - Inverted indexing for cross-lingual NLP.pdf:application/pdf;Snapshot:/Users/mario/Zotero/storage/26MECM8N/SÃ¸gaard et al. - 2015 - Inverted indexing for cross-lingual NLP.pdf:application/pdf}
 }
\ No newline at end of file
-- 
GitLab