From d1d6ae61ddb5b23cefce6a94f54c44541ba370c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de> Date: Fri, 20 Jul 2018 11:45:35 +0200 Subject: [PATCH] Revise dictionary loading --- code_jurica/seq2seq_ft_multi.py | 172 ++++++++++++++++++++++++-------- code_jurica/util.py | 37 ++++--- 2 files changed, 156 insertions(+), 53 deletions(-) diff --git a/code_jurica/seq2seq_ft_multi.py b/code_jurica/seq2seq_ft_multi.py index 2fc31d9..407fa67 100644 --- a/code_jurica/seq2seq_ft_multi.py +++ b/code_jurica/seq2seq_ft_multi.py @@ -2,6 +2,7 @@ import argparse import os import random + import numpy as np import pandas as pd import pickle @@ -12,13 +13,16 @@ import tensorflow as tf from gensim.models import FastText, KeyedVectors #from fastText import FastText -from keras import Model, Input +from keras import Model, Input, regularizers from keras.callbacks import CSVLogger, ModelCheckpoint, TensorBoard from keras.layers import Bidirectional, LSTM, Dense, Embedding, Dropout from keras.optimizers import Adam, SGD from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer -from typing import Dict, Callable, List, Tuple +from keras.regularizers import l2, l1 +from typing import Dict, Callable, List, Tuple, Counter + +from pandas import DataFrame from tqdm import tqdm from keras.utils import np_utils @@ -114,17 +118,29 @@ class FTMultilingualEmbeddings(LoggingMixin): return source_corpus, target_corpus - def prepare_dict_data(self): + def prepare_dict_data(self, oversampling: int = None, undersampling: int = None): self.logger.info('Start loading dictionary data') data_preparator = DataPreparator() - all_dicts = { + dict_per_lang = { 'FR' : data_preparator.loadDictionary('FR', DICT_FR), 'IT' : data_preparator.loadDictionary('IT', DICT_IT), 'HU' : data_preparator.loadDictionary('HU', DICT_HU) } - corpus = self.build_corpus(all_dicts, lambda line: line[0], lambda line: line[1]) + full_data_set = pd.concat([dict for _, dict in dict_per_lang.items()]) + + if undersampling is not None and oversampling is not None: + raise Exception("Either set under- or oversampling, but not both!") + + if undersampling: + dict_per_lang = self.undersample(dict_per_lang, undersampling) + + if oversampling: + dict_per_lang = self.oversample(dict_per_lang, oversampling) + + self.logger.info("Start preparing dictionary data") + corpus = self.build_corpus(full_data_set, lambda line: line[0], lambda line: line[1]) self.logger.info('Finished loading of dictionary data:') self.logger.info(" Vocabulary size: %s", len(corpus.vocabulary)) @@ -136,52 +152,112 @@ class FTMultilingualEmbeddings(LoggingMixin): return corpus - def build_corpus(self, corpora: Dict, text_extractor: Callable, label_extractor: Callable) -> Corpus: - tokenizer = TokenizePreprocessor() - max_length = 0 + def oversample(self, dictionaries: Dict[str, List], min_frequency: int) -> Dict[str, List]: + self.logger.info("Start oversampling with minimal frequency %s", min_frequency) + doc_counts = [] + + for lang, documents in dictionaries.items(): + samples = [] + label_counter = Counter([doc[1] for doc in documents]) + for label, frequency in label_counter.items(): + items_to_sample = min_frequency - frequency + if items_to_sample > 0: + sampling_base = [doc for doc in documents if doc[1] == label] + samples = samples + [sampling_base[i] for i in random.sample(range(len(sampling_base)), items_to_sample)] + + dictionaries[lang] = documents + samples + + doc_counts.append((lang, len(dictionaries[lang]), len(documents))) + + return dictionaries - corpus_texts = [] - corpus_labels = [] - vocabulary = dict() - rev_vocabulary = dict() + def undersample(self, dictionaries: Dict[str, List], min_frequency: int) -> Dict[str, List]: + self.logger.info("Start undersampling instances with minimal frequency %s", min_frequency) + doc_counts = [] - for lang, corpus in corpora.items(): - texts = [text_extractor(line) for line in corpus] - labels = [label_extractor(line) for line in corpus] + for lang, documents in dictionaries.items(): + prev_doc_count = len(documents) - tokenized_texts = tokenizer.transform(texts) - tokenized_texts = [[ lang + '#' + token.strip() for token in text] for text in tokenized_texts] - max_length = max(max_length, max(len(line) for line in tokenized_texts)) + label_counter = Counter([doc[1] for doc in documents]) + documents = [doc for doc in documents if label_counter[doc[1]] >= min_frequency] + dictionaries[lang] = documents - distinct_tokens = [token for token in list(set(flatten(tokenized_texts))) if token.strip()] + doc_counts.append((lang, prev_doc_count, len(documents))) - vocab_size = len(vocabulary) - lang_vocabulary = {token : vocab_size + i + 1 for i, token in enumerate(distinct_tokens)} - lang_rev_vocabulary = {vocab_size + i + 1: token for i, token in enumerate(distinct_tokens)} + doc_count_str = " | ".join(["{}: {} -> {}".format(lang, prev_count, cur_count) for lang, prev_count, cur_count in doc_counts]) + self.logger.info("Finished undersampling: %s", doc_count_str) + return dictionaries - texts = tokenized_texts + def build_corpus(self, data: DataFrame, text_extractor: Callable, label_extractor: Callable) -> Corpus: + tokenizer = TokenizePreprocessor() + max_length = 0 + + texts = [text_extractor(row) for _, row in data.iterrows()] + labels = [label_extractor(row) for _, row in data.iterrows()] + langs = [row["lang"] for _, row in data.iterrows()] - corpus_texts = corpus_texts + texts - corpus_labels = corpus_labels + labels + tokenized_texts = tokenizer.transform(texts) + tokenized_texts = zip(langs, tokenized_texts) + tokenized_texts = [[ lang + '#' + token.strip() for token in text] for lang, text in tokenized_texts] - vocabulary.update(lang_vocabulary) - rev_vocabulary.update(lang_rev_vocabulary) + max_length = max(max_length, max(len(text) for text in tokenized_texts)) + + distinct_tokens = [token for token in list(set(flatten([text for text in tokenized_texts]))) if token.strip()] + + vocabulary = {token : i + 1 for i, token in enumerate(distinct_tokens)} + rev_vocabulary = {i + 1: token for i, token in enumerate(distinct_tokens)} keras_tokenizer = Tokenizer() keras_tokenizer.word_index = vocabulary encoder = LabelEncoder() - encoder.fit(corpus_labels) - - return Corpus(corpus_texts, corpus_labels, vocabulary, rev_vocabulary, max_length, keras_tokenizer, encoder) + encoder.fit(labels) + + return Corpus(texts, labels, vocabulary, rev_vocabulary, max_length, keras_tokenizer, encoder) + + # def build_corpus(self, corpora: Dict, text_extractor: Callable, label_extractor: Callable) -> Corpus: + # tokenizer = TokenizePreprocessor() + # max_length = 0 + # + # corpus_texts = [] + # corpus_labels = [] + # vocabulary = dict() + # rev_vocabulary = dict() + # + # for lang, corpus in corpora.items(): + # texts = [text_extractor(line) for line in corpus] + # labels = [label_extractor(line) for line in corpus] + # + # tokenized_texts = tokenizer.transform(texts) + # tokenized_texts = [[ lang + '#' + token.strip() for token in text] for text in tokenized_texts] + # max_length = max(max_length, max(len(line) for line in tokenized_texts)) + # + # distinct_tokens = [token for token in list(set(flatten(tokenized_texts))) if token.strip()] + # + # vocab_size = len(vocabulary) + # lang_vocabulary = {token : vocab_size + i + 1 for i, token in enumerate(distinct_tokens)} + # lang_rev_vocabulary = {vocab_size + i + 1: token for i, token in enumerate(distinct_tokens)} + # + # texts = tokenized_texts + # + # corpus_texts = corpus_texts + texts + # corpus_labels = corpus_labels + labels + # + # vocabulary.update(lang_vocabulary) + # rev_vocabulary.update(lang_rev_vocabulary) + # + # keras_tokenizer = Tokenizer() + # keras_tokenizer.word_index = vocabulary + # + # encoder = LabelEncoder() + # encoder.fit(corpus_labels) + # + # return Corpus(corpus_texts, corpus_labels, vocabulary, rev_vocabulary, max_length, keras_tokenizer, encoder) def train_icd10_model(self, icd10_model: Model, dict_corpus: Corpus, batch_size: int, epochs: int): # Make sure the embeddings are loaded self.initialize_ft_models() - labels_encoded = dict_corpus.label_encoder.transform(dict_corpus.labels) - labels_one_hot = np_utils.to_categorical(labels_encoded) - source_word_sequence = dict_corpus.tokenizer.texts_to_sequences(dict_corpus.texts) #FIXME: Different to previous sequencing? @@ -192,10 +268,9 @@ class FTMultilingualEmbeddings(LoggingMixin): self.logger.info("Splitting data set into train and validation") data_train, data_val, labels_train, labels_val = \ - train_test_split(token_sequences, labels_one_hot, - test_size=0.2, random_state=777, stratify=dict_corpus.labels) + train_test_split(token_sequences, dict_corpus.labels, test_size=0.2, random_state=777, stratify=dict_corpus.labels) - self.logger.info("Training instances: %s, Validation instances: %s", len(data_train), len(data_val)) + self.logger.info("Training instances: %s, validation instances: %s", len(data_train), len(data_val)) self.logger.info("Persisting train and test splits") IOUtil.save_object(data_train, self.output_folder, "data_train.obj") @@ -203,9 +278,28 @@ class FTMultilingualEmbeddings(LoggingMixin): IOUtil.save_nparray(labels_train, self.output_folder, "labels_train.npy") IOUtil.save_nparray(labels_val, self.output_folder, "labels_val.npy") + target_classes = len(dictionary_data.label_encoder.classes_) + + labels_train = dict_corpus.label_encoder.transform(labels_train) + labels_train = np_utils.to_categorical(labels_train, num_classes=target_classes) + + labels_val = dict_corpus.label_encoder.transform(labels_val) + labels_val = np_utils.to_categorical(labels_val, num_classes=target_classes) + tensorboard_folder = os.path.join(self.output_folder, "tensorboard-log") os.makedirs(tensorboard_folder, exist_ok=True) + embedding_meta_file = os.path.join(tensorboard_folder, "metadata.tsv") + with open(embedding_meta_file, "w", encoding="utf-8") as output_writer: + sorted_items = sorted(dict_corpus.vocabulary.items(), key=lambda e: e[1]) + output_writer.write("Word\n") + output_writer.writelines(["%s\n" % item[0] for item in sorted_items]) + output_writer.close() + + # emb_data = np.zeros((len(dict_corpus.vocabulary), 26)) + # for i in range(len(dict_corpus.vocabulary)): + # emb_data[i][0] = i+1 + callbacks = [ CSVLogger(os.path.join(self.output_folder, "training.log")), ModelCheckpoint("model-{epoch:02d}.hdf5", save_best_only=True), @@ -227,19 +321,19 @@ class FTMultilingualEmbeddings(LoggingMixin): input_length=dict_corpus.max_length, trainable=True, mask_zero=True, name="emb_layer") embeddings = embedding_layer(inputs) - bi_lstm_layer = Bidirectional(LSTM(lstm_size, dropout=0.75, return_sequences=True, unroll=True)) + bi_lstm_layer = Bidirectional(LSTM(lstm_size, return_sequences=True, unroll=True)) bi_lstm_layer_out = bi_lstm_layer(embeddings) attention = Attention()(bi_lstm_layer_out) - dense_out = Dense(400)(attention) + dense_out = Dense(300)(attention) dense_out_dropout = Dropout(rate=0.5)(dense_out) softmax_layer = Dense(output_size, activation='softmax') softmax_out = softmax_layer(dense_out_dropout) model = Model(inputs=inputs, outputs=softmax_out) - optimizer = Adam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) + optimizer = Adam(lr=1e-3) #optimizer = SGD() model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) diff --git a/code_jurica/util.py b/code_jurica/util.py index 031227b..70ebc5a 100644 --- a/code_jurica/util.py +++ b/code_jurica/util.py @@ -14,6 +14,7 @@ import pickle from collections import Counter from nltk import TreebankWordTokenizer +from pandas import DataFrame from sklearn.base import BaseEstimator, TransformerMixin from unidecode import unidecode from tqdm import tqdm @@ -37,15 +38,15 @@ nonword_behind = "(?<=\W)" # model_HU = load_model('data/embeddings/cc.hu.300.bin') # model_IT = load_model('data/embeddings/cc.it.300.bin') -dummy_model = load_model("data/embeddings/wiki.pfl.bin") -global model_FR -model_FR= dummy_model - -global model_HU -model_HU = dummy_model - -global model_IT -model_IT = dummy_model +# dummy_model = load_model("data/embeddings/wiki.pfl.bin") +# global model_FR +# model_FR= dummy_model +# +# global model_HU +# model_HU = dummy_model +# +# global model_IT +# model_IT = dummy_model regex_concept_dict = [ # biomedical @@ -297,8 +298,12 @@ class DataPreparator(object): return data, errors - def loadDictionary(self, lang: str, dict_file: str): - dictionary_entries = [] + def loadDictionary(self, lang: str, dict_file: str) -> DataFrame: + data = { + "text": [], + "label" : [], + "lang" : [] + } df = pd.read_csv(dict_file, sep=';', dtype=str, encoding="utf8") for index, row in df.iterrows(): @@ -310,12 +315,16 @@ class DataPreparator(object): label = str(row['Icd1']).strip().upper()[:4] if not isinstance(text, float): - dictionary_entries.append([text.lower(), label, lang]) + data["text"].append(text.lower()) + data["label"].append(label) + data["lang"].append(lang) else: if not math.isnan(text): - dictionary_entries.append([text.lower(), label, lang]) + data["text"].append(text.lower()) + data["label"].append(label) + data["lang"].append(lang) - return dictionary_entries + return pd.DataFrame(data) def loadDictionaries(self, unbalanced=False, oversampled=False, shuffle: bool = False): -- GitLab