diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index 0ee0be9ccaafaf891dca6fd03a22bdb9ba551f2c..7ea5a8cc18f473cad1e42559661ce5fe79f51533 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -13,25 +13,29 @@ class Clef18Task1Data(LoggingMixin): def __init__(self): LoggingMixin.__init__(self, self.__class__.__name__) - def read_train_certifcates_by_language(self, lang: str) -> DataFrame: - if lang == "it": + def read_train_certifcates_by_id(self, id: str) -> DataFrame: + if id == "it": return self.read_it_train_certificates() - elif lang == "hu": + elif id == "hu": return self.read_hu_train_certificates() - elif lang == "fr": + elif id == "fr": return self.read_fr_train_certificates() + elif id == "all-con": + return self.read_all_con_certificates() else: - raise AssertionError("Unsupported language: " + lang) + raise AssertionError("Unsupported language: " + id) - def read_dictionary_by_language(self, lang: str) -> DataFrame: - if lang == "it": + def read_dictionary_by_id(self, id: str) -> DataFrame: + if id == "it": return self.read_it_dictionary() - elif lang == "hu": + elif id == "hu": return self.read_hu_dictionary() - elif lang == "fr": + elif id == "fr": return self.read_fr_dictionary() + elif id == "all-con": + return self.read_all_con_dictionary() else: - raise AssertionError("Unsupported language: " + lang) + raise AssertionError("Unsupported language: " + id) # -------------------------------------------------------------------------------- @@ -83,6 +87,18 @@ class Clef18Task1Data(LoggingMixin): # -------------------------------------------------------------------------------- + def read_all_con_dictionary(self) -> DataFrame: + return pd.concat([self.read_fr_dictionary(), + self.read_it_dictionary(), + self.read_hu_dictionary()]) + + def read_all_con_certificates(self) -> DataFrame: + return pd.concat([self.read_fr_train_certificates(), + self.read_it_train_certificates(), + self.read_hu_train_certificates()]) + + # -------------------------------------------------------------------------------- + def filter_single_code_lines(self, certificate_df: DataFrame) -> DataFrame: multi_code_lines = [key for key, group in groupby(certificate_df.index.values) if len(list(group)) > 1] self.logger.info("Start filtering %s lines with multiple codes", len(multi_code_lines)) diff --git a/code_mario/clef18_task1_emb1.py b/code_mario/clef18_task1_emb1.py index cc9529225480f80f4edb5d6539a2eb276fac0aab..c5b380efebe6722e93c030a15ae323759e2d794e 100644 --- a/code_mario/clef18_task1_emb1.py +++ b/code_mario/clef18_task1_emb1.py @@ -33,7 +33,7 @@ import ft_embeddings from app_context import AppContext from clef18_task1_data import Clef18Task1Data from dnn_classifiers import NeuralNetworkClassifiers as nnc -from ft_embeddings import FastTextEmbeddings +from ft_embeddings import FastTextEmbeddings, FastTextModel from preprocessing import DataPreparationUtil as pdu from keras_extension import KerasUtil as ku from util import LoggingMixin @@ -80,7 +80,7 @@ class Clef18Task1Emb1(LoggingMixin): def __init__(self): LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) - def train_embedding_model(self, config: Configuration, ft_model: FastText, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: + def train_embedding_model(self, config: Configuration, ft_model: FastTextModel, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: self.logger.info("Start building training pairs") train_pair_data = self.build_pairs(config.train_cert_df, config.dict_df, neg_sampling_strategy) self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts()) @@ -296,14 +296,14 @@ class Clef18Task1Emb1(LoggingMixin): return training_data, test_data - def build_embedding_model(self, word_index: Dict, ft_model: FastText, max_cert_length: int, max_dict_length: int): + def build_embedding_model(self, word_index: Dict, ft_model: FastTextModel, max_cert_length: int, max_dict_length: int): # TODO: Make hyper-parameter configurable! # TODO: Think about using CNNs instead of RNNs since we can have multiple ICD-10 per line! embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size)) for word, i in word_index.items(): try: - embedding_vector = ft_model[word] + embedding_vector = ft_model.lookup(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector @@ -333,7 +333,7 @@ class Clef18Task1Emb1(LoggingMixin): return model - def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, train_ratio: float, val_ratio: float, + def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastTextModel, train_ratio: float, val_ratio: float, strat_column: str, samples: int=None, stratified_splits: bool=False) -> Configuration: if samples: @@ -635,7 +635,7 @@ if __name__ == "__main__": subparsers = parser.add_subparsers(dest="mode") train_emb_parser = subparsers.add_parser("train-emb") - train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"]) + train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "all-con"]) train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int) train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int) train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) @@ -669,10 +669,10 @@ if __name__ == "__main__": AppContext.initialize_by_app_name(Clef18Task1Emb1.__name__ + "-" + args.mode) clef_data = Clef18Task1Data() - dictionary = clef_data.read_dictionary_by_language(args.lang) + dictionary = clef_data.read_dictionary_by_id(args.lang) #dictionary = dictionary.sample(1200) - certificates = clef_data.read_train_certifcates_by_language(args.lang) + certificates = clef_data.read_train_certifcates_by_id(args.lang) certificates = clef_data.filter_single_code_lines(certificates) certificates = clef_data.add_masked_icd10_column(certificates, 10) @@ -680,7 +680,7 @@ if __name__ == "__main__": #ft_model = FastText(sentences, min_count=1) ft_embeddings = FastTextEmbeddings() - ft_model = ft_embeddings.load_embeddings_by_language(args.lang) + ft_model = ft_embeddings.load_embeddings_by_id(args.lang) clef18_task1 = Clef18Task1Emb1() clef18_task1.save_arguments(args) diff --git a/code_mario/clef18_task1_emb2.py b/code_mario/clef18_task1_emb2.py index 1d6ddb1135593ff67190eb6fc23aa1f1d9282890..1eeb7d8e46ece374856cefebda986fa6987eaf0f 100644 --- a/code_mario/clef18_task1_emb2.py +++ b/code_mario/clef18_task1_emb2.py @@ -33,7 +33,7 @@ import ft_embeddings from app_context import AppContext from clef18_task1_data import Clef18Task1Data from dnn_classifiers import NeuralNetworkClassifiers as nnc -from ft_embeddings import FastTextEmbeddings +from ft_embeddings import FastTextEmbeddings, FastTextModel from preprocessing import DataPreparationUtil as pdu from keras_extension import KerasUtil as ku from util import LoggingMixin @@ -77,7 +77,7 @@ class Clef18Task1Emb2(LoggingMixin): def __init__(self): LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) - def train_embedding_model(self, config: Configuration, ft_model: FastText, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: + def train_embedding_model(self, config: Configuration, ft_model: FastTextModel, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: self.logger.info("Start building training pairs") train_pair_data = self.build_pairs(config.train_df, neg_sampling_strategy) self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts()) @@ -282,14 +282,14 @@ class Clef18Task1Emb2(LoggingMixin): return training_data, test_data - def build_embedding_model(self, word_index: Dict, ft_model: FastText, conf: Configuration): + def build_embedding_model(self, word_index: Dict, ft_model: FastTextModel, conf: Configuration): # TODO: Make hyper-parameter configurable! # TODO: Think about using CNNs instead of RNNs since we can have multiple ICD-10 per line! embedding_matrix = np.zeros((len(word_index) + 1, ft_model.vector_size)) for word, i in word_index.items(): try: - embedding_vector = ft_model[word] + embedding_vector = ft_model.lookup(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector @@ -321,7 +321,7 @@ class Clef18Task1Emb2(LoggingMixin): return model - def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, train_ratio: float, val_ratio: float, + def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastTextModel, train_ratio: float, val_ratio: float, strat_column: str, samples: int=None, stratified_splits: bool=False) -> Configuration: cert_df = cert_df[["RawText", "ICD10"]] @@ -584,7 +584,7 @@ if __name__ == "__main__": subparsers = parser.add_subparsers(dest="mode") train_emb_parser = subparsers.add_parser("train-emb") - train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"]) + train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "all-con"]) train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int) train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int) train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) @@ -618,10 +618,10 @@ if __name__ == "__main__": AppContext.initialize_by_app_name(Clef18Task1Emb2.__name__ + "-" + args.mode) clef_data = Clef18Task1Data() - dictionary = clef_data.read_dictionary_by_language(args.lang) + dictionary = clef_data.read_dictionary_by_id(args.lang) #dictionary = dictionary.sample(1200) - certificates = clef_data.read_train_certifcates_by_language(args.lang) + certificates = clef_data.read_train_certifcates_by_id(args.lang) #certificates = clef_data.filter_single_code_lines(certificates) #certificates = clef_data.add_masked_icd10_column(certificates, 10) @@ -629,7 +629,7 @@ if __name__ == "__main__": #ft_model = FastText(sentences, min_count=1) ft_embeddings = FastTextEmbeddings() - ft_model = ft_embeddings.load_embeddings_by_language(args.lang) + ft_model = ft_embeddings.load_embeddings_by_id(args.lang) clef18_task1 = Clef18Task1Emb2() clef18_task1.save_arguments(args) diff --git a/code_mario/ft_embeddings.py b/code_mario/ft_embeddings.py index 644b5b51bd426ca3f839bff705bcd9eadf2963e9..59bc279584ac83ab4e2255261143b6d26c114760 100644 --- a/code_mario/ft_embeddings.py +++ b/code_mario/ft_embeddings.py @@ -1,22 +1,54 @@ +import numpy as np + +from typing import List from gensim.models import FastText +from app_context import AppContext from util import LoggingMixin +class FastTextModel(LoggingMixin): + + def __init__(self, name: str, ft_models: List[FastText]): + LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + self.name = name + self.ft_models = ft_models + self.vector_size = sum([ft_model.vector_size for ft_model in self.ft_models]) + + def lookup(self, word: str): + embeddings = [] + for ft_model in self.ft_models: + try: + embeddings.append(ft_model[word]) + except KeyError: + self.logger.warn("Can't create embedding for " + word) + embeddings.append(np.zeros(ft_model.vector_size)) + + if len(embeddings) == 1: + return embeddings[0] + else: + return np.concatenate(embeddings) + + class FastTextEmbeddings(LoggingMixin): def __init__(self): - LoggingMixin.__init__(self, __class__.__name__) - - def load_embeddings_by_language(self, lang: str) -> FastText: - if lang == "it": - return self.load_it_embeddings() - elif lang == "hu": - return self.load_hu_embeddings() - elif lang == "fr": - return self.load_fr_embeddings() + LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + + def load_embeddings_by_id(self, id: str) -> FastTextModel: + if id == "it": + return FastTextModel("it", [self.load_it_embeddings()]) + elif id == "hu": + return FastTextModel("hu", [self.load_hu_embeddings()]) + elif id == "fr": + return FastTextModel("fr", [self.load_fr_embeddings()]) + elif id == "all-con": + return FastTextModel("all-con", [self.load_fr_embeddings(), + self.load_it_embeddings(), + self.load_hu_embeddings()]) + else: - raise AssertionError("Unsupported language: " + lang) + raise AssertionError("Unsupported language: " + id) # ------------------------------------------------------------------------------------ diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py index 47a2a6bb397910687cd39f80e44269cb5135bee4..9cba3822b3cf044a8db9bdc91a19fcbce2d2bff2 100644 --- a/code_mario/preprocessing.py +++ b/code_mario/preprocessing.py @@ -88,6 +88,7 @@ class DataPreparationUtil(object): return MapFunction(icd10_column, _mask_icd10, target_column) + class FitMixin(object): def fit(self, data, y=None):