diff --git a/.gitignore b/.gitignore index 8592a5c504e4fae882bc8952e619e24295cefcb6..2f35c766276ca111e240edd1bf0798c597f4972d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,6 @@ *.pyc +code_jurica/models/ + code_mario/data diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index 62ef9201c8f16234f1ecac3bfd6a809b0d578f6d..a900511f67ff8f22e30d61f2a9debbd98ac24ecc 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -13,6 +13,28 @@ class Clef18Task1Data(LoggingMixin): def __init__(self): LoggingMixin.__init__(self, self.__class__.__name__) + def read_train_certifcates_by_language(self, lang: str) -> DataFrame: + if lang == "it": + return self.read_it_train_certificates() + elif lang == "hu": + return self.read_hu_train_certificates() + elif lang == "fr": + return self.read_fr_train_certificates() + else: + raise AssertionError("Unsupported language: " + lang) + + def read_dictionary_by_language(self, lang: str) -> DataFrame: + if lang == "it": + return self.read_it_dictionary() + elif lang == "hu": + return self.read_hu_dictionary() + elif lang == "fr": + return self.read_fr_dictionary() + else: + raise AssertionError("Unsupported language: " + lang) + + # -------------------------------------------------------------------------------- + def read_it_train_certificates(self) -> DataFrame: base_folder = "data/train/IT/training/raw/corpus/" @@ -71,23 +93,35 @@ class Clef18Task1Data(LoggingMixin): return certificate_df + def add_masked_icd10_column(self, certificate_df: DataFrame, min_support: int, mask_code: str = "RARE-ICD10") -> DataFrame: + code_frequency_distribution = certificate_df["ICD10"].value_counts() + icd_masker = pdu.mask_icd10("ICD10", "ICD10_masked", code_frequency_distribution, min_support, mask_code) + + certificate_df = icd_masker.fit_transform(certificate_df) + + num_infrequent_codes = certificate_df["ICD10_masked"].value_counts()[mask_code] + self.logger.info("Added masked icd10 code column. Found %s codes with support less than %s", num_infrequent_codes, min_support) + + return certificate_df + # -------------------------------------------------------------------------------- def _read_certificates(self, calculees_file: str, brutus_file: str) -> DataFrame: self.logger.info("Reading calculees file from %s", calculees_file) - calculees_data = pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"]) + calculees_data = pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"], skipinitialspace=True) self.logger.info("Found %s death certificate lines", len(calculees_data)) self.logger.info("Reading brutus file from %s", brutus_file) brutus_data = pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"]) joined_data = brutus_data.join(calculees_data, lsuffix="_b", rsuffix="_c") + joined_data["ICD10"] = joined_data["ICD10"].astype(str) return joined_data[["RawText", "ICD10"]] def _read_icd10_dictionary(self, file: str, encoding: str) -> DataFrame: self.logger.info("Reading ICD-10 dictionary from %s", file) - dictionary_data = pd.read_csv(file, sep=";", encoding=encoding) + dictionary_data = pd.read_csv(file, sep=";", encoding=encoding, skipinitialspace=True) num_dictionary_entries = len(dictionary_data) self.logger.info("Found %s dictionary entries", num_dictionary_entries) @@ -99,6 +133,8 @@ class Clef18Task1Data(LoggingMixin): dictionary_data.columns = ["ICD10", "Standardized", "DiagnosisText"] + dictionary_data["ICD10"] = dictionary_data["ICD10"].astype(str) + self.logger.info("Removed %s duplicates from dictionary", num_dictionary_entries - len(dictionary_data)) return dictionary_data @@ -138,12 +174,22 @@ if __name__ == "__main__": clef_task_data = Clef18Task1Data() certificates = clef_task_data.read_it_train_certificates() - certificates = pdu.extract_icd10_chapter("ICD10", "ICD10_chapter").fit_transform(certificates) - certificates = pdu.extract_icd10_subchapter("ICD10", "ICD10_subchapter").fit_transform(certificates) + clef_task_data.add_masked_icd10_column(certificates, 4, "RARE-ICD10") + + it_dict = clef_task_data.read_it_dictionary() + fr_dict = clef_task_data.read_fr_dictionary() + hu_dict = clef_task_data.read_hu_dictionary() + + print("HU: ", len(hu_dict["ICD10"].unique())) + print("IT: ", len(it_dict["ICD10"].unique())) + print("FR: ", len(fr_dict["ICD10"].unique())) + print("HU: ", hu_dict["ICD10"].value_counts()) - print(certificates["ICD10_chapter"].value_counts()) - print(certificates["ICD10_subchapter"].value_counts()) + # certificates = pdu.extract_icd10_chapter("ICD10", "ICD10_chapter").fit_transform(certificates) + # certificates = pdu.extract_icd10_subchapter("ICD10", "ICD10_subchapter").fit_transform(certificates) + # print(certificates["ICD10_chapter"].value_counts()) + # print(certificates["ICD10_subchapter"].value_counts()) diff --git a/code_mario/clef18_task1_v2.py b/code_mario/clef18_task1_v2.py index 5ddb39766226d44bf1b6b8b433720c7f77717083..357137e2aade2722e9716841fe1a5caed4f8dc28 100644 --- a/code_mario/clef18_task1_v2.py +++ b/code_mario/clef18_task1_v2.py @@ -1,4 +1,6 @@ import argparse +from argparse import Namespace + import numpy as np import pandas as pd import keras as k @@ -16,14 +18,14 @@ from sklearn.dummy import DummyClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import SGDClassifier from sklearn.metrics import f1_score, accuracy_score -from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit +from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from tqdm import tqdm -from typing import Tuple, Dict, List +from typing import Tuple, Dict, List, Callable from sklearn.externals import joblib import ft_embeddings @@ -77,10 +79,9 @@ class Clef18Task1V2(LoggingMixin): def __init__(self): LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) - def train_embedding_model(self, config: Configuration, ft_model: FastText, - neg_samples: int, epochs: int, batch_size: int) -> Model: + def train_embedding_model(self, config: Configuration, ft_model: FastText, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: self.logger.info("Start building training pairs") - train_pair_data = self.build_pairs(config.train_cert_df, config.dict_df, neg_samples) + train_pair_data = self.build_pairs(config.train_cert_df, config.dict_df, neg_sampling_strategy) self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts()) self.logger.info("Start building embedding model") @@ -97,7 +98,7 @@ class Clef18Task1V2(LoggingMixin): if config.val_cert_df is not None and len(config.test_cert_df) > 0: self.logger.info("Start creation of validation pairs") - val_pair_data = self.build_pairs(config.val_cert_df, config.dict_df, neg_samples) + val_pair_data = self.build_pairs(config.val_cert_df, config.dict_df, neg_sampling_strategy) val_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length) val_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length) @@ -123,7 +124,7 @@ class Clef18Task1V2(LoggingMixin): self.logger.info("Start evaluation of embedding model!") self.logger.info("Start creation of test pairs") - val_pair_data = self.build_pairs(config.test_cert_df, config.dict_df, neg_samples) + val_pair_data = self.build_pairs(config.test_cert_df, config.dict_df, neg_sampling_strategy) test_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length) test_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length) @@ -268,15 +269,11 @@ class Clef18Task1V2(LoggingMixin): def split_train_test(self, certificate_df: DataFrame, train_size: float, stratified_splits: bool, label_column: str) -> Tuple[DataFrame, DataFrame]: if stratified_splits: - splitter = StratifiedShuffleSplit(n_splits=1, train_size=train_size, random_state=42) - split = splitter.split(certificate_df, certificate_df[label_column]) + self.logger.info("Creating stratified splits for column %s", label_column) + training_data, test_data = train_test_split(certificate_df, train_size=train_size, stratify=certificate_df[label_column]) else: - splitter = ShuffleSplit(n_splits=1, train_size=train_size, random_state=42) - split = splitter.split(certificate_df) - - for train_indices, test_indices in split: - training_data = certificate_df.iloc[train_indices] - test_data = certificate_df.iloc[test_indices] + self.logger.info("Creating non-stratified splits") + training_data, test_data = train_test_split(certificate_df, train_size=train_size) return training_data, test_data @@ -318,18 +315,18 @@ class Clef18Task1V2(LoggingMixin): return model def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastText, train_ratio: float, val_ratio: float, - label_column: str, samples: int=None, stratified_splits: bool=False) -> Configuration: + strat_column: str, samples: int=None, stratified_splits: bool=False) -> Configuration: if samples: self.logger.info("Sampling %s instances", samples) cert_df = cert_df.sample(samples, random_state=42) self.logger.info("Splitting certificate lines into train and evaluation data set") - train_cert_df, evaluation_cert_df = self.split_train_test(cert_df, train_ratio, stratified_splits, label_column) + train_cert_df, evaluation_cert_df = self.split_train_test(cert_df, train_ratio, stratified_splits, strat_column) self.logger.info("Finished splitting: train=%s instances, evaluation=%s instances", len(train_cert_df), len(evaluation_cert_df)) self.logger.info("Splitting evaluation data set into validation and test set") - val_cert_df, test_cert_df = self.split_train_test(evaluation_cert_df, val_ratio, stratified_splits, label_column) + val_cert_df, test_cert_df = self.split_train_test(evaluation_cert_df, val_ratio, stratified_splits, strat_column) label_encoders = self.prepare_label_encoders(dict_df, cert_df) keras_tokenizer = Tokenizer(oov_token="<UNK>") @@ -347,13 +344,13 @@ class Clef18Task1V2(LoggingMixin): dict_df, max_dict_length = self.prepare_dictionary_df(dict_df, "train", label_encoders, keras_tokenizer) return Configuration(train_cert_df, val_cert_df, test_cert_df, dict_df, max_cert_length, max_dict_length, - ft_model.vector_size, label_column, label_encoders, keras_tokenizer) + ft_model.vector_size, strat_column, label_encoders, keras_tokenizer) def prepare_label_encoders(self, dict_df: DataFrame, cert_df: DataFrame) -> ICD10LabelEncoders: self.logger.info("Fitting label encoder to ICD10 codes") icd10_code_encoder = LabelEncoder() icd10_code_encoder.fit(list([icd10.strip() for icd10 in dict_df["ICD10"].values]) + - list([icd10.strip() for icd10 in cert_df["ICD10"].values])) + list([icd10.strip() for icd10 in cert_df["ICD10"].values])) self.logger.info("Found %s distinct ICD10 codes within the data set", len(icd10_code_encoder.classes_)) self.logger.info("Fitting label encoder to ICD10 chapters") @@ -438,7 +435,7 @@ class Clef18Task1V2(LoggingMixin): return dict_data_prepared, max_length - def build_pairs(self, certificate_data: DataFrame, dictionary_data: DataFrame, num_neg_samples: int): + def build_pairs(self, certificate_data: DataFrame, dictionary_data: DataFrame, neg_sampling_strategy: Callable): # FIXME: This can be implemented more efficiently! # FIXME: Improve sampling of negative instances (especially if code is I-XXX sample other codes of the same class (e.g. I-YYY) # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2 @@ -460,8 +457,9 @@ class Clef18Task1V2(LoggingMixin): labels.append(1.0) # Find illegal ICD-10 for this line - negative_samples = dictionary_data.query("ICD10 != '%s'" % line_icd10_code) - negative_samples = negative_samples.sample(num_neg_samples) + #negative_samples = dictionary_data.query("ICD10 != '%s'" % line_icd10_code) + #negative_samples = negative_samples.sample(num_neg_samples) + negative_samples = neg_sampling_strategy(certificate_data, line_icd10_code) # Build negative samples for i, dict_row in negative_samples.iterrows(): @@ -484,9 +482,9 @@ class Clef18Task1V2(LoggingMixin): def save_evaluation_results(self, eval_results: List[EvaluationResult]): result_configurations = [ ("results.csv", None), - ("results_by_classifier.csv", lambda r: r.classifier_name), - ("results_by_data_set.csv", lambda r: r.data_set_name), - ("results_by_label.csv", lambda r: r.target_label) + ("results_by_classifier.csv", lambda result: result.classifier_name), + ("results_by_data_set.csv", lambda result: result.data_set_name), + ("results_by_label.csv", lambda result: result.target_label) ] for file_name, sort_key in result_configurations: @@ -531,28 +529,94 @@ class Clef18Task1V2(LoggingMixin): return k.models.load_model(args.emb_model) +class NegativeSampling(LoggingMixin): + + def __init__(self): + LoggingMixin.__init__(self, self.__class__.__name__, AppContext.default().default_log_file) + + def get_strategy_by_name(self, name: str, args: Namespace) -> Callable: + #FIXME: Make args to dictionary + + if name == "def": + return self.default_strategy(args.num_neg_samples) + elif name == "ext1": + return self.extended1_strategy(args.num_neg_cha, args.num_neg_sec, args.num_neg_sub, args.num_neg_oth) + else: + raise AssertionError("Unsupported negative sampling strategy: " + name) + + def default_strategy(self, num_negative_samples: int) -> Callable: + def _sample(dictionary_df: DataFrame, line_icd10_code: str): + negative_samples = dictionary_df.query("ICD10 != '%s'" % line_icd10_code) + negative_samples = negative_samples.sample(num_negative_samples) + + return negative_samples + + return _sample + + def extended1_strategy(self, num_chapter_samples, num_section_samples, num_subsection_samples, num_other_samples): + def _sample(dictionary_df: DataFrame, icd10_code: str): + icd10_chapter = icd10_code[0].lower() + icd10_section = icd10_code[0:2].lower() + icd10_subsection = icd10_code[0:3].lower() + + chapter_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter == '%s'" % (icd10_code, icd10_chapter)) + if len(chapter_samples) > 0: + chapter_samples = chapter_samples.sample(min(num_chapter_samples, len(chapter_samples))) + + section_samples = dictionary_df.query("ICD10 != '%s' & ICD10_section == '%s'" % (icd10_code, icd10_section)) + if len(section_samples) > 0: + section_samples = section_samples.sample(min(num_section_samples, len(section_samples))) + + subsection_samples = dictionary_df.query("ICD10 != '%s' & ICD10_subsection == '%s'" % (icd10_code, icd10_subsection)) + if len(subsection_samples) > 0: + subsection_samples = subsection_samples.sample(min(num_subsection_samples, len(subsection_samples))) + + exp_sim_samples = num_chapter_samples + num_section_samples + num_subsection_samples + act_sim_samples = len(chapter_samples) + len(section_samples) + len(subsection_samples) + + other_samples = dictionary_df.query("ICD10 != '%s' & ICD10_chapter != '%s'" % (icd10_code, icd10_chapter)) + other_samples = other_samples.sample(min(num_other_samples + (exp_sim_samples - act_sim_samples), len(other_samples))) + + # print("#Chapter samples: ", len(chapter_samples)) + # print("#Section samples: ", len(section_samples)) + # print("#Subsection samples: ", len(subsection_samples)) + # print("#Other samples: ", len(other_samples)) + + return pd.concat([chapter_samples, section_samples, subsection_samples, other_samples]) + return _sample + + if __name__ == "__main__": parser = argparse.ArgumentParser(prog="CLEF2018") subparsers = parser.add_subparsers(dest="mode") train_emb_parser = subparsers.add_parser("train-emb") + train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"]) train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int) train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int) train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) - train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.4, type=float) - train_emb_parser.add_argument("--neg_samples", help="Number of negative samples for each pair to use", default=75, type=int) + train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.3, type=float) + train_emb_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int) - train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_encoded", type=str) + train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str) train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) train_emb_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') + train_emb_parser.add_argument("--neg_sampling", help="Negative sampling strategy to use", default="ext1", choices=["def", "ext1"]) + train_emb_parser.add_argument("--num_neg_samples", help="Number of negative samples to use (default strategy)", default=75, type=int) + train_emb_parser.add_argument("--num_neg_cha", help="Number of negative chapter samples to use (ext1 strategy)", default=10, type=int) + train_emb_parser.add_argument("--num_neg_sec", help="Number of negative section samples to use (ext1 strategy)", default=10, type=int) + train_emb_parser.add_argument("--num_neg_sub", help="Number of negative subsection samples to use (ext1 strategy)", default=10, type=int) + train_emb_parser.add_argument("--num_neg_oth", help="Number of negative other samples to use (ext1 strategy)", default=40, type=int) + eval_classifier_parser = subparsers.add_parser("eval-cl") eval_classifier_parser.add_argument("emb_model", help="Path to the embedding model to use") eval_classifier_parser.add_argument("train_conf", help="Path to the training configuration dump") + eval_classifier_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"]) eval_classifier_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) eval_classifier_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.4, type=float) eval_classifier_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int) - eval_classifier_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_encoded", type=str) + eval_classifier_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str) eval_classifier_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) eval_classifier_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') @@ -561,25 +625,27 @@ if __name__ == "__main__": AppContext.initialize_by_app_name(args.mode) clef_data = Clef18Task1Data() - it_dictionary = clef_data.read_it_dictionary() - it_certificates = clef_data.read_it_train_certificates() - it_certificates = clef_data.filter_single_code_lines(it_certificates) + dictionary = clef_data.read_dictionary_by_language(args.lang) + certificates = clef_data.read_train_certifcates_by_language(args.lang) + certificates = clef_data.filter_single_code_lines(certificates) + certificates = clef_data.add_masked_icd10_column(certificates, 10) - #sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - #ft_it_model = FastText(sentences, min_count=1) + sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + ft_model = FastText(sentences, min_count=1) ft_embeddings = FastTextEmbeddings() - ft_it_model = ft_embeddings.load_it_embeddings() + #ft_model = ft_embeddings.load_embeddings_by_language(args.lang) clef18_task1 = Clef18Task1V2() + neg_sampling = NegativeSampling() if args.mode == "train-emb": - configuration = clef18_task1.prepare_data_set( - it_certificates, it_dictionary, ft_it_model, args.train_ratio, - args.val_ratio,args.strat_column, args.samples, args.strat_splits) + configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, args.val_ratio,args.strat_column, + args.samples, args.strat_splits) + neg_sampling_strategy = neg_sampling.get_strategy_by_name(args.neg_sampling, args) clef18_task1.save_configuration(configuration) - embedding_model = clef18_task1.train_embedding_model(configuration, ft_it_model, args.neg_samples, args.epochs, args.batch_size) + embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, neg_sampling_strategy, args.epochs, args.batch_size) elif args.mode == "eval-cl": configuration = clef18_task1.reload_configuration(args.train_conf) diff --git a/code_mario/ft_embeddings.py b/code_mario/ft_embeddings.py index 34cc482cd72255f13c8697b995c26bc9256a386c..644b5b51bd426ca3f839bff705bcd9eadf2963e9 100644 --- a/code_mario/ft_embeddings.py +++ b/code_mario/ft_embeddings.py @@ -8,8 +8,28 @@ class FastTextEmbeddings(LoggingMixin): def __init__(self): LoggingMixin.__init__(self, __class__.__name__) + def load_embeddings_by_language(self, lang: str) -> FastText: + if lang == "it": + return self.load_it_embeddings() + elif lang == "hu": + return self.load_hu_embeddings() + elif lang == "fr": + return self.load_fr_embeddings() + else: + raise AssertionError("Unsupported language: " + lang) + + # ------------------------------------------------------------------------------------ + + def load_fr_embeddings(self) -> FastText: + french_ft_file = "../code_jurica/data/embeddings/cc.fr.300.bin" + return self._load_ft_model(french_ft_file) + + def load_hu_embeddings(self) -> FastText: + hungarian_it_file = "../code_jurica/data/embeddings/cc.hu.300.bin" + return self._load_ft_model(hungarian_it_file) + def load_it_embeddings(self) -> FastText: - italian_ft_file = "embeddings/wiki.it" + italian_ft_file = "../code_jurica/data/embeddings/cc.it.300.bin" return self._load_ft_model(italian_ft_file) def _load_ft_model(self, ft_file: str) -> FastText: diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py index 7f1f14c36c511430fb19d4ef23e4f4f047a14829..47a2a6bb397910687cd39f80e44269cb5135bee4 100644 --- a/code_mario/preprocessing.py +++ b/code_mario/preprocessing.py @@ -79,12 +79,14 @@ class DataPreparationUtil(object): return MapFunction(icd10_column, _extract, target_column) @staticmethod - def extract_icd10_subchapter(icd10_column: str, target_column: str): - def _extract(value): - return value[0:2].lower() + def mask_icd10(icd10_column:str, target_column:str, value_counts, min_support: int, icd10_mask:str): + def _mask_icd10(icd10): + if str(icd10) in value_counts and value_counts[str(icd10)] >= min_support: + return icd10 - return MapFunction(icd10_column, _extract, target_column) + return icd10_mask + return MapFunction(icd10_column, _mask_icd10, target_column) class FitMixin(object): @@ -230,5 +232,3 @@ class KerasSequencer(BaseEstimator, TransformerMixin): sequences = self.keras_tokenizer.texts_to_sequences(texts) return PandasUtil.append_column(data, self.target_column, sequences) - -