diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index 7a149c50a4975b15e86087fcb081d8285cfd4999..d24c43a05c101cf8d35a11651d5726dfd9b8ad49 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -2,7 +2,6 @@ import os from typing import List import pandas as pd -from keras.preprocessing.text import Tokenizer, text_to_word_sequence from pandas import DataFrame from tqdm import tqdm @@ -80,7 +79,7 @@ class Clef18Task1Data(LoggingMixin): calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv") brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv") - return self._read_certificates([calculees_file], [brutes_file], "it") + return self._read_certificates([calculees_file], [brutes_file], "hu") def read_hu_dictionary(self) -> DataFrame: base_folder = "data/train/HU/training/raw/dictionaries" @@ -169,9 +168,6 @@ class Clef18Task1Data(LoggingMixin): joined_data = pdu.clean_text("RawText").fit_transform(joined_data) joined_data["Lang"] = language - word_tagger = pdu.tag_words_with_language("RawText", "RawText", "Lang") - joined_data = word_tagger.fit_transform(joined_data) - return joined_data[["RawText", "ICD10", "Lang"]] def _read_icd10_dictionary(self, dictionary_files: List[str], encoding: str, lang: str) -> DataFrame: @@ -198,10 +194,6 @@ class Clef18Task1Data(LoggingMixin): dictionary_data["Lang"] = lang - self.logger.info("Tag language %s on all words", lang) - word_tagger = pdu.tag_words_with_language("DiagnosisText", "DiagnosisText", "Lang") - dictionary_data = word_tagger.fit_transform(dictionary_data) - return dictionary_data def _read_test_data(self, file: str) -> DataFrame: @@ -260,22 +252,104 @@ class Clef18Task1Data(LoggingMixin): return sampled_df def extend_certificates_by_dictionaries(self, certificate_df: DataFrame, dictionary_df: DataFrame) -> DataFrame: + self.logger.info("Start extending certificate data set with dictionary entries (original size: %s)", len(certificate_df)) original_size = len(certificate_df) dict_icd10_codes = dictionary_df["ICD10"].unique() cert_icd10_codes = certificate_df["ICD10"].unique() unseen_icd10_codes = [dict_icd10 for dict_icd10 in dict_icd10_codes if dict_icd10 not in cert_icd10_codes] - for unseen_icd10 in tqdm(unseen_icd10_codes, desc="extend-cert", total=len(unseen_icd10_codes)): - entries = dictionary_df.query("ICD10 == '%s'" % unseen_icd10) - for key, row in entries.iterrows(): - certificate_df = certificate_df.append({"RawText": row["DiagnosisText"], "ICD10": row["ICD10"] }, ignore_index=True) + unseen_mask = dictionary_df["ICD10"].isin(unseen_icd10_codes) + lines_with_unseen_codes = dictionary_df.loc[unseen_mask] + + new_rows = certificate_df.copy().iloc[0:0] + for i, row in tqdm(lines_with_unseen_codes.iterrows(), desc="extend-cert", total=len(lines_with_unseen_codes)): + new_rows = new_rows.append({"RawText": row["DiagnosisText"], "ICD10": row["ICD10"], "Lang": row["Lang"] }, ignore_index=True) + + extended_size = len(new_rows) + certificate_df = pd.concat([certificate_df, new_rows]) - extended_size = len(certificate_df) self.logger.info("Extended cert data set with %s from dictionary (%s in total)" % (extended_size - original_size, extended_size)) return certificate_df + def remove_duplicates_from_certificates(self, certificate_df: DataFrame): + self.logger.info("Start removing duplicates from certificate data set (size: %s)", len(certificate_df)) + + cleaned_cert_df = certificate_df.drop_duplicates(subset=["RawText", "ICD10"]) + self.logger.info("Removed %s duplicates from certificate data set (new size: %s)", + len(certificate_df) - len(cleaned_cert_df), len(cleaned_cert_df)) + + return cleaned_cert_df + + def split_multi_code_lines(self, certificate_df: DataFrame) -> DataFrame: + self.logger.info("Start splitting multi code lines in certificate data set (size: %s)", len(certificate_df)) + duplicate_ids = certificate_df.index.duplicated(keep=False) + lines_with_multiple_codes = certificate_df[duplicate_ids].index.values + + mask_multicode = certificate_df.index.isin(lines_with_multiple_codes) + singlecode_rows = certificate_df[~mask_multicode] + multicode_rows = certificate_df[mask_multicode] + + last_index = -1 + text_pos = 0 + new_rows = singlecode_rows.copy().iloc[0:0] + for index, row in tqdm(multicode_rows.iterrows(), desc="split-multi", total=len(multicode_rows)): + split = str(row["RawText"]).split(",") + + if last_index != index: + text_pos = 0 + else: + text_pos = text_pos + 1 + + if text_pos < len(split): + row["RawText"] = split[text_pos] + new_rows = new_rows.append(row) + else: + new_rows = new_rows.append(row) + last_index = index + + result = pd.concat([singlecode_rows, new_rows]) + self.logger.info("Finished multi code line splitting. Adding %s new rows (new size: %s)", len(new_rows), len(result)) + return result + + def duplicate_less_frequent(self, certificate_df: DataFrame, min_freq: int): + self.logger.info("Start duplicating less frequent ICD10 code entries (size: %s)", len(certificate_df)) + + code_counts = certificate_df["ICD10"].value_counts() + less_frequent_codes = set([code for code, freq in code_counts.iteritems() if freq < min_freq]) + + less_frequent_mask = certificate_df["ICD10"].isin(less_frequent_codes) + less_frequent_rows = certificate_df[less_frequent_mask] + + new_rows = certificate_df.copy().iloc[0:0] + for code in tqdm(less_frequent_codes, desc="build-dup", total=len(less_frequent_codes)): + num_dumplicates = min_freq - code_counts[code] + duplicates = less_frequent_rows.query("ICD10 == '%s'" % code).sample(num_dumplicates, replace=True) + new_rows = pd.concat([new_rows, duplicates]) + + result = pd.concat([certificate_df, new_rows]) + self.logger.info("Added %s duplicates to data set to gurantee min frequency %s (new size: %s)", + len(new_rows), min_freq, len(result)) + + return result + + def language_tag_data(self, data_set: DataFrame, text_column: str, lang_column: str) -> DataFrame: + self.logger.info("Start word tagging column %s with language from %s (size: %s)", text_column, lang_column, len(data_set)) + + word_tagger = pdu.tag_words_with_language(text_column, text_column, lang_column) + data_set = word_tagger.fit_transform(data_set) + + self.logger.info("Finished word tagging data") + return data_set + + def filter_nan_texts(self, certifcate_df: DataFrame) -> DataFrame: + self.logger.info("Start filtering nan texts (size: %s)", len(certifcate_df)) + nan_mask = certifcate_df["RawText"].isin(["nan"]) + certifcate_df = certifcate_df[~nan_mask] + self.logger.info("Finished filtering nan texts (size: %s)", len(certifcate_df)) + return certifcate_df + def check_multi_label_distribution(ds_name: str, certificate_df: DataFrame): print("Data set: ", ds_name) @@ -385,20 +459,29 @@ if __name__ == "__main__": #clef_task_data.down_sample_by_icd10_frequency(all_cert, 4000) it_certificates = clef_task_data.read_it_train_certificates() - - word_tagger = pdu.tag_words_with_language("RawText", "RawText", "Lang") - word_tagger.fit_transform(it_certificates) - it_dictionary = clef_task_data.read_it_dictionary() - clef_task_data.extend_certificates_by_dictionaries(it_certificates, it_dictionary) + + it_certificates = clef_task_data.extend_certificates_by_dictionaries(it_certificates, it_dictionary) + it_certificates = clef_task_data.remove_duplicates_from_certificates(it_certificates) + it_certificates = clef_task_data.split_multi_code_lines(it_certificates) + it_certificates = clef_task_data.duplicate_less_frequent(it_certificates, 4) + print("IT: ", len(it_certificates)) hu_certificates = clef_task_data.read_hu_train_certificates() hu_dictionary = clef_task_data.read_hu_dictionary() - clef_task_data.extend_certificates_by_dictionaries(hu_certificates, hu_dictionary) + hu_certificates = clef_task_data.extend_certificates_by_dictionaries(hu_certificates, hu_dictionary) + hu_certificates = clef_task_data.remove_duplicates_from_certificates(hu_certificates) + hu_certificates = clef_task_data.split_multi_code_lines(hu_certificates) + hu_certificates = clef_task_data.duplicate_less_frequent(hu_certificates, 4) + print("HU: ", len(hu_certificates)) fr_certificates = clef_task_data.read_fr_train_certificates() fr_dictionary = clef_task_data.read_fr_dictionary() - clef_task_data.extend_certificates_by_dictionaries(fr_certificates, fr_dictionary) + fr_certificates = clef_task_data.extend_certificates_by_dictionaries(fr_certificates, fr_dictionary) + fr_certificates = clef_task_data.remove_duplicates_from_certificates(fr_certificates) + fr_certificates = clef_task_data.split_multi_code_lines(fr_certificates) + fr_certificates = clef_task_data.duplicate_less_frequent(fr_certificates, 4) + print("FR: ", len(fr_certificates)) #check_label_distribution(it_certificates) #it_certificates = clef_task_data.down_sample_by_icd10_frequency(it_certificates, 800) diff --git a/code_mario/clef18_task1_emb1.py b/code_mario/clef18_task1_emb1.py index fabf803ccdb9b6dfe68f4b976741c7fd01b85334..b3e58c388b9ce9e7dd6fef510d55c76088bb1a73 100644 --- a/code_mario/clef18_task1_emb1.py +++ b/code_mario/clef18_task1_emb1.py @@ -89,8 +89,8 @@ class Clef18Task1Emb1(Clef18Task1Base): return model - def train_embedding_model(self, config: Emb1Configuration, ft_model: FastTextModel, max_pos_samples: int, neg_sampling_strategy: Callable, - epochs: int, batch_size: int, workers: int, chunk_size: int) -> Model: + def train_embedding_model(self, config: Emb1Configuration, ft_model: FastTextModel, max_pos_samples: int, + neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: self.logger.info("Start building embedding model") model = self.build_embedding_model(config.keras_tokenizer.word_index, ft_model, config.max_cert_length, config.max_dict_length) model.summary(print_fn=self.logger.info) @@ -227,11 +227,7 @@ class Clef18Task1Emb1(Clef18Task1Base): # --------------------------------------------------------------------------------------------------------------------------------------- def prepare_data_set(self, cert_df: DataFrame, dict_df: DataFrame, ft_model: FastTextModel, train_ratio: float, val_ratio: float, - strat_column: str, samples: int=None, stratified_splits: bool=False) -> Emb1Configuration: - - if samples: - self.logger.info("Sampling %s instances", samples) - cert_df = cert_df.sample(samples, random_state=42) + strat_column: str, stratified_splits: bool=False) -> Emb1Configuration: self.logger.info("Splitting certificate lines into train and evaluation data set") train_cert_df, evaluation_cert_df = self.split_train_test(cert_df, train_ratio, stratified_splits, strat_column) @@ -429,19 +425,17 @@ if __name__ == "__main__": subparsers = parser.add_subparsers(dest="mode") train_emb_parser = subparsers.add_parser("train-emb") - train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "all-con"]) + train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "ac", "am"]) train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int) train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int) - train_emb_parser.add_argument("--workers", help="Number of threads during pair building", default=4, type=int) - train_emb_parser.add_argument("--slice_size", help="Number of cert entries to be handled by one thread during pair building", default=1000, type=int) train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) - train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.3, type=float) + train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.5, type=float) train_emb_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int) + train_emb_parser.add_argument("--min_freq", help="Minimal number of instances per ICD10 code", default=10, type=int) train_emb_parser.add_argument("--down_sample", help="Maximal frequency of ICD10 code until start down sampling", default=None, type=int) - train_emb_parser.add_argument("--extend_cert", help="Indicates whether to extend the certificates with dict_entries", default=True, type=bool) - train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str) - train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) + train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10", type=str) + train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=True, type=bool) train_emb_parser.add_argument("--strat_min_freq", help="Min frequency of an icd10 code to be an own class during stratification", default=8, type=int) train_emb_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') train_emb_parser.add_argument("--single_only", help="Indicates whether just to use the single code lines from the data", default=False, type=bool) @@ -457,12 +451,7 @@ if __name__ == "__main__": eval_classifier_parser = subparsers.add_parser("eval-cl") eval_classifier_parser.add_argument("emb_model", help="Path to the embedding model to use") eval_classifier_parser.add_argument("train_conf", help="Path to the training configuration dump") - eval_classifier_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu"]) - eval_classifier_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) - eval_classifier_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.4, type=float) - eval_classifier_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int) - eval_classifier_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str) - eval_classifier_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) + eval_classifier_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "ac", "am"]) eval_classifier_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') predict_parser = subparsers.add_parser("pred") @@ -484,34 +473,40 @@ if __name__ == "__main__": dictionary = clef_data.read_dictionary_by_id(args.lang) certificates = clef_data.read_train_certifcates_by_id(args.lang) - # sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - # ft_model = SingleLanguageFastTextModel("it", "it", FastText(sentences, min_count=1)) + sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + ft_model = SingleLanguageFastTextModel("hu", "hu", FastText(sentences, min_count=1)) ft_embeddings = FastTextEmbeddings() - ft_model = ft_embeddings.load_embeddings_by_id(args.lang) + #ft_model = ft_embeddings.load_embeddings_by_id(args.lang) + + certificates = clef_data.extend_certificates_by_dictionaries(certificates, dictionary) + certificates = clef_data.remove_duplicates_from_certificates(certificates) + certificates = clef_data.filter_nan_texts(certificates) + + if args.lang == "it" or args.lang == "hu": + certificates = clef_data.split_multi_code_lines(certificates) - if args.down_sample: - certificates = clef_data.down_sample_by_icd10_frequency(certificates, args.down_sample) + certificates = clef_data.language_tag_data(certificates, "RawText", "Lang") - if args.single_only: - certificates = clef_data.filter_single_code_lines(certificates) + certificates = clef_data.remove_duplicates_from_certificates(certificates) + certificates = clef_data.duplicate_less_frequent(certificates, args.min_freq) - if args.extend_cert: - certificates = clef_data.extend_certificates_by_dictionaries(certificates, dictionary) - #certificates = certificates + if args.samples: + icd10_sample = certificates["ICD10"].sample(args.samples, random_state=42).values + sample_mask = certificates["ICD10"].isin(icd10_sample) + certificates = certificates[sample_mask] - if args.strat_splits: - certificates = clef_data.add_masked_icd10_column(certificates, args.strat_min_freq) + dictionary = clef_data.language_tag_data(dictionary, "DiagnosisText", "Lang") - configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, args.val_ratio, args.strat_column, - args.samples, args.strat_splits) + configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, + args.val_ratio, args.strat_column, args.strat_splits) clef18_task1.save_configuration(configuration) neg_sampling = NegativeSampling() neg_sampling_strategy = neg_sampling.get_strategy_by_name(args.neg_sampling, args) embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, args.max_pos_samples, neg_sampling_strategy, - args.epochs, args.batch_size, args.workers, args.slice_size) + args.epochs, args.batch_size) clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py index d474372548f0c08eccb008c5058e1a464c80bf69..45b540db2fbe180913d1eefaea1dadbcb2290951 100644 --- a/code_mario/preprocessing.py +++ b/code_mario/preprocessing.py @@ -256,7 +256,8 @@ class LanguageWordTagger(BaseEstimator, FitMixin, TransformerMixin): def transform(self, data: DataFrame, y=None): def _tag_words(row): language = row[self.lang_column] - return " ".join("%s%s" % (language, word) for word in text_to_word_sequence(row[self.text_column])) + result = " ".join("%s%s" % (language, word) for word in text_to_word_sequence(str(row[self.text_column]))) + return result data[self.target_column] = data.apply(_tag_words, axis=1) return data