From 6701c992222a1915269142d93a1558b1dbe03cc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de> Date: Tue, 8 May 2018 17:21:18 +0200 Subject: [PATCH] Add support for down sampling of train instances by ICD10 frequency --- code_mario/clef18_task1_data.py | 58 ++++++++++++++++-- code_mario/clef18_task1_emb1.py | 101 +++++++++++++++++++++++++++----- code_mario/preprocessing.py | 7 +++ 3 files changed, 145 insertions(+), 21 deletions(-) diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index b550774..8c36f54 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -4,6 +4,8 @@ from typing import List import pandas as pd from pandas import DataFrame +from tqdm import tqdm + from app_context import AppContext from util import LoggingMixin from itertools import groupby @@ -154,13 +156,38 @@ class Clef18Task1Data(LoggingMixin): return certificate_df + def down_sample_by_icd10_frequency(self, certificate_df: DataFrame, max_freq: int): + self.logger.info("Down sampled data set with %s entries", len(certificate_df)) + icd10_codes = certificate_df["ICD10"].unique() + + data_sets = [] + for code in tqdm(icd10_codes,desc="down-sample", total=len(icd10_codes)): + entries_by_code = certificate_df.query("ICD10 == '%s'" % code) + if len(entries_by_code) > max_freq: + unique_texts = entries_by_code["RawText"].unique() + + unique_entries = [] + for text in unique_texts: + unique_entries.append(entries_by_code.query("RawText == \"%s\"" % text)[0:1]) + + unique_entries.append(entries_by_code.sample(max(max_freq-len(unique_texts), 10))) + entries_by_code = pd.concat(unique_entries) + + data_sets.append(entries_by_code) + + sampled_df = pd.concat(data_sets) + sampled_df = sampled_df.sample(frac=1) # Reshuffle! + self.logger.info("Down sampled data set contains %s entries", len(sampled_df)) + return sampled_df + # -------------------------------------------------------------------------------- def _read_certificates(self, calculees_files: List[str], brutus_files: List[str]) -> DataFrame: calculees_data = [] for calculees_file in calculees_files: self.logger.info("Reading calculees file from %s", calculees_file) - calculees_data.append(pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], skipinitialspace=True)) + calculees_data.append(pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], + skipinitialspace=True)) self.logger.info("Found %s death certificate entries", len(calculees_data[-1])) calculees_data = pd.concat(calculees_data) @@ -173,12 +200,17 @@ class Clef18Task1Data(LoggingMixin): skipinitialspace=True)) self.logger.info("Found %s death certificate entries", len(brutus_data[-1])) - brutus_data = pd.concat(brutus_data) joined_data = brutus_data.join(calculees_data, lsuffix="_b", rsuffix="_c") joined_data["ICD10"] = joined_data["ICD10"].astype(str) + num_unchecked_data = len(joined_data) + joined_data = joined_data.query("ICD10 != 'nan'") + self.logger.info("Removed %s lines with ICD10 'nan'", num_unchecked_data - len(joined_data)) + + joined_data = pdu.clean_text("RawText").fit_transform(joined_data) + return joined_data[["RawText", "ICD10"]] def _read_icd10_dictionary(self, file: str, encoding: str) -> DataFrame: @@ -302,27 +334,41 @@ def check_multi_code_lines(cert_df: DataFrame): print("\tCorrect: ", len(correct_single)) print("\tIncorrect: ", len(incorrect_single)) +def check_label_distribution(cert_df: DataFrame): + distribution = cert_df["ICD10"].value_counts() + print(distribution) + if __name__ == "__main__": # Just for debugging / development purposes AppContext.initialize_by_app_name("Clef18Task1-Data") clef_task_data = Clef18Task1Data() - it_certificates = clef_task_data.read_it_train_certificates() it_dictionary = clef_task_data.read_it_dictionary() - check_multi_code_lines(it_certificates) + + check_label_distribution(it_certificates) + #it_certificates = clef_task_data.down_sample_by_icd10_frequency(it_certificates, 800) + check_label_distribution(it_certificates) # check_word_dictionary_overlap(it_certificates, it_dictionary, "data/dictionary/it-en.txt") hu_certificates = clef_task_data.read_hu_train_certificates() hu_dictionary = clef_task_data.read_hu_dictionary() - check_multi_code_lines(hu_certificates) + + check_label_distribution(hu_certificates) + #hu_certificates = clef_task_data.down_sample_by_icd10_frequency(hu_certificates, 2750) + check_label_distribution(hu_certificates) # check_word_dictionary_overlap(hu_certificates, hu_dictionary, "data/dictionary/hu-en.txt") fr_certificates = clef_task_data.read_fr_train_certificates() fr_dictionary = clef_task_data.read_fr_dictionary() - check_multi_code_lines(fr_certificates) + + check_label_distribution(fr_certificates) + fr_certificates = clef_task_data.down_sample_by_icd10_frequency(fr_certificates, 2750) + check_label_distribution(fr_certificates) + + # check_word_dictionary_overlap(fr_certificates, fr_dictionary, "data/dictionary/fr-en.txt") # certificates = pdu.extract_icd10_chapter("ICD10", "ICD10_chapter").fit_transform(certificates) diff --git a/code_mario/clef18_task1_emb1.py b/code_mario/clef18_task1_emb1.py index 71be48f..dad8532 100644 --- a/code_mario/clef18_task1_emb1.py +++ b/code_mario/clef18_task1_emb1.py @@ -1,8 +1,9 @@ +import concurrent + from gensim.models import FastText -from sklearn.base import BaseEstimator -from sklearn.svm import SVC from init import * +from concurrent.futures import ThreadPoolExecutor from clef18_task1_base import ICD10LabelEncoders, Clef18Task1Base, EvaluationConfiguration, NegativeSampling from clef18_task1_emb2 import EvaluationResult @@ -46,7 +47,6 @@ class Emb1Configuration(object): self.label_encoders = label_encoders self.keras_tokenizer = keras_tokenizer - class Clef18Task1Emb1(Clef18Task1Base): def __init__(self): @@ -89,13 +89,14 @@ class Clef18Task1Emb1(Clef18Task1Base): return model - def train_embedding_model(self, config: Emb1Configuration, ft_model: FastTextModel, neg_sampling_strategy: Callable, epochs: int, batch_size: int) -> Model: + def train_embedding_model(self, config: Emb1Configuration, ft_model: FastTextModel, max_pos_samples: int, neg_sampling_strategy: Callable, + epochs: int, batch_size: int, workers: int, chunk_size: int) -> Model: self.logger.info("Start building embedding model") model = self.build_embedding_model(config.keras_tokenizer.word_index, ft_model, config.max_cert_length, config.max_dict_length) model.summary(print_fn=self.logger.info) self.logger.info("Start building training pairs") - train_pair_data = self.build_pairs(config.train_cert_df, config.dict_df, neg_sampling_strategy) + train_pair_data = self.build_pairs(config.train_cert_df, config.dict_df, max_pos_samples, neg_sampling_strategy) self.logger.info("Label distribution:\n%s", train_pair_data["Label"].value_counts()) cert_inputs = pad_sequences(train_pair_data["Cert_input"].values, maxlen=config.max_cert_length, padding="post") @@ -107,7 +108,7 @@ class Clef18Task1Emb1(Clef18Task1Base): if config.val_cert_df is not None and len(config.test_cert_df) > 0: self.logger.info("Start creation of validation pairs") - val_pair_data = self.build_pairs(config.val_cert_df, config.dict_df, neg_sampling_strategy) + val_pair_data = self.build_pairs(config.val_cert_df, config.dict_df, max_pos_samples, neg_sampling_strategy) val_cert_inputs = pad_sequences(val_pair_data["Cert_input"].values, maxlen=config.max_cert_length, padding="post") val_dict_inputs = pad_sequences(val_pair_data["Dict_input"].values, maxlen=config.max_dict_length, padding="post") @@ -133,7 +134,7 @@ class Clef18Task1Emb1(Clef18Task1Base): self.logger.info("Start evaluation of embedding model!") self.logger.info("Start creation of test pairs") - test_pair_data = self.build_pairs(config.test_cert_df, config.dict_df, neg_sampling_strategy) + test_pair_data = self.build_pairs(config.test_cert_df, config.dict_df, max_pos_samples, neg_sampling_strategy) test_cert_inputs = pad_sequences(test_pair_data["Cert_input"].values, maxlen=config.max_cert_length, padding="post") test_dict_inputs = pad_sequences(test_pair_data["Dict_input"].values, maxlen=config.max_dict_length, padding="post") @@ -319,9 +320,7 @@ class Clef18Task1Emb1(Clef18Task1Base): return dict_data_prepared, max_length - def build_pairs(self, certificate_data: DataFrame, dictionary_data: DataFrame, neg_sampling_strategy: Callable): - # FIXME: This can be implemented more efficiently! - # FIXME: Improve sampling of negative instances (especially if code is I-XXX sample other codes of the same class (e.g. I-YYY) + def build_pairs(self, certificate_data: DataFrame, dictionary_data: DataFrame, max_pos_samples: int, neg_sampling_strategy: Callable): # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2 certificate_vectors = [] @@ -333,6 +332,14 @@ class Clef18Task1Emb1(Clef18Task1Base): # Build positive examples (based on training data) dictionary_entries = dictionary_data.query("ICD10 == '%s'" % line_icd10_code) + if len(dictionary_entries) > 0: + dictionary_entries = dictionary_entries.sample(min(max_pos_samples, len(dictionary_entries))) + else: + # Add at least one example + certificate_vectors.append(cert_row["Token_ids"]) + dictionary_vectors.append(cert_row["Token_ids"]) + labels.append(1.0) + #self.logger.info("Found %s entries for ICD-10 code %s", len(dictionary_entries), line_icd10_code) for i, dict_row in dictionary_entries.iterrows(): certificate_vectors.append(cert_row["Token_ids"]) @@ -353,6 +360,61 @@ class Clef18Task1Emb1(Clef18Task1Base): data = {"Cert_input": certificate_vectors, "Dict_input": dictionary_vectors, "Label": labels} return pd.DataFrame(data) + def build_pairs_para(self, certificate_data: DataFrame, dictionary_data: DataFrame, max_pos_samples: int, + neg_sampling_strategy: Callable, workers: int, chunk_size: int): + # FIXME: This can be implemented more efficiently! + # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2 + + def _run_build_pairs(df_slice: DataFrame): + loc_certificate_vectors = [] + loc_dictionary_vectors = [] + loc_labels = [] + + for i, cert_row in df_slice.iterrows(): + line_icd10_code = cert_row["ICD10"] + + # Build positive examples (based on training data) + dictionary_entries = dictionary_data.query("ICD10 == '%s'" % line_icd10_code) + dictionary_entries = dictionary_entries.sample(min(max_pos_samples, len(dictionary_entries))) + + #self.logger.info("Found %s entries for ICD-10 code %s", len(dictionary_entries), line_icd10_code) + for i, dict_row in dictionary_entries.iterrows(): + loc_certificate_vectors.append(cert_row["Token_ids"]) + loc_dictionary_vectors.append(dict_row["Token_ids"]) + + loc_labels.append(1.0) + + # Build negative samples + # Find illegal ICD-10 for this line + negative_samples = neg_sampling_strategy(certificate_data, line_icd10_code) + + for i, neg_row in negative_samples.iterrows(): + loc_certificate_vectors.append(cert_row["Token_ids"]) + loc_dictionary_vectors.append(neg_row["Token_ids"]) + + loc_labels.append(0.0) + + return loc_certificate_vectors, loc_dictionary_vectors, loc_labels + + certificate_vectors = [] + dictionary_vectors = [] + labels = [] + + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = [] + for i in range(0, len(certificate_data), chunk_size): + futures.append(executor.submit(_run_build_pairs, certificate_data[i:i + chunk_size])) + + compl_futures = concurrent.futures.as_completed(futures) + for future in tqdm(compl_futures, desc="build-pairs", total=len(futures)): + slice_result = future.result() + certificate_vectors = certificate_vectors + slice_result[0] + dictionary_vectors = dictionary_vectors + slice_result[1] + labels = labels + slice_result[2] + + data = {"Cert_input": certificate_vectors, "Dict_input": dictionary_vectors, "Label": labels} + return pd.DataFrame(data) + def build_rnn_input(self, data: DataFrame, column: str, max_length: int, vector_size: int) -> np.ndarray: data_matrix = np.zeros((len(data), max_length, vector_size)) @@ -370,16 +432,20 @@ if __name__ == "__main__": train_emb_parser.add_argument("lang", help="Language to train on", choices=["it", "fr", "hu", "all-con"]) train_emb_parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int) train_emb_parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int) + train_emb_parser.add_argument("--workers", help="Number of threads during pair building", default=4, type=int) + train_emb_parser.add_argument("--slice_size", help="Number of cert entries to be handled by one thread during pair building", default=1000, type=int) train_emb_parser.add_argument("--train_ratio", help="Ratio of samples (from the complete data set) to use for training", default=0.8, type=float) train_emb_parser.add_argument("--val_ratio", help="Ratio of samples (from the evaluation data set) to use for validation", default=0.3, type=float) train_emb_parser.add_argument("--samples", help="Number of instances to sample from the (original) training data", default=None, type=int) + train_emb_parser.add_argument("--down_sample", help="Maximal frequency of ICD10 code until start down sampling", default=None, type=int) train_emb_parser.add_argument("--strat_column", help="Column used to stratify the data sets", default="ICD10_masked", type=str) train_emb_parser.add_argument("--strat_splits", help="Indicates whether to use stratified sampling", default=False, type=bool) train_emb_parser.add_argument("--strat_min_freq", help="Min frequency of an icd10 code to be an own class during stratification", default=8, type=int) train_emb_parser.add_argument("--target_labels", help="Target columns for the classification models", default=["icd10"], action='append') train_emb_parser.add_argument("--single_only", help="Indicates whether just to use the single code lines from the data", default=False, type=bool) + train_emb_parser.add_argument("--max_pos_samples", help="Maximal number of positive samples to use", default=10, type=int) train_emb_parser.add_argument("--neg_sampling", help="Negative sampling strategy to use", default="ext1", choices=["def", "ext1"]) train_emb_parser.add_argument("--num_neg_samples", help="Number of negative samples to use (default strategy)", default=75, type=int) train_emb_parser.add_argument("--num_neg_cha", help="Number of negative chapter samples to use (ext1 strategy)", default=10, type=int) @@ -423,20 +489,24 @@ if __name__ == "__main__": #sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] #ft_model = FastTextModel("dummy", [FastText(sentences, min_count=1)]) + if args.down_sample: + certificates = clef_data.down_sample_by_icd10_frequency(certificates, args.down_sample) + if args.single_only: certificates = clef_data.filter_single_code_lines(certificates) if args.strat_splits: certificates = clef_data.add_masked_icd10_column(certificates, args.strat_min_freq) - configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, args.val_ratio,args.strat_column, + configuration = clef18_task1.prepare_data_set(certificates, dictionary, ft_model, args.train_ratio, args.val_ratio, args.strat_column, args.samples, args.strat_splits) clef18_task1.save_configuration(configuration) neg_sampling = NegativeSampling() neg_sampling_strategy = neg_sampling.get_strategy_by_name(args.neg_sampling, args) - embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, neg_sampling_strategy, args.epochs, args.batch_size) + embedding_model = clef18_task1.train_embedding_model(configuration, ft_model, args.max_pos_samples, neg_sampling_strategy, + args.epochs, args.batch_size, args.workers, args.slice_size) clef18_task1.train_and_evaluate_classifiers(embedding_model, configuration, args.target_labels) @@ -454,9 +524,10 @@ if __name__ == "__main__": test_certificates = clef_data.read_test_certifcates_by_lang(args.lang) ft_embeddings = FastTextEmbeddings() - #ft_model = ft_embeddings.load_embeddings_by_id(args.lang) - sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - ft_model = FastTextModel("dummy", [FastText(sentences, min_count=1)]) + ft_model = ft_embeddings.load_embeddings_by_id(args.lang) + + #sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + #ft_model = FastTextModel("dummy", [FastText(sentences, min_count=1)]) clef18_task1.predict(embedding_model, classifer_model, configuration, test_certificates) diff --git a/code_mario/preprocessing.py b/code_mario/preprocessing.py index 4bfb28f..ac7fad0 100644 --- a/code_mario/preprocessing.py +++ b/code_mario/preprocessing.py @@ -88,6 +88,13 @@ class DataPreparationUtil(object): return MapFunction(icd10_column, _mask_icd10, target_column) + @staticmethod + def clean_text(column:str): + def _clean(value): + return str(value).replace("\"", " ") + + return MapFunction(column, _clean) + class FitMixin(object): -- GitLab