From 232704f62199d5e1bba9ed403724cdc82ce171b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de> Date: Fri, 23 Mar 2018 17:51:54 +0100 Subject: [PATCH] Initial version of representation learning model for eCLEF18-Task1. The model is able to distinguish between correct line-ICD10-code pairs (originating from the training data) and randomly sampled corrupted pairs. --- .gitignore | 6 + code/app_context.py | 29 ++++ code/clef18_task1.py | 223 +++++++++++++++++++++++++ code/clef18_task1_data.py | 134 +++++++++++++++ code/dummy.txt | 0 code/embeddings/download_embeddings.sh | 15 ++ code/ft_embeddings.py | 20 +++ code/preprocessing.py | 150 +++++++++++++++++ code/util.py | 55 ++++++ requirements.txt | 8 + 10 files changed, 640 insertions(+) create mode 100644 .gitignore create mode 100644 code/app_context.py create mode 100644 code/clef18_task1.py create mode 100644 code/clef18_task1_data.py delete mode 100644 code/dummy.txt create mode 100755 code/embeddings/download_embeddings.sh create mode 100644 code/ft_embeddings.py create mode 100644 code/preprocessing.py create mode 100644 code/util.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fcfca8e --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.idea/ + +code/_env +code/embeddings + +*.pyc diff --git a/code/app_context.py b/code/app_context.py new file mode 100644 index 0000000..a7fcd47 --- /dev/null +++ b/code/app_context.py @@ -0,0 +1,29 @@ +import os +from util import LogUtil + + +class AppContext(object): + + default_context = None + + def __init__(self, default_log_file: str, log_dir: str, output_dir: str): + self.default_log_file = default_log_file + self.log_dir = log_dir + self.output_dir = output_dir + + @staticmethod + def get_default(): + return AppContext.default_context + + @staticmethod + def initialize_by_app_name(app_name: str): + log_dir = LogUtil.create_timestamped_log_dir(app_name) + log_file = os.path.join(log_dir, "{}-application.log".format(app_name)) + + AppContext.default_context = AppContext(log_file, log_dir, log_dir) + return AppContext.default_context + + @staticmethod + def initialize(default_log_file: str, log_dir: str, output_dir: str): + AppContext.default_context = AppContext(default_log_file, log_dir, output_dir) + return AppContext.default_context diff --git a/code/clef18_task1.py b/code/clef18_task1.py new file mode 100644 index 0000000..f92fd1f --- /dev/null +++ b/code/clef18_task1.py @@ -0,0 +1,223 @@ +import argparse + +import numpy as np +import pandas as pd +from keras.losses import binary_crossentropy + +from keras.utils import np_utils +from gensim.models import FastText +from keras import Input, Model +from keras.layers import Bidirectional, Dense, Dot, LSTM +from pandas import DataFrame +from sklearn.metrics import f1_score, accuracy_score +from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit +from sklearn.pipeline import Pipeline +from tqdm import tqdm +from typing import Tuple + +from app_context import AppContext +from clef18_task1_data import Clef18Task1Data +from ft_embeddings import FastTextEmbeddings +from preprocessing import DataPreparationUtil as pdu +from util import LoggingMixin + + +class Clef18Task1(LoggingMixin): + + def __init__(self): + LoggingMixin.__init__(self, self.__class__.__name__) + + def train_and_evaluate(self, certificate_df: DataFrame, dictionary_df: DataFrame, ft_model: FastText, + test_ratio: float, neg_samples: int, epochs: int, batch_size: int): + self.logger.info("Start evaluation") + + self.logger.info("Splitting certificate lines into train and test") + train_cert_df, test_cert_df = self.split_train_test(certificate_df, test_ratio) + self.logger.info("Finished splitting: train=%s instances, test=%s instances", len(train_cert_df), len(test_cert_df)) + + self.logger.info("Start preparation of training cert data (%s instances)", len(train_cert_df)) + train_cert_df, max_cert_length = self.prepare_certificate_df(train_cert_df, ft_model) + + self.logger.info("Start preparation of dictionary data (%s instances)", len(dictionary_df)) + dictionary_df, max_dict_length = self.prepare_dictionary_df(dictionary_df, ft_model) + + self.logger.info("Start building training pairs") + train_pair_data = self.build_pairs(train_cert_df, dictionary_df, neg_samples) + print(train_pair_data["Label"].value_counts()) + + self.logger.info("Start building training matrices") + cert_matrix, dict_matrix, labels = self.build_matrices(train_pair_data, max_cert_length, max_dict_length, ft_model.vector_size) + + self.logger.info("Start building model") + model = self.build_model(ft_model, max_cert_length, max_dict_length) + + self.logger.info("Start model training") + model.fit([cert_matrix, dict_matrix], labels, epochs=epochs, batch_size=batch_size) + + ## ---------------------------------------------------------------------------------------------------------- + + self.logger.info("Start preparation of test cert data (%s instances)", len(test_cert_df)) + test_cert_df, _ = self.prepare_certificate_df(test_cert_df, ft_model, max_cert_length) + + self.logger.info("Start creation of test pairs") + test_pair_data = self.build_pairs(test_cert_df, dictionary_df, neg_samples) + + self.logger.info("Start building test matrices") + test_cert_matrix, test_dict_matrix, gold_labels = self.build_matrices(test_pair_data, max_cert_length, max_dict_length, ft_model.vector_size) + + self.logger.info("Start prediction of test labels") + pred_labels = model.predict([test_cert_matrix, test_dict_matrix], verbose=1) + pred_labels = (pred_labels > 0.5).astype(float) + + f1_value = f1_score(gold_labels, pred_labels) + acc_value = accuracy_score(gold_labels, pred_labels) + + self.logger.info("Result: f1_score= %s | acc_score= %s", f1_value, acc_value) + + def split_train_test(self, certificate_df: DataFrame, test_ratio: float) -> Tuple[DataFrame, DataFrame]: + #splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42) + #split = splitter.split(certificate_df, certificate_df["ICD10"]) + + splitter = ShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42) + split = splitter.split(certificate_df) + + for train_indices, test_indices in split: + training_data = certificate_df.iloc[train_indices] + test_data = certificate_df.iloc[test_indices] + + return training_data, test_data + + def build_model(self, embedding_model: FastText, max_cert_length: int, max_dict_length: int): + # TODO: Think about building a embedding layer + # TODO: Make hyper-parameter configurable! + # TODO: Think about using CNNs instaead of RNNs since we can have multiple ICD-10 per line! + + # Model 1: Learn a representation of a line originating from a death certificate + input_certificate_line = Input((max_cert_length, embedding_model.vector_size)) + certificate_rnn = Bidirectional(LSTM(200))(input_certificate_line) + + # Model 2: Learn a representation of a line in the ICD-10 dictionary (~ DiagnosisText) + input_dictionary_line = Input((max_dict_length, embedding_model.vector_size)) + dictionary_rnn = Bidirectional(LSTM(200))(input_dictionary_line) + + # Calculate similarity between both representations + dot_product = Dot(axes=1, normalize=True)([certificate_rnn, dictionary_rnn]) + + output = Dense(1, activation='sigmoid')(dot_product) + + # Create the primary training model + model = Model(inputs=[input_certificate_line, input_dictionary_line], outputs=output) + model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"]) + + return model + + def prepare_certificate_df(self, certificate_df: DataFrame, ft_model: FastText, max_length: int=None) -> Tuple[DataFrame, int]: + certificate_pipeline = Pipeline([ + ("LowercaseText", pdu.to_lowercase("RawText")), + ("TokenizeDiagnosis", pdu.tokenize("RawText", "Tokens")), + ("CountTokens", pdu.count_values("Tokens", "NumTokens")), + ("LookupFastTextVectors", pdu.lookup_fast_text_vectors("Tokens", "FtVectors", ft_model)) + ]) + + cert_data_prepared = certificate_pipeline.fit_transform(certificate_df) + + if not max_length: + max_length = cert_data_prepared["NumTokens"].max() + + cert_data_prepared = pdu.vectors_to_matrix("FtVectors", "FtMatrix", ft_model.vector_size, max_length).fit_transform(cert_data_prepared) + + return cert_data_prepared, max_length + + def prepare_dictionary_df(self, dictionary_df: DataFrame, ft_model: FastText) -> Tuple[DataFrame, int]: + dictionary_pipeline = Pipeline([ + ("CombineTexts", pdu.combine_texts(["DiagnosisText", "Standardized"], "DictText")), + ("LowercaseText", pdu.to_lowercase("DictText")), + ("TokenizeDiagnosis", pdu.tokenize("DictText", "Tokens")), + ("CountTokens", pdu.count_values("Tokens", "NumTokens")), + ("LookupTokenIds", pdu.lookup_fast_text_vectors("Tokens", "FtVectors", ft_model)), + ]) + + dict_data_prepared = dictionary_pipeline.fit_transform(dictionary_df) + max_length = dict_data_prepared["NumTokens"].max() + + dict_data_prepared = pdu.vectors_to_matrix("FtVectors", "FtMatrix", ft_model.vector_size, max_length).fit_transform(dict_data_prepared) + + return dict_data_prepared, max_length + + def build_pairs(self, certificate_data: DataFrame, dictionary_data: DataFrame, num_neg_samples: int): + # FIXME: This can be implemented more efficiently! + # FIXME: Improve sampling of negative instances (especially if code is I-XXX sample other codes of the same class (e.g. I-YYY) + # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2 + + certificate_vectors = [] + dictionary_vectors = [] + labels = [] + + for i, cert_row in tqdm(certificate_data.iterrows(), desc="build-pairs", total=len(certificate_data)): + line_icd10_code = cert_row["ICD10"] + + # Build positive examples (based on training data) + dictionary_entries = dictionary_data.query("Icd1 == '%s'" % line_icd10_code) + #self.logger.info("Found %s entries for ICD-10 code %s", len(dictionary_entries), line_icd10_code) + for i, dict_row in dictionary_entries.iterrows(): + certificate_vectors.append(cert_row["FtMatrix"]) + dictionary_vectors.append(dict_row["FtMatrix"]) + + labels.append(1.0) + + # Find illegal ICD-10 for this line + negative_samples = dictionary_data.query("Icd1 != '%s'" % line_icd10_code) + negative_samples = negative_samples.sample(num_neg_samples) + + # Build negative samples + for i, dict_row in negative_samples.iterrows(): + certificate_vectors.append(cert_row["FtMatrix"]) + dictionary_vectors.append(dict_row["FtMatrix"]) + + labels.append(0.0) + + data = {"CertFtMatrix": certificate_vectors, "DictFtMatrix": dictionary_vectors, "Label": labels} + return pd.DataFrame(data) + + def build_matrices(self, pair_data: DataFrame, max_cert_length: int, max_dict_length: int, vector_size: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + num_pairs = len(pair_data) + + certificate_matrix = np.zeros((num_pairs, max_cert_length, vector_size)) + dictionary_matrix = np.zeros((num_pairs, max_dict_length, vector_size)) + label_matrix = np.zeros((num_pairs)) + + for i, (_, row) in tqdm(enumerate(pair_data.iterrows()), desc="build-matrices", total=num_pairs): + certificate_matrix[i] = row["CertFtMatrix"] + dictionary_matrix[i] = row["DictFtMatrix"] + label_matrix[i] = row["Label"] + + return certificate_matrix, dictionary_matrix, label_matrix + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(prog="CLEF2018") + parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int) + parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int) + parser.add_argument("--test_ratio", help="Ratio of test samples to use", default=0.2, type=float) + parser.add_argument("--neg_samples", help="Number of negative samples for each pair to use", default=75, type=int) + parser.add_argument("--sample", help="Number of instances to sample from the training data", default=None, type=int) + + args = parser.parse_args() + + AppContext.initialize_by_app_name("eCLEF2018-Task1") + + clef_data = Clef18Task1Data() + it_dictionary = clef_data.read_it_dictionary() + it_certificates = clef_data.read_it_train_certificates() + + if args.sample: + print("Sampling %s instances", args.sample) + it_certificates = it_certificates.sample(args.sample, random_state=42) + + ft_embeddings = FastTextEmbeddings() + ft_it_model = ft_embeddings.load_it_embeddings() + + clef18_task1 = Clef18Task1() + clef18_task1.train_and_evaluate(it_certificates, it_dictionary, ft_it_model, args.test_ratio, args.neg_samples, args.epochs, args.batch_size) + + diff --git a/code/clef18_task1_data.py b/code/clef18_task1_data.py new file mode 100644 index 0000000..2f514fa --- /dev/null +++ b/code/clef18_task1_data.py @@ -0,0 +1,134 @@ +import os +import pandas as pd + +from pandas import DataFrame + +from app_context import AppContext +from preprocessing import CombineTexts +from util import LoggingMixin + + +class Clef18Task1Data(LoggingMixin): + + def __init__(self): + LoggingMixin.__init__(self, self.__class__.__name__) + + def read_it_train_certificates(self) -> DataFrame: + base_folder = "data/train/IT/training/raw/corpus/" + + calculees_file = os.path.join(base_folder, "CausesCalculees_IT_1.csv") + brutes_file = os.path.join(base_folder, "CausesBrutes_IT_1.csv") + + return self._read_certificates(calculees_file, brutes_file) + + def read_it_dictionary(self) -> DataFrame: + base_folder = "data/train/IT/training/raw/dictionaries" + dictionary_file = os.path.join(base_folder, "dictionary_IT.csv") + return self._read_icd10_dictionary(dictionary_file, "iso-8859-1") + + # -------------------------------------------------------------------------------- + + def read_hu_train_certificates(self) -> DataFrame: + base_folder = "data/train/HU/training/raw/corpus/" + + calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv") + brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv") + + return self._read_certificates(calculees_file, brutes_file) + + def read_hu_dictionary(self) -> DataFrame: + base_folder = "data/train/HU/training/raw/dictionaries" + dictionary_file = os.path.join(base_folder, "Hungarian_dictionary_UTF8.csv") + return self._read_icd10_dictionary(dictionary_file, "utf-8") + + # -------------------------------------------------------------------------------- + + def read_fr_train_certificates(self) -> DataFrame: + # FIXME: Load other training files from 2011-2015! + base_folder = "data/train/FR/training/raw/corpus/" + + calculees_file = os.path.join(base_folder, "CausesCalculees_FR_2006-2012.csv") + brutes_file = os.path.join(base_folder, "CausesBrutes_FR_2006-2012.csv") + + return self._read_certificates(calculees_file, brutes_file) + + def read_fr_dictionary(self) -> DataFrame: + # FIXME: Load other training files from 2011-2015! + + base_folder = "data/train/FR/training/aligned/dictionaries" + dictionary_file = os.path.join(base_folder, "Dictionnaire2006-2010.csv") + return self._read_icd10_dictionary(dictionary_file, "utf-8") + + # -------------------------------------------------------------------------------- + + def _read_certificates(self, calculees_file: str, brutus_file: str) -> DataFrame: + self.logger.info("Reading calculees file from %s", calculees_file) + calculees_data = pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"]) + self.logger.info("Found %s death certifcate lines", len(calculees_data)) + + self.logger.info("Reading brutus file from %s", brutus_file) + brutus_data = pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"]) + + joined_data = brutus_data.join(calculees_data, lsuffix="_b", rsuffix="_c") + + return joined_data[["RawText", "ICD10"]] + + def _read_icd10_dictionary(self, file: str, encoding: str) -> DataFrame: + self.logger.info("Reading ICD-10 dictionary from %s", file) + dictionary_data = pd.read_csv(file, sep=";", encoding=encoding) + num_dictionary_entries = len(dictionary_data) + self.logger.info("Found %s dictionary entries", num_dictionary_entries) + + if "Standardized" not in dictionary_data.columns: + dictionary_data["Standardized"] = None + + dictionary_data = dictionary_data[["Icd1", "Standardized", "DiagnosisText"]] + dictionary_data = dictionary_data.drop_duplicates() + + self.logger.info("Removed %s duplicates from dictionary", num_dictionary_entries - len(dictionary_data)) + + return dictionary_data + + +if __name__ == "__main__": + # Just for debugging / development purposes + AppContext.initialize_by_app_name("Clef18Task1-Data") + + clef_task_data = Clef18Task1Data() + + all_data_sets = { + "IT": (clef_task_data.read_it_train_certificates(), clef_task_data.read_it_dictionary()), + "HU": (clef_task_data.read_hu_train_certificates(), clef_task_data.read_hu_dictionary()), + "FR": (clef_task_data.read_fr_train_certificates(), clef_task_data.read_fr_dictionary()) + } + + for name, (certs, dict) in all_data_sets.items(): + print("Data set: ", name) + ids = [] + for i, row in certs.iterrows(): + ids.append("{}##{}".format(i[0], i[1])) + + certs["ID"] = ids + value_counts = certs["ID"].value_counts() + print(value_counts) + + total_sum = value_counts.sum() + sum_more_than_one = 0 + sum_more_than_two = 0 + for i, row in value_counts.iteritems(): + if row > 1: + sum_more_than_one = sum_more_than_one + row + if row > 2: + sum_more_than_two = sum_more_than_two + row + + + print(sum_more_than_one / total_sum) + print(sum_more_than_two / total_sum) + + print(certs["ICD10"].value_counts()) + print("\n\n\n") + + + + + diff --git a/code/dummy.txt b/code/dummy.txt deleted file mode 100644 index e69de29..0000000 diff --git a/code/embeddings/download_embeddings.sh b/code/embeddings/download_embeddings.sh new file mode 100755 index 0000000..0bb2427 --- /dev/null +++ b/code/embeddings/download_embeddings.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +function log_and_run { + echo $1 + eval "$1" +} + +echo "Downloading italian embeddings" +log_and_run "wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.it.zip" + +echo "Unzipping download vectors" +log_and_run "unzip wiki.it.zip" + +echo "Clean directory" +log_and_run "rm -f wiki.it.zip" diff --git a/code/ft_embeddings.py b/code/ft_embeddings.py new file mode 100644 index 0000000..34cc482 --- /dev/null +++ b/code/ft_embeddings.py @@ -0,0 +1,20 @@ +from gensim.models import FastText + +from util import LoggingMixin + + +class FastTextEmbeddings(LoggingMixin): + + def __init__(self): + LoggingMixin.__init__(self, __class__.__name__) + + def load_it_embeddings(self) -> FastText: + italian_ft_file = "embeddings/wiki.it" + return self._load_ft_model(italian_ft_file) + + def _load_ft_model(self, ft_file: str) -> FastText: + self.logger.info("Loading fast text embeddings from %s", ft_file) + ft_model = FastText.load_fasttext_format(ft_file) + self.logger.info("Finished loading of fast text model") + + return ft_model diff --git a/code/preprocessing.py b/code/preprocessing.py new file mode 100644 index 0000000..781e81e --- /dev/null +++ b/code/preprocessing.py @@ -0,0 +1,150 @@ +import re +import numpy as np +import pandas as pd + +from typing import Callable, List + +from gensim.models import FastText +from gensim.models.keyedvectors import Word2VecKeyedVectors +from pandas import DataFrame +from sklearn.base import BaseEstimator, TransformerMixin + +from util import PandasUtil + + +class DataPreparationUtil(object): + + @staticmethod + def to_lowercase(column: str): + def _lower(text): + return str(text).lower() + + return MapFunction(column, _lower) + + @staticmethod + def tokenize(text_column: str, token_column: str = "tokens"): + return SimpleTokenizer(text_column, token_column) + + @staticmethod + def count_values(column: str, target_column: str): + def _count(value): + return len(value) + return MapFunction(column, _count, target_column) + + @staticmethod + def lookup_fast_text_vectors(token_column: str, target_column: str, ft_model: FastText): + return FastTextVectorsLookup(token_column, target_column, ft_model) + + @staticmethod + def vectors_to_matrix(vectors_column: str, matrix_column: str, vector_size: int, max_length: int = None): + return VectorListToMatrix(vectors_column, matrix_column, vector_size, max_length) + + @staticmethod + def combine_texts(columns: List[str], target_column: str): + return CombineTexts(columns, target_column) + + +class FitMixin(object): + + def fit(self, data, y=None): + return self + + +class MapFunction(BaseEstimator, TransformerMixin, FitMixin): + + def __init__(self, column: str, map_function: Callable, target_column: str = None): + self.column = column + self.map_function = map_function + + if target_column: + self.target_column = target_column + else: + self.target_column = column + + def transform(self, data: DataFrame, y=None): + values = data[self.column].apply(self.map_function) + return PandasUtil.append_column(data, self.target_column, values) + + +class FastTextVectorsLookup(BaseEstimator, TransformerMixin, FitMixin): + + def __init__(self, token_column: str, vector_column: str, ft_model: FastText): + self.token_column = token_column + self.vector_column = vector_column + self.ft_model = ft_model + + def transform(self, data: DataFrame, y= None) -> DataFrame: + def _lookup_vectors(tokens): + vectors = [] + for token in tokens: + try: + vectors.append(self.ft_model[token]) + except KeyError: + print("Can't create embedding for "+token) + return vectors + + vectors = data[self.token_column].apply(_lookup_vectors) + return PandasUtil.append_column(data, self.vector_column, vectors) + + +class SimpleTokenizer(BaseEstimator, TransformerMixin, FitMixin): + + def __init__(self, text_column: str, token_column: str): + self.text_column = text_column + self.token_column = token_column + + def transform(self, data: DataFrame, y= None) -> DataFrame: + def _tokenize(value): + return [value.strip() for value in re.split("[ ,.=\t\!]", str(value)) if value.strip()] + + tokens = data[self.text_column].apply(_tokenize) + return PandasUtil.append_column(data, self.token_column, tokens) + + +class CombineTexts(BaseEstimator, TransformerMixin, FitMixin): + + def __init__(self, text_columns: List[str], target_column: str, joiner: str = " "): + self.text_columns = text_columns + self.target_column = target_column + self.joiner = joiner + + def transform(self, data: DataFrame, y=None): + def _combine(row): + return self.joiner.join([str(row[column]).strip() for column in self.text_columns if str(row[column]).strip()]) + + combined_texts = data.apply(_combine, axis=1) + return PandasUtil.append_column(data, self.target_column, combined_texts) + + +class VectorListToMatrix(BaseEstimator, TransformerMixin, FitMixin): + + def __init__(self, vector_column: str, matrix_column: str, vector_size: int, max_length: int = None): + self.vector_column = vector_column + self.matrix_column = matrix_column + self.vector_size = vector_size + self.max_length = max_length + + def transform(self, data: DataFrame, y=None): + if not self.max_length: + self.max_length = max([len(value) for value in data[self.vector_column].values]) + + matrices = data[self.vector_column].apply(self._build_matrix()) + return PandasUtil.append_column(data, self.matrix_column, matrices) + + def _build_matrix(self): + def _build(vectors): + matrix = np.zeros((self.max_length, self.vector_size)) + for i in range(self.max_length): + if i == len(vectors): + break + + matrix[i] = vectors[i] + + return matrix + + return _build + + + + + diff --git a/code/util.py b/code/util.py new file mode 100644 index 0000000..d4b63ef --- /dev/null +++ b/code/util.py @@ -0,0 +1,55 @@ +import os +import logging + +from datetime import datetime +from logging import Logger + +from pandas import DataFrame, Series + + +class LogUtil(object): + + LOG_ROOT_DIR = "_logs" + + @staticmethod + def create_logger(name: str, std_out: bool = True, file_path: str = None, level=logging.DEBUG) -> Logger: + log_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + logger = logging.getLogger(name) + logger.setLevel(level) + + if std_out: + std_handler = logging.StreamHandler() + std_handler.setLevel(level) + std_handler.setFormatter(log_format) + logger.addHandler(std_handler) + + if file_path is not None: + file_handler = logging.FileHandler(file_path, encoding="utf-8") + file_handler.setLevel(level) + file_handler.setFormatter(log_format) + logger.addHandler(file_handler) + + return logger + + @staticmethod + def create_timestamped_log_dir(sub_folder: str) -> str: + now_timestamp = datetime.utcnow().strftime("%Y%m%d%H%M%S") + dir_path = "{}/{}/{}/".format(LogUtil.LOG_ROOT_DIR, sub_folder, now_timestamp) + os.makedirs(dir_path) + return dir_path + + +class LoggingMixin(object): + + def __init__(self, logger_name: str): + self.logger = LogUtil.create_logger(logger_name) + + +class PandasUtil(object): + + @staticmethod + def append_column(data: DataFrame, column_name: str, values: Series): + column = {column_name: values} + return data.assign(**column) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f609de0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +pandas +numpy +nltk +gensim +keras +sklearn +tensorflow +tqdm -- GitLab