From 232704f62199d5e1bba9ed403724cdc82ce171b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de>
Date: Fri, 23 Mar 2018 17:51:54 +0100
Subject: [PATCH] Initial version of representation learning model for
 eCLEF18-Task1. The model is able to distinguish between correct
 line-ICD10-code pairs (originating from the training data) and randomly
 sampled corrupted pairs.

---
 .gitignore                             |   6 +
 code/app_context.py                    |  29 ++++
 code/clef18_task1.py                   | 223 +++++++++++++++++++++++++
 code/clef18_task1_data.py              | 134 +++++++++++++++
 code/dummy.txt                         |   0
 code/embeddings/download_embeddings.sh |  15 ++
 code/ft_embeddings.py                  |  20 +++
 code/preprocessing.py                  | 150 +++++++++++++++++
 code/util.py                           |  55 ++++++
 requirements.txt                       |   8 +
 10 files changed, 640 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 code/app_context.py
 create mode 100644 code/clef18_task1.py
 create mode 100644 code/clef18_task1_data.py
 delete mode 100644 code/dummy.txt
 create mode 100755 code/embeddings/download_embeddings.sh
 create mode 100644 code/ft_embeddings.py
 create mode 100644 code/preprocessing.py
 create mode 100644 code/util.py
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fcfca8e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+.idea/
+
+code/_env
+code/embeddings
+
+*.pyc
diff --git a/code/app_context.py b/code/app_context.py
new file mode 100644
index 0000000..a7fcd47
--- /dev/null
+++ b/code/app_context.py
@@ -0,0 +1,29 @@
+import os
+from util import LogUtil
+
+
+class AppContext(object):
+
+    default_context = None
+
+    def __init__(self, default_log_file: str, log_dir: str, output_dir: str):
+        self.default_log_file = default_log_file
+        self.log_dir = log_dir
+        self.output_dir = output_dir
+
+    @staticmethod
+    def get_default():
+        return AppContext.default_context
+
+    @staticmethod
+    def initialize_by_app_name(app_name: str):
+        log_dir = LogUtil.create_timestamped_log_dir(app_name)
+        log_file = os.path.join(log_dir, "{}-application.log".format(app_name))
+
+        AppContext.default_context = AppContext(log_file, log_dir, log_dir)
+        return AppContext.default_context
+
+    @staticmethod
+    def initialize(default_log_file: str, log_dir: str, output_dir: str):
+        AppContext.default_context = AppContext(default_log_file, log_dir, output_dir)
+        return AppContext.default_context
diff --git a/code/clef18_task1.py b/code/clef18_task1.py
new file mode 100644
index 0000000..f92fd1f
--- /dev/null
+++ b/code/clef18_task1.py
@@ -0,0 +1,223 @@
+import argparse
+
+import numpy as np
+import pandas as pd
+from keras.losses import binary_crossentropy
+
+from keras.utils import np_utils
+from gensim.models import FastText
+from keras import Input, Model
+from keras.layers import Bidirectional, Dense, Dot, LSTM
+from pandas import DataFrame
+from sklearn.metrics import f1_score, accuracy_score
+from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
+from sklearn.pipeline import Pipeline
+from tqdm import tqdm
+from typing import Tuple
+
+from app_context import AppContext
+from clef18_task1_data import Clef18Task1Data
+from ft_embeddings import FastTextEmbeddings
+from preprocessing import DataPreparationUtil as pdu
+from util import LoggingMixin
+
+
+class Clef18Task1(LoggingMixin):
+
+    def __init__(self):
+        LoggingMixin.__init__(self, self.__class__.__name__)
+
+    def train_and_evaluate(self, certificate_df: DataFrame, dictionary_df: DataFrame, ft_model: FastText,
+                           test_ratio: float, neg_samples: int, epochs: int, batch_size: int):
+        self.logger.info("Start evaluation")
+
+        self.logger.info("Splitting certificate lines into train and test")
+        train_cert_df, test_cert_df = self.split_train_test(certificate_df, test_ratio)
+        self.logger.info("Finished splitting: train=%s instances, test=%s instances", len(train_cert_df), len(test_cert_df))
+
+        self.logger.info("Start preparation of training cert data (%s instances)", len(train_cert_df))
+        train_cert_df, max_cert_length = self.prepare_certificate_df(train_cert_df, ft_model)
+
+        self.logger.info("Start preparation of dictionary data (%s instances)", len(dictionary_df))
+        dictionary_df, max_dict_length = self.prepare_dictionary_df(dictionary_df, ft_model)
+
+        self.logger.info("Start building training pairs")
+        train_pair_data = self.build_pairs(train_cert_df, dictionary_df, neg_samples)
+        print(train_pair_data["Label"].value_counts())
+
+        self.logger.info("Start building training matrices")
+        cert_matrix, dict_matrix, labels = self.build_matrices(train_pair_data, max_cert_length, max_dict_length, ft_model.vector_size)
+
+        self.logger.info("Start building model")
+        model = self.build_model(ft_model, max_cert_length, max_dict_length)
+
+        self.logger.info("Start model training")
+        model.fit([cert_matrix, dict_matrix], labels, epochs=epochs, batch_size=batch_size)
+
+        ## ----------------------------------------------------------------------------------------------------------
+
+        self.logger.info("Start preparation of test cert data (%s instances)", len(test_cert_df))
+        test_cert_df, _ = self.prepare_certificate_df(test_cert_df, ft_model, max_cert_length)
+
+        self.logger.info("Start creation of test pairs")
+        test_pair_data = self.build_pairs(test_cert_df, dictionary_df, neg_samples)
+
+        self.logger.info("Start building test matrices")
+        test_cert_matrix, test_dict_matrix, gold_labels = self.build_matrices(test_pair_data, max_cert_length, max_dict_length, ft_model.vector_size)
+
+        self.logger.info("Start prediction of test labels")
+        pred_labels = model.predict([test_cert_matrix, test_dict_matrix], verbose=1)
+        pred_labels = (pred_labels > 0.5).astype(float)
+
+        f1_value = f1_score(gold_labels, pred_labels)
+        acc_value = accuracy_score(gold_labels, pred_labels)
+
+        self.logger.info("Result: f1_score= %s | acc_score= %s", f1_value, acc_value)
+
+    def split_train_test(self, certificate_df: DataFrame, test_ratio: float) -> Tuple[DataFrame, DataFrame]:
+        #splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42)
+        #split = splitter.split(certificate_df, certificate_df["ICD10"])
+
+        splitter = ShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42)
+        split = splitter.split(certificate_df)
+
+        for train_indices, test_indices in split:
+            training_data = certificate_df.iloc[train_indices]
+            test_data = certificate_df.iloc[test_indices]
+
+        return training_data, test_data
+
+    def build_model(self, embedding_model: FastText, max_cert_length: int, max_dict_length: int):
+        # TODO: Think about building a embedding layer
+        # TODO: Make hyper-parameter configurable!
+        # TODO: Think about using CNNs instaead of RNNs since we can have multiple ICD-10 per line!
+
+        # Model 1: Learn a representation of a line originating from a death certificate
+        input_certificate_line = Input((max_cert_length, embedding_model.vector_size))
+        certificate_rnn = Bidirectional(LSTM(200))(input_certificate_line)
+
+        # Model 2: Learn a representation of a line in the ICD-10 dictionary (~ DiagnosisText)
+        input_dictionary_line = Input((max_dict_length, embedding_model.vector_size))
+        dictionary_rnn = Bidirectional(LSTM(200))(input_dictionary_line)
+
+        # Calculate similarity between both representations
+        dot_product = Dot(axes=1, normalize=True)([certificate_rnn, dictionary_rnn])
+
+        output = Dense(1, activation='sigmoid')(dot_product)
+
+        # Create the primary training model
+        model = Model(inputs=[input_certificate_line, input_dictionary_line], outputs=output)
+        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
+
+        return model
+
+    def prepare_certificate_df(self, certificate_df: DataFrame, ft_model: FastText, max_length: int=None) -> Tuple[DataFrame, int]:
+        certificate_pipeline = Pipeline([
+            ("LowercaseText", pdu.to_lowercase("RawText")),
+            ("TokenizeDiagnosis", pdu.tokenize("RawText", "Tokens")),
+            ("CountTokens", pdu.count_values("Tokens", "NumTokens")),
+            ("LookupFastTextVectors", pdu.lookup_fast_text_vectors("Tokens", "FtVectors", ft_model))
+        ])
+
+        cert_data_prepared = certificate_pipeline.fit_transform(certificate_df)
+
+        if not max_length:
+            max_length = cert_data_prepared["NumTokens"].max()
+
+        cert_data_prepared = pdu.vectors_to_matrix("FtVectors", "FtMatrix", ft_model.vector_size, max_length).fit_transform(cert_data_prepared)
+
+        return cert_data_prepared, max_length
+
+    def prepare_dictionary_df(self, dictionary_df: DataFrame, ft_model: FastText) -> Tuple[DataFrame, int]:
+        dictionary_pipeline = Pipeline([
+            ("CombineTexts", pdu.combine_texts(["DiagnosisText", "Standardized"], "DictText")),
+            ("LowercaseText", pdu.to_lowercase("DictText")),
+            ("TokenizeDiagnosis", pdu.tokenize("DictText", "Tokens")),
+            ("CountTokens", pdu.count_values("Tokens", "NumTokens")),
+            ("LookupTokenIds", pdu.lookup_fast_text_vectors("Tokens", "FtVectors", ft_model)),
+        ])
+
+        dict_data_prepared = dictionary_pipeline.fit_transform(dictionary_df)
+        max_length = dict_data_prepared["NumTokens"].max()
+
+        dict_data_prepared = pdu.vectors_to_matrix("FtVectors", "FtMatrix", ft_model.vector_size, max_length).fit_transform(dict_data_prepared)
+
+        return dict_data_prepared, max_length
+
+    def build_pairs(self, certificate_data: DataFrame, dictionary_data: DataFrame, num_neg_samples: int):
+        # FIXME: This can be implemented more efficiently!
+        # FIXME: Improve sampling of negative instances (especially if code is I-XXX sample other codes of the same class (e.g. I-YYY)
+        # FIXME: Use negative sample ratio (depending on true dictionary entries), e.g. 0.5 or 1.2
+
+        certificate_vectors = []
+        dictionary_vectors = []
+        labels = []
+
+        for i, cert_row in tqdm(certificate_data.iterrows(), desc="build-pairs", total=len(certificate_data)):
+            line_icd10_code = cert_row["ICD10"]
+
+            # Build positive examples (based on training data)
+            dictionary_entries = dictionary_data.query("Icd1 == '%s'" % line_icd10_code)
+            #self.logger.info("Found %s entries for ICD-10 code %s", len(dictionary_entries), line_icd10_code)
+            for i, dict_row in dictionary_entries.iterrows():
+                certificate_vectors.append(cert_row["FtMatrix"])
+                dictionary_vectors.append(dict_row["FtMatrix"])
+
+                labels.append(1.0)
+
+            # Find illegal ICD-10 for this line
+            negative_samples = dictionary_data.query("Icd1 != '%s'" % line_icd10_code)
+            negative_samples = negative_samples.sample(num_neg_samples)
+
+            # Build negative samples
+            for i, dict_row in negative_samples.iterrows():
+                certificate_vectors.append(cert_row["FtMatrix"])
+                dictionary_vectors.append(dict_row["FtMatrix"])
+
+                labels.append(0.0)
+
+        data = {"CertFtMatrix": certificate_vectors, "DictFtMatrix": dictionary_vectors, "Label": labels}
+        return pd.DataFrame(data)
+
+    def build_matrices(self, pair_data: DataFrame, max_cert_length: int, max_dict_length: int, vector_size: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        num_pairs = len(pair_data)
+
+        certificate_matrix = np.zeros((num_pairs, max_cert_length, vector_size))
+        dictionary_matrix = np.zeros((num_pairs, max_dict_length, vector_size))
+        label_matrix = np.zeros((num_pairs))
+
+        for i, (_, row) in tqdm(enumerate(pair_data.iterrows()), desc="build-matrices", total=num_pairs):
+            certificate_matrix[i] = row["CertFtMatrix"]
+            dictionary_matrix[i] = row["DictFtMatrix"]
+            label_matrix[i] = row["Label"]
+
+        return certificate_matrix, dictionary_matrix, label_matrix
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(prog="CLEF2018")
+    parser.add_argument("--epochs", help="Number of epochs to train", default=10, type=int)
+    parser.add_argument("--batch_size", help="Batch size during training", default=10, type=int)
+    parser.add_argument("--test_ratio", help="Ratio of test samples to use", default=0.2, type=float)
+    parser.add_argument("--neg_samples", help="Number of negative samples for each pair to use", default=75, type=int)
+    parser.add_argument("--sample", help="Number of instances to sample from the training data", default=None, type=int)
+
+    args = parser.parse_args()
+
+    AppContext.initialize_by_app_name("eCLEF2018-Task1")
+
+    clef_data = Clef18Task1Data()
+    it_dictionary = clef_data.read_it_dictionary()
+    it_certificates = clef_data.read_it_train_certificates()
+
+    if args.sample:
+        print("Sampling %s instances", args.sample)
+        it_certificates = it_certificates.sample(args.sample, random_state=42)
+
+    ft_embeddings = FastTextEmbeddings()
+    ft_it_model = ft_embeddings.load_it_embeddings()
+
+    clef18_task1 = Clef18Task1()
+    clef18_task1.train_and_evaluate(it_certificates, it_dictionary, ft_it_model, args.test_ratio, args.neg_samples, args.epochs, args.batch_size)
+
+
diff --git a/code/clef18_task1_data.py b/code/clef18_task1_data.py
new file mode 100644
index 0000000..2f514fa
--- /dev/null
+++ b/code/clef18_task1_data.py
@@ -0,0 +1,134 @@
+import os
+import pandas as pd
+
+from pandas import DataFrame
+
+from app_context import AppContext
+from preprocessing import CombineTexts
+from util import LoggingMixin
+
+
+class Clef18Task1Data(LoggingMixin):
+
+    def __init__(self):
+        LoggingMixin.__init__(self, self.__class__.__name__)
+
+    def read_it_train_certificates(self) -> DataFrame:
+        base_folder = "data/train/IT/training/raw/corpus/"
+
+        calculees_file = os.path.join(base_folder, "CausesCalculees_IT_1.csv")
+        brutes_file = os.path.join(base_folder, "CausesBrutes_IT_1.csv")
+
+        return self._read_certificates(calculees_file, brutes_file)
+
+    def read_it_dictionary(self) -> DataFrame:
+        base_folder = "data/train/IT/training/raw/dictionaries"
+        dictionary_file = os.path.join(base_folder, "dictionary_IT.csv")
+        return self._read_icd10_dictionary(dictionary_file, "iso-8859-1")
+
+    # --------------------------------------------------------------------------------
+
+    def read_hu_train_certificates(self) -> DataFrame:
+        base_folder = "data/train/HU/training/raw/corpus/"
+
+        calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv")
+        brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv")
+
+        return self._read_certificates(calculees_file, brutes_file)
+
+    def read_hu_dictionary(self) -> DataFrame:
+        base_folder = "data/train/HU/training/raw/dictionaries"
+        dictionary_file = os.path.join(base_folder, "Hungarian_dictionary_UTF8.csv")
+        return self._read_icd10_dictionary(dictionary_file, "utf-8")
+
+    # --------------------------------------------------------------------------------
+
+    def read_fr_train_certificates(self) -> DataFrame:
+        # FIXME: Load other training files from 2011-2015!
+        base_folder = "data/train/FR/training/raw/corpus/"
+
+        calculees_file = os.path.join(base_folder, "CausesCalculees_FR_2006-2012.csv")
+        brutes_file = os.path.join(base_folder, "CausesBrutes_FR_2006-2012.csv")
+
+        return self._read_certificates(calculees_file, brutes_file)
+
+    def read_fr_dictionary(self) -> DataFrame:
+        # FIXME: Load other training files from 2011-2015!
+
+        base_folder = "data/train/FR/training/aligned/dictionaries"
+        dictionary_file = os.path.join(base_folder, "Dictionnaire2006-2010.csv")
+        return self._read_icd10_dictionary(dictionary_file, "utf-8")
+
+    # --------------------------------------------------------------------------------
+
+    def _read_certificates(self, calculees_file: str, brutus_file: str) -> DataFrame:
+        self.logger.info("Reading calculees file from %s", calculees_file)
+        calculees_data = pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"])
+        self.logger.info("Found %s death certifcate lines", len(calculees_data))
+
+        self.logger.info("Reading brutus file from %s", brutus_file)
+        brutus_data = pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"])
+
+        joined_data = brutus_data.join(calculees_data, lsuffix="_b", rsuffix="_c")
+
+        return joined_data[["RawText", "ICD10"]]
+
+    def _read_icd10_dictionary(self, file: str, encoding: str) -> DataFrame:
+        self.logger.info("Reading ICD-10 dictionary from %s", file)
+        dictionary_data = pd.read_csv(file, sep=";", encoding=encoding)
+        num_dictionary_entries = len(dictionary_data)
+        self.logger.info("Found %s dictionary entries", num_dictionary_entries)
+
+        if "Standardized" not in dictionary_data.columns:
+            dictionary_data["Standardized"] = None
+
+        dictionary_data = dictionary_data[["Icd1", "Standardized", "DiagnosisText"]]
+        dictionary_data = dictionary_data.drop_duplicates()
+
+        self.logger.info("Removed %s duplicates from dictionary", num_dictionary_entries - len(dictionary_data))
+
+        return dictionary_data
+
+
+if __name__ == "__main__":
+    # Just for debugging / development purposes
+    AppContext.initialize_by_app_name("Clef18Task1-Data")
+
+    clef_task_data = Clef18Task1Data()
+
+    all_data_sets = {
+        "IT": (clef_task_data.read_it_train_certificates(), clef_task_data.read_it_dictionary()),
+        "HU": (clef_task_data.read_hu_train_certificates(), clef_task_data.read_hu_dictionary()),
+        "FR": (clef_task_data.read_fr_train_certificates(), clef_task_data.read_fr_dictionary())
+    }
+
+    for name, (certs, dict) in all_data_sets.items():
+        print("Data set: ", name)
+        ids = []
+        for i, row in certs.iterrows():
+            ids.append("{}##{}".format(i[0], i[1]))
+
+        certs["ID"] = ids
+        value_counts = certs["ID"].value_counts()
+        print(value_counts)
+
+        total_sum = value_counts.sum()
+        sum_more_than_one = 0
+        sum_more_than_two = 0
+        for i, row in value_counts.iteritems():
+            if row > 1:
+                sum_more_than_one = sum_more_than_one + row
+            if row > 2:
+                sum_more_than_two = sum_more_than_two + row
+
+
+        print(sum_more_than_one / total_sum)
+        print(sum_more_than_two / total_sum)
+
+        print(certs["ICD10"].value_counts())
+        print("\n\n\n")
+
+
+
+
+
diff --git a/code/dummy.txt b/code/dummy.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/code/embeddings/download_embeddings.sh b/code/embeddings/download_embeddings.sh
new file mode 100755
index 0000000..0bb2427
--- /dev/null
+++ b/code/embeddings/download_embeddings.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+function log_and_run {
+	echo $1
+	eval "$1"
+}
+
+echo "Downloading italian embeddings"
+log_and_run "wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.it.zip"
+
+echo "Unzipping download vectors"
+log_and_run "unzip wiki.it.zip"
+
+echo "Clean directory"
+log_and_run "rm -f wiki.it.zip"
diff --git a/code/ft_embeddings.py b/code/ft_embeddings.py
new file mode 100644
index 0000000..34cc482
--- /dev/null
+++ b/code/ft_embeddings.py
@@ -0,0 +1,20 @@
+from gensim.models import FastText
+
+from util import LoggingMixin
+
+
+class FastTextEmbeddings(LoggingMixin):
+
+    def __init__(self):
+        LoggingMixin.__init__(self, __class__.__name__)
+
+    def load_it_embeddings(self) -> FastText:
+        italian_ft_file = "embeddings/wiki.it"
+        return self._load_ft_model(italian_ft_file)
+
+    def _load_ft_model(self, ft_file: str) -> FastText:
+        self.logger.info("Loading fast text embeddings from %s", ft_file)
+        ft_model = FastText.load_fasttext_format(ft_file)
+        self.logger.info("Finished loading of fast text model")
+
+        return ft_model
diff --git a/code/preprocessing.py b/code/preprocessing.py
new file mode 100644
index 0000000..781e81e
--- /dev/null
+++ b/code/preprocessing.py
@@ -0,0 +1,150 @@
+import re
+import numpy as np
+import pandas as pd
+
+from typing import Callable, List
+
+from gensim.models import FastText
+from gensim.models.keyedvectors import Word2VecKeyedVectors
+from pandas import DataFrame
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from util import PandasUtil
+
+
+class DataPreparationUtil(object):
+
+    @staticmethod
+    def to_lowercase(column: str):
+        def _lower(text):
+            return str(text).lower()
+
+        return MapFunction(column, _lower)
+
+    @staticmethod
+    def tokenize(text_column: str, token_column: str = "tokens"):
+        return SimpleTokenizer(text_column, token_column)
+
+    @staticmethod
+    def count_values(column: str, target_column: str):
+        def _count(value):
+            return len(value)
+        return MapFunction(column, _count, target_column)
+
+    @staticmethod
+    def lookup_fast_text_vectors(token_column: str, target_column: str, ft_model: FastText):
+        return FastTextVectorsLookup(token_column, target_column, ft_model)
+
+    @staticmethod
+    def vectors_to_matrix(vectors_column: str, matrix_column: str, vector_size: int, max_length: int = None):
+        return VectorListToMatrix(vectors_column, matrix_column, vector_size, max_length)
+
+    @staticmethod
+    def combine_texts(columns: List[str], target_column: str):
+        return CombineTexts(columns, target_column)
+
+
+class FitMixin(object):
+
+    def fit(self, data, y=None):
+        return self
+
+
+class MapFunction(BaseEstimator, TransformerMixin, FitMixin):
+
+    def __init__(self, column: str, map_function: Callable, target_column: str = None):
+        self.column = column
+        self.map_function = map_function
+
+        if target_column:
+            self.target_column = target_column
+        else:
+            self.target_column = column
+
+    def transform(self, data: DataFrame, y=None):
+        values = data[self.column].apply(self.map_function)
+        return PandasUtil.append_column(data, self.target_column, values)
+
+
+class FastTextVectorsLookup(BaseEstimator, TransformerMixin, FitMixin):
+
+    def __init__(self, token_column: str, vector_column: str, ft_model: FastText):
+        self.token_column = token_column
+        self.vector_column = vector_column
+        self.ft_model = ft_model
+
+    def transform(self, data: DataFrame, y= None) -> DataFrame:
+        def _lookup_vectors(tokens):
+            vectors = []
+            for token in tokens:
+                try:
+                    vectors.append(self.ft_model[token])
+                except KeyError:
+                    print("Can't create embedding for "+token)
+            return vectors
+
+        vectors = data[self.token_column].apply(_lookup_vectors)
+        return PandasUtil.append_column(data, self.vector_column, vectors)
+
+
+class SimpleTokenizer(BaseEstimator, TransformerMixin, FitMixin):
+
+    def __init__(self, text_column: str, token_column: str):
+        self.text_column = text_column
+        self.token_column = token_column
+
+    def transform(self, data: DataFrame, y= None) -> DataFrame:
+        def _tokenize(value):
+            return [value.strip() for value in re.split("[ ,.=\t\!]", str(value)) if value.strip()]
+
+        tokens = data[self.text_column].apply(_tokenize)
+        return PandasUtil.append_column(data, self.token_column, tokens)
+
+
+class CombineTexts(BaseEstimator, TransformerMixin, FitMixin):
+
+    def __init__(self, text_columns: List[str], target_column: str, joiner: str = " "):
+        self.text_columns = text_columns
+        self.target_column = target_column
+        self.joiner = joiner
+
+    def transform(self, data: DataFrame, y=None):
+        def _combine(row):
+            return self.joiner.join([str(row[column]).strip() for column in self.text_columns if str(row[column]).strip()])
+
+        combined_texts = data.apply(_combine, axis=1)
+        return PandasUtil.append_column(data, self.target_column, combined_texts)
+
+
+class VectorListToMatrix(BaseEstimator, TransformerMixin, FitMixin):
+
+    def __init__(self, vector_column: str, matrix_column: str, vector_size: int, max_length: int = None):
+        self.vector_column = vector_column
+        self.matrix_column = matrix_column
+        self.vector_size = vector_size
+        self.max_length = max_length
+
+    def transform(self, data: DataFrame, y=None):
+        if not self.max_length:
+            self.max_length = max([len(value) for value in data[self.vector_column].values])
+
+        matrices = data[self.vector_column].apply(self._build_matrix())
+        return PandasUtil.append_column(data, self.matrix_column, matrices)
+
+    def _build_matrix(self):
+        def _build(vectors):
+            matrix = np.zeros((self.max_length, self.vector_size))
+            for i in range(self.max_length):
+                if i == len(vectors):
+                    break
+
+                matrix[i] = vectors[i]
+
+            return matrix
+
+        return _build
+
+
+
+
+
diff --git a/code/util.py b/code/util.py
new file mode 100644
index 0000000..d4b63ef
--- /dev/null
+++ b/code/util.py
@@ -0,0 +1,55 @@
+import os
+import logging
+
+from datetime import datetime
+from logging import Logger
+
+from pandas import DataFrame, Series
+
+
+class LogUtil(object):
+
+    LOG_ROOT_DIR = "_logs"
+
+    @staticmethod
+    def create_logger(name: str, std_out: bool = True, file_path: str = None, level=logging.DEBUG) -> Logger:
+        log_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+        logger = logging.getLogger(name)
+        logger.setLevel(level)
+
+        if std_out:
+            std_handler = logging.StreamHandler()
+            std_handler.setLevel(level)
+            std_handler.setFormatter(log_format)
+            logger.addHandler(std_handler)
+
+        if file_path is not None:
+            file_handler = logging.FileHandler(file_path, encoding="utf-8")
+            file_handler.setLevel(level)
+            file_handler.setFormatter(log_format)
+            logger.addHandler(file_handler)
+
+        return logger
+
+    @staticmethod
+    def create_timestamped_log_dir(sub_folder: str) -> str:
+        now_timestamp = datetime.utcnow().strftime("%Y%m%d%H%M%S")
+        dir_path = "{}/{}/{}/".format(LogUtil.LOG_ROOT_DIR, sub_folder, now_timestamp)
+        os.makedirs(dir_path)
+        return dir_path
+
+
+class LoggingMixin(object):
+
+    def __init__(self, logger_name: str):
+        self.logger = LogUtil.create_logger(logger_name)
+
+
+class PandasUtil(object):
+
+    @staticmethod
+    def append_column(data: DataFrame, column_name: str, values: Series):
+        column = {column_name: values}
+        return data.assign(**column)
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f609de0
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+pandas
+numpy
+nltk
+gensim
+keras
+sklearn
+tensorflow
+tqdm
-- 
GitLab