From 758bf2203dee2fc3ad9983b77fc1dae66f6f3574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20Sa=CC=88nger?= <mario.saenger@student.hu-berlin.de> Date: Mon, 7 May 2018 18:03:11 +0200 Subject: [PATCH] Add reading of all french training files --- code_mario/clef18_task1_data.py | 44 ++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/code_mario/clef18_task1_data.py b/code_mario/clef18_task1_data.py index 5f85ef0..8fd724f 100644 --- a/code_mario/clef18_task1_data.py +++ b/code_mario/clef18_task1_data.py @@ -1,4 +1,6 @@ import os +from typing import List + import pandas as pd from pandas import DataFrame @@ -45,7 +47,7 @@ class Clef18Task1Data(LoggingMixin): calculees_file = os.path.join(base_folder, "CausesCalculees_IT_1.csv") brutes_file = os.path.join(base_folder, "CausesBrutes_IT_1.csv") - return self._read_certificates(calculees_file, brutes_file) + return self._read_certificates([calculees_file], [brutes_file]) def read_it_dictionary(self) -> DataFrame: base_folder = "data/train/IT/training/raw/dictionaries" @@ -60,7 +62,7 @@ class Clef18Task1Data(LoggingMixin): calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv") brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv") - return self._read_certificates(calculees_file, brutes_file) + return self._read_certificates([calculees_file], [brutes_file]) def read_hu_dictionary(self) -> DataFrame: base_folder = "data/train/HU/training/raw/dictionaries" @@ -73,10 +75,19 @@ class Clef18Task1Data(LoggingMixin): # FIXME: Load other training files from 2011-2015! base_folder = "data/train/FR/training/raw/corpus/" - calculees_file = os.path.join(base_folder, "CausesCalculees_FR_2006-2012.csv") - brutes_file = os.path.join(base_folder, "CausesBrutes_FR_2006-2012.csv") + calculees_files = [ + os.path.join(base_folder, "CausesCalculees_FR_2006-2012.csv"), + os.path.join(base_folder, "CausesCalculees_FR_2013.csv"), + os.path.join(base_folder, "CausesCalculees_FR_2014.csv") + ] + + brutes_files = [ + os.path.join(base_folder, "CausesBrutes_FR_2006-2012.csv"), + os.path.join(base_folder, "CausesBrutes_FR_2013.csv"), + os.path.join(base_folder, "CausesBrutes_FR_2014.csv") + ] - return self._read_certificates(calculees_file, brutes_file) + return self._read_certificates(calculees_files, brutes_files) def read_fr_dictionary(self) -> DataFrame: # FIXME: Load other training files from 2011-2015! @@ -122,13 +133,24 @@ class Clef18Task1Data(LoggingMixin): # -------------------------------------------------------------------------------- - def _read_certificates(self, calculees_file: str, brutus_file: str) -> DataFrame: - self.logger.info("Reading calculees file from %s", calculees_file) - calculees_data = pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"], skipinitialspace=True) - self.logger.info("Found %s death certificate lines", len(calculees_data)) + def _read_certificates(self, calculees_files: List[str], brutus_files: List[str]) -> DataFrame: + calculees_data = [] + for calculees_file in calculees_files: + self.logger.info("Reading calculees file from %s", calculees_file) + calculees_data.append(pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], skipinitialspace=True)) + self.logger.info("Found %s death certificate entries", len(calculees_data[-1])) + + calculees_data = pd.concat(calculees_data) + self.logger.info("Found %s death certificate lines in total", len(calculees_data)) + + brutus_data = [] + for brutus_file in brutus_files: + self.logger.info("Reading brutus file from %s", brutus_file) + brutus_data.append(pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"])) + self.logger.info("Found %s death certificate entries", len(brutus_data[-1])) + - self.logger.info("Reading brutus file from %s", brutus_file) - brutus_data = pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"]) + brutus_data = pd.concat(brutus_data) joined_data = brutus_data.join(calculees_data, lsuffix="_b", rsuffix="_c") joined_data["ICD10"] = joined_data["ICD10"].astype(str) -- GitLab