Skip to content
Snippets Groups Projects
Commit 758bf220 authored by Mario Sänger's avatar Mario Sänger
Browse files

Add reading of all french training files

parent 3ae09c9b
No related merge requests found
import os import os
from typing import List
import pandas as pd import pandas as pd
from pandas import DataFrame from pandas import DataFrame
...@@ -45,7 +47,7 @@ class Clef18Task1Data(LoggingMixin): ...@@ -45,7 +47,7 @@ class Clef18Task1Data(LoggingMixin):
calculees_file = os.path.join(base_folder, "CausesCalculees_IT_1.csv") calculees_file = os.path.join(base_folder, "CausesCalculees_IT_1.csv")
brutes_file = os.path.join(base_folder, "CausesBrutes_IT_1.csv") brutes_file = os.path.join(base_folder, "CausesBrutes_IT_1.csv")
return self._read_certificates(calculees_file, brutes_file) return self._read_certificates([calculees_file], [brutes_file])
def read_it_dictionary(self) -> DataFrame: def read_it_dictionary(self) -> DataFrame:
base_folder = "data/train/IT/training/raw/dictionaries" base_folder = "data/train/IT/training/raw/dictionaries"
...@@ -60,7 +62,7 @@ class Clef18Task1Data(LoggingMixin): ...@@ -60,7 +62,7 @@ class Clef18Task1Data(LoggingMixin):
calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv") calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv")
brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv") brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv")
return self._read_certificates(calculees_file, brutes_file) return self._read_certificates([calculees_file], [brutes_file])
def read_hu_dictionary(self) -> DataFrame: def read_hu_dictionary(self) -> DataFrame:
base_folder = "data/train/HU/training/raw/dictionaries" base_folder = "data/train/HU/training/raw/dictionaries"
...@@ -73,10 +75,19 @@ class Clef18Task1Data(LoggingMixin): ...@@ -73,10 +75,19 @@ class Clef18Task1Data(LoggingMixin):
# FIXME: Load other training files from 2011-2015! # FIXME: Load other training files from 2011-2015!
base_folder = "data/train/FR/training/raw/corpus/" base_folder = "data/train/FR/training/raw/corpus/"
calculees_file = os.path.join(base_folder, "CausesCalculees_FR_2006-2012.csv") calculees_files = [
brutes_file = os.path.join(base_folder, "CausesBrutes_FR_2006-2012.csv") os.path.join(base_folder, "CausesCalculees_FR_2006-2012.csv"),
os.path.join(base_folder, "CausesCalculees_FR_2013.csv"),
os.path.join(base_folder, "CausesCalculees_FR_2014.csv")
]
brutes_files = [
os.path.join(base_folder, "CausesBrutes_FR_2006-2012.csv"),
os.path.join(base_folder, "CausesBrutes_FR_2013.csv"),
os.path.join(base_folder, "CausesBrutes_FR_2014.csv")
]
return self._read_certificates(calculees_file, brutes_file) return self._read_certificates(calculees_files, brutes_files)
def read_fr_dictionary(self) -> DataFrame: def read_fr_dictionary(self) -> DataFrame:
# FIXME: Load other training files from 2011-2015! # FIXME: Load other training files from 2011-2015!
...@@ -122,13 +133,24 @@ class Clef18Task1Data(LoggingMixin): ...@@ -122,13 +133,24 @@ class Clef18Task1Data(LoggingMixin):
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
def _read_certificates(self, calculees_file: str, brutus_file: str) -> DataFrame: def _read_certificates(self, calculees_files: List[str], brutus_files: List[str]) -> DataFrame:
self.logger.info("Reading calculees file from %s", calculees_file) calculees_data = []
calculees_data = pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"], skipinitialspace=True) for calculees_file in calculees_files:
self.logger.info("Found %s death certificate lines", len(calculees_data)) self.logger.info("Reading calculees file from %s", calculees_file)
calculees_data.append(pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], skipinitialspace=True))
self.logger.info("Found %s death certificate entries", len(calculees_data[-1]))
calculees_data = pd.concat(calculees_data)
self.logger.info("Found %s death certificate lines in total", len(calculees_data))
brutus_data = []
for brutus_file in brutus_files:
self.logger.info("Reading brutus file from %s", brutus_file)
brutus_data.append(pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"]))
self.logger.info("Found %s death certificate entries", len(brutus_data[-1]))
self.logger.info("Reading brutus file from %s", brutus_file) brutus_data = pd.concat(brutus_data)
brutus_data = pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"])
joined_data = brutus_data.join(calculees_data, lsuffix="_b", rsuffix="_c") joined_data = brutus_data.join(calculees_data, lsuffix="_b", rsuffix="_c")
joined_data["ICD10"] = joined_data["ICD10"].astype(str) joined_data["ICD10"] = joined_data["ICD10"].astype(str)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment