Skip to content
Snippets Groups Projects
Commit 758bf220 authored by Mario Sänger's avatar Mario Sänger
Browse files

Add reading of all french training files

parent 3ae09c9b
No related merge requests found
import os
from typing import List
import pandas as pd
from pandas import DataFrame
......@@ -45,7 +47,7 @@ class Clef18Task1Data(LoggingMixin):
calculees_file = os.path.join(base_folder, "CausesCalculees_IT_1.csv")
brutes_file = os.path.join(base_folder, "CausesBrutes_IT_1.csv")
return self._read_certificates(calculees_file, brutes_file)
return self._read_certificates([calculees_file], [brutes_file])
def read_it_dictionary(self) -> DataFrame:
base_folder = "data/train/IT/training/raw/dictionaries"
......@@ -60,7 +62,7 @@ class Clef18Task1Data(LoggingMixin):
calculees_file = os.path.join(base_folder, "CausesCalculees_HU_1.csv")
brutes_file = os.path.join(base_folder, "CausesBrutes_HU_1.csv")
return self._read_certificates(calculees_file, brutes_file)
return self._read_certificates([calculees_file], [brutes_file])
def read_hu_dictionary(self) -> DataFrame:
base_folder = "data/train/HU/training/raw/dictionaries"
......@@ -73,10 +75,19 @@ class Clef18Task1Data(LoggingMixin):
# FIXME: Load other training files from 2011-2015!
base_folder = "data/train/FR/training/raw/corpus/"
calculees_file = os.path.join(base_folder, "CausesCalculees_FR_2006-2012.csv")
brutes_file = os.path.join(base_folder, "CausesBrutes_FR_2006-2012.csv")
calculees_files = [
os.path.join(base_folder, "CausesCalculees_FR_2006-2012.csv"),
os.path.join(base_folder, "CausesCalculees_FR_2013.csv"),
os.path.join(base_folder, "CausesCalculees_FR_2014.csv")
]
brutes_files = [
os.path.join(base_folder, "CausesBrutes_FR_2006-2012.csv"),
os.path.join(base_folder, "CausesBrutes_FR_2013.csv"),
os.path.join(base_folder, "CausesBrutes_FR_2014.csv")
]
return self._read_certificates(calculees_file, brutes_file)
return self._read_certificates(calculees_files, brutes_files)
def read_fr_dictionary(self) -> DataFrame:
# FIXME: Load other training files from 2011-2015!
......@@ -122,13 +133,24 @@ class Clef18Task1Data(LoggingMixin):
# --------------------------------------------------------------------------------
def _read_certificates(self, calculees_file: str, brutus_file: str) -> DataFrame:
self.logger.info("Reading calculees file from %s", calculees_file)
calculees_data = pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"], skipinitialspace=True)
self.logger.info("Found %s death certificate lines", len(calculees_data))
def _read_certificates(self, calculees_files: List[str], brutus_files: List[str]) -> DataFrame:
calculees_data = []
for calculees_file in calculees_files:
self.logger.info("Reading calculees file from %s", calculees_file)
calculees_data.append(pd.read_csv(calculees_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"], skipinitialspace=True))
self.logger.info("Found %s death certificate entries", len(calculees_data[-1]))
calculees_data = pd.concat(calculees_data)
self.logger.info("Found %s death certificate lines in total", len(calculees_data))
brutus_data = []
for brutus_file in brutus_files:
self.logger.info("Reading brutus file from %s", brutus_file)
brutus_data.append(pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["YearCoded", "DocID", "LineID"]))
self.logger.info("Found %s death certificate entries", len(brutus_data[-1]))
self.logger.info("Reading brutus file from %s", brutus_file)
brutus_data = pd.read_csv(brutus_file, sep=";", encoding="iso-8859-1", index_col=["DocID", "LineID"])
brutus_data = pd.concat(brutus_data)
joined_data = brutus_data.join(calculees_data, lsuffix="_b", rsuffix="_c")
joined_data["ICD10"] = joined_data["ICD10"].astype(str)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment