diff --git a/code_jurica/_config.py b/code_jurica/_config.py index d998713e17b9c79c93f4a1cb26fff09cf00647fc..8ea5d0c745372a3c97c92233e2ba4f8339153718 100644 --- a/code_jurica/_config.py +++ b/code_jurica/_config.py @@ -11,23 +11,52 @@ DATA_IT=IT_HOME+'corpus/' TRAINING = { 'FR': { - 'CB': DATA_FR + 'CausesBrutes_FR_2014.csv', - 'CC': DATA_FR + 'CausesCalculees_FR_2014.csv', - 'Ident': DATA_FR + 'Ident_FR_2014.csv' + 'CB': [ + #DATA_FR + 'CausesBrutes_FR_2006-2012.csv', + #DATA_FR + 'CausesBrutes_FR_2013.csv', + DATA_FR + 'CausesBrutes_FR_2014.csv' + ], + 'CC': [ + #DATA_FR + 'CausesCalculees_FR_2006-2012.csv', + #DATA_FR + 'CausesCalculees_FR_2013.csv', + DATA_FR + 'CausesCalculees_FR_2014.csv' + ], + 'Ident': [ + DATA_FR + 'Ident_FR_2014.csv' + ], + "Encoding" : "latin1", + "SplitMultiCode" : False }, 'HU': { - 'CB': DATA_HU + 'CausesBrutes_HU_1.csv', - 'CC': DATA_HU + 'CausesCalculees_HU_1.csv', - 'Ident': DATA_HU + 'Ident_HU_1.csv' + 'CB': [ + DATA_HU + 'CausesBrutes_HU_1.csv' + ], + 'CC': [ + DATA_HU + 'CausesCalculees_HU_1.csv' + ], + 'Ident': [ + DATA_HU + 'Ident_HU_1.csv' + ], + "Encoding" : "latin1", + "SplitMultiCode" : False }, 'IT': { - 'CB': DATA_IT + 'CausesBrutes_IT_1.csv', - 'CC': DATA_IT + 'CausesCalculees_IT_1.csv', - 'Ident': DATA_IT + 'Ident_IT_1.csv' + 'CB': [ + DATA_IT + 'CausesBrutes_IT_1.csv' + ], + 'CC': [ + DATA_IT + 'CausesCalculees_IT_1.csv' + ], + 'Ident': [ + DATA_IT + 'Ident_IT_1.csv' + ], + "Encoding" : "latin1", + "SplitMultiCode" : False } } DICT_FR=FR_HOME+'dictionaries/Dictionnaire2015.csv' DICT_HU=HU_HOME+'dictionaries/Hungarian_dictionary_UTF8.csv' DICT_IT=IT_HOME+'dictionaries/dictionary_IT.csv' -DICT_PREPROCESED=PREPARED_DATA_FOLDER+'dictionaries.p' \ No newline at end of file + +DICT_PREPROCESED=PREPARED_DATA_FOLDER+'dictionaries.p' diff --git a/code_jurica/loader.py b/code_jurica/loader.py index 3d06d5572c5d35313e325068a6cf5880a7e3f42e..363d0677b299d65846df6f4303fbc95eda8f12f9 100644 --- a/code_jurica/loader.py +++ b/code_jurica/loader.py @@ -8,6 +8,9 @@ from sklearn.model_selection import train_test_split import pickle #REPRODUCIBLE +from util import TokenizePreprocessor + + np.random.seed(42) import random random.seed(12345) @@ -21,9 +24,9 @@ tokenizer = TokenizePreprocessor() prepareData = prepareData() SEED = 777 -frCorpora, frErrors = prepareData.prepareData('FR') itCorpora, itErrors = prepareData.prepareData('IT') huCorpora, huErrors = prepareData.prepareData('HU') +frCorpora, frErrors = prepareData.prepareData('FR') # print(len(frErrors), len(itErrors), len(huErrors)) try: @@ -141,4 +144,4 @@ target_train, target_val, labels_train, labels_val = train_test_split(target_cor # len(labels_train), # len(labels_val) # ) -# ) \ No newline at end of file +# ) diff --git a/code_jurica/util.py b/code_jurica/util.py index 31fc37bf17e362840ba991c609a164812f4b3f1f..6b9004bb5e3beda24d1e8e99062d4478e8d82ec2 100644 --- a/code_jurica/util.py +++ b/code_jurica/util.py @@ -226,26 +226,51 @@ class prepareData(): errors = pickle.load(open('data/preprocesed/Errors_{}.p'.format(dataset),'rb')) except FileNotFoundError: - cb = pd.read_csv(TRAINING[dataset]['CB'], sep=';', encoding = "latin1") - cc = pd.read_csv(TRAINING[dataset]['CC'], sep=';', encoding = "latin1") + file_encoding = TRAINING[dataset]['Encoding'] + split_multi_code = TRAINING[dataset]['SplitMultiCode'] + + cc_files = TRAINING[dataset]['CC'] + cc_data = [pd.read_csv(cc_file, sep=';', encoding=file_encoding, skipinitialspace=True) for cc_file in cc_files] + cc = pd.concat(cc_data) + + cb_files = TRAINING[dataset]['CB'] + cb_data = [pd.read_csv(cb_file, sep=';', encoding=file_encoding, skipinitialspace=True) for cb_file in cb_files] + cb = pd.concat(cb_data) + data = [] errors= [] for index, row in tqdm(cb.iterrows(), ascii=True, desc='Preparing {}'.format(dataset)): try: - text = cc[(cc.DocID == row.DocID) & (cc.LineID == row.LineID)] + text = cc[(cc.DocID == row.DocID) & (cc.LineID == row.LineID) & (cc.YearCoded == row.YearCoded)] + num_icd10_codes = len(text) # print(text.StandardText.values[0], text.ICD10.values[0]) - data.append([ - row.RawText.lower(), - text.StandardText.values[0].lower(), - text.ICD10.values[0] - ]) + + appended_data = False + if split_multi_code and num_icd10_codes > 1: + parts = row.RawText.lower().split(",") + if len(parts) == num_icd10_codes: + for i in range(num_icd10_codes): + data.append([parts[i], text.StandardText.values[i], text.ICD10.values[i]]) + + appended_data = True + + if not appended_data: + data.append([ + row.RawText.lower(), + text.StandardText.values[0].lower(), + text.ICD10.values[0] + ]) + except Exception as e: # print(e) # print(row.DocID, row.LineID) errors.append([dataset, row.DocID, row.LineID]) + output_folder = "data/preprocesed/" + os.makedirs(output_folder, exist_ok=True) + pickle.dump(data, open('data/preprocesed/Train_{}.p'.format(dataset),'wb')) pickle.dump(errors, open('data/preprocesed/Errors_{}.p'.format(dataset),'wb'))