From b536c5a8337e78c13dc0a251e01bafe29e8dca0d Mon Sep 17 00:00:00 2001
From: Jurica Seva <seva@informatik.hu-berlin.de>
Date: Sat, 5 May 2018 10:22:58 +0200
Subject: [PATCH] Final model with balanced s2s training data and ICD10 without
 additional data points from CC/CB combination.

---
 code_jurica/classificationICD10_attention.py | 24 +++++++++++++++++---
 code_jurica/loader.py                        |  9 +++++++-
 code_jurica/seq2seq.py                       | 19 +++++++++++++++-
 code_jurica/test.py                          | 15 ++++++++++++
 code_jurica/train.sh                         |  2 +-
 code_jurica/util.py                          |  8 +++++++
 6 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/code_jurica/classificationICD10_attention.py b/code_jurica/classificationICD10_attention.py
index 14cf975..a82fac8 100644
--- a/code_jurica/classificationICD10_attention.py
+++ b/code_jurica/classificationICD10_attention.py
@@ -5,11 +5,13 @@
 from util import *
 import numpy as np
 import random
+import tensorflow as tf
 import traceback
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import train_test_split
 from collections import Counter
 
+from keras import backend as K
 from keras.preprocessing.sequence import pad_sequences
 from keras.preprocessing.text import Tokenizer
 from keras.optimizers import Adam
@@ -18,22 +20,38 @@ from keras.layers import Embedding, Input, LSTM, Dense, Bidirectional
 from keras.models import Model
 from keras.utils import multi_gpu_model, np_utils
 
-import tensorflow as tf
-
 from _layers import AttentionWithContext, Attention
 
+
+#REPRODUCIBLE
+np.random.seed(42)
+import random
+random.seed(12345)
+import os
+os.environ['PYTHONHASHSEED'] = '0'
+
+import tensorflow as tf
+config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+from keras import backend as K
+tf.set_random_seed(1234)
+#REPRODUCIBLE
+
 ###################################
 # TensorFlow wizardry
-config=tf.ConfigProto()
+# config=tf.ConfigProto()
 
 # Don't pre-allocate memory; allocate as-needed
 config.gpu_options.allow_growth=True
 config.gpu_options.allocator_type='BFC'
 
+sess = tf.Session(graph=tf.get_default_graph(), config=config)
+K.set_session(sess)
+
 callbacks_list=[
     EarlyStopping(
         monitor='val_loss',
         patience=2,
+        min_delta=0.005
     ),
     ModelCheckpoint(
         filepath='models/icd10Classification_attention.h5',
diff --git a/code_jurica/loader.py b/code_jurica/loader.py
index a1d2bf0..eea1eaa 100644
--- a/code_jurica/loader.py
+++ b/code_jurica/loader.py
@@ -1,5 +1,4 @@
 from util import *
-import random
 import numpy as np
 from keras.preprocessing.sequence import pad_sequences
 from keras.preprocessing.text import Tokenizer
@@ -8,6 +7,14 @@ import random
 from sklearn.model_selection import train_test_split
 import pickle
 
+#REPRODUCIBLE
+np.random.seed(42)
+import random
+random.seed(12345)
+import os
+os.environ['PYTHONHASHSEED'] = '0'
+#REPRODUCIBLE
+
 kerasTokenizer = Tokenizer()
 tokenizer = TokenizePreprocessor()
 prepareData = prepareData()
diff --git a/code_jurica/seq2seq.py b/code_jurica/seq2seq.py
index 4f0e2cc..fd22ce9 100644
--- a/code_jurica/seq2seq.py
+++ b/code_jurica/seq2seq.py
@@ -13,14 +13,31 @@ import tqdm
 import pickle
 from sklearn.metrics import classification_report
 
+#REPRODUCIBLE
+np.random.seed(42)
+import random
+random.seed(12345)
+import os
+os.environ['PYTHONHASHSEED'] = '0'
+
+import tensorflow as tf
+config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+from keras import backend as K
+tf.set_random_seed(1234)
+#REPRODUCIBLE
+
+
 ###################################
 # TensorFlow wizardry
-config = tf.ConfigProto()
+# config = tf.ConfigProto()
 
 # Don't pre-allocate memory; allocate as-needed
 config.gpu_options.allow_growth = True
 config.gpu_options.allocator_type = 'BFC'
 
+sess = tf.Session(graph=tf.get_default_graph(), config=config)
+K.set_session(sess)
+
 # LOAD ICD 10 CLASSIFICATION MODEL
 try:
     icd10_model = keras_load_model('models/icd10Classification_attention.h5',
diff --git a/code_jurica/test.py b/code_jurica/test.py
index 4b2b253..e105d34 100644
--- a/code_jurica/test.py
+++ b/code_jurica/test.py
@@ -8,6 +8,21 @@ from sklearn.metrics import classification_report
 from util import report_to_df
 import numpy as np
 
+#REPRODUCIBLE
+np.random.seed(42)
+import random
+random.seed(12345)
+import os
+os.environ['PYTHONHASHSEED'] = '0'
+
+import tensorflow as tf
+session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+from keras import backend as K
+tf.set_random_seed(1234)
+sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
+K.set_session(sess)
+#REPRODUCIBLE
+
 with open('models/train_test_split.p', 'rb') as handle:
     data_set = pickle.load(handle)
 
diff --git a/code_jurica/train.sh b/code_jurica/train.sh
index 0165144..cae9235 100644
--- a/code_jurica/train.sh
+++ b/code_jurica/train.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
-#CUDA_VISIBLE_DEVICES=3 /home/sevajuri/anaconda3/bin/python3 /home/sevajuri/projects/clef18/code_jurica/classificationICD10_attention.py
+CUDA_VISIBLE_DEVICES=3 /home/sevajuri/anaconda3/bin/python3 /home/sevajuri/projects/clef18/code_jurica/classificationICD10_attention.py
 CUDA_VISIBLE_DEVICES=3 /home/sevajuri/anaconda3/bin/python3 /home/sevajuri/projects/clef18/code_jurica/seq2seq.py
 CUDA_VISIBLE_DEVICES=3 /home/sevajuri/anaconda3/bin/python3 /home/sevajuri/projects/clef18/code_jurica/test.py
diff --git a/code_jurica/util.py b/code_jurica/util.py
index f418a80..b96b5d4 100644
--- a/code_jurica/util.py
+++ b/code_jurica/util.py
@@ -16,6 +16,14 @@ import math
 import datetime
 from io import StringIO
 
+#REPRODUCIBLE
+np.random.seed(42)
+import random
+random.seed(12345)
+import os
+os.environ['PYTHONHASHSEED'] = '0'
+#REPRODUCIBLE
+
 now = datetime.datetime.now()
 date_label=now.strftime("%Y_%m_%d")
 
-- 
GitLab