Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
clef18
Manage
Activity
Members
Labels
Plan
Issues
10
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jurica Seva
clef18
Commits
87a185ed
Commit
87a185ed
authored
6 years ago
by
Jurica Seva
Browse files
Options
Downloads
Patches
Plain Diff
Updated with results on the train/val dataset.
parent
8e93eea7
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
code_jurica/test.py
+146
-134
146 additions, 134 deletions
code_jurica/test.py
paper/40_experiments.tex
+5
-5
5 additions, 5 deletions
paper/40_experiments.tex
with
151 additions
and
139 deletions
code_jurica/test.py
+
146
−
134
View file @
87a185ed
...
...
@@ -23,138 +23,150 @@ sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K
.
set_session
(
sess
)
#REPRODUCIBLE
with
open
(
'
models/train_test_split_extended.p
'
,
'
rb
'
)
as
handle
:
data_set
=
pickle
.
load
(
handle
)
source_val
=
data_set
[
'
source_val
'
]
target_val
=
data_set
[
'
target_val
'
]
labels_val
=
data_set
[
'
labels_val
'
]
# print(source_val)
# input('source val')
# ICD 10 STUFF
icd10_model
=
keras_load_model
(
'
models/icd10Classification_attention_duplicated.h5
'
,
custom_objects
=
{
'
Attention
'
:
Attention
})
with
open
(
'
models/icd10_tokenizer_duplicated.p
'
,
'
rb
'
)
as
handle
:
icd10Tokenizer
=
pickle
.
load
(
handle
)
with
open
(
'
models/icd10_mappings_duplicated.p
'
,
'
rb
'
)
as
handle
:
icd10Encoder
=
pickle
.
load
(
handle
)
# ICD 10 STUFF
# S2S STUFF
S2S_model
=
keras_load_model
(
'
models/s2s.h5
'
,
custom_objects
=
{
'
Attention
'
:
Attention
})
with
open
(
'
models/s2s_source_tokenizer.p
'
,
'
rb
'
)
as
handle
:
s2s_source_tokenizer
=
pickle
.
load
(
handle
)
source_vocab
=
s2s_source_tokenizer
.
word_index
source_index_to_word_dict
=
{
v
:
k
.
strip
()
for
k
,
v
in
s2s_source_tokenizer
.
word_index
.
items
()}
with
open
(
'
models/s2s_target_tokenizer.p
'
,
'
rb
'
)
as
handle
:
s2s_target_tokenizer
=
pickle
.
load
(
handle
)
target_vocab
=
s2s_target_tokenizer
.
word_index
target_index_to_word_dict
=
{
v
:
k
.
strip
()
for
k
,
v
in
s2s_target_tokenizer
.
word_index
.
items
()}
# S2S STUFF
# INFERENCE MODELS
encoder_input
=
S2S_model
.
get_layer
(
'
input_1
'
).
output
decoder_input
=
S2S_model
.
get_layer
(
'
input_2
'
).
output
x
,
state_h
,
state_c
=
S2S_model
.
get_layer
(
'
lstm_1
'
).
output
encoder_states
=
[
state_h
,
state_c
]
embed_2
=
S2S_model
.
get_layer
(
'
embedding_2
'
).
output
decoder_LSTM
=
S2S_model
.
get_layer
(
'
lstm_2
'
)
decoder_dense
=
S2S_model
.
get_layer
(
'
dense_1
'
)
# Encoder inference model
encoder_model_inf
=
Model
(
encoder_input
,
encoder_states
)
# Decoder inference model
decoder_state_input_h
=
Input
(
shape
=
(
256
,),
name
=
'
inf_input1
'
)
decoder_state_input_c
=
Input
(
shape
=
(
256
,),
name
=
'
inf_input2
'
)
decoder_input_states
=
[
decoder_state_input_h
,
decoder_state_input_c
]
decoder_out
,
decoder_h
,
decoder_c
=
decoder_LSTM
(
embed_2
,
initial_state
=
decoder_input_states
)
decoder_states
=
[
decoder_h
,
decoder_c
]
decoder_out
=
decoder_dense
(
decoder_out
)
decoder_model_inf
=
Model
(
inputs
=
[
decoder_input
]
+
decoder_input_states
,
outputs
=
[
decoder_out
]
+
decoder_states
)
# encoder_model_inf.summary()
# decoder_model_inf.summary()
def
decode_seq
(
inp_seq
):
states_val
=
encoder_model_inf
.
predict
(
inp_seq
)
target_seq
=
np
.
zeros
((
1
,
S2S_model
.
get_layer
(
'
input_2
'
).
output_shape
[
1
]))
target_seq
[
0
,
0
]
=
target_vocab
[
'
sos
'
]
translated_sent
=
[]
translated_index
=
[]
stop_condition
=
False
while
not
stop_condition
:
decoder_out
,
decoder_h
,
decoder_c
=
decoder_model_inf
.
predict
(
x
=
[
target_seq
]
+
states_val
)
max_val_index
=
np
.
argmax
(
decoder_out
[
0
,
-
1
,
:])
try
:
sampled_fra_char
=
target_index_to_word_dict
[
max_val_index
]
except
KeyError
:
sampled_fra_char
=
'
eos
'
translated_sent
.
append
(
sampled_fra_char
)
translated_index
.
append
(
max_val_index
)
if
((
sampled_fra_char
==
'
eos
'
)
or
(
len
(
translated_sent
)
>
S2S_model
.
get_layer
(
'
input_2
'
).
output_shape
[
1
])):
stop_condition
=
True
runs
=
{
'
run1
'
:{
'
icd10
'
:
'
_duplicated
'
,
'
s2s
'
:
''
},
'
run2
'
:{
'
icd10
'
:
'
_duplicated
'
,
'
s2s
'
:
'
_extended
'
}
}
for
k
,
v
in
runs
.
items
():
with
open
(
'
models/train_test_split_extended.p
'
,
'
rb
'
)
as
handle
:
data_set
=
pickle
.
load
(
handle
)
source_val
=
data_set
[
'
source_val
'
]
target_val
=
data_set
[
'
target_val
'
]
labels_val
=
data_set
[
'
labels_val
'
]
print
(
source_val
)
# input('source val')
# ICD 10 STUFF
icd10_model
=
keras_load_model
(
'
models/icd10Classification_attention{}.h5
'
.
format
(
v
[
'
icd10
'
]),
custom_objects
=
{
'
Attention
'
:
Attention
})
with
open
(
'
models/icd10_tokenizer{}.p
'
.
format
(
v
[
'
icd10
'
]),
'
rb
'
)
as
handle
:
icd10Tokenizer
=
pickle
.
load
(
handle
)
with
open
(
'
models/icd10_mappings{}.p
'
.
format
(
v
[
'
icd10
'
]),
'
rb
'
)
as
handle
:
icd10Encoder
=
pickle
.
load
(
handle
)
# ICD 10 STUFF
# S2S STUFF
S2S_model
=
keras_load_model
(
'
models/s2s{}.h5
'
.
format
(
v
[
'
s2s
'
]),
custom_objects
=
{
'
Attention
'
:
Attention
})
with
open
(
'
models/s2s_source_tokenizer{}.p
'
.
format
(
v
[
'
s2s
'
]),
'
rb
'
)
as
handle
:
s2s_source_tokenizer
=
pickle
.
load
(
handle
)
source_vocab
=
s2s_source_tokenizer
.
word_index
source_index_to_word_dict
=
{
v
:
k
.
strip
()
for
k
,
v
in
s2s_source_tokenizer
.
word_index
.
items
()}
with
open
(
'
models/s2s_target_tokenizer{}.p
'
.
format
(
v
[
'
s2s
'
]),
'
rb
'
)
as
handle
:
s2s_target_tokenizer
=
pickle
.
load
(
handle
)
target_vocab
=
s2s_target_tokenizer
.
word_index
target_index_to_word_dict
=
{
v
:
k
.
strip
()
for
k
,
v
in
s2s_target_tokenizer
.
word_index
.
items
()}
# S2S STUFF
# INFERENCE MODELS
encoder_input
=
S2S_model
.
get_layer
(
'
input_1
'
).
output
decoder_input
=
S2S_model
.
get_layer
(
'
input_2
'
).
output
x
,
state_h
,
state_c
=
S2S_model
.
get_layer
(
'
lstm_1
'
).
output
encoder_states
=
[
state_h
,
state_c
]
embed_2
=
S2S_model
.
get_layer
(
'
embedding_2
'
).
output
decoder_LSTM
=
S2S_model
.
get_layer
(
'
lstm_2
'
)
decoder_dense
=
S2S_model
.
get_layer
(
'
dense_1
'
)
# Encoder inference model
encoder_model_inf
=
Model
(
encoder_input
,
encoder_states
)
# Decoder inference model
decoder_state_input_h
=
Input
(
shape
=
(
256
,),
name
=
'
inf_input1
'
)
decoder_state_input_c
=
Input
(
shape
=
(
256
,),
name
=
'
inf_input2
'
)
decoder_input_states
=
[
decoder_state_input_h
,
decoder_state_input_c
]
decoder_out
,
decoder_h
,
decoder_c
=
decoder_LSTM
(
embed_2
,
initial_state
=
decoder_input_states
)
decoder_states
=
[
decoder_h
,
decoder_c
]
decoder_out
=
decoder_dense
(
decoder_out
)
decoder_model_inf
=
Model
(
inputs
=
[
decoder_input
]
+
decoder_input_states
,
outputs
=
[
decoder_out
]
+
decoder_states
)
# encoder_model_inf.summary()
# decoder_model_inf.summary()
def
decode_seq
(
inp_seq
):
states_val
=
encoder_model_inf
.
predict
(
inp_seq
)
target_seq
=
np
.
zeros
((
1
,
S2S_model
.
get_layer
(
'
input_2
'
).
output_shape
[
1
]))
target_seq
[
0
,
0
]
=
max_val_index
states_val
=
[
decoder_h
,
decoder_c
]
return
translated_sent
[:
-
1
],
translated_index
[:
-
1
]
y_true
=
[]
y_pred
=
[]
source_val
=
s2s_source_tokenizer
.
texts_to_sequences
(
source_val
)
source_val
=
pad_sequences
(
source_val
,
maxlen
=
S2S_model
.
get_layer
(
'
input_1
'
).
input_shape
[
1
],
padding
=
'
post
'
)
for
seq_index
in
tqdm
.
tqdm
(
range
(
len
(
source_val
))):
# inp_seq = source_val[seq_index:seq_index + 1]
# inp_seq = s2s_source_tokenizer.texts_to_sequences(inp_seq)
# inp_seq = pad_sequences(inp_seq, maxlen=S2S_model.get_layer('input_1').output_shape[1], padding='post')
# translated_sent, translated_index= decode_seq(inp_seq)
#
# target_seq = target_corpus[seq_index:seq_index + 1]
# target_seq = s2s_target_tokenizer.texts_to_sequences(target_seq)
inp_seq
=
source_val
[
seq_index
:
seq_index
+
1
]
translated_sent
,
translated_index
=
decode_seq
(
inp_seq
)
# PREDICT ICD10
source_word_sequence
=
icd10Tokenizer
.
texts_to_sequences
([
"
"
.
join
(
translated_sent
)])
word_sequence
=
pad_sequences
(
source_word_sequence
,
maxlen
=
icd10_model
.
layers
[
0
].
input_shape
[
1
],
padding
=
'
post
'
)
icd10_code_index
=
icd10_model
.
predict
(
word_sequence
)
# print(icd10_code_index, type(icd10_code_index))
max_val_index
=
np
.
argmax
(
icd10_code_index
,
axis
=
1
)[
0
]
# print(max_val_index)
icd10_label
=
icd10Encoder
.
inverse_transform
(
max_val_index
)
# print('-')
# target_index = target_seq[0]
# print('Target indexes:', target_index)
# print('Decoded indexes:', translated_index)
#
# print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index]))
# print('Decoded sentence:', " ".join(translated_sent))
#
# print('Target ICD-10:', labels[seq_index])
# print('Predict ICD-10:', icd10_label)
y_true
.
append
(
labels_val
[
seq_index
])
y_pred
.
append
(
icd10_label
)
report
=
classification_report
(
y_true
,
y_pred
)
report_df
=
report_to_df
(
report
)
report_df
.
to_csv
(
'
logs/classification_report_test_combined.csv
'
)
print
(
report_df
)
\ No newline at end of file
target_seq
[
0
,
0
]
=
target_vocab
[
'
sos
'
]
translated_sent
=
[]
translated_index
=
[]
stop_condition
=
False
while
not
stop_condition
:
decoder_out
,
decoder_h
,
decoder_c
=
decoder_model_inf
.
predict
(
x
=
[
target_seq
]
+
states_val
)
max_val_index
=
np
.
argmax
(
decoder_out
[
0
,
-
1
,
:])
try
:
sampled_fra_char
=
target_index_to_word_dict
[
max_val_index
]
except
KeyError
:
sampled_fra_char
=
'
eos
'
translated_sent
.
append
(
sampled_fra_char
)
translated_index
.
append
(
max_val_index
)
if
((
sampled_fra_char
==
'
eos
'
)
or
(
len
(
translated_sent
)
>
S2S_model
.
get_layer
(
'
input_2
'
).
output_shape
[
1
])):
stop_condition
=
True
target_seq
=
np
.
zeros
((
1
,
S2S_model
.
get_layer
(
'
input_2
'
).
output_shape
[
1
]))
target_seq
[
0
,
0
]
=
max_val_index
states_val
=
[
decoder_h
,
decoder_c
]
return
translated_sent
[:
-
1
],
translated_index
[:
-
1
]
y_true
=
[]
y_pred
=
[]
source_val
=
s2s_source_tokenizer
.
texts_to_sequences
(
source_val
)
source_val
=
pad_sequences
(
source_val
,
maxlen
=
S2S_model
.
get_layer
(
'
input_1
'
).
input_shape
[
1
],
padding
=
'
post
'
)
for
seq_index
in
tqdm
.
tqdm
(
range
(
len
(
source_val
))):
# inp_seq = source_val[seq_index:seq_index + 1]
# inp_seq = s2s_source_tokenizer.texts_to_sequences(inp_seq)
# inp_seq = pad_sequences(inp_seq, maxlen=S2S_model.get_layer('input_1').output_shape[1], padding='post')
# translated_sent, translated_index= decode_seq(inp_seq)
#
# target_seq = target_corpus[seq_index:seq_index + 1]
# target_seq = s2s_target_tokenizer.texts_to_sequences(target_seq)
inp_seq
=
source_val
[
seq_index
:
seq_index
+
1
]
translated_sent
,
translated_index
=
decode_seq
(
inp_seq
)
# PREDICT ICD10
source_word_sequence
=
icd10Tokenizer
.
texts_to_sequences
([
"
"
.
join
(
translated_sent
)])
word_sequence
=
pad_sequences
(
source_word_sequence
,
maxlen
=
icd10_model
.
layers
[
0
].
input_shape
[
1
],
padding
=
'
post
'
)
icd10_code_index
=
icd10_model
.
predict
(
word_sequence
)
# print(icd10_code_index, type(icd10_code_index))
max_val_index
=
np
.
argmax
(
icd10_code_index
,
axis
=
1
)[
0
]
# print(max_val_index)
icd10_label
=
icd10Encoder
.
inverse_transform
(
max_val_index
)
# print('-')
# target_index = target_seq[0]
# print('Target indexes:', target_index)
# print('Decoded indexes:', translated_index)
#
# print('Target sentence:', " ".join([target_index_to_word_dict[x] for x in target_index]))
# print('Decoded sentence:', " ".join(translated_sent))
#
# print('Target ICD-10:', labels[seq_index])
# print('Predict ICD-10:', icd10_label)
y_true
.
append
(
labels_val
[
seq_index
])
y_pred
.
append
(
icd10_label
)
report
=
classification_report
(
y_true
,
y_pred
)
report_df
=
report_to_df
(
report
)
report_df
.
to_csv
(
'
logs/classification_report_test{}.csv
'
.
format
(
v
[
'
s2s
'
]))
\ No newline at end of file
This diff is collapsed.
Click to expand it.
paper/40_experiments.tex
+
5
−
5
View file @
87a185ed
...
...
@@ -61,10 +61,10 @@ The results obtained from the two approaches are shown in Table \ref{tab:icd10Cl
\begin{table}
[]
\centering
\begin{tabular}
{
l|l|l|l|l|l|l
}
Mode
&
Model
&
Trained for epochs
&
Train Accuracy
&
Train Loss
&
Validation Accuracy
&
Validation Loss
\\
Tokenization
&
Model
&
Trained for epochs
&
Train Accuracy
&
Train Loss
&
Validation Accuracy
&
Validation Loss
\\
Word
&
Minimal
&
69
&
0.925
&
0.190
&
0.937
&
0.169
\\
Word
&
Extended
&
41
&
0.950
&
0.156
&
0.954
&
0.141
\\
Character
&
Minimal
&
&
&
&
&
\\
Character
&
Minimal
&
91
&
0.732
&
1.186
&
0.516
&
2.505
\\
\end{tabular}
\caption
{
Named Entity Normalization: ICD-10 Classification
}
\label
{
tab:icd10Classification
}
...
...
@@ -79,9 +79,9 @@ The results obtained during training are presented in Table \ref{tab:final_train
\begin{table}
[]
\centering
\begin{tabular}
{
l|l|l|l|l|l
}
Model
&
Trained for epochs
&
Train Accuracy
&
Train Loss
&
Validation Accuracy
&
Validation Loss
\\
S2S balanced + ICD-10 extended
&
&
&
&
&
\\
S2S extended + ICD-10 extended
&
&
&
&
&
\\
Model
&
Precision
&
Recall
&
F-1
\\
S2S balanced + ICD-10 extended
&
0.73
&
0.61
&
0.61
\\
S2S extended + ICD-10 extended
&
0.74
&
0.62
&
0.63
\\
\end{tabular}
\caption
{
Final Pipeline Evaluation
}
\label
{
tab:final
_
train
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment