Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Beatrice Tan
CNAprioritization
Commits
6869823f
Commit
6869823f
authored
Feb 07, 2018
by
Beatrice Tan
Browse files
Added script to plot precision and recall with different sample sizes.
parent
d320be6a
Changes
1
Hide whitespace changes
Inline
Side-by-side
scripts/AUC.py
0 → 100644
View file @
6869823f
import
matplotlib.pyplot
as
plt
import
seaborn
as
sns
sns
.
set_style
(
'whitegrid'
)
def
ROC_curve
(
gistic_subset
,
gistic_truth
,
rubic_subset
,
rubic_truth
,
plot_file
,
list_sizes
):
plot_recall
,
plot_precision
,
plot_avg_recall
,
plot_avg_precision
=
[],
[],
[],
[]
for
tool
in
(
gistic_subset
,
gistic_truth
),
(
rubic_subset
,
rubic_truth
):
list_recall
,
list_precision
=
get_list_rates
(
tool
[
0
],
tool
[
1
],
list_sizes
)
plot_recall
.
append
(
list_recall
)
#avg_recall)
plot_precision
.
append
(
list_precision
)
#.append(#avg_precision)
avg_recall
,
avg_precision
=
calculate_averages
(
list_recall
,
list_precision
)
plot_avg_recall
.
append
(
avg_recall
)
#avg_recall)
plot_avg_precision
.
append
(
avg_precision
)
plot_ROC
(
plot_recall
,
plot_precision
,
plot_file
,
list_sizes
,
False
)
plot_ROC
(
plot_avg_recall
,
plot_avg_precision
,
plot_file
+
"_avg.png"
,
list_sizes
,
True
)
def
get_list_rates
(
subset_files
,
truth_files
,
list_sizes
):
list_recall
=
[[]
for
i
in
range
(
len
(
list_sizes
))]
list_precision
=
[[]
for
i
in
range
(
len
(
list_sizes
))]
for
i
in
range
(
len
(
subset_files
)):
size_file
=
subset_files
[
i
].
split
(
"Size"
)[
1
].
split
(
".Rep"
)[
0
]
recall
,
precision
=
calculate_precision_recall
(
subset_files
[
i
],
truth_files
[
i
])
for
size
in
range
(
len
(
list_sizes
)):
if
str
(
list_sizes
[
size
])
==
size_file
:
list_recall
[
size
].
append
(
recall
)
list_precision
[
size
].
append
(
precision
)
return
list_recall
,
list_precision
def
calculate_precision_recall
(
bed_subset
,
bed_truth
):
FP
,
TP
,
FN
=
0
,
0
,
0
with
open
(
bed_subset
,
'r'
)
as
subset
:
#precision
for
region
in
subset
:
if
region
.
startswith
(
"track"
):
pass
#header
else
:
line
=
region
.
split
(
"
\t
"
)
if
line
[
4
]
==
'.'
:
FP
+=
1
else
:
TP
+=
1
with
open
(
bed_truth
,
'r'
)
as
truth
:
#recall
for
region
in
truth
:
if
region
.
startswith
(
"track"
):
pass
#header
else
:
line
=
region
.
split
(
"
\t
"
)
if
line
[
4
]
==
'.'
:
FN
+=
1
#else:
# TP += 1
recall
=
TP
/
float
(
TP
+
FN
)
precision
=
TP
/
float
(
TP
+
FP
)
return
recall
,
precision
def
calculate_averages
(
list_recall
,
list_precision
):
avg_recall
,
avg_precision
=
[],
[]
for
size
in
list_recall
:
avg
=
sum
(
size
)
/
float
(
len
(
size
))
avg_recall
.
append
(
avg
)
for
size
in
list_precision
:
avg
=
sum
(
size
)
/
float
(
len
(
size
))
avg_precision
.
append
(
avg
)
return
avg_recall
,
avg_precision
def
plot_ROC
(
list_recall
,
list_precision
,
plot_file
,
list_sizes
,
avg
):
plt
.
figure
()
if
avg
:
plt
.
xlim
([
0
,
1
])
plt
.
ylim
([
0.7
,
1
])
point_size
=
50
else
:
plt
.
xlim
([
0
,
1
])
plt
.
ylim
([
0.5
,
1.02
])
point_size
=
20
for
i
in
range
(
len
(
list_recall
)):
recall
,
precision
=
list_recall
[
i
],
list_precision
[
i
]
for
j
in
range
(
len
(
recall
)):
col
=
sns
.
cubehelix_palette
(
len
(
list_sizes
),
start
=
.
5
,
rot
=-
.
75
,
dark
=
.
2
)[
j
]
if
i
==
0
else
sns
.
cubehelix_palette
(
len
(
list_sizes
))[
j
]
plt
.
scatter
(
recall
[
j
],
precision
[
j
],
color
=
col
,
s
=
point_size
)
plt
.
xlabel
(
'Recall'
)
plt
.
ylabel
(
'Precision'
)
plt
.
savefig
(
plot_file
)
plt
.
close
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment