Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Beatrice Tan
CNAprioritization
Commits
71025db0
Commit
71025db0
authored
Feb 04, 2018
by
Beatrice Tan
Browse files
Updated plots for samples sizes to plot both tools.
parent
25d1700c
Changes
4
Hide whitespace changes
Inline
Side-by-side
config.yaml
View file @
71025db0
#Directories to be specified
workdir
:
/home/bftan/CNA_results
#directory to write output
gisticdir
:
/home/bftan/Tools/GISTIC2
#directory to install GISTIC2
#
workdir: /home/bftan/CNA_results #directory to write output
#
gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#
workdir: /home/beatrice/CNA_analysis
#
gisticdir: /home/beatrice/CNA_analysis/run_gistic2
workdir
:
/home/beatrice/CNA_analysis
gisticdir
:
/home/beatrice/CNA_analysis/run_gistic2
#Input details to download from firehose
cancer_type
:
SKCM
...
...
@@ -34,6 +34,6 @@ settings_gistic: "-ta 0.1
-conf
0.99"
#Settings for sample size differences
sizes
:
[
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
]
sizes
:
[
20
,
30
,
40
,
60
,
70
,
80
,
90
]
#sizes: [30, 40]
repeats
:
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
]
rules/SampleSizes.smk
View file @
71025db0
...
...
@@ -51,7 +51,7 @@ rule run_rubic_subsets:
rule report_sizes:
"""Report the difference when using different sample sizes."""
input:
#
gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
output:
...
...
@@ -61,6 +61,7 @@ rule report_sizes:
census=os.path.join(workflow.basedir, config["census_genes"]),
known=os.path.join(workflow.basedir, config["prev_found_genes"]),
reps=config["repeats"],
sizes=config["sizes"],
ref=config["reference"]
run:
ReportSizes.make_report(input.rubic_gains, input.rubic_losses, params.census, params.known, params.reps, params.ref, output.report, output.plots)
ReportSizes.make_report(
input.gistic,
input.rubic_gains, input.rubic_losses, params.census, params.known, params.reps,
params.sizes,
params.ref, output.report, output.plots)
scripts/ReportSizes.py
View file @
71025db0
...
...
@@ -10,88 +10,80 @@ import os.path
from
ParseResults
import
parse_regions
,
get_stats
from
collections
import
OrderedDict
def
make_report
(
size_rubic_gains
,
size_rubic_losses
,
census_genes
,
known_genes
,
reps
,
ref_genome
,
report_file
,
plot_dir
):
def
make_report
(
size_gistic
,
size_rubic_gains
,
size_rubic_losses
,
census_genes
,
known_genes
,
reps
,
sizes
,
ref_genome
,
report_file
,
plot_dir
):
"""Make a report of the results produced using input files with different sample sizes."""
list_stats
=
[]
with
open
(
report_file
,
'w'
)
as
out
:
dict_stats
=
{}
#row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
# "Nr. genes", "Nr. regions with known genes", "Nr. regions with census genes"])
tool
=
'RUBIC'
for
i
in
range
(
len
(
size_rubic_gains
)):
size_file
=
size_rubic_gains
[
i
],
size_rubic_losses
[
i
]
parsed_results
=
parse_regions
(
size_file
,
known_genes
,
census_genes
,
tool
)
if
tool
==
'RUBIC'
:
size_rep
=
size_file
[
0
].
split
(
"Size"
)[
1
].
split
(
"/gains"
)[
0
]
size
,
repetition
=
size_rep
.
split
(
".Rep"
)
else
:
size_rep
=
size_file
.
split
(
"Size"
)[
1
].
split
(
"/all_lesions"
)[
0
]
size
,
repetition
=
size_rep
.
split
(
".Rep"
)
stats_results
=
get_stats
(
parsed_results
,
size
)
if
size
not
in
dict_stats
.
keys
():
dict_stats
[
size
]
=
[
stats_results
]
else
:
dict_stats
[
size
]
=
dict_stats
[
size
]
+
[
stats_results
]
out
.
write
(
"
\t
"
.
join
(
stats_results
[
0
])
+
"
\n
"
+
"
\t
"
.
join
(
stats_results
[
1
]))
extract_results_sizes
(
dict_stats
,
reps
,
plot_dir
)
for
i
in
range
(
len
(
size_rubic_gains
)):
#loop through sizes
size_rep
=
size_rubic_gains
[
i
].
split
(
"Size"
)[
1
].
split
(
"/gains"
)[
0
]
size
,
repetition
=
size_rep
.
split
(
".Rep"
)
for
tool
in
'GISTIC'
,
'RUBIC'
:
size_file
=
(
size_rubic_gains
[
i
],
size_rubic_losses
[
i
])
if
tool
==
'RUBIC'
else
size_gistic
[
i
]
parsed_results
=
parse_regions
(
size_file
,
known_genes
,
census_genes
,
tool
)
stats_results
=
get_stats
(
parsed_results
,
size
)
for
stat_list
in
stats_results
[
0
],
stats_results
[
1
]:
converted_stats
=
[
tool
]
+
stat_list
[
0
:
2
]
for
stat
in
stat_list
[
2
:
5
]:
converted_stats
.
append
(
float
(
stat
))
for
stat
in
stat_list
[
5
:]:
converted_stats
.
append
(
float
(
stat
.
split
(
" ("
)[
0
]))
list_stats
.
append
(
converted_stats
)
make_plots
(
list_stats
,
reps
,
sizes
,
plot_dir
)
def
extract_results_sizes
(
dict_stats
,
reps
,
plot_dir
):
"""Prepare data from a dict with all results to plot the differences between analyses with different sample sizes."""
sizes
=
sorted
(
dict_stats
.
keys
())
nr_regions
,
avg_size
,
total_size
,
nr_genes
,
nr_census
,
nr_known
=
[],
[],
[],
[],
[],
[]
for
size
in
sizes
:
for
rep
in
range
(
len
(
reps
)):
for
cnv_type
in
0
,
1
:
stats
=
dict_stats
[
size
][
rep
][
cnv_type
]
nr_regions
.
append
(
float
(
stats
[
2
]))
reg_size
=
stats
[
3
]
if
reg_size
!=
'N/A'
:
avg_size
.
append
(
float
(
reg_size
))
total_size
.
append
(
float
(
stats
[
4
]))
nr_genes
.
append
(
float
(
stats
[
5
]))
nr_census
.
append
(
float
(
stats
[
7
].
split
(
" ("
)[
0
]))
nr_known
.
append
(
float
(
stats
[
6
].
split
(
" ("
)[
0
]))
plot_data
=
[
nr_regions
,
avg_size
,
total_size
,
nr_genes
,
nr_census
,
nr_known
]
def
make_plots
(
list_stats
,
reps
,
sizes
,
plot_dir
):
plot_y_axis
=
([
'Number of recurrent regions'
,
'Average size of regions (Kb)'
,
'Total size (Mb)'
,
'Number of genes'
,
'Nr. regions with census genes'
,
'Nr. regions with known genes'
])
for
plot_nr
in
range
(
len
(
plot_data
)):
plot_size_differences
(
plot_data
[
plot_nr
],
sizes
,
len
(
reps
),
plot_y_axis
[
plot_nr
],
plot_dir
)
'Number of genes'
,
'Nr. regions with known genes'
,
'Nr. regions with census genes'
])
df_stats
=
pd
.
DataFrame
(
list_stats
,
columns
=
[
'Tool'
,
'Sample size'
,
'Type'
]
+
plot_y_axis
)
for
plot_name
in
plot_y_axis
:
plot_size_differences
(
df_stats
,
plot_name
,
sizes
,
len
(
reps
),
plot_dir
)
for
tool
in
'GISTIC'
,
'RUBIC'
:
plot_size_differences_per_tool
(
df_stats
,
plot_name
,
sizes
,
len
(
reps
),
plot_dir
,
tool
)
def
plot_size_differences
(
list_size_result
s
,
list_sizes
,
nr_reps
,
y_axis
,
plot_dir
):
def
plot_size_differences
(
df_stats
,
value_y_axi
s
,
list_sizes
,
nr_reps
,
plot_dir
):
"""Plot the differences between analyses using different sample sizes."""
sns
.
set_style
(
"whitegrid"
)
df
=
pd
.
DataFrame
(
list_size_results
,
columns
=
[
'value_y_axis'
])
df
[
'type'
]
=
pd
.
Series
([
'Amplifications'
,
'Deletions'
]
*
len
(
list_sizes
)
*
nr_reps
)
sample_label
=
[]
for
size
in
list_sizes
:
sample_label
=
sample_label
+
[
size
]
*
nr_reps
*
2
df
[
'sample_size'
]
=
pd
.
Series
(
sample_label
)
g
=
sns
.
factorplot
(
x
=
"sample_size"
,
y
=
"value_y_axis"
,
col
=
"type"
,
data
=
df
,
kind
=
"box"
,
size
=
5
,
#hue="tool" for adding RUBIC in plots: colors, significance,
aspect
=
1
,
palette
=
sns
.
cubehelix_palette
(
8
,
start
=
.
5
,
rot
=-
.
75
,
dark
=
.
2
))
g
.
set_axis_labels
(
"Sample size"
,
y_axis
).
set_titles
(
"{col_name}"
).
despine
(
bottom
=
True
)
add_significance
(
g
,
df
,
list_sizes
)
png_file
=
os
.
path
.
join
(
plot_dir
,
y_axis
.
replace
(
" "
,
"_"
)
+
".png"
)
g
=
sns
.
factorplot
(
x
=
"Sample size"
,
y
=
value_y_axis
,
col
=
"Type"
,
hue
=
"Tool"
,
data
=
df_stats
,
kind
=
"box"
,
size
=
5
,
aspect
=
1
,
palette
=
sns
.
cubehelix_palette
(
8
,
start
=
.
5
,
rot
=-
.
75
,
dark
=
.
2
))
g
.
set_axis_labels
(
"Sample size"
,
value_y_axis
).
set_titles
(
"{col_name}"
).
despine
(
bottom
=
True
)
#add_significance(g, df_stats, list_sizes, value_y_axis)
png_file
=
os
.
path
.
join
(
plot_dir
,
value_y_axis
.
replace
(
" "
,
"_"
)
+
".png"
)
plt
.
savefig
(
png_file
,
dpi
=
300
)
plt
.
close
()
def
add_significance
(
g
,
df
,
list_sizes
):
def
plot_size_differences_per_tool
(
df_stats
,
value_y_axis
,
list_sizes
,
nr_reps
,
plot_dir
,
tool
):
"""Plot the differences between analyses using different sample sizes."""
sns
.
set_style
(
"whitegrid"
)
df_tool
=
df_stats
[
df_stats
[
'Tool'
]
==
tool
]
g
=
sns
.
factorplot
(
x
=
"Sample size"
,
y
=
value_y_axis
,
col
=
"Type"
,
data
=
df_tool
,
kind
=
"box"
,
size
=
5
,
aspect
=
1
,
palette
=
sns
.
cubehelix_palette
(
8
,
start
=
.
5
,
rot
=-
.
75
,
dark
=
.
2
))
g
.
set_axis_labels
(
"Sample size"
,
value_y_axis
).
set_titles
(
"{col_name}"
).
despine
(
bottom
=
True
)
#add_significance(g, df_stats, list_sizes, value_y_axis)
png_file
=
os
.
path
.
join
(
plot_dir
,
value_y_axis
.
replace
(
" "
,
"_"
)
+
"_"
+
tool
+
".png"
)
plt
.
savefig
(
png_file
,
dpi
=
300
)
plt
.
close
()
def
add_significance
(
plot
,
df
,
list_sizes
,
value_y_axis
):
"""Add significance to plot with differences between sample sizes."""
cnv_types
=
[
"Amplifications"
,
"Deletions"
]
cnv_type
=
"Amplifications"
for
ax
in
g
.
axes
.
flat
:
for
ax
in
plot
.
axes
.
flat
:
for
i
in
range
(
1
,
len
(
list_sizes
)):
x1
,
x2
=
i
-
1
,
i
pval
=
test_significance
(
x1
,
x2
,
df
,
cnv_type
,
list_sizes
)
pval
=
test_significance
(
x1
,
x2
,
df
,
cnv_type
,
list_sizes
,
value_y_axis
)
if
pval
<
.
05
:
y
,
h
,
col
=
df
[
'value_y_axis'
].
max
()
+
df
[
'value_y_axis'
].
max
()
*
0.035
,
df
[
'value_y_axis'
].
max
()
*
0.035
,
'k'
ax
.
plot
([
x1
,
x1
,
x2
,
x2
],
[
y
,
y
+
h
,
y
+
h
,
y
],
lw
=
1.5
,
c
=
col
)
ax
.
text
((
x1
+
x2
)
*
.
5
,
y
+
h
,
"*"
,
ha
=
'center'
,
va
=
'bottom'
,
color
=
col
)
y
=
df
[
value_y_axis
].
max
()
+
df
[
value_y_axis
].
max
()
*
0.035
h
=
df
[
value_y_axis
].
max
()
*
0.035
ax
.
plot
([
x1
,
x1
,
x2
,
x2
],
[
y
,
y
+
h
,
y
+
h
,
y
],
lw
=
1.5
,
c
=
'k'
)
ax
.
text
((
x1
+
x2
)
*
.
5
,
y
+
h
,
"*"
,
ha
=
'center'
,
va
=
'bottom'
,
color
=
'k'
)
cnv_type
=
"Deletions"
def
test_significance
(
x1
,
x2
,
df
,
cnv_type
,
list_sizes
):
def
test_significance
(
x1
,
x2
,
df
,
cnv_type
,
list_sizes
,
value_y_axis
):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_type
=
df
[
df
[
'type'
]
==
cnv_type
]
values_x1
=
only_type
[
only_type
[
'sample_size'
]
==
list_sizes
[
x1
]]
values_x2
=
only_type
[
only_type
[
'sample_size'
]
==
list_sizes
[
x2
]]
only_tool
=
df
[
df
[
'Type'
]
==
cnv_type
]
#only_tool = only_type[only_type['Tool']=='GISTIC']
values_x1
=
only_tool
[
only_tool
[
'Sample size'
]
==
list_sizes
[
x1
]]
values_x2
=
only_tool
[
only_tool
[
'Sample size'
]
==
list_sizes
[
x2
]]
#signficance = ttest_ind(values_x1['value_y_axis'], values_x2['value_y_axis'])
significance
=
mannwhitneyu
(
values_x1
[
'
value_y_axis
'
],
values_x2
[
'
value_y_axis
'
])
significance
=
mannwhitneyu
(
values_x1
[
value_y_axis
],
values_x2
[
value_y_axis
])
return
significance
[
1
]
scripts/ReportTools.py
View file @
71025db0
...
...
@@ -31,7 +31,7 @@ def make_report(gistic_results, rubic_gain_results, rubic_loss_results,
def
make_tool_report
(
file_tools
,
stats_tools
):
"""Make a report of the results from GISTIC and RUBIC."""
row_names
=
[
"Tool"
,
"Type"
,
"Nr. regions"
,
"Avg. size (Kb)"
,
"Total size (Mb)"
,
"Nr. genes"
,
"
R
egions with known genes"
,
"
R
egions with census genes"
]
row_names
=
[
"Tool"
,
"Type"
,
"Nr. regions"
,
"Avg. size (Kb)"
,
"Total size (Mb)"
,
"Nr. genes"
,
"
Nr. r
egions with known genes"
,
"
Nr. r
egions with census genes"
]
with
open
(
file_tools
,
'w'
)
as
out
:
for
i
in
range
(
len
(
row_names
)):
list_out
=
[
row_names
[
i
]]
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment