Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Beatrice Tan
CNAprioritization
Commits
815b2823
Commit
815b2823
authored
Feb 03, 2018
by
Tan
Browse files
Updated sample size report for RUBIC files.
parent
2d44810b
Changes
2
Hide whitespace changes
Inline
Side-by-side
rules/SampleSizes.smk
View file @
815b2823
...
...
@@ -51,21 +51,16 @@ rule run_rubic_subsets:
rule report_sizes:
"""Report the difference when using different sample sizes."""
input:
gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
<<<<<<< HEAD
#gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
=======
#rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
#rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
>>>>>>> 9067043362f4034a0e46f4579c366d1228927193
output:
report="Samplesize/Report.txt",
plots="Samplesize/Plots/"
params:
census=config["census_genes"],
known=config["prev_found_genes"],
census=
os.path.join(workflow.basedir,
config["census_genes"]
)
,
known=
os.path.join(workflow.basedir,
config["prev_found_genes"]
)
,
reps=config["repeats"],
ref=config["reference"]
run:
ReportSizes.make_report(input
[0]
, params.census, params.known, params.reps, params.ref, output.report, output.plots)
ReportSizes.make_report(input
.rubic_gains, input.rubic_losses
, params.census, params.known, params.reps, params.ref, output.report, output.plots)
scripts/ReportSizes.py
View file @
815b2823
import
numpy
as
np
import
matplotlib
import
matplotlib.pyplot
as
plt
plt
.
switch_backend
(
'agg'
)
from
matplotlib_venn
import
venn2
import
pandas
as
pd
import
seaborn
as
sns
...
...
@@ -9,25 +10,31 @@ import os.path
from
ParseResults
import
parse_regions
,
get_stats
from
collections
import
OrderedDict
def
make_report
(
size_r
esult
s
,
census_genes
,
known_genes
,
reps
,
ref_genome
,
report_file
,
plot_dir
):
def
make_report
(
size_r
ubic_gains
,
size_rubic_losse
s
,
census_genes
,
known_genes
,
reps
,
ref_genome
,
report_file
,
plot_dir
):
"""Make a report of the results produced using input files with different sample sizes."""
with
open
(
report_file
,
'w'
)
as
out
:
dict_stats
=
OrderedDict
()
row_names
=
([
"Size"
,
"Type"
,
"Nr. regions"
,
"Avg. size (Kb)"
,
"Total size (Mb)"
,
"Nr. regions with census genes"
,
"Nr. regions with known genes"
,
"Nr. genes"
])
tool
=
GISTIC2
for
size_file
in
size_results
:
dict_stats
=
{}
#row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
# "Nr. genes", "Nr. regions with known genes", "Nr. regions with census genes"])
tool
=
'RUBIC'
for
i
in
range
(
len
(
size_rubic_gains
)):
size_file
=
size_rubic_gains
[
i
],
size_rubic_losses
[
i
]
parsed_results
=
parse_regions
(
size_file
,
known_genes
,
census_genes
,
tool
)
size
,
repetition
=
size_file
.
split
(
"/"
)[
-
1
].
split
(
"x"
)
if
tool
==
'RUBIC'
:
size_rep
=
size_file
[
0
].
split
(
"Size"
)[
1
].
split
(
"/gains"
)[
0
]
size
,
repetition
=
size_rep
.
split
(
".Rep"
)
else
:
size_rep
=
size_file
.
split
(
"Size"
)[
1
].
split
(
"/all_lesions"
)[
0
]
size
,
repetition
=
size_rep
.
split
(
".Rep"
)
stats_results
=
get_stats
(
parsed_results
,
size
)
if
size
not
in
dict_stats
.
keys
():
dict_stats
[
size
]
=
[
stats_results
]
else
:
dict_stats
[
size
]
=
dict_stats
[
size
]
+
[
stats_results
]
out
.
write
(
dict_stats
)
extract_
sta
ts_sizes
(
dict_stats
,
reps
,
plot_dir
)
out
.
write
(
"
\t
"
.
join
(
stats_results
[
0
])
+
"
\n
"
+
"
\t
"
.
join
(
stats_results
[
1
])
)
extract_
resul
ts_sizes
(
dict_stats
,
reps
,
plot_dir
)
def
extract_results_sizes
(
dict_stats
,
reps
):
def
extract_results_sizes
(
dict_stats
,
reps
,
plot_dir
):
"""Prepare data from a dict with all results to plot the differences between analyses with different sample sizes."""
sizes
=
sorted
(
dict_stats
.
keys
())
nr_regions
,
avg_size
,
total_size
,
nr_genes
,
nr_census
,
nr_known
=
[],
[],
[],
[],
[],
[]
...
...
@@ -40,9 +47,9 @@ def extract_results_sizes(dict_stats, reps):
if
reg_size
!=
'N/A'
:
avg_size
.
append
(
float
(
reg_size
))
total_size
.
append
(
float
(
stats
[
4
]))
nr_genes
.
append
(
float
(
stats
[
7
]))
nr_census
.
append
(
float
(
stats
[
6
].
split
(
" ("
)[
0
]))
nr_known
.
append
(
float
(
stats
[
5
].
split
(
" ("
)[
0
]))
nr_genes
.
append
(
float
(
stats
[
5
]))
nr_census
.
append
(
float
(
stats
[
7
].
split
(
" ("
)[
0
]))
nr_known
.
append
(
float
(
stats
[
6
].
split
(
" ("
)[
0
]))
plot_data
=
[
nr_regions
,
avg_size
,
total_size
,
nr_genes
,
nr_census
,
nr_known
]
plot_y_axis
=
([
'Number of recurrent regions'
,
'Average size of regions (Kb)'
,
'Total size (Mb)'
,
'Number of genes'
,
'Nr. regions with census genes'
,
'Nr. regions with known genes'
])
...
...
@@ -72,14 +79,14 @@ def add_significance(g, df, list_sizes):
for
ax
in
g
.
axes
.
flat
:
for
i
in
range
(
1
,
len
(
list_sizes
)):
x1
,
x2
=
i
-
1
,
i
pval
=
test_significance
(
x1
,
x2
,
df
,
cnv_type
)
pval
=
test_significance
(
x1
,
x2
,
df
,
cnv_type
,
list_sizes
)
if
pval
<
.
05
:
y
,
h
,
col
=
df
[
'value_y_axis'
].
max
()
+
df
[
'value_y_axis'
].
max
()
*
0.035
,
df
[
'value_y_axis'
].
max
()
*
0.035
,
'k'
ax
.
plot
([
x1
,
x1
,
x2
,
x2
],
[
y
,
y
+
h
,
y
+
h
,
y
],
lw
=
1.5
,
c
=
col
)
ax
.
text
((
x1
+
x2
)
*
.
5
,
y
+
h
,
"*"
,
ha
=
'center'
,
va
=
'bottom'
,
color
=
col
)
cnv_type
=
"Deletions"
def
test_significance
(
x1
,
x2
,
df
,
cnv_type
):
def
test_significance
(
x1
,
x2
,
df
,
cnv_type
,
list_sizes
):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_type
=
df
[
df
[
'type'
]
==
cnv_type
]
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment