Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Beatrice Tan
CNAprioritization
Commits
25d1700c
Commit
25d1700c
authored
Feb 03, 2018
by
Beatrice Tan
Browse files
Merge branch 'master' of
https://git.lumc.nl/bftan/CNAprioritization
parents
cc4e5d19
815b2823
Changes
6
Hide whitespace changes
Inline
Side-by-side
Snakefile
View file @
25d1700c
...
...
@@ -43,9 +43,6 @@ rule all:
"""Define desired output from pipeline."""
input:
"Samplesize/Report.txt"
#"Reports/Overlap_plots/12.png"
#expand("Reports/Overlap_plots/{region}.png", region=range(22))
rule help:
"""Print list of all targets with help."""
...
...
config.yaml
View file @
25d1700c
#Directories to be specified
workdir
:
/home/bftan/CNA_results
#directory to write output
gisticdir
:
/home/bftan/Tools/GISTIC2
_test
#directory to install GISTIC2
gisticdir
:
/home/bftan/Tools/GISTIC2
#directory to install GISTIC2
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
...
...
@@ -34,6 +34,6 @@ settings_gistic: "-ta 0.1
-conf
0.99"
#Settings for sample size differences
#
sizes: [30, 40, 50, 60, 70, 80, 90]
sizes
:
[
30
,
40
]
sizes
:
[
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
]
#
sizes: [30, 40]
repeats
:
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
]
rules/SampleSizes.smk
View file @
25d1700c
import ReportSizes
from SampleSizes import SegFile
from Rubic import MarkerFile
import os.path
import datetime
rule seg_subsets:
"""Create segmentation files with different numbers of samples (randomly chosen) for a number of times."""
...
...
@@ -17,10 +18,10 @@ rule run_gistic_subsets:
gistic_directory=os.path.join(config["gisticdir"], "gistic2"),
seg="Samplesize/Input/Size{rand_nr}.Rep{rep_nr}.txt"
output:
"Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt"
"Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt",
"Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/regions_track.conf_" + config["gistic_precision"] + ".bed"
params:
cnv="",
gistic_directory=config["gisticdir"],
ref=config["reference"],
ref_file="",
extra=config["settings_gistic"]
...
...
@@ -50,16 +51,16 @@ rule run_rubic_subsets:
rule report_sizes:
"""Report the difference when using different sample sizes."""
input:
gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
#
rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
#
rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
#
gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
output:
report="Samplesize/Report.txt",
plots="Samplesize/Plots/"
params:
census=config["census_genes"],
known=config["prev_found_genes"],
census=
os.path.join(workflow.basedir,
config["census_genes"]
)
,
known=
os.path.join(workflow.basedir,
config["prev_found_genes"]
)
,
reps=config["repeats"],
ref=config["reference"]
run:
ReportSizes.make_report(input
[0]
, params.census, params.known, params.reps, params.ref, output.report, output.plots)
ReportSizes.make_report(input
.rubic_gains, input.rubic_losses
, params.census, params.known, params.reps, params.ref, output.report, output.plots)
run_pipeline_cluster.sh
View file @
25d1700c
...
...
@@ -20,6 +20,6 @@ snakemake -p \
--latency-wait
90
\
--jobs
1
\
--max-jobs-per-second
10
\
--restart-times
3
\
--use-conda
\
--conda-prefix
~/.conda
--restart-times
3
#
--use-conda \
#
--conda-prefix ~/.conda
scripts/ReportSizes.py
View file @
25d1700c
import
numpy
as
np
import
matplotlib
import
matplotlib.pyplot
as
plt
plt
.
switch_backend
(
'agg'
)
from
matplotlib_venn
import
venn2
import
pandas
as
pd
import
seaborn
as
sns
...
...
@@ -9,25 +10,31 @@ import os.path
from
ParseResults
import
parse_regions
,
get_stats
from
collections
import
OrderedDict
def
make_report
(
size_r
esult
s
,
census_genes
,
known_genes
,
reps
,
ref_genome
,
report_file
,
plot_dir
):
def
make_report
(
size_r
ubic_gains
,
size_rubic_losse
s
,
census_genes
,
known_genes
,
reps
,
ref_genome
,
report_file
,
plot_dir
):
"""Make a report of the results produced using input files with different sample sizes."""
with
open
(
report_file
,
'w'
)
as
out
:
dict_stats
=
OrderedDict
()
row_names
=
([
"Size"
,
"Type"
,
"Nr. regions"
,
"Avg. size (Kb)"
,
"Total size (Mb)"
,
"Nr. regions with census genes"
,
"Nr. regions with known genes"
,
"Nr. genes"
])
tool
=
GISTIC2
for
size_file
in
size_results
:
dict_stats
=
{}
#row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
# "Nr. genes", "Nr. regions with known genes", "Nr. regions with census genes"])
tool
=
'RUBIC'
for
i
in
range
(
len
(
size_rubic_gains
)):
size_file
=
size_rubic_gains
[
i
],
size_rubic_losses
[
i
]
parsed_results
=
parse_regions
(
size_file
,
known_genes
,
census_genes
,
tool
)
size
,
repetition
=
size_file
.
split
(
"/"
)[
-
1
].
split
(
"x"
)
if
tool
==
'RUBIC'
:
size_rep
=
size_file
[
0
].
split
(
"Size"
)[
1
].
split
(
"/gains"
)[
0
]
size
,
repetition
=
size_rep
.
split
(
".Rep"
)
else
:
size_rep
=
size_file
.
split
(
"Size"
)[
1
].
split
(
"/all_lesions"
)[
0
]
size
,
repetition
=
size_rep
.
split
(
".Rep"
)
stats_results
=
get_stats
(
parsed_results
,
size
)
if
size
not
in
dict_stats
.
keys
():
dict_stats
[
size
]
=
[
stats_results
]
else
:
dict_stats
[
size
]
=
dict_stats
[
size
]
+
[
stats_results
]
out
.
write
(
dict_stats
)
extract_
sta
ts_sizes
(
dict_stats
,
reps
,
plot_dir
)
out
.
write
(
"
\t
"
.
join
(
stats_results
[
0
])
+
"
\n
"
+
"
\t
"
.
join
(
stats_results
[
1
])
)
extract_
resul
ts_sizes
(
dict_stats
,
reps
,
plot_dir
)
def
extract_results_sizes
(
dict_stats
,
reps
):
def
extract_results_sizes
(
dict_stats
,
reps
,
plot_dir
):
"""Prepare data from a dict with all results to plot the differences between analyses with different sample sizes."""
sizes
=
sorted
(
dict_stats
.
keys
())
nr_regions
,
avg_size
,
total_size
,
nr_genes
,
nr_census
,
nr_known
=
[],
[],
[],
[],
[],
[]
...
...
@@ -40,9 +47,9 @@ def extract_results_sizes(dict_stats, reps):
if
reg_size
!=
'N/A'
:
avg_size
.
append
(
float
(
reg_size
))
total_size
.
append
(
float
(
stats
[
4
]))
nr_genes
.
append
(
float
(
stats
[
7
]))
nr_census
.
append
(
float
(
stats
[
6
].
split
(
" ("
)[
0
]))
nr_known
.
append
(
float
(
stats
[
5
].
split
(
" ("
)[
0
]))
nr_genes
.
append
(
float
(
stats
[
5
]))
nr_census
.
append
(
float
(
stats
[
7
].
split
(
" ("
)[
0
]))
nr_known
.
append
(
float
(
stats
[
6
].
split
(
" ("
)[
0
]))
plot_data
=
[
nr_regions
,
avg_size
,
total_size
,
nr_genes
,
nr_census
,
nr_known
]
plot_y_axis
=
([
'Number of recurrent regions'
,
'Average size of regions (Kb)'
,
'Total size (Mb)'
,
'Number of genes'
,
'Nr. regions with census genes'
,
'Nr. regions with known genes'
])
...
...
@@ -72,14 +79,14 @@ def add_significance(g, df, list_sizes):
for
ax
in
g
.
axes
.
flat
:
for
i
in
range
(
1
,
len
(
list_sizes
)):
x1
,
x2
=
i
-
1
,
i
pval
=
test_significance
(
x1
,
x2
,
df
,
cnv_type
)
pval
=
test_significance
(
x1
,
x2
,
df
,
cnv_type
,
list_sizes
)
if
pval
<
.
05
:
y
,
h
,
col
=
df
[
'value_y_axis'
].
max
()
+
df
[
'value_y_axis'
].
max
()
*
0.035
,
df
[
'value_y_axis'
].
max
()
*
0.035
,
'k'
ax
.
plot
([
x1
,
x1
,
x2
,
x2
],
[
y
,
y
+
h
,
y
+
h
,
y
],
lw
=
1.5
,
c
=
col
)
ax
.
text
((
x1
+
x2
)
*
.
5
,
y
+
h
,
"*"
,
ha
=
'center'
,
va
=
'bottom'
,
color
=
col
)
cnv_type
=
"Deletions"
def
test_significance
(
x1
,
x2
,
df
,
cnv_type
):
def
test_significance
(
x1
,
x2
,
df
,
cnv_type
,
list_sizes
):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_type
=
df
[
df
[
'type'
]
==
cnv_type
]
...
...
wrappers/GISTIC2/environment.yaml
View file @
25d1700c
channels
:
-
bioconda
-
conda-forge
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment