Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
CNAprioritization
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Beatrice Tan
CNAprioritization
Commits
2d44810b
Commit
2d44810b
authored
Feb 03, 2018
by
Tan
Browse files
Options
Browse Files
Download
Plain Diff
Merged.
parents
0c65624f
90670433
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
303 additions
and
251 deletions
+303
-251
Snakefile
Snakefile
+3
-4
rules/Circos.smk
rules/Circos.smk
+29
-1
rules/ComparisonRegions.smk
rules/ComparisonRegions.smk
+16
-23
rules/SampleSizes.smk
rules/SampleSizes.smk
+5
-0
scripts/Circos.py
scripts/Circos.py
+16
-0
scripts/ParseResults.py
scripts/ParseResults.py
+78
-63
scripts/ReportControl.py
scripts/ReportControl.py
+2
-4
scripts/ReportSegments.py
scripts/ReportSegments.py
+1
-1
scripts/ReportSizes.py
scripts/ReportSizes.py
+2
-4
scripts/ReportTools.py
scripts/ReportTools.py
+73
-149
scripts/circos/circos_zoom.conf
scripts/circos/circos_zoom.conf
+41
-0
scripts/ensemblQueries.py
scripts/ensemblQueries.py
+35
-0
wrappers/GISTIC2/wrapper.py
wrappers/GISTIC2/wrapper.py
+2
-2
No files found.
Snakefile
View file @
2d44810b
...
...
@@ -14,7 +14,7 @@ include: "rules/GenePrioritization.smk"
#Rules to compare different inputs.
include: "rules/ComparisonRegions.smk"
include: "rules/Circos.
yaml
"
include: "rules/Circos.
smk
"
include: "rules/SampleSizes.smk"
include: "rules/UseControl.smk"
...
...
@@ -43,8 +43,6 @@ rule all:
"""Define desired output from pipeline."""
input:
"Samplesize/Report.txt"
#"Reports/Results.html"
rule help:
"""Print list of all targets with help."""
...
...
@@ -65,7 +63,8 @@ rule report:
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png",
circos="Reports/Circos/RecurrentRegions.png",
circos_legend="Reports/Circos/RecurrentRegions_legend.png"
circos_legend="Reports/Circos/RecurrentRegions_legend.png",
known_genes="Reports/Overlap_known_genes.bed"
output:
html="Reports/Results.html"
run:
...
...
rules/Circos.
yaml
→
rules/Circos.
smk
View file @
2d44810b
from Circos import InputCircos
from Circos import InputCircos
, bed_to_circos
rule circos_input:
"""Make input files for making a circos diagram."""
...
...
@@ -60,3 +60,31 @@ rule add_legend_circos:
offset = ((c_w - l_w), 0)
circos.paste(legend, offset)
circos.save(output[0])
rule circos_input_zoom:
"""Make input files for making a circos diagram."""
input:
bed="Reports/Overlap_known_genes.bed"
output:
gistic="Reports/Circos/Zoom/GISTIC.txt",
rubic="Reports/Circos/Zoom/RUBIC.txt",
genes="Reports/Circos/Zoom/Genes.txt",
run:
bed_to_circos(input.bed, output.rubic, output.gistic, output.genes)
rule make_circos_zoom:
"""Compare locations of known genes, recurrent regions from RUBIC and recurrent regions from GISTIC2."""
input:
gistic="Reports/Circos/Zoom/GISTIC.txt",
rubic="Reports/Circos/Zoom/RUBIC.txt",
genes="Reports/Circos/Zoom/Genes.txt",
output:
plots="Reports/Overlap_plots/12.png"
params:
workflow.basedir + "/scripts/circos/circos_zoom.conf",
chrom='hs12'
conda:
workflow.basedir + "/envs/circos.yaml"
shell:
"circos -conf {params[0]} -outputfile {output[0]} -param gistic_file={input.gistic} -param rubic_file={input.rubic} \
-param gene_file={input.genes} -param chrom={params.chrom}"
rules/ComparisonRegions.smk
View file @
2d44810b
...
...
@@ -9,7 +9,7 @@ rule report_tools:
gistic="GISTIC/all_lesions.conf_" + config["gistic_precision"] + ".txt",
rubic_gain="RUBIC/gains.txt",
rubic_loss="RUBIC/losses.txt",
overlap="Reports/
Overlap_regions
.bed"
overlap="Reports/
Regions_overlapping_other_tool
.bed"
output:
tools="Reports/Tools.txt",
table_regions="Reports/Recurrent_regions.txt",
...
...
@@ -18,7 +18,7 @@ rule report_tools:
genes_rubic="Reports/Genes_RUBIC.txt",
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png",
pvals="Reports/Overlap_pvalues.png
"
bed_known="Reports/Locations_known_genes.bed
"
params: #select input files from repository or own input files
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
...
...
@@ -29,7 +29,7 @@ rule report_tools:
params.census, params.known, params.ref, params.biomart_info,
output.tools, output.table_regions, output.venn, output.swarmplot,
output.genes_both, output.genes_gistic, output.genes_rubic,
input.overlap, output.
pvals
)
input.overlap, output.
bed_known
)
rule bed_intersect:
"""Intersect the recurrent regions detected by RUBIC and GISTIC2.0."""
...
...
@@ -37,28 +37,21 @@ rule bed_intersect:
gistic="GISTIC/regions_track.conf_" + config["gistic_precision"] + ".bed",
rubic="RUBIC/regions_track.bed"
output:
"Reports/
Overlap_regions
.bed"
"Reports/
Regions_overlapping_other_tool
.bed"
conda:
workflow.basedir + "/envs/bedtools.yaml"
shell:
"bedtools intersect -a {input.gistic} -b {input.rubic} -wo > {output}"
def get_regions(bed_file):
plot_names = []
with open(bed_file, 'r') as bed:
bed.readline()
for line in bed:
chrom, start = line.split("\t")[0:2]
plot_names.append(chrom + "." + start)
return plot_names
#rule compare_regions:
# """Compare locations of known genes, recurrent regions from RUBIC and recurrent regions from GISTIC2."""
# input:
# overlap="Reports/Overlap_regions.bed"
# params:
# known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"]
# output:
# plots=expand("Reports/Overlap_plots/{region}.png", region=get_regions(input.overlap))
# shell:
# "R {workflow.basedir}/scripts/plot_regions.R"
rule bed_known_genes:
"""Intersect known genes and recurrent regions detected by RUBIC and GISTIC2.0."""
input:
known="Reports/Locations_known_genes.bed",
gistic="GISTIC/regions_track.conf_" + config["gistic_precision"] + ".bed",
rubic="RUBIC/regions_track.bed"
output:
"Reports/Overlap_known_genes.bed"
conda:
workflow.basedir + "/envs/bedtools.yaml"
shell:
"bedtools intersect -a {input.known} -b {input.gistic} {input.rubic} -names GISTIC RUBIC -wo > {output}"
rules/SampleSizes.smk
View file @
2d44810b
...
...
@@ -52,8 +52,13 @@ rule report_sizes:
"""Report the difference when using different sample sizes."""
input:
gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
<<<<<<< HEAD
rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
=======
#rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
#rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
>>>>>>> 9067043362f4034a0e46f4579c366d1228927193
output:
report="Samplesize/Report.txt",
plots="Samplesize/Plots/"
...
...
scripts/Circos.py
View file @
2d44810b
...
...
@@ -45,3 +45,19 @@ class InputCircos:
qval
=
str
(
max
([
float
(
left_q
),
float
(
right_q
)]))
out_line
=
[
"hs"
+
chrom
,
start
,
end
,
qval
]
out
.
write
(
" "
.
join
(
out_line
)
+
"
\n
"
)
def
bed_to_circos
(
bed_file
,
rubic_file
,
gistic_file
,
gene_file
):
rubic
=
open
(
rubic_file
,
'w'
)
gistic
=
open
(
gistic_file
,
'w'
)
genes
=
open
(
gene_file
,
'w'
)
with
open
(
bed_file
,
'r'
)
as
bed
:
for
line
in
bed
:
gene_chrom
,
gene_start
,
gene_end
,
gene_name
,
tool_name
,
tool_chrom
,
tool_start
,
tool_end
,
\
amp_name
,
overlap_bp
=
line
.
split
(
"
\t
"
)
chrom
=
'hs'
+
gene_chrom
.
strip
(
"chr"
)
genes
.
write
(
" "
.
join
([
chrom
,
gene_start
,
gene_end
])
+
"
\n
"
)
if
tool_name
==
'GISTIC'
:
gistic
.
write
(
" "
.
join
([
chrom
,
tool_start
,
tool_end
])
+
"
\n
"
)
else
:
rubic
.
write
(
" "
.
join
([
chrom
,
tool_start
,
tool_end
])
+
"
\n
"
)
rubic
.
close
(),
gistic
.
close
(),
genes
.
close
()
scripts/ParseResults.py
View file @
2d44810b
import
numpy
as
np
import
os.path
import
pyensembl
ensembl
=
pyensembl
.
EnsemblRelease
(
75
)
from
math
import
log10
from
collections
import
OrderedDict
from
ensemblQueries
import
genes_at_locus
def
install_ensembl
(
reference_genome
):
"""Import ensembl if needed and load correct release based on reference genome."""
if
"hg19"
in
reference_genome
:
ensembl
=
pyensembl
.
EnsemblRelease
(
75
)
elif
"38"
in
reference_genome
:
ensembl
=
pyensembl
.
EnsemblRelease
(
87
)
else
:
raise
ValueError
(
"Unknown reference genome used."
)
try
:
#only done first time to install ensembl version.
ensembl
.
download
()
ensembl
.
index
()
except
:
pass
def
parse
(
results_file
,
known_file
,
census_file
,
used_tool
):
def
parse_regions
(
results_file
,
known_file
,
census_file
,
used_tool
):
"""Make list with the parsed results from GISTIC or RUBIC.
Format: [chrom, start, end, cnv_type, qval, [known_gene_list], [census_gene_list], [gene_list]]"""
known_genes
=
gene_file_IDs
(
known_file
,
np
.
inf
)
census_genes
=
gene_file_IDs
(
census_file
,
np
.
inf
)
known_genes
=
parse_gene_file
(
known_file
,
np
.
inf
)
census_genes
=
parse_gene_file
(
census_file
,
np
.
inf
)
if
used_tool
==
'GISTIC'
:
recurrent_regions
=
parse_gistic_results
(
results_file
,
known_genes
,
census_genes
)
else
:
recurrent_regions
=
parse_rubic_results
(
results_file
[
0
],
results_file
[
1
],
known_genes
,
census_genes
)
return
recurrent_regions
def
parse_gene_file
(
gene_file
,
top_nr
):
"""Parse a file with a list of genes (one gene per line or first column).
Top_nr: the number of genes to extract (requires ranked gene list file), use np.inf to extract all genes."""
list_genes
=
[]
num_read
=
0
with
open
(
gene_file
,
'r'
)
as
genes
:
for
line
in
genes
:
if
line
.
startswith
(
"Supplementary"
)
or
line
.
startswith
(
"
\t
"
)
or
line
.
startswith
(
"Gene"
):
#deal with supplement files with one or more header lines
continue
else
:
gene
=
line
.
split
(
"
\t
"
)[
0
]
list_genes
.
append
(
gene
)
if
top_nr
!=
np
.
inf
:
list_genes
=
list_genes
[
0
:
top_nr
]
return
list_genes
def
parse_gistic_results
(
results_file
,
known_genes
,
census_genes
):
"""Create a list of lists with the recurrent regions called by GISTIC2."""
regions
=
[]
...
...
@@ -46,7 +47,6 @@ def parse_gistic_results(results_file, known_genes, census_genes):
start
,
end
=
bp
.
split
(
"-"
)
cnv_type
=
'amp'
if
stats
[
0
].
startswith
(
"Amplification"
)
else
'del'
qval
=
log10
(
float
(
stats
[
5
]))
*
-
1
#Convert q-value to log10(q-value)
#list_genes = genes_locus(chrom, int(start), int(end))
result_dir
,
conf
=
results_file
.
split
(
"all_lesions"
)
gene_file
=
os
.
path
.
join
(
result_dir
,
cnv_type
+
"_genes"
+
conf
)
list_genes
=
gistic_genes
(
gene_file
,
CNV_id
)
...
...
@@ -71,7 +71,7 @@ def gistic_genes(gene_file, CNV_id):
return
out_genes
def
parse_rubic_results
(
results_gains
,
results_losses
,
known_genes
,
census_genes
):
"""Create a list of lists with the recurrent regions called by
GISTIC2
."""
"""Create a list of lists with the recurrent regions called by
RUBIC
."""
regions
=
[]
for
result_file
in
results_gains
,
results_losses
:
with
open
(
result_file
,
'r'
)
as
lesions
:
...
...
@@ -80,69 +80,84 @@ def parse_rubic_results(results_gains, results_losses, known_genes, census_genes
chrom
,
start
,
end
,
qval_left
,
qval_right
,
genes
=
lesion
.
split
(
"
\t
"
)
CNV_id
=
"chr"
+
chrom
+
":"
+
start
+
"-"
+
end
cnv_type
=
'amp'
if
result_file
.
endswith
(
'gains.txt'
)
else
'del'
#list_genes = genes_locus(chrom, int(start), int(end))
list_genes
=
genes
.
strip
().
split
(
","
)
print
(
list_genes
)
known_overlap
=
set
(
list_genes
).
intersection
(
set
(
known_genes
))
census_overlap
=
set
(
list_genes
).
intersection
(
set
(
census_genes
))
cnv_data
=
[
chrom
,
start
,
end
,
cnv_type
,
(
qval_left
,
qval_right
),
list
(
known_overlap
),
list
(
census_overlap
),
list_genes
]
regions
.
append
(
cnv_data
)
return
regions
#def genes_locus(chrom, start, end):
# """Get list of gene IDs at certain location."""
# IDs = []
# gene_info = ensembl.genes_at_locus(chrom, start, end)
# for gene in gene_info:
# ID = gene.gene_id
# IDs.append(ID)
# return IDs
def
gene_file_IDs
(
gene_file
,
top_nr
):
"""Parse a file with a list of genes (one gene per line or first column).
Top_nr: the number of genes to extract (requires ranked gene list file), use np.inf to extract all genes."""
list_genes
=
[]
num_read
=
0
with
open
(
gene_file
,
'r'
)
as
genes
:
for
line
in
genes
:
if
line
.
startswith
(
"Supplementary"
)
or
line
.
startswith
(
"
\t
"
)
or
line
.
startswith
(
"Gene"
):
#deal with supplement files with one or more header lines
continue
def
parse_bed_overlap
(
bed_file
,
known_file
,
census_file
):
"""Parse the bed file with overlapping regions.
Returns for amplifications and deletions seperately: ['Overlap', type, number of regions,
average size of regions, total size of regions, number of genes detected]"""
known_genes
=
parse_gene_file
(
known_file
,
np
.
inf
)
census_genes
=
parse_gene_file
(
census_file
,
np
.
inf
)
return_stats
=
[]
for
cnv_type
in
'amp'
,
'del'
:
cnv_info
=
[
0
,
0
,
0
,
0
,
0
]
with
open
(
bed_file
,
'r'
)
as
lesions
:
for
line
in
lesions
:
first_chr
,
first_start
,
first_end
,
first_nr
,
sec_chr
,
sec_start
,
sec_end
,
sec_nr
,
bp_overlap
=
line
.
strip
().
split
(
"
\t
"
)
begin_overlap
=
max
(
int
(
first_start
),
int
(
sec_start
))
end_overlap
=
min
(
int
(
first_end
),
int
(
sec_end
))
if
cnv_type
==
'amp'
:
if
(
first_nr
.
startswith
(
"Any-AP"
)
and
sec_nr
.
startswith
(
"amp"
))
or
(
sec_nr
.
startswith
(
"Any-AP"
)
and
first_nr
.
starswith
(
"amp"
)):
cnv_info
=
add_numbers_bed_overlap
(
cnv_info
,
bp_overlap
,
first_chr
,
begin_overlap
,
end_overlap
,
known_genes
,
census_genes
)
else
:
gene
=
line
.
split
(
"
\t
"
)[
0
]
list_genes
.
append
(
gene
)
#try:
# gene_ID = ensembl.gene_ids_of_gene_name(gene)
# list_genes = list_genes + gene_ID
#except:
# pass
if
top_nr
!=
np
.
inf
:
list_genes
=
list_genes
[
0
:
top_nr
]
return
list_genes
if
(
first_nr
.
startswith
(
"Any-DP"
)
and
sec_nr
.
startswith
(
"del"
))
or
(
sec_nr
.
startswith
(
"Any-DP"
)
and
first_nr
.
starswith
(
"del"
)):
cnv_info
=
add_numbers_bed_overlap
(
cnv_info
,
bp_overlap
,
first_chr
,
begin_overlap
,
end_overlap
,
known_genes
,
census_genes
)
stats
=
calculate_stats
(
cnv_info
[
0
],
cnv_info
[
1
],
cnv_info
[
2
],
cnv_info
[
3
],
cnv_info
[
4
],
cnv_type
,
'Overlap'
)
return_stats
.
append
(
stats
)
return
return_stats
def
add_numbers_bed_overlap
(
cnv_info
,
bp_overlap
,
first_chr
,
begin_overlap
,
end_overlap
,
known_genes
,
census_genes
):
"""Add counts to list with stats.
cnv_info = [total size, number of genes, number of regions, number of regions with known genes,
number of regions with census genes]"""
list_genes
=
genes_at_locus
(
first_chr
,
begin_overlap
,
end_overlap
)
unique_genes
=
list
(
set
(
list_genes
))
cnv_info
[
0
]
+=
int
(
bp_overlap
)
#total size
cnv_info
[
1
]
+=
len
(
unique_genes
)
#number genes
cnv_info
[
2
]
+=
1
#number regions
known_overlap
=
set
(
list_genes
).
intersection
(
set
(
known_genes
))
if
list
(
known_overlap
)
!=
[]:
cnv_info
[
3
]
+=
1
census_overlap
=
set
(
list_genes
).
intersection
(
set
(
census_genes
))
if
list
(
census_overlap
)
!=
[]:
cnv_info
[
4
]
+=
1
return
cnv_info
def
get_stats
(
list_regions
,
used_tool
):
"""Calculate stats on the results from GISTIC2 and RUBIC."""
return_info
=
[]
for
cnv_type
in
'amp'
,
'del'
:
tot_size
,
nr_genes
,
nr_reg
_nogenes
,
nr_reg
,
known_count
,
census_count
=
0
,
0
,
0
,
0
,
0
,
0
tot_size
,
nr_genes
,
nr_reg
,
known_count
,
census_count
=
0
,
0
,
0
,
0
,
0
all_genes
=
[]
for
alt
in
list_regions
:
if
alt
[
3
]
==
cnv_type
:
#get amp stats
nr_reg
+=
1
tot_size
+=
int
(
alt
[
2
])
-
int
(
alt
[
1
])
nr_genes
+=
len
(
alt
[
7
])
#long list genes > unique genes
all_genes
+=
[
alt
[
7
]]
all_genes
=
all_genes
+
alt
[
7
]
if
alt
[
5
]
!=
[]:
known_count
+=
1
if
alt
[
6
]
!=
[]:
census_count
+=
1
type_name
=
'Amplifications'
if
cnv_type
==
'amp'
else
'Deletions'
avg_size
=
int
(
tot_size
/
nr_reg
/
1000
)
if
nr_reg
!=
0
else
'N/A'
#average size in kb
tot_size
=
tot_size
/
1000000
#total size in mb
stats
=
[
used_tool
,
type_name
,
nr_reg
,
avg_size
,
tot_size
,
nr_genes
]
stats
=
list
(
map
(
str
,
stats
))
for
count
in
known_count
,
census_count
:
percent
=
(
count
/
nr_reg
*
100
)
if
count
!=
0
else
0
stat_str
=
str
(
count
)
+
" ("
+
str
(
round
(
percent
,
2
))
+
"%)"
stats
.
append
(
stat_str
)
nr_genes
=
len
(
list
(
set
(
all_genes
)))
#unique genes only
stats
=
calculate_stats
(
tot_size
,
nr_genes
,
nr_reg
,
known_count
,
census_count
,
cnv_type
,
used_tool
)
return_info
.
append
(
stats
)
return
return_info
def
calculate_stats
(
tot_size
,
nr_genes
,
nr_reg
,
known_count
,
census_count
,
cnv_type
,
used_tool
):
"""Calculate average & total numbers for report on recurrent regions."""
type_name
=
'Amplifications'
if
cnv_type
==
'amp'
else
'Deletions'
avg_size
=
int
(
tot_size
/
nr_reg
/
1000
)
if
nr_reg
!=
0
else
'N/A'
#average size in kb
round_tot_size
=
round
((
tot_size
/
1000000
),
2
)
#total size in mb
stats
=
[
used_tool
,
type_name
,
nr_reg
,
avg_size
,
round_tot_size
,
nr_genes
]
stats
=
list
(
map
(
str
,
stats
))
for
count
in
known_count
,
census_count
:
percent
=
(
count
/
nr_reg
*
100
)
if
count
!=
0
else
0
stat_str
=
str
(
count
)
+
" ("
+
str
(
round
(
percent
,
2
))
+
"%)"
stats
.
append
(
stat_str
)
return
stats
scripts/ReportControl.py
View file @
2d44810b
...
...
@@ -6,19 +6,17 @@ import pandas as pd
import
seaborn
as
sns
from
scipy.stats
import
ttest_ind
,
mannwhitneyu
import
os.path
import
pyensembl
from
ParseResults
import
parse
,
get_stats
,
install_ensembl
from
ParseResults
import
parse_regions
,
get_stats
from
collections
import
OrderedDict
def
make_report
(
control_results
,
nocontrol_results
,
report_file
,
census_genes
,
known_genes
,
ref_genome
):
"""Make a report on analyses using control samples or without using them."""
install_ensembl
(
ref_genome
)
parsed_tools
,
stats_tools
=
[],
[]
with
open
(
report_file
,
'w'
)
as
out
:
row_names
=
([
"Control used?"
,
"Type"
,
"Nr. regions"
,
"Avg. size (Kb)"
,
"Total size (Mb)"
,
"Nr. genes"
,
"Nr. regions with census genes"
,
"Nr. regions with known genes"
])
for
result
in
control_results
,
nocontrol_results
:
parsed_results
=
parse
()
.
gistic_results
(
result
)
parsed_results
=
parse
_regions
.
gistic_results
(
result
)
parsed_tools
.
append
(
parsed_results
)
label
=
"Control"
if
result
==
control_results
else
"No control"
stats_results
=
stats
().
calculate_stats
(
parsed_results
,
census_genes
,
known_genes
,
label
)
...
...
scripts/ReportSegments.py
View file @
2d44810b
...
...
@@ -7,7 +7,7 @@ import seaborn as sns
from
scipy.stats
import
ttest_ind
,
mannwhitneyu
import
os.path
import
pyensembl
from
ParseResults
import
parse
,
get_stats
from
ParseResults
import
get_stats
from
collections
import
OrderedDict
def
make_report
(
segmentation_file
,
report_file
):
...
...
scripts/ReportSizes.py
View file @
2d44810b
...
...
@@ -6,20 +6,18 @@ import pandas as pd
import
seaborn
as
sns
from
scipy.stats
import
ttest_ind
,
mannwhitneyu
import
os.path
import
pyensembl
from
ParseResults
import
parse
,
get_stats
,
install_ensembl
from
ParseResults
import
parse_regions
,
get_stats
from
collections
import
OrderedDict
def
make_report
(
size_results
,
census_genes
,
known_genes
,
reps
,
ref_genome
,
report_file
,
plot_dir
):
"""Make a report of the results produced using input files with different sample sizes."""
install_ensembl
(
ref_genome
)
with
open
(
report_file
,
'w'
)
as
out
:
dict_stats
=
OrderedDict
()
row_names
=
([
"Size"
,
"Type"
,
"Nr. regions"
,
"Avg. size (Kb)"
,
"Total size (Mb)"
,
"Nr. regions with census genes"
,
"Nr. regions with known genes"
,
"Nr. genes"
])
tool
=
GISTIC2
for
size_file
in
size_results
:
parsed_results
=
parse
(
size_file
,
known_genes
,
census_genes
,
tool
)
parsed_results
=
parse
_regions
(
size_file
,
known_genes
,
census_genes
,
tool
)
size
,
repetition
=
size_file
.
split
(
"/"
)[
-
1
].
split
(
"x"
)
stats_results
=
get_stats
(
parsed_results
,
size
)
if
size
not
in
dict_stats
.
keys
():
...
...
scripts/ReportTools.py
View file @
2d44810b
import
numpy
as
np
import
matplotlib
import
matplotlib.pyplot
as
plt
from
matplotlib_venn
import
venn2
from
matplotlib_venn
import
venn2
,
venn3
import
pandas
as
pd
import
seaborn
as
sns
from
scipy.stats
import
ttest_ind
,
mannwhitneyu
,
pearsonr
import
os.path
import
pyensembl
from
ParseResults
import
parse
,
get_stats
,
install_ensembl
from
ParseResults
import
parse_regions
,
get_stats
,
parse_bed_overlap
,
parse_gene_file
from
collections
import
OrderedDict
import
math
...
...
@@ -15,43 +14,33 @@ def make_report(gistic_results, rubic_gain_results, rubic_loss_results,
census_genes
,
known_genes
,
ref_genome
,
biomart_file
,
file_tools
,
file_regions
,
file_venn
,
file_swarmplot
,
file_genes_both
,
file_genes_GISTIC
,
file_genes_RUBIC
,
bed_overlap
,
file_pvalu
es
):
bed_overlap
,
bed_known_gen
es
):
"""Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
install_ensembl
(
ref_genome
)
parsed
,
stats
=
[],
[]
for
tool
in
'GISTIC'
,
'RUBIC'
:
results
=
gistic_results
if
tool
==
'GISTIC'
else
[
rubic_gain_results
,
rubic_loss_results
]
parsed_results
=
parse
(
results
,
known_genes
,
census_genes
,
tool
)
stats_results
=
get_stats
(
parsed_results
,
tool
)
parsed_results
=
parse_regions
(
results
,
known_genes
,
census_genes
,
tool
)
parsed
.
append
(
parsed_results
)
stats
.
append
(
stats_results
)
make_files
(
parsed
,
stats
,
file_tools
,
file_genes_GISTIC
,
file_genes_RUBIC
,
file_genes_both
,
file_venn
,
file_swarmplot
,
file_regions
,
biomart_file
,
bed_overlap
,
file_pvalues
)
def
make_files
(
parsed
,
stats
,
file_tools
,
file_genes_GISTIC
,
file_genes_RUBIC
,
file_genes_both
,
file_venn
,
file_swarmplot
,
file_regions
,
biomart_file
,
bed_overlap
,
file_pvalues
):
#overlap_most_sign(parsed)
table_regions
(
parsed
,
file_regions
)
make_tool_report
(
file_tools
,
stats
)
all_genes
=
make_genefiles
(
parsed
,
file_genes_GISTIC
,
file_genes_RUBIC
)
make_genefile_overlap
(
all_genes
,
file_genes_both
)
venn_overlap
(
all_genes
,
file_venn
)
#compare_known_regions(known_genes, file_overlap, biomart_file,
# gistic_results, rubic_gain_results, rubic_loss_results)
plot_sizes
(
parsed
,
file_swarmplot
)
#compare_overlapping_regions(parsed, biomart_file, file_overlap)
overlapping_pvalues
(
bed_overlap
,
parsed
,
file_pvalues
)
stats
.
append
(
get_stats
(
parsed_results
,
tool
))
stats
.
append
(
parse_bed_overlap
(
bed_overlap
,
known_genes
,
census_genes
))
table_all_regions
(
parsed
,
file_regions
)
#make table with all regions
make_tool_report
(
file_tools
,
stats
)
#make tool report
all_genes
=
make_gene_files
(
parsed
,
file_genes_GISTIC
,
file_genes_RUBIC
,
file_genes_both
,
known_genes
,
file_venn
)
#make gene files and venn overlap plot
plot_histogram_sizes
(
parsed
,
file_swarmplot
)
#plot histogram with the sizes of the regions
make_bed_known_regions
(
known_genes
,
biomart_file
,
bed_known_genes
)
#Make bed file with the locations of known genes.
def
make_tool_report
(
file_tools
,
stats_tools
):
"""Make a report of the results from GISTIC and RUBIC."""
row_names
=
[
"Tool"
,
"Type"
,
"Nr. regions"
,
"Avg. size (Kb)"
,
"Total size (Mb)"
,
"Nr. genes"
,
"Regions with known genes"
,
"Regions with census genes"
]
with
open
(
file_tools
,
'w'
)
as
out
:
for
i
in
range
(
len
(
row_names
)):
list_out
=
[
row_names
[
i
],
stats_tools
[
0
][
0
][
i
],
stats_tools
[
0
][
1
][
i
],
stats_tools
[
1
][
0
][
i
],
stats_tools
[
1
][
1
][
i
]]
list_out
=
[
row_names
[
i
]]
for
j
in
range
(
len
(
stats_tools
)):
list_out
+=
[
stats_tools
[
j
][
0
][
i
],
stats_tools
[
j
][
1
][
i
]]
list_out
=
list
(
map
(
str
,
list_out
))
out
.
write
(
"
\t
"
.
join
(
list_out
)
+
"
\n
"
)
def
table_regions
(
parsed_regions
,
file_regions
):
def
table_
all_
regions
(
parsed_regions
,
file_regions
):
"""Make a table with all recurent regions."""
with
open
(
file_regions
,
'w'
)
as
out
:
header
=
[
"Method"
,
"Chr"
,
"Start"
,
"End"
,
"Type"
,
"Negative log10 q-value"
,
"Known genes"
,
"Census genes"
,
"All genes"
]
...
...
@@ -69,7 +58,7 @@ def table_regions(parsed_regions, file_regions):
out
.
write
(
"
\n
"
)
tool_name
=
"RUBIC"
def
make_gene
files
(
parsed_results
,
out_GISTIC
,
out_RUBIC
):
def
make_gene
_files
(
parsed_results
,
out_GISTIC
,
out_RUBIC
,
out_both
,
known_genes
,
out_venn
):
"""Make files with all genes reported by GISTIC2 and RUBIC."""
genes_all
=
[]
for
tool_results
in
parsed_results
:
...
...
@@ -82,38 +71,39 @@ def make_genefiles(parsed_results, out_GISTIC, out_RUBIC):
out_file
=
out_GISTIC
if
tool_results
==
parsed_results
[
0
]
else
out_RUBIC
with
open
(
out_file
,
'w'
)
as
out
:
out
.
write
(
"
\n
"
.
join
(
genes_tool
))
return
genes_all
make_gene_file_overlap
(
genes_all
,
out_both
)
venn3_overlap
(
genes_all
,
known_genes
,
out_venn
)
def
make_genefile_overlap
(
gene_lists
,
out_both
):
def
make_gene_file_overlap
(
gene_lists
,
out_both
):
"""Make file with all genes reported by both tools."""
overlap
=
set
(
gene_lists
[
0
]).
intersection
(
set
(
gene_lists
[
1
]))
with
open
(
out_both
,
'w'
)
as
overlap_file
:
overlap_file
.
write
(
"
\n
"
.
join
(
list
(
overlap
)))
def
parse_sizes
(
parsed_results
):
"""Calculate the sizes of regions to use as input swarmplot of sizes of recurrent regions."""
sizes
,
sizes_zoomed
=
[],
[]
tool
=
'GISTIC2.0'
for
tool_results
in
parsed_results
:
amp_sizes
,
del_sizes
=
[],
[]
for
cnv
in
tool_results
:
size
=
int
(
cnv
[
2
])
-
int
(
cnv
[
1
])
cnv_label
=
'Amplification'
if
cnv
[
3
]
==
'amp'
else
'Deletion'
sizes
+=
[(
size
,
tool
,
cnv_label
)]
if
size
<
900000
:
sizes_zoomed
+=
[(
size
,
tool
,
cnv_label
)]
tool
=
'RUBIC'
labels
=
[
'Size'
,
'Tool'
,
'CNV type'
]
df
=
pd
.
DataFrame
(
sizes
,
columns
=
labels
)
return
df
def
venn3_overlap
(
gene_lists
,
known_genes
,
out_venn
):
known
=
parse_gene_file
(
known_genes
,
np
.
inf
)
plt
.
subplots
(
figsize
=
(
7
,
7
))
venn_sets
=
[
set
(
gene_lists
[
0
]),
set
(
gene_lists
[
1
]),
set
(
known
)]
c
=
venn3
(
venn_sets
,
(
'GISTIC2'
,
'RUBIC'
,
' Known genes'
))
c
.
get_patch_by_id
(
'100'
).
set_color
(
'#5975A4'
),
c
.
get_patch_by_id
(
'010'
).
set_color
(
'#5F9E6E'
)
c
.
get_patch_by_id
(
'001'
).
set_color
(
'#857AAA'
),
c
.
get_patch_by_id
(
'011'
).
set_color
(
'#748A8F'
)
c
.
get_patch_by_id
(
'101'
).
set_color
(
'#6D77A7'
),
c
.
get_patch_by_id
(
'110'
).
set_color
(
'#5C888B'
)
c
.
get_patch_by_id
(
'111'
).
set_color
(
'#B55D60'
)
c
.
get_patch_by_id
(
'100'
).
set_alpha
(
1
),
c
.
get_patch_by_id
(
'010'
).
set_alpha
(
1
)
c
.
get_patch_by_id
(
'001'
).
set_alpha
(
1
),
c
.
get_patch_by_id
(
'011'
).
set_alpha
(
1
)
c
.
get_patch_by_id
(
'101'
).
set_alpha
(
1
),
c
.
get_patch_by_id
(
'110'
).
set_alpha
(
1
)
c
.
get_patch_by_id
(
'111'
).
set_alpha
(
1
)
plt
.
savefig
(
out_venn
,
dpi
=
300
)
def
plot_sizes
(
parsed_results
,
plot_file
):
def
plot_
histogram_
sizes
(
parsed_results
,
plot_file
):
"""Make boxplot with swarmplot on top showing sizes of recurrent regions detected by GISTIC2.0 and RUBIC."""
size_list
=
parse_sizes
(
parsed_results
)
size_list
,
size_list_known
=
parse_sizes_for_plot
(
parsed_results
)
sns
.
set_style
(
"white"
)
f
,
ax
=
plt
.
subplots
(
figsize
=
(
7
,
7
))
ax
.
set
(
yscale
=
"log"
)
g
=
sns
.
boxplot
(
x
=
"Tool"
,
y
=
"Size"
,
hue
=
"CNV type"
,
data
=
size_list
,
whis
=
np
.
inf
,
palette
=
"deep"
)
g
=
sns
.
swarmplot
(
x
=
"Tool"
,
y
=
"Size"
,
dodge
=
True
,
hue
=
"CNV type"
,
data
=
size_list
,
size
=
5
,
palette
=
{
"Amplification"
:
"#3f3f3f"
,
"Deletion"
:
"#3f3f3f"
})
g
=
sns
.
boxplot
(
x
=
"Tool"
,
y
=
"Size"
,
hue
=
"CNV type"
,
data
=
size_list
,
whis
=
np
.
inf
,
palette
=
{
"Amplification"
:
"#71AEC0"
,
"Deletion"
:
"#B55D60"
})
g
=
sns
.
swarmplot
(
x
=
"Tool"
,
y
=
"Size"
,
dodge
=
True
,
hue
=
"CNV type"
,
data
=
size_list
,
size
=
5
,
palette
=
{
"Amplification"
:
"#527F8C"
,
"Deletion"
:
"#844446"
})
g
=
sns
.
swarmplot
(
x
=
"Tool"
,
y
=
"Size"
,
dodge
=
True
,
hue
=
"CNV type"
,
data
=
size_list_known
,
size
=
5
,
palette
=
{
"Amplification"
:
"black"
,
"Deletion"
:
"black"
})
sns
.
despine
()
g
.
set_xlabel
(
"Tool"
,
fontsize
=
16
)
g
.
set_ylabel
(
"Log size of recurrent regions"
,
fontsize
=
16
)
...
...
@@ -121,108 +111,42 @@ def plot_sizes(parsed_results, plot_file):
handles
,
labels
=
g
.
get_legend_handles_labels
()
plt
.
legend
(
handles
[
0
:
2
],
labels
[:
2
],
fontsize
=
12
)
plt
.
savefig
(
plot_file
,
dpi
=
300
)
#plt.close()
def
venn_overlap
(
gene_lists
,
out_venn
):
#maybe add known genes?
"""Compare the genes detected by GISTIC and those by RUBIC."""
plt
.
subplots
(
figsize
=
(
7
,
7
))
c
=
venn2
([
set
(
gene_lists
[
0
]),
set
(
gene_lists
[
1
])],
set_labels
=
(
'GISTIC2'
,
'RUBIC'
))
c
.
get_patch_by_id
(
'10'
).
set_color
(
'#5975A4'
)
c
.
get_patch_by_id
(
'01'
).
set_color
(
'#5F9E6E'
)
c
.
get_patch_by_id
(
'10'
).
set_edgecolor
(
'#4c4c4c'
)
c
.
get_patch_by_id
(
'01'
).
set_edgecolor
(
'#4c4c4c'
)
c
.
get_patch_by_id
(
'10'
).
set_alpha
(
1
)
c
.
get_patch_by_id
(
'01'
).
set_alpha
(
1
)
c
.
get_patch_by_id
(
'11'
).
set_color
(
'#857AAA'
)
c
.
get_patch_by_id
(
'11'
).
set_edgecolor
(
'none'
)
c
.
get_patch_by_id
(
'11'
).
set_alpha
(
1
)
plt
.
savefig
(
out_venn
,
dpi
=
300
)
#plt.close()
##Wrong
def
compare_overlapping_regions
(
parsed_results
,
biomart_file
,
overlap_file
):
"""Compare the regions containing known driver genes"""
location_dict
=
{}
tool
=
"GISTIC"
def
parse_sizes_for_plot
(
parsed_results
):
"""Calculate the sizes of regions to use as input swarmplot of sizes of recurrent regions."""
sizes
,
sizes_known
=
[],
[]
tool
=
'GISTIC2.0'
for
tool_results
in
parsed_results
:
amp_sizes
,
del_sizes
=
[],
[]
for
cnv
in
tool_results
: