...
 
Commits (2)
from snakemake.utils import report
#Configure input and settings
configfile: "config.yaml"
......@@ -9,15 +11,15 @@ import os.path
#Rules to run pipeline for prioritization of regions and genes.
include: "rules/PreprocessInput.smk"
include: "rules/GISTIC2.smk"
include: "rules/Rubic.smk"
include: "rules/RUBIC.smk"
include: "rules/GenePrioritization.smk"
#Rules to compare different inputs.
include: "rules/ComparisonRegions.smk"
include: "rules/Circos.smk"
#include: "rules/SampleSizes.smk"
include: "rules/SampleSizes.smk"
#include: "rules/UseControl.smk"
#include: "rules/ComparisonSettings.smk"
include: "rules/ComparisonSettings.smk"
#Directory to save all files.
workdir: config["workdir"]
......@@ -39,11 +41,12 @@ onsuccess:
onerror:
print("\n\nPipeline failed. Possible reasons:\n- Wrong input files\n- Missing arguments in config file\n- Error in conda environment\n\n")
rule all:
"""Define desired output from pipeline."""
input:
"Reports/Results.html"
#"Settings/Report.txt",
"PipelineResults.html",
"ComparisonResults.html"
rule help:
"""Print list of all targets with help."""
......@@ -51,9 +54,8 @@ rule help:
for rule in workflow.rules:
print('- ' + rule.name + "\t" + rule.docstring)
rule report:
"""Write html report on segmentation file."""
rule report_pipeline:
"""Write HTML report on output from pipeline."""
input:
seg="Reports/Segments.txt",
tools="Reports/Tools.txt",
......@@ -62,23 +64,40 @@ rule report:
genes_gistic="Reports/Genes_GISTIC2.txt",
genes_rubic="Reports/Genes_RUBIC.txt",
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png",
circos="Reports/Circos/RecurrentRegions.png",
circos_legend="Reports/Circos/RecurrentRegions_legend.png",
known_genes="Reports/Overlap_known_genes.bed"
swarmplot="Reports/Comparison_sizes.png",
circos="Circos/RecurrentRegions_legend.png",
output:
html="PipelineResults.html"
run:
from snakemake.utils import report
report("""
====================================================
Report on the results of the prioritization pipeline
====================================================
- Report on segmentation file: seg_
- Report on comparison between tools and overlapping regions: tools_
- Table with all recurrent regions and overlapping genes: table_regions_
- Circos plot showing the raw segmentation file and recurrent regions detected by both tools: circos_
- Venn diagram showing the overlap between gene lists from both tools: venn_
- Swarmplot showing the differences in sizes between both tools: swarmplot_
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", **input)
rule report_comparisons:
"""Write HTML report on comparisons between sample sizes, settings and using a control."""
input:
#size="Samplesizes/Report.txt"
circos_genes=get_list_genes_circos
output:
html="Reports/Results.html"
html="ComparisonResults.html"
run:
from snakemake.utils import report
with open(input.seg, 'r') as seg:
nr_samples = seg.readline().split("\t")[1].strip()
report("""
====================================================
Report on the results of the prioritization pipeline
====================================================
In total, {nr_samples} samples were present in the raw segmentation file.
See: Table T1_
- Report on sample size comparison:
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", T1=input[0])
#**input)
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", **input)
#Directories to be specified
workdir: /home/bftan/CNA_results #directory to write output
gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#workdir: /home/beatrice/CNA_analysis
#gisticdir: /home/beatrice/CNA_analysis/run_gistic2
workdir: /home/beatrice/CNA_99_genegistic
gisticdir: /home/beatrice/CNA_analysis/run_gistic2
#Input details to download from firehose
cancer_type: SKCM
......@@ -14,22 +14,23 @@ inputfile: "" #tumor segmentation data
normal: "" #normal segmentation data
#Data for running and benchmarking tools.
reference: hg19
reference: hg19 #hg38.UCSC.add_miR.160920.refgene
prev_found_genes: input_files/intogen-CM-drivers-data.tsv
census_genes: input_files/Census_genes.txt
biomart_genes: input_files/biomart_human_genes.tsv
biomart_genes: input_files/biomart_human_genes_hg19.tsv #wrong genome build
ID_to_GO: input_files/ID_to_GO.txt
#Settings GISTIC2.0
gistic_precision: "99"
settings_gistic: ""
settings_gistic: "-brlen 0.98 -genegistic 1"
comparison_settings: ["-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 1 -conf 0.99",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 1 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.98 -cap 1.5 -rx 1 -genegistic 1 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.98 -cap 1.5 -rx 1 -genegistic 0 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 0 -conf 0.75",]
#GISTIC2.0 settings to compare
comparison_precision: ["99", "90", "75"]
comparison_settings: ["-brlen 0.7 -genegistic 1",
"-brlen 0.98 -genegistic 1",
"-brlen 0.98 -genegistic 0",
"-brlen 0.7 -genegistic 0"]
#Settings for sample size differences
sizes: [20, 30, 40, 50, 60, 70, 80, 90]
sizes: [20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
KIT
PDGFRA
KDR
CDK4
CCND1
MDM2
TERT
BRAF
MITF
PD-L1
NRAS
CD274
This diff is collapsed.
from Circos import InputCircos, bed_to_circos, make_CIRCOS_legend
from Circos import InputCircos, bed_to_circos, make_CIRCOS_legend, get_plot_region
from ReportTools import make_bed_genes
from PIL import Image
import os.path
rule make_CIRCOS_input:
"""Make input files for making a CIRCOS plot."""
......@@ -9,20 +11,20 @@ rule make_CIRCOS_input:
rubic_gains="RUBIC/gains.txt",
rubic_losses="RUBIC/losses.txt"
output:
seg="Reports/Circos/Segments.txt",
gistic="Reports/Circos/GISTIC_results.txt",
rubic="Reports/Circos/RUBIC_results.txt",
seg="Circos/Segments.txt",
gistic="Circos/GISTIC_results.txt",
rubic="Circos/RUBIC_results.txt",
run:
InputCircos(input.seg, input.gistic, input.rubic_gains, input.rubic_losses, output.seg, output.gistic, output.rubic)
rule make_CIRCOS_plot:
rule plot_CIRCOS:
"""Make CIRCOS plot of recurrent regions in RUBIC and GISTIC2.0"""
input:
seg="Reports/Circos/Segments.txt",
gistic="Reports/Circos/GISTIC_results.txt",
rubic="Reports/Circos/RUBIC_results.txt",
seg="Circos/Segments.txt",
gistic="Circos/GISTIC_results.txt",
rubic="Circos/RUBIC_results.txt",
output:
"Reports/Circos/RecurrentRegions.png"
"Circos/RecurrentRegions.png"
params:
conf=workflow.basedir + "/scripts/circos/circos.conf"
conda:
......@@ -33,72 +35,72 @@ rule make_CIRCOS_plot:
rule add_legend_CIRCOS:
"""Add a custom legend to the CIRCOS plot."""
input:
circos="Reports/Circos/RecurrentRegions.png",
circos="Circos/RecurrentRegions.png",
output:
legend="Reports/Circos/legend.png",
circos="Reports/Circos/RecurrentRegions_legend.png"
legend="Circos/legend.png",
circos="Circos/RecurrentRegions_legend.png"
run:
make_CIRCOS_legend(input.circos, output.legend, output.circos)
rule make_CIRCOS_zoom_input: #necessary?
"""Make input files for making a circos diagram."""
rule make_bed_genes_census:
input:
bed="Reports/Overlap_known_genes.bed"
gene_file=os.path.join(workflow.basedir, config["census_genes"])
output:
gistic="Reports/Circos/Zoom/GISTIC.txt",
rubic="Reports/Circos/Zoom/RUBIC.txt",
genes="Reports/Circos/Zoom/Genes.txt",
bed="Reports/Locations_census_genes.bed"
params:
ref=os.path.join(workflow.basedir, config["reference"]),
biomart=os.path.join(workflow.basedir, config["biomart_genes"])
run:
bed_to_circos(input.bed, output.rubic, output.gistic, output.genes)
make_bed_genes(input.gene_file, params.biomart, output.bed, params.ref)
def get_list_genes(overlapping_genes, locations_known_genes):
plot_list = []
list_overlapping_genes = []
with open(overlapping_genes, 'r') as plot_genes:
for line in plot_genes:
chrom, start, end = line.strip().split("\t")
chrom = "chr" + chrom.strip("hs")
list_overlapping_genes.append([chrom, start, end])
with open(locations_known_genes, 'r') as known_genes:
known_genes.readline()
for line in known_genes:
chrom, start, end, gene_name = line.strip().split("\t")
if [chrom, start, end] in list_overlapping_genes:
plot_list.append(gene_name)
return(plot_list)
rule make_bed_genes_known:
input:
gene_file=os.path.join(workflow.basedir, config["prev_found_genes"])
output:
bed="Reports/Locations_known_genes.bed"
params:
ref=os.path.join(workflow.basedir, config["reference"]),
biomart=os.path.join(workflow.basedir, config["biomart_genes"])
run:
make_bed_genes(input.gene_file, params.biomart, output.bed, params.ref)
def get_plot_region(gene_name, locations_known_genes):
with open(locations_known_genes, 'r') as known_genes:
known_genes.readline()
for line in known_genes:
if gene_name == line.strip().split("\t")[3]:
chrom, start, end, name = line.strip().split("\t")
chrom_region = "hs" + chrom.strip("chr")
if int(start) < 1000000:
start_region = 0
else:
start_region = int(start) - 1000000
end_region = int(end) + 1000000
plot_region = chrom_region + ":" + str(start_region) + "-" + str(end_region)
return plot_region
rule make_CIRCOS_input_genes: #toevoegen aan make_CIRCOS_input
input:
bed="Reports/Locations_{type}_genes.bed"
output:
circos="Circos/{type}_genes.txt"
run:
bed_to_circos(input.bed, output.circos)
def get_list_genes_circos(wildcards):
"""Extract list of known genes to produce list of file names to save CIRCOS plot for each gene"""
list_genes = []
with open(os.path.join(workflow.basedir, config["prev_found_genes"])) as known:
for line in known:
gene = line.split("\t")[2]
gene_file = "Circos/KnownGenes/" + gene + ".png"
list_genes.append(gene_file)
return list_genes
#rule make_CIRCOS_zoom_plots:
# """Compare locations of known genes, recurrent regions from RUBIC and recurrent regions from GISTIC2."""
# input:
# gistic="Reports/Circos/Zoom/GISTIC.txt", #all gistic regions
# rubic="Reports/Circos/Zoom/RUBIC.txt", #all rubic regions
# genes="Reports/Circos/Zoom/Genes.txt",
# known="Reports/Locations_known_genes.bed", #use locations known genes to extract list of known genes and run rule for each gene maybe also not overlapping, but region around gene
# list_genes = lambda
# params:
# known="Reports/Locations_known_genes.bed",
# genes="Reports/Circos/Zoom/Genes.txt",
# conf=workflow.basedir + "/scripts/circos/circos_zoom.conf",
# chrom=get_plot_region(wildcards.gene, input.known)
# output:
# plot=expand("Reports/Overlap_plots/{gene}.png", gene=get_list_genes(params.genes, params.known))
rule plot_CIRCOS_per_gene: #Two genes won't plot because region is outside chromosome?
"""Compare locations of known genes, recurrent regions from RUBIC and recurrent regions from GISTIC2.
Use 'get_list_genes_circos' as input for a rule (e.g. rule all) to produce a plot for each known gene.'"""
input:
gistic="Circos/GISTIC_results.txt",
rubic="Circos/RUBIC_results.txt",
genes="Circos/known_genes.txt",
census="Circos/census_genes.txt",
known_genes="Reports/Locations_known_genes.bed",
overlap="Reports/Overlap_known_genes.bed"
output:
plot="Circos/KnownGenes/{gene}.png"
params:
conf=workflow.basedir + "/scripts/circos/circos_zoom.conf",
#chrom=lambda wildcards, input: get_plot_region(wildcards.gene, input.known_genes, input.overlap),
#units=lambda wildcards, input: get_plot_units(wildcards.gene, input.known_genes, input.overlap)
# conda:
# workflow.basedir + "/envs/circos.yaml"
# shell:
# "circos -conf {params.conf} -outputfile {output.plot} -param gistic_file={input.gistic} -param rubic_file={input.rubic} \
# -param gene_file={input.genes} -param chrom={params.chrom}"
run:
chrom, units = get_plot_region(wildcards.gene, input.known_genes, input.overlap)
shell("circos -conf {params.conf} -outputfile {output.plot} -param gistic_file={input.gistic} -param rubic_file={input.rubic} \
-param gene_file={input.genes} -param census_file={input.census} -param chrom=" + chrom + " -param units=" + str(units))
......@@ -17,8 +17,7 @@ rule report_tools:
genes_gistic="Reports/Genes_GISTIC2.txt",
genes_rubic="Reports/Genes_RUBIC.txt",
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png",
bed_known="Reports/Locations_known_genes.bed"
size_plot="Reports/Comparison_sizes.png"
params: #select input files from repository or own input files
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
......@@ -27,9 +26,9 @@ rule report_tools:
run:
ReportTools.make_report(input.gistic, input.rubic_gain, input.rubic_loss,
params.census, params.known, params.ref, params.biomart_info,
output.tools, output.table_regions, output.venn, output.swarmplot,
output.tools, output.table_regions, output.venn, output.size_plot,
output.genes_both, output.genes_gistic, output.genes_rubic,
input.overlap, output.bed_known)
input.overlap)
rule get_overlap_GISTIC_RUBIC:
"""Intersect the recurrent regions detected by RUBIC and GISTIC2.0."""
......
def get_settings(nr_settings, all_settings):
print(nr_settings)
print(all_settings)
print(list(range(len(config["comparison_settings"]))))
from ParseResults import parse_regions, get_stats
from ReportTools import make_tool_report
rule gistic_settings:
"""Run GISTIC2 based on different settings."""
"""Run GISTIC2 using different settings."""
input:
gistic_directory=os.path.join(config["gisticdir"], "gistic2"),
seg="Input/Segments_tumor.txt",
lambda wildcards: config["comparison_settings"][wildcards.setting]
output:
expand("Settings/GISTIC_{setting_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", setting_nr=range(len(config["comparison_settings"]))),
"Settings/GISTIC_{setting_nr}/regions_track.conf_" + config["gistic_precision"] + ".bed"
"Settings/GISTIC.{setting}_{precision}/all_lesions.conf_{precision}.txt",
"Settings/GISTIC.{setting}_{precision}/regions_track.conf_{precision}.bed"
params:
cnv="",
ref=config["reference"],
ref_file="",
extra="wildcards.setting",
confidence=config["gistic_precision"]
extra=lambda wildcards: config["comparison_settings"][int(wildcards.setting)], #get_settings(wildcards.setting),
confidence=lambda wildcards: wildcards.precision
wrapper:
"file:" + workflow.basedir + "/wrappers/GISTIC2"
def get_list_settings(wildcards):
"""Extract list of known genes to produce list of file names to save CIRCOS plot for each gene"""
file_names = []
for i in range(len(config["comparison_settings"])):
for precision in config["comparison_precision"]:
file_name = "Settings/GISTIC." + str(i) + "_" + precision + "/all_lesions.conf_" + precision + ".txt"
file_names.append(file_name)
return file_names
def get_settings(setting):
"""Extract setting to use based on wildcard.setting, which is a number."""
list_settings = config["comparison_settings"]
return list_settings[int(setting)]
rule compare_settings:
input:
"Settings/GISTIC_{setting_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt"
get_list_settings
output:
"Settings/Report.txt"
report="Settings/Report.txt"
params:
settings=config["comparison_settings"],
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
ref=config["reference"]
run:
with open(output[0], 'w') as out:
out.write(input[0])
SettingReport(input, output.report, params.settings, params.known, params.census, params.ref)
def SettingReport(setting_results, report_file, settings, known_genes, census_genes, ref_genome):
stats = []
legend = []
print(setting_results)
for result in setting_results:
file_ID = result.split("GISTIC.")[1].split("/all_lesions")[0]
setting_nr, precision = file_ID.split("_")
used_setting = settings[int(setting_nr)]
parsed_results = parse_regions(result, known_genes, census_genes, 'GISTIC', ref_genome)
stats_results = get_stats(parsed_results, file_ID)
stats.append(stats_results)
legend.append([setting_nr, used_setting])
make_tool_report(report_file, stats)
with open(report_file, 'a') as out:
out.write("\n\nLegend:\n")
for nr in legend:
out.write("\t".join(nr) + "\n")
#& bed file with all regions
......@@ -4,7 +4,7 @@ rule do_GO_analysis:
gene_list="Reports/Genes_{tool}.txt"
output:
table="GO/Enriched_GOs_{tool}.txt", #Add gene names to table
plot="GO/Enriched_GOs_{tool}.jpg"
plot="GO/Enriched_GOs_{tool}.png"
params:
organism="hsapiens", #default is human
ontology="BP", #MF, BP, CC or all
......@@ -17,7 +17,7 @@ rule do_GO_analysis:
rule compare_enriched_GOs:
"""Compare the top 50 GO terms detected by RUBIC and GISTIC2.0"""
input:
go=expand("GO/Enriched_GOs_{tool}.txt", tool=["GISTIC2", "RUBIC"])
go=expand("GO/Enriched_GOs_{tool}.txt", tool=["GISTIC2", "RUBIC", "both"])
output:
"GO/comparison.txt"
run:
......
......@@ -37,15 +37,16 @@ rule define_input_pipeline:
inputfile=config["inputfile"],
normalfile=config["normal"]
run:
if input[0] == params.inputfile: #use provided input file
if input[0] == params.inputfile: #use provided input file
shell("cp {params.inputfile} {output.tumor}")
if config["normal"] != "":
shell("cp {params.normalfile} {output.normal}")
else:
shell("touch {output.normal}")
else: #split firehose data in tumor and normal files.
else: #split firehose data in tumor and normal files.
split_normal_tumor(input[0], output.tumor, output.normal)
rule report_segmentation_file:
"""Report information on the input segmentation file."""
input:
......@@ -67,13 +68,16 @@ def split_normal_tumor(all_samples, out_tumor, out_normal):
normal.write(header)
for line in old:
sample = line.split("\t")[0]
type_ID = sample.split("-")[3] #.strip("A")
type_ID = sample.split("-")[3]
if type_ID == "06A": #TM: Metastatic
tumor.write(line)
#tumor.write(line)
normal.write(line)
elif type_ID == "01A": #TP: Primary Solid Tumor
tumor.write(line)
pass
#tumor.write(line)
elif type_ID == "10A" or type_ID == "11A": #NB: Blood Derived Normal or NT: Solid Tissue Normal
normal.write(line)
#normal.write(line)
tumor.write(line)
else:
raise ValueError("Unkonwn sample type: " + type_ID + \
"\nPlease check samples report: http://gdac.broadinstitute.org/runs/stddata__latest/samples_report/")
......@@ -20,7 +20,7 @@ rule run_RUBIC:
out_plots="RUBIC/plots"
params:
fdr="0.25",
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["bimart_genes"]
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["biomart_genes"]
benchmark:
"Benchmarks/RUBIC." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
wrapper:
......@@ -32,5 +32,7 @@ rule make_bed_file_RUBIC:
losses="RUBIC/losses.txt"
output:
bed="RUBIC/regions_track.bed"
params:
ref=config["reference"]
run:
BedFile(input.gains, input.losses, output.bed)
BedFile(input.gains, input.losses, output.bed, params.ref)
......@@ -2,7 +2,7 @@ import ReportSizes
from SampleSizes import SegFile
import os.path
import datetime
from AUC import ROC_curve
from PrecisionRecall import plot_PrecisionRecall
rule get_segmentation_files_subsets:
"""Create segmentation files with different numbers of samples (randomly chosen) for a number of times."""
......@@ -41,7 +41,7 @@ rule run_RUBIC_subsets:
out_plots="Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/plots"
params:
fdr="0.25",
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["bimart_genes"]
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["biomart_genes"]
benchmark:
"Benchmarks/RUBIC." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
conda:
......@@ -106,7 +106,7 @@ rule compare_subset_truth_RUBIC:
"bedtools intersect -a {input.bed_subsets} -b {input.bed_truth} -wao > {output.subset} && \
bedtools intersect -a {input.bed_truth} -b {input.bed_subsets} -wao > {output.truth}"
rule make_ROC_plot:
rule make_Precision_Recall_plot:
"""Make ROC plot on the precision and recall from the subsets using GISTIC and RUBIC."""
input:
gistic_subset=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/Overlap_subset_truth.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]),
......@@ -114,8 +114,9 @@ rule make_ROC_plot:
rubic_subset=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/Overlap_subset_truth.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_truth=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/Overlap_truth_subset.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]),
output:
AUC="Samplesize/Precision_recall.png"
plot="Samplesize/Precision_recall.png",
plot_avg="Samplesize/Precision_recall_avg.png"
params:
sizes=config["sizes"]
run:
ROC_curve(input.gistic_subset, input.gistic_truth, input.rubic_subset, input.rubic_truth, output.AUC, params.sizes)
plot_PrecisionRecall(input.gistic_subset, input.gistic_truth, input.rubic_subset, input.rubic_truth, output.plot, output.plot_avg, params.sizes)
import ReportControl
rule run_GISTIC_control:
"""Run GISTIC2 for the tumor segmentation data with data from control samples included."""
input:
gistic_directory=os.path.join(config["gisticdir"], "gistic2"),
seg="Input/Segments_tumor.txt"
output:
"Control/all_lesions.conf_" + config["gistic_precision"] + ".txt",
"Control/regions_track.conf_" + config["gistic_precision"] + ".bed"
params:
cnv="Input/Segments_normal.txt",
ref=config["reference"],
ref_file="",
extra="",
confidence=config["gistic_precision"]
wrapper:
"file:" + workflow.basedir + "/wrappers/GISTIC2"
rule report_control:
"""Report the differences between using a control and without using a control."""
input:
control="Control/",
nocontrol="GISTIC2/"
output:
"Reports/Control.txt"
params:
census=config["census_genes"],
known=config["prev_found_genes"],
ref=config["reference"]
run:
ReportControl.make_report(input.control, input.nocontrol, output[0], params.census, params.known, params.ref)
......@@ -28,7 +28,7 @@ class InputCircos:
recurrent.readline()
for line in recurrent:
lineparts = line.split("\t")
loc = lineparts[3].split("(probes")[0]
loc = lineparts[2].split("(probes")[0]
chrom, bp = loc.split(":")
start, end = bp.split("-")
chrom = chrom.strip("chr")
......@@ -57,10 +57,11 @@ def make_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png):
gistic = mpatches.Patch(color='#5975A4', label='GISTIC2.0 regions')
rubic = mpatches.Patch(color='#5F9E6E', label='RUBIC regions')
legend = plt.legend(handles=[gains, losses, gistic, rubic], loc=3, framealpha=1, frameon=False)
plt.axis('off')
fig = legend.figure
fig.canvas.draw()
bbox = legend.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
fig.savefig(legend_png, dpi=400, bbox_inches=bbox)
fig.savefig(legend_png, dpi=300, bbox_inches=bbox)
concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png)
def concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png):
......@@ -73,20 +74,72 @@ def concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png):
circos.paste(legend, offset)
circos.save(concatted_png)
def bed_to_circos(bed_file, circos_file):
"""Convert bed file to CIRCOS input file."""
with open(circos_file, 'w') as out:
with open(bed_file, 'r') as bed:
bed.readline()
for line in bed:
chrom, start, end, gene_name = line.strip().split("\t")
chrom = 'hs' + chrom.strip("chr")
out.write(" ".join([chrom, start, end, gene_name]) + "\n")
def bed_to_circos(bed_file, rubic_file, gistic_file, gene_file):
"""Convert bed file to CIRCOS input files."""
rubic = open(rubic_file, 'w')
gistic = open(gistic_file, 'w')
genes = open(gene_file, 'w')
with open(bed_file, 'r') as bed:
for line in bed:
gene_chrom, gene_start, gene_end, gene_name, tool_name, tool_chrom, tool_start, tool_end, \
amp_name, overlap_bp = line.split("\t")
chrom = 'hs' + gene_chrom.strip("chr")
genes.write(" ".join([chrom, gene_start, gene_end]) + "\n")
if tool_name == 'GISTIC':
gistic.write(" ".join([chrom, tool_start, tool_end]) + "\n")
else:
rubic.write(" ".join([chrom, tool_start, tool_end]) + "\n")
rubic.close(), gistic.close(), genes.close()
def get_plot_region(gene_name, locations_known_genes, overlap_tools):
"""Define which region to plot around the known gene, based on size of gene and overlapping regions."""
plot_region, units = calculate_region_units(gene_name, locations_known_genes, overlap_tools)
return plot_region, units
def calculate_region_units(gene_name, locations_known_genes, overlap_tools):
plot_region = "hs1:0-5000000" #unknown gene in input file
units = 1000000
with open(locations_known_genes, 'r') as known_genes:
known_genes.readline()
plot_region = ''
for line in known_genes:
if gene_name == line.strip().split("\t")[3]:
chrom, start, end, name = line.strip().split("\t")
chrom_region = "hs" + chrom.strip("chr")
start_region, end_region = define_region(overlap_tools, chrom, start, end)
size_region = end_region - start_region
units = round(size_region / 5, -2) #choose units based on size difference of 5
plot_end = round(end_region / units, 1)
plot_start = round(start_region / units, 1)
plot_region = chrom_region + ":" + str(plot_start) + "-" + str(plot_end)
if plot_region == '' or "_" in plot_region:
plot_region = "hs1:0-5000000"
return plot_region, int(units)
def define_region(overlap_tools, chrom, start, end):
"""Make plot region wider when overlapping recurrent region is larger to examine how precise region was called.
Positions should be in million basepairs (change chromosome_units in circos_zoom.conf to alter this)."""
gistic_start, gistic_end = check_overlap(overlap_tools, 'GISTIC', chrom, start, end)
rubic_start, rubic_end = check_overlap(overlap_tools, 'RUBIC', chrom, start, end)
if gistic_start == 'N/A' and gistic_end == 'N/A' and rubic_start == 'N/A' and rubic_end == 'N/A':
start_region = int(start) - 10000000
end_region = int(end) + 10000000
else:
start_region = int(start)
end_region = int(end)
if gistic_start != 'N/A' and int(gistic_start) < int(start):
start_region = int(gistic_start)
if rubic_start != 'N/A' and int(rubic_start) < int(start_region):
start_region = int(rubic_start)
if gistic_end != 'N/A' and int(gistic_end) > int(end_region):
end_region = int(gistic_end)
if rubic_end != 'N/A' and int(rubic_end) > int(end_region):
end_region = int(rubic_end)
size_region = end_region - start_region
start_region = start_region - int(0.2 * size_region)
end_region = end_region + int(0.2 * size_region)
return start_region, end_region
def check_overlap(overlap_file, tool_name, chrom, start, end):
out_start, out_end = 'N/A', 'N/A'
with open(overlap_file, 'r') as overlap:
for line in overlap:
gene_chrom, gene_start, gene_end, gene_name, tool, tool_chrom, tool_start, tool_end, tool_ID, tool_bp = line.strip().split("\t")
if tool == tool_name:
if gene_chrom == chrom and gene_start == start and gene_end == end:
out_start, out_end = tool_start, tool_end
return out_start, out_end
This diff is collapsed.
......@@ -2,23 +2,28 @@ import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
def ROC_curve(gistic_subset, gistic_truth, rubic_subset, rubic_truth, plot_file, list_sizes):
def plot_PrecisionRecall(gistic_subset, gistic_truth, rubic_subset, rubic_truth, plot_file, plot_avg_file, list_sizes):
plot_recall, plot_precision, plot_avg_recall, plot_avg_precision = [], [], [], []
for tool in (gistic_subset, gistic_truth), (rubic_subset, rubic_truth):
list_recall, list_precision = get_list_rates(tool[0], tool[1], list_sizes)
plot_recall.append(list_recall) #avg_recall)
plot_precision.append(list_precision) #.append(#avg_precision)
print(list_recall)
print(list_precision)
plot_recall.append(list_recall)
plot_precision.append(list_precision)
avg_recall, avg_precision = calculate_averages(list_recall, list_precision)
plot_avg_recall.append(avg_recall) #avg_recall)
print(avg_recall)
print(avg_precision)
plot_avg_recall.append(avg_recall)
plot_avg_precision.append(avg_precision)
plot_ROC(plot_recall, plot_precision, plot_file, list_sizes, False)
plot_ROC(plot_avg_recall, plot_avg_precision, plot_file + "_avg.png", list_sizes, True)
make_plot(plot_recall, plot_precision, plot_file, list_sizes, False)
make_plot(plot_avg_recall, plot_avg_precision, plot_avg_file, list_sizes, True)
def get_list_rates(subset_files, truth_files, list_sizes):
list_recall = [[] for i in range(len(list_sizes))]
list_precision = [[] for i in range(len(list_sizes))]
for i in range(len(subset_files)):
size_file = subset_files[i].split("Size")[1].split(".Rep")[0]
print(size_file)
recall, precision = calculate_precision_recall(subset_files[i], truth_files[i])
for size in range(len(list_sizes)):
if str(list_sizes[size]) == size_file:
......@@ -36,8 +41,8 @@ def calculate_precision_recall(bed_subset, bed_truth):
line = region.split("\t")
if line[4] == '.':
FP += 1
else:
TP += 1
#else:
# TP += 1
with open(bed_truth, 'r') as truth: #recall
for region in truth:
if region.startswith("track"):
......@@ -46,8 +51,8 @@ def calculate_precision_recall(bed_subset, bed_truth):
line = region.split("\t")
if line[4] == '.':
FN += 1
#else:
# TP += 1
else:
TP += 1
recall = TP / float(TP + FN)
precision = TP / float(TP + FP)
return recall, precision
......@@ -62,7 +67,7 @@ def calculate_averages(list_recall, list_precision):
avg_precision.append(avg)
return avg_recall, avg_precision
def plot_ROC(list_recall, list_precision, plot_file, list_sizes, avg):
def make_plot(list_recall, list_precision, plot_file, list_sizes, avg):
plt.figure()
if avg:
plt.xlim([0, 1])
......
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
from ParseResults import parse_regions, get_stats
from collections import OrderedDict
def make_report(control_results, nocontrol_results, report_file, census_genes, known_genes, ref_genome):
"""Make a report on analyses using control samples or without using them."""
parsed_tools, stats_tools = [], []
with open(report_file, 'w') as out:
row_names = (["Control used?", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for result in control_results, nocontrol_results:
parsed_results = parse_regions.gistic_results(result)
parsed_tools.append(parsed_results)
label = "Control" if result == control_results else "No control"
stats_results = stats().calculate_stats(parsed_results, census_genes, known_genes, label)
stats_tools.append(stats_results)
#make_gene_files(parsed_tools[0], parsed_tools[1], file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_venn)
for i in range(len(row_names)):
list_out = [row_names[i], stats_tools[0][0][i], stats_tools[0][1][i], stats_tools[1][0][i], stats_tools[1][1][i]]
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
#compare_known_regions(known_genes, file_overlap, gene_file, gistic_results, rubic_results)
......@@ -19,7 +19,7 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes,
size, repetition = size_rep.split(".Rep")
for tool in 'GISTIC', 'RUBIC':
size_file = (size_rubic_gains[i], size_rubic_losses[i]) if tool == 'RUBIC' else size_gistic[i]
parsed_results = parse_regions(size_file, known_genes, census_genes, tool)
parsed_results = parse_regions(size_file, known_genes, census_genes, tool, ref_genome)
#if tool not in all_results.keys():
# all_results[tool] = {}
# all_results[tool][size] = [parsed_results]
......@@ -33,23 +33,25 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes,
for stat in stat_list[5:]:
converted_stats.append(float(stat.split(" (")[0]))
list_stats.append(converted_stats)
#overlap_genes(all_results, report_file)
overlap_genes(all_results, report_file)
make_plots(list_stats, reps, sizes, plot_dir)
def overlap_genes(all_results, report_file):
known_genes = {}
with open(report_file, 'w') as out:
for tool in all_results.keys():
out.write(tool + '\n')
for size in all_results[tool].keys():
out.write(size + '\n')
for cnv_type in size:
out.write(cnv_type[5])
out.write("\n".join(list(all_results.keys())))
# known_genes = {}
# with open(report_file, 'w') as out:
# for tool in all_results.keys():
# out.write(tool + '\n')
# for size in all_results[tool].keys():
# out.write(size + '\n')
# for cnv_type in size:
# out.write(cnv_type[5])
def make_plots(list_stats, reps, sizes, plot_dir):
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Nr. regions with known genes', 'Nr. regions with census genes'])
'Number of genes', 'Number of known regions', 'Number of census regions'])
df_stats = pd.DataFrame(list_stats, columns=['Tool', 'Sample size', 'Type'] + plot_y_axis)
for plot_name in plot_y_axis:
plot_size_differences(df_stats, plot_name, sizes, len(reps), plot_dir)
......@@ -60,10 +62,15 @@ def plot_size_differences(df_stats, value_y_axis, list_sizes, nr_reps, plot_dir)
"""Plot the differences between analyses using different sample sizes."""
sns.set_style("whitegrid")
g = sns.factorplot(x="Sample size", y=value_y_axis, col="Type", hue="Tool", data=df_stats, kind="box",
size=5, aspect=1, palette=["#5975A4","#5F9E6E"])
g.set_axis_labels("Sample size", value_y_axis).set_titles("{col_name}").despine(bottom=True)
size=5, aspect=1, palette=["#5975A4","#5F9E6E"], order=list(map(str, list_sizes)))
if value_y_axis in ['Average size of regions (Kb)', 'Total size (Mb)', 'Number of genes']:
label_y_axis = 'Log ' + value_y_axis[0].lower() + value_y_axis[1:]
g.fig.get_axes()[0].set_yscale('log')
else:
label_y_axis = value_y_axis
g.set_axis_labels("Sample size", label_y_axis).set_titles("{col_name}").despine(bottom=True)
#add_significance(g, df_stats, list_sizes, value_y_axis)
png_file = os.path.join(plot_dir, value_y_axis.replace(" ", "_") + ".png")
png_file = os.path.join(plot_dir, value_y_axis.split(" (")[0].replace(" ", "_") + ".png")
plt.savefig(png_file, dpi=300)
plt.close()
......@@ -71,8 +78,8 @@ def plot_size_differences_per_tool(df_stats, value_y_axis, list_sizes, nr_reps,
"""Plot the differences between analyses using different sample sizes."""
sns.set_style("whitegrid")
df_tool = df_stats[df_stats['Tool']==tool]
g = sns.factorplot(x="Sample size", y=value_y_axis, col="Type", data=df_tool, kind="box",
size=5, aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g = sns.factorplot(x="Sample size", y=value_y_axis, col="Type", data=df_tool, kind="box", order=list(map(str, list_sizes)),
size=5, aspect=1, palette=sns.cubehelix_palette(10, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", value_y_axis).set_titles("{col_name}").despine(bottom=True)
#add_significance(g, df_stats, list_sizes, value_y_axis)
png_file = os.path.join(plot_dir, value_y_axis.replace(" ", "_") + "_" + tool + ".png")
......
This diff is collapsed.
This diff is collapsed.
......@@ -13,7 +13,8 @@ class MarkerFile:
segments.readline()
for line in segments:
sample, chrom, start, end, num_probes, segment_mean = line.strip().split("\t")
pos = int(start) + int(int(end) - int(start) / 2)
seg_size = int(end) - int(start)
pos = int(start) + int(seg_size / 2)
if chrom not in positions:
positions[chrom] = [pos, int(start), int(end)]
else:
......@@ -32,9 +33,9 @@ class MarkerFile:
markers.write("\t".join(marker_line) + "\n")
marker_nr += 1
def BedFile(gains_file, losses_file, bed_file):
def BedFile(gains_file, losses_file, bed_file, ref_genome):
"""Create bed file from results."""
regions = parse_rubic_results(gains_file, losses_file, [], [])
regions = parse_rubic_results(gains_file, losses_file, [], [], ref_genome)
with open(bed_file, 'w') as out:
out.write("track name=RUBIC color=0,0,255\n")
cnv_count = 1
......
<<include etc/colors_fonts_patterns.conf>>
<<include ideogram.conf>>
<<include ticks.conf>>
<<include ticks_zoom.conf>>
<image>
<<include etc/image.conf>>
</image>
karyotype = data/karyotype/karyotype.human.txt
chromosomes_units = 1000000
karyotype = data/karyotype/karyotype.human.txt #cytoBandIdeo.txt
chromosomes_units = conf(units)
chromosomes = conf(chrom)
chromosomes_display_default = no
<highlights>
<plots>
<plot>
type = text
color = black
file = conf(gene_file)
r0 = 0.8r
r1 = 1.0r
label_size = 40p
label_font = condensed
label_center = yes
padding = 0.05r
rpadding = 40p
</plot>
</plots>
<highlights>
z = 5
<highlight>
file = conf(census_file)
r0 = 1.0r
r1 = 0.8r
fill_color = 109,119,167
z = 10
</highlight>
<highlight>
file = conf(gene_file)
r0 = 1.0r
r1 = 0.8r
fill_color = 169,104,106
z = 5
</highlight>
<highlight>
file = conf(gistic_file)
r0 = 0.9r
r1 = 0.7r
r0 = 0.8r
r1 = 0.6r
fill_color = 89,117,164
</highlight>
<highlight>
file = conf(rubic_file)
r0 = 0.7r
r1 = 0.5r
r0 = 0.6r
r1 = 0.4r
fill_color = 95,158,110
</highlight>
<highlight>
file = conf(gene_file)
r0 = 1.0r
r1 = 0.9r
r1 = 0.8r
fill_color = 109,119,167
</highlight>
......
This diff is collapsed.
......@@ -5,5 +5,5 @@ label_font = default
label_radius = dims(image,radius) - 50p
label_size = 36
label_parallel = yes
label_case = lower
label_format = eval(sprintf("Chr%s",var(label)))
label_case = upper
label_format = eval(sprintf("chr%s",var(label)))
show_ticks = yes
show_tick_labels = yes
<ticks>
tick_separation = 3p
label_separation = 5p
radius = dims(ideogram,radius_outer)
multiplier = 1 / conf(units)
color = black
size = 35p
thickness = 4p
label_offset = 5p
format = %d
<tick>
spacing = 1u
show_label = yes
label_size = 24p
</tick>
<tick>
spacing = 5u
show_label = yes
label_size = 24p
</tick>
<tick>
spacing = 10u
show_label = yes
label_size = 24p
</tick>
<tick>
spacing = 20u
show_label = yes
label_size = 24p
</tick>
</ticks>
import pyensembl
ensembl = pyensembl.EnsemblRelease(75)
def install_ensembl(reference_genome):
"""Import ensembl if needed and load correct release based on reference genome."""
if "hg19" in reference_genome:
if "19" in reference_genome or "37" in reference_genome:
ensembl = pyensembl.EnsemblRelease(75)
elif "38" in reference_genome:
ensembl = pyensembl.EnsemblRelease(87)
else:
raise ValueError("Unknown reference genome used.")
try: #only done first time to install ensembl version.
try:
ensembl.gene_names_at_locus(1,1,10000)
except: #only done first time to install ensembl version.
ensembl.download()
ensembl.index()
except:
pass
return ensembl
def genes_at_locus(chrom, start, end):
"""Get list of gene IDs at certain location."""
IDs = []
def genes_at_locus(chrom, start, end, ref_genome):
"""Get list of gene names at a genomic location."""
ensembl = install_ensembl(ref_genome)
overlapping_names = []
gene_info = ensembl.gene_names_at_locus(chrom, start, end)
for gene in gene_info:
IDs.append(gene)
return IDs
overlapping_names.append(gene)
return overlapping_names
def gene_name_to_ID(gene_names):
list_genes = []
def gene_IDs_at_locus(chrom, start, end, ref_genome):
ensembl = install_ensembl(ref_genome)
overlapping_IDs = []
gene_info = ensembl.genes_at_locus(chrom, start, end)
for gene in gene_info:
overlapping_IDs.append(gene.gene_id)
return overlapping_IDs
def gene_name_to_ID(gene_names, ref_genome):
"""Convert a list of gene names to a list of gene IDs"""
ensembl = install_ensembl(ref_genome)
gene_IDs = []
for gene in gene_names:
try:
gene_ID = ensembl.gene_ids_of_gene_name(gene)
list_genes.append(gene_ID[0])
gene_IDs.append(gene_ID[0])
except:
#print(gene)
pass
return list_genes
return gene_IDs
def gene_ID_to_name(gene_IDs, ref_genome):
"""Convert a list of gene names to a list of gene IDs"""
ensembl = install_ensembl(ref_genome)
gene_names = []
for gene in gene_IDs:
try:
gene_name = ensembl.gene_name_of_gene_id(gene)
gene_names.append(gene_name)
except:
gene_names.append(gene)
return gene_names
channels:
- bioconda
dependencies:
- python
......@@ -31,9 +31,9 @@ else:
#Additional arguments
extra = snakemake.params.get("extra", "")
if snakemake.params.confidence != "":
extra += "-conf 0." + snakemake.params.confidence
extra += " -conf 0." + snakemake.params.confidence
#Make output directory, go to gistic2 directory and run gistic2
command = "./gistic2 -b " + outfolder + " -seg " + segments + " -refgene " + reference + cnv_arg + extra
print(command)
shell("mkdir -p {outfolder} && cd {gistic_dir} && ./gistic2 -b {outfolder} -seg {segments} -refgene {reference} {cnv_arg} {extra} && cd -")
shell("mkdir -p {outfolder} && cd {gistic_dir} && ./gistic2 -b {outfolder} -seg {segments} -refgene {reference} {cnv_arg} {extra} -v 0 -smalldisk 1 && cd -")
......@@ -6,12 +6,10 @@
library(RUBIC)
all_genes <- if (snakemake@params[["genefile"]] == "") system.file("extdata", "genes.tsv", package="RUBIC") else snakemake@params[["genefile"]]
print(all_genes)
fdr <- if (snakemake@params[["fdr"]] == "") 0.25 else snakemake@params[["fdr"]]
rbc <- rubic(fdr, snakemake@input[["seg"]], snakemake@input[["markers"]], genes=all_genes)
print("rbc ready")
rbc$save.focal.gains(snakemake@output[["out_gains"]])
print("gains ready")
rbc$save.focal.losses(snakemake@output[["out_losses"]])
print("losses ready")
rbc$save.plots(snakemake@output[["out_plots"]])
......@@ -5,7 +5,7 @@
library(topGO)
#Download table with geen IDs and GO terms from biomart or provide own file
#Download table with geen IDs and GO terms from biomart or provide own file (the connection to biomart fails sometimes)
if (snakemake@params[["gene_ID_to_GO"]] == "") {
library(biomaRt)
ensembl <- useMart("ensembl")
......@@ -30,7 +30,6 @@ ontologies <- if (snakemake@params[["ontology"]] == "all") c("BP", "CC", "MF") e
#Combine data
GOdata <- new("topGOdata", ontology=ontologies, allGenes = geneList, annot = annFUN.gene2GO, gene2GO = geneID2GO)
pint(GOdata)
#Perform enrichment tests
test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
......@@ -45,11 +44,17 @@ allRes <- GenTable(GOdata, classic = resultFisher,
#KS = resultKS,
weight = resultWeight,
orderBy = "weight", ranksOf = "classic") #, topNodes = 50)
print(allRes)
write.table(allRes, file=snakemake@output[[1]], quote=FALSE, sep="\t", eol = "\n")
#Visualize GO term relationships
jpeg(snakemake@output[[1]])
showSigOfNodes(GOdata, score(resultFisher), firstSigNodes = 5, useInfo ='all')
printGraph(GOdata, resultFisher, firstSigNodes = 5, fn.prefix = "tGO", useInfo = "all", pdfSW = TRUE)
png(snakemake@output[[1]])
#showSigOfNodes(GOdata, score(resultFisher), firstSigNodes = 5, useInfo ='all')
printGraph(GOdata, resultFisher, firstSigNodes = 10, fn.prefix = "tGO", useInfo = "all", pdfSW = TRUE)
dev.off()
#Print most significant genes
ann.genes <- genesInTerm(GOdata)
str(ann.genes)
fisher.go <- names(score(resultFisher))[1:5]
fisher.ann.genes <- genesInTerm(sampleGOdata, whichGO=fisher.go)
print(fisher.ann.genes)