Commit 1a6403cd authored by Beatrice Tan's avatar Beatrice Tan

Improved structure of rules and functions.

parent db6d0312
from snakemake.utils import report
#Configure input and settings #Configure input and settings
configfile: "config.yaml" configfile: "config.yaml"
...@@ -9,15 +11,15 @@ import os.path ...@@ -9,15 +11,15 @@ import os.path
#Rules to run pipeline for prioritization of regions and genes. #Rules to run pipeline for prioritization of regions and genes.
include: "rules/PreprocessInput.smk" include: "rules/PreprocessInput.smk"
include: "rules/GISTIC2.smk" include: "rules/GISTIC2.smk"
include: "rules/Rubic.smk" include: "rules/RUBIC.smk"
include: "rules/GenePrioritization.smk" include: "rules/GenePrioritization.smk"
#Rules to compare different inputs. #Rules to compare different inputs.
include: "rules/ComparisonRegions.smk" include: "rules/ComparisonRegions.smk"
include: "rules/Circos.smk" include: "rules/Circos.smk"
#include: "rules/SampleSizes.smk" include: "rules/SampleSizes.smk"
#include: "rules/UseControl.smk" #include: "rules/UseControl.smk"
#include: "rules/ComparisonSettings.smk" include: "rules/ComparisonSettings.smk"
#Directory to save all files. #Directory to save all files.
workdir: config["workdir"] workdir: config["workdir"]
...@@ -39,11 +41,12 @@ onsuccess: ...@@ -39,11 +41,12 @@ onsuccess:
onerror: onerror:
print("\n\nPipeline failed. Possible reasons:\n- Wrong input files\n- Missing arguments in config file\n- Error in conda environment\n\n") print("\n\nPipeline failed. Possible reasons:\n- Wrong input files\n- Missing arguments in config file\n- Error in conda environment\n\n")
rule all: rule all:
"""Define desired output from pipeline.""" """Define desired output from pipeline."""
input: input:
"Reports/Results.html" #"Settings/Report.txt",
"PipelineResults.html",
"ComparisonResults.html"
rule help: rule help:
"""Print list of all targets with help.""" """Print list of all targets with help."""
...@@ -51,9 +54,8 @@ rule help: ...@@ -51,9 +54,8 @@ rule help:
for rule in workflow.rules: for rule in workflow.rules:
print('- ' + rule.name + "\t" + rule.docstring) print('- ' + rule.name + "\t" + rule.docstring)
rule report_pipeline:
rule report: """Write HTML report on output from pipeline."""
"""Write html report on segmentation file."""
input: input:
seg="Reports/Segments.txt", seg="Reports/Segments.txt",
tools="Reports/Tools.txt", tools="Reports/Tools.txt",
...@@ -62,23 +64,40 @@ rule report: ...@@ -62,23 +64,40 @@ rule report:
genes_gistic="Reports/Genes_GISTIC2.txt", genes_gistic="Reports/Genes_GISTIC2.txt",
genes_rubic="Reports/Genes_RUBIC.txt", genes_rubic="Reports/Genes_RUBIC.txt",
venn="Reports/Venn_overlap_genes.png", venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png", swarmplot="Reports/Comparison_sizes.png",
circos="Reports/Circos/RecurrentRegions.png", circos="Circos/RecurrentRegions_legend.png",
circos_legend="Reports/Circos/RecurrentRegions_legend.png", output:
known_genes="Reports/Overlap_known_genes.bed" html="PipelineResults.html"
run:
from snakemake.utils import report
report("""
====================================================
Report on the results of the prioritization pipeline
====================================================
- Report on segmentation file: seg_
- Report on comparison between tools and overlapping regions: tools_
- Table with all recurrent regions and overlapping genes: table_regions_
- Circos plot showing the raw segmentation file and recurrent regions detected by both tools: circos_
- Venn diagram showing the overlap between gene lists from both tools: venn_
- Swarmplot showing the differences in sizes between both tools: swarmplot_
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", **input)
rule report_comparisons:
"""Write HTML report on comparisons between sample sizes, settings and using a control."""
input:
#size="Samplesizes/Report.txt"
circos_genes=get_list_genes_circos
output: output:
html="Reports/Results.html" html="ComparisonResults.html"
run: run:
from snakemake.utils import report from snakemake.utils import report
with open(input.seg, 'r') as seg:
nr_samples = seg.readline().split("\t")[1].strip()
report(""" report("""
==================================================== ====================================================
Report on the results of the prioritization pipeline Report on the results of the prioritization pipeline
==================================================== ====================================================
In total, {nr_samples} samples were present in the raw segmentation file. - Report on sample size comparison:
See: Table T1_
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", T1=input[0]) """, output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", **input)
#**input)
#Directories to be specified #Directories to be specified
workdir: /home/bftan/CNA_results #directory to write output #workdir: /home/bftan/CNA_results #directory to write output
gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2 #gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#workdir: /home/beatrice/CNA_analysis workdir: /home/beatrice/CNA_99_genegistic
#gisticdir: /home/beatrice/CNA_analysis/run_gistic2 gisticdir: /home/beatrice/CNA_analysis/run_gistic2
#Input details to download from firehose #Input details to download from firehose
cancer_type: SKCM cancer_type: SKCM
...@@ -14,22 +14,23 @@ inputfile: "" #tumor segmentation data ...@@ -14,22 +14,23 @@ inputfile: "" #tumor segmentation data
normal: "" #normal segmentation data normal: "" #normal segmentation data
#Data for running and benchmarking tools. #Data for running and benchmarking tools.
reference: hg19 reference: hg19 #hg38.UCSC.add_miR.160920.refgene
prev_found_genes: input_files/intogen-CM-drivers-data.tsv prev_found_genes: input_files/intogen-CM-drivers-data.tsv
census_genes: input_files/Census_genes.txt census_genes: input_files/Census_genes.txt
biomart_genes: input_files/biomart_human_genes.tsv biomart_genes: input_files/biomart_human_genes_hg19.tsv #wrong genome build
ID_to_GO: input_files/ID_to_GO.txt ID_to_GO: input_files/ID_to_GO.txt
#Settings GISTIC2.0 #Settings GISTIC2.0
gistic_precision: "99" gistic_precision: "99"
settings_gistic: "" settings_gistic: "-brlen 0.98 -genegistic 1"
comparison_settings: ["-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 1 -conf 0.99", #GISTIC2.0 settings to compare
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 1 -conf 0.75", comparison_precision: ["99", "90", "75"]
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.98 -cap 1.5 -rx 1 -genegistic 1 -conf 0.75", comparison_settings: ["-brlen 0.7 -genegistic 1",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.98 -cap 1.5 -rx 1 -genegistic 0 -conf 0.75", "-brlen 0.98 -genegistic 1",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 0 -conf 0.75",] "-brlen 0.98 -genegistic 0",
"-brlen 0.7 -genegistic 0"]
#Settings for sample size differences #Settings for sample size differences
sizes: [20, 30, 40, 50, 60, 70, 80, 90] sizes: [20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
KIT
PDGFRA
KDR
CDK4
CCND1
MDM2
TERT
BRAF
MITF
PD-L1
NRAS
CD274
This diff is collapsed.
from Circos import InputCircos, bed_to_circos, make_CIRCOS_legend from Circos import InputCircos, bed_to_circos, make_CIRCOS_legend, get_plot_region
from ReportTools import make_bed_genes
from PIL import Image from PIL import Image
import os.path
rule make_CIRCOS_input: rule make_CIRCOS_input:
"""Make input files for making a CIRCOS plot.""" """Make input files for making a CIRCOS plot."""
...@@ -9,20 +11,20 @@ rule make_CIRCOS_input: ...@@ -9,20 +11,20 @@ rule make_CIRCOS_input:
rubic_gains="RUBIC/gains.txt", rubic_gains="RUBIC/gains.txt",
rubic_losses="RUBIC/losses.txt" rubic_losses="RUBIC/losses.txt"
output: output:
seg="Reports/Circos/Segments.txt", seg="Circos/Segments.txt",
gistic="Reports/Circos/GISTIC_results.txt", gistic="Circos/GISTIC_results.txt",
rubic="Reports/Circos/RUBIC_results.txt", rubic="Circos/RUBIC_results.txt",
run: run:
InputCircos(input.seg, input.gistic, input.rubic_gains, input.rubic_losses, output.seg, output.gistic, output.rubic) InputCircos(input.seg, input.gistic, input.rubic_gains, input.rubic_losses, output.seg, output.gistic, output.rubic)
rule make_CIRCOS_plot: rule plot_CIRCOS:
"""Make CIRCOS plot of recurrent regions in RUBIC and GISTIC2.0""" """Make CIRCOS plot of recurrent regions in RUBIC and GISTIC2.0"""
input: input:
seg="Reports/Circos/Segments.txt", seg="Circos/Segments.txt",
gistic="Reports/Circos/GISTIC_results.txt", gistic="Circos/GISTIC_results.txt",
rubic="Reports/Circos/RUBIC_results.txt", rubic="Circos/RUBIC_results.txt",
output: output:
"Reports/Circos/RecurrentRegions.png" "Circos/RecurrentRegions.png"
params: params:
conf=workflow.basedir + "/scripts/circos/circos.conf" conf=workflow.basedir + "/scripts/circos/circos.conf"
conda: conda:
...@@ -33,72 +35,72 @@ rule make_CIRCOS_plot: ...@@ -33,72 +35,72 @@ rule make_CIRCOS_plot:
rule add_legend_CIRCOS: rule add_legend_CIRCOS:
"""Add a custom legend to the CIRCOS plot.""" """Add a custom legend to the CIRCOS plot."""
input: input:
circos="Reports/Circos/RecurrentRegions.png", circos="Circos/RecurrentRegions.png",
output: output:
legend="Reports/Circos/legend.png", legend="Circos/legend.png",
circos="Reports/Circos/RecurrentRegions_legend.png" circos="Circos/RecurrentRegions_legend.png"
run: run:
make_CIRCOS_legend(input.circos, output.legend, output.circos) make_CIRCOS_legend(input.circos, output.legend, output.circos)
rule make_CIRCOS_zoom_input: #necessary? rule make_bed_genes_census:
"""Make input files for making a circos diagram."""
input: input:
bed="Reports/Overlap_known_genes.bed" gene_file=os.path.join(workflow.basedir, config["census_genes"])
output: output:
gistic="Reports/Circos/Zoom/GISTIC.txt", bed="Reports/Locations_census_genes.bed"
rubic="Reports/Circos/Zoom/RUBIC.txt", params:
genes="Reports/Circos/Zoom/Genes.txt", ref=os.path.join(workflow.basedir, config["reference"]),
biomart=os.path.join(workflow.basedir, config["biomart_genes"])
run: run:
bed_to_circos(input.bed, output.rubic, output.gistic, output.genes) make_bed_genes(input.gene_file, params.biomart, output.bed, params.ref)
def get_list_genes(overlapping_genes, locations_known_genes): rule make_bed_genes_known:
plot_list = [] input:
list_overlapping_genes = [] gene_file=os.path.join(workflow.basedir, config["prev_found_genes"])
with open(overlapping_genes, 'r') as plot_genes: output:
for line in plot_genes: bed="Reports/Locations_known_genes.bed"
chrom, start, end = line.strip().split("\t") params:
chrom = "chr" + chrom.strip("hs") ref=os.path.join(workflow.basedir, config["reference"]),
list_overlapping_genes.append([chrom, start, end]) biomart=os.path.join(workflow.basedir, config["biomart_genes"])
with open(locations_known_genes, 'r') as known_genes: run:
known_genes.readline() make_bed_genes(input.gene_file, params.biomart, output.bed, params.ref)
for line in known_genes:
chrom, start, end, gene_name = line.strip().split("\t")
if [chrom, start, end] in list_overlapping_genes:
plot_list.append(gene_name)
return(plot_list)
def get_plot_region(gene_name, locations_known_genes): rule make_CIRCOS_input_genes: #toevoegen aan make_CIRCOS_input
with open(locations_known_genes, 'r') as known_genes: input:
known_genes.readline() bed="Reports/Locations_{type}_genes.bed"
for line in known_genes: output:
if gene_name == line.strip().split("\t")[3]: circos="Circos/{type}_genes.txt"
chrom, start, end, name = line.strip().split("\t") run:
chrom_region = "hs" + chrom.strip("chr") bed_to_circos(input.bed, output.circos)
if int(start) < 1000000:
start_region = 0
else:
start_region = int(start) - 1000000
end_region = int(end) + 1000000
plot_region = chrom_region + ":" + str(start_region) + "-" + str(end_region)
return plot_region
rule make_CIRCOS_zoom_plots: def get_list_genes_circos(wildcards):
"""Compare locations of known genes, recurrent regions from RUBIC and recurrent regions from GISTIC2.""" """Extract list of known genes to produce list of file names to save CIRCOS plot for each gene"""
list_genes = []
with open(os.path.join(workflow.basedir, config["prev_found_genes"])) as known:
for line in known:
gene = line.split("\t")[2]
gene_file = "Circos/KnownGenes/" + gene + ".png"
list_genes.append(gene_file)
return list_genes
rule plot_CIRCOS_per_gene: #Two genes won't plot because region is outside chromosome?
"""Compare locations of known genes, recurrent regions from RUBIC and recurrent regions from GISTIC2.
Use 'get_list_genes_circos' as input for a rule (e.g. rule all) to produce a plot for each known gene.'"""
input: input:
gistic="Reports/Circos/Zoom/GISTIC.txt", #all gistic regions gistic="Circos/GISTIC_results.txt",
rubic="Reports/Circos/Zoom/RUBIC.txt", #all rubic regions rubic="Circos/RUBIC_results.txt",
genes="Reports/Circos/Zoom/Genes.txt", genes="Circos/known_genes.txt",
known="Reports/Locations_known_genes.bed", #use locations known genes to extract list of known genes and run rule for each gene maybe also not overlapping, but region around gene census="Circos/census_genes.txt",
list_genes = lambda known_genes="Reports/Locations_known_genes.bed",
overlap="Reports/Overlap_known_genes.bed"
output:
plot="Circos/KnownGenes/{gene}.png"
params: params:
known="Reports/Locations_known_genes.bed",
genes="Reports/Circos/Zoom/Genes.txt",
conf=workflow.basedir + "/scripts/circos/circos_zoom.conf", conf=workflow.basedir + "/scripts/circos/circos_zoom.conf",
chrom=get_plot_region(wildcards.gene, input.known) #chrom=lambda wildcards, input: get_plot_region(wildcards.gene, input.known_genes, input.overlap),
output: #units=lambda wildcards, input: get_plot_units(wildcards.gene, input.known_genes, input.overlap)
plot=expand("Reports/Overlap_plots/{gene}.png", gene=get_list_genes(params.genes, params.known)) # conda:
conda: # workflow.basedir + "/envs/circos.yaml"
workflow.basedir + "/envs/circos.yaml" run:
shell: chrom, units = get_plot_region(wildcards.gene, input.known_genes, input.overlap)
"circos -conf {params.conf} -outputfile {output.plot} -param gistic_file={input.gistic} -param rubic_file={input.rubic} \ shell("circos -conf {params.conf} -outputfile {output.plot} -param gistic_file={input.gistic} -param rubic_file={input.rubic} \
-param gene_file={input.genes} -param chrom={params.chrom}" -param gene_file={input.genes} -param census_file={input.census} -param chrom=" + chrom + " -param units=" + str(units))
...@@ -17,8 +17,7 @@ rule report_tools: ...@@ -17,8 +17,7 @@ rule report_tools:
genes_gistic="Reports/Genes_GISTIC2.txt", genes_gistic="Reports/Genes_GISTIC2.txt",
genes_rubic="Reports/Genes_RUBIC.txt", genes_rubic="Reports/Genes_RUBIC.txt",
venn="Reports/Venn_overlap_genes.png", venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png", size_plot="Reports/Comparison_sizes.png"
bed_known="Reports/Locations_known_genes.bed"
params: #select input files from repository or own input files params: #select input files from repository or own input files
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"], census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"], known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
...@@ -27,9 +26,9 @@ rule report_tools: ...@@ -27,9 +26,9 @@ rule report_tools:
run: run:
ReportTools.make_report(input.gistic, input.rubic_gain, input.rubic_loss, ReportTools.make_report(input.gistic, input.rubic_gain, input.rubic_loss,
params.census, params.known, params.ref, params.biomart_info, params.census, params.known, params.ref, params.biomart_info,
output.tools, output.table_regions, output.venn, output.swarmplot, output.tools, output.table_regions, output.venn, output.size_plot,
output.genes_both, output.genes_gistic, output.genes_rubic, output.genes_both, output.genes_gistic, output.genes_rubic,
input.overlap, output.bed_known) input.overlap)
rule get_overlap_GISTIC_RUBIC: rule get_overlap_GISTIC_RUBIC:
"""Intersect the recurrent regions detected by RUBIC and GISTIC2.0.""" """Intersect the recurrent regions detected by RUBIC and GISTIC2.0."""
......
def get_settings(nr_settings, all_settings): from ParseResults import parse_regions, get_stats
print(nr_settings) from ReportTools import make_tool_report
print(all_settings)
print(list(range(len(config["comparison_settings"]))))
rule gistic_settings: rule gistic_settings:
"""Run GISTIC2 based on different settings.""" """Run GISTIC2 using different settings."""
input: input:
gistic_directory=os.path.join(config["gisticdir"], "gistic2"), gistic_directory=os.path.join(config["gisticdir"], "gistic2"),
seg="Input/Segments_tumor.txt", seg="Input/Segments_tumor.txt",
lambda wildcards: config["comparison_settings"][wildcards.setting]
output: output:
expand("Settings/GISTIC_{setting_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", setting_nr=range(len(config["comparison_settings"]))), "Settings/GISTIC.{setting}_{precision}/all_lesions.conf_{precision}.txt",
"Settings/GISTIC_{setting_nr}/regions_track.conf_" + config["gistic_precision"] + ".bed" "Settings/GISTIC.{setting}_{precision}/regions_track.conf_{precision}.bed"
params: params:
cnv="", cnv="",
ref=config["reference"], ref=config["reference"],
ref_file="", ref_file="",
extra="wildcards.setting", extra=lambda wildcards: config["comparison_settings"][int(wildcards.setting)], #get_settings(wildcards.setting),
confidence=config["gistic_precision"] confidence=lambda wildcards: wildcards.precision
wrapper: wrapper:
"file:" + workflow.basedir + "/wrappers/GISTIC2" "file:" + workflow.basedir + "/wrappers/GISTIC2"
def get_list_settings(wildcards):
"""Extract list of known genes to produce list of file names to save CIRCOS plot for each gene"""
file_names = []
for i in range(len(config["comparison_settings"])):
for precision in config["comparison_precision"]:
file_name = "Settings/GISTIC." + str(i) + "_" + precision + "/all_lesions.conf_" + precision + ".txt"
file_names.append(file_name)
return file_names
def get_settings(setting):
"""Extract setting to use based on wildcard.setting, which is a number."""
list_settings = config["comparison_settings"]
return list_settings[int(setting)]
rule compare_settings: rule compare_settings:
input: input:
"Settings/GISTIC_{setting_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt" get_list_settings
output: output:
"Settings/Report.txt" report="Settings/Report.txt"
params:
settings=config["comparison_settings"],
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
ref=config["reference"]
run: run:
with open(output[0], 'w') as out: SettingReport(input, output.report, params.settings, params.known, params.census, params.ref)
out.write(input[0])
def SettingReport(setting_results, report_file, settings, known_genes, census_genes, ref_genome):
stats = []
legend = []
print(setting_results)
for result in setting_results:
file_ID = result.split("GISTIC.")[1].split("/all_lesions")[0]
setting_nr, precision = file_ID.split("_")
used_setting = settings[int(setting_nr)]
parsed_results = parse_regions(result, known_genes, census_genes, 'GISTIC', ref_genome)
stats_results = get_stats(parsed_results, file_ID)
stats.append(stats_results)
legend.append([setting_nr, used_setting])
make_tool_report(report_file, stats)
with open(report_file, 'a') as out:
out.write("\n\nLegend:\n")
for nr in legend:
out.write("\t".join(nr) + "\n")
#& bed file with all regions
...@@ -4,7 +4,7 @@ rule do_GO_analysis: ...@@ -4,7 +4,7 @@ rule do_GO_analysis:
gene_list="Reports/Genes_{tool}.txt" gene_list="Reports/Genes_{tool}.txt"
output: output:
table="GO/Enriched_GOs_{tool}.txt", #Add gene names to table table="GO/Enriched_GOs_{tool}.txt", #Add gene names to table
plot="GO/Enriched_GOs_{tool}.jpg" plot="GO/Enriched_GOs_{tool}.png"
params: params:
organism="hsapiens", #default is human organism="hsapiens", #default is human
ontology="BP", #MF, BP, CC or all ontology="BP", #MF, BP, CC or all
...@@ -17,7 +17,7 @@ rule do_GO_analysis: ...@@ -17,7 +17,7 @@ rule do_GO_analysis:
rule compare_enriched_GOs: rule compare_enriched_GOs:
"""Compare the top 50 GO terms detected by RUBIC and GISTIC2.0""" """Compare the top 50 GO terms detected by RUBIC and GISTIC2.0"""
input: input:
go=expand("GO/Enriched_GOs_{tool}.txt", tool=["GISTIC2", "RUBIC"]) go=expand("GO/Enriched_GOs_{tool}.txt", tool=["GISTIC2", "RUBIC", "both"])
output: output:
"GO/comparison.txt" "GO/comparison.txt"
run: run:
......
...@@ -37,15 +37,16 @@ rule define_input_pipeline: ...@@ -37,15 +37,16 @@ rule define_input_pipeline:
inputfile=config["inputfile"], inputfile=config["inputfile"],
normalfile=config["normal"] normalfile=config["normal"]
run: run:
if input[0] == params.inputfile: #use provided input file if input[0] == params.inputfile: #use provided input file
shell("cp {params.inputfile} {output.tumor}") shell("cp {params.inputfile} {output.tumor}")
if config["normal"] != "": if config["normal"] != "":
shell("cp {params.normalfile} {output.normal}") shell("cp {params.normalfile} {output.normal}")
else: else:
shell("touch {output.normal}") shell("touch {output.normal}")
else: #split firehose data in tumor and normal files. else: #split firehose data in tumor and normal files.
split_normal_tumor(input[0], output.tumor, output.normal) split_normal_tumor(input[0], output.tumor, output.normal)
rule report_segmentation_file: rule report_segmentation_file:
"""Report information on the input segmentation file.""" """Report information on the input segmentation file."""
input: input:
...@@ -67,13 +68,16 @@ def split_normal_tumor(all_samples, out_tumor, out_normal): ...@@ -67,13 +68,16 @@ def split_normal_tumor(all_samples, out_tumor, out_normal):
normal.write(header) normal.write(header)
for line in old: for line in old:
sample = line.split("\t")[0] sample = line.split("\t")[0]
type_ID = sample.split("-")[3] #.strip("A") type_ID = sample.split("-")[3]
if type_ID == "06A": #TM: Metastatic if type_ID == "06A": #TM: Metastatic
tumor.write(line) #tumor.write(line)
normal.write(line)
elif type_ID == "01A": #TP: Primary Solid Tumor elif type_ID == "01A": #TP: Primary Solid Tumor
tumor.write(line) pass
#tumor.write(line)
elif type_ID == "10A" or type_ID == "11A": #NB: Blood Derived Normal or NT: Solid Tissue Normal elif type_ID == "10A" or type_ID == "11A": #NB: Blood Derived Normal or NT: Solid Tissue Normal
normal.write(line) #normal.write(line)
tumor.write(line)
else: else:
raise ValueError("Unkonwn sample type: " + type_ID + \ raise ValueError("Unkonwn sample type: " + type_ID + \
"\nPlease check samples report: http://gdac.broadinstitute.org/runs/stddata__latest/samples_report/") "\nPlease check samples report: http://gdac.broadinstitute.org/runs/stddata__latest/samples_report/")
...@@ -20,7 +20,7 @@ rule run_RUBIC: ...@@ -20,7 +20,7 @@ rule run_RUBIC:
out_plots="RUBIC/plots" out_plots="RUBIC/plots"
params: params:
fdr="0.25", fdr="0.25",
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["bimart_genes"] genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["biomart_genes"]
benchmark: benchmark:
"Benchmarks/RUBIC." + str(datetime.datetime.now()).replace(" ", "_") + ".txt" "Benchmarks/RUBIC." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
wrapper: wrapper:
...@@ -32,5 +32,7 @@ rule make_bed_file_RUBIC: ...@@ -32,5 +32,7 @@ rule make_bed_file_RUBIC:
losses="RUBIC/losses.txt" losses="RUBIC/losses.txt"
output: output:
bed="RUBIC/regions_track.bed" bed="RUBIC/regions_track.bed"
params:
ref=config["reference"]
run: run:
BedFile(input.gains, input.losses, output.bed) BedFile(input.gains, input.losses, output.bed, params.ref)
...@@ -2,7 +2,7 @@ import ReportSizes ...@@ -2,7 +2,7 @@ import ReportSizes
from SampleSizes import SegFile from SampleSizes import SegFile
import os.path import os.path
import datetime import datetime
from AUC import ROC_curve from PrecisionRecall import plot_PrecisionRecall
rule get_segmentation_files_subsets: rule get_segmentation_files_subsets:
"""Create segmentation files with different numbers of samples (randomly chosen) for a number of times.""" """Create segmentation files with different numbers of samples (randomly chosen) for a number of times."""
...@@ -41,7 +41,7 @@ rule run_RUBIC_subsets: ...@@ -41,7 +41,7 @@ rule run_RUBIC_subsets:
out_plots="Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/plots" out_plots="Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/plots"
params: params:
fdr="0.25", fdr="0.25",
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["bimart_genes"] genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["biomart_genes"]
benchmark: benchmark:
"Benchmarks/RUBIC." + str(datetime.datetime.now()).replace(" ", "_") + ".txt" "Benchmarks/RUBIC." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
conda: conda:
...@@ -106,7 +106,7 @@ rule compare_subset_truth_RUBIC: ...@@ -106,7 +106,7 @@ rule compare_subset_truth_RUBIC:
"bedtools intersect -a {input.bed_subsets} -b {input.bed_truth} -wao > {output.subset} && \ "bedtools intersect -a {input.bed_subsets} -b {input.bed_truth} -wao > {output.subset} && \
bedtools intersect -a {input.bed_truth} -b {input.bed_subsets} -wao > {output.truth}" bedtools intersect -a {input.bed_truth} -b {input.bed_subsets} -wao > {output.truth}"
rule make_ROC_plot: rule make_Precision_Recall_plot:
"""Make ROC plot on the precision and recall from the subsets using GISTIC and RUBIC.""" """Make ROC plot on the precision and recall from the subsets using GISTIC and RUBIC."""
input: input:
gistic_subset=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/Overlap_subset_truth.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]), gistic_subset=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/Overlap_subset_truth.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]),
...@@ -114,8 +114,9 @@ rule make_ROC_plot: ...@@ -114,8 +114,9 @@ rule make_ROC_plot:
rubic_subset=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/Overlap_subset_truth.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]), rubic_subset=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/Overlap_subset_truth.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_truth=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/Overlap_truth_subset.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]), rubic_truth=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/Overlap_truth_subset.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]),
output: output:
AUC="Samplesize/Precision_recall.png" plot="Samplesize/Precision_recall.png",
plot_avg="Samplesize/Precision_recall_avg.png"
params: params:
sizes=config["sizes"] sizes=config["sizes"]
run: run:
ROC_curve(input.gistic_subset, input.gistic_truth, input.rubic_subset, input.rubic_truth, output.AUC, params.sizes) plot_PrecisionRecall(input.gistic_subset, input.gistic_truth, input.rubic_subset, input.rubic_truth, output.plot, output.plot_avg, params.sizes)
import ReportControl
rule run_GISTIC_control:
"""Run GISTIC2 for the tumor segmentation data with data from control samples included."""
input:
gistic_directory=os.path.join(config["gisticdir"], "gistic2"),
seg="Input/Segments_tumor.txt"
output:
"Control/all_lesions.conf_" + config["gistic_precision"] + ".txt",
"Control/regions_track.conf_" + config["gistic_precision"] + ".bed"
params:
cnv="Input/Segments_normal.txt",
ref=config["reference"],
ref_file="",
extra="",
confidence=config["gistic_precision"]
wrapper:
"file:" + workflow.basedir + "/wrappers/GISTIC2"
rule report_control:
"""Report the differences between using a control and without using a control."""
input:
control="Control/",
nocontrol="GISTIC2/"
output:
"Reports/Control.txt"
params:
census=config["census_genes"],
known=config["prev_found_genes"],
ref=config["reference"]
run:
ReportControl.make_report(input.control, input.nocontrol, output[0], params.census, params.known, params.ref)
...@@ -28,7 +28,7 @@ class InputCircos: ...@@ -28,7 +28,7 @@ class InputCircos:
recurrent.readline() recurrent.readline()
for line in recurrent: for line in recurrent:
lineparts = line.split("\t") lineparts = line.split("\t")
loc = lineparts[3].split("(probes")[0] loc = lineparts[2].split("(probes")[0]
chrom, bp = loc.split(":") chrom, bp = loc.split(":")
start, end = bp.split("-") start, end = bp.split("-")
chrom = chrom.strip("chr") chrom = chrom.strip("chr")
...@@ -57,10 +57,11 @@ def make_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png): ...@@ -57,10 +57,11 @@ def make_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png):
gistic = mpatches.Patch(color='#5975A4', label='GISTIC2.0 regions') gistic = mpatches.Patch(color='#5975A4', label='GISTIC2.0 regions')
rubic = mpatches.Patch(color='#5F9E6E', label='RUBIC regions') rubic = mpatches.Patch(color='#5F9E6E', label='RUBIC regions')
legend = plt.legend(handles=[gains, losses, gistic, rubic], loc=3, framealpha=1, frameon=False) legend = plt.legend(handles=[gains, losses, gistic, rubic], loc=3, framealpha=1, frameon=False)
plt.axis('off')
fig = legend.figure fig = legend.figure
fig.canvas.draw() fig.canvas.draw()
bbox = legend.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) bbox = legend.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
fig.savefig(legend_png, dpi=400, bbox_inches=bbox) fig.savefig(legend_png, dpi=300, bbox_inches=bbox)
concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png) concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png)
def concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png): def concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png):
...@@ -73,20 +74,72 @@ def concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png): ...@@ -73,20 +74,72 @@ def concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png):
circos.paste(legend, offset) circos.paste(legend, offset)
circos.save(concatted_png) circos.save(concatted_png)
def bed_to_circos(bed_file, circos_file):
"""Convert bed file to CIRCOS input file."""
with open(circos_file, 'w') as out:
with open(bed_file, 'r') as bed:
bed.readline()
for line in bed:
chrom, start, end, gene_name = line.strip().split("\t")
chrom = 'hs' + chrom.strip("chr")
out.write(" ".join([chrom, start, end, gene_name]) + "\n")
def bed_to_circos(bed_file, rubic_file, gistic_file, gene_file):
"""Convert bed file to CIRCOS input files.""" def get_plot_region(