Commit 1a6403cd authored by Beatrice Tan's avatar Beatrice Tan

Improved structure of rules and functions.

parent db6d0312
from snakemake.utils import report
#Configure input and settings
configfile: "config.yaml"
......@@ -9,15 +11,15 @@ import os.path
#Rules to run pipeline for prioritization of regions and genes.
include: "rules/PreprocessInput.smk"
include: "rules/GISTIC2.smk"
include: "rules/Rubic.smk"
include: "rules/RUBIC.smk"
include: "rules/GenePrioritization.smk"
#Rules to compare different inputs.
include: "rules/ComparisonRegions.smk"
include: "rules/Circos.smk"
#include: "rules/SampleSizes.smk"
include: "rules/SampleSizes.smk"
#include: "rules/UseControl.smk"
#include: "rules/ComparisonSettings.smk"
include: "rules/ComparisonSettings.smk"
#Directory to save all files.
workdir: config["workdir"]
......@@ -39,11 +41,12 @@ onsuccess:
onerror:
print("\n\nPipeline failed. Possible reasons:\n- Wrong input files\n- Missing arguments in config file\n- Error in conda environment\n\n")
rule all:
"""Define desired output from pipeline."""
input:
"Reports/Results.html"
#"Settings/Report.txt",
"PipelineResults.html",
"ComparisonResults.html"
rule help:
"""Print list of all targets with help."""
......@@ -51,9 +54,8 @@ rule help:
for rule in workflow.rules:
print('- ' + rule.name + "\t" + rule.docstring)
rule report:
"""Write html report on segmentation file."""
rule report_pipeline:
"""Write HTML report on output from pipeline."""
input:
seg="Reports/Segments.txt",
tools="Reports/Tools.txt",
......@@ -62,23 +64,40 @@ rule report:
genes_gistic="Reports/Genes_GISTIC2.txt",
genes_rubic="Reports/Genes_RUBIC.txt",
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png",
circos="Reports/Circos/RecurrentRegions.png",
circos_legend="Reports/Circos/RecurrentRegions_legend.png",
known_genes="Reports/Overlap_known_genes.bed"
swarmplot="Reports/Comparison_sizes.png",
circos="Circos/RecurrentRegions_legend.png",
output:
html="PipelineResults.html"
run:
from snakemake.utils import report
report("""
====================================================
Report on the results of the prioritization pipeline
====================================================
- Report on segmentation file: seg_
- Report on comparison between tools and overlapping regions: tools_
- Table with all recurrent regions and overlapping genes: table_regions_
- Circos plot showing the raw segmentation file and recurrent regions detected by both tools: circos_
- Venn diagram showing the overlap between gene lists from both tools: venn_
- Swarmplot showing the differences in sizes between both tools: swarmplot_
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", **input)
rule report_comparisons:
"""Write HTML report on comparisons between sample sizes, settings and using a control."""
input:
#size="Samplesizes/Report.txt"
circos_genes=get_list_genes_circos
output:
html="Reports/Results.html"
html="ComparisonResults.html"
run:
from snakemake.utils import report
with open(input.seg, 'r') as seg:
nr_samples = seg.readline().split("\t")[1].strip()
report("""
====================================================
Report on the results of the prioritization pipeline
====================================================
In total, {nr_samples} samples were present in the raw segmentation file.
See: Table T1_
- Report on sample size comparison:
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", T1=input[0])
#**input)
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", **input)
#Directories to be specified
workdir: /home/bftan/CNA_results #directory to write output
gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#workdir: /home/beatrice/CNA_analysis
#gisticdir: /home/beatrice/CNA_analysis/run_gistic2
workdir: /home/beatrice/CNA_99_genegistic
gisticdir: /home/beatrice/CNA_analysis/run_gistic2
#Input details to download from firehose
cancer_type: SKCM
......@@ -14,22 +14,23 @@ inputfile: "" #tumor segmentation data
normal: "" #normal segmentation data
#Data for running and benchmarking tools.
reference: hg19
reference: hg19 #hg38.UCSC.add_miR.160920.refgene
prev_found_genes: input_files/intogen-CM-drivers-data.tsv
census_genes: input_files/Census_genes.txt
biomart_genes: input_files/biomart_human_genes.tsv
biomart_genes: input_files/biomart_human_genes_hg19.tsv #wrong genome build
ID_to_GO: input_files/ID_to_GO.txt
#Settings GISTIC2.0
gistic_precision: "99"
settings_gistic: ""
settings_gistic: "-brlen 0.98 -genegistic 1"
comparison_settings: ["-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 1 -conf 0.99",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 1 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.98 -cap 1.5 -rx 1 -genegistic 1 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.98 -cap 1.5 -rx 1 -genegistic 0 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 0 -conf 0.75",]
#GISTIC2.0 settings to compare
comparison_precision: ["99", "90", "75"]
comparison_settings: ["-brlen 0.7 -genegistic 1",
"-brlen 0.98 -genegistic 1",
"-brlen 0.98 -genegistic 0",
"-brlen 0.7 -genegistic 0"]
#Settings for sample size differences
sizes: [20, 30, 40, 50, 60, 70, 80, 90]
sizes: [20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
KIT
PDGFRA
KDR
CDK4
CCND1
MDM2
TERT
BRAF
MITF
PD-L1
NRAS
CD274
This diff is collapsed.
from Circos import InputCircos, bed_to_circos, make_CIRCOS_legend
from Circos import InputCircos, bed_to_circos, make_CIRCOS_legend, get_plot_region
from ReportTools import make_bed_genes
from PIL import Image
import os.path
rule make_CIRCOS_input:
"""Make input files for making a CIRCOS plot."""
......@@ -9,20 +11,20 @@ rule make_CIRCOS_input:
rubic_gains="RUBIC/gains.txt",
rubic_losses="RUBIC/losses.txt"
output:
seg="Reports/Circos/Segments.txt",
gistic="Reports/Circos/GISTIC_results.txt",
rubic="Reports/Circos/RUBIC_results.txt",
seg="Circos/Segments.txt",
gistic="Circos/GISTIC_results.txt",
rubic="Circos/RUBIC_results.txt",
run:
InputCircos(input.seg, input.gistic, input.rubic_gains, input.rubic_losses, output.seg, output.gistic, output.rubic)
rule make_CIRCOS_plot:
rule plot_CIRCOS:
"""Make CIRCOS plot of recurrent regions in RUBIC and GISTIC2.0"""
input:
seg="Reports/Circos/Segments.txt",
gistic="Reports/Circos/GISTIC_results.txt",
rubic="Reports/Circos/RUBIC_results.txt",
seg="Circos/Segments.txt",
gistic="Circos/GISTIC_results.txt",
rubic="Circos/RUBIC_results.txt",
output:
"Reports/Circos/RecurrentRegions.png"
"Circos/RecurrentRegions.png"
params:
conf=workflow.basedir + "/scripts/circos/circos.conf"
conda:
......@@ -33,72 +35,72 @@ rule make_CIRCOS_plot:
rule add_legend_CIRCOS:
"""Add a custom legend to the CIRCOS plot."""
input:
circos="Reports/Circos/RecurrentRegions.png",
circos="Circos/RecurrentRegions.png",
output:
legend="Reports/Circos/legend.png",
circos="Reports/Circos/RecurrentRegions_legend.png"
legend="Circos/legend.png",
circos="Circos/RecurrentRegions_legend.png"
run:
make_CIRCOS_legend(input.circos, output.legend, output.circos)
rule make_CIRCOS_zoom_input: #necessary?
"""Make input files for making a circos diagram."""
rule make_bed_genes_census:
input:
bed="Reports/Overlap_known_genes.bed"
gene_file=os.path.join(workflow.basedir, config["census_genes"])
output:
gistic="Reports/Circos/Zoom/GISTIC.txt",
rubic="Reports/Circos/Zoom/RUBIC.txt",
genes="Reports/Circos/Zoom/Genes.txt",
bed="Reports/Locations_census_genes.bed"
params:
ref=os.path.join(workflow.basedir, config["reference"]),
biomart=os.path.join(workflow.basedir, config["biomart_genes"])
run:
bed_to_circos(input.bed, output.rubic, output.gistic, output.genes)
make_bed_genes(input.gene_file, params.biomart, output.bed, params.ref)
def get_list_genes(overlapping_genes, locations_known_genes):
plot_list = []
list_overlapping_genes = []
with open(overlapping_genes, 'r') as plot_genes:
for line in plot_genes:
chrom, start, end = line.strip().split("\t")
chrom = "chr" + chrom.strip("hs")
list_overlapping_genes.append([chrom, start, end])
with open(locations_known_genes, 'r') as known_genes:
known_genes.readline()
for line in known_genes:
chrom, start, end, gene_name = line.strip().split("\t")
if [chrom, start, end] in list_overlapping_genes:
plot_list.append(gene_name)
return(plot_list)
rule make_bed_genes_known:
input:
gene_file=os.path.join(workflow.basedir, config["prev_found_genes"])
output:
bed="Reports/Locations_known_genes.bed"
params:
ref=os.path.join(workflow.basedir, config["reference"]),
biomart=os.path.join(workflow.basedir, config["biomart_genes"])
run:
make_bed_genes(input.gene_file, params.biomart, output.bed, params.ref)
def get_plot_region(gene_name, locations_known_genes):
with open(locations_known_genes, 'r') as known_genes:
known_genes.readline()
for line in known_genes:
if gene_name == line.strip().split("\t")[3]:
chrom, start, end, name = line.strip().split("\t")
chrom_region = "hs" + chrom.strip("chr")
if int(start) < 1000000:
start_region = 0
else:
start_region = int(start) - 1000000
end_region = int(end) + 1000000
plot_region = chrom_region + ":" + str(start_region) + "-" + str(end_region)
return plot_region
rule make_CIRCOS_input_genes: #toevoegen aan make_CIRCOS_input
input:
bed="Reports/Locations_{type}_genes.bed"
output:
circos="Circos/{type}_genes.txt"
run:
bed_to_circos(input.bed, output.circos)
rule make_CIRCOS_zoom_plots:
"""Compare locations of known genes, recurrent regions from RUBIC and recurrent regions from GISTIC2."""
def get_list_genes_circos(wildcards):
"""Extract list of known genes to produce list of file names to save CIRCOS plot for each gene"""
list_genes = []
with open(os.path.join(workflow.basedir, config["prev_found_genes"])) as known:
for line in known:
gene = line.split("\t")[2]
gene_file = "Circos/KnownGenes/" + gene + ".png"
list_genes.append(gene_file)
return list_genes
rule plot_CIRCOS_per_gene: #Two genes won't plot because region is outside chromosome?
"""Compare locations of known genes, recurrent regions from RUBIC and recurrent regions from GISTIC2.
Use 'get_list_genes_circos' as input for a rule (e.g. rule all) to produce a plot for each known gene.'"""
input:
gistic="Reports/Circos/Zoom/GISTIC.txt", #all gistic regions
rubic="Reports/Circos/Zoom/RUBIC.txt", #all rubic regions
genes="Reports/Circos/Zoom/Genes.txt",
known="Reports/Locations_known_genes.bed", #use locations known genes to extract list of known genes and run rule for each gene maybe also not overlapping, but region around gene
list_genes = lambda
gistic="Circos/GISTIC_results.txt",
rubic="Circos/RUBIC_results.txt",
genes="Circos/known_genes.txt",
census="Circos/census_genes.txt",
known_genes="Reports/Locations_known_genes.bed",
overlap="Reports/Overlap_known_genes.bed"
output:
plot="Circos/KnownGenes/{gene}.png"
params:
known="Reports/Locations_known_genes.bed",
genes="Reports/Circos/Zoom/Genes.txt",
conf=workflow.basedir + "/scripts/circos/circos_zoom.conf",
chrom=get_plot_region(wildcards.gene, input.known)
output:
plot=expand("Reports/Overlap_plots/{gene}.png", gene=get_list_genes(params.genes, params.known))
conda:
workflow.basedir + "/envs/circos.yaml"
shell:
"circos -conf {params.conf} -outputfile {output.plot} -param gistic_file={input.gistic} -param rubic_file={input.rubic} \
-param gene_file={input.genes} -param chrom={params.chrom}"
#chrom=lambda wildcards, input: get_plot_region(wildcards.gene, input.known_genes, input.overlap),
#units=lambda wildcards, input: get_plot_units(wildcards.gene, input.known_genes, input.overlap)
# conda:
# workflow.basedir + "/envs/circos.yaml"
run:
chrom, units = get_plot_region(wildcards.gene, input.known_genes, input.overlap)
shell("circos -conf {params.conf} -outputfile {output.plot} -param gistic_file={input.gistic} -param rubic_file={input.rubic} \
-param gene_file={input.genes} -param census_file={input.census} -param chrom=" + chrom + " -param units=" + str(units))
......@@ -17,8 +17,7 @@ rule report_tools:
genes_gistic="Reports/Genes_GISTIC2.txt",
genes_rubic="Reports/Genes_RUBIC.txt",
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png",
bed_known="Reports/Locations_known_genes.bed"
size_plot="Reports/Comparison_sizes.png"
params: #select input files from repository or own input files
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
......@@ -27,9 +26,9 @@ rule report_tools:
run:
ReportTools.make_report(input.gistic, input.rubic_gain, input.rubic_loss,
params.census, params.known, params.ref, params.biomart_info,
output.tools, output.table_regions, output.venn, output.swarmplot,
output.tools, output.table_regions, output.venn, output.size_plot,
output.genes_both, output.genes_gistic, output.genes_rubic,
input.overlap, output.bed_known)
input.overlap)
rule get_overlap_GISTIC_RUBIC:
"""Intersect the recurrent regions detected by RUBIC and GISTIC2.0."""
......
def get_settings(nr_settings, all_settings):
print(nr_settings)
print(all_settings)
print(list(range(len(config["comparison_settings"]))))
from ParseResults import parse_regions, get_stats
from ReportTools import make_tool_report
rule gistic_settings:
"""Run GISTIC2 based on different settings."""
"""Run GISTIC2 using different settings."""
input:
gistic_directory=os.path.join(config["gisticdir"], "gistic2"),
seg="Input/Segments_tumor.txt",
lambda wildcards: config["comparison_settings"][wildcards.setting]
output:
expand("Settings/GISTIC_{setting_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", setting_nr=range(len(config["comparison_settings"]))),
"Settings/GISTIC_{setting_nr}/regions_track.conf_" + config["gistic_precision"] + ".bed"
"Settings/GISTIC.{setting}_{precision}/all_lesions.conf_{precision}.txt",
"Settings/GISTIC.{setting}_{precision}/regions_track.conf_{precision}.bed"
params:
cnv="",
ref=config["reference"],
ref_file="",
extra="wildcards.setting",
confidence=config["gistic_precision"]
extra=lambda wildcards: config["comparison_settings"][int(wildcards.setting)], #get_settings(wildcards.setting),
confidence=lambda wildcards: wildcards.precision
wrapper:
"file:" + workflow.basedir + "/wrappers/GISTIC2"
def get_list_settings(wildcards):
"""Extract list of known genes to produce list of file names to save CIRCOS plot for each gene"""
file_names = []
for i in range(len(config["comparison_settings"])):
for precision in config["comparison_precision"]:
file_name = "Settings/GISTIC." + str(i) + "_" + precision + "/all_lesions.conf_" + precision + ".txt"
file_names.append(file_name)
return file_names
def get_settings(setting):
"""Extract setting to use based on wildcard.setting, which is a number."""
list_settings = config["comparison_settings"]
return list_settings[int(setting)]
rule compare_settings:
input:
"Settings/GISTIC_{setting_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt"
get_list_settings
output:
"Settings/Report.txt"
report="Settings/Report.txt"
params:
settings=config["comparison_settings"],
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
ref=config["reference"]
run:
with open(output[0], 'w') as out:
out.write(input[0])
SettingReport(input, output.report, params.settings, params.known, params.census, params.ref)
def SettingReport(setting_results, report_file, settings, known_genes, census_genes, ref_genome):
stats = []
legend = []
print(setting_results)
for result in setting_results:
file_ID = result.split("GISTIC.")[1].split("/all_lesions")[0]
setting_nr, precision = file_ID.split("_")
used_setting = settings[int(setting_nr)]
parsed_results = parse_regions(result, known_genes, census_genes, 'GISTIC', ref_genome)
stats_results = get_stats(parsed_results, file_ID)
stats.append(stats_results)
legend.append([setting_nr, used_setting])
make_tool_report(report_file, stats)
with open(report_file, 'a') as out:
out.write("\n\nLegend:\n")
for nr in legend:
out.write("\t".join(nr) + "\n")
#& bed file with all regions
......@@ -4,7 +4,7 @@ rule do_GO_analysis:
gene_list="Reports/Genes_{tool}.txt"
output:
table="GO/Enriched_GOs_{tool}.txt", #Add gene names to table
plot="GO/Enriched_GOs_{tool}.jpg"
plot="GO/Enriched_GOs_{tool}.png"
params:
organism="hsapiens", #default is human
ontology="BP", #MF, BP, CC or all
......@@ -17,7 +17,7 @@ rule do_GO_analysis:
rule compare_enriched_GOs:
"""Compare the top 50 GO terms detected by RUBIC and GISTIC2.0"""
input:
go=expand("GO/Enriched_GOs_{tool}.txt", tool=["GISTIC2", "RUBIC"])
go=expand("GO/Enriched_GOs_{tool}.txt", tool=["GISTIC2", "RUBIC", "both"])
output:
"GO/comparison.txt"
run:
......
......@@ -37,15 +37,16 @@ rule define_input_pipeline:
inputfile=config["inputfile"],
normalfile=config["normal"]
run:
if input[0] == params.inputfile: #use provided input file
if input[0] == params.inputfile: #use provided input file
shell("cp {params.inputfile} {output.tumor}")
if config["normal"] != "":
shell("cp {params.normalfile} {output.normal}")
else:
shell("touch {output.normal}")
else: #split firehose data in tumor and normal files.
else: #split firehose data in tumor and normal files.
split_normal_tumor(input[0], output.tumor, output.normal)
rule report_segmentation_file:
"""Report information on the input segmentation file."""
input:
......@@ -67,13 +68,16 @@ def split_normal_tumor(all_samples, out_tumor, out_normal):
normal.write(header)
for line in old:
sample = line.split("\t")[0]
type_ID = sample.split("-")[3] #.strip("A")
type_ID = sample.split("-")[3]
if type_ID == "06A": #TM: Metastatic
tumor.write(line)
#tumor.write(line)
normal.write(line)
elif type_ID == "01A": #TP: Primary Solid Tumor
tumor.write(line)
pass
#tumor.write(line)
elif type_ID == "10A" or type_ID == "11A": #NB: Blood Derived Normal or NT: Solid Tissue Normal
normal.write(line)
#normal.write(line)
tumor.write(line)
else:
raise ValueError("Unkonwn sample type: " + type_ID + \
"\nPlease check samples report: http://gdac.broadinstitute.org/runs/stddata__latest/samples_report/")
......@@ -20,7 +20,7 @@ rule run_RUBIC:
out_plots="RUBIC/plots"
params:
fdr="0.25",
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["bimart_genes"]
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["biomart_genes"]
benchmark:
"Benchmarks/RUBIC." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
wrapper:
......@@ -32,5 +32,7 @@ rule make_bed_file_RUBIC:
losses="RUBIC/losses.txt"
output:
bed="RUBIC/regions_track.bed"
params:
ref=config["reference"]
run:
BedFile(input.gains, input.losses, output.bed)
BedFile(input.gains, input.losses, output.bed, params.ref)
......@@ -2,7 +2,7 @@ import ReportSizes
from SampleSizes import SegFile
import os.path
import datetime
from AUC import ROC_curve
from PrecisionRecall import plot_PrecisionRecall
rule get_segmentation_files_subsets:
"""Create segmentation files with different numbers of samples (randomly chosen) for a number of times."""
......@@ -41,7 +41,7 @@ rule run_RUBIC_subsets:
out_plots="Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/plots"
params:
fdr="0.25",
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["bimart_genes"]
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["biomart_genes"]
benchmark:
"Benchmarks/RUBIC." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
conda:
......@@ -106,7 +106,7 @@ rule compare_subset_truth_RUBIC:
"bedtools intersect -a {input.bed_subsets} -b {input.bed_truth} -wao > {output.subset} && \
bedtools intersect -a {input.bed_truth} -b {input.bed_subsets} -wao > {output.truth}"
rule make_ROC_plot:
rule make_Precision_Recall_plot:
"""Make ROC plot on the precision and recall from the subsets using GISTIC and RUBIC."""
input:
gistic_subset=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/Overlap_subset_truth.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]),
......@@ -114,8 +114,9 @@ rule make_ROC_plot:
rubic_subset=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/Overlap_subset_truth.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_truth=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/Overlap_truth_subset.bed", rand_nr=config["sizes"], rep_nr=config["repeats"]),
output:
AUC="Samplesize/Precision_recall.png"
plot="Samplesize/Precision_recall.png",
plot_avg="Samplesize/Precision_recall_avg.png"
params:
sizes=config["sizes"]
run:
ROC_curve(input.gistic_subset, input.gistic_truth, input.rubic_subset, input.rubic_truth, output.AUC, params.sizes)
plot_PrecisionRecall(input.gistic_subset, input.gistic_truth, input.rubic_subset, input.rubic_truth, output.plot, output.plot_avg, params.sizes)
import ReportControl
rule run_GISTIC_control:
"""Run GISTIC2 for the tumor segmentation data with data from control samples included."""
input:
gistic_directory=os.path.join(config["gisticdir"], "gistic2"),
seg="Input/Segments_tumor.txt"
output:
"Control/all_lesions.conf_" + config["gistic_precision"] + ".txt",
"Control/regions_track.conf_" + config["gistic_precision"] + ".bed"
params:
cnv="Input/Segments_normal.txt",
ref=config["reference"],
ref_file="",
extra="",
confidence=config["gistic_precision"]
wrapper:
"file:" + workflow.basedir + "/wrappers/GISTIC2"
rule report_control:
"""Report the differences between using a control and without using a control."""
input:
control="Control/",
nocontrol="GISTIC2/"
output:
"Reports/Control.txt"
params:
census=config["census_genes"],
known=config["prev_found_genes"],
ref=config["reference"]
run:
ReportControl.make_report(input.control, input.nocontrol, output[0], params.census, params.known, params.ref)
......@@ -28,7 +28,7 @@ class InputCircos:
recurrent.readline()
for line in recurrent:
lineparts = line.split("\t")
loc = lineparts[3].split("(probes")[0]
loc = lineparts[2].split("(probes")[0]
chrom, bp = loc.split(":")
start, end = bp.split("-")
chrom = chrom.strip("chr")
......@@ -57,10 +57,11 @@ def make_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png):
gistic = mpatches.Patch(color='#5975A4', label='GISTIC2.0 regions')
rubic = mpatches.Patch(color='#5F9E6E', label='RUBIC regions')
legend = plt.legend(handles=[gains, losses, gistic, rubic], loc=3, framealpha=1, frameon=False)
plt.axis('off')
fig = legend.figure
fig.canvas.draw()
bbox = legend.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
fig.savefig(legend_png, dpi=400, bbox_inches=bbox)
fig.savefig(legend_png, dpi=300, bbox_inches=bbox)
concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png)
def concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png):
......@@ -73,20 +74,72 @@ def concat_CIRCOS_legend(CIRCOS_png, legend_png, concatted_png):
circos.paste(legend, offset)
circos.save(concatted_png)
def bed_to_circos(bed_file, circos_file):
"""Convert bed file to CIRCOS input file."""
with open(circos_file, 'w') as out:
with open(bed_file, 'r') as bed:
bed.readline()
for line in bed:
chrom, start, end, gene_name = line.strip().split("\t")
chrom = 'hs' + chrom.strip("chr")
out.write(" ".join([chrom, start, end, gene_name]) + "\n")
def bed_to_circos(bed_file, rubic_file, gistic_file, gene_file):
"""Convert bed file to CIRCOS input files."""
rubic = open(rubic_file, 'w')
gistic = open(gistic_file, 'w')
genes = open(gene_file, 'w')
with open(bed_file, 'r') as bed:
for line in bed:
gene_chrom, gene_start, gene_end, gene_name, tool_name, tool_chrom, tool_start, tool_end, \
amp_name, overlap_bp = line.split("\t")
chrom = 'hs' + gene_chrom.strip("chr")
genes.write(" ".join([chrom, gene_start, gene_end]) + "\n")
if tool_name == 'GISTIC':
gistic.write(" ".join([chrom, tool_start, tool_end]) + "\n")
else:
rubic.write(" ".join([chrom, tool_start, tool_end]) + "\n")
rubic.close(), gistic.close(), genes.close()
def get_plot_region(gene_name, locations_known_genes, overlap_tools):
"""Define which region to plot around the known gene, based on size of gene and overlapping regions."""
plot_region, units = calculate_region_units(gene_name, locations_known_genes, overlap_tools)
return plot_region, units
def calculate_region_units(gene_name, locations_known_genes, overlap_tools):
plot_region = "hs1:0-5000000" #unknown gene in input file
units = 1000000
with open(locations_known_genes, 'r') as known_genes:
known_genes.readline()
plot_region = ''
for line in known_genes:
if gene_name == line.strip().split("\t")[3]:
chrom, start, end, name = line.strip().split("\t")
chrom_region = "hs" + chrom.strip("chr")
start_region, end_region = define_region(overlap_tools, chrom, start, end)
size_region = end_region - start_region
units = round(size_region / 5, -2) #choose units based on size difference of 5
plot_end = round(end_region / units, 1)
plot_start = round(start_region / units, 1)
plot_region = chrom_region + ":" + str(plot_start) + "-" + str(plot_end)
if plot_region == '' or "_" in plot_region:
plot_region = "hs1:0-5000000"
return plot_region, int(units)
def define_region(overlap_tools, chrom, start, end):
"""Make plot region wider when overlapping recurrent region is larger to examine how precise region was called.
Positions should be in million basepairs (change chromosome_units in circos_zoom.conf to alter this)."""
gistic_start, gistic_end = check_overlap(overlap_tools, 'GISTIC', chrom, start, end)
rubic_start, rubic_end = check_overlap(overlap_tools, 'RUBIC', chrom, start, end)
if gistic_start == 'N/A' and gistic_end == 'N/A' and rubic_start == 'N/A' and rubic_end == 'N/A':
start_region = int(start) - 10000000
end_region = int(end) + 10000000
else:
start_region = int(start)
end_region = int(end)
if gistic_start != 'N/A' and int(gistic_start) < int(start):
start_region = int(gistic_start)
if rubic_start != 'N/A' and int(rubic_start) < int(start_region):
start_region = int(rubic_start)
if gistic_end != 'N/A' and int(gistic_end) > int(end_region):
end_region = int(gistic_end)
if rubic_end != 'N/A' and int(rubic_end) > int(end_region):
end_region = int(rubic_end)
size_region = end_region - start_region
start_region = start_region - int(0.2 * size_region)
end_region = end_region + int(0.2 * size_region)
return start_region, end_region
def check_overlap(overlap_file, tool_name, chrom, start, end):
out_start, out_end = 'N/A', 'N/A'
with open(overlap_file, 'r') as overlap:
for line in overlap:
gene_chrom, gene_start, gene_end, gene_name, tool, tool_chrom, tool_start, tool_end, tool_ID, tool_bp = line.strip().split("\t")
if tool == tool_name:
if gene_chrom == chrom and gene_start == start and gene_end == end:
out_start, out_end = tool_start, tool_end
return out_start, out_end
This diff is collapsed.
......@@ -2,23 +2,28 @@ import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
def ROC_curve(gistic_subset, gistic_truth, rubic_subset, rubic_truth, plot_file, list_sizes):
def plot_PrecisionRecall(gistic_subset, gistic_truth, rubic_subset, rubic_truth, plot_file, plot_avg_file, list_sizes):
plot_recall, plot_precision, plot_avg_recall, plot_avg_precision = [], [], [], []
for tool in (gistic_subset, gistic_truth), (rubic_subset, rubic_truth):
list_recall, list_precision = get_list_rates(tool[0], tool[1], list_sizes)
plot_recall.append(list_recall) #avg_recall)
plot_precision.append(list_precision) #.append(#avg_precision)
print(list_recall)
print(list_precision)
plot_recall.append(list_recall)
plot_precision.append(list_precision)
avg_recall, avg_precision = calculate_averages(list_recall, list_precision)
plot_avg_recall.append(avg_recall) #avg_recall)
print(avg_recall)
print(avg_precision)
plot_avg_recall.append(avg_recall)
plot_avg_precision.append(avg_precision)
plot_ROC(plot_recall, plot_precision, plot_file, list_sizes, False)
plot_ROC(plot_avg_recall, plot_avg_precision, plot_file + "_avg.png", list_sizes, True)
make_plot(plot_recall, plot_precision, plot_file, list_sizes, False)
make_plot(plot_avg_recall, plot_avg_precision, plot_avg_file, list_sizes, True)
def get_list_rates(subset_files, truth_files, list_sizes):
list_recall = [[] for i in range(len(list_sizes))]
list_precision = [[] for i in range(len(list_sizes))]
for i in range(len(subset_files)):
size_file = subset_files[i].split("Size")[1].split(".Rep")[0]
print(size_file)
recall, precision = calculate_precision_recall(subset_files[i], truth_files[i])
for size in range(len(list_sizes)):
if str(list_sizes[size]) == size_file:
......@@ -36,8 +41,8 @@ def calculate_precision_recall(bed_subset, bed_truth):
line = region.split("\t")
if line[4] == '.':
FP += 1
else:
TP += 1
#else:
# TP += 1
with open(bed_truth, 'r') as truth: #recall
for region in truth:
if region.startswith("track"):
......@@ -46,8 +51,8 @@ def calculate_precision_recall(bed_subset, bed_truth):
line = region.split("\t")
if line[4] == '.':
FN += 1