Commit 2d44810b authored by Tan's avatar Tan

Merged.

parents 0c65624f 90670433
......@@ -14,7 +14,7 @@ include: "rules/GenePrioritization.smk"
#Rules to compare different inputs.
include: "rules/ComparisonRegions.smk"
include: "rules/Circos.yaml"
include: "rules/Circos.smk"
include: "rules/SampleSizes.smk"
include: "rules/UseControl.smk"
......@@ -43,8 +43,6 @@ rule all:
"""Define desired output from pipeline."""
input:
"Samplesize/Report.txt"
#"Reports/Results.html"
rule help:
"""Print list of all targets with help."""
......@@ -65,7 +63,8 @@ rule report:
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png",
circos="Reports/Circos/RecurrentRegions.png",
circos_legend="Reports/Circos/RecurrentRegions_legend.png"
circos_legend="Reports/Circos/RecurrentRegions_legend.png",
known_genes="Reports/Overlap_known_genes.bed"
output:
html="Reports/Results.html"
run:
......
from Circos import InputCircos
from Circos import InputCircos, bed_to_circos
rule circos_input:
"""Make input files for making a circos diagram."""
......@@ -60,3 +60,31 @@ rule add_legend_circos:
offset = ((c_w - l_w), 0)
circos.paste(legend, offset)
circos.save(output[0])
rule circos_input_zoom:
"""Make input files for making a circos diagram."""
input:
bed="Reports/Overlap_known_genes.bed"
output:
gistic="Reports/Circos/Zoom/GISTIC.txt",
rubic="Reports/Circos/Zoom/RUBIC.txt",
genes="Reports/Circos/Zoom/Genes.txt",
run:
bed_to_circos(input.bed, output.rubic, output.gistic, output.genes)
rule make_circos_zoom:
"""Compare locations of known genes, recurrent regions from RUBIC and recurrent regions from GISTIC2."""
input:
gistic="Reports/Circos/Zoom/GISTIC.txt",
rubic="Reports/Circos/Zoom/RUBIC.txt",
genes="Reports/Circos/Zoom/Genes.txt",
output:
plots="Reports/Overlap_plots/12.png"
params:
workflow.basedir + "/scripts/circos/circos_zoom.conf",
chrom='hs12'
conda:
workflow.basedir + "/envs/circos.yaml"
shell:
"circos -conf {params[0]} -outputfile {output[0]} -param gistic_file={input.gistic} -param rubic_file={input.rubic} \
-param gene_file={input.genes} -param chrom={params.chrom}"
......@@ -9,7 +9,7 @@ rule report_tools:
gistic="GISTIC/all_lesions.conf_" + config["gistic_precision"] + ".txt",
rubic_gain="RUBIC/gains.txt",
rubic_loss="RUBIC/losses.txt",
overlap="Reports/Overlap_regions.bed"
overlap="Reports/Regions_overlapping_other_tool.bed"
output:
tools="Reports/Tools.txt",
table_regions="Reports/Recurrent_regions.txt",
......@@ -18,7 +18,7 @@ rule report_tools:
genes_rubic="Reports/Genes_RUBIC.txt",
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png",
pvals="Reports/Overlap_pvalues.png"
bed_known="Reports/Locations_known_genes.bed"
params: #select input files from repository or own input files
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
......@@ -29,7 +29,7 @@ rule report_tools:
params.census, params.known, params.ref, params.biomart_info,
output.tools, output.table_regions, output.venn, output.swarmplot,
output.genes_both, output.genes_gistic, output.genes_rubic,
input.overlap, output.pvals)
input.overlap, output.bed_known)
rule bed_intersect:
"""Intersect the recurrent regions detected by RUBIC and GISTIC2.0."""
......@@ -37,28 +37,21 @@ rule bed_intersect:
gistic="GISTIC/regions_track.conf_" + config["gistic_precision"] + ".bed",
rubic="RUBIC/regions_track.bed"
output:
"Reports/Overlap_regions.bed"
"Reports/Regions_overlapping_other_tool.bed"
conda:
workflow.basedir + "/envs/bedtools.yaml"
shell:
"bedtools intersect -a {input.gistic} -b {input.rubic} -wo > {output}"
def get_regions(bed_file):
plot_names = []
with open(bed_file, 'r') as bed:
bed.readline()
for line in bed:
chrom, start = line.split("\t")[0:2]
plot_names.append(chrom + "." + start)
return plot_names
#rule compare_regions:
# """Compare locations of known genes, recurrent regions from RUBIC and recurrent regions from GISTIC2."""
# input:
# overlap="Reports/Overlap_regions.bed"
# params:
# known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"]
# output:
# plots=expand("Reports/Overlap_plots/{region}.png", region=get_regions(input.overlap))
# shell:
# "R {workflow.basedir}/scripts/plot_regions.R"
rule bed_known_genes:
"""Intersect known genes and recurrent regions detected by RUBIC and GISTIC2.0."""
input:
known="Reports/Locations_known_genes.bed",
gistic="GISTIC/regions_track.conf_" + config["gistic_precision"] + ".bed",
rubic="RUBIC/regions_track.bed"
output:
"Reports/Overlap_known_genes.bed"
conda:
workflow.basedir + "/envs/bedtools.yaml"
shell:
"bedtools intersect -a {input.known} -b {input.gistic} {input.rubic} -names GISTIC RUBIC -wo > {output}"
......@@ -52,8 +52,13 @@ rule report_sizes:
"""Report the difference when using different sample sizes."""
input:
gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
<<<<<<< HEAD
rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
=======
#rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
#rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
>>>>>>> 9067043362f4034a0e46f4579c366d1228927193
output:
report="Samplesize/Report.txt",
plots="Samplesize/Plots/"
......
......@@ -45,3 +45,19 @@ class InputCircos:
qval = str(max([float(left_q), float(right_q)]))
out_line = ["hs" + chrom, start, end, qval]
out.write(" ".join(out_line) + "\n")
def bed_to_circos(bed_file, rubic_file, gistic_file, gene_file):
rubic = open(rubic_file, 'w')
gistic = open(gistic_file, 'w')
genes = open(gene_file, 'w')
with open(bed_file, 'r') as bed:
for line in bed:
gene_chrom, gene_start, gene_end, gene_name, tool_name, tool_chrom, tool_start, tool_end, \
amp_name, overlap_bp = line.split("\t")
chrom = 'hs' + gene_chrom.strip("chr")
genes.write(" ".join([chrom, gene_start, gene_end]) + "\n")
if tool_name == 'GISTIC':
gistic.write(" ".join([chrom, tool_start, tool_end]) + "\n")
else:
rubic.write(" ".join([chrom, tool_start, tool_end]) + "\n")
rubic.close(), gistic.close(), genes.close()
This diff is collapsed.
......@@ -6,19 +6,17 @@ import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats, install_ensembl
from ParseResults import parse_regions, get_stats
from collections import OrderedDict
def make_report(control_results, nocontrol_results, report_file, census_genes, known_genes, ref_genome):
"""Make a report on analyses using control samples or without using them."""
install_ensembl(ref_genome)
parsed_tools, stats_tools = [], []
with open(report_file, 'w') as out:
row_names = (["Control used?", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for result in control_results, nocontrol_results:
parsed_results = parse().gistic_results(result)
parsed_results = parse_regions.gistic_results(result)
parsed_tools.append(parsed_results)
label = "Control" if result == control_results else "No control"
stats_results = stats().calculate_stats(parsed_results, census_genes, known_genes, label)
......
......@@ -7,7 +7,7 @@ import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats
from ParseResults import get_stats
from collections import OrderedDict
def make_report(segmentation_file, report_file):
......
......@@ -6,20 +6,18 @@ import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats, install_ensembl
from ParseResults import parse_regions, get_stats
from collections import OrderedDict
def make_report(size_results, census_genes, known_genes, reps, ref_genome, report_file, plot_dir):
"""Make a report of the results produced using input files with different sample sizes."""
install_ensembl(ref_genome)
with open(report_file, 'w') as out:
dict_stats = OrderedDict()
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. regions with census genes", "Nr. regions with known genes", "Nr. genes"])
tool = GISTIC2
for size_file in size_results:
parsed_results = parse(size_file, known_genes, census_genes, tool)
parsed_results = parse_regions(size_file, known_genes, census_genes, tool)
size, repetition = size_file.split("/")[-1].split("x")
stats_results = get_stats(parsed_results, size)
if size not in dict_stats.keys():
......
This diff is collapsed.
<<include etc/colors_fonts_patterns.conf>>
<<include ideogram.conf>>
<<include ticks.conf>>
<image>
<<include etc/image.conf>>
</image>
karyotype = data/karyotype/karyotype.human.txt
chromosomes_units = 1000000
chromosomes = conf(chrom)
chromosomes_display_default = no
<highlights>
z = 5
<highlight>
file = conf(gistic_file)
r0 = 0.9r
r1 = 0.7r
fill_color = lgrey
</highlight>
<highlight>
file = conf(rubic_file)
r0 = 0.7r
r1 = 0.5r
fill_color = lyellow
</highlight>
<highlight>
file = conf(gene_file)
r0 = 1.0r
r1 = 0.9r
fill_color = black
</highlight>
</highlights>
<<include etc/housekeeping.conf>>
import pyensembl
ensembl = pyensembl.EnsemblRelease(75)
def install_ensembl(reference_genome):
"""Import ensembl if needed and load correct release based on reference genome."""
if "hg19" in reference_genome:
ensembl = pyensembl.EnsemblRelease(75)
elif "38" in reference_genome:
ensembl = pyensembl.EnsemblRelease(87)
else:
raise ValueError("Unknown reference genome used.")
try: #only done first time to install ensembl version.
ensembl.download()
ensembl.index()
except:
pass
def genes_at_locus(chrom, start, end):
"""Get list of gene IDs at certain location."""
IDs = []
gene_info = ensembl.gene_names_at_locus(chrom, start, end)
for gene in gene_info:
IDs.append(gene)
return IDs
def gene_name_to_ID(gene_names):
list_genes = []
for gene in gene_names:
try:
gene_ID = ensembl.gene_ids_of_gene_name(gene)
list_genes.append(gene_ID[0])
except:
print(gene)
pass
return list_genes
......@@ -8,9 +8,9 @@ from snakemake.shell import shell
#Convert file locations to absolute paths
segments = os.path.abspath(snakemake.input.seg)
gistic_dir = os.path.abspath(snakemake.input.gistic_directory).split("gistic2")[0]
gistic_dir = os.path.abspath(snakemake.input.gistic_directory).split("/gistic2")[0]
outfolder = os.path.abspath(snakemake.output[0]).split("all_lesions")[0]
print(outfolder)
#Select reference file
ref = snakemake.params.get("ref", "")
ref_file = snakemake.params.get("ref_file", "")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment