Commit 7c36cd45 authored by Beatrice Tan's avatar Beatrice Tan

Added settings and table with all recurrent regions.

parent 331a4061
......@@ -37,11 +37,11 @@ onerror:
rule all:
"""Define desired output from pipeline."""
input:
"Reports/Circos/RecurrentRegions.png"
#"Reports/Circos/RecurrentRegions.png"
#"GO/comparison.txt",
#"Input/Segments_tumor.txt",
#"Reports/Control.txt",
#"Reports/Tools.txt",
"Reports/Tools.txt",
#"Reports/Results.html",
#"Reports/Control.txt",
......
......@@ -16,6 +16,8 @@ prev_found_genes: input_files/intogen-CM-drivers-data.tsv
census_genes: input_files/Census_genes.txt
biomart_genes: input_files/biomart_human_genes.tsv
precision: '99'
#Settings for sample size differences
sizes: [20, 30, 40, 50, 60, 70, 80, 90]
repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
......@@ -17,7 +17,7 @@ rule run_gistic:
gistic_directory=config["gisticdir"],
seg="Input/Segments_tumor.txt"
output:
"GISTIC/"
"GISTIC/all_lesions.conf_" + config["precision"] + ".txt"
params:
cnv="",
ref=config["reference"],
......@@ -85,24 +85,29 @@ rule make_circos:
rule report_tools:
"""Report the differences in calls between GISTIC2 and RUBIC."""
input:
gistic="GISTIC/",
rubic="RUBIC/gains.txt"
gistic="GISTIC/all_lesions.conf_" + config["precision"] + ".txt",
rubic_gain="RUBIC/gains.txt",
rubic_loss="RUBIC/losses.txt"
output:
tools="Reports/Tools.txt",
table_regions="Reports/Recurrent_regions.txt",
genes_both="Reports/Genes_both.txt",
genes_gistic="Reports/Genes_GISTIC2.txt",
genes_rubic="Reports/Genes_RUBIC.txt",
overlap="Reports/Overlap_genes.txt",
#overlap="Reports/Overlap_genes.txt",
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png"
params: #select input files from repository or own input files
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
gene_info=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["biomart_genes"],
biomart_info=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["biomart_genes"],
ref=config["reference"]
run:
ReportTools(input.gistic, input.rubic, params.census, params.known, params.gene_info, params.ref,
output.tools, output.venn, output.genes_both, output.genes_gistic, output.genes_rubic, output.overlap, output.swarmplot)
ReportTools(input.gistic, input.rubic_gain, input.rubic_loss,
params.census, params.known, params.ref, params.biomart_info,
output.tools, output.table_regions, output.venn, 'overlap', #output.overlap,
output.swarmplot,
output.genes_both, output.genes_gistic, output.genes_rubic)
rule report:
......
This diff is collapsed.
......@@ -7,7 +7,7 @@ import seaborn as sns
from scipy.stats import ttest_ind
import os.path
import pyensembl
from ParseResults import parse, stats
from ParseResults import parse, get_stats
from collections import OrderedDict
def install_ensembl(reference_genome):
......@@ -88,37 +88,33 @@ class ReportSegmentation:
class ReportTools:
"""Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
def __init__(self, gistic_dir, rubic_gain_file, census_genes, known_genes, gene_file, ref_genome, file_tools,
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_swarmplot): #double brackets? ((...))
rubic_dir = rubic_gain_file.split("/gains.txt")[0] #get diretory with rubic results from gain file path.
def __init__(self, gistic_results, rubic_gain_results, rubic_loss_results,
census_genes, known_genes, ref_genome, biomart_file,
file_tools, file_regions, file_venn, file_overlap, file_swarmplot,
file_genes_both, file_genes_GISTIC, file_genes_RUBIC):
install_ensembl(ref_genome)
parsed, stats = self.calculate_stats(gistic_dir, rubic_dir, file_tools, census_genes, known_genes)
self.overlap_most_sign(parsed)
self.make_tool_report(file_tools, stats)
parsed, stats = [], []
for tool in 'GISTIC', 'RUBIC':
results = gistic_results if tool == 'GISTIC' else [rubic_gain_results, rubic_loss_results]
parsed_results = parse(results, known_genes, census_genes, tool)
stats_results = get_stats(parsed_results, tool)
parsed.append(parsed_results)
stats.append(stats_results)
self.make_files(parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
file_venn, file_swarmplot, file_regions)
def make_files(self, parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
file_venn, file_swarmplot, file_regions):
#self.overlap_most_sign(parsed)
self.table_regions(parsed, file_regions)
self.make_tool_report(file_tools, stats)
all_genes = self.make_genefiles(parsed, file_genes_GISTIC, file_genes_RUBIC)
self.make_genefile_overlap(all_genes, file_genes_both)
self.venn_overlap(all_genes, file_venn)
self.compare_known_regions(known_genes, file_overlap, gene_file, gistic_dir, rubic_dir)
#self.compare_known_regions(known_genes, file_overlap, biomart_file,
# gistic_results, rubic_gain_results, rubic_loss_results)
self.make_swarmplot_sizes(parsed, file_swarmplot)
def calculate_stats(self, gistic_dir, rubic_dir, file_tools, census_genes, known_genes):
"""Calculate the stats to write to the tool report."""
parsed_tools, stats_tools = [], []
for tool in "GISTIC2", "RUBIC":
if tool == "GISTIC2":
parsed_results = parse().gistic_results(gistic_dir)
else:
parsed_results = parse().rubic_results(rubic_dir + '/gains.txt', rubic_dir + '/losses.txt')
parsed_tools.append(parsed_results)
stats_results = stats().calculate_stats(parsed_results, census_genes, known_genes, tool)
stats_tools.append(stats_results)
#print(parsed_tools)
#print(stats_tools)
return parsed_tools, stats_tools
def make_tool_report(self, file_tools, stats_tools):
"""Make a report of the results from GISTIC and RUBIC."""
row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Regions with census genes", "Regions with known genes"]
......@@ -128,16 +124,33 @@ class ReportTools:
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
def table_regions(self, parsed_regions, file_regions):
"""Make a table with all recurent regions."""
with open(file_regions, 'w') as out:
header = ["Method", "Chr", "Start", "End", "Type", "Negative log10 q-value", "Known genes", "Census genes", "All genes"]
out.write("\t".join(header) + "\n")
tool_name = "GISTIC2"
for tool in parsed_regions:
for cnv in tool:
out.write(tool_name + "\t")
cnv_type = "Amplification" if cnv[3] == "amp" else "Deletion"
begin_row = cnv[0:3] + [cnv_type, str(cnv[4])]
out.write("\t".join(begin_row))
for list_genes in range(5,8):
out.write("\t" + ", ".join(cnv[list_genes]))
out.write("\n")
tool_name = "RUBIC"
def overlap_most_sign(self, parsed_results):
"""Examine whether most significant regions (highest log10(qvalue)) are also found by other tool."""
for tool in parsed_results:
tool_qvals = {}
for cnv in tool.keys():
qval = tool[cnv][4]
for cnv in tool:
qval = cnv[4]
if type(qval) == tuple: #rubic dict
qval = qval[0]
qval = float(qval)
tool_qvals[qval] = tool[cnv]
tool_qvals[qval] = cnv
sorted_qvals = sorted(tool_qvals, reverse=True)
print(sorted_qvals)
for top in range(0,10):
......@@ -148,14 +161,14 @@ class ReportTools:
def make_genefiles(self, parsed_results, out_GISTIC, out_RUBIC):
"""Make files with all genes reported by GISTIC2 and RUBIC."""
genes_all = []
for dict_tool in parsed_results:
for tool_results in parsed_results:
genes_tool = []
for cnv in dict_tool.keys():
genes = dict_tool[cnv][5]
for cnv in tool_results:
genes = cnv[7]
genes_tool += genes
genes_tool = list(set(genes_tool)) #get unique genes
genes_all.append(genes_tool)
out_file = out_GISTIC if dict_tool == parsed_results[0] else out_RUBIC
out_file = out_GISTIC if tool_results == parsed_results[0] else out_RUBIC
with open(out_file, 'w') as out:
out.write("\n".join(genes_tool))
return genes_all
......@@ -165,16 +178,15 @@ class ReportTools:
with open(out_both, 'w') as overlap_file:
overlap_file.write("\n".join(list(overlap)))
def make_swarmplot_sizes(self, list_dicts, out_plot):
def make_swarmplot_sizes(self, parsed_results, out_plot):
"""Make swarmplot of the sizes of recurrent regions detected by either GISTIC2 or RUBIC."""
sizes, sizes_zoomed = [], []
tool = 'GISTIC2'
for dict_tool in list_dicts:
for tool_results in parsed_results:
amp_sizes, del_sizes = [], []
for cnv in dict_tool.keys():
size = int(dict_tool[cnv][2]) - int(dict_tool[cnv][1])
cnv_type = dict_tool[cnv][3]
cnv_label = 'Amplification' if cnv_type == 'amp' else 'Deletion'
for cnv in tool_results:
size = int(cnv[2]) - int(cnv[1])
cnv_label = 'Amplification' if cnv[3] == 'amp' else 'Deletion'
sizes += [(size, tool, cnv_label)]
if size < 900000:
sizes_zoomed += [(size, tool, cnv_label)]
......@@ -200,7 +212,7 @@ class ReportTools:
plt.savefig(out_venn)
plt.close()
def compare_known_regions(self, known_genes, overlap_file, gene_file, gistic_results, rubic_results):
def compare_known_regions(self, known_genes, overlap_file, gene_file, gistic_results, rubic_gain_results, rubic_loss_results):
"""Compare the regions containing known driver genes"""
top_genes = []
top_genes = parse().gene_file(known_genes, 10)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment