Commit 7a860865 authored by Beatrice Tan's avatar Beatrice Tan
Browse files

Updated overlap function and added submit script.

parent 4592528e
......@@ -37,11 +37,11 @@ onerror:
rule all:
"""Define desired output from pipeline."""
input:
"Samplesize/Report.txt",
#"Samplesize/Report.txt",
#"Reports/Circos/RecurrentRegions.png"
#"GO/comparison.txt",
#"Reports/Control.txt",
#"Reports/Tools.txt",
"Reports/Tools.txt",
#"Reports/Results.html",
#"Reports/Control.txt",
......
......@@ -94,7 +94,7 @@ rule report_tools:
genes_both="Reports/Genes_both.txt",
genes_gistic="Reports/Genes_GISTIC2.txt",
genes_rubic="Reports/Genes_RUBIC.txt",
#overlap="Reports/Overlap_genes.txt",
overlap="Reports/Overlap_genes.txt",
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png"
params: #select input files from repository or own input files
......@@ -105,7 +105,7 @@ rule report_tools:
run:
ReportTools(input.gistic, input.rubic_gain, input.rubic_loss,
params.census, params.known, params.ref, params.biomart_info,
output.tools, output.table_regions, output.venn, 'overlap', #output.overlap,
output.tools, output.table_regions, output.venn, output.overlap,
output.swarmplot,
output.genes_both, output.genes_gistic, output.genes_rubic)
......
#! /bin/bash
#$ -S /bin/bash
#$ -q all.q
#$ -N prioritization
#$ -l h_vmem=1G
#$ -cwd
#$ -j Y
#$ -V
#$ -m be
#$ -M b.f.tan@lumc.nl
snakemake -p \
--configfile $@ \
--snakefile #/exports/sasc/dcats/snakemake/RNAseq_multi_method/Snakefile \
--latency-wait 90 \
# --drmaa " -N preprocessor -pe BWA {threads} -l h_vmem={resources.mem}G -q all.q -cwd -V" \
# --drmaa-log-dir $(pwd)/.cluster_logs \
--jobs 4 \
--max-jobs-per-second 10 \
--restart-times 3 \
--use-conda \
--conda-prefix #/exports/sasc/dcats/snakemake/.conda
......@@ -101,10 +101,10 @@ class ReportTools:
parsed.append(parsed_results)
stats.append(stats_results)
self.make_files(parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
file_venn, file_swarmplot, file_regions)
file_venn, file_swarmplot, file_regions, biomart_file, file_overlap)
def make_files(self, parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
file_venn, file_swarmplot, file_regions):
file_venn, file_swarmplot, file_regions, biomart_file, file_overlap):
#self.overlap_most_sign(parsed)
self.table_regions(parsed, file_regions)
self.make_tool_report(file_tools, stats)
......@@ -114,6 +114,7 @@ class ReportTools:
#self.compare_known_regions(known_genes, file_overlap, biomart_file,
# gistic_results, rubic_gain_results, rubic_loss_results)
self.make_swarmplot_sizes(parsed, file_swarmplot)
self.compare_overlapping_regions(parsed, biomart_file, file_overlap)
def make_tool_report(self, file_tools, stats_tools):
"""Make a report of the results from GISTIC and RUBIC."""
......@@ -141,21 +142,21 @@ class ReportTools:
out.write("\n")
tool_name = "RUBIC"
def overlap_most_sign(self, parsed_results):
"""Examine whether most significant regions (highest log10(qvalue)) are also found by other tool."""
for tool in parsed_results:
tool_qvals = {}
for cnv in tool:
qval = cnv[4]
if type(qval) == tuple: #rubic dict
qval = qval[0]
qval = float(qval)
tool_qvals[qval] = cnv
sorted_qvals = sorted(tool_qvals, reverse=True)
print(sorted_qvals)
for top in range(0,10):
top_cnv = tool_qvals[sorted_qvals[top]]
print(top_cnv)
# def overlap_most_sign(self, parsed_results):
# """Examine whether most significant regions (highest log10(qvalue)) are also found by other tool."""
# for tool in parsed_results:
# tool_qvals = {}
# for cnv in tool:
# qval = cnv[4]
# if type(qval) == tuple: #rubic dict
# qval = qval[0]
# qval = float(qval)
# tool_qvals[qval] = cnv
# sorted_qvals = sorted(tool_qvals, reverse=True)
# print(sorted_qvals)
# for top in range(0,10):
# top_cnv = tool_qvals[sorted_qvals[top]]
# print(top_cnv)
def make_genefiles(self, parsed_results, out_GISTIC, out_RUBIC):
......@@ -212,40 +213,76 @@ class ReportTools:
plt.savefig(out_venn)
plt.close()
def compare_known_regions(self, known_genes, overlap_file, gene_file, gistic_results, rubic_gain_results, rubic_loss_results):
def compare_overlapping_regions(self, parsed_results, biomart_file, overlap_file):
"""Compare the regions containing known driver genes"""
top_genes = []
top_genes = parse().gene_file(known_genes, 10)
#print(top_genes)
location_dict = {}
tool = "GISTIC"
for tool_results in parsed_results:
for cnv in tool_results:
known_genes = cnv[5]
if len(known_genes) != 0:
for gene in known_genes:
location_gene, gene_name = self.get_location_gene(biomart_file, gene)
if location_gene != ['N/A', 'N/A', 'N/A']:
if tool == "GISTIC": #first loop
location_gistic = cnv[0:3]
location_rubic = ['N/A', 'N/A', 'N/A']
for rubic_cnv in parsed_results[1]:
if int(rubic_cnv[0]) == int(location_gene[0]) and int(rubic_cnv[1]) <= int(location_gene[2]) and int(rubic_cnv[2]) >= int(location_gene[1]): #overlap cnv and gene
location_rubic = rubic_cnv[0:3]
else: #second loop
location_rubic = cnv[0:3]
location_gistic = ['N/A', 'N/A', 'N/A']
for gistic_cnv in parsed_results[0]:
if int(gistic_cnv[0]) == int(location_gene[0]) and int(gistic_cnv[1]) <= int(location_gene[2]) and int(gistic_cnv[2]) >= int(location_gene[1]): #overlap cnv and gene
location_gistic = gistic_cnv[0:3]
if gene not in location_dict:
location_dict[gene] = gene_name, location_gene, location_gistic, location_rubic
else:
print(location_dict[gene])
print(gene_name, location_gene, location_gistic, location_rubic)
tool = "RUBIC"
with open(overlap_file, 'w') as out:
for gene in top_genes:
out.write(gene + "\n")
for gene in location_dict.keys():
gene_name, location_gene, location_gistic, location_rubic = location_dict[gene]
out_line = [gene_name, location_gene[0] + ":" + "-".join(location_gene[1:3]), location_gistic[0] + ":" + "-".join(location_gistic[1:3]), location_rubic[0] + ":" + "-".join(location_rubic[1:3])]
out.write("\t".join(out_line) + "\n")
# top_genes = []
# top_genes = parse().gene_file(known_genes, 10)
# #print(top_genes)
# with open(overlap_file, 'w') as out:
# for gene in top_genes:
# out.write(gene + "\n")
#print(gene)
location_gene = self.get_location_gene(gene_file, gene)
location_gene = list(map(str, location_gene))
# location_gene = self.get_location_gene(gene_file, gene)
# location_gene = list(map(str, location_gene))
#print(location_gene)
out.write("Location of gene:\t" + "\t".join(location_gene) + "\n")
if location_gene != ('N/A', 'N/A', 'N/A'):
location_gistic = self.get_location_gistic(gistic_results, location_gene)
out.write("Location CNV GISTIC:\t" + "\t".join(location_gistic) + "\n")
location_rubic = self.get_location_rubic(rubic_results, gene)
out.write("Location CNV RUBIC:\t" + "\t".join(location_rubic) + "\n\n")
# out.write("Location of gene:\t" + "\t".join(location_gene) + "\n")
# if location_gene != ('N/A', 'N/A', 'N/A'):
# location_gistic = self.get_location_gistic(gistic_results, location_gene)
# out.write("Location CNV GISTIC:\t" + "\t".join(location_gistic) + "\n")
# location_rubic = self.get_location_rubic(rubic_results, gene)
# out.write("Location CNV RUBIC:\t" + "\t".join(location_rubic) + "\n\n")
def get_location_gene(self, gene_file, gene_of_interest):
def get_location_gene(self, biomart_file, gene_of_interest):
"""Get location of a gene from a biomart gene file"""
location = ['N/A', 'N/A', 'N/A']
with open(gene_file, 'r') as genes:
gene_name = 'N/A'
with open(biomart_file, 'r') as genes:
genes.readline()
for line in genes:
info = line.strip().split("\t")
if info[1] == gene_of_interest:
if info[0] == gene_of_interest:
location = [info[2], info[3], info[4]]
try:
location = [int(i) for i in location]
break
except:
continue
return location
gene_name = info[1]
#try:
# location = [int(i) for i in location]
# break
#except:
# continue
return location, gene_name
def get_location_gistic(self, gistic_results, location_gene):
"""Get location of the CNV detected by GISTIC2 harboring the gene of interest"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment