Commit 75205c4f authored by Beatrice Tan's avatar Beatrice Tan

Made function names more explanatory and code better readable.

parent 2d3389cc
......@@ -64,7 +64,8 @@ rule report:
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png",
circos="Reports/Circos/RecurrentRegions.png",
circos_legend="Reports/Circos/RecurrentRegions_legend.png"
circos_legend="Reports/Circos/RecurrentRegions_legend.png",
known_genes="Reports/Overlap_known_genes.bed"
output:
html="Reports/Results.html"
run:
......
......@@ -9,7 +9,7 @@ rule report_tools:
gistic="GISTIC/all_lesions.conf_" + config["gistic_precision"] + ".txt",
rubic_gain="RUBIC/gains.txt",
rubic_loss="RUBIC/losses.txt",
overlap="Reports/Overlap_regions.bed"
overlap="Reports/Regions_overlapping_other_tool.bed"
output:
tools="Reports/Tools.txt",
table_regions="Reports/Recurrent_regions.txt",
......@@ -18,7 +18,7 @@ rule report_tools:
genes_rubic="Reports/Genes_RUBIC.txt",
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Swarmplot_sizes.png",
pvals="Reports/Overlap_pvalues.png"
bed_known="Reports/Locations_known_genes.bed"
params: #select input files from repository or own input files
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
......@@ -29,7 +29,7 @@ rule report_tools:
params.census, params.known, params.ref, params.biomart_info,
output.tools, output.table_regions, output.venn, output.swarmplot,
output.genes_both, output.genes_gistic, output.genes_rubic,
input.overlap, output.pvals)
input.overlap, output.bed_known)
rule bed_intersect:
"""Intersect the recurrent regions detected by RUBIC and GISTIC2.0."""
......@@ -37,12 +37,26 @@ rule bed_intersect:
gistic="GISTIC/regions_track.conf_" + config["gistic_precision"] + ".bed",
rubic="RUBIC/regions_track.bed"
output:
"Reports/Overlap_regions.bed"
"Reports/Regions_overlapping_other_tool.bed"
conda:
workflow.basedir + "/envs/bedtools.yaml"
shell:
"bedtools intersect -a {input.gistic} -b {input.rubic} -wo > {output}"
rule bed_known_genes:
"""Intersect known genes and recurrent regions detected by RUBIC and GISTIC2.0."""
input:
known="Reports/Locations_known_genes.bed",
gistic="GISTIC/regions_track.conf_" + config["gistic_precision"] + ".bed",
rubic="RUBIC/regions_track.bed"
output:
"Reports/Overlap_known_genes.bed"
conda:
workflow.basedir + "/envs/bedtools.yaml"
shell:
"bedtools intersect -a {input.known} -b {input.gistic} {input.rubic} -names GISTIC RUBIC -wo > {output}"
def get_regions(bed_file):
plot_names = []
with open(bed_file, 'r') as bed:
......
import numpy as np
import os.path
import pyensembl
ensembl = pyensembl.EnsemblRelease(75)
from math import log10
from collections import OrderedDict
from ensemblQueries import genes_at_locus
def install_ensembl(reference_genome):
"""Import ensembl if needed and load correct release based on reference genome."""
if "hg19" in reference_genome:
ensembl = pyensembl.EnsemblRelease(75)
elif "38" in reference_genome:
ensembl = pyensembl.EnsemblRelease(87)
else:
raise ValueError("Unknown reference genome used.")
try: #only done first time to install ensembl version.
ensembl.download()
ensembl.index()
except:
pass
def parse(results_file, known_file, census_file, used_tool):
def parse_regions(results_file, known_file, census_file, used_tool):
"""Make list with the parsed results from GISTIC or RUBIC.
Format: [chrom, start, end, cnv_type, qval, [known_gene_list], [census_gene_list], [gene_list]]"""
known_genes = gene_file_IDs(known_file, np.inf)
census_genes = gene_file_IDs(census_file, np.inf)
known_genes = parse_gene_file(known_file, np.inf)
census_genes = parse_gene_file(census_file, np.inf)
if used_tool == 'GISTIC':
recurrent_regions = parse_gistic_results(results_file, known_genes, census_genes)
else:
recurrent_regions = parse_rubic_results(results_file[0], results_file[1], known_genes, census_genes)
return recurrent_regions
def parse_gene_file(gene_file, top_nr):
"""Parse a file with a list of genes (one gene per line or first column).
Top_nr: the number of genes to extract (requires ranked gene list file), use np.inf to extract all genes."""
list_genes = []
num_read = 0
with open(gene_file, 'r') as genes:
for line in genes:
if line.startswith("Supplementary") or line.startswith("\t") or line.startswith("Gene"): #deal with supplement files with one or more header lines
continue
else:
gene = line.split("\t")[0]
list_genes.append(gene)
if top_nr != np.inf:
list_genes = list_genes[0:top_nr]
return list_genes
def parse_gistic_results(results_file, known_genes, census_genes):
"""Create a list of lists with the recurrent regions called by GISTIC2."""
regions = []
......@@ -46,7 +47,6 @@ def parse_gistic_results(results_file, known_genes, census_genes):
start, end = bp.split("-")
cnv_type = 'amp' if stats[0].startswith("Amplification") else 'del'
qval = log10(float(stats[5])) * -1 #Convert q-value to log10(q-value)
#list_genes = genes_locus(chrom, int(start), int(end))
result_dir, conf = results_file.split("all_lesions")
gene_file = os.path.join(result_dir, cnv_type + "_genes" + conf)
list_genes = gistic_genes(gene_file, CNV_id)
......@@ -71,7 +71,7 @@ def gistic_genes(gene_file, CNV_id):
return out_genes
def parse_rubic_results(results_gains, results_losses, known_genes, census_genes):
"""Create a list of lists with the recurrent regions called by GISTIC2."""
"""Create a list of lists with the recurrent regions called by RUBIC."""
regions = []
for result_file in results_gains, results_losses:
with open(result_file, 'r') as lesions:
......@@ -80,69 +80,84 @@ def parse_rubic_results(results_gains, results_losses, known_genes, census_genes
chrom, start, end, qval_left, qval_right, genes = lesion.split("\t")
CNV_id = "chr" + chrom + ":" + start + "-" + end
cnv_type = 'amp' if result_file.endswith('gains.txt') else 'del'
#list_genes = genes_locus(chrom, int(start), int(end))
list_genes = genes.strip().split(",")
print(list_genes)
known_overlap = set(list_genes).intersection(set(known_genes))
census_overlap = set(list_genes).intersection(set(census_genes))
cnv_data = [chrom, start, end, cnv_type, (qval_left, qval_right), list(known_overlap), list(census_overlap), list_genes]
regions.append(cnv_data)
return regions
#def genes_locus(chrom, start, end):
# """Get list of gene IDs at certain location."""
# IDs = []
# gene_info = ensembl.genes_at_locus(chrom, start, end)
# for gene in gene_info:
# ID = gene.gene_id
# IDs.append(ID)
# return IDs
def gene_file_IDs(gene_file, top_nr):
"""Parse a file with a list of genes (one gene per line or first column).
Top_nr: the number of genes to extract (requires ranked gene list file), use np.inf to extract all genes."""
list_genes = []
num_read = 0
with open(gene_file, 'r') as genes:
for line in genes:
if line.startswith("Supplementary") or line.startswith("\t") or line.startswith("Gene"): #deal with supplement files with one or more header lines
continue
def parse_bed_overlap(bed_file, known_file, census_file):
"""Parse the bed file with overlapping regions.
Returns for amplifications and deletions seperately: ['Overlap', type, number of regions,
average size of regions, total size of regions, number of genes detected]"""
known_genes = parse_gene_file(known_file, np.inf)
census_genes = parse_gene_file(census_file, np.inf)
return_stats = []
for cnv_type in 'amp', 'del':
cnv_info = [0, 0, 0, 0, 0]
with open(bed_file, 'r') as lesions:
for line in lesions:
first_chr, first_start, first_end, first_nr, sec_chr, sec_start, sec_end, sec_nr, bp_overlap = line.strip().split("\t")
begin_overlap = max(int(first_start), int(sec_start))
end_overlap = min(int(first_end), int(sec_end))
if cnv_type == 'amp':
if (first_nr.startswith("Any-AP") and sec_nr.startswith("amp")) or (sec_nr.startswith("Any-AP") and first_nr.starswith("amp")):
cnv_info = add_numbers_bed_overlap(cnv_info, bp_overlap, first_chr, begin_overlap, end_overlap, known_genes, census_genes)
else:
gene = line.split("\t")[0]
list_genes.append(gene)
#try:
# gene_ID = ensembl.gene_ids_of_gene_name(gene)
# list_genes = list_genes + gene_ID
#except:
# pass
if top_nr != np.inf:
list_genes = list_genes[0:top_nr]
return list_genes
if (first_nr.startswith("Any-DP") and sec_nr.startswith("del")) or (sec_nr.startswith("Any-DP") and first_nr.starswith("del")):
cnv_info = add_numbers_bed_overlap(cnv_info, bp_overlap, first_chr, begin_overlap, end_overlap, known_genes, census_genes)
stats = calculate_stats(cnv_info[0], cnv_info[1], cnv_info[2], cnv_info[3], cnv_info[4], cnv_type, 'Overlap')
return_stats.append(stats)
return return_stats
def add_numbers_bed_overlap(cnv_info, bp_overlap, first_chr, begin_overlap, end_overlap, known_genes, census_genes):
"""Add counts to list with stats.
cnv_info = [total size, number of genes, number of regions, number of regions with known genes,
number of regions with census genes]"""
list_genes = genes_at_locus(first_chr, begin_overlap, end_overlap)
unique_genes = list(set(list_genes))
cnv_info[0] += int(bp_overlap) #total size
cnv_info[1] += len(unique_genes) #number genes
cnv_info[2] += 1 #number regions
known_overlap = set(list_genes).intersection(set(known_genes))
if list(known_overlap) != []:
cnv_info[3] += 1
census_overlap = set(list_genes).intersection(set(census_genes))
if list(census_overlap) != []:
cnv_info[4] += 1
return cnv_info
def get_stats(list_regions, used_tool):
"""Calculate stats on the results from GISTIC2 and RUBIC."""
return_info = []
for cnv_type in 'amp', 'del':
tot_size, nr_genes, nr_reg_nogenes, nr_reg, known_count, census_count = 0, 0, 0, 0, 0, 0
tot_size, nr_genes, nr_reg, known_count, census_count = 0, 0, 0, 0, 0
all_genes = []
for alt in list_regions:
if alt[3] == cnv_type: #get amp stats
nr_reg += 1
tot_size += int(alt[2]) - int(alt[1])
nr_genes += len(alt[7]) #long list genes > unique genes
all_genes += [alt[7]]
all_genes = all_genes + alt[7]
if alt[5] != []:
known_count += 1
if alt[6] != []:
census_count += 1
type_name = 'Amplifications' if cnv_type == 'amp' else 'Deletions'
avg_size = int(tot_size / nr_reg / 1000) if nr_reg != 0 else 'N/A' #average size in kb
tot_size = tot_size / 1000000 #total size in mb
stats = [used_tool, type_name, nr_reg, avg_size, tot_size, nr_genes]
stats = list(map(str, stats))
for count in known_count, census_count:
percent = (count / nr_reg * 100) if count != 0 else 0
stat_str = str(count) + " (" + str(round(percent, 2)) + "%)"
stats.append(stat_str)
nr_genes = len(list(set(all_genes))) #unique genes only
stats = calculate_stats(tot_size, nr_genes, nr_reg, known_count, census_count, cnv_type, used_tool)
return_info.append(stats)
return return_info
def calculate_stats(tot_size, nr_genes, nr_reg, known_count, census_count, cnv_type, used_tool):
"""Calculate average & total numbers for report on recurrent regions."""
type_name = 'Amplifications' if cnv_type == 'amp' else 'Deletions'
avg_size = int(tot_size / nr_reg / 1000) if nr_reg != 0 else 'N/A' #average size in kb
round_tot_size = round((tot_size / 1000000), 2) #total size in mb
stats = [used_tool, type_name, nr_reg, avg_size, round_tot_size, nr_genes]
stats = list(map(str, stats))
for count in known_count, census_count:
percent = (count / nr_reg * 100) if count != 0 else 0
stat_str = str(count) + " (" + str(round(percent, 2)) + "%)"
stats.append(stat_str)
return stats
......@@ -6,19 +6,17 @@ import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats, install_ensembl
from ParseResults import parse_regions, get_stats
from collections import OrderedDict
def make_report(control_results, nocontrol_results, report_file, census_genes, known_genes, ref_genome):
"""Make a report on analyses using control samples or without using them."""
install_ensembl(ref_genome)
parsed_tools, stats_tools = [], []
with open(report_file, 'w') as out:
row_names = (["Control used?", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for result in control_results, nocontrol_results:
parsed_results = parse().gistic_results(result)
parsed_results = parse_regions.gistic_results(result)
parsed_tools.append(parsed_results)
label = "Control" if result == control_results else "No control"
stats_results = stats().calculate_stats(parsed_results, census_genes, known_genes, label)
......
......@@ -7,7 +7,7 @@ import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats
from ParseResults import get_stats
from collections import OrderedDict
def make_report(segmentation_file, report_file):
......
......@@ -6,20 +6,18 @@ import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats, install_ensembl
from ParseResults import parse_regions, get_stats
from collections import OrderedDict
def make_report(size_results, census_genes, known_genes, reps, ref_genome, report_file, plot_dir):
"""Make a report of the results produced using input files with different sample sizes."""
install_ensembl(ref_genome)
with open(report_file, 'w') as out:
dict_stats = OrderedDict()
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. regions with census genes", "Nr. regions with known genes", "Nr. genes"])
tool = GISTIC2
for size_file in size_results:
parsed_results = parse(size_file, known_genes, census_genes, tool)
parsed_results = parse_regions(size_file, known_genes, census_genes, tool)
size, repetition = size_file.split("/")[-1].split("x")
stats_results = get_stats(parsed_results, size)
if size not in dict_stats.keys():
......
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
from matplotlib_venn import venn2, venn3
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu, pearsonr
import os.path
import pyensembl
from ParseResults import parse, get_stats, install_ensembl
from ParseResults import parse_regions, get_stats, parse_bed_overlap, parse_gene_file
from collections import OrderedDict
import math
......@@ -15,43 +14,33 @@ def make_report(gistic_results, rubic_gain_results, rubic_loss_results,
census_genes, known_genes, ref_genome, biomart_file,
file_tools, file_regions, file_venn, file_swarmplot,
file_genes_both, file_genes_GISTIC, file_genes_RUBIC,
bed_overlap, file_pvalues):
bed_overlap, bed_known_genes):
"""Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
install_ensembl(ref_genome)
parsed, stats = [], []
for tool in 'GISTIC', 'RUBIC':
results = gistic_results if tool == 'GISTIC' else [rubic_gain_results, rubic_loss_results]
parsed_results = parse(results, known_genes, census_genes, tool)
stats_results = get_stats(parsed_results, tool)
parsed_results = parse_regions(results, known_genes, census_genes, tool)
parsed.append(parsed_results)
stats.append(stats_results)
make_files(parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
file_venn, file_swarmplot, file_regions, biomart_file, bed_overlap, file_pvalues)
def make_files(parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
file_venn, file_swarmplot, file_regions, biomart_file, bed_overlap, file_pvalues):
#overlap_most_sign(parsed)
table_regions(parsed, file_regions)
make_tool_report(file_tools, stats)
all_genes = make_genefiles(parsed, file_genes_GISTIC, file_genes_RUBIC)
make_genefile_overlap(all_genes, file_genes_both)
venn_overlap(all_genes, file_venn)
#compare_known_regions(known_genes, file_overlap, biomart_file,
# gistic_results, rubic_gain_results, rubic_loss_results)
plot_sizes(parsed, file_swarmplot)
#compare_overlapping_regions(parsed, biomart_file, file_overlap)
overlapping_pvalues(bed_overlap, parsed, file_pvalues)
stats.append(get_stats(parsed_results, tool))
stats.append(parse_bed_overlap(bed_overlap, known_genes, census_genes))
table_all_regions(parsed, file_regions) #make table with all regions
make_tool_report(file_tools, stats) #make tool report
all_genes = make_gene_files(parsed, file_genes_GISTIC, file_genes_RUBIC, file_genes_both, known_genes, file_venn) #make gene files and venn overlap plot
plot_histogram_sizes(parsed, file_swarmplot) #plot histogram with the sizes of the regions
make_bed_known_regions(known_genes, biomart_file, bed_known_genes) #Make bed file with the locations of known genes.
def make_tool_report(file_tools, stats_tools):
"""Make a report of the results from GISTIC and RUBIC."""
row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Regions with known genes", "Regions with census genes"]
with open(file_tools, 'w') as out:
for i in range(len(row_names)):
list_out = [row_names[i], stats_tools[0][0][i], stats_tools[0][1][i], stats_tools[1][0][i], stats_tools[1][1][i]]
list_out = [row_names[i]]
for j in range(len(stats_tools)):
list_out += [stats_tools[j][0][i], stats_tools[j][1][i]]
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
def table_regions(parsed_regions, file_regions):
def table_all_regions(parsed_regions, file_regions):
"""Make a table with all recurent regions."""
with open(file_regions, 'w') as out:
header = ["Method", "Chr", "Start", "End", "Type", "Negative log10 q-value", "Known genes", "Census genes", "All genes"]
......@@ -69,7 +58,7 @@ def table_regions(parsed_regions, file_regions):
out.write("\n")
tool_name = "RUBIC"
def make_genefiles(parsed_results, out_GISTIC, out_RUBIC):
def make_gene_files(parsed_results, out_GISTIC, out_RUBIC, out_both, known_genes, out_venn):
"""Make files with all genes reported by GISTIC2 and RUBIC."""
genes_all = []
for tool_results in parsed_results:
......@@ -82,38 +71,39 @@ def make_genefiles(parsed_results, out_GISTIC, out_RUBIC):
out_file = out_GISTIC if tool_results == parsed_results[0] else out_RUBIC
with open(out_file, 'w') as out:
out.write("\n".join(genes_tool))
return genes_all
make_gene_file_overlap(genes_all, out_both)
venn3_overlap(genes_all, known_genes, out_venn)
def make_genefile_overlap(gene_lists, out_both):
def make_gene_file_overlap(gene_lists, out_both):
"""Make file with all genes reported by both tools."""
overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
with open(out_both, 'w') as overlap_file:
overlap_file.write("\n".join(list(overlap)))
def parse_sizes(parsed_results):
"""Calculate the sizes of regions to use as input swarmplot of sizes of recurrent regions."""
sizes, sizes_zoomed = [], []
tool = 'GISTIC2.0'
for tool_results in parsed_results:
amp_sizes, del_sizes = [], []
for cnv in tool_results:
size = int(cnv[2]) - int(cnv[1])
cnv_label = 'Amplification' if cnv[3] == 'amp' else 'Deletion'
sizes += [(size, tool, cnv_label)]
if size < 900000:
sizes_zoomed += [(size, tool, cnv_label)]
tool = 'RUBIC'
labels = ['Size', 'Tool', 'CNV type']
df = pd.DataFrame(sizes, columns=labels)
return df
def venn3_overlap(gene_lists, known_genes, out_venn):
known = parse_gene_file(known_genes, np.inf)
plt.subplots(figsize=(7, 7))
venn_sets = [set(gene_lists[0]), set(gene_lists[1]), set(known)]
c = venn3(venn_sets, ('GISTIC2', 'RUBIC', ' Known genes'))
c.get_patch_by_id('100').set_color('#5975A4'), c.get_patch_by_id('010').set_color('#5F9E6E')
c.get_patch_by_id('001').set_color('#857AAA'), c.get_patch_by_id('011').set_color('#748A8F')
c.get_patch_by_id('101').set_color('#6D77A7'), c.get_patch_by_id('110').set_color('#5C888B')
c.get_patch_by_id('111').set_color('#B55D60')
c.get_patch_by_id('100').set_alpha(1), c.get_patch_by_id('010').set_alpha(1)
c.get_patch_by_id('001').set_alpha(1), c.get_patch_by_id('011').set_alpha(1)
c.get_patch_by_id('101').set_alpha(1), c.get_patch_by_id('110').set_alpha(1)
c.get_patch_by_id('111').set_alpha(1)
plt.savefig(out_venn, dpi=300)
def plot_sizes(parsed_results, plot_file):
def plot_histogram_sizes(parsed_results, plot_file):
"""Make boxplot with swarmplot on top showing sizes of recurrent regions detected by GISTIC2.0 and RUBIC."""
size_list = parse_sizes(parsed_results)
size_list, size_list_known = parse_sizes_for_plot(parsed_results)
sns.set_style("white")
f, ax = plt.subplots(figsize=(7, 7))
ax.set(yscale="log")
g = sns.boxplot(x="Tool", y="Size", hue="CNV type", data=size_list, whis=np.inf, palette="deep")
g = sns.swarmplot(x="Tool", y="Size", dodge=True, hue="CNV type", data=size_list, size=5, palette={"Amplification": "#3f3f3f", "Deletion": "#3f3f3f"})
g = sns.boxplot(x="Tool", y="Size", hue="CNV type", data=size_list, whis=np.inf, palette={"Amplification": "#71AEC0", "Deletion": "#B55D60"})
g = sns.swarmplot(x="Tool", y="Size", dodge=True, hue="CNV type", data=size_list, size=5, palette={"Amplification": "#527F8C", "Deletion": "#844446"})
g = sns.swarmplot(x="Tool", y="Size", dodge=True, hue="CNV type", data=size_list_known, size=5, palette={"Amplification": "black", "Deletion": "black"})
sns.despine()
g.set_xlabel("Tool",fontsize=16)
g.set_ylabel("Log size of recurrent regions",fontsize=16)
......@@ -121,108 +111,42 @@ def plot_sizes(parsed_results, plot_file):
handles, labels = g.get_legend_handles_labels()
plt.legend(handles[0:2], labels[:2], fontsize=12)
plt.savefig(plot_file, dpi=300)
#plt.close()
def venn_overlap(gene_lists, out_venn): #maybe add known genes?
"""Compare the genes detected by GISTIC and those by RUBIC."""
plt.subplots(figsize=(7, 7))
c = venn2([set(gene_lists[0]), set(gene_lists[1])], set_labels = ('GISTIC2', 'RUBIC'))
c.get_patch_by_id('10').set_color('#5975A4')
c.get_patch_by_id('01').set_color('#5F9E6E')
c.get_patch_by_id('10').set_edgecolor('#4c4c4c')
c.get_patch_by_id('01').set_edgecolor('#4c4c4c')
c.get_patch_by_id('10').set_alpha(1)
c.get_patch_by_id('01').set_alpha(1)
c.get_patch_by_id('11').set_color('#857AAA')
c.get_patch_by_id('11').set_edgecolor('none')
c.get_patch_by_id('11').set_alpha(1)
plt.savefig(out_venn, dpi=300)
#plt.close()
##Wrong
def compare_overlapping_regions(parsed_results, biomart_file, overlap_file):
"""Compare the regions containing known driver genes"""
location_dict = {}
tool = "GISTIC"
def parse_sizes_for_plot(parsed_results):
"""Calculate the sizes of regions to use as input swarmplot of sizes of recurrent regions."""
sizes, sizes_known = [], []
tool = 'GISTIC2.0'
for tool_results in parsed_results:
amp_sizes, del_sizes = [], []
for cnv in tool_results:
known_genes = cnv[5]
if len(known_genes) != 0:
for gene in known_genes:
location_gene, gene_name = get_location_gene(biomart_file, gene)
if location_gene != ['N/A', 'N/A', 'N/A']:
if tool == "GISTIC": #first loop
location_gistic = cnv[0:3]
location_rubic = ['N/A', 'N/A', 'N/A']
for rubic_cnv in parsed_results[1]:
if int(rubic_cnv[0]) == int(location_gene[0]) and int(rubic_cnv[1]) <= int(location_gene[2]) and int(rubic_cnv[2]) >= int(location_gene[1]): #overlap cnv and gene
location_rubic = rubic_cnv[0:3]
else: #second loop
location_rubic = cnv[0:3]
location_gistic = ['N/A', 'N/A', 'N/A']
for gistic_cnv in parsed_results[0]:
if int(gistic_cnv[0]) == int(location_gene[0]) and int(gistic_cnv[1]) <= int(location_gene[2]) and int(gistic_cnv[2]) >= int(location_gene[1]): #overlap cnv and gene
location_gistic = gistic_cnv[0:3]
if gene not in location_dict:
location_dict[gene] = gene_name, location_gene, location_gistic, location_rubic
#else:
#print(location_dict[gene])
#print(gene_name, location_gene, location_gistic, location_rubic)
tool = "RUBIC"
with open(overlap_file, 'w') as out:
for gene in location_dict.keys():
gene_name, location_gene, location_gistic, location_rubic = location_dict[gene]
out_line = [gene_name, location_gene[0] + ":" + "-".join(location_gene[1:3]), location_gistic[0] + ":" + "-".join(location_gistic[1:3]), location_rubic[0] + ":" + "-".join(location_rubic[1:3])]
out.write("\t".join(out_line) + "\n")
def get_location_gene(biomart_file, gene_of_interest):
"""Get location of a gene from a biomart gene file"""
location = ['N/A', 'N/A', 'N/A']
gene_name = 'N/A'
with open(biomart_file, 'r') as genes:
genes.readline()
for line in genes:
info = line.strip().split("\t")
if info[0] == gene_of_interest:
location = [info[2], info[3], info[4]]
gene_name = info[1]
#try:
# location = [int(i) for i in location]
# break
#except:
# continue
return location, gene_name
size = int(cnv[2]) - int(cnv[1])
cnv_label = 'Amplification' if cnv[3] == 'amp' else 'Deletion'
sizes += [(size, tool, cnv_label)]
if cnv[5] != []:
sizes_known += [(size, tool, cnv_label)]
tool = 'RUBIC'
labels = ['Size', 'Tool', 'CNV type']
df = pd.DataFrame(sizes, columns=labels)
df_known = pd.DataFrame(sizes_known, columns=labels)
return df, df_known
def overlapping_pvalues(bed_file, parsed_results, plot_file):
"""Parse the P-values of overlapping regions between GISTIC2.0 and RUBIC."""
pvals = []
with open(bed_file, 'r') as overlap:
for line in overlap:
g_chr, g_start, g_end, g_nr, r_chr, r_start, r_end, r_nr, bp_overlap = line.strip().split("\t")
for region in parsed_results[0]:
if region[0:3] == [g_chr.strip("chr"), g_start, g_end]:
g_pval = region[4]
for region in parsed_results[1]:
if region[0:3] == [r_chr.strip("chr"), r_start, r_end]:
r_left_pv, r_right_pv = region[4]
r_pval = min(float(r_left_pv), float(r_right_pv))
if float(r_pval) > 0:
gpv = math.pow(10, float(g_pval)*-1)
rpv = math.pow(10, float(r_pval)*-1)
#pvals.append((float(g_pval), float(r_left_pv)))
pvals.append((gpv, rpv))
labels = ["P-value GISTIC2.0", "P-value RUBIC"]
pval_df = pd.DataFrame(pvals, columns=labels)
print(pval_df)
plot_correlation(pval_df, plot_file)
def make_bed_known_regions(known_file, biomart_file, out_bed):
"""Make bed file with location of known genes."""
known_genes = parse_gene_file(known_file, np.inf)
lines = location_genes(known_genes, biomart_file)
with open(out_bed, 'w') as out:
out.write("track name=Known genes color=0,0,255\n")
for line in lines:
out.write("\t".join(line) + "\n")
def plot_correlation(df_pvals, plot_file):
"""Plot correlation between P-values."""
g = sns.jointplot("P-value GISTIC2.0", "P-value RUBIC", data=df_pvals, kind="reg", color="r", size=7, xlim=(0,0.25), ylim=(0,0.25))
pr = pearsonr(df_pvals["P-value GISTIC2.0"], df_pvals["P-value RUBIC"])
print(pr)