Commit f1bab812 authored by BeatriceTan's avatar BeatriceTan

Removed small errors.

parent acc6235f
*/.snakemake
.snakemake
rules/__pycache__/
*__pycache__*
rules/rubic_markers.smk
channels:
- conda-forge
- edurand
dependencies:
- matplotlib-venn =0.11.5
- pyensembl =1.1.0
from Rubic import get_seg_rubic
from Reports_new import ReportTools, ReportSegmentation
from Rubic import marker_file
from Reports import ReportTools, ReportSegmentation
import os.path
rule install_gistic:
......@@ -26,23 +26,34 @@ rule run_gistic:
wrapper:
"file:" + workflow.basedir + "/wrappers/GISTIC2"
rule seg_rubic:
"""Change segmentation file to use as input for RUBIC."""
#rule seg_rubic:
# """Change segmentation file to use as input for RUBIC."""
# input:
# "Input/Segments_tumor.txt"
# output:
# seg="RUBIC/Segmentation_file.txt",
# marker="RUBIC/Marker_file.txt"
# run:
# get_seg_rubic(input[0], output.seg, output.marker)
rule markers_rubic:
"""Make marker file to use as input for RUBIC."""
input:
"Input/Segments_tumor.txt"
output:
seg="RUBIC/Segmentation_file.txt",
marker="RUBIC/Marker_file.txt"
"Input/Markers.txt"
run:
get_seg_rubic(input[0], output.seg, output.marker)
marker_file(input[0], output[0])
rule run_rubic:
"""Run RUBIC for the tumor segmentation data."""
input:
seg="RUBIC/Segmentation_file.txt",
markers="RUBIC/Marker_file.txt"
seg="Input/Segments_tumor.txt",
markers="Input/Markers.txt"
output:
"RUBIC/"
out_gains="RUBIC/gains.txt",
out_losses="RUBIC/losses.txt",
out_plots="RUBIC/plots"
params:
fdr="0.25",
genefile="" #config["biomart_genes"]
......@@ -55,7 +66,7 @@ rule report_tools:
"""Report the differences in calls between GISTIC2 and RUBIC."""
input:
gistic="GISTIC/",
rubic="RUBIC/"
rubic="RUBIC/gains.txt"
output:
tools="Reports/Tools.txt",
genes_both="Reports/Genes_both.txt",
......
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind
import os.path
import pyensembl
ensembl = pyensembl.EnsemblRelease(87)
from math import log10
class parse:
"""Functions to parse the outputs from tools"""
def gistic_results(self, result_dir):
"""Create a dictionary with the recurrent regions called by GISTIC2."""
"""Create a dictionary with the recurrent regions called by GISTIC2.
Format: {"chr#:#-#" : (chrom, start, end, cnv_type, qval, [gene, gene])}"""
cnv_dict = {}
with open(os.path.join(result_dir, "all_lesions.conf_75.txt"), 'r') as lesions:
lesions.readline()
......@@ -23,13 +26,13 @@ class parse:
CNV_id = stats[2].split("(")[0]
chrom, bp = CNV_id.split(":")
start, end = bp.split("-")
size = int(end) - int(start)
cnv_type = 'amp' if stats[0].startswith("Amplification") else 'del'
qval = log10(float(stats[5])) #Convert q-value to log10(q-value)
gene_file = os.path.join(result_dir, cnv_type + "_genes.conf_75.txt")
list_genes = self.gistic_genes(gene_file, CNV_id)
cnv_data = [chrom, start, end, size, len(list_genes), cnv_type]
cnv_data = [chrom, start, end, cnv_type, qval, list_genes]
cnv_data = [str(i) for i in cnv_data]
cnv_dict[CNV_id] = cnv_data + [list_genes]
cnv_dict[CNV_id] = tuple(cnv_data)
return cnv_dict
def gistic_genes(self, gene_file, CNV_id):
......@@ -46,22 +49,21 @@ class parse:
out_genes.append(gene)
return out_genes
def rubic_results(self, result_dir):
"""Create a dictionary with the recurrent regions called by RUBIC."""
def rubic_results(self, results_gains, results_losses):
"""Create a dictionary with the recurrent regions called by RUBIC.
Format: {"chr#:#-#" : (chrom, start, end, cnv_type, (qval_left, qval_right), [gene, gene])}"""
cnv_dict = {}
for cnv in 'Focal_gains.tsv', 'Focal_losses.tsv':
with open(os.path.join(result_dir, cnv), 'r') as lesions:
for result_file in results_gains, results_losses:
with open(result_file, 'r') as lesions:
lesions.readline()
for lesion in lesions:
chrom, start, end, left, right, genes = lesion.split("\t")
list_genes = genes.replace("[", "").replace("]", "").strip().split(",")
chrom, start, end, qval_left, qval_right, genes = lesion.split("\t")
CNV_id = "chr" + chrom + ":" + start + "-" + end
nr_genes = 0 if genes.strip() == '' else len(list_genes)
cnv_type = 'amp' if cnv == 'Focal_gains.tsv' else 'del'
size = int(end) - int(start)
cnv_data = [chrom, start, end, size, nr_genes, cnv_type]
cnv_type = 'amp' if result_file.endswith('Focal_gains.tsv') else 'del'
list_genes = genes.replace("[", "").replace("]", "").strip().split(",")
cnv_data = [chrom, start, end, cnv_type, (qval_left, qval_right), list_genes]
cnv_data = [str(i) for i in cnv_data]
cnv_dict[CNV_id] = cnv_data + [list_genes]
cnv_dict[CNV_id] = tuple(cnv_data)
return cnv_dict
def gene_file(self, gene_file):
......@@ -85,15 +87,15 @@ class stats:
tot_size, nr_genes, nr_reg_nogenes, nr_reg = 0, 0, 0, 0
all_genes = []
for alt in alt_dict.keys():
if alt_dict[alt][5] == cnv_type: #get amp stats
if alt_dict[alt][3] == cnv_type: #get amp stats
nr_reg += 1
tot_size += int(alt_dict[alt][3])
nr_genes += int(alt_dict[alt][4])
all_genes += [alt_dict[alt][6]]
tot_size += int(alt_dict[alt][2]) - int(alt_dict[alt][1])
nr_genes += len(alt_dict[alt][5])
all_genes += [alt_dict[alt][5]]
type_name = 'Amplifications' if cnv_type == 'amp' else 'Deletions'
avg_size = int(tot_size / nr_reg / 1000) if nr_reg != 0 else 'N/A' #avg size in kb
stats = [used_tool, type_name, nr_reg, avg_size, tot_size / 1000000, nr_genes] #tot size in mb
stats = [str(i) for i in stats]
stats = list(map(str, stats))
for ref_genes in census_file, known_genes_file:
ref_stat = self.overlap_stats(all_genes, ref_genes)
stats.append(ref_stat)
......@@ -148,8 +150,9 @@ class ReportSegmentation():
def make_seg_report(self, segmentation_file, report_file):
"""Write report."""
with open(report_file, 'w') as out:
seg_rows = ["Number of samples", "Smallest number of CNVs in a sample", "Largest number of CNVs in a sample", \
"Average number of CNVs per sample", "Average number of focal CNVs per sample", "Average number of deletions per sample", "Average length of CNVs (Mb)"]
seg_rows = (["Number of samples", "Smallest number of CNVs in a sample", "Largest number of CNVs in a sample",
"Average number of CNVs per sample", "Average number of focal CNVs per sample",
"Average number of deletions per sample", "Average length of CNVs (Mb)"])
seg_stats = self.get_segment_stats(segmentation_file)
seg_stats_str = [str(i) for i in seg_stats]
for i in range(len(seg_rows)):
......@@ -200,44 +203,55 @@ class ReportSegmentation():
avg_nr_del = int(tot_nr_del / nr_samples)
return [avg_nr_per_sample, avg_nr_focal, avg_nr_del, avg_size]
#Format: {"chr#:#-#" : (chrom, start, end, cnv_type, qval, [gene, gene])}
class ReportTools:
"""Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
def __init__(self, gistic_results, rubic_results, census_genes, known_genes, gene_file, file_tools, \
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_histogram):
self.make_tool_report(gistic_results, rubic_results, census_genes, known_genes, gene_file, file_tools, \
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_histogram)
def make_tool_report(self, gistic_results, rubic_results, census_genes, known_genes, gene_file, file_tools, \
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_histogram):
"""Make a report of the results produced by GISTIC and RUBIC."""
def __init__(self, gistic_dir, rubic_gain_file, census_genes, known_genes, gene_file, file_tools,
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_histogram, file_swarmplot): #double brackets? ((...))
rubic_dir = rubic_gain_file.split("gains.txt")[0] #get diretory with rubic results from gain file.
parsed, stats = self.calculate_stats(gistic_dir, rubic_dir, file_tools, census_genes, known_genes)
self.make_tool_report(file_tools, stats)
all_genes = self.make_genefiles(parsed, file_genes_both, file_genes_GISTIC, file_genes_RUBIC)
self.make_genefile_overlap(all_genes, file_genes_both)
self.venn_overlap(all_genes, file_venn)
#self.make_file_overlap(all_genes, file_swarmplot)
self.compare_known_regions(known_genes, file_overlap, gene_file, gistic_dir, rubic_dir)
self.make_histogram_sizes(parsed, file_histogram)
self.make_swarmplot_sizes(parsed, file_swarmplot)
def calculate_stats(self, gistic_dir, rubic_dir, file_tools, census_genes, known_genes):
"""Calculate the stats to write to the tool report."""
parsed_tools, stats_tools = [], []
row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Regions with census genes", "Regions with known genes"]
for tool in "GISTIC2", "RUBIC":
if tool == "GISTIC2":
parsed_results = parse().gistic_results(gistic_results)
parsed_results = parse().gistic_results(gistic_dir)
else:
parsed_results = parse().rubic_results(rubic_results)
parsed_results = parse().rubic_results(rubic_dir + '/Focal_gains.tsv', rubic_dir + '/Focal_losses.tsv')
parsed_tools.append(parsed_results)
stats_results = stats().calculate_stats(parsed_results, census_genes, known_genes, tool)
stats_tools.append(stats_results)
return parsed_tools, stats_tools
def make_tool_report(self, file_tools, stats_tools):
"""Make a report of the results from GISTIC and RUBIC."""
row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Regions with census genes", "Regions with known genes"]
with open(file_tools, 'w') as out:
for i in range(len(row_names)):
list_out = [row_names[i], stats_tools[0][0][i], stats_tools[0][1][i], stats_tools[1][0][i], stats_tools[1][1][i]]
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
self.make_gene_files(parsed_tools[0], parsed_tools[1], file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_venn)
self.compare_known_regions(known_genes, file_overlap, gene_file, gistic_results, rubic_results)
self.make_histogram_sizes(parsed_tools, file_histogram)
self.make_swarmplot_sizes(parsed_tools, file_histogram)
def make_gene_files(self, dict_gistic, dict_rubic, out_both, out_GISTIC, out_RUBIC, out_venn):
def make_genefiles(self, parsed_results, out_both, out_GISTIC, out_RUBIC):
"""Make files with all genes reported by GISTIC2 and RUBIC."""
genes_all = []
for dict_tool in dict_gistic, dict_rubic:
for dict_tool in parsed_results:
genes_tool = []
for cnv in dict_tool.keys():
genes = dict_tool[cnv][6]
genes = dict_tool[cnv][5]
for gene_name in genes:
try:
gene_ID = ensembl.gene_ids_of_gene_name(gene_name)
......@@ -246,65 +260,68 @@ class ReportTools:
except: #gene name not found
pass
genes_all.append(genes_tool)
out_file = out_GISTIC if dict_tool == dict_gistic else out_RUBIC
with open(out_file, 'w') as out_tool:
out_tool.write("\n".join(genes_tool))
self.venn_overlap(genes_all, out_venn)
self.file_overlap(genes_all, out_both)
out_file = out_GISTIC if dict_tool == parsed_results[0] else out_RUBIC
with open(out_file, 'w') as out:
out.write("\n".join(genes_tool))
return genes_all
def make_histogram_sizes(self, list_dicts, out_histogram):
def make_genefile_overlap(self, gene_lists, out_both):
overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
with open(out_both, 'w') as overlap_file:
overlap_file.write("\n".join(list(overlap)))
#remove > not very informative
def make_histogram_sizes(self, parsed_results, out_histogram):
"""Make histograms of the sizes of recurrent regions detected by either GISTIC2 or RUBIC."""
sns.set_style("whitegrid")
for dict_tool in list_dicts:
for dict_tool in parsed_results:
amp_sizes, del_sizes = [], []
for cnv in dict_tool.keys():
size = dict_tool[cnv][3]
cnv_type = dict_tool[cnv][5]
size = int(dict_tool[cnv][2]) - int(dict_tool[cnv][1])
cnv_type = dict_tool[cnv][3]
if cnv_type == 'amp':
amp_sizes += size
amp_sizes += [size]
else:
del_sizes += size
del_sizes += [size]
for cnv_sizes in amp_sizes, del_sizes:
sns.distplot(cnv_sizes, bins=20)
plt.savefig(out_histogram, dpi=300)
if len(cnv_sizes) > 10:
sns.distplot(cnv_sizes, bins=20)
plt.savefig(out_histogram, dpi=300)
plt.close()
def make_swarmplot_sizes(self, list_dicts, out_plot):
sizes = []
tool = GISTIC2
tool = 'GISTIC2'
for dict_tool in list_dicts:
amp_sizes, del_sizes = [], []
for cnv in dict_tool.keys():
size = dict_tool[cnv][3]
cnv_type = dict_tool[cnv][5]
size = int(dict_tool[cnv][2]) - int(dict_tool[cnv][1])
cnv_type = dict_tool[cnv][3]
cnv_label = 'Amplification' if cnv_type == 'amp' else 'Deletion'
entry = (size, tool, cnv_label)
sizes += entry
tool = RUBIC
#if size < 1000000:
sizes += [(size, tool, cnv_label)]
tool = 'RUBIC'
labels = ['Size', 'Tool', 'Type']
df = pd.DataFrame(sizes, columns=labels)
print(df)
sns.swarmplot(x="Size", y="Tool", hue="Type", data=df);
plt.savefig(out_plot + ".swarm", dpi=300)
plt.savefig(out_plot, dpi=300)
plt.close()
def venn_overlap(self, gene_lists, out_venn):
"""Compare the genes detected by GISTIC and those by RUBIC."""
from matplotlib_venn import venn2
venn2([set(gene_lists[0]), set(gene_lists[1])], set_labels = ('GISTIC2', 'RUBIC'))
plt.savefig(out_venn)
def file_overlap(self, gene_lists, out_both):
overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
with open(out_both, 'w') as overlap_file:
overlap_file.write("\n".join(list(overlap)))
plt.close()
def compare_known_regions(self, known_genes, overlap_file, gene_file, gistic_results, rubic_results):
"""Compare the regions containing known driver genes"""
top_genes = []
with open(known_genes, 'r') as known:
with open(known_genes, 'r') as known: #use parse! > make it possible to define number to parse
for line in range(0,100):
top_gene = known.readline().split("\t")[0]
top_genes.append(top_gene)
#print(top_genes)
#print(top_genes) #2 functions
with open(overlap_file, 'w') as out:
for gene in top_genes:
out.write(gene + "\n")
......@@ -379,7 +396,8 @@ class ReportSizes:
"""Make a report of the results produced using input files with different sample sizes."""
with open(report_file, 'w') as out:
dict_stats = {}
row_names = ["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"]
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for size_file in size_results:
parsed_gistic = parse().gistic_results(size_file)
size, repetition = size_file.split("/")[-1].split("x")
......@@ -408,7 +426,8 @@ class ReportSizes:
nr_census.append(float(stats[7].split(" (")[0]))
nr_known.append(float(stats[8].split(" (")[0]))
plot_data = [nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known]
plot_y_axis = ['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)', 'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes']
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes'])
for plot_nr in range(len(plot_data)):
self.plot_size_differences(plot_data[plot_nr], sizes, len(reps), plot_y_axis[plot_nr], plot_dir)
......@@ -421,7 +440,8 @@ class ReportSizes:
for size in list_sizes:
sample_label = sample_label + [size] * nr_reps * 2
df['sample_size'] = pd.Series(sample_label)
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5, aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5,
aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", y_axis).set_titles("{col_name}").despine(bottom=True)
cnv_types = ["Amplifications", "Deletions"]
cnv_type = "Amplifications"
......@@ -449,7 +469,8 @@ class ReportControl:
"""Write report."""
parsed_tools, stats_tools = [], []
with open(report_file, 'w') as out:
row_names = ["Control used?", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"]
row_names = (["Control used?", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for result in control_results, nocontrol_results:
parsed_results = parse().gistic_results(result)
parsed_tools.append(parsed_results)
......
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind
import os.path
import pyensembl
ensembl = pyensembl.EnsemblRelease(87)
from math import log10
class parse:
"""Functions to parse the outputs from tools"""
def gistic_results(self, result_dir):
"""Create a dictionary with the recurrent regions called by GISTIC2.
Format: {"chr#:#-#" : (chrom, start, end, cnv_type, qval, [gene, gene])}"""
"""Create a dictionary with the recurrent regions called by GISTIC2."""
cnv_dict = {}
with open(os.path.join(result_dir, "all_lesions.conf_75.txt"), 'r') as lesions:
lesions.readline()
......@@ -26,13 +23,13 @@ class parse:
CNV_id = stats[2].split("(")[0]
chrom, bp = CNV_id.split(":")
start, end = bp.split("-")
size = int(end) - int(start)
cnv_type = 'amp' if stats[0].startswith("Amplification") else 'del'
qval = log10(float(stats[5])) #Convert q-value to log10(q-value)
gene_file = os.path.join(result_dir, cnv_type + "_genes.conf_75.txt")
list_genes = self.gistic_genes(gene_file, CNV_id)
cnv_data = [chrom, start, end, cnv_type, qval, list_genes]
cnv_data = [chrom, start, end, size, len(list_genes), cnv_type]
cnv_data = [str(i) for i in cnv_data]
cnv_dict[CNV_id] = tuple(cnv_data)
cnv_dict[CNV_id] = cnv_data + [list_genes]
return cnv_dict
def gistic_genes(self, gene_file, CNV_id):
......@@ -49,21 +46,22 @@ class parse:
out_genes.append(gene)
return out_genes
def rubic_results(self, results_gains, results_losses):
"""Create a dictionary with the recurrent regions called by RUBIC.
Format: {"chr#:#-#" : (chrom, start, end, cnv_type, (qval_left, qval_right), [gene, gene])}"""
def rubic_results(self, result_dir):
"""Create a dictionary with the recurrent regions called by RUBIC."""
cnv_dict = {}
for result_file in results_gains, results_losses:
with open(result_file, 'r') as lesions:
for cnv in 'Focal_gains.tsv', 'Focal_losses.tsv':
with open(os.path.join(result_dir, cnv), 'r') as lesions:
lesions.readline()
for lesion in lesions:
chrom, start, end, qval_left, qval_right, genes = lesion.split("\t")
CNV_id = "chr" + chrom + ":" + start + "-" + end
cnv_type = 'amp' if result_file.endswith('Focal_gains.tsv') else 'del'
chrom, start, end, left, right, genes = lesion.split("\t")
list_genes = genes.replace("[", "").replace("]", "").strip().split(",")
cnv_data = [chrom, start, end, cnv_type, (qval_left, qval_right), list_genes]
CNV_id = "chr" + chrom + ":" + start + "-" + end
nr_genes = 0 if genes.strip() == '' else len(list_genes)
cnv_type = 'amp' if cnv == 'Focal_gains.tsv' else 'del'
size = int(end) - int(start)
cnv_data = [chrom, start, end, size, nr_genes, cnv_type]
cnv_data = [str(i) for i in cnv_data]
cnv_dict[CNV_id] = tuple(cnv_data)
cnv_dict[CNV_id] = cnv_data + [list_genes]
return cnv_dict
def gene_file(self, gene_file):
......@@ -87,15 +85,15 @@ class stats:
tot_size, nr_genes, nr_reg_nogenes, nr_reg = 0, 0, 0, 0
all_genes = []
for alt in alt_dict.keys():
if alt_dict[alt][3] == cnv_type: #get amp stats
if alt_dict[alt][5] == cnv_type: #get amp stats
nr_reg += 1
tot_size += int(alt_dict[alt][2]) - int(alt_dict[alt][1])
nr_genes += len(alt_dict[alt][5])
all_genes += [alt_dict[alt][5]]
tot_size += int(alt_dict[alt][3])
nr_genes += int(alt_dict[alt][4])
all_genes += [alt_dict[alt][6]]
type_name = 'Amplifications' if cnv_type == 'amp' else 'Deletions'
avg_size = int(tot_size / nr_reg / 1000) if nr_reg != 0 else 'N/A' #avg size in kb
stats = [used_tool, type_name, nr_reg, avg_size, tot_size / 1000000, nr_genes] #tot size in mb
stats = list(map(str, stats))
stats = [str(i) for i in stats]
for ref_genes in census_file, known_genes_file:
ref_stat = self.overlap_stats(all_genes, ref_genes)
stats.append(ref_stat)
......@@ -150,9 +148,8 @@ class ReportSegmentation():
def make_seg_report(self, segmentation_file, report_file):
"""Write report."""
with open(report_file, 'w') as out:
seg_rows = (["Number of samples", "Smallest number of CNVs in a sample", "Largest number of CNVs in a sample",
"Average number of CNVs per sample", "Average number of focal CNVs per sample",
"Average number of deletions per sample", "Average length of CNVs (Mb)"])
seg_rows = ["Number of samples", "Smallest number of CNVs in a sample", "Largest number of CNVs in a sample", \
"Average number of CNVs per sample", "Average number of focal CNVs per sample", "Average number of deletions per sample", "Average length of CNVs (Mb)"]
seg_stats = self.get_segment_stats(segmentation_file)
seg_stats_str = [str(i) for i in seg_stats]
for i in range(len(seg_rows)):
......@@ -203,54 +200,44 @@ class ReportSegmentation():
avg_nr_del = int(tot_nr_del / nr_samples)
return [avg_nr_per_sample, avg_nr_focal, avg_nr_del, avg_size]
#Format: {"chr#:#-#" : (chrom, start, end, cnv_type, qval, [gene, gene])}
class ReportTools:
"""Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
def __init__(self, gistic_results, rubic_results, census_genes, known_genes, gene_file, file_tools,
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_histogram, file_swarmplot): #double brackets? ((...))
parsed, stats = self.calculate_stats(gistic_results, rubic_results, file_tools, census_genes, known_genes)
self.make_tool_report(file_tools, stats)
all_genes = self.make_genefiles(parsed, file_genes_both, file_genes_GISTIC, file_genes_RUBIC)
self.make_genefile_overlap(all_genes, file_genes_both)
self.venn_overlap(all_genes, file_venn)
#self.make_file_overlap(all_genes, file_swarmplot)
self.compare_known_regions(known_genes, file_overlap, gene_file, gistic_results, rubic_results)
self.make_histogram_sizes(parsed, file_histogram)
self.make_swarmplot_sizes(parsed, file_swarmplot)
def calculate_stats(self, gistic_dir, rubic_dir, file_tools, census_genes, known_genes):
"""Calculate the stats to write to the tool report."""
def __init__(self, gistic_results, rubic_results, census_genes, known_genes, gene_file, file_tools, \
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_histogram):
self.make_tool_report(gistic_results, rubic_results, census_genes, known_genes, gene_file, file_tools, \
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_histogram)
def make_tool_report(self, gistic_results, rubic_results, census_genes, known_genes, gene_file, file_tools, \
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_histogram):
"""Make a report of the results produced by GISTIC and RUBIC."""
parsed_tools, stats_tools = [], []
row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Regions with census genes", "Regions with known genes"]
for tool in "GISTIC2", "RUBIC":
if tool == "GISTIC2":
parsed_results = parse().gistic_results(gistic_dir)
parsed_results = parse().gistic_results(gistic_results)
else:
parsed_results = parse().rubic_results(rubic_dir + '/Focal_gains.tsv', rubic_dir + '/Focal_losses.tsv')
parsed_results = parse().rubic_results(rubic_results)
parsed_tools.append(parsed_results)
stats_results = stats().calculate_stats(parsed_results, census_genes, known_genes, tool)
stats_tools.append(stats_results)
return parsed_tools, stats_tools
def make_tool_report(self, file_tools, stats_tools):
"""Make a report of the results from GISTIC and RUBIC."""
row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Regions with census genes", "Regions with known genes"]
with open(file_tools, 'w') as out:
for i in range(len(row_names)):
list_out = [row_names[i], stats_tools[0][0][i], stats_tools[0][1][i], stats_tools[1][0][i], stats_tools[1][1][i]]
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
self.make_gene_files(parsed_tools[0], parsed_tools[1], file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_venn)
self.compare_known_regions(known_genes, file_overlap, gene_file, gistic_results, rubic_results)
self.make_histogram_sizes(parsed_tools, file_histogram)
self.make_swarmplot_sizes(parsed_tools, file_histogram)
def make_genefiles(self, parsed_results, out_both, out_GISTIC, out_RUBIC):
def make_gene_files(self, dict_gistic, dict_rubic, out_both, out_GISTIC, out_RUBIC, out_venn):
"""Make files with all genes reported by GISTIC2 and RUBIC."""
genes_all = []
for dict_tool in parsed_results:
for dict_tool in dict_gistic, dict_rubic:
genes_tool = []
for cnv in dict_tool.keys():
genes = dict_tool[cnv][5]
genes = dict_tool[cnv][6]
for gene_name in genes:
try:
gene_ID = ensembl.gene_ids_of_gene_name(gene_name)
......@@ -259,68 +246,65 @@ class ReportTools:
except: #gene name not found
pass
genes_all.append(genes_tool)
out_file = out_GISTIC if dict_tool == parsed_results[0] else out_RUBIC
with open(out_file, 'w') as out:
out.write("\n".join(genes_tool))
return genes_all
def make_genefile_overlap(self, gene_lists, out_both):
overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
with open(out_both, 'w') as overlap_file:
overlap_file.write("\n".join(list(overlap)))
out_file = out_GISTIC if dict_tool == dict_gistic else out_RUBIC
with open(out_file, 'w') as out_tool:
out_tool.write("\n".join(genes_tool))
self.venn_overlap(genes_all, out_venn)
self.file_overlap(genes_all, out_both)
#remove > not very informative
def make_histogram_sizes(self, parsed_results, out_histogram):
def make_histogram_sizes(self, list_dicts, out_histogram):
"""Make histograms of the sizes of recurrent regions detected by either GISTIC2 or RUBIC."""
sns.set_style("whitegrid")
for dict_tool in parsed_results:
for dict_tool in list_dicts:
amp_sizes, del_sizes = [], []
for cnv in dict_tool.keys():
size = int(dict_tool[cnv][2]) - int(dict_tool[cnv][1])
cnv_type = dict_tool[cnv][3]
size = dict_tool[cnv][3]
cnv_type = dict_tool[cnv][5]
if cnv_type == 'amp':
amp_sizes += [size]
amp_sizes += size
else:
del_sizes += [size]
del_sizes += size
for cnv_sizes in amp_sizes, del_sizes:
if len(cnv_sizes) > 10:
sns.distplot(cnv_sizes, bins=20)
plt.savefig(out_histogram, dpi=300)
plt.close()
sns.distplot(cnv_sizes, bins=20)
plt.savefig(out_histogram, dpi=300)
def make_swarmplot_sizes(self, list_dicts, out_plot):
sizes = []
tool = 'GISTIC2'
tool = GISTIC2
for dict_tool in list_dicts:
amp_sizes, del_sizes = [], []
for cnv in dict_tool.keys():
size = int(dict_tool[cnv][2]) - int(dict_tool[cnv][1])
cnv_type = dict_tool[cnv][3]
size = dict_tool[cnv][3]
cnv_type = dict_tool[cnv][5]
cnv_label = 'Amplification' if cnv_type == 'amp' else 'Deletion'
#if size < 1000000:
sizes += [(size, tool, cnv_label)]
tool = 'RUBIC'
entry = (size, tool, cnv_label)
sizes += entry
tool = RUBIC
labels = ['Size', 'Tool', 'Type']
df = pd.DataFrame(sizes, columns=labels)
print(df)
sns.swarmplot(x="Size", y="Tool", hue="Type", data=df);
plt.savefig(out_plot, dpi=300)
plt.close()
plt.savefig(out_plot + ".swarm", dpi=300)
def venn_overlap(self, gene_lists, out_venn):
"""Compare the genes detected by GISTIC and those by RUBIC."""
from matplotlib_venn import venn2
venn2([set(gene_lists[0]), set(gene_lists[1])], set_labels = ('GISTIC2', 'RUBIC'))
plt.savefig(out_venn)
plt.close()
def file_overlap(self, gene_lists, out_both):
overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
with open(out_both, 'w') as overlap_file:
overlap_file.write("\n".join(list(overlap)))
def compare_known_regions(self, known_genes, overlap_file, gene_file, gistic_results, rubic_results):
"""Compare the regions containing known driver genes"""
top_genes = []
with open(known_genes, 'r') as known: #use parse! > make it possible to define number to parse
with open(known_genes, 'r') as known:
for line in range(0,100):
top_gene = known.readline().split("\t")[0]
top_genes.append(top_gene)
#print(top_genes) #2 functions
#print(top_genes)
with open(overlap_file, 'w') as out:
for gene in top_genes:
out.write(gene + "\n")
......@@ -395,8 +379,7 @@ class ReportSizes:
"""Make a report of the results produced using input files with different sample sizes."""
with open(report_file, 'w') as out:
dict_stats = {}
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
row_names = ["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"]
for size_file in size_results:
parsed_gistic = parse().gistic_results(size_file)
size, repetition = size_file.split("/")[-1].split("x")
......@@ -425,8 +408,7 @@ class ReportSizes:
nr_census.append(float(stats[7].split(" (")[0]))
nr_known.append(float(stats[8].split(" (")[0]))
plot_data = [nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known]
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes'])
plot_y_axis = ['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)', 'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes']
for plot_nr in range(len(plot_data)):
self.plot_size_differences(plot_data[plot_nr], sizes, len(reps), plot_y_axis[plot_nr], plot_dir)
......@@ -439,8 +421,7 @@ class ReportSizes:
for size in list_sizes:
sample_label = sample_label + [size] * nr_reps * 2
df['sample_size'] = pd.Series(sample_label)
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5,
aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5, aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", y_axis).set_titles("{col_name}").despine(bottom=True)
cnv_types = ["Amplifications", "Deletions"]
cnv_type = "Amplifications"
......@@ -468,8 +449,7 @@ class ReportControl:
"""Write report."""