Commit 44467197 authored by BeatriceTan's avatar BeatriceTan
Browse files

Split report functions to seperate files.

parent 6401d987
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats
from collections import OrderedDict
class ReportControl:
"""Make a report on analyses using control samples or without using them."""
def __init__(self, control_results, nocontrol_results, report_file, census_genes, known_genes, ref_genome):
install_ensembl(ref_genome)
self.make_control_report(control_results, nocontrol_results, report_file, census_genes, known_genes)
def make_control_report(self, control_results, nocontrol_results, report_file, census_genes, known_genes):
"""Write report."""
parsed_tools, stats_tools = [], []
with open(report_file, 'w') as out:
row_names = (["Control used?", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for result in control_results, nocontrol_results:
parsed_results = parse().gistic_results(result)
parsed_tools.append(parsed_results)
label = "Control" if result == control_results else "No control"
stats_results = stats().calculate_stats(parsed_results, census_genes, known_genes, label)
stats_tools.append(stats_results)
#self.make_gene_files(parsed_tools[0], parsed_tools[1], file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_venn)
for i in range(len(row_names)):
list_out = [row_names[i], stats_tools[0][0][i], stats_tools[0][1][i], stats_tools[1][0][i], stats_tools[1][1][i]]
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
#self.compare_known_regions(known_genes, file_overlap, gene_file, gistic_results, rubic_results)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats
from collections import OrderedDict
class ReportSegmentation:
"""Make a report of the segmentation file used as input."""
def __init__(self, segmentation_file, report_file):
self.make_seg_report(segmentation_file, report_file)
def make_seg_report(self, segmentation_file, report_file):
"""Write report."""
with open(report_file, 'w') as out:
seg_rows = (["Number of samples", "Smallest number of CNVs in a sample", "Largest number of CNVs in a sample",
"Average number of CNVs per sample", "Average number of focal CNVs per sample",
"Average number of deletions per sample", "Average length of CNVs (Mb)"])
seg_stats = self.get_segment_stats(segmentation_file)
seg_stats_str = [str(i) for i in seg_stats]
for i in range(len(seg_rows)):
out.write("\t".join([seg_rows[i], seg_stats_str[i]]) + "\n")
def get_segment_stats(self, segmentation_file):
"""Calculate several statistics from the segmentation file."""
stat_dict = self.parse_segments(segmentation_file)
tot_size, tot_nr_cnvs, tot_nr_focal, tot_nr_del = 0, 0, 0, 0
samples = list(stat_dict.keys())
smallest_nr, largest_nr = stat_dict[samples[0]][0], stat_dict[samples[0]][0]
for sample in range(len(samples)):
nr_cnvs, cnv_length, focal_count, del_count = stat_dict[samples[sample]]
largest_nr = nr_cnvs if nr_cnvs >= largest_nr else largest_nr
smallest_nr = nr_cnvs if nr_cnvs <= smallest_nr else smallest_nr
tot_nr_focal += focal_count
tot_nr_del += del_count
tot_nr_cnvs += nr_cnvs
tot_size += cnv_length
avg_stats = self.calculate_stats(tot_size, tot_nr_focal, tot_nr_del, tot_nr_cnvs, len(samples))
return [len(samples), smallest_nr, largest_nr] + avg_stats
def parse_segments(self, segmentation_file):
"""Parse the input segmentation file and return a dictionary with number of CNVs and total CNV length per sample."""
stat_dict = OrderedDict()
minimal_seg = 0
maximal_seg = 0
with open(segmentation_file, 'r') as lines:
lines.readline()
for line in lines:
sample, chrom, start, end, num_prob, seg_CN = line.strip().split("\t")
if float(seg_CN) > 0:
maximal_seg = float(seg_CN) if float(seg_CN) > maximal_seg else maximal_seg
if float(seg_CN) < 0:
minimal_seg = float(seg_CN) if float(seg_CN) < minimal_seg else minimal_seg
if sample not in stat_dict:
stat_dict[sample] = [0, 0, 0, 0]
cnv_length = int(end) - int(start)
focal = True if cnv_length <= 3000000 else False
deletion = True if float(seg_CN) < 0.0 else False
stat_dict[sample][0] += 1
stat_dict[sample][1] += cnv_length
if focal:
stat_dict[sample][2] += 1
if deletion:
stat_dict[sample][3] += 1
print(minimal_seg)
print(maximal_seg)
return stat_dict
def calculate_stats(self, tot_size, tot_nr_focal, tot_nr_del, tot_nr_cnvs, nr_samples):
"""Calculate average number CNVs, number of focal CNVs and the average size of the CNVs."""
avg_size = round(((tot_size / tot_nr_cnvs) / 1000000), 2)
avg_nr_per_sample = int(tot_nr_cnvs / nr_samples)
avg_nr_focal = int(tot_nr_focal / nr_samples)
avg_nr_del = int(tot_nr_del / nr_samples)
return [avg_nr_per_sample, avg_nr_focal, avg_nr_del, avg_size]
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats
from collections import OrderedDict
class ReportSizes:
"""Make a report and plots on analyses with different sample sizes"""
def __init__(self, size_results, census_genes, known_genes, reps, ref_genome, report_file, plot_dir):
install_ensembl(ref_genome)
self.make_sizes_report(size_results, census_genes, known_genes, reps, report_file, plot_dir)
def make_sizes_report(self, size_results, census_genes, known_genes, reps, report_file, plot_dir):
"""Make a report of the results produced using input files with different sample sizes."""
with open(report_file, 'w') as out:
dict_stats = OrderedDict()
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. regions with census genes", "Nr. regions with known genes", "Nr. genes"])
tool = GISTIC2
for size_file in size_results:
parsed_gistic = parsed_results = parse(size_file, known_genes, census_genes, tool)
size, repetition = size_file.split("/")[-1].split("x")
stats_results = get_stats(parsed_results, size)
if size not in dict_stats.keys():
dict_stats[size] = [stats_results]
else:
dict_stats[size] = dict_stats[size] + [stats_results]
out.write(dict_stats)
self.extract_stats_sizes(dict_stats, reps, plot_dir)
def extract_results_sizes(self, dict_stats, reps):
"""Prepare data from a dict with all results to plot the differences between analyses with different sample sizes."""
sizes = sorted(dict_stats.keys())
nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known = [], [], [], [], [], []
for size in sizes:
for rep in range(len(reps)):
for cnv_type in 0, 1:
stats = dict_stats[size][rep][cnv_type]
nr_regions.append(float(stats[2]))
reg_size = stats[3]
if reg_size != 'N/A':
avg_size.append(float(reg_size))
total_size.append(float(stats[4]))
nr_genes.append(float(stats[7]))
nr_census.append(float(stats[6].split(" (")[0]))
nr_known.append(float(stats[5].split(" (")[0]))
plot_data = [nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known]
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes'])
for plot_nr in range(len(plot_data)):
self.plot_size_differences(plot_data[plot_nr], sizes, len(reps), plot_y_axis[plot_nr], plot_dir)
def plot_size_differences(self, list_size_results, list_sizes, nr_reps, y_axis, plot_dir):
"""Plot the differences between analyses using different sample sizes."""
sns.set_style("whitegrid")
df = pd.DataFrame(list_size_results, columns=['value_y_axis'])
df['type'] = pd.Series(['Amplifications', 'Deletions'] * len(list_sizes) * nr_reps)
sample_label = []
for size in list_sizes:
sample_label = sample_label + [size] * nr_reps * 2
df['sample_size'] = pd.Series(sample_label)
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5, #hue="tool" for adding RUBIC in plots: colors, significance,
aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", y_axis).set_titles("{col_name}").despine(bottom=True)
self.add_significance(g, df, list_sizes)
png_file = os.path.join(plot_dir, y_axis.replace(" ", "_") + ".png")
plt.savefig(png_file, dpi=300)
def add_significance(self, g, df, list_sizes):
"""Add significance to plot with differences between sample sizes."""
cnv_types = ["Amplifications", "Deletions"]
cnv_type = "Amplifications"
for ax in g.axes.flat:
for i in range(1,len(list_sizes)):
x1, x2 = i-1, i
pval = self.test_significance(x1, x2, df, cnv_type)
if pval < .05:
y, h, col = df['value_y_axis'].max() + df['value_y_axis'].max() * 0.035, df['value_y_axis'].max() * 0.035, 'k'
ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
ax.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)
cnv_type = "Deletions"
def test_significance(self, x1, x2, df, cnv_type):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_type = df[df['type']==cnv_type]
values_x1 = only_type[only_type['sample_size']==list_sizes[x1]]
values_x2 = only_type[only_type['sample_size']==list_sizes[x2]]
#signficance = ttest_ind(values_x1['value_y_axis'], values_x2['value_y_axis'])
significance = mannwhitneyu(values_x1['value_y_axis'], values_x2['value_y_axis'])
return significance[1]
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats
from collections import OrderedDict
class ReportTools:
"""Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
def __init__(self, gistic_results, rubic_gain_results, rubic_loss_results,
census_genes, known_genes, ref_genome, biomart_file,
file_tools, file_regions, file_venn, file_overlap, file_swarmplot,
file_genes_both, file_genes_GISTIC, file_genes_RUBIC):
install_ensembl(ref_genome)
parsed, stats = [], []
for tool in 'GISTIC', 'RUBIC':
results = gistic_results if tool == 'GISTIC' else [rubic_gain_results, rubic_loss_results]
parsed_results = parse(results, known_genes, census_genes, tool)
stats_results = get_stats(parsed_results, tool)
parsed.append(parsed_results)
stats.append(stats_results)
self.make_files(parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
file_venn, file_swarmplot, file_regions, biomart_file, file_overlap)
def make_files(self, parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
file_venn, file_swarmplot, file_regions, biomart_file, file_overlap):
#self.overlap_most_sign(parsed)
self.table_regions(parsed, file_regions)
self.make_tool_report(file_tools, stats)
all_genes = self.make_genefiles(parsed, file_genes_GISTIC, file_genes_RUBIC)
self.make_genefile_overlap(all_genes, file_genes_both)
self.venn_overlap(all_genes, file_venn)
#self.compare_known_regions(known_genes, file_overlap, biomart_file,
# gistic_results, rubic_gain_results, rubic_loss_results)
self.make_swarmplot_sizes(parsed, file_swarmplot)
self.compare_overlapping_regions(parsed, biomart_file, file_overlap)
def make_tool_report(self, file_tools, stats_tools):
"""Make a report of the results from GISTIC and RUBIC."""
row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Regions with known genes", "Regions with census genes"]
with open(file_tools, 'w') as out:
for i in range(len(row_names)):
list_out = [row_names[i], stats_tools[0][0][i], stats_tools[0][1][i], stats_tools[1][0][i], stats_tools[1][1][i]]
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
def table_regions(self, parsed_regions, file_regions):
"""Make a table with all recurent regions."""
with open(file_regions, 'w') as out:
header = ["Method", "Chr", "Start", "End", "Type", "Negative log10 q-value", "Known genes", "Census genes", "All genes"]
out.write("\t".join(header) + "\n")
tool_name = "GISTIC2"
for tool in parsed_regions:
for cnv in tool:
out.write(tool_name + "\t")
cnv_type = "Amplification" if cnv[3] == "amp" else "Deletion"
q_vals = ", ".join(cnv[4]) if type(cnv[4]) == tuple else cnv[4]
begin_row = cnv[0:3] + [cnv_type, str(q_vals)]
out.write("\t".join(begin_row))
for list_genes in range(5,8):
out.write("\t" + ", ".join(cnv[list_genes]))
out.write("\n")
tool_name = "RUBIC"
def make_genefiles(self, parsed_results, out_GISTIC, out_RUBIC):
"""Make files with all genes reported by GISTIC2 and RUBIC."""
genes_all = []
for tool_results in parsed_results:
genes_tool = []
for cnv in tool_results:
genes = cnv[7]
genes_tool += genes
genes_tool = list(set(genes_tool)) #get unique genes
genes_all.append(genes_tool)
out_file = out_GISTIC if tool_results == parsed_results[0] else out_RUBIC
with open(out_file, 'w') as out:
out.write("\n".join(genes_tool))
return genes_all
def make_genefile_overlap(self, gene_lists, out_both):
overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
with open(out_both, 'w') as overlap_file:
overlap_file.write("\n".join(list(overlap)))
def make_swarmplot_sizes(self, parsed_results, out_plot):
"""Make swarmplot of the sizes of recurrent regions detected by either GISTIC2 or RUBIC."""
sizes, sizes_zoomed = [], []
tool = 'GISTIC2.0'
for tool_results in parsed_results:
amp_sizes, del_sizes = [], []
for cnv in tool_results:
size = int(cnv[2]) - int(cnv[1])
cnv_label = 'Amplification' if cnv[3] == 'amp' else 'Deletion'
sizes += [(size, tool, cnv_label)]
if size < 900000:
sizes_zoomed += [(size, tool, cnv_label)]
tool = 'RUBIC'
labels = ['Size', 'Tool', 'CNV type']
df = pd.DataFrame(sizes, columns=labels)
self.make_swarmplot_log_sizes(df, out_plot)
def make_swarmplot_log_sizes(self, size_list, plot_file):
sns.set_style("white")
f, ax = plt.subplots(figsize=(7, 7))
ax.set(yscale="log")
g = sns.boxplot(x="Tool", y="Size", hue="CNV type", data=size_list, whis=np.inf, palette="deep")
g = sns.swarmplot(x="Tool", y="Size", dodge=True, hue="CNV type", data=size_list, size=5, palette={"Amplification": "#3f3f3f", "Deletion": "#3f3f3f"})
sns.despine()
g.set_xlabel("Tool",fontsize=16)
g.set_ylabel("Log size of recurrent regions",fontsize=16)
g.tick_params(labelsize=12)
handles, labels = g.get_legend_handles_labels()
plt.legend(handles[0:2], labels[:2], fontsize=12)
plt.savefig(plot_file, dpi=300)
plt.close()
def venn_overlap(self, gene_lists, out_venn): #maybe add known genes?
"""Compare the genes detected by GISTIC and those by RUBIC."""
plt.subplots(figsize=(7, 7))
c = venn2([set(gene_lists[0]), set(gene_lists[1])], set_labels = ('GISTIC2', 'RUBIC'))
c.get_patch_by_id('10').set_color('#5975A4')
c.get_patch_by_id('01').set_color('#5F9E6E')
c.get_patch_by_id('10').set_edgecolor('#4c4c4c')
c.get_patch_by_id('01').set_edgecolor('#4c4c4c')
c.get_patch_by_id('10').set_alpha(1)
c.get_patch_by_id('01').set_alpha(1)
c.get_patch_by_id('11').set_color('#857AAA')
c.get_patch_by_id('11').set_edgecolor('none')
c.get_patch_by_id('11').set_alpha(1)
plt.savefig(out_venn, dpi=300)
plt.close()
def compare_overlapping_regions(self, parsed_results, biomart_file, overlap_file):
"""Compare the regions containing known driver genes"""
location_dict = {}
tool = "GISTIC"
for tool_results in parsed_results:
for cnv in tool_results:
known_genes = cnv[5]
if len(known_genes) != 0:
for gene in known_genes:
location_gene, gene_name = self.get_location_gene(biomart_file, gene)
if location_gene != ['N/A', 'N/A', 'N/A']:
if tool == "GISTIC": #first loop
location_gistic = cnv[0:3]
location_rubic = ['N/A', 'N/A', 'N/A']
for rubic_cnv in parsed_results[1]:
if int(rubic_cnv[0]) == int(location_gene[0]) and int(rubic_cnv[1]) <= int(location_gene[2]) and int(rubic_cnv[2]) >= int(location_gene[1]): #overlap cnv and gene
location_rubic = rubic_cnv[0:3]
else: #second loop
location_rubic = cnv[0:3]
location_gistic = ['N/A', 'N/A', 'N/A']
for gistic_cnv in parsed_results[0]:
if int(gistic_cnv[0]) == int(location_gene[0]) and int(gistic_cnv[1]) <= int(location_gene[2]) and int(gistic_cnv[2]) >= int(location_gene[1]): #overlap cnv and gene
location_gistic = gistic_cnv[0:3]
if gene not in location_dict:
location_dict[gene] = gene_name, location_gene, location_gistic, location_rubic
#else:
#print(location_dict[gene])
#print(gene_name, location_gene, location_gistic, location_rubic)
tool = "RUBIC"
with open(overlap_file, 'w') as out:
for gene in location_dict.keys():
gene_name, location_gene, location_gistic, location_rubic = location_dict[gene]
out_line = [gene_name, location_gene[0] + ":" + "-".join(location_gene[1:3]), location_gistic[0] + ":" + "-".join(location_gistic[1:3]), location_rubic[0] + ":" + "-".join(location_rubic[1:3])]
out.write("\t".join(out_line) + "\n")
def get_location_gene(self, biomart_file, gene_of_interest):
"""Get location of a gene from a biomart gene file"""
location = ['N/A', 'N/A', 'N/A']
gene_name = 'N/A'
with open(biomart_file, 'r') as genes:
genes.readline()
for line in genes:
info = line.strip().split("\t")
if info[0] == gene_of_interest:
location = [info[2], info[3], info[4]]
gene_name = info[1]
#try:
# location = [int(i) for i in location]
# break
#except:
# continue
return location, gene_name
def get_location_gistic(self, gistic_results, location_gene): #more regions overlap: extract one with longest overlap
"""Get location of the CNV detected by GISTIC2 harboring the gene of interest"""
loc = ['N/A', 'N/A', 'N/A']
chrom_gene, start_gene, end_gene = location_gene
with open(os.path.join(gistic_results, "all_lesions.conf_75.txt"), 'r') as cnvs:
cnvs.readline()
for cnv in cnvs:
cnv_loc = cnv.split("\t")[3]
cnv_loc_stripped = cnv_loc.split("(probes")[0]
chrom, bp = cnv_loc_stripped.split(":")
chrom = chrom.strip("chr")
if chrom == chrom_gene:
start, end = bp.split("-")
if int(start) <= int(end_gene) and int(end) >= int(start_gene):
if loc == ['N/A', 'N/A', 'N/A']:
loc = [chrom, start, end]
#else:
#loc = loc, chrom, start, end
return loc
def get_location_rubic(self, rubic_results, gene):
loc = ['N/A', 'N/A', 'N/A']
for file_type in "gains.txt", "losses.txt":
with open(os.path.join(rubic_results, file_type), 'r') as cnvs:
cnvs.readline()
for cnv in cnvs:
if gene in cnv.strip().split("\t")[5]:
chrom, start, end, pleft, pright, genes = cnv.strip().split("\t")
if loc == ['N/A', 'N/A', 'N/A']:
loc = [chrom, start, end]
#else:
#loc = loc, chrom, start, end
return loc
......@@ -4,7 +4,7 @@ import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats
......@@ -362,27 +362,36 @@ class ReportSizes:
for size in list_sizes:
sample_label = sample_label + [size] * nr_reps * 2
df['sample_size'] = pd.Series(sample_label)
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5,
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5, #hue="tool" for adding RUBIC in plots: colors, significance,
aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", y_axis).set_titles("{col_name}").despine(bottom=True)
self.add_significance(g, df, list_sizes)
png_file = os.path.join(plot_dir, y_axis.replace(" ", "_") + ".png")
plt.savefig(png_file, dpi=300)
def add_significance(self, g, df, list_sizes):
"""Add significance to plot with differences between sample sizes."""
cnv_types = ["Amplifications", "Deletions"]
cnv_type = "Amplifications"
for ax in g.axes.flat:
for i in range(1,len(list_sizes)):
x1, x2 = i-1, i
only_type = df[df['type']==cnv_type]
values_x1 = only_type[only_type['sample_size']==list_sizes[x1]]
values_x2 = only_type[only_type['sample_size']==list_sizes[x2]]
signficance = ttest_ind(values_x1['value_y_axis'], values_x2['value_y_axis']) #NON-PARAMETRIC TEST?
if signficance[1] < .05:
pval = self.test_significance(x1, x2, df, cnv_type)
if pval < .05:
y, h, col = df['value_y_axis'].max() + df['value_y_axis'].max() * 0.035, df['value_y_axis'].max() * 0.035, 'k'
ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
ax.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)
cnv_type = "Deletions"
png_file = os.path.join(plot_dir, y_axis.replace(" ", "_") + ".png")
plt.savefig(png_file, dpi=300)
# def test_signficance(list_one, list_two):
def test_significance(self, x1, x2, df, cnv_type):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_type = df[df['type']==cnv_type]
values_x1 = only_type[only_type['sample_size']==list_sizes[x1]]
values_x2 = only_type[only_type['sample_size']==list_sizes[x2]]
#signficance = ttest_ind(values_x1['value_y_axis'], values_x2['value_y_axis'])
significance = mannwhitneyu(values_x1['value_y_axis'], values_x2['value_y_axis'])
return significance[1]
class ReportControl:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment