Commit a61e8e72 authored by BeatriceTan's avatar BeatriceTan
Browse files

Updated report scripts based on new structure.

parent 44467197
......@@ -5,6 +5,20 @@ ensembl = pyensembl.EnsemblRelease(75)
from math import log10
from collections import OrderedDict
def install_ensembl(reference_genome):
"""Import ensembl if needed and load correct release based on reference genome."""
if "hg19" in reference_genome:
ensembl = pyensembl.EnsemblRelease(75)
elif "38" in reference_genome:
ensembl = pyensembl.EnsemblRelease(87)
else:
raise ValueError("Unknown reference genome used.")
try: #only done first time to install ensembl version.
ensembl.download()
ensembl.index()
except:
pass
def parse(results_file, known_file, census_file, used_tool):
"""Make list with the parsed results from GISTIC or RUBIC.
Format: [chrom, start, end, cnv_type, qval, [known_gene_list], [census_gene_list], [gene_list]]"""
......
......@@ -7,30 +7,25 @@ import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats
from ParseResults import parse, get_stats, install_ensembl
from collections import OrderedDict
class ReportControl:
def make_report(control_results, nocontrol_results, report_file, census_genes, known_genes, ref_genome):
"""Make a report on analyses using control samples or without using them."""
def __init__(self, control_results, nocontrol_results, report_file, census_genes, known_genes, ref_genome):
install_ensembl(ref_genome)
self.make_control_report(control_results, nocontrol_results, report_file, census_genes, known_genes)
def make_control_report(self, control_results, nocontrol_results, report_file, census_genes, known_genes):
"""Write report."""
parsed_tools, stats_tools = [], []
with open(report_file, 'w') as out:
row_names = (["Control used?", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for result in control_results, nocontrol_results:
parsed_results = parse().gistic_results(result)
parsed_tools.append(parsed_results)
label = "Control" if result == control_results else "No control"
stats_results = stats().calculate_stats(parsed_results, census_genes, known_genes, label)
stats_tools.append(stats_results)
#self.make_gene_files(parsed_tools[0], parsed_tools[1], file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_venn)
for i in range(len(row_names)):
list_out = [row_names[i], stats_tools[0][0][i], stats_tools[0][1][i], stats_tools[1][0][i], stats_tools[1][1][i]]
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
#self.compare_known_regions(known_genes, file_overlap, gene_file, gistic_results, rubic_results)
install_ensembl(ref_genome)
parsed_tools, stats_tools = [], []
with open(report_file, 'w') as out:
row_names = (["Control used?", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for result in control_results, nocontrol_results:
parsed_results = parse().gistic_results(result)
parsed_tools.append(parsed_results)
label = "Control" if result == control_results else "No control"
stats_results = stats().calculate_stats(parsed_results, census_genes, known_genes, label)
stats_tools.append(stats_results)
#make_gene_files(parsed_tools[0], parsed_tools[1], file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_venn)
for i in range(len(row_names)):
list_out = [row_names[i], stats_tools[0][0][i], stats_tools[0][1][i], stats_tools[1][0][i], stats_tools[1][1][i]]
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
#compare_known_regions(known_genes, file_overlap, gene_file, gistic_results, rubic_results)
......@@ -10,71 +10,66 @@ import pyensembl
from ParseResults import parse, get_stats
from collections import OrderedDict
class ReportSegmentation:
def make_report(segmentation_file, report_file):
"""Make a report of the segmentation file used as input."""
def __init__(self, segmentation_file, report_file):
self.make_seg_report(segmentation_file, report_file)
with open(report_file, 'w') as out:
seg_rows = (["Number of samples", "Smallest number of CNVs in a sample", "Largest number of CNVs in a sample",
"Average number of CNVs per sample", "Average number of focal CNVs per sample",
"Average number of deletions per sample", "Average length of CNVs (Mb)"])
seg_stats = get_segment_stats(segmentation_file)
seg_stats_str = [str(i) for i in seg_stats]
for i in range(len(seg_rows)):
out.write("\t".join([seg_rows[i], seg_stats_str[i]]) + "\n")
def make_seg_report(self, segmentation_file, report_file):
"""Write report."""
with open(report_file, 'w') as out:
seg_rows = (["Number of samples", "Smallest number of CNVs in a sample", "Largest number of CNVs in a sample",
"Average number of CNVs per sample", "Average number of focal CNVs per sample",
"Average number of deletions per sample", "Average length of CNVs (Mb)"])
seg_stats = self.get_segment_stats(segmentation_file)
seg_stats_str = [str(i) for i in seg_stats]
for i in range(len(seg_rows)):
out.write("\t".join([seg_rows[i], seg_stats_str[i]]) + "\n")
def get_segment_stats(segmentation_file):
"""Calculate several statistics from the segmentation file."""
stat_dict = parse_segments(segmentation_file)
tot_size, tot_nr_cnvs, tot_nr_focal, tot_nr_del = 0, 0, 0, 0
samples = list(stat_dict.keys())
smallest_nr, largest_nr = stat_dict[samples[0]][0], stat_dict[samples[0]][0]
for sample in range(len(samples)):
nr_cnvs, cnv_length, focal_count, del_count = stat_dict[samples[sample]]
largest_nr = nr_cnvs if nr_cnvs >= largest_nr else largest_nr
smallest_nr = nr_cnvs if nr_cnvs <= smallest_nr else smallest_nr
tot_nr_focal += focal_count
tot_nr_del += del_count
tot_nr_cnvs += nr_cnvs
tot_size += cnv_length
avg_stats = calculate_stats(tot_size, tot_nr_focal, tot_nr_del, tot_nr_cnvs, len(samples))
return [len(samples), smallest_nr, largest_nr] + avg_stats
def get_segment_stats(self, segmentation_file):
"""Calculate several statistics from the segmentation file."""
stat_dict = self.parse_segments(segmentation_file)
tot_size, tot_nr_cnvs, tot_nr_focal, tot_nr_del = 0, 0, 0, 0
samples = list(stat_dict.keys())
smallest_nr, largest_nr = stat_dict[samples[0]][0], stat_dict[samples[0]][0]
for sample in range(len(samples)):
nr_cnvs, cnv_length, focal_count, del_count = stat_dict[samples[sample]]
largest_nr = nr_cnvs if nr_cnvs >= largest_nr else largest_nr
smallest_nr = nr_cnvs if nr_cnvs <= smallest_nr else smallest_nr
tot_nr_focal += focal_count
tot_nr_del += del_count
tot_nr_cnvs += nr_cnvs
tot_size += cnv_length
avg_stats = self.calculate_stats(tot_size, tot_nr_focal, tot_nr_del, tot_nr_cnvs, len(samples))
return [len(samples), smallest_nr, largest_nr] + avg_stats
def parse_segments(segmentation_file):
"""Parse the input segmentation file and return a dictionary with number of CNVs and total CNV length per sample."""
stat_dict = OrderedDict()
minimal_seg = 0
maximal_seg = 0
with open(segmentation_file, 'r') as lines:
lines.readline()
for line in lines:
sample, chrom, start, end, num_prob, seg_CN = line.strip().split("\t")
if float(seg_CN) > 0:
maximal_seg = float(seg_CN) if float(seg_CN) > maximal_seg else maximal_seg
if float(seg_CN) < 0:
minimal_seg = float(seg_CN) if float(seg_CN) < minimal_seg else minimal_seg
if sample not in stat_dict:
stat_dict[sample] = [0, 0, 0, 0]
cnv_length = int(end) - int(start)
focal = True if cnv_length <= 3000000 else False
deletion = True if float(seg_CN) < 0.0 else False
stat_dict[sample][0] += 1
stat_dict[sample][1] += cnv_length
if focal:
stat_dict[sample][2] += 1
if deletion:
stat_dict[sample][3] += 1
print(minimal_seg)
print(maximal_seg)
return stat_dict
def parse_segments(self, segmentation_file):
"""Parse the input segmentation file and return a dictionary with number of CNVs and total CNV length per sample."""
stat_dict = OrderedDict()
minimal_seg = 0
maximal_seg = 0
with open(segmentation_file, 'r') as lines:
lines.readline()
for line in lines:
sample, chrom, start, end, num_prob, seg_CN = line.strip().split("\t")
if float(seg_CN) > 0:
maximal_seg = float(seg_CN) if float(seg_CN) > maximal_seg else maximal_seg
if float(seg_CN) < 0:
minimal_seg = float(seg_CN) if float(seg_CN) < minimal_seg else minimal_seg
if sample not in stat_dict:
stat_dict[sample] = [0, 0, 0, 0]
cnv_length = int(end) - int(start)
focal = True if cnv_length <= 3000000 else False
deletion = True if float(seg_CN) < 0.0 else False
stat_dict[sample][0] += 1
stat_dict[sample][1] += cnv_length
if focal:
stat_dict[sample][2] += 1
if deletion:
stat_dict[sample][3] += 1
print(minimal_seg)
print(maximal_seg)
return stat_dict
def calculate_stats(self, tot_size, tot_nr_focal, tot_nr_del, tot_nr_cnvs, nr_samples):
"""Calculate average number CNVs, number of focal CNVs and the average size of the CNVs."""
avg_size = round(((tot_size / tot_nr_cnvs) / 1000000), 2)
avg_nr_per_sample = int(tot_nr_cnvs / nr_samples)
avg_nr_focal = int(tot_nr_focal / nr_samples)
avg_nr_del = int(tot_nr_del / nr_samples)
return [avg_nr_per_sample, avg_nr_focal, avg_nr_del, avg_size]
def calculate_stats(tot_size, tot_nr_focal, tot_nr_del, tot_nr_cnvs, nr_samples):
"""Calculate average number CNVs, number of focal CNVs and the average size of the CNVs."""
avg_size = round(((tot_size / tot_nr_cnvs) / 1000000), 2)
avg_nr_per_sample = int(tot_nr_cnvs / nr_samples)
avg_nr_focal = int(tot_nr_focal / nr_samples)
avg_nr_del = int(tot_nr_del / nr_samples)
return [avg_nr_per_sample, avg_nr_focal, avg_nr_del, avg_size]
......@@ -7,91 +7,86 @@ import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats
from ParseResults import parse, get_stats, install_ensembl
from collections import OrderedDict
class ReportSizes:
"""Make a report and plots on analyses with different sample sizes"""
def __init__(self, size_results, census_genes, known_genes, reps, ref_genome, report_file, plot_dir):
install_ensembl(ref_genome)
self.make_sizes_report(size_results, census_genes, known_genes, reps, report_file, plot_dir)
def make_report(size_results, census_genes, known_genes, reps, ref_genome, report_file, plot_dir):
"""Make a report of the results produced using input files with different sample sizes."""
install_ensembl(ref_genome)
with open(report_file, 'w') as out:
dict_stats = OrderedDict()
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. regions with census genes", "Nr. regions with known genes", "Nr. genes"])
tool = GISTIC2
for size_file in size_results:
parsed_results = parse(size_file, known_genes, census_genes, tool)
size, repetition = size_file.split("/")[-1].split("x")
stats_results = get_stats(parsed_results, size)
if size not in dict_stats.keys():
dict_stats[size] = [stats_results]
else:
dict_stats[size] = dict_stats[size] + [stats_results]
out.write(dict_stats)
extract_stats_sizes(dict_stats, reps, plot_dir)
def make_sizes_report(self, size_results, census_genes, known_genes, reps, report_file, plot_dir):
"""Make a report of the results produced using input files with different sample sizes."""
with open(report_file, 'w') as out:
dict_stats = OrderedDict()
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. regions with census genes", "Nr. regions with known genes", "Nr. genes"])
tool = GISTIC2
for size_file in size_results:
parsed_gistic = parsed_results = parse(size_file, known_genes, census_genes, tool)
size, repetition = size_file.split("/")[-1].split("x")
stats_results = get_stats(parsed_results, size)
if size not in dict_stats.keys():
dict_stats[size] = [stats_results]
else:
dict_stats[size] = dict_stats[size] + [stats_results]
out.write(dict_stats)
self.extract_stats_sizes(dict_stats, reps, plot_dir)
def extract_results_sizes(dict_stats, reps):
"""Prepare data from a dict with all results to plot the differences between analyses with different sample sizes."""
sizes = sorted(dict_stats.keys())
nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known = [], [], [], [], [], []
for size in sizes:
for rep in range(len(reps)):
for cnv_type in 0, 1:
stats = dict_stats[size][rep][cnv_type]
nr_regions.append(float(stats[2]))
reg_size = stats[3]
if reg_size != 'N/A':
avg_size.append(float(reg_size))
total_size.append(float(stats[4]))
nr_genes.append(float(stats[7]))
nr_census.append(float(stats[6].split(" (")[0]))
nr_known.append(float(stats[5].split(" (")[0]))
plot_data = [nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known]
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes'])
for plot_nr in range(len(plot_data)):
plot_size_differences(plot_data[plot_nr], sizes, len(reps), plot_y_axis[plot_nr], plot_dir)
def extract_results_sizes(self, dict_stats, reps):
"""Prepare data from a dict with all results to plot the differences between analyses with different sample sizes."""
sizes = sorted(dict_stats.keys())
nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known = [], [], [], [], [], []
for size in sizes:
for rep in range(len(reps)):
for cnv_type in 0, 1:
stats = dict_stats[size][rep][cnv_type]
nr_regions.append(float(stats[2]))
reg_size = stats[3]
if reg_size != 'N/A':
avg_size.append(float(reg_size))
total_size.append(float(stats[4]))
nr_genes.append(float(stats[7]))
nr_census.append(float(stats[6].split(" (")[0]))
nr_known.append(float(stats[5].split(" (")[0]))
plot_data = [nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known]
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes'])
for plot_nr in range(len(plot_data)):
self.plot_size_differences(plot_data[plot_nr], sizes, len(reps), plot_y_axis[plot_nr], plot_dir)
def plot_size_differences(list_size_results, list_sizes, nr_reps, y_axis, plot_dir):
"""Plot the differences between analyses using different sample sizes."""
sns.set_style("whitegrid")
df = pd.DataFrame(list_size_results, columns=['value_y_axis'])
df['type'] = pd.Series(['Amplifications', 'Deletions'] * len(list_sizes) * nr_reps)
sample_label = []
for size in list_sizes:
sample_label = sample_label + [size] * nr_reps * 2
df['sample_size'] = pd.Series(sample_label)
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5, #hue="tool" for adding RUBIC in plots: colors, significance,
aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", y_axis).set_titles("{col_name}").despine(bottom=True)
add_significance(g, df, list_sizes)
png_file = os.path.join(plot_dir, y_axis.replace(" ", "_") + ".png")
plt.savefig(png_file, dpi=300)
def plot_size_differences(self, list_size_results, list_sizes, nr_reps, y_axis, plot_dir):
"""Plot the differences between analyses using different sample sizes."""
sns.set_style("whitegrid")
df = pd.DataFrame(list_size_results, columns=['value_y_axis'])
df['type'] = pd.Series(['Amplifications', 'Deletions'] * len(list_sizes) * nr_reps)
sample_label = []
for size in list_sizes:
sample_label = sample_label + [size] * nr_reps * 2
df['sample_size'] = pd.Series(sample_label)
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5, #hue="tool" for adding RUBIC in plots: colors, significance,
aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", y_axis).set_titles("{col_name}").despine(bottom=True)
self.add_significance(g, df, list_sizes)
png_file = os.path.join(plot_dir, y_axis.replace(" ", "_") + ".png")
plt.savefig(png_file, dpi=300)
def add_significance(g, df, list_sizes):
"""Add significance to plot with differences between sample sizes."""
cnv_types = ["Amplifications", "Deletions"]
cnv_type = "Amplifications"
for ax in g.axes.flat:
for i in range(1,len(list_sizes)):
x1, x2 = i-1, i
pval = test_significance(x1, x2, df, cnv_type)
if pval < .05:
y, h, col = df['value_y_axis'].max() + df['value_y_axis'].max() * 0.035, df['value_y_axis'].max() * 0.035, 'k'
ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
ax.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)
cnv_type = "Deletions"
def add_significance(self, g, df, list_sizes):
"""Add significance to plot with differences between sample sizes."""
cnv_types = ["Amplifications", "Deletions"]
cnv_type = "Amplifications"
for ax in g.axes.flat:
for i in range(1,len(list_sizes)):
x1, x2 = i-1, i
pval = self.test_significance(x1, x2, df, cnv_type)
if pval < .05:
y, h, col = df['value_y_axis'].max() + df['value_y_axis'].max() * 0.035, df['value_y_axis'].max() * 0.035, 'k'
ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
ax.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)
cnv_type = "Deletions"
def test_significance(self, x1, x2, df, cnv_type):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_type = df[df['type']==cnv_type]
values_x1 = only_type[only_type['sample_size']==list_sizes[x1]]
values_x2 = only_type[only_type['sample_size']==list_sizes[x2]]
#signficance = ttest_ind(values_x1['value_y_axis'], values_x2['value_y_axis'])
significance = mannwhitneyu(values_x1['value_y_axis'], values_x2['value_y_axis'])
def test_significance(x1, x2, df, cnv_type):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_type = df[df['type']==cnv_type]
values_x1 = only_type[only_type['sample_size']==list_sizes[x1]]
values_x2 = only_type[only_type['sample_size']==list_sizes[x2]]
#signficance = ttest_ind(values_x1['value_y_axis'], values_x2['value_y_axis'])
significance = mannwhitneyu(values_x1['value_y_axis'], values_x2['value_y_axis'])
return significance[1]
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment