Commit 815b2823 authored by Tan's avatar Tan
Browse files

Updated sample size report for RUBIC files.

parent 2d44810b
......@@ -51,21 +51,16 @@ rule run_rubic_subsets:
rule report_sizes:
"""Report the difference when using different sample sizes."""
gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
<<<<<<< HEAD
#gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
#rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
#rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
>>>>>>> 9067043362f4034a0e46f4579c366d1228927193
census=os.path.join(workflow.basedir, config["census_genes"]),
known=os.path.join(workflow.basedir, config["prev_found_genes"]),
ReportSizes.make_report(input[0], params.census, params.known, params.reps, params.ref,, output.plots)
ReportSizes.make_report(input.rubic_gains, input.rubic_losses, params.census, params.known, params.reps, params.ref,, output.plots)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
......@@ -9,25 +10,31 @@ import os.path
from ParseResults import parse_regions, get_stats
from collections import OrderedDict
def make_report(size_results, census_genes, known_genes, reps, ref_genome, report_file, plot_dir):
def make_report(size_rubic_gains, size_rubic_losses, census_genes, known_genes, reps, ref_genome, report_file, plot_dir):
"""Make a report of the results produced using input files with different sample sizes."""
with open(report_file, 'w') as out:
dict_stats = OrderedDict()
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. regions with census genes", "Nr. regions with known genes", "Nr. genes"])
tool = GISTIC2
for size_file in size_results:
dict_stats = {}
#row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
# "Nr. genes", "Nr. regions with known genes", "Nr. regions with census genes"])
tool = 'RUBIC'
for i in range(len(size_rubic_gains)):
size_file = size_rubic_gains[i], size_rubic_losses[i]
parsed_results = parse_regions(size_file, known_genes, census_genes, tool)
size, repetition = size_file.split("/")[-1].split("x")
if tool == 'RUBIC':
size_rep = size_file[0].split("Size")[1].split("/gains")[0]
size, repetition = size_rep.split(".Rep")
size_rep = size_file.split("Size")[1].split("/all_lesions")[0]
size, repetition = size_rep.split(".Rep")
stats_results = get_stats(parsed_results, size)
if size not in dict_stats.keys():
dict_stats[size] = [stats_results]
dict_stats[size] = dict_stats[size] + [stats_results]
extract_stats_sizes(dict_stats, reps, plot_dir)
out.write("\t".join(stats_results[0]) + "\n" + "\t".join(stats_results[1]))
extract_results_sizes(dict_stats, reps, plot_dir)
def extract_results_sizes(dict_stats, reps):
def extract_results_sizes(dict_stats, reps, plot_dir):
"""Prepare data from a dict with all results to plot the differences between analyses with different sample sizes."""
sizes = sorted(dict_stats.keys())
nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known = [], [], [], [], [], []
......@@ -40,9 +47,9 @@ def extract_results_sizes(dict_stats, reps):
if reg_size != 'N/A':
nr_census.append(float(stats[6].split(" (")[0]))
nr_known.append(float(stats[5].split(" (")[0]))
nr_census.append(float(stats[7].split(" (")[0]))
nr_known.append(float(stats[6].split(" (")[0]))
plot_data = [nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known]
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes'])
......@@ -72,14 +79,14 @@ def add_significance(g, df, list_sizes):
for ax in g.axes.flat:
for i in range(1,len(list_sizes)):
x1, x2 = i-1, i
pval = test_significance(x1, x2, df, cnv_type)
pval = test_significance(x1, x2, df, cnv_type, list_sizes)
if pval < .05:
y, h, col = df['value_y_axis'].max() + df['value_y_axis'].max() * 0.035, df['value_y_axis'].max() * 0.035, 'k'
ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
ax.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)
cnv_type = "Deletions"
def test_significance(x1, x2, df, cnv_type):
def test_significance(x1, x2, df, cnv_type, list_sizes):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_type = df[df['type']==cnv_type]
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment