...
 
Commits (3)
#Directories to be specified
workdir: /home/bftan/CNA_results #directory to write output
gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#workdir: /home/beatrice/CNA_analysis
#gisticdir: /home/beatrice/CNA_analysis/run_gistic2
workdir: /home/beatrice/CNA_analysis
gisticdir: /home/beatrice/CNA_analysis/run_gistic2
#Input details to download from firehose
cancer_type: SKCM
......@@ -34,6 +34,6 @@ settings_gistic: "-ta 0.1
-conf 0.99"
#Settings for sample size differences
sizes: [20, 30, 40, 50, 60, 70, 80, 90]
sizes: [20, 30, 40, 60, 70, 80, 90]
#sizes: [30, 40]
repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
#Directories to be specified
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2_test #directory to install GISTIC2
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
workdir: /home/beatrice/CNA_analysis
gisticdir: /home/beatrice/CNA_analysis/run_gistic2
#Input details to download from firehose
cancer_type: SKCM
date_data: "2016_07_15"
#Or provide input file
inputfile: "" #tumor segmentation data
normal: "" #normal segmentation data
#Data for running and benchmarking tools.
reference: hg19
prev_found_genes: input_files/intogen-CM-drivers-data.tsv
census_genes: input_files/Census_genes.txt
biomart_genes: input_files/biomart_human_genes.tsv
#Settings GISTIC2.0
gistic_precision: "99"
settings_gistic: "-ta 0.1
-td 0.1
-qvt 0.25
-brlen 0.7
-cap 1.5
-rx 1
-genegistic 1
-conf 0.99"
#Settings for sample size differences
sizes: [20, 30, 40, 50, 60, 70, 80, 90]
#sizes: [30, 40]
repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
......@@ -51,7 +51,7 @@ rule run_rubic_subsets:
rule report_sizes:
"""Report the difference when using different sample sizes."""
input:
#gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
output:
......@@ -61,6 +61,7 @@ rule report_sizes:
census=os.path.join(workflow.basedir, config["census_genes"]),
known=os.path.join(workflow.basedir, config["prev_found_genes"]),
reps=config["repeats"],
sizes=config["sizes"],
ref=config["reference"]
run:
ReportSizes.make_report(input.rubic_gains, input.rubic_losses, params.census, params.known, params.reps, params.ref, output.report, output.plots)
ReportSizes.make_report(input.gistic, input.rubic_gains, input.rubic_losses, params.census, params.known, params.reps, params.sizes, params.ref, output.report, output.plots)
......@@ -10,88 +10,80 @@ import os.path
from ParseResults import parse_regions, get_stats
from collections import OrderedDict
def make_report(size_rubic_gains, size_rubic_losses, census_genes, known_genes, reps, ref_genome, report_file, plot_dir):
def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes, known_genes, reps, sizes, ref_genome, report_file, plot_dir):
"""Make a report of the results produced using input files with different sample sizes."""
list_stats = []
with open(report_file, 'w') as out:
dict_stats = {}
#row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
# "Nr. genes", "Nr. regions with known genes", "Nr. regions with census genes"])
tool = 'RUBIC'
for i in range(len(size_rubic_gains)):
size_file = size_rubic_gains[i], size_rubic_losses[i]
parsed_results = parse_regions(size_file, known_genes, census_genes, tool)
if tool == 'RUBIC':
size_rep = size_file[0].split("Size")[1].split("/gains")[0]
size, repetition = size_rep.split(".Rep")
else:
size_rep = size_file.split("Size")[1].split("/all_lesions")[0]
size, repetition = size_rep.split(".Rep")
stats_results = get_stats(parsed_results, size)
if size not in dict_stats.keys():
dict_stats[size] = [stats_results]
else:
dict_stats[size] = dict_stats[size] + [stats_results]
out.write("\t".join(stats_results[0]) + "\n" + "\t".join(stats_results[1]))
extract_results_sizes(dict_stats, reps, plot_dir)
for i in range(len(size_rubic_gains)): #loop through sizes
size_rep = size_rubic_gains[i].split("Size")[1].split("/gains")[0]
size, repetition = size_rep.split(".Rep")
for tool in 'GISTIC', 'RUBIC':
size_file = (size_rubic_gains[i], size_rubic_losses[i]) if tool == 'RUBIC' else size_gistic[i]
parsed_results = parse_regions(size_file, known_genes, census_genes, tool)
stats_results = get_stats(parsed_results, size)
for stat_list in stats_results[0], stats_results[1]:
converted_stats = [tool] + stat_list[0:2]
for stat in stat_list[2:5]:
converted_stats.append(float(stat))
for stat in stat_list[5:]:
converted_stats.append(float(stat.split(" (")[0]))
list_stats.append(converted_stats)
make_plots(list_stats, reps, sizes, plot_dir)
def extract_results_sizes(dict_stats, reps, plot_dir):
"""Prepare data from a dict with all results to plot the differences between analyses with different sample sizes."""
sizes = sorted(dict_stats.keys())
nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known = [], [], [], [], [], []
for size in sizes:
for rep in range(len(reps)):
for cnv_type in 0, 1:
stats = dict_stats[size][rep][cnv_type]
nr_regions.append(float(stats[2]))
reg_size = stats[3]
if reg_size != 'N/A':
avg_size.append(float(reg_size))
total_size.append(float(stats[4]))
nr_genes.append(float(stats[5]))
nr_census.append(float(stats[7].split(" (")[0]))
nr_known.append(float(stats[6].split(" (")[0]))
plot_data = [nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known]
def make_plots(list_stats, reps, sizes, plot_dir):
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes'])
for plot_nr in range(len(plot_data)):
plot_size_differences(plot_data[plot_nr], sizes, len(reps), plot_y_axis[plot_nr], plot_dir)
'Number of genes', 'Nr. regions with known genes', 'Nr. regions with census genes'])
df_stats = pd.DataFrame(list_stats, columns=['Tool', 'Sample size', 'Type'] + plot_y_axis)
for plot_name in plot_y_axis:
plot_size_differences(df_stats, plot_name, sizes, len(reps), plot_dir)
for tool in 'GISTIC', 'RUBIC':
plot_size_differences_per_tool(df_stats, plot_name, sizes, len(reps), plot_dir, tool)
def plot_size_differences(list_size_results, list_sizes, nr_reps, y_axis, plot_dir):
def plot_size_differences(df_stats, value_y_axis, list_sizes, nr_reps, plot_dir):
"""Plot the differences between analyses using different sample sizes."""
sns.set_style("whitegrid")
df = pd.DataFrame(list_size_results, columns=['value_y_axis'])
df['type'] = pd.Series(['Amplifications', 'Deletions'] * len(list_sizes) * nr_reps)
sample_label = []
for size in list_sizes:
sample_label = sample_label + [size] * nr_reps * 2
df['sample_size'] = pd.Series(sample_label)
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5, #hue="tool" for adding RUBIC in plots: colors, significance,
aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", y_axis).set_titles("{col_name}").despine(bottom=True)
add_significance(g, df, list_sizes)
png_file = os.path.join(plot_dir, y_axis.replace(" ", "_") + ".png")
g = sns.factorplot(x="Sample size", y=value_y_axis, col="Type", hue="Tool", data=df_stats, kind="box",
size=5, aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", value_y_axis).set_titles("{col_name}").despine(bottom=True)
#add_significance(g, df_stats, list_sizes, value_y_axis)
png_file = os.path.join(plot_dir, value_y_axis.replace(" ", "_") + ".png")
plt.savefig(png_file, dpi=300)
plt.close()
def add_significance(g, df, list_sizes):
def plot_size_differences_per_tool(df_stats, value_y_axis, list_sizes, nr_reps, plot_dir, tool):
"""Plot the differences between analyses using different sample sizes."""
sns.set_style("whitegrid")
df_tool = df_stats[df_stats['Tool']==tool]
g = sns.factorplot(x="Sample size", y=value_y_axis, col="Type", data=df_tool, kind="box",
size=5, aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", value_y_axis).set_titles("{col_name}").despine(bottom=True)
#add_significance(g, df_stats, list_sizes, value_y_axis)
png_file = os.path.join(plot_dir, value_y_axis.replace(" ", "_") + "_" + tool + ".png")
plt.savefig(png_file, dpi=300)
plt.close()
def add_significance(plot, df, list_sizes, value_y_axis):
"""Add significance to plot with differences between sample sizes."""
cnv_types = ["Amplifications", "Deletions"]
cnv_type = "Amplifications"
for ax in g.axes.flat:
for ax in plot.axes.flat:
for i in range(1,len(list_sizes)):
x1, x2 = i-1, i
pval = test_significance(x1, x2, df, cnv_type, list_sizes)
pval = test_significance(x1, x2, df, cnv_type, list_sizes, value_y_axis)
if pval < .05:
y, h, col = df['value_y_axis'].max() + df['value_y_axis'].max() * 0.035, df['value_y_axis'].max() * 0.035, 'k'
ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
ax.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)
y = df[value_y_axis].max() + df[value_y_axis].max() * 0.035
h = df[value_y_axis].max() * 0.035
ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c='k')
ax.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color='k')
cnv_type = "Deletions"
def test_significance(x1, x2, df, cnv_type, list_sizes):
def test_significance(x1, x2, df, cnv_type, list_sizes, value_y_axis):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_type = df[df['type']==cnv_type]
values_x1 = only_type[only_type['sample_size']==list_sizes[x1]]
values_x2 = only_type[only_type['sample_size']==list_sizes[x2]]
only_tool = df[df['Type']==cnv_type]
#only_tool = only_type[only_type['Tool']=='GISTIC']
values_x1 = only_tool[only_tool['Sample size']==list_sizes[x1]]
values_x2 = only_tool[only_tool['Sample size']==list_sizes[x2]]
#signficance = ttest_ind(values_x1['value_y_axis'], values_x2['value_y_axis'])
significance = mannwhitneyu(values_x1['value_y_axis'], values_x2['value_y_axis'])
significance = mannwhitneyu(values_x1[value_y_axis], values_x2[value_y_axis])
return significance[1]
......@@ -31,7 +31,7 @@ def make_report(gistic_results, rubic_gain_results, rubic_loss_results,
def make_tool_report(file_tools, stats_tools):
"""Make a report of the results from GISTIC and RUBIC."""
row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Regions with known genes", "Regions with census genes"]
row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Nr. regions with known genes", "Nr. regions with census genes"]
with open(file_tools, 'w') as out:
for i in range(len(row_names)):
list_out = [row_names[i]]
......