Commit 25d1700c authored by Beatrice Tan's avatar Beatrice Tan
parents cc4e5d19 815b2823
......@@ -43,9 +43,6 @@ rule all:
"""Define desired output from pipeline."""
input:
"Samplesize/Report.txt"
#"Reports/Overlap_plots/12.png"
#expand("Reports/Overlap_plots/{region}.png", region=range(22))
rule help:
"""Print list of all targets with help."""
......
#Directories to be specified
workdir: /home/bftan/CNA_results #directory to write output
gisticdir: /home/bftan/Tools/GISTIC2_test #directory to install GISTIC2
gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
......@@ -34,6 +34,6 @@ settings_gistic: "-ta 0.1
-conf 0.99"
#Settings for sample size differences
#sizes: [30, 40, 50, 60, 70, 80, 90]
sizes: [30, 40]
sizes: [20, 30, 40, 50, 60, 70, 80, 90]
#sizes: [30, 40]
repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
import ReportSizes
from SampleSizes import SegFile
from Rubic import MarkerFile
import os.path
import datetime
rule seg_subsets:
"""Create segmentation files with different numbers of samples (randomly chosen) for a number of times."""
......@@ -17,10 +18,10 @@ rule run_gistic_subsets:
gistic_directory=os.path.join(config["gisticdir"], "gistic2"),
seg="Samplesize/Input/Size{rand_nr}.Rep{rep_nr}.txt"
output:
"Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt"
"Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt",
"Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/regions_track.conf_" + config["gistic_precision"] + ".bed"
params:
cnv="",
gistic_directory=config["gisticdir"],
ref=config["reference"],
ref_file="",
extra=config["settings_gistic"]
......@@ -50,16 +51,16 @@ rule run_rubic_subsets:
rule report_sizes:
"""Report the difference when using different sample sizes."""
input:
gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
#rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
#rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
#gistic=expand("Samplesize/GISTIC/Size{rand_nr}.Rep{rep_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_gains=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/gains.txt", rand_nr=config["sizes"], rep_nr=config["repeats"]),
rubic_losses=expand("Samplesize/RUBIC/Size{rand_nr}.Rep{rep_nr}/losses.txt", rand_nr=config["sizes"], rep_nr=config["repeats"])
output:
report="Samplesize/Report.txt",
plots="Samplesize/Plots/"
params:
census=config["census_genes"],
known=config["prev_found_genes"],
census=os.path.join(workflow.basedir, config["census_genes"]),
known=os.path.join(workflow.basedir, config["prev_found_genes"]),
reps=config["repeats"],
ref=config["reference"]
run:
ReportSizes.make_report(input[0], params.census, params.known, params.reps, params.ref, output.report, output.plots)
ReportSizes.make_report(input.rubic_gains, input.rubic_losses, params.census, params.known, params.reps, params.ref, output.report, output.plots)
......@@ -20,6 +20,6 @@ snakemake -p \
--latency-wait 90 \
--jobs 1 \
--max-jobs-per-second 10 \
--restart-times 3 \
--use-conda \
--conda-prefix ~/.conda
--restart-times 3
# --use-conda \
# --conda-prefix ~/.conda
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
......@@ -9,25 +10,31 @@ import os.path
from ParseResults import parse_regions, get_stats
from collections import OrderedDict
def make_report(size_results, census_genes, known_genes, reps, ref_genome, report_file, plot_dir):
def make_report(size_rubic_gains, size_rubic_losses, census_genes, known_genes, reps, ref_genome, report_file, plot_dir):
"""Make a report of the results produced using input files with different sample sizes."""
with open(report_file, 'w') as out:
dict_stats = OrderedDict()
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. regions with census genes", "Nr. regions with known genes", "Nr. genes"])
tool = GISTIC2
for size_file in size_results:
dict_stats = {}
#row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
# "Nr. genes", "Nr. regions with known genes", "Nr. regions with census genes"])
tool = 'RUBIC'
for i in range(len(size_rubic_gains)):
size_file = size_rubic_gains[i], size_rubic_losses[i]
parsed_results = parse_regions(size_file, known_genes, census_genes, tool)
size, repetition = size_file.split("/")[-1].split("x")
if tool == 'RUBIC':
size_rep = size_file[0].split("Size")[1].split("/gains")[0]
size, repetition = size_rep.split(".Rep")
else:
size_rep = size_file.split("Size")[1].split("/all_lesions")[0]
size, repetition = size_rep.split(".Rep")
stats_results = get_stats(parsed_results, size)
if size not in dict_stats.keys():
dict_stats[size] = [stats_results]
else:
dict_stats[size] = dict_stats[size] + [stats_results]
out.write(dict_stats)
extract_stats_sizes(dict_stats, reps, plot_dir)
out.write("\t".join(stats_results[0]) + "\n" + "\t".join(stats_results[1]))
extract_results_sizes(dict_stats, reps, plot_dir)
def extract_results_sizes(dict_stats, reps):
def extract_results_sizes(dict_stats, reps, plot_dir):
"""Prepare data from a dict with all results to plot the differences between analyses with different sample sizes."""
sizes = sorted(dict_stats.keys())
nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known = [], [], [], [], [], []
......@@ -40,9 +47,9 @@ def extract_results_sizes(dict_stats, reps):
if reg_size != 'N/A':
avg_size.append(float(reg_size))
total_size.append(float(stats[4]))
nr_genes.append(float(stats[7]))
nr_census.append(float(stats[6].split(" (")[0]))
nr_known.append(float(stats[5].split(" (")[0]))
nr_genes.append(float(stats[5]))
nr_census.append(float(stats[7].split(" (")[0]))
nr_known.append(float(stats[6].split(" (")[0]))
plot_data = [nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known]
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes'])
......@@ -72,14 +79,14 @@ def add_significance(g, df, list_sizes):
for ax in g.axes.flat:
for i in range(1,len(list_sizes)):
x1, x2 = i-1, i
pval = test_significance(x1, x2, df, cnv_type)
pval = test_significance(x1, x2, df, cnv_type, list_sizes)
if pval < .05:
y, h, col = df['value_y_axis'].max() + df['value_y_axis'].max() * 0.035, df['value_y_axis'].max() * 0.035, 'k'
ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
ax.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)
cnv_type = "Deletions"
def test_significance(x1, x2, df, cnv_type):
def test_significance(x1, x2, df, cnv_type, list_sizes):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_type = df[df['type']==cnv_type]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment