ReportSizes.py 5.4 KB
Newer Older
1
2
3
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
4
plt.switch_backend('agg')
5
6
7
8
9
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
10
from ParseResults import parse_regions, get_stats
11
12
from collections import OrderedDict

13
def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes, known_genes, reps, sizes, ref_genome, report_file, plot_dir):
14
    """Make a report of the results produced using input files with different sample sizes."""
15
    list_stats = []
Beatrice Tan's avatar
Beatrice Tan committed
16
17
18
19
20
21
22
    all_results = {}
    for i in range(len(size_rubic_gains)):      #loop through sizes
        size_rep = size_rubic_gains[i].split("Size")[1].split("/gains")[0]
        size, repetition = size_rep.split(".Rep")
        for tool in 'GISTIC', 'RUBIC':
            size_file = (size_rubic_gains[i], size_rubic_losses[i]) if tool == 'RUBIC' else size_gistic[i]
            parsed_results = parse_regions(size_file, known_genes, census_genes, tool)
Beatrice Tan's avatar
Beatrice Tan committed
23
24
25
26
27
            #if tool not in all_results.keys():
            #    all_results[tool] = {}
            #    all_results[tool][size] = [parsed_results]
            #else:
            #    all_results[tool][size] = all_results[tool][size] + [parsed_results]
Beatrice Tan's avatar
Beatrice Tan committed
28
29
30
31
32
33
34
35
            stats_results = get_stats(parsed_results, size)
            for stat_list in stats_results[0], stats_results[1]:
                converted_stats = [tool] + stat_list[0:2]
                for stat in stat_list[2:5]:
                    converted_stats.append(float(stat))
                for stat in stat_list[5:]:
                    converted_stats.append(float(stat.split(" (")[0]))
                list_stats.append(converted_stats)
Beatrice Tan's avatar
Beatrice Tan committed
36
    #overlap_genes(all_results, report_file)
37
    make_plots(list_stats, reps, sizes, plot_dir)
38

Beatrice Tan's avatar
Beatrice Tan committed
39
def overlap_genes(all_results, report_file):
40
41
42
43
44
45
46
47
    known_genes = {}
    with open(report_file, 'w') as out:
        for tool in all_results.keys():
            out.write(tool + '\n')
            for size in all_results[tool].keys():
                out.write(size + '\n')
                for cnv_type in size:
                    out.write(cnv_type[5])
Beatrice Tan's avatar
Beatrice Tan committed
48
49


50
def make_plots(list_stats, reps, sizes, plot_dir):
51
    plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
52
53
54
55
56
57
                   'Number of genes', 'Nr. regions with known genes', 'Nr. regions with census genes'])
    df_stats = pd.DataFrame(list_stats, columns=['Tool', 'Sample size', 'Type'] + plot_y_axis)
    for plot_name in plot_y_axis:
        plot_size_differences(df_stats, plot_name, sizes, len(reps), plot_dir)
        for tool in 'GISTIC', 'RUBIC':
            plot_size_differences_per_tool(df_stats, plot_name, sizes, len(reps), plot_dir, tool)
58

59
def plot_size_differences(df_stats, value_y_axis, list_sizes, nr_reps, plot_dir):
60
61
    """Plot the differences between analyses using different sample sizes."""
    sns.set_style("whitegrid")
62
    g = sns.factorplot(x="Sample size", y=value_y_axis, col="Type", hue="Tool", data=df_stats, kind="box",
Beatrice Tan's avatar
Beatrice Tan committed
63
                       size=5, aspect=1, palette=["#5975A4","#5F9E6E"])
64
65
66
    g.set_axis_labels("Sample size", value_y_axis).set_titles("{col_name}").despine(bottom=True)
    #add_significance(g, df_stats, list_sizes, value_y_axis)
    png_file = os.path.join(plot_dir, value_y_axis.replace(" ", "_") + ".png")
67
    plt.savefig(png_file, dpi=300)
68
    plt.close()
69

70
71
72
73
74
75
76
77
78
79
80
81
82
def plot_size_differences_per_tool(df_stats, value_y_axis, list_sizes, nr_reps, plot_dir, tool):
    """Plot the differences between analyses using different sample sizes."""
    sns.set_style("whitegrid")
    df_tool = df_stats[df_stats['Tool']==tool]
    g = sns.factorplot(x="Sample size", y=value_y_axis, col="Type", data=df_tool, kind="box",
                       size=5, aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
    g.set_axis_labels("Sample size", value_y_axis).set_titles("{col_name}").despine(bottom=True)
    #add_significance(g, df_stats, list_sizes, value_y_axis)
    png_file = os.path.join(plot_dir, value_y_axis.replace(" ", "_") + "_" + tool + ".png")
    plt.savefig(png_file, dpi=300)
    plt.close()

def add_significance(plot, df, list_sizes, value_y_axis):
83
84
    """Add significance to plot with differences between sample sizes."""
    cnv_type = "Amplifications"
85
    for ax in plot.axes.flat:
86
87
        for i in range(1,len(list_sizes)):
            x1, x2 = i-1, i
88
            pval = test_significance(x1, x2, df, cnv_type, list_sizes, value_y_axis)
89
            if pval < .05:
90
91
92
93
                y = df[value_y_axis].max() + df[value_y_axis].max() * 0.035
                h = df[value_y_axis].max() * 0.035
                ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c='k')
                ax.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color='k')
94
            cnv_type = "Deletions"
95

96
def test_significance(x1, x2, df, cnv_type, list_sizes, value_y_axis):
97
98
    """Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
    Returns P-value."""
99
100
101
102
    only_tool = df[df['Type']==cnv_type]
    #only_tool = only_type[only_type['Tool']=='GISTIC']
    values_x1 = only_tool[only_tool['Sample size']==list_sizes[x1]]
    values_x2 = only_tool[only_tool['Sample size']==list_sizes[x2]]
103
    #signficance = ttest_ind(values_x1['value_y_axis'], values_x2['value_y_axis'])
104
    significance = mannwhitneyu(values_x1[value_y_axis], values_x2[value_y_axis])
105
    return significance[1]