ReportSizes.py 5.79 KB
Newer Older
1
2
3
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
4
plt.switch_backend('agg')
5
6
7
8
9
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
10
from ParseResults import parse_regions, get_stats
11
12
from collections import OrderedDict

13
def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes, known_genes, reps, sizes, ref_genome, report_file, plot_dir):
14
    """Make a report of the results produced using input files with different sample sizes."""
15
    list_stats = []
Beatrice Tan's avatar
Beatrice Tan committed
16
17
18
19
20
21
    all_results = {}
    for i in range(len(size_rubic_gains)):      #loop through sizes
        size_rep = size_rubic_gains[i].split("Size")[1].split("/gains")[0]
        size, repetition = size_rep.split(".Rep")
        for tool in 'GISTIC', 'RUBIC':
            size_file = (size_rubic_gains[i], size_rubic_losses[i]) if tool == 'RUBIC' else size_gistic[i]
22
            parsed_results = parse_regions(size_file, known_genes, census_genes, tool, ref_genome)
Beatrice Tan's avatar
Beatrice Tan committed
23
24
25
26
27
28
29
30
            stats_results = get_stats(parsed_results, size)
            for stat_list in stats_results[0], stats_results[1]:
                converted_stats = [tool] + stat_list[0:2]
                for stat in stat_list[2:5]:
                    converted_stats.append(float(stat))
                for stat in stat_list[5:]:
                    converted_stats.append(float(stat.split(" (")[0]))
                list_stats.append(converted_stats)
31
    overlap_genes(all_results, report_file)
32
    make_plots(list_stats, reps, sizes, plot_dir)
33

Beatrice Tan's avatar
Beatrice Tan committed
34
def overlap_genes(all_results, report_file):
35
    with open(report_file, 'w') as out:
36
37
38
39
40
41
42
43
44
        out.write("\n".join(list(all_results.keys())))
#    known_genes = {}
#    with open(report_file, 'w') as out:
#        for tool in all_results.keys():
#            out.write(tool + '\n')
#            for size in all_results[tool].keys():
#                out.write(size + '\n')
#                for cnv_type in size:
#                    out.write(cnv_type[5])
Beatrice Tan's avatar
Beatrice Tan committed
45
46


47
def make_plots(list_stats, reps, sizes, plot_dir):
Beatrice Tan's avatar
Beatrice Tan committed
48
49
50
51
    plot_y_axis = (["No. regions", "Avg. size (Kb)", "Total size (Mb)", "No. genes", "Average driver density", "No. focal regions", "No. known regions",
                    "No. census regions", "No. known focal regions"])
                # 'Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
                #   'Number of genes', 'Number of known regions', 'Number of census regions'])
52
53
54
55
56
    df_stats = pd.DataFrame(list_stats, columns=['Tool', 'Sample size', 'Type'] + plot_y_axis)
    for plot_name in plot_y_axis:
        plot_size_differences(df_stats, plot_name, sizes, len(reps), plot_dir)
        for tool in 'GISTIC', 'RUBIC':
            plot_size_differences_per_tool(df_stats, plot_name, sizes, len(reps), plot_dir, tool)
57

58
def plot_size_differences(df_stats, value_y_axis, list_sizes, nr_reps, plot_dir):
59
60
    """Plot the differences between analyses using different sample sizes."""
    sns.set_style("whitegrid")
61
    g = sns.factorplot(x="Sample size", y=value_y_axis, col="Type", hue="Tool", data=df_stats, kind="box",
62
                       size=5, aspect=1, palette=["#5975A4","#5F9E6E"], order=list(map(str, list_sizes)))
Beatrice Tan's avatar
Beatrice Tan committed
63
    if value_y_axis in ["Avg. size (Kb)", "Total size (Mb)", "No. genes"]:
64
65
66
67
68
        label_y_axis = 'Log ' + value_y_axis[0].lower() + value_y_axis[1:]
        g.fig.get_axes()[0].set_yscale('log')
    else:
        label_y_axis = value_y_axis
    g.set_axis_labels("Sample size", label_y_axis).set_titles("{col_name}").despine(bottom=True)
69
    #add_significance(g, df_stats, list_sizes, value_y_axis)
70
    png_file = os.path.join(plot_dir, value_y_axis.split(" (")[0].replace(" ", "_") + ".png")
71
    plt.savefig(png_file, dpi=300)
72
    plt.close()
73

74
75
76
77
def plot_size_differences_per_tool(df_stats, value_y_axis, list_sizes, nr_reps, plot_dir, tool):
    """Plot the differences between analyses using different sample sizes."""
    sns.set_style("whitegrid")
    df_tool = df_stats[df_stats['Tool']==tool]
78
79
    g = sns.factorplot(x="Sample size", y=value_y_axis, col="Type", data=df_tool, kind="box", order=list(map(str, list_sizes)),
                       size=5, aspect=1, palette=sns.cubehelix_palette(10, start=.5, rot=-.75, dark=.2))
80
81
82
83
84
85
86
    g.set_axis_labels("Sample size", value_y_axis).set_titles("{col_name}").despine(bottom=True)
    #add_significance(g, df_stats, list_sizes, value_y_axis)
    png_file = os.path.join(plot_dir, value_y_axis.replace(" ", "_") + "_" + tool + ".png")
    plt.savefig(png_file, dpi=300)
    plt.close()

def add_significance(plot, df, list_sizes, value_y_axis):
87
88
    """Add significance to plot with differences between sample sizes."""
    cnv_type = "Amplifications"
89
    for ax in plot.axes.flat:
90
91
        for i in range(1,len(list_sizes)):
            x1, x2 = i-1, i
92
            pval = test_significance(x1, x2, df, cnv_type, list_sizes, value_y_axis)
93
            if pval < .05:
94
95
96
97
                y = df[value_y_axis].max() + df[value_y_axis].max() * 0.035
                h = df[value_y_axis].max() * 0.035
                ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c='k')
                ax.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color='k')
98
            cnv_type = "Deletions"
99

100
def test_significance(x1, x2, df, cnv_type, list_sizes, value_y_axis):
101
102
    """Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
    Returns P-value."""
103
104
105
106
    only_tool = df[df['Type']==cnv_type]
    #only_tool = only_type[only_type['Tool']=='GISTIC']
    values_x1 = only_tool[only_tool['Sample size']==list_sizes[x1]]
    values_x2 = only_tool[only_tool['Sample size']==list_sizes[x2]]
107
    #signficance = ttest_ind(values_x1['value_y_axis'], values_x2['value_y_axis'])
108
    significance = mannwhitneyu(values_x1[value_y_axis], values_x2[value_y_axis])
109
    return significance[1]