ReportTools.py 8.08 KB
Newer Older
1
2
3
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
4
from matplotlib_venn import venn2, venn3
5
6
import pandas as pd
import seaborn as sns
BeatriceTan's avatar
BeatriceTan committed
7
from scipy.stats import ttest_ind, mannwhitneyu, pearsonr
8
import os.path
9
from ParseResults import parse_regions, get_stats, parse_bed_overlap, parse_gene_file
10
from ensemblQueries import gene_name_to_ID
11
from collections import OrderedDict
BeatriceTan's avatar
BeatriceTan committed
12
import math
13

BeatriceTan's avatar
BeatriceTan committed
14
def make_report(gistic_results, rubic_gain_results, rubic_loss_results,
15
                  census_genes, known_genes, ref_genome, biomart_file,
BeatriceTan's avatar
BeatriceTan committed
16
                  file_tools, file_regions, file_venn, file_swarmplot,
BeatriceTan's avatar
BeatriceTan committed
17
                  file_genes_both, file_genes_GISTIC, file_genes_RUBIC,
18
                  bed_overlap, bed_known_genes):
19
    """Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
20
21
22
    parsed, stats = [], []
    for tool in 'GISTIC', 'RUBIC':
        results = gistic_results if tool == 'GISTIC' else [rubic_gain_results, rubic_loss_results]
23
        parsed_results = parse_regions(results, known_genes, census_genes, tool)
24
        parsed.append(parsed_results)
25
26
        stats.append(get_stats(parsed_results, tool))
    stats.append(parse_bed_overlap(bed_overlap, known_genes, census_genes))
Beatrice Tan's avatar
Beatrice Tan committed
27
28
    table_all_regions(parsed, file_regions)                                 #make table with all regions
    make_tool_report(file_tools, stats)                                     #make tool report
29
    all_genes = make_gene_files(parsed, file_genes_GISTIC, file_genes_RUBIC, file_genes_both, known_genes, file_venn)    #make gene files and venn overlap plot
Beatrice Tan's avatar
Beatrice Tan committed
30
31
    plot_histogram_sizes(parsed, file_swarmplot)                            #plot histogram with the sizes of the regions
    make_bed_known_regions(known_genes, biomart_file, bed_known_genes)      #Make bed file with the locations of known genes.
32

33
34
def make_tool_report(file_tools, stats_tools):
    """Make a report of the results from GISTIC and RUBIC."""
35
    row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Nr. regions with known genes", "Nr. regions with census genes"]
36
37
    with open(file_tools, 'w') as out:
        for i in range(len(row_names)):
38
39
40
            list_out = [row_names[i]]
            for j in range(len(stats_tools)):
                list_out += [stats_tools[j][0][i], stats_tools[j][1][i]]
41
42
            list_out = list(map(str, list_out))
            out.write("\t".join(list_out) + "\n")
43

44
def table_all_regions(parsed_regions, file_regions):
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
    """Make a table with all recurent regions."""
    with open(file_regions, 'w') as out:
        header = ["Method", "Chr", "Start", "End", "Type", "Negative log10 q-value", "Known genes", "Census genes", "All genes"]
        out.write("\t".join(header) + "\n")
        tool_name = "GISTIC2"
        for tool in parsed_regions:
            for cnv in tool:
                out.write(tool_name + "\t")
                cnv_type = "Amplification" if cnv[3] == "amp" else "Deletion"
                q_vals = ", ".join(cnv[4]) if type(cnv[4]) == tuple else cnv[4]
                begin_row = cnv[0:3] + [cnv_type, str(q_vals)]
                out.write("\t".join(begin_row))
                for list_genes in range(5,8):
                    out.write("\t" + ", ".join(cnv[list_genes]))
                out.write("\n")
            tool_name = "RUBIC"
61

62
def make_gene_files(parsed_results, out_GISTIC, out_RUBIC, out_both, known_genes, out_venn):
63
64
65
66
67
68
69
70
71
72
73
    """Make files with all genes reported by GISTIC2 and RUBIC."""
    genes_all = []
    for tool_results in parsed_results:
        genes_tool = []
        for cnv in tool_results:
            genes = cnv[7]
            genes_tool += genes
        genes_tool = list(set(genes_tool))  #get unique genes
        genes_all.append(genes_tool)
        out_file = out_GISTIC if tool_results == parsed_results[0] else out_RUBIC
        with open(out_file, 'w') as out:
74
75
            all_IDs = gene_name_to_ID(genes_tool)
            out.write("\n".join(all_IDs))
76
77
    make_gene_file_overlap(genes_all, out_both)
    venn3_overlap(genes_all, known_genes, out_venn)
78

79
80
def make_gene_file_overlap(gene_lists, out_both):
    """Make file with all genes reported by both tools."""
81
    overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
82
    all_IDs = gene_name_to_ID(list(overlap))
83
    with open(out_both, 'w') as overlap_file:
84
        overlap_file.write("\n".join(all_IDs))
85

86
87
88
def venn3_overlap(gene_lists, known_genes, out_venn):
    known = parse_gene_file(known_genes, np.inf)
    plt.subplots(figsize=(7, 7))
Beatrice Tan's avatar
Beatrice Tan committed
89
90
    overlap_tools = set(gene_lists[0]).intersection(set(gene_lists[1]))
    print(overlap_tools.intersection(set(known)))
91
92
93
94
95
    venn_sets = [set(gene_lists[0]), set(gene_lists[1]), set(known)]
    c = venn3(venn_sets, ('GISTIC2', 'RUBIC', '                 Known genes'))
    c.get_patch_by_id('100').set_color('#5975A4'), c.get_patch_by_id('010').set_color('#5F9E6E')
    c.get_patch_by_id('001').set_color('#857AAA'), c.get_patch_by_id('011').set_color('#748A8F')
    c.get_patch_by_id('101').set_color('#6D77A7'), c.get_patch_by_id('110').set_color('#5C888B')
96
97
98
99
100
    try:
        c.get_patch_by_id('111').set_color('#B55D60')
        c.get_patch_by_id('111').set_alpha(1)
    except:
        pass #too little overlap from three groups to show
101
102
103
104
    c.get_patch_by_id('100').set_alpha(1), c.get_patch_by_id('010').set_alpha(1)
    c.get_patch_by_id('001').set_alpha(1), c.get_patch_by_id('011').set_alpha(1)
    c.get_patch_by_id('101').set_alpha(1), c.get_patch_by_id('110').set_alpha(1)
    plt.savefig(out_venn, dpi=300)
105

106
def plot_histogram_sizes(parsed_results, plot_file):
107
    """Make boxplot with swarmplot on top showing sizes of recurrent regions detected by GISTIC2.0 and RUBIC."""
108
    size_list, size_list_known = parse_sizes_for_plot(parsed_results)
109
110
111
    sns.set_style("white")
    f, ax = plt.subplots(figsize=(7, 7))
    ax.set(yscale="log")
112
113
114
    g = sns.boxplot(x="Tool", y="Size", hue="CNV type", data=size_list, whis=np.inf, palette={"Amplification": "#71AEC0", "Deletion": "#B55D60"})
    g = sns.swarmplot(x="Tool", y="Size", dodge=True, hue="CNV type", data=size_list, size=5, palette={"Amplification": "#527F8C", "Deletion": "#844446"})
    g = sns.swarmplot(x="Tool", y="Size", dodge=True, hue="CNV type", data=size_list_known, size=5, palette={"Amplification": "black", "Deletion": "black"})
115
116
117
118
119
120
121
    sns.despine()
    g.set_xlabel("Tool",fontsize=16)
    g.set_ylabel("Log size of recurrent regions",fontsize=16)
    g.tick_params(labelsize=12)
    handles, labels = g.get_legend_handles_labels()
    plt.legend(handles[0:2], labels[:2], fontsize=12)
    plt.savefig(plot_file, dpi=300)
122

123
124
125
126
def parse_sizes_for_plot(parsed_results):
    """Calculate the sizes of regions to use as input swarmplot of sizes of recurrent regions."""
    sizes, sizes_known = [], []
    tool = 'GISTIC2.0'
127
    for tool_results in parsed_results:
128
        amp_sizes, del_sizes = [], []
129
        for cnv in tool_results:
130
131
132
133
134
135
136
137
138
139
            size = int(cnv[2]) - int(cnv[1])
            cnv_label = 'Amplification' if cnv[3] == 'amp' else 'Deletion'
            sizes += [(size, tool, cnv_label)]
            if cnv[5] != []:
                sizes_known += [(size, tool, cnv_label)]
        tool = 'RUBIC'
    labels = ['Size', 'Tool', 'CNV type']
    df = pd.DataFrame(sizes, columns=labels)
    df_known = pd.DataFrame(sizes_known, columns=labels)
    return df, df_known
140

141
142
143
144
145
146
147
148
def make_bed_known_regions(known_file, biomart_file, out_bed):
    """Make bed file with location of known genes."""
    known_genes = parse_gene_file(known_file, np.inf)
    lines = location_genes(known_genes, biomart_file)
    with open(out_bed, 'w') as out:
        out.write("track name=Known genes color=0,0,255\n")
        for line in lines:
            out.write("\t".join(line) + "\n")
BeatriceTan's avatar
BeatriceTan committed
149

150
151
152
153
154
155
156
157
158
159
160
def location_genes(known_genes, biomart_file):
    outlines = []
    with open(biomart_file, 'r') as biomart:
        biomart.readline()
        for line in biomart:
            gene = line.split("\t")[1]
            if gene in known_genes:
                #if list(set(gene).intersection(set(known_genes))) != []:
                gene_ID, gene_name, chrom, start, end = line.strip().split("\t")
                outlines.append(["chr" + chrom, start, end, gene_name])
    return outlines