ReportTools.py 11.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
from ParseResults import parse, get_stats
from collections import OrderedDict

class ReportTools:
    """Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
    def __init__(self, gistic_results, rubic_gain_results, rubic_loss_results,
                 census_genes, known_genes, ref_genome, biomart_file,
                 file_tools, file_regions, file_venn, file_overlap, file_swarmplot,
                 file_genes_both, file_genes_GISTIC, file_genes_RUBIC):
        install_ensembl(ref_genome)
        parsed, stats = [], []
        for tool in 'GISTIC', 'RUBIC':
            results = gistic_results if tool == 'GISTIC' else [rubic_gain_results, rubic_loss_results]
            parsed_results = parse(results, known_genes, census_genes, tool)
            stats_results = get_stats(parsed_results, tool)
            parsed.append(parsed_results)
            stats.append(stats_results)
        self.make_files(parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
                        file_venn, file_swarmplot, file_regions, biomart_file, file_overlap)

    def make_files(self, parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
                   file_venn, file_swarmplot, file_regions, biomart_file, file_overlap):
        #self.overlap_most_sign(parsed)
        self.table_regions(parsed, file_regions)
        self.make_tool_report(file_tools, stats)
        all_genes = self.make_genefiles(parsed, file_genes_GISTIC, file_genes_RUBIC)
        self.make_genefile_overlap(all_genes, file_genes_both)
        self.venn_overlap(all_genes, file_venn)
        #self.compare_known_regions(known_genes, file_overlap, biomart_file,
        #                           gistic_results, rubic_gain_results, rubic_loss_results)
        self.make_swarmplot_sizes(parsed, file_swarmplot)
        self.compare_overlapping_regions(parsed, biomart_file, file_overlap)

    def make_tool_report(self, file_tools, stats_tools):
        """Make a report of the results from GISTIC and RUBIC."""
        row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Regions with known genes", "Regions with census genes"]
        with open(file_tools, 'w') as out:
            for i in range(len(row_names)):
                list_out = [row_names[i], stats_tools[0][0][i], stats_tools[0][1][i], stats_tools[1][0][i], stats_tools[1][1][i]]
                list_out = list(map(str, list_out))
                out.write("\t".join(list_out) + "\n")

    def table_regions(self, parsed_regions, file_regions):
        """Make a table with all recurent regions."""
        with open(file_regions, 'w') as out:
            header = ["Method", "Chr", "Start", "End", "Type", "Negative log10 q-value", "Known genes", "Census genes", "All genes"]
            out.write("\t".join(header) + "\n")
            tool_name = "GISTIC2"
            for tool in parsed_regions:
                for cnv in tool:
                    out.write(tool_name + "\t")
                    cnv_type = "Amplification" if cnv[3] == "amp" else "Deletion"
                    q_vals = ", ".join(cnv[4]) if type(cnv[4]) == tuple else cnv[4]
                    begin_row = cnv[0:3] + [cnv_type, str(q_vals)]
                    out.write("\t".join(begin_row))
                    for list_genes in range(5,8):
                        out.write("\t" + ", ".join(cnv[list_genes]))
                    out.write("\n")
                tool_name = "RUBIC"

    def make_genefiles(self, parsed_results, out_GISTIC, out_RUBIC):
        """Make files with all genes reported by GISTIC2 and RUBIC."""
        genes_all = []
        for tool_results in parsed_results:
            genes_tool = []
            for cnv in tool_results:
                genes = cnv[7]
                genes_tool += genes
            genes_tool = list(set(genes_tool))  #get unique genes
            genes_all.append(genes_tool)
            out_file = out_GISTIC if tool_results == parsed_results[0] else out_RUBIC
            with open(out_file, 'w') as out:
                out.write("\n".join(genes_tool))
        return genes_all

    def make_genefile_overlap(self, gene_lists, out_both):
        overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
        with open(out_both, 'w') as overlap_file:
            overlap_file.write("\n".join(list(overlap)))

    def make_swarmplot_sizes(self, parsed_results, out_plot):
        """Make swarmplot of the sizes of recurrent regions detected by either GISTIC2 or RUBIC."""
        sizes, sizes_zoomed = [], []
        tool = 'GISTIC2.0'
        for tool_results in parsed_results:
            amp_sizes, del_sizes = [], []
            for cnv in tool_results:
                size = int(cnv[2]) - int(cnv[1])
                cnv_label = 'Amplification' if cnv[3] == 'amp' else 'Deletion'
                sizes += [(size, tool, cnv_label)]
                if size < 900000:
                    sizes_zoomed += [(size, tool, cnv_label)]
            tool = 'RUBIC'
        labels = ['Size', 'Tool', 'CNV type']
        df = pd.DataFrame(sizes, columns=labels)
        self.make_swarmplot_log_sizes(df, out_plot)

    def make_swarmplot_log_sizes(self, size_list, plot_file):
        sns.set_style("white")
        f, ax = plt.subplots(figsize=(7, 7))
        ax.set(yscale="log")
        g = sns.boxplot(x="Tool", y="Size", hue="CNV type", data=size_list, whis=np.inf, palette="deep")
        g = sns.swarmplot(x="Tool", y="Size", dodge=True, hue="CNV type", data=size_list, size=5, palette={"Amplification": "#3f3f3f", "Deletion": "#3f3f3f"})
        sns.despine()
        g.set_xlabel("Tool",fontsize=16)
        g.set_ylabel("Log size of recurrent regions",fontsize=16)
        g.tick_params(labelsize=12)
        handles, labels = g.get_legend_handles_labels()
        plt.legend(handles[0:2], labels[:2], fontsize=12)
        plt.savefig(plot_file, dpi=300)
        plt.close()

    def venn_overlap(self, gene_lists, out_venn):   #maybe add known genes?
        """Compare the genes detected by GISTIC and those by RUBIC."""
        plt.subplots(figsize=(7, 7))
        c = venn2([set(gene_lists[0]), set(gene_lists[1])], set_labels = ('GISTIC2', 'RUBIC'))
        c.get_patch_by_id('10').set_color('#5975A4')
        c.get_patch_by_id('01').set_color('#5F9E6E')
        c.get_patch_by_id('10').set_edgecolor('#4c4c4c')
        c.get_patch_by_id('01').set_edgecolor('#4c4c4c')
        c.get_patch_by_id('10').set_alpha(1)
        c.get_patch_by_id('01').set_alpha(1)
        c.get_patch_by_id('11').set_color('#857AAA')
        c.get_patch_by_id('11').set_edgecolor('none')
        c.get_patch_by_id('11').set_alpha(1)
        plt.savefig(out_venn, dpi=300)
        plt.close()

    def compare_overlapping_regions(self, parsed_results, biomart_file, overlap_file):
        """Compare the regions containing known driver genes"""
        location_dict = {}
        tool = "GISTIC"
        for tool_results in parsed_results:
            for cnv in tool_results:
                known_genes = cnv[5]
                if len(known_genes) != 0:
                    for gene in known_genes:
                        location_gene, gene_name = self.get_location_gene(biomart_file, gene)
                        if location_gene != ['N/A', 'N/A', 'N/A']:
                            if tool == "GISTIC":    #first loop
                                location_gistic = cnv[0:3]
                                location_rubic = ['N/A', 'N/A', 'N/A']
                                for rubic_cnv in parsed_results[1]:
                                    if int(rubic_cnv[0]) == int(location_gene[0]) and int(rubic_cnv[1]) <= int(location_gene[2]) and int(rubic_cnv[2]) >= int(location_gene[1]):   #overlap cnv and gene
                                        location_rubic = rubic_cnv[0:3]
                            else:                   #second loop
                                location_rubic = cnv[0:3]
                                location_gistic = ['N/A', 'N/A', 'N/A']
                                for gistic_cnv in parsed_results[0]:
                                    if int(gistic_cnv[0]) == int(location_gene[0]) and int(gistic_cnv[1]) <= int(location_gene[2]) and int(gistic_cnv[2]) >= int(location_gene[1]):   #overlap cnv and gene
                                        location_gistic = gistic_cnv[0:3]
                            if gene not in location_dict:
                                location_dict[gene] = gene_name, location_gene, location_gistic, location_rubic
                            #else:
                                #print(location_dict[gene])
                                #print(gene_name, location_gene, location_gistic, location_rubic)
            tool = "RUBIC"
        with open(overlap_file, 'w') as out:
            for gene in location_dict.keys():
                gene_name, location_gene, location_gistic, location_rubic = location_dict[gene]
                out_line = [gene_name, location_gene[0] + ":" + "-".join(location_gene[1:3]), location_gistic[0] + ":" + "-".join(location_gistic[1:3]), location_rubic[0] + ":" + "-".join(location_rubic[1:3])]
                out.write("\t".join(out_line) + "\n")

    def get_location_gene(self, biomart_file, gene_of_interest):
        """Get location of a gene from a biomart gene file"""
        location = ['N/A', 'N/A', 'N/A']
        gene_name = 'N/A'
        with open(biomart_file, 'r') as genes:
            genes.readline()
            for line in genes:
                info = line.strip().split("\t")
                if info[0] == gene_of_interest:
                    location = [info[2], info[3], info[4]]
                    gene_name = info[1]
                    #try:
                    #    location = [int(i) for i in location]
                    #    break
                    #except:
                    #    continue
        return location, gene_name

    def get_location_gistic(self, gistic_results, location_gene):       #more regions overlap: extract one with longest overlap
        """Get location of the CNV detected by GISTIC2 harboring the gene of interest"""
        loc = ['N/A', 'N/A', 'N/A']
        chrom_gene, start_gene, end_gene = location_gene
        with open(os.path.join(gistic_results, "all_lesions.conf_75.txt"), 'r') as cnvs:
            cnvs.readline()
            for cnv in cnvs:
                cnv_loc = cnv.split("\t")[3]
                cnv_loc_stripped = cnv_loc.split("(probes")[0]
                chrom, bp = cnv_loc_stripped.split(":")
                chrom = chrom.strip("chr")
                if chrom == chrom_gene:
                    start, end = bp.split("-")
                    if int(start) <= int(end_gene) and int(end) >= int(start_gene):
                        if loc == ['N/A', 'N/A', 'N/A']:
                            loc = [chrom, start, end]
                        #else:
                            #loc = loc, chrom, start, end
        return loc

    def get_location_rubic(self, rubic_results, gene):
        loc = ['N/A', 'N/A', 'N/A']
        for file_type in "gains.txt", "losses.txt":
            with open(os.path.join(rubic_results, file_type), 'r') as cnvs:
                cnvs.readline()
                for cnv in cnvs:
                    if gene in cnv.strip().split("\t")[5]:
                        chrom, start, end, pleft, pright, genes = cnv.strip().split("\t")
                        if loc == ['N/A', 'N/A', 'N/A']:
                            loc = [chrom, start, end]
                        #else:
                            #loc = loc, chrom, start, end
        return loc