ReportTools.py 10.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
import os.path
import pyensembl
10
from ParseResults import parse, get_stats, install_ensembl
11
12
from collections import OrderedDict

13
14
15
16
class make_report(gistic_results, rubic_gain_results, rubic_loss_results,
                  census_genes, known_genes, ref_genome, biomart_file,
                  file_tools, file_regions, file_venn, file_overlap, file_swarmplot,
                  file_genes_both, file_genes_GISTIC, file_genes_RUBIC):
17
    """Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
18
19
20
21
22
23
24
25
26
27
    install_ensembl(ref_genome)
    parsed, stats = [], []
    for tool in 'GISTIC', 'RUBIC':
        results = gistic_results if tool == 'GISTIC' else [rubic_gain_results, rubic_loss_results]
        parsed_results = parse(results, known_genes, census_genes, tool)
        stats_results = get_stats(parsed_results, tool)
        parsed.append(parsed_results)
        stats.append(stats_results)
    make_files(parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
               file_venn, file_swarmplot, file_regions, biomart_file, file_overlap)
28

29
30
31
32
33
34
35
36
37
38
39
40
def make_files(parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
               file_venn, file_swarmplot, file_regions, biomart_file, file_overlap):
    #overlap_most_sign(parsed)
    table_regions(parsed, file_regions)
    make_tool_report(file_tools, stats)
    all_genes = make_genefiles(parsed, file_genes_GISTIC, file_genes_RUBIC)
    make_genefile_overlap(all_genes, file_genes_both)
    venn_overlap(all_genes, file_venn)
    #compare_known_regions(known_genes, file_overlap, biomart_file,
    #                      gistic_results, rubic_gain_results, rubic_loss_results)
    plot_sizes(parsed, file_swarmplot)
    compare_overlapping_regions(parsed, biomart_file, file_overlap)
41

42
43
44
45
46
47
48
49
def make_tool_report(file_tools, stats_tools):
    """Make a report of the results from GISTIC and RUBIC."""
    row_names = ["Tool", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Regions with known genes", "Regions with census genes"]
    with open(file_tools, 'w') as out:
        for i in range(len(row_names)):
            list_out = [row_names[i], stats_tools[0][0][i], stats_tools[0][1][i], stats_tools[1][0][i], stats_tools[1][1][i]]
            list_out = list(map(str, list_out))
            out.write("\t".join(list_out) + "\n")
50

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def table_regions(parsed_regions, file_regions):
    """Make a table with all recurent regions."""
    with open(file_regions, 'w') as out:
        header = ["Method", "Chr", "Start", "End", "Type", "Negative log10 q-value", "Known genes", "Census genes", "All genes"]
        out.write("\t".join(header) + "\n")
        tool_name = "GISTIC2"
        for tool in parsed_regions:
            for cnv in tool:
                out.write(tool_name + "\t")
                cnv_type = "Amplification" if cnv[3] == "amp" else "Deletion"
                q_vals = ", ".join(cnv[4]) if type(cnv[4]) == tuple else cnv[4]
                begin_row = cnv[0:3] + [cnv_type, str(q_vals)]
                out.write("\t".join(begin_row))
                for list_genes in range(5,8):
                    out.write("\t" + ", ".join(cnv[list_genes]))
                out.write("\n")
            tool_name = "RUBIC"
68

69
70
71
72
73
74
75
76
77
78
79
80
81
82
def make_genefiles(parsed_results, out_GISTIC, out_RUBIC):
    """Make files with all genes reported by GISTIC2 and RUBIC."""
    genes_all = []
    for tool_results in parsed_results:
        genes_tool = []
        for cnv in tool_results:
            genes = cnv[7]
            genes_tool += genes
        genes_tool = list(set(genes_tool))  #get unique genes
        genes_all.append(genes_tool)
        out_file = out_GISTIC if tool_results == parsed_results[0] else out_RUBIC
        with open(out_file, 'w') as out:
            out.write("\n".join(genes_tool))
    return genes_all
83

84
85
86
87
def make_genefile_overlap(gene_lists, out_both):
    overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
    with open(out_both, 'w') as overlap_file:
        overlap_file.write("\n".join(list(overlap)))
88

89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
def parse_sizes(parsed_results):
    """Calculate the sizes of regions to use as input swarmplot of sizes of recurrent regions."""
    sizes, sizes_zoomed = [], []
    tool = 'GISTIC2.0'
    for tool_results in parsed_results:
        amp_sizes, del_sizes = [], []
        for cnv in tool_results:
            size = int(cnv[2]) - int(cnv[1])
            cnv_label = 'Amplification' if cnv[3] == 'amp' else 'Deletion'
            sizes += [(size, tool, cnv_label)]
            if size < 900000:
                sizes_zoomed += [(size, tool, cnv_label)]
        tool = 'RUBIC'
    labels = ['Size', 'Tool', 'CNV type']
    df = pd.DataFrame(sizes, columns=labels)
    return df
105

106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def plot_sizes(parsed_results, plot_file):
    """Make boxplot with swarmplot on top showing sizes of recurrent regions detected by GISTIC2.0 and RUBIC."""
    size_list = parse_sizes(parsed_results)
    sns.set_style("white")
    f, ax = plt.subplots(figsize=(7, 7))
    ax.set(yscale="log")
    g = sns.boxplot(x="Tool", y="Size", hue="CNV type", data=size_list, whis=np.inf, palette="deep")
    g = sns.swarmplot(x="Tool", y="Size", dodge=True, hue="CNV type", data=size_list, size=5, palette={"Amplification": "#3f3f3f", "Deletion": "#3f3f3f"})
    sns.despine()
    g.set_xlabel("Tool",fontsize=16)
    g.set_ylabel("Log size of recurrent regions",fontsize=16)
    g.tick_params(labelsize=12)
    handles, labels = g.get_legend_handles_labels()
    plt.legend(handles[0:2], labels[:2], fontsize=12)
    plt.savefig(plot_file, dpi=300)
    plt.close()
122

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def venn_overlap(gene_lists, out_venn):   #maybe add known genes?
    """Compare the genes detected by GISTIC and those by RUBIC."""
    plt.subplots(figsize=(7, 7))
    c = venn2([set(gene_lists[0]), set(gene_lists[1])], set_labels = ('GISTIC2', 'RUBIC'))
    c.get_patch_by_id('10').set_color('#5975A4')
    c.get_patch_by_id('01').set_color('#5F9E6E')
    c.get_patch_by_id('10').set_edgecolor('#4c4c4c')
    c.get_patch_by_id('01').set_edgecolor('#4c4c4c')
    c.get_patch_by_id('10').set_alpha(1)
    c.get_patch_by_id('01').set_alpha(1)
    c.get_patch_by_id('11').set_color('#857AAA')
    c.get_patch_by_id('11').set_edgecolor('none')
    c.get_patch_by_id('11').set_alpha(1)
    plt.savefig(out_venn, dpi=300)
    plt.close()
138

139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def compare_overlapping_regions(parsed_results, biomart_file, overlap_file):
    """Compare the regions containing known driver genes"""
    location_dict = {}
    tool = "GISTIC"
    for tool_results in parsed_results:
        for cnv in tool_results:
            known_genes = cnv[5]
            if len(known_genes) != 0:
                for gene in known_genes:
                    location_gene, gene_name = get_location_gene(biomart_file, gene)
                    if location_gene != ['N/A', 'N/A', 'N/A']:
                        if tool == "GISTIC":    #first loop
                            location_gistic = cnv[0:3]
                            location_rubic = ['N/A', 'N/A', 'N/A']
                            for rubic_cnv in parsed_results[1]:
                                if int(rubic_cnv[0]) == int(location_gene[0]) and int(rubic_cnv[1]) <= int(location_gene[2]) and int(rubic_cnv[2]) >= int(location_gene[1]):   #overlap cnv and gene
                                    location_rubic = rubic_cnv[0:3]
                        else:                   #second loop
                            location_rubic = cnv[0:3]
                            location_gistic = ['N/A', 'N/A', 'N/A']
                            for gistic_cnv in parsed_results[0]:
                                if int(gistic_cnv[0]) == int(location_gene[0]) and int(gistic_cnv[1]) <= int(location_gene[2]) and int(gistic_cnv[2]) >= int(location_gene[1]):   #overlap cnv and gene
                                    location_gistic = gistic_cnv[0:3]
                        if gene not in location_dict:
                            location_dict[gene] = gene_name, location_gene, location_gistic, location_rubic
                        #else:
                            #print(location_dict[gene])
                            #print(gene_name, location_gene, location_gistic, location_rubic)
        tool = "RUBIC"
    with open(overlap_file, 'w') as out:
        for gene in location_dict.keys():
            gene_name, location_gene, location_gistic, location_rubic = location_dict[gene]
            out_line = [gene_name, location_gene[0] + ":" + "-".join(location_gene[1:3]), location_gistic[0] + ":" + "-".join(location_gistic[1:3]), location_rubic[0] + ":" + "-".join(location_rubic[1:3])]
            out.write("\t".join(out_line) + "\n")

def get_location_gene(biomart_file, gene_of_interest):
    """Get location of a gene from a biomart gene file"""
    location = ['N/A', 'N/A', 'N/A']
    gene_name = 'N/A'
    with open(biomart_file, 'r') as genes:
        genes.readline()
        for line in genes:
            info = line.strip().split("\t")
            if info[0] == gene_of_interest:
                location = [info[2], info[3], info[4]]
                gene_name = info[1]
                #try:
                #    location = [int(i) for i in location]
                #    break
                #except:
                #    continue
    return location, gene_name
191

192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def get_location_gistic(gistic_results, location_gene):       #more regions overlap: extract one with longest overlap
    """Get location of the CNV detected by GISTIC2 harboring the gene of interest"""
    loc = ['N/A', 'N/A', 'N/A']
    chrom_gene, start_gene, end_gene = location_gene
    with open(os.path.join(gistic_results, "all_lesions.conf_75.txt"), 'r') as cnvs:
        cnvs.readline()
        for cnv in cnvs:
            cnv_loc = cnv.split("\t")[3]
            cnv_loc_stripped = cnv_loc.split("(probes")[0]
            chrom, bp = cnv_loc_stripped.split(":")
            chrom = chrom.strip("chr")
            if chrom == chrom_gene:
                start, end = bp.split("-")
                if int(start) <= int(end_gene) and int(end) >= int(start_gene):
                    if loc == ['N/A', 'N/A', 'N/A']:
                        loc = [chrom, start, end]
                    #else:
                        #loc = loc, chrom, start, end
    return loc
211

212
213
214
215
def get_location_rubic(rubic_results, gene):
    loc = ['N/A', 'N/A', 'N/A']
    for file_type in "gains.txt", "losses.txt":
        with open(os.path.join(rubic_results, file_type), 'r') as cnvs:
216
217
            cnvs.readline()
            for cnv in cnvs:
218
219
220
221
222
223
224
                if gene in cnv.strip().split("\t")[5]:
                    chrom, start, end, pleft, pright, genes = cnv.strip().split("\t")
                    if loc == ['N/A', 'N/A', 'N/A']:
                        loc = [chrom, start, end]
                    #else:
                        #loc = loc, chrom, start, end
    return loc