ReportTools.py 9.87 KB
Newer Older
1
2
3
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
4
from matplotlib_venn import venn3
5
6
import pandas as pd
import seaborn as sns
BeatriceTan's avatar
BeatriceTan committed
7
from scipy.stats import ttest_ind, mannwhitneyu, pearsonr
8
import os.path
BeatriceTan's avatar
BeatriceTan committed
9
import math
10

11
12
13
from ParseResults import parse_regions, get_stats, parse_gene_file
from ensemblQueries import gene_name_to_ID, gene_ID_to_name

BeatriceTan's avatar
BeatriceTan committed
14
def make_report(gistic_results, rubic_gain_results, rubic_loss_results,
15
                  census_genes, known_genes, ref_genome, biomart_file,
BeatriceTan's avatar
BeatriceTan committed
16
                  file_tools, file_regions, file_venn, file_swarmplot,
BeatriceTan's avatar
BeatriceTan committed
17
                  file_genes_both, file_genes_GISTIC, file_genes_RUBIC,
18
19
                  bed_overlap):
    """Make a report, tables, figures and gene files from the analyses with GISTIC2 and RUBIC."""
20
    parsed, stats = [], []
21
22
23
24
25
26
27
28
    for tool in 'GISTIC', 'RUBIC', 'Overlap':
        if tool == 'GISTIC':
            results = gistic_results
        elif tool == 'RUBIC':
            results = [rubic_gain_results, rubic_loss_results]
        else:
            results = bed_overlap
        parsed_results = parse_regions(results, known_genes, census_genes, tool, ref_genome)
29
        parsed.append(parsed_results)
30
31
        stats_results = get_stats(parsed_results, tool)
        stats.append(stats_results)
Beatrice Tan's avatar
Beatrice Tan committed
32
    make_table_regions(parsed[0:2], file_regions, ref_genome)                                                                    #make table with all recurrent regions
33
34
35
    make_tool_report(file_tools, stats)                                                                         #make tool report
    extract_gene_lists(parsed, file_genes_GISTIC, file_genes_RUBIC, file_genes_both, known_genes, file_venn, ref_genome)    #make gene files and venn overlap plot
    plot_histogram_sizes(parsed, file_swarmplot)                                                                #plot histogram with the sizes of the regions
36

37
38
def make_tool_report(file_tools, stats_tools):
    """Make a report of the results from GISTIC and RUBIC."""
Beatrice Tan's avatar
Beatrice Tan committed
39
40
    row_names = ["Tool", "Type", "No. regions", "Avg. size (Kb)", "Total size (Mb)", "No. genes", "Average driver density", "No. focal regions", "No. known regions",
                 "No. census regions", "No. known focal regions"]
41
42
    with open(file_tools, 'w') as out:
        for i in range(len(row_names)):
43
44
45
            list_out = [row_names[i]]
            for j in range(len(stats_tools)):
                list_out += [stats_tools[j][0][i], stats_tools[j][1][i]]
46
47
            list_out = list(map(str, list_out))
            out.write("\t".join(list_out) + "\n")
48

Beatrice Tan's avatar
Beatrice Tan committed
49
def make_table_regions(parsed_tools, file_regions, ref_genome):
50
    """Make a table with all recurent regions detected by both tools."""
51
    with open(file_regions, 'w') as out:
Beatrice Tan's avatar
Beatrice Tan committed
52
        header = ["Method", "Chr", "Start", "End", "Type", "Negative log10 q-value", "Known genes", "Census genes", "Total number of genes"]
53
        out.write("\t".join(header) + "\n")
54
55
        for tool in parsed_tools:
            tool_name = "GISTIC" if tool == parsed_tools[0] else "RUBIC"
56
57
58
59
60
61
            for cnv in tool:
                out.write(tool_name + "\t")
                cnv_type = "Amplification" if cnv[3] == "amp" else "Deletion"
                q_vals = ", ".join(cnv[4]) if type(cnv[4]) == tuple else cnv[4]
                begin_row = cnv[0:3] + [cnv_type, str(q_vals)]
                out.write("\t".join(begin_row))
Beatrice Tan's avatar
Beatrice Tan committed
62
63
64
65
66
67
                for list_genes in range(5,7):
                    out_genes = gene_ID_to_name(cnv[list_genes], ref_genome)
                    out.write("\t" + ", ".join(out_genes))
                out.write("\t" + str(len(cnv[7])))
                #for list_genes in range(5,8):
                    #out.write("\t" + ", ".join(cnv[list_genes]))
68
                out.write("\n")
69

70
71
def extract_gene_lists(parsed_results, out_GISTIC, out_RUBIC, out_both, known_genes, out_venn, ref_genome):
    """Extract lists of genes from the parsed results and a list of known genes to make files with gene lists and a venn diagram on the overlap."""
72
    genes_all = []
73
74
    genes_venn = []
    for tool_results in parsed_results:     #tool
75
        genes_tool = []
76
77
78
79
80
81
        for cnv_type in tool_results:       #type CNV
            for cnv in tool_results:
                genes = cnv[7]
                genes_tool += genes
        unique_genes = list(set(genes_tool))
        genes_all.append(unique_genes)
82
        out_file = out_GISTIC if tool_results == parsed_results[0] else out_RUBIC
83
        make_gene_file_tool(genes_tool, out_file)
84
    make_gene_file_overlap(genes_all, out_both)
85
86
87
88
89
90
91
92
    venn3_overlap(genes_all, known_genes, out_venn, ref_genome)

def make_gene_file_tool(gene_names, out_file):
    """Make file with list of genes reported by tool (one gene per line)."""
    with open(out_file, 'w') as out:
        #gene_IDs = gene_name_to_ID(gene_names)       #Convert gene names to IDs in order to do GO enrichment analysis
        #out.write("\n".join(gene_IDs))
        out.write("\n".join(gene_names))
93

94
def make_gene_file_overlap(gene_lists, out_both):
95
96
97
    """Make file with list of all genes reported by both tools (one gene per line)."""
    overlapping_genes = set(gene_lists[0]).intersection(set(gene_lists[1]))
    #overlapping_IDs = gene_name_to_ID(list(overlapping_genes))
98
    with open(out_both, 'w') as overlap_file:
99
100
        #overlap_file.write("\n".join(overlapping_IDs))
        overlap_file.write("\n".join(overlapping_genes))
101

102
103
104
def venn3_overlap(gene_lists, known_genes_file, out_venn, ref_genome):
    """Plot the overlap between the gene lists per tool and the list of known genes."""
    known_genes = parse_gene_file(known_genes_file, np.inf, ref_genome)
Beatrice Tan's avatar
Beatrice Tan committed
105
    overlap_tools = set(gene_lists[0]).intersection(set(gene_lists[1]))
106
107
108
109
110
111
112
113
114
    overlap_all = list(overlap_tools.intersection(set(known_genes)))
    overlap_GISTIC_known = list(set(gene_lists[0]).intersection(set(known_genes)))
    overlap_RUBIC_known = list(set(gene_lists[1]).intersection(set(known_genes)))
    print('Overlap RUBIC, GISTIC & known:\t' + ", ".join(gene_ID_to_name(overlap_all, ref_genome)))
    print('Overlap GISTIC & known:\t' + ", ".join(gene_ID_to_name(overlap_GISTIC_known, ref_genome)))
    print('Overlap RUBIC & known:\t' + ", ".join(gene_ID_to_name(overlap_RUBIC_known, ref_genome)))
    plt.subplots(figsize=(7, 7))
    venn_sets = [set(gene_lists[0]), set(gene_lists[1]), set(known_genes)]
    c = venn3(venn_sets, ('GISTIC', 'RUBIC', '                 Known genes'))
Beatrice Tan's avatar
Beatrice Tan committed
115
116
117
118
119
120
121
122
123
124
    try:
        c.get_patch_by_id('100').set_color('#5975A4'), c.get_patch_by_id('100').set_alpha(1)
        c.get_patch_by_id('010').set_color('#5F9E6E'), c.get_patch_by_id('010').set_alpha(1)
        c.get_patch_by_id('110').set_color('#5C888B'), c.get_patch_by_id('110').set_alpha(1)
        c.get_patch_by_id('001').set_color('#857AAA'), c.get_patch_by_id('001').set_alpha(1)
        c.get_patch_by_id('101').set_color('#6D77A7'), c.get_patch_by_id('101').set_alpha(1)
        c.get_patch_by_id('011').set_color('#748A8F'), c.get_patch_by_id('011').set_alpha(1)
        c.get_patch_by_id('111').set_color('#B55D60'), c.get_patch_by_id('111').set_alpha(1)
    except:
        pass    #too small overlap from three groups to show the overlapping region
125
    plt.savefig(out_venn, dpi=300)
126

127
def plot_histogram_sizes(parsed_results, plot_file):
128
    """Make boxplot with swarmplot on top showing sizes of recurrent regions detected by GISTIC2.0 and RUBIC."""
129
    size_list, size_list_known = parse_sizes_for_plot(parsed_results)
130
131
132
    sns.set_style("white")
    f, ax = plt.subplots(figsize=(7, 7))
    ax.set(yscale="log")
133
134
    g = sns.boxplot(x="Tool", y="Size", hue="CNV type", data=size_list, whis=np.inf, palette={"Amplification": "#71AEC0", "Deletion": "#B55D60"})
    g = sns.swarmplot(x="Tool", y="Size", dodge=True, hue="CNV type", data=size_list, size=5, palette={"Amplification": "#527F8C", "Deletion": "#844446"})
135
    g = sns.swarmplot(x="Tool", y="Size", dodge=True, hue="CNV type", data=size_list_known, size=5, palette={"Amplification": "black", "Deletion": "black"})
136
137
138
139
    sns.despine()
    g.set_xlabel("Tool",fontsize=16)
    g.set_ylabel("Log size of recurrent regions",fontsize=16)
    g.tick_params(labelsize=12)
140
    handles, labels = g.get_legend_handles_labels()         #Fix legend to prevent duplicates
141
142
    plt.legend(handles[0:2], labels[:2], fontsize=12)
    plt.savefig(plot_file, dpi=300)
143

144
def parse_sizes_for_plot(parsed_results):
145
    """Extract the sizes of regions to use as input for plot of sizes of recurrent regions."""
146
    sizes, sizes_known = [], []
147
    for tool_results in parsed_results:
148
        tool = 'GISTIC' if tool_results == parsed_results[0] else 'RUBIC'
149
        amp_sizes, del_sizes = [], []
150
        for cnv in tool_results:
151
152
153
154
155
156
            size = int(cnv[2]) - int(cnv[1])
            cnv_label = 'Amplification' if cnv[3] == 'amp' else 'Deletion'
            sizes += [(size, tool, cnv_label)]
            if cnv[5] != []:
                sizes_known += [(size, tool, cnv_label)]
    labels = ['Size', 'Tool', 'CNV type']
157
158
159
    df_sizes = pd.DataFrame(sizes, columns=labels)
    df_sizes_known = pd.DataFrame(sizes_known, columns=labels)
    return df_sizes, df_sizes_known
160

161
def make_bed_genes(known_file, biomart_file, out_bed, ref_genome):
162
    """Make bed file with location of known genes."""
163
164
    known_genes = parse_gene_file(known_file, np.inf, ref_genome)
    bed_lines = get_location_genes(known_genes, biomart_file)
165
166
    with open(out_bed, 'w') as out:
        out.write("track name=Known genes color=0,0,255\n")
167
        for line in bed_lines:
168
            out.write("\t".join(line) + "\n")
BeatriceTan's avatar
BeatriceTan committed
169

170
171
172
def get_location_genes(known_genes, biomart_file):
    """Extract locations of a list of genes from a biomart file with all genes to use for a bed file."""
    bed_lines = []
173
174
175
    with open(biomart_file, 'r') as biomart:
        biomart.readline()
        for line in biomart:
176
            gene = line.split("\t")[0]
177
178
            if gene in known_genes:
                gene_ID, gene_name, chrom, start, end = line.strip().split("\t")
179
180
                bed_lines.append(["chr" + chrom, start, end, gene_name])
    return bed_lines