ParseResults.py 9.22 KB
Newer Older
1
2
3
import numpy as np
import os.path
from math import log10
Beatrice Tan's avatar
Beatrice Tan committed
4
from collections import OrderedDict
5
from ensemblQueries import genes_at_locus, gene_IDs_at_locus, gene_name_to_ID
6

7
def parse_regions(results_file, known_file, census_file, used_tool, ref_genome):
8
    """Make list with the parsed results from GISTIC or RUBIC.
Beatrice Tan's avatar
Beatrice Tan committed
9
    Format: [chrom, start, end, cnv_type, qval, [known_gene_list], [census_gene_list], [gene_list]]"""
10
11
    known_genes = parse_gene_file(known_file, np.inf, ref_genome)
    census_genes = parse_gene_file(census_file, np.inf, ref_genome)
12
    if used_tool == 'GISTIC':
13
14
15
        recurrent_regions = parse_gistic_results(results_file, known_genes, census_genes, ref_genome)
    elif used_tool == 'RUBIC':
        recurrent_regions = parse_rubic_results(results_file[0], results_file[1], known_genes, census_genes, ref_genome)
16
    else:
17
        recurrent_regions = parse_overlapping_regions(results_file, known_genes, census_genes, ref_genome)
18
    return recurrent_regions
19

20
def parse_gene_file(gene_file, top_nr, ref_genome):
21
22
23
24
25
26
    """Parse a file with a list of genes (one gene per line or first column).
    Top_nr: the number of genes to extract (requires ranked gene list file), use np.inf to extract all genes."""
    list_genes = []
    num_read = 0
    with open(gene_file, 'r') as genes:
        for line in genes:
27
28
29
30
31
            if line.startswith("Supplementary") or line.startswith("\t") or line.startswith("Gene") or line.startswith("DRIVER_LABEL"):    #deal with supplement files with one or more header lines
                continue
            else:
                if line.startswith("perProject") or line.startswith("Pooled_driver"):
                    gene = line.split("\t")[1]  #gene ID
32
                    list_genes.append(gene)
33
34
35
36
                else:
                    gene_name = line.split("\t")[0]
                    gene = gene_name_to_ID([gene_name], ref_genome)
                    list_genes += gene
37
38
39
40
    if top_nr != np.inf:
        list_genes = list_genes[0:top_nr]
    return list_genes

41
def parse_gistic_results(results_file, known_genes, census_genes, ref_genome):
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
    """Create a list of lists with the recurrent regions called by GISTIC2."""
    regions = []
    with open(results_file, 'r') as lesions:
        lesions.readline()
        for lesion in lesions:
            stats = lesion.strip().split("\t")
            if stats[0].endswith("CN values"):
                continue
            else:
                CNV_id = stats[2].split("(")[0]
                chrom, bp = CNV_id.split(":")
                chrom = chrom.strip("chr")
                start, end = bp.split("-")
                cnv_type = 'amp' if stats[0].startswith("Amplification") else 'del'
                qval = log10(float(stats[5])) * -1   #Convert q-value to log10(q-value)
Beatrice Tan's avatar
Beatrice Tan committed
57
58
59
                result_dir, conf = results_file.split("all_lesions")
                gene_file = os.path.join(result_dir, cnv_type + "_genes" + conf)
                list_genes = gistic_genes(gene_file, CNV_id)
60
61
62
63
64
65
                #known_overlap = set(list_genes).intersection(set(known_genes))
                #census_overlap = set(list_genes).intersection(set(census_genes))
                overlapping_genes = gene_IDs_at_locus(int(chrom), int(start), int(end), ref_genome)     #get list of genes at genomic region
                known_overlap = set(overlapping_genes).intersection(set(known_genes))       #extract known genes captured by analysis
                census_overlap = set(overlapping_genes).intersection(set(census_genes))
                cnv_data = [chrom, start, end, cnv_type, qval, list(known_overlap), list(census_overlap), overlapping_genes]
66
67
                regions.append(cnv_data)
    return regions
68

Beatrice Tan's avatar
Beatrice Tan committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def gistic_genes(gene_file, CNV_id):
        """Extract list of genes for a given recurrent CNV from GISTIC2 output."""
        out_genes = []
        with open(gene_file, 'r') as genes:
            for i in range(3):
                genes.readline()
            column_CNV = genes.readline().split("\t").index(CNV_id)
            for line in genes:
                if len(line.split("\t")) >= column_CNV:
                    gene = line.split("\t")[column_CNV]
                    if gene != '':
                        out_genes.append(gene)
        return out_genes

83
def parse_rubic_results(results_gains, results_losses, known_genes, census_genes, ref_genome):
84
    """Create a list of lists with the recurrent regions called by RUBIC."""
85
86
87
88
89
90
91
    regions = []
    for result_file in results_gains, results_losses:
        with open(result_file, 'r') as lesions:
            lesions.readline()
            for lesion in lesions:
                chrom, start, end, qval_left, qval_right, genes = lesion.split("\t")
                cnv_type = 'amp' if result_file.endswith('gains.txt') else 'del'
92
93
94
95
96
97
98
99
100
                #list_genes = genes.strip().split(",")
                qval_left = float(qval_left)
                qval_right = float(qval_right)
                #known_overlap = set(list_genes).intersection(set(known_genes))
                #census_overlap = set(list_genes).intersection(set(census_genes))
                overlapping_genes = gene_IDs_at_locus(chrom, int(start), int(end), ref_genome)
                known_overlap = set(overlapping_genes).intersection(set(known_genes))
                census_overlap = set(overlapping_genes).intersection(set(census_genes))
                cnv_data = [chrom, start, end, cnv_type, (str(qval_left), str(qval_right)),  list(known_overlap), list(census_overlap), overlapping_genes]
101
102
                regions.append(cnv_data)
    return regions
103

104
105
106
def parse_overlapping_regions(bed_file, known_genes, census_genes, ref_genome):
    """Create a list of lists with overlapping regions from bed file."""
    regions = []
107
108
109
110
111
112
    for cnv_type in 'amp', 'del':
        with open(bed_file, 'r') as lesions:
            for line in lesions:
                first_chr, first_start, first_end, first_nr, sec_chr, sec_start, sec_end, sec_nr, bp_overlap = line.strip().split("\t")
                begin_overlap = max(int(first_start), int(sec_start))
                end_overlap = min(int(first_end), int(sec_end))
113
114
115
                overlapping_genes = gene_IDs_at_locus(first_chr, begin_overlap, end_overlap, ref_genome)
                known_overlap = set(overlapping_genes).intersection(set(known_genes))
                census_overlap = set(overlapping_genes).intersection(set(census_genes))
116
                if cnv_type == 'amp':
117
118
119
                    if (first_nr.startswith("Any-AP") and sec_nr.startswith("amp")) or (sec_nr.startswith("Any-AP") and first_nr.starswith("amp")): #both tools report amplification
                        cnv_data = [first_chr, begin_overlap, end_overlap, cnv_type, 'N/A', list(known_overlap), list(census_overlap), overlapping_genes]
                        regions.append(cnv_data)
120
                else:
121
                    if (first_nr.startswith("Any-DP") and sec_nr.startswith("del")) or (sec_nr.startswith("Any-DP") and first_nr.starswith("del")):
122
123
124
                        cnv_data = [first_chr, begin_overlap, end_overlap, cnv_type, 'N/A', list(known_overlap), list(census_overlap), overlapping_genes]
                        regions.append(cnv_data)
    return regions
125

126
127
128
129
def get_stats(list_regions, used_tool):
    """Calculate stats on the results from GISTIC2 and RUBIC."""
    return_info = []
    for cnv_type in 'amp', 'del':
130
        tot_size, nr_genes, nr_reg, known_count, census_count, focal_count = 0, 0, 0, 0, 0, 0
Beatrice Tan's avatar
Beatrice Tan committed
131
        known_census_count, known_focal = 0,0
132
        all_genes = []
133
        driver_density = []
134
        for alt in list_regions:
135
            if alt[3] == cnv_type:
136
                nr_reg += 1
137
138
                reg_size = int(alt[2]) - int(alt[1])
                tot_size += reg_size
Beatrice Tan's avatar
Beatrice Tan committed
139
                if reg_size < 10000000:
140
                    focal_count += 1
Beatrice Tan's avatar
Beatrice Tan committed
141
142
                    if alt[5] != []:
                        known_focal += 1
143
                all_genes = all_genes + alt[7]
144
145
146
147
                if alt[5] != []:
                    known_count += 1
                if alt[6] != []:
                    census_count += 1
148
149
                drivers = 1 / len(alt[7]) if alt[7] != [] else 0
                driver_density.append(drivers)
150
        nr_genes = len(list(set(all_genes)))    #unique genes only
Beatrice Tan's avatar
Beatrice Tan committed
151
        stats = calculate_stats(tot_size, nr_genes, nr_reg, known_count, census_count, focal_count, cnv_type, used_tool, driver_density, known_focal)
152
153
        return_info.append(stats)
    return return_info
154

Beatrice Tan's avatar
Beatrice Tan committed
155
def calculate_stats(tot_size, nr_genes, nr_reg, known_count, census_count, focal_count, cnv_type, used_tool, driver_density, known_focal):
156
157
158
159
    """Calculate average & total numbers for report on recurrent regions."""
    type_name = 'Amplifications' if cnv_type == 'amp' else 'Deletions'
    avg_size = int(tot_size / nr_reg / 1000) if nr_reg != 0 else 'N/A'          #average size in kb
    round_tot_size = round((tot_size / 1000000), 2)                             #total size in mb
160
161
    avg_driver_density = round(sum(driver_density) / nr_reg, 2)
    stats = [used_tool, type_name, nr_reg, avg_size, round_tot_size, nr_genes, avg_driver_density]
162
    stats = list(map(str, stats))
Beatrice Tan's avatar
Beatrice Tan committed
163
    for count in focal_count, known_count, census_count, known_focal:
164
        percent = (count / nr_reg * 100) if count != 0 else 0
Beatrice Tan's avatar
Beatrice Tan committed
165
        stat_str = str(count) + " (" + str(int(percent)) + "%)"
166
167
        stats.append(stat_str)
    return stats