create_summary.py 9.47 KB
Newer Older
1
2
3
#!/usr/bin/env python

import json
bow's avatar
bow committed
4
from pathlib import Path
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

import click
from crimson import picard, vep


def process_aln_stats(path):
    pd = picard.parse(path)
    raw = next(x for x in pd["metrics"]["contents"]
               if x["CATEGORY"] == "PAIR")

    aln_proper_pairs = (raw["READS_ALIGNED_IN_PAIRS"] -
                        raw["PF_READS_IMPROPER_PAIRS"])

    return {
        "num_aligned_bases": raw["PF_ALIGNED_BASES"],
        "num_aligned_reads": raw["PF_READS_ALIGNED"],
        "num_aligned_reads_proper_pairs": aln_proper_pairs,
        "num_total_reads": raw["TOTAL_READS"],
        "pct_adapter": raw["PCT_ADAPTER"],
        "pct_aligned_reads_from_total": (raw["PF_READS_ALIGNED"] * 100. /
                                         raw["TOTAL_READS"]),
        "pct_aligned_reads_proper_pairs": (aln_proper_pairs * 100. /
                                           raw["PF_READS_ALIGNED"]),
        "pct_chimeras": raw["PCT_CHIMERAS"],
        "rate_indel": raw["PF_INDEL_RATE"],
        "rate_mismatch": raw["PF_MISMATCH_RATE"],
        "strand_balance": raw["STRAND_BALANCE"],
    }


def process_rna_stats(path):
    pd = picard.parse(path)
    raw = pd["metrics"]["contents"]
    cov_sort_key = lambda x: x["normalized_position"]
    return {
        "median_3prime_bias": raw["MEDIAN_3PRIME_BIAS"],
        "median_5prime_bias": raw["MEDIAN_5PRIME_BIAS"],
        "median_5prime_to_3prime_bias": raw["MEDIAN_5PRIME_TO_3PRIME_BIAS"],
        "median_cv_coverage": raw["MEDIAN_CV_COVERAGE"],
        "num_coding_bases": raw["CODING_BASES"],
        "num_intergenic_bases": raw["INTERGENIC_BASES"],
        "num_intronic_bases": raw["INTRONIC_BASES"],
        "num_mrna_bases": raw["CODING_BASES"] + raw["UTR_BASES"],
        "num_ribosomal_bases": (raw["RIBOSOMAL_BASES"]
                                if raw["RIBOSOMAL_BASES"] != ""
                                else None),
        "num_total_bases": raw["PF_BASES"],
        "num_utr_bases": raw["UTR_BASES"],
        "pct_coding_bases": raw["PCT_CODING_BASES"],
        "pct_intergenic_bases": raw["PCT_INTERGENIC_BASES"],
        "pct_intronic_bases": raw["PCT_INTRONIC_BASES"],
        "pct_mrna_bases": raw["PCT_MRNA_BASES"],
        "pct_ribosomal_bases": (raw["RIBOSOMAL_BASES"] * 100. / raw["PF_ALIGNED_BASES"]
                                if raw["RIBOSOMAL_BASES"] != ""
                                else None),
        "pct_utr_bases": raw["PCT_UTR_BASES"],
        "normalized_cov": [item["All_Reads.normalized_coverage"]
                           for item in sorted(pd["histogram"]["contents"],
                                              key=cov_sort_key, reverse=False)]
    }


def process_seq_stats(path):
    with open(path) as src:
        raw = json.load(src)

    def f(rgi):
        return {
            "raw": {
                "num_reads_r1": rgi["raw"]["R1"]["num_seq"],
                "num_reads_r2": rgi["raw"]["R2"]["num_seq"],
                "pct_gc_r1": rgi["raw"]["R1"]["pct_gc"],
                "pct_gc_r2": rgi["raw"]["R2"]["pct_gc"],
            },
            "proc": {
                "num_reads_r1": rgi["proc"]["R1"]["num_seq"],
                "num_reads_r2": rgi["proc"]["R2"]["num_seq"],
                "pct_gc_r1": rgi["proc"]["R1"]["pct_gc"],
                "pct_gc_r2": rgi["proc"]["R2"]["pct_gc"],
            },
            "name": rgi["name"],
        }

    rv = {
        "all_read_groups": {
            "raw": {
                "num_reads_r1": raw["raw"]["R1"]["num_seq"],
                "num_reads_r2": raw["raw"]["R2"]["num_seq"],
            },
            "proc": {
                "num_reads_r1": raw["proc"]["R1"]["num_seq"],
                "num_reads_r2": raw["proc"]["R2"]["num_seq"],
            },
        },
        "per_read_group": [f(item) for item in raw["read_groups"]],

    }
    return rv


def process_insert_stats(path):
    pd = picard.parse(path)
    raw = pd["metrics"]["contents"]
    # Raw can be a list if there are more than one PAIR_ORIENTATION.
    if isinstance(raw, list):
        raw = next(r for r in raw if r["PAIR_ORIENTATION"] == "FR")
    return {
        "median_absolute_deviation": raw["MEDIAN_ABSOLUTE_DEVIATION"],
        "median_insert_size": raw["MEDIAN_INSERT_SIZE"],
        "min_insert_size": raw["MIN_INSERT_SIZE"],
        "max_insert_size": raw["MAX_INSERT_SIZE"],
    }


def process_var_stats(path):
    pd = vep.parse(path)
    return {
        "coding_consequences":
        {k: v for k, v in pd["Coding consequences"].items()},
        "num_deletions": pd["Variant classes"].get("deletion", 0),
        "num_insertions": pd["Variant classes"].get("insertion", 0),
        "num_snvs": pd["Variant classes"].get("SNV", 0),
        "per_chromosome":
        {k: v
         for k, v in pd["Variants by chromosome"].items()
         if k in {str(i) for i in range(1, 23)}.union({"X", "Y", "MT"})},
        "polyphen": {
            "num_benign_variants": pd["PolyPhen summary"].get("benign", 0),
            "num_possibly_damaging_variants":
bow's avatar
bow committed
134
            pd["PolyPhen summary"].get("possibly damaging", 0),
135
            "num_probably_damaging_variants":
bow's avatar
bow committed
136
            pd["PolyPhen summary"].get("probably damaging", 0),
137
138
139
140
141
142
143
144
145
146
147
            "num_unknown_variants": pd["PolyPhen summary"].get("unknown", 0),
        },
        "sift": {
            "num_deleterious_variants":
            pd["SIFT summary"].get("deleterious", 0),
            "num_tolerated_variants":
            pd["SIFT summary"].get("tolerated", 0),
        },
    }


148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def process_exon_cov_stats(path):
    with open(path, "r") as src:
        raw = json.load(src)

    tempd = {}
    for val in raw.values():
        gene = val.pop("gx")
        if gene not in tempd:
            tempd[gene] = {}
        trx = val.pop("trx")
        if trx not in tempd[gene]:
            tempd[gene][trx] = {}
        exon = int(val["exon_num"])
        assert exon not in tempd[gene][trx], f"{gene}:{trx}:{exon}"
        tempd[gene][trx][exon] = val

    return {gid: {tid: sorted([exn for exn in tv.values()],
                               key=lambda x: int(x["exon_num"]))
                  for tid, tv in gv.items()}
            for gid, gv in tempd.items()}


170
171
172
173
174
175
176
177
178
def post_process(cs):
    cs["aln"]["num_total_bases"] = (cs["rna"]
                                          .pop("num_total_bases"))
    pct_bases_aln = (cs["aln"]["num_aligned_bases"] * 100. /
                     cs["aln"]["num_total_bases"])
    cs["aln"]["pct_aligned_bases_from_total"] = pct_bases_aln
    return cs


bow's avatar
bow committed
179
180
181
def parse_idm(fh):
    idms = []
    for lineno, line in enumerate(fh):
bow's avatar
bow committed
182
183
        if lineno == 0:
            continue
bow's avatar
bow committed
184
185
186
187
188
189
190
191
192
193
194
195
        gid, gsym, raw_tids = line.strip().split("\t")
        tids = raw_tids.split(",")
        idms.append({"gene_id": gid, "gene_symbol": gsym,
                     "transcript_ids": tids})
    return idms


def add_variant_plots(idm, var_plot_dir):
    idms = set([])
    for item in idm:
        gsym = item["gene_symbol"]
        assert gsym not in idms, item
bow's avatar
bow committed
196
197
198
199
200
201
202
203
204
205
206
207
208
        idms.add(gsym)

    plots = []
    vpd = Path(var_plot_dir)
    for png in vpd.glob("*.png"):
        stem = png.stem
        _, gene = png.stem.rsplit("_gene_", 1)
        if gene in idms:
            plots.append({"path": str(png), "gene": gene})

    return plots


209
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
bow's avatar
bow committed
210
211
212
@click.argument("id_mappings_path", type=click.File("r"))
@click.argument("var_plot_dir",
                type=click.Path(exists=True, file_okay=False))
213
214
215
216
217
218
219
220
@click.argument("seq_stats_path",
                type=click.Path(exists=True, dir_okay=False))
@click.argument("aln_stats_path",
                type=click.Path(exists=True, dir_okay=False))
@click.argument("rna_stats_path",
                type=click.Path(exists=True, dir_okay=False))
@click.argument("insert_stats_path",
                type=click.Path(exists=True, dir_okay=False))
221
222
@click.argument("exon_cov_stats_path",
                type=click.Path(exists=True, dir_okay=False))
223
224
@click.argument("vep_stats_path",
                type=click.Path(exists=True, dir_okay=False))
bow's avatar
bow committed
225
226
@click.option("-r", "--run-name", type=str,
              help="Name of the run in which the stats were generated.")
227
228
@click.option("-n", "--sample-name", type=str,
              help="Name of the sample from which the stats were generated.")
229
230
@click.option("--pipeline-version", type=str,
              help="Version string of the pipeline.")
bow's avatar
bow committed
231
232
233
def main(id_mappings_path, var_plot_dir,
         seq_stats_path, aln_stats_path, rna_stats_path,
         insert_stats_path, exon_cov_stats_path, vep_stats_path,
bow's avatar
bow committed
234
         run_name, sample_name, pipeline_version):
235
    """Helper script for combining multiple stats files into one JSON."""
bow's avatar
bow committed
236
    idm = parse_idm(id_mappings_path)
237
    combined = {
bow's avatar
bow committed
238
239
240
241
        "metadata": {
            "pipeline_version": pipeline_version,
            "run_name": run_name,
            "sample_name": sample_name,
bow's avatar
bow committed
242
            "genes_of_interest": idm,
bow's avatar
bow committed
243
        },
244
245
246
247
        "stats": {
            "seq": process_seq_stats(seq_stats_path),
            "aln": process_aln_stats(aln_stats_path),
            "rna": process_rna_stats(rna_stats_path),
248
            "cov": process_exon_cov_stats(exon_cov_stats_path),
249
            "ins": process_insert_stats(insert_stats_path),
250
251
            "var": process_var_stats(vep_stats_path),
        },
bow's avatar
bow committed
252
253
254
        "results": {
            "var": {"plots": []},
        },
255
    }
bow's avatar
bow committed
256
257
    combined["results"]["var"]["plots"].extend(
        add_variant_plots(idm, var_plot_dir))
258
259
260
261
262
263
264
    combined["stats"] = post_process(combined["stats"])
    print(json.dumps(combined, separators=(",", ":"), sort_keys=True))


if __name__ == "__main__":
    main()