pdf_report.py 18.5 KB
Newer Older
bow's avatar
bow committed
1
#!/usr/bin/env python
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
bow's avatar
bow committed
17
# pdf_report.py
18
#
bow's avatar
bow committed
19
20
# TeX template generation script for Gentrap pipeline.

21
from __future__ import print_function
bow's avatar
bow committed
22
23
24
25
26
27

import argparse
import json
import locale
import os
import re
28
29
import sys
from os import path
bow's avatar
bow committed
30
31
32

from jinja2 import Environment, FileSystemLoader

bow's avatar
bow committed
33

bow's avatar
bow committed
34
# set locale for digit grouping
35
locale.setlocale(locale.LC_ALL, "")
bow's avatar
bow committed
36

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288

class FastQCModule(object):

    """Class representing a FastQC analysis module."""

    def __init__(self, raw_lines, end_mark='>>END_MODULE'):
        """

        :param raw_lines: list of lines in the module
        :type raw_lines: list of str
        :param end_mark: mark of the end of the module
        :type end_mark: str

        """
        self.raw_lines = raw_lines
        self.end_mark = end_mark
        self._status = None
        self._name = None
        self._data = self._parse()

    def __repr__(self):
        return '%s(%s)' % (self.__class__.__name__,
                '[%r, ...]' % self.raw_lines[0])

    def __str__(self):
        return ''.join(self.raw_lines)

    @property
    def name(self):
        """Name of the module."""
        return self._name

    @property
    def columns(self):
        """Columns in the module."""
        return self._columns

    @property
    def data(self):
        """FastQC data."""
        return self._data

    @property
    def status(self):
        """FastQC run status."""
        return self._status

    def _parse(self):
        """Common parser for a FastQC module."""
        # check that the last line is a proper end mark
        assert self.raw_lines[-1].startswith(self.end_mark)
        # parse name and status from first line
        tokens = self.raw_lines[0].strip().split('\t')
        name = tokens[0][2:]
        self._name = name
        status = tokens[-1]
        assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
            % status
        self._status = status
        # and column names from second line
        columns = self.raw_lines[1][1:].strip().split('\t')
        self._columns = columns
        # the rest of the lines except the last one
        data = []
        for line in self.raw_lines[2:-1]:
            cols = line.strip().split('\t')
            data.append(cols)

        # optional processing for different modules
        if self.name == 'Basic Statistics':
            data = {k: v for k, v in data}

        return data


class FastQC(object):

    """Class representing results from a FastQC run."""

    # module name -- attribute name mapping
    _mod_map = {
        '>>Basic Statistics': 'basic_statistics',
        '>>Per base sequence quality': 'per_base_sequence_quality',
        '>>Per sequence quality scores': 'per_sequence_quality_scores',
        '>>Per base sequence content': 'per_base_sequence_content',
        '>>Per base GC content': 'per_base_gc_content',
        '>>Per sequence GC content': 'per_sequence_gc_content',
        '>>Per base N content': 'per_base_n_content',
        '>>Sequence Length Distribution': 'sequence_length_distribution',
        '>>Sequence Duplication Levels': 'sequence_duplication_levels',
        '>>Overrepresented sequences': 'overrepresented_sequences',
        '>>Kmer content': 'kmer_content',
    }

    def __init__(self, fname):
        """

        :param fp: open file handle pointing to the FastQC data file
        :type fp: file handle

        """
        # get file name
        self.fname = fname
        self._modules = {}

        with open(fname, "r") as fp:
            line = fp.readline()
            while True:

                tokens = line.strip().split('\t')
                # break on EOF
                if not line:
                    break
                # parse version
                elif line.startswith('##FastQC'):
                    self.version = line.strip().split()[1]
                # parse individual modules
                elif tokens[0] in self._mod_map:
                    attr = self._mod_map[tokens[0]]
                    raw_lines = self._read_module(fp, line, tokens[0])
                    self._modules[attr] = FastQCModule(raw_lines)

                line = fp.readline()

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, self.fname)

    def _filter_by_status(self, status):
        """Filter out modules whose status is different from the given status.

        :param status: module status
        :type status: str
        :returns: a list of FastQC module names with the given status
        :rtype: list of str

        """
        return [x.name for x in self._modules.values() if x.status == status]

    def _read_module(self, fp, line, start_mark):
        """Returns a list of lines in a module.

        :param fp: open file handle pointing to the FastQC data file
        :type fp: file handle
        :param line: first line in the module
        :type line: str
        :param start_mark: string denoting start of the module
        :type start_mark: str
        :returns: a list of lines in the module
        :rtype: list of str

        """
        raw = [line]
        while not line.startswith('>>END_MODULE'):
            line = fp.readline()
            raw.append(line)

            if not line:
                raise ValueError("Unexpected end of file in module %r" % line)

        return raw

    @property
    def modules(self):
        """All modules in the FastQC results."""
        return self._modules

    @property
    def passes(self):
        """All module names that pass QC."""
        return self._filter_by_status('pass')

    @property
    def passes_num(self):
        """How many modules have pass status."""
        return len(self.passes)

    @property
    def warns(self):
        """All module names with warning status."""
        return self._filter_by_status('warn')

    @property
    def warns_num(self):
        """How many modules have warn status."""
        return len(self.warns)

    @property
    def fails(self):
        """All names of failed modules."""
        return self._filter_by_status('fail')

    @property
    def fails_num(self):
        """How many modules failed."""
        return len(self.fails)

    @property
    def basic_statistics(self):
        """Basic statistics module results."""
        return self._modules['basic_statistics']

    @property
    def per_base_sequence_quality(self):
        """Per base sequence quality module results."""
        return self._modules['per_base_sequence_quality']

    @property
    def per_sequence_quality_scores(self):
        """Per sequence quality scores module results."""
        return self._modules['per_sequence_quality_scores']

    @property
    def per_base_sequence_content(self):
        """Per base sequence content module results."""
        return self._modules['per_base_sequence_content']

    @property
    def per_base_gc_content(self):
        """Per base GC content module results."""
        return self._modules['per_base_gc_content']

    @property
    def per_sequence_gc_content(self):
        """Per sequence GC content module results."""
        return self._modules['per_sequence_gc_content']

    @property
    def per_base_n_content(self):
        """Per base N content module results."""
        return self._modules['per_base_n_content']

    @property
    def sequence_length_distribution(self):
        """Per sequence length distribution module results."""
        return self._modules['sequence_length_distribution']

    @property
    def sequence_duplication_levels(self):
        """Sequence duplication module results."""
        return self._modules['sequence_duplication_levels']

    @property
    def overrepresented_sequences(self):
        """Overrepresented sequences module results."""
        return self._modules['overrepresented_sequences']

    @property
    def kmer_content(self):
        """Kmer content module results."""
        return self._modules['kmer_content']


bow's avatar
bow committed
289
290
291
292
293
294
295
# HACK: remove this and use jinja2 only for templating
class LongTable(object):

    """Class representing a longtable in LaTeX."""

    def __init__(self, caption, label, header, aln, colnum):
        self.lines = [
296
297
298
299
300
301
302
303
304
            "\\begin{center}",
            "\\captionof{table}{%s}" % caption,
            "\\label{%s}" % label,
            "\\begin{longtable}{%s}" % aln,
            "\\hline",
            "%s" % header,
            "\\hline \\hline",
            "\\endhead",
            "\\hline \\multicolumn{%i}{c}{\\textit{Continued on next page}}\\\\" % \
bow's avatar
bow committed
305
                    colnum,
306
307
308
309
            "\\hline",
            "\\endfoot",
            "\\hline",
            "\\endlastfoot",
bow's avatar
bow committed
310
311
312
        ]

    def __str__(self):
313
        return "\n".join(self.lines)
bow's avatar
bow committed
314
315
316
317
318

    def add_row(self, row):
        self.lines.append(row)

    def end(self):
319
320
321
        self.lines.extend(["\\end{longtable}", "\\end{center}",
            "\\addtocounter{table}{-1}"])

bow's avatar
bow committed
322
323

# filter functions for the jinja environment
bow's avatar
bow committed
324
325
326
def nice_int(num, default="None"):
    if num is None:
        return default
bow's avatar
bow committed
327
328
329
330
    try:
        return locale.format("%i", int(num), grouping=True)
    except:
        return default
bow's avatar
bow committed
331
332


bow's avatar
bow committed
333
334
335
def nice_flt(num, default="None"):
    if num is None:
        return default
bow's avatar
bow committed
336
337
338
339
340
341
342
343
344
345
346
347
348
    try:
        return locale.format("%.2f", float(num), grouping=True)
    except:
        return default


def float2nice_pct(num, default="None"):
    if num is None:
        return default
    try:
        return locale.format("%.2f", float(num) * 100.0, grouping=True)
    except:
        return default
bow's avatar
bow committed
349
350
351
352
353


# and some handy functions
def natural_sort(inlist):
    key = lambda x: [int(a) if a.isdigit() else a.lower() for a in
354
            re.split("([0-9]+)", x)]
bow's avatar
bow committed
355
356
357
358
    inlist.sort(key=key)
    return inlist


bow's avatar
bow committed
359
def write_template(run, template_file, logo_file):
bow's avatar
bow committed
360

361
362
    template_file = path.abspath(path.realpath(template_file))
    template_dir = path.dirname(template_file)
bow's avatar
bow committed
363
364
365
    # spawn environment and create output directory
    env = Environment(loader=FileSystemLoader(template_dir))

366
367
368
369
370
371
372
    # change delimiters since LaTeX may use "{{", "{%", or "{#"
    env.block_start_string = "((*"
    env.block_end_string = "*))"
    env.variable_start_string = "((("
    env.variable_end_string = ")))"
    env.comment_start_string = "((="
    env.comment_end_string = "=))"
bow's avatar
bow committed
373
374
375
376
377
378

    # trim all block-related whitespaces
    env.trim_blocks = True
    env.lstrip_blocks = True

    # put in out filter functions
379
380
    env.filters["nice_int"] = nice_int
    env.filters["nice_flt"] = nice_flt
bow's avatar
bow committed
381
    env.filters["float2nice_pct"] = float2nice_pct
bow's avatar
bow committed
382
383

    # write tex template for pdflatex
384
    jinja_template = env.get_template(path.basename(template_file))
bow's avatar
bow committed
385
    run.logo = logo_file
bow's avatar
bow committed
386
    render_vars = {
bow's avatar
bow committed
387
        "run": run,
bow's avatar
bow committed
388
389
390
    }
    rendered = jinja_template.render(**render_vars)

391
    print(rendered, file=sys.stdout)
bow's avatar
bow committed
392
393


bow's avatar
bow committed
394
395
396
397
398
399
400
401
402
class GentrapLib(object):

    def __init__(self, run, sample, name, summary):
        assert isinstance(run, GentrapRun)
        assert isinstance(sample, GentrapSample)
        self.run = run
        self.sample = sample
        self.name = name
        self._raw = summary
bow's avatar
bow committed
403
        # flexiprep settings
bow's avatar
bow committed
404
405
406
407
        self.flexiprep = summary.get("flexiprep", {})
        self.clipping = not self.flexiprep["settings"]["skip_clip"]
        self.trimming = not self.flexiprep["settings"]["skip_trim"]
        self.is_paired_end = self.flexiprep["settings"]["paired"]
408
        if "fastqc_R1" in self.flexiprep["files"]:
bow's avatar
bow committed
409
410
            self.fastqc_r1_files = self.flexiprep["files"]["fastqc_R1"]
            self.fastqc_r1 = FastQC(self.fastqc_r1_files["fastqc_data"]["path"])
411
        if "fastqc_R2" in self.flexiprep["files"]:
bow's avatar
bow committed
412
413
            self.fastqc_r2_files = self.flexiprep["files"]["fastqc_R2"]
            self.fastqc_r2 = FastQC(self.fastqc_r2_files["fastqc_data"]["path"])
414
        if "fastqc_R1_qc" in self.flexiprep["files"]:
bow's avatar
bow committed
415
416
            self.fastqc_r1_qc_files = self.flexiprep["files"]["fastqc_R1_qc"]
            self.fastqc_r1_qc = FastQC(self.fastqc_r1_qc_files["fastqc_data"]["path"])
417
        if "fastqc_R2_qc" in self.flexiprep["files"]:
bow's avatar
bow committed
418
419
            self.fastqc_r2_qc_files = self.flexiprep["files"]["fastqc_R2_qc"]
            self.fastqc_r2_qc = FastQC(self.fastqc_r2_qc_files["fastqc_data"]["path"])
bow's avatar
bow committed
420
421
        # mapping metrics settings
        self.aln_metrics = summary.get("bammetrics", {}).get("stats", {}).get("alignment_metrics", {})
422
423
424
        # insert size metrics files
        self.inserts_metrics_files = summary.get("bammetrics", {}).get("files", {}).get("insert_size_metrics", {})
        # rna metrics files and stats
bow's avatar
bow committed
425
        self.rna_metrics_files = summary.get("gentrap", {}).get("files", {}).get("rna_metrics", {})
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
        _rmetrics = summary.get("gentrap", {}).get("stats", {}).get("rna_metrics", {})
        if _rmetrics:
            self.rna_metrics = {k: v for k, v in _rmetrics.items() }
            pf_bases = float(_rmetrics["pf_bases"])
            exonic_bases = int(_rmetrics.get("coding_bases", 0)) + int(_rmetrics.get("utr_bases", 0))
            # picard uses pct_ but it's actually ratio ~ we follow their convention
            pct_exonic_bases_all = exonic_bases / float(_rmetrics["pf_bases"])
            pct_exonic_bases = exonic_bases / float(_rmetrics.get("pf_aligned_bases", 0))
            self.rna_metrics.update({
                    "exonic_bases": exonic_bases,
                    "pct_exonic_bases_all": pct_exonic_bases_all,
                    "pct_exonic_bases": pct_exonic_bases,
                    "pct_aligned_bases": 1.0,
                    "pct_aligned_bases_all": float(_rmetrics.get("pf_aligned_bases", 0.0)) / pf_bases,
                    "pct_coding_bases_all": float(_rmetrics.get("coding_bases", 0.0)) / pf_bases,
                    "pct_utr_bases_all": float(_rmetrics.get("utr_bases", 0.0)) / pf_bases,
                    "pct_intronic_bases_all": float(_rmetrics.get("intronic_bases", 0.0)) / pf_bases,
                    "pct_intergenic_bases_all": float(_rmetrics.get("intergenic_bases", 0.0)) / pf_bases,
            })
            if _rmetrics.get("ribosomal_bases", "") != "":
                self.rna_metrics["pct_ribosomal_bases_all"] = float(_rmetrics.get("pf_ribosomal_bases", 0.0)) / pf_bases
bow's avatar
bow committed
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463

    def __repr__(self):
        return "{0}(sample=\"{1}\", lib=\"{2}\")".format(
                self.__class__.__name__, self.sample.name, self.name)


class GentrapSample(object):

    def __init__(self, run, name, summary):
        assert isinstance(run, GentrapRun)
        self.run = run
        self.name = name
        self._raw = summary
        self.lib_names = sorted(summary["libraries"].keys())
        self.libs = \
            {l: GentrapLib(self.run, self, l, summary["libraries"][l]) \
                for l in self.lib_names}
bow's avatar
bow committed
464
465
466
467
468
        self.is_paired_end = summary.get("gentrap", {}).get("stats", {}).get("pipeline", {})["all_paired"]
        # mapping metrics settings
        self.aln_metrics = summary.get("bammetrics", {}).get("stats", {}).get("alignment_metrics", {})
        # insert size metrics files
        self.inserts_metrics_files = summary.get("bammetrics", {}).get("files", {}).get("insert_size_metrics", {})
469
        # rna metrics files and stats
bow's avatar
bow committed
470
        self.rna_metrics_files = summary.get("gentrap", {}).get("files", {}).get("rna_metrics", {})
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
        _rmetrics = summary.get("gentrap", {}).get("stats", {}).get("rna_metrics", {})
        if _rmetrics:
            self.rna_metrics = {k: v for k, v in _rmetrics.items() }
            pf_bases = float(_rmetrics["pf_bases"])
            exonic_bases = int(_rmetrics.get("coding_bases", 0)) + int(_rmetrics.get("utr_bases", 0))
            # picard uses pct_ but it's actually ratio ~ we follow their convention
            pct_exonic_bases_all = exonic_bases / float(_rmetrics["pf_bases"])
            pct_exonic_bases = exonic_bases / float(_rmetrics.get("pf_aligned_bases", 0))
            self.rna_metrics.update({
                    "exonic_bases": exonic_bases,
                    "pct_exonic_bases_all": pct_exonic_bases_all,
                    "pct_exonic_bases": pct_exonic_bases,
                    "pct_aligned_bases": 1.0,
                    "pct_aligned_bases_all": float(_rmetrics.get("pf_aligned_bases", 0.0)) / pf_bases,
                    "pct_coding_bases_all": float(_rmetrics.get("coding_bases", 0.0)) / pf_bases,
                    "pct_utr_bases_all": float(_rmetrics.get("utr_bases", 0.0)) / pf_bases,
                    "pct_intronic_bases_all": float(_rmetrics.get("intronic_bases", 0.0)) / pf_bases,
                    "pct_intergenic_bases_all": float(_rmetrics.get("intergenic_bases", 0.0)) / pf_bases,
            })
            if _rmetrics.get("ribosomal_bases", "") != "":
                self.rna_metrics["pct_ribosomal_bases_all"] = float(_rmetrics.get("pf_ribosomal_bases", 0.0)) / pf_bases

bow's avatar
bow committed
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521

    def __repr__(self):
        return "{0}(\"{1}\")".format(self.__class__.__name__, self.name)


class GentrapRun(object):

    def __init__(self, summary_file):

        with open(summary_file, "r") as src:
            summary = json.load(src)

        self._raw = summary
        self.summary_file = summary_file
        self.sample_names = sorted(summary["samples"].keys())
        self.samples = \
            {s: GentrapSample(self, s, summary["samples"][s]) \
                for s in self.sample_names}

        self.files = summary["gentrap"]["files"]
        self.executables = summary["gentrap"]["executables"]
        self.settings = summary["gentrap"]["settings"]
        self.version = self.settings["version"]

    def __repr__(self):
        return "{0}(\"{1}\")".format(self.__class__.__name__,
                                        self.summary_file)


522
if __name__ == "__main__":
bow's avatar
bow committed
523
524

    parser = argparse.ArgumentParser()
525
526
527
528
    parser.add_argument("summary_file", type=str,
            help="Path to Gentrap summary file")
    parser.add_argument("template_file", type=str,
            help="Path to main template file")
bow's avatar
bow committed
529
530
    parser.add_argument("logo_file", type=str,
            help="Path to main logo file")
bow's avatar
bow committed
531
532
    args = parser.parse_args()

bow's avatar
bow committed
533
534
    run = GentrapRun(args.summary_file)
    write_template(run, args.template_file, args.logo_file)