pdf_report.py 20.6 KB
Newer Older
bow's avatar
bow committed
1
#!/usr/bin/env python
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
bow's avatar
bow committed
17

18
from __future__ import print_function
bow's avatar
bow committed
19
20
21
22
23
24

import argparse
import json
import locale
import os
import re
25
26
import sys
from os import path
bow's avatar
bow committed
27
28
29

from jinja2 import Environment, FileSystemLoader

bow's avatar
bow committed
30

bow's avatar
bow committed
31
# set locale for digit grouping
32
locale.setlocale(locale.LC_ALL, "")
bow's avatar
bow committed
33

34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285

class FastQCModule(object):

    """Class representing a FastQC analysis module."""

    def __init__(self, raw_lines, end_mark='>>END_MODULE'):
        """

        :param raw_lines: list of lines in the module
        :type raw_lines: list of str
        :param end_mark: mark of the end of the module
        :type end_mark: str

        """
        self.raw_lines = raw_lines
        self.end_mark = end_mark
        self._status = None
        self._name = None
        self._data = self._parse()

    def __repr__(self):
        return '%s(%s)' % (self.__class__.__name__,
                '[%r, ...]' % self.raw_lines[0])

    def __str__(self):
        return ''.join(self.raw_lines)

    @property
    def name(self):
        """Name of the module."""
        return self._name

    @property
    def columns(self):
        """Columns in the module."""
        return self._columns

    @property
    def data(self):
        """FastQC data."""
        return self._data

    @property
    def status(self):
        """FastQC run status."""
        return self._status

    def _parse(self):
        """Common parser for a FastQC module."""
        # check that the last line is a proper end mark
        assert self.raw_lines[-1].startswith(self.end_mark)
        # parse name and status from first line
        tokens = self.raw_lines[0].strip().split('\t')
        name = tokens[0][2:]
        self._name = name
        status = tokens[-1]
        assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
            % status
        self._status = status
        # and column names from second line
        columns = self.raw_lines[1][1:].strip().split('\t')
        self._columns = columns
        # the rest of the lines except the last one
        data = []
        for line in self.raw_lines[2:-1]:
            cols = line.strip().split('\t')
            data.append(cols)

        # optional processing for different modules
        if self.name == 'Basic Statistics':
            data = {k: v for k, v in data}

        return data


class FastQC(object):

    """Class representing results from a FastQC run."""

    # module name -- attribute name mapping
    _mod_map = {
        '>>Basic Statistics': 'basic_statistics',
        '>>Per base sequence quality': 'per_base_sequence_quality',
        '>>Per sequence quality scores': 'per_sequence_quality_scores',
        '>>Per base sequence content': 'per_base_sequence_content',
        '>>Per base GC content': 'per_base_gc_content',
        '>>Per sequence GC content': 'per_sequence_gc_content',
        '>>Per base N content': 'per_base_n_content',
        '>>Sequence Length Distribution': 'sequence_length_distribution',
        '>>Sequence Duplication Levels': 'sequence_duplication_levels',
        '>>Overrepresented sequences': 'overrepresented_sequences',
        '>>Kmer content': 'kmer_content',
    }

    def __init__(self, fname):
        """

        :param fp: open file handle pointing to the FastQC data file
        :type fp: file handle

        """
        # get file name
        self.fname = fname
        self._modules = {}

        with open(fname, "r") as fp:
            line = fp.readline()
            while True:

                tokens = line.strip().split('\t')
                # break on EOF
                if not line:
                    break
                # parse version
                elif line.startswith('##FastQC'):
                    self.version = line.strip().split()[1]
                # parse individual modules
                elif tokens[0] in self._mod_map:
                    attr = self._mod_map[tokens[0]]
                    raw_lines = self._read_module(fp, line, tokens[0])
                    self._modules[attr] = FastQCModule(raw_lines)

                line = fp.readline()

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, self.fname)

    def _filter_by_status(self, status):
        """Filter out modules whose status is different from the given status.

        :param status: module status
        :type status: str
        :returns: a list of FastQC module names with the given status
        :rtype: list of str

        """
        return [x.name for x in self._modules.values() if x.status == status]

    def _read_module(self, fp, line, start_mark):
        """Returns a list of lines in a module.

        :param fp: open file handle pointing to the FastQC data file
        :type fp: file handle
        :param line: first line in the module
        :type line: str
        :param start_mark: string denoting start of the module
        :type start_mark: str
        :returns: a list of lines in the module
        :rtype: list of str

        """
        raw = [line]
        while not line.startswith('>>END_MODULE'):
            line = fp.readline()
            raw.append(line)

            if not line:
                raise ValueError("Unexpected end of file in module %r" % line)

        return raw

    @property
    def modules(self):
        """All modules in the FastQC results."""
        return self._modules

    @property
    def passes(self):
        """All module names that pass QC."""
        return self._filter_by_status('pass')

    @property
    def passes_num(self):
        """How many modules have pass status."""
        return len(self.passes)

    @property
    def warns(self):
        """All module names with warning status."""
        return self._filter_by_status('warn')

    @property
    def warns_num(self):
        """How many modules have warn status."""
        return len(self.warns)

    @property
    def fails(self):
        """All names of failed modules."""
        return self._filter_by_status('fail')

    @property
    def fails_num(self):
        """How many modules failed."""
        return len(self.fails)

    @property
    def basic_statistics(self):
        """Basic statistics module results."""
        return self._modules['basic_statistics']

    @property
    def per_base_sequence_quality(self):
        """Per base sequence quality module results."""
        return self._modules['per_base_sequence_quality']

    @property
    def per_sequence_quality_scores(self):
        """Per sequence quality scores module results."""
        return self._modules['per_sequence_quality_scores']

    @property
    def per_base_sequence_content(self):
        """Per base sequence content module results."""
        return self._modules['per_base_sequence_content']

    @property
    def per_base_gc_content(self):
        """Per base GC content module results."""
        return self._modules['per_base_gc_content']

    @property
    def per_sequence_gc_content(self):
        """Per sequence GC content module results."""
        return self._modules['per_sequence_gc_content']

    @property
    def per_base_n_content(self):
        """Per base N content module results."""
        return self._modules['per_base_n_content']

    @property
    def sequence_length_distribution(self):
        """Per sequence length distribution module results."""
        return self._modules['sequence_length_distribution']

    @property
    def sequence_duplication_levels(self):
        """Sequence duplication module results."""
        return self._modules['sequence_duplication_levels']

    @property
    def overrepresented_sequences(self):
        """Overrepresented sequences module results."""
        return self._modules['overrepresented_sequences']

    @property
    def kmer_content(self):
        """Kmer content module results."""
        return self._modules['kmer_content']


bow's avatar
bow committed
286
287
288
289
290
291
292
# HACK: remove this and use jinja2 only for templating
class LongTable(object):

    """Class representing a longtable in LaTeX."""

    def __init__(self, caption, label, header, aln, colnum):
        self.lines = [
293
294
295
296
297
298
299
300
301
            "\\begin{center}",
            "\\captionof{table}{%s}" % caption,
            "\\label{%s}" % label,
            "\\begin{longtable}{%s}" % aln,
            "\\hline",
            "%s" % header,
            "\\hline \\hline",
            "\\endhead",
            "\\hline \\multicolumn{%i}{c}{\\textit{Continued on next page}}\\\\" % \
bow's avatar
bow committed
302
                    colnum,
303
304
305
306
            "\\hline",
            "\\endfoot",
            "\\hline",
            "\\endlastfoot",
bow's avatar
bow committed
307
308
309
        ]

    def __str__(self):
310
        return "\n".join(self.lines)
bow's avatar
bow committed
311
312
313
314
315

    def add_row(self, row):
        self.lines.append(row)

    def end(self):
316
317
318
        self.lines.extend(["\\end{longtable}", "\\end{center}",
            "\\addtocounter{table}{-1}"])

bow's avatar
bow committed
319
320

# filter functions for the jinja environment
bow's avatar
bow committed
321
322
323
def nice_int(num, default="None"):
    if num is None:
        return default
bow's avatar
bow committed
324
325
326
327
    try:
        return locale.format("%i", int(num), grouping=True)
    except:
        return default
bow's avatar
bow committed
328
329


bow's avatar
bow committed
330
331
332
def nice_flt(num, default="None"):
    if num is None:
        return default
bow's avatar
bow committed
333
334
335
336
337
338
339
340
341
342
343
344
345
    try:
        return locale.format("%.2f", float(num), grouping=True)
    except:
        return default


def float2nice_pct(num, default="None"):
    if num is None:
        return default
    try:
        return locale.format("%.2f", float(num) * 100.0, grouping=True)
    except:
        return default
bow's avatar
bow committed
346
347
348
349
350


# and some handy functions
def natural_sort(inlist):
    key = lambda x: [int(a) if a.isdigit() else a.lower() for a in
351
            re.split("([0-9]+)", x)]
bow's avatar
bow committed
352
353
354
355
    inlist.sort(key=key)
    return inlist


bow's avatar
bow committed
356
def write_template(run, template_file, logo_file):
bow's avatar
bow committed
357

358
359
    template_file = path.abspath(path.realpath(template_file))
    template_dir = path.dirname(template_file)
bow's avatar
bow committed
360
361
362
    # spawn environment and create output directory
    env = Environment(loader=FileSystemLoader(template_dir))

363
364
365
366
367
368
369
    # change delimiters since LaTeX may use "{{", "{%", or "{#"
    env.block_start_string = "((*"
    env.block_end_string = "*))"
    env.variable_start_string = "((("
    env.variable_end_string = ")))"
    env.comment_start_string = "((="
    env.comment_end_string = "=))"
bow's avatar
bow committed
370
371
372
373
374
375

    # trim all block-related whitespaces
    env.trim_blocks = True
    env.lstrip_blocks = True

    # put in out filter functions
376
377
    env.filters["nice_int"] = nice_int
    env.filters["nice_flt"] = nice_flt
bow's avatar
bow committed
378
    env.filters["float2nice_pct"] = float2nice_pct
bow's avatar
bow committed
379
    env.filters["basename"] = path.basename
bow's avatar
bow committed
380
381

    # write tex template for pdflatex
382
    jinja_template = env.get_template(path.basename(template_file))
bow's avatar
bow committed
383
    run.logo = logo_file
bow's avatar
bow committed
384
    render_vars = {
bow's avatar
bow committed
385
        "run": run,
bow's avatar
bow committed
386
387
388
    }
    rendered = jinja_template.render(**render_vars)

389
    print(rendered, file=sys.stdout)
bow's avatar
bow committed
390
391


bow's avatar
bow committed
392
393
394
395
396
397
398
399
400
class GentrapLib(object):

    def __init__(self, run, sample, name, summary):
        assert isinstance(run, GentrapRun)
        assert isinstance(sample, GentrapSample)
        self.run = run
        self.sample = sample
        self.name = name
        self._raw = summary
bow's avatar
bow committed
401
        # flexiprep settings
bow's avatar
bow committed
402
        self.flexiprep = summary.get("flexiprep", {})
bow's avatar
bow committed
403
        self.flexiprep_files = summary.get("flexiprep", {}).get("files", {}).get("pipeline", {})
bow's avatar
bow committed
404
405
406
        self.clipping = not self.flexiprep["settings"]["skip_clip"]
        self.trimming = not self.flexiprep["settings"]["skip_trim"]
        self.is_paired_end = self.flexiprep["settings"]["paired"]
407
        if "fastqc_R1" in self.flexiprep["files"]:
bow's avatar
bow committed
408
409
            self.fastqc_r1_files = self.flexiprep["files"]["fastqc_R1"]
            self.fastqc_r1 = FastQC(self.fastqc_r1_files["fastqc_data"]["path"])
410
        if "fastqc_R2" in self.flexiprep["files"]:
bow's avatar
bow committed
411
412
            self.fastqc_r2_files = self.flexiprep["files"]["fastqc_R2"]
            self.fastqc_r2 = FastQC(self.fastqc_r2_files["fastqc_data"]["path"])
413
        if "fastqc_R1_qc" in self.flexiprep["files"]:
bow's avatar
bow committed
414
415
            self.fastqc_r1_qc_files = self.flexiprep["files"]["fastqc_R1_qc"]
            self.fastqc_r1_qc = FastQC(self.fastqc_r1_qc_files["fastqc_data"]["path"])
416
        if "fastqc_R2_qc" in self.flexiprep["files"]:
bow's avatar
bow committed
417
418
            self.fastqc_r2_qc_files = self.flexiprep["files"]["fastqc_R2_qc"]
            self.fastqc_r2_qc = FastQC(self.fastqc_r2_qc_files["fastqc_data"]["path"])
bow's avatar
bow committed
419
420
        # mapping metrics settings
        self.aln_metrics = summary.get("bammetrics", {}).get("stats", {}).get("alignment_metrics", {})
421
422
423
        # insert size metrics files
        self.inserts_metrics_files = summary.get("bammetrics", {}).get("files", {}).get("insert_size_metrics", {})
        # rna metrics files and stats
bow's avatar
bow committed
424
        self.rna_metrics_files = summary.get("gentrap", {}).get("files", {}).get("rna_metrics", {})
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
        _rmetrics = summary.get("gentrap", {}).get("stats", {}).get("rna_metrics", {})
        if _rmetrics:
            self.rna_metrics = {k: v for k, v in _rmetrics.items() }
            pf_bases = float(_rmetrics["pf_bases"])
            exonic_bases = int(_rmetrics.get("coding_bases", 0)) + int(_rmetrics.get("utr_bases", 0))
            # picard uses pct_ but it's actually ratio ~ we follow their convention
            pct_exonic_bases_all = exonic_bases / float(_rmetrics["pf_bases"])
            pct_exonic_bases = exonic_bases / float(_rmetrics.get("pf_aligned_bases", 0))
            self.rna_metrics.update({
                    "exonic_bases": exonic_bases,
                    "pct_exonic_bases_all": pct_exonic_bases_all,
                    "pct_exonic_bases": pct_exonic_bases,
                    "pct_aligned_bases": 1.0,
                    "pct_aligned_bases_all": float(_rmetrics.get("pf_aligned_bases", 0.0)) / pf_bases,
                    "pct_coding_bases_all": float(_rmetrics.get("coding_bases", 0.0)) / pf_bases,
                    "pct_utr_bases_all": float(_rmetrics.get("utr_bases", 0.0)) / pf_bases,
                    "pct_intronic_bases_all": float(_rmetrics.get("intronic_bases", 0.0)) / pf_bases,
                    "pct_intergenic_bases_all": float(_rmetrics.get("intergenic_bases", 0.0)) / pf_bases,
            })
            if _rmetrics.get("ribosomal_bases", "") != "":
                self.rna_metrics["pct_ribosomal_bases_all"] = float(_rmetrics.get("pf_ribosomal_bases", 0.0)) / pf_bases
bow's avatar
bow committed
446
447
448
449
450
451
452
453
454
455
456
457
458

    def __repr__(self):
        return "{0}(sample=\"{1}\", lib=\"{2}\")".format(
                self.__class__.__name__, self.sample.name, self.name)


class GentrapSample(object):

    def __init__(self, run, name, summary):
        assert isinstance(run, GentrapRun)
        self.run = run
        self.name = name
        self._raw = summary
bow's avatar
bow committed
459
460
461
462
463
        self.is_paired_end = summary.get("gentrap", {}).get("stats", {}).get("pipeline", {})["all_paired"]
        # mapping metrics settings
        self.aln_metrics = summary.get("bammetrics", {}).get("stats", {}).get("alignment_metrics", {})
        # insert size metrics files
        self.inserts_metrics_files = summary.get("bammetrics", {}).get("files", {}).get("insert_size_metrics", {})
464
        # rna metrics files and stats
bow's avatar
bow committed
465
        self.rna_metrics_files = summary.get("gentrap", {}).get("files", {}).get("rna_metrics", {})
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
        _rmetrics = summary.get("gentrap", {}).get("stats", {}).get("rna_metrics", {})
        if _rmetrics:
            self.rna_metrics = {k: v for k, v in _rmetrics.items() }
            pf_bases = float(_rmetrics["pf_bases"])
            exonic_bases = int(_rmetrics.get("coding_bases", 0)) + int(_rmetrics.get("utr_bases", 0))
            # picard uses pct_ but it's actually ratio ~ we follow their convention
            pct_exonic_bases_all = exonic_bases / float(_rmetrics["pf_bases"])
            pct_exonic_bases = exonic_bases / float(_rmetrics.get("pf_aligned_bases", 0))
            self.rna_metrics.update({
                    "exonic_bases": exonic_bases,
                    "pct_exonic_bases_all": pct_exonic_bases_all,
                    "pct_exonic_bases": pct_exonic_bases,
                    "pct_aligned_bases": 1.0,
                    "pct_aligned_bases_all": float(_rmetrics.get("pf_aligned_bases", 0.0)) / pf_bases,
                    "pct_coding_bases_all": float(_rmetrics.get("coding_bases", 0.0)) / pf_bases,
                    "pct_utr_bases_all": float(_rmetrics.get("utr_bases", 0.0)) / pf_bases,
                    "pct_intronic_bases_all": float(_rmetrics.get("intronic_bases", 0.0)) / pf_bases,
                    "pct_intergenic_bases_all": float(_rmetrics.get("intergenic_bases", 0.0)) / pf_bases,
            })
bow's avatar
bow committed
485
486
487
            if self.run.settings["strand_protocol"] != "non_specific":
                self.rna_metrics.update({
                })
488
489
490
            if _rmetrics.get("ribosomal_bases", "") != "":
                self.rna_metrics["pct_ribosomal_bases_all"] = float(_rmetrics.get("pf_ribosomal_bases", 0.0)) / pf_bases

bow's avatar
bow committed
491
492
493
494
        self.lib_names = sorted(summary["libraries"].keys())
        self.libs = \
            {l: GentrapLib(self.run, self, l, summary["libraries"][l]) \
                for l in self.lib_names}
bow's avatar
bow committed
495
496
497
498
499
500
501
502
503
504
505
506
507
508

    def __repr__(self):
        return "{0}(\"{1}\")".format(self.__class__.__name__, self.name)


class GentrapRun(object):

    def __init__(self, summary_file):

        with open(summary_file, "r") as src:
            summary = json.load(src)

        self._raw = summary
        self.summary_file = summary_file
bow's avatar
bow committed
509
510

        self.files = summary["gentrap"].get("files", {}).get("pipeline", {})
bow's avatar
bow committed
511
        self.settings = summary["gentrap"]["settings"]
bow's avatar
bow committed
512
        self.version = self.settings.get("version", "unknown")
bow's avatar
bow committed
513
514
515
516
517
518
519
520
        # list containing all exes
        self.all_executables = summary["gentrap"]["executables"]
        # list containing exes we want to display
        executables = [
            ("cutadapt", "adapter clipping"),
            ("sickle", "base quality trimming"),
            ("fastqc", "sequence metrics collection"),
            ("gsnap", "alignment"),
bow's avatar
bow committed
521
522
            ("tophat", "alignment"),
            ("star", "alignment"),
bow's avatar
bow committed
523
524
            ("htseqcount", "fragment counting"),
        ]
bow's avatar
bow committed
525
526
527
528
529
530
        self.executables = {}
        for k, desc in executables:
            in_summary = self.all_executables.get(k)
            if in_summary is not None:
                self.executables[k] = in_summary
                self.executables[k]["desc"] = desc
bow's avatar
bow committed
531
532
533
534
        # since we get the version from the tools we use
        if self.all_executables.get("collectalignmentsummarymetrics") is not None:
            self.executables["picard"] = self.all_executables["collectalignmentsummarymetrics"]
            self.executables["picard"]["desc"] = "alignment_metrics_collection"
bow's avatar
bow committed
535
536
537
            # None means we are using the Queue built in Picard
            if self.executables["picard"].get("version") is None:
                self.executables["picard"]["version"] = "built-in"
bow's avatar
bow committed
538
539
540
541
        # since we get the version from the sub tools we use
        if self.all_executables.get("samtoolsview") is not None:
            self.executables["samtools"] = self.all_executables["samtoolsview"]
            self.executables["samtools"]["desc"] = "various post-alignment processing"
bow's avatar
bow committed
542

bow's avatar
bow committed
543
544
545
546
547
548
549
550
551
552
553
554
555
556
        self.sample_names = sorted(summary["samples"].keys())
        self.samples = \
            {s: GentrapSample(self, s, summary["samples"][s]) \
                for s in self.sample_names}
        self.libs = []
        for sample in self.samples.values():
            self.libs.extend(sample.libs.values())
        if all([s.is_paired_end for s in self.samples.values()]):
            self.lib_type = "all paired end"
        elif all([not s.is_paired_end for s in self.samples.values()]):
            self.lib_type = "all single end"
        else:
            self.lib_type = "mixed (single end and paired end)"

bow's avatar
bow committed
557
558
559
560
561
    def __repr__(self):
        return "{0}(\"{1}\")".format(self.__class__.__name__,
                                        self.summary_file)


562
if __name__ == "__main__":
bow's avatar
bow committed
563
564

    parser = argparse.ArgumentParser()
565
566
567
568
    parser.add_argument("summary_file", type=str,
            help="Path to Gentrap summary file")
    parser.add_argument("template_file", type=str,
            help="Path to main template file")
bow's avatar
bow committed
569
570
    parser.add_argument("logo_file", type=str,
            help="Path to main logo file")
bow's avatar
bow committed
571
572
    args = parser.parse_args()

bow's avatar
bow committed
573
574
    run = GentrapRun(args.summary_file)
    write_template(run, args.template_file, args.logo_file)