Commit 3c2f3b45 authored by bow's avatar bow
Browse files

Add FastQC parser to PDF report template generator

parent 06e9a56a
......@@ -34,6 +34,258 @@ from jinja2 import Environment, FileSystemLoader
# set locale for digit grouping
locale.setlocale(locale.LC_ALL, "")
class FastQCModule(object):
"""Class representing a FastQC analysis module."""
def __init__(self, raw_lines, end_mark='>>END_MODULE'):
:param raw_lines: list of lines in the module
:type raw_lines: list of str
:param end_mark: mark of the end of the module
:type end_mark: str
self.raw_lines = raw_lines
self.end_mark = end_mark
self._status = None
self._name = None
self._data = self._parse()
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__,
'[%r, ...]' % self.raw_lines[0])
def __str__(self):
return ''.join(self.raw_lines)
def name(self):
"""Name of the module."""
return self._name
def columns(self):
"""Columns in the module."""
return self._columns
def data(self):
"""FastQC data."""
return self._data
def status(self):
"""FastQC run status."""
return self._status
def _parse(self):
"""Common parser for a FastQC module."""
# check that the last line is a proper end mark
assert self.raw_lines[-1].startswith(self.end_mark)
# parse name and status from first line
tokens = self.raw_lines[0].strip().split('\t')
name = tokens[0][2:]
self._name = name
status = tokens[-1]
assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
% status
self._status = status
# and column names from second line
columns = self.raw_lines[1][1:].strip().split('\t')
self._columns = columns
# the rest of the lines except the last one
data = []
for line in self.raw_lines[2:-1]:
cols = line.strip().split('\t')
# optional processing for different modules
if == 'Basic Statistics':
data = {k: v for k, v in data}
return data
class FastQC(object):
"""Class representing results from a FastQC run."""
# module name -- attribute name mapping
_mod_map = {
'>>Basic Statistics': 'basic_statistics',
'>>Per base sequence quality': 'per_base_sequence_quality',
'>>Per sequence quality scores': 'per_sequence_quality_scores',
'>>Per base sequence content': 'per_base_sequence_content',
'>>Per base GC content': 'per_base_gc_content',
'>>Per sequence GC content': 'per_sequence_gc_content',
'>>Per base N content': 'per_base_n_content',
'>>Sequence Length Distribution': 'sequence_length_distribution',
'>>Sequence Duplication Levels': 'sequence_duplication_levels',
'>>Overrepresented sequences': 'overrepresented_sequences',
'>>Kmer content': 'kmer_content',
def __init__(self, fname):
:param fp: open file handle pointing to the FastQC data file
:type fp: file handle
# get file name
self.fname = fname
self._modules = {}
with open(fname, "r") as fp:
line = fp.readline()
while True:
tokens = line.strip().split('\t')
# break on EOF
if not line:
# parse version
elif line.startswith('##FastQC'):
self.version = line.strip().split()[1]
# parse individual modules
elif tokens[0] in self._mod_map:
attr = self._mod_map[tokens[0]]
raw_lines = self._read_module(fp, line, tokens[0])
self._modules[attr] = FastQCModule(raw_lines)
line = fp.readline()
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, self.fname)
def _filter_by_status(self, status):
"""Filter out modules whose status is different from the given status.
:param status: module status
:type status: str
:returns: a list of FastQC module names with the given status
:rtype: list of str
return [ for x in self._modules.values() if x.status == status]
def _read_module(self, fp, line, start_mark):
"""Returns a list of lines in a module.
:param fp: open file handle pointing to the FastQC data file
:type fp: file handle
:param line: first line in the module
:type line: str
:param start_mark: string denoting start of the module
:type start_mark: str
:returns: a list of lines in the module
:rtype: list of str
raw = [line]
while not line.startswith('>>END_MODULE'):
line = fp.readline()
if not line:
raise ValueError("Unexpected end of file in module %r" % line)
return raw
def modules(self):
"""All modules in the FastQC results."""
return self._modules
def passes(self):
"""All module names that pass QC."""
return self._filter_by_status('pass')
def passes_num(self):
"""How many modules have pass status."""
return len(self.passes)
def warns(self):
"""All module names with warning status."""
return self._filter_by_status('warn')
def warns_num(self):
"""How many modules have warn status."""
return len(self.warns)
def fails(self):
"""All names of failed modules."""
return self._filter_by_status('fail')
def fails_num(self):
"""How many modules failed."""
return len(self.fails)
def basic_statistics(self):
"""Basic statistics module results."""
return self._modules['basic_statistics']
def per_base_sequence_quality(self):
"""Per base sequence quality module results."""
return self._modules['per_base_sequence_quality']
def per_sequence_quality_scores(self):
"""Per sequence quality scores module results."""
return self._modules['per_sequence_quality_scores']
def per_base_sequence_content(self):
"""Per base sequence content module results."""
return self._modules['per_base_sequence_content']
def per_base_gc_content(self):
"""Per base GC content module results."""
return self._modules['per_base_gc_content']
def per_sequence_gc_content(self):
"""Per sequence GC content module results."""
return self._modules['per_sequence_gc_content']
def per_base_n_content(self):
"""Per base N content module results."""
return self._modules['per_base_n_content']
def sequence_length_distribution(self):
"""Per sequence length distribution module results."""
return self._modules['sequence_length_distribution']
def sequence_duplication_levels(self):
"""Sequence duplication module results."""
return self._modules['sequence_duplication_levels']
def overrepresented_sequences(self):
"""Overrepresented sequences module results."""
return self._modules['overrepresented_sequences']
def kmer_content(self):
"""Kmer content module results."""
return self._modules['kmer_content']
# HACK: remove this and use jinja2 only for templating
class LongTable(object):
......@@ -136,6 +388,22 @@ class GentrapLib(object):
self.clipping = not self.flexiprep["settings"]["skip_clip"]
self.trimming = not self.flexiprep["settings"]["skip_trim"]
self.is_paired_end = self.flexiprep["settings"]["paired"]
if "fastqc_R1" in self.flexiprep["files"]:
self.fastqc_r1 = FastQC(self.flexiprep["files"]["fastqc_R1"]["fastqc_data"]["path"])
self.fastqc_r1 = None
if "fastqc_R2" in self.flexiprep["files"]:
self.fastqc_r2 = FastQC(self.flexiprep["files"]["fastqc_R2"]["fastqc_data"]["path"])
self.fastqc_r2 = None
if "fastqc_R1_qc" in self.flexiprep["files"]:
self.fastqc_r1_qc = FastQC(self.flexiprep["files"]["fastqc_R1_qc"]["fastqc_data"]["path"])
self.fastqc_r1_qc = None
if "fastqc_R2_qc" in self.flexiprep["files"]:
self.fastqc_r2_qc = FastQC(self.flexiprep["files"]["fastqc_R2_qc"]["fastqc_data"]["path"])
self.fastqc_r2_qc = None
def __repr__(self):
return "{0}(sample=\"{1}\", lib=\"{2}\")".format(
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment