From 9519ee0433c1d5674053e922117058f7ee17de93 Mon Sep 17 00:00:00 2001
From: Peter van 't Hof <p.j.van_t_hof@lumc.nl>
Date: Mon, 26 May 2014 17:20:23 +0200
Subject: [PATCH] Added python scripts to jar file

---
 .../flexiprep/scripts/fastqc_contam.py        |  81 ++++
 .../flexiprep/scripts/pyfastqc/__init__.py    | 232 ++++++++++
 .../flexiprep/scripts/qual_type_sickle.py     |  37 ++
 .../pipelines/flexiprep/scripts/seq_stat.py   | 135 ++++++
 .../flexiprep/scripts/summarize_flexiprep.py  | 428 ++++++++++++++++++
 .../scripts/sync_paired_end_reads.py          | 110 +++++
 6 files changed, 1023 insertions(+)
 create mode 100755 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/fastqc_contam.py
 create mode 100644 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/pyfastqc/__init__.py
 create mode 100755 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/qual_type_sickle.py
 create mode 100755 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/seq_stat.py
 create mode 100755 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/summarize_flexiprep.py
 create mode 100644 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/sync_paired_end_reads.py

diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/fastqc_contam.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/fastqc_contam.py
new file mode 100755
index 000000000..58d89f03b
--- /dev/null
+++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/fastqc_contam.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import argparse
+import os
+
+from pyfastqc import load_from_dir
+
+
+def parse_contam_file(contam_file, delimiter='\t'):
+    """Given a contaminant file, return a dictionary of contaminant sequences
+    names and their sequences.
+
+    Args:
+        contam_file -- path to contaminants file
+        delimiter -- ID, sequence delimiter in the contaminants file
+    """
+    assert os.path.exists(contam_file), "Contaminant file %r does not exist" % \
+            contam_file
+    with open(contam_file, 'r') as source:
+        # read only lines not beginning with '#' and discard empty lines
+        lines = filter(None, (line.strip() for line in source if not
+            line.startswith('#')))
+        # parse contam seq lines into lists of [id, sequence]
+        parse = lambda line: filter(None, line.split(delimiter))
+        parsed = (parse(line) for line in lines)
+        # and create a dictionary, key=sequence id and value=sequence
+        contam_ref = {name: seq for name, seq in parsed}
+
+    return contam_ref
+
+
+def get_contams_present(results_dir, contam_ref):
+    """Given a path to a FastQC HTML results file, return the <div> tag of the
+    overrepresented sequences list.
+
+    Args:
+        results_dir -- Path to FastQC results directory.
+    """
+    assert os.path.exists(results_dir), "Directory {0} not " \
+            "found.".format(results_dir)
+
+    fastqc = load_from_dir(results_dir)
+    contam_names = set([x[3] for x in fastqc.overrepresented_sequences.data])
+    in_sample = lambda rid: any([cid.startswith(rid) for cid in contam_names])
+    contams_present = {x: y for x, y in contam_ref.items() if in_sample(x)}
+
+    return contams_present
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('results_dir', type=str, 
+            help='Path to FastQC result directory file')
+    parser.add_argument('-c', '--contam_file', type=str,
+            dest='contam_file',
+            help='Path to contaminant file')
+    parser.add_argument('-o', '--output', type=str,
+            dest='output',
+            help='Path to output file')
+    parser.add_argument('--seq-only',dest='seq_only',
+            action='store_true',
+            help='Whether to output contaminant sequences only or not')
+
+    args = parser.parse_args()
+
+    contam_ref = parse_contam_file(args.contam_file)
+    contam_ids = get_contams_present(args.results_dir, contam_ref)
+
+    if args.seq_only:
+       fmt_out = lambda cid, seq: seq
+    else:
+       fmt_out = lambda cid, seq: "{0}\t{1}".format(cid, seq)
+
+    if args.output is None:
+        for cid, seq in contam_ids.items():
+            print fmt_out(cid, seq)
+    else:
+        with open(args.output, 'w') as target:
+            for cid, seq in contam_ids.items():
+                target.write(fmt_out(cid, seq) + '\n')
diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/pyfastqc/__init__.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/pyfastqc/__init__.py
new file mode 100644
index 000000000..3802b5af3
--- /dev/null
+++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/pyfastqc/__init__.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python
+
+"""
+--------
+pyfastqc
+--------
+
+Parser for FastQC results.
+
+Provides a method and classes for parsing a FastQC run results.
+
+Tested with FastQC 0.9.5 output.
+
+"""
+
+import os
+
+
+def load_file(fname, mode='r', **kwargs):
+    """Given a path to a FastQC data file or an open file object pointing to it,
+    return a `FastQC` object.
+    """
+    if isinstance(fname, basestring):
+        with open(fname, mode, **kwargs) as fp:
+            return FastQC(fp)
+    else:
+        return FastQC(fname)
+
+
+def load_from_dir(dirname, data_fname='fastqc_data.txt', mode='r', **kwargs):
+    """Given a path to a FastQC results directory, return a `FastQC` object."""
+    assert os.path.exists(dirname), "Directory %r does not exist" % dirname
+    fqc_path = os.path.join(dirname, os.walk(dirname).next()[1][0], data_fname)
+    return load_file(fqc_path, mode, **kwargs)
+
+
+class FastQCModule(object):
+
+    """Class representing a FastQC analysis module."""
+
+    def __init__(self, raw_lines, end_mark='>>END_MODULE'):
+        self.raw_lines = raw_lines
+        self.end_mark = end_mark
+        self._status = None
+        self._name = None
+        self._data = self.parse()
+
+    def __repr__(self):
+        return '%s(%s)' % (self.__class__.__name__,
+                '[%r, ...]' % self.raw_lines[0])
+
+    def __str__(self):
+        return ''.join(self.raw_lines)
+
+    @property
+    def name(self):
+        """Name of the module."""
+        return self._name
+
+    @property
+    def columns(self):
+        """Columns in the module."""
+        return self._columns
+
+    @property
+    def data(self):
+        """FastQC data."""
+        return self._data
+
+    @property
+    def status(self):
+        """FastQC run status."""
+        return self._status
+
+    def parse(self):
+        """Common parser for a FastQC module."""
+        # check that the last line is a proper end mark
+        assert self.raw_lines[-1].startswith(self.end_mark)
+        # parse name and status from first line
+        tokens = self.raw_lines[0].strip().split('\t')
+        name = tokens[0][2:]
+        self._name = name
+        status = tokens[-1]
+        assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
+            % status
+        self._status = status
+        # and column names from second line
+        columns = self.raw_lines[1][1:].strip().split('\t')
+        self._columns = columns
+        # the rest of the lines except the last one
+        data = []
+        for line in self.raw_lines[2:-1]:
+            cols = line.strip().split('\t')
+            data.append(cols)
+
+        # optional processing for different modules
+        if self.name == 'Basic Statistics':
+            data = {k: v for k, v in data}
+
+        return data
+
+
+class FastQC(object):
+
+    """Class representing results from a FastQC run."""
+
+    # module name -- attribute name mapping
+    _mod_map = {
+        '>>Basic Statistics': 'basic_statistics',
+        '>>Per base sequence quality': 'per_base_sequence_quality',
+        '>>Per sequence quality scores': 'per_sequence_quality_scores',
+        '>>Per base sequence content': 'per_base_sequence_content',
+        '>>Per base GC content': 'per_base_gc_content',
+        '>>Per sequence GC content': 'per_sequence_gc_content',
+        '>>Per base N content': 'per_base_n_content',
+        '>>Sequence Length Distribution': 'sequence_length_distribution',
+        '>>Sequence Duplication Levels': 'sequence_duplication_levels',
+        '>>Overrepresented sequences': 'overrepresented_sequences',
+        '>>Kmer content': 'kmer_content',
+    }
+
+    def __init__(self, fp):
+        # get file name
+        self.fname = fp.name
+        self._modules = {}
+
+        line = fp.readline()
+        while True:
+
+            tokens = line.strip().split('\t')
+            # break on EOF
+            if not line:
+                break
+            # parse version
+            elif line.startswith('##FastQC'):
+                self.version = line.strip().split()[1]
+            # parse individual modules
+            elif tokens[0] in self._mod_map:
+                attr = self._mod_map[tokens[0]]
+                raw_lines = self.read_module(fp, line, tokens[0])
+                self._modules[attr] = FastQCModule(raw_lines)
+
+            line = fp.readline()
+
+    def __repr__(self):
+        return '%s(%r)' % (self.__class__.__name__, self.fname)
+
+    def _filter_by_status(self, status):
+        return [x.name for x in self._modules.values() if x.status == status]
+
+    def read_module(self, fp, line, start_mark):
+        raw = [line]
+        while not line.startswith('>>END_MODULE'):
+            line = fp.readline()
+            raw.append(line)
+
+            if not line:
+                raise ValueError("Unexpected end of file in module %r" % line)
+
+        return raw
+
+    @property
+    def modules(self):
+        return self._modules
+
+    @property
+    def passes(self):
+        return self._filter_by_status('pass')
+
+    @property
+    def passes_num(self):
+        return len(self.passes)
+
+    @property
+    def warns(self):
+        return self._filter_by_status('warn')
+
+    @property
+    def warns_num(self):
+        return len(self.warns)
+
+    @property
+    def fails(self):
+        return self._filter_by_status('fail')
+
+    @property
+    def fails_num(self):
+        return len(self.fails)
+
+    @property
+    def basic_statistics(self):
+        return self._modules['basic_statistics']
+
+    @property
+    def per_base_sequence_quality(self):
+        return self._modules['per_base_sequence_quality']
+
+    @property
+    def per_sequence_quality_scores(self):
+        return self._modules['per_sequence_quality_scores']
+
+    @property
+    def per_base_sequence_content(self):
+        return self._modules['per_base_sequence_content']
+
+    @property
+    def per_base_gc_content(self):
+        return self._modules['per_base_gc_content']
+
+    @property
+    def per_sequence_gc_content(self):
+        return self._modules['per_sequence_gc_content']
+
+    @property
+    def per_base_n_content(self):
+        return self._modules['per_base_n_content']
+
+    @property
+    def sequence_length_distribution(self):
+        return self._modules['sequence_length_distribution']
+
+    @property
+    def sequence_duplication_levels(self):
+        return self._modules['sequence_duplication_levels']
+
+    @property
+    def overrepresented_sequences(self):
+        return self._modules['overrepresented_sequences']
+
+    @property
+    def kmer_content(self):
+        return self._modules['kmer_content']
diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/qual_type_sickle.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/qual_type_sickle.py
new file mode 100755
index 000000000..7bbd6ab54
--- /dev/null
+++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/qual_type_sickle.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+import argparse
+
+from pyfastqc import load_from_dir
+
+## CONSTANTS ##
+# fastqc-sickle encoding name map
+# from FastQC source
+# (uk.ac.babraham.FastQC.Sequence.QualityEncoding.PhredEncoding.java)
+# and sickle source (src/sickle.h)
+FQ_SICKLE_ENC_MAP = {
+    'Sanger / Illumina 1.9': ('sanger', 33),
+    'Illumina <1.3': ('solexa', 59),
+    'Illumina 1.3': ('illumina', 64),
+    'Illumina 1.5': ('illumina', 64),
+}
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('results_dir', type=str, 
+            help='Path to FastQC result directory file')
+    parser.add_argument('--force', type=str,
+            help='Force quality to use the given encoding')
+
+    args = parser.parse_args()
+
+    fastqc = load_from_dir(args.results_dir)
+    if args.force is None:
+        enc, offset = FQ_SICKLE_ENC_MAP.get(fastqc.basic_statistics.data['Encoding'])
+    else:
+        enc = args.force
+        offset = [x[1] for x in FQ_SICKLE_ENC_MAP.values() if x[0] == enc][0]
+
+    print "{0}\t{1}".format(enc, offset)
diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/seq_stat.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/seq_stat.py
new file mode 100755
index 000000000..2fc810bf4
--- /dev/null
+++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/seq_stat.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python2
+
+"""
+Simple sequencing file statistics.
+
+
+Gathers various statistics of a FASTQ file.
+
+Requirements:
+    * Python == 2.7.x
+    * Biopython >= 1.60
+
+Copyright (c) 2013 Wibowo Arindrarto <w.arindrarto@lumc.nl>
+Copyright (c) 2013 LUMC Sequencing Analysis Support Core <sasc@lumc.nl>
+MIT License <http://opensource.org/licenses/MIT>
+"""
+
+RELEASE = False
+__version_info__ = ('0', '2', )
+__version__ = '.'.join(__version_info__)
+__version__ += '-dev' if not RELEASE else ''
+
+
+import argparse
+import json
+import os
+import sys
+
+from Bio import SeqIO
+
+
+# quality points we want to measure
+QVALS = [1] + range(10, 70, 10)
+
+
+def dict2json(d_input, f_out):
+    """Dump the given dictionary as a JSON file."""
+    if isinstance(f_out, basestring):
+        target = open(f_out, 'w')
+    else:
+        target = f_out
+
+    json.dump(d_input, target, sort_keys=True, indent=4,
+            separators=(',', ': '))
+
+    target.close()
+
+
+def gather_stat(in_fastq, out_json, fmt):
+
+    total_bases, total_reads = 0, 0
+
+    bcntd = dict.fromkeys(QVALS, 0)
+    rcntd = dict.fromkeys(QVALS, 0)
+
+    len_max, len_min = 0, None
+    n_bases, reads_with_n = 0, 0
+    # adjust format name to Biopython-compatible name
+    for rec in SeqIO.parse(in_fastq, 'fastq-' + fmt):
+        read_quals = rec.letter_annotations['phred_quality']
+        read_len = len(read_quals)
+
+        # compute quality metrics
+        avg_qual = sum(read_quals) / len(read_quals)
+        for qval in QVALS:
+            bcntd[qval] += len([q for q in read_quals if q >= qval])
+            if avg_qual >= qval:
+                rcntd[qval] += 1
+
+        # compute length metrics
+        if read_len > len_max:
+            len_max = read_len
+        if len_min is None:
+            len_min = read_len
+        elif read_len < len_min:
+            len_min = read_len
+
+        # compute n metrics
+        n_count = rec.seq.lower().count('n')
+        n_bases += n_count
+        if n_count > 0:
+            reads_with_n += 1
+
+        total_bases += read_len
+        total_reads += 1
+
+    pctd = {
+        'files': {
+            'fastq': {
+                'path': os.path.abspath(in_fastq),
+                'checksum_sha1': None,
+            },
+        },
+        'stats': {
+            'qual_encoding': fmt,
+            'bases': {
+                'num_qual_gte': {},
+                'num_n': n_bases,
+                'num_total': total_bases,
+            },
+            'reads': {
+                'num_mean_qual_gte': {},
+                'len_max': len_max,
+                'len_min': len_min,
+                'num_with_n': reads_with_n,
+                'num_total': total_reads,
+            },
+        },
+    }
+
+    for qval in QVALS:
+        pctd['stats']['bases']['num_qual_gte'][qval] = bcntd[qval]
+        pctd['stats']['reads']['num_mean_qual_gte'][qval] = rcntd[qval]
+
+    dict2json(pctd, out_json)
+
+
+if __name__ == '__main__':
+
+    usage = __doc__.split('\n\n\n')
+    parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=usage[0], epilog=usage[1])
+
+    parser.add_argument('input', type=str, help='Path to input FASTQ file')
+    parser.add_argument('-o', '--output', type=str, default=sys.stdout,
+            help='Path to output JSON file')
+    parser.add_argument('--fmt', type=str, choices=['sanger', 'illumina',
+        'solexa'], default='sanger', help='FASTQ quality encoding')
+    parser.add_argument('--version', action='version', version='%(prog)s ' +
+            __version__)
+
+    args = parser.parse_args()
+
+    gather_stat(args.input, args.output, args.fmt)
diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/summarize_flexiprep.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/summarize_flexiprep.py
new file mode 100755
index 000000000..ce0723be8
--- /dev/null
+++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/summarize_flexiprep.py
@@ -0,0 +1,428 @@
+#!/usr/bin/env python
+
+"""
+Summarize results from a Flexiprep run into a gzipped JSON file.
+
+
+Requirements:
+    * Python == 2.7.x
+
+Copyright (c) 2013 Wibowo Arindrarto <w.arindrarto@lumc.nl>
+MIT License <http://opensource.org/licenses/MIT>
+"""
+
+RELEASE = False
+__version_info__ = ('0', '2', )
+__version__ = '.'.join(__version_info__)
+__version__ += '-dev' if not RELEASE else ''
+
+
+import argparse
+import contextlib
+import json
+import os
+import re
+from datetime import datetime
+
+
+def fastqc2dict(run_dir, sample, name, data_fname='fastqc_data.txt',
+        image_dname='Images'):
+    """Summarize FastQC results into a single dictionary object."""
+    dirname = os.path.join(run_dir, sample)
+    fqc_core_dir = os.path.join(dirname, os.walk(dirname).next()[1][0])
+    fqc_dataf, fqc_imaged = None, None
+    for p1, p2, p3 in os.walk(fqc_core_dir):
+        if image_dname in p2:
+            fqc_imaged = os.path.join(p1, image_dname)
+        if data_fname in p3:
+            fqc_dataf = os.path.join(p1, data_fname)
+        if fqc_dataf is not None and fqc_imaged is not None:
+            break
+    if fqc_imaged is None or fqc_dataf is None:
+        raise ValueError("Could not find data directory "
+                "and/or image directory in FastQC result.")
+
+    d = {
+        'files': {
+            'txt_fastqc_' + name: { 
+                'checksum_sha1': None,
+                'path': fqc_dataf
+            },
+        },
+        'stats': {},
+    }
+
+    for img in [os.path.join(fqc_imaged, f) for f in os.listdir(fqc_imaged)]:
+        key = os.path.splitext(os.path.basename(img))[0]
+        d['files']['plot_' + key + '_' + name] = {
+            'path': os.path.abspath(img),
+            'checksum_sha1': None,
+        }
+
+    return d
+
+
+def clip2dict(sample, samplea, sampleb, lib_type, run_dir):
+
+    re_aff = re.compile(r'\s*Trimmed reads:\s*(\d+)')
+    re_short = re.compile(r'\s*Too short reads:\s*(\d+)')
+    re_long = re.compile(r'\s*Too long reads:\s*(\d+)')
+    re_disc_1 = re.compile(r'Filtered (\d+) .+ first')
+    re_disc_2 = re.compile(r'Filtered (\d+) reads from second')
+    re_kept = re.compile(r'Synced read .+ (\d+) reads.')
+
+    @contextlib.contextmanager
+    def open_result(*toks):
+        with open(os.path.join(run_dir, *toks)) as src:
+                yield src
+
+    def get_re_adp(seq):
+        return re.compile("Adapter.+'{0}'.+trimmed (\d+) times.".format(seq))
+
+    def get_adapters(read_pair):
+        adapters = {}
+        with open_result(read_pair + '.contams.txt') as src:
+            for line in src:
+                name, seq = line.strip().split('\t')
+                # None is placeholder for trimming count
+                adapters[name] = [seq, None]
+
+        return adapters
+
+    def get_cutadapt_stats(read_pair, adapter_dict):
+        stats = {}
+        stat_file = read_pair + '.clipstat'
+
+        try:
+            with open_result(stat_file) as src:
+                cutadapt_txt = src.read()
+        except IOError:
+            # file does not exist, no adapter contamination found
+            stats['num_reads_discarded'] = 0
+            stats['num_reads_affected'] = 0
+            return stats
+
+        short_count = int(re.search(re_short, cutadapt_txt).group(1))
+        long_count = int(re.search(re_long, cutadapt_txt).group(1))
+        stats['num_reads_discarded'] = short_count + long_count
+        stats['num_reads_affected'] = int(re.search(re_aff,
+            cutadapt_txt).group(1))
+        for key in adapter_dict:
+            re_adp = get_re_adp(adapter_dict[key][0])
+            adapter_dict[key][1] = int(re.search(re_adp,
+                cutadapt_txt).group(1))
+
+        return stats
+
+    def get_sync_stats(sample):
+        stats = {}
+        with open_result(sample + '.clipsync.stats') as src:
+            sync_txt = src.read()
+
+        stats['num_reads_discarded_1'] = int(re.search(re_disc_1, sync_txt).group(1))
+        stats['num_reads_discarded_2'] = int(re.search(re_disc_2, sync_txt).group(1))
+        stats['num_reads_kept'] = int(re.search(re_kept, sync_txt).group(1))
+        stats['num_reads_discarded_total'] = stats['num_reads_discarded_1'] + \
+                stats['num_reads_discarded_2']
+
+        return stats
+
+    clip_stats = {'fastq_1': {}}
+    clip_stats['fastq_1']['adapters'] = get_adapters(samplea)
+    clip_stats['fastq_1'].update(
+            get_cutadapt_stats(samplea,
+                    clip_stats['fastq_1']['adapters']))
+
+    if lib_type == 'paired':
+        clip_stats['fastq_2'] = {}
+        clip_stats['fastq_2']['adapters'] = get_adapters(sampleb)
+        clip_stats['fastq_2'].update(
+                get_cutadapt_stats(sampleb,
+                        clip_stats['fastq_2']['adapters']))
+        clip_stats['sync'] = get_sync_stats(sample)
+    else:
+        clip_stats['sync'] = {}
+
+    return clip_stats
+
+
+def sickle2dict(run_name, lib_type, run_dir):
+
+    trim_stats = {}
+
+    if lib_type == 'paired':
+        re_paired_kept = re.compile(r'paired records kept: \d+ \((\d+) pairs\)')
+        re_disc = re.compile(r'single records discarded: \d+ \(from PE1: (\d+), from PE2: (\d+)\)')
+        re_disc_paired = re.compile(r'paired records discarded: \d+ \((\d+) pairs\)')
+        with open(os.path.join(run_dir, run_name + '.filtersync.stats')) as src:
+            sickle_txt = src.read()
+
+        discarda = int(re.search(re_disc, sickle_txt).group(1))
+        discardb = int(re.search(re_disc, sickle_txt).group(2))
+        discard_both = int(re.search(re_disc_paired, sickle_txt).group(1)) 
+
+        trim_stats['num_reads_kept'] = int(re.search(re_paired_kept, sickle_txt).group(1))
+        trim_stats['num_reads_discarded_1'] = discarda
+        trim_stats['num_reads_discarded_2'] = discardb
+        trim_stats['num_reads_discarded_both'] = discard_both
+        trim_stats['num_reads_discarded_total'] = discarda + discardb + discard_both
+
+    else:
+        re_kept = re.compile(r'records kept: (\d+)')
+        re_disc = re.compile(r'records discarded: (\d+)')
+        with open(os.path.join(run_dir, run_name + '.filtersync.stats')) as src:
+            sickle_txt = src.read()
+
+        trim_stats['num_reads_kept'] = int(re.search(re_kept, sickle_txt).group(1))
+        trim_stats['num_reads_discarded_total'] = int(re.search(re_disc, sickle_txt).group(1))
+
+    return trim_stats
+
+
+def dict2json(d_input, f_out):
+    """Dump the given dictionary as a JSON file."""
+    with open(f_out, 'w') as target:
+        json.dump(d_input, target, sort_keys=True, indent=4,
+                separators=(',', ': '))
+
+
+def summarize_flexiprep(run_name, qc_mode, samplea, sampleb, outf, run_dir):
+
+    def load_json(fname):
+        with open(os.path.join(run_dir, fname), 'r') as target:
+            return json.load(target)
+
+    def load_chksum(fname):
+        with open(os.path.join(run_dir, fname), 'r') as src:
+            return src.readline().strip().split()
+
+    def gc_from_fastqc(fname):
+        # other quick statistics
+        with open(fname, 'r') as src:
+            return int([x for x in src if
+                x.startswith('%GC')].pop().split('\t')[1])
+
+
+    sumd = {
+        '_meta': {
+            'module': 'flexiprep',
+            'run_name': run_name,
+            'run_time': datetime.utcnow().isoformat(),
+        },
+        'stats': {
+            'qc_mode': qc_mode,
+        },
+        'dirs': {
+            'run': run_dir,
+        },
+        'files': {
+        },
+    }
+
+    if sampleb is None:
+        lib_type = 'single'
+    else:
+        lib_type = 'paired'
+    sumd['stats']['lib_type'] = lib_type
+
+    # gather checksum files
+    chksums = [c for c in os.listdir(run_dir) if c.endswith('.sha1')]
+    # gather fastqc directories
+    fastqcs = [d for d in os.listdir(run_dir) if d.endswith('.fastqc')]
+    # gather seqstat files
+    sstats = [s for s in os.listdir(run_dir) if s.endswith('.seqstats.json')]
+
+    if lib_type == 'single':
+        if qc_mode == 'none':
+            assert len(chksums) == len(fastqcs) == len(sstats) == 1
+            chksum_r1 = load_chksum(chksums[0])
+            sumd['files']['fastq_raw_1'] = {
+                    'checksum_sha1': chksum_r1[0],
+                    'path': os.path.join(args.run_dir,
+                        os.path.basename(chksum_r1[1]))}
+            fqd_r1 = fastqc2dict(args.run_dir, fastqcs[0], 'raw_1')
+            seqstat_r1 = load_json(sstats[0])['stats']
+            seqstat_r1['mean_gc'] = gc_from_fastqc(
+                    fqd_r1['files']['txt_fastqc_raw_1']['path'])
+            sumd['stats']['fastq_raw_1'] = seqstat_r1
+
+            for k in ('files', 'stats'):
+                sumd[k].update(fqd_r1[k])
+        else:
+            assert len(chksums) == len(fastqcs) == len(sstats) == 2
+            fqp1_name = [x for x in fastqcs if x.endswith('.qc.fastqc')][0]
+
+            chksum_p1 = load_chksum([x for x in chksums if
+                    x.endswith('.qc.sha1')][0])
+            sumd['files']['fastq_proc_1'] = {
+                    'checksum_sha1': chksum_p1[0],
+                    'path': os.path.join(args.run_dir,
+                        os.path.basename(chksum_p1[1]))}
+            fqd_p1 = fastqc2dict(args.run_dir,
+                    [x for x in fastqcs if x.endswith('.qc.fastqc')][0],
+                    'proc_1')
+            seqstat_p1 = load_json([x for x in sstats if
+                x.endswith('.qc.seqstats.json')][0])['stats']
+            seqstat_p1['mean_gc'] = gc_from_fastqc(
+                    fqd_p1['files']['txt_fastqc_proc_1']['path'])
+            sumd['stats']['fastq_proc_1'] = seqstat_p1
+
+            chksum_r1 = load_chksum([x for x in chksums if not
+                    x.endswith('.qc.sha1')][0])
+            sumd['files']['fastq_raw_1'] = {
+                    'checksum_sha1': chksum_r1[0],
+                    'path': os.path.join(args.run_dir,
+                        os.path.basename(chksum_r1[1]))}
+            fqd_r1 = fastqc2dict(args.run_dir,
+                    [x for x in fastqcs if x != fqp1_name][0],
+                    'raw_1')
+            seqstat_r1 = load_json([x for x in sstats if not
+                    x.endswith('.qc.seqstats.json')][0])['stats']
+            seqstat_r1['mean_gc'] = gc_from_fastqc(
+                    fqd_r1['files']['txt_fastqc_raw_1']['path'])
+            sumd['stats']['fastq_raw_1'] = seqstat_r1
+
+            for k in ('files', 'stats'):
+                sumd[k].update(fqd_p1[k])
+                sumd[k].update(fqd_r1[k])
+    else:
+        if qc_mode == 'none':
+            assert len(chksums) == len(fastqcs) == len(sstats) == 2
+
+            chksum_r1 = load_chksum([x for x in chksums if
+                x.startswith(samplea)][0])
+            sumd['files']['fastq_raw_1'] = {
+                'checksum_sha1': chksum_r1[0],
+                'path': os.path.join(args.run_dir,
+                    os.path.basename(chksum_r1[1]))}
+            fqd_r1 = fastqc2dict(args.run_dir,
+                    [x for x in fastqcs if x.startswith(samplea)][0],
+                    'raw_1')
+            seqstat_r1 = load_json([x for x in sstats if
+                x.startswith(samplea)][0])['stats']
+            seqstat_r1['mean_gc'] = gc_from_fastqc(
+                    fqd_r1['files']['txt_fastqc_raw_1']['path'])
+            sumd['stats']['fastq_raw_1'] = seqstat_r1
+
+            chksum_r2 = load_chksum([x for x in chksums if
+                not x.startswith(samplea)][0])
+            sumd['files']['fastq_raw_2'] = {
+                    'checksum_sha1': chksum_r2[0],
+                    'path': os.path.join(args.run_dir,
+                        os.path.basename(chksum_r2[1]))}
+            fqd_r2 = fastqc2dict(args.run_dir,
+                    [x for x in fastqcs if not x.startswith(samplea)][0],
+                    'raw_2')
+            seqstat_r2 = load_json([x for x in sstats if
+                not x.startswith(samplea)][0])['stats']
+            seqstat_r2['mean_gc'] = gc_from_fastqc(
+                    fqd_r2['files']['txt_fastqc_raw_2']['path'])
+            sumd['stats']['fastq_raw_2'] = seqstat_r2
+
+            for k in ('files', 'stats'):
+                sumd[k].update(fqd_r1[k])
+                sumd[k].update(fqd_r2[k])
+        else:
+            assert len(fastqcs) == len(sstats) == 4
+            proc_chksums = [x for x in chksums if x.endswith('.qc.sha1')]
+            proc_fastqcs = [x for x in fastqcs if x.endswith('.qc.fastqc')]
+            proc_sstats = [x for x in sstats if x.endswith('.qc.seqstats.json')]
+            raw_chksums = [x for x in chksums if x not in proc_chksums]
+            raw_fastqcs = [x for x in fastqcs if x not in proc_fastqcs]
+            raw_sstats = [x for x in sstats if x not in proc_sstats]
+
+            chksum_r1 = load_chksum([x for x in raw_chksums if
+                x.startswith(samplea)][0])
+            sumd['files']['fastq_raw_1'] = {
+                'checksum_sha1': chksum_r1[0],
+                'path': os.path.join(args.run_dir,
+                    os.path.basename(chksum_r1[1]))}
+            fqd_r1 = fastqc2dict(args.run_dir,
+                [x for x in raw_fastqcs if x.startswith(samplea)][0],
+                'raw_1')
+            seqstat_r1 = load_json([x for x in raw_sstats if
+                x.startswith(samplea)][0])['stats']
+            seqstat_r1['mean_gc'] = gc_from_fastqc(
+                    fqd_r1['files']['txt_fastqc_raw_1']['path'])
+            sumd['stats']['fastq_raw_1'] = seqstat_r1
+
+            chksum_r2 = load_chksum([x for x in raw_chksums if
+                not x.startswith(samplea)][0])
+            sumd['files']['fastq_raw_2'] = {
+                    'checksum_sha1': chksum_r2[0],
+                    'path': os.path.join(args.run_dir,
+                        os.path.basename(chksum_r2[1]))}
+            fqd_r2 = fastqc2dict(args.run_dir,
+                    [x for x in raw_fastqcs if not x.startswith(samplea)][0],
+                    'raw_2')
+            seqstat_r2 = load_json([x for x in raw_sstats if
+                not x.startswith(samplea)][0])['stats']
+            seqstat_r2['mean_gc'] = gc_from_fastqc(
+                    fqd_r2['files']['txt_fastqc_raw_2']['path'])
+            sumd['stats']['fastq_raw_2'] = seqstat_r2
+
+            chksum_p1 = load_chksum([x for x in proc_chksums if
+                x.startswith(samplea)][0])
+            sumd['files']['fastq_proc_1'] = {
+                'checksum_sha1': chksum_p1[0],
+                'path': os.path.join(args.run_dir,
+                    os.path.basename(chksum_p1[1]))}
+            fqd_p1 = fastqc2dict(args.run_dir,
+                [x for x in proc_fastqcs if x.startswith(samplea)][0],
+                'proc_1')
+            seqstat_p1 = load_json([x for x in proc_sstats if
+                x.startswith(samplea)][0])['stats']
+            seqstat_p1['mean_gc'] = gc_from_fastqc(
+                    fqd_p1['files']['txt_fastqc_proc_1']['path'])
+            sumd['stats']['fastq_proc_1'] = seqstat_p1
+
+            chksum_p2 = load_chksum([x for x in proc_chksums if
+                not x.startswith(samplea)][0])
+            sumd['files']['fastq_proc_2'] = {
+                'checksum_sha1': chksum_p2[0],
+                'path': os.path.join(args.run_dir,
+                    os.path.basename(chksum_p2[1]))}
+            fqd_p2 = fastqc2dict(args.run_dir,
+                [x for x in proc_fastqcs if  not x.startswith(samplea)][0],
+                'proc_2')
+            seqstat_p2 = load_json([x for x in proc_sstats if
+                not x.startswith(samplea)][0])['stats']
+            seqstat_p2['mean_gc'] = gc_from_fastqc(
+                    fqd_p2['files']['txt_fastqc_proc_2']['path'])
+            sumd['stats']['fastq_proc_2'] = seqstat_p2
+
+            for k in ('files', 'stats'):
+                for fqd in (fqd_r1, fqd_r2, fqd_p1, fqd_p2):
+                    sumd[k].update(fqd[k])
+
+    if 'clip' in qc_mode:
+        sumd['stats']['clip'] = clip2dict(run_name, samplea, sampleb, lib_type, run_dir)
+
+    if 'trim' in qc_mode:
+        sumd['stats']['trim'] = sickle2dict(run_name, lib_type, run_dir)
+
+    dict2json(sumd, outf)
+
+
+if __name__ == '__main__':
+
+    usage = __doc__.split('\n\n\n')
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=usage[0], epilog=usage[1])
+
+    parser.add_argument('run_name', type=str, help='FastQC run name')
+    parser.add_argument('qc_mode', type=str, choices=['clip', 'cliptrim',
+        'trim', 'none'], help='Flexiprep QC mode')
+    parser.add_argument('samplea', type=str, help='Sample A name.')
+    parser.add_argument('--sampleb', type=str, help='Sample B name.')
+    parser.add_argument('out_file', type=str, help='Path to output file')
+    parser.add_argument('--run-dir', type=str, default=os.getcwd(),
+            help='Path to Flexiprep output directory.')
+    parser.add_argument('--version', action='version', version='%(prog)s ' +
+            __version__)
+
+    args = parser.parse_args()
+
+    summarize_flexiprep(args.run_name, args.qc_mode,
+            args.samplea, args.sampleb, args.out_file, args.run_dir)
diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/sync_paired_end_reads.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/sync_paired_end_reads.py
new file mode 100644
index 000000000..7d4ed6ad0
--- /dev/null
+++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/sync_paired_end_reads.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+"""
+(Re-)sync two filtered paired end FASTQ files.
+
+Given two filtered paired end read files and one of the original read files,
+re-sync the filtered reads by filtering out anything that is only present in
+one of the two files.
+
+Usage:
+  {command} <orig.fq> <reads_1.fq> <reads_2.fq> \\
+      <reads_1.synced.fq> <reads_2.synced.fq>
+
+The synced reads are written to disk as <reads_1.synced.fq> and
+<reads_2.synced.fq>. Afterwards some counts are printed.
+
+
+Both Illumina old-style and new-style paired-end header lines are supported.
+
+The original read file is used to speed up processing: it contains all
+possible reads from both edited reads (in all files in the same order) so it
+can process all files line by line, not having to read a single file in
+memory. Some ideas were taken from [1].
+
+[1] https://gist.github.com/588841/
+
+2011-11-03, Martijn Vermaat <m.vermaat.hg@lumc.nl>
+"""
+
+
+import sys
+import re
+
+
+def sync_paired_end_reads(original, reads_a, reads_b, synced_a, synced_b):
+    """
+    Filter out reads from two paired end read files that are not present in
+    both of them. Do this in a reasonable amount of time by using a file
+    containing all of the reads for one of the paired ends.
+
+    All arguments are open file handles.
+
+    @arg original: File containing all original reads for one of the paired
+                   ends.
+    @arg reads_a:  First from paired end read files.
+    @arg reads_b:  Second from paired end read files.
+    @arg synced_a: Filtered reads from first paired end read file.
+    @arg synced_b: Filtered reads from second paired end read file.
+
+    @return:       Triple (filtered_a, filtered_b, kept) containing counts
+                   of the number of reads filtered from both input files and
+                   the total number of reads kept in the synced results.
+
+    @todo: Print warnings if obvious things are not right (a or b still has
+           lines after original is processed).
+    """
+    # This matches 1, 2, or 3 preceded by / _ or whitespace. Its rightmost
+    # match in a header line is used to identify the read pair.
+    sep = re.compile('[\s_/][123]')
+
+    def next_record(fh):
+        return [fh.readline().strip() for i in range(4)]
+
+    def head(record):
+        return sep.split(record[0])[:-1]
+
+    headers = (sep.split(x.strip())[:-1] for i, x in enumerate(original)
+               if not (i % 4))
+
+    filtered_a = filtered_b = kept = 0
+
+    a, b = next_record(reads_a), next_record(reads_b)
+
+    for header in headers:
+        if header == head(a) and head(b) != header:
+            a = next_record(reads_a)
+            filtered_a += 1
+
+        if header == head(b) and head(a) != header:
+            b = next_record(reads_b)
+            filtered_b += 1
+
+        if header == head(a) == head(b):
+            print >>synced_a, '\n'.join(a)
+            print >>synced_b, '\n'.join(b)
+            a, b = next_record(reads_a), next_record(reads_b)
+            kept += 1
+
+    return filtered_a, filtered_b, kept
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 6:
+        sys.stderr.write(__doc__.split('\n\n\n')[0].strip().format(
+            command=sys.argv[0]) + '\n')
+        sys.exit(1)
+    try:
+        original = open(sys.argv[1], 'r')
+        reads_a = open(sys.argv[2], 'r')
+        reads_b = open(sys.argv[3], 'r')
+        synced_a = open(sys.argv[4], 'w')
+        synced_b = open(sys.argv[5], 'w')
+        filtered_a, filtered_b, kept = \
+                    sync_paired_end_reads(original, reads_a, reads_b,
+                                          synced_a, synced_b)
+        print 'Filtered %i reads from first read file.' % filtered_a
+        print 'Filtered %i reads from second read file.' % filtered_b
+        print 'Synced read files contain %i reads.' % kept
+    except IOError as (_, message):
+        sys.stderr.write('Error: %s\n' % message)
+        sys.exit(1)
-- 
GitLab