From 9519ee0433c1d5674053e922117058f7ee17de93 Mon Sep 17 00:00:00 2001 From: Peter van 't Hof <p.j.van_t_hof@lumc.nl> Date: Mon, 26 May 2014 17:20:23 +0200 Subject: [PATCH] Added python scripts to jar file --- .../flexiprep/scripts/fastqc_contam.py | 81 ++++ .../flexiprep/scripts/pyfastqc/__init__.py | 232 ++++++++++ .../flexiprep/scripts/qual_type_sickle.py | 37 ++ .../pipelines/flexiprep/scripts/seq_stat.py | 135 ++++++ .../flexiprep/scripts/summarize_flexiprep.py | 428 ++++++++++++++++++ .../scripts/sync_paired_end_reads.py | 110 +++++ 6 files changed, 1023 insertions(+) create mode 100755 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/fastqc_contam.py create mode 100644 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/pyfastqc/__init__.py create mode 100755 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/qual_type_sickle.py create mode 100755 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/seq_stat.py create mode 100755 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/summarize_flexiprep.py create mode 100644 flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/sync_paired_end_reads.py diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/fastqc_contam.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/fastqc_contam.py new file mode 100755 index 000000000..58d89f03b --- /dev/null +++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/fastqc_contam.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +import argparse +import os + +from pyfastqc import load_from_dir + + +def parse_contam_file(contam_file, delimiter='\t'): + """Given a contaminant file, return a dictionary of contaminant sequences + names and their sequences. + + Args: + contam_file -- path to contaminants file + delimiter -- ID, sequence delimiter in the contaminants file + """ + assert os.path.exists(contam_file), "Contaminant file %r does not exist" % \ + contam_file + with open(contam_file, 'r') as source: + # read only lines not beginning with '#' and discard empty lines + lines = filter(None, (line.strip() for line in source if not + line.startswith('#'))) + # parse contam seq lines into lists of [id, sequence] + parse = lambda line: filter(None, line.split(delimiter)) + parsed = (parse(line) for line in lines) + # and create a dictionary, key=sequence id and value=sequence + contam_ref = {name: seq for name, seq in parsed} + + return contam_ref + + +def get_contams_present(results_dir, contam_ref): + """Given a path to a FastQC HTML results file, return the <div> tag of the + overrepresented sequences list. + + Args: + results_dir -- Path to FastQC results directory. + """ + assert os.path.exists(results_dir), "Directory {0} not " \ + "found.".format(results_dir) + + fastqc = load_from_dir(results_dir) + contam_names = set([x[3] for x in fastqc.overrepresented_sequences.data]) + in_sample = lambda rid: any([cid.startswith(rid) for cid in contam_names]) + contams_present = {x: y for x, y in contam_ref.items() if in_sample(x)} + + return contams_present + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('results_dir', type=str, + help='Path to FastQC result directory file') + parser.add_argument('-c', '--contam_file', type=str, + dest='contam_file', + help='Path to contaminant file') + parser.add_argument('-o', '--output', type=str, + dest='output', + help='Path to output file') + parser.add_argument('--seq-only',dest='seq_only', + action='store_true', + help='Whether to output contaminant sequences only or not') + + args = parser.parse_args() + + contam_ref = parse_contam_file(args.contam_file) + contam_ids = get_contams_present(args.results_dir, contam_ref) + + if args.seq_only: + fmt_out = lambda cid, seq: seq + else: + fmt_out = lambda cid, seq: "{0}\t{1}".format(cid, seq) + + if args.output is None: + for cid, seq in contam_ids.items(): + print fmt_out(cid, seq) + else: + with open(args.output, 'w') as target: + for cid, seq in contam_ids.items(): + target.write(fmt_out(cid, seq) + '\n') diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/pyfastqc/__init__.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/pyfastqc/__init__.py new file mode 100644 index 000000000..3802b5af3 --- /dev/null +++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/pyfastqc/__init__.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python + +""" +-------- +pyfastqc +-------- + +Parser for FastQC results. + +Provides a method and classes for parsing a FastQC run results. + +Tested with FastQC 0.9.5 output. + +""" + +import os + + +def load_file(fname, mode='r', **kwargs): + """Given a path to a FastQC data file or an open file object pointing to it, + return a `FastQC` object. + """ + if isinstance(fname, basestring): + with open(fname, mode, **kwargs) as fp: + return FastQC(fp) + else: + return FastQC(fname) + + +def load_from_dir(dirname, data_fname='fastqc_data.txt', mode='r', **kwargs): + """Given a path to a FastQC results directory, return a `FastQC` object.""" + assert os.path.exists(dirname), "Directory %r does not exist" % dirname + fqc_path = os.path.join(dirname, os.walk(dirname).next()[1][0], data_fname) + return load_file(fqc_path, mode, **kwargs) + + +class FastQCModule(object): + + """Class representing a FastQC analysis module.""" + + def __init__(self, raw_lines, end_mark='>>END_MODULE'): + self.raw_lines = raw_lines + self.end_mark = end_mark + self._status = None + self._name = None + self._data = self.parse() + + def __repr__(self): + return '%s(%s)' % (self.__class__.__name__, + '[%r, ...]' % self.raw_lines[0]) + + def __str__(self): + return ''.join(self.raw_lines) + + @property + def name(self): + """Name of the module.""" + return self._name + + @property + def columns(self): + """Columns in the module.""" + return self._columns + + @property + def data(self): + """FastQC data.""" + return self._data + + @property + def status(self): + """FastQC run status.""" + return self._status + + def parse(self): + """Common parser for a FastQC module.""" + # check that the last line is a proper end mark + assert self.raw_lines[-1].startswith(self.end_mark) + # parse name and status from first line + tokens = self.raw_lines[0].strip().split('\t') + name = tokens[0][2:] + self._name = name + status = tokens[-1] + assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \ + % status + self._status = status + # and column names from second line + columns = self.raw_lines[1][1:].strip().split('\t') + self._columns = columns + # the rest of the lines except the last one + data = [] + for line in self.raw_lines[2:-1]: + cols = line.strip().split('\t') + data.append(cols) + + # optional processing for different modules + if self.name == 'Basic Statistics': + data = {k: v for k, v in data} + + return data + + +class FastQC(object): + + """Class representing results from a FastQC run.""" + + # module name -- attribute name mapping + _mod_map = { + '>>Basic Statistics': 'basic_statistics', + '>>Per base sequence quality': 'per_base_sequence_quality', + '>>Per sequence quality scores': 'per_sequence_quality_scores', + '>>Per base sequence content': 'per_base_sequence_content', + '>>Per base GC content': 'per_base_gc_content', + '>>Per sequence GC content': 'per_sequence_gc_content', + '>>Per base N content': 'per_base_n_content', + '>>Sequence Length Distribution': 'sequence_length_distribution', + '>>Sequence Duplication Levels': 'sequence_duplication_levels', + '>>Overrepresented sequences': 'overrepresented_sequences', + '>>Kmer content': 'kmer_content', + } + + def __init__(self, fp): + # get file name + self.fname = fp.name + self._modules = {} + + line = fp.readline() + while True: + + tokens = line.strip().split('\t') + # break on EOF + if not line: + break + # parse version + elif line.startswith('##FastQC'): + self.version = line.strip().split()[1] + # parse individual modules + elif tokens[0] in self._mod_map: + attr = self._mod_map[tokens[0]] + raw_lines = self.read_module(fp, line, tokens[0]) + self._modules[attr] = FastQCModule(raw_lines) + + line = fp.readline() + + def __repr__(self): + return '%s(%r)' % (self.__class__.__name__, self.fname) + + def _filter_by_status(self, status): + return [x.name for x in self._modules.values() if x.status == status] + + def read_module(self, fp, line, start_mark): + raw = [line] + while not line.startswith('>>END_MODULE'): + line = fp.readline() + raw.append(line) + + if not line: + raise ValueError("Unexpected end of file in module %r" % line) + + return raw + + @property + def modules(self): + return self._modules + + @property + def passes(self): + return self._filter_by_status('pass') + + @property + def passes_num(self): + return len(self.passes) + + @property + def warns(self): + return self._filter_by_status('warn') + + @property + def warns_num(self): + return len(self.warns) + + @property + def fails(self): + return self._filter_by_status('fail') + + @property + def fails_num(self): + return len(self.fails) + + @property + def basic_statistics(self): + return self._modules['basic_statistics'] + + @property + def per_base_sequence_quality(self): + return self._modules['per_base_sequence_quality'] + + @property + def per_sequence_quality_scores(self): + return self._modules['per_sequence_quality_scores'] + + @property + def per_base_sequence_content(self): + return self._modules['per_base_sequence_content'] + + @property + def per_base_gc_content(self): + return self._modules['per_base_gc_content'] + + @property + def per_sequence_gc_content(self): + return self._modules['per_sequence_gc_content'] + + @property + def per_base_n_content(self): + return self._modules['per_base_n_content'] + + @property + def sequence_length_distribution(self): + return self._modules['sequence_length_distribution'] + + @property + def sequence_duplication_levels(self): + return self._modules['sequence_duplication_levels'] + + @property + def overrepresented_sequences(self): + return self._modules['overrepresented_sequences'] + + @property + def kmer_content(self): + return self._modules['kmer_content'] diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/qual_type_sickle.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/qual_type_sickle.py new file mode 100755 index 000000000..7bbd6ab54 --- /dev/null +++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/qual_type_sickle.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +import argparse + +from pyfastqc import load_from_dir + +## CONSTANTS ## +# fastqc-sickle encoding name map +# from FastQC source +# (uk.ac.babraham.FastQC.Sequence.QualityEncoding.PhredEncoding.java) +# and sickle source (src/sickle.h) +FQ_SICKLE_ENC_MAP = { + 'Sanger / Illumina 1.9': ('sanger', 33), + 'Illumina <1.3': ('solexa', 59), + 'Illumina 1.3': ('illumina', 64), + 'Illumina 1.5': ('illumina', 64), +} + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('results_dir', type=str, + help='Path to FastQC result directory file') + parser.add_argument('--force', type=str, + help='Force quality to use the given encoding') + + args = parser.parse_args() + + fastqc = load_from_dir(args.results_dir) + if args.force is None: + enc, offset = FQ_SICKLE_ENC_MAP.get(fastqc.basic_statistics.data['Encoding']) + else: + enc = args.force + offset = [x[1] for x in FQ_SICKLE_ENC_MAP.values() if x[0] == enc][0] + + print "{0}\t{1}".format(enc, offset) diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/seq_stat.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/seq_stat.py new file mode 100755 index 000000000..2fc810bf4 --- /dev/null +++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/seq_stat.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python2 + +""" +Simple sequencing file statistics. + + +Gathers various statistics of a FASTQ file. + +Requirements: + * Python == 2.7.x + * Biopython >= 1.60 + +Copyright (c) 2013 Wibowo Arindrarto <w.arindrarto@lumc.nl> +Copyright (c) 2013 LUMC Sequencing Analysis Support Core <sasc@lumc.nl> +MIT License <http://opensource.org/licenses/MIT> +""" + +RELEASE = False +__version_info__ = ('0', '2', ) +__version__ = '.'.join(__version_info__) +__version__ += '-dev' if not RELEASE else '' + + +import argparse +import json +import os +import sys + +from Bio import SeqIO + + +# quality points we want to measure +QVALS = [1] + range(10, 70, 10) + + +def dict2json(d_input, f_out): + """Dump the given dictionary as a JSON file.""" + if isinstance(f_out, basestring): + target = open(f_out, 'w') + else: + target = f_out + + json.dump(d_input, target, sort_keys=True, indent=4, + separators=(',', ': ')) + + target.close() + + +def gather_stat(in_fastq, out_json, fmt): + + total_bases, total_reads = 0, 0 + + bcntd = dict.fromkeys(QVALS, 0) + rcntd = dict.fromkeys(QVALS, 0) + + len_max, len_min = 0, None + n_bases, reads_with_n = 0, 0 + # adjust format name to Biopython-compatible name + for rec in SeqIO.parse(in_fastq, 'fastq-' + fmt): + read_quals = rec.letter_annotations['phred_quality'] + read_len = len(read_quals) + + # compute quality metrics + avg_qual = sum(read_quals) / len(read_quals) + for qval in QVALS: + bcntd[qval] += len([q for q in read_quals if q >= qval]) + if avg_qual >= qval: + rcntd[qval] += 1 + + # compute length metrics + if read_len > len_max: + len_max = read_len + if len_min is None: + len_min = read_len + elif read_len < len_min: + len_min = read_len + + # compute n metrics + n_count = rec.seq.lower().count('n') + n_bases += n_count + if n_count > 0: + reads_with_n += 1 + + total_bases += read_len + total_reads += 1 + + pctd = { + 'files': { + 'fastq': { + 'path': os.path.abspath(in_fastq), + 'checksum_sha1': None, + }, + }, + 'stats': { + 'qual_encoding': fmt, + 'bases': { + 'num_qual_gte': {}, + 'num_n': n_bases, + 'num_total': total_bases, + }, + 'reads': { + 'num_mean_qual_gte': {}, + 'len_max': len_max, + 'len_min': len_min, + 'num_with_n': reads_with_n, + 'num_total': total_reads, + }, + }, + } + + for qval in QVALS: + pctd['stats']['bases']['num_qual_gte'][qval] = bcntd[qval] + pctd['stats']['reads']['num_mean_qual_gte'][qval] = rcntd[qval] + + dict2json(pctd, out_json) + + +if __name__ == '__main__': + + usage = __doc__.split('\n\n\n') + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=usage[0], epilog=usage[1]) + + parser.add_argument('input', type=str, help='Path to input FASTQ file') + parser.add_argument('-o', '--output', type=str, default=sys.stdout, + help='Path to output JSON file') + parser.add_argument('--fmt', type=str, choices=['sanger', 'illumina', + 'solexa'], default='sanger', help='FASTQ quality encoding') + parser.add_argument('--version', action='version', version='%(prog)s ' + + __version__) + + args = parser.parse_args() + + gather_stat(args.input, args.output, args.fmt) diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/summarize_flexiprep.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/summarize_flexiprep.py new file mode 100755 index 000000000..ce0723be8 --- /dev/null +++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/summarize_flexiprep.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python + +""" +Summarize results from a Flexiprep run into a gzipped JSON file. + + +Requirements: + * Python == 2.7.x + +Copyright (c) 2013 Wibowo Arindrarto <w.arindrarto@lumc.nl> +MIT License <http://opensource.org/licenses/MIT> +""" + +RELEASE = False +__version_info__ = ('0', '2', ) +__version__ = '.'.join(__version_info__) +__version__ += '-dev' if not RELEASE else '' + + +import argparse +import contextlib +import json +import os +import re +from datetime import datetime + + +def fastqc2dict(run_dir, sample, name, data_fname='fastqc_data.txt', + image_dname='Images'): + """Summarize FastQC results into a single dictionary object.""" + dirname = os.path.join(run_dir, sample) + fqc_core_dir = os.path.join(dirname, os.walk(dirname).next()[1][0]) + fqc_dataf, fqc_imaged = None, None + for p1, p2, p3 in os.walk(fqc_core_dir): + if image_dname in p2: + fqc_imaged = os.path.join(p1, image_dname) + if data_fname in p3: + fqc_dataf = os.path.join(p1, data_fname) + if fqc_dataf is not None and fqc_imaged is not None: + break + if fqc_imaged is None or fqc_dataf is None: + raise ValueError("Could not find data directory " + "and/or image directory in FastQC result.") + + d = { + 'files': { + 'txt_fastqc_' + name: { + 'checksum_sha1': None, + 'path': fqc_dataf + }, + }, + 'stats': {}, + } + + for img in [os.path.join(fqc_imaged, f) for f in os.listdir(fqc_imaged)]: + key = os.path.splitext(os.path.basename(img))[0] + d['files']['plot_' + key + '_' + name] = { + 'path': os.path.abspath(img), + 'checksum_sha1': None, + } + + return d + + +def clip2dict(sample, samplea, sampleb, lib_type, run_dir): + + re_aff = re.compile(r'\s*Trimmed reads:\s*(\d+)') + re_short = re.compile(r'\s*Too short reads:\s*(\d+)') + re_long = re.compile(r'\s*Too long reads:\s*(\d+)') + re_disc_1 = re.compile(r'Filtered (\d+) .+ first') + re_disc_2 = re.compile(r'Filtered (\d+) reads from second') + re_kept = re.compile(r'Synced read .+ (\d+) reads.') + + @contextlib.contextmanager + def open_result(*toks): + with open(os.path.join(run_dir, *toks)) as src: + yield src + + def get_re_adp(seq): + return re.compile("Adapter.+'{0}'.+trimmed (\d+) times.".format(seq)) + + def get_adapters(read_pair): + adapters = {} + with open_result(read_pair + '.contams.txt') as src: + for line in src: + name, seq = line.strip().split('\t') + # None is placeholder for trimming count + adapters[name] = [seq, None] + + return adapters + + def get_cutadapt_stats(read_pair, adapter_dict): + stats = {} + stat_file = read_pair + '.clipstat' + + try: + with open_result(stat_file) as src: + cutadapt_txt = src.read() + except IOError: + # file does not exist, no adapter contamination found + stats['num_reads_discarded'] = 0 + stats['num_reads_affected'] = 0 + return stats + + short_count = int(re.search(re_short, cutadapt_txt).group(1)) + long_count = int(re.search(re_long, cutadapt_txt).group(1)) + stats['num_reads_discarded'] = short_count + long_count + stats['num_reads_affected'] = int(re.search(re_aff, + cutadapt_txt).group(1)) + for key in adapter_dict: + re_adp = get_re_adp(adapter_dict[key][0]) + adapter_dict[key][1] = int(re.search(re_adp, + cutadapt_txt).group(1)) + + return stats + + def get_sync_stats(sample): + stats = {} + with open_result(sample + '.clipsync.stats') as src: + sync_txt = src.read() + + stats['num_reads_discarded_1'] = int(re.search(re_disc_1, sync_txt).group(1)) + stats['num_reads_discarded_2'] = int(re.search(re_disc_2, sync_txt).group(1)) + stats['num_reads_kept'] = int(re.search(re_kept, sync_txt).group(1)) + stats['num_reads_discarded_total'] = stats['num_reads_discarded_1'] + \ + stats['num_reads_discarded_2'] + + return stats + + clip_stats = {'fastq_1': {}} + clip_stats['fastq_1']['adapters'] = get_adapters(samplea) + clip_stats['fastq_1'].update( + get_cutadapt_stats(samplea, + clip_stats['fastq_1']['adapters'])) + + if lib_type == 'paired': + clip_stats['fastq_2'] = {} + clip_stats['fastq_2']['adapters'] = get_adapters(sampleb) + clip_stats['fastq_2'].update( + get_cutadapt_stats(sampleb, + clip_stats['fastq_2']['adapters'])) + clip_stats['sync'] = get_sync_stats(sample) + else: + clip_stats['sync'] = {} + + return clip_stats + + +def sickle2dict(run_name, lib_type, run_dir): + + trim_stats = {} + + if lib_type == 'paired': + re_paired_kept = re.compile(r'paired records kept: \d+ \((\d+) pairs\)') + re_disc = re.compile(r'single records discarded: \d+ \(from PE1: (\d+), from PE2: (\d+)\)') + re_disc_paired = re.compile(r'paired records discarded: \d+ \((\d+) pairs\)') + with open(os.path.join(run_dir, run_name + '.filtersync.stats')) as src: + sickle_txt = src.read() + + discarda = int(re.search(re_disc, sickle_txt).group(1)) + discardb = int(re.search(re_disc, sickle_txt).group(2)) + discard_both = int(re.search(re_disc_paired, sickle_txt).group(1)) + + trim_stats['num_reads_kept'] = int(re.search(re_paired_kept, sickle_txt).group(1)) + trim_stats['num_reads_discarded_1'] = discarda + trim_stats['num_reads_discarded_2'] = discardb + trim_stats['num_reads_discarded_both'] = discard_both + trim_stats['num_reads_discarded_total'] = discarda + discardb + discard_both + + else: + re_kept = re.compile(r'records kept: (\d+)') + re_disc = re.compile(r'records discarded: (\d+)') + with open(os.path.join(run_dir, run_name + '.filtersync.stats')) as src: + sickle_txt = src.read() + + trim_stats['num_reads_kept'] = int(re.search(re_kept, sickle_txt).group(1)) + trim_stats['num_reads_discarded_total'] = int(re.search(re_disc, sickle_txt).group(1)) + + return trim_stats + + +def dict2json(d_input, f_out): + """Dump the given dictionary as a JSON file.""" + with open(f_out, 'w') as target: + json.dump(d_input, target, sort_keys=True, indent=4, + separators=(',', ': ')) + + +def summarize_flexiprep(run_name, qc_mode, samplea, sampleb, outf, run_dir): + + def load_json(fname): + with open(os.path.join(run_dir, fname), 'r') as target: + return json.load(target) + + def load_chksum(fname): + with open(os.path.join(run_dir, fname), 'r') as src: + return src.readline().strip().split() + + def gc_from_fastqc(fname): + # other quick statistics + with open(fname, 'r') as src: + return int([x for x in src if + x.startswith('%GC')].pop().split('\t')[1]) + + + sumd = { + '_meta': { + 'module': 'flexiprep', + 'run_name': run_name, + 'run_time': datetime.utcnow().isoformat(), + }, + 'stats': { + 'qc_mode': qc_mode, + }, + 'dirs': { + 'run': run_dir, + }, + 'files': { + }, + } + + if sampleb is None: + lib_type = 'single' + else: + lib_type = 'paired' + sumd['stats']['lib_type'] = lib_type + + # gather checksum files + chksums = [c for c in os.listdir(run_dir) if c.endswith('.sha1')] + # gather fastqc directories + fastqcs = [d for d in os.listdir(run_dir) if d.endswith('.fastqc')] + # gather seqstat files + sstats = [s for s in os.listdir(run_dir) if s.endswith('.seqstats.json')] + + if lib_type == 'single': + if qc_mode == 'none': + assert len(chksums) == len(fastqcs) == len(sstats) == 1 + chksum_r1 = load_chksum(chksums[0]) + sumd['files']['fastq_raw_1'] = { + 'checksum_sha1': chksum_r1[0], + 'path': os.path.join(args.run_dir, + os.path.basename(chksum_r1[1]))} + fqd_r1 = fastqc2dict(args.run_dir, fastqcs[0], 'raw_1') + seqstat_r1 = load_json(sstats[0])['stats'] + seqstat_r1['mean_gc'] = gc_from_fastqc( + fqd_r1['files']['txt_fastqc_raw_1']['path']) + sumd['stats']['fastq_raw_1'] = seqstat_r1 + + for k in ('files', 'stats'): + sumd[k].update(fqd_r1[k]) + else: + assert len(chksums) == len(fastqcs) == len(sstats) == 2 + fqp1_name = [x for x in fastqcs if x.endswith('.qc.fastqc')][0] + + chksum_p1 = load_chksum([x for x in chksums if + x.endswith('.qc.sha1')][0]) + sumd['files']['fastq_proc_1'] = { + 'checksum_sha1': chksum_p1[0], + 'path': os.path.join(args.run_dir, + os.path.basename(chksum_p1[1]))} + fqd_p1 = fastqc2dict(args.run_dir, + [x for x in fastqcs if x.endswith('.qc.fastqc')][0], + 'proc_1') + seqstat_p1 = load_json([x for x in sstats if + x.endswith('.qc.seqstats.json')][0])['stats'] + seqstat_p1['mean_gc'] = gc_from_fastqc( + fqd_p1['files']['txt_fastqc_proc_1']['path']) + sumd['stats']['fastq_proc_1'] = seqstat_p1 + + chksum_r1 = load_chksum([x for x in chksums if not + x.endswith('.qc.sha1')][0]) + sumd['files']['fastq_raw_1'] = { + 'checksum_sha1': chksum_r1[0], + 'path': os.path.join(args.run_dir, + os.path.basename(chksum_r1[1]))} + fqd_r1 = fastqc2dict(args.run_dir, + [x for x in fastqcs if x != fqp1_name][0], + 'raw_1') + seqstat_r1 = load_json([x for x in sstats if not + x.endswith('.qc.seqstats.json')][0])['stats'] + seqstat_r1['mean_gc'] = gc_from_fastqc( + fqd_r1['files']['txt_fastqc_raw_1']['path']) + sumd['stats']['fastq_raw_1'] = seqstat_r1 + + for k in ('files', 'stats'): + sumd[k].update(fqd_p1[k]) + sumd[k].update(fqd_r1[k]) + else: + if qc_mode == 'none': + assert len(chksums) == len(fastqcs) == len(sstats) == 2 + + chksum_r1 = load_chksum([x for x in chksums if + x.startswith(samplea)][0]) + sumd['files']['fastq_raw_1'] = { + 'checksum_sha1': chksum_r1[0], + 'path': os.path.join(args.run_dir, + os.path.basename(chksum_r1[1]))} + fqd_r1 = fastqc2dict(args.run_dir, + [x for x in fastqcs if x.startswith(samplea)][0], + 'raw_1') + seqstat_r1 = load_json([x for x in sstats if + x.startswith(samplea)][0])['stats'] + seqstat_r1['mean_gc'] = gc_from_fastqc( + fqd_r1['files']['txt_fastqc_raw_1']['path']) + sumd['stats']['fastq_raw_1'] = seqstat_r1 + + chksum_r2 = load_chksum([x for x in chksums if + not x.startswith(samplea)][0]) + sumd['files']['fastq_raw_2'] = { + 'checksum_sha1': chksum_r2[0], + 'path': os.path.join(args.run_dir, + os.path.basename(chksum_r2[1]))} + fqd_r2 = fastqc2dict(args.run_dir, + [x for x in fastqcs if not x.startswith(samplea)][0], + 'raw_2') + seqstat_r2 = load_json([x for x in sstats if + not x.startswith(samplea)][0])['stats'] + seqstat_r2['mean_gc'] = gc_from_fastqc( + fqd_r2['files']['txt_fastqc_raw_2']['path']) + sumd['stats']['fastq_raw_2'] = seqstat_r2 + + for k in ('files', 'stats'): + sumd[k].update(fqd_r1[k]) + sumd[k].update(fqd_r2[k]) + else: + assert len(fastqcs) == len(sstats) == 4 + proc_chksums = [x for x in chksums if x.endswith('.qc.sha1')] + proc_fastqcs = [x for x in fastqcs if x.endswith('.qc.fastqc')] + proc_sstats = [x for x in sstats if x.endswith('.qc.seqstats.json')] + raw_chksums = [x for x in chksums if x not in proc_chksums] + raw_fastqcs = [x for x in fastqcs if x not in proc_fastqcs] + raw_sstats = [x for x in sstats if x not in proc_sstats] + + chksum_r1 = load_chksum([x for x in raw_chksums if + x.startswith(samplea)][0]) + sumd['files']['fastq_raw_1'] = { + 'checksum_sha1': chksum_r1[0], + 'path': os.path.join(args.run_dir, + os.path.basename(chksum_r1[1]))} + fqd_r1 = fastqc2dict(args.run_dir, + [x for x in raw_fastqcs if x.startswith(samplea)][0], + 'raw_1') + seqstat_r1 = load_json([x for x in raw_sstats if + x.startswith(samplea)][0])['stats'] + seqstat_r1['mean_gc'] = gc_from_fastqc( + fqd_r1['files']['txt_fastqc_raw_1']['path']) + sumd['stats']['fastq_raw_1'] = seqstat_r1 + + chksum_r2 = load_chksum([x for x in raw_chksums if + not x.startswith(samplea)][0]) + sumd['files']['fastq_raw_2'] = { + 'checksum_sha1': chksum_r2[0], + 'path': os.path.join(args.run_dir, + os.path.basename(chksum_r2[1]))} + fqd_r2 = fastqc2dict(args.run_dir, + [x for x in raw_fastqcs if not x.startswith(samplea)][0], + 'raw_2') + seqstat_r2 = load_json([x for x in raw_sstats if + not x.startswith(samplea)][0])['stats'] + seqstat_r2['mean_gc'] = gc_from_fastqc( + fqd_r2['files']['txt_fastqc_raw_2']['path']) + sumd['stats']['fastq_raw_2'] = seqstat_r2 + + chksum_p1 = load_chksum([x for x in proc_chksums if + x.startswith(samplea)][0]) + sumd['files']['fastq_proc_1'] = { + 'checksum_sha1': chksum_p1[0], + 'path': os.path.join(args.run_dir, + os.path.basename(chksum_p1[1]))} + fqd_p1 = fastqc2dict(args.run_dir, + [x for x in proc_fastqcs if x.startswith(samplea)][0], + 'proc_1') + seqstat_p1 = load_json([x for x in proc_sstats if + x.startswith(samplea)][0])['stats'] + seqstat_p1['mean_gc'] = gc_from_fastqc( + fqd_p1['files']['txt_fastqc_proc_1']['path']) + sumd['stats']['fastq_proc_1'] = seqstat_p1 + + chksum_p2 = load_chksum([x for x in proc_chksums if + not x.startswith(samplea)][0]) + sumd['files']['fastq_proc_2'] = { + 'checksum_sha1': chksum_p2[0], + 'path': os.path.join(args.run_dir, + os.path.basename(chksum_p2[1]))} + fqd_p2 = fastqc2dict(args.run_dir, + [x for x in proc_fastqcs if not x.startswith(samplea)][0], + 'proc_2') + seqstat_p2 = load_json([x for x in proc_sstats if + not x.startswith(samplea)][0])['stats'] + seqstat_p2['mean_gc'] = gc_from_fastqc( + fqd_p2['files']['txt_fastqc_proc_2']['path']) + sumd['stats']['fastq_proc_2'] = seqstat_p2 + + for k in ('files', 'stats'): + for fqd in (fqd_r1, fqd_r2, fqd_p1, fqd_p2): + sumd[k].update(fqd[k]) + + if 'clip' in qc_mode: + sumd['stats']['clip'] = clip2dict(run_name, samplea, sampleb, lib_type, run_dir) + + if 'trim' in qc_mode: + sumd['stats']['trim'] = sickle2dict(run_name, lib_type, run_dir) + + dict2json(sumd, outf) + + +if __name__ == '__main__': + + usage = __doc__.split('\n\n\n') + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=usage[0], epilog=usage[1]) + + parser.add_argument('run_name', type=str, help='FastQC run name') + parser.add_argument('qc_mode', type=str, choices=['clip', 'cliptrim', + 'trim', 'none'], help='Flexiprep QC mode') + parser.add_argument('samplea', type=str, help='Sample A name.') + parser.add_argument('--sampleb', type=str, help='Sample B name.') + parser.add_argument('out_file', type=str, help='Path to output file') + parser.add_argument('--run-dir', type=str, default=os.getcwd(), + help='Path to Flexiprep output directory.') + parser.add_argument('--version', action='version', version='%(prog)s ' + + __version__) + + args = parser.parse_args() + + summarize_flexiprep(args.run_name, args.qc_mode, + args.samplea, args.sampleb, args.out_file, args.run_dir) diff --git a/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/sync_paired_end_reads.py b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/sync_paired_end_reads.py new file mode 100644 index 000000000..7d4ed6ad0 --- /dev/null +++ b/flexiprep/scripts/nl/lumc/sasc/biopet/pipelines/flexiprep/scripts/sync_paired_end_reads.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +""" +(Re-)sync two filtered paired end FASTQ files. + +Given two filtered paired end read files and one of the original read files, +re-sync the filtered reads by filtering out anything that is only present in +one of the two files. + +Usage: + {command} <orig.fq> <reads_1.fq> <reads_2.fq> \\ + <reads_1.synced.fq> <reads_2.synced.fq> + +The synced reads are written to disk as <reads_1.synced.fq> and +<reads_2.synced.fq>. Afterwards some counts are printed. + + +Both Illumina old-style and new-style paired-end header lines are supported. + +The original read file is used to speed up processing: it contains all +possible reads from both edited reads (in all files in the same order) so it +can process all files line by line, not having to read a single file in +memory. Some ideas were taken from [1]. + +[1] https://gist.github.com/588841/ + +2011-11-03, Martijn Vermaat <m.vermaat.hg@lumc.nl> +""" + + +import sys +import re + + +def sync_paired_end_reads(original, reads_a, reads_b, synced_a, synced_b): + """ + Filter out reads from two paired end read files that are not present in + both of them. Do this in a reasonable amount of time by using a file + containing all of the reads for one of the paired ends. + + All arguments are open file handles. + + @arg original: File containing all original reads for one of the paired + ends. + @arg reads_a: First from paired end read files. + @arg reads_b: Second from paired end read files. + @arg synced_a: Filtered reads from first paired end read file. + @arg synced_b: Filtered reads from second paired end read file. + + @return: Triple (filtered_a, filtered_b, kept) containing counts + of the number of reads filtered from both input files and + the total number of reads kept in the synced results. + + @todo: Print warnings if obvious things are not right (a or b still has + lines after original is processed). + """ + # This matches 1, 2, or 3 preceded by / _ or whitespace. Its rightmost + # match in a header line is used to identify the read pair. + sep = re.compile('[\s_/][123]') + + def next_record(fh): + return [fh.readline().strip() for i in range(4)] + + def head(record): + return sep.split(record[0])[:-1] + + headers = (sep.split(x.strip())[:-1] for i, x in enumerate(original) + if not (i % 4)) + + filtered_a = filtered_b = kept = 0 + + a, b = next_record(reads_a), next_record(reads_b) + + for header in headers: + if header == head(a) and head(b) != header: + a = next_record(reads_a) + filtered_a += 1 + + if header == head(b) and head(a) != header: + b = next_record(reads_b) + filtered_b += 1 + + if header == head(a) == head(b): + print >>synced_a, '\n'.join(a) + print >>synced_b, '\n'.join(b) + a, b = next_record(reads_a), next_record(reads_b) + kept += 1 + + return filtered_a, filtered_b, kept + + +if __name__ == '__main__': + if len(sys.argv) < 6: + sys.stderr.write(__doc__.split('\n\n\n')[0].strip().format( + command=sys.argv[0]) + '\n') + sys.exit(1) + try: + original = open(sys.argv[1], 'r') + reads_a = open(sys.argv[2], 'r') + reads_b = open(sys.argv[3], 'r') + synced_a = open(sys.argv[4], 'w') + synced_b = open(sys.argv[5], 'w') + filtered_a, filtered_b, kept = \ + sync_paired_end_reads(original, reads_a, reads_b, + synced_a, synced_b) + print 'Filtered %i reads from first read file.' % filtered_a + print 'Filtered %i reads from second read file.' % filtered_b + print 'Synced read files contain %i reads.' % kept + except IOError as (_, message): + sys.stderr.write('Error: %s\n' % message) + sys.exit(1) -- GitLab