Skip to content
Snippets Groups Projects
Commit 9519ee04 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Added python scripts to jar file

parent 2fb73c96
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
import argparse
import os
from pyfastqc import load_from_dir
def parse_contam_file(contam_file, delimiter='\t'):
"""Given a contaminant file, return a dictionary of contaminant sequences
names and their sequences.
Args:
contam_file -- path to contaminants file
delimiter -- ID, sequence delimiter in the contaminants file
"""
assert os.path.exists(contam_file), "Contaminant file %r does not exist" % \
contam_file
with open(contam_file, 'r') as source:
# read only lines not beginning with '#' and discard empty lines
lines = filter(None, (line.strip() for line in source if not
line.startswith('#')))
# parse contam seq lines into lists of [id, sequence]
parse = lambda line: filter(None, line.split(delimiter))
parsed = (parse(line) for line in lines)
# and create a dictionary, key=sequence id and value=sequence
contam_ref = {name: seq for name, seq in parsed}
return contam_ref
def get_contams_present(results_dir, contam_ref):
"""Given a path to a FastQC HTML results file, return the <div> tag of the
overrepresented sequences list.
Args:
results_dir -- Path to FastQC results directory.
"""
assert os.path.exists(results_dir), "Directory {0} not " \
"found.".format(results_dir)
fastqc = load_from_dir(results_dir)
contam_names = set([x[3] for x in fastqc.overrepresented_sequences.data])
in_sample = lambda rid: any([cid.startswith(rid) for cid in contam_names])
contams_present = {x: y for x, y in contam_ref.items() if in_sample(x)}
return contams_present
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('results_dir', type=str,
help='Path to FastQC result directory file')
parser.add_argument('-c', '--contam_file', type=str,
dest='contam_file',
help='Path to contaminant file')
parser.add_argument('-o', '--output', type=str,
dest='output',
help='Path to output file')
parser.add_argument('--seq-only',dest='seq_only',
action='store_true',
help='Whether to output contaminant sequences only or not')
args = parser.parse_args()
contam_ref = parse_contam_file(args.contam_file)
contam_ids = get_contams_present(args.results_dir, contam_ref)
if args.seq_only:
fmt_out = lambda cid, seq: seq
else:
fmt_out = lambda cid, seq: "{0}\t{1}".format(cid, seq)
if args.output is None:
for cid, seq in contam_ids.items():
print fmt_out(cid, seq)
else:
with open(args.output, 'w') as target:
for cid, seq in contam_ids.items():
target.write(fmt_out(cid, seq) + '\n')
#!/usr/bin/env python
"""
--------
pyfastqc
--------
Parser for FastQC results.
Provides a method and classes for parsing a FastQC run results.
Tested with FastQC 0.9.5 output.
"""
import os
def load_file(fname, mode='r', **kwargs):
"""Given a path to a FastQC data file or an open file object pointing to it,
return a `FastQC` object.
"""
if isinstance(fname, basestring):
with open(fname, mode, **kwargs) as fp:
return FastQC(fp)
else:
return FastQC(fname)
def load_from_dir(dirname, data_fname='fastqc_data.txt', mode='r', **kwargs):
"""Given a path to a FastQC results directory, return a `FastQC` object."""
assert os.path.exists(dirname), "Directory %r does not exist" % dirname
fqc_path = os.path.join(dirname, os.walk(dirname).next()[1][0], data_fname)
return load_file(fqc_path, mode, **kwargs)
class FastQCModule(object):
"""Class representing a FastQC analysis module."""
def __init__(self, raw_lines, end_mark='>>END_MODULE'):
self.raw_lines = raw_lines
self.end_mark = end_mark
self._status = None
self._name = None
self._data = self.parse()
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__,
'[%r, ...]' % self.raw_lines[0])
def __str__(self):
return ''.join(self.raw_lines)
@property
def name(self):
"""Name of the module."""
return self._name
@property
def columns(self):
"""Columns in the module."""
return self._columns
@property
def data(self):
"""FastQC data."""
return self._data
@property
def status(self):
"""FastQC run status."""
return self._status
def parse(self):
"""Common parser for a FastQC module."""
# check that the last line is a proper end mark
assert self.raw_lines[-1].startswith(self.end_mark)
# parse name and status from first line
tokens = self.raw_lines[0].strip().split('\t')
name = tokens[0][2:]
self._name = name
status = tokens[-1]
assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
% status
self._status = status
# and column names from second line
columns = self.raw_lines[1][1:].strip().split('\t')
self._columns = columns
# the rest of the lines except the last one
data = []
for line in self.raw_lines[2:-1]:
cols = line.strip().split('\t')
data.append(cols)
# optional processing for different modules
if self.name == 'Basic Statistics':
data = {k: v for k, v in data}
return data
class FastQC(object):
"""Class representing results from a FastQC run."""
# module name -- attribute name mapping
_mod_map = {
'>>Basic Statistics': 'basic_statistics',
'>>Per base sequence quality': 'per_base_sequence_quality',
'>>Per sequence quality scores': 'per_sequence_quality_scores',
'>>Per base sequence content': 'per_base_sequence_content',
'>>Per base GC content': 'per_base_gc_content',
'>>Per sequence GC content': 'per_sequence_gc_content',
'>>Per base N content': 'per_base_n_content',
'>>Sequence Length Distribution': 'sequence_length_distribution',
'>>Sequence Duplication Levels': 'sequence_duplication_levels',
'>>Overrepresented sequences': 'overrepresented_sequences',
'>>Kmer content': 'kmer_content',
}
def __init__(self, fp):
# get file name
self.fname = fp.name
self._modules = {}
line = fp.readline()
while True:
tokens = line.strip().split('\t')
# break on EOF
if not line:
break
# parse version
elif line.startswith('##FastQC'):
self.version = line.strip().split()[1]
# parse individual modules
elif tokens[0] in self._mod_map:
attr = self._mod_map[tokens[0]]
raw_lines = self.read_module(fp, line, tokens[0])
self._modules[attr] = FastQCModule(raw_lines)
line = fp.readline()
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, self.fname)
def _filter_by_status(self, status):
return [x.name for x in self._modules.values() if x.status == status]
def read_module(self, fp, line, start_mark):
raw = [line]
while not line.startswith('>>END_MODULE'):
line = fp.readline()
raw.append(line)
if not line:
raise ValueError("Unexpected end of file in module %r" % line)
return raw
@property
def modules(self):
return self._modules
@property
def passes(self):
return self._filter_by_status('pass')
@property
def passes_num(self):
return len(self.passes)
@property
def warns(self):
return self._filter_by_status('warn')
@property
def warns_num(self):
return len(self.warns)
@property
def fails(self):
return self._filter_by_status('fail')
@property
def fails_num(self):
return len(self.fails)
@property
def basic_statistics(self):
return self._modules['basic_statistics']
@property
def per_base_sequence_quality(self):
return self._modules['per_base_sequence_quality']
@property
def per_sequence_quality_scores(self):
return self._modules['per_sequence_quality_scores']
@property
def per_base_sequence_content(self):
return self._modules['per_base_sequence_content']
@property
def per_base_gc_content(self):
return self._modules['per_base_gc_content']
@property
def per_sequence_gc_content(self):
return self._modules['per_sequence_gc_content']
@property
def per_base_n_content(self):
return self._modules['per_base_n_content']
@property
def sequence_length_distribution(self):
return self._modules['sequence_length_distribution']
@property
def sequence_duplication_levels(self):
return self._modules['sequence_duplication_levels']
@property
def overrepresented_sequences(self):
return self._modules['overrepresented_sequences']
@property
def kmer_content(self):
return self._modules['kmer_content']
#!/usr/bin/env python
import argparse
from pyfastqc import load_from_dir
## CONSTANTS ##
# fastqc-sickle encoding name map
# from FastQC source
# (uk.ac.babraham.FastQC.Sequence.QualityEncoding.PhredEncoding.java)
# and sickle source (src/sickle.h)
FQ_SICKLE_ENC_MAP = {
'Sanger / Illumina 1.9': ('sanger', 33),
'Illumina <1.3': ('solexa', 59),
'Illumina 1.3': ('illumina', 64),
'Illumina 1.5': ('illumina', 64),
}
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('results_dir', type=str,
help='Path to FastQC result directory file')
parser.add_argument('--force', type=str,
help='Force quality to use the given encoding')
args = parser.parse_args()
fastqc = load_from_dir(args.results_dir)
if args.force is None:
enc, offset = FQ_SICKLE_ENC_MAP.get(fastqc.basic_statistics.data['Encoding'])
else:
enc = args.force
offset = [x[1] for x in FQ_SICKLE_ENC_MAP.values() if x[0] == enc][0]
print "{0}\t{1}".format(enc, offset)
#!/usr/bin/env python2
"""
Simple sequencing file statistics.
Gathers various statistics of a FASTQ file.
Requirements:
* Python == 2.7.x
* Biopython >= 1.60
Copyright (c) 2013 Wibowo Arindrarto <w.arindrarto@lumc.nl>
Copyright (c) 2013 LUMC Sequencing Analysis Support Core <sasc@lumc.nl>
MIT License <http://opensource.org/licenses/MIT>
"""
RELEASE = False
__version_info__ = ('0', '2', )
__version__ = '.'.join(__version_info__)
__version__ += '-dev' if not RELEASE else ''
import argparse
import json
import os
import sys
from Bio import SeqIO
# quality points we want to measure
QVALS = [1] + range(10, 70, 10)
def dict2json(d_input, f_out):
"""Dump the given dictionary as a JSON file."""
if isinstance(f_out, basestring):
target = open(f_out, 'w')
else:
target = f_out
json.dump(d_input, target, sort_keys=True, indent=4,
separators=(',', ': '))
target.close()
def gather_stat(in_fastq, out_json, fmt):
total_bases, total_reads = 0, 0
bcntd = dict.fromkeys(QVALS, 0)
rcntd = dict.fromkeys(QVALS, 0)
len_max, len_min = 0, None
n_bases, reads_with_n = 0, 0
# adjust format name to Biopython-compatible name
for rec in SeqIO.parse(in_fastq, 'fastq-' + fmt):
read_quals = rec.letter_annotations['phred_quality']
read_len = len(read_quals)
# compute quality metrics
avg_qual = sum(read_quals) / len(read_quals)
for qval in QVALS:
bcntd[qval] += len([q for q in read_quals if q >= qval])
if avg_qual >= qval:
rcntd[qval] += 1
# compute length metrics
if read_len > len_max:
len_max = read_len
if len_min is None:
len_min = read_len
elif read_len < len_min:
len_min = read_len
# compute n metrics
n_count = rec.seq.lower().count('n')
n_bases += n_count
if n_count > 0:
reads_with_n += 1
total_bases += read_len
total_reads += 1
pctd = {
'files': {
'fastq': {
'path': os.path.abspath(in_fastq),
'checksum_sha1': None,
},
},
'stats': {
'qual_encoding': fmt,
'bases': {
'num_qual_gte': {},
'num_n': n_bases,
'num_total': total_bases,
},
'reads': {
'num_mean_qual_gte': {},
'len_max': len_max,
'len_min': len_min,
'num_with_n': reads_with_n,
'num_total': total_reads,
},
},
}
for qval in QVALS:
pctd['stats']['bases']['num_qual_gte'][qval] = bcntd[qval]
pctd['stats']['reads']['num_mean_qual_gte'][qval] = rcntd[qval]
dict2json(pctd, out_json)
if __name__ == '__main__':
usage = __doc__.split('\n\n\n')
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=usage[0], epilog=usage[1])
parser.add_argument('input', type=str, help='Path to input FASTQ file')
parser.add_argument('-o', '--output', type=str, default=sys.stdout,
help='Path to output JSON file')
parser.add_argument('--fmt', type=str, choices=['sanger', 'illumina',
'solexa'], default='sanger', help='FASTQ quality encoding')
parser.add_argument('--version', action='version', version='%(prog)s ' +
__version__)
args = parser.parse_args()
gather_stat(args.input, args.output, args.fmt)
#!/usr/bin/env python
"""
Summarize results from a Flexiprep run into a gzipped JSON file.
Requirements:
* Python == 2.7.x
Copyright (c) 2013 Wibowo Arindrarto <w.arindrarto@lumc.nl>
MIT License <http://opensource.org/licenses/MIT>
"""
RELEASE = False
__version_info__ = ('0', '2', )
__version__ = '.'.join(__version_info__)
__version__ += '-dev' if not RELEASE else ''
import argparse
import contextlib
import json
import os
import re
from datetime import datetime
def fastqc2dict(run_dir, sample, name, data_fname='fastqc_data.txt',
image_dname='Images'):
"""Summarize FastQC results into a single dictionary object."""
dirname = os.path.join(run_dir, sample)
fqc_core_dir = os.path.join(dirname, os.walk(dirname).next()[1][0])
fqc_dataf, fqc_imaged = None, None
for p1, p2, p3 in os.walk(fqc_core_dir):
if image_dname in p2:
fqc_imaged = os.path.join(p1, image_dname)
if data_fname in p3:
fqc_dataf = os.path.join(p1, data_fname)
if fqc_dataf is not None and fqc_imaged is not None:
break
if fqc_imaged is None or fqc_dataf is None:
raise ValueError("Could not find data directory "
"and/or image directory in FastQC result.")
d = {
'files': {
'txt_fastqc_' + name: {
'checksum_sha1': None,
'path': fqc_dataf
},
},
'stats': {},
}
for img in [os.path.join(fqc_imaged, f) for f in os.listdir(fqc_imaged)]:
key = os.path.splitext(os.path.basename(img))[0]
d['files']['plot_' + key + '_' + name] = {
'path': os.path.abspath(img),
'checksum_sha1': None,
}
return d
def clip2dict(sample, samplea, sampleb, lib_type, run_dir):
re_aff = re.compile(r'\s*Trimmed reads:\s*(\d+)')
re_short = re.compile(r'\s*Too short reads:\s*(\d+)')
re_long = re.compile(r'\s*Too long reads:\s*(\d+)')
re_disc_1 = re.compile(r'Filtered (\d+) .+ first')
re_disc_2 = re.compile(r'Filtered (\d+) reads from second')
re_kept = re.compile(r'Synced read .+ (\d+) reads.')
@contextlib.contextmanager
def open_result(*toks):
with open(os.path.join(run_dir, *toks)) as src:
yield src
def get_re_adp(seq):
return re.compile("Adapter.+'{0}'.+trimmed (\d+) times.".format(seq))
def get_adapters(read_pair):
adapters = {}
with open_result(read_pair + '.contams.txt') as src:
for line in src:
name, seq = line.strip().split('\t')
# None is placeholder for trimming count
adapters[name] = [seq, None]
return adapters
def get_cutadapt_stats(read_pair, adapter_dict):
stats = {}
stat_file = read_pair + '.clipstat'
try:
with open_result(stat_file) as src:
cutadapt_txt = src.read()
except IOError:
# file does not exist, no adapter contamination found
stats['num_reads_discarded'] = 0
stats['num_reads_affected'] = 0
return stats
short_count = int(re.search(re_short, cutadapt_txt).group(1))
long_count = int(re.search(re_long, cutadapt_txt).group(1))
stats['num_reads_discarded'] = short_count + long_count
stats['num_reads_affected'] = int(re.search(re_aff,
cutadapt_txt).group(1))
for key in adapter_dict:
re_adp = get_re_adp(adapter_dict[key][0])
adapter_dict[key][1] = int(re.search(re_adp,
cutadapt_txt).group(1))
return stats
def get_sync_stats(sample):
stats = {}
with open_result(sample + '.clipsync.stats') as src:
sync_txt = src.read()
stats['num_reads_discarded_1'] = int(re.search(re_disc_1, sync_txt).group(1))
stats['num_reads_discarded_2'] = int(re.search(re_disc_2, sync_txt).group(1))
stats['num_reads_kept'] = int(re.search(re_kept, sync_txt).group(1))
stats['num_reads_discarded_total'] = stats['num_reads_discarded_1'] + \
stats['num_reads_discarded_2']
return stats
clip_stats = {'fastq_1': {}}
clip_stats['fastq_1']['adapters'] = get_adapters(samplea)
clip_stats['fastq_1'].update(
get_cutadapt_stats(samplea,
clip_stats['fastq_1']['adapters']))
if lib_type == 'paired':
clip_stats['fastq_2'] = {}
clip_stats['fastq_2']['adapters'] = get_adapters(sampleb)
clip_stats['fastq_2'].update(
get_cutadapt_stats(sampleb,
clip_stats['fastq_2']['adapters']))
clip_stats['sync'] = get_sync_stats(sample)
else:
clip_stats['sync'] = {}
return clip_stats
def sickle2dict(run_name, lib_type, run_dir):
trim_stats = {}
if lib_type == 'paired':
re_paired_kept = re.compile(r'paired records kept: \d+ \((\d+) pairs\)')
re_disc = re.compile(r'single records discarded: \d+ \(from PE1: (\d+), from PE2: (\d+)\)')
re_disc_paired = re.compile(r'paired records discarded: \d+ \((\d+) pairs\)')
with open(os.path.join(run_dir, run_name + '.filtersync.stats')) as src:
sickle_txt = src.read()
discarda = int(re.search(re_disc, sickle_txt).group(1))
discardb = int(re.search(re_disc, sickle_txt).group(2))
discard_both = int(re.search(re_disc_paired, sickle_txt).group(1))
trim_stats['num_reads_kept'] = int(re.search(re_paired_kept, sickle_txt).group(1))
trim_stats['num_reads_discarded_1'] = discarda
trim_stats['num_reads_discarded_2'] = discardb
trim_stats['num_reads_discarded_both'] = discard_both
trim_stats['num_reads_discarded_total'] = discarda + discardb + discard_both
else:
re_kept = re.compile(r'records kept: (\d+)')
re_disc = re.compile(r'records discarded: (\d+)')
with open(os.path.join(run_dir, run_name + '.filtersync.stats')) as src:
sickle_txt = src.read()
trim_stats['num_reads_kept'] = int(re.search(re_kept, sickle_txt).group(1))
trim_stats['num_reads_discarded_total'] = int(re.search(re_disc, sickle_txt).group(1))
return trim_stats
def dict2json(d_input, f_out):
"""Dump the given dictionary as a JSON file."""
with open(f_out, 'w') as target:
json.dump(d_input, target, sort_keys=True, indent=4,
separators=(',', ': '))
def summarize_flexiprep(run_name, qc_mode, samplea, sampleb, outf, run_dir):
def load_json(fname):
with open(os.path.join(run_dir, fname), 'r') as target:
return json.load(target)
def load_chksum(fname):
with open(os.path.join(run_dir, fname), 'r') as src:
return src.readline().strip().split()
def gc_from_fastqc(fname):
# other quick statistics
with open(fname, 'r') as src:
return int([x for x in src if
x.startswith('%GC')].pop().split('\t')[1])
sumd = {
'_meta': {
'module': 'flexiprep',
'run_name': run_name,
'run_time': datetime.utcnow().isoformat(),
},
'stats': {
'qc_mode': qc_mode,
},
'dirs': {
'run': run_dir,
},
'files': {
},
}
if sampleb is None:
lib_type = 'single'
else:
lib_type = 'paired'
sumd['stats']['lib_type'] = lib_type
# gather checksum files
chksums = [c for c in os.listdir(run_dir) if c.endswith('.sha1')]
# gather fastqc directories
fastqcs = [d for d in os.listdir(run_dir) if d.endswith('.fastqc')]
# gather seqstat files
sstats = [s for s in os.listdir(run_dir) if s.endswith('.seqstats.json')]
if lib_type == 'single':
if qc_mode == 'none':
assert len(chksums) == len(fastqcs) == len(sstats) == 1
chksum_r1 = load_chksum(chksums[0])
sumd['files']['fastq_raw_1'] = {
'checksum_sha1': chksum_r1[0],
'path': os.path.join(args.run_dir,
os.path.basename(chksum_r1[1]))}
fqd_r1 = fastqc2dict(args.run_dir, fastqcs[0], 'raw_1')
seqstat_r1 = load_json(sstats[0])['stats']
seqstat_r1['mean_gc'] = gc_from_fastqc(
fqd_r1['files']['txt_fastqc_raw_1']['path'])
sumd['stats']['fastq_raw_1'] = seqstat_r1
for k in ('files', 'stats'):
sumd[k].update(fqd_r1[k])
else:
assert len(chksums) == len(fastqcs) == len(sstats) == 2
fqp1_name = [x for x in fastqcs if x.endswith('.qc.fastqc')][0]
chksum_p1 = load_chksum([x for x in chksums if
x.endswith('.qc.sha1')][0])
sumd['files']['fastq_proc_1'] = {
'checksum_sha1': chksum_p1[0],
'path': os.path.join(args.run_dir,
os.path.basename(chksum_p1[1]))}
fqd_p1 = fastqc2dict(args.run_dir,
[x for x in fastqcs if x.endswith('.qc.fastqc')][0],
'proc_1')
seqstat_p1 = load_json([x for x in sstats if
x.endswith('.qc.seqstats.json')][0])['stats']
seqstat_p1['mean_gc'] = gc_from_fastqc(
fqd_p1['files']['txt_fastqc_proc_1']['path'])
sumd['stats']['fastq_proc_1'] = seqstat_p1
chksum_r1 = load_chksum([x for x in chksums if not
x.endswith('.qc.sha1')][0])
sumd['files']['fastq_raw_1'] = {
'checksum_sha1': chksum_r1[0],
'path': os.path.join(args.run_dir,
os.path.basename(chksum_r1[1]))}
fqd_r1 = fastqc2dict(args.run_dir,
[x for x in fastqcs if x != fqp1_name][0],
'raw_1')
seqstat_r1 = load_json([x for x in sstats if not
x.endswith('.qc.seqstats.json')][0])['stats']
seqstat_r1['mean_gc'] = gc_from_fastqc(
fqd_r1['files']['txt_fastqc_raw_1']['path'])
sumd['stats']['fastq_raw_1'] = seqstat_r1
for k in ('files', 'stats'):
sumd[k].update(fqd_p1[k])
sumd[k].update(fqd_r1[k])
else:
if qc_mode == 'none':
assert len(chksums) == len(fastqcs) == len(sstats) == 2
chksum_r1 = load_chksum([x for x in chksums if
x.startswith(samplea)][0])
sumd['files']['fastq_raw_1'] = {
'checksum_sha1': chksum_r1[0],
'path': os.path.join(args.run_dir,
os.path.basename(chksum_r1[1]))}
fqd_r1 = fastqc2dict(args.run_dir,
[x for x in fastqcs if x.startswith(samplea)][0],
'raw_1')
seqstat_r1 = load_json([x for x in sstats if
x.startswith(samplea)][0])['stats']
seqstat_r1['mean_gc'] = gc_from_fastqc(
fqd_r1['files']['txt_fastqc_raw_1']['path'])
sumd['stats']['fastq_raw_1'] = seqstat_r1
chksum_r2 = load_chksum([x for x in chksums if
not x.startswith(samplea)][0])
sumd['files']['fastq_raw_2'] = {
'checksum_sha1': chksum_r2[0],
'path': os.path.join(args.run_dir,
os.path.basename(chksum_r2[1]))}
fqd_r2 = fastqc2dict(args.run_dir,
[x for x in fastqcs if not x.startswith(samplea)][0],
'raw_2')
seqstat_r2 = load_json([x for x in sstats if
not x.startswith(samplea)][0])['stats']
seqstat_r2['mean_gc'] = gc_from_fastqc(
fqd_r2['files']['txt_fastqc_raw_2']['path'])
sumd['stats']['fastq_raw_2'] = seqstat_r2
for k in ('files', 'stats'):
sumd[k].update(fqd_r1[k])
sumd[k].update(fqd_r2[k])
else:
assert len(fastqcs) == len(sstats) == 4
proc_chksums = [x for x in chksums if x.endswith('.qc.sha1')]
proc_fastqcs = [x for x in fastqcs if x.endswith('.qc.fastqc')]
proc_sstats = [x for x in sstats if x.endswith('.qc.seqstats.json')]
raw_chksums = [x for x in chksums if x not in proc_chksums]
raw_fastqcs = [x for x in fastqcs if x not in proc_fastqcs]
raw_sstats = [x for x in sstats if x not in proc_sstats]
chksum_r1 = load_chksum([x for x in raw_chksums if
x.startswith(samplea)][0])
sumd['files']['fastq_raw_1'] = {
'checksum_sha1': chksum_r1[0],
'path': os.path.join(args.run_dir,
os.path.basename(chksum_r1[1]))}
fqd_r1 = fastqc2dict(args.run_dir,
[x for x in raw_fastqcs if x.startswith(samplea)][0],
'raw_1')
seqstat_r1 = load_json([x for x in raw_sstats if
x.startswith(samplea)][0])['stats']
seqstat_r1['mean_gc'] = gc_from_fastqc(
fqd_r1['files']['txt_fastqc_raw_1']['path'])
sumd['stats']['fastq_raw_1'] = seqstat_r1
chksum_r2 = load_chksum([x for x in raw_chksums if
not x.startswith(samplea)][0])
sumd['files']['fastq_raw_2'] = {
'checksum_sha1': chksum_r2[0],
'path': os.path.join(args.run_dir,
os.path.basename(chksum_r2[1]))}
fqd_r2 = fastqc2dict(args.run_dir,
[x for x in raw_fastqcs if not x.startswith(samplea)][0],
'raw_2')
seqstat_r2 = load_json([x for x in raw_sstats if
not x.startswith(samplea)][0])['stats']
seqstat_r2['mean_gc'] = gc_from_fastqc(
fqd_r2['files']['txt_fastqc_raw_2']['path'])
sumd['stats']['fastq_raw_2'] = seqstat_r2
chksum_p1 = load_chksum([x for x in proc_chksums if
x.startswith(samplea)][0])
sumd['files']['fastq_proc_1'] = {
'checksum_sha1': chksum_p1[0],
'path': os.path.join(args.run_dir,
os.path.basename(chksum_p1[1]))}
fqd_p1 = fastqc2dict(args.run_dir,
[x for x in proc_fastqcs if x.startswith(samplea)][0],
'proc_1')
seqstat_p1 = load_json([x for x in proc_sstats if
x.startswith(samplea)][0])['stats']
seqstat_p1['mean_gc'] = gc_from_fastqc(
fqd_p1['files']['txt_fastqc_proc_1']['path'])
sumd['stats']['fastq_proc_1'] = seqstat_p1
chksum_p2 = load_chksum([x for x in proc_chksums if
not x.startswith(samplea)][0])
sumd['files']['fastq_proc_2'] = {
'checksum_sha1': chksum_p2[0],
'path': os.path.join(args.run_dir,
os.path.basename(chksum_p2[1]))}
fqd_p2 = fastqc2dict(args.run_dir,
[x for x in proc_fastqcs if not x.startswith(samplea)][0],
'proc_2')
seqstat_p2 = load_json([x for x in proc_sstats if
not x.startswith(samplea)][0])['stats']
seqstat_p2['mean_gc'] = gc_from_fastqc(
fqd_p2['files']['txt_fastqc_proc_2']['path'])
sumd['stats']['fastq_proc_2'] = seqstat_p2
for k in ('files', 'stats'):
for fqd in (fqd_r1, fqd_r2, fqd_p1, fqd_p2):
sumd[k].update(fqd[k])
if 'clip' in qc_mode:
sumd['stats']['clip'] = clip2dict(run_name, samplea, sampleb, lib_type, run_dir)
if 'trim' in qc_mode:
sumd['stats']['trim'] = sickle2dict(run_name, lib_type, run_dir)
dict2json(sumd, outf)
if __name__ == '__main__':
usage = __doc__.split('\n\n\n')
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=usage[0], epilog=usage[1])
parser.add_argument('run_name', type=str, help='FastQC run name')
parser.add_argument('qc_mode', type=str, choices=['clip', 'cliptrim',
'trim', 'none'], help='Flexiprep QC mode')
parser.add_argument('samplea', type=str, help='Sample A name.')
parser.add_argument('--sampleb', type=str, help='Sample B name.')
parser.add_argument('out_file', type=str, help='Path to output file')
parser.add_argument('--run-dir', type=str, default=os.getcwd(),
help='Path to Flexiprep output directory.')
parser.add_argument('--version', action='version', version='%(prog)s ' +
__version__)
args = parser.parse_args()
summarize_flexiprep(args.run_name, args.qc_mode,
args.samplea, args.sampleb, args.out_file, args.run_dir)
#!/usr/bin/env python
"""
(Re-)sync two filtered paired end FASTQ files.
Given two filtered paired end read files and one of the original read files,
re-sync the filtered reads by filtering out anything that is only present in
one of the two files.
Usage:
{command} <orig.fq> <reads_1.fq> <reads_2.fq> \\
<reads_1.synced.fq> <reads_2.synced.fq>
The synced reads are written to disk as <reads_1.synced.fq> and
<reads_2.synced.fq>. Afterwards some counts are printed.
Both Illumina old-style and new-style paired-end header lines are supported.
The original read file is used to speed up processing: it contains all
possible reads from both edited reads (in all files in the same order) so it
can process all files line by line, not having to read a single file in
memory. Some ideas were taken from [1].
[1] https://gist.github.com/588841/
2011-11-03, Martijn Vermaat <m.vermaat.hg@lumc.nl>
"""
import sys
import re
def sync_paired_end_reads(original, reads_a, reads_b, synced_a, synced_b):
"""
Filter out reads from two paired end read files that are not present in
both of them. Do this in a reasonable amount of time by using a file
containing all of the reads for one of the paired ends.
All arguments are open file handles.
@arg original: File containing all original reads for one of the paired
ends.
@arg reads_a: First from paired end read files.
@arg reads_b: Second from paired end read files.
@arg synced_a: Filtered reads from first paired end read file.
@arg synced_b: Filtered reads from second paired end read file.
@return: Triple (filtered_a, filtered_b, kept) containing counts
of the number of reads filtered from both input files and
the total number of reads kept in the synced results.
@todo: Print warnings if obvious things are not right (a or b still has
lines after original is processed).
"""
# This matches 1, 2, or 3 preceded by / _ or whitespace. Its rightmost
# match in a header line is used to identify the read pair.
sep = re.compile('[\s_/][123]')
def next_record(fh):
return [fh.readline().strip() for i in range(4)]
def head(record):
return sep.split(record[0])[:-1]
headers = (sep.split(x.strip())[:-1] for i, x in enumerate(original)
if not (i % 4))
filtered_a = filtered_b = kept = 0
a, b = next_record(reads_a), next_record(reads_b)
for header in headers:
if header == head(a) and head(b) != header:
a = next_record(reads_a)
filtered_a += 1
if header == head(b) and head(a) != header:
b = next_record(reads_b)
filtered_b += 1
if header == head(a) == head(b):
print >>synced_a, '\n'.join(a)
print >>synced_b, '\n'.join(b)
a, b = next_record(reads_a), next_record(reads_b)
kept += 1
return filtered_a, filtered_b, kept
if __name__ == '__main__':
if len(sys.argv) < 6:
sys.stderr.write(__doc__.split('\n\n\n')[0].strip().format(
command=sys.argv[0]) + '\n')
sys.exit(1)
try:
original = open(sys.argv[1], 'r')
reads_a = open(sys.argv[2], 'r')
reads_b = open(sys.argv[3], 'r')
synced_a = open(sys.argv[4], 'w')
synced_b = open(sys.argv[5], 'w')
filtered_a, filtered_b, kept = \
sync_paired_end_reads(original, reads_a, reads_b,
synced_a, synced_b)
print 'Filtered %i reads from first read file.' % filtered_a
print 'Filtered %i reads from second read file.' % filtered_b
print 'Synced read files contain %i reads.' % kept
except IOError as (_, message):
sys.stderr.write('Error: %s\n' % message)
sys.exit(1)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment