Commit 1f864153 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Remove old script

parent 020ffb1f
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
import argparse
import os
from pyfastqc import load_from_dir
def parse_contam_file(contam_file, delimiter='\t'):
"""Given a contaminant file, return a dictionary of contaminant sequences
names and their sequences.
Args:
contam_file -- path to contaminants file
delimiter -- ID, sequence delimiter in the contaminants file
"""
assert os.path.exists(contam_file), "Contaminant file %r does not exist" % \
contam_file
with open(contam_file, 'r') as source:
# read only lines not beginning with '#' and discard empty lines
lines = filter(None, (line.strip() for line in source if not
line.startswith('#')))
# parse contam seq lines into lists of [id, sequence]
parse = lambda line: filter(None, line.split(delimiter))
parsed = (parse(line) for line in lines)
# and create a dictionary, key=sequence id and value=sequence
contam_ref = {name: seq for name, seq in parsed}
return contam_ref
def get_contams_present(results_dir, contam_ref):
"""Given a path to a FastQC HTML results file, return the <div> tag of the
overrepresented sequences list.
Args:
results_dir -- Path to FastQC results directory.
"""
assert os.path.exists(results_dir), "Directory {0} not " \
"found.".format(results_dir)
fastqc = load_from_dir(results_dir)
contam_names = set([x[3] for x in fastqc.overrepresented_sequences.data])
in_sample = lambda rid: any([cid.startswith(rid) for cid in contam_names])
contams_present = {x: y for x, y in contam_ref.items() if in_sample(x)}
return contams_present
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('results_dir', type=str,
help='Path to FastQC result directory file')
parser.add_argument('-c', '--contam_file', type=str,
dest='contam_file',
help='Path to contaminant file')
parser.add_argument('-o', '--output', type=str,
dest='output',
help='Path to output file')
parser.add_argument('--seq-only',dest='seq_only',
action='store_true',
help='Whether to output contaminant sequences only or not')
args = parser.parse_args()
contam_ref = parse_contam_file(args.contam_file)
contam_ids = get_contams_present(args.results_dir, contam_ref)
if args.seq_only:
fmt_out = lambda cid, seq: seq
else:
fmt_out = lambda cid, seq: "{0}\t{1}".format(cid, seq)
if args.output is None:
for cid, seq in contam_ids.items():
print fmt_out(cid, seq)
else:
with open(args.output, 'w') as target:
for cid, seq in contam_ids.items():
target.write(fmt_out(cid, seq) + '\n')
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
"""
--------
pyfastqc
--------
Parser for FastQC results.
Provides a method and classes for parsing a FastQC run results.
Tested with FastQC 0.9.5 output.
"""
import os
def load_file(fname, mode='r', **kwargs):
"""Given a path to a FastQC data file or an open file object pointing to it,
return a `FastQC` object.
"""
if isinstance(fname, basestring):
with open(fname, mode, **kwargs) as fp:
return FastQC(fp)
else:
return FastQC(fname)
def load_from_dir(dirname, data_fname='fastqc_data.txt', mode='r', **kwargs):
"""Given a path to a FastQC results directory, return a `FastQC` object."""
assert os.path.exists(dirname), "Directory %r does not exist" % dirname
fqc_path = os.path.join(dirname, os.walk(dirname).next()[1][0], data_fname)
return load_file(fqc_path, mode, **kwargs)
class FastQCModule(object):
"""Class representing a FastQC analysis module."""
def __init__(self, raw_lines, end_mark='>>END_MODULE'):
self.raw_lines = raw_lines
self.end_mark = end_mark
self._status = None
self._name = None
self._data = self.parse()
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__,
'[%r, ...]' % self.raw_lines[0])
def __str__(self):
return ''.join(self.raw_lines)
@property
def name(self):
"""Name of the module."""
return self._name
@property
def columns(self):
"""Columns in the module."""
return self._columns
@property
def data(self):
"""FastQC data."""
return self._data
@property
def status(self):
"""FastQC run status."""
return self._status
def parse(self):
"""Common parser for a FastQC module."""
# check that the last line is a proper end mark
assert self.raw_lines[-1].startswith(self.end_mark)
# parse name and status from first line
tokens = self.raw_lines[0].strip().split('\t')
name = tokens[0][2:]
self._name = name
status = tokens[-1]
assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
% status
self._status = status
# and column names from second line
columns = self.raw_lines[1][1:].strip().split('\t')
self._columns = columns
# the rest of the lines except the last one
data = []
for line in self.raw_lines[2:-1]:
cols = line.strip().split('\t')
data.append(cols)
# optional processing for different modules
if self.name == 'Basic Statistics':
data = {k: v for k, v in data}
return data
class FastQC(object):
"""Class representing results from a FastQC run."""
# module name -- attribute name mapping
_mod_map = {
'>>Basic Statistics': 'basic_statistics',
'>>Per base sequence quality': 'per_base_sequence_quality',
'>>Per sequence quality scores': 'per_sequence_quality_scores',
'>>Per base sequence content': 'per_base_sequence_content',
'>>Per base GC content': 'per_base_gc_content',
'>>Per sequence GC content': 'per_sequence_gc_content',
'>>Per base N content': 'per_base_n_content',
'>>Sequence Length Distribution': 'sequence_length_distribution',
'>>Sequence Duplication Levels': 'sequence_duplication_levels',
'>>Overrepresented sequences': 'overrepresented_sequences',
'>>Kmer content': 'kmer_content',
}
def __init__(self, fp):
# get file name
self.fname = fp.name
self._modules = {}
line = fp.readline()
while True:
tokens = line.strip().split('\t')
# break on EOF
if not line:
break
# parse version
elif line.startswith('##FastQC'):
self.version = line.strip().split()[1]
# parse individual modules
elif tokens[0] in self._mod_map:
attr = self._mod_map[tokens[0]]
raw_lines = self.read_module(fp, line, tokens[0])
self._modules[attr] = FastQCModule(raw_lines)
line = fp.readline()
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, self.fname)
def _filter_by_status(self, status):
return [x.name for x in self._modules.values() if x.status == status]
def read_module(self, fp, line, start_mark):
raw = [line]
while not line.startswith('>>END_MODULE'):
line = fp.readline()
raw.append(line)
if not line:
raise ValueError("Unexpected end of file in module %r" % line)
return raw
@property
def modules(self):
return self._modules
@property
def passes(self):
return self._filter_by_status('pass')
@property
def passes_num(self):
return len(self.passes)
@property
def warns(self):
return self._filter_by_status('warn')
@property
def warns_num(self):
return len(self.warns)
@property
def fails(self):
return self._filter_by_status('fail')
@property
def fails_num(self):
return len(self.fails)
@property
def basic_statistics(self):
return self._modules['basic_statistics']
@property
def per_base_sequence_quality(self):
return self._modules['per_base_sequence_quality']
@property
def per_sequence_quality_scores(self):
return self._modules['per_sequence_quality_scores']
@property
def per_base_sequence_content(self):
return self._modules['per_base_sequence_content']
@property
def per_base_gc_content(self):
return self._modules['per_base_gc_content']
@property
def per_sequence_gc_content(self):
return self._modules['per_sequence_gc_content']
@property
def per_base_n_content(self):
return self._modules['per_base_n_content']
@property
def sequence_length_distribution(self):
return self._modules['sequence_length_distribution']
@property
def sequence_duplication_levels(self):
return self._modules['sequence_duplication_levels']
@property
def overrepresented_sequences(self):
return self._modules['overrepresented_sequences']
@property
def kmer_content(self):
return self._modules['kmer_content']
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.scripts
import java.io.File
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
class FastqcToContams(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("__init__.py", "pyfastqc/")
setPythonScript("fastqc_contam.py")
@Input(doc = "Fastqc output", shortName = "fastqc", required = true)
var fastqc_output: File = _
@Input(doc = "Contams input", shortName = "fastqc", required = false)
var contams_file: File = _
@Output(doc = "Output file", shortName = "out", required = true)
var out: File = _
def cmdLine = {
getPythonCommand +
required(fastqc_output.getParent()) +
required("-c", contams_file) +
" > " +
required(out)
}
}
......@@ -21,7 +21,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat, Seqstat }
import nl.lumc.sasc.biopet.scripts.{ FastqSync, FastqcToContams }
import nl.lumc.sasc.biopet.scripts.{ FastqSync }
class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment