Skip to content
Snippets Groups Projects
Commit 1f864153 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Remove old script

parent 020ffb1f
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
import argparse
import os
from pyfastqc import load_from_dir
def parse_contam_file(contam_file, delimiter='\t'):
"""Given a contaminant file, return a dictionary of contaminant sequences
names and their sequences.
Args:
contam_file -- path to contaminants file
delimiter -- ID, sequence delimiter in the contaminants file
"""
assert os.path.exists(contam_file), "Contaminant file %r does not exist" % \
contam_file
with open(contam_file, 'r') as source:
# read only lines not beginning with '#' and discard empty lines
lines = filter(None, (line.strip() for line in source if not
line.startswith('#')))
# parse contam seq lines into lists of [id, sequence]
parse = lambda line: filter(None, line.split(delimiter))
parsed = (parse(line) for line in lines)
# and create a dictionary, key=sequence id and value=sequence
contam_ref = {name: seq for name, seq in parsed}
return contam_ref
def get_contams_present(results_dir, contam_ref):
"""Given a path to a FastQC HTML results file, return the <div> tag of the
overrepresented sequences list.
Args:
results_dir -- Path to FastQC results directory.
"""
assert os.path.exists(results_dir), "Directory {0} not " \
"found.".format(results_dir)
fastqc = load_from_dir(results_dir)
contam_names = set([x[3] for x in fastqc.overrepresented_sequences.data])
in_sample = lambda rid: any([cid.startswith(rid) for cid in contam_names])
contams_present = {x: y for x, y in contam_ref.items() if in_sample(x)}
return contams_present
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('results_dir', type=str,
help='Path to FastQC result directory file')
parser.add_argument('-c', '--contam_file', type=str,
dest='contam_file',
help='Path to contaminant file')
parser.add_argument('-o', '--output', type=str,
dest='output',
help='Path to output file')
parser.add_argument('--seq-only',dest='seq_only',
action='store_true',
help='Whether to output contaminant sequences only or not')
args = parser.parse_args()
contam_ref = parse_contam_file(args.contam_file)
contam_ids = get_contams_present(args.results_dir, contam_ref)
if args.seq_only:
fmt_out = lambda cid, seq: seq
else:
fmt_out = lambda cid, seq: "{0}\t{1}".format(cid, seq)
if args.output is None:
for cid, seq in contam_ids.items():
print fmt_out(cid, seq)
else:
with open(args.output, 'w') as target:
for cid, seq in contam_ids.items():
target.write(fmt_out(cid, seq) + '\n')
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
"""
--------
pyfastqc
--------
Parser for FastQC results.
Provides a method and classes for parsing a FastQC run results.
Tested with FastQC 0.9.5 output.
"""
import os
def load_file(fname, mode='r', **kwargs):
"""Given a path to a FastQC data file or an open file object pointing to it,
return a `FastQC` object.
"""
if isinstance(fname, basestring):
with open(fname, mode, **kwargs) as fp:
return FastQC(fp)
else:
return FastQC(fname)
def load_from_dir(dirname, data_fname='fastqc_data.txt', mode='r', **kwargs):
"""Given a path to a FastQC results directory, return a `FastQC` object."""
assert os.path.exists(dirname), "Directory %r does not exist" % dirname
fqc_path = os.path.join(dirname, os.walk(dirname).next()[1][0], data_fname)
return load_file(fqc_path, mode, **kwargs)
class FastQCModule(object):
"""Class representing a FastQC analysis module."""
def __init__(self, raw_lines, end_mark='>>END_MODULE'):
self.raw_lines = raw_lines
self.end_mark = end_mark
self._status = None
self._name = None
self._data = self.parse()
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__,
'[%r, ...]' % self.raw_lines[0])
def __str__(self):
return ''.join(self.raw_lines)
@property
def name(self):
"""Name of the module."""
return self._name
@property
def columns(self):
"""Columns in the module."""
return self._columns
@property
def data(self):
"""FastQC data."""
return self._data
@property
def status(self):
"""FastQC run status."""
return self._status
def parse(self):
"""Common parser for a FastQC module."""
# check that the last line is a proper end mark
assert self.raw_lines[-1].startswith(self.end_mark)
# parse name and status from first line
tokens = self.raw_lines[0].strip().split('\t')
name = tokens[0][2:]
self._name = name
status = tokens[-1]
assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
% status
self._status = status
# and column names from second line
columns = self.raw_lines[1][1:].strip().split('\t')
self._columns = columns
# the rest of the lines except the last one
data = []
for line in self.raw_lines[2:-1]:
cols = line.strip().split('\t')
data.append(cols)
# optional processing for different modules
if self.name == 'Basic Statistics':
data = {k: v for k, v in data}
return data
class FastQC(object):
"""Class representing results from a FastQC run."""
# module name -- attribute name mapping
_mod_map = {
'>>Basic Statistics': 'basic_statistics',
'>>Per base sequence quality': 'per_base_sequence_quality',
'>>Per sequence quality scores': 'per_sequence_quality_scores',
'>>Per base sequence content': 'per_base_sequence_content',
'>>Per base GC content': 'per_base_gc_content',
'>>Per sequence GC content': 'per_sequence_gc_content',
'>>Per base N content': 'per_base_n_content',
'>>Sequence Length Distribution': 'sequence_length_distribution',
'>>Sequence Duplication Levels': 'sequence_duplication_levels',
'>>Overrepresented sequences': 'overrepresented_sequences',
'>>Kmer content': 'kmer_content',
}
def __init__(self, fp):
# get file name
self.fname = fp.name
self._modules = {}
line = fp.readline()
while True:
tokens = line.strip().split('\t')
# break on EOF
if not line:
break
# parse version
elif line.startswith('##FastQC'):
self.version = line.strip().split()[1]
# parse individual modules
elif tokens[0] in self._mod_map:
attr = self._mod_map[tokens[0]]
raw_lines = self.read_module(fp, line, tokens[0])
self._modules[attr] = FastQCModule(raw_lines)
line = fp.readline()
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, self.fname)
def _filter_by_status(self, status):
return [x.name for x in self._modules.values() if x.status == status]
def read_module(self, fp, line, start_mark):
raw = [line]
while not line.startswith('>>END_MODULE'):
line = fp.readline()
raw.append(line)
if not line:
raise ValueError("Unexpected end of file in module %r" % line)
return raw
@property
def modules(self):
return self._modules
@property
def passes(self):
return self._filter_by_status('pass')
@property
def passes_num(self):
return len(self.passes)
@property
def warns(self):
return self._filter_by_status('warn')
@property
def warns_num(self):
return len(self.warns)
@property
def fails(self):
return self._filter_by_status('fail')
@property
def fails_num(self):
return len(self.fails)
@property
def basic_statistics(self):
return self._modules['basic_statistics']
@property
def per_base_sequence_quality(self):
return self._modules['per_base_sequence_quality']
@property
def per_sequence_quality_scores(self):
return self._modules['per_sequence_quality_scores']
@property
def per_base_sequence_content(self):
return self._modules['per_base_sequence_content']
@property
def per_base_gc_content(self):
return self._modules['per_base_gc_content']
@property
def per_sequence_gc_content(self):
return self._modules['per_sequence_gc_content']
@property
def per_base_n_content(self):
return self._modules['per_base_n_content']
@property
def sequence_length_distribution(self):
return self._modules['sequence_length_distribution']
@property
def sequence_duplication_levels(self):
return self._modules['sequence_duplication_levels']
@property
def overrepresented_sequences(self):
return self._modules['overrepresented_sequences']
@property
def kmer_content(self):
return self._modules['kmer_content']
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.scripts
import java.io.File
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
class FastqcToContams(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("__init__.py", "pyfastqc/")
setPythonScript("fastqc_contam.py")
@Input(doc = "Fastqc output", shortName = "fastqc", required = true)
var fastqc_output: File = _
@Input(doc = "Contams input", shortName = "fastqc", required = false)
var contams_file: File = _
@Output(doc = "Output file", shortName = "out", required = true)
var out: File = _
def cmdLine = {
getPythonCommand +
required(fastqc_output.getParent()) +
required("-c", contams_file) +
" > " +
required(out)
}
}
......@@ -21,7 +21,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat, Seqstat }
import nl.lumc.sasc.biopet.scripts.{ FastqSync, FastqcToContams }
import nl.lumc.sasc.biopet.scripts.{ FastqSync }
class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment