Skip to content
Snippets Groups Projects
Commit 86e3b9eb authored by bow's avatar bow
Browse files

Merge branch 'feature-fix_contams' into 'develop'

Feature fix contams

Fix for #11

See merge request !69
parents 259e3372 1f864153
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
import argparse
import os
from pyfastqc import load_from_dir
def parse_contam_file(contam_file, delimiter='\t'):
"""Given a contaminant file, return a dictionary of contaminant sequences
names and their sequences.
Args:
contam_file -- path to contaminants file
delimiter -- ID, sequence delimiter in the contaminants file
"""
assert os.path.exists(contam_file), "Contaminant file %r does not exist" % \
contam_file
with open(contam_file, 'r') as source:
# read only lines not beginning with '#' and discard empty lines
lines = filter(None, (line.strip() for line in source if not
line.startswith('#')))
# parse contam seq lines into lists of [id, sequence]
parse = lambda line: filter(None, line.split(delimiter))
parsed = (parse(line) for line in lines)
# and create a dictionary, key=sequence id and value=sequence
contam_ref = {name: seq for name, seq in parsed}
return contam_ref
def get_contams_present(results_dir, contam_ref):
"""Given a path to a FastQC HTML results file, return the <div> tag of the
overrepresented sequences list.
Args:
results_dir -- Path to FastQC results directory.
"""
assert os.path.exists(results_dir), "Directory {0} not " \
"found.".format(results_dir)
fastqc = load_from_dir(results_dir)
contam_names = set([x[3] for x in fastqc.overrepresented_sequences.data])
in_sample = lambda rid: any([cid.startswith(rid) for cid in contam_names])
contams_present = {x: y for x, y in contam_ref.items() if in_sample(x)}
return contams_present
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('results_dir', type=str,
help='Path to FastQC result directory file')
parser.add_argument('-c', '--contam_file', type=str,
dest='contam_file',
help='Path to contaminant file')
parser.add_argument('-o', '--output', type=str,
dest='output',
help='Path to output file')
parser.add_argument('--seq-only',dest='seq_only',
action='store_true',
help='Whether to output contaminant sequences only or not')
args = parser.parse_args()
contam_ref = parse_contam_file(args.contam_file)
contam_ids = get_contams_present(args.results_dir, contam_ref)
if args.seq_only:
fmt_out = lambda cid, seq: seq
else:
fmt_out = lambda cid, seq: "{0}\t{1}".format(cid, seq)
if args.output is None:
for cid, seq in contam_ids.items():
print fmt_out(cid, seq)
else:
with open(args.output, 'w') as target:
for cid, seq in contam_ids.items():
target.write(fmt_out(cid, seq) + '\n')
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
"""
--------
pyfastqc
--------
Parser for FastQC results.
Provides a method and classes for parsing a FastQC run results.
Tested with FastQC 0.9.5 output.
"""
import os
def load_file(fname, mode='r', **kwargs):
"""Given a path to a FastQC data file or an open file object pointing to it,
return a `FastQC` object.
"""
if isinstance(fname, basestring):
with open(fname, mode, **kwargs) as fp:
return FastQC(fp)
else:
return FastQC(fname)
def load_from_dir(dirname, data_fname='fastqc_data.txt', mode='r', **kwargs):
"""Given a path to a FastQC results directory, return a `FastQC` object."""
assert os.path.exists(dirname), "Directory %r does not exist" % dirname
fqc_path = os.path.join(dirname, os.walk(dirname).next()[1][0], data_fname)
return load_file(fqc_path, mode, **kwargs)
class FastQCModule(object):
"""Class representing a FastQC analysis module."""
def __init__(self, raw_lines, end_mark='>>END_MODULE'):
self.raw_lines = raw_lines
self.end_mark = end_mark
self._status = None
self._name = None
self._data = self.parse()
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__,
'[%r, ...]' % self.raw_lines[0])
def __str__(self):
return ''.join(self.raw_lines)
@property
def name(self):
"""Name of the module."""
return self._name
@property
def columns(self):
"""Columns in the module."""
return self._columns
@property
def data(self):
"""FastQC data."""
return self._data
@property
def status(self):
"""FastQC run status."""
return self._status
def parse(self):
"""Common parser for a FastQC module."""
# check that the last line is a proper end mark
assert self.raw_lines[-1].startswith(self.end_mark)
# parse name and status from first line
tokens = self.raw_lines[0].strip().split('\t')
name = tokens[0][2:]
self._name = name
status = tokens[-1]
assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
% status
self._status = status
# and column names from second line
columns = self.raw_lines[1][1:].strip().split('\t')
self._columns = columns
# the rest of the lines except the last one
data = []
for line in self.raw_lines[2:-1]:
cols = line.strip().split('\t')
data.append(cols)
# optional processing for different modules
if self.name == 'Basic Statistics':
data = {k: v for k, v in data}
return data
class FastQC(object):
"""Class representing results from a FastQC run."""
# module name -- attribute name mapping
_mod_map = {
'>>Basic Statistics': 'basic_statistics',
'>>Per base sequence quality': 'per_base_sequence_quality',
'>>Per sequence quality scores': 'per_sequence_quality_scores',
'>>Per base sequence content': 'per_base_sequence_content',
'>>Per base GC content': 'per_base_gc_content',
'>>Per sequence GC content': 'per_sequence_gc_content',
'>>Per base N content': 'per_base_n_content',
'>>Sequence Length Distribution': 'sequence_length_distribution',
'>>Sequence Duplication Levels': 'sequence_duplication_levels',
'>>Overrepresented sequences': 'overrepresented_sequences',
'>>Kmer content': 'kmer_content',
}
def __init__(self, fp):
# get file name
self.fname = fp.name
self._modules = {}
line = fp.readline()
while True:
tokens = line.strip().split('\t')
# break on EOF
if not line:
break
# parse version
elif line.startswith('##FastQC'):
self.version = line.strip().split()[1]
# parse individual modules
elif tokens[0] in self._mod_map:
attr = self._mod_map[tokens[0]]
raw_lines = self.read_module(fp, line, tokens[0])
self._modules[attr] = FastQCModule(raw_lines)
line = fp.readline()
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, self.fname)
def _filter_by_status(self, status):
return [x.name for x in self._modules.values() if x.status == status]
def read_module(self, fp, line, start_mark):
raw = [line]
while not line.startswith('>>END_MODULE'):
line = fp.readline()
raw.append(line)
if not line:
raise ValueError("Unexpected end of file in module %r" % line)
return raw
@property
def modules(self):
return self._modules
@property
def passes(self):
return self._filter_by_status('pass')
@property
def passes_num(self):
return len(self.passes)
@property
def warns(self):
return self._filter_by_status('warn')
@property
def warns_num(self):
return len(self.warns)
@property
def fails(self):
return self._filter_by_status('fail')
@property
def fails_num(self):
return len(self.fails)
@property
def basic_statistics(self):
return self._modules['basic_statistics']
@property
def per_base_sequence_quality(self):
return self._modules['per_base_sequence_quality']
@property
def per_sequence_quality_scores(self):
return self._modules['per_sequence_quality_scores']
@property
def per_base_sequence_content(self):
return self._modules['per_base_sequence_content']
@property
def per_base_gc_content(self):
return self._modules['per_base_gc_content']
@property
def per_sequence_gc_content(self):
return self._modules['per_sequence_gc_content']
@property
def per_base_n_content(self):
return self._modules['per_base_n_content']
@property
def sequence_length_distribution(self):
return self._modules['sequence_length_distribution']
@property
def sequence_duplication_levels(self):
return self._modules['sequence_duplication_levels']
@property
def overrepresented_sequences(self):
return self._modules['overrepresented_sequences']
@property
def kmer_content(self):
return self._modules['kmer_content']
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.scripts
import java.io.File
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
class FastqcToContams(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("__init__.py", "pyfastqc/")
setPythonScript("fastqc_contam.py")
@Input(doc = "Fastqc output", shortName = "fastqc", required = true)
var fastqc_output: File = _
@Input(doc = "Contams input", shortName = "fastqc", required = false)
var contams_file: File = _
@Output(doc = "Output file", shortName = "out", required = true)
var out: File = _
def cmdLine = {
getPythonCommand +
required(fastqc_output.getParent()) +
required("-c", contams_file) +
" > " +
required(out)
}
}
......@@ -28,12 +28,15 @@ import nl.lumc.sasc.biopet.core.config.Configurable
import scala.collection.mutable.Map
class Cutadapt(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Cutadapt(root) {
@Input(doc = "Fastq contams file", required = false)
var contams_file: File = _
var fastqc: Fastqc = _
override def beforeCmd() {
super.beforeCmd
getContamsFromFile
val foundAdapters = fastqc.getFoundAdapters.map(_.seq)
if (default_clip_mode == "3") opt_adapter ++= foundAdapters
else if (default_clip_mode == "5") opt_front ++= foundAdapters
else if (default_clip_mode == "both") opt_anywhere ++= foundAdapters
}
override def cmdLine = {
......@@ -46,24 +49,6 @@ class Cutadapt(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Cutada
}
}
def getContamsFromFile {
if (contams_file != null) {
if (contams_file.exists()) {
for (line <- Source.fromFile(contams_file).getLines) {
var s: String = line.substring(line.lastIndexOf("\t") + 1, line.size)
if (default_clip_mode == "3") opt_adapter += s
else if (default_clip_mode == "5") opt_front += s
else if (default_clip_mode == "both") opt_anywhere += s
else {
opt_adapter += s
logger.warn("Option default_clip_mode should be '3', '5' or 'both', falling back to default: '3'")
}
logger.info("Adapter: " + s + " found in: " + fastq_input)
}
} else logger.warn("File : " + contams_file + " does not exist")
}
}
def getSummary: Json = {
val trimR = """.*Trimmed reads: *(\d*) .*""".r
val tooShortR = """.*Too short reads: *(\d*) .*""".r
......
......@@ -13,11 +13,6 @@
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nl.lumc.sasc.biopet.pipelines.flexiprep
......@@ -52,6 +47,30 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r
return null // Could be default Sanger with a warning in the log
}
protected case class Sequence(name: String, seq: String)
def getFoundAdapters: List[Sequence] = {
def getSeqs(file: File) = {
if (file != null) {
(for (
line <- Source.fromFile(file).getLines(); if line.startsWith("#");
values = line.split("\t*") if values.size >= 2
) yield Sequence(values(0), values(1))).toList
} else Nil
}
val seqs = getSeqs(adapters) ::: getSeqs(contaminants)
val block = getDataBlock("Overrepresented sequences")
if (block == null) return Nil
val found = for (
line <- block if !line.startsWith("#");
values = line.split("\t") if values.size >= 4
) yield values(3)
seqs.filter(x => found.exists(_.startsWith(x.name)))
}
def getSummary: Json = {
val subfixs = Map("plot_duplication_levels" -> "Images/duplication_levels.png",
"plot_kmer_profiles" -> "Images/kmer_profiles.png",
......
......@@ -21,7 +21,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat, Seqstat }
import nl.lumc.sasc.biopet.scripts.{ FastqSync, FastqcToContams }
import nl.lumc.sasc.biopet.scripts.{ FastqSync }
class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
......@@ -101,7 +101,6 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
add(fastqc_R1)
summary.addFastqc(fastqc_R1)
outputFiles += ("fastqc_R1" -> fastqc_R1.output)
outputFiles += ("contams_R1" -> getContams(fastqc_R1, R1_name))
val md5sum_R1 = Md5sum(this, input_R1, outputDir)
add(md5sum_R1)
......@@ -112,7 +111,6 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
add(fastqc_R2)
summary.addFastqc(fastqc_R2, R2 = true)
outputFiles += ("fastqc_R2" -> fastqc_R2.output)
outputFiles += ("contams_R2" -> getContams(fastqc_R2, R2_name))
val md5sum_R2 = Md5sum(this, input_R2, outputDir)
add(md5sum_R2)
......@@ -120,15 +118,6 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
}
}
def getContams(fastqc: Fastqc, pairname: String): File = {
val fastqcToContams = new FastqcToContams(this)
fastqcToContams.fastqc_output = fastqc.output
fastqcToContams.out = new File(outputDir + pairname + ".contams.txt")
fastqcToContams.contams_file = fastqc.contaminants
add(fastqcToContams)
return fastqcToContams.out
}
def runTrimClip(R1_in: File, outDir: String, chunk: String): (File, File, List[File]) = {
runTrimClip(R1_in, new File(""), outDir, chunk)
}
......@@ -146,13 +135,13 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
var R2: File = new File(R2_in)
var deps: List[File] = if (paired) List(R1, R2) else List(R1)
val seqtkSeq_R1 = SeqtkSeq.apply(this, R1, swapExt(outDir, R1, R1_ext, ".sanger" + R1_ext), fastqc_R1)
val seqtkSeq_R1 = SeqtkSeq(this, R1, swapExt(outDir, R1, R1_ext, ".sanger" + R1_ext), fastqc_R1)
add(seqtkSeq_R1, isIntermediate = true)
R1 = seqtkSeq_R1.output
deps ::= R1
if (paired) {
val seqtkSeq_R2 = SeqtkSeq.apply(this, R2, swapExt(outDir, R2, R2_ext, ".sanger" + R2_ext), fastqc_R2)
val seqtkSeq_R2 = SeqtkSeq(this, R2, swapExt(outDir, R2, R2_ext, ".sanger" + R2_ext), fastqc_R2)
add(seqtkSeq_R2, isIntermediate = true)
R2 = seqtkSeq_R2.output
deps ::= R2
......@@ -171,7 +160,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
if (!skipClip) { // Adapter clipping
val cutadapt_R1 = Cutadapt(this, R1, swapExt(outDir, R1, R1_ext, ".clip" + R1_ext))
if (outputFiles.contains("contams_R1")) cutadapt_R1.contams_file = outputFiles("contams_R1")
cutadapt_R1.fastqc = fastqc_R1
add(cutadapt_R1, isIntermediate = true)
summary.addCutadapt(cutadapt_R1, R2 = false, chunk)
R1 = cutadapt_R1.fastq_output
......@@ -181,7 +170,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
if (paired) {
val cutadapt_R2 = Cutadapt(this, R2, swapExt(outDir, R2, R2_ext, ".clip" + R2_ext))
outputFiles += ("cutadapt_R2_stats" -> cutadapt_R2.stats_output)
if (outputFiles.contains("contams_R2")) cutadapt_R2.contams_file = outputFiles("contams_R2")
cutadapt_R2.fastqc = fastqc_R2
add(cutadapt_R2, isIntermediate = true)
summary.addCutadapt(cutadapt_R2, R2 = true, chunk)
R2 = cutadapt_R2.fastq_output
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment