Commit e1eb4765 authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'develop' of git.lumc.nl:biopet/biopet into feature-yamsvp

parents ab220766 1dc0cde8
......@@ -11,13 +11,15 @@ import org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGATK
trait GatkGeneral extends CommandLineGATK with BiopetJavaCommandLineFunction {
memoryLimit = Option(3)
if (config.contains("gatk_jar")) jarFile = config("gatk_jar")
override def subPath = "gatk" :: super.subPath
jarFile = config("gatk_jar", required = true)
override val defaultVmem = "7G"
if (config.contains("intervals", submodule = "gatk")) intervals = config("intervals", submodule = "gatk").asFileList
if (config.contains("exclude_intervals", submodule = "gatk")) excludeIntervals = config("exclude_intervals", submodule = "gatk").asFileList
reference_sequence = config("reference", submodule = "gatk")
gatk_key = config("gatk_key", submodule = "gatk")
if (config.contains("pedigree", submodule = "gatk")) pedigree = config("pedigree", submodule = "gatk").asFileList
if (config.contains("intervals")) intervals = config("intervals").asFileList
if (config.contains("exclude_intervals")) excludeIntervals = config("exclude_intervals").asFileList
reference_sequence = config("reference")
gatk_key = config("gatk_key")
if (config.contains("pedigree")) pedigree = config("pedigree").asFileList
}
......@@ -9,6 +9,7 @@ import nl.lumc.sasc.biopet.core.MultiSampleQScript
import nl.lumc.sasc.biopet.core.PipelineCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import htsjdk.samtools.SamReaderFactory
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping._
import scala.collection.JavaConversions._
import java.io.File
import nl.lumc.sasc.biopet.extensions.gatk.{ CombineVariants, CombineGVCFs }
......@@ -62,8 +63,8 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
}
val multisampleVariantcalling = new GatkVariantcalling(this) {
override protected lazy val configName = "gatkvariantcalling"
override def configPath: List[String] = "multisample" :: super.configPath
override def configName = "gatkvariantcalling"
override def configPath: List[String] = super.configPath ::: "multisample" :: Nil
}
def biopetScript() {
......@@ -97,8 +98,8 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
val allRawVcfFiles = for ((sampleID, sampleOutput) <- samplesOutput) yield sampleOutput.variantcalling.rawFilterVcfFile
val gatkVariantcalling = new GatkVariantcalling(this) {
override protected lazy val configName = "gatkvariantcalling"
override def configPath: List[String] = "multisample" :: super.configPath
override def configName = "gatkvariantcalling"
override def configPath: List[String] = super.configPath ::: "multisample" :: Nil
}
if (gatkVariantcalling.useMpileup) {
......@@ -132,7 +133,7 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = {
val sampleOutput = new SampleOutput
var libraryBamfiles: List[File] = List()
val sampleID: String = sampleConfig("ID").toString
val sampleID: String = getCurrentSample
sampleOutput.libraries = runLibraryJobs(sampleConfig)
val sampleDir = globalSampleDir + sampleID
for ((libraryID, libraryOutput) <- sampleOutput.libraries) {
......@@ -162,25 +163,43 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
// Called for each run from a sample
def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = {
val libraryOutput = new LibraryOutput
val runID: String = runConfig("ID").toString
val sampleID: String = sampleConfig("ID").toString
val runID: String = getCurrentLibrary
val sampleID: String = getCurrentSample
val runDir: String = globalSampleDir + sampleID + "/run_" + runID + "/"
var inputType = ""
if (runConfig.contains("inputtype")) inputType = runConfig("inputtype").toString
else inputType = config("inputtype", default = "dna").toString
if (runConfig.contains("R1")) {
val mapping = Mapping.loadFromLibraryConfig(this, runConfig, sampleConfig, runDir)
var inputType: String = config("inputtype", default = "dna")
def loadFromLibraryConfig(startJobs: Boolean = true): Mapping = {
val mapping = new Mapping(this)
mapping.input_R1 = config("R1")
mapping.input_R2 = config("R2")
mapping.RGLB = runID
mapping.RGSM = sampleID
mapping.RGPL = config("PL")
mapping.RGPU = config("PU")
mapping.RGCN = config("CN")
mapping.outputDir = runDir
if (startJobs) {
mapping.init
mapping.biopetScript
}
return mapping
}
if (config.contains("R1")) {
val mapping = loadFromLibraryConfig()
addAll(mapping.functions) // Add functions of mapping to curent function pool
libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile")
} else if (runConfig.contains("bam")) {
var bamFile = new File(runConfig("bam").toString)
} else if (config.contains("bam")) {
var bamFile: File = config("bam")
if (!bamFile.exists) throw new IllegalStateException("Bam in config does not exist, file: " + bamFile)
if (config("bam_to_fastq", default = false).asBoolean) {
val samToFastq = SamToFastq(this, bamFile, runDir + sampleID + "-" + runID + ".R1.fastq",
runDir + sampleID + "-" + runID + ".R2.fastq")
add(samToFastq, isIntermediate = true)
val mapping = Mapping.loadFromLibraryConfig(this, runConfig, sampleConfig, runDir, startJobs = false)
val mapping = loadFromLibraryConfig(startJobs = false)
mapping.input_R1 = samToFastq.fastqR1
mapping.input_R2 = samToFastq.fastqR2
mapping.init
......@@ -205,11 +224,9 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
aorrg.RGID = sampleID + "-" + runID
aorrg.RGLB = runID
aorrg.RGSM = sampleID
if (runConfig.contains("PL")) aorrg.RGPL = runConfig("PL").toString
else aorrg.RGPL = "illumina"
if (runConfig.contains("PU")) aorrg.RGPU = runConfig("PU").toString
else aorrg.RGPU = "na"
if (runConfig.contains("CN")) aorrg.RGCN = runConfig("CN").toString
aorrg.RGPL = config("PL", default = "illumina")
aorrg.RGPU = config("PU", default = "na")
aorrg.RGCN = config("CN")
add(aorrg, isIntermediate = true)
bamFile = aorrg.output
} else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile +
......@@ -219,7 +236,10 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
libraryOutput.mappedBamFile = bamFile
}
} else logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig)
} else {
logger.error("Sample: " + sampleID + ": No R1 found for run: " + runID)
return libraryOutput
}
val gatkVariantcalling = new GatkVariantcalling(this)
gatkVariantcalling.inputBams = List(libraryOutput.mappedBamFile)
......
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
import argparse
import os
from pyfastqc import load_from_dir
def parse_contam_file(contam_file, delimiter='\t'):
"""Given a contaminant file, return a dictionary of contaminant sequences
names and their sequences.
Args:
contam_file -- path to contaminants file
delimiter -- ID, sequence delimiter in the contaminants file
"""
assert os.path.exists(contam_file), "Contaminant file %r does not exist" % \
contam_file
with open(contam_file, 'r') as source:
# read only lines not beginning with '#' and discard empty lines
lines = filter(None, (line.strip() for line in source if not
line.startswith('#')))
# parse contam seq lines into lists of [id, sequence]
parse = lambda line: filter(None, line.split(delimiter))
parsed = (parse(line) for line in lines)
# and create a dictionary, key=sequence id and value=sequence
contam_ref = {name: seq for name, seq in parsed}
return contam_ref
def get_contams_present(results_dir, contam_ref):
"""Given a path to a FastQC HTML results file, return the <div> tag of the
overrepresented sequences list.
Args:
results_dir -- Path to FastQC results directory.
"""
assert os.path.exists(results_dir), "Directory {0} not " \
"found.".format(results_dir)
fastqc = load_from_dir(results_dir)
contam_names = set([x[3] for x in fastqc.overrepresented_sequences.data])
in_sample = lambda rid: any([cid.startswith(rid) for cid in contam_names])
contams_present = {x: y for x, y in contam_ref.items() if in_sample(x)}
return contams_present
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('results_dir', type=str,
help='Path to FastQC result directory file')
parser.add_argument('-c', '--contam_file', type=str,
dest='contam_file',
help='Path to contaminant file')
parser.add_argument('-o', '--output', type=str,
dest='output',
help='Path to output file')
parser.add_argument('--seq-only',dest='seq_only',
action='store_true',
help='Whether to output contaminant sequences only or not')
args = parser.parse_args()
contam_ref = parse_contam_file(args.contam_file)
contam_ids = get_contams_present(args.results_dir, contam_ref)
if args.seq_only:
fmt_out = lambda cid, seq: seq
else:
fmt_out = lambda cid, seq: "{0}\t{1}".format(cid, seq)
if args.output is None:
for cid, seq in contam_ids.items():
print fmt_out(cid, seq)
else:
with open(args.output, 'w') as target:
for cid, seq in contam_ids.items():
target.write(fmt_out(cid, seq) + '\n')
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
# prep_deepsage.py
#
# Script for preprocessing deepSAGE sequencing samples.
#
# Adapted from: http://galaxy.nbic.nl/u/pacthoen/w/deepsagelumc
import argparse
from Bio import SeqIO
PREFIX = 'CATG'
def prep_deepsage(input_fastq, output_fastq, is_gzip, seq):
if is_gzip:
import gzip
opener = gzip.GzipFile
else:
opener = open
with opener(input_fastq, 'r') as in_handle, open(output_fastq, 'w') as \
out_handle:
# adapted from fastools.fastools.add
# not importing since we also need to cut away parts of the sequence
for rec in SeqIO.parse(in_handle, 'fastq'):
qual = rec.letter_annotations['phred_quality']
rec.letter_annotations = {}
rec.seq = seq + rec.seq
rec.letter_annotations = {'phred_quality': [40] * len(seq) +
qual}
SeqIO.write(rec, out_handle, "fastq")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('input_fastq', type=str,
help="Path to input FASTQ file")
parser.add_argument('-o', '--output', type=str,
dest='output_fastq',
default="prep_deepsage_output.fq",
help="Path to output FASTQ file")
parser.add_argument('--gzip', dest='gzip',
action='store_true',
help="Whether input FASTQ file is gzipped or not.")
parser.add_argument('--prefix', dest='prefix', type=str,
default=PREFIX,
help="Whether input FASTQ file is gzipped or not.")
args = parser.parse_args()
prep_deepsage(args.input_fastq, args.output_fastq, args.gzip, args.prefix)
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
"""
--------
pyfastqc
--------
Parser for FastQC results.
Provides a method and classes for parsing a FastQC run results.
Tested with FastQC 0.9.5 output.
"""
import os
def load_file(fname, mode='r', **kwargs):
"""Given a path to a FastQC data file or an open file object pointing to it,
return a `FastQC` object.
"""
if isinstance(fname, basestring):
with open(fname, mode, **kwargs) as fp:
return FastQC(fp)
else:
return FastQC(fname)
def load_from_dir(dirname, data_fname='fastqc_data.txt', mode='r', **kwargs):
"""Given a path to a FastQC results directory, return a `FastQC` object."""
assert os.path.exists(dirname), "Directory %r does not exist" % dirname
fqc_path = os.path.join(dirname, os.walk(dirname).next()[1][0], data_fname)
return load_file(fqc_path, mode, **kwargs)
class FastQCModule(object):
"""Class representing a FastQC analysis module."""
def __init__(self, raw_lines, end_mark='>>END_MODULE'):
self.raw_lines = raw_lines
self.end_mark = end_mark
self._status = None
self._name = None
self._data = self.parse()
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__,
'[%r, ...]' % self.raw_lines[0])
def __str__(self):
return ''.join(self.raw_lines)
@property
def name(self):
"""Name of the module."""
return self._name
@property
def columns(self):
"""Columns in the module."""
return self._columns
@property
def data(self):
"""FastQC data."""
return self._data
@property
def status(self):
"""FastQC run status."""
return self._status
def parse(self):
"""Common parser for a FastQC module."""
# check that the last line is a proper end mark
assert self.raw_lines[-1].startswith(self.end_mark)
# parse name and status from first line
tokens = self.raw_lines[0].strip().split('\t')
name = tokens[0][2:]
self._name = name
status = tokens[-1]
assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
% status
self._status = status
# and column names from second line
columns = self.raw_lines[1][1:].strip().split('\t')
self._columns = columns
# the rest of the lines except the last one
data = []
for line in self.raw_lines[2:-1]:
cols = line.strip().split('\t')
data.append(cols)
# optional processing for different modules
if self.name == 'Basic Statistics':
data = {k: v for k, v in data}
return data
class FastQC(object):
"""Class representing results from a FastQC run."""
# module name -- attribute name mapping
_mod_map = {
'>>Basic Statistics': 'basic_statistics',
'>>Per base sequence quality': 'per_base_sequence_quality',
'>>Per sequence quality scores': 'per_sequence_quality_scores',
'>>Per base sequence content': 'per_base_sequence_content',
'>>Per base GC content': 'per_base_gc_content',
'>>Per sequence GC content': 'per_sequence_gc_content',
'>>Per base N content': 'per_base_n_content',
'>>Sequence Length Distribution': 'sequence_length_distribution',
'>>Sequence Duplication Levels': 'sequence_duplication_levels',
'>>Overrepresented sequences': 'overrepresented_sequences',
'>>Kmer content': 'kmer_content',
}
def __init__(self, fp):
# get file name
self.fname = fp.name
self._modules = {}
line = fp.readline()
while True:
tokens = line.strip().split('\t')
# break on EOF
if not line:
break
# parse version
elif line.startswith('##FastQC'):
self.version = line.strip().split()[1]
# parse individual modules
elif tokens[0] in self._mod_map:
attr = self._mod_map[tokens[0]]
raw_lines = self.read_module(fp, line, tokens[0])
self._modules[attr] = FastQCModule(raw_lines)
line = fp.readline()
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, self.fname)
def _filter_by_status(self, status):
return [x.name for x in self._modules.values() if x.status == status]
def read_module(self, fp, line, start_mark):
raw = [line]
while not line.startswith('>>END_MODULE'):
line = fp.readline()
raw.append(line)
if not line:
raise ValueError("Unexpected end of file in module %r" % line)
return raw
@property
def modules(self):
return self._modules
@property
def passes(self):
return self._filter_by_status('pass')
@property
def passes_num(self):
return len(self.passes)
@property
def warns(self):
return self._filter_by_status('warn')
@property
def warns_num(self):
return len(self.warns)
@property
def fails(self):
return self._filter_by_status('fail')
@property
def fails_num(self):
return len(self.fails)
@property
def basic_statistics(self):
return self._modules['basic_statistics']
@property
def per_base_sequence_quality(self):
return self._modules['per_base_sequence_quality']
@property
def per_sequence_quality_scores(self):
return self._modules['per_sequence_quality_scores']
@property
def per_base_sequence_content(self):
return self._modules['per_base_sequence_content']
@property
def per_base_gc_content(self):
return self._modules['per_base_gc_content']
@property
def per_sequence_gc_content(self):
return self._modules['per_sequence_gc_content']
@property
def per_base_n_content(self):
return self._modules['per_base_n_content']
@property
def sequence_length_distribution(self):
return self._modules['sequence_length_distribution']
@property
def sequence_duplication_levels(self):
return self._modules['sequence_duplication_levels']
@property
def overrepresented_sequences(self):
return self._modules['overrepresented_sequences']
@property
def kmer_content(self):
return self._modules['kmer_content']