Commit 19d495c5 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'release-0.2.0' into 'master'

Release 0.2.0

See merge request !64
parents 187bae8b 1572cb02
package nl.lumc.sasc.biopet.core
import org.broadinstitute.gatk.queue.util.Logging
trait PipelineCommand extends MainCommand with Logging {
def pipeline = "/" + getClass.getName.stripSuffix("$").replaceAll("\\.", "/") + ".class"
def main(args: Array[String]): Unit = {
var argv: Array[String] = Array()
argv ++= Array("-S", pipeline)
argv ++= args
BiopetQCommandLine.main(argv)
}
}
\ No newline at end of file
package nl.lumc.sasc.biopet.core
import java.text.SimpleDateFormat
import java.util.Calendar
import org.apache.log4j.Logger
import org.apache.log4j.WriterAppender
import org.apache.log4j.helpers.DateLayout
import org.apache.log4j.spi.LoggingEvent
import java.io.File
trait ToolCommand extends MainCommand {
abstract class AbstractArgs {
}
abstract class AbstractOptParser extends scopt.OptionParser[Args](commandName) {
opt[Unit]("log_nostderr") foreach { _ =>
logger.removeAppender(stderrAppender) } text("No output to stderr")
opt[File]("log_file") foreach { x =>
logger.addAppender(new WriterAppender(logLayout, new java.io.PrintStream(x))) } text("Log file") valueName("<file>")
opt[String]('l', "log_level") foreach { x =>
x.toLowerCase match {
case "debug" => logger.setLevel(org.apache.log4j.Level.DEBUG)
case "info" => logger.setLevel(org.apache.log4j.Level.INFO)
case "warn" => logger.setLevel(org.apache.log4j.Level.WARN)
case "error" => logger.setLevel(org.apache.log4j.Level.ERROR)
case _ =>
} } text("Log level") validate { x => x match {
case "debug" | "info" | "warn" | "error" => success
case _ => failure("Log level must be <debug/info/warn/error>")
}
}
opt[Unit]('h', "help") foreach { _ =>
System.err.println(this.usage); sys.exit(1)} text("Print usage")
}
type Args <: AbstractArgs
type OptParser <: AbstractOptParser
protected val logger = Logger.getLogger(commandName)
private val logLayout = new DateLayout() {
val ignoresThrowable = false
def format(event:org.apache.log4j.spi.LoggingEvent): String = {
val calendar: Calendar = Calendar.getInstance
calendar.setTimeInMillis(event.getTimeStamp)
val formatter: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
val formattedDate: String = formatter.format(calendar.getTime)
var logLevel = event.getLevel.toString
while (logLevel.size < 6) logLevel += " "
logLevel + " [" + formattedDate + "] [" + event.getLoggerName + "] " + event.getMessage + "\n"
}
}
private val stderrAppender = new WriterAppender(logLayout, sys.process.stderr)
logger.setLevel(org.apache.log4j.Level.INFO)
logger.addAppender(stderrAppender)
}
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.basty
import nl.lumc.sasc.biopet.core.MultiSampleQScript
import nl.lumc.sasc.biopet.core.PipelineCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline
import org.broadinstitute.gatk.queue.QScript
class Basty (val root: Configurable) extends QScript with MultiSampleQScript {
def this() = this(null)
class LibraryOutput extends AbstractLibraryOutput {
}
class SampleOutput extends AbstractSampleOutput {
}
defaults ++= Map("ploidy" -> 1, "use_haplotypecaller" -> false, "use_unifiedgenotyper" -> true)
var gatkPipeline: GatkPipeline = _
def init() {
gatkPipeline = new GatkPipeline(this)
gatkPipeline.outputDir = outputDir
gatkPipeline.init
}
def biopetScript() {
gatkPipeline.biopetScript
addAll(gatkPipeline.functions)
runSamplesJobs()
}
// Called for each sample
def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = {
val sampleOutput = new SampleOutput
val sampleID: String = sampleConfig("ID").toString
val sampleDir = globalSampleDir + sampleID
sampleOutput.libraries = runLibraryJobs(sampleConfig)
return sampleOutput
}
// Called for each run from a sample
def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = {
val libraryOutput = new LibraryOutput
val runID: String = runConfig("ID").toString
val sampleID: String = sampleConfig("ID").toString
val runDir: String = globalSampleDir + sampleID + "/run_" + runID + "/"
return libraryOutput
}
}
object Basty extends PipelineCommand
package nl.lumc.sasc.biopet.tools
import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
import htsjdk.variant.vcf.VCFFileReader
import java.io.File
import java.util.ArrayList
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import scala.collection.JavaConversions._
class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input vcf", shortName = "I", required = true)
var inputVcf: File = _
@Output(doc = "Output vcf", shortName = "o", required = false)
var outputVcf: File = _
var minSampleDepth: Option[Int] = _
var minTotalDepth: Option[Int] = _
var minAlternateDepth: Option[Int] = _
var minSamplesPass: Option[Int] = _
var filterRefCalls: Boolean = _
override val defaultVmem = "8G"
memoryLimit = Option(4.0)
override def afterGraph {
minSampleDepth = config("min_sample_depth")
minTotalDepth = config("min_total_depth")
minAlternateDepth = config("min_alternate_depth")
minSamplesPass = config("min_samples_pass")
filterRefCalls = config("filter_ref_calls")
}
override def commandLine = super.commandLine +
required("-I", inputVcf) +
required("-o", outputVcf) +
optional("--minSampleDepth", minSampleDepth) +
optional("--minTotalDepth", minTotalDepth) +
optional("--minAlternateDepth", minAlternateDepth) +
optional("--minSamplesPass", minSamplesPass) +
conditional(filterRefCalls, "--filterRefCalls")
}
object VcfFilter extends ToolCommand {
case class Args (inputVcf:File = null, outputVcf:File = null, minSampleDepth: Int = -1, minTotalDepth: Int = -1,
minAlternateDepth: Int = -1, minSamplesPass: Int = 0, minBamAlternateDepth: Int = 0, filterRefCalls: Boolean = false) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('I', "inputVcf") required() maxOccurs(1) valueName("<file>") action { (x, c) =>
c.copy(inputVcf = x) }
opt[File]('o', "outputVcf") required() maxOccurs(1) valueName("<file>") action { (x, c) =>
c.copy(outputVcf = x) } text("output file, default to stdout")
opt[Int]("minSampleDepth") unbounded() action { (x, c) =>
c.copy(minSampleDepth = x ) }
opt[Int]("minTotalDepth") unbounded() action { (x, c) =>
c.copy(minTotalDepth = x ) }
opt[Int]("minAlternateDepth") unbounded() action { (x, c) =>
c.copy(minAlternateDepth = x) }
opt[Int]("minSamplesPass") unbounded() action { (x, c) =>
c.copy(minSamplesPass = x) }
opt[Int]("minBamAlternateDepth") unbounded() action { (x, c) =>
c.copy(minBamAlternateDepth = x) }
opt[Unit]("filterRefCalls") unbounded() action { (x, c) =>
c.copy(filterRefCalls = true) }
}
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val reader = new VCFFileReader(commandArgs.inputVcf, false)
val header = reader.getFileHeader
val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputVcf).build)
writer.writeHeader(header)
val bamADFields = (for (line <- header.getInfoHeaderLines if line.getID.startsWith("BAM-AD-")) yield line.getID).toList
val bamDPFields = (for (line <- header.getInfoHeaderLines if line.getID.startsWith("BAM-DP-")) yield line.getID).toList
for (record <- reader) {
val genotypes = for (genotype <- record.getGenotypes) yield {
val DP = if (genotype.hasDP) genotype.getDP else -1
val AD = if (genotype.hasAD) List(genotype.getAD:_*) else Nil
DP >= commandArgs.minSampleDepth &&
(if (!AD.isEmpty) AD.tail.count(_ >= commandArgs.minAlternateDepth) > 0 else true) &&
!(commandArgs.filterRefCalls && genotype.isHomRef)
}
val bamADvalues = (for (field <- bamADFields) yield {
record.getAttribute(field, new ArrayList) match {
case t:ArrayList[_] if t.length > 1 => {
for (i <- 1 until t.size) yield {
t(i) match {
case a:Int => a > commandArgs.minBamAlternateDepth
case a:String => a.toInt > commandArgs.minBamAlternateDepth
case _ => false
}
}
}
case _ => List(false)
}
}).flatten
if (record.getAttributeAsInt("DP", -1) >= commandArgs.minTotalDepth &&
genotypes.count(_ == true) >= commandArgs.minSamplesPass &&
(commandArgs.minBamAlternateDepth <= 0 || bamADvalues.count(_ == true) >= commandArgs.minSamplesPass))
writer.add(record)
}
reader.close
writer.close
}
}
\ No newline at end of file
/**
* Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl>
* @author Wibowo Arindrarto <w.arindrarto@lumc.nl>
*/
package nl.lumc.sasc.biopet.tools
import java.io.{ File, IOException }
import scala.collection.JavaConverters._
import htsjdk.samtools.AlignmentBlock
import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.SAMFileReader.QueryInterval
import htsjdk.samtools.SAMFileWriterFactory
import htsjdk.samtools.SAMRecord
import htsjdk.tribble.AbstractFeatureReader.getFeatureReader
import htsjdk.tribble.Feature
import htsjdk.tribble.BasicFeature
import htsjdk.tribble.bed.BEDCodec
import htsjdk.tribble.index.interval.{ Interval, IntervalTree }
import orestes.bloomfilter.HashProvider.HashMethod
import orestes.bloomfilter.{ BloomFilter, FilterBuilder }
import org.apache.commons.io.FilenameUtils.getExtension
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
// TODO: finish implementation for usage in pipelines
/**
* WipeReads function class for usage in Biopet pipelines
*
* @param root Configuration object for the pipeline
*/
class WipeReads(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input BAM file (must be indexed)", shortName = "I", required = true)
var inputBAM: File = _
@Output(doc = "Output BAM", shortName = "o", required = true)
var outputBAM: File = _
}
object WipeReads extends ToolCommand {
/** Function to check whether one feature contains the other */
private def contains(feat1: Feature, feat2: Feature): Boolean =
if (feat1.getChr != feat2.getChr)
false
else
feat1.getStart <= feat2.getStart && feat1.getEnd >= feat2.getEnd
/** Function to check whether two features overlap each other */
private def overlaps(feat1: Feature, feat2: Feature): Boolean =
if (feat1.getChr != feat2.getChr)
false
else
feat1.getStart <= feat2.getStart && feat1.getEnd >= feat2.getStart
/** Function to merge two overlapping intervals
* NOTE: we assume subtypes of Feature has a constructor with (chr, start, end) signature */
private def merge[T <: Feature](feat1: T, feat2: T): T = {
if (feat1.getChr != feat2.getChr)
throw new IllegalArgumentException("Can not merge features from different chromosomes")
if (contains(feat1, feat2))
feat1
// FIXME: can we avoid casting here?
else if (overlaps(feat1, feat2))
new BasicFeature(feat1.getChr, feat1.getStart, feat2.getEnd).asInstanceOf[T]
else
throw new IllegalArgumentException("Can not merge non-overlapping RawInterval objects")
}
/** Function to create an iterator yielding non-overlapped Feature objects */
private def mergeOverlappingIntervals[T <: Feature](ri: Iterator[T]): Iterator[T] =
ri.toList
.sortBy(x => (x.getChr, x.getStart, x.getEnd))
.foldLeft(List.empty[T]) {
(acc, i) => acc match {
case head :: tail if overlaps(head, i) => merge(head, i) :: tail
case _ => i :: acc
}}
.toIterator
/**
* Function to create iterator over intervals from input interval file
*
* @param inFile input interval file
*/
def makeFeatureFromFile(inFile: File): Iterator[Feature] = {
logger.info("Parsing interval file ...")
/** Function to create iterator from BED file */
def makeFeatureFromBED(inFile: File): Iterator[Feature] =
asScalaIteratorConverter(getFeatureReader(inFile.toPath.toString, new BEDCodec(), false).iterator).asScala
// TODO: implementation
/** Function to create iterator from refFlat file */
def makeFeatureFromRefFlat(inFile: File): Iterator[Feature] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// TODO: implementation
/** Function to create iterator from GTF file */
def makeFeatureFromGTF(inFile: File): Iterator[Feature] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// detect interval file format from extension
val iterFunc: (File => Iterator[Feature]) =
if (getExtension(inFile.toString.toLowerCase) == "bed")
makeFeatureFromBED
else
throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath)
mergeOverlappingIntervals(iterFunc(inFile))
}
def bloomParamsOk(bloomSize: Long, bloomFp: Double): Boolean = {
val optimalArraySize = FilterBuilder.optimalM(bloomSize, bloomFp)
// we are currently limited to maximum integer size
// if the optimal array size equals or exceeds it, we assume
// that it's a result of a truncation and return false
optimalArraySize <= Int.MaxValue
}
// TODO: set minimum fraction for overlap
/**
* Function to create function to check SAMRecord for exclusion in filtered BAM file.
*
* The returned function evaluates all filtered-in SAMRecord to false.
*
* @param iv iterator yielding RawInterval objects
* @param inBAM input BAM file
* @param inBAMIndex input BAM file index
* @param filterOutMulti whether to filter out reads with same name outside target region (default: true)
* @param minMapQ minimum MapQ of reads in target region to filter out (default: 0)
* @param readGroupIDs read group IDs of reads in target region to filter out (default: all IDs)
* @param bloomSize expected size of elements to contain in the Bloom filter
* @param bloomFp expected Bloom filter false positive rate
* @return function that checks whether a SAMRecord or String is to be excluded
*/
def makeFilterOutFunction(iv: Iterator[Feature],
inBAM: File, inBAMIndex: File = null,
filterOutMulti: Boolean = true,
minMapQ: Int = 0, readGroupIDs: Set[String] = Set(),
bloomSize: Long, bloomFp: Double): (SAMRecord => Boolean) = {
logger.info("Building set of reads to exclude ...")
// TODO: implement optional index creation
/** Function to check for BAM file index and return a SAMFileReader given a File */
def prepIndexedInputBAM(): SAMFileReader =
if (inBAMIndex != null)
new SAMFileReader(inBAM, inBAMIndex)
else {
val sfr = new SAMFileReader(inBAM)
sfr.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT)
if (!sfr.hasIndex)
throw new IllegalStateException("Input BAM file must be indexed")
else
sfr
}
/**
* Function to query intervals from a BAM file
*
* The function still works when only either one of the interval or BAM
* file contig is prepended with "chr"
*
* @param inBAM BAM file to query as SAMFileReader
* @param feat feature object containing query
* @return QueryInterval wrapped in Option
*/
def monadicMakeQueryInterval(inBAM: SAMFileReader, feat: Feature): Option[QueryInterval] =
if (inBAM.getFileHeader.getSequenceIndex(feat.getChr) > -1)
Some(inBAM.makeQueryInterval(feat.getChr, feat.getStart, feat.getEnd))
else if (feat.getChr.startsWith("chr")
&& inBAM.getFileHeader.getSequenceIndex(feat.getChr.substring(3)) > -1)
Some(inBAM.makeQueryInterval(feat.getChr.substring(3), feat.getStart, feat.getEnd))
else if (!feat.getChr.startsWith("chr")
&& inBAM.getFileHeader.getSequenceIndex("chr" + feat.getChr) > -1)
Some(inBAM.makeQueryInterval("chr" + feat.getChr, feat.getStart, feat.getEnd))
else
None
/** function to make IntervalTree from our RawInterval objects
*
* @param ifeat iterable over feature objects
* @return IntervalTree objects, filled with intervals from RawInterval
*/
def makeIntervalTree[T <: Feature](ifeat: Iterable[T]): IntervalTree = {
val ivt = new IntervalTree
for (iv <- ifeat)
ivt.insert(new Interval(iv.getStart, iv.getEnd))
ivt
}
/**
* Function to ensure that a SAMRecord overlaps our target regions
*
* This is required because htsjdk's queryOverlap method does not take into
* account the SAMRecord splicing structure
*
* @param rec SAMRecord to check
* @param ivtm mutable mapping of a chromosome and its interval tree
* @return
*/
def alignmentBlockOverlaps(rec: SAMRecord, ivtm: Map[String, IntervalTree]): Boolean =
// if SAMRecord is not spliced, assume queryOverlap has done its job
// otherwise check for alignment block overlaps in our interval list
// using raw SAMString to bypass cigar string decoding
if (rec.getSAMString.split("\t")(5).contains("N")) {
for (ab: AlignmentBlock <- rec.getAlignmentBlocks.asScala) {
if (!ivtm(rec.getReferenceName).findOverlapping(
new Interval(ab.getReferenceStart, ab.getReferenceStart + ab.getLength - 1)).isEmpty)
return true
}
false
} else
true
/** filter function for read IDs */
val rgFilter =
if (readGroupIDs.size == 0)
(r: SAMRecord) => true
else
(r: SAMRecord) => readGroupIDs.contains(r.getReadGroup.getReadGroupId)
/** function to get set element */
val SAMRecordElement =
if (filterOutMulti)
(r: SAMRecord) => r.getReadName
else
(r: SAMRecord) => r.getReadName + "_" + r.getAlignmentStart.toString
val SAMRecordMateElement =
(r: SAMRecord) => r.getReadName + "_" + r.getMateAlignmentStart.toString
val firstBAM = prepIndexedInputBAM()
/* NOTE: the interval vector here should be bypass-able if we can make
the BAM query intervals with Interval objects. This is not possible
at the moment since we can not retrieve star and end coordinates
of an Interval, so we resort to our own RawInterval vector
*/
lazy val intervals = iv.toVector
lazy val intervalTreeMap: Map[String, IntervalTree] = intervals
.groupBy(x => x.getChr)
.map({ case (key, value) => (key, makeIntervalTree(value)) })
lazy val queryIntervals = intervals
.flatMap(x => monadicMakeQueryInterval(firstBAM, x))
// makeQueryInterval only accepts a sorted QueryInterval list
.sortBy(x => (x.referenceIndex, x.start, x.end))
.toArray
val filteredRecords: Iterator[SAMRecord] = firstBAM.queryOverlapping(queryIntervals).asScala
// ensure spliced reads have at least one block overlapping target region
.filter(x => alignmentBlockOverlaps(x, intervalTreeMap))
// filter for MAPQ on target region reads
.filter(x => x.getMappingQuality >= minMapQ)
// filter on specific read group IDs
.filter(x => rgFilter(x))
val filteredOutSet: BloomFilter[String] = new FilterBuilder(bloomSize.toInt, bloomFp)
.hashFunction(HashMethod.Murmur3KirschMitzenmacher)
.buildBloomFilter()
for (rec <- filteredRecords) {
logger.debug("Adding read " + rec.getReadName + " to set ...")
if ((!filterOutMulti) && rec.getReadPairedFlag) {
filteredOutSet.add(SAMRecordElement(rec))
filteredOutSet.add(SAMRecordMateElement(rec))
}
else
filteredOutSet.add(SAMRecordElement(rec))
}
if (filterOutMulti)
(rec: SAMRecord) => filteredOutSet.contains(rec.getReadName)
else
(rec: SAMRecord) => {
if (rec.getReadPairedFlag)
filteredOutSet.contains(SAMRecordElement(rec)) &&
filteredOutSet.contains(SAMRecordMateElement(rec))
else
filteredOutSet.contains(SAMRecordElement(rec))
}
}
// TODO: implement stats file output
/**
* Function to filter input BAM and write its output to the filesystem
*
* @param filterFunc filter function that evaluates true for excluded SAMRecord
* @param inBAM input BAM file
* @param outBAM output BAM file
* @param writeIndex whether to write index for output BAM file
* @param async whether to write asynchronously
* @param filteredOutBAM whether to write excluded SAMRecords to their own BAM file
*/
def writeFilteredBAM(filterFunc: (SAMRecord => Boolean), inBAM: File, outBAM: File,
writeIndex: Boolean = true, async: Boolean = true,
filteredOutBAM: File = null) = {