Commit 6a83f9d3 authored by wyleung's avatar wyleung
Browse files

Merge from master

parents 2a22fa63 9cde95b2
*.bam binary
*.bam.bai binary
{
"genotypegvcfs": { "scattercount": 100 },
"variantannotator": { "scattercount": 10 },
"realignertargetcreator": { "scattercount": 30 },
"combinevariants": { "scattercount": 10 },
"printreads_temp": { "scattercount": 30 },
"indelrealigner": { "scattercount": 30 },
"haplotypecaller": { "scattercount": 100 },
"unifiedgenotyper": { "scattercount": 100 },
"baserecalibrator": { "scattercount": 30 },
"basty": {
"haplotypecaller": { "scattercount": 20 },
"unifiedgenotyper": { "scattercount": 1 },
"multisample": { "unifiedgenotyper": { "scattercount": 100 } },
"baserecalibrator": { "scattercount": 1 },
"indelrealigner": { "scattercount": 1 },
"printreads_temp": { "scattercount": 1 },
"realignertargetcreator": { "scattercount": 1 },
"genotypegvcfs": { "scattercount": 1 },
"combinevariants": { "scattercount": 1 }
}
}
......@@ -23,6 +23,10 @@
<name>BioJava repository</name>
<url>http://www.biojava.org/download/maven/</url>
</repository>
<repository>
<id>orestes-bloom-filter</id>
<url>https://raw.githubusercontent.com/Baqend/Orestes-Bloomfilter/master/maven-repo</url>
</repository>
</repositories>
<dependencies>
<dependency>
......@@ -31,6 +35,11 @@
<version>6.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
......@@ -56,6 +65,16 @@
<artifactId>biojava3-sequencing</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>com.baqend</groupId>
<artifactId>bloom-filter</artifactId>
<version>1.02</version>
</dependency>
<dependency>
<groupId>com.github.scopt</groupId>
<artifactId>scopt_2.10</artifactId>
<version>3.2.0</version>
</dependency>
</dependencies>
<build>
<resources>
......
......@@ -16,40 +16,77 @@ import nl.lumc.sasc.biopet.pipelines.yamsvp.Yamsvp
object BiopetExecutable {
val pipelines: Map[String,PipelineCommand] = Map(
"flexiprep" -> Flexiprep,
"mapping" -> Mapping,
"gentrap" -> Gentrap,
"bam-metrics" -> BamMetrics,
"gatk-benchmark-genotyping" -> GatkBenchmarkGenotyping,
"gatk-genotyping" -> GatkGenotyping,
"gatk-variantcalling" -> GatkVariantcalling,
"gatk-pipeline" -> GatkPipeline,
"gatk-variant-recalibration" -> GatkVariantRecalibration,
"gatk-vcf-sample-compare" -> GatkVcfSampleCompare,
"sage" -> Sage,
"basty" -> Basty,
"yamsvp" -> Yamsvp
)
val modules: Map[String, List[MainCommand]] = Map(
"pipeline" -> List(
nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep,
nl.lumc.sasc.biopet.pipelines.mapping.Mapping,
nl.lumc.sasc.biopet.pipelines.gentrap.Gentrap,
nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics,
nl.lumc.sasc.biopet.pipelines.gatk.GatkBenchmarkGenotyping,
nl.lumc.sasc.biopet.pipelines.gatk.GatkGenotyping,
nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantcalling,
nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline,
nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantRecalibration,
nl.lumc.sasc.biopet.pipelines.gatk.GatkVcfSampleCompare,
nl.lumc.sasc.biopet.pipelines.sage.Sage,
nl.lumc.sasc.biopet.pipelines.basty.Basty),
"tool" -> List(
nl.lumc.sasc.biopet.core.apps.WipeReads,
nl.lumc.sasc.biopet.core.apps.BiopetFlagstat))
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
if (args.isEmpty) {
System.err.println(pipelineList)
System.exit(1)
def toBulletedList(m: List[MainCommand], kind: String = "", bullet: String = "-") =
"Available %ss:\n ".format(kind) + bullet + " " + m.map(x => x.commandName).sorted.mkString("\n " + bullet + " ")
def usage(module: String = null): String = {
if (module != null) checkModule(module)
val usage: String = {
val set = if (module == null) modules.keySet else Set(module)
val u = for (m <- set) yield toBulletedList(modules(m), m)
u.mkString("\n\n")
}
"""
|Usage: java -jar BiopetFramework.jar {%s} <name> [args]
|
|%s
|
|Questions or comments? Email sasc@lumc.nl or check out the project page at https://git.lumc.nl/biopet/biopet.git
""".stripMargin.format(modules.keys.mkString(","), usage)
}
else if (pipelines.contains(args.head)) pipelines(args.head).main(args.tail)
else {
System.err.println("Pipeline '" + args.head + "' does not exist")
System.err.println(pipelineList)
System.exit(1)
def checkModule(module: String) {
if (!modules.contains(module)) {
System.err.println(s"ERROR: module '$module' does not exist\n" + usage())
System.exit(1)
}
}
def pipelineList: String = {
val pipelinesArray = for ((k,v) <- pipelines) yield k
"Available pipelines:" + pipelinesArray.mkString("\n- ", "\n- ", "\n") + "please supply a valid pipeline"
def getCommand(module: String, name: String): MainCommand = {
checkModule(module)
val command = modules(module).find(p => p.commandName.toLowerCase == name.toLowerCase)
if (command == None) {
System.err.println(s"ERROR: command '$name' does not exist in module '$module'\n" + usage(module))
System.exit(1)
}
return command.get
}
args match {
case Array(module, name, passArgs @ _*) => {
getCommand(module, name).main(passArgs.toArray)
}
case Array(module) => {
System.err.println(usage(module))
System.exit(1)
}
case _ => {
System.err.println(usage())
System.exit(1)
}
}
}
}
package nl.lumc.sasc.biopet.core
trait MainCommand {
lazy val commandName = this.getClass.getSimpleName
.split("\\$").last
def main(args: Array[String])
}
......@@ -2,14 +2,14 @@ package nl.lumc.sasc.biopet.core
import org.broadinstitute.gatk.queue.util.Logging
trait PipelineCommand extends Logging {
val pipeline = ""
trait PipelineCommand extends MainCommand with Logging {
def pipeline = "/" + getClass.getName.stripSuffix("$").replaceAll("\\.", "/") + ".class"
def main(args: Array[String]): Unit = {
var argv: Array[String] = Array()
//argv ++= Array("-S", tempFile.getAbsolutePath)
argv ++= Array("-S", pipeline)
argv ++= args
return BiopetQCommandLine.main(argv)
BiopetQCommandLine.main(argv)
}
}
\ No newline at end of file
package nl.lumc.sasc.biopet.core
import java.text.SimpleDateFormat
import java.util.Calendar
import org.apache.log4j.Logger
import org.apache.log4j.WriterAppender
import org.apache.log4j.helpers.DateLayout
import org.apache.log4j.spi.LoggingEvent
import java.io.File
trait ToolCommand extends MainCommand {
abstract class AbstractArgs {
}
abstract class AbstractOptParser extends scopt.OptionParser[Args](commandName) {
opt[Unit]("log_nostderr") foreach { _ =>
logger.removeAppender(stderrAppender) } text("No output to stderr")
opt[File]("log_file") foreach { x =>
logger.addAppender(new WriterAppender(logLayout, new java.io.PrintStream(x))) } text("Log file") valueName("<file>")
opt[String]('l', "log_level") foreach { x =>
x.toLowerCase match {
case "debug" => logger.setLevel(org.apache.log4j.Level.DEBUG)
case "info" => logger.setLevel(org.apache.log4j.Level.INFO)
case "warn" => logger.setLevel(org.apache.log4j.Level.WARN)
case "error" => logger.setLevel(org.apache.log4j.Level.ERROR)
case _ =>
} } text("Log level") validate { x => x match {
case "debug" | "info" | "warn" | "error" => success
case _ => failure("Log level must be <debug/info/warn/error>")
}
}
opt[Unit]('h', "help") foreach { _ =>
System.err.println(this.usage); sys.exit(1)} text("Print usage")
}
type Args <: AbstractArgs
type OptParser <: AbstractOptParser
protected val logger = Logger.getLogger(commandName)
private val logLayout = new DateLayout() {
val ignoresThrowable = false
def format(event:org.apache.log4j.spi.LoggingEvent): String = {
val calendar: Calendar = Calendar.getInstance
calendar.setTimeInMillis(event.getTimeStamp)
val formatter: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
val formattedDate: String = formatter.format(calendar.getTime)
var logLevel = event.getLevel.toString
while (logLevel.size < 6) logLevel += " "
logLevel + " [" + formattedDate + "] [" + event.getLoggerName + "] " + event.getMessage + "\n"
}
}
private val stderrAppender = new WriterAppender(logLayout, sys.process.stderr)
logger.setLevel(org.apache.log4j.Level.INFO)
logger.addAppender(stderrAppender)
}
\ No newline at end of file
......@@ -4,6 +4,7 @@ import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.SAMRecord
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import org.broadinstitute.gatk.queue.util.Logging
......@@ -22,10 +23,10 @@ class BiopetFlagstat(val root: Configurable) extends BiopetJavaCommandLineFuncti
override val defaultVmem = "8G"
memoryLimit = Option(4.0)
override def commandLine = super.commandLine + required(input) + " > " + required(output)
override def commandLine = super.commandLine + required("I", input) + " > " + required(output)
}
object BiopetFlagstat extends Logging {
object BiopetFlagstat extends ToolCommand {
def apply(root: Configurable, input: File, output: File): BiopetFlagstat = {
val flagstat = new BiopetFlagstat(root)
flagstat.input = input
......@@ -38,12 +39,32 @@ object BiopetFlagstat extends Logging {
flagstat.output = new File(outputDir, input.getName.stripSuffix(".bam") + ".biopetflagstat")
return flagstat
}
case class Args (inputFile:File = null, region:Option[String] = None) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('I', "inputFile") required() valueName("<file>") action { (x, c) =>
c.copy(inputFile = x) } text("out is a required file property")
opt[String]('r', "region") valueName("<chr:start-stop>") action { (x, c) =>
c.copy(region = Some(x)) } text("out is a required file property")
}
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
val inputSam = new SAMFileReader(new File(args(0)))
val argsParser = new OptParser
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val inputSam = new SAMFileReader(commandArgs.inputFile)
val iterSam = if (commandArgs.region == None) inputSam.iterator else {
val regionRegex = """(.*):(.*)-(.*)""".r
commandArgs.region.get match {
case regionRegex(chr, start, stop) => inputSam.query(chr, start.toInt, stop.toInt, false)
case _ => sys.error("Region wrong format")
}
}
val flagstatCollector = new FlagstatCollector
flagstatCollector.loadDefaultFunctions
val m = 10
......@@ -89,10 +110,11 @@ object BiopetFlagstat extends Logging {
flagstatCollector.addFunction("Mate in same strand", record => record.getReadPairedFlag && record.getReadNegativeStrandFlag && record.getMateNegativeStrandFlag &&
record.getReferenceIndex == record.getMateReferenceIndex)
flagstatCollector.addFunction("Mate on other chr", record => record.getReadPairedFlag && record.getReferenceIndex != record.getMateReferenceIndex)
for (record <- inputSam.iterator) {
logger.info("Start reading file: " + commandArgs.inputFile)
for (record <- iterSam) {
if (flagstatCollector.readsCount % 1e6 == 0 && flagstatCollector.readsCount > 0)
System.err.println("Reads prosessed: " + flagstatCollector.readsCount)
logger.info("Reads prosessed: " + flagstatCollector.readsCount)
flagstatCollector.loadRecord(record)
}
......
/**
* Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl>
* @author Wibowo Arindrarto <w.arindrarto@lumc.nl>
*/
package nl.lumc.sasc.biopet.core.apps
import java.io.{ File, IOException }
import scala.collection.JavaConverters._
import scala.io.Source
import htsjdk.samtools.AlignmentBlock
import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.SAMFileReader.QueryInterval
import htsjdk.samtools.SAMFileWriterFactory
import htsjdk.samtools.SAMRecord
import htsjdk.tribble.index.interval.{ Interval, IntervalTree }
import orestes.bloomfilter.HashProvider.HashMethod
import orestes.bloomfilter.{ BloomFilter, FilterBuilder }
import org.apache.commons.io.FilenameUtils.getExtension
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.MainCommand
import nl.lumc.sasc.biopet.core.config.Configurable
// TODO: finish implementation for usage in pipelines
/**
* WipeReads function class for usage in Biopet pipelines
*
* @param root Configuration object for the pipeline
*/
class WipeReads(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input BAM file (must be indexed)", shortName = "I", required = true)
var inputBAM: File = _
@Output(doc = "Output BAM", shortName = "o", required = true)
var outputBAM: File = _
}
object WipeReads extends MainCommand {
/** Container type for command line flags */
type OptionMap = Map[String, Any]
/** Container class for interval parsing results */
case class RawInterval(chrom: String, start: Int, end: Int) {
require(start <= end, s"Start coordinate $start is larger than end coordinate $end")
/** Function to check whether one interval contains the other */
def contains(that: RawInterval): Boolean =
if (this.chrom != that.chrom)
false
else
this.start <= that.start && this.end >= that.end
/** Function to check whether two intervals overlap each other */
def overlaps(that: RawInterval): Boolean =
if (this.chrom != that.chrom)
false
else
this.start <= that.start && this.end >= that.start
/** Function to merge two overlapping intervals */
def merge(that: RawInterval): RawInterval = {
if (this.chrom != that.chrom)
throw new IllegalArgumentException("Can not merge RawInterval objects from different chromosomes")
if (contains(that))
this
else if (overlaps(that))
RawInterval(this.chrom, this.start, that.end)
else
throw new IllegalArgumentException("Can not merge non-overlapping RawInterval objects")
}
}
/**
* Function to create an iterator yielding non-overlapped RawInterval objects
*
* @param ri iterator yielding RawInterval objects
* @return iterator yielding RawInterval objects
*/
def mergeOverlappingIntervals(ri: Iterator[RawInterval]): Iterator[RawInterval] =
ri.toList
.sortBy(x => (x.chrom, x.start, x.end))
.foldLeft(List.empty[RawInterval]) {
(acc, i) => acc match {
case head :: tail if head.overlaps(i) => head.merge(i) :: tail
case _ => i :: acc
}}
.toIterator
/**
* Function to create iterator over intervals from input interval file
*
* @param inFile input interval file
*/
def makeRawIntervalFromFile(inFile: File): Iterator[RawInterval] = {
/** Function to create iterator from BED file */
def makeRawIntervalFromBED(inFile: File): Iterator[RawInterval] =
// BED file coordinates are 0-based, half open so we need to do some conversion
Source.fromFile(inFile)
.getLines()
.filterNot(_.trim.isEmpty)
.dropWhile(_.matches("^track | ^browser "))
.map(line => line.trim.split("\t") match {
case Array(chrom, start, end, _*) => new RawInterval(chrom, start.toInt + 1, end.toInt)
})
// TODO: implementation
/** Function to create iterator from refFlat file */
def makeRawIntervalFromRefFlat(inFile: File): Iterator[RawInterval] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// TODO: implementation
/** Function to create iterator from GTF file */
def makeRawIntervalFromGTF(inFile: File): Iterator[RawInterval] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// detect interval file format from extension
val iterFunc: (File => Iterator[RawInterval]) =
if (getExtension(inFile.toString.toLowerCase) == "bed")
makeRawIntervalFromBED
else
throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath)
mergeOverlappingIntervals(iterFunc(inFile))
}
// TODO: set minimum fraction for overlap
/**
* Function to create function to check SAMRecord for exclusion in filtered BAM file.
*
* The returned function evaluates all filtered-in SAMRecord to false.
*
* @param iv iterator yielding RawInterval objects
* @param inBAM input BAM file
* @param inBAMIndex input BAM file index
* @param filterOutMulti whether to filter out reads with same name outside target region (default: true)
* @param minMapQ minimum MapQ of reads in target region to filter out (default: 0)
* @param readGroupIDs read group IDs of reads in target region to filter out (default: all IDs)
* @param bloomSize expected size of elements to contain in the Bloom filter
* @param bloomFp expected Bloom filter false positive rate
* @return function that checks whether a SAMRecord or String is to be excluded
*/
def makeFilterOutFunction(iv: Iterator[RawInterval],
inBAM: File, inBAMIndex: File = null,
filterOutMulti: Boolean = true,
minMapQ: Int = 0, readGroupIDs: Set[String] = Set(),
bloomSize: Int = 100000000, bloomFp: Double = 1e-10): (SAMRecord => Boolean) = {
// TODO: implement optional index creation
/** Function to check for BAM file index and return a SAMFileReader given a File */
def prepIndexedInputBAM(): SAMFileReader =
if (inBAMIndex != null)
new SAMFileReader(inBAM, inBAMIndex)
else {
val sfr = new SAMFileReader(inBAM)
sfr.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT)
if (!sfr.hasIndex)
throw new IllegalStateException("Input BAM file must be indexed")
else
sfr
}
/**
* Function to query intervals from a BAM file
*
* The function still works when only either one of the interval or BAM
* file contig is prepended with "chr"
*
* @param inBAM BAM file to query as SAMFileReader
* @param ri raw interval object containing query
* @return QueryInterval wrapped in Option
*/
def monadicMakeQueryInterval(inBAM: SAMFileReader, ri: RawInterval): Option[QueryInterval] =
if (inBAM.getFileHeader.getSequenceIndex(ri.chrom) > -1)
Some(inBAM.makeQueryInterval(ri.chrom, ri.start, ri.end))
else if (ri.chrom.startsWith("chr")
&& inBAM.getFileHeader.getSequenceIndex(ri.chrom.substring(3)) > -1)
Some(inBAM.makeQueryInterval(ri.chrom.substring(3), ri.start, ri.end))
else if (!ri.chrom.startsWith("chr")
&& inBAM.getFileHeader.getSequenceIndex("chr" + ri.chrom) > -1)
Some(inBAM.makeQueryInterval("chr" + ri.chrom, ri.start, ri.end))
else
None
/** function to make IntervalTree from our RawInterval objects
*
* @param ri iterable over RawInterval objects
* @return IntervalTree objects, filled with intervals from RawInterval
*/
def makeIntervalTree(ri: Iterable[RawInterval]): IntervalTree = {
val ivt = new IntervalTree
for (iv: RawInterval <- ri)
ivt.insert(new Interval(iv.start, iv.end))
ivt
}
/**
* Function to ensure that a SAMRecord overlaps our target regions
*
* This is required because htsjdk's queryOverlap method does not take into
* account the SAMRecord splicing structure
*
* @param rec SAMRecord to check
* @param ivtm mutable mapping of a chromosome and its interval tree
* @return
*/
def alignmentBlockOverlaps(rec: SAMRecord, ivtm: Map[String, IntervalTree]): Boolean =
// if SAMRecord is not spliced, assume queryOverlap has done its job
// otherwise check for alignment block overlaps in our interval list
// using raw SAMString to bypass cigar string decoding
if (rec.getSAMString.split("\t")(5).contains("N")) {
for (ab: AlignmentBlock <- rec.getAlignmentBlocks.asScala) {
if (!ivtm(rec.getReferenceName).findOverlapping(
new Interval(ab.getReferenceStart, ab.getReferenceStart + ab.getLength - 1)).isEmpty)
return true
}
false
} else
true
/** filter function for read IDs */
val rgFilter =
if (readGroupIDs.size == 0)
(r: SAMRecord) => true
else