Commit 57b85b65 authored by Sander Bollen's avatar Sander Bollen
Browse files

Merge branch 'develop' into tooltests

parents 4faa97b2 9bbaa813
### System Requirements
Biopet is build on top of GATK Queue, which requires having `java` installed on the analysis machine(s).
For end-users:
* [Java 7 JVM](http://www.oracle.com/technetwork/java/javase/downloads/index.html) or [OpenJDK 7](http://openjdk.java.net/install/)
* [Cran R 2.15.3](http://cran.r-project.org/)
For developers:
* [OpenJDK 7](http://openjdk.java.net/install/)
* Minimum of 4 GB RAM {todo: provide more accurate estimation on building}
* Maven 3
* Compiled and installed version 3.4 of [GATK + Queue](https://github.com/broadgsa/gatk-protected/) in your maven repository.
* IntelliJ or Netbeans 8.0 for development
......@@ -3,6 +3,7 @@ pages:
- Home: 'index.md'
- General:
- Config: 'general/config.md'
- Requirements: 'general/requirements.md'
- Pipelines:
- Basty: 'pipelines/basty.md'
- Bam2Wig: 'pipelines/bam2wig.md'
......
......@@ -12,10 +12,12 @@ import nl.lumc.sasc.biopet.core.config.Configurable
class ApplyRecalibration(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.ApplyRecalibration with GatkGeneral {
scatterCount = config("scattercount", default = 0)
override val defaultThreads = 3
override def beforeGraph() {
super.beforeGraph()
nt = Option(getThreads(3))
nt = Option(getThreads)
memoryLimit = Option(nt.getOrElse(1) * 2)
import org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode
......
......@@ -9,6 +9,9 @@ import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType
class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.HaplotypeCaller with GatkGeneral {
override val defaultThreads = 1
min_mapping_quality_score = config("minMappingQualityScore", default = 20)
scatterCount = config("scattercount", default = 1)
if (config.contains("dbsnp")) this.dbsnp = config("dbsnp")
......@@ -41,7 +44,7 @@ class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.qu
threads = 1
logger.warn("BamOutput is on, nct/threads is forced to set on 1, this option is only for debug")
}
nct = Some(getThreads(1))
nct = Some(getThreads)
memoryLimit = Option(memoryLimit.getOrElse(2.0) * nct.getOrElse(1))
}
......
......@@ -26,11 +26,13 @@ class UnifiedGenotyper(val root: Configurable) extends org.broadinstitute.gatk.q
}
}
override val defaultThreads = 1
override def beforeGraph() {
super.beforeGraph()
genotype_likelihoods_model = org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH
nct = Some(getThreads(1))
nct = Some(getThreads)
memoryLimit = Option(nct.getOrElse(1) * memoryLimit.getOrElse(2.0))
}
}
......@@ -11,7 +11,9 @@ import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile
class VariantRecalibrator(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.VariantRecalibrator with GatkGeneral {
nt = Option(getThreads(4))
override val defaultThreads = 4
nt = Option(getThreads)
memoryLimit = Option(nt.getOrElse(1) * 2)
if (config.contains("dbsnp")) resource :+= new TaggedFile(config("dbsnp").asString, "known=true,training=false,truth=false,prior=2.0")
......
library(reshape2)
library(ggplot2)
library(argparse)
parser <- ArgumentParser(description='Process some integers')
parser$add_argument('--input', dest='input', type='character', help='Input tsv file', required=TRUE)
parser$add_argument('--output', dest='output', type='character', help='Output png file', required=TRUE)
parser$add_argument('--width', dest='width', type='integer', default = 500)
parser$add_argument('--height', dest='height', type='integer', default = 500)
parser$add_argument('--xlabel', dest='xlabel', type='character')
parser$add_argument('--ylabel', dest='ylabel', type='character', required=TRUE)
parser$add_argument('--llabel', dest='llabel', type='character')
parser$add_argument('--title', dest='title', type='character')
parser$add_argument('--removeZero', dest='removeZero', type='character', default="false")
arguments <- parser$parse_args()
png(filename = arguments$output, width = arguments$width, height = arguments$height)
DF <- read.table(arguments$input, header=TRUE)
if (is.null(arguments$xlabel)) xlab <- colnames(DF)[1] else xlab <- arguments$xlabel
colnames(DF)[1] <- "Rank"
DF1 <- melt(DF, id.var="Rank")
if (arguments$removeZero == "true") DF1 <- DF1[DF1$value > 0, ]
if (arguments$removeZero == "true") print("Removed 0 values")
ggplot(DF1, aes(x = Rank, y = value, group = variable, color = variable)) +
xlab(xlab) +
ylab(arguments$ylabel) +
guides(fill=guide_legend(title=arguments$llabel)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) +
ggtitle(arguments$title) +
theme_bw() +
geom_point()
dev.off()
......@@ -199,13 +199,15 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
BiopetCommandLineFunctionTrait.versionCache.get(versionCommand)
}
def getThreads: Int = getThreads(defaultThreads)
/**
* Get threads from config
* @param default default when not found in config
* @return number of threads
*/
def getThreads(default: Int): Int = {
val maxThreads: Int = config("maxthreads", default = 8)
val maxThreads: Int = config("maxthreads", default = 24)
val threads: Int = config("threads", default = default)
if (maxThreads > threads) threads
else maxThreads
......@@ -218,7 +220,7 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
* @return number of threads
*/
def getThreads(default: Int, module: String): Int = {
val maxThreads: Int = config("maxthreads", default = 8, submodule = module)
val maxThreads: Int = config("maxthreads", default = 24, submodule = module)
val threads: Int = config("threads", default = default, submodule = module)
if (maxThreads > threads) threads
else maxThreads
......
......@@ -58,34 +58,37 @@ class Bowtie(val root: Configurable) extends BiopetCommandLineFunction with Refe
var strata: Boolean = config("strata", default = false)
var maqerr: Option[Int] = config("maqerr")
var maxins: Option[Int] = config("maxins")
var largeIndex: Boolean = config("large-index", default = false)
override def beforeGraph() {
super.beforeGraph()
if (reference == null) reference = referenceFasta()
val basename = reference.getName.stripSuffix(".fasta").stripSuffix(".fa")
if (reference.getParentFile.list().toList.filter(_.startsWith(basename)).exists(_.endsWith(".ebwtl")))
largeIndex = config("large-index", default = true)
}
/** return commandline to execute */
def cmdLine = {
required(executable) +
optional("--threads", threads) +
conditional(sam, "--sam") +
conditional(best, "--best") +
conditional(strata, "--strata") +
optional("--sam-RG", sam_RG) +
optional("--seedlen", seedlen) +
optional("--seedmms", seedmms) +
optional("-k", k) +
optional("-m", m) +
optional("--maxbts", maxbts) +
optional("--maqerr", maqerr) +
optional("--maxins", maxins) +
required(reference) +
(R2 match {
case Some(r2) =>
required("-1", R1) +
optional("-2", r2)
case _ => required(R1)
}) +
" > " + required(output)
}
def cmdLine = required(executable) +
optional("--threads", threads) +
conditional(sam, "--sam") +
conditional(largeIndex, "--large-index") +
conditional(best, "--best") +
conditional(strata, "--strata") +
optional("--sam-RG", sam_RG) +
optional("--seedlen", seedlen) +
optional("--seedmms", seedmms) +
optional("-k", k) +
optional("-m", m) +
optional("--maxbts", maxbts) +
optional("--maqerr", maqerr) +
optional("--maxins", maxins) +
required(reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta")) +
(R2 match {
case Some(r2) =>
required("-1", R1) +
optional("-2", r2)
case _ => required(R1)
}) +
" > " + required(output)
}
\ No newline at end of file
......@@ -74,7 +74,7 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction {
/** Sets correct output files to job */
override def beforeGraph() {
require(w != null)
if (threads == 0) threads = getThreads(defaultThreads)
if (threads == 0) threads = getThreads
executable = if (threads > 1 && executableThreads.isDefined) executableThreads.get else executableNonThreads
super.beforeGraph()
out :::= List(Some(getInfoFile), getBestTreeFile, getBootstrapFile, getBipartitionsFile).flatten
......
......@@ -63,7 +63,7 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere
var outFileNamePrefix: String = _
var runThreadN: Option[Int] = config("runThreadN")
override def defaultCoreMemory = 4.0
override def defaultCoreMemory = 6.0
override def defaultThreads = 8
/** Sets output files for the graph */
......@@ -72,7 +72,7 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere
if (reference == null) reference = referenceFasta()
genomeDir = config("genomeDir", new File(reference.getAbsoluteFile.getParent, "star"))
if (outFileNamePrefix != null && !outFileNamePrefix.endsWith(".")) outFileNamePrefix += "."
val prefix = if (outFileNamePrefix != null) outputDir + outFileNamePrefix else outputDir
val prefix = if (outFileNamePrefix != null) outputDir + File.separator + outFileNamePrefix else outputDir + File.separator
if (runmode == null) {
outputSam = new File(prefix + "Aligned.out.sam")
outputTab = new File(prefix + "SJ.out.tab")
......
......@@ -17,14 +17,14 @@ package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.core.{ Reference, BiopetCommandLineFunction }
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output }
/**
* Extension for Tophat
*/
class Tophat(val root: Configurable) extends BiopetCommandLineFunction {
class Tophat(val root: Configurable) extends BiopetCommandLineFunction with Reference {
executable = config("exe", default = "tophat", freeVar = false)
......@@ -264,6 +264,16 @@ class Tophat(val root: Configurable) extends BiopetCommandLineFunction {
var rg_platform: Option[String] = config("rg_platform")
override def beforeGraph: Unit = {
super.beforeGraph
if (bowtie1 && !new File(bowtie_index).getParentFile.list().toList
.filter(_.startsWith(new File(bowtie_index).getName)).exists(_.endsWith(".ebwt")))
throw new IllegalArgumentException("No bowtie1 index found for tophat")
else if (!new File(bowtie_index).getParentFile.list().toList
.filter(_.startsWith(new File(bowtie_index).getName)).exists(_.endsWith(".bt2")))
throw new IllegalArgumentException("No bowtie2 index found for tophat")
}
def cmdLine: String = required(executable) +
optional("-o", output_dir) +
conditional(bowtie1, "--bowtie1") +
......
......@@ -45,6 +45,14 @@ class MergeSamFiles(val root: Configurable) extends Picard {
@Argument(doc = "COMMENT", required = false)
var comment: Option[String] = config("comment")
@Output(doc = "Bam Index", required = true)
private var outputIndex: File = _
override def beforeGraph() {
super.beforeGraph()
if (createIndex) outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai")
}
/** Returns command to execute */
override def commandLine = super.commandLine +
repeat("INPUT=", input, spaceSeparated = false) +
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.rscript
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.RscriptCommandLineFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Extension for en general line plot with R
*
* Created by pjvan_thof on 4/29/15.
*/
class ScatterPlot(val root: Configurable) extends RscriptCommandLineFunction {
protected var script: File = config("script", default = "plotScatter.R")
@Input
var input: File = _
@Output
var output: File = _
var width: Option[Int] = config("width")
var height: Option[Int] = config("height")
var xlabel: Option[String] = config("xlabel")
var ylabel: Option[String] = config("ylabel")
var llabel: Option[String] = config("llabel")
var title: Option[String] = config("title")
var removeZero: Boolean = config("removeZero", default = false)
override def cmdLine: String = super.cmdLine +
required("--input", input) +
required("--output", output) +
optional("--width", width) +
optional("--height", height) +
optional("--xlabel", xlabel) +
required("--ylabel", ylabel) +
optional("--llabel", llabel) +
optional("--title", title) +
optional("--removeZero", removeZero)
}
......@@ -21,6 +21,7 @@ import htsjdk.variant.variantcontext.VariantContextBuilder
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.vcf.{ VCFFileReader, VCFHeaderLineCount, VCFHeaderLineType, VCFInfoHeaderLine }
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
import scala.collection.JavaConversions._
import scala.collection.mutable
......@@ -83,19 +84,9 @@ object AnnotateVcfWithBed extends ToolCommand {
logger.info("Start")
val argsParser = new OptParser
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val bedRecords: mutable.Map[String, List[(Int, Int, String)]] = mutable.Map()
// Read bed file
/*
// function bedRecord.getName will not compile, not clear why
for (bedRecord <- asScalaIteratorConverter(getFeatureReader(commandArgs.bedFile.toPath.toString, new BEDCodec(), false).iterator()).asScala) {
logger.debug(bedRecord)
bedRecords(bedRecord.getChr) = (bedRecord.getStart, bedRecord.getEnd, bedRecord.getName) :: bedRecords.getOrElse(bedRecord.getChr, Nil)
}
*/
val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val fieldType = commandArgs.fieldType match {
val fieldType = cmdArgs.fieldType match {
case "Integer" => VCFHeaderLineType.Integer
case "Flag" => VCFHeaderLineType.Flag
case "Character" => VCFHeaderLineType.Character
......@@ -104,48 +95,32 @@ object AnnotateVcfWithBed extends ToolCommand {
}
logger.info("Reading bed file")
for (line <- Source.fromFile(commandArgs.bedFile).getLines()) {
val values = line.split("\t")
if (values.size >= 4)
bedRecords(values(0)) = (values(1).toInt, values(2).toInt, values(3)) :: bedRecords.getOrElse(values(0), Nil)
else values.size >= 3 && fieldType == VCFHeaderLineType.Flag
bedRecords(values(0)) = (values(1).toInt, values(2).toInt, "") :: bedRecords.getOrElse(values(0), Nil)
}
logger.info("Sorting bed records")
// Sort records when needed
for ((chr, record) <- bedRecords) {
bedRecords(chr) = record.sortBy(x => (x._1, x._2))
}
val bedRecords = BedRecordList.fromFile(cmdArgs.bedFile).sorted
logger.info("Starting output file")
val reader = new VCFFileReader(commandArgs.inputFile, false)
val reader = new VCFFileReader(cmdArgs.inputFile, false)
val header = reader.getFileHeader
val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
setOutputFile(commandArgs.outputFile).
setOutputFile(cmdArgs.outputFile).
setReferenceDictionary(header.getSequenceDictionary).
build)
header.addMetaDataLine(new VCFInfoHeaderLine(commandArgs.fieldName,
VCFHeaderLineCount.UNBOUNDED, fieldType, commandArgs.fieldDescription))
header.addMetaDataLine(new VCFInfoHeaderLine(cmdArgs.fieldName,
VCFHeaderLineCount.UNBOUNDED, fieldType, cmdArgs.fieldDescription))
writer.writeHeader(header)
logger.info("Start reading vcf records")
for (record <- reader) {
val overlaps = bedRecords.getOrElse(record.getContig, Nil).filter(x => {
record.getStart <= x._2 && record.getEnd >= x._1
})
val overlaps = bedRecords.overlapWith(BedRecord(record.getContig, record.getStart, record.getEnd))
if (overlaps.isEmpty) {
writer.add(record)
} else {
val builder = new VariantContextBuilder(record)
if (fieldType == VCFHeaderLineType.Flag) builder.attribute(commandArgs.fieldName, true)
else builder.attribute(commandArgs.fieldName, overlaps.map(_._3).mkString(","))
if (fieldType == VCFHeaderLineType.Flag) builder.attribute(cmdArgs.fieldName, true)
else builder.attribute(cmdArgs.fieldName, overlaps.map(_.name).mkString(","))
writer.add(builder.make)
}
}
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.tools
import java.io.{ PrintWriter, InputStream, File }
import java.util
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.extensions.rscript.ScatterPlot
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
import scala.collection.JavaConversions._
import scala.collection.mutable
object RegionAfCount extends ToolCommand {
case class Args(bedFile: File = null,
outputPrefix: String = null,
scatterpPlot: Boolean = false,
vcfFiles: List[File] = Nil) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('b', "bedFile") unbounded () required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(bedFile = x)
}
opt[String]('o', "outputPrefix") unbounded () required () maxOccurs 1 valueName "<file prefix>" action { (x, c) =>
c.copy(outputPrefix = x)
}
opt[Unit]('s', "scatterPlot") unbounded () action { (x, c) =>
c.copy(scatterpPlot = true)
}
opt[File]('V', "vcfFile") unbounded () minOccurs 1 action { (x, c) =>
c.copy(vcfFiles = c.vcfFiles ::: x :: Nil)
}
}
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
logger.info("Start")
logger.info("Reading bed file")
val bedRecords = BedRecordList.fromFile(cmdArgs.bedFile).sorted
logger.info(s"Combine ${bedRecords.allRecords.size} bed records")
val combinedBedRecords = bedRecords.combineOverlap
logger.info(s"${combinedBedRecords.allRecords.size} left")
logger.info(s"${combinedBedRecords.allRecords.size * cmdArgs.vcfFiles.size} query's to do")
logger.info("Reading vcf files")
case class AfCounts(var names: Double = 0,
var namesExons: Double = 0,
var namesIntrons: Double = 0,
var namesCoding: Double = 0,
var utr: Double = 0,
var utr5: Double = 0,
var utr3: Double = 0)
var c = 0
val afCounts = (for (vcfFile <- cmdArgs.vcfFiles.par) yield vcfFile -> {
val reader = new VCFFileReader(vcfFile, true)
val afCounts: mutable.Map[String, AfCounts] = mutable.Map()
for (region <- combinedBedRecords.allRecords) yield {
val originals = region.originals()
for (variant <- reader.query(region.chr, region.start, region.end)) {
val sum = (variant.getAttribute("AF", 0) match {
case a: util.ArrayList[_] => a.map(_.toString.toDouble).toArray
case s => Array(s.toString.toDouble)
}).sum
val interval = BedRecord(variant.getContig, variant.getStart, variant.getEnd)
originals.foreach { x =>
val name = x.name.getOrElse(s"${x.chr}:${x.start}-${x.end}")
if (!afCounts.contains(name)) afCounts += name -> AfCounts()
afCounts(name).names += sum
val exons = x.exons.getOrElse(Seq()).filter(_.overlapWith(interval))
val introns = x.introns.getOrElse(Seq()).filter(_.overlapWith(interval))
val utr5 = x.utr5.map(_.overlapWith(interval))
val utr3 = x.utr3.map(_.overlapWith(interval))
if (exons.nonEmpty) {
afCounts(name).namesExons += sum
if (!utr5.getOrElse(false) && !utr3.getOrElse(false)) afCounts(name).namesCoding += sum
}
if (introns.nonEmpty) afCounts(name).namesIntrons += sum
if (utr5.getOrElse(false) || utr3.getOrElse(false)) afCounts(name).utr += sum
if (utr5.getOrElse(false)) afCounts(name).utr5 += sum
if (utr3.getOrElse(false)) afCounts(name).utr3 += sum