Commit 9bbaa813 authored by Sander Bollen's avatar Sander Bollen
Browse files

Merge branch 'feature-region_af_count' into 'develop'

Feature region af count

This is a new tool made for SASC project 71. Tool will get sums af AF from a vcf file for a given set of regions. Also generates a scatter plot when more vcf files are given, first vcf file is always the ref set like gonl or 1000 Genomes.
Will become a fix for multiple issue:
- [x] BedFile reader, htsjdk bed reader is not fully supported scala
 - [x] Extraction of Exons
 - [x] Extraction of Introns
 - [x] Extraction of UTR regions
 - [x] Squish method, requires by #49 
 - [x] Combine overlapping records
 - [x] Scatter regions with a given approximate bin size 
- [x] #192 
 - [x] Use Biopet interval interface
 - [x] Extract features when given in a bed file
- [x] #148 (need for #171)
 - [x] Use Biopet interval interface
 - [x] Use general scatter method
- [x] #49 
 - [x] Use Biopet interval interface
 - [x] Use general Squish method

See merge request !218
parents 43178ffa 3e58bd92
......@@ -12,10 +12,12 @@ import nl.lumc.sasc.biopet.core.config.Configurable
class ApplyRecalibration(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.ApplyRecalibration with GatkGeneral {
scatterCount = config("scattercount", default = 0)
override val defaultThreads = 3
override def beforeGraph() {
super.beforeGraph()
nt = Option(getThreads(3))
nt = Option(getThreads)
memoryLimit = Option(nt.getOrElse(1) * 2)
import org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode
......
......@@ -9,6 +9,9 @@ import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType
class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.HaplotypeCaller with GatkGeneral {
override val defaultThreads = 1
min_mapping_quality_score = config("minMappingQualityScore", default = 20)
scatterCount = config("scattercount", default = 1)
if (config.contains("dbsnp")) this.dbsnp = config("dbsnp")
......@@ -41,7 +44,7 @@ class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.qu
threads = 1
logger.warn("BamOutput is on, nct/threads is forced to set on 1, this option is only for debug")
}
nct = Some(getThreads(1))
nct = Some(getThreads)
memoryLimit = Option(memoryLimit.getOrElse(2.0) * nct.getOrElse(1))
}
......
......@@ -26,11 +26,13 @@ class UnifiedGenotyper(val root: Configurable) extends org.broadinstitute.gatk.q
}
}
override val defaultThreads = 1
override def beforeGraph() {
super.beforeGraph()
genotype_likelihoods_model = org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH
nct = Some(getThreads(1))
nct = Some(getThreads)
memoryLimit = Option(nct.getOrElse(1) * memoryLimit.getOrElse(2.0))
}
}
......@@ -11,7 +11,9 @@ import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile
class VariantRecalibrator(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.VariantRecalibrator with GatkGeneral {
nt = Option(getThreads(4))
override val defaultThreads = 4
nt = Option(getThreads)
memoryLimit = Option(nt.getOrElse(1) * 2)
if (config.contains("dbsnp")) resource :+= new TaggedFile(config("dbsnp").asString, "known=true,training=false,truth=false,prior=2.0")
......
library(reshape2)
library(ggplot2)
library(argparse)
parser <- ArgumentParser(description='Process some integers')
parser$add_argument('--input', dest='input', type='character', help='Input tsv file', required=TRUE)
parser$add_argument('--output', dest='output', type='character', help='Output png file', required=TRUE)
parser$add_argument('--width', dest='width', type='integer', default = 500)
parser$add_argument('--height', dest='height', type='integer', default = 500)
parser$add_argument('--xlabel', dest='xlabel', type='character')
parser$add_argument('--ylabel', dest='ylabel', type='character', required=TRUE)
parser$add_argument('--llabel', dest='llabel', type='character')
parser$add_argument('--title', dest='title', type='character')
parser$add_argument('--removeZero', dest='removeZero', type='character', default="false")
arguments <- parser$parse_args()
png(filename = arguments$output, width = arguments$width, height = arguments$height)
DF <- read.table(arguments$input, header=TRUE)
if (is.null(arguments$xlabel)) xlab <- colnames(DF)[1] else xlab <- arguments$xlabel
colnames(DF)[1] <- "Rank"
DF1 <- melt(DF, id.var="Rank")
if (arguments$removeZero == "true") DF1 <- DF1[DF1$value > 0, ]
if (arguments$removeZero == "true") print("Removed 0 values")
ggplot(DF1, aes(x = Rank, y = value, group = variable, color = variable)) +
xlab(xlab) +
ylab(arguments$ylabel) +
guides(fill=guide_legend(title=arguments$llabel)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) +
ggtitle(arguments$title) +
theme_bw() +
geom_point()
dev.off()
......@@ -199,13 +199,15 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
BiopetCommandLineFunctionTrait.versionCache.get(versionCommand)
}
def getThreads: Int = getThreads(defaultThreads)
/**
* Get threads from config
* @param default default when not found in config
* @return number of threads
*/
def getThreads(default: Int): Int = {
val maxThreads: Int = config("maxthreads", default = 8)
val maxThreads: Int = config("maxthreads", default = 24)
val threads: Int = config("threads", default = default)
if (maxThreads > threads) threads
else maxThreads
......@@ -218,7 +220,7 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
* @return number of threads
*/
def getThreads(default: Int, module: String): Int = {
val maxThreads: Int = config("maxthreads", default = 8, submodule = module)
val maxThreads: Int = config("maxthreads", default = 24, submodule = module)
val threads: Int = config("threads", default = default, submodule = module)
if (maxThreads > threads) threads
else maxThreads
......
......@@ -74,7 +74,7 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction {
/** Sets correct output files to job */
override def beforeGraph() {
require(w != null)
if (threads == 0) threads = getThreads(defaultThreads)
if (threads == 0) threads = getThreads
executable = if (threads > 1 && executableThreads.isDefined) executableThreads.get else executableNonThreads
super.beforeGraph()
out :::= List(Some(getInfoFile), getBestTreeFile, getBootstrapFile, getBipartitionsFile).flatten
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.rscript
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.RscriptCommandLineFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Extension for en general line plot with R
*
* Created by pjvan_thof on 4/29/15.
*/
class ScatterPlot(val root: Configurable) extends RscriptCommandLineFunction {
protected var script: File = config("script", default = "plotScatter.R")
@Input
var input: File = _
@Output
var output: File = _
var width: Option[Int] = config("width")
var height: Option[Int] = config("height")
var xlabel: Option[String] = config("xlabel")
var ylabel: Option[String] = config("ylabel")
var llabel: Option[String] = config("llabel")
var title: Option[String] = config("title")
var removeZero: Boolean = config("removeZero", default = false)
override def cmdLine: String = super.cmdLine +
required("--input", input) +
required("--output", output) +
optional("--width", width) +
optional("--height", height) +
optional("--xlabel", xlabel) +
required("--ylabel", ylabel) +
optional("--llabel", llabel) +
optional("--title", title) +
optional("--removeZero", removeZero)
}
......@@ -21,6 +21,7 @@ import htsjdk.variant.variantcontext.VariantContextBuilder
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.vcf.{ VCFFileReader, VCFHeaderLineCount, VCFHeaderLineType, VCFInfoHeaderLine }
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
import scala.collection.JavaConversions._
import scala.collection.mutable
......@@ -83,19 +84,9 @@ object AnnotateVcfWithBed extends ToolCommand {
logger.info("Start")
val argsParser = new OptParser
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val bedRecords: mutable.Map[String, List[(Int, Int, String)]] = mutable.Map()
// Read bed file
/*
// function bedRecord.getName will not compile, not clear why
for (bedRecord <- asScalaIteratorConverter(getFeatureReader(commandArgs.bedFile.toPath.toString, new BEDCodec(), false).iterator()).asScala) {
logger.debug(bedRecord)
bedRecords(bedRecord.getChr) = (bedRecord.getStart, bedRecord.getEnd, bedRecord.getName) :: bedRecords.getOrElse(bedRecord.getChr, Nil)
}
*/
val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val fieldType = commandArgs.fieldType match {
val fieldType = cmdArgs.fieldType match {
case "Integer" => VCFHeaderLineType.Integer
case "Flag" => VCFHeaderLineType.Flag
case "Character" => VCFHeaderLineType.Character
......@@ -104,48 +95,32 @@ object AnnotateVcfWithBed extends ToolCommand {
}
logger.info("Reading bed file")
for (line <- Source.fromFile(commandArgs.bedFile).getLines()) {
val values = line.split("\t")
if (values.size >= 4)
bedRecords(values(0)) = (values(1).toInt, values(2).toInt, values(3)) :: bedRecords.getOrElse(values(0), Nil)
else values.size >= 3 && fieldType == VCFHeaderLineType.Flag
bedRecords(values(0)) = (values(1).toInt, values(2).toInt, "") :: bedRecords.getOrElse(values(0), Nil)
}
logger.info("Sorting bed records")
// Sort records when needed
for ((chr, record) <- bedRecords) {
bedRecords(chr) = record.sortBy(x => (x._1, x._2))
}
val bedRecords = BedRecordList.fromFile(cmdArgs.bedFile).sorted
logger.info("Starting output file")
val reader = new VCFFileReader(commandArgs.inputFile, false)
val reader = new VCFFileReader(cmdArgs.inputFile, false)
val header = reader.getFileHeader
val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
setOutputFile(commandArgs.outputFile).
setOutputFile(cmdArgs.outputFile).
setReferenceDictionary(header.getSequenceDictionary).
build)
header.addMetaDataLine(new VCFInfoHeaderLine(commandArgs.fieldName,
VCFHeaderLineCount.UNBOUNDED, fieldType, commandArgs.fieldDescription))
header.addMetaDataLine(new VCFInfoHeaderLine(cmdArgs.fieldName,
VCFHeaderLineCount.UNBOUNDED, fieldType, cmdArgs.fieldDescription))
writer.writeHeader(header)
logger.info("Start reading vcf records")
for (record <- reader) {
val overlaps = bedRecords.getOrElse(record.getContig, Nil).filter(x => {
record.getStart <= x._2 && record.getEnd >= x._1
})
val overlaps = bedRecords.overlapWith(BedRecord(record.getContig, record.getStart, record.getEnd))
if (overlaps.isEmpty) {
writer.add(record)
} else {
val builder = new VariantContextBuilder(record)
if (fieldType == VCFHeaderLineType.Flag) builder.attribute(commandArgs.fieldName, true)
else builder.attribute(commandArgs.fieldName, overlaps.map(_._3).mkString(","))
if (fieldType == VCFHeaderLineType.Flag) builder.attribute(cmdArgs.fieldName, true)
else builder.attribute(cmdArgs.fieldName, overlaps.map(_.name).mkString(","))
writer.add(builder.make)
}
}
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.tools
import java.io.{ PrintWriter, InputStream, File }
import java.util
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.extensions.rscript.ScatterPlot
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
import scala.collection.JavaConversions._
import scala.collection.mutable
object RegionAfCount extends ToolCommand {
case class Args(bedFile: File = null,
outputPrefix: String = null,
scatterpPlot: Boolean = false,
vcfFiles: List[File] = Nil) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('b', "bedFile") unbounded () required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(bedFile = x)
}
opt[String]('o', "outputPrefix") unbounded () required () maxOccurs 1 valueName "<file prefix>" action { (x, c) =>
c.copy(outputPrefix = x)
}
opt[Unit]('s', "scatterPlot") unbounded () action { (x, c) =>
c.copy(scatterpPlot = true)
}
opt[File]('V', "vcfFile") unbounded () minOccurs 1 action { (x, c) =>
c.copy(vcfFiles = c.vcfFiles ::: x :: Nil)
}
}
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
logger.info("Start")
logger.info("Reading bed file")
val bedRecords = BedRecordList.fromFile(cmdArgs.bedFile).sorted
logger.info(s"Combine ${bedRecords.allRecords.size} bed records")
val combinedBedRecords = bedRecords.combineOverlap
logger.info(s"${combinedBedRecords.allRecords.size} left")
logger.info(s"${combinedBedRecords.allRecords.size * cmdArgs.vcfFiles.size} query's to do")
logger.info("Reading vcf files")
case class AfCounts(var names: Double = 0,
var namesExons: Double = 0,
var namesIntrons: Double = 0,
var namesCoding: Double = 0,
var utr: Double = 0,
var utr5: Double = 0,
var utr3: Double = 0)
var c = 0
val afCounts = (for (vcfFile <- cmdArgs.vcfFiles.par) yield vcfFile -> {
val reader = new VCFFileReader(vcfFile, true)
val afCounts: mutable.Map[String, AfCounts] = mutable.Map()
for (region <- combinedBedRecords.allRecords) yield {
val originals = region.originals()
for (variant <- reader.query(region.chr, region.start, region.end)) {
val sum = (variant.getAttribute("AF", 0) match {
case a: util.ArrayList[_] => a.map(_.toString.toDouble).toArray
case s => Array(s.toString.toDouble)
}).sum
val interval = BedRecord(variant.getContig, variant.getStart, variant.getEnd)
originals.foreach { x =>
val name = x.name.getOrElse(s"${x.chr}:${x.start}-${x.end}")
if (!afCounts.contains(name)) afCounts += name -> AfCounts()
afCounts(name).names += sum
val exons = x.exons.getOrElse(Seq()).filter(_.overlapWith(interval))
val introns = x.introns.getOrElse(Seq()).filter(_.overlapWith(interval))
val utr5 = x.utr5.map(_.overlapWith(interval))
val utr3 = x.utr3.map(_.overlapWith(interval))
if (exons.nonEmpty) {
afCounts(name).namesExons += sum
if (!utr5.getOrElse(false) && !utr3.getOrElse(false)) afCounts(name).namesCoding += sum
}
if (introns.nonEmpty) afCounts(name).namesIntrons += sum
if (utr5.getOrElse(false) || utr3.getOrElse(false)) afCounts(name).utr += sum
if (utr5.getOrElse(false)) afCounts(name).utr5 += sum
if (utr3.getOrElse(false)) afCounts(name).utr3 += sum
}
}
c += 1
if (c % 100 == 0) logger.info(s"$c regions done")
}
afCounts.toMap
}).toMap
logger.info(s"Done reading, ${c} regions")
logger.info("Writing output files")
def writeOutput(tsvFile: File, function: AfCounts => Double): Unit = {
val writer = new PrintWriter(tsvFile)
writer.println("\t" + cmdArgs.vcfFiles.map(_.getName).mkString("\t"))
for (r <- cmdArgs.vcfFiles.foldLeft(Set[String]())((a, b) => a ++ afCounts(b).keySet)) {
writer.print(r + "\t")
writer.println(cmdArgs.vcfFiles.map(x => function(afCounts(x).getOrElse(r, AfCounts()))).mkString("\t"))
}
writer.close()
if (cmdArgs.scatterpPlot) generatePlot(tsvFile)
}
def generatePlot(tsvFile: File): Unit = {
logger.info(s"Generate plot for $tsvFile")
val scatterPlot = new ScatterPlot(null)
scatterPlot.input = tsvFile
scatterPlot.output = new File(tsvFile.getAbsolutePath.stripSuffix(".tsv") + ".png")
scatterPlot.ylabel = Some("Sum of AFs")
scatterPlot.width = Some(1200)
scatterPlot.height = Some(1000)
scatterPlot.runLocal()
}
for (
arg <- List[(File, AfCounts => Double)](
(new File(cmdArgs.outputPrefix + ".names.tsv"), _.names),
(new File(cmdArgs.outputPrefix + ".names.exons_only.tsv"), _.namesExons),
(new File(cmdArgs.outputPrefix + ".names.introns_only.tsv"), _.namesIntrons),
(new File(cmdArgs.outputPrefix + ".names.coding.tsv"), _.namesCoding),
(new File(cmdArgs.outputPrefix + ".names.utr.tsv"), _.utr),
(new File(cmdArgs.outputPrefix + ".names.utr5.tsv"), _.utr5),
(new File(cmdArgs.outputPrefix + ".names.utr3.tsv"), _.utr3)
).par
) writeOutput(arg._1, arg._2)
logger.info("Done")
}
}
\ No newline at end of file
package nl.lumc.sasc.biopet.tools
import java.io.File
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
/**
* Created by pjvanthof on 22/08/15.
*/
object SquishBed extends ToolCommand {
case class Args(input: File = null,
output: File = null,
strandSensitive: Boolean = false) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('I', "input") required () valueName "<file>" action { (x, c) =>
c.copy(input = x)
}
opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) =>
c.copy(output = x)
}
opt[Unit]('s', "strandSensitive") unbounded () valueName "<file>" action { (x, c) =>
c.copy(strandSensitive = true)
}
}
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
if (!cmdArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + cmdArgs.input)
logger.info("Start")
val records = BedRecordList.fromFile(cmdArgs.input)
val length = records.length
val refLength = records.combineOverlap.length
logger.info(s"Total bases: $length")
logger.info(s"Total bases on reference: $refLength")
logger.info("Start squishing")
val squishBed = records.squishBed(cmdArgs.strandSensitive).sorted
logger.info("Done squishing")
val squishLength = squishBed.length
val squishRefLength = squishBed.combineOverlap.length
logger.info(s"Total bases left: $squishLength")
logger.info(s"Total bases left on reference: $squishRefLength")
logger.info(s"Total bases removed from ref: ${refLength - squishRefLength}")
squishBed.writeToFile(cmdArgs.output)
logger.info("Done")
}
}
......@@ -24,6 +24,7 @@ import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.core.summary.{ Summarizable, SummaryQScript }
import nl.lumc.sasc.biopet.core.{ Reference, ToolCommand, ToolCommandFuntion }
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.collection.JavaConversions._
......@@ -146,12 +147,9 @@ object VcfStats extends ToolCommand {
opt[File]('o', "outputDir") required () unbounded () valueName "<file>" action { (x, c) =>
c.copy(outputDir = x)
}
//TODO: add interval argument
/*
opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) =>
c.copy(intervals = Some(x))
}
*/
opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
c.copy(infoTags = x :: c.infoTags)
}
......@@ -258,7 +256,7 @@ object VcfStats extends ToolCommand {
}
}
protected var commandArgs: Args = _
protected var cmdArgs: Args = _
val defaultGenotypeFields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")
......@@ -273,9 +271,9 @@ object VcfStats extends ToolCommand {
def main(args: Array[String]): Unit = {
logger.info("Started")
val argsParser = new OptParser
commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
cmdArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val reader = new VCFFileReader(commandArgs.inputFile, true)
val reader = new VCFFileReader(cmdArgs.inputFile, true)
val header = reader.getFileHeader
val samples = header.getSampleNamesInOrder.toList
......@@ -283,44 +281,36 @@ object VcfStats extends ToolCommand {
val adInfoTags = {
(for (
infoTag <- commandArgs.infoTags if !defaultInfoFields.contains(infoTag)
infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)
) yield {