Commit ad151df9 authored by akaljuvee's avatar akaljuvee

parsing vcf files in separate jobs

parent 1740b1b9
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.tools
import java.io.File
import nl.lumc.sasc.biopet.core.summary.Summarizable
import nl.lumc.sasc.biopet.core.ToolCommandFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
class VcfStatsForSv(val parent: Configurable) extends ToolCommandFunction with Summarizable {
def toolObject = nl.lumc.sasc.biopet.tools.vcfstats.VcfStatsForSv
mainFunction = false
@Input(required = true)
var inputFile: File = _
@Input(required = true)
var histogramBinBoundaries: Array[Int] = _
@Output(required = true)
var outputFile: File = _
override def cmdLine = super.cmdLine +
required("-i", inputFile) +
required("-o", outputFile) +
repeat("--histBinBoundaries", histogramBinBoundaries)
def summaryStats: Map[String, Any] = ConfigUtils.fileToConfigMap(outputFile)
def summaryFiles: Map[String, File] = Map.empty
}
package nl.lumc.sasc.biopet.tools.vcfstats
import java.io.File
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.utils.{ ConfigUtils, ToolCommand }
import scala.collection.JavaConversions._
object VcfStatsForSv extends ToolCommand {
/** Commandline arguments */
case class Args(inputFile: File = null, outputFile: File = null, histBinBoundaries: Array[Int] = Array()) extends AbstractArgs
/** Parsing commandline arguments */
class OptParser extends AbstractOptParser {
opt[File]('i', "inputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(inputFile = x)
} validate {
x => if (x.exists) success else failure("Input VCF required")
} text "Input VCF file (required)"
opt[File]('o', "outputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(outputFile = x)
} text "Output file (required)"
opt[Int]("histBinBoundaries") required () unbounded () action { (x, c) =>
c.copy(histBinBoundaries = c.histBinBoundaries :+ x)
} text "When counting the records, sv-s are divided to different size classes, this parameter should give the boundaries between these classes."
}
protected var cmdArgs: Args = _
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
logger.info(s"Parsing file: ${cmdArgs.inputFile}")
val stats: Map[String, Any] = getVariantCounts(cmdArgs.inputFile, cmdArgs.histBinBoundaries)
ConfigUtils.mapToYamlFile(stats, cmdArgs.outputFile)
}
/** Parses a vcf-file and counts sv-s by type and size. Sv-s are divided to different size classes, the parameter histogramBinBoundaries gives the boundaries between these classes. */
def getVariantCounts(vcfFile: File, histogramBinBoundaries: Array[Int]): Map[String, Any] = {
val delCounts, insCounts, dupCounts, invCounts = Array.fill(histogramBinBoundaries.size + 1) { 0 }
var traCount = 0
val reader = new VCFFileReader(vcfFile, false)
for (record <- reader) {
record.getAttributeAsString("SVTYPE", "") match {
case "TRA" | "CTX" | "ITX" => traCount += 1
case svType => {
val size = record.getEnd - record.getStart
var i = 0
while (i < histogramBinBoundaries.size && size > histogramBinBoundaries(i)) i += 1
svType match {
case "DEL" => delCounts(i) += 1
case "INS" => insCounts(i) += 1
case "DUP" => dupCounts(i) += 1
case "INV" => invCounts(i) += 1
case _ => logger.warn(s"Vcf file contains a record of unknown type: file-$vcfFile, type-$svType")
}
}
}
}
reader.close()
Map("DEL" -> delCounts, "INS" -> insCounts, "DUP" -> dupCounts, "INV" -> invCounts, "TRA" -> traCount)
}
}
...@@ -14,18 +14,15 @@ ...@@ -14,18 +14,15 @@
*/ */
package nl.lumc.sasc.biopet.pipelines.shiva package nl.lumc.sasc.biopet.pipelines.shiva
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.core.summary.{ Summarizable, SummaryQScript } import nl.lumc.sasc.biopet.core.summary.{ Summarizable, SummaryQScript }
import nl.lumc.sasc.biopet.core.{ PipelineCommand, Reference } import nl.lumc.sasc.biopet.core.{ PipelineCommand, Reference }
import nl.lumc.sasc.biopet.extensions.Pysvtools import nl.lumc.sasc.biopet.extensions.Pysvtools
import nl.lumc.sasc.biopet.pipelines.shiva.ShivaSvCallingReport.histogramBinBoundaries import nl.lumc.sasc.biopet.extensions.tools.VcfStatsForSv
import nl.lumc.sasc.biopet.pipelines.shiva.svcallers._ import nl.lumc.sasc.biopet.pipelines.shiva.svcallers._
import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.{ BamUtils, Logging } import nl.lumc.sasc.biopet.utils.{ BamUtils, Logging }
import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.queue.QScript
import scala.collection.JavaConversions._
/** /**
* Common trait for ShivaVariantcalling * Common trait for ShivaVariantcalling
* *
...@@ -101,11 +98,18 @@ class ShivaSvCalling(val parent: Configurable) extends QScript with SummaryQScri ...@@ -101,11 +98,18 @@ class ShivaSvCalling(val parent: Configurable) extends QScript with SummaryQScri
// sample tagging is however not available within this pipeline // sample tagging is however not available within this pipeline
for ((sample, mergedResultFile) <- outputMergedVCFbySample) { for ((sample, mergedResultFile) <- outputMergedVCFbySample) {
lazy val counts = getVariantCounts(mergedResultFile) val vcfStats = new VcfStatsForSv(qscript)
vcfStats.inputFile = mergedResultFile
vcfStats.outputFile = new File(outputDir, s".$sample.merged.stats")
vcfStats.histogramBinBoundaries = ShivaSvCallingReport.histogramBinBoundaries
add(vcfStats)
addSummarizable(vcfStats, "vcfstats-sv", Some(sample))
addSummarizable(new Summarizable { addSummarizable(new Summarizable {
def summaryFiles = Map("output_vcf" -> mergedResultFile) def summaryFiles = Map("output_vcf" -> mergedResultFile)
def summaryStats = counts def summaryStats = Map.empty
}, "parse_sv_vcf", Some(sample)) }, "merge_variants", Some(sample))
} }
addSummaryJobs() addSummaryJobs()
...@@ -115,39 +119,11 @@ class ShivaSvCalling(val parent: Configurable) extends QScript with SummaryQScri ...@@ -115,39 +119,11 @@ class ShivaSvCalling(val parent: Configurable) extends QScript with SummaryQScri
protected def callersList: List[SvCaller] = List(new Breakdancer(this), new Clever(this), new Delly(this), new Pindel(this)) protected def callersList: List[SvCaller] = List(new Breakdancer(this), new Clever(this), new Delly(this), new Pindel(this))
/** Settings for the summary */ /** Settings for the summary */
def summarySettings = Map("sv_callers" -> configCallers.toList, "hist_bin_boundaries" -> histogramBinBoundaries) def summarySettings = Map("sv_callers" -> configCallers.toList, "hist_bin_boundaries" -> ShivaSvCallingReport.histogramBinBoundaries)
/** Files for the summary */ /** Files for the summary */
def summaryFiles: Map[String, File] = if (inputBams.size > 1) Map("final_mergedvcf" -> outputMergedVCF) else Map.empty def summaryFiles: Map[String, File] = if (inputBams.size > 1) Map("final_mergedvcf" -> outputMergedVCF) else Map.empty
/** Parses a vcf-file and counts sv-s by type and size. Sv-s are divided to different size classes, the boundaries between these classes are those given in ShivaSvCallingReport.histogramBinBoundaries. */
def getVariantCounts(vcfFile: File): Map[String, Any] = {
val delCounts, insCounts, dupCounts, invCounts = Array.fill(histogramBinBoundaries.size + 1) { 0 }
var traCount = 0
val reader = new VCFFileReader(vcfFile, false)
for (record <- reader) {
record.getAttributeAsString("SVTYPE", "") match {
case "TRA" | "CTX" | "ITX" => traCount += 1
case svType => {
val size = record.getEnd - record.getStart
var i = 0
while (i < histogramBinBoundaries.size && size > histogramBinBoundaries(i)) i += 1
svType match {
case "DEL" => delCounts(i) += 1
case "INS" => insCounts(i) += 1
case "DUP" => dupCounts(i) += 1
case "INV" => invCounts(i) += 1
case _ => logger.warn(s"Vcf file contains a record of unknown type: file-$vcfFile, type-$svType")
}
}
}
}
reader.close()
Map("DEL" -> delCounts, "INS" -> insCounts, "DUP" -> dupCounts, "INV" -> invCounts, "TRA" -> traCount)
}
} }
object ShivaSvCalling extends PipelineCommand object ShivaSvCalling extends PipelineCommand
\ No newline at end of file
...@@ -22,7 +22,7 @@ trait ShivaSvCallingReportTrait extends Logging { ...@@ -22,7 +22,7 @@ trait ShivaSvCallingReportTrait extends Logging {
var delCounts, insCounts, dupCounts, invCounts: Map[String, Array[Long]] = Map() var delCounts, insCounts, dupCounts, invCounts: Map[String, Array[Long]] = Map()
for (sampleName <- sampleNames) { for (sampleName <- sampleNames) {
val sampleCounts: Map[String, Any] = Await.result(summary.getStat(runId, PipelineName("shivasvcalling"), ModuleName("parse_sv_vcf"), SampleName(sampleName)), Duration.Inf).get val sampleCounts: Map[String, Any] = Await.result(summary.getStat(runId, PipelineName("shivasvcalling"), ModuleName("vcfstats-sv"), SampleName(sampleName)), Duration.Inf).get
for ((svType, counts) <- sampleCounts.collect({ case (k, v: List[_]) => (k, v.toArray[Any]) })) { for ((svType, counts) <- sampleCounts.collect({ case (k, v: List[_]) => (k, v.toArray[Any]) })) {
val elem: Tuple2[String, Array[Long]] = (sampleName, counts.collect({ case x: Long => x })) val elem: Tuple2[String, Array[Long]] = (sampleName, counts.collect({ case x: Long => x }))
svType match { svType match {
...@@ -45,7 +45,7 @@ trait ShivaSvCallingReportTrait extends Logging { ...@@ -45,7 +45,7 @@ trait ShivaSvCallingReportTrait extends Logging {
def parseSummaryForTranslocations(summary: SummaryDb, runId: Int, sampleNames: Seq[String]): Map[String, Long] = { def parseSummaryForTranslocations(summary: SummaryDb, runId: Int, sampleNames: Seq[String]): Map[String, Long] = {
var traCounts: Map[String, Long] = Map() var traCounts: Map[String, Long] = Map()
for (sampleName <- sampleNames) { for (sampleName <- sampleNames) {
val counts: Map[String, Any] = Await.result(summary.getStat(runId, PipelineName("shivasvcalling"), ModuleName("parse_sv_vcf"), SampleName(sampleName)), Duration.Inf).get val counts: Map[String, Any] = Await.result(summary.getStat(runId, PipelineName("shivasvcalling"), ModuleName("vcfstats-sv"), SampleName(sampleName)), Duration.Inf).get
counts.get("TRA") match { counts.get("TRA") match {
case Some(c: Long) => traCounts += (sampleName -> c) case Some(c: Long) => traCounts += (sampleName -> c)
case Some(c) => logger.error(s"Unable to parse translocation counts from summary db for sample $sampleName (type mismatch, type in the db: ${c.getClass})") case Some(c) => logger.error(s"Unable to parse translocation counts from summary db for sample $sampleName (type mismatch, type in the db: ${c.getClass})")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment