Commit 4258ac1b authored by Sander Bollen's avatar Sander Bollen
Browse files

Merge branch 'feature-genotype_concordance' into 'develop'

Added GenotypeConcordance

Fixes #227, #228 and #229 

Adding here GenotypeConcordance of gatk. This can calculate concordance on multisample vcf files.

See merge request !260
parents fe9f50f7 c40dda46
......@@ -5,13 +5,15 @@
*/
package nl.lumc.sasc.biopet.extensions.gatk.broad
import nl.lumc.sasc.biopet.core.{ Version, CommandLineResources, Reference, BiopetJavaCommandLineFunction }
import nl.lumc.sasc.biopet.core._
import org.broadinstitute.gatk.engine.phonehome.GATKRunReport
import org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGATK
trait GatkGeneral extends CommandLineGATK with CommandLineResources with Reference with Version {
memoryLimit = Option(3)
var executable: String = config("java", default = "java", submodule = "java", freeVar = false)
override def subPath = "gatk" :: super.subPath
jarFile = config("gatk_jar")
......@@ -20,6 +22,7 @@ trait GatkGeneral extends CommandLineGATK with CommandLineResources with Referen
override def defaultCoreMemory = 4.0
override def faiRequired = true
override def dictRequired = true
if (config.contains("intervals")) intervals = config("intervals").asFileList
if (config.contains("exclude_intervals")) excludeIntervals = config("exclude_intervals").asFileList
......@@ -39,5 +42,8 @@ trait GatkGeneral extends CommandLineGATK with CommandLineResources with Referen
override def versionExitcode = List(0, 1)
def versionCommand = "java" + " -jar " + jarFile + " -version"
override def getVersion = super.getVersion.collect { case v => "Gatk " + v }
override def getVersion = {
BiopetCommandLineFunction.preProcessExecutable(executable).path.foreach(executable = _)
super.getVersion.collect { case v => "Gatk " + v }
}
}
......@@ -26,7 +26,6 @@ import org.ggf.drmaa.JobTemplate
import scala.collection.mutable
import scala.io.Source
import scala.sys.process.{ Process, ProcessLogger }
import scala.util.matching.Regex
import scala.collection.JavaConversions._
/** Biopet command line trait to auto check executable and cluster values */
......@@ -50,20 +49,18 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
val writer = new PrintWriter(file)
writer.println("set -eubf")
writer.println("set -o pipefail")
lines.foreach(writer.println(_))
lines.foreach(writer.println)
writer.close()
}
// This overrides the default "sh" from queue. For Biopet the default is "bash"
updateJobRun = {
case jt: JobTemplate => {
case jt: JobTemplate =>
changeScript(new File(jt.getArgs.head.toString))
jt.setRemoteCommand(remoteCommand)
}
case ps: ProcessSettings => {
case ps: ProcessSettings =>
changeScript(new File(ps.getCommand.tail.head))
ps.setCommand(Array(remoteCommand) ++ ps.getCommand.tail)
}
}
/**
......@@ -91,50 +88,19 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
}
/** can override this value is executable may not be converted to CanonicalPath */
/**
* Can override this value is executable may not be converted to CanonicalPath
* @deprecated
*/
val executableToCanonicalPath = true
/**
* Checks executable. Follow full CanonicalPath, checks if it is existing and do a md5sum on it to store in job report
*/
protected[core] def preProcessExecutable() {
if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) {
if (executable != null) {
if (!BiopetCommandLineFunction.executableCache.contains(executable)) {
try {
val oldExecutable = executable
val buffer = new StringBuffer()
val cmd = Seq("which", executable)
val process = Process(cmd).run(ProcessLogger(buffer.append(_)))
if (process.exitValue == 0) {
executable = buffer.toString
val file = new File(executable)
if (executableToCanonicalPath) executable = file.getCanonicalPath
else executable = file.getAbsolutePath
} else Logging.addError("executable: '" + executable + "' not found, please check config")
BiopetCommandLineFunction.executableCache += oldExecutable -> executable
BiopetCommandLineFunction.executableCache += executable -> executable
} catch {
case ioe: java.io.IOException =>
logger.warn(s"Could not use 'which' on '$executable', check on executable skipped: " + ioe)
}
} else executable = BiopetCommandLineFunction.executableCache(executable)
if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) {
if (new File(executable).exists()) {
val is = new FileInputStream(executable)
val cnt = is.available
val bytes = Array.ofDim[Byte](cnt)
is.read(bytes)
is.close()
val temp = MessageDigest.getInstance("MD5").digest(bytes).map("%02X".format(_)).mkString.toLowerCase
BiopetCommandLineFunction.executableMd5Cache += executable -> temp
} else BiopetCommandLineFunction.executableMd5Cache += executable -> "file_does_not_exist"
}
}
}
val md5 = BiopetCommandLineFunction.executableMd5Cache.get(executable)
addJobReportBinding("md5sum_exe", md5.getOrElse("None"))
val exe = BiopetCommandLineFunction.preProcessExecutable(executable)
exe.path.foreach(executable = _)
addJobReportBinding("md5sum_exe", exe.md5.getOrElse("N/A"))
}
/** executes checkExecutable method and fill job report */
......@@ -166,10 +132,9 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
that.beforeGraph()
that.internalBeforeGraph()
this match {
case p: BiopetPipe => {
case p: BiopetPipe =>
p.commands.last._outputAsStdout = true
new BiopetPipe(p.commands ::: that :: Nil)
}
case _ => new BiopetPipe(List(this, that))
}
}
......@@ -230,7 +195,48 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
}
/** stores global caches */
object BiopetCommandLineFunction {
object BiopetCommandLineFunction extends Logging {
private[core] val executableMd5Cache: mutable.Map[String, String] = mutable.Map()
private[core] val executableCache: mutable.Map[String, String] = mutable.Map()
case class Executable(path: Option[String], md5: Option[String])
def preProcessExecutable(executable: String): Executable = {
if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) {
if (executable != null) {
if (!BiopetCommandLineFunction.executableCache.contains(executable)) {
try {
val buffer = new StringBuffer()
val cmd = Seq("which", executable)
val process = Process(cmd).run(ProcessLogger(buffer.append(_)))
if (process.exitValue == 0) {
val file = new File(buffer.toString)
BiopetCommandLineFunction.executableCache += executable -> file.getAbsolutePath
} else {
Logging.addError("executable: '" + executable + "' not found, please check config")
BiopetCommandLineFunction.executableCache += executable -> executable
}
} catch {
case ioe: java.io.IOException =>
logger.warn(s"Could not use 'which' on '$executable', check on executable skipped: " + ioe)
BiopetCommandLineFunction.executableCache += executable -> executable
}
}
if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) {
val newExe = BiopetCommandLineFunction.executableCache(executable)
if (new File(newExe).exists()) {
val is = new FileInputStream(newExe)
val cnt = is.available
val bytes = Array.ofDim[Byte](cnt)
is.read(bytes)
is.close()
val temp = MessageDigest.getInstance("MD5").digest(bytes).map("%02X".format(_)).mkString.toLowerCase
BiopetCommandLineFunction.executableMd5Cache += newExe -> temp
} else BiopetCommandLineFunction.executableMd5Cache += newExe -> "file_does_not_exist"
}
}
}
Executable(BiopetCommandLineFunction.executableCache.get(executable),
BiopetCommandLineFunction.executableMd5Cache.get(executable))
}
}
......@@ -21,9 +21,9 @@ import org.broadinstitute.gatk.queue.function.JavaCommandLineFunction
trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetCommandLineFunction {
executable = config("java", default = "java", submodule = "java", freeVar = false)
javaGCThreads = config("java_gc_threads")
javaGCHeapFreeLimit = config("java_gc_heap_freelimit")
javaGCTimeLimit = config("java_gc_timelimit")
javaGCThreads = config("java_gc_threads", default = 4)
javaGCHeapFreeLimit = config("java_gc_heap_freelimit", default = 10)
javaGCTimeLimit = config("java_gc_timelimit", default = 50)
override def defaultVmemFactor: Double = 2.0
......@@ -38,8 +38,6 @@ trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetC
if (javaMainClass != null && javaClasspath.isEmpty)
javaClasspath = JavaCommandLineFunction.currentClasspath
//threads = getThreads(defaultThreads)
}
/** Creates command to execute extension */
......
......@@ -54,8 +54,6 @@ object Version extends Logging {
if (versionCache.contains(versionCommand)) return versionCache.get(versionCommand)
else if (versionCommand == null || versionRegex == null) return None
else {
val exe = new File(versionCommand.trim.split(" ")(0))
if (!exe.exists()) return None
val stdout = new StringBuffer()
val stderr = new StringBuffer()
def outputLog = "Version command: \n" + versionCommand +
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.gatk
import java.io.File
import nl.lumc.sasc.biopet.core.summary.Summarizable
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import org.broadinstitute.gatk.utils.report.{ GATKReportTable, GATKReport }
/**
* Extension for CombineVariants from GATK
*
* Created by pjvan_thof on 2/26/15.
*/
class GenotypeConcordance(val root: Configurable) extends Gatk with Summarizable {
val analysisType = "GenotypeConcordance"
@Input(required = true)
var evalFile: File = null
@Input(required = true)
var compFile: File = null
@Output(required = true)
var outputFile: File = null
var moltenize = true
def summaryFiles = Map("output" -> outputFile)
def summaryStats = {
val report = new GATKReport(outputFile)
val compProportions = report.getTable("GenotypeConcordance_CompProportions")
val counts = report.getTable("GenotypeConcordance_Counts")
val evalProportions = report.getTable("GenotypeConcordance_EvalProportions")
val genotypeSummary = report.getTable("GenotypeConcordance_Summary")
val siteSummary = report.getTable("SiteConcordance_Summary")
val samples = for (i <- 0 until genotypeSummary.getNumRows) yield genotypeSummary.get(i, "Sample").toString
def getMap(table: GATKReportTable, column: String) = samples.distinct.map(sample => sample -> {
(for (i <- 0 until table.getNumRows if table.get(i, "Sample") == sample) yield s"${table.get(i, "Eval_Genotype")}__${table.get(i, "Comp_Genotype")}" -> table.get(i, column)).toMap
}).toMap
Map(
"compProportions" -> getMap(compProportions, "Proportion"),
"counts" -> getMap(counts, "Count"),
"evalProportions" -> getMap(evalProportions, "Proportion"),
"genotypeSummary" -> samples.distinct.map(sample => {
val i = samples.indexOf(sample)
sample -> Map(
"Non-Reference_Discrepancy" -> genotypeSummary.get(i, "Non-Reference_Discrepancy"),
"Non-Reference_Sensitivity" -> genotypeSummary.get(i, "Non-Reference_Sensitivity"),
"Overall_Genotype_Concordance" -> genotypeSummary.get(i, "Overall_Genotype_Concordance")
)
}).toMap,
"siteSummary" -> Map(
"ALLELES_MATCH" -> siteSummary.get(0, "ALLELES_MATCH"),
"EVAL_SUPERSET_TRUTH" -> siteSummary.get(0, "EVAL_SUPERSET_TRUTH"),
"EVAL_SUBSET_TRUTH" -> siteSummary.get(0, "EVAL_SUBSET_TRUTH"),
"ALLELES_DO_NOT_MATCH" -> siteSummary.get(0, "ALLELES_DO_NOT_MATCH"),
"EVAL_ONLY" -> siteSummary.get(0, "EVAL_ONLY"),
"TRUTH_ONLY" -> siteSummary.get(0, "TRUTH_ONLY")
)
)
}
override def beforeGraph(): Unit = {
super.beforeGraph()
deps :::= (evalFile :: compFile :: Nil).filter(_.getName.endsWith("vcf.gz")).map(x => new File(x.getAbsolutePath + ".tbi"))
deps = deps.distinct
}
override def cmdLine = super.cmdLine +
required("--eval", evalFile) +
required("--comp", compFile) +
required("-o", outputFile) +
conditional(moltenize, "--moltenize")
}
......@@ -20,10 +20,10 @@ import java.io.File
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.{ Reference, SampleLibraryTag }
import nl.lumc.sasc.biopet.extensions.bcftools.{ BcftoolsCall, BcftoolsMerge }
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import nl.lumc.sasc.biopet.extensions.gatk.{ GenotypeConcordance, CombineVariants }
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsMpileup
import nl.lumc.sasc.biopet.extensions.tools.{ MpileupToVcf, VcfFilter, VcfStats }
import nl.lumc.sasc.biopet.extensions.{ Bgzip, Tabix }
import nl.lumc.sasc.biopet.extensions.{ Ln, Bgzip, Tabix }
import nl.lumc.sasc.biopet.utils.Logging
import org.broadinstitute.gatk.utils.commandline.Input
......@@ -38,6 +38,10 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with
@Input(doc = "Bam files (should be deduped bams)", shortName = "BAM", required = true)
var inputBams: List[File] = Nil
var referenceVcf: Option[File] = config("reference_vcf")
var referenceVcfRegions: Option[File] = config("reference_vcf_regions")
/** Name prefix, can override this methods if neeeded */
def namePrefix: String = {
(sampleId, libId) match {
......@@ -85,6 +89,16 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with
vcfStats.setOutputDir(new File(caller.outputDir, "vcfstats"))
add(vcfStats)
addSummarizable(vcfStats, namePrefix + "-vcfstats-" + caller.name)
referenceVcf.foreach(referenceVcfFile => {
val gc = new GenotypeConcordance(this)
gc.evalFile = caller.outputFile
gc.compFile = referenceVcfFile
gc.outputFile = new File(caller.outputDir, s"$namePrefix-genotype_concordance.${caller.name}.txt")
referenceVcfRegions.foreach(gc.intervals ::= _)
add(gc)
addSummarizable(gc, s"$namePrefix-genotype_concordance-${caller.name}")
})
}
add(cv)
......@@ -95,6 +109,16 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with
add(vcfStats)
addSummarizable(vcfStats, namePrefix + "-vcfstats-final")
referenceVcf.foreach(referenceVcfFile => {
val gc = new GenotypeConcordance(this)
gc.evalFile = finalFile
gc.compFile = referenceVcfFile
gc.outputFile = new File(outputDir, s"$namePrefix-genotype_concordance.final.txt")
referenceVcfRegions.foreach(gc.intervals ::= _)
add(gc)
addSummarizable(gc, s"$namePrefix-genotype_concordance-final")
})
addSummaryJobs()
}
......@@ -200,11 +224,13 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with
bt.output
}
val bcfmerge = new BcftoolsMerge(qscript)
bcfmerge.input = sampleVcfs
bcfmerge.output = outputFile
bcfmerge.O = Some("z")
add(bcfmerge)
if (sampleVcfs.size > 1) {
val bcfmerge = new BcftoolsMerge(qscript)
bcfmerge.input = sampleVcfs
bcfmerge.output = outputFile
bcfmerge.O = Some("z")
add(bcfmerge)
} else add(Ln.apply(qscript, sampleVcfs.head, outputFile))
add(Tabix(qscript, outputFile))
}
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment