Skip to content
Snippets Groups Projects
Commit 4258ac1b authored by Sander Bollen's avatar Sander Bollen
Browse files

Merge branch 'feature-genotype_concordance' into 'develop'

Added GenotypeConcordance

Fixes #227, #228 and #229 

Adding here GenotypeConcordance of gatk. This can calculate concordance on multisample vcf files.

See merge request !260
parents fe9f50f7 c40dda46
No related branches found
No related tags found
No related merge requests found
......@@ -5,13 +5,15 @@
*/
package nl.lumc.sasc.biopet.extensions.gatk.broad
import nl.lumc.sasc.biopet.core.{ Version, CommandLineResources, Reference, BiopetJavaCommandLineFunction }
import nl.lumc.sasc.biopet.core._
import org.broadinstitute.gatk.engine.phonehome.GATKRunReport
import org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGATK
trait GatkGeneral extends CommandLineGATK with CommandLineResources with Reference with Version {
memoryLimit = Option(3)
var executable: String = config("java", default = "java", submodule = "java", freeVar = false)
override def subPath = "gatk" :: super.subPath
jarFile = config("gatk_jar")
......@@ -20,6 +22,7 @@ trait GatkGeneral extends CommandLineGATK with CommandLineResources with Referen
override def defaultCoreMemory = 4.0
override def faiRequired = true
override def dictRequired = true
if (config.contains("intervals")) intervals = config("intervals").asFileList
if (config.contains("exclude_intervals")) excludeIntervals = config("exclude_intervals").asFileList
......@@ -39,5 +42,8 @@ trait GatkGeneral extends CommandLineGATK with CommandLineResources with Referen
override def versionExitcode = List(0, 1)
def versionCommand = "java" + " -jar " + jarFile + " -version"
override def getVersion = super.getVersion.collect { case v => "Gatk " + v }
override def getVersion = {
BiopetCommandLineFunction.preProcessExecutable(executable).path.foreach(executable = _)
super.getVersion.collect { case v => "Gatk " + v }
}
}
......@@ -26,7 +26,6 @@ import org.ggf.drmaa.JobTemplate
import scala.collection.mutable
import scala.io.Source
import scala.sys.process.{ Process, ProcessLogger }
import scala.util.matching.Regex
import scala.collection.JavaConversions._
/** Biopet command line trait to auto check executable and cluster values */
......@@ -50,20 +49,18 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
val writer = new PrintWriter(file)
writer.println("set -eubf")
writer.println("set -o pipefail")
lines.foreach(writer.println(_))
lines.foreach(writer.println)
writer.close()
}
// This overrides the default "sh" from queue. For Biopet the default is "bash"
updateJobRun = {
case jt: JobTemplate => {
case jt: JobTemplate =>
changeScript(new File(jt.getArgs.head.toString))
jt.setRemoteCommand(remoteCommand)
}
case ps: ProcessSettings => {
case ps: ProcessSettings =>
changeScript(new File(ps.getCommand.tail.head))
ps.setCommand(Array(remoteCommand) ++ ps.getCommand.tail)
}
}
/**
......@@ -91,50 +88,19 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
}
/** can override this value is executable may not be converted to CanonicalPath */
/**
* Can override this value is executable may not be converted to CanonicalPath
* @deprecated
*/
val executableToCanonicalPath = true
/**
* Checks executable. Follow full CanonicalPath, checks if it is existing and do a md5sum on it to store in job report
*/
protected[core] def preProcessExecutable() {
if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) {
if (executable != null) {
if (!BiopetCommandLineFunction.executableCache.contains(executable)) {
try {
val oldExecutable = executable
val buffer = new StringBuffer()
val cmd = Seq("which", executable)
val process = Process(cmd).run(ProcessLogger(buffer.append(_)))
if (process.exitValue == 0) {
executable = buffer.toString
val file = new File(executable)
if (executableToCanonicalPath) executable = file.getCanonicalPath
else executable = file.getAbsolutePath
} else Logging.addError("executable: '" + executable + "' not found, please check config")
BiopetCommandLineFunction.executableCache += oldExecutable -> executable
BiopetCommandLineFunction.executableCache += executable -> executable
} catch {
case ioe: java.io.IOException =>
logger.warn(s"Could not use 'which' on '$executable', check on executable skipped: " + ioe)
}
} else executable = BiopetCommandLineFunction.executableCache(executable)
if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) {
if (new File(executable).exists()) {
val is = new FileInputStream(executable)
val cnt = is.available
val bytes = Array.ofDim[Byte](cnt)
is.read(bytes)
is.close()
val temp = MessageDigest.getInstance("MD5").digest(bytes).map("%02X".format(_)).mkString.toLowerCase
BiopetCommandLineFunction.executableMd5Cache += executable -> temp
} else BiopetCommandLineFunction.executableMd5Cache += executable -> "file_does_not_exist"
}
}
}
val md5 = BiopetCommandLineFunction.executableMd5Cache.get(executable)
addJobReportBinding("md5sum_exe", md5.getOrElse("None"))
val exe = BiopetCommandLineFunction.preProcessExecutable(executable)
exe.path.foreach(executable = _)
addJobReportBinding("md5sum_exe", exe.md5.getOrElse("N/A"))
}
/** executes checkExecutable method and fill job report */
......@@ -166,10 +132,9 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
that.beforeGraph()
that.internalBeforeGraph()
this match {
case p: BiopetPipe => {
case p: BiopetPipe =>
p.commands.last._outputAsStdout = true
new BiopetPipe(p.commands ::: that :: Nil)
}
case _ => new BiopetPipe(List(this, that))
}
}
......@@ -230,7 +195,48 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
}
/** stores global caches */
object BiopetCommandLineFunction {
object BiopetCommandLineFunction extends Logging {
private[core] val executableMd5Cache: mutable.Map[String, String] = mutable.Map()
private[core] val executableCache: mutable.Map[String, String] = mutable.Map()
case class Executable(path: Option[String], md5: Option[String])
def preProcessExecutable(executable: String): Executable = {
if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) {
if (executable != null) {
if (!BiopetCommandLineFunction.executableCache.contains(executable)) {
try {
val buffer = new StringBuffer()
val cmd = Seq("which", executable)
val process = Process(cmd).run(ProcessLogger(buffer.append(_)))
if (process.exitValue == 0) {
val file = new File(buffer.toString)
BiopetCommandLineFunction.executableCache += executable -> file.getAbsolutePath
} else {
Logging.addError("executable: '" + executable + "' not found, please check config")
BiopetCommandLineFunction.executableCache += executable -> executable
}
} catch {
case ioe: java.io.IOException =>
logger.warn(s"Could not use 'which' on '$executable', check on executable skipped: " + ioe)
BiopetCommandLineFunction.executableCache += executable -> executable
}
}
if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) {
val newExe = BiopetCommandLineFunction.executableCache(executable)
if (new File(newExe).exists()) {
val is = new FileInputStream(newExe)
val cnt = is.available
val bytes = Array.ofDim[Byte](cnt)
is.read(bytes)
is.close()
val temp = MessageDigest.getInstance("MD5").digest(bytes).map("%02X".format(_)).mkString.toLowerCase
BiopetCommandLineFunction.executableMd5Cache += newExe -> temp
} else BiopetCommandLineFunction.executableMd5Cache += newExe -> "file_does_not_exist"
}
}
}
Executable(BiopetCommandLineFunction.executableCache.get(executable),
BiopetCommandLineFunction.executableMd5Cache.get(executable))
}
}
......@@ -21,9 +21,9 @@ import org.broadinstitute.gatk.queue.function.JavaCommandLineFunction
trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetCommandLineFunction {
executable = config("java", default = "java", submodule = "java", freeVar = false)
javaGCThreads = config("java_gc_threads")
javaGCHeapFreeLimit = config("java_gc_heap_freelimit")
javaGCTimeLimit = config("java_gc_timelimit")
javaGCThreads = config("java_gc_threads", default = 4)
javaGCHeapFreeLimit = config("java_gc_heap_freelimit", default = 10)
javaGCTimeLimit = config("java_gc_timelimit", default = 50)
override def defaultVmemFactor: Double = 2.0
......@@ -38,8 +38,6 @@ trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetC
if (javaMainClass != null && javaClasspath.isEmpty)
javaClasspath = JavaCommandLineFunction.currentClasspath
//threads = getThreads(defaultThreads)
}
/** Creates command to execute extension */
......
......@@ -54,8 +54,6 @@ object Version extends Logging {
if (versionCache.contains(versionCommand)) return versionCache.get(versionCommand)
else if (versionCommand == null || versionRegex == null) return None
else {
val exe = new File(versionCommand.trim.split(" ")(0))
if (!exe.exists()) return None
val stdout = new StringBuffer()
val stderr = new StringBuffer()
def outputLog = "Version command: \n" + versionCommand +
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.gatk
import java.io.File
import nl.lumc.sasc.biopet.core.summary.Summarizable
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import org.broadinstitute.gatk.utils.report.{ GATKReportTable, GATKReport }
/**
* Extension for CombineVariants from GATK
*
* Created by pjvan_thof on 2/26/15.
*/
class GenotypeConcordance(val root: Configurable) extends Gatk with Summarizable {
val analysisType = "GenotypeConcordance"
@Input(required = true)
var evalFile: File = null
@Input(required = true)
var compFile: File = null
@Output(required = true)
var outputFile: File = null
var moltenize = true
def summaryFiles = Map("output" -> outputFile)
def summaryStats = {
val report = new GATKReport(outputFile)
val compProportions = report.getTable("GenotypeConcordance_CompProportions")
val counts = report.getTable("GenotypeConcordance_Counts")
val evalProportions = report.getTable("GenotypeConcordance_EvalProportions")
val genotypeSummary = report.getTable("GenotypeConcordance_Summary")
val siteSummary = report.getTable("SiteConcordance_Summary")
val samples = for (i <- 0 until genotypeSummary.getNumRows) yield genotypeSummary.get(i, "Sample").toString
def getMap(table: GATKReportTable, column: String) = samples.distinct.map(sample => sample -> {
(for (i <- 0 until table.getNumRows if table.get(i, "Sample") == sample) yield s"${table.get(i, "Eval_Genotype")}__${table.get(i, "Comp_Genotype")}" -> table.get(i, column)).toMap
}).toMap
Map(
"compProportions" -> getMap(compProportions, "Proportion"),
"counts" -> getMap(counts, "Count"),
"evalProportions" -> getMap(evalProportions, "Proportion"),
"genotypeSummary" -> samples.distinct.map(sample => {
val i = samples.indexOf(sample)
sample -> Map(
"Non-Reference_Discrepancy" -> genotypeSummary.get(i, "Non-Reference_Discrepancy"),
"Non-Reference_Sensitivity" -> genotypeSummary.get(i, "Non-Reference_Sensitivity"),
"Overall_Genotype_Concordance" -> genotypeSummary.get(i, "Overall_Genotype_Concordance")
)
}).toMap,
"siteSummary" -> Map(
"ALLELES_MATCH" -> siteSummary.get(0, "ALLELES_MATCH"),
"EVAL_SUPERSET_TRUTH" -> siteSummary.get(0, "EVAL_SUPERSET_TRUTH"),
"EVAL_SUBSET_TRUTH" -> siteSummary.get(0, "EVAL_SUBSET_TRUTH"),
"ALLELES_DO_NOT_MATCH" -> siteSummary.get(0, "ALLELES_DO_NOT_MATCH"),
"EVAL_ONLY" -> siteSummary.get(0, "EVAL_ONLY"),
"TRUTH_ONLY" -> siteSummary.get(0, "TRUTH_ONLY")
)
)
}
override def beforeGraph(): Unit = {
super.beforeGraph()
deps :::= (evalFile :: compFile :: Nil).filter(_.getName.endsWith("vcf.gz")).map(x => new File(x.getAbsolutePath + ".tbi"))
deps = deps.distinct
}
override def cmdLine = super.cmdLine +
required("--eval", evalFile) +
required("--comp", compFile) +
required("-o", outputFile) +
conditional(moltenize, "--moltenize")
}
......@@ -20,10 +20,10 @@ import java.io.File
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.{ Reference, SampleLibraryTag }
import nl.lumc.sasc.biopet.extensions.bcftools.{ BcftoolsCall, BcftoolsMerge }
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import nl.lumc.sasc.biopet.extensions.gatk.{ GenotypeConcordance, CombineVariants }
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsMpileup
import nl.lumc.sasc.biopet.extensions.tools.{ MpileupToVcf, VcfFilter, VcfStats }
import nl.lumc.sasc.biopet.extensions.{ Bgzip, Tabix }
import nl.lumc.sasc.biopet.extensions.{ Ln, Bgzip, Tabix }
import nl.lumc.sasc.biopet.utils.Logging
import org.broadinstitute.gatk.utils.commandline.Input
......@@ -38,6 +38,10 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with
@Input(doc = "Bam files (should be deduped bams)", shortName = "BAM", required = true)
var inputBams: List[File] = Nil
var referenceVcf: Option[File] = config("reference_vcf")
var referenceVcfRegions: Option[File] = config("reference_vcf_regions")
/** Name prefix, can override this methods if neeeded */
def namePrefix: String = {
(sampleId, libId) match {
......@@ -85,6 +89,16 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with
vcfStats.setOutputDir(new File(caller.outputDir, "vcfstats"))
add(vcfStats)
addSummarizable(vcfStats, namePrefix + "-vcfstats-" + caller.name)
referenceVcf.foreach(referenceVcfFile => {
val gc = new GenotypeConcordance(this)
gc.evalFile = caller.outputFile
gc.compFile = referenceVcfFile
gc.outputFile = new File(caller.outputDir, s"$namePrefix-genotype_concordance.${caller.name}.txt")
referenceVcfRegions.foreach(gc.intervals ::= _)
add(gc)
addSummarizable(gc, s"$namePrefix-genotype_concordance-${caller.name}")
})
}
add(cv)
......@@ -95,6 +109,16 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with
add(vcfStats)
addSummarizable(vcfStats, namePrefix + "-vcfstats-final")
referenceVcf.foreach(referenceVcfFile => {
val gc = new GenotypeConcordance(this)
gc.evalFile = finalFile
gc.compFile = referenceVcfFile
gc.outputFile = new File(outputDir, s"$namePrefix-genotype_concordance.final.txt")
referenceVcfRegions.foreach(gc.intervals ::= _)
add(gc)
addSummarizable(gc, s"$namePrefix-genotype_concordance-final")
})
addSummaryJobs()
}
......@@ -200,11 +224,13 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with
bt.output
}
val bcfmerge = new BcftoolsMerge(qscript)
bcfmerge.input = sampleVcfs
bcfmerge.output = outputFile
bcfmerge.O = Some("z")
add(bcfmerge)
if (sampleVcfs.size > 1) {
val bcfmerge = new BcftoolsMerge(qscript)
bcfmerge.input = sampleVcfs
bcfmerge.output = outputFile
bcfmerge.O = Some("z")
add(bcfmerge)
} else add(Ln.apply(qscript, sampleVcfs.head, outputFile))
add(Tabix(qscript, outputFile))
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment