diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala index 1c557e5223634fdc14d992313e006bca6b141c90..11b59f2fa52bf56e455475e2e719a746f6ff41ed 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala @@ -5,13 +5,15 @@ */ package nl.lumc.sasc.biopet.extensions.gatk.broad -import nl.lumc.sasc.biopet.core.{ Version, CommandLineResources, Reference, BiopetJavaCommandLineFunction } +import nl.lumc.sasc.biopet.core._ import org.broadinstitute.gatk.engine.phonehome.GATKRunReport import org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGATK trait GatkGeneral extends CommandLineGATK with CommandLineResources with Reference with Version { memoryLimit = Option(3) + var executable: String = config("java", default = "java", submodule = "java", freeVar = false) + override def subPath = "gatk" :: super.subPath jarFile = config("gatk_jar") @@ -20,6 +22,7 @@ trait GatkGeneral extends CommandLineGATK with CommandLineResources with Referen override def defaultCoreMemory = 4.0 override def faiRequired = true + override def dictRequired = true if (config.contains("intervals")) intervals = config("intervals").asFileList if (config.contains("exclude_intervals")) excludeIntervals = config("exclude_intervals").asFileList @@ -39,5 +42,8 @@ trait GatkGeneral extends CommandLineGATK with CommandLineResources with Referen override def versionExitcode = List(0, 1) def versionCommand = "java" + " -jar " + jarFile + " -version" - override def getVersion = super.getVersion.collect { case v => "Gatk " + v } + override def getVersion = { + BiopetCommandLineFunction.preProcessExecutable(executable).path.foreach(executable = _) + super.getVersion.collect { case v => "Gatk " + v } + } } diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunction.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunction.scala index 45dbe227c72cd10243b38ea9865ed01f4ad493bc..c0046d0535ccb5c429b2fa8b5707868c74f71f41 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunction.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunction.scala @@ -26,7 +26,6 @@ import org.ggf.drmaa.JobTemplate import scala.collection.mutable import scala.io.Source import scala.sys.process.{ Process, ProcessLogger } -import scala.util.matching.Regex import scala.collection.JavaConversions._ /** Biopet command line trait to auto check executable and cluster values */ @@ -50,20 +49,18 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction => val writer = new PrintWriter(file) writer.println("set -eubf") writer.println("set -o pipefail") - lines.foreach(writer.println(_)) + lines.foreach(writer.println) writer.close() } // This overrides the default "sh" from queue. For Biopet the default is "bash" updateJobRun = { - case jt: JobTemplate => { + case jt: JobTemplate => changeScript(new File(jt.getArgs.head.toString)) jt.setRemoteCommand(remoteCommand) - } - case ps: ProcessSettings => { + case ps: ProcessSettings => changeScript(new File(ps.getCommand.tail.head)) ps.setCommand(Array(remoteCommand) ++ ps.getCommand.tail) - } } /** @@ -91,50 +88,19 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction => } - /** can override this value is executable may not be converted to CanonicalPath */ + /** + * Can override this value is executable may not be converted to CanonicalPath + * @deprecated + */ val executableToCanonicalPath = true /** * Checks executable. Follow full CanonicalPath, checks if it is existing and do a md5sum on it to store in job report */ protected[core] def preProcessExecutable() { - if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) { - if (executable != null) { - if (!BiopetCommandLineFunction.executableCache.contains(executable)) { - try { - val oldExecutable = executable - val buffer = new StringBuffer() - val cmd = Seq("which", executable) - val process = Process(cmd).run(ProcessLogger(buffer.append(_))) - if (process.exitValue == 0) { - executable = buffer.toString - val file = new File(executable) - if (executableToCanonicalPath) executable = file.getCanonicalPath - else executable = file.getAbsolutePath - } else Logging.addError("executable: '" + executable + "' not found, please check config") - BiopetCommandLineFunction.executableCache += oldExecutable -> executable - BiopetCommandLineFunction.executableCache += executable -> executable - } catch { - case ioe: java.io.IOException => - logger.warn(s"Could not use 'which' on '$executable', check on executable skipped: " + ioe) - } - } else executable = BiopetCommandLineFunction.executableCache(executable) - - if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) { - if (new File(executable).exists()) { - val is = new FileInputStream(executable) - val cnt = is.available - val bytes = Array.ofDim[Byte](cnt) - is.read(bytes) - is.close() - val temp = MessageDigest.getInstance("MD5").digest(bytes).map("%02X".format(_)).mkString.toLowerCase - BiopetCommandLineFunction.executableMd5Cache += executable -> temp - } else BiopetCommandLineFunction.executableMd5Cache += executable -> "file_does_not_exist" - } - } - } - val md5 = BiopetCommandLineFunction.executableMd5Cache.get(executable) - addJobReportBinding("md5sum_exe", md5.getOrElse("None")) + val exe = BiopetCommandLineFunction.preProcessExecutable(executable) + exe.path.foreach(executable = _) + addJobReportBinding("md5sum_exe", exe.md5.getOrElse("N/A")) } /** executes checkExecutable method and fill job report */ @@ -166,10 +132,9 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction => that.beforeGraph() that.internalBeforeGraph() this match { - case p: BiopetPipe => { + case p: BiopetPipe => p.commands.last._outputAsStdout = true new BiopetPipe(p.commands ::: that :: Nil) - } case _ => new BiopetPipe(List(this, that)) } } @@ -230,7 +195,48 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction => } /** stores global caches */ -object BiopetCommandLineFunction { +object BiopetCommandLineFunction extends Logging { private[core] val executableMd5Cache: mutable.Map[String, String] = mutable.Map() private[core] val executableCache: mutable.Map[String, String] = mutable.Map() + + case class Executable(path: Option[String], md5: Option[String]) + def preProcessExecutable(executable: String): Executable = { + if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) { + if (executable != null) { + if (!BiopetCommandLineFunction.executableCache.contains(executable)) { + try { + val buffer = new StringBuffer() + val cmd = Seq("which", executable) + val process = Process(cmd).run(ProcessLogger(buffer.append(_))) + if (process.exitValue == 0) { + val file = new File(buffer.toString) + BiopetCommandLineFunction.executableCache += executable -> file.getAbsolutePath + } else { + Logging.addError("executable: '" + executable + "' not found, please check config") + BiopetCommandLineFunction.executableCache += executable -> executable + } + } catch { + case ioe: java.io.IOException => + logger.warn(s"Could not use 'which' on '$executable', check on executable skipped: " + ioe) + BiopetCommandLineFunction.executableCache += executable -> executable + } + } + + if (!BiopetCommandLineFunction.executableMd5Cache.contains(executable)) { + val newExe = BiopetCommandLineFunction.executableCache(executable) + if (new File(newExe).exists()) { + val is = new FileInputStream(newExe) + val cnt = is.available + val bytes = Array.ofDim[Byte](cnt) + is.read(bytes) + is.close() + val temp = MessageDigest.getInstance("MD5").digest(bytes).map("%02X".format(_)).mkString.toLowerCase + BiopetCommandLineFunction.executableMd5Cache += newExe -> temp + } else BiopetCommandLineFunction.executableMd5Cache += newExe -> "file_does_not_exist" + } + } + } + Executable(BiopetCommandLineFunction.executableCache.get(executable), + BiopetCommandLineFunction.executableMd5Cache.get(executable)) + } } diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetJavaCommandLineFunction.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetJavaCommandLineFunction.scala index 86e97f4c7dff893869eb8d8c7eec94c4389b9146..5722871028d92aeddd2723c22971a2e08e51107a 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetJavaCommandLineFunction.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetJavaCommandLineFunction.scala @@ -21,9 +21,9 @@ import org.broadinstitute.gatk.queue.function.JavaCommandLineFunction trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetCommandLineFunction { executable = config("java", default = "java", submodule = "java", freeVar = false) - javaGCThreads = config("java_gc_threads") - javaGCHeapFreeLimit = config("java_gc_heap_freelimit") - javaGCTimeLimit = config("java_gc_timelimit") + javaGCThreads = config("java_gc_threads", default = 4) + javaGCHeapFreeLimit = config("java_gc_heap_freelimit", default = 10) + javaGCTimeLimit = config("java_gc_timelimit", default = 50) override def defaultVmemFactor: Double = 2.0 @@ -38,8 +38,6 @@ trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetC if (javaMainClass != null && javaClasspath.isEmpty) javaClasspath = JavaCommandLineFunction.currentClasspath - - //threads = getThreads(defaultThreads) } /** Creates command to execute extension */ diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/Version.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/Version.scala index d283729b2c5a853461d7d75a56eb8d5cb8125c0f..91d813acb3a2c633d65a7c6d8f403c05d2ea2332 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/Version.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/Version.scala @@ -54,8 +54,6 @@ object Version extends Logging { if (versionCache.contains(versionCommand)) return versionCache.get(versionCommand) else if (versionCommand == null || versionRegex == null) return None else { - val exe = new File(versionCommand.trim.split(" ")(0)) - if (!exe.exists()) return None val stdout = new StringBuffer() val stderr = new StringBuffer() def outputLog = "Version command: \n" + versionCommand + diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GenotypeConcordance.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GenotypeConcordance.scala new file mode 100644 index 0000000000000000000000000000000000000000..519cbfad6a8db14cf4812fb0005dc7e0e0e6aa65 --- /dev/null +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GenotypeConcordance.scala @@ -0,0 +1,94 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.gatk + +import java.io.File + +import nl.lumc.sasc.biopet.core.summary.Summarizable +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } +import org.broadinstitute.gatk.utils.report.{ GATKReportTable, GATKReport } + +/** + * Extension for CombineVariants from GATK + * + * Created by pjvan_thof on 2/26/15. + */ +class GenotypeConcordance(val root: Configurable) extends Gatk with Summarizable { + val analysisType = "GenotypeConcordance" + + @Input(required = true) + var evalFile: File = null + + @Input(required = true) + var compFile: File = null + + @Output(required = true) + var outputFile: File = null + + var moltenize = true + + def summaryFiles = Map("output" -> outputFile) + + def summaryStats = { + val report = new GATKReport(outputFile) + val compProportions = report.getTable("GenotypeConcordance_CompProportions") + val counts = report.getTable("GenotypeConcordance_Counts") + val evalProportions = report.getTable("GenotypeConcordance_EvalProportions") + val genotypeSummary = report.getTable("GenotypeConcordance_Summary") + val siteSummary = report.getTable("SiteConcordance_Summary") + + val samples = for (i <- 0 until genotypeSummary.getNumRows) yield genotypeSummary.get(i, "Sample").toString + + def getMap(table: GATKReportTable, column: String) = samples.distinct.map(sample => sample -> { + (for (i <- 0 until table.getNumRows if table.get(i, "Sample") == sample) yield s"${table.get(i, "Eval_Genotype")}__${table.get(i, "Comp_Genotype")}" -> table.get(i, column)).toMap + }).toMap + + Map( + "compProportions" -> getMap(compProportions, "Proportion"), + "counts" -> getMap(counts, "Count"), + "evalProportions" -> getMap(evalProportions, "Proportion"), + "genotypeSummary" -> samples.distinct.map(sample => { + val i = samples.indexOf(sample) + sample -> Map( + "Non-Reference_Discrepancy" -> genotypeSummary.get(i, "Non-Reference_Discrepancy"), + "Non-Reference_Sensitivity" -> genotypeSummary.get(i, "Non-Reference_Sensitivity"), + "Overall_Genotype_Concordance" -> genotypeSummary.get(i, "Overall_Genotype_Concordance") + ) + }).toMap, + "siteSummary" -> Map( + "ALLELES_MATCH" -> siteSummary.get(0, "ALLELES_MATCH"), + "EVAL_SUPERSET_TRUTH" -> siteSummary.get(0, "EVAL_SUPERSET_TRUTH"), + "EVAL_SUBSET_TRUTH" -> siteSummary.get(0, "EVAL_SUBSET_TRUTH"), + "ALLELES_DO_NOT_MATCH" -> siteSummary.get(0, "ALLELES_DO_NOT_MATCH"), + "EVAL_ONLY" -> siteSummary.get(0, "EVAL_ONLY"), + "TRUTH_ONLY" -> siteSummary.get(0, "TRUTH_ONLY") + ) + ) + } + + override def beforeGraph(): Unit = { + super.beforeGraph() + deps :::= (evalFile :: compFile :: Nil).filter(_.getName.endsWith("vcf.gz")).map(x => new File(x.getAbsolutePath + ".tbi")) + deps = deps.distinct + } + + override def cmdLine = super.cmdLine + + required("--eval", evalFile) + + required("--comp", compFile) + + required("-o", outputFile) + + conditional(moltenize, "--moltenize") +} diff --git a/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTrait.scala b/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTrait.scala index 428ced6f8e3d0a1640572efb3491d74e2e5c440c..6eb2832c17acea6bcdc0d6766d48d3100dc73472 100644 --- a/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTrait.scala +++ b/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTrait.scala @@ -20,10 +20,10 @@ import java.io.File import nl.lumc.sasc.biopet.core.summary.SummaryQScript import nl.lumc.sasc.biopet.core.{ Reference, SampleLibraryTag } import nl.lumc.sasc.biopet.extensions.bcftools.{ BcftoolsCall, BcftoolsMerge } -import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants +import nl.lumc.sasc.biopet.extensions.gatk.{ GenotypeConcordance, CombineVariants } import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsMpileup import nl.lumc.sasc.biopet.extensions.tools.{ MpileupToVcf, VcfFilter, VcfStats } -import nl.lumc.sasc.biopet.extensions.{ Bgzip, Tabix } +import nl.lumc.sasc.biopet.extensions.{ Ln, Bgzip, Tabix } import nl.lumc.sasc.biopet.utils.Logging import org.broadinstitute.gatk.utils.commandline.Input @@ -38,6 +38,10 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with @Input(doc = "Bam files (should be deduped bams)", shortName = "BAM", required = true) var inputBams: List[File] = Nil + var referenceVcf: Option[File] = config("reference_vcf") + + var referenceVcfRegions: Option[File] = config("reference_vcf_regions") + /** Name prefix, can override this methods if neeeded */ def namePrefix: String = { (sampleId, libId) match { @@ -85,6 +89,16 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with vcfStats.setOutputDir(new File(caller.outputDir, "vcfstats")) add(vcfStats) addSummarizable(vcfStats, namePrefix + "-vcfstats-" + caller.name) + + referenceVcf.foreach(referenceVcfFile => { + val gc = new GenotypeConcordance(this) + gc.evalFile = caller.outputFile + gc.compFile = referenceVcfFile + gc.outputFile = new File(caller.outputDir, s"$namePrefix-genotype_concordance.${caller.name}.txt") + referenceVcfRegions.foreach(gc.intervals ::= _) + add(gc) + addSummarizable(gc, s"$namePrefix-genotype_concordance-${caller.name}") + }) } add(cv) @@ -95,6 +109,16 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with add(vcfStats) addSummarizable(vcfStats, namePrefix + "-vcfstats-final") + referenceVcf.foreach(referenceVcfFile => { + val gc = new GenotypeConcordance(this) + gc.evalFile = finalFile + gc.compFile = referenceVcfFile + gc.outputFile = new File(outputDir, s"$namePrefix-genotype_concordance.final.txt") + referenceVcfRegions.foreach(gc.intervals ::= _) + add(gc) + addSummarizable(gc, s"$namePrefix-genotype_concordance-final") + }) + addSummaryJobs() } @@ -200,11 +224,13 @@ trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag with bt.output } - val bcfmerge = new BcftoolsMerge(qscript) - bcfmerge.input = sampleVcfs - bcfmerge.output = outputFile - bcfmerge.O = Some("z") - add(bcfmerge) + if (sampleVcfs.size > 1) { + val bcfmerge = new BcftoolsMerge(qscript) + bcfmerge.input = sampleVcfs + bcfmerge.output = outputFile + bcfmerge.O = Some("z") + add(bcfmerge) + } else add(Ln.apply(qscript, sampleVcfs.head, outputFile)) add(Tabix(qscript, outputFile)) } }