Commit 88b4b2a4 authored by Peter van 't Hof's avatar Peter van 't Hof

Added protected artifact for gatk pipelines

parent 6441dfd0
/target/
\ No newline at end of file
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetGatkPipelines</artifactId>
<version>0.2.0-DEV</version>
<packaging>jar</packaging>
<inceptionYear>2014</inceptionYear>
<name>BiopetGatkPipelines</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<sting.unpack.phase>prepare-package</sting.unpack.phase>
<sting.shade.phase>package</sting.shade.phase>
<app.main.class>nl.lumc.sasc.biopet.core.BiopetExecutable</app.main.class>
</properties>
<dependencies>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetFramework</artifactId>
<version>0.2.0-DEV</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetGatkExtensions</artifactId>
<version>0.2.0-DEV</version>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
<includes>
<include>**/*</include>
</includes>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<id>scala-compile</id>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
<arg>-deprecation</arg>
<arg>-feature</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.5</version>
<configuration>
<archive>
<manifest>
<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<showDeprecation>true</showDeprecation>
</configuration>
</plugin>
</plugins>
</build>
</project>
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import scala.util.Random
class GatkBenchmarkGenotyping(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Sample gvcf file")
var sampleGvcf: File = _
@Argument(doc = "SampleName", required = true)
var sampleName: String = _
@Input(doc = "Gvcf files", shortName = "I", required = false)
var gvcfFiles: List[File] = Nil
var reference: File = config("reference")
@Argument(doc = "Dbsnp", shortName = "dbsnp", required = false)
var dbsnp: File = config("dbsnp")
def init() {
if (config.contains("gvcffiles")) for (file <- config("gvcffiles").asList) {
gvcfFiles ::= file.toString
}
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
var todoGvcfs = gvcfFiles
var gvcfPool: List[File] = Nil
addGenotypingPipeline(gvcfPool)
while (todoGvcfs.size > 0) {
val index = Random.nextInt(todoGvcfs.size)
gvcfPool ::= todoGvcfs(index)
addGenotypingPipeline(gvcfPool)
todoGvcfs = todoGvcfs.filter(b => b != todoGvcfs(index))
}
}
def addGenotypingPipeline(gvcfPool: List[File]) {
val gatkGenotyping = new GatkGenotyping(this)
gatkGenotyping.inputGvcfs = sampleGvcf :: gvcfPool
gatkGenotyping.samples :+= sampleName
gatkGenotyping.outputDir = outputDir + "samples_" + gvcfPool.size + "/"
gatkGenotyping.init
gatkGenotyping.biopetScript
addAll(gatkGenotyping.functions)
}
}
object GatkBenchmarkGenotyping extends PipelineCommand
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.{ GenotypeGVCFs, SelectVariants }
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument }
class GatkGenotyping(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Gvcf files", shortName = "I")
var inputGvcfs: List[File] = Nil
@Argument(doc = "Reference", shortName = "R", required = false)
var reference: File = config("reference")
@Argument(doc = "Dbsnp", shortName = "dbsnp", required = false)
var dbsnp: File = config("dbsnp")
@Argument(doc = "OutputName", required = false)
var outputName: String = "genotype"
@Output(doc = "OutputFile", shortName = "O", required = false)
var outputFile: File = _
@Argument(doc = "Samples", shortName = "sample", required = false)
var samples: List[String] = Nil
def init() {
if (outputFile == null) outputFile = outputDir + outputName + ".vcf.gz"
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
addGenotypeGVCFs(inputGvcfs, outputFile)
if (!samples.isEmpty) {
if (samples.size > 1) addSelectVariants(outputFile, samples, outputDir + "samples/", "all")
for (sample <- samples) addSelectVariants(outputFile, List(sample), outputDir + "samples/", sample)
}
}
def addGenotypeGVCFs(gvcfFiles: List[File], outputFile: File): File = {
val genotypeGVCFs = GenotypeGVCFs(this, gvcfFiles, outputFile)
add(genotypeGVCFs)
return genotypeGVCFs.out
}
def addSelectVariants(inputFile: File, samples: List[String], outputDir: String, name: String) {
val selectVariants = SelectVariants(this, inputFile, outputDir + name + ".vcf.gz")
selectVariants.excludeNonVariants = true
for (sample <- samples) selectVariants.sample_name :+= sample
add(selectVariants)
}
}
object GatkGenotyping extends PipelineCommand
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.BiopetQScript
import nl.lumc.sasc.biopet.core.PipelineCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.ApplyRecalibration
import nl.lumc.sasc.biopet.extensions.gatk.VariantAnnotator
import nl.lumc.sasc.biopet.extensions.gatk.VariantRecalibrator
import org.broadinstitute.gatk.queue.QScript
class GatkVariantRecalibration(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "input vcf file", shortName = "I")
var inputVcf: File = _
@Input(doc = "input vcf file", shortName = "BAM", required = false)
var bamFiles: List[File] = Nil
@Output(doc = "output vcf file", shortName = "out")
var outputVcf: File = _
def init() {
if (inputVcf == null) throw new IllegalStateException("Missing Output directory on gatk module")
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
var vcfFile: File = if (!bamFiles.isEmpty) addVariantAnnotator(inputVcf, bamFiles, outputDir) else inputVcf
vcfFile = addSnpVariantRecalibrator(vcfFile, outputDir)
vcfFile = addIndelVariantRecalibrator(vcfFile, outputDir)
}
def addSnpVariantRecalibrator(inputVcf: File, dir: String): File = {
val snpRecal = VariantRecalibrator(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal"),
swapExt(dir, inputVcf, ".vcf", ".indel.tranches"), indel = false)
if (!snpRecal.resource.isEmpty) {
add(snpRecal)
val snpApply = ApplyRecalibration(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal.vcf"),
snpRecal.recal_file, snpRecal.tranches_file, indel = false)
add(snpApply)
return snpApply.out
} else {
logger.warn("Skipped snp Recalibration, resource is missing")
return inputVcf
}
}
def addIndelVariantRecalibrator(inputVcf: File, dir: String): File = {
val indelRecal = VariantRecalibrator(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal"),
swapExt(dir, inputVcf, ".vcf", ".indel.tranches"), indel = true)
if (!indelRecal.resource.isEmpty) {
add(indelRecal)
val indelApply = ApplyRecalibration(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal.vcf"),
indelRecal.recal_file, indelRecal.tranches_file, indel = true)
add(indelApply)
return indelApply.out
} else {
logger.warn("Skipped indel Recalibration, resource is missing")
return inputVcf
}
}
def addVariantAnnotator(inputvcf: File, bamfiles: List[File], dir: String): File = {
val variantAnnotator = VariantAnnotator(this, inputvcf, bamfiles, swapExt(dir, inputvcf, ".vcf", ".anotated.vcf"))
add(variantAnnotator)
return variantAnnotator.out
}
}
object GatkVariantRecalibration extends PipelineCommand
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import nl.lumc.sasc.biopet.extensions.gatk.SelectVariants
import nl.lumc.sasc.biopet.extensions.gatk.VariantEval
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
class GatkVcfSampleCompare(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Sample vcf file(s)", shortName = "V")
var vcfFiles: List[File] = _
@Argument(doc = "Reference", shortName = "R", required = false)
var reference: File = config("reference")
@Argument(doc = "Target bed", shortName = "targetBed", required = false)
var targetBed: List[File] = Nil
@Argument(doc = "Samples", shortName = "sample", required = false)
var samples: List[String] = Nil
var vcfFile: File = _
var sampleVcfs: Map[String, File] = Map()
def generalSampleDir = outputDir + "samples/"
def init() {
if (config.contains("target_bed"))
for (bed <- config("target_bed").asList)
targetBed :+= bed.toString
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
vcfFile = if (vcfFiles.size > 1) {
val combineVariants = CombineVariants(this, vcfFiles, outputDir + "merge.vcf")
add(combineVariants)
combineVariants.out
} else vcfFiles.head
for (sample <- samples) {
sampleVcfs += (sample -> new File(generalSampleDir + sample + File.separator + sample + ".vcf"))
val selectVariants = SelectVariants(this, vcfFile, sampleVcfs(sample))
selectVariants.sample_name = Seq(sample)
selectVariants.excludeNonVariants = true
add(selectVariants)
}
val sampleCompareMetrics = new SampleCompareMetrics(this)
sampleCompareMetrics.samples = samples
sampleCompareMetrics.sampleDir = generalSampleDir
sampleCompareMetrics.snpRelFile = outputDir + "compare.snp.rel.tsv"
sampleCompareMetrics.snpAbsFile = outputDir + "compare.snp.abs.tsv"
sampleCompareMetrics.indelRelFile = outputDir + "compare.indel.rel.tsv"
sampleCompareMetrics.indelAbsFile = outputDir + "compare.indel.abs.tsv"
sampleCompareMetrics.totalFile = outputDir + "total.tsv"
for ((sample, sampleVcf) <- sampleVcfs) {
val sampleDir = generalSampleDir + sample + File.separator
for ((compareSample, compareSampleVcf) <- sampleVcfs) {
val variantEval = VariantEval(this,
sampleVcf,
compareSampleVcf,
new File(sampleDir + sample + "-" + compareSample + ".eval.txt"),
Seq("VariantType", "CompRod"),
Seq("CompOverlap")
)
if (targetBed != null) variantEval.L = targetBed
add(variantEval)
sampleCompareMetrics.deps ::= variantEval.out
}
}
add(sampleCompareMetrics)
}
}
object GatkVcfSampleCompare extends PipelineCommand
package nl.lumc.sasc.biopet.pipelines.gatk
import java.io.File
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.R.RScriptExecutor
import org.broadinstitute.gatk.utils.commandline.{ Output, Argument }
import scala.io.Source
import org.broadinstitute.gatk.utils.R.{ RScriptLibrary, RScriptExecutor }
import org.broadinstitute.gatk.utils.io.Resource
import scala.collection.mutable.Map
import scala.math._
class SampleCompareMetrics(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Argument(doc = "Sample Dir", shortName = "sampleDir", required = true)
var sampleDir: String = _
@Argument(doc = "Samples", shortName = "sample", required = true)
var samples: List[String] = Nil
@Argument(doc = "File sufix", shortName = "sufix", required = false)
var fileSufix: String = _
@Output(doc = "snpRelFile", shortName = "snpRelFile", required = true)
var snpRelFile: File = _
@Output(doc = "snpAbsFile", shortName = "snpAbsFile", required = true)
var snpAbsFile: File = _
@Output(doc = "indelRelFile", shortName = "indelRelFile", required = true)
var indelRelFile: File = _
@Output(doc = "indelAbsFile", shortName = "indelAbsFile", required = true)
var indelAbsFile: File = _
@Output(doc = "totalFile", shortName = "totalFile", required = true)
var totalFile: File = _
override val defaultVmem = "8G"
memoryLimit = Option(4.0)
override def commandLine = super.commandLine +
required("-sampleDir", sampleDir) +
repeat("-sample", samples) +
optional("-fileSufix", fileSufix) +
required("-snpRelFile", snpRelFile) +
required("-snpAbsFile", snpAbsFile) +
required("-indelRelFile", indelRelFile) +
required("-indelAbsFile", indelAbsFile) +
required("-totalFile", totalFile)
}
object SampleCompareMetrics {
var sampleDir: String = _
var samples: List[String] = Nil
var fileSufix: String = ".eval.txt"
var snpRelFile: File = _
var snpAbsFile: File = _
var indelRelFile: File = _
var indelAbsFile: File = _
var totalFile: File = _
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
for (t <- 0 until args.size) {
args(t) match {
case "-sample" => samples +:= args(t + 1)
case "-sampleDir" => sampleDir = args(t + 1)
case "-fileSufix" => fileSufix = args(t + 1)
case "-snpRelFile" => snpRelFile = new File(args(t + 1))
case "-snpAbsFile" => snpAbsFile = new File(args(t + 1))
case "-indelRelFile" => indelRelFile = new File(args(t + 1))
case "-indelAbsFile" => indelAbsFile = new File(args(t + 1))
case "-totalFile" => totalFile = new File(args(t + 1))
case _ =>
}
}
if (sampleDir == null) throw new IllegalStateException("No sampleDir, use -sampleDir")
else if (!sampleDir.endsWith("/")) sampleDir += "/"
val regex = """\W+""".r
val snpsOverlap: Map[(String, String), Int] = Map()
val indelsOverlap: Map[(String, String), Int] = Map()
val snpsTotal: Map[String, Int] = Map()
val indelsTotal: Map[String, Int] = Map()
for (sample1 <- samples; sample2 <- samples) {
val reader = Source.fromFile(new File(sampleDir + sample1 + "/" + sample1 + "-" + sample2 + fileSufix))
for (line <- reader.getLines) {
regex.split(line) match {
case Array(_, _, _, varType, all, novel, overlap, rate, _*) => {
varType match {
case "SNP" => {
snpsOverlap += (sample1, sample2) -> overlap.toInt
snpsTotal += sample1 -> all.toInt
}
case "INDEL" => {
indelsOverlap += (sample1, sample2) -> overlap.toInt
indelsTotal += sample1 -> all.toInt
}
case _ =>
}
}
case _ =>
}
}
reader.close()
}
val snpRelWritter = new PrintWriter(snpRelFile)
val snpAbsWritter = new PrintWriter(snpAbsFile)
val indelRelWritter = new PrintWriter(indelRelFile)
val indelAbsWritter = new PrintWriter(indelAbsFile)
val allWritters = List(snpRelWritter, snpAbsWritter, indelRelWritter, indelAbsWritter)
for (writter <- allWritters) writter.println(samples.mkString("\t", "\t", ""))
for (sample1 <- samples) {
for (writter <- allWritters) writter.print(sample1)
for (sample2 <- samples) {
snpRelWritter.print("\t" + (round((snpsOverlap(sample1, sample2).toDouble / snpsTotal(sample1) * 10000.0)) / 10000.0))
snpAbsWritter.print("\t" + snpsOverlap(sample1, sample2))
indelRelWritter.print("\t" + (round((indelsOverlap(sample1, sample2).toDouble / indelsTotal(sample1) * 10000.0)) / 10000.0))
indelAbsWritter.print("\t" + indelsOverlap(sample1, sample2))
}
for (writter <- allWritters) writter.println()
}
for (writter <- allWritters) writter.close()
val totalWritter = new PrintWriter(totalFile)
totalWritter.println("Sample\tSNPs\tIndels")
for (sample <- samples)
totalWritter.println(sample + "\t" + snpsTotal(sample) + "\t" + indelsTotal(sample))
totalWritter.close()
def plot(file: File) {
val executor = new RScriptExecutor
executor.addScript(new Resource("plotHeatmap.R", getClass))
executor.addArgs(file, file.getAbsolutePath.stripSuffix(".tsv") + ".png", file.getAbsolutePath.stripSuffix(".tsv") + ".clustering.png")
executor.exec()
}
plot(snpRelFile)
plot(indelRelFile)
}
}
\ No newline at end of file
......@@ -10,6 +10,7 @@
<packaging>pom</packaging>
<modules>
<module>biopet-gatk-extensions</module>
<module>biopet-gatk-pipelines</module>
</modules>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment