Commit e11ff161 authored by bow's avatar bow

Merge branch 'feature-index_generation' into 'develop'

Reference index generation



See merge request !396
parents 11c11c41 ee5c73f8
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
<modules> <modules>
<module>../biopet-core</module> <module>../biopet-core</module>
<module>../generate-indexes</module>
<module>../biopet-package</module> <module>../biopet-package</module>
<module>../bammetrics</module> <module>../bammetrics</module>
<module>../flexiprep</module> <module>../flexiprep</module>
......
...@@ -27,7 +27,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output } ...@@ -27,7 +27,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
*/ */
class BiopetPipe(val commands: List[BiopetCommandLineFunction]) extends BiopetCommandLineFunction { class BiopetPipe(val commands: List[BiopetCommandLineFunction]) extends BiopetCommandLineFunction {
@Input @Input(required = false)
lazy val input: List[File] = try { lazy val input: List[File] = try {
commands.flatMap(_.inputs) commands.flatMap(_.inputs)
} catch { } catch {
......
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.util.matching.Regex
/**
* Created by pjvan_thof on 17-5-16.
*/
class Awk(val root: Configurable) extends BiopetCommandLineFunction with Version {
executable = config("exe", default = "awk", freeVar = false)
def versionCommand: String = executable + " --version"
def versionRegex: Regex = """(GNU Awk \d+\.\d+\.\d+)""".r
@Input(required = false)
var input: File = _
@Output
var output: File = _
var command: String = _
def cmdLine = executable +
required(command) +
(if (inputAsStdin) "" else required(input)) +
(if (outputAsStsout) "" else " > " + required(output))
}
object Awk {
def apply(root: Configurable, command: String): Awk = {
val awk = new Awk(root)
awk.command = command
awk
}
}
\ No newline at end of file
...@@ -34,5 +34,5 @@ class Curl(val root: Configurable) extends BiopetCommandLineFunction with Versio ...@@ -34,5 +34,5 @@ class Curl(val root: Configurable) extends BiopetCommandLineFunction with Versio
def versionCommand = executable + " --version" def versionCommand = executable + " --version"
def versionRegex = """curl (\w+\.\w+\.\w+) .*""".r def versionRegex = """curl (\w+\.\w+\.\w+) .*""".r
def cmdLine: String = required(executable) + required(url) + " > " + required(output) def cmdLine: String = required(executable) + required(url) + (if (outputAsStsout) "" else " > " + required(output))
} }
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvan_thof on 17-5-16.
*/
class GtfToGenePred(val root: Configurable) extends BiopetCommandLineFunction {
executable = config("exe", default = "gtfToGenePred", freeVar = false)
@Input
var inputGtfs: List[File] = Nil
@Output
var outputGenePred: File = _
@Output
var infoOut: Option[File] = None
var genePredExt: Boolean = config("gene _pred _ext", default = false)
var allErrors: Boolean = config("all_errors", default = false)
var impliedStopAfterCds: Boolean = config("implied_stop_after_cds", default = false)
var simple: Boolean = config("simple", default = false)
var geneNameAsName2: Boolean = config("gene _name_as_name2", default = false)
def cmdLine = executable +
conditional(genePredExt, "-genePredExt") +
conditional(allErrors, "-allErrors") +
optional("-infoOut", infoOut) +
conditional(allErrors, "-allErrors") +
conditional(impliedStopAfterCds, "-impliedStopAfterCds") +
conditional(simple, "-simple") +
conditional(geneNameAsName2, "-geneNameAsName2") +
repeat(inputGtfs) +
(if (outputAsStsout) required("/dev/stdout") else required(outputGenePred))
}
...@@ -75,7 +75,8 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere ...@@ -75,7 +75,8 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere
var genomeSAindexNbases: Option[Int] = config("genomesaindexnbases") var genomeSAindexNbases: Option[Int] = config("genomesaindexnbases")
var genomeSAsparseD: Option[Int] = config("genomesasparsed") var genomeSAsparseD: Option[Int] = config("genomesasparsed")
var sjdbGTFfile: Option[String] = config("sjdbgtfile") @Input(required = false)
var sjdbGTFfile: Option[File] = config("sjdbgtfile")
var sjdbGTFchrPrefix: Option[String] = config("sjdbgtfchrprefix") var sjdbGTFchrPrefix: Option[String] = config("sjdbgtfchrprefix")
var sjdbGTFfeatureExon: Option[String] = config("sjdbgtffeatureexon") var sjdbGTFfeatureExon: Option[String] = config("sjdbgtffeatureexon")
var sjdbGTFtagExonParentTranscript: Option[String] = config("sjdbgtftagexonparenttranscript") var sjdbGTFtagExonParentTranscript: Option[String] = config("sjdbgtftagexonparenttranscript")
......
...@@ -24,7 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output } ...@@ -24,7 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/** Extension for zcat */ /** Extension for zcat */
class Zcat(val root: Configurable) extends BiopetCommandLineFunction with Version { class Zcat(val root: Configurable) extends BiopetCommandLineFunction with Version {
@Input(doc = "Zipped file", required = true) @Input(doc = "Zipped file", required = true)
var input: List[File] = _ var input: List[File] = Nil
@Output(doc = "Unzipped file", required = true) @Output(doc = "Unzipped file", required = true)
var output: File = _ var output: File = _
......
...@@ -44,6 +44,11 @@ ...@@ -44,6 +44,11 @@
<artifactId>BiopetCore</artifactId> <artifactId>BiopetCore</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>GenerateIndexes</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>nl.lumc.sasc</groupId> <groupId>nl.lumc.sasc</groupId>
<artifactId>Flexiprep</artifactId> <artifactId>Flexiprep</artifactId>
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
*/ */
package nl.lumc.sasc.biopet package nl.lumc.sasc.biopet
import nl.lumc.sasc.biopet.pipelines.generateindexes.GenerateIndexes
import nl.lumc.sasc.biopet.utils.{ BiopetExecutable, MainCommand } import nl.lumc.sasc.biopet.utils.{ BiopetExecutable, MainCommand }
object BiopetExecutableMain extends BiopetExecutable { object BiopetExecutableMain extends BiopetExecutable {
...@@ -36,7 +37,8 @@ object BiopetExecutableMain extends BiopetExecutable { ...@@ -36,7 +37,8 @@ object BiopetExecutableMain extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.gwastest.GwasTest, nl.lumc.sasc.biopet.pipelines.gwastest.GwasTest,
nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling, nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling,
nl.lumc.sasc.biopet.pipelines.basty.Basty, nl.lumc.sasc.biopet.pipelines.basty.Basty,
nl.lumc.sasc.biopet.pipelines.shiva.Shiva nl.lumc.sasc.biopet.pipelines.shiva.Shiva,
GenerateIndexes
) )
def tools: List[MainCommand] = BiopetToolsExecutable.tools def tools: List[MainCommand] = BiopetToolsExecutable.tools
......
...@@ -45,6 +45,18 @@ ...@@ -45,6 +45,18 @@
<artifactId>BiopetExtensions</artifactId> <artifactId>BiopetExtensions</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.10</artifactId>
<version>2.2.1</version>
<scope>test</scope>
</dependency>
</dependencies> </dependencies>
</project> </project>
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvan_thof on 13-5-16.
*/
class FastaMerging(val root: Configurable) extends BiopetCommandLineFunction {
@Input
var input: List[File] = Nil
@Output(required = true)
var output: File = _
var cmds: Array[BiopetCommandLineFunction] = Array()
def cmdLine = cmds.map(_.commandLine).mkString(" && ")
}
...@@ -13,14 +13,13 @@ ...@@ -13,14 +13,13 @@
* license; For commercial users or users who do not want to follow the AGPL * license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license. * license, please contact us to obtain a separate license.
*/ */
package nl.lumc.sasc.biopet.pipelines package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.PrintWriter import java.io.{ File, PrintWriter }
import java.util import java.util
import nl.lumc.sasc.biopet.core.extensions.Md5sum import nl.lumc.sasc.biopet.core.extensions.Md5sum
import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions._ import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2Build, BowtieBuild } import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2Build, BowtieBuild }
import nl.lumc.sasc.biopet.extensions.bwa.BwaIndex import nl.lumc.sasc.biopet.extensions.bwa.BwaIndex
...@@ -29,26 +28,26 @@ import nl.lumc.sasc.biopet.extensions.gmap.GmapBuild ...@@ -29,26 +28,26 @@ import nl.lumc.sasc.biopet.extensions.gmap.GmapBuild
import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx
import nl.lumc.sasc.biopet.utils.ConfigUtils import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.queue.QScript
import scala.language.reflectiveCalls
import scala.collection.JavaConversions._ import scala.collection.JavaConversions._
import scala.language.reflectiveCalls
class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript { class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null) def this() = this(null)
@Argument @Argument(required = true)
var referenceConfigFile: File = _ var referenceConfigFiles: List[File] = Nil
var referenceConfig: Map[String, Any] = Map()
var configDeps: List[File] = Nil var referenceConfig: Map[String, Any] = null
def outputConfigFile = new File(outputDir, "reference.json") protected var configDeps: List[File] = Nil
/** This is executed before the script starts */ /** This is executed before the script starts */
def init(): Unit = { def init(): Unit = {
referenceConfig = ConfigUtils.fileToConfigMap(referenceConfigFile) if (referenceConfig == null)
referenceConfig = referenceConfigFiles.foldLeft(Map[String, Any]())((a, b) => ConfigUtils.mergeMaps(a, ConfigUtils.fileToConfigMap(b)))
} }
/** Method where jobs must be added */ /** Method where jobs must be added */
...@@ -58,11 +57,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript ...@@ -58,11 +57,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
val speciesConfig = ConfigUtils.any2map(c) val speciesConfig = ConfigUtils.any2map(c)
val speciesDir = new File(outputDir, speciesName) val speciesDir = new File(outputDir, speciesName)
for ((genomeName, c) <- speciesConfig) yield genomeName -> { for ((genomeName, c) <- speciesConfig) yield genomeName -> {
var configDeps: List[File] = Nil
val genomeConfig = ConfigUtils.any2map(c) val genomeConfig = ConfigUtils.any2map(c)
val fastaUris = genomeConfig.getOrElse("fasta_uri", val fastaUris = genomeConfig.getOrElse("fasta_uri",
throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match { throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match {
case a: Array[_] => a.map(_.toString) case a: Traversable[_] => a.map(_.toString).toArray
case a => Array(a.toString) case a: util.ArrayList[_] => a.map(_.toString).toArray
case a => Array(a.toString)
} }
val genomeDir = new File(speciesDir, genomeName) val genomeDir = new File(speciesDir, genomeName)
...@@ -83,18 +84,10 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript ...@@ -83,18 +84,10 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
curl.output curl.output
} }
val fastaCat = new CommandLineFunction { val fastaCat = new FastaMerging(this)
var cmds: Array[BiopetCommandLineFunction] = Array() fastaCat.output = fastaFile
@Input if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) {
var input: List[File] = Nil
@Output
var output = fastaFile
def commandLine = cmds.mkString(" && ")
}
if (fastaUris.length > 1 || fastaFiles.filter(_.getName.endsWith(".gz")).nonEmpty) {
fastaFiles.foreach { file => fastaFiles.foreach { file =>
if (file.getName.endsWith(".gz")) { if (file.getName.endsWith(".gz")) {
val zcat = new Zcat(this) val zcat = new Zcat(this)
...@@ -159,14 +152,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript ...@@ -159,14 +152,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
val regex = """.*\/(.*)_vep_(\d*)_(.*)\.tar\.gz""".r val regex = """.*\/(.*)_vep_(\d*)_(.*)\.tar\.gz""".r
vepCacheUri.toString match { vepCacheUri.toString match {
case regex(species, version, assembly) if (version.forall(_.isDigit)) => { case regex(species, version, assembly) if version.forall(_.isDigit) =>
outputConfig ++= Map("varianteffectpredictor" -> Map( outputConfig ++= Map("varianteffectpredictor" -> Map(
"species" -> species, "species" -> species,
"assembly" -> assembly, "assembly" -> assembly,
"cache_version" -> version.toInt, "cache_version" -> version.toInt,
"cache" -> vepDir, "cache" -> vepDir,
"fasta" -> createLinks(vepDir))) "fasta" -> createLinks(vepDir)))
}
case _ => throw new IllegalArgumentException("Cache found but no version was found") case _ => throw new IllegalArgumentException("Cache found but no version was found")
} }
} }
...@@ -183,13 +175,14 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript ...@@ -183,13 +175,14 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
add(curl) add(curl)
cv.variant :+= curl.output cv.variant :+= curl.output
val tabix = new Tabix(this) if (curl.output.getName.endsWith(".vcf.gz")) {
tabix.input = curl.output val tabix = new Tabix(this)
tabix.p = Some("vcf") tabix.input = curl.output
tabix.isIntermediate = true tabix.p = Some("vcf")
add(tabix) tabix.isIntermediate = true
configDeps :+= tabix.outputIndex add(tabix)
cv.deps ::= tabix.outputIndex configDeps :+= tabix.outputIndex
}
} }
dbsnpUri match { dbsnpUri match {
...@@ -200,6 +193,28 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript ...@@ -200,6 +193,28 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
cv.out = new File(annotationDir, "dbsnp.vcf.gz") cv.out = new File(annotationDir, "dbsnp.vcf.gz")
add(cv) add(cv)
outputConfig += "dbsnp" -> cv.out
}
val gtfFile: Option[File] = genomeConfig.get("gtf_uri").map { gtfUri =>
val outputFile = new File(annotationDir, new File(gtfUri.toString).getName.stripSuffix(".gz"))
val curl = new Curl(this)
curl.url = gtfUri.toString
if (gtfUri.toString.endsWith(".gz")) add(curl | Zcat(this) > outputFile)
else add(curl > outputFile)
outputConfig += "annotation_gtf" -> outputFile
outputFile
}
val refFlatFile: Option[File] = gtfFile.map { gtf =>
val refFlat = new File(gtf + ".refFlat")
val gtfToGenePred = new GtfToGenePred(this)
gtfToGenePred.inputGtfs :+= gtf
add(gtfToGenePred | Awk(this, """{ print $12"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10 }""") > refFlat)
outputConfig += "annotation_refflat" -> refFlat
refFlat
} }
// Bwa index // Bwa index
...@@ -220,11 +235,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript ...@@ -220,11 +235,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
outputConfig += "gsnap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName) outputConfig += "gsnap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName)
outputConfig += "gmap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName) outputConfig += "gmap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName)
// STAR index
val starDir = new File(genomeDir, "star") val starDir = new File(genomeDir, "star")
val starIndex = new Star(this) val starIndex = new Star(this)
starIndex.outputDir = starDir starIndex.outputDir = starDir
starIndex.reference = createLinks(starDir) starIndex.reference = createLinks(starDir)
starIndex.runmode = "genomeGenerate" starIndex.runmode = "genomeGenerate"
starIndex.sjdbGTFfile = gtfFile
add(starIndex) add(starIndex)
configDeps :+= starIndex.jobOutputFile configDeps :+= starIndex.jobOutputFile
outputConfig += "star" -> Map( outputConfig += "star" -> Map(
...@@ -232,6 +249,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript ...@@ -232,6 +249,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
"genomeDir" -> starDir.getAbsolutePath "genomeDir" -> starDir.getAbsolutePath
) )
// Bowtie index
val bowtieIndex = new BowtieBuild(this) val bowtieIndex = new BowtieBuild(this)
bowtieIndex.reference = createLinks(new File(genomeDir, "bowtie")) bowtieIndex.reference = createLinks(new File(genomeDir, "bowtie"))
bowtieIndex.baseName = "reference" bowtieIndex.baseName = "reference"
...@@ -239,6 +257,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript ...@@ -239,6 +257,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
configDeps :+= bowtieIndex.jobOutputFile configDeps :+= bowtieIndex.jobOutputFile
outputConfig += "bowtie" -> Map("reference_fasta" -> bowtieIndex.reference.getAbsolutePath) outputConfig += "bowtie" -> Map("reference_fasta" -> bowtieIndex.reference.getAbsolutePath)
// Bowtie2 index
val bowtie2Index = new Bowtie2Build(this) val bowtie2Index = new Bowtie2Build(this)
bowtie2Index.reference = createLinks(new File(genomeDir, "bowtie2")) bowtie2Index.reference = createLinks(new File(genomeDir, "bowtie2"))
bowtie2Index.baseName = "reference" bowtie2Index.baseName = "reference"
...@@ -249,19 +268,22 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript ...@@ -249,19 +268,22 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
"bowtie_index" -> bowtie2Index.reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta") "bowtie_index" -> bowtie2Index.reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta")
) )
val writeConfig = new WriteConfig
writeConfig.deps = configDeps
writeConfig.out = new File(genomeDir, s"$speciesName-$genomeName.json")
writeConfig.config = Map("references" -> Map(speciesName -> Map(genomeName -> outputConfig)))
add(writeConfig)
this.configDeps :::= configDeps
outputConfig outputConfig
} }
} }
add(new InProcessFunction { val writeConfig = new WriteConfig
@Input val deps: List[File] = configDeps writeConfig.deps = configDeps
writeConfig.out = new File(outputDir, "references.json")
def run: Unit = { writeConfig.config = Map("references" -> outputConfig)
val writer = new PrintWriter(outputConfigFile) add(writeConfig)
writer.println(ConfigUtils.mapToJson(Map("references" -> outputConfig)).spaces2)
writer.close()
}
})
} }
} }
......
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.{ File, PrintWriter }
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.broadinstitute.gatk.queue.function.InProcessFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvanthof on 15/05/16.
*/
class WriteConfig extends InProcessFunction {
@Input
var deps: List[File] = Nil
@Output(required = true)
var out: File = _
var config: Map[String, Any] = _
def run: Unit = {
val writer = new PrintWriter(out)
writer.println(ConfigUtils.mapToJson(config).spaces2)
writer.close()
}
}
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.generateindexes
import com.google.common.io.Files
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Config
import org.broadinstitute.gatk.queue.QSettings
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
/**
* Created by pjvan_thof on 13-5-16.
*/
class GenerateIndexesTest extends TestNGSuite with Matchers {
def initPipeline(map: Map[String, Any]): GenerateIndexes = {