Commit e11ff161 authored by bow's avatar bow

Merge branch 'feature-index_generation' into 'develop'

Reference index generation



See merge request !396
parents 11c11c41 ee5c73f8
......@@ -17,6 +17,7 @@
<modules>
<module>../biopet-core</module>
<module>../generate-indexes</module>
<module>../biopet-package</module>
<module>../bammetrics</module>
<module>../flexiprep</module>
......
......@@ -27,7 +27,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
*/
class BiopetPipe(val commands: List[BiopetCommandLineFunction]) extends BiopetCommandLineFunction {
@Input
@Input(required = false)
lazy val input: List[File] = try {
commands.flatMap(_.inputs)
} catch {
......
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.util.matching.Regex
/**
* Created by pjvan_thof on 17-5-16.
*/
class Awk(val root: Configurable) extends BiopetCommandLineFunction with Version {
executable = config("exe", default = "awk", freeVar = false)
def versionCommand: String = executable + " --version"
def versionRegex: Regex = """(GNU Awk \d+\.\d+\.\d+)""".r
@Input(required = false)
var input: File = _
@Output
var output: File = _
var command: String = _
def cmdLine = executable +
required(command) +
(if (inputAsStdin) "" else required(input)) +
(if (outputAsStsout) "" else " > " + required(output))
}
object Awk {
def apply(root: Configurable, command: String): Awk = {
val awk = new Awk(root)
awk.command = command
awk
}
}
\ No newline at end of file
......@@ -34,5 +34,5 @@ class Curl(val root: Configurable) extends BiopetCommandLineFunction with Versio
def versionCommand = executable + " --version"
def versionRegex = """curl (\w+\.\w+\.\w+) .*""".r
def cmdLine: String = required(executable) + required(url) + " > " + required(output)
def cmdLine: String = required(executable) + required(url) + (if (outputAsStsout) "" else " > " + required(output))
}
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvan_thof on 17-5-16.
*/
class GtfToGenePred(val root: Configurable) extends BiopetCommandLineFunction {
executable = config("exe", default = "gtfToGenePred", freeVar = false)
@Input
var inputGtfs: List[File] = Nil
@Output
var outputGenePred: File = _
@Output
var infoOut: Option[File] = None
var genePredExt: Boolean = config("gene _pred _ext", default = false)
var allErrors: Boolean = config("all_errors", default = false)
var impliedStopAfterCds: Boolean = config("implied_stop_after_cds", default = false)
var simple: Boolean = config("simple", default = false)
var geneNameAsName2: Boolean = config("gene _name_as_name2", default = false)
def cmdLine = executable +
conditional(genePredExt, "-genePredExt") +
conditional(allErrors, "-allErrors") +
optional("-infoOut", infoOut) +
conditional(allErrors, "-allErrors") +
conditional(impliedStopAfterCds, "-impliedStopAfterCds") +
conditional(simple, "-simple") +
conditional(geneNameAsName2, "-geneNameAsName2") +
repeat(inputGtfs) +
(if (outputAsStsout) required("/dev/stdout") else required(outputGenePred))
}
......@@ -75,7 +75,8 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere
var genomeSAindexNbases: Option[Int] = config("genomesaindexnbases")
var genomeSAsparseD: Option[Int] = config("genomesasparsed")
var sjdbGTFfile: Option[String] = config("sjdbgtfile")
@Input(required = false)
var sjdbGTFfile: Option[File] = config("sjdbgtfile")
var sjdbGTFchrPrefix: Option[String] = config("sjdbgtfchrprefix")
var sjdbGTFfeatureExon: Option[String] = config("sjdbgtffeatureexon")
var sjdbGTFtagExonParentTranscript: Option[String] = config("sjdbgtftagexonparenttranscript")
......
......@@ -24,7 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/** Extension for zcat */
class Zcat(val root: Configurable) extends BiopetCommandLineFunction with Version {
@Input(doc = "Zipped file", required = true)
var input: List[File] = _
var input: List[File] = Nil
@Output(doc = "Unzipped file", required = true)
var output: File = _
......
......@@ -44,6 +44,11 @@
<artifactId>BiopetCore</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>GenerateIndexes</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Flexiprep</artifactId>
......
......@@ -15,6 +15,7 @@
*/
package nl.lumc.sasc.biopet
import nl.lumc.sasc.biopet.pipelines.generateindexes.GenerateIndexes
import nl.lumc.sasc.biopet.utils.{ BiopetExecutable, MainCommand }
object BiopetExecutableMain extends BiopetExecutable {
......@@ -36,7 +37,8 @@ object BiopetExecutableMain extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.gwastest.GwasTest,
nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling,
nl.lumc.sasc.biopet.pipelines.basty.Basty,
nl.lumc.sasc.biopet.pipelines.shiva.Shiva
nl.lumc.sasc.biopet.pipelines.shiva.Shiva,
GenerateIndexes
)
def tools: List[MainCommand] = BiopetToolsExecutable.tools
......
......@@ -45,6 +45,18 @@
<artifactId>BiopetExtensions</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.10</artifactId>
<version>2.2.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvan_thof on 13-5-16.
*/
class FastaMerging(val root: Configurable) extends BiopetCommandLineFunction {
@Input
var input: List[File] = Nil
@Output(required = true)
var output: File = _
var cmds: Array[BiopetCommandLineFunction] = Array()
def cmdLine = cmds.map(_.commandLine).mkString(" && ")
}
......@@ -13,14 +13,13 @@
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.PrintWriter
import java.io.{ File, PrintWriter }
import java.util
import nl.lumc.sasc.biopet.core.extensions.Md5sum
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2Build, BowtieBuild }
import nl.lumc.sasc.biopet.extensions.bwa.BwaIndex
......@@ -29,26 +28,26 @@ import nl.lumc.sasc.biopet.extensions.gmap.GmapBuild
import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
import scala.language.reflectiveCalls
import scala.collection.JavaConversions._
import scala.language.reflectiveCalls
class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Argument
var referenceConfigFile: File = _
var referenceConfig: Map[String, Any] = Map()
@Argument(required = true)
var referenceConfigFiles: List[File] = Nil
var configDeps: List[File] = Nil
var referenceConfig: Map[String, Any] = null
def outputConfigFile = new File(outputDir, "reference.json")
protected var configDeps: List[File] = Nil
/** This is executed before the script starts */
def init(): Unit = {
referenceConfig = ConfigUtils.fileToConfigMap(referenceConfigFile)
if (referenceConfig == null)
referenceConfig = referenceConfigFiles.foldLeft(Map[String, Any]())((a, b) => ConfigUtils.mergeMaps(a, ConfigUtils.fileToConfigMap(b)))
}
/** Method where jobs must be added */
......@@ -58,11 +57,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
val speciesConfig = ConfigUtils.any2map(c)
val speciesDir = new File(outputDir, speciesName)
for ((genomeName, c) <- speciesConfig) yield genomeName -> {
var configDeps: List[File] = Nil
val genomeConfig = ConfigUtils.any2map(c)
val fastaUris = genomeConfig.getOrElse("fasta_uri",
throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match {
case a: Array[_] => a.map(_.toString)
case a => Array(a.toString)
case a: Traversable[_] => a.map(_.toString).toArray
case a: util.ArrayList[_] => a.map(_.toString).toArray
case a => Array(a.toString)
}
val genomeDir = new File(speciesDir, genomeName)
......@@ -83,18 +84,10 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
curl.output
}
val fastaCat = new CommandLineFunction {
var cmds: Array[BiopetCommandLineFunction] = Array()
val fastaCat = new FastaMerging(this)
fastaCat.output = fastaFile
@Input
var input: List[File] = Nil
@Output
var output = fastaFile
def commandLine = cmds.mkString(" && ")
}
if (fastaUris.length > 1 || fastaFiles.filter(_.getName.endsWith(".gz")).nonEmpty) {
if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) {
fastaFiles.foreach { file =>
if (file.getName.endsWith(".gz")) {
val zcat = new Zcat(this)
......@@ -159,14 +152,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
val regex = """.*\/(.*)_vep_(\d*)_(.*)\.tar\.gz""".r
vepCacheUri.toString match {
case regex(species, version, assembly) if (version.forall(_.isDigit)) => {
case regex(species, version, assembly) if version.forall(_.isDigit) =>
outputConfig ++= Map("varianteffectpredictor" -> Map(
"species" -> species,
"assembly" -> assembly,
"cache_version" -> version.toInt,
"cache" -> vepDir,
"fasta" -> createLinks(vepDir)))
}
case _ => throw new IllegalArgumentException("Cache found but no version was found")
}
}
......@@ -183,13 +175,14 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
add(curl)
cv.variant :+= curl.output
val tabix = new Tabix(this)
tabix.input = curl.output
tabix.p = Some("vcf")
tabix.isIntermediate = true
add(tabix)
configDeps :+= tabix.outputIndex
cv.deps ::= tabix.outputIndex
if (curl.output.getName.endsWith(".vcf.gz")) {
val tabix = new Tabix(this)
tabix.input = curl.output
tabix.p = Some("vcf")
tabix.isIntermediate = true
add(tabix)
configDeps :+= tabix.outputIndex
}
}
dbsnpUri match {
......@@ -200,6 +193,28 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
cv.out = new File(annotationDir, "dbsnp.vcf.gz")
add(cv)
outputConfig += "dbsnp" -> cv.out
}
val gtfFile: Option[File] = genomeConfig.get("gtf_uri").map { gtfUri =>
val outputFile = new File(annotationDir, new File(gtfUri.toString).getName.stripSuffix(".gz"))
val curl = new Curl(this)
curl.url = gtfUri.toString
if (gtfUri.toString.endsWith(".gz")) add(curl | Zcat(this) > outputFile)
else add(curl > outputFile)
outputConfig += "annotation_gtf" -> outputFile
outputFile
}
val refFlatFile: Option[File] = gtfFile.map { gtf =>
val refFlat = new File(gtf + ".refFlat")
val gtfToGenePred = new GtfToGenePred(this)
gtfToGenePred.inputGtfs :+= gtf
add(gtfToGenePred | Awk(this, """{ print $12"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10 }""") > refFlat)
outputConfig += "annotation_refflat" -> refFlat
refFlat
}
// Bwa index
......@@ -220,11 +235,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
outputConfig += "gsnap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName)
outputConfig += "gmap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName)
// STAR index
val starDir = new File(genomeDir, "star")
val starIndex = new Star(this)
starIndex.outputDir = starDir
starIndex.reference = createLinks(starDir)
starIndex.runmode = "genomeGenerate"
starIndex.sjdbGTFfile = gtfFile
add(starIndex)
configDeps :+= starIndex.jobOutputFile
outputConfig += "star" -> Map(
......@@ -232,6 +249,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
"genomeDir" -> starDir.getAbsolutePath
)
// Bowtie index
val bowtieIndex = new BowtieBuild(this)
bowtieIndex.reference = createLinks(new File(genomeDir, "bowtie"))
bowtieIndex.baseName = "reference"
......@@ -239,6 +257,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
configDeps :+= bowtieIndex.jobOutputFile
outputConfig += "bowtie" -> Map("reference_fasta" -> bowtieIndex.reference.getAbsolutePath)
// Bowtie2 index
val bowtie2Index = new Bowtie2Build(this)
bowtie2Index.reference = createLinks(new File(genomeDir, "bowtie2"))
bowtie2Index.baseName = "reference"
......@@ -249,19 +268,22 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
"bowtie_index" -> bowtie2Index.reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta")
)
val writeConfig = new WriteConfig
writeConfig.deps = configDeps
writeConfig.out = new File(genomeDir, s"$speciesName-$genomeName.json")
writeConfig.config = Map("references" -> Map(speciesName -> Map(genomeName -> outputConfig)))
add(writeConfig)
this.configDeps :::= configDeps
outputConfig
}
}
add(new InProcessFunction {
@Input val deps: List[File] = configDeps
def run: Unit = {
val writer = new PrintWriter(outputConfigFile)
writer.println(ConfigUtils.mapToJson(Map("references" -> outputConfig)).spaces2)
writer.close()
}
})
val writeConfig = new WriteConfig
writeConfig.deps = configDeps
writeConfig.out = new File(outputDir, "references.json")
writeConfig.config = Map("references" -> outputConfig)
add(writeConfig)
}
}
......
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.{ File, PrintWriter }
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.broadinstitute.gatk.queue.function.InProcessFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvanthof on 15/05/16.
*/
class WriteConfig extends InProcessFunction {
@Input
var deps: List[File] = Nil
@Output(required = true)
var out: File = _
var config: Map[String, Any] = _
def run: Unit = {
val writer = new PrintWriter(out)
writer.println(ConfigUtils.mapToJson(config).spaces2)
writer.close()
}
}
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.generateindexes
import com.google.common.io.Files
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Config
import org.broadinstitute.gatk.queue.QSettings
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
/**
* Created by pjvan_thof on 13-5-16.
*/
class GenerateIndexesTest extends TestNGSuite with Matchers {
def initPipeline(map: Map[String, Any]): GenerateIndexes = {
new GenerateIndexes() {
override def configNamespace = "generateindexes"
override def globalConfig = new Config(ConfigUtils.mergeMaps(map, GenerateIndexesTest.config))
qSettings = new QSettings
qSettings.runName = "test"
}
}
@Test
def testNoFastaUri: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("" -> "")))
intercept[IllegalArgumentException] {
pipeline.script()
}
}
@Test
def testSingleFasta: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri")))
pipeline.script()
}
@Test
def testMultiFasta: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> List("uri", "uri2", "uri3.gz"))))
pipeline.script()
}
@Test
def testSingleDbsnp: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "dbsnp_vcf_uri" -> "uri.vcf.gz")))
pipeline.script()
}
@Test
def testMultiDbsnp: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "dbsnp_vcf_uri" -> List("uri.vcf.gz", "uri2.vcf.gz"))))
pipeline.script()
}
@Test
def testVep: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "vep_cache_uri" -> "something/human_vep_80_hg19.tar.gz")))
pipeline.script()
}
@Test
def testGtfZipped: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "gtf_uri" -> "bla.gf.gz")))
pipeline.script()
}
@Test
def testGtf: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "gtf_uri" -> "bla.gf")))
pipeline.script()
}
}
object GenerateIndexesTest {
val outputDir = Files.createTempDir()
outputDir.deleteOnExit()
val config = Map("output_dir" -> outputDir,
"bwa" -> Map("exe" -> "test"),
"star" -> Map("exe" -> "test"),
"bowtiebuild" -> Map("exe" -> "test"),
"bowtie2build" -> Map("exe" -> "test"),
"gmapbuild" -> Map("exe" -> "test"),
"samtools" -> Map("exe" -> "test"),
"md5sum" -> Map("exe" -> "test"),
"gatk_jar" -> "test",
"tabix" -> Map("exe" -> "test")
)
}
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.File
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
/**
* Created by pjvanthof on 17/05/16.
*/
class WriteConfigTest extends TestNGSuite with Matchers {
@Test
def testWriteConfig: Unit = {
val writeConfig = new WriteConfig
writeConfig.config = Map("test" -> "bla")
writeConfig.out = File.createTempFile("config.", ".json")
writeConfig.out.deleteOnExit()
writeConfig.run
ConfigUtils.fileToConfigMap(writeConfig.out) shouldBe Map("test" -> "bla")
}
}
......@@ -171,6 +171,7 @@ class ShivaWithAnnotationTest extends ShivaTestTrait {
object ShivaTest {
val outputDir = Files.createTempDir()
outputDir.deleteOnExit()
new File(outputDir, "input").mkdirs()
def inputTouch(name: String): String = {
val file = new File(outputDir, "input" + File.separator + name)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment