Skip to content
Snippets Groups Projects
Commit e11ff161 authored by bow's avatar bow
Browse files

Merge branch 'feature-index_generation' into 'develop'

Reference index generation



See merge request !396
parents 11c11c41 ee5c73f8
No related branches found
No related tags found
No related merge requests found
Showing
with 345 additions and 48 deletions
......@@ -17,6 +17,7 @@
<modules>
<module>../biopet-core</module>
<module>../generate-indexes</module>
<module>../biopet-package</module>
<module>../bammetrics</module>
<module>../flexiprep</module>
......
......@@ -27,7 +27,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
*/
class BiopetPipe(val commands: List[BiopetCommandLineFunction]) extends BiopetCommandLineFunction {
@Input
@Input(required = false)
lazy val input: List[File] = try {
commands.flatMap(_.inputs)
} catch {
......
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.util.matching.Regex
/**
* Created by pjvan_thof on 17-5-16.
*/
class Awk(val root: Configurable) extends BiopetCommandLineFunction with Version {
executable = config("exe", default = "awk", freeVar = false)
def versionCommand: String = executable + " --version"
def versionRegex: Regex = """(GNU Awk \d+\.\d+\.\d+)""".r
@Input(required = false)
var input: File = _
@Output
var output: File = _
var command: String = _
def cmdLine = executable +
required(command) +
(if (inputAsStdin) "" else required(input)) +
(if (outputAsStsout) "" else " > " + required(output))
}
object Awk {
def apply(root: Configurable, command: String): Awk = {
val awk = new Awk(root)
awk.command = command
awk
}
}
\ No newline at end of file
......@@ -34,5 +34,5 @@ class Curl(val root: Configurable) extends BiopetCommandLineFunction with Versio
def versionCommand = executable + " --version"
def versionRegex = """curl (\w+\.\w+\.\w+) .*""".r
def cmdLine: String = required(executable) + required(url) + " > " + required(output)
def cmdLine: String = required(executable) + required(url) + (if (outputAsStsout) "" else " > " + required(output))
}
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvan_thof on 17-5-16.
*/
class GtfToGenePred(val root: Configurable) extends BiopetCommandLineFunction {
executable = config("exe", default = "gtfToGenePred", freeVar = false)
@Input
var inputGtfs: List[File] = Nil
@Output
var outputGenePred: File = _
@Output
var infoOut: Option[File] = None
var genePredExt: Boolean = config("gene _pred _ext", default = false)
var allErrors: Boolean = config("all_errors", default = false)
var impliedStopAfterCds: Boolean = config("implied_stop_after_cds", default = false)
var simple: Boolean = config("simple", default = false)
var geneNameAsName2: Boolean = config("gene _name_as_name2", default = false)
def cmdLine = executable +
conditional(genePredExt, "-genePredExt") +
conditional(allErrors, "-allErrors") +
optional("-infoOut", infoOut) +
conditional(allErrors, "-allErrors") +
conditional(impliedStopAfterCds, "-impliedStopAfterCds") +
conditional(simple, "-simple") +
conditional(geneNameAsName2, "-geneNameAsName2") +
repeat(inputGtfs) +
(if (outputAsStsout) required("/dev/stdout") else required(outputGenePred))
}
......@@ -75,7 +75,8 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere
var genomeSAindexNbases: Option[Int] = config("genomesaindexnbases")
var genomeSAsparseD: Option[Int] = config("genomesasparsed")
var sjdbGTFfile: Option[String] = config("sjdbgtfile")
@Input(required = false)
var sjdbGTFfile: Option[File] = config("sjdbgtfile")
var sjdbGTFchrPrefix: Option[String] = config("sjdbgtfchrprefix")
var sjdbGTFfeatureExon: Option[String] = config("sjdbgtffeatureexon")
var sjdbGTFtagExonParentTranscript: Option[String] = config("sjdbgtftagexonparenttranscript")
......
......@@ -24,7 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/** Extension for zcat */
class Zcat(val root: Configurable) extends BiopetCommandLineFunction with Version {
@Input(doc = "Zipped file", required = true)
var input: List[File] = _
var input: List[File] = Nil
@Output(doc = "Unzipped file", required = true)
var output: File = _
......
......@@ -44,6 +44,11 @@
<artifactId>BiopetCore</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>GenerateIndexes</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Flexiprep</artifactId>
......
......@@ -15,6 +15,7 @@
*/
package nl.lumc.sasc.biopet
import nl.lumc.sasc.biopet.pipelines.generateindexes.GenerateIndexes
import nl.lumc.sasc.biopet.utils.{ BiopetExecutable, MainCommand }
object BiopetExecutableMain extends BiopetExecutable {
......@@ -36,7 +37,8 @@ object BiopetExecutableMain extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.gwastest.GwasTest,
nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling,
nl.lumc.sasc.biopet.pipelines.basty.Basty,
nl.lumc.sasc.biopet.pipelines.shiva.Shiva
nl.lumc.sasc.biopet.pipelines.shiva.Shiva,
GenerateIndexes
)
def tools: List[MainCommand] = BiopetToolsExecutable.tools
......
......@@ -45,6 +45,18 @@
<artifactId>BiopetExtensions</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.10</artifactId>
<version>2.2.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvan_thof on 13-5-16.
*/
class FastaMerging(val root: Configurable) extends BiopetCommandLineFunction {
@Input
var input: List[File] = Nil
@Output(required = true)
var output: File = _
var cmds: Array[BiopetCommandLineFunction] = Array()
def cmdLine = cmds.map(_.commandLine).mkString(" && ")
}
......@@ -13,14 +13,13 @@
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.PrintWriter
import java.io.{ File, PrintWriter }
import java.util
import nl.lumc.sasc.biopet.core.extensions.Md5sum
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2Build, BowtieBuild }
import nl.lumc.sasc.biopet.extensions.bwa.BwaIndex
......@@ -29,26 +28,26 @@ import nl.lumc.sasc.biopet.extensions.gmap.GmapBuild
import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
import scala.language.reflectiveCalls
import scala.collection.JavaConversions._
import scala.language.reflectiveCalls
class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Argument
var referenceConfigFile: File = _
var referenceConfig: Map[String, Any] = Map()
@Argument(required = true)
var referenceConfigFiles: List[File] = Nil
var configDeps: List[File] = Nil
var referenceConfig: Map[String, Any] = null
def outputConfigFile = new File(outputDir, "reference.json")
protected var configDeps: List[File] = Nil
/** This is executed before the script starts */
def init(): Unit = {
referenceConfig = ConfigUtils.fileToConfigMap(referenceConfigFile)
if (referenceConfig == null)
referenceConfig = referenceConfigFiles.foldLeft(Map[String, Any]())((a, b) => ConfigUtils.mergeMaps(a, ConfigUtils.fileToConfigMap(b)))
}
/** Method where jobs must be added */
......@@ -58,11 +57,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
val speciesConfig = ConfigUtils.any2map(c)
val speciesDir = new File(outputDir, speciesName)
for ((genomeName, c) <- speciesConfig) yield genomeName -> {
var configDeps: List[File] = Nil
val genomeConfig = ConfigUtils.any2map(c)
val fastaUris = genomeConfig.getOrElse("fasta_uri",
throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match {
case a: Array[_] => a.map(_.toString)
case a => Array(a.toString)
case a: Traversable[_] => a.map(_.toString).toArray
case a: util.ArrayList[_] => a.map(_.toString).toArray
case a => Array(a.toString)
}
val genomeDir = new File(speciesDir, genomeName)
......@@ -83,18 +84,10 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
curl.output
}
val fastaCat = new CommandLineFunction {
var cmds: Array[BiopetCommandLineFunction] = Array()
val fastaCat = new FastaMerging(this)
fastaCat.output = fastaFile
@Input
var input: List[File] = Nil
@Output
var output = fastaFile
def commandLine = cmds.mkString(" && ")
}
if (fastaUris.length > 1 || fastaFiles.filter(_.getName.endsWith(".gz")).nonEmpty) {
if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) {
fastaFiles.foreach { file =>
if (file.getName.endsWith(".gz")) {
val zcat = new Zcat(this)
......@@ -159,14 +152,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
val regex = """.*\/(.*)_vep_(\d*)_(.*)\.tar\.gz""".r
vepCacheUri.toString match {
case regex(species, version, assembly) if (version.forall(_.isDigit)) => {
case regex(species, version, assembly) if version.forall(_.isDigit) =>
outputConfig ++= Map("varianteffectpredictor" -> Map(
"species" -> species,
"assembly" -> assembly,
"cache_version" -> version.toInt,
"cache" -> vepDir,
"fasta" -> createLinks(vepDir)))
}
case _ => throw new IllegalArgumentException("Cache found but no version was found")
}
}
......@@ -183,13 +175,14 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
add(curl)
cv.variant :+= curl.output
val tabix = new Tabix(this)
tabix.input = curl.output
tabix.p = Some("vcf")
tabix.isIntermediate = true
add(tabix)
configDeps :+= tabix.outputIndex
cv.deps ::= tabix.outputIndex
if (curl.output.getName.endsWith(".vcf.gz")) {
val tabix = new Tabix(this)
tabix.input = curl.output
tabix.p = Some("vcf")
tabix.isIntermediate = true
add(tabix)
configDeps :+= tabix.outputIndex
}
}
dbsnpUri match {
......@@ -200,6 +193,28 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
cv.out = new File(annotationDir, "dbsnp.vcf.gz")
add(cv)
outputConfig += "dbsnp" -> cv.out
}
val gtfFile: Option[File] = genomeConfig.get("gtf_uri").map { gtfUri =>
val outputFile = new File(annotationDir, new File(gtfUri.toString).getName.stripSuffix(".gz"))
val curl = new Curl(this)
curl.url = gtfUri.toString
if (gtfUri.toString.endsWith(".gz")) add(curl | Zcat(this) > outputFile)
else add(curl > outputFile)
outputConfig += "annotation_gtf" -> outputFile
outputFile
}
val refFlatFile: Option[File] = gtfFile.map { gtf =>
val refFlat = new File(gtf + ".refFlat")
val gtfToGenePred = new GtfToGenePred(this)
gtfToGenePred.inputGtfs :+= gtf
add(gtfToGenePred | Awk(this, """{ print $12"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10 }""") > refFlat)
outputConfig += "annotation_refflat" -> refFlat
refFlat
}
// Bwa index
......@@ -220,11 +235,13 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
outputConfig += "gsnap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName)
outputConfig += "gmap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName)
// STAR index
val starDir = new File(genomeDir, "star")
val starIndex = new Star(this)
starIndex.outputDir = starDir
starIndex.reference = createLinks(starDir)
starIndex.runmode = "genomeGenerate"
starIndex.sjdbGTFfile = gtfFile
add(starIndex)
configDeps :+= starIndex.jobOutputFile
outputConfig += "star" -> Map(
......@@ -232,6 +249,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
"genomeDir" -> starDir.getAbsolutePath
)
// Bowtie index
val bowtieIndex = new BowtieBuild(this)
bowtieIndex.reference = createLinks(new File(genomeDir, "bowtie"))
bowtieIndex.baseName = "reference"
......@@ -239,6 +257,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
configDeps :+= bowtieIndex.jobOutputFile
outputConfig += "bowtie" -> Map("reference_fasta" -> bowtieIndex.reference.getAbsolutePath)
// Bowtie2 index
val bowtie2Index = new Bowtie2Build(this)
bowtie2Index.reference = createLinks(new File(genomeDir, "bowtie2"))
bowtie2Index.baseName = "reference"
......@@ -249,19 +268,22 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
"bowtie_index" -> bowtie2Index.reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta")
)
val writeConfig = new WriteConfig
writeConfig.deps = configDeps
writeConfig.out = new File(genomeDir, s"$speciesName-$genomeName.json")
writeConfig.config = Map("references" -> Map(speciesName -> Map(genomeName -> outputConfig)))
add(writeConfig)
this.configDeps :::= configDeps
outputConfig
}
}
add(new InProcessFunction {
@Input val deps: List[File] = configDeps
def run: Unit = {
val writer = new PrintWriter(outputConfigFile)
writer.println(ConfigUtils.mapToJson(Map("references" -> outputConfig)).spaces2)
writer.close()
}
})
val writeConfig = new WriteConfig
writeConfig.deps = configDeps
writeConfig.out = new File(outputDir, "references.json")
writeConfig.config = Map("references" -> outputConfig)
add(writeConfig)
}
}
......
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.{ File, PrintWriter }
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.broadinstitute.gatk.queue.function.InProcessFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvanthof on 15/05/16.
*/
class WriteConfig extends InProcessFunction {
@Input
var deps: List[File] = Nil
@Output(required = true)
var out: File = _
var config: Map[String, Any] = _
def run: Unit = {
val writer = new PrintWriter(out)
writer.println(ConfigUtils.mapToJson(config).spaces2)
writer.close()
}
}
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.generateindexes
import com.google.common.io.Files
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Config
import org.broadinstitute.gatk.queue.QSettings
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
/**
* Created by pjvan_thof on 13-5-16.
*/
class GenerateIndexesTest extends TestNGSuite with Matchers {
def initPipeline(map: Map[String, Any]): GenerateIndexes = {
new GenerateIndexes() {
override def configNamespace = "generateindexes"
override def globalConfig = new Config(ConfigUtils.mergeMaps(map, GenerateIndexesTest.config))
qSettings = new QSettings
qSettings.runName = "test"
}
}
@Test
def testNoFastaUri: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("" -> "")))
intercept[IllegalArgumentException] {
pipeline.script()
}
}
@Test
def testSingleFasta: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri")))
pipeline.script()
}
@Test
def testMultiFasta: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> List("uri", "uri2", "uri3.gz"))))
pipeline.script()
}
@Test
def testSingleDbsnp: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "dbsnp_vcf_uri" -> "uri.vcf.gz")))
pipeline.script()
}
@Test
def testMultiDbsnp: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "dbsnp_vcf_uri" -> List("uri.vcf.gz", "uri2.vcf.gz"))))
pipeline.script()
}
@Test
def testVep: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "vep_cache_uri" -> "something/human_vep_80_hg19.tar.gz")))
pipeline.script()
}
@Test
def testGtfZipped: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "gtf_uri" -> "bla.gf.gz")))
pipeline.script()
}
@Test
def testGtf: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "gtf_uri" -> "bla.gf")))
pipeline.script()
}
}
object GenerateIndexesTest {
val outputDir = Files.createTempDir()
outputDir.deleteOnExit()
val config = Map("output_dir" -> outputDir,
"bwa" -> Map("exe" -> "test"),
"star" -> Map("exe" -> "test"),
"bowtiebuild" -> Map("exe" -> "test"),
"bowtie2build" -> Map("exe" -> "test"),
"gmapbuild" -> Map("exe" -> "test"),
"samtools" -> Map("exe" -> "test"),
"md5sum" -> Map("exe" -> "test"),
"gatk_jar" -> "test",
"tabix" -> Map("exe" -> "test")
)
}
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.File
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
/**
* Created by pjvanthof on 17/05/16.
*/
class WriteConfigTest extends TestNGSuite with Matchers {
@Test
def testWriteConfig: Unit = {
val writeConfig = new WriteConfig
writeConfig.config = Map("test" -> "bla")
writeConfig.out = File.createTempFile("config.", ".json")
writeConfig.out.deleteOnExit()
writeConfig.run
ConfigUtils.fileToConfigMap(writeConfig.out) shouldBe Map("test" -> "bla")
}
}
......@@ -171,6 +171,7 @@ class ShivaWithAnnotationTest extends ShivaTestTrait {
object ShivaTest {
val outputDir = Files.createTempDir()
outputDir.deleteOnExit()
new File(outputDir, "input").mkdirs()
def inputTouch(name: String): String = {
val file = new File(outputDir, "input" + File.separator + name)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment