diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/GffRead.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/GffRead.scala new file mode 100644 index 0000000000000000000000000000000000000000..3213b90026cbfe5289fbbaf0620e8a8a499f054a --- /dev/null +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/GffRead.scala @@ -0,0 +1,28 @@ +package nl.lumc.sasc.biopet.extensions + +import java.io.File + +import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version } +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** + * Created by pjvanthof on 20/06/16. + */ +class GffRead(val root: Configurable) extends BiopetCommandLineFunction { + + executable = config("exe", default = "gffread", freeVar = false) + + @Input + var input: File = _ + + @Output + var output: File = _ + + var T: Boolean = config("T", default = false, freeVar = false) + + def cmdLine = executable + + (if (inputAsStdin) "" else required(input)) + + (if (outputAsStsout) "" else required("-o", output)) + + conditional(T, "-T") +} diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Sed.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Sed.scala new file mode 100644 index 0000000000000000000000000000000000000000..dbc06508a55d3cf2a6cb9a9e1c8dd2286ca9f4f8 --- /dev/null +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Sed.scala @@ -0,0 +1,36 @@ +package nl.lumc.sasc.biopet.extensions + +import java.io.File + +import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version } +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +import scala.util.matching.Regex + +/** + * Created by pjvanthof on 18/05/16. + */ +class Sed(val root: Configurable) extends BiopetCommandLineFunction with Version { + executable = config("exe", default = "sed", freeVar = false) + + /** Command to get version of executable */ + override def versionCommand: String = executable + " --version" + + /** Regex to get version from version command output */ + override def versionRegex: Regex = """sed (GNU sed) \d+.\d+.\d+""".r + + @Input(required = false) + var inputFile: File = _ + + @Output + var outputFile: File = _ + + var expressions: List[String] = Nil + + def cmdLine = executable + + repeat("-e", expressions) + + (if (inputAsStdin) "" else required(inputFile)) + + (if (outputAsStsout) "" else " > " + required(outputFile)) + +} diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/hisat/Hisat2Build.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/hisat/Hisat2Build.scala index 3e3c92872ce12c0fc1a16c095cb9d9117fccd869..ed22aa3b8bbbcba890d4b410b30ad278cd65b664 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/hisat/Hisat2Build.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/hisat/Hisat2Build.scala @@ -43,8 +43,7 @@ class Hisat2Build(val root: Configurable) extends BiopetCommandLineFunction with var exon: Option[File] = config("exon") executable = config("exe", default = "hisat2-build", freeVar = false) - def versionRegex = """.*hisat2-align-s version (.*)""".r - override def versionExitcode = List(0, 1) + def versionRegex = """.*hisat.*version (.*)""".r def versionCommand = executable + " --version" var bmax: Option[Int] = config("bmax") diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/NormalizeFasta.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/NormalizeFasta.scala new file mode 100644 index 0000000000000000000000000000000000000000..a8657736f5a30c088213b302d3fe7d2f639fcc70 --- /dev/null +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/NormalizeFasta.scala @@ -0,0 +1,46 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.picard + +import java.io.File + +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** + * Created by sajvanderzeeuw on 6-10-15. + */ +class NormalizeFasta(val root: Configurable) extends Picard { + + javaMainClass = new picard.reference.NormalizeFasta().getClass.getName + + @Input(doc = "The input fasta file", required = true) + var input: File = _ + + @Output(doc = "The output fasta file", required = true) + var output: File = _ + + val lineLength: Int = config("line_length") + + val truncateSequenceNameAtWhitespace: Boolean = config("truncate_sequence_name_at_whitespace", default = false) + + override def cmdLine = super.cmdLine + + (if (inputAsStdin) required("INPUT=", new File("/dev/stdin"), spaceSeparated = false) + else required("INPUT=", input, spaceSeparated = false)) + + (if (outputAsStsout) required("OUTPUT=", new File("/dev/stdout"), spaceSeparated = false) + else required("OUTPUT=", output, spaceSeparated = false)) + + required("LINE_LENGTH=", lineLength, spaceSeparated = false) + + conditional(truncateSequenceNameAtWhitespace, "TRUNCATE_SEQUENCE_NAMES_AT_WHITESPACE=TRUE") +} diff --git a/biopet-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutableMain.scala b/biopet-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutableMain.scala index 25985115855087bd810116ea99ed1ce96cda02e2..a57e71a0503eecda1cc18c3311bfa3a0bcf786d1 100644 --- a/biopet-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutableMain.scala +++ b/biopet-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutableMain.scala @@ -14,7 +14,6 @@ */ package nl.lumc.sasc.biopet -import nl.lumc.sasc.biopet.pipelines.generateindexes.GenerateIndexes import nl.lumc.sasc.biopet.utils.{ BiopetExecutable, MainCommand } object BiopetExecutableMain extends BiopetExecutable { @@ -38,7 +37,8 @@ object BiopetExecutableMain extends BiopetExecutable { nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling, nl.lumc.sasc.biopet.pipelines.basty.Basty, nl.lumc.sasc.biopet.pipelines.shiva.Shiva, - GenerateIndexes + nl.lumc.sasc.biopet.pipelines.generateindexes.DownloadGenomes, + nl.lumc.sasc.biopet.pipelines.generateindexes.GenerateIndexes ) def tools: List[MainCommand] = BiopetToolsExecutable.tools diff --git a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/DownloadNcbiAssembly.scala b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/DownloadNcbiAssembly.scala new file mode 100644 index 0000000000000000000000000000000000000000..3e0576309a90a6807d454e7cadb4dce208489243 --- /dev/null +++ b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/DownloadNcbiAssembly.scala @@ -0,0 +1,51 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.tools + +import java.io.File + +import nl.lumc.sasc.biopet.core.ToolCommandFunction +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** + * @deprecated Use picard.util.BedToIntervalList instead + */ +class DownloadNcbiAssembly(val root: Configurable) extends ToolCommandFunction { + def toolObject = nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly + + @Output(doc = "Output fasta file", required = true) + var output: File = _ + + var outputReport: File = _ + + var assemblyId: String = null + + var nameHeader: Option[String] = None + + var mustHaveOne: List[String] = Nil + var mustNotHave: List[String] = Nil + + override def defaultCoreMemory = 4.0 + + override def cmdLine = super.cmdLine + + required("-a", assemblyId) + + required("--report", outputReport) + + required("-o", output) + + optional("--nameHeader", nameHeader) + + repeat("--mustHaveOne", mustHaveOne) + + repeat("--mustNotHave", mustNotHave) +} + diff --git a/biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala b/biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala index 45cb24741233150ad690787903a02f88985684f6..afc8453c4f27b2e97ce28dfa5bb643e133d7b048 100644 --- a/biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala +++ b/biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala @@ -49,5 +49,6 @@ object BiopetToolsExecutable extends BiopetExecutable { nl.lumc.sasc.biopet.tools.VcfToTsv, nl.lumc.sasc.biopet.tools.VcfWithVcf, nl.lumc.sasc.biopet.tools.VepNormalizer, - nl.lumc.sasc.biopet.tools.WipeReads) + nl.lumc.sasc.biopet.tools.WipeReads, + nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly) } diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/DownloadNcbiAssembly.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/DownloadNcbiAssembly.scala new file mode 100644 index 0000000000000000000000000000000000000000..7d3f40eb10bb0ddcfd01c0678493dbd5a13cca2f --- /dev/null +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/DownloadNcbiAssembly.scala @@ -0,0 +1,97 @@ +package nl.lumc.sasc.biopet.tools + +import java.io.{ File, PrintWriter } + +import nl.lumc.sasc.biopet.utils.ToolCommand + +import scala.io.Source + +/** + * Created by pjvan_thof on 21-9-16. + */ +object DownloadNcbiAssembly extends ToolCommand { + + case class Args(assemblyId: String = null, + outputFile: File = null, + reportFile: Option[File] = None, + contigNameHeader: Option[String] = None, + mustHaveOne: List[(String, String)] = List(), + mustNotHave: List[(String, String)] = List()) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[String]('a', "assembly id") required () unbounded () valueName "<file>" action { (x, c) => + c.copy(assemblyId = x) + } + opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) => + c.copy(outputFile = x) + } + opt[File]("report") unbounded () valueName "<file>" action { (x, c) => + c.copy(reportFile = Some(x)) + } + opt[String]("nameHeader") unbounded () valueName "<string>" action { (x, c) => + c.copy(contigNameHeader = Some(x)) + } + opt[(String, String)]("mustHaveOne") unbounded () valueName "<string>" action { (x, c) => + c.copy(mustHaveOne = (x._1, x._2) :: c.mustHaveOne) + } + opt[(String, String)]("mustNotHave") unbounded () valueName "<string>" action { (x, c) => + c.copy(mustNotHave = (x._1, x._2) :: c.mustNotHave) + } + } + + /** + * @param args the command line arguments + */ + def main(args: Array[String]): Unit = { + val argsParser = new OptParser + val cmdargs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException) + + logger.info(s"Reading ${cmdargs.assemblyId} from NCBI") + val reader = Source.fromURL(s"ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/${cmdargs.assemblyId}.assembly.txt") + val assamblyReport = reader.getLines().toList + reader.close() + cmdargs.reportFile.foreach { file => + val writer = new PrintWriter(file) + assamblyReport.foreach(writer.println) + writer.close() + } + + val headers = assamblyReport.filter(_.startsWith("#")).last.stripPrefix("# ").split("\t").zipWithIndex.toMap + val nameId = cmdargs.contigNameHeader.map(x => headers(x)) + val lengthId = headers.get("Sequence-Length") + + val baseUrlEutils = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" + + val fastaWriter = new PrintWriter(cmdargs.outputFile) + + val allContigs = assamblyReport.filter(!_.startsWith("#")) + .map(_.split("\t")) + val totalLength = lengthId.map(id => allContigs.map(_.apply(id).toLong).sum) + + logger.info(s"${allContigs.size} contigs found") + totalLength.foreach(l => logger.info(s"Total length: ${l}")) + + val filterContigs = allContigs + .filter(values => cmdargs.mustNotHave.forall(x => values(headers(x._1)) != x._2)) + .filter(values => cmdargs.mustHaveOne.exists(x => values(headers(x._1)) == x._2) || cmdargs.mustHaveOne.isEmpty) + val filterLength = lengthId.map(id => filterContigs.map(_.apply(id).toLong).sum) + + logger.info(s"${filterContigs.size} contigs left after filtering") + filterLength.foreach(l => logger.info(s"Filtered length: ${l}")) + + filterContigs.foreach { values => + val id = if (values(6) == "na") values(4) else values(6) + logger.info(s"Start download ${id}") + val fastaReader = Source.fromURL(s"${baseUrlEutils}/efetch.fcgi?db=nuccore&id=${id}&retmode=text&rettype=fasta") + fastaReader.getLines() + .map(x => nameId.map(y => x.replace(">", s">${values(y)} ")).getOrElse(x)) + .foreach(fastaWriter.println) + fastaReader.close() + } + + logger.info("Downloading complete") + + fastaWriter.close() + + } +} diff --git a/biopet-tools/src/test/resources/GCF_000844745.1.report b/biopet-tools/src/test/resources/GCF_000844745.1.report new file mode 100644 index 0000000000000000000000000000000000000000..9f5e951151eab2ad3b68d2cb7508ca571ca35153 --- /dev/null +++ b/biopet-tools/src/test/resources/GCF_000844745.1.report @@ -0,0 +1,23 @@ +# Assembly name: ViralProj14444 +# Organism name: Ageratum yellow vein betasatellite (viruses) +# Taxid: 185750 +# BioProject: PRJNA14444 +# Submitter: NCBI +# Date: 2000-6-14 +# Assembly type: n/a +# Release type: major +# Assembly level: Complete Genome +# Genome representation: full +# RefSeq assembly accession: GCF_000844745.1 +# +## Assembly-Units: +## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name +## GCF_000844735.1 Primary assembly +# +# Ordered by chromosome/plasmid; the chromosomes/plasmids are followed by +# unlocalized scaffolds. +# Unplaced scaffolds are listed at the end. +# RefSeq is equal or derived from GenBank object. +# +# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name +Unknown assembled-molecule Unknown Segment na <> NC_003403.1 Primary assembly 1347 na \ No newline at end of file diff --git a/biopet-tools/src/test/resources/NC_003403.1.fasta b/biopet-tools/src/test/resources/NC_003403.1.fasta new file mode 100644 index 0000000000000000000000000000000000000000..cf297759532bc910fd2df5d9f6e4acf2a5a1bd7e --- /dev/null +++ b/biopet-tools/src/test/resources/NC_003403.1.fasta @@ -0,0 +1,22 @@ +>gi|18702122|ref|NC_003403.1| Ageratum yellow vein virus-associated DNA beta, complete genome +ACCGGTGGCGAGCGGGGAATTTGGGTTCCAGGTGGGACCCACACCTGGAACAGCCTGGAAAGAACGTGAG +TGGGCCAAATAAGTGGGCCGTTGAATTGGGCTTTTTATTCAAAACTATCTCAAATTTACGAAGCCCATTA +TTATTCATTTGAGAAAAATTACATTAAACATAGTCCGTATAAATATACATTTATACGGTTACATTCTTGT +ATACATGGTATTCGTCGTTTACTTGGATATCTATTACTGGAGCTTCCTGTTGCATCATGATATCTATTAT +TTCAACCATGTCTTCTGATTTGAACTCTTCTATTGTTGAATCTCTATACATGATCCTTAGTATGTTCTTG +ATACCTTCTTCTAAAGCATTAAAATTGAATGGTGGTATGATACCGTGATGTCCGTATGGAATGGAGAATT +TGTTCTTGACTAGTGCTGGTGACCTTGTTGAAAACACTTCTACGTCAACTCTGAATTTTATTTCTGATCT +GAACTTGACGTTGATGATGAACAATATTCCTTTATCGTTGGTATATGATATAGTCATGATTGTTTGATGG +AAATATATTCATTTGATGTGCATTTATAATTGGATGTGTCTGTTGTATGTGGTTGTGATTAATGATCGTG +TCAAATTTATGTTCTTGTATTAATTAGTGTATGTGTGACTGTATTGATGTTGATGTTTGTGATTATAATT +GAGAAAAGAAGATATTCATTCATTTGAATAAGAAAAGAAAGAAAAAAAAAAGAAAACTATATACATAAAA +AAAACGAATAATAAAAATAACTATCTATTCAGAAAAAATGGGAGCGCAGCGAAAACGAAAACAAAAATCG +ATATCAAAGAAAATCCTGATAAATTAAATAGACAGAATTGAAAAGGAAAAAAGTAAAAAAAGATAAAAGT +TATTACATTAAAAAAACTAAAGAAAAAAAAAATAAAAAAACTAAAAATGATAAATAAAAAAATGACGTCA +TCTGATGTCATCACAGTTAAAAACTATTTACTGCGGTTAAAATTTAATATAATTAACCACTTGGGTAAAT +ATATTGTCTCCAATAGGTAAATTGTCTCCAATATATCGGAGACATATTGGAGACTCTGGAGACACTTATA +TGCTAATTTCCCATTTTACCCTTACTAATGTGACTGTAAGGCGCGTGGGAGTGCTCTGAAAAAGTAGACC +TTCTCTCTCCAAAACTCCCCGGAGAAGACGATACAGGCTGATCCCGGCATCAATTCGCGACACGCGCGGC +GGTGTGTACCCCTGGGAGGGTAGAAACCACTACGCTACGCAGCAGCCTTAGCTACGCCGGAGCTTAGCTC +GCCACCGTAATAATATT + diff --git a/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/DownloadNcbiAssemblyTest.scala b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/DownloadNcbiAssemblyTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..f495d7f2f4d5bf39857abea8f916953f71a87eaf --- /dev/null +++ b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/DownloadNcbiAssemblyTest.scala @@ -0,0 +1,33 @@ +package nl.lumc.sasc.biopet.tools + +import java.io.File +import java.nio.file.Paths + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +import scala.io.Source + +/** + * Created by pjvanthof on 03/10/16. + */ +class DownloadNcbiAssemblyTest extends TestNGSuite with Matchers { + private def resourcePath(p: String): String = { + Paths.get(getClass.getResource(p).toURI).toString + } + + @Test + def testNC_003403_1: Unit = { + val output = File.createTempFile("test.", ".fasta") + val outputReport = File.createTempFile("test.", ".report") + output.deleteOnExit() + outputReport.deleteOnExit() + DownloadNcbiAssembly.main(Array("-a", "GCF_000844745.1", + "-o", output.getAbsolutePath, + "--report", outputReport.getAbsolutePath)) + + Source.fromFile(output).getLines().toList shouldBe Source.fromFile(new File(resourcePath("/NC_003403.1.fasta"))).getLines().toList + Source.fromFile(outputReport).getLines().toList shouldBe Source.fromFile(new File(resourcePath("/GCF_000844745.1.report"))).getLines().toList + } +} diff --git a/generate-indexes/pom.xml b/generate-indexes/pom.xml index 47be110c6f01a977ba080d32ca4e468d11a3065e..6e37240f24301e63a33d46957357397e4bc36ea6 100644 --- a/generate-indexes/pom.xml +++ b/generate-indexes/pom.xml @@ -41,7 +41,7 @@ </dependency> <dependency> <groupId>nl.lumc.sasc</groupId> - <artifactId>BiopetExtensions</artifactId> + <artifactId>BiopetToolsExtensions</artifactId> <version>${project.version}</version> </dependency> <dependency> diff --git a/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/DownloadGenomes.scala b/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/DownloadGenomes.scala new file mode 100644 index 0000000000000000000000000000000000000000..5476cdba89de2b80013852c58173d0d423ea7502 --- /dev/null +++ b/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/DownloadGenomes.scala @@ -0,0 +1,265 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.generateindexes + +import java.io.File +import java.util + +import nl.lumc.sasc.biopet.core.extensions.Md5sum +import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } +import nl.lumc.sasc.biopet.extensions._ +import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants +import nl.lumc.sasc.biopet.extensions.picard.NormalizeFasta +import nl.lumc.sasc.biopet.extensions.tools.DownloadNcbiAssembly +import nl.lumc.sasc.biopet.utils.ConfigUtils +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.QScript + +import scala.collection.JavaConversions._ +import scala.language.reflectiveCalls + +class DownloadGenomes(val root: Configurable) extends QScript with BiopetQScript { + def this() = this(null) + + @Argument(required = true) + var referenceConfigFiles: List[File] = Nil + + var referenceConfig: Map[String, Any] = _ + + override def fixedValues = Map("gffread" -> Map("T" -> true)) + + override def defaults = Map("normalizefasta" -> Map("line_length" -> 60)) + + val downloadAnnotations: Boolean = config("download_annotations", default = false) + + /** This is executed before the script starts */ + def init(): Unit = { + if (referenceConfig == null) + referenceConfig = referenceConfigFiles.foldLeft(Map[String, Any]())((a, b) => ConfigUtils.mergeMaps(a, ConfigUtils.fileToConfigMap(b))) + } + + /** Method where jobs must be added */ + def biopetScript(): Unit = { + + for ((speciesName, c) <- referenceConfig) yield speciesName -> { + val speciesConfig = ConfigUtils.any2map(c) + val speciesDir = new File(outputDir, speciesName) + for ((genomeName, c) <- speciesConfig) yield genomeName -> { + var configDeps: List[File] = Nil + val genomeConfig = ConfigUtils.any2map(c) + + val genomeDir = new File(speciesDir, genomeName) + val fastaFile = new File(genomeDir, "reference.fa") + val downloadFastaFile = new File(genomeDir, "download.reference.fa") + + genomeConfig.get("ncbi_assembly_id") match { + case Some(assemblyID: String) => + val downloadAssembly = new DownloadNcbiAssembly(this) + downloadAssembly.assemblyId = assemblyID + downloadAssembly.output = downloadFastaFile + downloadAssembly.outputReport = new File(genomeDir, s"$speciesName-$genomeName.assembly.report") + downloadAssembly.nameHeader = genomeConfig.get("ncbi_assembly_header_name").map(_.toString) + downloadAssembly.mustHaveOne = genomeConfig.get("ncbi_assembly_must_have_one") + .map(_.asInstanceOf[util.ArrayList[util.LinkedHashMap[String, String]]]) + .getOrElse(new util.ArrayList()).flatMap(x => x.map(y => y._1 + "=" + y._2)) + .toList + downloadAssembly.mustNotHave = genomeConfig.get("ncbi_assembly_must_not_have") + .map(_.asInstanceOf[util.ArrayList[util.LinkedHashMap[String, String]]]) + .getOrElse(new util.ArrayList()).flatMap(x => x.map(y => y._1 + "=" + y._2)) + .toList + downloadAssembly.isIntermediate = true + add(downloadAssembly) + case _ => + val fastaUris = genomeConfig.getOrElse("fasta_uri", + throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match { + case a: Traversable[_] => a.map(_.toString).toArray + case a: util.ArrayList[_] => a.map(_.toString).toArray + case a => Array(a.toString) + } + + val fastaFiles = for (fastaUri <- fastaUris) yield { + val curl = new Curl(this) + curl.url = fastaUri + curl.output = if (fastaUris.length > 1 || fastaUri.endsWith(".gz")) { + curl.isIntermediate = true + new File(genomeDir, new File(fastaUri).getName) + } else fastaFile + + add(curl) + add(Md5sum(this, curl.output, genomeDir)) + curl.output + } + + val fastaCat = new FastaMerging(this) + fastaCat.output = downloadFastaFile + fastaCat.isIntermediate = true + if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) { + fastaFiles.foreach { file => + if (file.getName.endsWith(".gz")) { + val zcat = new Zcat(this) + zcat.appending = true + zcat.input :+= file + zcat.output = downloadFastaFile + fastaCat.cmds :+= zcat + fastaCat.input :+= file + } else { + val cat = new Cat(this) + cat.appending = true + cat.input :+= file + cat.output = downloadFastaFile + fastaCat.cmds :+= cat + fastaCat.input :+= file + } + } + add(fastaCat) + configDeps :+= fastaCat.output + } + } + + val normalizeFasta = new NormalizeFasta(this) + normalizeFasta.input = downloadFastaFile + normalizeFasta.output = fastaFile + add(normalizeFasta) + + val generateIndexes = new GenerateIndexes(this) + generateIndexes.fastaFile = fastaFile + generateIndexes.speciesName = speciesName + generateIndexes.genomeName = genomeName + generateIndexes.outputDir = genomeDir + //generateIndexes.fastaUris = fastaUris + //TODO: add gtf file + add(generateIndexes) + + if (downloadAnnotations) { + val annotationDir = new File(genomeDir, "annotation") + + def getAnnotation(tag: String): Map[String, Map[String, Any]] = (genomeConfig.get(tag) match { + case Some(s: Map[_, _]) => s.map(x => x._2 match { + case o: Map[_, _] => x._1.toString -> o.map(x => (x._1.toString, x._2)) + case _ => throw new IllegalStateException(s"values in the tag $tag should be json objects") + }) + case None => Map() + case x => throw new IllegalStateException(s"tag $tag should be an object with objects, now $x") + }) + + // Download vep caches + getAnnotation("vep").foreach { + case (version, vep) => + val vepDir = new File(annotationDir, "vep" + File.separator + version) + val curl = new Curl(this) + curl.url = vep("cache_uri").toString + curl.output = new File(vepDir, new File(curl.url).getName) + curl.isIntermediate = true + add(curl) + + val tar = new TarExtract(this) + tar.inputTar = curl.output + tar.outputDir = vepDir + add(tar) + } + + getAnnotation("dbsnp").foreach { + case (version, dbsnp) => + val dbpsnpDir = new File(annotationDir, "dbsnp") + val contigMap = dbsnp.get("dbsnp_contig_map").map(_.asInstanceOf[Map[String, Any]]) + val contigSed = contigMap.map { map => + val sed = new Sed(this) + sed.expressions = map.map(x => s"""s/^${x._1}\t/${x._2}\t/""").toList + sed + } + + val cv = new CombineVariants(this) + cv.reference_sequence = fastaFile + def addDownload(uri: String): Unit = { + val isZipped = uri.endsWith(".gz") + val output = new File(dbpsnpDir, version + "." + new File(uri).getName + (if (isZipped) "" else ".gz")) + val curl = new Curl(this) + curl.url = uri + + val downloadCmd = (isZipped, contigSed) match { + case (true, Some(sed)) => curl | Zcat(this) | sed | new Bgzip(this) > output + case (false, Some(sed)) => curl | sed | new Bgzip(this) > output + case (true, None) => curl > output + case (false, None) => curl | new Bgzip(this) > output + } + downloadCmd.isIntermediate = true + add(downloadCmd) + + val tabix = new Tabix(this) + tabix.input = output + tabix.p = Some("vcf") + tabix.isIntermediate = true + add(tabix) + + cv.variant :+= output + } + + dbsnp.get("vcf_uri") match { + case Some(l: Traversable[_]) => l.foreach(x => addDownload(x.toString)) + case Some(l: util.ArrayList[_]) => l.foreach(x => addDownload(x.toString)) + case Some(s) => addDownload(s.toString) + case None => throw new IllegalStateException("Dbsnp should always have a 'vcf_uri' key") + } + + cv.out = new File(dbpsnpDir, s"dbsnp.$version.vcf.gz") + add(cv) + } + + getAnnotation("gene_annotation").foreach { + case (version, geneAnnotation) => + val dir = new File(annotationDir, version) + val gffFile: Option[File] = geneAnnotation.get("gff_uri").map { gtfUri => + val outputFile = new File(dir, new File(gtfUri.toString).getName.stripSuffix(".gz")) + val curl = new Curl(this) + curl.url = gtfUri.toString + if (gtfUri.toString.endsWith(".gz")) add(curl | Zcat(this) > outputFile) + else add(curl > outputFile) + outputFile + } + + val gtfFile: Option[File] = geneAnnotation.get("gtf_uri") match { + case Some(gtfUri) => + val outputFile = new File(dir, new File(gtfUri.toString).getName.stripSuffix(".gz")) + val curl = new Curl(this) + curl.url = gtfUri.toString + if (gtfUri.toString.endsWith(".gz")) add(curl | Zcat(this) > outputFile) + else add(curl > outputFile) + Some(outputFile) + case _ => gffFile.map { gff => + val gffRead = new GffRead(this) + gffRead.input = gff + gffRead.output = swapExt(dir, gff, ".gff", ".gtf") + add(gffRead) + gffRead.output + } + } + + val refFlatFile: Option[File] = gtfFile.map { gtf => + val refFlat = new File(gtf + ".refFlat") + val gtfToGenePred = new GtfToGenePred(this) + gtfToGenePred.inputGtfs :+= gtf + + add(gtfToGenePred | Awk(this, """{ print $12"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10 }""") > refFlat) + + refFlat + } + } + } + } + } + } +} + +object DownloadGenomes extends PipelineCommand diff --git a/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala b/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala index 234c0961729a8ee8884b1113fe05cc7081dc987b..2b25fa7277599f41e38b27273a062df9e3e042df 100644 --- a/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala +++ b/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala @@ -1,301 +1,146 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ package nl.lumc.sasc.biopet.pipelines.generateindexes -import java.io.{ File, PrintWriter } -import java.util - -import nl.lumc.sasc.biopet.core.extensions.Md5sum import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } -import nl.lumc.sasc.biopet.extensions._ import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2Build, BowtieBuild } +import nl.lumc.sasc.biopet.extensions.{ Ln, Star } import nl.lumc.sasc.biopet.extensions.bwa.BwaIndex -import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants import nl.lumc.sasc.biopet.extensions.gmap.GmapBuild import nl.lumc.sasc.biopet.extensions.hisat.Hisat2Build import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx -import nl.lumc.sasc.biopet.utils.ConfigUtils import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.queue.QScript -import scala.collection.JavaConversions._ -import scala.language.reflectiveCalls +import scala.collection.mutable.ListBuffer +/** + * Created by pjvan_thof on 21-9-16. + */ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript { def this() = this(null) + @Input(doc = "Input fasta file", shortName = "R") + var fastaFile: File = _ + + @Argument(required = true) + var speciesName: String = _ + @Argument(required = true) - var referenceConfigFiles: List[File] = Nil + var genomeName: String = _ - var referenceConfig: Map[String, Any] = null + @Input(required = false) + var gtfFile: Option[File] = None - protected var configDeps: List[File] = Nil + var fastaUris: Array[String] = Array() - /** This is executed before the script starts */ + /** Init for pipeline */ def init(): Unit = { - if (referenceConfig == null) - referenceConfig = referenceConfigFiles.foldLeft(Map[String, Any]())((a, b) => ConfigUtils.mergeMaps(a, ConfigUtils.fileToConfigMap(b))) + if (outputDir == null) outputDir = fastaFile.getParentFile } - /** Method where jobs must be added */ + /** Pipeline itself */ def biopetScript(): Unit = { - val outputConfig = for ((speciesName, c) <- referenceConfig) yield speciesName -> { - val speciesConfig = ConfigUtils.any2map(c) - val speciesDir = new File(outputDir, speciesName) - for ((genomeName, c) <- speciesConfig) yield genomeName -> { - var configDeps: List[File] = Nil - val genomeConfig = ConfigUtils.any2map(c) - val fastaUris = genomeConfig.getOrElse("fasta_uri", - throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match { - case a: Traversable[_] => a.map(_.toString).toArray - case a: util.ArrayList[_] => a.map(_.toString).toArray - case a => Array(a.toString) - } - - val genomeDir = new File(speciesDir, genomeName) - val fastaFile = new File(genomeDir, "reference.fa") - var outputConfig: Map[String, Any] = Map("reference_fasta" -> fastaFile) - - val fastaFiles = for (fastaUri <- fastaUris) yield { - val curl = new Curl(this) - curl.url = fastaUri - curl.output = if (fastaUris.length > 1 || fastaUri.endsWith(".gz")) { - curl.isIntermediate = true - new File(genomeDir, new File(fastaUri).getName) - } else fastaFile - - add(curl) - add(Md5sum(this, curl.output, genomeDir)) - configDeps :+= curl.output - curl.output - } - - val fastaCat = new FastaMerging(this) - fastaCat.output = fastaFile - - if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) { - fastaFiles.foreach { file => - if (file.getName.endsWith(".gz")) { - val zcat = new Zcat(this) - zcat.appending = true - zcat.input :+= file - zcat.output = fastaFile - fastaCat.cmds :+= zcat - fastaCat.input :+= file - } else { - val cat = new Cat(this) - cat.appending = true - cat.input :+= file - cat.output = fastaFile - fastaCat.cmds :+= cat - fastaCat.input :+= file - } - } - add(fastaCat) - configDeps :+= fastaCat.output - } - - val faidx = SamtoolsFaidx(this, fastaFile) - add(faidx) - configDeps :+= faidx.output - - val createDict = new CreateSequenceDictionary(this) - createDict.reference = fastaFile - createDict.output = new File(genomeDir, fastaFile.getName.stripSuffix(".fa") + ".dict") - createDict.species = Some(speciesName) - createDict.genomeAssembly = Some(genomeName) - createDict.uri = Some(fastaUris.mkString(",")) - add(createDict) - configDeps :+= createDict.output - - def createLinks(dir: File): File = { - val newFastaFile = new File(dir, fastaFile.getName) - val newFai = new File(dir, faidx.output.getName) - val newDict = new File(dir, createDict.output.getName) - - add(Ln(this, faidx.output, newFai)) - add(Ln(this, createDict.output, newDict)) - val lnFasta = Ln(this, fastaFile, newFastaFile) - lnFasta.deps ++= List(newFai, newDict) - add(lnFasta) - newFastaFile - } - - val annotationDir = new File(genomeDir, "annotation") - - genomeConfig.get("vep_cache_uri").foreach { vepCacheUri => - val vepDir = new File(annotationDir, "vep") - val curl = new Curl(this) - curl.url = vepCacheUri.toString - curl.output = new File(vepDir, new File(curl.url).getName) - curl.isIntermediate = true - add(curl) - - val tar = new TarExtract(this) - tar.inputTar = curl.output - tar.outputDir = vepDir - add(tar) - - val regex = """.*\/(.*)_vep_(\d*)_(.*)\.tar\.gz""".r - vepCacheUri.toString match { - case regex(species, version, assembly) if version.forall(_.isDigit) => - outputConfig ++= Map("varianteffectpredictor" -> Map( - "species" -> species, - "assembly" -> assembly, - "cache_version" -> version.toInt, - "cache" -> vepDir, - "fasta" -> createLinks(vepDir))) - case _ => throw new IllegalArgumentException("Cache found but no version was found") - } - } - - genomeConfig.get("dbsnp_vcf_uri").foreach { dbsnpUri => - val cv = new CombineVariants(this) - cv.reference_sequence = fastaFile - cv.deps ::= createDict.output - def addDownload(uri: String): Unit = { - val curl = new Curl(this) - curl.url = uri - curl.output = new File(annotationDir, new File(curl.url).getName) - curl.isIntermediate = true - add(curl) - cv.variant :+= curl.output - - if (curl.output.getName.endsWith(".vcf.gz")) { - val tabix = new Tabix(this) - tabix.input = curl.output - tabix.p = Some("vcf") - tabix.isIntermediate = true - add(tabix) - configDeps :+= tabix.outputIndex - } - } - - dbsnpUri match { - case l: Traversable[_] => l.foreach(x => addDownload(x.toString)) - case l: util.ArrayList[_] => l.foreach(x => addDownload(x.toString)) - case _ => addDownload(dbsnpUri.toString) - } - - cv.out = new File(annotationDir, "dbsnp.vcf.gz") - add(cv) - outputConfig += "dbsnp" -> cv.out - } - - val gtfFile: Option[File] = genomeConfig.get("gtf_uri").map { gtfUri => - val outputFile = new File(annotationDir, new File(gtfUri.toString).getName.stripSuffix(".gz")) - val curl = new Curl(this) - curl.url = gtfUri.toString - if (gtfUri.toString.endsWith(".gz")) add(curl | Zcat(this) > outputFile) - else add(curl > outputFile) - outputConfig += "annotation_gtf" -> outputFile - outputFile - } - - val refFlatFile: Option[File] = gtfFile.map { gtf => - val refFlat = new File(gtf + ".refFlat") - val gtfToGenePred = new GtfToGenePred(this) - gtfToGenePred.inputGtfs :+= gtf - - add(gtfToGenePred | Awk(this, """{ print $12"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10 }""") > refFlat) - - outputConfig += "annotation_refflat" -> refFlat - refFlat - } - - // Bwa index - val bwaIndex = new BwaIndex(this) - bwaIndex.reference = createLinks(new File(genomeDir, "bwa")) - add(bwaIndex) - configDeps :+= bwaIndex.jobOutputFile - outputConfig += "bwa" -> Map("reference_fasta" -> bwaIndex.reference.getAbsolutePath) - - // Gmap index - val gmapDir = new File(genomeDir, "gmap") - val gmapBuild = new GmapBuild(this) - gmapBuild.dir = gmapDir - gmapBuild.db = genomeName - gmapBuild.fastaFiles ::= createLinks(gmapDir) - add(gmapBuild) - configDeps :+= gmapBuild.jobOutputFile - outputConfig += "gsnap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName) - outputConfig += "gmap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName) - - // STAR index - val starDir = new File(genomeDir, "star") - val starIndex = new Star(this) - starIndex.outputDir = starDir - starIndex.reference = createLinks(starDir) - starIndex.runmode = "genomeGenerate" - starIndex.sjdbGTFfile = gtfFile - add(starIndex) - configDeps :+= starIndex.jobOutputFile - outputConfig += "star" -> Map( - "reference_fasta" -> starIndex.reference.getAbsolutePath, - "genomeDir" -> starDir.getAbsolutePath - ) - - // Bowtie index - val bowtieIndex = new BowtieBuild(this) - bowtieIndex.reference = createLinks(new File(genomeDir, "bowtie")) - bowtieIndex.baseName = "reference" - add(bowtieIndex) - configDeps :+= bowtieIndex.jobOutputFile - outputConfig += "bowtie" -> Map("reference_fasta" -> bowtieIndex.reference.getAbsolutePath) - - // Bowtie2 index - val bowtie2Index = new Bowtie2Build(this) - bowtie2Index.reference = createLinks(new File(genomeDir, "bowtie2")) - bowtie2Index.baseName = "reference" - add(bowtie2Index) - configDeps :+= bowtie2Index.jobOutputFile - outputConfig += "bowtie2" -> Map("reference_fasta" -> bowtie2Index.reference.getAbsolutePath) - outputConfig += "tophat" -> Map( - "bowtie_index" -> bowtie2Index.reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta") - ) - - // Hisat2 index - val hisat2Index = new Hisat2Build(this) - hisat2Index.inputFasta = createLinks(new File(genomeDir, "hisat2")) - hisat2Index.hisat2IndexBase = new File(new File(genomeDir, "hisat2"), "reference").getAbsolutePath - add(hisat2Index) - configDeps :+= hisat2Index.jobOutputFile - outputConfig += "hisat2" -> Map( - "reference_fasta" -> hisat2Index.inputFasta.getAbsolutePath, - "hisat_index" -> hisat2Index.inputFasta.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta") - ) - - val writeConfig = new WriteConfig - writeConfig.deps = configDeps - writeConfig.out = new File(genomeDir, s"$speciesName-$genomeName.json") - writeConfig.config = Map("references" -> Map(speciesName -> Map(genomeName -> outputConfig))) - add(writeConfig) - - this.configDeps :::= configDeps - outputConfig - } + var outputConfig: Map[String, Any] = Map("reference_fasta" -> fastaFile) + val configDeps = new ListBuffer[File]() + + val faidx = SamtoolsFaidx(this, fastaFile) + add(faidx) + configDeps += faidx.output + + val createDict = new CreateSequenceDictionary(this) + createDict.reference = fastaFile + createDict.output = new File(outputDir, fastaFile.getName.stripSuffix(".fa") + ".dict") + createDict.species = Some(speciesName) + createDict.genomeAssembly = Some(genomeName) + if (fastaUris.nonEmpty) createDict.uri = Some(fastaUris.mkString(",")) + add(createDict) + configDeps += createDict.output + + def createLinks(dir: File): File = { + val newFastaFile = new File(dir, fastaFile.getName) + val newFai = new File(dir, faidx.output.getName) + val newDict = new File(dir, createDict.output.getName) + + add(Ln(this, faidx.output, newFai)) + add(Ln(this, createDict.output, newDict)) + val lnFasta = Ln(this, fastaFile, newFastaFile) + lnFasta.deps ++= List(newFai, newDict) + add(lnFasta) + newFastaFile } + // Bwa index + val bwaIndex = new BwaIndex(this) + bwaIndex.reference = createLinks(new File(outputDir, "bwa")) + add(bwaIndex) + configDeps += bwaIndex.jobOutputFile + outputConfig += "bwa" -> Map("reference_fasta" -> bwaIndex.reference.getAbsolutePath) + + // Gmap index + val gmapDir = new File(outputDir, "gmap") + val gmapBuild = new GmapBuild(this) + gmapBuild.dir = gmapDir + gmapBuild.db = genomeName + gmapBuild.fastaFiles ::= createLinks(gmapDir) + add(gmapBuild) + configDeps += gmapBuild.jobOutputFile + outputConfig += "gsnap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName) + outputConfig += "gmap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName) + + // STAR index + val starDir = new File(outputDir, "star") + val starIndex = new Star(this) + starIndex.outputDir = starDir + starIndex.reference = createLinks(starDir) + starIndex.runmode = "genomeGenerate" + starIndex.sjdbGTFfile = gtfFile + add(starIndex) + configDeps += starIndex.jobOutputFile + outputConfig += "star" -> Map( + "reference_fasta" -> starIndex.reference.getAbsolutePath, + "genomeDir" -> starDir.getAbsolutePath + ) + + // Bowtie index + val bowtieIndex = new BowtieBuild(this) + bowtieIndex.reference = createLinks(new File(outputDir, "bowtie")) + bowtieIndex.baseName = "reference" + add(bowtieIndex) + configDeps += bowtieIndex.jobOutputFile + outputConfig += "bowtie" -> Map("reference_fasta" -> bowtieIndex.reference.getAbsolutePath) + + // Bowtie2 index + val bowtie2Index = new Bowtie2Build(this) + bowtie2Index.reference = createLinks(new File(outputDir, "bowtie2")) + bowtie2Index.baseName = "reference" + add(bowtie2Index) + configDeps += bowtie2Index.jobOutputFile + outputConfig += "bowtie2" -> Map("reference_fasta" -> bowtie2Index.reference.getAbsolutePath) + outputConfig += "tophat" -> Map( + "bowtie_index" -> bowtie2Index.reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta") + ) + + // Hisat2 index + val hisat2Index = new Hisat2Build(this) + hisat2Index.inputFasta = createLinks(new File(outputDir, "hisat2")) + hisat2Index.hisat2IndexBase = new File(new File(outputDir, "hisat2"), "reference").getAbsolutePath + add(hisat2Index) + configDeps += hisat2Index.jobOutputFile + outputConfig += "hisat2" -> Map( + "reference_fasta" -> hisat2Index.inputFasta.getAbsolutePath, + "hisat_index" -> hisat2Index.inputFasta.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta") + ) + val writeConfig = new WriteConfig - writeConfig.deps = configDeps - writeConfig.out = new File(outputDir, "references.json") - writeConfig.config = Map("references" -> outputConfig) + writeConfig.deps = configDeps.toList + writeConfig.out = configFile + writeConfig.config = Map("references" -> Map(speciesName -> Map(genomeName -> outputConfig))) add(writeConfig) + } + + def configFile = new File(outputDir, s"$speciesName-$genomeName.json") } -object GenerateIndexes extends PipelineCommand +object GenerateIndexes extends PipelineCommand \ No newline at end of file diff --git a/generate-indexes/src/test/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexesTest.scala b/generate-indexes/src/test/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/DownloadGenomesTest.scala similarity index 50% rename from generate-indexes/src/test/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexesTest.scala rename to generate-indexes/src/test/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/DownloadGenomesTest.scala index 5931c7e0e827be453141f191d4f05005e27931c5..331798087b5c7bf26f9fc7ec87ed79403f3f272c 100644 --- a/generate-indexes/src/test/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexesTest.scala +++ b/generate-indexes/src/test/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/DownloadGenomesTest.scala @@ -25,18 +25,18 @@ import org.testng.annotations.Test /** * Created by pjvan_thof on 13-5-16. */ -class GenerateIndexesTest extends TestNGSuite with Matchers { - def initPipeline(map: Map[String, Any]): GenerateIndexes = { - new GenerateIndexes() { +class DownloadGenomesTest extends TestNGSuite with Matchers { + def initPipeline(map: Map[String, Any]): DownloadGenomes = { + new DownloadGenomes() { override def configNamespace = "generateindexes" - override def globalConfig = new Config(ConfigUtils.mergeMaps(map, GenerateIndexesTest.config)) + override def globalConfig = new Config(ConfigUtils.mergeMaps(map, DownloadGenomesTest.config)) qSettings = new QSettings qSettings.runName = "test" } } @Test - def testNoFastaUri: Unit = { + def testNoFastaUri(): Unit = { val pipeline = initPipeline(Map()) pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("" -> ""))) intercept[IllegalArgumentException] { @@ -45,57 +45,101 @@ class GenerateIndexesTest extends TestNGSuite with Matchers { } @Test - def testSingleFasta: Unit = { + def testNcbiAssembly(): Unit = { + val pipeline = initPipeline(Map()) + pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("ncbi_assembly_id" -> "id"))) + pipeline.script() + } + + @Test + def testSingleFasta(): Unit = { val pipeline = initPipeline(Map()) pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri"))) pipeline.script() } @Test - def testMultiFasta: Unit = { + def testMultiFasta(): Unit = { val pipeline = initPipeline(Map()) pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> List("uri", "uri2", "uri3.gz")))) pipeline.script() } @Test - def testSingleDbsnp: Unit = { - val pipeline = initPipeline(Map()) - pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "dbsnp_vcf_uri" -> "uri.vcf.gz"))) + def testSingleDbsnp(): Unit = { + val pipeline = initPipeline(Map("download_annotations" -> true)) + pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", + "dbsnp" -> Map("version" -> Map("vcf_uri" -> "uri.vcf.gz"))))) pipeline.script() } @Test - def testMultiDbsnp: Unit = { - val pipeline = initPipeline(Map()) - pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "dbsnp_vcf_uri" -> List("uri.vcf.gz", "uri2.vcf.gz")))) + def testContigMapDbsnp(): Unit = { + val pipeline = initPipeline(Map("download_annotations" -> true)) + pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", + "dbsnp" -> Map("version" -> Map("vcf_uri" -> "uri.vcf.gz", "contig_map" -> Map("1" -> "chr1")))))) pipeline.script() } @Test - def testVep: Unit = { - val pipeline = initPipeline(Map()) - pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "vep_cache_uri" -> "something/human_vep_80_hg19.tar.gz"))) + def testUnzippedContigMapDbsnp(): Unit = { + val pipeline = initPipeline(Map("download_annotations" -> true)) + pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", + "dbsnp" -> Map("version" -> Map("vcf_uri" -> "uri.vcf", "contig_map" -> Map("1" -> "chr1")))))) pipeline.script() } @Test - def testGtfZipped: Unit = { - val pipeline = initPipeline(Map()) - pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "gtf_uri" -> "bla.gf.gz"))) + def testSingleUnzippedDbsnp(): Unit = { + val pipeline = initPipeline(Map("download_annotations" -> true)) + pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", + "dbsnp" -> Map("version" -> Map(("vcf_uri" -> "uri.vcf")))))) pipeline.script() } @Test - def testGtf: Unit = { - val pipeline = initPipeline(Map()) - pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "gtf_uri" -> "bla.gf"))) + def testMultiDbsnp(): Unit = { + val pipeline = initPipeline(Map("download_annotations" -> true)) + pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", + "dbsnp" -> Map("version" -> Map("vcf_uri" -> List("uri.vcf.gz", "uri2.vcf.gz")))))) + pipeline.script() + } + + @Test + def testVep(): Unit = { + val pipeline = initPipeline(Map("download_annotations" -> true)) + pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", + "vep" -> Map("version" -> Map("cache_uri" -> "something/human_vep_80_hg19.tar.gz"))))) + pipeline.script() + } + + @Test + def testGtfZipped(): Unit = { + val pipeline = initPipeline(Map("download_annotations" -> true)) + pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", + "gene_annotation" -> Map("version" -> Map("gtf_uri" -> "bla.gf.gz"))))) + pipeline.script() + } + + @Test + def testGtf(): Unit = { + val pipeline = initPipeline(Map("download_annotations" -> true)) + pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", + "gene_annotation" -> Map("version" -> Map("gtf_uri" -> "bla.gf"))))) + pipeline.script() + } + + @Test + def testGff(): Unit = { + val pipeline = initPipeline(Map("download_annotations" -> true)) + pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", + "gene_annotation" -> Map("version" -> Map("gff_uri" -> "bla.gf"))))) pipeline.script() } } -object GenerateIndexesTest { +object DownloadGenomesTest { val outputDir = Files.createTempDir() outputDir.deleteOnExit() @@ -109,6 +153,7 @@ object GenerateIndexesTest { "samtools" -> Map("exe" -> "test"), "md5sum" -> Map("exe" -> "test"), "gatk_jar" -> "test", - "tabix" -> Map("exe" -> "test") + "tabix" -> Map("exe" -> "test"), + "gffread" -> Map("exe" -> "test") ) }