Commit 1a4b1b9a authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Adding ncbi downloader to pipeline

parent ecd25b59
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.tools
import java.io.File
import nl.lumc.sasc.biopet.core.ToolCommandFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{Input, Output}
/**
* @deprecated Use picard.util.BedToIntervalList instead
*/
class DownloadNcbiAssembly(val root: Configurable) extends ToolCommandFunction {
def toolObject = nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly
@Output(doc = "Output fasta file", required = true)
var output: File = _
@Output(doc = "Output NCBI report", required = true)
var outputReport: File = _
var assemblyId: String = null
var nameHeader: Option[String] = None
var mustHaveOne: Map[String, String] = Map()
var mustNotHave: Map[String, String] = Map()
override def cmdLine = super.cmdLine +
required("-a", assemblyId) +
required("--report", outputReport) +
required("-o", output) +
optional("--nameHeader", nameHeader) +
repeat("--mustHaveOne", mustHaveOne.map(x => s"${x._1}=${x._2}")) +
repeat("--mustNotHave", mustNotHave.map(x => s"${x._1}=${x._2}"))
}
......@@ -13,6 +13,7 @@ object DownloadNcbiAssembly extends ToolCommand {
case class Args(assemblyId: String = null,
outputFile: File = null,
reportFile: Option[File] = None,
contigNameHeader: Option[String] = None,
mustHaveOne: Map[String, String] = Map(),
mustNotHave: Map[String, String] = Map()) extends AbstractArgs
......@@ -24,6 +25,9 @@ object DownloadNcbiAssembly extends ToolCommand {
opt[File]('o', "output") required () valueName "<file>" action { (x, c) =>
c.copy(outputFile = x)
}
opt[File]("report") required () valueName "<file>" action { (x, c) =>
c.copy(reportFile = Some(x))
}
opt[String]("nameHeader") valueName "<string>" action { (x, c) =>
c.copy(contigNameHeader = Some(x))
}
......@@ -46,6 +50,11 @@ object DownloadNcbiAssembly extends ToolCommand {
val reader = Source.fromURL(s"ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/${cmdargs.assemblyId}.assembly.txt")
val assamblyReport = reader.getLines().toList
reader.close()
cmdargs.reportFile.foreach { file =>
val writer = new PrintWriter(file)
assamblyReport.foreach(writer.println)
writer.close()
}
val headers = assamblyReport.filter(_.startsWith("#")).last.stripPrefix("# ").split("\t").zipWithIndex.toMap
val nameId = cmdargs.contigNameHeader.map(x => headers(x))
......
......@@ -41,7 +41,7 @@
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetExtensions</artifactId>
<artifactId>BiopetToolsExtensions</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
......
......@@ -17,15 +17,9 @@ package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.util
import nl.lumc.sasc.biopet.core.extensions.Md5sum
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2Build, BowtieBuild }
import nl.lumc.sasc.biopet.extensions.bwa.BwaIndex
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import nl.lumc.sasc.biopet.extensions.gmap.GmapBuild
import nl.lumc.sasc.biopet.extensions.hisat.Hisat2Build
import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx
import nl.lumc.sasc.biopet.core.{BiopetQScript, PipelineCommand}
import nl.lumc.sasc.biopet.extensions.{Cat, Curl, Zcat}
import nl.lumc.sasc.biopet.extensions.tools.DownloadNcbiAssembly
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
......@@ -58,59 +52,79 @@ class DownloadGenomes(val root: Configurable) extends QScript with BiopetQScript
for ((genomeName, c) <- speciesConfig) yield genomeName -> {
var configDeps: List[File] = Nil
val genomeConfig = ConfigUtils.any2map(c)
val fastaUris = genomeConfig.getOrElse("fasta_uri",
throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match {
case a: Traversable[_] => a.map(_.toString).toArray
case a: util.ArrayList[_] => a.map(_.toString).toArray
case a => Array(a.toString)
}
val genomeDir = new File(speciesDir, genomeName)
val fastaFile = new File(genomeDir, "reference.fa")
val fastaFiles = for (fastaUri <- fastaUris) yield {
val curl = new Curl(this)
curl.url = fastaUri
curl.output = if (fastaUris.length > 1 || fastaUri.endsWith(".gz")) {
curl.isIntermediate = true
new File(genomeDir, new File(fastaUri).getName)
} else fastaFile
add(curl)
add(Md5sum(this, curl.output, genomeDir))
curl.output
}
referenceConfig.get("ncbi_assembly_id") match {
case Some(assemblyID: String) => {
val downloadAssembly = new DownloadNcbiAssembly(this)
downloadAssembly.assemblyId = assemblyID
downloadAssembly.output = fastaFile
downloadAssembly
downloadAssembly.nameHeader = referenceConfig.get("ncbi_assembly_header_name").map(_.toString)
downloadAssembly.mustHaveOne = referenceConfig.get("ncbi_assembly_must_have_one")
.map(_.asInstanceOf[Map[String, String]])
.getOrElse(Map())
downloadAssembly.mustNotHave = referenceConfig.get("ncbi_assembly_must_not_have")
.map(_.asInstanceOf[Map[String, String]])
.getOrElse(Map())
add(downloadAssembly)
}
case _ => {
val fastaUris = genomeConfig.getOrElse("fasta_uri",
throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match {
case a: Traversable[_] => a.map(_.toString).toArray
case a: util.ArrayList[_] => a.map(_.toString).toArray
case a => Array(a.toString)
}
val fastaCat = new FastaMerging(this)
fastaCat.output = fastaFile
if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) {
fastaFiles.foreach { file =>
if (file.getName.endsWith(".gz")) {
val zcat = new Zcat(this)
zcat.appending = true
zcat.input :+= file
zcat.output = fastaFile
fastaCat.cmds :+= zcat
fastaCat.input :+= file
} else {
val cat = new Cat(this)
cat.appending = true
cat.input :+= file
cat.output = fastaFile
fastaCat.cmds :+= cat
fastaCat.input :+= file
val fastaFiles = for (fastaUri <- fastaUris) yield {
val curl = new Curl(this)
curl.url = fastaUri
curl.output = if (fastaUris.length > 1 || fastaUri.endsWith(".gz")) {
curl.isIntermediate = true
new File(genomeDir, new File(fastaUri).getName)
} else fastaFile
add(curl)
add(Md5sum(this, curl.output, genomeDir))
curl.output
}
val fastaCat = new FastaMerging(this)
fastaCat.output = fastaFile
if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) {
fastaFiles.foreach { file =>
if (file.getName.endsWith(".gz")) {
val zcat = new Zcat(this)
zcat.appending = true
zcat.input :+= file
zcat.output = fastaFile
fastaCat.cmds :+= zcat
fastaCat.input :+= file
} else {
val cat = new Cat(this)
cat.appending = true
cat.input :+= file
cat.output = fastaFile
fastaCat.cmds :+= cat
fastaCat.input :+= file
}
}
add(fastaCat)
configDeps :+= fastaCat.output
}
}
add(fastaCat)
configDeps :+= fastaCat.output
}
val generateIndexes = new GenerateIndexes(this)
generateIndexes.fastaFile = fastaFile
generateIndexes.speciesName = speciesName
generateIndexes.genomeName = genomeName
generateIndexes.fastaUris = fastaUris
//generateIndexes.fastaUris = fastaUris
//TODO: add gtf file
add(generateIndexes)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment