Commit d8ffa403 authored by Peter van 't Hof's avatar Peter van 't Hof

Merge remote-tracking branch 'remotes/origin/fix-generate_indexes' into feature_annotation_versions

parents 30bff1a5 243311ec
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvanthof on 20/06/16.
*/
class GffRead(val root: Configurable) extends BiopetCommandLineFunction {
executable = config("exe", default = "gffread", freeVar = false)
@Input
var input: File = _
@Output
var output: File = _
var T: Boolean = config("T", default = false, freeVar = false)
def cmdLine = executable +
(if (inputAsStdin) "" else required(input)) +
(if (outputAsStsout) "" else required("-o", output)) +
conditional(T, "-T")
}
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.util.matching.Regex
/**
* Created by pjvanthof on 18/05/16.
*/
class Sed(val root: Configurable) extends BiopetCommandLineFunction with Version {
executable = config("exe", default = "sed", freeVar = false)
/** Command to get version of executable */
override def versionCommand: String = executable + " --version"
/** Regex to get version from version command output */
override def versionRegex: Regex = """sed (GNU sed) \d+.\d+.\d+""".r
@Input(required = false)
var inputFile: File = _
@Output
var outputFile: File = _
var expressions: List[String] = Nil
def cmdLine = executable +
repeat("-e", expressions) +
(if (inputAsStdin) "" else required(inputFile)) +
(if (outputAsStsout) "" else " > " + required(outputFile))
}
......@@ -43,8 +43,7 @@ class Hisat2Build(val root: Configurable) extends BiopetCommandLineFunction with
var exon: Option[File] = config("exon")
executable = config("exe", default = "hisat2-build", freeVar = false)
def versionRegex = """.*hisat2-align-s version (.*)""".r
override def versionExitcode = List(0, 1)
def versionRegex = """.*hisat.*version (.*)""".r
def versionCommand = executable + " --version"
var bmax: Option[Int] = config("bmax")
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.picard
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by sajvanderzeeuw on 6-10-15.
*/
class NormalizeFasta(val root: Configurable) extends Picard {
javaMainClass = new picard.reference.NormalizeFasta().getClass.getName
@Input(doc = "The input fasta file", required = true)
var input: File = _
@Output(doc = "The output fasta file", required = true)
var output: File = _
val lineLength: Int = config("line_length")
val truncateSequenceNameAtWhitespace: Boolean = config("truncate_sequence_name_at_whitespace", default = false)
override def cmdLine = super.cmdLine +
(if (inputAsStdin) required("INPUT=", new File("/dev/stdin"), spaceSeparated = false)
else required("INPUT=", input, spaceSeparated = false)) +
(if (outputAsStsout) required("OUTPUT=", new File("/dev/stdout"), spaceSeparated = false)
else required("OUTPUT=", output, spaceSeparated = false)) +
required("LINE_LENGTH=", lineLength, spaceSeparated = false) +
conditional(truncateSequenceNameAtWhitespace, "TRUNCATE_SEQUENCE_NAMES_AT_WHITESPACE=TRUE")
}
......@@ -14,7 +14,6 @@
*/
package nl.lumc.sasc.biopet
import nl.lumc.sasc.biopet.pipelines.generateindexes.GenerateIndexes
import nl.lumc.sasc.biopet.utils.{ BiopetExecutable, MainCommand }
object BiopetExecutableMain extends BiopetExecutable {
......@@ -38,7 +37,8 @@ object BiopetExecutableMain extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling,
nl.lumc.sasc.biopet.pipelines.basty.Basty,
nl.lumc.sasc.biopet.pipelines.shiva.Shiva,
GenerateIndexes
nl.lumc.sasc.biopet.pipelines.generateindexes.DownloadGenomes,
nl.lumc.sasc.biopet.pipelines.generateindexes.GenerateIndexes
)
def tools: List[MainCommand] = BiopetToolsExecutable.tools
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.tools
import java.io.File
import nl.lumc.sasc.biopet.core.ToolCommandFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* @deprecated Use picard.util.BedToIntervalList instead
*/
class DownloadNcbiAssembly(val root: Configurable) extends ToolCommandFunction {
def toolObject = nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly
@Output(doc = "Output fasta file", required = true)
var output: File = _
var outputReport: File = _
var assemblyId: String = null
var nameHeader: Option[String] = None
var mustHaveOne: List[String] = Nil
var mustNotHave: List[String] = Nil
override def defaultCoreMemory = 4.0
override def cmdLine = super.cmdLine +
required("-a", assemblyId) +
required("--report", outputReport) +
required("-o", output) +
optional("--nameHeader", nameHeader) +
repeat("--mustHaveOne", mustHaveOne) +
repeat("--mustNotHave", mustNotHave)
}
......@@ -49,5 +49,6 @@ object BiopetToolsExecutable extends BiopetExecutable {
nl.lumc.sasc.biopet.tools.VcfToTsv,
nl.lumc.sasc.biopet.tools.VcfWithVcf,
nl.lumc.sasc.biopet.tools.VepNormalizer,
nl.lumc.sasc.biopet.tools.WipeReads)
nl.lumc.sasc.biopet.tools.WipeReads,
nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly)
}
package nl.lumc.sasc.biopet.tools
import java.io.{ File, PrintWriter }
import nl.lumc.sasc.biopet.utils.ToolCommand
import scala.io.Source
/**
* Created by pjvan_thof on 21-9-16.
*/
object DownloadNcbiAssembly extends ToolCommand {
case class Args(assemblyId: String = null,
outputFile: File = null,
reportFile: Option[File] = None,
contigNameHeader: Option[String] = None,
mustHaveOne: List[(String, String)] = List(),
mustNotHave: List[(String, String)] = List()) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[String]('a', "assembly id") required () unbounded () valueName "<file>" action { (x, c) =>
c.copy(assemblyId = x)
}
opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) =>
c.copy(outputFile = x)
}
opt[File]("report") unbounded () valueName "<file>" action { (x, c) =>
c.copy(reportFile = Some(x))
}
opt[String]("nameHeader") unbounded () valueName "<string>" action { (x, c) =>
c.copy(contigNameHeader = Some(x))
}
opt[(String, String)]("mustHaveOne") unbounded () valueName "<string>" action { (x, c) =>
c.copy(mustHaveOne = (x._1, x._2) :: c.mustHaveOne)
}
opt[(String, String)]("mustNotHave") unbounded () valueName "<string>" action { (x, c) =>
c.copy(mustNotHave = (x._1, x._2) :: c.mustNotHave)
}
}
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
val cmdargs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
logger.info(s"Reading ${cmdargs.assemblyId} from NCBI")
val reader = Source.fromURL(s"ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/${cmdargs.assemblyId}.assembly.txt")
val assamblyReport = reader.getLines().toList
reader.close()
cmdargs.reportFile.foreach { file =>
val writer = new PrintWriter(file)
assamblyReport.foreach(writer.println)
writer.close()
}
val headers = assamblyReport.filter(_.startsWith("#")).last.stripPrefix("# ").split("\t").zipWithIndex.toMap
val nameId = cmdargs.contigNameHeader.map(x => headers(x))
val lengthId = headers.get("Sequence-Length")
val baseUrlEutils = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
val fastaWriter = new PrintWriter(cmdargs.outputFile)
val allContigs = assamblyReport.filter(!_.startsWith("#"))
.map(_.split("\t"))
val totalLength = lengthId.map(id => allContigs.map(_.apply(id).toLong).sum)
logger.info(s"${allContigs.size} contigs found")
totalLength.foreach(l => logger.info(s"Total length: ${l}"))
val filterContigs = allContigs
.filter(values => cmdargs.mustNotHave.forall(x => values(headers(x._1)) != x._2))
.filter(values => cmdargs.mustHaveOne.exists(x => values(headers(x._1)) == x._2) || cmdargs.mustHaveOne.isEmpty)
val filterLength = lengthId.map(id => filterContigs.map(_.apply(id).toLong).sum)
logger.info(s"${filterContigs.size} contigs left after filtering")
filterLength.foreach(l => logger.info(s"Filtered length: ${l}"))
filterContigs.foreach { values =>
val id = if (values(6) == "na") values(4) else values(6)
logger.info(s"Start download ${id}")
val fastaReader = Source.fromURL(s"${baseUrlEutils}/efetch.fcgi?db=nuccore&id=${id}&retmode=text&rettype=fasta")
fastaReader.getLines()
.map(x => nameId.map(y => x.replace(">", s">${values(y)} ")).getOrElse(x))
.foreach(fastaWriter.println)
fastaReader.close()
}
logger.info("Downloading complete")
fastaWriter.close()
}
}
# Assembly name: ViralProj14444
# Organism name: Ageratum yellow vein betasatellite (viruses)
# Taxid: 185750
# BioProject: PRJNA14444
# Submitter: NCBI
# Date: 2000-6-14
# Assembly type: n/a
# Release type: major
# Assembly level: Complete Genome
# Genome representation: full
# RefSeq assembly accession: GCF_000844745.1
#
## Assembly-Units:
## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name
## GCF_000844735.1 Primary assembly
#
# Ordered by chromosome/plasmid; the chromosomes/plasmids are followed by
# unlocalized scaffolds.
# Unplaced scaffolds are listed at the end.
# RefSeq is equal or derived from GenBank object.
#
# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
Unknown assembled-molecule Unknown Segment na <> NC_003403.1 Primary assembly 1347 na
\ No newline at end of file
>gi|18702122|ref|NC_003403.1| Ageratum yellow vein virus-associated DNA beta, complete genome
ACCGGTGGCGAGCGGGGAATTTGGGTTCCAGGTGGGACCCACACCTGGAACAGCCTGGAAAGAACGTGAG
TGGGCCAAATAAGTGGGCCGTTGAATTGGGCTTTTTATTCAAAACTATCTCAAATTTACGAAGCCCATTA
TTATTCATTTGAGAAAAATTACATTAAACATAGTCCGTATAAATATACATTTATACGGTTACATTCTTGT
ATACATGGTATTCGTCGTTTACTTGGATATCTATTACTGGAGCTTCCTGTTGCATCATGATATCTATTAT
TTCAACCATGTCTTCTGATTTGAACTCTTCTATTGTTGAATCTCTATACATGATCCTTAGTATGTTCTTG
ATACCTTCTTCTAAAGCATTAAAATTGAATGGTGGTATGATACCGTGATGTCCGTATGGAATGGAGAATT
TGTTCTTGACTAGTGCTGGTGACCTTGTTGAAAACACTTCTACGTCAACTCTGAATTTTATTTCTGATCT
GAACTTGACGTTGATGATGAACAATATTCCTTTATCGTTGGTATATGATATAGTCATGATTGTTTGATGG
AAATATATTCATTTGATGTGCATTTATAATTGGATGTGTCTGTTGTATGTGGTTGTGATTAATGATCGTG
TCAAATTTATGTTCTTGTATTAATTAGTGTATGTGTGACTGTATTGATGTTGATGTTTGTGATTATAATT
GAGAAAAGAAGATATTCATTCATTTGAATAAGAAAAGAAAGAAAAAAAAAAGAAAACTATATACATAAAA
AAAACGAATAATAAAAATAACTATCTATTCAGAAAAAATGGGAGCGCAGCGAAAACGAAAACAAAAATCG
ATATCAAAGAAAATCCTGATAAATTAAATAGACAGAATTGAAAAGGAAAAAAGTAAAAAAAGATAAAAGT
TATTACATTAAAAAAACTAAAGAAAAAAAAAATAAAAAAACTAAAAATGATAAATAAAAAAATGACGTCA
TCTGATGTCATCACAGTTAAAAACTATTTACTGCGGTTAAAATTTAATATAATTAACCACTTGGGTAAAT
ATATTGTCTCCAATAGGTAAATTGTCTCCAATATATCGGAGACATATTGGAGACTCTGGAGACACTTATA
TGCTAATTTCCCATTTTACCCTTACTAATGTGACTGTAAGGCGCGTGGGAGTGCTCTGAAAAAGTAGACC
TTCTCTCTCCAAAACTCCCCGGAGAAGACGATACAGGCTGATCCCGGCATCAATTCGCGACACGCGCGGC
GGTGTGTACCCCTGGGAGGGTAGAAACCACTACGCTACGCAGCAGCCTTAGCTACGCCGGAGCTTAGCTC
GCCACCGTAATAATATT
package nl.lumc.sasc.biopet.tools
import java.io.File
import java.nio.file.Paths
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
import scala.io.Source
/**
* Created by pjvanthof on 03/10/16.
*/
class DownloadNcbiAssemblyTest extends TestNGSuite with Matchers {
private def resourcePath(p: String): String = {
Paths.get(getClass.getResource(p).toURI).toString
}
@Test
def testNC_003403_1: Unit = {
val output = File.createTempFile("test.", ".fasta")
val outputReport = File.createTempFile("test.", ".report")
output.deleteOnExit()
outputReport.deleteOnExit()
DownloadNcbiAssembly.main(Array("-a", "GCF_000844745.1",
"-o", output.getAbsolutePath,
"--report", outputReport.getAbsolutePath))
Source.fromFile(output).getLines().toList shouldBe Source.fromFile(new File(resourcePath("/NC_003403.1.fasta"))).getLines().toList
Source.fromFile(outputReport).getLines().toList shouldBe Source.fromFile(new File(resourcePath("/GCF_000844745.1.report"))).getLines().toList
}
}
......@@ -41,7 +41,7 @@
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetExtensions</artifactId>
<artifactId>BiopetToolsExtensions</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines.generateindexes
import java.io.File
import java.util
import nl.lumc.sasc.biopet.core.extensions.Md5sum
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import nl.lumc.sasc.biopet.extensions.picard.NormalizeFasta
import nl.lumc.sasc.biopet.extensions.tools.DownloadNcbiAssembly
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
import scala.collection.JavaConversions._
import scala.language.reflectiveCalls
class DownloadGenomes(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Argument(required = true)
var referenceConfigFiles: List[File] = Nil
var referenceConfig: Map[String, Any] = _
override def fixedValues = Map("gffread" -> Map("T" -> true))
override def defaults = Map("normalizefasta" -> Map("line_length" -> 60))
val downloadAnnotations: Boolean = config("download_annotations", default = false)
/** This is executed before the script starts */
def init(): Unit = {
if (referenceConfig == null)
referenceConfig = referenceConfigFiles.foldLeft(Map[String, Any]())((a, b) => ConfigUtils.mergeMaps(a, ConfigUtils.fileToConfigMap(b)))
}
/** Method where jobs must be added */
def biopetScript(): Unit = {
for ((speciesName, c) <- referenceConfig) yield speciesName -> {
val speciesConfig = ConfigUtils.any2map(c)
val speciesDir = new File(outputDir, speciesName)
for ((genomeName, c) <- speciesConfig) yield genomeName -> {
var configDeps: List[File] = Nil
val genomeConfig = ConfigUtils.any2map(c)
val genomeDir = new File(speciesDir, genomeName)
val fastaFile = new File(genomeDir, "reference.fa")
val downloadFastaFile = new File(genomeDir, "download.reference.fa")
genomeConfig.get("ncbi_assembly_id") match {
case Some(assemblyID: String) =>
val downloadAssembly = new DownloadNcbiAssembly(this)
downloadAssembly.assemblyId = assemblyID
downloadAssembly.output = downloadFastaFile
downloadAssembly.outputReport = new File(genomeDir, s"$speciesName-$genomeName.assembly.report")
downloadAssembly.nameHeader = genomeConfig.get("ncbi_assembly_header_name").map(_.toString)
downloadAssembly.mustHaveOne = genomeConfig.get("ncbi_assembly_must_have_one")
.map(_.asInstanceOf[util.ArrayList[util.LinkedHashMap[String, String]]])
.getOrElse(new util.ArrayList()).flatMap(x => x.map(y => y._1 + "=" + y._2))
.toList
downloadAssembly.mustNotHave = genomeConfig.get("ncbi_assembly_must_not_have")
.map(_.asInstanceOf[util.ArrayList[util.LinkedHashMap[String, String]]])
.getOrElse(new util.ArrayList()).flatMap(x => x.map(y => y._1 + "=" + y._2))
.toList
downloadAssembly.isIntermediate = true
add(downloadAssembly)
case _ =>
val fastaUris = genomeConfig.getOrElse("fasta_uri",
throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match {
case a: Traversable[_] => a.map(_.toString).toArray
case a: util.ArrayList[_] => a.map(_.toString).toArray
case a => Array(a.toString)
}
val fastaFiles = for (fastaUri <- fastaUris) yield {
val curl = new Curl(this)
curl.url = fastaUri
curl.output = if (fastaUris.length > 1 || fastaUri.endsWith(".gz")) {
curl.isIntermediate = true
new File(genomeDir, new File(fastaUri).getName)
} else fastaFile
add(curl)
add(Md5sum(this, curl.output, genomeDir))
curl.output
}
val fastaCat = new FastaMerging(this)
fastaCat.output = downloadFastaFile
fastaCat.isIntermediate = true
if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) {
fastaFiles.foreach { file =>
if (file.getName.endsWith(".gz")) {
val zcat = new Zcat(this)
zcat.appending = true
zcat.input :+= file
zcat.output = downloadFastaFile
fastaCat.cmds :+= zcat
fastaCat.input :+= file
} else {
val cat = new Cat(this)
cat.appending = true
cat.input :+= file
cat.output = downloadFastaFile
fastaCat.cmds :+= cat
fastaCat.input :+= file
}
}
add(fastaCat)
configDeps :+= fastaCat.output
}
}
val normalizeFasta = new NormalizeFasta(this)
normalizeFasta.input = downloadFastaFile
normalizeFasta.output = fastaFile
add(normalizeFasta)
val generateIndexes = new GenerateIndexes(this)
generateIndexes.fastaFile = fastaFile
generateIndexes.speciesName = speciesName
generateIndexes.genomeName = genomeName
generateIndexes.outputDir = genomeDir
//generateIndexes.fastaUris = fastaUris
//TODO: add gtf file
add(generateIndexes)
if (downloadAnnotations) {
val annotationDir = new File(genomeDir, "annotation")
def getAnnotation(tag: String): Map[String, Map[String, Any]] = (genomeConfig.get(tag) match {
case Some(s: Map[_, _]) => s.map(x => x._2 match {
case o: Map[_, _] => x._1.toString -> o.map(x => (x._1.toString, x._2))
case _ => throw new IllegalStateException(s"values in the tag $tag should be json objects")
})
case None => Map()
case x => throw new IllegalStateException(s"tag $tag should be an object with objects, now $x")
})
// Download vep caches
getAnnotation("vep").foreach {
case (version, vep) =>
val vepDir = new File(annotationDir, "vep" + File.separator + version)
val curl = new Curl(this)
curl.url = vep("cache_uri").toString
curl.output = new File(vepDir, new File(curl.url).getName)
curl.isIntermediate = true
add(curl)
val tar = new TarExtract(this)
tar.inputTar = curl.output
tar.outputDir = vepDir
add(tar)
}
getAnnotation("dbsnp").foreach {
case (version, dbsnp) =>
val dbpsnpDir = new File(annotationDir, "dbsnp")
val contigMap = dbsnp.get("dbsnp_contig_map").map(_.asInstanceOf[Map[String, Any]])
val contigSed = contigMap.map { map =>
val sed = new Sed(this)
sed.expressions = map.map(x => s"""s/^${x._1}\t/${x._2}\t/""").toList
sed
}
val cv = new CombineVariants(this)
cv.reference_sequence = fastaFile
def addDownload(uri: String): Unit = {