Commit d8ffa403 authored by Peter van 't Hof's avatar Peter van 't Hof

Merge remote-tracking branch 'remotes/origin/fix-generate_indexes' into feature_annotation_versions

parents 30bff1a5 243311ec
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by pjvanthof on 20/06/16.
*/
class GffRead(val root: Configurable) extends BiopetCommandLineFunction {
executable = config("exe", default = "gffread", freeVar = false)
@Input
var input: File = _
@Output
var output: File = _
var T: Boolean = config("T", default = false, freeVar = false)
def cmdLine = executable +
(if (inputAsStdin) "" else required(input)) +
(if (outputAsStsout) "" else required("-o", output)) +
conditional(T, "-T")
}
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.util.matching.Regex
/**
* Created by pjvanthof on 18/05/16.
*/
class Sed(val root: Configurable) extends BiopetCommandLineFunction with Version {
executable = config("exe", default = "sed", freeVar = false)
/** Command to get version of executable */
override def versionCommand: String = executable + " --version"
/** Regex to get version from version command output */
override def versionRegex: Regex = """sed (GNU sed) \d+.\d+.\d+""".r
@Input(required = false)
var inputFile: File = _
@Output
var outputFile: File = _
var expressions: List[String] = Nil
def cmdLine = executable +
repeat("-e", expressions) +
(if (inputAsStdin) "" else required(inputFile)) +
(if (outputAsStsout) "" else " > " + required(outputFile))
}
......@@ -43,8 +43,7 @@ class Hisat2Build(val root: Configurable) extends BiopetCommandLineFunction with
var exon: Option[File] = config("exon")
executable = config("exe", default = "hisat2-build", freeVar = false)
def versionRegex = """.*hisat2-align-s version (.*)""".r
override def versionExitcode = List(0, 1)
def versionRegex = """.*hisat.*version (.*)""".r
def versionCommand = executable + " --version"
var bmax: Option[Int] = config("bmax")
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.picard
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Created by sajvanderzeeuw on 6-10-15.
*/
class NormalizeFasta(val root: Configurable) extends Picard {
javaMainClass = new picard.reference.NormalizeFasta().getClass.getName
@Input(doc = "The input fasta file", required = true)
var input: File = _
@Output(doc = "The output fasta file", required = true)
var output: File = _
val lineLength: Int = config("line_length")
val truncateSequenceNameAtWhitespace: Boolean = config("truncate_sequence_name_at_whitespace", default = false)
override def cmdLine = super.cmdLine +
(if (inputAsStdin) required("INPUT=", new File("/dev/stdin"), spaceSeparated = false)
else required("INPUT=", input, spaceSeparated = false)) +
(if (outputAsStsout) required("OUTPUT=", new File("/dev/stdout"), spaceSeparated = false)
else required("OUTPUT=", output, spaceSeparated = false)) +
required("LINE_LENGTH=", lineLength, spaceSeparated = false) +
conditional(truncateSequenceNameAtWhitespace, "TRUNCATE_SEQUENCE_NAMES_AT_WHITESPACE=TRUE")
}
......@@ -14,7 +14,6 @@
*/
package nl.lumc.sasc.biopet
import nl.lumc.sasc.biopet.pipelines.generateindexes.GenerateIndexes
import nl.lumc.sasc.biopet.utils.{ BiopetExecutable, MainCommand }
object BiopetExecutableMain extends BiopetExecutable {
......@@ -38,7 +37,8 @@ object BiopetExecutableMain extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling,
nl.lumc.sasc.biopet.pipelines.basty.Basty,
nl.lumc.sasc.biopet.pipelines.shiva.Shiva,
GenerateIndexes
nl.lumc.sasc.biopet.pipelines.generateindexes.DownloadGenomes,
nl.lumc.sasc.biopet.pipelines.generateindexes.GenerateIndexes
)
def tools: List[MainCommand] = BiopetToolsExecutable.tools
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.tools
import java.io.File
import nl.lumc.sasc.biopet.core.ToolCommandFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* @deprecated Use picard.util.BedToIntervalList instead
*/
class DownloadNcbiAssembly(val root: Configurable) extends ToolCommandFunction {
def toolObject = nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly
@Output(doc = "Output fasta file", required = true)
var output: File = _
var outputReport: File = _
var assemblyId: String = null
var nameHeader: Option[String] = None
var mustHaveOne: List[String] = Nil
var mustNotHave: List[String] = Nil
override def defaultCoreMemory = 4.0
override def cmdLine = super.cmdLine +
required("-a", assemblyId) +
required("--report", outputReport) +
required("-o", output) +
optional("--nameHeader", nameHeader) +
repeat("--mustHaveOne", mustHaveOne) +
repeat("--mustNotHave", mustNotHave)
}
......@@ -49,5 +49,6 @@ object BiopetToolsExecutable extends BiopetExecutable {
nl.lumc.sasc.biopet.tools.VcfToTsv,
nl.lumc.sasc.biopet.tools.VcfWithVcf,
nl.lumc.sasc.biopet.tools.VepNormalizer,
nl.lumc.sasc.biopet.tools.WipeReads)
nl.lumc.sasc.biopet.tools.WipeReads,
nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly)
}
package nl.lumc.sasc.biopet.tools
import java.io.{ File, PrintWriter }
import nl.lumc.sasc.biopet.utils.ToolCommand
import scala.io.Source
/**
* Created by pjvan_thof on 21-9-16.
*/
object DownloadNcbiAssembly extends ToolCommand {
case class Args(assemblyId: String = null,
outputFile: File = null,
reportFile: Option[File] = None,
contigNameHeader: Option[String] = None,
mustHaveOne: List[(String, String)] = List(),
mustNotHave: List[(String, String)] = List()) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[String]('a', "assembly id") required () unbounded () valueName "<file>" action { (x, c) =>
c.copy(assemblyId = x)
}
opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) =>
c.copy(outputFile = x)
}
opt[File]("report") unbounded () valueName "<file>" action { (x, c) =>
c.copy(reportFile = Some(x))
}
opt[String]("nameHeader") unbounded () valueName "<string>" action { (x, c) =>
c.copy(contigNameHeader = Some(x))
}
opt[(String, String)]("mustHaveOne") unbounded () valueName "<string>" action { (x, c) =>
c.copy(mustHaveOne = (x._1, x._2) :: c.mustHaveOne)
}
opt[(String, String)]("mustNotHave") unbounded () valueName "<string>" action { (x, c) =>
c.copy(mustNotHave = (x._1, x._2) :: c.mustNotHave)
}
}
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
val cmdargs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
logger.info(s"Reading ${cmdargs.assemblyId} from NCBI")
val reader = Source.fromURL(s"ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/${cmdargs.assemblyId}.assembly.txt")
val assamblyReport = reader.getLines().toList
reader.close()
cmdargs.reportFile.foreach { file =>
val writer = new PrintWriter(file)
assamblyReport.foreach(writer.println)
writer.close()
}
val headers = assamblyReport.filter(_.startsWith("#")).last.stripPrefix("# ").split("\t").zipWithIndex.toMap
val nameId = cmdargs.contigNameHeader.map(x => headers(x))
val lengthId = headers.get("Sequence-Length")
val baseUrlEutils = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
val fastaWriter = new PrintWriter(cmdargs.outputFile)
val allContigs = assamblyReport.filter(!_.startsWith("#"))
.map(_.split("\t"))
val totalLength = lengthId.map(id => allContigs.map(_.apply(id).toLong).sum)
logger.info(s"${allContigs.size} contigs found")
totalLength.foreach(l => logger.info(s"Total length: ${l}"))
val filterContigs = allContigs
.filter(values => cmdargs.mustNotHave.forall(x => values(headers(x._1)) != x._2))
.filter(values => cmdargs.mustHaveOne.exists(x => values(headers(x._1)) == x._2) || cmdargs.mustHaveOne.isEmpty)
val filterLength = lengthId.map(id => filterContigs.map(_.apply(id).toLong).sum)
logger.info(s"${filterContigs.size} contigs left after filtering")
filterLength.foreach(l => logger.info(s"Filtered length: ${l}"))
filterContigs.foreach { values =>
val id = if (values(6) == "na") values(4) else values(6)
logger.info(s"Start download ${id}")
val fastaReader = Source.fromURL(s"${baseUrlEutils}/efetch.fcgi?db=nuccore&id=${id}&retmode=text&rettype=fasta")
fastaReader.getLines()
.map(x => nameId.map(y => x.replace(">", s">${values(y)} ")).getOrElse(x))
.foreach(fastaWriter.println)
fastaReader.close()
}
logger.info("Downloading complete")
fastaWriter.close()
}
}
# Assembly name: ViralProj14444
# Organism name: Ageratum yellow vein betasatellite (viruses)
# Taxid: 185750
# BioProject: PRJNA14444
# Submitter: NCBI
# Date: 2000-6-14
# Assembly type: n/a
# Release type: major
# Assembly level: Complete Genome
# Genome representation: full
# RefSeq assembly accession: GCF_000844745.1
#
## Assembly-Units:
## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name
## GCF_000844735.1 Primary assembly
#
# Ordered by chromosome/plasmid; the chromosomes/plasmids are followed by
# unlocalized scaffolds.
# Unplaced scaffolds are listed at the end.
# RefSeq is equal or derived from GenBank object.
#
# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
Unknown assembled-molecule Unknown Segment na <> NC_003403.1 Primary assembly 1347 na
\ No newline at end of file
>gi|18702122|ref|NC_003403.1| Ageratum yellow vein virus-associated DNA beta, complete genome
ACCGGTGGCGAGCGGGGAATTTGGGTTCCAGGTGGGACCCACACCTGGAACAGCCTGGAAAGAACGTGAG
TGGGCCAAATAAGTGGGCCGTTGAATTGGGCTTTTTATTCAAAACTATCTCAAATTTACGAAGCCCATTA
TTATTCATTTGAGAAAAATTACATTAAACATAGTCCGTATAAATATACATTTATACGGTTACATTCTTGT
ATACATGGTATTCGTCGTTTACTTGGATATCTATTACTGGAGCTTCCTGTTGCATCATGATATCTATTAT
TTCAACCATGTCTTCTGATTTGAACTCTTCTATTGTTGAATCTCTATACATGATCCTTAGTATGTTCTTG
ATACCTTCTTCTAAAGCATTAAAATTGAATGGTGGTATGATACCGTGATGTCCGTATGGAATGGAGAATT
TGTTCTTGACTAGTGCTGGTGACCTTGTTGAAAACACTTCTACGTCAACTCTGAATTTTATTTCTGATCT
GAACTTGACGTTGATGATGAACAATATTCCTTTATCGTTGGTATATGATATAGTCATGATTGTTTGATGG
AAATATATTCATTTGATGTGCATTTATAATTGGATGTGTCTGTTGTATGTGGTTGTGATTAATGATCGTG
TCAAATTTATGTTCTTGTATTAATTAGTGTATGTGTGACTGTATTGATGTTGATGTTTGTGATTATAATT
GAGAAAAGAAGATATTCATTCATTTGAATAAGAAAAGAAAGAAAAAAAAAAGAAAACTATATACATAAAA
AAAACGAATAATAAAAATAACTATCTATTCAGAAAAAATGGGAGCGCAGCGAAAACGAAAACAAAAATCG
ATATCAAAGAAAATCCTGATAAATTAAATAGACAGAATTGAAAAGGAAAAAAGTAAAAAAAGATAAAAGT
TATTACATTAAAAAAACTAAAGAAAAAAAAAATAAAAAAACTAAAAATGATAAATAAAAAAATGACGTCA
TCTGATGTCATCACAGTTAAAAACTATTTACTGCGGTTAAAATTTAATATAATTAACCACTTGGGTAAAT
ATATTGTCTCCAATAGGTAAATTGTCTCCAATATATCGGAGACATATTGGAGACTCTGGAGACACTTATA
TGCTAATTTCCCATTTTACCCTTACTAATGTGACTGTAAGGCGCGTGGGAGTGCTCTGAAAAAGTAGACC
TTCTCTCTCCAAAACTCCCCGGAGAAGACGATACAGGCTGATCCCGGCATCAATTCGCGACACGCGCGGC
GGTGTGTACCCCTGGGAGGGTAGAAACCACTACGCTACGCAGCAGCCTTAGCTACGCCGGAGCTTAGCTC
GCCACCGTAATAATATT
package nl.lumc.sasc.biopet.tools
import java.io.File
import java.nio.file.Paths
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
import scala.io.Source
/**
* Created by pjvanthof on 03/10/16.
*/
class DownloadNcbiAssemblyTest extends TestNGSuite with Matchers {
private def resourcePath(p: String): String = {
Paths.get(getClass.getResource(p).toURI).toString
}
@Test
def testNC_003403_1: Unit = {
val output = File.createTempFile("test.", ".fasta")
val outputReport = File.createTempFile("test.", ".report")
output.deleteOnExit()
outputReport.deleteOnExit()
DownloadNcbiAssembly.main(Array("-a", "GCF_000844745.1",
"-o", output.getAbsolutePath,
"--report", outputReport.getAbsolutePath))
Source.fromFile(output).getLines().toList shouldBe Source.fromFile(new File(resourcePath("/NC_003403.1.fasta"))).getLines().toList
Source.fromFile(outputReport).getLines().toList shouldBe Source.fromFile(new File(resourcePath("/GCF_000844745.1.report"))).getLines().toList
}
}
......@@ -41,7 +41,7 @@
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetExtensions</artifactId>
<artifactId>BiopetToolsExtensions</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
......
......@@ -25,18 +25,18 @@ import org.testng.annotations.Test
/**
* Created by pjvan_thof on 13-5-16.
*/
class GenerateIndexesTest extends TestNGSuite with Matchers {
def initPipeline(map: Map[String, Any]): GenerateIndexes = {
new GenerateIndexes() {
class DownloadGenomesTest extends TestNGSuite with Matchers {
def initPipeline(map: Map[String, Any]): DownloadGenomes = {
new DownloadGenomes() {
override def configNamespace = "generateindexes"
override def globalConfig = new Config(ConfigUtils.mergeMaps(map, GenerateIndexesTest.config))
override def globalConfig = new Config(ConfigUtils.mergeMaps(map, DownloadGenomesTest.config))
qSettings = new QSettings
qSettings.runName = "test"
}
}
@Test
def testNoFastaUri: Unit = {
def testNoFastaUri(): Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("" -> "")))
intercept[IllegalArgumentException] {
......@@ -45,57 +45,101 @@ class GenerateIndexesTest extends TestNGSuite with Matchers {
}
@Test
def testSingleFasta: Unit = {
def testNcbiAssembly(): Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("ncbi_assembly_id" -> "id")))
pipeline.script()
}
@Test
def testSingleFasta(): Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri")))
pipeline.script()
}
@Test
def testMultiFasta: Unit = {
def testMultiFasta(): Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> List("uri", "uri2", "uri3.gz"))))
pipeline.script()
}
@Test
def testSingleDbsnp: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "dbsnp_vcf_uri" -> "uri.vcf.gz")))
def testSingleDbsnp(): Unit = {
val pipeline = initPipeline(Map("download_annotations" -> true))
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri",
"dbsnp" -> Map("version" -> Map("vcf_uri" -> "uri.vcf.gz")))))
pipeline.script()
}
@Test
def testMultiDbsnp: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "dbsnp_vcf_uri" -> List("uri.vcf.gz", "uri2.vcf.gz"))))
def testContigMapDbsnp(): Unit = {
val pipeline = initPipeline(Map("download_annotations" -> true))
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri",
"dbsnp" -> Map("version" -> Map("vcf_uri" -> "uri.vcf.gz", "contig_map" -> Map("1" -> "chr1"))))))
pipeline.script()
}
@Test
def testVep: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "vep_cache_uri" -> "something/human_vep_80_hg19.tar.gz")))
def testUnzippedContigMapDbsnp(): Unit = {
val pipeline = initPipeline(Map("download_annotations" -> true))
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri",
"dbsnp" -> Map("version" -> Map("vcf_uri" -> "uri.vcf", "contig_map" -> Map("1" -> "chr1"))))))
pipeline.script()
}
@Test
def testGtfZipped: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "gtf_uri" -> "bla.gf.gz")))
def testSingleUnzippedDbsnp(): Unit = {
val pipeline = initPipeline(Map("download_annotations" -> true))
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri",
"dbsnp" -> Map("version" -> Map(("vcf_uri" -> "uri.vcf"))))))
pipeline.script()
}
@Test
def testGtf: Unit = {
val pipeline = initPipeline(Map())
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri", "gtf_uri" -> "bla.gf")))
def testMultiDbsnp(): Unit = {
val pipeline = initPipeline(Map("download_annotations" -> true))
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri",
"dbsnp" -> Map("version" -> Map("vcf_uri" -> List("uri.vcf.gz", "uri2.vcf.gz"))))))
pipeline.script()
}
@Test
def testVep(): Unit = {
val pipeline = initPipeline(Map("download_annotations" -> true))
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri",
"vep" -> Map("version" -> Map("cache_uri" -> "something/human_vep_80_hg19.tar.gz")))))
pipeline.script()
}
@Test
def testGtfZipped(): Unit = {
val pipeline = initPipeline(Map("download_annotations" -> true))
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri",
"gene_annotation" -> Map("version" -> Map("gtf_uri" -> "bla.gf.gz")))))
pipeline.script()
}
@Test
def testGtf(): Unit = {
val pipeline = initPipeline(Map("download_annotations" -> true))
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri",
"gene_annotation" -> Map("version" -> Map("gtf_uri" -> "bla.gf")))))
pipeline.script()
}
@Test
def testGff(): Unit = {
val pipeline = initPipeline(Map("download_annotations" -> true))
pipeline.referenceConfig = Map("s1" -> Map("g1" -> Map("fasta_uri" -> "uri",
"gene_annotation" -> Map("version" -> Map("gff_uri" -> "bla.gf")))))
pipeline.script()
}
}
object GenerateIndexesTest {
object DownloadGenomesTest {
val outputDir = Files.createTempDir()
outputDir.deleteOnExit()
......@@ -109,6 +153,7 @@ object GenerateIndexesTest {
"samtools" -> Map("exe" -> "test"),
"md5sum" -> Map("exe" -> "test"),
"gatk_jar" -> "test",
"tabix" -> Map("exe" -> "test")
"tabix" -> Map("exe" -> "test"),
"gffread" -> Map("exe" -> "test")
)
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment