Commit 488594e1 authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'feature-generate_indexes' into 'develop'

Feature generate indexes

see also #187 

See merge request !211
parents 1921f894 244653f5
......@@ -31,12 +31,14 @@ class Cat(val root: Configurable) extends BiopetCommandLineFunction {
@Output(doc = "Unzipped file", required = true)
var output: File = _
var appending = false
executable = config("exe", default = "cat")
/** return commandline to execute */
def cmdLine = required(executable) +
(if (inputAsStdin) "" else repeat(input)) +
(if (outputAsStsout) "" else " > " + required(output))
(if (outputAsStsout) "" else (if (appending) " >> " else " > ") + required(output))
}
/**
......
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.Output
/**
* Created by pjvan_thof on 8/11/15.
*/
class Curl(val root: Configurable) extends BiopetCommandLineFunction with Version {
@Output
var output: File = _
var url: String = _
executable = config("exe", default = "curl")
def versionCommand = executable + " --version"
def versionRegex = """curl (\w+\.\w+\.\w+) .*""".r
def cmdLine: String = required(executable) + required(url) + " > " + required(output)
}
......@@ -62,7 +62,7 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere
var genomeDir: File = null
var runmode: String = _
var sjdbOverhang: Int = _
var sjdbOverhang: Option[Int] = None
var outFileNamePrefix: String = _
var runThreadN: Option[Int] = config("runThreadN")
......@@ -73,24 +73,24 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere
override def beforeGraph() {
super.beforeGraph()
if (reference == null) reference = referenceFasta()
genomeDir = config("genomeDir", new File(reference.getAbsoluteFile.getParent, "star"))
if (outFileNamePrefix != null && !outFileNamePrefix.endsWith(".")) outFileNamePrefix += "."
val prefix = if (outFileNamePrefix != null) outputDir + File.separator + outFileNamePrefix else outputDir + File.separator
if (runmode == null) {
outputSam = new File(prefix + "Aligned.out.sam")
outputTab = new File(prefix + "SJ.out.tab")
genomeDir = config("genomeDir", new File(reference.getAbsoluteFile.getParent, "star"))
} else if (runmode == "genomeGenerate") {
genomeDir = outputDir
outputGenome = new File(prefix + "Genome")
outputSA = new File(prefix + "SA")
outputSAindex = new File(prefix + "SAindex")
sjdbOverhang = config("sjdboverhang", 75)
sjdbOverhang = config("sjdboverhang")
}
}
/** Returns command to execute */
def cmdLine = {
var cmd: String = required("cd", outputDir) + "&&" + required(executable)
var cmd: String = required("cd", outputDir) + " && " + required(executable)
if (runmode != null && runmode == "genomeGenerate") { // Create index
cmd += required("--runMode", runmode) +
required("--genomeFastaFiles", reference)
......@@ -100,8 +100,8 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere
cmd += required("--genomeDir", genomeDir) +
optional("--sjdbFileChrStartEnd", sjdbFileChrStartEnd) +
optional("--runThreadN", threads) +
optional("--outFileNamePrefix", outFileNamePrefix)
if (sjdbOverhang > 0) cmd += optional("--sjdbOverhang", sjdbOverhang)
optional("--outFileNamePrefix", outFileNamePrefix) +
optional("--sjdbOverhang", sjdbOverhang)
cmd
}
......
......@@ -66,6 +66,7 @@ class Tabix(val root: Configurable) extends BiopetCommandLineFunction with Versi
private val validFormats: Set[String] = Set("gff", "bed", "sam", "vcf", "psltbl")
override def beforeGraph(): Unit = {
super.beforeGraph()
p match {
case Some(fmt) =>
require(validFormats.contains(fmt), "-p flag must be one of " + validFormats.mkString(", "))
......
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input }
/**
* Created by pjvan_thof on 8/11/15.
*/
class TarExtract(val root: Configurable) extends BiopetCommandLineFunction with Version {
@Input(required = true)
var inputTar: File = _
@Argument(required = true)
var outputDir: File = _
executable = config("exe", default = "tar", freeVar = false)
def versionCommand = executable + " --version"
def versionRegex = """tar \(GNU tar\) (.*)""".r
override def beforeGraph: Unit = {
super.beforeGraph
jobLocalDir = outputDir
jobOutputFile = new File(outputDir, "." + inputTar.getName + ".tar.out")
}
def cmdLine: String = required(executable) +
required("-x") +
required("-f", inputTar) +
required("--directory", outputDir)
}
......@@ -29,6 +29,8 @@ class Zcat(val root: Configurable) extends BiopetCommandLineFunction with Versio
@Output(doc = "Unzipped file", required = true)
var output: File = _
var appending = false
executable = config("exe", default = "zcat")
def versionRegex = """zcat \(gzip\) (.*)""".r
......@@ -37,7 +39,7 @@ class Zcat(val root: Configurable) extends BiopetCommandLineFunction with Versio
/** Returns command to execute */
def cmdLine = required(executable) +
(if (inputAsStdin) "" else repeat(input)) +
(if (outputAsStsout) "" else " > " + required(output))
(if (outputAsStsout) "" else (if (appending) " >> " else " > ") + required(output))
}
object Zcat {
......
......@@ -13,7 +13,7 @@
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions
package nl.lumc.sasc.biopet.extensions.bowtie
import java.io.File
......
package nl.lumc.sasc.biopet.extensions
package nl.lumc.sasc.biopet.extensions.bowtie
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Reference, Version }
import nl.lumc.sasc.biopet.core.{BiopetCommandLineFunction, Reference, Version}
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import org.broadinstitute.gatk.utils.commandline.{Input, Output}
/**
* Extension for bowtie 2
......
package nl.lumc.sasc.biopet.extensions.bowtie
import java.io.File
import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input }
/**
* Created by pjvan_thof on 8/15/15.
*/
class Bowtie2Build(val root: Configurable) extends BiopetCommandLineFunction with Version {
@Input(required = true)
var reference: File = _
@Argument(required = true)
var baseName: String = _
executable = config("exe", default = "bowtie2-build", freeVar = false)
def versionRegex = """.*[Vv]ersion:? (\d*\.\d*\.\d*)""".r
def versionCommand = executable + " --version"
override def defaultCoreMemory = 15.0
override def beforeGraph: Unit = {
outputFiles ::= new File(reference.getParentFile, baseName + ".1.bt2")
outputFiles ::= new File(reference.getParentFile, baseName + ".2.bt2")
}
def cmdLine = required("cd", reference.getParentFile) + " && " +
required(executable) +
required(reference) +
required(baseName)
}
package nl.lumc.sasc.biopet.extensions.bowtie
import java.io.File
import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input }
/**
* Created by pjvan_thof on 8/15/15.
*/
class BowtieBuild(val root: Configurable) extends BiopetCommandLineFunction with Version {
@Input(required = true)
var reference: File = _
@Argument(required = true)
var baseName: String = _
executable = config("exe", default = "bowtie-build", freeVar = false)
def versionRegex = """.*[Vv]ersion:? (\d*\.\d*\.\d*)""".r
def versionCommand = executable + " --version"
override def defaultCoreMemory = 15.0
override def beforeGraph: Unit = {
outputFiles ::= new File(reference.getParentFile, baseName + ".1.ebwt")
outputFiles ::= new File(reference.getParentFile, baseName + ".2.ebwt")
}
def cmdLine = required("cd", reference.getParentFile) + " && " +
required(executable) +
required(reference) +
required(baseName)
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.bwa
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Extension for bwa aln
*
* Based on version 0.7.12-r1039
*
* Created by pjvan_thof on 1/16/15.
*/
class BwaIndex(val root: Configurable) extends Bwa {
@Input(doc = "Fastq file", required = true)
var reference: File = _
@Output(doc = "Index files for bwa", required = false)
private var output: List[File] = Nil
var a: Option[String] = config("a", freeVar = false)
var p: Option[String] = config("p", freeVar = false)
var b: Option[Int] = config("e", freeVar = false)
var _6: Boolean = config("6", default = false, freeVar = false)
override def defaultCoreMemory = 35.0
override def beforeGraph() {
super.beforeGraph()
List(".sa", ".pac")
.foreach(ext => output ::= new File(reference.getAbsolutePath + ext))
output = output.distinct
}
/** Returns command to execute */
def cmdLine = required(executable) +
required("index") +
optional("-a", a) +
optional("-p", p) +
optional("-b", b) +
conditional(_6, "-6") +
required(reference)
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.gmap
import java.io.File
import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction, Reference }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.Input
/**
* Wrapper for the gsnap command line tool
* Written based on gsnap version 2014-05-15
*/
class GmapBuild(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version {
/** default executable */
executable = config("exe", default = "gmap_build", freeVar = false)
/** input file */
@Input(doc = "Input fasta files", required = true) //var input: List[File] = _
var fastaFiles: List[File] = Nil
/** genome directory */
var dir: File = _
/** genome database */
var db: String = _
override def defaultCoreMemory = 25.0
def versionRegex = """.* version (.*)""".r
def versionCommand = executable
override def versionExitcode = List(0, 1, 255)
override def beforeGraph: Unit = {
super.beforeGraph
jobOutputFile = new File(dir, ".log.out")
}
def cmdLine = {
required(executable) +
required("--dir", dir) +
optional("--db", db) +
repeat(fastaFiles)
}
}
......@@ -13,7 +13,7 @@
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions
package nl.lumc.sasc.biopet.extensions.gmap
import java.io.File
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.picard
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
class CreateSequenceDictionary(val root: Configurable) extends Picard {
javaMainClass = new picard.sam.CreateSequenceDictionary().getClass.getName
@Input(required = true)
var reference: File = _
@Output(required = true)
var output: File = _
var genomeAssembly: Option[String] = config("genomeAssembly")
var uri: Option[String] = config("uri")
var species: Option[String] = config("species")
var truncateAtWhiteSpace: Boolean = config("truncateAtWhiteSpace", default = false)
var numSequences: Option[Int] = config("numSequences")
override def cmdLine = super.cmdLine +
required("REFERENCE=", reference, spaceSeparated = false) +
required("OUTPUT=", output, spaceSeparated = false) +
optional("GENOME_ASSEMBLY=", genomeAssembly, spaceSeparated = false) +
optional("URI=", uri, spaceSeparated = false) +
optional("SPECIES=", species, spaceSeparated = false) +
conditional(truncateAtWhiteSpace, "TRUNCATE_NAMES_AT_WHITESPACE=true") +
optional("NUM_SEQUENCES=", numSequences, spaceSeparated = false)
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.samtools
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/** Extension for samtools flagstat */
class SamtoolsFaidx(val root: Configurable) extends Samtools {
@Input(doc = "Bam File")
var input: File = _
@Output(doc = "output File")
private var _output: File = _
def output = _output
override def beforeGraph: Unit = {
super.beforeGraph
_output = new File(input.getParentFile, input.getName + ".fai")
}
/** Returns command to execute */
def cmdLine = required(executable) + required("faidx") + required(input)
}
object SamtoolsFaidx {
def apply(root: Configurable, input: File): SamtoolsFaidx = {
val faidx = new SamtoolsFaidx(root)
faidx.input = input
faidx._output = new File(input.getParentFile, input.getName + ".fai")
faidx
}
}
\ No newline at end of file
......@@ -16,6 +16,7 @@
package nl.lumc.sasc.biopet.extensions
import nl.lumc.sasc.biopet.utils.config.Config
import nl.lumc.sasc.biopet.extensions.gmap.Gsnap
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.SkipException
......
<?xml version="1.0" encoding="UTF-8"?>
<!--
Biopet is built on top of GATK Queue for building bioinformatic
pipelines. It is mainly intended to support LUMC SHARK cluster which is running
SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
should also be able to execute Biopet tools and pipelines.
Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
Contact us at: sasc@lumc.nl
A dual licensing mode is applied. The source code within this project that are
not part of GATK Queue is freely available for non-commercial use under an AGPL
license; For commercial users or users who do not want to follow the AGPL
license, please contact us to obtain a separate license.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>GenerateIndexes</artifactId>
<packaging>jar</packaging>
<parent>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Biopet</artifactId>
<version>0.6.0-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>
<inceptionYear>2014</inceptionYear>
<name>GenerateIndexes</name>
<dependencies>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetCore</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetExtensions</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines
import java.io.PrintWriter
import java.util
import nl.lumc.sasc.biopet.core.extensions.Md5sum
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2Build, BowtieBuild }
import nl.lumc.sasc.biopet.extensions.bwa.BwaIndex
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import nl.lumc.sasc.biopet.extensions.gmap.GmapBuild
import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx