Commit 827de641 authored by bow's avatar bow

Merge branch 'feature-basty_generate_fasta' into 'develop'

Feature basty generate fasta

for #31

See merge request !51
parents 109ff2b3 cbcd5dd0
......@@ -37,6 +37,7 @@ object BiopetExecutable extends Logging {
nl.lumc.sasc.biopet.tools.SageCountFastq,
nl.lumc.sasc.biopet.tools.SageCreateLibrary,
nl.lumc.sasc.biopet.tools.SageCreateTagCounts,
nl.lumc.sasc.biopet.tools.BastyGenerateFasta,
nl.lumc.sasc.biopet.tools.MergeAlleles,
nl.lumc.sasc.biopet.tools.SamplesTsvToJson)
)
......
......@@ -9,6 +9,8 @@ trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetC
javaGCHeapFreeLimit = config("java_gc_heap_freelimit")
javaGCTimeLimit = config("java_gc_timelimit")
override def javaOpts = super.javaOpts + optional("-Dscala.concurrent.context.numThreads=", threads, spaceSeparated = false, escape = false)
override def afterGraph {
memoryLimit = config("memory_limit")
}
......
......@@ -7,9 +7,9 @@ import java.io.File
class Raxml(val root: Configurable) extends BiopetCommandLineFunction {
override val defaultThreads = 4
override val defaultThreads = 1
override def versionCommand = executable + " -v"
override val versionRegex = """.*version \w* .*""".r
override val versionRegex = """.*version ([\w\.]*) .*""".r
@Input(doc = "Input phy/fasta file", required = true)
var input: File = _
......@@ -32,8 +32,8 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction {
@Argument(doc = "Name of output files", required = true)
var f: String = "d"
@Argument(doc = "Output directory", required = false)
var w: String = jobLocalDir.getAbsolutePath
@Argument(doc = "Output directory", required = true)
var w: String = _
@Input(required = false)
var t: File = _
......@@ -44,18 +44,35 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction {
@Output(doc = "Output files", required = false)
private var out: List[File] = Nil
executable = config("exe", default = "raxmlHPC")
var executableNonThreads: String = config("exe", default = "raxmlHPC")
var executableThreads: String = config("exe_pthreads")
override def afterGraph {
if (threads == 0) threads = getThreads(defaultThreads)
executable = if (threads > 1 && executableThreads != null) executableThreads else executableNonThreads
super.afterGraph
out +:= getInfoFile
f match {
case "d" if b.isEmpty => out +:= getBestTree
case "d" if b.isDefined => out +:= getBootstrap
case "d" if b.isEmpty => {
out +:= getBestTreeFile
for (t <- 0 until N.getOrElse(1)) {
out +:= new File(w + File.separator + "RAxML_log." + n + ".RUN." + t)
out +:= new File(w + File.separator + "RAxML_parsimonyTree." + n + ".RUN." + t)
out +:= new File(w + File.separator + "RAxML_result." + n + ".RUN." + t)
}
}
case "d" if b.isDefined => out +:= getBootstrapFile
case "b" => {
out +:= new File(w + File.separator + "RAxML_bipartitionsBranchLabels." + n)
out +:= new File(w + File.separator + "RAxML_bipartitions." + n)
}
case _ =>
}
}
def getBestTree: File = new File(w + File.separator + "RAxML_bestTree." + n)
def getBootstrap: File = new File(w + File.separator + "RAxML_bootstrap." + n)
def getBestTreeFile: File = new File(w + File.separator + "RAxML_bestTree." + n)
def getBootstrapFile: File = new File(w + File.separator + "RAxML_bootstrap." + n)
def getInfoFile: File = new File(w + File.separator + "RAxML_info." + n)
def cmdLine = required(executable) +
required("-m", m) +
......@@ -63,9 +80,10 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction {
optional("-p", p) +
optional("-b", b) +
optional("-N", N) +
optional("-n", n) +
optional("-w", w) +
optional("-f", f) +
optional("-t", t) +
optional("-z", z) +
(if (threads > 1) required("-T", threads) else "")
}
\ No newline at end of file
required("-T", threads)
}
package nl.lumc.sasc.biopet.pipelines.basty
import java.io.File
import nl.lumc.sasc.biopet.core.MultiSampleQScript
import nl.lumc.sasc.biopet.core.PipelineCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.Cat
import nl.lumc.sasc.biopet.extensions.Raxml
import nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline
import nl.lumc.sasc.biopet.tools.BastyGenerateFasta
import org.broadinstitute.gatk.queue.QScript
class Basty(val root: Configurable) extends QScript with MultiSampleQScript {
......@@ -12,15 +16,18 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript {
class LibraryOutput extends AbstractLibraryOutput {
}
case class FastaOutput(variants: File, consensus: File, consensusVariants: File)
class SampleOutput extends AbstractSampleOutput {
var output: FastaOutput = _
var outputSnps: FastaOutput = _
}
defaults ++= Map("ploidy" -> 1, "use_haplotypecaller" -> false, "use_unifiedgenotyper" -> true)
defaults ++= Map("ploidy" -> 1, "use_haplotypecaller" -> false, "use_unifiedgenotyper" -> true, "joint_variantcalling" -> true)
var gatkPipeline: GatkPipeline = _
var gatkPipeline: GatkPipeline = new GatkPipeline(this)
gatkPipeline.jointVariantcalling = true
def init() {
gatkPipeline = new GatkPipeline(this)
gatkPipeline.outputDir = outputDir
gatkPipeline.init
}
......@@ -29,17 +36,82 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript {
gatkPipeline.biopetScript
addAll(gatkPipeline.functions)
val refVariants = addGenerateFasta(null, outputDir + "reference/", outputName = "reference")
val refVariantSnps = addGenerateFasta(null, outputDir + "reference/", outputName = "reference", snpsOnly = true)
runSamplesJobs()
val catVariants = Cat(this, refVariants.variants :: samplesOutput.map(_._2.output.variants).toList, outputDir + "fastas/variant.fasta")
add(catVariants)
val catVariantsSnps = Cat(this, refVariantSnps.variants :: samplesOutput.map(_._2.outputSnps.variants).toList, outputDir + "fastas/variant.snps_only.fasta")
add(catVariantsSnps)
val catConsensus = Cat(this, refVariants.consensus :: samplesOutput.map(_._2.output.consensus).toList, outputDir + "fastas/consensus.fasta")
add(catConsensus)
val catConsensusSnps = Cat(this, refVariantSnps.consensus :: samplesOutput.map(_._2.outputSnps.consensus).toList, outputDir + "fastas/consensus.snps_only.fasta")
add(catConsensusSnps)
val catConsensusVariants = Cat(this, refVariants.consensusVariants :: samplesOutput.map(_._2.output.consensusVariants).toList, outputDir + "fastas/consensus.variant.fasta")
add(catConsensusVariants)
val catConsensusVariantsSnps = Cat(this, refVariantSnps.consensusVariants :: samplesOutput.map(_._2.outputSnps.consensusVariants).toList, outputDir + "fastas/consensus.variant.snps_only.fasta")
add(catConsensusVariantsSnps)
val seed: Int = config("seed", default = 12345)
def addRaxml(input: File, outputDir: String, outputName: String) {
val raxmlMl = new Raxml(this)
raxmlMl.input = input
raxmlMl.m = config("raxml_ml_model", default = "GTRGAMMAX")
raxmlMl.p = seed
raxmlMl.n = outputName + "_ml"
raxmlMl.w = outputDir
raxmlMl.N = config("ml_runs", default = 20, submodule = "raxml")
add(raxmlMl)
val r = new scala.util.Random(seed)
val numBoot = config("boot_runs", default = 100, submodule = "raxml").getInt
val bootList = for (t <- 0 until numBoot) yield {
val raxmlBoot = new Raxml(this)
raxmlBoot.threads = 1
raxmlBoot.input = input
raxmlBoot.m = config("raxml_ml_model", default = "GTRGAMMAX")
raxmlBoot.p = seed
raxmlBoot.b = math.abs(r.nextInt)
raxmlBoot.w = outputDir
raxmlBoot.N = 1
raxmlBoot.n = outputName + "_boot_" + t
add(raxmlBoot)
raxmlBoot.getBootstrapFile
}
val cat = Cat(this, bootList.toList, outputDir + "/boot_list")
add(cat)
val raxmlBi = new Raxml(this)
raxmlBi.input = input
raxmlBi.t = raxmlMl.getBestTreeFile
raxmlBi.z = cat.output
raxmlBi.m = config("raxml_ml_model", default = "GTRGAMMAX")
raxmlBi.p = seed
raxmlBi.f = "b"
raxmlBi.n = outputName + "_bi"
raxmlBi.w = outputDir
add(raxmlBi)
}
addRaxml(catVariantsSnps.output, outputDir + "raxml", "snps")
}
// Called for each sample
def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = {
val sampleOutput = new SampleOutput
val sampleID: String = sampleConfig("ID").toString
val sampleDir = globalSampleDir + sampleID
val sampleDir = globalSampleDir + sampleID + "/"
sampleOutput.libraries = runLibraryJobs(sampleConfig)
sampleOutput.output = addGenerateFasta(sampleID, sampleDir)
sampleOutput.outputSnps = addGenerateFasta(sampleID, sampleDir, snpsOnly = true)
return sampleOutput
}
......@@ -53,6 +125,23 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript {
return libraryOutput
}
def addGenerateFasta(sampleName: String, outputDir: String, outputName: String = null,
snpsOnly: Boolean = false): FastaOutput = {
val bastyGenerateFasta = new BastyGenerateFasta(this)
bastyGenerateFasta.outputName = if (outputName != null) outputName else sampleName
bastyGenerateFasta.inputVcf = gatkPipeline.multisampleVariantcalling.scriptOutput.finalVcfFile
if (gatkPipeline.samplesOutput.contains(sampleName)) {
bastyGenerateFasta.bamFile = gatkPipeline.samplesOutput(sampleName).variantcalling.bamFiles.head
}
bastyGenerateFasta.outputVariants = outputDir + bastyGenerateFasta.outputName + ".variants" + (if (snpsOnly) ".snps_only" else "") + ".fasta"
bastyGenerateFasta.outputConsensus = outputDir + bastyGenerateFasta.outputName + ".consensus" + (if (snpsOnly) ".snps_only" else "") + ".fasta"
bastyGenerateFasta.outputConsensusVariants = outputDir + bastyGenerateFasta.outputName + ".consensus_variants" + (if (snpsOnly) ".snps_only" else "") + ".fasta"
bastyGenerateFasta.sampleName = sampleName
bastyGenerateFasta.snpsOnly = snpsOnly
add(bastyGenerateFasta)
return FastaOutput(bastyGenerateFasta.outputVariants, bastyGenerateFasta.outputConsensus, bastyGenerateFasta.outputConsensusVariants)
}
}
object Basty extends PipelineCommand
......@@ -59,6 +59,11 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
else if (!outputDir.endsWith("/")) outputDir += "/"
}
val multisampleVariantcalling = new GatkVariantcalling(this) {
override protected lazy val configName = "gatkvariantcalling"
override def configPath: List[String] = "multisample" :: super.configPath
}
def biopetScript() {
if (onlySample.isEmpty) {
runSamplesJobs
......@@ -92,23 +97,19 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
val cvRaw = CombineVariants(this, allRawVcfFiles.toList, outputDir + "variantcalling/multisample.raw.vcf.gz")
add(cvRaw)
val gatkVariantcalling = new GatkVariantcalling(this) {
override protected lazy val configName = "gatkvariantcalling"
override def configPath: List[String] = "multisample" :: super.configPath
}
gatkVariantcalling.preProcesBams = Some(false)
gatkVariantcalling.doublePreProces = Some(false)
gatkVariantcalling.inputBams = allBamfiles.toList
gatkVariantcalling.rawVcfInput = cvRaw.out
gatkVariantcalling.outputDir = outputDir + "variantcalling"
gatkVariantcalling.outputName = "multisample"
gatkVariantcalling.init
gatkVariantcalling.biopetScript
addAll(gatkVariantcalling.functions)
multisampleVariantcalling.preProcesBams = false
multisampleVariantcalling.doublePreProces = false
multisampleVariantcalling.inputBams = allBamfiles.toList
multisampleVariantcalling.rawVcfInput = cvRaw.out
multisampleVariantcalling.outputDir = outputDir + "variantcalling"
multisampleVariantcalling.outputName = "multisample"
multisampleVariantcalling.init
multisampleVariantcalling.biopetScript
addAll(multisampleVariantcalling.functions)
if (config("inputtype", default = "dna").getString != "rna" && config("recalibration", default = false).getBoolean) {
val recalibration = new GatkVariantRecalibration(this)
recalibration.inputVcf = gatkVariantcalling.scriptOutput.finalVcfFile
recalibration.inputVcf = multisampleVariantcalling.scriptOutput.finalVcfFile
recalibration.bamFiles = finalBamFiles
recalibration.outputDir = outputDir + "recalibration/"
recalibration.init
......@@ -134,10 +135,10 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
val gatkVariantcalling = new GatkVariantcalling(this)
gatkVariantcalling.inputBams = libraryBamfiles
gatkVariantcalling.outputDir = sampleDir + "/variantcalling/"
gatkVariantcalling.preProcesBams = Some(false)
gatkVariantcalling.preProcesBams = false
if (!singleSampleCalling) {
gatkVariantcalling.useHaplotypecaller = Some(false)
gatkVariantcalling.useUnifiedGenotyper = Some(false)
gatkVariantcalling.useHaplotypecaller = false
gatkVariantcalling.useUnifiedGenotyper = false
}
gatkVariantcalling.sampleID = sampleID
gatkVariantcalling.init
......
......@@ -8,7 +8,7 @@ import nl.lumc.sasc.biopet.extensions.gatk.{ AnalyzeCovariates, BaseRecalibrator
import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile
import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument }
import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import scala.collection.SortedMap
import scala.language.reflectiveCalls
......@@ -41,25 +41,32 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
var useHaplotypecaller: Option[Boolean] = config("use_haplotypecaller", default = true)
var useUnifiedGenotyper: Option[Boolean] = config("use_unifiedgenotyper", default = false)
var useAllelesOption: Option[Boolean] = config("use_alleles_option", default = false)
var useIndelRealigner: Boolean = config("use_indel_realign", default = true)
var useBaseRecalibration: Boolean = config("use_base_recalibration", default = true)
def init() {
if (outputName == null && sampleID != null) outputName = sampleID
else if (outputName == null) outputName = "noname"
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
val baseRecalibrator = new BaseRecalibrator(this)
if (preProcesBams && useBaseRecalibration && baseRecalibrator.knownSites.isEmpty) {
logger.warn("No Known site found, skipping base recalibration")
useBaseRecalibration = false
}
}
private def doublePreProces(files: List[File]): List[File] = {
if (files.size == 1) return files
if (files.isEmpty) throw new IllegalStateException("Files can't be empty")
if (!doublePreProces.get) return files
val markDub = MarkDuplicates(this, files, new File(outputDir + outputName + ".dedup.bam"))
if (dbsnp != null) {
add(markDub, isIntermediate = true)
List(addIndelRealign(markDub.output, outputDir, isIntermediate = false))
val markDup = MarkDuplicates(this, files, new File(outputDir + outputName + ".dedup.bam"))
add(markDup, isIntermediate = useIndelRealigner)
if (useIndelRealigner) {
List(addIndelRealign(markDup.output, outputDir, isIntermediate = false))
} else {
add(markDub, isIntermediate = true)
List(markDub.output)
List(markDup.output)
}
}
......@@ -67,8 +74,14 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
scriptOutput.bamFiles = if (preProcesBams.get) {
var bamFiles: List[File] = Nil
for (inputBam <- inputBams) {
var bamFile = addIndelRealign(inputBam, outputDir)
bamFiles :+= addBaseRecalibrator(bamFile, outputDir, isIntermediate = bamFiles.size > 1)
var bamFile = inputBam
if (useIndelRealigner) {
bamFile = addIndelRealign(bamFile, outputDir, isIntermediate = useBaseRecalibration)
}
if (useBaseRecalibration) {
bamFile = addBaseRecalibrator(bamFile, outputDir, isIntermediate = bamFiles.size > 1)
}
bamFiles :+= bamFile
}
doublePreProces(bamFiles)
} else if (inputBams.size > 1 && doublePreProces.get) {
......@@ -188,7 +201,7 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
}
def addBaseRecalibrator(inputBam: File, dir: String, isIntermediate: Boolean = false): File = {
val baseRecalibrator = BaseRecalibrator(this, inputBam, swapExt(dir, inputBam, ".bam", ".baserecal")) //with gatkArguments {
val baseRecalibrator = BaseRecalibrator(this, inputBam, swapExt(dir, inputBam, ".bam", ".baserecal"))
if (baseRecalibrator.knownSites.isEmpty) {
logger.warn("No Known site found, skipping base recalibration, file: " + inputBam)
......@@ -196,7 +209,7 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
}
add(baseRecalibrator)
val baseRecalibratorAfter = BaseRecalibrator(this, inputBam, swapExt(dir, inputBam, ".bam", ".baserecal.after")) //with gatkArguments {
val baseRecalibratorAfter = BaseRecalibrator(this, inputBam, swapExt(dir, inputBam, ".bam", ".baserecal.after"))
baseRecalibratorAfter.BQSR = baseRecalibrator.o
add(baseRecalibratorAfter)
......
package nl.lumc.sasc.biopet.utils
import htsjdk.variant.variantcontext.VariantContext
import scala.collection.JavaConversions._
/** Utility object for general vcf file/records functions. */
object VcfUtils {
/**
* Return longest allele of VariantContext.
*
* @param vcfRecord record to check
* @return allele with most nucleotides
*/
def getLongestAllele(vcfRecord: VariantContext) = {
val alleles = vcfRecord.getAlleles
val longestAlleleId = alleles.map(_.getBases.length).zipWithIndex.maxBy(_._1)._2
alleles(longestAlleleId)
}
def fillAllele(bases: String, newSize: Int, fillWith: Char = '-'): String = {
bases + (Array.fill[Char](newSize - bases.size)(fillWith)).mkString
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment