diff --git a/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala b/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala index 08fcf3608550fc40a2f2d5a5ec184888428e1e3d..0f2eed5d1b30e96cb36d9cc6f3936e93a87364f2 100644 --- a/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala +++ b/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala @@ -12,24 +12,40 @@ import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.{ RunGubbins, Cat, Raxml } import nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline import nl.lumc.sasc.biopet.tools.BastyGenerateFasta +import nl.lumc.sasc.biopet.utils.ConfigUtils import org.broadinstitute.gatk.queue.QScript class Basty(val root: Configurable) extends QScript with MultiSampleQScript { + qscript => def this() = this(null) - class LibraryOutput extends AbstractLibraryOutput { - } - case class FastaOutput(variants: File, consensus: File, consensusVariants: File) - class SampleOutput extends AbstractSampleOutput { + + override def defaults = ConfigUtils.mergeMaps(Map( + "ploidy" -> 1, + "use_haplotypecaller" -> false, + "use_unifiedgenotyper" -> true, + "joint_variantcalling" -> true + ), super.defaults) + + var gatkPipeline: GatkPipeline = new GatkPipeline(qscript) + + def makeSample(id: String) = new Sample(id) + class Sample(sampleId: String) extends AbstractSample(sampleId) { + def makeLibrary(id: String) = new Library(id) + class Library(libraryId: String) extends AbstractLibrary(libraryId) { + protected def addJobs(): Unit = {} + } + var output: FastaOutput = _ var outputSnps: FastaOutput = _ - } - defaults ++= Map("ploidy" -> 1, "use_haplotypecaller" -> false, "use_unifiedgenotyper" -> true, "joint_variantcalling" -> true) - - var gatkPipeline: GatkPipeline = new GatkPipeline(this) - gatkPipeline.jointVariantcalling = true + protected def addJobs(): Unit = { + addLibsJobs() + output = addGenerateFasta(sampleId, sampleDir) + outputSnps = addGenerateFasta(sampleId, sampleDir, snpsOnly = true) + } + } def init() { gatkPipeline.outputDir = outputDir @@ -43,21 +59,21 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript { val refVariants = addGenerateFasta(null, outputDir + "reference/", outputName = "reference") val refVariantSnps = addGenerateFasta(null, outputDir + "reference/", outputName = "reference", snpsOnly = true) - runSamplesJobs() + addSamplesJobs() - val catVariants = Cat(this, refVariants.variants :: samplesOutput.map(_._2.output.variants).toList, outputDir + "fastas/variant.fasta") + val catVariants = Cat(this, refVariants.variants :: samples.map(_._2.output.variants).toList, outputDir + "fastas/variant.fasta") add(catVariants) - val catVariantsSnps = Cat(this, refVariantSnps.variants :: samplesOutput.map(_._2.outputSnps.variants).toList, outputDir + "fastas/variant.snps_only.fasta") + val catVariantsSnps = Cat(this, refVariantSnps.variants :: samples.map(_._2.outputSnps.variants).toList, outputDir + "fastas/variant.snps_only.fasta") add(catVariantsSnps) - val catConsensus = Cat(this, refVariants.consensus :: samplesOutput.map(_._2.output.consensus).toList, outputDir + "fastas/consensus.fasta") + val catConsensus = Cat(this, refVariants.consensus :: samples.map(_._2.output.consensus).toList, outputDir + "fastas/consensus.fasta") add(catConsensus) - val catConsensusSnps = Cat(this, refVariantSnps.consensus :: samplesOutput.map(_._2.outputSnps.consensus).toList, outputDir + "fastas/consensus.snps_only.fasta") + val catConsensusSnps = Cat(this, refVariantSnps.consensus :: samples.map(_._2.outputSnps.consensus).toList, outputDir + "fastas/consensus.snps_only.fasta") add(catConsensusSnps) - val catConsensusVariants = Cat(this, refVariants.consensusVariants :: samplesOutput.map(_._2.output.consensusVariants).toList, outputDir + "fastas/consensus.variant.fasta") + val catConsensusVariants = Cat(this, refVariants.consensusVariants :: samples.map(_._2.output.consensusVariants).toList, outputDir + "fastas/consensus.variant.fasta") add(catConsensusVariants) - val catConsensusVariantsSnps = Cat(this, refVariantSnps.consensusVariants :: samplesOutput.map(_._2.outputSnps.consensusVariants).toList, outputDir + "fastas/consensus.variant.snps_only.fasta") + val catConsensusVariantsSnps = Cat(this, refVariantSnps.consensusVariants :: samples.map(_._2.outputSnps.consensusVariants).toList, outputDir + "fastas/consensus.variant.snps_only.fasta") add(catConsensusVariantsSnps) val seed: Int = config("seed", default = 12345) @@ -115,45 +131,20 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript { addTreeJobs(catVariants.output, catConsensusVariants.output, outputDir + "trees" + File.separator + "snps_indels", "snps_indels") } - // Called for each sample - def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = { - val sampleOutput = new SampleOutput - val sampleID: String = sampleConfig("ID").toString - val sampleDir = globalSampleDir + sampleID + "/" - - sampleOutput.libraries = runLibraryJobs(sampleConfig) - - sampleOutput.output = addGenerateFasta(sampleID, sampleDir) - sampleOutput.outputSnps = addGenerateFasta(sampleID, sampleDir, snpsOnly = true) - - return sampleOutput - } - - // Called for each run from a sample - def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = { - val libraryOutput = new LibraryOutput - - val runID: String = runConfig("ID").toString - val sampleID: String = sampleConfig("ID").toString - val runDir: String = globalSampleDir + sampleID + "/run_" + runID + "/" - - return libraryOutput - } - def addGenerateFasta(sampleName: String, outputDir: String, outputName: String = null, snpsOnly: Boolean = false): FastaOutput = { val bastyGenerateFasta = new BastyGenerateFasta(this) bastyGenerateFasta.outputName = if (outputName != null) outputName else sampleName bastyGenerateFasta.inputVcf = gatkPipeline.multisampleVariantcalling.scriptOutput.finalVcfFile - if (gatkPipeline.samplesOutput.contains(sampleName)) { - bastyGenerateFasta.bamFile = gatkPipeline.samplesOutput(sampleName).variantcalling.bamFiles.head + if (gatkPipeline.samples.contains(sampleName)) { + bastyGenerateFasta.bamFile = gatkPipeline.samples(sampleName).gatkVariantcalling.scriptOutput.bamFiles.head } bastyGenerateFasta.outputVariants = outputDir + bastyGenerateFasta.outputName + ".variants" + (if (snpsOnly) ".snps_only" else "") + ".fasta" bastyGenerateFasta.outputConsensus = outputDir + bastyGenerateFasta.outputName + ".consensus" + (if (snpsOnly) ".snps_only" else "") + ".fasta" bastyGenerateFasta.outputConsensusVariants = outputDir + bastyGenerateFasta.outputName + ".consensus_variants" + (if (snpsOnly) ".snps_only" else "") + ".fasta" bastyGenerateFasta.sampleName = sampleName bastyGenerateFasta.snpsOnly = snpsOnly - add(bastyGenerateFasta) + qscript.add(bastyGenerateFasta) return FastaOutput(bastyGenerateFasta.outputVariants, bastyGenerateFasta.outputConsensus, bastyGenerateFasta.outputConsensusVariants) } } diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala index 423d9e7899da25d3e6c77be0b02d0f17f59b61dc..8728b6c651824e340556222cb60ef52d7fd9ab0a 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala @@ -9,13 +9,17 @@ import java.io.File import nl.lumc.sasc.biopet.core.config.Configurable class ApplyRecalibration(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.ApplyRecalibration with GatkGeneral { + scatterCount = config("scattercount", default = 0) + override def afterGraph { super.afterGraph - if (config.contains("scattercount")) scatterCount = config("scattercount") - nt = Option(getThreads(3)) memoryLimit = Option(nt.getOrElse(1) * 2) + + import org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode + if (mode == Mode.INDEL) ts_filter_level = config("ts_filter_level", default = 99.0) + else if (mode == Mode.SNP) ts_filter_level = config("ts_filter_level", default = 99.5) ts_filter_level = config("ts_filter_level") } } @@ -24,11 +28,9 @@ object ApplyRecalibration { def apply(root: Configurable, input: File, output: File, recal_file: File, tranches_file: File, indel: Boolean = false): ApplyRecalibration = { val ar = if (indel) new ApplyRecalibration(root) { mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL - defaults ++= Map("ts_filter_level" -> 99.0) } else new ApplyRecalibration(root) { mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP - defaults ++= Map("ts_filter_level" -> 99.5) } ar.input :+= input ar.recal_file = recal_file diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala index 002e515c997082825310b367d0ccf874f62d8b73..e8866c2201d85b12eaaab1924be9234e2dac9fe1 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala @@ -27,11 +27,9 @@ object VariantRecalibrator { override def configPath: List[String] = (if (indel) "indel" else "snp") :: super.configPath if (indel) { mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL - defaults ++= Map("ts_filter_level" -> 99.0) if (config.contains("mills")) resource :+= new TaggedFile(config("mills").asString, "known=false,training=true,truth=true,prior=12.0") } else { mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP - defaults ++= Map("ts_filter_level" -> 99.5) if (config.contains("hapmap")) resource +:= new TaggedFile(config("hapmap").asString, "known=false,training=true,truth=true,prior=15.0") if (config.contains("omni")) resource +:= new TaggedFile(config("omni").asString, "known=false,training=true,truth=true,prior=12.0") if (config.contains("1000G")) resource +:= new TaggedFile(config("1000G").asString, "known=false,training=true,truth=false,prior=10.0") diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala index 24903a0bd200fe9dc2e6a7ac29cf25ab486eae0a..8ea6f9706b0e675b3f4fa1271c72038b1b2c3d6f 100644 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala +++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala @@ -9,9 +9,7 @@ import nl.lumc.sasc.biopet.core.MultiSampleQScript import nl.lumc.sasc.biopet.core.PipelineCommand import nl.lumc.sasc.biopet.core.config.Configurable import htsjdk.samtools.SamReaderFactory -import nl.lumc.sasc.biopet.pipelines.mapping.Mapping._ import scala.collection.JavaConversions._ -import java.io.File import nl.lumc.sasc.biopet.extensions.gatk.{ CombineVariants, CombineGVCFs } import nl.lumc.sasc.biopet.extensions.picard.AddOrReplaceReadGroups import nl.lumc.sasc.biopet.extensions.picard.SamToFastq @@ -21,238 +19,192 @@ import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.utils.commandline.{ Argument } class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScript { + qscript => def this() = this(null) - @Argument(doc = "Only Sample", shortName = "sample", required = false) - val onlySample: List[String] = Nil - @Argument(doc = "Skip Genotyping step", shortName = "skipgenotyping", required = false) - var skipGenotyping: Boolean = false + var skipGenotyping: Boolean = config("skip_genotyping", default = false) - @Argument(doc = "Merge gvcfs", shortName = "mergegvcfs", required = false) - var mergeGvcfs: Boolean = false + /** Merge gvcfs */ + var mergeGvcfs: Boolean = config("merge_gvcfs", default = false) - @Argument(doc = "Joint variantcalling", shortName = "jointVariantCalling", required = false) + /** Joint variantcalling */ var jointVariantcalling: Boolean = config("joint_variantcalling", default = false) - @Argument(doc = "Joint genotyping", shortName = "jointGenotyping", required = false) + /** Joint genotyping */ var jointGenotyping: Boolean = config("joint_genotyping", default = false) var singleSampleCalling = config("single_sample_calling", default = true) var reference: File = config("reference", required = true) var dbsnp: File = config("dbsnp") - var gvcfFiles: List[File] = Nil - var finalBamFiles: List[File] = Nil var useAllelesOption: Boolean = config("use_alleles_option", default = false) - - class LibraryOutput extends AbstractLibraryOutput { - var mappedBamFile: File = _ - var variantcalling: GatkVariantcalling.ScriptOutput = _ - } - - class SampleOutput extends AbstractSampleOutput { - var variantcalling: GatkVariantcalling.ScriptOutput = _ - } - - def init() { - if (config.contains("gvcfFiles")) - for (file <- config("gvcfFiles").asList) - gvcfFiles :+= file.toString - if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module") - else if (!outputDir.endsWith("/")) outputDir += "/" - } - - val multisampleVariantcalling = new GatkVariantcalling(this) { - override def configName = "gatkvariantcalling" - override def configPath: List[String] = super.configPath ::: "multisample" :: Nil - } - - def biopetScript() { - if (onlySample.isEmpty) { - runSamplesJobs - - //SampleWide jobs - if (mergeGvcfs && gvcfFiles.size > 0) { - val newFile = outputDir + "merged.gvcf.vcf.gz" - add(CombineGVCFs(this, gvcfFiles, newFile)) - gvcfFiles = List(newFile) - } - - if (!skipGenotyping && gvcfFiles.size > 0) { - if (jointGenotyping) { - val gatkGenotyping = new GatkGenotyping(this) - gatkGenotyping.inputGvcfs = gvcfFiles - gatkGenotyping.outputDir = outputDir + "genotyping/" - gatkGenotyping.init - gatkGenotyping.biopetScript - addAll(gatkGenotyping.functions) - var vcfFile = gatkGenotyping.outputFile - } - } else logger.warn("No gVCFs to genotype") - - if (jointVariantcalling) { - val allBamfiles = for ( - (sampleID, sampleOutput) <- samplesOutput; - file <- sampleOutput.variantcalling.bamFiles - ) yield file - val allRawVcfFiles = for ((sampleID, sampleOutput) <- samplesOutput) yield sampleOutput.variantcalling.rawFilterVcfFile - - val gatkVariantcalling = new GatkVariantcalling(this) { - override def configName = "gatkvariantcalling" - override def configPath: List[String] = super.configPath ::: "multisample" :: Nil + val externalGvcfs = config("external_gvcfs_files", default = Nil).asFileList + + def makeSample(id: String) = new Sample(id) + class Sample(sampleId: String) extends AbstractSample(sampleId) { + def makeLibrary(id: String) = new Library(id) + class Library(libraryId: String) extends AbstractLibrary(libraryId) { + val mapping = new Mapping(qscript) + mapping.sampleId = sampleId + mapping.libraryId = libraryId + mapping.outputDir = libDir + "/variantcalling/" + + /** Library variantcalling */ + val gatkVariantcalling = new GatkVariantcalling(qscript) + gatkVariantcalling.sampleID = sampleId + gatkVariantcalling.outputDir = libDir + + protected def addJobs(): Unit = { + val bamFile: Option[File] = if (config.contains("R1")) { + mapping.input_R1 = config("R1") + mapping.input_R2 = config("R2") + mapping.init + mapping.biopetScript + addAll(mapping.functions) // Add functions of mapping to curent function pool + Some(mapping.finalBamFile) + } else if (config.contains("bam")) { + var bamFile: File = config("bam") + if (!bamFile.exists) throw new IllegalStateException("Bam in config does not exist, file: " + bamFile) + + if (config("bam_to_fastq", default = false).asBoolean) { + val samToFastq = SamToFastq(qscript, bamFile, libDir + sampleId + "-" + libraryId + ".R1.fastq", + libDir + sampleId + "-" + libraryId + ".R2.fastq") + samToFastq.isIntermediate = true + qscript.add(samToFastq) + mapping.input_R1 = samToFastq.fastqR1 + mapping.input_R2 = samToFastq.fastqR2 + mapping.init + mapping.biopetScript + addAll(mapping.functions) // Add functions of mapping to curent function pool + Some(mapping.finalBamFile) + } else { + var readGroupOke = true + val inputSam = SamReaderFactory.makeDefault.open(bamFile) + val header = inputSam.getFileHeader.getReadGroups + for (readGroup <- inputSam.getFileHeader.getReadGroups) { + if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same") + if (readGroup.getLibrary != libraryId) logger.warn("Library ID readgroup in bam file is not the same") + if (readGroup.getSample != sampleId || readGroup.getLibrary != libraryId) readGroupOke = false + } + inputSam.close + + if (!readGroupOke) { + if (config("correct_readgroups", default = false)) { + logger.info("Correcting readgroups, file:" + bamFile) + val aorrg = AddOrReplaceReadGroups(qscript, bamFile, new File(libDir + sampleId + "-" + libraryId + ".bam")) + aorrg.RGID = sampleId + "-" + libraryId + aorrg.RGLB = libraryId + aorrg.RGSM = sampleId + aorrg.isIntermediate = true + qscript.add(aorrg) + bamFile = aorrg.output + } else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile + + "\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this") + } + addAll(BamMetrics(qscript, bamFile, libDir + "metrics/").functions) + + Some(bamFile) + } + } else { + logger.error("Sample: " + sampleId + ": No R1 found for run: " + libraryId) + None } - if (gatkVariantcalling.useMpileup) { - val cvRaw = CombineVariants(this, allRawVcfFiles.toList, outputDir + "variantcalling/multisample.raw.vcf.gz") - add(cvRaw) - gatkVariantcalling.rawVcfInput = cvRaw.out - } - - multisampleVariantcalling.preProcesBams = false - multisampleVariantcalling.doublePreProces = false - multisampleVariantcalling.inputBams = allBamfiles.toList - multisampleVariantcalling.outputDir = outputDir + "variantcalling" - multisampleVariantcalling.outputName = "multisample" - multisampleVariantcalling.init - multisampleVariantcalling.biopetScript - addAll(multisampleVariantcalling.functions) - - if (config("inputtype", default = "dna").asString != "rna" && config("recalibration", default = false).asBoolean) { - val recalibration = new GatkVariantRecalibration(this) - recalibration.inputVcf = multisampleVariantcalling.scriptOutput.finalVcfFile - recalibration.bamFiles = finalBamFiles - recalibration.outputDir = outputDir + "recalibration/" - recalibration.init - recalibration.biopetScript + if (bamFile.isDefined) { + gatkVariantcalling.inputBams = List(bamFile.get) + gatkVariantcalling.variantcalling = config("library_variantcalling", default = false) + gatkVariantcalling.preProcesBams = true + gatkVariantcalling.init + gatkVariantcalling.biopetScript + addAll(gatkVariantcalling.functions) } } - } else for (sample <- onlySample) runSingleSampleJobs(sample) - } - - // Called for each sample - def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = { - val sampleOutput = new SampleOutput - var libraryBamfiles: List[File] = List() - val sampleID: String = getCurrentSample - sampleOutput.libraries = runLibraryJobs(sampleConfig) - val sampleDir = globalSampleDir + sampleID - for ((libraryID, libraryOutput) <- sampleOutput.libraries) { - libraryBamfiles ++= libraryOutput.variantcalling.bamFiles } - if (libraryBamfiles.size > 0) { - finalBamFiles ++= libraryBamfiles - val gatkVariantcalling = new GatkVariantcalling(this) - gatkVariantcalling.inputBams = libraryBamfiles - gatkVariantcalling.outputDir = sampleDir + "/variantcalling/" + /** sample variantcalling */ + val gatkVariantcalling = new GatkVariantcalling(qscript) + gatkVariantcalling.sampleID = sampleId + gatkVariantcalling.outputDir = sampleDir + "/variantcalling/" + + protected def addJobs(): Unit = { + addLibsJobs() + gatkVariantcalling.inputBams = libraries.map(_._2.mapping.finalBamFile).toList gatkVariantcalling.preProcesBams = false if (!singleSampleCalling) { gatkVariantcalling.useHaplotypecaller = false gatkVariantcalling.useUnifiedGenotyper = false } - gatkVariantcalling.sampleID = sampleID gatkVariantcalling.init gatkVariantcalling.biopetScript addAll(gatkVariantcalling.functions) - sampleOutput.variantcalling = gatkVariantcalling.scriptOutput - gvcfFiles :+= gatkVariantcalling.scriptOutput.gvcfFile - } else logger.warn("No bamfiles for variant calling for sample: " + sampleID) - return sampleOutput + } } - // Called for each run from a sample - def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = { - val libraryOutput = new LibraryOutput - val runID: String = getCurrentLibrary - val sampleID: String = getCurrentSample - val runDir: String = globalSampleDir + sampleID + "/run_" + runID + "/" - var inputType: String = config("inputtype", default = "dna") - - def loadFromLibraryConfig(startJobs: Boolean = true): Mapping = { - val mapping = new Mapping(this) + def init() { + if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module") + else if (!outputDir.endsWith("/")) outputDir += "/" + } - mapping.input_R1 = config("R1") - mapping.input_R2 = config("R2") - mapping.RGLB = runID - mapping.RGSM = sampleID - mapping.RGPL = config("PL") - mapping.RGPU = config("PU") - mapping.RGCN = config("CN") - mapping.outputDir = runDir + val multisampleVariantcalling = new GatkVariantcalling(this) { + override def configName = "gatkvariantcalling" + override def configPath: List[String] = super.configPath ::: "multisample" :: Nil + } - if (startJobs) { - mapping.init - mapping.biopetScript + def biopetScript() { + addSamplesJobs + + //SampleWide jobs + val gvcfFiles: List[File] = if (mergeGvcfs && externalGvcfs.size + samples.size > 1) { + val newFile = outputDir + "merged.gvcf.vcf.gz" + add(CombineGVCFs(this, externalGvcfs ++ samples.map(_._2.gatkVariantcalling.scriptOutput.gvcfFile), newFile)) + List(newFile) + } else externalGvcfs ++ samples.map(_._2.gatkVariantcalling.scriptOutput.gvcfFile) + + if (!skipGenotyping && gvcfFiles.size > 0) { + if (jointGenotyping) { + val gatkGenotyping = new GatkGenotyping(this) + gatkGenotyping.inputGvcfs = gvcfFiles + gatkGenotyping.outputDir = outputDir + "genotyping/" + gatkGenotyping.init + gatkGenotyping.biopetScript + addAll(gatkGenotyping.functions) + var vcfFile = gatkGenotyping.outputFile } - return mapping - } + } else logger.warn("No gVCFs to genotype") - if (config.contains("R1")) { - val mapping = loadFromLibraryConfig() - addAll(mapping.functions) // Add functions of mapping to curent function pool - libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile") - } else if (config.contains("bam")) { - var bamFile: File = config("bam") - if (!bamFile.exists) throw new IllegalStateException("Bam in config does not exist, file: " + bamFile) + if (jointVariantcalling) { + val allBamfiles = samples.map(_._2.gatkVariantcalling.scriptOutput.bamFiles).toList.fold(Nil)(_ ++ _) + val allRawVcfFiles = samples.map(_._2.gatkVariantcalling.scriptOutput.rawVcfFile).filter(_ != null).toList - if (config("bam_to_fastq", default = false).asBoolean) { - val samToFastq = SamToFastq(this, bamFile, runDir + sampleID + "-" + runID + ".R1.fastq", - runDir + sampleID + "-" + runID + ".R2.fastq") - add(samToFastq, isIntermediate = true) - val mapping = loadFromLibraryConfig(startJobs = false) - mapping.input_R1 = samToFastq.fastqR1 - mapping.input_R2 = samToFastq.fastqR2 - mapping.init - mapping.biopetScript - addAll(mapping.functions) // Add functions of mapping to curent function pool - libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile") - } else { - var readGroupOke = true - val inputSam = SamReaderFactory.makeDefault.open(bamFile) - val header = inputSam.getFileHeader.getReadGroups - for (readGroup <- inputSam.getFileHeader.getReadGroups) { - if (readGroup.getSample != sampleID) logger.warn("Sample ID readgroup in bam file is not the same") - if (readGroup.getLibrary != runID) logger.warn("Library ID readgroup in bam file is not the same") - if (readGroup.getSample != sampleID || readGroup.getLibrary != runID) readGroupOke = false - } - inputSam.close + val gatkVariantcalling = new GatkVariantcalling(this) { + override def configName = "gatkvariantcalling" + override def configPath: List[String] = super.configPath ::: "multisample" :: Nil + } - if (!readGroupOke) { - if (config("correct_readgroups", default = false)) { - logger.info("Correcting readgroups, file:" + bamFile) - val aorrg = AddOrReplaceReadGroups(this, bamFile, new File(runDir + sampleID + "-" + runID + ".bam")) - aorrg.RGID = sampleID + "-" + runID - aorrg.RGLB = runID - aorrg.RGSM = sampleID - aorrg.RGPL = config("PL", default = "illumina") - aorrg.RGPU = config("PU", default = "na") - aorrg.RGCN = config("CN") - add(aorrg, isIntermediate = true) - bamFile = aorrg.output - } else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile + - "\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this") - } - addAll(BamMetrics(this, bamFile, runDir + "metrics/").functions) + if (gatkVariantcalling.useMpileup) { + val cvRaw = CombineVariants(this, allRawVcfFiles.toList, outputDir + "variantcalling/multisample.raw.vcf.gz") + add(cvRaw) + gatkVariantcalling.rawVcfInput = cvRaw.out + } - libraryOutput.mappedBamFile = bamFile + multisampleVariantcalling.preProcesBams = false + multisampleVariantcalling.doublePreProces = false + multisampleVariantcalling.inputBams = allBamfiles.toList + multisampleVariantcalling.outputDir = outputDir + "variantcalling" + multisampleVariantcalling.outputName = "multisample" + multisampleVariantcalling.init + multisampleVariantcalling.biopetScript + addAll(multisampleVariantcalling.functions) + + if (config("inputtype", default = "dna").asString != "rna" && config("recalibration", default = false).asBoolean) { + val recalibration = new GatkVariantRecalibration(this) + recalibration.inputVcf = multisampleVariantcalling.scriptOutput.finalVcfFile + recalibration.bamFiles = allBamfiles + recalibration.outputDir = outputDir + "recalibration/" + recalibration.init + recalibration.biopetScript } - } else { - logger.error("Sample: " + sampleID + ": No R1 found for run: " + runID) - return libraryOutput } - - val gatkVariantcalling = new GatkVariantcalling(this) - gatkVariantcalling.inputBams = List(libraryOutput.mappedBamFile) - gatkVariantcalling.outputDir = runDir - gatkVariantcalling.variantcalling = config("library_variantcalling", default = false) - gatkVariantcalling.preProcesBams = true - gatkVariantcalling.sampleID = sampleID - gatkVariantcalling.init - gatkVariantcalling.biopetScript - addAll(gatkVariantcalling.functions) - libraryOutput.variantcalling = gatkVariantcalling.scriptOutput - - return libraryOutput } } diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala index c242539f4de05b7636b8b8615ae89297dfcc31cc..759181117116f0df027a2c98243cb5123777fa16 100644 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala +++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala @@ -11,6 +11,7 @@ import nl.lumc.sasc.biopet.tools.{ MpileupToVcf, VcfFilter, MergeAlleles } import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.gatk.{ AnalyzeCovariates, BaseRecalibrator, GenotypeGVCFs, HaplotypeCaller, IndelRealigner, PrintReads, RealignerTargetCreator, SelectVariants, CombineVariants, UnifiedGenotyper } import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates +import nl.lumc.sasc.biopet.utils.ConfigUtils import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import org.broadinstitute.gatk.utils.commandline.{ Input, Argument } @@ -68,7 +69,8 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr if (files.isEmpty) throw new IllegalStateException("Files can't be empty") if (!doublePreProces.get) return files val markDup = MarkDuplicates(this, files, new File(outputDir + outputName + ".dedup.bam")) - add(markDup, isIntermediate = useIndelRealigner) + markDup.isIntermediate = useIndelRealigner + add(markDup) if (useIndelRealigner) { List(addIndelRealign(markDup.output, outputDir, isIntermediate = false)) } else { @@ -141,11 +143,13 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr add(m2v) scriptOutput.rawVcfFile = m2v.output - val vcfFilter = new VcfFilter(this) - vcfFilter.defaults ++= Map("min_sample_depth" -> 8, - "min_alternate_depth" -> 2, - "min_samples_pass" -> 1, - "filter_ref_calls" -> true) + val vcfFilter = new VcfFilter(this) { + override def defaults = ConfigUtils.mergeMaps(Map("min_sample_depth" -> 8, + "min_alternate_depth" -> 2, + "min_samples_pass" -> 1, + "filter_ref_calls" -> true + ), super.defaults) + } vcfFilter.inputVcf = m2v.output vcfFilter.outputVcf = this.swapExt(outputDir, m2v.output, ".vcf", ".filter.vcf.gz") add(vcfFilter) @@ -157,7 +161,8 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr // Allele mode if (useAllelesOption.get) { val mergeAlleles = MergeAlleles(this, mergeList.toList, outputDir + "raw.allele__temp_only.vcf.gz") - add(mergeAlleles, isIntermediate = true) + mergeAlleles.isIntermediate = true + add(mergeAlleles) if (useHaplotypecaller.get) { val hcAlleles = new HaplotypeCaller(this) @@ -187,7 +192,8 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr val sv = SelectVariants(this, input, output) sv.excludeFiltered = true sv.excludeNonVariants = true - add(sv, isIntermediate = true) + sv.isIntermediate = true + add(sv) sv.out } @@ -200,10 +206,12 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr def addIndelRealign(inputBam: File, dir: String, isIntermediate: Boolean = true): File = { val realignerTargetCreator = RealignerTargetCreator(this, inputBam, dir) - add(realignerTargetCreator, isIntermediate = true) + realignerTargetCreator.isIntermediate = true + add(realignerTargetCreator) val indelRealigner = IndelRealigner.apply(this, inputBam, realignerTargetCreator.out, dir) - add(indelRealigner, isIntermediate = isIntermediate) + indelRealigner.isIntermediate = isIntermediate + add(indelRealigner) return indelRealigner.o } @@ -227,7 +235,8 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr val printReads = PrintReads(this, inputBam, swapExt(dir, inputBam, ".bam", ".baserecal.bam")) printReads.BQSR = baseRecalibrator.o - add(printReads, isIntermediate = isIntermediate) + printReads.isIntermediate = isIntermediate + add(printReads) return printReads.o } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala index 534844aaec44fa9b7d8e8d9899c260eefe136429..374ba068fb576e26b7f1fa33eaf1896c3ff36f63 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala @@ -29,11 +29,10 @@ trait BiopetQScript extends Configurable with GatkLogging { @Argument(doc = "JSON config file(s)", fullName = "config_file", shortName = "config", required = false) val configfiles: List[File] = Nil - @Argument(doc = "Output directory", fullName = "output_directory", shortName = "outDir", required = true) var outputDir: String = _ @Argument(doc = "Disable all scatters", shortName = "DSC", required = false) - var disableScatterDefault: Boolean = false + var disableScatter: Boolean = false var outputFiles: Map[String, File] = Map() @@ -45,11 +44,12 @@ trait BiopetQScript extends Configurable with GatkLogging { var functions: Seq[QFunction] final def script() { + outputDir = config("output_dir", required = true) if (!outputDir.endsWith("/")) outputDir += "/" init biopetScript - if (disableScatterDefault) for (function <- functions) function match { + if (disableScatter) for (function <- functions) function match { case f: ScatterGatherableFunction => f.scatterCount = 1 case _ => } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala index 39c60a27b33fbabb8ef5a478affd8bee01051c5c..2579be01565cbf92d08519dee02e8ae568000c9f 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala @@ -15,171 +15,144 @@ */ package nl.lumc.sasc.biopet.core -import nl.lumc.sasc.biopet.core.config.{ ConfigValue, Config, Configurable } +import java.io.File + +import nl.lumc.sasc.biopet.core.config.{ Config } import nl.lumc.sasc.biopet.utils.ConfigUtils -import nl.lumc.sasc.biopet.utils.ConfigUtils._ +import org.broadinstitute.gatk.utils.commandline.{ Argument } +/** + * This trait creates a structured way of use multisample pipelines + */ trait MultiSampleQScript extends BiopetQScript { - type LibraryOutput <: AbstractLibraryOutput - type SampleOutput <: AbstractSampleOutput - - abstract class AbstractLibraryOutput - abstract class AbstractSampleOutput { - var libraries: Map[String, LibraryOutput] = Map() - def getAllLibraries = libraries - def getLibrary(key: String) = libraries(key) - } + @Argument(doc = "Only Sample", shortName = "sample", required = false) + val onlySample: List[String] = Nil - if (!Config.global.map.contains("samples")) logger.warn("No Samples found in config") + require(Config.global.map.contains("samples"), "No Samples found in config") /** - * Returns a map with all sample configs + * Sample class with basic functions build in + * @param sampleId */ - val getSamplesConfig: Map[String, Any] = ConfigUtils.any2map(Config.global.map.getOrElse("samples", Map())) + abstract class AbstractSample(val sampleId: String) { + /** Overrules config of qscript with default sample */ + val config = new ConfigFunctions(defaultSample = sampleId) + + /** + * Library class with basic functions build in + * @param libraryId + */ + abstract class AbstractLibrary(val libraryId: String) { + /** Overrules config of qscript with default sample and default library */ + val config = new ConfigFunctions(defaultSample = sampleId, defaultLibrary = libraryId) + + /** Adds the library jobs */ + final def addAndTrackJobs(): Unit = { + currentSample = Some(sampleId) + currentLib = Some(libraryId) + addJobs() + currentLib = None + currentSample = None + } - /** - * Returns a list of all sampleIDs - */ - val getSamples: Set[String] = getSamplesConfig.keySet + /** Creates a library file with given suffix */ + def createFile(suffix: String): File = new File(libDir, sampleId + "-" + libraryId + suffix) - /** - * Returns the global sample directory - * @return global sample directory - */ - def globalSampleDir: String = outputDir + "samples/" + /** Returns library directory */ + def libDir = sampleDir + "lib_" + libraryId + File.separator - var samplesOutput: Map[String, SampleOutput] = Map() + /** Function that add library jobs */ + protected def addJobs() + } - /** - * Runs runSingleSampleJobs method for each sample - */ - final def runSamplesJobs() { - for ((key, value) <- getSamplesConfig) { - var sample = any2map(value) - if (!sample.contains("ID")) sample += ("ID" -> key) - if (sample("ID") == key) { - currentSample = key - samplesOutput += key -> runSingleSampleJobs(sample) - currentSample = null - } else logger.warn("Key is not the same as ID on value for sample") + /** Library type, need implementation in pipeline */ + type Library <: AbstractLibrary + + /** Stores all libraries */ + val libraries: Map[String, Library] = libIds.map(id => id -> makeLibrary(id)).toMap + + /** + * Factory method for Library class + * @param id SampleId + * @return Sample class + */ + def makeLibrary(id: String): Library + + /** returns a set with library names */ + protected def libIds: Set[String] = { + ConfigUtils.getMapFromPath(Config.global.map, List("samples", sampleId, "libraries")).getOrElse(Map()).keySet } - } - def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput + /** Adds sample jobs */ + final def addAndTrackJobs(): Unit = { + currentSample = Some(sampleId) + addJobs() + currentSample = None + } - /** - * Run sample with only sampleID - * @param sample sampleID - * @return - */ - def runSingleSampleJobs(sample: String): SampleOutput = { - var map = any2map(getSamplesConfig(sample)) - if (map.contains("ID") && map("ID") != sample) - throw new IllegalStateException("ID in config not the same as the key") - else map += ("ID" -> sample) - return runSingleSampleJobs(map) - } + /** Function to add sample jobs */ + protected def addJobs() - /** - * Runs runSingleLibraryJobs method for each library found in sampleConfig - * @param sampleConfig sample config - * @return Map with libraryID -> LibraryOutput object - */ - final def runLibraryJobs(sampleConfig: Map[String, Any]): Map[String, LibraryOutput] = { - var output: Map[String, LibraryOutput] = Map() - val sampleID = sampleConfig("ID").toString - if (sampleConfig.contains("libraries")) { - val runs = any2map(sampleConfig("libraries")) - for ((key, value) <- runs) { - var library = any2map(value) - if (!library.contains("ID")) library += ("ID" -> key) - if (library("ID") == key) { - currentLibrary = key - output += key -> runSingleLibraryJobs(library, sampleConfig) - currentLibrary = null - } else logger.warn("Key is not the same as ID on value for run of sample: " + sampleID) + /** function add all libraries in one call */ + protected final def addLibsJobs(): Unit = { + for ((libraryId, library) <- libraries) { + library.addAndTrackJobs() } - } else logger.warn("No runs found in config for sample: " + sampleID) - return output - } - def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput + } - protected var currentSample: String = null - protected var currentLibrary: String = null + /** + * Creates a sample file with given suffix + * @param suffix + * @return + */ + def createFile(suffix: String) = new File(sampleDir, sampleId + suffix) - /** - * Set current sample manual, only use this when not using runSamplesJobs method - * @param sample - */ - def setCurrentSample(sample: String) { - logger.debug("Manual sample set to: " + sample) - currentSample = sample + /** Returns sample directory */ + def sampleDir = outputDir + "samples" + File.pathSeparator + sampleId + File.pathSeparator } - /** - * Gets current sample - * @return current sample - */ - def getCurrentSample = currentSample + /** Sample type, need implementation in pipeline */ + type Sample <: AbstractSample /** - * Reset current sample manual, only use this when not using runSamplesJobs method + * Factory method for Sample class + * @param id SampleId + * @return Sample class */ - def resetCurrentSample() { - logger.debug("Manual sample reset") - currentSample = null + def makeSample(id: String): Sample + + /** Stores all samples */ + val samples: Map[String, Sample] = sampleIds.map(id => id -> makeSample(id)).toMap + + /** Returns a list of all sampleIDs */ + protected def sampleIds: Set[String] = if (onlySample != Nil) onlySample.toSet else { + ConfigUtils.any2map(Config.global.map("samples")).keySet } /** - * Set current library manual, only use this when not using runLibraryJobs method - * @param library - */ - def setCurrentLibrary(library: String) { - logger.debug("Manual library set to: " + library) - currentLibrary = library + * Runs addAndTrackJobs method for each sample */ + final def addSamplesJobs() { + for ((sampleId, sample) <- samples) { + sample.addAndTrackJobs() + } } - /** - * Gets current library - * @return current library - */ - def getCurrentLibrary = currentLibrary + /** Stores sample state */ + private var currentSample: Option[String] = None - /** Reset current library manual, only use this when not using runLibraryJobs method */ - def resetCurrentLibrary() { - logger.debug("Manual library reset") - currentLibrary = null - } + /** Stores library state */ + private var currentLib: Option[String] = None + /** Prefix full path with sample and library for jobs that's are created in current state */ override protected[core] def configFullPath: List[String] = { - (if (currentSample != null) "samples" :: currentSample :: Nil else Nil) ::: - (if (currentLibrary != null) "libraries" :: currentLibrary :: Nil else Nil) ::: - super.configFullPath - } - - override val config = new ConfigFunctionsExt - - protected class ConfigFunctionsExt extends super.ConfigFunctions { - override def apply(key: String, - default: Any = null, - submodule: String = null, - required: Boolean = false, - freeVar: Boolean = true, - sample: String = null, - library: String = null): ConfigValue = { - val s = if (sample == null) currentSample else sample - val l = if (library == null) currentLibrary else library - super.apply(key, default, submodule, required, freeVar, s, l) + val s = currentSample match { + case Some(s) => "samples" :: s :: Nil + case _ => Nil } - - override def contains(key: String, - submodule: String = null, - freeVar: Boolean = true, - sample: String = null, - library: String = null) = { - val s = if (sample == null) currentSample else sample - val l = if (library == null) currentLibrary else library - super.contains(key, submodule, freeVar, s, l) + val l = currentLib match { + case Some(l) => "libraries" :: l :: Nil + case _ => Nil } + s ::: l ::: super.configFullPath } } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala index 94d5b4cc5d18759f97641ae01a7cb0dd64078dc6..60f7112525a92597c8e88c480e87c43bb228105a 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala @@ -31,6 +31,20 @@ trait PipelineCommand extends MainCommand with GatkLogging { if (t >= argsSize) throw new IllegalStateException("-config needs a value") Config.global.loadConfigFile(new File(args(t + 1))) } + if (args(t) == "--logging_level" || args(t) == "-l") { + args(t + 1).toLowerCase match { + case "debug" => Logging.logger.setLevel(org.apache.log4j.Level.DEBUG) + case "info" => Logging.logger.setLevel(org.apache.log4j.Level.INFO) + case "warn" => Logging.logger.setLevel(org.apache.log4j.Level.WARN) + case "error" => Logging.logger.setLevel(org.apache.log4j.Level.ERROR) + case _ => + } + } + } + for (t <- 0 until argsSize) { + if (args(t) == "--outputDir" || args(t) == "-outDir") { + throw new IllegalArgumentException("Commandline argument is deprecated, should use config for this now") + } } var argv: Array[String] = Array() diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Configurable.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Configurable.scala index 98f4ee3a4ba263d3510bac8ae7b70e56e2dbf0ef..54e6dfd170a8cb2e68f5c63dd8ca2675513cee7d 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Configurable.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Configurable.scala @@ -15,7 +15,6 @@ */ package nl.lumc.sasc.biopet.core.config -import java.io.File import nl.lumc.sasc.biopet.core.Logging import nl.lumc.sasc.biopet.utils.ConfigUtils.ImplicitConversions @@ -36,9 +35,9 @@ trait Configurable extends ImplicitConversions { protected[core] def configFullPath: List[String] = configPath ::: configName :: Nil /** Map to store defaults for config */ - var defaults: scala.collection.mutable.Map[String, Any] = { - if (root != null) scala.collection.mutable.Map(root.defaults.toArray: _*) - else scala.collection.mutable.Map() + def defaults: Map[String, Any] = { + if (root != null) root.defaults + else Map() } val config = new ConfigFunctions @@ -60,7 +59,21 @@ trait Configurable extends ImplicitConversions { /** * Class is used for retrieval of config values */ - protected class ConfigFunctions { + protected class ConfigFunctions(val defaultSample: Option[String] = None, val defaultLibrary: Option[String] = None) { + def this(defaultSample: String, defaultLibrary: String) = { + this(defaultSample = Some(defaultSample), defaultLibrary = Some(defaultLibrary)) + } + + def this(defaultSample: String) = { + this(defaultSample = Some(defaultSample), defaultLibrary = None) + } + + (defaultSample, defaultLibrary) match { + case (Some(null), _) => throw new IllegalArgumentException("defaultSample can not be null") + case (_, Some(null)) => throw new IllegalArgumentException("defaultLibrary can not be null") + case _ => + } + /** * * @param key Name of value @@ -79,13 +92,15 @@ trait Configurable extends ImplicitConversions { freeVar: Boolean = true, sample: String = null, library: String = null): ConfigValue = { + val s = if (sample != null || defaultSample.isEmpty) sample else defaultSample.get + val l = if (library != null || defaultLibrary.isEmpty) library else defaultLibrary.get val m = if (submodule != null) submodule else configName - val p = path(sample, library, submodule) + val p = path(s, l, submodule) val d = { val value = Config.getValueFromMap(defaults.toMap, ConfigValueIndex(m, p, key, freeVar)) if (value.isDefined) value.get.value else default } - if (!contains(key, submodule, freeVar, sample = sample, library = library) && d == null) { + if (!contains(key, submodule, freeVar, sample = s, library = l) && d == null) { if (required) { Logging.logger.error("Value in config could not be found but it is required, key: " + key + " module: " + m + " path: " + p) throw new IllegalStateException("Value in config could not be found but it is required, key: " + key + " module: " + m + " path: " + p) @@ -109,8 +124,10 @@ trait Configurable extends ImplicitConversions { freeVar: Boolean = true, sample: String = null, library: String = null) = { + val s = if (sample != null || defaultSample.isEmpty) sample else defaultSample.get + val l = if (library != null || defaultLibrary.isEmpty) library else defaultLibrary.get val m = if (submodule != null) submodule else configName - val p = path(sample, library, submodule) + val p = path(s, l, submodule) Config.global.contains(m, p, key, freeVar) || !(Config.getValueFromMap(defaults.toMap, ConfigValueIndex(m, p, key, freeVar)) == None) } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala index 205ad0541bde581c68019fa28b8c3c5574d605e9..7f670a44468ee63230f6df282b8027057ae9a08a 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala @@ -48,9 +48,9 @@ class Bowtie(val root: Configurable) extends BiopetCommandLineFunction { var seedmms: Option[Int] = config("seedmms") var k: Option[Int] = config("k") var m: Option[Int] = config("m") - var best: Boolean = config("best") + var best: Boolean = config("best", default = false) var maxbts: Option[Int] = config("maxbts") - var strata: Boolean = config("strata") + var strata: Boolean = config("strata", default = false) var maqerr: Option[Int] = config("maqerr") def cmdLine = { diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/MultisamplePipelineTemplate.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/MultisamplePipelineTemplate.scala new file mode 100644 index 0000000000000000000000000000000000000000..1ccf16f428e6cd2482648f4e9f2cc030696c9799 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/MultisamplePipelineTemplate.scala @@ -0,0 +1,47 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines + +import nl.lumc.sasc.biopet.core.{ PipelineCommand, MultiSampleQScript, BiopetQScript } +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.queue.QScript + +class MultisamplePipelineTemplate(val root: Configurable) extends QScript with MultiSampleQScript { + def this() = this(null) + + def makeSample(id: String) = new Sample(id) + class Sample(sampleId: String) extends AbstractSample(sampleId) { + + def makeLibrary(id: String) = new Library(id) + class Library(libraryId: String) extends AbstractLibrary(libraryId) { + protected def addJobs(): Unit = { + // Library jobs + } + } + + protected def addJobs(): Unit = { + // Sample jobs + } + } + + def init(): Unit = { + } + + def biopetScript() { + } +} + +object MultisamplePipelineTemplate extends PipelineCommand \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala index ac5eaa3c9d3b9648fadb1b90dd173c673bb368f5..ae21c34e4b820ebfb0b33e3b033196e03a58ef6e 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala @@ -21,6 +21,7 @@ import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsMpileup +import nl.lumc.sasc.biopet.utils.ConfigUtils import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.collection.mutable.ArrayBuffer import scala.io.Source @@ -50,7 +51,8 @@ class MpileupToVcf(val root: Configurable) extends BiopetJavaCommandLineFunction override val defaultVmem = "6G" memoryLimit = Option(2.0) - defaults ++= Map("samtoolsmpileup" -> Map("disable_baq" -> true, "min_map_quality" -> 1)) + override def defaults = ConfigUtils.mergeMaps(Map("samtoolsmpileup" -> Map("disable_baq" -> true, "min_map_quality" -> 1)), + super.defaults) override def afterGraph { super.afterGraph diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala index 8e041e224c78de07c0d670747b50c5363ab7e1e4..16a00b55157918b6ce92a49ff29d1a22a06110d5 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala @@ -340,6 +340,15 @@ object ConfigUtils extends Logging { if (value != null && value.value != null && value.value != None) new File(any2string(value.value)) else null } + /** + * Convert ConfigValue to File + * @param value Input ConfigValue + * @return + */ + implicit def configValue2optionFile(value: ConfigValue): Option[File] = { + if (value != null && value.value != null && value.value != None) Some(new File(any2string(value.value))) else None + } + /** * Convert ConfigValue to String * @param value Input ConfigValue @@ -350,6 +359,15 @@ object ConfigUtils extends Logging { if (value != null && value.value != null && value.value != None) any2string(value.value) else null } + /** + * Convert ConfigValue to String + * @param value Input ConfigValue + * @return + */ + implicit def configValue2optionString(value: ConfigValue): Option[String] = { + if (value != null && value.value != null && value.value != None) Some(any2string(value.value)) else None + } + /** * Convert ConfigValue to Long * @param value Input ConfigValue diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala index b072e7e64f5ecd4aedda103822d8b73210d58046..e44260588ee1037911be41438c38973d3e81fff8 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala @@ -32,17 +32,17 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { @Input(doc = "R2 fastq file (gzipped allowed)", shortName = "R2", required = false) var input_R2: File = _ - @Argument(doc = "Skip Trim fastq files", shortName = "skiptrim", required = false) - var skipTrim: Boolean = config("skiptrim", default = false) + /** Skip Trim fastq files */ + var skipTrim: Boolean = config("skip_trim", default = false) - @Argument(doc = "Skip Clip fastq files", shortName = "skipclip", required = false) - var skipClip: Boolean = config("skipclip", default = false) + /** Skip Clip fastq files */ + var skipClip: Boolean = config("skip_clip", default = false) - @Argument(doc = "Sample name", shortName = "sample", required = true) - var sampleName: String = _ + /** Sample name */ + var sampleId: String = _ - @Argument(doc = "Library name", shortName = "library", required = true) - var libraryName: String = _ + /** Library name */ + var libraryId: String = _ var paired: Boolean = (input_R2 != null) var R1_ext: String = _ @@ -60,8 +60,8 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { def init() { if (input_R1 == null) throw new IllegalStateException("Missing R1 on flexiprep module") if (outputDir == null) throw new IllegalStateException("Missing Output directory on flexiprep module") - if (sampleName == null) throw new IllegalStateException("Missing Sample name on flexiprep module") - if (libraryName == null) throw new IllegalStateException("Missing Library name on flexiprep module") + if (sampleId == null) throw new IllegalStateException("Missing Sample name on flexiprep module") + if (libraryId == null) throw new IllegalStateException("Missing Library name on flexiprep module") else if (!outputDir.endsWith("/")) outputDir += "/" paired = (input_R2 != null) @@ -79,7 +79,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { R2_name = R2_name.substring(0, R2_name.lastIndexOf(R2_ext)) } - summary.out = outputDir + sampleName + "-" + libraryName + ".qc.summary.json" + summary.out = outputDir + sampleId + "-" + libraryId + ".qc.summary.json" } def biopetScript() { @@ -136,24 +136,28 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { var deps: List[File] = if (paired) List(R1, R2) else List(R1) val seqtkSeq_R1 = SeqtkSeq(this, R1, swapExt(outDir, R1, R1_ext, ".sanger" + R1_ext), fastqc_R1) - add(seqtkSeq_R1, isIntermediate = true) + seqtkSeq_R1.isIntermediate = true + add(seqtkSeq_R1) R1 = seqtkSeq_R1.output deps ::= R1 if (paired) { val seqtkSeq_R2 = SeqtkSeq(this, R2, swapExt(outDir, R2, R2_ext, ".sanger" + R2_ext), fastqc_R2) - add(seqtkSeq_R2, isIntermediate = true) + seqtkSeq_R2.isIntermediate = true + add(seqtkSeq_R2) R2 = seqtkSeq_R2.output deps ::= R2 } val seqstat_R1 = Seqstat(this, R1, outDir) - add(seqstat_R1, isIntermediate = true) + seqstat_R1.isIntermediate = true + add(seqstat_R1) summary.addSeqstat(seqstat_R1, R2 = false, after = false, chunk) if (paired) { val seqstat_R2 = Seqstat(this, R2, outDir) - add(seqstat_R2, isIntermediate = true) + seqstat_R2.isIntermediate = true + add(seqstat_R2) summary.addSeqstat(seqstat_R2, R2 = true, after = false, chunk) } @@ -161,7 +165,8 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { val cutadapt_R1 = Cutadapt(this, R1, swapExt(outDir, R1, R1_ext, ".clip" + R1_ext)) cutadapt_R1.fastqc = fastqc_R1 - add(cutadapt_R1, isIntermediate = true) + cutadapt_R1.isIntermediate = true + add(cutadapt_R1) summary.addCutadapt(cutadapt_R1, R2 = false, chunk) R1 = cutadapt_R1.fastq_output deps ::= R1 @@ -171,7 +176,8 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { val cutadapt_R2 = Cutadapt(this, R2, swapExt(outDir, R2, R2_ext, ".clip" + R2_ext)) outputFiles += ("cutadapt_R2_stats" -> cutadapt_R2.stats_output) cutadapt_R2.fastqc = fastqc_R2 - add(cutadapt_R2, isIntermediate = true) + cutadapt_R2.isIntermediate = true + add(cutadapt_R2) summary.addCutadapt(cutadapt_R2, R2 = true, chunk) R2 = cutadapt_R2.fastq_output deps ::= R2 @@ -179,7 +185,8 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { val fastqSync = FastqSync(this, cutadapt_R1.fastq_input, cutadapt_R1.fastq_output, cutadapt_R2.fastq_output, swapExt(outDir, R1, R1_ext, ".sync" + R1_ext), swapExt(outDir, R2, R2_ext, ".sync" + R2_ext), swapExt(outDir, R1, R1_ext, ".sync.stats")) fastqSync.deps :::= deps - add(fastqSync, isIntermediate = true) + fastqSync.isIntermediate = true + add(fastqSync) summary.addFastqcSync(fastqSync, chunk) outputFiles += ("syncStats" -> fastqSync.output_stats) R1 = fastqSync.output_R1 diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala index aa5168c1ab5f5a93775e65cdda22bedbc58bf104..855e7288c51d697389adb0e60000e11730f1f4e0 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala @@ -120,8 +120,8 @@ class FlexiprepSummary(val root: Configurable) extends InProcessFunction with Co logger.debug("Start") md5Summary() val summary = - ("samples" := ( flexiprep.sampleName := - ("libraries" := ( flexiprep.libraryName := ( + ("samples" := ( flexiprep.sampleId := + ("libraries" := ( flexiprep.libraryId := ( ("flexiprep" := ( ("clipping" := !flexiprep.skipClip) ->: ("trimming" := !flexiprep.skipTrim) ->: diff --git a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala index b1ee9e7a3cc224aea2e9123bc4786ac53ada7c43..87a435d5429efb3ee6dc95e9aeab9c4bb8e7d72c 100644 --- a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala +++ b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala @@ -19,7 +19,7 @@ import nl.lumc.sasc.biopet.core.config.Configurable import java.io.File import java.util.Date import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } -import nl.lumc.sasc.biopet.extensions.{ Star, Stampy, Bowtie } +import nl.lumc.sasc.biopet.extensions.{ Ln, Star, Stampy, Bowtie } import nl.lumc.sasc.biopet.extensions.bwa.{ BwaSamse, BwaSampe, BwaAln, BwaMem } import nl.lumc.sasc.biopet.tools.FastqSplitter import nl.lumc.sasc.biopet.extensions.picard.{ MarkDuplicates, SortSam, MergeSamFiles, AddOrReplaceReadGroups } @@ -38,61 +38,61 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { @Input(doc = "R2 fastq file", shortName = "R2", required = false) var input_R2: File = _ - @Argument(doc = "Output name", shortName = "outputName", required = false) + /** Output name */ var outputName: String = _ - @Argument(doc = "Skip flexiprep", shortName = "skipflexiprep", required = false) - var skipFlexiprep: Boolean = false + /** Skip flexiprep */ + protected var skipFlexiprep: Boolean = config("skip_flexiprep", default = false) - @Argument(doc = "Skip mark duplicates", shortName = "skipmarkduplicates", required = false) - var skipMarkduplicates: Boolean = false + /** Skip mark duplicates */ + protected var skipMarkduplicates: Boolean = config("skip_markduplicates", default = false) - @Argument(doc = "Skip metrics", shortName = "skipmetrics", required = false) - var skipMetrics: Boolean = false + /** Skip metrics */ + protected var skipMetrics: Boolean = config("skip_metrics", default = false) - @Argument(doc = "Aligner", shortName = "ALN", required = false) - var aligner: String = config("aligner", default = "bwa") + /** Aligner */ + protected var aligner: String = config("aligner", default = "bwa") - @Argument(doc = "Reference", shortName = "R", required = false) - var reference: File = config("reference") + /** Reference */ + protected var reference: File = config("reference") - @Argument(doc = "Chunking", shortName = "chunking", required = false) - var chunking: Boolean = config("chunking", false) + /** Number of chunks, when not defined pipeline will automatic calculate number of chunks */ + protected var numberChunks: Option[Int] = config("number_chunks") - @ClassType(classOf[Int]) - @Argument(doc = "Number of chunks, when not defined pipeline will automatic calculate number of chunks", shortName = "numberChunks", required = false) - var numberChunks: Option[Int] = None + /** Enable chunking */ + protected var chunking: Boolean = config("chunking", numberChunks.getOrElse(1) > 1) // Readgroup items - @Argument(doc = "Readgroup ID", shortName = "RGID", required = false) - var RGID: String = config("RGID") + /** Readgroup ID */ + protected var readgroupId: String = _ - @Argument(doc = "Readgroup Library", shortName = "RGLB", required = false) - var RGLB: String = config("RGLB") + /** Readgroup Library */ + var libraryId: String = _ - @Argument(doc = "Readgroup Platform", shortName = "RGPL", required = false) - var RGPL: String = config("RGPL", default = "illumina") + /** Readgroup Platform */ + protected var platform: String = config("platform", default = "illumina") - @Argument(doc = "Readgroup platform unit", shortName = "RGPU", required = false) - var RGPU: String = config("RGPU", default = "na") + /** Readgroup platform unit */ + protected var platformUnit: String = config("platform_unit", default = "na") - @Argument(doc = "Readgroup sample", shortName = "RGSM", required = false) - var RGSM: String = config("RGSM") + /**Readgroup sample */ + var sampleId: String = _ - @Argument(doc = "Readgroup sequencing center", shortName = "RGCN", required = false) - var RGCN: String = config("RGCN") + /** Readgroup sequencing center */ + protected var readgroupSequencingCenter: Option[String] = config("readgroup_sequencing_center") - @Argument(doc = "Readgroup description", shortName = "RGDS", required = false) - var RGDS: String = config("RGDS") + /** Readgroup description */ + protected var readgroupDescription: Option[String] = config("readgroup_description") - @Argument(doc = "Readgroup sequencing date", shortName = "RGDT", required = false) - var RGDT: Date = _ + /** Readgroup sequencing date */ + protected var readgroupDate: Date = _ - @Argument(doc = "Readgroup predicted insert size", shortName = "RGPI", required = false) - var RGPI: Option[Int] = config("RGPI") + /** Readgroup predicted insert size */ + protected var predictedInsertsize: Option[Int] = config("predicted_insertsize") - var paired: Boolean = false + protected var paired: Boolean = false val flexiprep = new Flexiprep(this) + def finalBamFile: File = outputDir + outputName + ".final.bam" def init() { if (outputDir == null) throw new IllegalStateException("Missing Output directory on mapping module") @@ -100,12 +100,12 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { if (input_R1 == null) throw new IllegalStateException("Missing FastQ R1 on mapping module") paired = (input_R2 != null) - if (RGLB == null) throw new IllegalStateException("Missing Readgroup library on mapping module") - if (RGLB == null) throw new IllegalStateException("Missing Readgroup sample on mapping module") - if (RGID == null && RGSM != null && RGLB != null) RGID = RGSM + "-" + RGLB - else if (RGID == null) throw new IllegalStateException("Missing Readgroup ID on mapping module") + if (libraryId == null) libraryId = config("library_id") + if (sampleId == null) sampleId = config("sample_id") + if (readgroupId == null && sampleId != null && libraryId != null) readgroupId = sampleId + "-" + libraryId + else if (readgroupId == null) readgroupId = config("readgroup_id") - if (outputName == null) outputName = RGID + if (outputName == null) outputName = readgroupId if (chunking) { if (numberChunks.isEmpty) { @@ -126,8 +126,8 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { flexiprep.outputDir = outputDir + "flexiprep/" flexiprep.input_R1 = input_R1 if (paired) flexiprep.input_R2 = input_R2 - flexiprep.sampleName = this.RGSM - flexiprep.libraryName = this.RGLB + flexiprep.sampleId = this.sampleId + flexiprep.libraryId = this.libraryId flexiprep.init flexiprep.runInitialJobs } @@ -210,6 +210,8 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { if (!skipMetrics) addAll(BamMetrics(this, bamFile, outputDir + "metrics/").functions) + add(Ln(this, swapExt(outputDir, bamFile, ".bam", ".bai"), swapExt(outputDir, finalBamFile, ".bam", ".bai"))) + add(Ln(this, bamFile, finalBamFile)) outputFiles += ("finalBamFile" -> bamFile) } @@ -275,15 +277,15 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { def addStampy(R1: File, R2: File, output: File, deps: List[File]): File = { - var RG: String = "ID:" + RGID + "," - RG += "SM:" + RGSM + "," - RG += "LB:" + RGLB + "," - if (RGDS != null) RG += "DS" + RGDS + "," - RG += "PU:" + RGPU + "," - if (RGPI > 0) RG += "PI:" + RGPI + "," - if (RGCN != null) RG += "CN:" + RGCN + "," - if (RGDT != null) RG += "DT:" + RGDT + "," - RG += "PL:" + RGPL + var RG: String = "ID:" + readgroupId + "," + RG += "SM:" + sampleId + "," + RG += "LB:" + libraryId + "," + if (readgroupDescription != null) RG += "DS" + readgroupDescription + "," + RG += "PU:" + platformUnit + "," + if (predictedInsertsize.getOrElse(0) > 0) RG += "PI:" + predictedInsertsize.get + "," + if (readgroupSequencingCenter.isDefined) RG += "CN:" + readgroupSequencingCenter.get + "," + if (readgroupDate != null) RG += "DT:" + readgroupDate + "," + RG += "PL:" + platform val stampyCmd = new Stampy(this) stampyCmd.R1 = R1 @@ -327,13 +329,13 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { val addOrReplaceReadGroups = AddOrReplaceReadGroups(this, input, output) addOrReplaceReadGroups.createIndex = true - addOrReplaceReadGroups.RGID = RGID - addOrReplaceReadGroups.RGLB = RGLB - addOrReplaceReadGroups.RGPL = RGPL - addOrReplaceReadGroups.RGPU = RGPU - addOrReplaceReadGroups.RGSM = RGSM - if (RGCN != null) addOrReplaceReadGroups.RGCN = RGCN - if (RGDS != null) addOrReplaceReadGroups.RGDS = RGDS + addOrReplaceReadGroups.RGID = readgroupId + addOrReplaceReadGroups.RGLB = libraryId + addOrReplaceReadGroups.RGPL = platform + addOrReplaceReadGroups.RGPU = platformUnit + addOrReplaceReadGroups.RGSM = sampleId + if (readgroupSequencingCenter.isDefined) addOrReplaceReadGroups.RGCN = readgroupSequencingCenter.get + if (readgroupDescription.isDefined) addOrReplaceReadGroups.RGDS = readgroupDescription.get if (!skipMarkduplicates) addOrReplaceReadGroups.isIntermediate = true add(addOrReplaceReadGroups) @@ -341,15 +343,15 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { } def getReadGroup(): String = { - var RG: String = "@RG\\t" + "ID:" + RGID + "\\t" - RG += "LB:" + RGLB + "\\t" - RG += "PL:" + RGPL + "\\t" - RG += "PU:" + RGPU + "\\t" - RG += "SM:" + RGSM + "\\t" - if (RGCN != null) RG += "CN:" + RGCN + "\\t" - if (RGDS != null) RG += "DS" + RGDS + "\\t" - if (RGDT != null) RG += "DT" + RGDT + "\\t" - if (RGPI.isDefined) RG += "PI" + RGPI.get + "\\t" + var RG: String = "@RG\\t" + "ID:" + readgroupId + "\\t" + RG += "LB:" + libraryId + "\\t" + RG += "PL:" + platform + "\\t" + RG += "PU:" + platformUnit + "\\t" + RG += "SM:" + sampleId + "\\t" + if (readgroupSequencingCenter.isDefined) RG += "CN:" + readgroupSequencingCenter.get + "\\t" + if (readgroupDescription.isDefined) RG += "DS" + readgroupDescription.get + "\\t" + if (readgroupDate != null) RG += "DT" + readgroupDate + "\\t" + if (predictedInsertsize.isDefined) RG += "PI" + predictedInsertsize.get + "\\t" return RG.substring(0, RG.lastIndexOf("\\t")) } diff --git a/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala b/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala index c3f88b61f866ea6b62bada4d789b5b43037a8428..851e8481c5da2eff863760e1a9dfcaef180cc369 100644 --- a/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala +++ b/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala @@ -15,7 +15,7 @@ */ package nl.lumc.sasc.biopet.pipelines.sage -import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, PipelineCommand } +import nl.lumc.sasc.biopet.core.{ BiopetQScript, MultiSampleQScript, PipelineCommand } import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.Cat import nl.lumc.sasc.biopet.extensions.bedtools.BedtoolsCoverage @@ -28,38 +28,101 @@ import nl.lumc.sasc.biopet.scripts.SquishBed import nl.lumc.sasc.biopet.tools.SageCountFastq import nl.lumc.sasc.biopet.tools.SageCreateLibrary import nl.lumc.sasc.biopet.tools.SageCreateTagCounts +import nl.lumc.sasc.biopet.utils.ConfigUtils import org.broadinstitute.gatk.queue.QScript class Sage(val root: Configurable) extends QScript with MultiSampleQScript { + qscript => def this() = this(null) - @Input(doc = "countBed", required = false) var countBed: File = config("count_bed") - @Input(doc = "squishedCountBed, by suppling this file the auto squish job will be skipped", required = false) var squishedCountBed: File = config("squished_count_bed") - @Input(doc = "Transcriptome, used for generation of tag library", required = false) var transcriptome: File = config("transcriptome") var tagsLibrary: File = config("tags_library") - defaults ++= Map("bowtie" -> Map( + override def defaults = ConfigUtils.mergeMaps(Map("bowtie" -> Map( "m" -> 1, "k" -> 1, "best" -> true, "strata" -> true, "seedmms" -> 1 + ), "mapping" -> Map( + "aligner" -> "bowtie", + "skip_flexiprep" -> true, + "skip_markduplicates" -> true + ), "flexiprep" -> Map( + "skip_clip" -> true, + "skip_trim" -> true ) - ) - - class LibraryOutput extends AbstractLibraryOutput { - var mappedBamFile: File = _ - var prefixFastq: File = _ - } - - class SampleOutput extends AbstractSampleOutput { + ), super.defaults) + + def makeSample(id: String) = new Sample(id) + class Sample(sampleId: String) extends AbstractSample(sampleId) { + def makeLibrary(id: String) = new Library(id) + class Library(libraryId: String) extends AbstractLibrary(libraryId) { + val inputFastq: File = config("R1", required = true) + val prefixFastq: File = createFile(".prefix.fastq") + + val flexiprep = new Flexiprep(qscript) + flexiprep.sampleId = sampleId + flexiprep.libraryId = libraryId + + val mapping = new Mapping(qscript) + mapping.libraryId = libraryId + mapping.sampleId = sampleId + + protected def addJobs(): Unit = { + flexiprep.outputDir = libDir + "flexiprep/" + flexiprep.input_R1 = inputFastq + flexiprep.init + flexiprep.biopetScript + qscript.addAll(flexiprep.functions) + + val flexiprepOutput = for ((key, file) <- flexiprep.outputFiles if key.endsWith("output_R1")) yield file + val pf = new PrefixFastq(qscript) + pf.inputFastq = flexiprepOutput.head + pf.outputFastq = prefixFastq + pf.prefixSeq = config("sage_tag", default = "CATG") + pf.deps +:= flexiprep.outputFiles("fastq_input_R1") + qscript.add(pf) + + mapping.input_R1 = pf.outputFastq + mapping.outputDir = libDir + mapping.init + mapping.biopetScript + qscript.addAll(mapping.functions) + + if (config("library_counts", default = false).asBoolean) { + addBedtoolsCounts(mapping.finalBamFile, sampleId + "-" + libraryId, libDir) + addTablibCounts(pf.outputFastq, sampleId + "-" + libraryId, libDir) + } + } + } + protected def addJobs(): Unit = { + addLibsJobs() + val libraryBamfiles = libraries.map(_._2.mapping.finalBamFile).toList + val libraryFastqFiles = libraries.map(_._2.prefixFastq).toList + + val bamFile: File = if (libraryBamfiles.size == 1) libraryBamfiles.head + else if (libraryBamfiles.size > 1) { + val mergeSamFiles = MergeSamFiles(qscript, libraryBamfiles, sampleDir) + qscript.add(mergeSamFiles) + mergeSamFiles.output + } else null + val fastqFile: File = if (libraryFastqFiles.size == 1) libraryFastqFiles.head + else if (libraryFastqFiles.size > 1) { + val cat = Cat(qscript, libraryFastqFiles, sampleDir + sampleId + ".fastq") + qscript.add(cat) + cat.output + } else null + + addBedtoolsCounts(bamFile, sampleId, sampleDir) + addTablibCounts(fastqFile, sampleId, sampleDir) + } } def init() { @@ -88,88 +151,7 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { tagsLibrary = cdl.output } - runSamplesJobs - } - - // Called for each sample - def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = { - val sampleOutput = new SampleOutput - var libraryBamfiles: List[File] = List() - var libraryFastqFiles: List[File] = List() - val sampleID: String = sampleConfig("ID").toString - val sampleDir: String = globalSampleDir + sampleID + "/" - for ((library, libraryFiles) <- runLibraryJobs(sampleConfig)) { - libraryFastqFiles +:= libraryFiles.prefixFastq - libraryBamfiles +:= libraryFiles.mappedBamFile - } - - val bamFile: File = if (libraryBamfiles.size == 1) libraryBamfiles.head - else if (libraryBamfiles.size > 1) { - val mergeSamFiles = MergeSamFiles(this, libraryBamfiles, sampleDir) - add(mergeSamFiles) - mergeSamFiles.output - } else null - val fastqFile: File = if (libraryFastqFiles.size == 1) libraryFastqFiles.head - else if (libraryFastqFiles.size > 1) { - val cat = Cat.apply(this, libraryFastqFiles, sampleDir + sampleID + ".fastq") - add(cat) - cat.output - } else null - - addBedtoolsCounts(bamFile, sampleID, sampleDir) - addTablibCounts(fastqFile, sampleID, sampleDir) - - return sampleOutput - } - - // Called for each run from a sample - def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = { - val libraryOutput = new LibraryOutput - val runID: String = runConfig("ID").toString - val sampleID: String = sampleConfig("ID").toString - val runDir: String = globalSampleDir + sampleID + "/run_" + runID + "/" - if (runConfig.contains("R1")) { - val flexiprep = new Flexiprep(this) - flexiprep.outputDir = runDir + "flexiprep/" - flexiprep.input_R1 = new File(runConfig("R1").toString) - flexiprep.skipClip = true - flexiprep.skipTrim = true - flexiprep.sampleName = sampleID - flexiprep.libraryName = runID - flexiprep.init - flexiprep.biopetScript - addAll(flexiprep.functions) - - val flexiprepOutput = for ((key, file) <- flexiprep.outputFiles if key.endsWith("output_R1")) yield file - val prefixFastq = PrefixFastq(this, flexiprepOutput.head, runDir) - prefixFastq.prefixSeq = config("sage_tag", default = "CATG") - prefixFastq.deps +:= flexiprep.outputFiles("fastq_input_R1") - add(prefixFastq) - libraryOutput.prefixFastq = prefixFastq.outputFastq - - val mapping = new Mapping(this) - mapping.skipFlexiprep = true - mapping.skipMarkduplicates = true - mapping.aligner = config("aligner", default = "bowtie") - mapping.input_R1 = prefixFastq.outputFastq - mapping.RGLB = runConfig("ID").toString - mapping.RGSM = sampleConfig("ID").toString - if (runConfig.contains("PL")) mapping.RGPL = runConfig("PL").toString - if (runConfig.contains("PU")) mapping.RGPU = runConfig("PU").toString - if (runConfig.contains("CN")) mapping.RGCN = runConfig("CN").toString - mapping.outputDir = runDir - mapping.init - mapping.biopetScript - addAll(mapping.functions) - - if (config("library_counts", default = false).asBoolean) { - addBedtoolsCounts(mapping.outputFiles("finalBamFile"), sampleID + "-" + runID, runDir) - addTablibCounts(prefixFastq.outputFastq, sampleID + "-" + runID, runDir) - } - - libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile") - } else this.logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig) - return libraryOutput + addSamplesJobs } def addBedtoolsCounts(bamFile: File, outputPrefix: String, outputDir: String) { @@ -211,4 +193,4 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { } } -object Sage extends PipelineCommand +object Sage extends PipelineCommand \ No newline at end of file diff --git a/public/yamsvp/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala b/public/yamsvp/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala index 29b299c30e2071df0a631d3cb0c1ec73149bb043..8e3ef0af511f28b19dda8403258b024ab8936b42 100644 --- a/public/yamsvp/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala +++ b/public/yamsvp/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala @@ -20,8 +20,7 @@ package nl.lumc.sasc.biopet.pipelines.yamsvp import nl.lumc.sasc.biopet.core.config.Configurable -import nl.lumc.sasc.biopet.core.MultiSampleQScript -import nl.lumc.sasc.biopet.core.PipelineCommand +import nl.lumc.sasc.biopet.core.{ BiopetQScript, MultiSampleQScript, PipelineCommand } import nl.lumc.sasc.biopet.extensions.sambamba.{ SambambaIndex, SambambaMerge } import nl.lumc.sasc.biopet.extensions.svcallers.pindel.Pindel @@ -33,12 +32,12 @@ import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.queue.function._ import org.broadinstitute.gatk.queue.engine.JobRunInfo -class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { +class Yamsvp(val root: Configurable) extends QScript with BiopetQScript { //with MultiSampleQScript { def this() = this(null) var reference: File = config("reference", required = true) var finalBamFiles: List[File] = Nil - + /* class LibraryOutput extends AbstractLibraryOutput { var mappedBamFile: File = _ } @@ -47,7 +46,7 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { var vcf: Map[String, List[File]] = Map() var mappedBamFile: File = _ } - +*/ override def init() { if (outputDir == null) throw new IllegalStateException("Output directory is not specified in the config / argument") @@ -61,7 +60,7 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { // read config and set all parameters for the pipeline logger.info("Starting YAM SV Pipeline") - runSamplesJobs + //runSamplesJobs // } @@ -69,19 +68,18 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { override def onExecutionDone(jobs: Map[QFunction, JobRunInfo], success: Boolean) { logger.info("YAM SV Pipeline has run .......................") } - - def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = { + /* + def runSingleSampleJobs(sampleID: String): SampleOutput = { val sampleOutput = new SampleOutput var libraryBamfiles: List[File] = List() var outputFiles: Map[String, List[File]] = Map() var libraryFastqFiles: List[File] = List() - val sampleID: String = sampleConfig("ID").toString val sampleDir: String = outputDir + sampleID + "/" val alignmentDir: String = sampleDir + "alignment/" val svcallingDir: String = sampleDir + "svcalls/" - sampleOutput.libraries = runLibraryJobs(sampleConfig) + sampleOutput.libraries = runLibraryJobs(sampleID) for ((libraryID, libraryOutput) <- sampleOutput.libraries) { // this is extending the libraryBamfiles list like '~=' in D or .append in Python or .push_back in C++ libraryBamfiles ++= List(libraryOutput.mappedBamFile) @@ -126,29 +124,27 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { // Called for each run from a sample - def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = { + def runSingleLibraryJobs(libraryId: String, sampleID: String): LibraryOutput = { val libraryOutput = new LibraryOutput - val runID: String = runConfig("ID").toString - val sampleID: String = sampleConfig("ID").toString val alignmentDir: String = outputDir + sampleID + "/alignment/" - val runDir: String = alignmentDir + "run_" + runID + "/" + val runDir: String = alignmentDir + "run_" + libraryId + "/" - if (runConfig.contains("R1")) { + if (config.contains("R1")) { val mapping = new Mapping(this) mapping.aligner = config("aligner", default = "stampy") mapping.skipFlexiprep = false mapping.skipMarkduplicates = true // we do the dedup marking using Sambamba - if (runConfig.contains("R1")) mapping.input_R1 = new File(runConfig("R1").toString) - if (runConfig.contains("R2")) mapping.input_R2 = new File(runConfig("R2").toString) + mapping.input_R1 = config("R1") + mapping.input_R2 = config("R2") mapping.paired = (mapping.input_R2 != null) - mapping.RGLB = runConfig("ID").toString - mapping.RGSM = sampleConfig("ID").toString - if (runConfig.contains("PL")) mapping.RGPL = runConfig("PL").toString - if (runConfig.contains("PU")) mapping.RGPU = runConfig("PU").toString - if (runConfig.contains("CN")) mapping.RGCN = runConfig("CN").toString + mapping.RGLB = libraryId + mapping.RGSM = sampleID + mapping.RGPL = config("PL") + mapping.RGPU = config("PU") + mapping.RGCN = config("CN") mapping.outputDir = runDir mapping.init @@ -158,11 +154,12 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { // start sambamba dedup libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile") - } else this.logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig) + } else this.logger.error("Sample: " + sampleID + ": No R1 found for library: " + libraryId) return libraryOutput // logger.debug(outputFiles) // return outputFiles } + */ } object Yamsvp extends PipelineCommand \ No newline at end of file