diff --git a/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala b/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala index 612ddc50c9eb30a809b24fa459c5e31bdccd6d86..7b22399d3c33f1c71883b30c5aaeb7150324b4f5 100644 --- a/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala +++ b/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala @@ -6,7 +6,7 @@ package nl.lumc.sasc.biopet.core object BiopetExecutableProtected extends BiopetExecutable { - def pipelines: List[MainCommand] = BiopetExecutablePublic.protectedPipelines ::: List( + def pipelines: List[MainCommand] = BiopetExecutablePublic.pipelines ::: List( nl.lumc.sasc.biopet.pipelines.gatk.Shiva, nl.lumc.sasc.biopet.pipelines.gatk.ShivaVariantcalling, nl.lumc.sasc.biopet.pipelines.gatk.Basty) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/kraken/Kraken.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/kraken/Kraken.scala index 89d72249ab3751eef9b40c40e08d3692ce3e1baf..da53b6841c64345420465d60edfd7d2a08fabeee 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/kraken/Kraken.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/kraken/Kraken.scala @@ -28,6 +28,15 @@ class Kraken(val root: Configurable) extends BiopetCommandLineFunction { @Input(doc = "Input: FastQ or FastA") var input: List[File] = _ + @Output(doc = "Unidentified reads", required = false) + var unclassified_out: Option[File] = None + + @Output(doc = "Identified reads", required = false) + var classified_out: Option[File] = None + + @Output(doc = "Output with hits per sequence") + var output: File = _ + var db: File = config("db") var inputFastQ: Boolean = true @@ -36,55 +45,37 @@ class Kraken(val root: Configurable) extends BiopetCommandLineFunction { var compressionBzip: Boolean = false var quick: Boolean = false - var min_hits: Option[Int] = config("min_hits") - - @Output(doc = "Unidentified reads", required = false) - var unclassified_out: Option[File] = None - @Output(doc = "Identified reads", required = false) - var classified_out: Option[File] = None + var minHits: Option[Int] = config("min_hits") - @Output(doc = "Output with hits per sequence") - var output: File = _ - var preload: Boolean = config("preload", default = true) + var preLoad: Boolean = config("preload", default = true) var paired: Boolean = config("paired", default = false) executable = config("exe", default = "kraken") override def versionRegex = """Kraken version ([\d\w\-\.]+)\n.*""".r override def versionExitcode = List(0, 1) + override def versionCommand = executable + " --version" override def defaultCoreMemory = 8.0 override def defaultThreads = 4 - override def versionCommand = executable + " --version" - /** Sets readgroup when not set yet */ override def beforeGraph(): Unit = { super.beforeGraph() + //FIXME: This does not do anything } /** Returns command to execute */ - def cmdLine = { - var cmd: String = required(executable) + - "--db" + required(db) + - optional("--threads", nCoresRequest) + - conditional(inputFastQ, "--fastq-input") + - conditional(!inputFastQ, "--fasta-input") + - conditional(quick, "--quick") - - min_hits match { - case Some(v) => cmd += "--min_hits " + v - case _ => cmd += "" - } - - cmd += optional("--unclassified-out ", unclassified_out.get) + - optional("--classified-out ", classified_out.get) + - "--output" + required(output) + - conditional(preload, "--preload") + - conditional(paired, "--paired") - - // finally the input files (R1 [R2]) - cmd += input.mkString(" ") - - cmd - } + def cmdLine = required(executable) + + "--db" + required(db) + + optional("--threads", nCoresRequest) + + conditional(inputFastQ, "--fastq-input") + + conditional(!inputFastQ, "--fasta-input") + + conditional(quick, "--quick") + + optional("--min_hits", minHits) + + optional("--unclassified-out ", unclassified_out.get) + + optional("--classified-out ", classified_out.get) + + "--output" + required(output) + + conditional(preLoad, "--preload") + + conditional(paired, "--paired") + + repeat(input) } diff --git a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala index 725ab4e98e3fc02e3794f711a6074d0a51c5f789..40012aa2759279307589ef9309474f9841d34921 100644 --- a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala +++ b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala @@ -16,7 +16,7 @@ package nl.lumc.sasc.biopet.core object BiopetExecutablePublic extends BiopetExecutable { - def protectedPipelines: List[MainCommand] = List( + def publicPipelines: List[MainCommand] = List( nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep, nl.lumc.sasc.biopet.pipelines.mapping.Mapping, nl.lumc.sasc.biopet.pipelines.gentrap.Gentrap, @@ -25,15 +25,15 @@ object BiopetExecutablePublic extends BiopetExecutable { nl.lumc.sasc.biopet.pipelines.bamtobigwig.Bam2Wig, nl.lumc.sasc.biopet.pipelines.carp.Carp, nl.lumc.sasc.biopet.pipelines.toucan.Toucan, - nl.lumc.sasc.biopet.pipelines.shiva.ShivaSvCalling + nl.lumc.sasc.biopet.pipelines.shiva.ShivaSvCalling, + nl.lumc.sasc.biopet.pipelines.gears.Gears ) def pipelines: List[MainCommand] = List( nl.lumc.sasc.biopet.pipelines.shiva.Shiva, nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling, - nl.lumc.sasc.biopet.pipelines.basty.Basty, - nl.lumc.sasc.biopet.pipelines.gears.Gears - ) ::: protectedPipelines + nl.lumc.sasc.biopet.pipelines.basty.Basty + ) ::: publicPipelines def tools: List[MainCommand] = List( nl.lumc.sasc.biopet.tools.MergeTables, diff --git a/public/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/Gears.scala b/public/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/Gears.scala index 892082f063ee9220a611d57fbed41529c6c40905..0e14359fd649a6d1d56ec5c3aea23fc53f1cbec7 100644 --- a/public/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/Gears.scala +++ b/public/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/Gears.scala @@ -15,12 +15,310 @@ */ package nl.lumc.sasc.biopet.pipelines.gears -import nl.lumc.sasc.biopet.core.PipelineCommand +import htsjdk.samtools.SamReaderFactory +import nl.lumc.sasc.biopet.FullVersion +import nl.lumc.sasc.biopet.core.{ PipelineCommand, MultiSampleQScript } import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.extensions.Ln +import nl.lumc.sasc.biopet.extensions.kraken.{ Kraken, KrakenReport } +import nl.lumc.sasc.biopet.extensions.picard.{ AddOrReplaceReadGroups, MarkDuplicates, MergeSamFiles, SamToFastq } +import nl.lumc.sasc.biopet.extensions.sambamba.SambambaView +import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics +import nl.lumc.sasc.biopet.pipelines.mapping.Mapping +import nl.lumc.sasc.biopet.tools.FastqSync import org.broadinstitute.gatk.queue.QScript -class Gears(val root: Configurable) extends QScript with GearsTrait { +import scala.collection.JavaConversions._ + +/** + * This is a trait for the Gears pipeline + * The ShivaTrait is used as template for this pipeline + */ +class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qscript => def this() = this(null) + + /** Executed before running the script */ + def init(): Unit = { + } + + /** Method to add jobs */ + def biopetScript(): Unit = { + addSamplesJobs() + addSummaryJobs() + } + + /** Multisample meta-genome comparison */ + def addMultiSampleJobs(): Unit = { + // generate report from multiple samples, this is: + // - the TSV + // - the Spearman correlation plot + table + } + + /** Location of summary file */ + def summaryFile = new File(outputDir, "gears.summary.json") + + /** Settings of pipeline for summary */ + def summarySettings = Map() + + /** Files for the summary */ + def summaryFiles = Map() + + /** Method to make a sample */ + def makeSample(id: String) = new Sample(id) + + /** Class that will generate jobs for a sample */ + class Sample(sampleId: String) extends AbstractSample(sampleId) { + /** Sample specific files to add to summary */ + def summaryFiles: Map[String, File] = { + preProcessBam match { + case Some(pb) => Map("bamFile" -> pb) + case _ => Map() + } + } ++ Map("alignment" -> alnFile) + + /** Sample specific stats to add to summary */ + def summaryStats: Map[String, Any] = Map() + + /** Method to make a library */ + def makeLibrary(id: String) = new Library(id) + + /** Class to generate jobs for a library */ + class Library(libId: String) extends AbstractLibrary(libId) { + /** Library specific files to add to the summary */ + def summaryFiles: Map[String, File] = { + (bamFile, preProcessBam) match { + case (Some(b), Some(pb)) => Map("bamFile" -> b, "preProcessBam" -> pb) + case (Some(b), _) => Map("bamFile" -> b) + case _ => Map() + } + } + + /** Alignment results of this library ~ can only be accessed after addJobs is run! */ + def alnFile: File = bamFile match { + case Some(b) => b + case _ => throw new IllegalStateException("The bamfile is not generated yet") + } + + /** Library specific stats to add to summary */ + def summaryStats: Map[String, Any] = Map() + + /** Method to execute library preprocess */ + def preProcess(input: File): Option[File] = None + + /** Method to make the mapping submodule */ + def makeMapping = { + val mapping = new Mapping(qscript) + mapping.sampleId = Some(sampleId) + mapping.libId = Some(libId) + mapping.outputDir = libDir + mapping.outputName = sampleId + "-" + libId + (Some(mapping), Some(mapping.finalBamFile), preProcess(mapping.finalBamFile)) + } + + /** + * Determine where where to start the pipeline in cases where both R1 (fastq) and BAM is specified + */ + lazy val (mapping, bamFile, preProcessBam): (Option[Mapping], Option[File], Option[File]) = + (config.contains("R1"), config.contains("bam")) match { + case (true, _) => makeMapping // Default starting from fastq files + case (false, true) => // Starting from bam file + config("bam_to_fastq", default = false).asBoolean match { + case true => makeMapping // bam file will be converted to fastq + case false => + val file = new File(libDir, sampleId + "-" + libId + ".final.bam") + (None, Some(file), preProcess(file)) + } + case _ => (None, None, None) + } + + /** This will add jobs for this library */ + def addJobs(): Unit = { + (config.contains("R1"), config.contains("bam")) match { + case (true, _) => mapping.foreach(mapping => { + mapping.input_R1 = config("R1") + }) + case (false, true) => config("bam_to_fastq", default = false).asBoolean match { + case true => + val samToFastq = SamToFastq(qscript, config("bam"), + new File(libDir, sampleId + "-" + libId + ".R1.fastq"), + new File(libDir, sampleId + "-" + libId + ".R2.fastq")) + samToFastq.isIntermediate = true + qscript.add(samToFastq) + mapping.foreach(mapping => { + mapping.input_R1 = samToFastq.fastqR1 + mapping.input_R2 = Some(samToFastq.fastqR2) + }) + case false => + val inputSam = SamReaderFactory.makeDefault.open(config("bam")) + val readGroups = inputSam.getFileHeader.getReadGroups + + val readGroupOke = readGroups.forall(readGroup => { + if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same") + if (readGroup.getLibrary != libId) logger.warn("Library ID readgroup in bam file is not the same") + readGroup.getSample == sampleId && readGroup.getLibrary == libId + }) + inputSam.close() + + if (!readGroupOke) { + if (config("correct_readgroups", default = false).asBoolean) { + logger.info("Correcting readgroups, file:" + config("bam")) + val aorrg = AddOrReplaceReadGroups(qscript, config("bam"), bamFile.get) + aorrg.RGID = sampleId + "-" + libId + aorrg.RGLB = libId + aorrg.RGSM = sampleId + aorrg.isIntermediate = true + qscript.add(aorrg) + } else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile + + "\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this") + } else { + val oldBamFile: File = config("bam") + val oldIndex: File = new File(oldBamFile.getAbsolutePath.stripSuffix(".bam") + ".bai") + val newIndex: File = new File(libDir, oldBamFile.getName.stripSuffix(".bam") + ".bai") + val baiLn = Ln(qscript, oldIndex, newIndex) + add(baiLn) + + val bamLn = Ln(qscript, oldBamFile, bamFile.get) + bamLn.deps :+= baiLn.output + add(bamLn) + } + } + case _ => logger.warn("Sample: " + sampleId + " Library: " + libId + ", no reads found") + } + mapping.foreach(mapping => { + mapping.init() + mapping.biopetScript() + addAll(mapping.functions) // Add functions of mapping to current function pool + addSummaryQScript(mapping) + }) + } + } + + /** This will add jobs for the double preprocessing */ + protected def addDoublePreProcess(input: List[File], isIntermediate: Boolean = false): Option[File] = { + if (input == Nil) None + else if (input.tail == Nil) { + val bamFile = new File(sampleDir, input.head.getName) + val oldIndex: File = new File(input.head.getAbsolutePath.stripSuffix(".bam") + ".bai") + val newIndex: File = new File(sampleDir, input.head.getName.stripSuffix(".bam") + ".bai") + val baiLn = Ln(qscript, oldIndex, newIndex) + add(baiLn) + + val bamLn = Ln(qscript, input.head, bamFile) + bamLn.deps :+= baiLn.output + add(bamLn) + Some(bamFile) + } else { + val md = new MarkDuplicates(qscript) + md.input = input + md.output = new File(sampleDir, sampleId + ".dedup.bam") + md.outputMetrics = new File(sampleDir, sampleId + ".dedup.metrics") + md.isIntermediate = isIntermediate + md.removeDuplicates = true + add(md) + addSummarizable(md, "mark_duplicates") + Some(md.output) + } + } + + lazy val preProcessBam: Option[File] = addDoublePreProcess(libraries.flatMap(lib => { + (lib._2.bamFile, lib._2.preProcessBam) match { + case (_, Some(file)) => Some(file) + case (Some(file), _) => Some(file) + case _ => None + } + }).toList) + def alnFile: File = sampleBamLinkJob.output + + /** Job for combining all library BAMs */ + private def sampleBamLinkJob: Ln = + makeCombineJob(libraries.values.map(_.alnFile).toList, createFile(".bam")) + + /** Ln or MergeSamFile job, depending on how many inputs are supplied */ + private def makeCombineJob(inFiles: List[File], outFile: File, + mergeSortOrder: String = "coordinate"): Ln = { + require(inFiles.nonEmpty, "At least one input files for combine job") + val input: File = { + + if (inFiles.size == 1) inFiles.head + else { + val mergedBam = createFile(".merged.bam") + val mergejob = new MergeSamFiles(qscript) + mergejob.input = inFiles + mergejob.output = mergedBam + mergejob.sortOrder = mergeSortOrder + add(mergejob) + mergejob.output + } + } + + val linkJob = new Ln(qscript) + linkJob.input = input + linkJob.output = outFile + linkJob + + } + + /** This will add sample jobs */ + def addJobs(): Unit = { + addPerLibJobs() + // merge or symlink per-library alignments + add(sampleBamLinkJob) + + if (preProcessBam.isDefined) { + val bamMetrics = new BamMetrics(qscript) + bamMetrics.sampleId = Some(sampleId) + bamMetrics.inputBam = preProcessBam.get + bamMetrics.outputDir = sampleDir + bamMetrics.init() + bamMetrics.biopetScript() + addAll(bamMetrics.functions) + addSummaryQScript(bamMetrics) + } + + // sambamba view -f bam -F "unmapped or mate_is_unmapped" <alnFile> > <extracted.bam> + val samFilterUnmapped = new SambambaView(qscript) + samFilterUnmapped.input = alnFile + samFilterUnmapped.filter = Some("unmapped or mate_is_unmapped") + samFilterUnmapped.output = createFile(".unmapped.bam") + samFilterUnmapped.isIntermediate = true + qscript.add(samFilterUnmapped) + + // start bam to fastq (only on unaligned reads) also extract the matesam + val samToFastq = SamToFastq(qscript, alnFile, + createFile(".unmap.R1.fastq"), + createFile(".unmap.R2.fastq") + ) + samToFastq.isIntermediate = true + qscript.add(samToFastq) + + // sync the fastq records + val fastqSync = new FastqSync(qscript) + fastqSync.refFastq = samToFastq.fastqR1 + fastqSync.inputFastq1 = samToFastq.fastqR1 + fastqSync.inputFastq2 = samToFastq.fastqR2 + fastqSync.outputFastq1 = createFile(".unmapsynced.R1.fastq.gz") + fastqSync.outputFastq2 = createFile(".unmapsynced.R2.fastq.gz") + fastqSync.outputStats = createFile(".syncstats.json") + qscript.add(fastqSync) + + // start kraken + val krakenAnalysis = new Kraken(qscript) + krakenAnalysis.input = List(fastqSync.outputFastq1, fastqSync.outputFastq2) + krakenAnalysis.output = createFile(".krkn.raw") + krakenAnalysis.paired = true + krakenAnalysis.classified_out = Option(createFile(".krkn.classified.fastq")) + krakenAnalysis.unclassified_out = Option(createFile(".krkn.unclassified.fastq")) + qscript.add(krakenAnalysis) + + // create kraken summary file + + val krakenReport = new KrakenReport(qscript) + krakenReport.input = krakenAnalysis.output + krakenReport.show_zeros = true + krakenReport.output = createFile(".krkn.full") + qscript.add(krakenReport) + } + } } /** This object give a default main method to the pipelines */ diff --git a/public/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTrait.scala b/public/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTrait.scala deleted file mode 100644 index c0f2cd8cd2bae5e128f19cda457ed62b9aa93283..0000000000000000000000000000000000000000 --- a/public/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTrait.scala +++ /dev/null @@ -1,328 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.pipelines.gears - -import java.io.File - -import htsjdk.samtools.SamReaderFactory -import nl.lumc.sasc.biopet.FullVersion -import nl.lumc.sasc.biopet.core.MultiSampleQScript -import nl.lumc.sasc.biopet.core.summary.SummaryQScript -import nl.lumc.sasc.biopet.extensions.Ln -import nl.lumc.sasc.biopet.extensions.kraken.{ Kraken, KrakenReport } -import nl.lumc.sasc.biopet.extensions.picard.{ AddOrReplaceReadGroups, MarkDuplicates, MergeSamFiles, SamToFastq } -import nl.lumc.sasc.biopet.extensions.sambamba.SambambaView -import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics -import nl.lumc.sasc.biopet.pipelines.mapping.Mapping -import nl.lumc.sasc.biopet.tools.FastqSync - -import scala.collection.JavaConversions._ - -/** - * This is a trait for the Gears pipeline - * The ShivaTrait is used as template for this pipeline - */ -trait GearsTrait extends MultiSampleQScript with SummaryQScript { qscript => - - /** Executed before running the script */ - def init(): Unit = { - } - - /** Method to add jobs */ - def biopetScript(): Unit = { - addSamplesJobs() - addSummaryJobs() - } - - /** Multisample meta-genome comparison */ - def addMultiSampleJobs(): Unit = { - // generate report from multiple samples, this is: - // - the TSV - // - the Spearman correlation plot + table - } - - /** Location of summary file */ - def summaryFile = new File(outputDir, "gears.summary.json") - - /** Settings of pipeline for summary */ - def summarySettings = Map( - "version" -> FullVersion - ) - - /** Files for the summary */ - def summaryFiles = Map() - - /** Method to make a sample */ - def makeSample(id: String) = new Sample(id) - - /** Class that will generate jobs for a sample */ - class Sample(sampleId: String) extends AbstractSample(sampleId) { - /** Sample specific files to add to summary */ - def summaryFiles: Map[String, File] = { - preProcessBam match { - case Some(pb) => Map("bamFile" -> pb) - case _ => Map() - } - } ++ Map( - "alignment" -> alnFile - ) - - /** Sample specific stats to add to summary */ - def summaryStats: Map[String, Any] = Map() - - /** Method to make a library */ - def makeLibrary(id: String) = new Library(id) - - /** Class to generate jobs for a library */ - class Library(libId: String) extends AbstractLibrary(libId) { - /** Library specific files to add to the summary */ - def summaryFiles: Map[String, File] = { - (bamFile, preProcessBam) match { - case (Some(b), Some(pb)) => Map("bamFile" -> b, "preProcessBam" -> pb) - case (Some(b), _) => Map("bamFile" -> b) - case _ => Map() - } - } - - /** Alignment results of this library ~ can only be accessed after addJobs is run! */ - def alnFile: File = bamFile match { - case Some(b) => b - case _ => throw new IllegalStateException("The bamfile is not generated yet") - } - - /** Library specific stats to add to summary */ - def summaryStats: Map[String, Any] = Map() - - /** Method to execute library preprocess */ - def preProcess(input: File): Option[File] = None - - /** Method to make the mapping submodule */ - def makeMapping = { - val mapping = new Mapping(qscript) - mapping.sampleId = Some(sampleId) - mapping.libId = Some(libId) - mapping.outputDir = libDir - mapping.outputName = sampleId + "-" + libId - (Some(mapping), Some(mapping.finalBamFile), preProcess(mapping.finalBamFile)) - } - - /** - * Determine where where to start the pipeline in cases where both R1 (fastq) and BAM is specified - */ - lazy val (mapping, bamFile, preProcessBam): (Option[Mapping], Option[File], Option[File]) = - (config.contains("R1"), config.contains("bam")) match { - case (true, _) => makeMapping // Default starting from fastq files - case (false, true) => // Starting from bam file - config("bam_to_fastq", default = false).asBoolean match { - case true => makeMapping // bam file will be converted to fastq - case false => - val file = new File(libDir, sampleId + "-" + libId + ".final.bam") - (None, Some(file), preProcess(file)) - } - case _ => (None, None, None) - } - - /** This will add jobs for this library */ - def addJobs(): Unit = { - (config.contains("R1"), config.contains("bam")) match { - case (true, _) => mapping.foreach(mapping => { - mapping.input_R1 = config("R1") - }) - case (false, true) => config("bam_to_fastq", default = false).asBoolean match { - case true => - val samToFastq = SamToFastq(qscript, config("bam"), - new File(libDir, sampleId + "-" + libId + ".R1.fastq"), - new File(libDir, sampleId + "-" + libId + ".R2.fastq")) - samToFastq.isIntermediate = true - qscript.add(samToFastq) - mapping.foreach(mapping => { - mapping.input_R1 = samToFastq.fastqR1 - mapping.input_R2 = Some(samToFastq.fastqR2) - }) - case false => - val inputSam = SamReaderFactory.makeDefault.open(config("bam")) - val readGroups = inputSam.getFileHeader.getReadGroups - - val readGroupOke = readGroups.forall(readGroup => { - if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same") - if (readGroup.getLibrary != libId) logger.warn("Library ID readgroup in bam file is not the same") - readGroup.getSample == sampleId && readGroup.getLibrary == libId - }) - inputSam.close() - - if (!readGroupOke) { - if (config("correct_readgroups", default = false).asBoolean) { - logger.info("Correcting readgroups, file:" + config("bam")) - val aorrg = AddOrReplaceReadGroups(qscript, config("bam"), bamFile.get) - aorrg.RGID = sampleId + "-" + libId - aorrg.RGLB = libId - aorrg.RGSM = sampleId - aorrg.isIntermediate = true - qscript.add(aorrg) - } else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile + - "\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this") - } else { - val oldBamFile: File = config("bam") - val oldIndex: File = new File(oldBamFile.getAbsolutePath.stripSuffix(".bam") + ".bai") - val newIndex: File = new File(libDir, oldBamFile.getName.stripSuffix(".bam") + ".bai") - val baiLn = Ln(qscript, oldIndex, newIndex) - add(baiLn) - - val bamLn = Ln(qscript, oldBamFile, bamFile.get) - bamLn.deps :+= baiLn.output - add(bamLn) - } - } - case _ => logger.warn("Sample: " + sampleId + " Library: " + libId + ", no reads found") - } - mapping.foreach(mapping => { - mapping.init() - mapping.biopetScript() - addAll(mapping.functions) // Add functions of mapping to current function pool - addSummaryQScript(mapping) - }) - } - } - - /** This will add jobs for the double preprocessing */ - protected def addDoublePreProcess(input: List[File], isIntermediate: Boolean = false): Option[File] = { - if (input == Nil) None - else if (input.tail == Nil) { - val bamFile = new File(sampleDir, input.head.getName) - val oldIndex: File = new File(input.head.getAbsolutePath.stripSuffix(".bam") + ".bai") - val newIndex: File = new File(sampleDir, input.head.getName.stripSuffix(".bam") + ".bai") - val baiLn = Ln(qscript, oldIndex, newIndex) - add(baiLn) - - val bamLn = Ln(qscript, input.head, bamFile) - bamLn.deps :+= baiLn.output - add(bamLn) - Some(bamFile) - } else { - val md = new MarkDuplicates(qscript) - md.input = input - md.output = new File(sampleDir, sampleId + ".dedup.bam") - md.outputMetrics = new File(sampleDir, sampleId + ".dedup.metrics") - md.isIntermediate = isIntermediate - md.removeDuplicates = true - add(md) - addSummarizable(md, "mark_duplicates") - Some(md.output) - } - } - - lazy val preProcessBam: Option[File] = addDoublePreProcess(libraries.flatMap(lib => { - (lib._2.bamFile, lib._2.preProcessBam) match { - case (_, Some(file)) => Some(file) - case (Some(file), _) => Some(file) - case _ => None - } - }).toList) - def alnFile: File = sampleBamLinkJob.output - - /** Job for combining all library BAMs */ - private def sampleBamLinkJob: Ln = - makeCombineJob(libraries.values.map(_.alnFile).toList, createFile(".bam")) - - /** Ln or MergeSamFile job, depending on how many inputs are supplied */ - private def makeCombineJob(inFiles: List[File], outFile: File, - mergeSortOrder: String = "coordinate"): Ln = { - require(inFiles.nonEmpty, "At least one input files for combine job") - val input: File = { - - if (inFiles.size == 1) inFiles.head - else { - val mergedBam = createFile(".merged.bam") - val mergejob = new MergeSamFiles(qscript) - mergejob.input = inFiles - mergejob.output = mergedBam - mergejob.sortOrder = mergeSortOrder - add(mergejob) - mergejob.output - } - } - - val linkJob = new Ln(qscript) - linkJob.input = input - linkJob.output = outFile - linkJob - - } - - /** This will add sample jobs */ - def addJobs(): Unit = { - addPerLibJobs() - // merge or symlink per-library alignments - add(sampleBamLinkJob) - - if (preProcessBam.isDefined) { - val bamMetrics = new BamMetrics(qscript) - bamMetrics.sampleId = Some(sampleId) - bamMetrics.inputBam = preProcessBam.get - bamMetrics.outputDir = sampleDir - bamMetrics.init() - bamMetrics.biopetScript() - addAll(bamMetrics.functions) - addSummaryQScript(bamMetrics) - } - - // sambamba view -f bam -F "unmapped or mate_is_unmapped" <alnFile> > <extracted.bam> - val samFilterUnmapped = new SambambaView(qscript) - samFilterUnmapped.input = alnFile - samFilterUnmapped.filter = Some("unmapped or mate_is_unmapped") - samFilterUnmapped.output = createFile(".unmapped.bam") - samFilterUnmapped.isIntermediate = true - qscript.add(samFilterUnmapped) - - // start bam to fastq (only on unaligned reads) also extract the matesam - val samToFastq = SamToFastq(qscript, alnFile, - createFile(".unmap.R1.fastq"), - createFile(".unmap.R2.fastq") - ) - samToFastq.isIntermediate = true - qscript.add(samToFastq) - - // sync the fastq records - val fastqsync = new FastqSync(qscript) - fastqsync.refFastq = samToFastq.fastqR1 - fastqsync.inputFastq1 = samToFastq.fastqR1 - fastqsync.inputFastq2 = samToFastq.fastqR2 - fastqsync.outputFastq1 = createFile(".unmapsynced.R1.fastq.gz") - fastqsync.outputFastq2 = createFile(".unmapsynced.R2.fastq.gz") - fastqsync.outputStats = createFile(".syncstats.json") - qscript.add(fastqsync) - - // start kraken - val krakenAnalysis = new Kraken(qscript) - krakenAnalysis.input = List(fastqsync.outputFastq1, fastqsync.outputFastq2) - krakenAnalysis.output = createFile(".krkn.raw") - krakenAnalysis.paired = true - krakenAnalysis.classified_out = Option(createFile(".krkn.classified.fastq")) - krakenAnalysis.unclassified_out = Option(createFile(".krkn.unclassified.fastq")) - qscript.add(krakenAnalysis) - - // create kraken summary file - - val krakenReport = new KrakenReport(qscript) - krakenReport.input = krakenAnalysis.output - krakenReport.show_zeros = true - krakenReport.output = createFile(".krkn.full") - qscript.add(krakenReport) - - } - } - -}