diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala index 60c25a5a69820b72a7e5bbd0f17cc8b5f0dac3fe..fb99be4a4f8716c8f96fb6be12d9caa264b249be 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala @@ -24,6 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.collection.mutable import scala.io.Source +import scala.util.matching.Regex /** * Extension for cutadapt @@ -163,6 +164,51 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su (if (outputAsStsout) "" else required("--output", fastqOutput) + " > " + required(statsOutput)) + def extractClippedAdapters(statsOutput: File): Map[String, Any] = { + val histoCountRow: Regex = """([\d]+)\t([\d]+)\t.*""".r + val adapterR = """Sequence: ([C|T|A|G]+);.*Trimmed: ([\d]+) times\.""".r + + val statsFile = Source.fromFile(statsOutput) + val adapterRawStats: Array[String] = statsFile.mkString + .split("=== Adapter [\\d]+ ===") + .filter(_.contains("Sequence") + ) + statsFile.close() + + adapterRawStats.map(adapter => { + var adapterName = "" + var adapterCount = 0 + // identify the adapter name and count + for (line <- adapter.split("\n")) { + line match { + case adapterR(adapter, count) => { + adapterName = adapter + adapterCount = count.toInt + } + case _ => + } + } + + // parse the block that gives the histogram of clipped bases and from which end + val counts = adapter.split("Overview of removed sequences ") + .filter(x => x.contains("length")) + .map(clipSideRawStats => { + val clipSideLabel = if (clipSideRawStats.contains("5'")) { "5p" } else { "3p" } + + val histogramValues = clipSideRawStats.split("\n").flatMap({ + case histoCountRow(length, count) => Some(length.toInt -> count.toInt) + case _ => None + }) + clipSideLabel -> histogramValues.toMap + }) + + adapterName -> Map( + "count" -> adapterCount, + "histogram" -> counts.toMap + ) + }).toMap // converting the Array[String] containing map-items to Map with 'toMap' + } + /** Output summary stats */ def summaryStats: Map[String, Any] = { /** @@ -177,7 +223,6 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su val tooLongR = """.* that were too long: *([,\d]+) .*""".r val tooManyN = """.* with too many N: *([,\d]+) .*""".r - val adapterR = """Sequence ([C|T|A|G]*);.*Trimmed: ([,\d]+) times.""".r val basePairsProcessed = """Total basepairs processed: *([,\d]+) bp""".r val basePairsWritten = """Total written \(filtered\): *([,\d]+) bp .*""".r @@ -192,24 +237,28 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su "bpoutput" -> 0, "toomanyn" -> 0 ) - val adapterStats: mutable.Map[String, Long] = mutable.Map() + + // extract the adapters with its histogram + val adapterStats = if (statsOutput.exists) { + extractClippedAdapters(statsOutput) + } else Map.empty if (statsOutput.exists) { val statsFile = Source.fromFile(statsOutput) for (line <- statsFile.getLines()) { line match { - case processedReads(m) => stats("processed") = m.replaceAll(",", "").toLong - case withAdapters(m) => stats("withadapters") = m.replaceAll(",", "").toLong - case readsPassingFilters(m) => stats("passingfilters") = m.replaceAll(",", "").toLong - case tooShortR(m) => stats("tooshort") = m.replaceAll(",", "").toLong - case tooLongR(m) => stats("toolong") = m.replaceAll(",", "").toLong - case tooManyN(m) => stats("toomanyn") = m.replaceAll(",", "").toLong - case basePairsProcessed(m) => stats("bpinput") = m.replaceAll(",", "").toLong - case basePairsWritten(m) => stats("bpoutput") = m.replaceAll(",", "").toLong - case adapterR(adapter, count) => adapterStats += (adapter -> count.toLong) - case _ => + case processedReads(m) => stats("processed") = m.replaceAll(",", "").toLong + case withAdapters(m) => stats("withadapters") = m.replaceAll(",", "").toLong + case readsPassingFilters(m) => stats("passingfilters") = m.replaceAll(",", "").toLong + case tooShortR(m) => stats("tooshort") = m.replaceAll(",", "").toLong + case tooLongR(m) => stats("toolong") = m.replaceAll(",", "").toLong + case tooManyN(m) => stats("toomanyn") = m.replaceAll(",", "").toLong + case basePairsProcessed(m) => stats("bpinput") = m.replaceAll(",", "").toLong + case basePairsWritten(m) => stats("bpoutput") = m.replaceAll(",", "").toLong + case _ => } } + statsFile.close() } val cleanReads = stats("processed") - stats("withadapters") @@ -223,8 +272,8 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su "num_reads_discarded_too_long" -> stats("toolong"), "num_reads_discarded_many_n" -> stats("toomanyn"), "num_bases_input" -> stats("bpinput"), - "num_based_output" -> stats("bpoutput"), - adaptersStatsName -> adapterStats.toMap + "num_bases_output" -> stats("bpoutput"), + adaptersStatsName -> adapterStats ) } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala index 50c15dbabc3b51be862c5a3987704ff10a57106d..20b4ae8422936bfc5a8b3b8310074b01b46445a6 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala @@ -156,9 +156,9 @@ class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFu override def beforeGraph(): Unit = { super.beforeGraph() if (!cache && !database) { - Logging.addError("Must supply either cache or database for VariantEffectPredictor") + Logging.addError("Must either set 'cache' or 'database' to true for VariantEffectPredictor") } else if (cache && dir.isEmpty) { - Logging.addError("Must supply dir to cache for VariantEffectPredictor") + Logging.addError("Must supply 'dir_cache' to cache for VariantEffectPredictor") } if (statsText) _summary = new File(output.getAbsolutePath + "_summary.txt") } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/AnalyzeCovariates.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/AnalyzeCovariates.scala similarity index 98% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/AnalyzeCovariates.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/AnalyzeCovariates.scala index b501d47de6cb0899d8e02ae8a4372fd50de4f7e1..c0740c64008c0d887a879e445c08e41dc1a96cd6 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/AnalyzeCovariates.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/AnalyzeCovariates.scala @@ -1,9 +1,9 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.core.ScatterGatherableFunction +import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ } class AnalyzeCovariates(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/ApplyRecalibration.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala similarity index 96% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/ApplyRecalibration.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala index a84aa4b4b8728a1a5c7bbab442a4d905b626821a..b3be8d8578a2a0563274bfc88fa5cf8eb14df543 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/ApplyRecalibration.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala @@ -1,15 +1,12 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils -import org.broadinstitute.gatk.utils.commandline.Argument -import org.broadinstitute.gatk.utils.commandline.Gather -import org.broadinstitute.gatk.utils.commandline.Input -import org.broadinstitute.gatk.utils.commandline.Output +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile +import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Input, Output } class ApplyRecalibration(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { def analysis_type = "ApplyRecalibration" diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/BamGatherFunction.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BamGatherFunction.scala similarity index 95% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/BamGatherFunction.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BamGatherFunction.scala index eb821f0e47ec6c5b3444cb8bed7172ae6dc906eb..c7a55537e2750e1cb316dfe51d63f9874faa6a72 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/BamGatherFunction.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BamGatherFunction.scala @@ -1,9 +1,8 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad - -import org.broadinstitute.gatk.queue.function.scattergather.GatherFunction +package nl.lumc.sasc.biopet.extensions.gatk import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles +import org.broadinstitute.gatk.queue.function.scattergather.GatherFunction /** * Merges BAM files using htsjdk.samtools.MergeSamFiles. diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/BaseRecalibrator.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala similarity index 98% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/BaseRecalibrator.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala index 828beeb64ea6396437fd195dbdb4718e2417f24a..7e5bbfd3ff4645d11bd2ee173005c713bf0f2458 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/BaseRecalibrator.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala @@ -1,11 +1,10 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.queue.extensions.gatk.{ TaggedFile } -import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ } //TODO: check gathering diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala index ceceed5f64ba51a75b098e4cd1b18beaa4f1894d..4d712a8407abb8f09b8e6e7fdcceaba8d11d2bb4 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala @@ -1,58 +1,56 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.core.{ Reference, BiopetJavaCommandLineFunction } +import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } +import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Input, Output } -class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction with Reference { +class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction { + analysisName = "CatVariants" + javaMainClass = "org.broadinstitute.gatk.tools.CatVariants" - javaMainClass = classOf[org.broadinstitute.gatk.tools.CatVariants].getName + /** genome reference file <name>.fasta */ + @Input(fullName = "reference", shortName = "R", doc = "genome reference file <name>.fasta", required = true, exclusiveOf = "", validation = "") + var reference: File = _ - @Input(required = true) - var inputFiles: List[File] = Nil + /** Input VCF file/s */ + @Input(fullName = "variant", shortName = "V", doc = "Input VCF file/s", required = true, exclusiveOf = "", validation = "") + var variant: Seq[File] = Nil - @Output(required = true) - var outputFile: File = null + /** output file */ + @Output(fullName = "outputFile", shortName = "out", doc = "output file", required = true, exclusiveOf = "", validation = "") + @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction]) + var outputFile: File = _ - @Input - var reference: File = null + /** assumeSorted should be true if the input files are already sorted (based on the position of the variants) */ + @Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if the input files are already sorted (based on the position of the variants)", required = false, exclusiveOf = "", validation = "") + var assumeSorted: Boolean = _ - var assumeSorted = false + /** which type of IndexCreator to use for VCF/BCF indices */ + @Argument(fullName = "variant_index_type", shortName = "", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false, exclusiveOf = "", validation = "") + var variant_index_type: Option[String] = None - override def beforeGraph(): Unit = { - super.beforeGraph() - if (reference == null) reference = referenceFasta() - } + /** the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator */ + @Argument(fullName = "variant_index_parameter", shortName = "", doc = "the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator", required = false, exclusiveOf = "", validation = "") + var variant_index_parameter: Option[Int] = None + + /** Set the minimum level of logging */ + @Argument(fullName = "logging_level", shortName = "l", doc = "Set the minimum level of logging", required = false, exclusiveOf = "", validation = "") + var logging_level: String = _ + + /** Set the logging location */ + @Output(fullName = "log_to_file", shortName = "log", doc = "Set the logging location", required = false, exclusiveOf = "", validation = "") + @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction]) + var log_to_file: File = _ override def cmdLine = super.cmdLine + - repeat("-V", inputFiles) + - required("-out", outputFile) + - required("-R", reference) + - conditional(assumeSorted, "--assumeSorted") + required("-R", reference, spaceSeparated = true, escape = true, format = "%s") + + repeat("-V", variant, spaceSeparated = true, escape = true, format = "%s") + + required("-out", outputFile, spaceSeparated = true, escape = true, format = "%s") + + conditional(assumeSorted, "-assumeSorted", escape = true, format = "%s") + + optional("--variant_index_type", variant_index_type, spaceSeparated = true, escape = true, format = "%s") + + optional("--variant_index_parameter", variant_index_parameter, spaceSeparated = true, escape = true, format = "%s") + + optional("-l", logging_level, spaceSeparated = true, escape = true, format = "%s") + + optional("-log", log_to_file, spaceSeparated = true, escape = true, format = "%s") } - -object CatVariants { - def apply(root: Configurable, input: List[File], output: File): CatVariants = { - val cv = new CatVariants(root) - cv.inputFiles = input - cv.outputFile = output - cv - } -} \ No newline at end of file diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CatVariantsGather.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariantsGather.scala similarity index 96% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CatVariantsGather.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariantsGather.scala index e97d6affbf023455122c49b3df59900211df0bea..27c6cb7cadb59997c7dd0c2039e1f53c74f4f8e7 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CatVariantsGather.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariantsGather.scala @@ -1,4 +1,4 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CombineGVCFs.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CombineGVCFs.scala similarity index 99% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CombineGVCFs.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CombineGVCFs.scala index e20331fe1b1c670acff8d1d1ca743585a8a0735f..ed0065e589e17000a4e4fb9742d4bca71f6622a1 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CombineGVCFs.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CombineGVCFs.scala @@ -1,11 +1,11 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import org.broadinstitute.gatk.utils.commandline.{ Gather, Input, Output, _ } class CombineGVCFs(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CombineVariants.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CombineVariants.scala index 343e2d769656dd6800d4cd552f51aa25cec7d28c..ba1740d3518e1ea6dbc17a9c60b0d6aa59abc9df 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CombineVariants.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CombineVariants.scala @@ -1,80 +1,128 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ package nl.lumc.sasc.biopet.extensions.gatk import java.io.File +import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.utils.commandline.{ Gather, Input, Output } - -/** - * Extension for CombineVariants from GATK - * - * Created by pjvan_thof on 2/26/15. - * - * @deprecated - */ -class CombineVariants(val root: Configurable) extends Gatk { - val analysisType = "CombineVariants" - - @Input(doc = "", required = true) - var inputFiles: List[File] = Nil - - @Output(doc = "", required = true) - var outputFile: File = null - - var setKey: String = null - var rodPriorityList: String = null - var minimumN: Int = config("minimumN", default = 1) - var genotypeMergeOptions: Option[String] = config("genotypeMergeOptions") - var excludeNonVariants: Boolean = false - - var inputMap: Map[File, String] = Map() - - def addInput(file: File, name: String): Unit = { - inputFiles :+= file - inputMap += file -> name - } +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile +import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ } + +class CombineVariants(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { + def analysis_type = "CombineVariants" + scatterClass = classOf[LocusScatterFunction] + setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false } + + /** VCF files to merge together */ + @Input(fullName = "variant", shortName = "V", doc = "VCF files to merge together", required = true, exclusiveOf = "", validation = "") + var variant: Seq[File] = Nil + + /** File to which variants should be written */ + @Output(fullName = "out", shortName = "o", doc = "File to which variants should be written", required = false, exclusiveOf = "", validation = "") + @Gather(classOf[CatVariantsGatherer]) + var out: File = _ + + /** Determines how we should merge genotype records for samples shared across the ROD files */ + @Argument(fullName = "genotypemergeoption", shortName = "genotypeMergeOptions", doc = "Determines how we should merge genotype records for samples shared across the ROD files", required = false, exclusiveOf = "", validation = "") + var genotypemergeoption: Option[String] = config("genotypemergeoption") + + /** Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields */ + @Argument(fullName = "filteredrecordsmergetype", shortName = "filteredRecordsMergeType", doc = "Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required = false, exclusiveOf = "", validation = "") + var filteredrecordsmergetype: Option[String] = config("filteredrecordsmergetype") + + /** Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel) */ + @Argument(fullName = "multipleallelesmergetype", shortName = "multipleAllelesMergeType", doc = "Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required = false, exclusiveOf = "", validation = "") + var multipleallelesmergetype: Option[String] = config("multipleallelesmergetype") + + /** Ordered list specifying priority for merging */ + @Argument(fullName = "rod_priority_list", shortName = "priority", doc = "Ordered list specifying priority for merging", required = false, exclusiveOf = "", validation = "") + var rod_priority_list: Option[String] = config("rod_priority_list") + + /** Emit interesting sites requiring complex compatibility merging to file */ + @Argument(fullName = "printComplexMerges", shortName = "printComplexMerges", doc = "Emit interesting sites requiring complex compatibility merging to file", required = false, exclusiveOf = "", validation = "") + var printComplexMerges: Boolean = config("printComplexMerges", default = false) + + /** Treat filtered variants as uncalled */ + @Argument(fullName = "filteredAreUncalled", shortName = "filteredAreUncalled", doc = "Treat filtered variants as uncalled", required = false, exclusiveOf = "", validation = "") + var filteredAreUncalled: Boolean = config("filteredAreUncalled", default = false) + + /** Emit a sites-only file */ + @Argument(fullName = "minimalVCF", shortName = "minimalVCF", doc = "Emit a sites-only file", required = false, exclusiveOf = "", validation = "") + var minimalVCF: Boolean = config("minimalVCF", default = false) + + /** Exclude sites where no variation is present after merging */ + @Argument(fullName = "excludeNonVariants", shortName = "env", doc = "Exclude sites where no variation is present after merging", required = false, exclusiveOf = "", validation = "") + var excludeNonVariants: Boolean = config("excludeNonVariants", default = false) + + /** Key name for the set attribute */ + @Argument(fullName = "setKey", shortName = "setKey", doc = "Key name for the set attribute", required = false, exclusiveOf = "", validation = "") + var setKey: Option[String] = config("set_key") + + /** Assume input VCFs have identical sample sets and disjoint calls */ + @Argument(fullName = "assumeIdenticalSamples", shortName = "assumeIdenticalSamples", doc = "Assume input VCFs have identical sample sets and disjoint calls", required = false, exclusiveOf = "", validation = "") + var assumeIdenticalSamples: Boolean = config("assumeIdenticalSamples", default = false) + + /** Minimum number of input files the site must be observed in to be included */ + @Argument(fullName = "minimumN", shortName = "minN", doc = "Minimum number of input files the site must be observed in to be included", required = false, exclusiveOf = "", validation = "") + var minimumN: Option[Int] = config("minimumN") + + /** Do not output the command line to the header */ + @Argument(fullName = "suppressCommandLineHeader", shortName = "suppressCommandLineHeader", doc = "Do not output the command line to the header", required = false, exclusiveOf = "", validation = "") + var suppressCommandLineHeader: Boolean = config("suppressCommandLineHeader", default = false) + + /** Use the INFO content of the record with the highest AC */ + @Argument(fullName = "mergeInfoWithMaxAC", shortName = "mergeInfoWithMaxAC", doc = "Use the INFO content of the record with the highest AC", required = false, exclusiveOf = "", validation = "") + var mergeInfoWithMaxAC: Boolean = config("mergeInfoWithMaxAC", default = false) + + /** Filter out reads with CIGAR containing the N operator, instead of failing with an error */ + @Argument(fullName = "filter_reads_with_N_cigar", shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false, exclusiveOf = "", validation = "") + var filter_reads_with_N_cigar: Boolean = config("filter_reads_with_N_cigar", default = false) + + /** Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error */ + @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false, exclusiveOf = "", validation = "") + var filter_mismatching_base_and_quals: Boolean = config("filter_mismatching_base_and_quals", default = false) + + /** Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error */ + @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false, exclusiveOf = "", validation = "") + var filter_bases_not_stored: Boolean = config("filter_bases_not_stored", default = false) @Output @Gather(enabled = false) private var outputIndex: File = _ - override def beforeGraph(): Unit = { + override def beforeGraph() { super.beforeGraph() - outputIndex = VcfUtils.getVcfIndexFile(outputFile) - genotypeMergeOptions match { - case Some("UNIQUIFY") | Some("PRIORITIZE") | Some("UNSORTED") | Some("REQUIRE_UNIQUE") | None => - case _ => throw new IllegalArgumentException("Wrong option for genotypeMergeOptions") - } - deps :::= inputFiles.filter(_.getName.endsWith("vcf.gz")).map(x => new File(x.getAbsolutePath + ".tbi")) - deps = deps.distinct + deps ++= variant.filter(orig => orig != null && (!orig.getName.endsWith(".list"))).map(orig => VcfUtils.getVcfIndexFile(orig)) + if (out != null && !org.broadinstitute.gatk.utils.io.IOUtils.isSpecialFile(out)) + outputIndex = VcfUtils.getVcfIndexFile(out) } override def cmdLine = super.cmdLine + - (for (file <- inputFiles) yield { - inputMap.get(file) match { - case Some(name) => required("-V:" + name, file) - case _ => required("-V", file) - } - }).mkString + - required("-o", outputFile) + - optional("--setKey", setKey) + - optional("--rod_priority_list", rodPriorityList) + - optional("-genotypeMergeOptions", genotypeMergeOptions) + - conditional(excludeNonVariants, "--excludeNonVariants") + repeat("-V", variant, formatPrefix = TaggedFile.formatCommandLineParameter, spaceSeparated = true, escape = true, format = "%s") + + optional("-o", out, spaceSeparated = true, escape = true, format = "%s") + + optional("-genotypeMergeOptions", genotypemergeoption, spaceSeparated = true, escape = true, format = "%s") + + optional("-filteredRecordsMergeType", filteredrecordsmergetype, spaceSeparated = true, escape = true, format = "%s") + + optional("-multipleAllelesMergeType", multipleallelesmergetype, spaceSeparated = true, escape = true, format = "%s") + + optional("-priority", rod_priority_list, spaceSeparated = true, escape = true, format = "%s") + + conditional(printComplexMerges, "-printComplexMerges", escape = true, format = "%s") + + conditional(filteredAreUncalled, "-filteredAreUncalled", escape = true, format = "%s") + + conditional(minimalVCF, "-minimalVCF", escape = true, format = "%s") + + conditional(excludeNonVariants, "-env", escape = true, format = "%s") + + optional("-setKey", setKey, spaceSeparated = true, escape = true, format = "%s") + + conditional(assumeIdenticalSamples, "-assumeIdenticalSamples", escape = true, format = "%s") + + optional("-minN", minimumN, spaceSeparated = true, escape = true, format = "%s") + + conditional(suppressCommandLineHeader, "-suppressCommandLineHeader", escape = true, format = "%s") + + conditional(mergeInfoWithMaxAC, "-mergeInfoWithMaxAC", escape = true, format = "%s") + + conditional(filter_reads_with_N_cigar, "-filterRNC", escape = true, format = "%s") + + conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape = true, format = "%s") + + conditional(filter_bases_not_stored, "-filterNoBases", escape = true, format = "%s") +} + +object CombineVariants { + def apply(root: Configurable, input: List[File], output: File): CombineVariants = { + val cv = new CombineVariants(root) + cv.variant = input + cv.out = output + cv + } } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CommandLineGATK.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CommandLineGATK.scala similarity index 99% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CommandLineGATK.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CommandLineGATK.scala index 3e8091437d7bfe377748a827fef43655cf5b65b4..7fbba210ce42d99573c3dfd127a45ab9346f438a 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CommandLineGATK.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CommandLineGATK.scala @@ -1,4 +1,4 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/ContigScatterFunction.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ContigScatterFunction.scala similarity index 88% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/ContigScatterFunction.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ContigScatterFunction.scala index 12350d3ad5d891e410f15dc90662645c3ceb2319..abfc807c26b7a88623b4b3ec649b010cb3d5d526 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/ContigScatterFunction.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ContigScatterFunction.scala @@ -1,8 +1,9 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk -import collection.JavaConversions._ -import org.broadinstitute.gatk.utils.interval.IntervalUtils import org.broadinstitute.gatk.queue.function.InProcessFunction +import org.broadinstitute.gatk.utils.interval.IntervalUtils + +import scala.collection.JavaConversions._ /** * Splits intervals by contig instead of evenly. diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/Gatk.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/Gatk.scala deleted file mode 100644 index 92ca40e02d94e4935f3f1c031cf1371e4b77b8a2..0000000000000000000000000000000000000000 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/Gatk.scala +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.extensions.gatk - -import java.io.File - -import nl.lumc.sasc.biopet.core.{ Version, BiopetJavaCommandLineFunction, Reference } -import org.broadinstitute.gatk.utils.commandline.Input - -/** - * General extension for GATK module - * - * Created by pjvan_thof on 2/26/15. - * - * @deprecated - */ -abstract class Gatk extends BiopetJavaCommandLineFunction with Reference with Version { - override def subPath = "gatk" :: super.subPath - - jarFile = config("gatk_jar") - - val analysisType: String - - override def defaultCoreMemory = 3.0 - - @Input(required = true) - var reference: File = null - - @Input(required = false) - var gatkKey: Option[File] = config("gatk_key") - - @Input(required = false) - var intervals: List[File] = config("intervals", default = Nil) - - @Input(required = false) - var excludeIntervals: List[File] = config("exclude_intervals", default = Nil) - - @Input(required = false) - var pedigree: List[File] = config("pedigree", default = Nil) - - var et: Option[String] = config("et") - - def versionRegex = """(.*)""".r - override def versionExitcode = List(0, 1) - def versionCommand = executable + " -jar " + jarFile + " -version" - - override def getVersion = super.getVersion.collect { case version => "Gatk " + version } - override def dictRequired = true - - override def beforeGraph(): Unit = { - super.beforeGraph() - if (reference == null) reference = referenceFasta() - } - - override def cmdLine = super.cmdLine + - required("-T", analysisType) + - required("-R", reference) + - optional("-K", gatkKey) + - optional("-et", et) + - repeat("-L", intervals) + - repeat("-XL", excludeIntervals) + - repeat("-ped", pedigree) -} \ No newline at end of file diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkScatterFunction.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkScatterFunction.scala similarity index 97% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkScatterFunction.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkScatterFunction.scala index f2399b946ca51488bc102b90c9b96f4368f91147..d49b4d34372dfebdd5eafb6469d643e3067a8d4b 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkScatterFunction.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkScatterFunction.scala @@ -1,12 +1,12 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk -import org.broadinstitute.gatk.utils.interval.IntervalUtils import java.io.File import org.broadinstitute.gatk.queue.extensions.gatk.GATKIntervals -import org.broadinstitute.gatk.utils.io.IOUtils import org.broadinstitute.gatk.queue.function.scattergather.{ CloneFunction, ScatterFunction } -import org.broadinstitute.gatk.utils.commandline.{ Output, _ } +import org.broadinstitute.gatk.utils.commandline.Output +import org.broadinstitute.gatk.utils.interval.IntervalUtils +import org.broadinstitute.gatk.utils.io.IOUtils trait GATKScatterFunction extends ScatterFunction { /* The runtime field to set for specifying intervals. */ diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GenotypeConcordance.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GenotypeConcordance.scala index 62d2457de0431b8c28ddb3e2eae58352a4b39b23..aea609fb98b311748e98cd044cf0b454a88bfe94 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GenotypeConcordance.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GenotypeConcordance.scala @@ -1,52 +1,70 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ package nl.lumc.sasc.biopet.extensions.gatk import java.io.File +import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.core.summary.Summarizable +import nl.lumc.sasc.biopet.utils.VcfUtils import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } -import org.broadinstitute.gatk.utils.report.{ GATKReportTable, GATKReport } +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile +import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ } +import org.broadinstitute.gatk.utils.report.{ GATKReport, GATKReportTable } -/** - * Extension for CombineVariants from GATK - * - * Created by pjvan_thof on 2/26/15. - * - * @deprecated - */ -class GenotypeConcordance(val root: Configurable) extends Gatk with Summarizable { - val analysisType = "GenotypeConcordance" +class GenotypeConcordance(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction with Summarizable { + analysisName = "GenotypeConcordance" + val analysis_type = "GenotypeConcordance" + scatterClass = classOf[LocusScatterFunction] + setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false } - @Input(required = true) - var evalFile: File = null + /** The variants and genotypes to evaluate */ + @Input(fullName = "eval", shortName = "eval", doc = "The variants and genotypes to evaluate", required = true, exclusiveOf = "", validation = "") + var eval: File = _ - @Input(required = true) - var compFile: File = null + /** The variants and genotypes to compare against */ + @Input(fullName = "comp", shortName = "comp", doc = "The variants and genotypes to compare against", required = true, exclusiveOf = "", validation = "") + var comp: File = _ - @Output(required = true) - var outputFile: File = null + /** Filters will be ignored */ + @Argument(fullName = "ignoreFilters", shortName = "", doc = "Filters will be ignored", required = false, exclusiveOf = "", validation = "") + var ignoreFilters: Boolean = config("ignoreFilters", default = false) - var moltenize = true + /** One or more criteria to use to set EVAL genotypes to no-call. These genotype-level filters are only applied to the EVAL rod. */ + @Argument(fullName = "genotypeFilterExpressionEval", shortName = "gfe", doc = "One or more criteria to use to set EVAL genotypes to no-call. These genotype-level filters are only applied to the EVAL rod.", required = false, exclusiveOf = "", validation = "") + var genotypeFilterExpressionEval: List[String] = config("genotypeFilterExpressionEval", default = Nil) - def summaryFiles = Map("output" -> outputFile) + /** One or more criteria to use to set COMP genotypes to no-call. These genotype-level filters are only applied to the COMP rod. */ + @Argument(fullName = "genotypeFilterExpressionComp", shortName = "gfc", doc = "One or more criteria to use to set COMP genotypes to no-call. These genotype-level filters are only applied to the COMP rod.", required = false, exclusiveOf = "", validation = "") + var genotypeFilterExpressionComp: Seq[String] = config("genotypeFilterExpressionComp", default = Nil) + + /** Molten rather than tabular output */ + @Argument(fullName = "moltenize", shortName = "moltenize", doc = "Molten rather than tabular output", required = false, exclusiveOf = "", validation = "") + var moltenize: Boolean = config("moltenize", default = true) + + /** File to output the discordant sites and genotypes. */ + @Output(fullName = "printInterestingSites", shortName = "sites", doc = "File to output the discordant sites and genotypes.", required = false, exclusiveOf = "", validation = "") + var printInterestingSites: Option[File] = None + + /** An output file created by the walker. Will overwrite contents if file exists */ + @Output(fullName = "out", shortName = "o", doc = "An output file created by the walker. Will overwrite contents if file exists", required = false, exclusiveOf = "", validation = "") + @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction]) + var out: File = _ + + /** Filter out reads with CIGAR containing the N operator, instead of failing with an error */ + @Argument(fullName = "filter_reads_with_N_cigar", shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false, exclusiveOf = "", validation = "") + var filter_reads_with_N_cigar: Boolean = config("filter_reads_with_N_cigar", default = false) + + /** Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error */ + @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false, exclusiveOf = "", validation = "") + var filter_mismatching_base_and_quals: Boolean = config("filter_mismatching_base_and_quals", default = false) + + /** Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error */ + @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false, exclusiveOf = "", validation = "") + var filter_bases_not_stored: Boolean = config("filter_bases_not_stored", default = false) + + def summaryFiles = Map("output" -> out) def summaryStats = { - val report = new GATKReport(outputFile) + val report = new GATKReport(out) val compProportions = report.getTable("GenotypeConcordance_CompProportions") val counts = report.getTable("GenotypeConcordance_Counts") val evalProportions = report.getTable("GenotypeConcordance_EvalProportions") @@ -82,15 +100,22 @@ class GenotypeConcordance(val root: Configurable) extends Gatk with Summarizable ) } - override def beforeGraph(): Unit = { + override def beforeGraph() { super.beforeGraph() - deps :::= (evalFile :: compFile :: Nil).filter(_.getName.endsWith("vcf.gz")).map(x => new File(x.getAbsolutePath + ".tbi")) - deps = deps.distinct + if (eval != null) deps :+= VcfUtils.getVcfIndexFile(eval) + if (comp != null) deps :+= VcfUtils.getVcfIndexFile(comp) } override def cmdLine = super.cmdLine + - required("--eval", evalFile) + - required("--comp", compFile) + - required("-o", outputFile) + - conditional(moltenize, "--moltenize") + required(TaggedFile.formatCommandLineParameter("-eval", eval), eval, spaceSeparated = true, escape = true, format = "%s") + + required(TaggedFile.formatCommandLineParameter("-comp", comp), comp, spaceSeparated = true, escape = true, format = "%s") + + conditional(ignoreFilters, "--ignoreFilters", escape = true, format = "%s") + + repeat("-gfe", genotypeFilterExpressionEval, spaceSeparated = true, escape = true, format = "%s") + + repeat("-gfc", genotypeFilterExpressionComp, spaceSeparated = true, escape = true, format = "%s") + + conditional(moltenize, "-moltenize", escape = true, format = "%s") + + optional("-sites", printInterestingSites, spaceSeparated = true, escape = true, format = "%s") + + optional("-o", out, spaceSeparated = true, escape = true, format = "%s") + + conditional(filter_reads_with_N_cigar, "-filterRNC", escape = true, format = "%s") + + conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape = true, format = "%s") + + conditional(filter_bases_not_stored, "-filterNoBases", escape = true, format = "%s") } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GenotypeGVCFs.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GenotypeGVCFs.scala similarity index 99% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GenotypeGVCFs.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GenotypeGVCFs.scala index b1a54e34f4b9a079fef323110c2b61b7d9c5ad25..650340d63c1be0ba9195609a616ddddb4abee8ef 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GenotypeGVCFs.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GenotypeGVCFs.scala @@ -1,11 +1,11 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ } class GenotypeGVCFs(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/HaplotypeCaller.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala similarity index 99% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/HaplotypeCaller.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala index 5ffdcb306b32bcc06152fd3e1a7457715443cee9..9eac2ba9e3d4974e7b479addcb3d0f1dadf5ef56 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/HaplotypeCaller.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala @@ -1,10 +1,10 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils +import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Input, Output } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/IndelRealigner.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/IndelRealigner.scala similarity index 99% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/IndelRealigner.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/IndelRealigner.scala index 34b7c58f9af92fdaee68bb710fc800779984e953..7d16d832892a0daaffd999f549f5fd00f327b51c 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/IndelRealigner.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/IndelRealigner.scala @@ -1,11 +1,11 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ } class IndelRealigner(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/LocusScatterFunction.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/LocusScatterFunction.scala similarity index 87% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/LocusScatterFunction.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/LocusScatterFunction.scala index 853d185b37d7cb1838b2c3f2928a91cdb8b82548..6a3b961ecbfb034465909714305e469af63a816a 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/LocusScatterFunction.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/LocusScatterFunction.scala @@ -1,8 +1,9 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk -import collection.JavaConversions._ -import org.broadinstitute.gatk.utils.interval.IntervalUtils import org.broadinstitute.gatk.queue.function.InProcessFunction +import org.broadinstitute.gatk.utils.interval.IntervalUtils + +import scala.collection.JavaConversions._ /** * A scatter function that divides down to the locus level. diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/PrintReads.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/PrintReads.scala similarity index 99% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/PrintReads.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/PrintReads.scala index 9f18533cf8f78dcf9ebc8f3749112db9e6d6bd4f..6eaca11891f0d5a559cd3723f5ee8852800f9260 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/PrintReads.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/PrintReads.scala @@ -1,9 +1,9 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.core.ScatterGatherableFunction +import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.utils.commandline._ class PrintReads(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/RealignerTargetCreator.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/RealignerTargetCreator.scala similarity index 97% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/RealignerTargetCreator.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/RealignerTargetCreator.scala index 383e74fbf0e4055c2fb5ec296b5a4ccaf72ead39..74ce632bd78dba9de535c3f3b4f82fb481976410 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/RealignerTargetCreator.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/RealignerTargetCreator.scala @@ -1,11 +1,11 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.queue.extensions.gatk.{ TaggedFile } import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Input, _ } class RealignerTargetCreator(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/SelectVariants.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/SelectVariants.scala index d98abff1485b59cd0424eff47d03b0d1dbfe585d..a1ed7b732f9b72d1660c9ea8c1995e5fc0137a68 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/SelectVariants.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/SelectVariants.scala @@ -1,69 +1,262 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ package nl.lumc.sasc.biopet.extensions.gatk import java.io.File +import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.utils.commandline.{ Gather, Input, Output } +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile +import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ } -/** - * Extension for CombineVariants from GATK - * - * Created by pjvan_thof on 2/26/15. - * - * @deprecated - */ -class SelectVariants(val root: Configurable) extends Gatk { - val analysisType = "SelectVariants" +class SelectVariants(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { + def analysis_type = "SelectVariants" + scatterClass = classOf[LocusScatterFunction] + setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false } - @Input(doc = "", required = true) - var inputFiles: List[File] = Nil + /** Input VCF file */ + @Input(fullName = "variant", shortName = "V", doc = "Input VCF file", required = true, exclusiveOf = "", validation = "") + var variant: File = _ - @Output(doc = "", required = true) - var outputFile: File = null + /** Output variants not called in this comparison track */ + @Input(fullName = "discordance", shortName = "disc", doc = "Output variants not called in this comparison track", required = false, exclusiveOf = "", validation = "") + var discordance: Option[File] = None - var excludeNonVariants: Boolean = false + /** Output variants also called in this comparison track */ + @Input(fullName = "concordance", shortName = "conc", doc = "Output variants also called in this comparison track", required = false, exclusiveOf = "", validation = "") + var concordance: Option[File] = None - var inputMap: Map[File, String] = Map() + /** File to which variants should be written */ + @Output(fullName = "out", shortName = "o", doc = "File to which variants should be written", required = false, exclusiveOf = "", validation = "") + @Gather(classOf[CatVariantsGatherer]) + var out: File = _ - def addInput(file: File, name: String): Unit = { - inputFiles :+= file - inputMap += file -> name - } + /** Include genotypes from this sample */ + @Argument(fullName = "sample_name", shortName = "sn", doc = "Include genotypes from this sample", required = false, exclusiveOf = "", validation = "") + var sample_name: List[String] = config("sample_name", default = Nil) + + /** Regular expression to select multiple samples */ + @Argument(fullName = "sample_expressions", shortName = "se", doc = "Regular expression to select multiple samples", required = false, exclusiveOf = "", validation = "") + var sample_expressions: List[String] = config("sample_expressions", default = Nil) + + /** File containing a list of samples to include */ + @Input(fullName = "sample_file", shortName = "sf", doc = "File containing a list of samples to include", required = false, exclusiveOf = "", validation = "") + var sample_file: List[File] = config("sample_file", default = Nil) + + /** Exclude genotypes from this sample */ + @Argument(fullName = "exclude_sample_name", shortName = "xl_sn", doc = "Exclude genotypes from this sample", required = false, exclusiveOf = "", validation = "") + var exclude_sample_name: List[String] = config("exclude_sample_name", default = Nil) + + /** List of samples to exclude */ + @Input(fullName = "exclude_sample_file", shortName = "xl_sf", doc = "List of samples to exclude", required = false, exclusiveOf = "", validation = "") + var exclude_sample_file: List[File] = config("exclude_sample_file", default = Nil) + + /** List of sample expressions to exclude */ + @Input(fullName = "exclude_sample_expressions", shortName = "xl_se", doc = "List of sample expressions to exclude", required = false, exclusiveOf = "", validation = "") + var exclude_sample_expressions: List[File] = config("exclude_sample_expressions", default = Nil) + + /** One or more criteria to use when selecting the data */ + @Argument(fullName = "selectexpressions", shortName = "select", doc = "One or more criteria to use when selecting the data", required = false, exclusiveOf = "", validation = "") + var selectexpressions: List[String] = config("selectexpressions", default = Nil) + + /** Invert the selection criteria for -select */ + @Argument(fullName = "invertselect", shortName = "invertSelect", doc = "Invert the selection criteria for -select", required = false, exclusiveOf = "", validation = "") + var invertselect: Boolean = config("invertselect", default = false) + + /** Don't include non-variant sites */ + @Argument(fullName = "excludeNonVariants", shortName = "env", doc = "Don't include non-variant sites", required = false, exclusiveOf = "", validation = "") + var excludeNonVariants: Boolean = config("excludeNonVariants", default = false) + + /** Don't include filtered sites */ + @Argument(fullName = "excludeFiltered", shortName = "ef", doc = "Don't include filtered sites", required = false, exclusiveOf = "", validation = "") + var excludeFiltered: Boolean = config("excludeFiltered", default = false) + + /** Preserve original alleles, do not trim */ + @Argument(fullName = "preserveAlleles", shortName = "noTrim", doc = "Preserve original alleles, do not trim", required = false, exclusiveOf = "", validation = "") + var preserveAlleles: Boolean = config("preserveAlleles", default = false) + + /** Remove alternate alleles not present in any genotypes */ + @Argument(fullName = "removeUnusedAlternates", shortName = "trimAlternates", doc = "Remove alternate alleles not present in any genotypes", required = false, exclusiveOf = "", validation = "") + var removeUnusedAlternates: Boolean = config("removeUnusedAlternates", default = false) + + /** Select only variants of a particular allelicity */ + @Argument(fullName = "restrictAllelesTo", shortName = "restrictAllelesTo", doc = "Select only variants of a particular allelicity", required = false, exclusiveOf = "", validation = "") + var restrictAllelesTo: Option[String] = config("restrictAllelesTo") + + /** Store the original AC, AF, and AN values after subsetting */ + @Argument(fullName = "keepOriginalAC", shortName = "keepOriginalAC", doc = "Store the original AC, AF, and AN values after subsetting", required = false, exclusiveOf = "", validation = "") + var keepOriginalAC: Boolean = config("keepOriginalAC", default = false) + + /** Store the original DP value after subsetting */ + @Argument(fullName = "keepOriginalDP", shortName = "keepOriginalDP", doc = "Store the original DP value after subsetting", required = false, exclusiveOf = "", validation = "") + var keepOriginalDP: Boolean = config("keepOriginalDP", default = false) + + /** Output mendelian violation sites only */ + @Argument(fullName = "mendelianViolation", shortName = "mv", doc = "Output mendelian violation sites only", required = false, exclusiveOf = "", validation = "") + var mendelianViolation: Boolean = config("mendelianViolation", default = false) + + /** Output non-mendelian violation sites only */ + @Argument(fullName = "invertMendelianViolation", shortName = "invMv", doc = "Output non-mendelian violation sites only", required = false, exclusiveOf = "", validation = "") + var invertMendelianViolation: Boolean = config("invertMendelianViolation", default = false) + + /** Minimum GQ score for each trio member to accept a site as a violation */ + @Argument(fullName = "mendelianViolationQualThreshold", shortName = "mvq", doc = "Minimum GQ score for each trio member to accept a site as a violation", required = false, exclusiveOf = "", validation = "") + var mendelianViolationQualThreshold: Option[Double] = config("mendelianViolationQualThreshold") + + /** Format string for mendelianViolationQualThreshold */ + @Argument(fullName = "mendelianViolationQualThresholdFormat", shortName = "", doc = "Format string for mendelianViolationQualThreshold", required = false, exclusiveOf = "", validation = "") + var mendelianViolationQualThresholdFormat: String = "%s" + + /** Select a fraction of variants at random from the input */ + @Argument(fullName = "select_random_fraction", shortName = "fraction", doc = "Select a fraction of variants at random from the input", required = false, exclusiveOf = "", validation = "") + var select_random_fraction: Option[Double] = config("select_random_fraction") + + /** Format string for select_random_fraction */ + @Argument(fullName = "select_random_fractionFormat", shortName = "", doc = "Format string for select_random_fraction", required = false, exclusiveOf = "", validation = "") + var select_random_fractionFormat: String = "%s" + + /** Select a fraction of genotypes at random from the input and sets them to no-call */ + @Argument(fullName = "remove_fraction_genotypes", shortName = "fractionGenotypes", doc = "Select a fraction of genotypes at random from the input and sets them to no-call", required = false, exclusiveOf = "", validation = "") + var remove_fraction_genotypes: Option[Double] = config("remove_fraction_genotypes") + + /** Format string for remove_fraction_genotypes */ + @Argument(fullName = "remove_fraction_genotypesFormat", shortName = "", doc = "Format string for remove_fraction_genotypes", required = false, exclusiveOf = "", validation = "") + var remove_fraction_genotypesFormat: String = "%s" + + /** Select only a certain type of variants from the input file */ + @Argument(fullName = "selectTypeToInclude", shortName = "selectType", doc = "Select only a certain type of variants from the input file", required = false, exclusiveOf = "", validation = "") + var selectTypeToInclude: List[String] = config("selectTypeToInclude", default = Nil) + + /** Do not select certain type of variants from the input file */ + @Argument(fullName = "selectTypeToExclude", shortName = "xlSelectType", doc = "Do not select certain type of variants from the input file", required = false, exclusiveOf = "", validation = "") + var selectTypeToExclude: Seq[String] = config("selectTypeToExclude", default = Nil) + + /** List of variant IDs to select */ + @Input(fullName = "keepIDs", shortName = "IDs", doc = "List of variant IDs to select", required = false, exclusiveOf = "", validation = "") + var keepIDs: Option[File] = config("keepIDs") + + /** List of variant IDs to select */ + @Argument(fullName = "excludeIDs", shortName = "xlIDs", doc = "List of variant IDs to select", required = false, exclusiveOf = "", validation = "") + var excludeIDs: Option[File] = config("excludeIDs") + + /** If true, the incoming VariantContext will be fully decoded */ + @Argument(fullName = "fullyDecode", shortName = "", doc = "If true, the incoming VariantContext will be fully decoded", required = false, exclusiveOf = "", validation = "") + var fullyDecode: Boolean = config("fullyDecode", default = false) + + /** If true, we won't actually write the output file. For efficiency testing only */ + @Argument(fullName = "justRead", shortName = "", doc = "If true, we won't actually write the output file. For efficiency testing only", required = false, exclusiveOf = "", validation = "") + var justRead: Boolean = config("justRead", default = false) + + /** Maximum size of indels to include */ + @Argument(fullName = "maxIndelSize", shortName = "", doc = "Maximum size of indels to include", required = false, exclusiveOf = "", validation = "") + var maxIndelSize: Option[Int] = config("maxIndelSize") + + /** Minimum size of indels to include */ + @Argument(fullName = "minIndelSize", shortName = "", doc = "Minimum size of indels to include", required = false, exclusiveOf = "", validation = "") + var minIndelSize: Option[Int] = config("minIndelSize") + + /** Maximum number of samples filtered at the genotype level */ + @Argument(fullName = "maxFilteredGenotypes", shortName = "", doc = "Maximum number of samples filtered at the genotype level", required = false, exclusiveOf = "", validation = "") + var maxFilteredGenotypes: Option[Int] = config("maxFilteredGenotypes") + + /** Minimum number of samples filtered at the genotype level */ + @Argument(fullName = "minFilteredGenotypes", shortName = "", doc = "Minimum number of samples filtered at the genotype level", required = false, exclusiveOf = "", validation = "") + var minFilteredGenotypes: Option[Int] = config("minFilteredGenotypes") + + /** Maximum fraction of samples filtered at the genotype level */ + @Argument(fullName = "maxFractionFilteredGenotypes", shortName = "", doc = "Maximum fraction of samples filtered at the genotype level", required = false, exclusiveOf = "", validation = "") + var maxFractionFilteredGenotypes: Option[Double] = config("maxFractionFilteredGenotypes") + + /** Format string for maxFractionFilteredGenotypes */ + @Argument(fullName = "maxFractionFilteredGenotypesFormat", shortName = "", doc = "Format string for maxFractionFilteredGenotypes", required = false, exclusiveOf = "", validation = "") + var maxFractionFilteredGenotypesFormat: String = "%s" + + /** Maximum fraction of samples filtered at the genotype level */ + @Argument(fullName = "minFractionFilteredGenotypes", shortName = "", doc = "Maximum fraction of samples filtered at the genotype level", required = false, exclusiveOf = "", validation = "") + var minFractionFilteredGenotypes: Option[Double] = config("minFractionFilteredGenotypes") + + /** Format string for minFractionFilteredGenotypes */ + @Argument(fullName = "minFractionFilteredGenotypesFormat", shortName = "", doc = "Format string for minFractionFilteredGenotypes", required = false, exclusiveOf = "", validation = "") + var minFractionFilteredGenotypesFormat: String = "%s" + + /** Set filtered genotypes to no-call */ + @Argument(fullName = "setFilteredGtToNocall", shortName = "", doc = "Set filtered genotypes to no-call", required = false, exclusiveOf = "", validation = "") + var setFilteredGtToNocall: Boolean = config("setFilteredGtToNocall", default = false) + + /** Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored. */ + @Argument(fullName = "ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", shortName = "", doc = "Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.", required = false, exclusiveOf = "", validation = "") + var ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES: Boolean = config("ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", default = false) + + /** Forces output VCF to be compliant to up-to-date version */ + @Argument(fullName = "forceValidOutput", shortName = "", doc = "Forces output VCF to be compliant to up-to-date version", required = false, exclusiveOf = "", validation = "") + var forceValidOutput: Boolean = config("forceValidOutput", default = false) + + /** Filter out reads with CIGAR containing the N operator, instead of failing with an error */ + @Argument(fullName = "filter_reads_with_N_cigar", shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false, exclusiveOf = "", validation = "") + var filter_reads_with_N_cigar: Boolean = config("filter_reads_with_N_cigar", default = false) + + /** Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error */ + @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false, exclusiveOf = "", validation = "") + var filter_mismatching_base_and_quals: Boolean = config("filter_mismatching_base_and_quals", default = false) + + /** Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error */ + @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false, exclusiveOf = "", validation = "") + var filter_bases_not_stored: Boolean = config("filter_bases_not_stored", default = false) @Output @Gather(enabled = false) private var outputIndex: File = _ - override def beforeGraph(): Unit = { + override def beforeGraph() { super.beforeGraph() - outputIndex = VcfUtils.getVcfIndexFile(outputFile) - deps :::= inputFiles.filter(_.getName.endsWith("vcf.gz")).map(x => new File(x.getAbsolutePath + ".tbi")) - deps = deps.distinct + if (variant != null) + deps :+= VcfUtils.getVcfIndexFile(variant) + discordance.foreach(deps :+= VcfUtils.getVcfIndexFile(_)) + concordance.foreach(deps :+= VcfUtils.getVcfIndexFile(_)) + if (out != null && !org.broadinstitute.gatk.utils.io.IOUtils.isSpecialFile(out)) + outputIndex = VcfUtils.getVcfIndexFile(out) } override def cmdLine = super.cmdLine + - (for (file <- inputFiles) yield { - inputMap.get(file) match { - case Some(name) => required("-V:" + name, file) - case _ => required("-V", file) - } - }).mkString + - required("-o", outputFile) + - conditional(excludeNonVariants, "--excludeNonVariants") + required(TaggedFile.formatCommandLineParameter("-V", variant), variant, spaceSeparated = true, escape = true, format = "%s") + + optional(TaggedFile.formatCommandLineParameter("-disc", discordance), discordance, spaceSeparated = true, escape = true, format = "%s") + + optional(TaggedFile.formatCommandLineParameter("-conc", concordance), concordance, spaceSeparated = true, escape = true, format = "%s") + + optional("-o", out, spaceSeparated = true, escape = true, format = "%s") + + repeat("-sn", sample_name, spaceSeparated = true, escape = true, format = "%s") + + repeat("-se", sample_expressions, spaceSeparated = true, escape = true, format = "%s") + + repeat("-sf", sample_file, spaceSeparated = true, escape = true, format = "%s") + + repeat("-xl_sn", exclude_sample_name, spaceSeparated = true, escape = true, format = "%s") + + repeat("-xl_sf", exclude_sample_file, spaceSeparated = true, escape = true, format = "%s") + + repeat("-xl_se", exclude_sample_expressions, spaceSeparated = true, escape = true, format = "%s") + + repeat("-select", selectexpressions, spaceSeparated = true, escape = true, format = "%s") + + conditional(invertselect, "-invertSelect", escape = true, format = "%s") + + conditional(excludeNonVariants, "-env", escape = true, format = "%s") + + conditional(excludeFiltered, "-ef", escape = true, format = "%s") + + conditional(preserveAlleles, "-noTrim", escape = true, format = "%s") + + conditional(removeUnusedAlternates, "-trimAlternates", escape = true, format = "%s") + + optional("-restrictAllelesTo", restrictAllelesTo, spaceSeparated = true, escape = true, format = "%s") + + conditional(keepOriginalAC, "-keepOriginalAC", escape = true, format = "%s") + + conditional(keepOriginalDP, "-keepOriginalDP", escape = true, format = "%s") + + conditional(mendelianViolation, "-mv", escape = true, format = "%s") + + conditional(invertMendelianViolation, "-invMv", escape = true, format = "%s") + + optional("-mvq", mendelianViolationQualThreshold, spaceSeparated = true, escape = true, format = mendelianViolationQualThresholdFormat) + + optional("-fraction", select_random_fraction, spaceSeparated = true, escape = true, format = select_random_fractionFormat) + + optional("-fractionGenotypes", remove_fraction_genotypes, spaceSeparated = true, escape = true, format = remove_fraction_genotypesFormat) + + repeat("-selectType", selectTypeToInclude, spaceSeparated = true, escape = true, format = "%s") + + repeat("-xlSelectType", selectTypeToExclude, spaceSeparated = true, escape = true, format = "%s") + + optional("-IDs", keepIDs, spaceSeparated = true, escape = true, format = "%s") + + optional("-xlIDs", excludeIDs, spaceSeparated = true, escape = true, format = "%s") + + conditional(fullyDecode, "--fullyDecode", escape = true, format = "%s") + + conditional(justRead, "--justRead", escape = true, format = "%s") + + optional("--maxIndelSize", maxIndelSize, spaceSeparated = true, escape = true, format = "%s") + + optional("--minIndelSize", minIndelSize, spaceSeparated = true, escape = true, format = "%s") + + optional("--maxFilteredGenotypes", maxFilteredGenotypes, spaceSeparated = true, escape = true, format = "%s") + + optional("--minFilteredGenotypes", minFilteredGenotypes, spaceSeparated = true, escape = true, format = "%s") + + optional("--maxFractionFilteredGenotypes", maxFractionFilteredGenotypes, spaceSeparated = true, escape = true, format = maxFractionFilteredGenotypesFormat) + + optional("--minFractionFilteredGenotypes", minFractionFilteredGenotypes, spaceSeparated = true, escape = true, format = minFractionFilteredGenotypesFormat) + + conditional(setFilteredGtToNocall, "--setFilteredGtToNocall", escape = true, format = "%s") + + conditional(ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES, "--ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", escape = true, format = "%s") + + conditional(forceValidOutput, "--forceValidOutput", escape = true, format = "%s") + + conditional(filter_reads_with_N_cigar, "-filterRNC", escape = true, format = "%s") + + conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape = true, format = "%s") + + conditional(filter_bases_not_stored, "-filterNoBases", escape = true, format = "%s") } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/UnifiedGenotyper.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/UnifiedGenotyper.scala similarity index 99% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/UnifiedGenotyper.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/UnifiedGenotyper.scala index fbb4423898d3f0c688a71cce40a2f1ad2b632978..0edfe5260fe2fb9101fd92d01fbcebce94ba0441 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/UnifiedGenotyper.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/UnifiedGenotyper.scala @@ -1,11 +1,11 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import org.broadinstitute.gatk.utils.commandline.{ Gather, Input, Output, _ } class UnifiedGenotyper(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantAnnotator.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantAnnotator.scala similarity index 99% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantAnnotator.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantAnnotator.scala index 7fa034cfe6c48b3e4498a7ee4f968404a157a88c..d98a55a49eb6e34c00588fddde66dc00cddfe610 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantAnnotator.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantAnnotator.scala @@ -1,4 +1,4 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantEval.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantEval.scala similarity index 98% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantEval.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantEval.scala index a595ce5c1494101ac9104220db3a2c28d3120d2a..cf1c362c7dd06b2eec25104fdfd3fb5a06dea2f8 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantEval.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantEval.scala @@ -1,13 +1,11 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk + +import java.io.File import nl.lumc.sasc.biopet.utils.VcfUtils import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile -import java.io.File -import org.broadinstitute.gatk.utils.commandline.Argument -import org.broadinstitute.gatk.utils.commandline.Gather -import org.broadinstitute.gatk.utils.commandline.Input -import org.broadinstitute.gatk.utils.commandline.Output +import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Input, Output } class VariantEval(val root: Configurable) extends CommandLineGATK { def analysis_type = "VariantEval" diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala similarity index 99% rename from biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala rename to biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala index db631449a9e6c59112832247367634ee8d256983..96b5ee4c00fc1350e6298f31d11c2308010b6e23 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala @@ -1,4 +1,4 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad +package nl.lumc.sasc.biopet.extensions.gatk import java.io.File diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CatVariants.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CatVariants.scala deleted file mode 100644 index 6a23df5ff8d1f1a66c7606e826d7a09ad8924cc5..0000000000000000000000000000000000000000 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CatVariants.scala +++ /dev/null @@ -1,59 +0,0 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad - -import java.io.File - -import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.utils.commandline.Argument -import org.broadinstitute.gatk.utils.commandline.Gather -import org.broadinstitute.gatk.utils.commandline.Input -import org.broadinstitute.gatk.utils.commandline.Output - -class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction { - analysisName = "CatVariants" - javaMainClass = "org.broadinstitute.gatk.tools.CatVariants" - - /** genome reference file <name>.fasta */ - @Input(fullName = "reference", shortName = "R", doc = "genome reference file <name>.fasta", required = true, exclusiveOf = "", validation = "") - var reference: File = _ - - /** Input VCF file/s */ - @Input(fullName = "variant", shortName = "V", doc = "Input VCF file/s", required = true, exclusiveOf = "", validation = "") - var variant: Seq[File] = Nil - - /** output file */ - @Output(fullName = "outputFile", shortName = "out", doc = "output file", required = true, exclusiveOf = "", validation = "") - @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction]) - var outputFile: File = _ - - /** assumeSorted should be true if the input files are already sorted (based on the position of the variants) */ - @Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if the input files are already sorted (based on the position of the variants)", required = false, exclusiveOf = "", validation = "") - var assumeSorted: Boolean = _ - - /** which type of IndexCreator to use for VCF/BCF indices */ - @Argument(fullName = "variant_index_type", shortName = "", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false, exclusiveOf = "", validation = "") - var variant_index_type: Option[String] = None - - /** the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator */ - @Argument(fullName = "variant_index_parameter", shortName = "", doc = "the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator", required = false, exclusiveOf = "", validation = "") - var variant_index_parameter: Option[Int] = None - - /** Set the minimum level of logging */ - @Argument(fullName = "logging_level", shortName = "l", doc = "Set the minimum level of logging", required = false, exclusiveOf = "", validation = "") - var logging_level: String = _ - - /** Set the logging location */ - @Output(fullName = "log_to_file", shortName = "log", doc = "Set the logging location", required = false, exclusiveOf = "", validation = "") - @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction]) - var log_to_file: File = _ - - override def cmdLine = super.cmdLine + - required("-R", reference, spaceSeparated = true, escape = true, format = "%s") + - repeat("-V", variant, spaceSeparated = true, escape = true, format = "%s") + - required("-out", outputFile, spaceSeparated = true, escape = true, format = "%s") + - conditional(assumeSorted, "-assumeSorted", escape = true, format = "%s") + - optional("--variant_index_type", variant_index_type, spaceSeparated = true, escape = true, format = "%s") + - optional("--variant_index_parameter", variant_index_parameter, spaceSeparated = true, escape = true, format = "%s") + - optional("-l", logging_level, spaceSeparated = true, escape = true, format = "%s") + - optional("-log", log_to_file, spaceSeparated = true, escape = true, format = "%s") -} diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CombineVariants.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CombineVariants.scala deleted file mode 100644 index 7873ba3e44a3be042a923024af37a36922bb46d4..0000000000000000000000000000000000000000 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/CombineVariants.scala +++ /dev/null @@ -1,128 +0,0 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad - -import java.io.File - -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile -import nl.lumc.sasc.biopet.core.ScatterGatherableFunction -import nl.lumc.sasc.biopet.utils.VcfUtils -import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ } - -class CombineVariants(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { - def analysis_type = "CombineVariants" - scatterClass = classOf[LocusScatterFunction] - setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false } - - /** VCF files to merge together */ - @Input(fullName = "variant", shortName = "V", doc = "VCF files to merge together", required = true, exclusiveOf = "", validation = "") - var variant: Seq[File] = Nil - - /** File to which variants should be written */ - @Output(fullName = "out", shortName = "o", doc = "File to which variants should be written", required = false, exclusiveOf = "", validation = "") - @Gather(classOf[CatVariantsGatherer]) - var out: File = _ - - /** Determines how we should merge genotype records for samples shared across the ROD files */ - @Argument(fullName = "genotypemergeoption", shortName = "genotypeMergeOptions", doc = "Determines how we should merge genotype records for samples shared across the ROD files", required = false, exclusiveOf = "", validation = "") - var genotypemergeoption: Option[String] = config("genotypemergeoption") - - /** Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields */ - @Argument(fullName = "filteredrecordsmergetype", shortName = "filteredRecordsMergeType", doc = "Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required = false, exclusiveOf = "", validation = "") - var filteredrecordsmergetype: Option[String] = config("filteredrecordsmergetype") - - /** Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel) */ - @Argument(fullName = "multipleallelesmergetype", shortName = "multipleAllelesMergeType", doc = "Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required = false, exclusiveOf = "", validation = "") - var multipleallelesmergetype: Option[String] = config("multipleallelesmergetype") - - /** Ordered list specifying priority for merging */ - @Argument(fullName = "rod_priority_list", shortName = "priority", doc = "Ordered list specifying priority for merging", required = false, exclusiveOf = "", validation = "") - var rod_priority_list: Option[String] = config("rod_priority_list") - - /** Emit interesting sites requiring complex compatibility merging to file */ - @Argument(fullName = "printComplexMerges", shortName = "printComplexMerges", doc = "Emit interesting sites requiring complex compatibility merging to file", required = false, exclusiveOf = "", validation = "") - var printComplexMerges: Boolean = config("printComplexMerges", default = false) - - /** Treat filtered variants as uncalled */ - @Argument(fullName = "filteredAreUncalled", shortName = "filteredAreUncalled", doc = "Treat filtered variants as uncalled", required = false, exclusiveOf = "", validation = "") - var filteredAreUncalled: Boolean = config("filteredAreUncalled", default = false) - - /** Emit a sites-only file */ - @Argument(fullName = "minimalVCF", shortName = "minimalVCF", doc = "Emit a sites-only file", required = false, exclusiveOf = "", validation = "") - var minimalVCF: Boolean = config("minimalVCF", default = false) - - /** Exclude sites where no variation is present after merging */ - @Argument(fullName = "excludeNonVariants", shortName = "env", doc = "Exclude sites where no variation is present after merging", required = false, exclusiveOf = "", validation = "") - var excludeNonVariants: Boolean = config("excludeNonVariants", default = false) - - /** Key name for the set attribute */ - @Argument(fullName = "setKey", shortName = "setKey", doc = "Key name for the set attribute", required = false, exclusiveOf = "", validation = "") - var setKey: Option[String] = config("set_key") - - /** Assume input VCFs have identical sample sets and disjoint calls */ - @Argument(fullName = "assumeIdenticalSamples", shortName = "assumeIdenticalSamples", doc = "Assume input VCFs have identical sample sets and disjoint calls", required = false, exclusiveOf = "", validation = "") - var assumeIdenticalSamples: Boolean = config("assumeIdenticalSamples", default = false) - - /** Minimum number of input files the site must be observed in to be included */ - @Argument(fullName = "minimumN", shortName = "minN", doc = "Minimum number of input files the site must be observed in to be included", required = false, exclusiveOf = "", validation = "") - var minimumN: Option[Int] = config("minimumN") - - /** Do not output the command line to the header */ - @Argument(fullName = "suppressCommandLineHeader", shortName = "suppressCommandLineHeader", doc = "Do not output the command line to the header", required = false, exclusiveOf = "", validation = "") - var suppressCommandLineHeader: Boolean = config("suppressCommandLineHeader", default = false) - - /** Use the INFO content of the record with the highest AC */ - @Argument(fullName = "mergeInfoWithMaxAC", shortName = "mergeInfoWithMaxAC", doc = "Use the INFO content of the record with the highest AC", required = false, exclusiveOf = "", validation = "") - var mergeInfoWithMaxAC: Boolean = config("mergeInfoWithMaxAC", default = false) - - /** Filter out reads with CIGAR containing the N operator, instead of failing with an error */ - @Argument(fullName = "filter_reads_with_N_cigar", shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false, exclusiveOf = "", validation = "") - var filter_reads_with_N_cigar: Boolean = config("filter_reads_with_N_cigar", default = false) - - /** Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error */ - @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false, exclusiveOf = "", validation = "") - var filter_mismatching_base_and_quals: Boolean = config("filter_mismatching_base_and_quals", default = false) - - /** Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error */ - @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false, exclusiveOf = "", validation = "") - var filter_bases_not_stored: Boolean = config("filter_bases_not_stored", default = false) - - @Output - @Gather(enabled = false) - private var outputIndex: File = _ - - override def beforeGraph() { - super.beforeGraph() - deps ++= variant.filter(orig => orig != null && (!orig.getName.endsWith(".list"))).map(orig => VcfUtils.getVcfIndexFile(orig)) - if (out != null && !org.broadinstitute.gatk.utils.io.IOUtils.isSpecialFile(out)) - outputIndex = VcfUtils.getVcfIndexFile(out) - } - - override def cmdLine = super.cmdLine + - repeat("-V", variant, formatPrefix = TaggedFile.formatCommandLineParameter, spaceSeparated = true, escape = true, format = "%s") + - optional("-o", out, spaceSeparated = true, escape = true, format = "%s") + - optional("-genotypeMergeOptions", genotypemergeoption, spaceSeparated = true, escape = true, format = "%s") + - optional("-filteredRecordsMergeType", filteredrecordsmergetype, spaceSeparated = true, escape = true, format = "%s") + - optional("-multipleAllelesMergeType", multipleallelesmergetype, spaceSeparated = true, escape = true, format = "%s") + - optional("-priority", rod_priority_list, spaceSeparated = true, escape = true, format = "%s") + - conditional(printComplexMerges, "-printComplexMerges", escape = true, format = "%s") + - conditional(filteredAreUncalled, "-filteredAreUncalled", escape = true, format = "%s") + - conditional(minimalVCF, "-minimalVCF", escape = true, format = "%s") + - conditional(excludeNonVariants, "-env", escape = true, format = "%s") + - optional("-setKey", setKey, spaceSeparated = true, escape = true, format = "%s") + - conditional(assumeIdenticalSamples, "-assumeIdenticalSamples", escape = true, format = "%s") + - optional("-minN", minimumN, spaceSeparated = true, escape = true, format = "%s") + - conditional(suppressCommandLineHeader, "-suppressCommandLineHeader", escape = true, format = "%s") + - conditional(mergeInfoWithMaxAC, "-mergeInfoWithMaxAC", escape = true, format = "%s") + - conditional(filter_reads_with_N_cigar, "-filterRNC", escape = true, format = "%s") + - conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape = true, format = "%s") + - conditional(filter_bases_not_stored, "-filterNoBases", escape = true, format = "%s") -} - -object CombineVariants { - def apply(root: Configurable, input: List[File], output: File): CombineVariants = { - val cv = new CombineVariants(root) - cv.variant = input - cv.out = output - cv - } -} diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala deleted file mode 100644 index b8c3e6ba238836cf5a1c6f71a534a0359275e8c8..0000000000000000000000000000000000000000 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala +++ /dev/null @@ -1,44 +0,0 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad - -import nl.lumc.sasc.biopet.core._ -import org.broadinstitute.gatk.engine.phonehome.GATKRunReport - -/** - * @deprecated - */ -trait GatkGeneral extends org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGATK with CommandLineResources with Reference with Version { - var executable: String = config("java", default = "java", namespace = "java", freeVar = false) - - override def subPath = "gatk" :: super.subPath - - jarFile = config("gatk_jar") - - reference_sequence = referenceFasta() - - override def defaultCoreMemory = 4.0 - override def faiRequired = true - override def dictRequired = true - - if (config.contains("intervals")) intervals = config("intervals").asFileList - if (config.contains("exclude_intervals")) excludeIntervals = config("exclude_intervals").asFileList - - Option(config("et").value) match { - case Some("NO_ET") => et = GATKRunReport.PhoneHomeOption.NO_ET - case Some("AWS") => et = GATKRunReport.PhoneHomeOption.AWS - case Some("STDOUT") => et = GATKRunReport.PhoneHomeOption.STDOUT - case Some(x) => throw new IllegalArgumentException(s"Unknown et option for gatk: $x") - case _ => - } - - if (config.contains("gatk_key")) gatk_key = config("gatk_key") - if (config.contains("pedigree")) pedigree = config("pedigree") - - def versionRegex = """(.*)""".r - override def versionExitcode = List(0, 1) - def versionCommand = "java" + " -jar " + jarFile + " -version" - - override def getVersion = { - BiopetCommandLineFunction.preProcessExecutable(executable).path.foreach(executable = _) - super.getVersion.collect { case v => "Gatk " + v } - } -} diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/SelectVariants.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/SelectVariants.scala deleted file mode 100644 index dd49dd75da5fb6d91f5cc0826338796799a198c3..0000000000000000000000000000000000000000 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/SelectVariants.scala +++ /dev/null @@ -1,262 +0,0 @@ -package nl.lumc.sasc.biopet.extensions.gatk.broad - -import java.io.File - -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile -import nl.lumc.sasc.biopet.core.ScatterGatherableFunction -import nl.lumc.sasc.biopet.utils.VcfUtils -import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ } - -class SelectVariants(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { - def analysis_type = "SelectVariants" - scatterClass = classOf[LocusScatterFunction] - setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false } - - /** Input VCF file */ - @Input(fullName = "variant", shortName = "V", doc = "Input VCF file", required = true, exclusiveOf = "", validation = "") - var variant: File = _ - - /** Output variants not called in this comparison track */ - @Input(fullName = "discordance", shortName = "disc", doc = "Output variants not called in this comparison track", required = false, exclusiveOf = "", validation = "") - var discordance: Option[File] = None - - /** Output variants also called in this comparison track */ - @Input(fullName = "concordance", shortName = "conc", doc = "Output variants also called in this comparison track", required = false, exclusiveOf = "", validation = "") - var concordance: Option[File] = None - - /** File to which variants should be written */ - @Output(fullName = "out", shortName = "o", doc = "File to which variants should be written", required = false, exclusiveOf = "", validation = "") - @Gather(classOf[CatVariantsGatherer]) - var out: File = _ - - /** Include genotypes from this sample */ - @Argument(fullName = "sample_name", shortName = "sn", doc = "Include genotypes from this sample", required = false, exclusiveOf = "", validation = "") - var sample_name: List[String] = config("sample_name", default = Nil) - - /** Regular expression to select multiple samples */ - @Argument(fullName = "sample_expressions", shortName = "se", doc = "Regular expression to select multiple samples", required = false, exclusiveOf = "", validation = "") - var sample_expressions: List[String] = config("sample_expressions", default = Nil) - - /** File containing a list of samples to include */ - @Input(fullName = "sample_file", shortName = "sf", doc = "File containing a list of samples to include", required = false, exclusiveOf = "", validation = "") - var sample_file: List[File] = config("sample_file", default = Nil) - - /** Exclude genotypes from this sample */ - @Argument(fullName = "exclude_sample_name", shortName = "xl_sn", doc = "Exclude genotypes from this sample", required = false, exclusiveOf = "", validation = "") - var exclude_sample_name: List[String] = config("exclude_sample_name", default = Nil) - - /** List of samples to exclude */ - @Input(fullName = "exclude_sample_file", shortName = "xl_sf", doc = "List of samples to exclude", required = false, exclusiveOf = "", validation = "") - var exclude_sample_file: List[File] = config("exclude_sample_file", default = Nil) - - /** List of sample expressions to exclude */ - @Input(fullName = "exclude_sample_expressions", shortName = "xl_se", doc = "List of sample expressions to exclude", required = false, exclusiveOf = "", validation = "") - var exclude_sample_expressions: List[File] = config("exclude_sample_expressions", default = Nil) - - /** One or more criteria to use when selecting the data */ - @Argument(fullName = "selectexpressions", shortName = "select", doc = "One or more criteria to use when selecting the data", required = false, exclusiveOf = "", validation = "") - var selectexpressions: List[String] = config("selectexpressions", default = Nil) - - /** Invert the selection criteria for -select */ - @Argument(fullName = "invertselect", shortName = "invertSelect", doc = "Invert the selection criteria for -select", required = false, exclusiveOf = "", validation = "") - var invertselect: Boolean = config("invertselect", default = false) - - /** Don't include non-variant sites */ - @Argument(fullName = "excludeNonVariants", shortName = "env", doc = "Don't include non-variant sites", required = false, exclusiveOf = "", validation = "") - var excludeNonVariants: Boolean = config("excludeNonVariants", default = false) - - /** Don't include filtered sites */ - @Argument(fullName = "excludeFiltered", shortName = "ef", doc = "Don't include filtered sites", required = false, exclusiveOf = "", validation = "") - var excludeFiltered: Boolean = config("excludeFiltered", default = false) - - /** Preserve original alleles, do not trim */ - @Argument(fullName = "preserveAlleles", shortName = "noTrim", doc = "Preserve original alleles, do not trim", required = false, exclusiveOf = "", validation = "") - var preserveAlleles: Boolean = config("preserveAlleles", default = false) - - /** Remove alternate alleles not present in any genotypes */ - @Argument(fullName = "removeUnusedAlternates", shortName = "trimAlternates", doc = "Remove alternate alleles not present in any genotypes", required = false, exclusiveOf = "", validation = "") - var removeUnusedAlternates: Boolean = config("removeUnusedAlternates", default = false) - - /** Select only variants of a particular allelicity */ - @Argument(fullName = "restrictAllelesTo", shortName = "restrictAllelesTo", doc = "Select only variants of a particular allelicity", required = false, exclusiveOf = "", validation = "") - var restrictAllelesTo: Option[String] = config("restrictAllelesTo") - - /** Store the original AC, AF, and AN values after subsetting */ - @Argument(fullName = "keepOriginalAC", shortName = "keepOriginalAC", doc = "Store the original AC, AF, and AN values after subsetting", required = false, exclusiveOf = "", validation = "") - var keepOriginalAC: Boolean = config("keepOriginalAC", default = false) - - /** Store the original DP value after subsetting */ - @Argument(fullName = "keepOriginalDP", shortName = "keepOriginalDP", doc = "Store the original DP value after subsetting", required = false, exclusiveOf = "", validation = "") - var keepOriginalDP: Boolean = config("keepOriginalDP", default = false) - - /** Output mendelian violation sites only */ - @Argument(fullName = "mendelianViolation", shortName = "mv", doc = "Output mendelian violation sites only", required = false, exclusiveOf = "", validation = "") - var mendelianViolation: Boolean = config("mendelianViolation", default = false) - - /** Output non-mendelian violation sites only */ - @Argument(fullName = "invertMendelianViolation", shortName = "invMv", doc = "Output non-mendelian violation sites only", required = false, exclusiveOf = "", validation = "") - var invertMendelianViolation: Boolean = config("invertMendelianViolation", default = false) - - /** Minimum GQ score for each trio member to accept a site as a violation */ - @Argument(fullName = "mendelianViolationQualThreshold", shortName = "mvq", doc = "Minimum GQ score for each trio member to accept a site as a violation", required = false, exclusiveOf = "", validation = "") - var mendelianViolationQualThreshold: Option[Double] = config("mendelianViolationQualThreshold") - - /** Format string for mendelianViolationQualThreshold */ - @Argument(fullName = "mendelianViolationQualThresholdFormat", shortName = "", doc = "Format string for mendelianViolationQualThreshold", required = false, exclusiveOf = "", validation = "") - var mendelianViolationQualThresholdFormat: String = "%s" - - /** Select a fraction of variants at random from the input */ - @Argument(fullName = "select_random_fraction", shortName = "fraction", doc = "Select a fraction of variants at random from the input", required = false, exclusiveOf = "", validation = "") - var select_random_fraction: Option[Double] = config("select_random_fraction") - - /** Format string for select_random_fraction */ - @Argument(fullName = "select_random_fractionFormat", shortName = "", doc = "Format string for select_random_fraction", required = false, exclusiveOf = "", validation = "") - var select_random_fractionFormat: String = "%s" - - /** Select a fraction of genotypes at random from the input and sets them to no-call */ - @Argument(fullName = "remove_fraction_genotypes", shortName = "fractionGenotypes", doc = "Select a fraction of genotypes at random from the input and sets them to no-call", required = false, exclusiveOf = "", validation = "") - var remove_fraction_genotypes: Option[Double] = config("remove_fraction_genotypes") - - /** Format string for remove_fraction_genotypes */ - @Argument(fullName = "remove_fraction_genotypesFormat", shortName = "", doc = "Format string for remove_fraction_genotypes", required = false, exclusiveOf = "", validation = "") - var remove_fraction_genotypesFormat: String = "%s" - - /** Select only a certain type of variants from the input file */ - @Argument(fullName = "selectTypeToInclude", shortName = "selectType", doc = "Select only a certain type of variants from the input file", required = false, exclusiveOf = "", validation = "") - var selectTypeToInclude: List[String] = config("selectTypeToInclude", default = Nil) - - /** Do not select certain type of variants from the input file */ - @Argument(fullName = "selectTypeToExclude", shortName = "xlSelectType", doc = "Do not select certain type of variants from the input file", required = false, exclusiveOf = "", validation = "") - var selectTypeToExclude: Seq[String] = config("selectTypeToExclude", default = Nil) - - /** List of variant IDs to select */ - @Input(fullName = "keepIDs", shortName = "IDs", doc = "List of variant IDs to select", required = false, exclusiveOf = "", validation = "") - var keepIDs: Option[File] = config("keepIDs") - - /** List of variant IDs to select */ - @Argument(fullName = "excludeIDs", shortName = "xlIDs", doc = "List of variant IDs to select", required = false, exclusiveOf = "", validation = "") - var excludeIDs: Option[File] = config("excludeIDs") - - /** If true, the incoming VariantContext will be fully decoded */ - @Argument(fullName = "fullyDecode", shortName = "", doc = "If true, the incoming VariantContext will be fully decoded", required = false, exclusiveOf = "", validation = "") - var fullyDecode: Boolean = config("fullyDecode", default = false) - - /** If true, we won't actually write the output file. For efficiency testing only */ - @Argument(fullName = "justRead", shortName = "", doc = "If true, we won't actually write the output file. For efficiency testing only", required = false, exclusiveOf = "", validation = "") - var justRead: Boolean = config("justRead", default = false) - - /** Maximum size of indels to include */ - @Argument(fullName = "maxIndelSize", shortName = "", doc = "Maximum size of indels to include", required = false, exclusiveOf = "", validation = "") - var maxIndelSize: Option[Int] = config("maxIndelSize") - - /** Minimum size of indels to include */ - @Argument(fullName = "minIndelSize", shortName = "", doc = "Minimum size of indels to include", required = false, exclusiveOf = "", validation = "") - var minIndelSize: Option[Int] = config("minIndelSize") - - /** Maximum number of samples filtered at the genotype level */ - @Argument(fullName = "maxFilteredGenotypes", shortName = "", doc = "Maximum number of samples filtered at the genotype level", required = false, exclusiveOf = "", validation = "") - var maxFilteredGenotypes: Option[Int] = config("maxFilteredGenotypes") - - /** Minimum number of samples filtered at the genotype level */ - @Argument(fullName = "minFilteredGenotypes", shortName = "", doc = "Minimum number of samples filtered at the genotype level", required = false, exclusiveOf = "", validation = "") - var minFilteredGenotypes: Option[Int] = config("minFilteredGenotypes") - - /** Maximum fraction of samples filtered at the genotype level */ - @Argument(fullName = "maxFractionFilteredGenotypes", shortName = "", doc = "Maximum fraction of samples filtered at the genotype level", required = false, exclusiveOf = "", validation = "") - var maxFractionFilteredGenotypes: Option[Double] = config("maxFractionFilteredGenotypes") - - /** Format string for maxFractionFilteredGenotypes */ - @Argument(fullName = "maxFractionFilteredGenotypesFormat", shortName = "", doc = "Format string for maxFractionFilteredGenotypes", required = false, exclusiveOf = "", validation = "") - var maxFractionFilteredGenotypesFormat: String = "%s" - - /** Maximum fraction of samples filtered at the genotype level */ - @Argument(fullName = "minFractionFilteredGenotypes", shortName = "", doc = "Maximum fraction of samples filtered at the genotype level", required = false, exclusiveOf = "", validation = "") - var minFractionFilteredGenotypes: Option[Double] = config("minFractionFilteredGenotypes") - - /** Format string for minFractionFilteredGenotypes */ - @Argument(fullName = "minFractionFilteredGenotypesFormat", shortName = "", doc = "Format string for minFractionFilteredGenotypes", required = false, exclusiveOf = "", validation = "") - var minFractionFilteredGenotypesFormat: String = "%s" - - /** Set filtered genotypes to no-call */ - @Argument(fullName = "setFilteredGtToNocall", shortName = "", doc = "Set filtered genotypes to no-call", required = false, exclusiveOf = "", validation = "") - var setFilteredGtToNocall: Boolean = config("setFilteredGtToNocall", default = false) - - /** Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored. */ - @Argument(fullName = "ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", shortName = "", doc = "Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.", required = false, exclusiveOf = "", validation = "") - var ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES: Boolean = config("ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", default = false) - - /** Forces output VCF to be compliant to up-to-date version */ - @Argument(fullName = "forceValidOutput", shortName = "", doc = "Forces output VCF to be compliant to up-to-date version", required = false, exclusiveOf = "", validation = "") - var forceValidOutput: Boolean = config("forceValidOutput", default = false) - - /** Filter out reads with CIGAR containing the N operator, instead of failing with an error */ - @Argument(fullName = "filter_reads_with_N_cigar", shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false, exclusiveOf = "", validation = "") - var filter_reads_with_N_cigar: Boolean = config("filter_reads_with_N_cigar", default = false) - - /** Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error */ - @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false, exclusiveOf = "", validation = "") - var filter_mismatching_base_and_quals: Boolean = config("filter_mismatching_base_and_quals", default = false) - - /** Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error */ - @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false, exclusiveOf = "", validation = "") - var filter_bases_not_stored: Boolean = config("filter_bases_not_stored", default = false) - - @Output - @Gather(enabled = false) - private var outputIndex: File = _ - - override def beforeGraph() { - super.beforeGraph() - if (variant != null) - deps :+= VcfUtils.getVcfIndexFile(variant) - discordance.foreach(deps :+= VcfUtils.getVcfIndexFile(_)) - concordance.foreach(deps :+= VcfUtils.getVcfIndexFile(_)) - if (out != null && !org.broadinstitute.gatk.utils.io.IOUtils.isSpecialFile(out)) - outputIndex = VcfUtils.getVcfIndexFile(out) - } - - override def cmdLine = super.cmdLine + - required(TaggedFile.formatCommandLineParameter("-V", variant), variant, spaceSeparated = true, escape = true, format = "%s") + - optional(TaggedFile.formatCommandLineParameter("-disc", discordance), discordance, spaceSeparated = true, escape = true, format = "%s") + - optional(TaggedFile.formatCommandLineParameter("-conc", concordance), concordance, spaceSeparated = true, escape = true, format = "%s") + - optional("-o", out, spaceSeparated = true, escape = true, format = "%s") + - repeat("-sn", sample_name, spaceSeparated = true, escape = true, format = "%s") + - repeat("-se", sample_expressions, spaceSeparated = true, escape = true, format = "%s") + - repeat("-sf", sample_file, spaceSeparated = true, escape = true, format = "%s") + - repeat("-xl_sn", exclude_sample_name, spaceSeparated = true, escape = true, format = "%s") + - repeat("-xl_sf", exclude_sample_file, spaceSeparated = true, escape = true, format = "%s") + - repeat("-xl_se", exclude_sample_expressions, spaceSeparated = true, escape = true, format = "%s") + - repeat("-select", selectexpressions, spaceSeparated = true, escape = true, format = "%s") + - conditional(invertselect, "-invertSelect", escape = true, format = "%s") + - conditional(excludeNonVariants, "-env", escape = true, format = "%s") + - conditional(excludeFiltered, "-ef", escape = true, format = "%s") + - conditional(preserveAlleles, "-noTrim", escape = true, format = "%s") + - conditional(removeUnusedAlternates, "-trimAlternates", escape = true, format = "%s") + - optional("-restrictAllelesTo", restrictAllelesTo, spaceSeparated = true, escape = true, format = "%s") + - conditional(keepOriginalAC, "-keepOriginalAC", escape = true, format = "%s") + - conditional(keepOriginalDP, "-keepOriginalDP", escape = true, format = "%s") + - conditional(mendelianViolation, "-mv", escape = true, format = "%s") + - conditional(invertMendelianViolation, "-invMv", escape = true, format = "%s") + - optional("-mvq", mendelianViolationQualThreshold, spaceSeparated = true, escape = true, format = mendelianViolationQualThresholdFormat) + - optional("-fraction", select_random_fraction, spaceSeparated = true, escape = true, format = select_random_fractionFormat) + - optional("-fractionGenotypes", remove_fraction_genotypes, spaceSeparated = true, escape = true, format = remove_fraction_genotypesFormat) + - repeat("-selectType", selectTypeToInclude, spaceSeparated = true, escape = true, format = "%s") + - repeat("-xlSelectType", selectTypeToExclude, spaceSeparated = true, escape = true, format = "%s") + - optional("-IDs", keepIDs, spaceSeparated = true, escape = true, format = "%s") + - optional("-xlIDs", excludeIDs, spaceSeparated = true, escape = true, format = "%s") + - conditional(fullyDecode, "--fullyDecode", escape = true, format = "%s") + - conditional(justRead, "--justRead", escape = true, format = "%s") + - optional("--maxIndelSize", maxIndelSize, spaceSeparated = true, escape = true, format = "%s") + - optional("--minIndelSize", minIndelSize, spaceSeparated = true, escape = true, format = "%s") + - optional("--maxFilteredGenotypes", maxFilteredGenotypes, spaceSeparated = true, escape = true, format = "%s") + - optional("--minFilteredGenotypes", minFilteredGenotypes, spaceSeparated = true, escape = true, format = "%s") + - optional("--maxFractionFilteredGenotypes", maxFractionFilteredGenotypes, spaceSeparated = true, escape = true, format = maxFractionFilteredGenotypesFormat) + - optional("--minFractionFilteredGenotypes", minFractionFilteredGenotypes, spaceSeparated = true, escape = true, format = minFractionFilteredGenotypesFormat) + - conditional(setFilteredGtToNocall, "--setFilteredGtToNocall", escape = true, format = "%s") + - conditional(ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES, "--ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", escape = true, format = "%s") + - conditional(forceValidOutput, "--forceValidOutput", escape = true, format = "%s") + - conditional(filter_reads_with_N_cigar, "-filterRNC", escape = true, format = "%s") + - conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape = true, format = "%s") + - conditional(filter_bases_not_stored, "-filterNoBases", escape = true, format = "%s") -} diff --git a/biopet-tools-extensions/src/test/scala/nl/lumc/sasc/biopet/extensions/tools/VcfFilterTest.scala b/biopet-tools-extensions/src/test/scala/nl/lumc/sasc/biopet/extensions/tools/VcfFilterTest.scala index cc98b0c7ad39f729bae0af7ed8ff90604f82e5b3..5444ade3b60a58c4e94a312e6c4d24a156c42c84 100644 --- a/biopet-tools-extensions/src/test/scala/nl/lumc/sasc/biopet/extensions/tools/VcfFilterTest.scala +++ b/biopet-tools-extensions/src/test/scala/nl/lumc/sasc/biopet/extensions/tools/VcfFilterTest.scala @@ -19,7 +19,7 @@ import java.io.File import org.scalatest.Matchers import org.scalatest.testng.TestNGSuite -import org.testng.annotations.{ DataProvider, Test } +import org.testng.annotations.Test /** * Created by ahbbollen on 2-3-16. @@ -44,34 +44,26 @@ class VcfFilterTest extends TestNGSuite with Matchers { filterer.outputVcfIndex.getAbsolutePath shouldBe oVcf.getAbsolutePath + ".tbi" } - @DataProvider(name = "functions") - def functions = { - Array( - () => testCommand(minSampleDepth = Some(2)), - () => testCommand(minTotalDepth = Some(2)), - () => testCommand(minAlternateDepth = Some(2)), - () => testCommand(minSamplesPass = Some(2)), - () => testCommand(minGenomeQuality = Some(50)), - () => testCommand(filterRefCalls = true), - () => testCommand(invertedOutputVcf = Some(File.createTempFile("vcfFilter", ".vcf"))), - () => testCommand(resToDom = Some("dummy")), - () => testCommand(trioCompound = Some("dummy")), - () => testCommand(deNovoInSample = Some("dummy")), - () => testCommand(deNovoTrio = Some("dummy")), - () => testCommand(trioLossOfHet = Some("dummy")), - () => testCommand(mustHaveVariant = List("sample1", "sample2")), - () => testCommand(calledIn = List("sample1", "sample2")), - () => testCommand(mustHaveGenotype = List("sample1:HET", "sample2:HET")), - () => testCommand(diffGenotype = List("sample1:sample2", "sample2:sample3")), - () => testCommand(minQualScore = Some(50.0)), - () => testCommand(filterHetVarToHomVar = List("dummy")), - () => testCommand(id = List("rs01", "rs02")), - () => testCommand(idFile = Some(File.createTempFile("vcfFilter", ".txt"))) - ).map(Array(_)) - } - - @Test(dataProvider = "functions") - def executer(function0: Function0[Unit]): Unit = function0() + @Test def testMinSampleDepth() = testCommand(minSampleDepth = Some(2)) + @Test def testMinTotalDepth() = testCommand(minTotalDepth = Some(2)) + @Test def testMinAlternateDepth() = testCommand(minAlternateDepth = Some(2)) + @Test def testMinSamplesPass() = testCommand(minSamplesPass = Some(2)) + @Test def testMinGenomeQuality() = testCommand(minGenomeQuality = Some(50)) + @Test def testFilterRefCalls() = testCommand(filterRefCalls = true) + @Test def testInvertedOutputVcf() = testCommand(invertedOutputVcf = Some(File.createTempFile("vcfFilter", ".vcf"))) + @Test def testResToDom() = testCommand(resToDom = Some("dummy")) + @Test def testTrioCompound() = testCommand(trioCompound = Some("dummy")) + @Test def testDeNovoInSample() = testCommand(deNovoInSample = Some("dummy")) + @Test def testDeNovoTrio() = testCommand(deNovoTrio = Some("dummy")) + @Test def testTrioLossOfHet() = testCommand(trioLossOfHet = Some("dummy")) + @Test def testMustHaveVariant() = testCommand(mustHaveVariant = List("sample1", "sample2")) + @Test def testCalledIn() = testCommand(calledIn = List("sample1", "sample2")) + @Test def testMustHaveGenotype() = testCommand(mustHaveGenotype = List("sample1:HET", "sample2:HET")) + @Test def testDiffGenotype() = testCommand(diffGenotype = List("sample1:sample2", "sample2:sample3")) + @Test def testMinQualScore() = testCommand(minQualScore = Some(50.0)) + @Test def testFilterHetVarToHomVar() = testCommand(filterHetVarToHomVar = List("dummy")) + @Test def testId() = testCommand(id = List("rs01", "rs02")) + @Test def testIdFile() = testCommand(idFile = Some(File.createTempFile("vcfFilter", ".txt"))) protected def testCommand(minSampleDepth: Option[Int] = None, minTotalDepth: Option[Int] = None, diff --git a/docs/pipelines/toucan.md b/docs/pipelines/toucan.md index 12f4ea108c54402f51d37e6f673d97e633058d6f..5f362bf13fa083d571f57e03f95351fc1231a16b 100644 --- a/docs/pipelines/toucan.md +++ b/docs/pipelines/toucan.md @@ -83,6 +83,25 @@ The following config values are optional: Annotation queries can be set by the `annotation_queries` config value in the `manwe` config namespace. By default, a global query is returned. + +###Groups +In case you want to add your samples to a specific group in your varda database, you can use the tagging system in your sample config. +Specifically, the `varda_group` tag should be a list of strings pointing to group. + +E.g. : + +```json +{ + "samples": { + "sample1": { + "tags": { + "varda_group": ["group1", "group2"] + } + } + } +} +``` + Running the pipeline --------------- The command to run the pipeline is: diff --git a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala index fc8db7ab30f7581c7638f15c48bba6e9443eb195..3cb06df0e160cb97b98710de74f7ca9fa31ce919 100644 --- a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala +++ b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala @@ -16,6 +16,7 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep import nl.lumc.sasc.biopet.utils.config.Configurable +import scala.collection.JavaConversions._ /** * Cutadapt wrapper specific for Flexiprep. @@ -41,23 +42,26 @@ class Cutadapt(root: Configurable, fastqc: Fastqc) extends nl.lumc.sasc.biopet.e val adapterCounts: Map[String, Any] = initStats.get(adaptersStatsName) match { // "adapters" key found in statistics case Some(m: Map[_, _]) => m.flatMap { - case (seq: String, count) => - seqToNameMap.get(seq) match { + case (adapterSequence: String, adapterStats: Map[_, _]) => + seqToNameMap.get(adapterSequence) match { // adapter sequence is found by FastQC - case Some(n) => Some(n -> Map("sequence" -> seq, "count" -> count)) + case Some(adapterSeqName) => { + Some(adapterSeqName -> + Map("sequence" -> adapterSequence, "stats" -> adapterStats.toMap) + ) + } // adapter sequence is clipped but not found by FastQC ~ should not happen since all clipped adapter // sequences come from FastQC case _ => - throw new IllegalStateException(s"Adapter '$seq' is clipped but not found by FastQC in '$fastqInput'.") + throw new IllegalStateException(s"Adapter '$adapterSequence' is clipped but not found by FastQC in '$fastqInput'.") } // FastQC found no adapters case otherwise => - ; logger.debug(s"No adapters found for summarizing in '$fastqInput'.") None } // "adapters" key not found ~ something went wrong in our part - case _ => throw new RuntimeException(s"Required key 'adapters' not found in stats entry '$fastqInput'.") + case _ => throw new RuntimeException(s"Required key '${adaptersStatsName}' not found in stats entry '${fastqInput}'.") } initStats.updated(adaptersStatsName, adapterCounts) } diff --git a/flexiprep/src/test/resources/ct-test.R1.clip.stats b/flexiprep/src/test/resources/ct-test.R1.clip.stats new file mode 100644 index 0000000000000000000000000000000000000000..4a280ef0a7d2588169c02b5e40432f4f903c69b8 --- /dev/null +++ b/flexiprep/src/test/resources/ct-test.R1.clip.stats @@ -0,0 +1,160 @@ +This is cutadapt 1.9.1 with Python 2.7.6 +Command line parameters: -b CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC -b GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG -b CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC -b GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG --error-rate 0.2 --times 2 -m 15 ct_r1.fq.gz.seqtk.fq --output ct_r1.fq.gz.cutadapt.fq +Trimming 4 adapters with at most 20.0% errors in single-end mode ... +Finished in 0.19 s (189 us/read; 0.32 M reads/minute). + +=== Summary === + +Total reads processed: 1,000 +Reads with adapters: 440 (44.0%) +Reads that were too short: 15 (1.5%) +Reads written (passing filters): 985 (98.5%) + +Total basepairs processed: 100,000 bp +Total written (filtered): 89,423 bp (89.4%) + +=== Adapter 1 === + +Sequence: CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC; Type: variable 5'/3'; Length: 63; Trimmed: 94 times. +18 times, it overlapped the 5' end of a read +76 times, it overlapped the 3' end or was within the read + +No. of allowed errors: +0-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11; 60-63 bp: 12 + +Overview of removed sequences (5') +length count expect max.err error counts +3 8 15.6 0 8 +4 3 3.9 0 2 1 +5 2 1.0 1 0 2 +6 4 0.2 1 1 3 +9 1 0.0 1 0 0 1 + + +Overview of removed sequences (3' or within) +length count expect max.err error counts +3 13 15.6 0 13 +4 19 3.9 0 3 16 +5 21 1.0 1 0 21 +6 18 0.2 1 1 17 +7 2 0.1 1 0 2 +9 1 0.0 1 0 0 1 +11 1 0.0 2 0 0 1 +12 1 0.0 2 0 0 1 + +=== Adapter 2 === + +Sequence: GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG; Type: variable 5'/3'; Length: 63; Trimmed: 340 times. +117 times, it overlapped the 5' end of a read +223 times, it overlapped the 3' end or was within the read + +No. of allowed errors: +0-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11; 60-63 bp: 12 + +Overview of removed sequences (5') +length count expect max.err error counts +3 14 15.6 0 14 +4 29 3.9 0 6 23 +5 32 1.0 1 3 29 +6 36 0.2 1 0 36 +8 1 0.0 1 0 1 +9 1 0.0 1 0 0 1 +10 1 0.0 2 0 0 1 +11 2 0.0 2 0 0 2 +37 1 0.0 7 0 0 0 0 0 1 + + +Overview of removed sequences (3' or within) +length count expect max.err error counts +3 18 15.6 0 18 +4 9 3.9 0 5 4 +5 15 1.0 1 8 7 +6 10 0.2 1 8 2 +7 7 0.1 1 5 2 +8 10 0.0 1 9 1 +9 6 0.0 1 5 1 +10 8 0.0 2 5 0 3 +11 4 0.0 2 4 +12 4 0.0 2 4 +13 9 0.0 2 9 +14 4 0.0 2 3 0 1 +15 7 0.0 3 7 +16 2 0.0 3 2 +17 4 0.0 3 2 1 0 1 +18 2 0.0 3 2 +19 2 0.0 3 2 +20 2 0.0 4 0 1 1 +21 7 0.0 4 6 1 +22 7 0.0 4 7 +23 2 0.0 4 2 +24 3 0.0 4 3 +25 5 0.0 5 5 +26 5 0.0 5 5 +27 8 0.0 5 8 +28 6 0.0 5 5 1 +29 2 0.0 5 2 +30 5 0.0 6 5 +31 3 0.0 6 3 +32 8 0.0 6 8 +33 1 0.0 6 1 +34 5 0.0 6 0 5 +35 2 0.0 7 0 0 0 0 0 0 2 +36 3 0.0 7 0 0 0 0 0 0 3 +37 4 0.0 7 0 0 0 0 0 0 0 2 2 +38 2 0.0 7 0 0 0 0 0 0 0 0 0 2 +39 4 0.0 7 0 0 0 0 1 0 0 0 0 3 +40 3 0.0 8 0 0 0 0 0 0 0 3 +41 1 0.0 8 0 0 0 0 0 0 0 1 +42 4 0.0 8 0 0 0 0 0 0 0 0 4 +43 5 0.0 8 0 0 0 0 0 0 0 0 0 5 +44 3 0.0 8 0 0 0 0 0 0 0 0 0 0 3 +46 1 0.0 9 0 0 0 0 0 0 0 0 0 0 1 +49 1 0.0 9 0 0 0 0 0 1 + +=== Adapter 3 === + +Sequence: CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC; Type: variable 5'/3'; Length: 63; Trimmed: 0 times. + +=== Adapter 4 === + +Sequence: GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG; Type: variable 5'/3'; Length: 63; Trimmed: 82 times. +15 times, it overlapped the 5' end of a read +67 times, it overlapped the 3' end or was within the read + +No. of allowed errors: +0-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11; 60-63 bp: 12 + +Overview of removed sequences (5') +length count expect max.err error counts +26 1 0.0 5 0 1 +61 2 0.0 12 0 0 0 2 +64 11 0.0 12 0 0 0 11 +72 1 0.0 12 0 0 0 0 0 0 0 0 0 0 0 1 + + +Overview of removed sequences (3' or within) +length count expect max.err error counts +45 3 0.0 9 0 0 0 3 +46 2 0.0 9 0 0 0 2 +47 3 0.0 9 0 0 0 3 +48 3 0.0 9 0 0 0 3 +49 2 0.0 9 0 0 0 2 +50 3 0.0 10 0 0 0 3 +51 2 0.0 10 0 0 0 2 +52 6 0.0 10 0 0 0 6 +53 1 0.0 10 0 0 0 1 +54 5 0.0 10 0 0 0 4 0 1 +56 2 0.0 11 0 0 0 2 +57 2 0.0 11 0 0 0 2 +58 2 0.0 11 0 0 0 2 +59 3 0.0 11 0 0 0 2 0 0 0 0 0 1 +61 1 0.0 12 0 0 0 0 0 1 +62 3 0.0 12 0 0 0 2 1 +63 1 0.0 12 0 0 0 0 1 +66 3 0.0 12 0 0 0 3 +67 3 0.0 12 0 0 0 3 +70 1 0.0 12 0 0 0 1 +72 1 0.0 12 0 0 0 1 +80 1 0.0 12 0 0 0 1 +99 14 0.0 12 0 0 0 14 + diff --git a/flexiprep/src/test/resources/fqc_contaminants_v0112.txt b/flexiprep/src/test/resources/fqc_contaminants_v0112.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2c29bee8171e0454994c6d7d6f0f4780efb3921 --- /dev/null +++ b/flexiprep/src/test/resources/fqc_contaminants_v0112.txt @@ -0,0 +1,182 @@ +# This file contains a list of potential contaminants which are +# frequently found in high throughput sequencing reactions. These +# are mostly sequences of adapters / primers used in the various +# sequencing chemistries. +# +# Please DO NOT rely on these sequences to design your own oligos, some +# of them are truncated at ambiguous positions, and none of them are +# definitive sequences from the manufacturers so don't blame us if you +# try to use them and they don't work. +# +# You can add more sequences to the file by putting one line per entry +# and specifying a name[tab]sequence. If the contaminant you add is +# likely to be of use to others please consider sending it to the FastQ +# authors, either via a bug report at www.bioinformatics.babraham.ac.uk/bugzilla/ +# or by directly emailing simon.andrews@babraham.ac.uk so other users of +# the program can benefit. + +Illumina Single End Adapter 1 GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +Illumina Single End Adapter 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +Illumina Single End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Single End PCR Primer 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +Illumina Single End Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT + +Illumina Paired End Adapter 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End Adapter 2 GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +Illumina Paried End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End PCR Primer 2 CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +Illumina Paried End Sequencing Primer 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End Sequencing Primer 2 CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT + +Illumina DpnII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGAC +Illumina DpnII expression Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina DpnII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina DpnII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina DpnII expression Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina NlaIII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGACATG +Illumina NlaIII expression Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina NlaIII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina NlaIII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina NlaIII expression Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG + +Illumina Small RNA Adapter 1 GTTCAGAGTTCTACAGTCCGACGATC +Illumina Small RNA Adapter 2 TGGAATTCTCGGGTGCCAAGG +Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina Multiplexing Adapter 1 GATCGGAAGAGCACACGTCT +Illumina Multiplexing Adapter 2 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing PCR Primer 1.01 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing PCR Primer 2.01 GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +Illumina Multiplexing Read1 Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing Index Sequencing Primer GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +Illumina Multiplexing Read2 Sequencing Primer GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT + +Illumina PCR Primer Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC +Illumina PCR Primer Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC +Illumina PCR Primer Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC +Illumina PCR Primer Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC +Illumina PCR Primer Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC +Illumina PCR Primer Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC +Illumina PCR Primer Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC +Illumina PCR Primer Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC +Illumina PCR Primer Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC +Illumina PCR Primer Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC +Illumina PCR Primer Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC +Illumina PCR Primer Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC + +Illumina DpnII Gex Adapter 1 GATCGTCGGACTGTAGAACTCTGAAC +Illumina DpnII Gex Adapter 1.01 ACAGGTTCAGAGTTCTACAGTCCGAC +Illumina DpnII Gex Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina DpnII Gex Adapter 2.01 TCGTATGCCGTCTTCTGCTTG +Illumina DpnII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina DpnII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina DpnII Gex Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina NlaIII Gex Adapter 1.01 TCGGACTGTAGAACTCTGAAC +Illumina NlaIII Gex Adapter 1.02 ACAGGTTCAGAGTTCTACAGTCCGACATG +Illumina NlaIII Gex Adapter 2.01 CAAGCAGAAGACGGCATACGA +Illumina NlaIII Gex Adapter 2.02 TCGTATGCCGTCTTCTGCTTG +Illumina NlaIII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina NlaIII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina NlaIII Gex Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG + +Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA +Illumina 5p RNA Adapter GTTCAGAGTTCTACAGTCCGACGATC +Illumina RNA Adapter1 TGGAATTCTCGGGTGCCAAGG + +Illumina Small RNA 3p Adapter 1 ATCTCGTATGCCGTCTTCTGCTTG +Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +TruSeq Universal Adapter AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +TruSeq Adapter, Index 1 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 2 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 3 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 4 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 5 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 6 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 7 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 8 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 9 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 10 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 11 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 12 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 13 GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 14 GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 15 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 16 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 18 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 19 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 20 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 21 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 22 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 23 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCACTCTTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 25 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 27 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTCTCGTATGCCGTCTTCTGCTTG + +Illumina RNA RT Primer GCCTTGGCACCCGAGAATTCCA +Illumina RNA PCR Primer AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA + +RNA PCR Primer, Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 13 CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 14 CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 15 CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 16 CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 17 CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 18 CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 19 CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 20 CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 21 CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 22 CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 23 CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 24 CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 25 CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 26 CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 27 CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 28 CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 29 CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 30 CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 31 CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 32 CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 33 CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 34 CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 35 CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 36 CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 37 CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 38 CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 39 CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 40 CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 41 CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 42 CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 43 CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 44 CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 45 CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 46 CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 47 CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 48 CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA + +ABI Dynabead EcoP Oligo CTGATCTAGAGGTACCGGATCCCAGCAGT +ABI Solid3 Adapter A CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG +ABI Solid3 Adapter B CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT +ABI Solid3 5' AMP Primer CCACTACGCCTCCGCTTTCCTCTCTATG +ABI Solid3 3' AMP Primer CTGCCCCGGGTTCCTCATTCT +ABI Solid3 EF1 alpha Sense Primer CATGTGTGTTGAGAGCTTC +ABI Solid3 EF1 alpha Antisense Primer GAAAACCAAAGTGGTCCAC +ABI Solid3 GAPDH Forward Primer TTAGCACCCCTGGCCAAGG +ABI Solid3 GAPDH Reverse Primer CTTACTCCTTGGAGGCCATG diff --git a/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt b/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt index 74938a52b7d505b1185b1962ffe7234ddb304a52..02b9e3f0cbf01c6ce54fa715df93d7cfc6ba4bab 100644 --- a/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt +++ b/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt @@ -1,29 +1,13 @@ -==== - Biopet is built on top of GATK Queue for building bioinformatic - pipelines. It is mainly intended to support LUMC SHARK cluster which is running - SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - should also be able to execute Biopet tools and pipelines. - - Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - - Contact us at: sasc@lumc.nl - - A dual licensing mode is applied. The source code within this project that are - not part of GATK Queue is freely available for non-commercial use under an AGPL - license; For commercial users or users who do not want to follow the AGPL - license, please contact us to obtain a separate license. -==== - -##FastQC 0.10.1 +##FastQC 0.11.2 >>Basic Statistics pass -#Measure Value -Filename ct_r1.fq -File type Conventional base calls -Encoding Sanger / Illumina 1.9 -Total Sequences 1000 -Filtered Sequences 0 -Sequence length 100 -%GC 53 +#Measure Value +Filename ct_r1.fq.gz +File type Conventional base calls +Encoding Sanger / Illumina 1.9 +Total Sequences 1000 +Sequences flagged as poor quality 0 +Sequence length 100 +%GC 53 >>END_MODULE >>Per base sequence quality fail #Base Mean Median Lower Quartile Upper Quartile 10th Percentile 90th Percentile @@ -36,26 +20,111 @@ Sequence length 100 7 35.783 37.0 35.0 37.0 35.0 37.0 8 36.008 37.0 35.0 37.0 35.0 37.0 9 37.706 39.0 37.0 39.0 35.0 39.0 -10-14 37.857600000000005 39.2 37.2 39.4 34.8 39.4 -15-19 38.9788 40.2 38.0 41.0 35.0 41.0 -20-24 38.8246 40.0 38.0 41.0 34.8 41.0 -25-29 38.589600000000004 40.0 38.0 41.0 34.4 41.0 -30-34 38.3568 40.0 38.0 41.0 33.8 41.0 -35-39 38.1592 40.0 37.4 41.0 33.6 41.0 -40-44 37.4808 39.8 36.0 41.0 32.6 41.0 -45-49 36.9478 39.0 35.0 40.8 31.2 41.0 -50-54 35.845600000000005 37.8 34.6 40.0 29.4 41.0 -55-59 34.739 36.6 33.6 40.0 27.4 41.0 -60-64 34.1336 35.4 33.4 38.6 27.2 40.2 -65-69 32.7464 35.0 32.6 37.2 24.6 39.6 -70-74 29.3478 34.0 29.6 35.6 2.0 38.6 -75-79 27.4908 33.2 26.4 35.0 2.0 36.6 -80-84 25.893000000000008 33.0 21.8 35.0 2.0 35.4 -85-89 25.031799999999997 32.4 16.2 34.6 2.0 35.0 -90-94 23.9446 31.4 6.4 34.0 2.0 35.0 -95-99 22.9358 30.4 2.0 34.0 2.0 35.0 +10-11 37.709 39.0 37.0 39.0 35.0 39.0 +12-13 37.6135 39.0 37.0 39.0 35.0 39.0 +14-15 38.793 40.0 38.0 41.0 34.5 41.0 +16-17 39.033500000000004 40.5 38.0 41.0 35.0 41.0 +18-19 38.942 40.0 38.0 41.0 35.0 41.0 +20-21 38.888 40.0 38.0 41.0 35.0 41.0 +22-23 38.807 40.0 38.0 41.0 35.0 41.0 +24-25 38.702 40.0 38.0 41.0 34.0 41.0 +26-27 38.65 40.0 38.0 41.0 34.5 41.0 +28-29 38.4885 40.0 38.0 41.0 34.5 41.0 +30-31 38.307 40.0 38.0 41.0 34.0 41.0 +32-33 38.433499999999995 40.0 38.0 41.0 34.0 41.0 +34-35 38.3425 40.0 38.0 41.0 33.5 41.0 +36-37 38.1185 40.0 37.5 41.0 33.5 41.0 +38-39 38.088499999999996 40.0 37.0 41.0 33.5 41.0 +40-41 37.555 40.0 36.0 41.0 32.5 41.0 +42-43 37.504999999999995 40.0 36.0 41.0 33.0 41.0 +44-45 37.167 39.0 35.5 41.0 32.0 41.0 +46-47 36.980999999999995 39.0 35.0 41.0 31.0 41.0 +48-49 36.8635 39.0 35.0 40.5 31.0 41.0 +50-51 36.4125 38.5 35.0 40.0 30.5 41.0 +52-53 35.528000000000006 37.5 34.5 40.0 28.5 41.0 +54-55 34.925 37.0 33.5 40.0 27.5 41.0 +56-57 34.8735 37.0 34.0 40.0 27.5 41.0 +58-59 34.7225 36.0 33.5 40.0 28.0 41.0 +60-61 34.67400000000001 36.0 34.0 39.0 28.5 40.5 +62-63 33.841499999999996 35.0 33.0 38.5 26.5 40.0 +64-65 33.549 35.0 33.0 38.0 26.0 40.0 +66-67 32.971999999999994 35.0 33.0 37.0 26.0 40.0 +68-69 32.1635 35.0 32.0 37.0 22.5 39.0 +70-71 30.002000000000002 34.0 30.5 36.0 2.0 39.0 +72-73 29.0695 34.0 29.0 35.5 2.0 38.5 +74-75 28.641 34.0 29.0 35.0 2.0 38.0 +76-77 27.8495 33.0 27.5 35.0 2.0 36.0 +78-79 26.5345 33.0 24.0 35.0 2.0 36.5 +80-81 26.140500000000003 33.0 23.0 35.0 2.0 36.0 +82-83 25.784 33.0 21.5 35.0 2.0 35.0 +84-85 25.6115 33.0 20.0 35.0 2.0 35.0 +86-87 25.1755 33.0 17.0 35.0 2.0 35.0 +88-89 24.600499999999997 31.5 13.5 34.0 2.0 35.0 +90-91 24.088 31.5 6.5 34.0 2.0 35.0 +92-93 24.16 32.0 8.5 34.0 2.0 35.0 +94-95 23.02 30.0 2.0 34.0 2.0 35.0 +96-97 23.183 30.5 2.0 34.0 2.0 35.0 +98-99 22.75 30.5 2.0 34.0 2.0 35.0 100 21.984 30.0 2.0 34.0 2.0 35.0 >>END_MODULE +>>Per tile sequence quality pass +#Tile Base Mean +1101 1 0.0 +1101 2 0.0 +1101 3 0.0 +1101 4 0.0 +1101 5 0.0 +1101 6 0.0 +1101 7 0.0 +1101 8 0.0 +1101 9 0.0 +1101 10-11 0.0 +1101 12-13 0.0 +1101 14-15 0.0 +1101 16-17 0.0 +1101 18-19 0.0 +1101 20-21 0.0 +1101 22-23 0.0 +1101 24-25 0.0 +1101 26-27 0.0 +1101 28-29 0.0 +1101 30-31 0.0 +1101 32-33 0.0 +1101 34-35 0.0 +1101 36-37 0.0 +1101 38-39 0.0 +1101 40-41 0.0 +1101 42-43 0.0 +1101 44-45 0.0 +1101 46-47 0.0 +1101 48-49 0.0 +1101 50-51 0.0 +1101 52-53 0.0 +1101 54-55 0.0 +1101 56-57 0.0 +1101 58-59 0.0 +1101 60-61 0.0 +1101 62-63 0.0 +1101 64-65 0.0 +1101 66-67 0.0 +1101 68-69 0.0 +1101 70-71 0.0 +1101 72-73 0.0 +1101 74-75 0.0 +1101 76-77 0.0 +1101 78-79 0.0 +1101 80-81 0.0 +1101 82-83 0.0 +1101 84-85 0.0 +1101 86-87 0.0 +1101 88-89 0.0 +1101 90-91 0.0 +1101 92-93 0.0 +1101 94-95 0.0 +1101 96-97 0.0 +1101 98-99 0.0 +1101 100 0.0 +>>END_MODULE >>Per sequence quality scores pass #Quality Count 11 1.0 @@ -99,57 +168,53 @@ Sequence length 100 7 20.9 24.7 32.6 21.8 8 20.0 27.200000000000003 30.0 22.8 9 24.5 21.5 27.800000000000004 26.200000000000003 -10-14 25.22 23.28 26.26 25.240000000000002 -15-19 26.44 21.34 26.1 26.119999999999997 -20-24 25.240000000000002 22.1 24.6 28.060000000000002 -25-29 24.62 22.06 25.119999999999997 28.199999999999996 -30-34 26.240000000000002 21.44 24.279999999999998 28.04 -35-39 24.8 22.439999999999998 24.34 28.42 -40-44 25.8 22.84 23.9 27.46 -45-49 26.26 22.64 23.66 27.439999999999998 -50-54 26.72 22.58 23.18 27.52 -55-59 25.019999999999996 22.58 24.38 28.02 -60-64 26.251501802162597 22.00640768922707 23.28794553464157 28.454144973968766 -65-69 25.683829444891394 23.873692679002414 23.049074818986323 27.39340305711987 -70-74 25.554134697357206 25.44757033248082 21.717817561807333 27.28047740835465 -75-79 25.818501428257523 23.643155350472423 23.071852340145025 27.466490881125026 -80-84 26.973532796317606 23.95857307249712 21.74913693901036 27.318757192174914 -85-89 25.452016689847014 24.849327770050998 22.624014835419565 27.07464070468243 -90-94 24.547101449275363 22.35054347826087 24.139492753623188 28.962862318840582 -95-99 25.318837549655026 24.231653773782146 23.186284758519758 27.263223918043067 +10-11 25.15 24.0 27.55 23.3 +12-13 26.200000000000003 22.3 24.65 26.85 +14-15 24.75 21.95 26.3 27.0 +16-17 25.4 21.7 26.55 26.35 +18-19 27.650000000000002 21.6 25.85 24.9 +20-21 24.8 21.8 24.3 29.099999999999998 +22-23 25.900000000000002 23.05 24.15 26.900000000000002 +24-25 24.85 21.4 25.900000000000002 27.85 +26-27 24.7 20.849999999999998 25.0 29.45 +28-29 24.4 23.3 24.95 27.35 +30-31 27.35 20.95 25.15 26.55 +32-33 24.9 22.05 23.400000000000002 29.65 +34-35 25.6 22.15 25.900000000000002 26.35 +36-37 24.95 21.2 23.400000000000002 30.45 +38-39 24.8 23.35 23.7 28.15 +40-41 27.0 23.35 23.599999999999998 26.05 +42-43 25.15 22.35 23.799999999999997 28.7 +44-45 26.200000000000003 20.7 24.3 28.799999999999997 +46-47 26.3 24.0 23.150000000000002 26.55 +48-49 25.5 23.3 24.05 27.150000000000002 +50-51 27.55 22.75 23.7 26.0 +52-53 24.45 23.400000000000002 23.1 29.049999999999997 +54-55 27.450000000000003 21.85 23.0 27.700000000000003 +56-57 25.85 22.15 23.5 28.499999999999996 +58-59 24.05 22.75 25.6 27.6 +60-61 25.25 20.95 23.45 30.349999999999998 +62-63 27.3 21.9 23.7 27.1 +64-65 26.178535606820464 24.57372116349047 22.617853560682047 26.629889669007024 +66-67 25.7 23.75 22.05 28.499999999999996 +68-69 25.405679513184587 23.52941176470588 24.036511156186613 27.028397565922923 +70-71 25.159574468085104 23.085106382978722 23.138297872340424 28.617021276595743 +72-73 26.031065881092662 26.513122656668454 20.51419389394751 26.941617568291377 +74-75 25.197680548234054 26.56826568265683 21.929362150764366 26.304691618344755 +76-77 25.911812738160044 23.51660315732172 24.550898203592812 26.02068590092542 +78-79 26.16345062429058 22.985244040862657 21.793416572077184 29.05788876276958 +80-81 26.98324022346369 25.474860335195533 21.005586592178773 26.536312849162012 +82-83 26.46370023419204 24.355971896955502 22.131147540983605 27.049180327868854 +84-85 26.124567474048444 23.18339100346021 22.145328719723185 28.546712802768166 +86-87 25.976331360946748 25.443786982248522 22.36686390532544 26.21301775147929 +88-89 25.503742084052966 23.54634427173287 23.316062176165804 27.63385146804836 +90-91 23.832052040212893 21.525724423418097 25.901833234772326 28.74039030159669 +92-93 24.525139664804467 22.849162011173185 23.743016759776538 28.88268156424581 +94-95 25.161987041036717 24.028077753779698 22.4622030237581 28.347732181425485 +96-97 25.37393162393162 24.412393162393162 23.664529914529915 26.549145299145298 +98-99 25.67703109327984 23.620862587763288 22.71815446339017 27.9839518555667 100 24.0 26.0 21.9 28.1 >>END_MODULE ->>Per base GC content fail -#Base %GC -1 71.01303911735206 -2 64.1 -3 73.3 -4 65.3 -5 55.800000000000004 -6 87.3 -7 42.699999999999996 -8 42.8 -9 50.7 -10-14 50.46000000000001 -15-19 52.559999999999995 -20-24 53.300000000000004 -25-29 52.82 -30-34 54.279999999999994 -35-39 53.22 -40-44 53.26 -45-49 53.7 -50-54 54.24 -55-59 53.04 -60-64 54.70564677613135 -65-69 53.07723250201126 -70-74 52.834612105711855 -75-79 53.28499230938255 -80-84 54.29228998849251 -85-89 52.526657394529444 -90-94 53.509963768115945 -95-99 52.5820614676981 -100 52.1 ->>END_MODULE >>Per sequence GC content fail #GC Content Count 0 0.0 @@ -265,24 +330,51 @@ Sequence length 100 7 0.0 8 0.0 9 0.0 -10-14 0.0 -15-19 0.0 -20-24 0.0 -25-29 0.0 -30-34 0.0 -35-39 0.0 -40-44 0.0 -45-49 0.0 -50-54 0.0 -55-59 0.0 -60-64 0.12 -65-69 0.5599999999999999 -70-74 6.16 -75-79 8.98 -80-84 13.100000000000001 -85-89 13.719999999999999 -90-94 11.68 -95-99 4.34 +10-11 0.0 +12-13 0.0 +14-15 0.0 +16-17 0.0 +18-19 0.0 +20-21 0.0 +22-23 0.0 +24-25 0.0 +26-27 0.0 +28-29 0.0 +30-31 0.0 +32-33 0.0 +34-35 0.0 +36-37 0.0 +38-39 0.0 +40-41 0.0 +42-43 0.0 +44-45 0.0 +46-47 0.0 +48-49 0.0 +50-51 0.0 +52-53 0.0 +54-55 0.0 +56-57 0.0 +58-59 0.0 +60-61 0.0 +62-63 0.0 +64-65 0.3 +66-67 0.0 +68-69 1.4000000000000001 +70-71 6.0 +72-73 6.65 +74-75 5.1499999999999995 +76-77 8.15 +78-79 11.899999999999999 +80-81 10.5 +82-83 14.6 +84-85 13.3 +86-87 15.5 +88-89 13.15 +90-91 15.45 +92-93 10.5 +94-95 7.3999999999999995 +96-97 6.4 +98-99 0.3 100 0.0 >>END_MODULE >>Sequence Length Distribution pass @@ -290,565 +382,85 @@ Sequence length 100 100 1000.0 >>END_MODULE >>Sequence Duplication Levels pass -#Total Duplicate Percentage 3.4 -#Duplication Level Relative count -1 100.0 -2 0.4140786749482402 -3 0.0 -4 0.0 -5 0.0 -6 0.0 -7 0.0 -8 0.0 -9 0.0 -10++ 0.2070393374741201 +#Total Deduplicated Percentage 97.2 +#Duplication Level Percentage of deduplicated Percentage of total +1 99.38271604938271 96.6 +2 0.411522633744856 0.8 +3 0.0 0.0 +4 0.0 0.0 +5 0.0 0.0 +6 0.0 0.0 +7 0.0 0.0 +8 0.0 0.0 +9 0.0 0.0 +>10 0.205761316872428 2.6 +>50 0.0 0.0 +>100 0.0 0.0 +>500 0.0 0.0 +>1k 0.0 0.0 +>5k 0.0 0.0 +>10k+ 0.0 0.0 >>END_MODULE >>Overrepresented sequences fail #Sequence Count Percentage Possible Source -AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTAT 14 1.4000000000000001 TruSeq Adapter, Index 1 (97% over 36bp) -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATG 12 1.2 TruSeq Adapter, Index 1 (97% over 36bp) +AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTAT 14 1.4000000000000001 TruSeq Adapter, Index 18 (97% over 37bp) +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATG 12 1.2 TruSeq Adapter, Index 18 (97% over 37bp) AGGGGGAATGATGGTTGTCTTTGGATATACTACAGCGATGGCTATTGAGG 2 0.2 No Hit GGCTTGTTTTATTTTAATGGCTGATCTATGTAATCACAGAGGCCAGTATG 2 0.2 No Hit GTGGGGTGGTGTTTGTGGGGGACTTCATCATCTCAGGCTTCCCAGGGTCC 2 0.2 No Hit -CGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATGCCG 2 0.2 TruSeq Adapter, Index 1 (96% over 33bp) +CGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATGCCG 2 0.2 TruSeq Adapter, Index 18 (97% over 34bp) +>>END_MODULE +>>Adapter Content fail +#Position Illumina Universal Adapter Illumina Small RNA Adapter Nextera Transposase Sequence +1 1.4 0.0 0.0 +2 1.4 0.0 0.0 +3 1.4 0.0 0.0 +4 1.4 0.0 0.0 +5 1.4 0.0 0.0 +6 1.4 0.0 0.0 +7 1.4 0.0 0.0 +8 1.4 0.0 0.0 +9 1.4 0.0 0.0 +10-11 1.4 0.0 0.0 +12-13 1.4 0.0 0.0 +14-15 1.4 0.0 0.0 +16-17 1.4 0.0 0.0 +18-19 1.4 0.0 0.0 +20-21 1.5 0.0 0.0 +22-23 1.5 0.0 0.0 +24-25 1.5 0.0 0.0 +26-27 1.5 0.0 0.0 +28-29 1.6 0.0 0.0 +30-31 1.7 0.0 0.0 +32-33 1.9 0.0 0.0 +34-35 2.4 0.0 0.0 +36-37 2.45 0.0 0.0 +38-39 2.95 0.0 0.0 +40-41 3.25 0.0 0.0 +42-43 3.75 0.0 0.0 +44-45 4.2 0.0 0.0 +46-47 4.9 0.0 0.0 +48-49 5.699999999999999 0.0 0.0 +50-51 6.300000000000001 0.0 0.0 +52-53 6.949999999999999 0.0 0.0 +54-55 7.65 0.0 0.0 +56-57 8.399999999999999 0.0 0.0 +58-59 9.350000000000001 0.0 0.0 +60-61 9.899999999999999 0.0 0.0 +62-63 10.600000000000001 0.0 0.0 +64-65 11.3 0.0 0.0 +66-67 12.0 0.0 0.0 +68-69 13.05 0.0 0.0 +70-71 13.6 0.0 0.0 +72-73 14.5 0.0 0.0 +74-75 15.55 0.0 0.0 +76-77 16.15 0.0 0.0 +78-79 17.2 0.0 0.0 +80-81 17.700000000000003 0.0 0.0 +82-83 18.15 0.0 0.0 +84-85 18.75 0.0 0.0 +86-87 19.799999999999997 0.0 0.0 +88 20.6 0.0 0.0 >>END_MODULE ->>Kmer Content fail -#Sequence Count Obs/Exp Overall Obs/Exp Max Max Obs/Exp Position -AAAAA 385 7.3597403 68.038994 65-69 -AGATC 435 5.4375157 23.135067 1 -GAAGA 375 5.258809 32.443344 6 -GGAAG 420 5.044668 33.345257 5 -TCCAG 475 4.8355613 14.131038 2 -AAGAG 320 4.487517 25.954676 7 -CCAGG 475 4.4180827 17.21471 3 -GAGCA 380 4.3399205 21.1377 9 -AGCAC 395 4.2895336 15.0741825 7 -CTCCA 415 4.0171337 12.105032 95-96 -AGAGC 340 3.883087 21.137697 8 -TTTTT 280 3.8749053 8.964593 10-14 -CTTCT 370 3.8646336 11.598914 55-59 -CTGAA 305 3.812511 13.130004 90-94 -CGGAA 320 3.65467 26.422123 5 -ACCAG 335 3.6379597 10.049457 7 -TCTGA 310 3.6325634 12.308498 90-94 -CACAC 340 3.5108058 14.806036 85-89 -ATCGG 325 3.4795394 24.768969 3 -TCGGA 320 3.426008 19.815174 3 -GATCG 320 3.426008 19.815174 1 -CGTCT 355 3.387832 11.578538 85-89 -CTGCT 355 3.387832 17.662533 3 -GCACA 310 3.3664696 15.0741825 8 -TCTTC 320 3.3423858 7.7326093 50-54 -CAGCA 305 3.3121717 10.049455 6 -GAACT 260 3.2500093 13.130004 90-94 -GTCTG 320 3.2116532 12.65067 90-94 -CAGGA 280 3.197836 15.8532715 3 -AACTC 265 3.1497202 23.781752 95-96 -TGAAC 250 3.125009 13.130004 90-94 -CCAGC 350 3.0954454 6.6359653 95-96 -AGTCA 240 3.0000086 10.41078 25-29 -CACCA 290 2.9945107 6.079907 70-74 -TGCTG 295 2.960743 9.2877 2 -CAGAT 230 2.875008 11.040063 70-74 -CTTCC 315 2.8583732 10.916445 30-34 -CACGT 280 2.8504362 12.351324 85-89 -CAGGG 290 2.8367646 22.630535 9 -ACACG 260 2.8234906 13.175687 85-89 -TTCCA 250 2.7855206 9.279795 30-34 -TTCTT 230 2.765239 6.6755276 50-54 -AGCAG 240 2.7410026 15.853272 2 -TTCTG 240 2.6363494 10.165324 55-59 -ACTCC 270 2.6135564 14.526036 95-96 -GCCAG 280 2.6043434 8.607355 1 -ACGTC 255 2.595933 10.105629 85-89 -GATCT 220 2.5779483 8.675031 40-44 -TCTGC 265 2.5289452 13.2469015 2 -AAGAT 160 2.4557784 12.783248 35-39 -ATCTC 220 2.4512577 9.279794 40-44 -CAGTC 240 2.4432309 8.554544 90-94 -TCCAA 205 2.4365761 10.999062 7 -CTTTT 200 2.4045558 16.688818 6 -TTCCT 230 2.40234 9.665762 7 -CCAGT 235 2.3923304 9.4206915 25-29 -TTTCT 195 2.3444414 16.688818 8 -CTGGG 255 2.3383298 6.004135 80-84 -TGCTT 210 2.3068056 10.165323 4 -TCTTT 190 2.284328 5.5629396 15-19 -TTTTC 190 2.2843277 11.125878 7 -GGGGG 255 2.2468696 16.307867 2 -AGGAA 160 2.2437584 19.466007 5 -GTCAC 220 2.2396283 10.184532 95-96 -TCACT 200 2.2284167 8.360176 95-96 -CACTT 200 2.2284167 10.3108835 30-34 -GAAAA 135 2.2103586 10.606119 60-64 -ACTTC 195 2.172706 9.279794 30-34 -TTGAA 150 2.1582448 11.9834385 60-64 -CTCCT 235 2.1324375 16.794533 4 -TCCTC 235 2.1324372 8.397265 5 -ATCTT 165 2.11616 7.1210704 10-14 -GGGGA 205 2.1089406 14.2801 3 -ACACA 165 2.092039 11.7331705 8 -TGCAG 195 2.0877237 9.907587 5 -GACCA 190 2.0633202 10.049455 6 -AGGGG 200 2.057503 9.520067 1 -CCTCC 260 2.049668 14.590484 5 -AGGAG 170 2.0418897 5.557543 2 -TCCTT 195 2.0367663 14.498643 4 -GTCTT 185 2.032186 15.247986 7 -GCTGG 220 2.0173824 8.485845 1 -CCAGA 185 2.0090222 5.3284492 70-74 -CCTGG 230 2.0054333 8.068818 3 -GCAGG 205 2.005299 9.052214 3 -GGACC 215 1.9997637 8.607355 5 -TTCAT 155 1.987908 5.934226 2 -CCTTT 190 1.9845415 14.498643 5 -TTTCC 190 1.9845415 5.799457 15-19 -TGGCA 185 1.980661 14.861383 2 -TCTTG 180 1.977262 10.165323 5 -CCAAG 180 1.9547247 9.044511 35-39 -CTTCA 175 1.9498644 10.310883 6 -CAAGA 145 1.933477 12.339583 35-39 -CTGGA 180 1.9271295 9.907587 6 -GGCTG 210 1.9256833 16.97169 2 -AATGA 125 1.918577 7.677627 95-96 -TGAAA 125 1.918577 15.623971 60-64 -GCTTC 200 1.9086379 13.2469015 2 -GTCCA 185 1.8833237 14.131036 1 -AGAAA 115 1.882898 7.5757995 7 -TGGGG 195 1.8805519 13.386638 1 -TTCTC 180 1.880092 5.799457 25-29 -CTTGA 160 1.8748715 8.675031 60-64 -ACAAA 120 1.8682072 5.762797 40-44 -TCTCG 195 1.8609219 8.831266 5 -GGGAC 190 1.8585701 9.052216 5 -TGAGG 165 1.8578365 5.209824 2 -TGAAG 140 1.8404517 6.082693 2 -CATCT 165 1.8384434 5.155441 4 -CACTG 180 1.8324232 9.4206915 6 -CTGCA 180 1.8324231 5.3465896 90-94 -GCTGC 210 1.8310483 8.068819 1 -GCAGA 160 1.8273348 10.568848 3 -CCTTC 200 1.8148402 8.397265 9 -AGGGA 150 1.8016673 6.0081544 95-96 -TTTCA 140 1.7955297 7.1210704 15-19 -CACAG 165 1.7918309 5.432139 95-96 -AAACA 115 1.7903653 7.6389136 70-74 -ATTTT 120 1.7715117 13.661307 6 -TTTTG 140 1.7701824 17.551357 7 -GGGGC 210 1.7594293 11.629828 3 -GATTT 130 1.7534488 12.481857 6 -CAAAT 120 1.7513192 6.7527947 50-54 -GAGGG 170 1.7488776 9.520067 1 -GAAGG 145 1.7416117 6.0081544 95-96 -CATTT 135 1.7314036 5.9342256 5 -ATTTC 135 1.7314036 5.9342256 7 -CCTCT 190 1.7240983 8.397266 1 -ATCCA 145 1.7234317 5.49953 4 -GCAGC 185 1.7207267 6.9789357 95-96 -TCCTG 180 1.717774 13.2469 2 -CTCTG 180 1.717774 13.2469 2 -AAAAC 110 1.7125233 7.6389136 70-74 -CTTGG 170 1.7061908 9.2877 2 -AAAAT 95 1.7024158 8.291661 9 -TCACC 175 1.693972 8.957724 8 -TCCAC 175 1.693972 8.957724 5 -GAGAA 120 1.6828189 6.488669 6 -TCTCC 185 1.6787271 5.038359 55-59 -GAGCC 180 1.6742208 8.607355 9 -TCATC 150 1.6713123 5.1554413 2 -AGACA 125 1.6667906 6.169792 2 -TGATG 135 1.6636823 11.404236 9 -GGGAG 160 1.6460025 9.520067 1 -AGCCA 150 1.6289369 6.029673 10-14 -ATGCC 160 1.6288207 8.478622 45-49 -CTCGT 170 1.6223421 8.831266 3 -GAGGA 135 1.6215005 11.115086 3 -TGTTG 140 1.6173534 10.690706 2 -CTCAT 145 1.6156021 5.1554418 2 -CAGGT 150 1.6059413 9.907587 4 -GCTTG 160 1.6058266 9.2877 60-64 -GGGTC 175 1.6047363 12.728768 2 -TCATT 125 1.6031516 5.934226 9 -GTTGA 130 1.6020645 5.702118 1 -ACAGA 120 1.6001189 10.005068 95-96 -GGAGG 155 1.5945649 9.520067 2 -GGGGT 165 1.5912362 13.386638 1 -TGGGA 140 1.5763463 10.419649 2 -GGATG 140 1.5763462 15.629472 6 -GCCTC 190 1.575248 7.672287 2 -CCTGC 190 1.5752479 11.508429 2 -GCTCC 190 1.5752479 11.508429 6 -TCTCT 150 1.5667434 5.224736 95-96 -GGGAA 130 1.561445 11.115086 4 -TCCAT 140 1.5598917 10.3108835 8 -GGCTT 155 1.5556445 13.93155 1 -TTGAT 115 1.5511277 6.240928 4 -CATCA 130 1.5451456 5.49953 2 -AGAGA 110 1.542584 6.488669 9 -AGGAC 135 1.541814 6.341309 55-59 -GTATG 125 1.5404466 9.123388 45-49 -AACAT 105 1.5324043 13.5055895 9 -AGCTC 150 1.5270194 9.4206915 5 -TTTGT 120 1.5172992 17.551357 8 -GATGA 115 1.5117996 6.082693 5 -GAGAT 115 1.5117996 6.082693 4 -AGGAT 115 1.5117996 12.165386 4 -TGAGA 115 1.5117996 6.082693 5 -CTGGT 150 1.5054625 9.2877 4 -GCTGT 150 1.5054625 18.5754 3 -TTCAC 135 1.504181 10.310883 7 -CCCAG 170 1.5035021 12.276537 2 -CAGTG 140 1.4988785 9.907587 5 -CTCCC 190 1.4978343 7.295242 1 -CCCTG 180 1.4923402 11.5084305 2 -CAGAG 130 1.4847097 7.398194 20-24 -CTTTG 135 1.4829465 10.165323 2 -CAAAA 95 1.4789973 7.203496 9 -TCTCA 130 1.4484707 5.1554413 8 -GAATG 110 1.4460692 12.165386 7 -GGAAT 110 1.4460692 12.165386 5 -TTTGG 125 1.4440656 5.345353 7 -GGCCT 165 1.4386805 12.103227 1 -GCTCT 150 1.4314783 6.1818867 20-24 -TCTGT 130 1.4280226 15.247986 3 -CTGTT 130 1.4280226 15.247986 4 -AGGTT 115 1.4172109 11.404235 8 -TTGAG 115 1.4172107 5.702117 4 -TTTGA 105 1.416247 7.4891143 10-14 -ATCTG 120 1.4061534 5.4218936 2 -GGTCT 140 1.4050984 9.287701 6 -TTTTA 95 1.4024467 7.384491 95-96 -GGGTG 145 1.3983592 13.386638 2 -GGCAC 150 1.3951839 8.607355 4 -AAAGA 85 1.3917071 7.5757985 8 -AAGAA 85 1.3917071 5.254889 75-79 -TTGTT 110 1.3908576 5.850453 4 -GGAGA 115 1.3812783 5.557543 3 -ATGAC 110 1.3750039 6.252721 95-96 -TGTTC 125 1.3730987 10.165325 5 -GGGCA 140 1.3694727 9.052216 4 -ATGAT 95 1.3668885 6.6574664 6 -CCACT 140 1.3551775 5.3746343 30-34 -TGGCT 135 1.3549163 13.931552 3 -GATGG 120 1.3511539 10.419648 9 -TCGTA 115 1.3475639 5.421894 40-44 -TGTCA 115 1.3475639 5.421894 5 -GCTGA 125 1.3382844 9.907587 6 -CAGAA 100 1.3334324 5.6025352 90-94 -CCAAA 105 1.3312978 5.8665853 8 -GGGCT 145 1.3296387 12.728768 1 -TAGGA 100 1.3146083 12.165386 4 -GACAG 115 1.313397 5.2844243 1 -GGTCC 150 1.3078917 8.068819 6 -CCATC 135 1.3067783 8.957724 9 -AAATG 85 1.3046323 7.101804 6 -TTCAA 95 1.2997144 6.330293 9 -CGTAT 110 1.2889742 8.675031 45-49 -TGACT 110 1.2889742 5.421894 3 -TATGC 110 1.2889739 8.67503 45-49 -GCCCT 155 1.2850707 7.672287 3 -TGGGC 140 1.283789 8.485846 7 -ACTTT 100 1.2825212 5.9342256 1 -ATGTT 95 1.2813665 6.2409286 1 -ATTTG 95 1.2813663 12.481856 9 -TGGTT 110 1.2707777 5.345353 5 -TGGTG 120 1.2666163 9.767722 7 -GTTTT 100 1.2644161 5.8504534 6 -GCCTG 145 1.2642952 12.103229 1 -TTGCT 115 1.2632507 6.0991945 50-54 -CCACC 150 1.2614243 7.7821474 5 -GGACA 110 1.2562928 15.853274 6 -GAAGC 110 1.2562928 10.568849 9 -TGACA 100 1.2500036 5.7837667 9 -GACAT 100 1.2500035 11.567533 7 -TGGAA 95 1.248878 6.082693 5 -ACAGC 115 1.2488517 10.049455 5 -AATCC 105 1.2480024 5.499531 7 -TGCCT 130 1.2406145 8.831266 3 -AGGTG 110 1.2385577 5.209824 4 -GTGGC 135 1.2379395 12.728768 1 -CATGT 105 1.2303842 5.4218936 1 -TAGAT 85 1.2230055 6.0453725 90-94 -CCCTC 155 1.2219174 7.295242 4 -GCCGT 140 1.2206988 8.068819 3 -AGTTT 90 1.2139261 6.2409286 7 -TTTAG 90 1.213926 6.240928 8 -TTGGG 115 1.2138406 9.767722 2 -ACCTC 125 1.20998 8.957724 1 -AGCAA 90 1.2000892 6.169792 9 -CAAAG 90 1.2000891 6.169791 5 -AAAGC 90 1.2000891 6.169791 6 -ACAGG 105 1.1991886 10.568849 8 -AGGCA 105 1.1991886 5.712891 95-96 -ATCAG 95 1.1875033 5.7837663 6 -ATGAG 90 1.1831475 6.082693 25-29 -CAGTT 100 1.1717947 5.1698627 85-89 -ATGCT 100 1.1717947 5.421894 8 -TCAAT 85 1.1629024 6.3302937 10-14 -TGTGT 100 1.1552525 10.690706 3 -GCCCA 130 1.1497369 12.276536 1 -TGATT 85 1.1464858 12.481857 5 -TGCTC 120 1.1451827 8.831267 4 -TGTCC 120 1.1451827 13.2469015 2 -TCCCC 145 1.143084 7.295242 2 -AAGGC 100 1.1420842 5.493164 65-69 -CAACA 90 1.1411123 5.8665853 8 -CACAA 90 1.1411123 11.7331705 9 -ACATC 95 1.129145 5.4995303 8 -AAGCT 90 1.1250031 6.2527194 95-96 -GAAAG 80 1.1218792 12.977338 7 -AAGGA 80 1.1218792 6.488669 3 -GCACT 110 1.1198142 9.4206915 5 -CCTGA 110 1.119814 9.420691 9 -ACCTT 100 1.1142083 5.1554418 7 -GTCAT 95 1.113205 5.421894 1 -TGATC 95 1.113205 10.843788 5 -TCATG 95 1.113205 5.421894 3 -TGGAT 90 1.1091216 5.702118 9 -GTGGG 115 1.1090435 8.924425 1 -CTGTG 110 1.1040058 9.2877 4 -GCTTT 100 1.0984789 5.4947696 95-96 -TGTCT 100 1.0984789 10.165323 5 -TTGGT 95 1.0974898 5.345353 4 -CTGTC 115 1.0974668 17.662535 4 -CAGAC 100 1.0859579 5.0247273 5 -GGAAC 95 1.0849801 5.2844243 6 -CCTCG 130 1.0778012 7.672287 6 -GCGGC 135 1.075477 7.372196 1 -ATAAA 60 1.0752101 8.291662 7 -GGGAT 95 1.0696635 10.419649 3 -CATCC 110 1.0647823 8.957723 3 -ACAGT 85 1.062503 5.7837663 4 -ACTGA 85 1.062503 11.567533 7 -GTTGG 100 1.0555136 9.767722 1 -TGTGG 100 1.0555136 9.767722 5 -GGAAA 75 1.0517617 19.466007 6 -GTGAA 80 1.0516868 6.082693 1 -GAAGT 80 1.0516866 6.082693 5 -GTCTC 110 1.0497508 8.831267 1 -CGGCT 120 1.046313 8.068818 1 -TTTAT 70 1.0333818 5.4645233 10-14 -GACAC 95 1.0316601 10.049455 7 -GGCAA 90 1.0278759 10.56885 3 -TCATA 75 1.0260904 6.330293 5 -ATTCA 75 1.0260903 6.3302927 7 -TAACA 70 1.0216029 6.7527957 8 -GGTCA 95 1.0170963 9.907589 3 -ATGGC 95 1.0170962 9.907587 1 -TCAGG 95 1.0170962 9.907587 8 -GGTGA 90 1.0133655 15.629474 3 -TGTTT 80 1.0115329 5.8504534 5 -TGAAT 70 1.007181 6.6574664 5 -ATTGA 70 1.0071809 6.6574664 7 -AAGTT 70 1.0071809 6.6574664 6 -TTGCC 105 1.0020349 8.831267 2 -CTTGC 105 1.0020349 8.831267 6 -GCAAA 75 1.0000744 6.169792 4 -CATAG 80 1.0000029 6.2527204 95-96 -GACTT 85 0.99602544 5.421894 1 -CTGAT 85 0.99602544 5.421894 4 -CTTGT 90 0.988631 10.165323 3 -AATGG 75 0.98595625 6.082693 8 -AAGGT 75 0.9859562 6.0826926 4 -GATGT 80 0.98588586 5.7021174 7 -GGATT 80 0.98588586 11.404235 5 -GGCGG 115 0.96349704 7.753219 1 -AGAGG 80 0.9608892 5.557543 8 -GAGGT 85 0.95706743 5.2098246 3 -ATGGG 85 0.9570673 5.209824 1 -CCGTC 115 0.95343953 7.672287 4 -TAGCA 75 0.9375027 5.7837667 1 -ACATG 75 0.9375026 5.7837663 2 -TTGCA 80 0.93743575 5.421894 4 -GTTCA 80 0.93743575 5.421894 6 -ATGTC 80 0.93743575 5.421894 5 -TTCAG 80 0.93743575 5.421894 8 -TTGAC 80 0.9374356 5.4218936 2 -GTTCT 85 0.93370706 5.0826616 1 -TTGTC 85 0.93370706 5.0826616 9 -TTTGC 85 0.93370706 5.0826616 3 -ATGGT 75 0.924268 5.7021174 4 -ATGAA 60 0.920917 7.1018047 9 -AGATG 70 0.92022586 6.082693 5 -GCTCA 90 0.91621155 5.092265 95-96 -AGTGC 85 0.9100334 9.907587 2 -AGGGT 80 0.90076935 10.419649 1 -GTAGG 80 0.90076923 10.419648 6 -AGTGG 80 0.90076923 5.209824 2 -TAAAA 50 0.89600843 8.291662 8 -CACAT 75 0.89143026 5.499531 6 -CCATT 80 0.89136666 10.3108835 9 -ATACT 65 0.8892783 6.330293 9 -ACATT 65 0.88927823 6.3302927 7 -GCGGG 105 0.87971467 7.753219 2 -ACACC 85 0.8777014 9.555587 9 -CATAA 60 0.8756596 6.7527947 6 -ACCCT 90 0.8711856 13.436585 1 -GAACA 65 0.8667311 6.169792 7 -ACTGC 85 0.8653109 5.092265 95-96 -GGTAT 70 0.86265016 17.106354 6 -AGTTG 70 0.86265016 5.702118 7 -GAGAC 75 0.85656327 5.2844243 1 -GTGTC 85 0.8530954 13.93155 1 -GTTGC 85 0.8530954 9.2877 1 -ATAGA 55 0.84417385 7.1018047 8 -GAAAT 55 0.84417385 7.1018047 5 -CATTC 75 0.83565605 5.155441 6 -TCACA 70 0.83200157 5.499531 3 -TGCGG 90 0.8252928 8.485845 3 -GCATT 70 0.8202563 5.421894 4 -GAACC 75 0.8144686 5.0247283 6 -CTCGA 80 0.81441027 9.420691 6 -GAATC 65 0.8125023 5.7837667 6 -TACAG 65 0.81250226 11.567533 7 -TGGTA 65 0.80103225 11.404236 5 -AAGAC 60 0.80005944 6.169791 8 -CAAGG 70 0.7994591 5.2844243 2 -ATGTA 55 0.7913565 6.6574664 4 -AATGT 55 0.7913565 6.6574664 3 -CGGCA 85 0.7906042 8.607354 2 -GAGAG 65 0.7807225 5.557543 8 -ACCAT 65 0.7725729 5.499531 8 -TTCTA 60 0.7695128 5.934226 9 -TAGAA 50 0.7674308 7.1018047 9 -GCATC 75 0.7635097 9.4206915 1 -GTTCC 80 0.76345515 8.831267 6 -AGCTT 65 0.76166654 5.421894 1 -TTAGC 65 0.76166654 5.421894 9 -CTGTA 65 0.76166654 5.421894 2 -ACTTG 65 0.7616664 5.4218936 2 -GTGCT 75 0.7527313 9.287701 3 -ATCAT 55 0.7524662 6.3302927 3 -GTTTG 65 0.7509141 5.345353 9 -GTGTT 65 0.7509141 10.690706 1 -GTCAA 60 0.75000215 11.5675335 6 -AATGC 60 0.75000215 6.252721 95-96 -CAAGT 60 0.7500021 5.7837663 9 -GCAAT 60 0.7500021 5.7837663 4 -GCAAG 65 0.74235487 5.2844243 1 -AGTGT 60 0.7394144 5.7021174 1 -TTAGG 60 0.7394144 5.702118 7 -AGCGG 75 0.73364604 9.052214 1 -ATCCT 65 0.72423524 5.155441 4 -ACTCT 65 0.72423524 5.155441 9 -AGTGA 55 0.7230346 6.082693 6 -AATAA 40 0.71680677 8.291662 6 -AACCT 60 0.71314424 5.4995303 1 -ATTCT 55 0.70538664 5.9342256 7 -AGTCT 60 0.7030768 5.421894 3 -GTGCA 65 0.69590795 9.907589 6 -AAAGT 45 0.69068766 7.101804 8 -AACTG 55 0.6875019 5.7837663 1 -CGAAG 60 0.68525064 5.2844243 4 -GATTG 55 0.67779654 5.702118 6 -GTGAT 55 0.67779654 11.404236 4 -TGTTA 50 0.67440337 12.481857 5 -TTGTA 50 0.6744033 6.240928 9 -TATTG 50 0.6744033 6.240928 7 -CTCTA 60 0.6685249 5.1554413 7 -TACCT 60 0.66852486 10.310882 8 -ATGGA 50 0.65730417 6.082693 8 -ATACA 45 0.6567447 6.7527957 6 -ATCAA 45 0.65674466 6.7527947 9 -TGTAA 45 0.6474735 6.6574664 7 -GCGGT 70 0.6418945 8.485846 4 -GGCCG 80 0.63731974 7.372196 2 -GGTTT 55 0.63538885 10.690706 9 -TTGTG 55 0.63538885 5.345353 1 -TATAT 40 0.62991583 7.2865515 8 -CCTGT 65 0.62030727 8.831266 3 -GTGAG 55 0.6192789 5.2098246 1 -TAGGG 55 0.61927885 5.209824 8 -GAGTT 50 0.6161787 5.7021174 6 -ATGTG 50 0.6161787 5.702118 2 -GAATA 40 0.61394465 7.1018047 6 -CTGCG 70 0.6103493 8.068818 2 -CGGTG 65 0.59604484 8.485845 2 -TAAGG 45 0.5915738 6.082693 9 -AAGTG 45 0.5915737 6.0826926 1 -TATTT 40 0.5905039 6.8306537 8 -GGCAT 55 0.5888452 14.861383 3 -GTATC 50 0.5858973 5.421894 4 -ATAAC 40 0.5837731 13.505591 7 -TTACT 45 0.57713455 5.934226 9 -GTATA 40 0.575532 13.314933 7 -GAGTG 50 0.5629808 5.209824 1 -GTACA 45 0.5625016 5.7837667 6 -ATAGC 45 0.5625016 5.7837667 9 -TCTAC 50 0.5571041 5.1554413 8 -GCGAG 55 0.53800714 9.052216 1 -ACGGG 55 0.5380071 9.052214 1 -GATAA 35 0.5372016 7.1018047 6 -AATAG 35 0.5372016 7.101805 7 -CAACT 45 0.53485817 5.4995303 6 -CATAC 45 0.53485817 5.4995303 5 -GATTC 45 0.52730757 5.421894 6 -AGGTA 40 0.5258433 12.165386 5 -CGGTC 60 0.52315664 8.068819 5 -ACGAG 45 0.51393795 5.2844243 7 -TATTC 40 0.5130085 5.9342256 7 -CTAAA 35 0.51080143 6.7527957 9 -TACAA 35 0.51080143 5.402236 35-39 -CCTTA 45 0.5013937 5.1554413 6 -CAGTA 40 0.50000143 5.7837667 4 -GTGTA 40 0.49294293 5.702118 4 -TAACT 35 0.47884214 6.330293 8 -CTTAA 35 0.47884214 6.330293 7 -CTATA 35 0.47884214 6.330293 4 -TTAAC 35 0.47884214 6.330293 8 -TATCA 35 0.4788421 6.3302927 5 -TCAAC 40 0.47542948 5.499531 7 -ACTCA 40 0.47542942 5.49953 8 -TTAGT 35 0.47208238 10.120425 95-96 -TGTAT 35 0.47208238 6.2409286 3 -ATTGT 35 0.47208235 6.240928 8 -GTTAC 40 0.46871787 5.421894 6 -TGTAC 40 0.46871787 10.843788 7 -AGAGT 35 0.46011293 6.082693 5 -AGTAG 35 0.46011293 6.082693 5 -CTCCG 55 0.45599285 7.672287 6 -GGTAG 40 0.45038468 5.2098246 2 -TTTAC 35 0.44888243 5.9342256 8 -CTACT 40 0.44568333 5.1554418 4 -AACTA 30 0.4378298 6.7527947 9 -TATAG 30 0.43164897 6.6574664 5 -ATATA 25 0.4199739 7.7728767 9 -CTCAA 35 0.41600078 5.499531 9 -TATAC 30 0.4104361 6.3302927 5 -ACTAT 30 0.4104361 6.3302927 6 -TACTA 30 0.4104361 6.3302927 5 -TCGAT 35 0.41012815 10.843788 7 -ACGTT 35 0.41012815 5.421894 4 -CGAAA 30 0.40002972 6.169792 9 -GTAAG 30 0.3943825 6.082693 8 -ATAGG 30 0.3943825 6.082693 3 -TCCTA 35 0.38997287 5.1554413 5 -TTACC 35 0.38997287 5.1554413 7 -ACCGA 35 0.3800853 5.0247273 7 -GCATA 30 0.37500107 5.7837667 1 -TCGAA 30 0.37500107 5.7837667 4 -GCTAA 30 0.37500107 5.7837667 8 -TAGGT 30 0.3697072 5.7021174 7 -GTTAG 30 0.3697072 5.702118 6 -CAATA 25 0.36485815 6.7527947 5 -ATACC 30 0.35657212 5.499531 6 -GACGA 30 0.3426253 5.284424 6 -AAGCG 30 0.3426253 10.568848 7 -GTTTA 25 0.33720168 6.2409286 7 -GTATT 25 0.33720168 12.481857 6 -AGATA 20 0.30697232 7.1018047 5 -CGTCA 30 0.30540386 9.420691 5 -CCTAA 25 0.29714343 5.499531 7 -TACCA 25 0.2971434 5.49953 9 -TGCTA 25 0.29294866 5.421894 7 -TACGT 25 0.29294863 5.4218936 9 -AGACG 25 0.2855211 5.284425 9 -CCTAT 25 0.2785521 5.1554418 3 -TAAGC 20 0.25000072 5.7837667 9 -CTAAG 20 0.25000072 5.7837667 8 -CGATT 20 0.23435894 5.421894 9 -GGGTA 20 0.22519234 5.2098246 2 -ACGCA 20 0.21719159 5.0247273 5 -GCGAA 15 0.17131266 5.284425 3 -CGAAC 15 0.16289368 5.0247273 5 +>>Kmer Content pass >>END_MODULE diff --git a/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/CutadaptTest.scala b/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/CutadaptTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..2b537d9767cbc1ddbd9f2e528a1c122dfe973d7c --- /dev/null +++ b/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/CutadaptTest.scala @@ -0,0 +1,84 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.flexiprep + +import java.io.File + +import org.testng.annotations.Test + +class CutadaptTest extends FastqcV0101Test { + /** Mock output file of a Cutadapt 1.9 run */ + private[flexiprep] val cutadaptOut: File = resourceFile("ct-test.R1.clip.stats") + + def testFastQCinstance: Fastqc = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + fqc.contaminants = Option(resourceFile("fqc_contaminants_v0112.txt")) + // fqc.beforeGraph() + fqc + } + + def testCutadaptInst: Cutadapt = { + val caExe = new Cutadapt(null, testFastQCinstance) + caExe.statsOutput = cutadaptOut + caExe + } + + @Test def testAdapterFound() = { + val cutadapt = testCutadaptInst + val adapters = cutadapt.extractClippedAdapters(cutadaptOut) + adapters.keys.size shouldBe 4 + + adapters.get("CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC") shouldBe Some( + Map( + "count" -> 94, + "histogram" -> Map( + "5p" -> Map(5 -> 2, 6 -> 4, 9 -> 1, 3 -> 8, 4 -> 3), + "3p" -> Map(5 -> 21, 6 -> 18, 9 -> 1, 12 -> 1, 7 -> 2, 3 -> 13, 11 -> 1, 4 -> 19) + ) + ) + ) + + adapters.get("CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC") shouldBe Some( + Map( + "count" -> 0, + "histogram" -> Map() + ) + ) + } + + @Test def testSummary() = { + val cutadapt = testCutadaptInst + val summary = cutadapt.summaryStats + + summary.keys shouldBe Set("num_bases_input", "num_reads_input", "num_reads_output", + "num_reads_with_adapters", "num_reads_affected", "num_reads_discarded_too_long", + "adapters", "num_reads_discarded_many_n", "num_reads_discarded_too_short", "num_bases_output") + + summary.keys.size shouldBe 10 + summary("adapters").asInstanceOf[Map[String, Map[String, Any]]].keys.size shouldBe 4 + + summary("num_bases_input") shouldBe 100000 + summary("num_reads_input") shouldBe 1000 + summary("num_reads_output") shouldBe 985 + summary("num_reads_with_adapters") shouldBe 440 + summary("num_reads_affected") shouldBe 425 + summary("num_reads_discarded_too_long") shouldBe 0 + summary("num_reads_discarded_many_n") shouldBe 0 + summary("num_reads_discarded_too_short") shouldBe 15 + summary("num_bases_output") shouldBe 89423 + } +} diff --git a/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala b/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala index 4cb68fdfc44d5a30c3ed76aabc9570d6f62529f3..3cf24e8c60a570e8e51fe528ece4f81d0b66a01a 100644 --- a/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala +++ b/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala @@ -25,14 +25,14 @@ import org.testng.annotations.Test class FastqcV0101Test extends TestNGSuite with Matchers { /** Returns the absolute path to test resource directory as a File object */ - private val resourceDir: File = new File(Paths.get(getClass.getResource("/").toURI).toString) + private[flexiprep] val resourceDir: File = new File(Paths.get(getClass.getResource("/").toURI).toString) /** Given a resource file name, returns the the absolute path to it as a File object */ - private def resourceFile(p: String): File = new File(resourceDir, p) + private[flexiprep] def resourceFile(p: String): File = new File(resourceDir, p) /** Mock output file of a FastQC v0.10.1 run */ // the file doesn't actually exist, we just need it so the outputDir value can be computed correctly - private val outputv0101: File = resourceFile("v0101.fq_fastqc.zip") + private[flexiprep] val outputv0101: File = resourceFile("v0101.fq_fastqc.zip") @Test def testOutputDir() = { val fqc = new Fastqc(null) @@ -44,7 +44,7 @@ class FastqcV0101Test extends TestNGSuite with Matchers { val fqc = new Fastqc(null) fqc.output = outputv0101 // 11 QC modules - fqc.qcModules.size shouldBe 11 + fqc.qcModules.size shouldBe 12 // first module fqc.qcModules.keySet should contain("Basic Statistics") // mid (6th module) @@ -83,4 +83,23 @@ class FastqcV0101Test extends TestNGSuite with Matchers { adapters.last.seq shouldEqual "GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" } + + @Test def testPerBaseSequenceQuality() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + + val perBaseSequenceQuality = fqc.perBaseSequenceQuality + perBaseSequenceQuality.size shouldBe 55 + perBaseSequenceQuality.keys should contain("54-55") + } + + @Test def testPerBaseSequenceContent() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + + val perBaseSequenceContent: Map[String, Map[String, Double]] = fqc.perBaseSequenceContent + perBaseSequenceContent.size shouldBe 55 + perBaseSequenceContent.keys should contain("1") + } + } \ No newline at end of file diff --git a/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/GenerateIndexes.scala b/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/GenerateIndexes.scala index 6e2aa683f8e6e2abe31e2e8307d71db8c41c5258..d7c40fb76197f77ddb944803c113b65cf124a0bf 100644 --- a/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/GenerateIndexes.scala +++ b/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/GenerateIndexes.scala @@ -30,8 +30,8 @@ import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx import nl.lumc.sasc.biopet.utils.ConfigUtils import org.broadinstitute.gatk.queue.QScript -import scala.language.reflectiveCalls +import scala.language.reflectiveCalls import scala.collection.JavaConversions._ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript { @@ -173,7 +173,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript genomeConfig.get("dbsnp_vcf_uri").foreach { dbsnpUri => val cv = new CombineVariants(this) - cv.reference = fastaFile + cv.reference_sequence = fastaFile cv.deps ::= createDict.output def addDownload(uri: String): Unit = { val curl = new Curl(this) @@ -181,7 +181,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript curl.output = new File(annotationDir, new File(curl.url).getName) curl.isIntermediate = true add(curl) - cv.inputFiles ::= curl.output + cv.variant :+= curl.output val tabix = new Tabix(this) tabix.input = curl.output @@ -198,7 +198,7 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript case _ => addDownload(dbsnpUri.toString) } - cv.outputFile = new File(annotationDir, "dbsnp.vcf.gz") + cv.out = new File(annotationDir, "dbsnp.vcf.gz") add(cv) } diff --git a/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/Measurement.scala b/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/Measurement.scala index b07d295a88d9044f351f738320c23692f53711bf..8f470ee063d85c5c4b14e6e261c4cc1cb323ff2d 100644 --- a/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/Measurement.scala +++ b/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/Measurement.scala @@ -29,6 +29,7 @@ trait Measurement extends SummaryQScript with Reference { qscript: QScript => /** * Method to add a bamFile to the pipeline + * * @param id Unique id used for this bam file, most likely to be a sampleName * @param file Location of the bam file */ @@ -51,6 +52,8 @@ trait Measurement extends SummaryQScript with Reference { qscript: QScript => require(bamFiles.nonEmpty) } + lazy val mergeCountFiles: Boolean = config("merge_count_files", default = true) + private var extraSummaryFiles: Map[String, File] = Map() def addMergeTableJob(countFiles: List[File], @@ -58,18 +61,22 @@ trait Measurement extends SummaryQScript with Reference { qscript: QScript => name: String, fileExtension: String, args: MergeArgs = mergeArgs): Unit = { - add(MergeTables(this, countFiles, outputFile, - args.idCols, args.valCol, args.numHeaderLines, args.fallback, fileExtension = Some(fileExtension))) - extraSummaryFiles += s"${name}_table" -> outputFile + if (mergeCountFiles) { + add(MergeTables(this, countFiles, outputFile, + args.idCols, args.valCol, args.numHeaderLines, args.fallback, fileExtension = Some(fileExtension))) + extraSummaryFiles += s"${name}_table" -> outputFile + } } def addHeatmapJob(countTable: File, outputFile: File, name: String, countType: Option[String] = None): Unit = { - val job = new PlotHeatmap(qscript) - job.input = countTable - job.output = outputFile - job.countType = countType - add(job) - extraSummaryFiles += s"${name}_heatmap" -> outputFile + if (mergeCountFiles) { + val job = new PlotHeatmap(qscript) + job.input = countTable + job.output = outputFile + job.countType = countType + add(job) + extraSummaryFiles += s"${name}_heatmap" -> outputFile + } } /** Must return a map with used settings for this pipeline */ diff --git a/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala b/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala index afc6bbdc3ba737db63f5c4270009de0a60b8deaa..d2303ac3e014110652af209dbaee180565405ca0 100644 --- a/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala +++ b/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala @@ -82,10 +82,10 @@ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with R gensToVcf.outputVcf = new File(outputDirGens, gen._1.genotypes.getName + s".${gen._2}.vcf.gz") gensToVcf.isIntermediate = true add(gensToVcf) - cvChr.inputFiles :+= gensToVcf.outputVcf + cvChr.variant :+= gensToVcf.outputVcf } add(cvChr) - cvTotal.inputFiles :+= cvChr.outputFile + cvTotal.variant :+= cvChr.outputFile contig -> cvChr.outputFile } add(cvTotal) @@ -105,14 +105,14 @@ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with R bedFile.deleteOnExit() val sv = new SelectVariants(this) - sv.inputFiles :+= chrVcfFiles.getOrElse(region.chr, vcfFile) - sv.outputFile = new File(regionDir, s"$name.vcf.gz") + sv.variant = chrVcfFiles.getOrElse(region.chr, vcfFile) + sv.out = new File(regionDir, s"$name.vcf.gz") sv.intervals :+= bedFile sv.isIntermediate = true add(sv) val snptest = new Snptest(this) - snptest.inputGenotypes :+= sv.outputFile + snptest.inputGenotypes :+= sv.out snptest.inputSampleFiles :+= phenotypeFile snptest.outputFile = Some(new File(regionDir, s"$name.snptest")) add(snptest) @@ -127,7 +127,7 @@ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with R } val cv = new CatVariants(this) - cv.inputFiles = snpTests.map(_._2).toList + cv.variant = snpTests.map(_._2).toList cv.outputFile = new File(outputDir, "snptest" + File.separator + "snptest.vcf.gz") add(cv) } diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/Shiva.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/Shiva.scala index 7bebb491a55a52b7b77ce8141f11f162ab7eb643..ed0e1318d96c615346172b5e1add0df6dc4476d0 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/Shiva.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/Shiva.scala @@ -17,7 +17,7 @@ package nl.lumc.sasc.biopet.pipelines.shiva import nl.lumc.sasc.biopet.core.{ PipelineCommand, Reference } import nl.lumc.sasc.biopet.core.report.ReportBuilderExtension -import nl.lumc.sasc.biopet.extensions.gatk.broad._ +import nl.lumc.sasc.biopet.extensions.gatk._ import nl.lumc.sasc.biopet.pipelines.bammetrics.TargetRegions import nl.lumc.sasc.biopet.pipelines.mapping.MultisampleMappingTrait import nl.lumc.sasc.biopet.pipelines.toucan.Toucan diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcalling.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcalling.scala index b2055a09f725f8a67321cbad36c5bea05a94bb13..4f84d061198ee9449811cd414c72e661ee1e501b 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcalling.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcalling.scala @@ -26,6 +26,7 @@ import nl.lumc.sasc.biopet.pipelines.shiva.variantcallers.{ VarscanCnsSingleSamp import nl.lumc.sasc.biopet.utils.{ BamUtils, Logging } import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.queue.QScript +import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile /** * Implementation of ShivaVariantcalling @@ -86,10 +87,10 @@ class ShivaVariantcalling(val root: Configurable) extends QScript require(callers.nonEmpty, "must select at least 1 variantcaller, choices are: " + callersList.map(_.name).mkString(", ")) val cv = new CombineVariants(qscript) - cv.outputFile = finalFile - cv.setKey = "VariantCaller" - cv.genotypeMergeOptions = Some("PRIORITIZE") - cv.rodPriorityList = callers.map(_.name).mkString(",") + cv.out = finalFile + cv.setKey = Some("VariantCaller") + cv.genotypemergeoption = Some("PRIORITIZE") + cv.rod_priority_list = Some(callers.map(_.name).mkString(",")) for (caller <- callers) { caller.inputBams = inputBams caller.namePrefix = namePrefix @@ -110,17 +111,17 @@ class ShivaVariantcalling(val root: Configurable) extends QScript vtDecompose.inputVcf = vtNormalize.outputVcf vtDecompose.outputVcf = swapExt(caller.outputDir, vtNormalize.outputVcf, ".vcf.gz", ".decompose.vcf.gz") add(vtDecompose, Tabix(this, vtDecompose.outputVcf)) - cv.addInput(vtDecompose.outputVcf, caller.name) + cv.variant :+= TaggedFile(vtDecompose.outputVcf, caller.name) } else if (normalize && !decompose) { vtNormalize.outputVcf = swapExt(caller.outputDir, caller.outputFile, ".vcf.gz", ".normalized.vcf.gz") add(vtNormalize, Tabix(this, vtNormalize.outputVcf)) - cv.addInput(vtNormalize.outputVcf, caller.name) + cv.variant :+= TaggedFile(vtNormalize.outputVcf, caller.name) } else if (!normalize && decompose) { vtDecompose.inputVcf = caller.outputFile vtDecompose.outputVcf = swapExt(caller.outputDir, caller.outputFile, ".vcf.gz", ".decompose.vcf.gz") add(vtDecompose, Tabix(this, vtDecompose.outputVcf)) - cv.addInput(vtDecompose.outputVcf, caller.name) - } else cv.addInput(caller.outputFile, caller.name) + cv.variant :+= TaggedFile(vtDecompose.outputVcf, caller.name) + } else cv.variant :+= TaggedFile(caller.outputFile, caller.name) } add(cv) @@ -139,9 +140,9 @@ class ShivaVariantcalling(val root: Configurable) extends QScript referenceVcf.foreach(referenceVcfFile => { val gc = new GenotypeConcordance(this) - gc.evalFile = vcfFile - gc.compFile = referenceVcfFile - gc.outputFile = new File(vcfFile.getParentFile, s"$namePrefix-genotype_concordance.$name.txt") + gc.eval = vcfFile + gc.comp = referenceVcfFile + gc.out = new File(vcfFile.getParentFile, s"$namePrefix-genotype_concordance.$name.txt") referenceVcfRegions.foreach(gc.intervals ::= _) add(gc) addSummarizable(gc, s"$namePrefix-genotype_concordance-$name") diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Delly.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Delly.scala index 98fe0e0a06342cee60db461acc33f1a64b5c23b2..91f8468b189b878d756554782b956a8a0037ceef 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Delly.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Delly.scala @@ -41,7 +41,7 @@ class Delly(val root: Configurable) extends SvCaller { delly.analysistype = "DEL" delly.outputvcf = new File(dellyDir, sample + ".delly.del.vcf") add(delly) - catVariants.inputFiles :+= delly.outputvcf + catVariants.variant :+= delly.outputvcf } if (dup) { val delly = new DellyCaller(this) @@ -49,7 +49,7 @@ class Delly(val root: Configurable) extends SvCaller { delly.analysistype = "DUP" delly.outputvcf = new File(dellyDir, sample + ".delly.dup.vcf") add(delly) - catVariants.inputFiles :+= delly.outputvcf + catVariants.variant :+= delly.outputvcf } if (inv) { val delly = new DellyCaller(this) @@ -57,18 +57,18 @@ class Delly(val root: Configurable) extends SvCaller { delly.analysistype = "INV" delly.outputvcf = new File(dellyDir, sample + ".delly.inv.vcf") add(delly) - catVariants.inputFiles :+= delly.outputvcf + catVariants.variant :+= delly.outputvcf } if (tra) { val delly = new DellyCaller(this) delly.input = bamFile delly.analysistype = "TRA" delly.outputvcf = new File(dellyDir, sample + ".delly.tra.vcf") - catVariants.inputFiles :+= delly.outputvcf + catVariants.variant :+= delly.outputvcf add(delly) } - require(catVariants.inputFiles.nonEmpty, "Must atleast 1 SV-type be selected for Delly") + require(catVariants.variant.nonEmpty, "Must atleast 1 SV-type be selected for Delly") add(catVariants) addVCF(sample, catVariants.outputFile) diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCaller.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCaller.scala index cd039f946a182b70c23d715b9efc3cf57960f2b4..1224592eb7fb66eb4075eb9aff9215379d6553c4 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCaller.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCaller.scala @@ -5,7 +5,7 @@ */ package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers -import nl.lumc.sasc.biopet.extensions.gatk.broad +import nl.lumc.sasc.biopet.extensions.gatk import nl.lumc.sasc.biopet.utils.config.Configurable /** Default mode for the haplotypecaller */ @@ -14,7 +14,7 @@ class HaplotypeCaller(val root: Configurable) extends Variantcaller { protected def defaultPrio = 1 def biopetScript() { - val hc = broad.HaplotypeCaller(this, inputBams.values.toList, outputFile) + val hc = gatk.HaplotypeCaller(this, inputBams.values.toList, outputFile) add(hc) } } diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerAllele.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerAllele.scala index 3de2234b78317c0182aaf8db6a163c087b4afe34..09e7b5e0286fee0da538c23ce9d8b5f639df1555 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerAllele.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerAllele.scala @@ -5,7 +5,7 @@ */ package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers -import nl.lumc.sasc.biopet.extensions.gatk.broad +import nl.lumc.sasc.biopet.extensions.gatk import nl.lumc.sasc.biopet.utils.config.Configurable /** Allele mode for Haplotypecaller */ @@ -14,7 +14,7 @@ class HaplotypeCallerAllele(val root: Configurable) extends Variantcaller { protected def defaultPrio = 5 def biopetScript() { - val hc = broad.HaplotypeCaller(this, inputBams.values.toList, outputFile) + val hc = gatk.HaplotypeCaller(this, inputBams.values.toList, outputFile) hc.alleles = config("input_alleles") hc.genotyping_mode = Some("GENOTYPE_GIVEN_ALLELES") add(hc) diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerGvcf.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerGvcf.scala index eba59ccdbb73024f3435cda5fbc2e1e40293f98d..585c33d649cf304e32c19e28ee7c6c294a75eca7 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerGvcf.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerGvcf.scala @@ -5,7 +5,7 @@ */ package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers -import nl.lumc.sasc.biopet.extensions.gatk.broad +import nl.lumc.sasc.biopet.extensions.gatk import nl.lumc.sasc.biopet.utils.config.Configurable /** Gvcf mode for haplotypecaller */ @@ -21,13 +21,13 @@ class HaplotypeCallerGvcf(val root: Configurable) extends Variantcaller { def getGvcfs = gVcfFiles def biopetScript() { - gVcfFiles = for ((sample, inputBam) <- inputBams) yield { - val hc = broad.HaplotypeCaller.gvcf(this, inputBam, new File(outputDir, sample + ".gvcf.vcf.gz")) + val gvcfFiles = for ((sample, inputBam) <- inputBams) yield { + val hc = gatk.HaplotypeCaller.gvcf(this, inputBam, new File(outputDir, sample + ".gvcf.vcf.gz")) add(hc) sample -> hc.out } - val genotypeGVCFs = broad.GenotypeGVCFs(this, gVcfFiles.values.toList, outputFile) + val genotypeGVCFs = gatk.GenotypeGVCFs(this, gvcfFiles.toList, outputFile) add(genotypeGVCFs) } } diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/RawVcf.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/RawVcf.scala index 847e671166191da3153cc2df818828c66de37aa1..ec46b9c348b3761786195777057852a96f6b214f 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/RawVcf.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/RawVcf.scala @@ -15,11 +15,9 @@ */ package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers -import java.io.File - import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsMpileup -import nl.lumc.sasc.biopet.extensions.tools.{ VcfFilter, MpileupToVcf } +import nl.lumc.sasc.biopet.extensions.tools.{ MpileupToVcf, VcfFilter } import nl.lumc.sasc.biopet.utils.config.Configurable /** Makes a vcf file from a mpileup without statistics */ @@ -60,9 +58,9 @@ class RawVcf(val root: Configurable) extends Variantcaller { } val cv = new CombineVariants(this) - cv.inputFiles = rawFiles.toList - cv.outputFile = outputFile - cv.setKey = "null" + cv.variant = rawFiles.toList + cv.out = outputFile + cv.setKey = Some("null") cv.excludeNonVariants = !keepRefCalls add(cv) } diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyper.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyper.scala index 96c3821bcb1343163507d9b0e9a950f477a47c6c..43fbe730d4b585edacff62b7a8388a5c82fbe062 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyper.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyper.scala @@ -5,7 +5,7 @@ */ package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers -import nl.lumc.sasc.biopet.extensions.gatk.broad +import nl.lumc.sasc.biopet.extensions.gatk import nl.lumc.sasc.biopet.utils.config.Configurable /** Default mode for UnifiedGenotyper */ @@ -14,7 +14,7 @@ class UnifiedGenotyper(val root: Configurable) extends Variantcaller { protected def defaultPrio = 20 def biopetScript() { - val ug = broad.UnifiedGenotyper(this, inputBams.values.toList, outputFile) + val ug = gatk.UnifiedGenotyper(this, inputBams.values.toList, outputFile) add(ug) } } diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyperAllele.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyperAllele.scala index 8ffdcd962107840ccfbf175310c70035c8c668a1..364691f517c7434a39dc498a58ac1349e7e46d2f 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyperAllele.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyperAllele.scala @@ -5,7 +5,7 @@ */ package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers -import nl.lumc.sasc.biopet.extensions.gatk.broad +import nl.lumc.sasc.biopet.extensions.gatk import nl.lumc.sasc.biopet.utils.config.Configurable /** Allele mode for GenotyperAllele */ @@ -14,7 +14,7 @@ class UnifiedGenotyperAllele(val root: Configurable) extends Variantcaller { protected def defaultPrio = 9 def biopetScript() { - val ug = broad.UnifiedGenotyper(this, inputBams.values.toList, outputFile) + val ug = gatk.UnifiedGenotyper(this, inputBams.values.toList, outputFile) ug.alleles = config("input_alleles") ug.genotyping_mode = Some("GENOTYPE_GIVEN_ALLELES") add(ug) diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/VarscanCnsSingleSample.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/VarscanCnsSingleSample.scala index 9a0fb2839413948de68d3d16101fc4ce912df5b3..cb213f28e6485c04c7bf8f76a7293cf062516d8a 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/VarscanCnsSingleSample.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/VarscanCnsSingleSample.scala @@ -35,7 +35,8 @@ class VarscanCnsSingleSample(val root: Configurable) extends Variantcaller { "disable_baq" -> true, "depth" -> 1000000 ), - "varscanmpileup2cns" -> Map("strand_filter" -> 0) + "varscanmpileup2cns" -> Map("strand_filter" -> 0), + "combinevariants" -> Map("scattercount" -> 20) ) override def fixedValues = Map( @@ -67,9 +68,9 @@ class VarscanCnsSingleSample(val root: Configurable) extends Variantcaller { } val cv = new CombineVariants(this) - cv.inputFiles = sampleVcfs - cv.outputFile = outputFile - cv.setKey = "null" + cv.variant = sampleVcfs + cv.out = outputFile + cv.setKey = Some("null") cv.excludeNonVariants = true add(cv) } diff --git a/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala b/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala index 736cff399b1dc09d41ea1df8cc6adfc697503d44..eb1d40ece24ce6862023eae3343b84c840ea3e42 100644 --- a/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala +++ b/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala @@ -18,7 +18,7 @@ package nl.lumc.sasc.biopet.pipelines.shiva import java.io.{ File, FileOutputStream } import com.google.common.io.Files -import nl.lumc.sasc.biopet.extensions.gatk.broad._ +import nl.lumc.sasc.biopet.extensions.gatk.{ BaseRecalibrator, IndelRealigner, PrintReads, RealignerTargetCreator } import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates import nl.lumc.sasc.biopet.extensions.tools.VcfStats import nl.lumc.sasc.biopet.utils.ConfigUtils diff --git a/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTest.scala b/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTest.scala index f85137cc27a87069021752d42517e3fe6685359a..8c9dcb1e5496d3e43792bcc83d56644396eaeee6 100644 --- a/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTest.scala +++ b/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTest.scala @@ -11,16 +11,14 @@ import com.google.common.io.Files import nl.lumc.sasc.biopet.core.BiopetPipe import nl.lumc.sasc.biopet.extensions.Freebayes import nl.lumc.sasc.biopet.extensions.bcftools.{ BcftoolsCall, BcftoolsMerge } +import nl.lumc.sasc.biopet.extensions.gatk.{ CombineVariants, HaplotypeCaller, UnifiedGenotyper } import nl.lumc.sasc.biopet.utils.config.Config -import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants -import nl.lumc.sasc.biopet.extensions.gatk.broad.{ HaplotypeCaller, UnifiedGenotyper } import nl.lumc.sasc.biopet.extensions.tools.{ MpileupToVcf, VcfFilter, VcfStats } import nl.lumc.sasc.biopet.utils.ConfigUtils -import org.apache.commons.io.FileUtils import org.broadinstitute.gatk.queue.QSettings import org.scalatest.Matchers import org.scalatest.testng.TestNGSuite -import org.testng.annotations.{ AfterClass, DataProvider, Test } +import org.testng.annotations.{ DataProvider, Test } import scala.collection.mutable.ListBuffer diff --git a/toucan/pom.xml b/toucan/pom.xml index 781e458c31cc8128843b55873781e3aaa9f8b1e0..62a9699f8f63680d1c8b6bce67a7390907ca8b34 100644 --- a/toucan/pom.xml +++ b/toucan/pom.xml @@ -43,5 +43,17 @@ <artifactId>BiopetToolsExtensions</artifactId> <version>${project.version}</version> </dependency> + <dependency> + <groupId>org.testng</groupId> + <artifactId>testng</artifactId> + <version>6.8</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_2.10</artifactId> + <version>2.2.1</version> + <scope>test</scope> + </dependency> </dependencies> </project> diff --git a/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala b/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala index 2e3ffe26a1735dbea0f664bf8c4c957bd7262f7b..58dcaf82447daef751eae0d21ac7c43387026149 100644 --- a/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala +++ b/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala @@ -40,15 +40,17 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum @Input(doc = "Input GVCF file", shortName = "gvcf", required = false) var inputGvcf: Option[File] = None - var sampleIds: List[String] = Nil + var outputVcf: Option[File] = None + + def sampleInfo: Map[String, Map[String, Any]] = root match { + case m: MultiSampleQScript => m.samples.map { case (sampleId, sample) => sampleId -> sample.sampleTags } + case null => VcfUtils.getSampleIds(inputVCF).map(x => x -> Map[String, Any]()).toMap + case s: SampleLibraryTag => s.sampleId.map(x => x -> Map[String, Any]()).toMap + case _ => throw new IllegalArgumentException("") + } + def init(): Unit = { inputFiles :+= new InputFile(inputVCF) - sampleIds = root match { - case m: MultiSampleQScript => m.samples.keys.toList - case null => VcfUtils.getSampleIds(inputVCF) - case s: SampleLibraryTag => s.sampleId.toList - case _ => throw new IllegalArgumentException("You don't have any samples") - } } override def defaults = Map( @@ -79,29 +81,29 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum val gonlVcfFile: Option[File] = config("gonl_vcf") val exacVcfFile: Option[File] = config("exac_vcf") - var outputFile = normalizer.outputVcf + outputVcf = Some(normalizer.outputVcf) gonlVcfFile match { case Some(gonlFile) => val vcfWithVcf = new VcfWithVcf(this) - vcfWithVcf.input = outputFile + vcfWithVcf.input = outputVcf.getOrElse(new File("")) vcfWithVcf.secondaryVcf = gonlFile vcfWithVcf.output = swapExt(outputDir, normalizer.outputVcf, ".vcf.gz", ".gonl.vcf.gz") vcfWithVcf.fields ::= ("AF", "AF_gonl", None) add(vcfWithVcf) - outputFile = vcfWithVcf.output + outputVcf = Some(vcfWithVcf.output) case _ => } exacVcfFile match { case Some(exacFile) => val vcfWithVcf = new VcfWithVcf(this) - vcfWithVcf.input = outputFile + vcfWithVcf.input = outputVcf.getOrElse(new File("")) vcfWithVcf.secondaryVcf = exacFile - vcfWithVcf.output = swapExt(outputDir, outputFile, ".vcf.gz", ".exac.vcf.gz") + vcfWithVcf.output = swapExt(outputDir, outputVcf.getOrElse(new File("")), ".vcf.gz", ".exac.vcf.gz") vcfWithVcf.fields ::= ("AF", "AF_exac", None) add(vcfWithVcf) - outputFile = vcfWithVcf.output + outputVcf = Some(vcfWithVcf.output) case _ => } @@ -116,7 +118,7 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum * @param annotation: ManweDownloadAnnotateVcf object of annotated vcf * @return */ - def importAndActivateSample(sampleID: String, inputVcf: File, + def importAndActivateSample(sampleID: String, sampleGroups: List[String], inputVcf: File, gVCF: File, annotation: ManweAnnotateVcf): ManweActivateAfterAnnotImport = { val minGQ: Int = config("minimum_genome_quality", default = 20, namespace = "manwe") @@ -165,6 +167,7 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum imported.beds = List(bgzippedBed.output) imported.name = Some(sampleID) imported.public = isPublic + imported.group = sampleGroups imported.waitToComplete = false imported.isIntermediate = true imported.output = swapExt(outputDir, intersected.output, ".vcf.gz", ".manwe.import") @@ -186,7 +189,6 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum def varda(vcf: File, gVcf: File): File = { val annotationQueries: List[String] = config("annotation_queries", default = List("GLOBAL *"), namespace = "manwe") - //TODO: add groups!!! Need sample-specific group tags for this val annotate = new ManweAnnotateVcf(this) annotate.vcf = vcf @@ -202,7 +204,14 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum annotatedVcf.output = swapExt(outputDir, annotate.output, ".manwe.annot", "manwe.annot.vcf.gz") add(annotatedVcf) - val activates = sampleIds map { x => importAndActivateSample(x, vcf, gVcf, annotate) } + val activates = sampleInfo map { x => + val sampleGroup = x._2.getOrElse("varda_group", Nil) match { + case x: List[String] => x + case Nil => Nil + case _ => throw new IllegalArgumentException("Sample tag 'varda_group' is not a list of strings") + } + importAndActivateSample(x._1, sampleGroup, vcf, gVcf, annotate) + } val finalLn = new Ln(this) activates.foreach(x => finalLn.deps :+= x.output)