diff --git a/.idea/misc.xml b/.idea/misc.xml index f413f8bf3b5b8693c73eb94d6f540b7606326d36..8fb54ee42a504d86c34a2800c767c27a0b9318df 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,5 +1,8 @@ <?xml version="1.0" encoding="UTF-8"?> <project version="4"> + <component name="EntryPointsManager"> + <entry_points version="2.0" /> + </component> <component name="MavenProjectsManager"> <option name="originalFiles"> <list> diff --git a/docs/config.md b/docs/config.md index c10419859bed6d3f510c80afe0b27fea913d774f..de3342b195b1dc7acb338729792d9dabc59c5228 100644 --- a/docs/config.md +++ b/docs/config.md @@ -72,16 +72,16 @@ Global setting examples are: #### Example settings config ~~~ { - "reference": "/data/LGTC/projects/vandoorn-melanoma/data/references/hg19_nohap/ucsc.hg19_nohap.fasta", - "dbsnp": "/data/LGTC/projects/vandoorn-melanoma/data/references/hg19_nohap/dbsnp_137.hg19_nohap.vcf", + "reference": "/references/hg19_nohap/ucsc.hg19_nohap.fasta", + "dbsnp": "/references/hg19_nohap/dbsnp_137.hg19_nohap.vcf", "joint_variantcalling": false, "haplotypecaller": { "scattercount": 100 }, "multisample": { "haplotypecaller": { "scattercount": 1000 } }, "picard": { "validationstringency": "LENIENT" }, "library_variantcalling_temp": true, - "target_bed_temp": "/data/LGTC/projects/vandoorn-melanoma/analysis/target.bed", + "target_bed_temp": "analysis/target.bed", "min_dp": 5, - "bedtools": {"exe":"/share/isilon/system/local/BEDtools/bedtools-2.17.0/bin/bedtools"}, + "bedtools": {"exe":"/BEDtools/bedtools-2.17.0/bin/bedtools"}, "bam_to_fastq": true, "baserecalibrator": { "memory_limit": 8, "vmem":"16G" }, "samtofastq": {"memory_limit": 8, "vmem": "16G"}, @@ -95,4 +95,4 @@ Global setting examples are: ### JSON validation To check if the JSON file created is correct we can use multiple options the simplest way is using [this](http://jsonformatter.curiousconcept.com/) -website. It is also possible to use Python or Scala for validating but this requires some more knowledge. \ No newline at end of file +website. It is also possible to use Python or Scala for validating but this requires some more knowledge. diff --git a/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala b/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala index 08fcf3608550fc40a2f2d5a5ec184888428e1e3d..5088e44c2eca6f54b23104c7695c3a03d5f762c8 100644 --- a/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala +++ b/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala @@ -12,24 +12,40 @@ import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.{ RunGubbins, Cat, Raxml } import nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline import nl.lumc.sasc.biopet.tools.BastyGenerateFasta +import nl.lumc.sasc.biopet.utils.ConfigUtils import org.broadinstitute.gatk.queue.QScript class Basty(val root: Configurable) extends QScript with MultiSampleQScript { + qscript => def this() = this(null) - class LibraryOutput extends AbstractLibraryOutput { - } - case class FastaOutput(variants: File, consensus: File, consensusVariants: File) - class SampleOutput extends AbstractSampleOutput { + + override def defaults = ConfigUtils.mergeMaps(Map( + "ploidy" -> 1, + "use_haplotypecaller" -> false, + "use_unifiedgenotyper" -> true, + "joint_variantcalling" -> true + ), super.defaults) + + var gatkPipeline: GatkPipeline = new GatkPipeline(qscript) + + def makeSample(id: String) = new Sample(id) + class Sample(sampleId: String) extends AbstractSample(sampleId) { + def makeLibrary(id: String) = new Library(id) + class Library(libId: String) extends AbstractLibrary(libId) { + protected def addJobs(): Unit = {} + } + var output: FastaOutput = _ var outputSnps: FastaOutput = _ - } - - defaults ++= Map("ploidy" -> 1, "use_haplotypecaller" -> false, "use_unifiedgenotyper" -> true, "joint_variantcalling" -> true) - var gatkPipeline: GatkPipeline = new GatkPipeline(this) - gatkPipeline.jointVariantcalling = true + protected def addJobs(): Unit = { + addPerLibJobs() + output = addGenerateFasta(sampleId, sampleDir) + outputSnps = addGenerateFasta(sampleId, sampleDir, snpsOnly = true) + } + } def init() { gatkPipeline.outputDir = outputDir @@ -40,24 +56,26 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript { gatkPipeline.biopetScript addAll(gatkPipeline.functions) + addSamplesJobs() + } + + def addMultiSampleJobs(): Unit = { val refVariants = addGenerateFasta(null, outputDir + "reference/", outputName = "reference") val refVariantSnps = addGenerateFasta(null, outputDir + "reference/", outputName = "reference", snpsOnly = true) - runSamplesJobs() - - val catVariants = Cat(this, refVariants.variants :: samplesOutput.map(_._2.output.variants).toList, outputDir + "fastas/variant.fasta") + val catVariants = Cat(this, refVariants.variants :: samples.map(_._2.output.variants).toList, outputDir + "fastas/variant.fasta") add(catVariants) - val catVariantsSnps = Cat(this, refVariantSnps.variants :: samplesOutput.map(_._2.outputSnps.variants).toList, outputDir + "fastas/variant.snps_only.fasta") + val catVariantsSnps = Cat(this, refVariantSnps.variants :: samples.map(_._2.outputSnps.variants).toList, outputDir + "fastas/variant.snps_only.fasta") add(catVariantsSnps) - val catConsensus = Cat(this, refVariants.consensus :: samplesOutput.map(_._2.output.consensus).toList, outputDir + "fastas/consensus.fasta") + val catConsensus = Cat(this, refVariants.consensus :: samples.map(_._2.output.consensus).toList, outputDir + "fastas/consensus.fasta") add(catConsensus) - val catConsensusSnps = Cat(this, refVariantSnps.consensus :: samplesOutput.map(_._2.outputSnps.consensus).toList, outputDir + "fastas/consensus.snps_only.fasta") + val catConsensusSnps = Cat(this, refVariantSnps.consensus :: samples.map(_._2.outputSnps.consensus).toList, outputDir + "fastas/consensus.snps_only.fasta") add(catConsensusSnps) - val catConsensusVariants = Cat(this, refVariants.consensusVariants :: samplesOutput.map(_._2.output.consensusVariants).toList, outputDir + "fastas/consensus.variant.fasta") + val catConsensusVariants = Cat(this, refVariants.consensusVariants :: samples.map(_._2.output.consensusVariants).toList, outputDir + "fastas/consensus.variant.fasta") add(catConsensusVariants) - val catConsensusVariantsSnps = Cat(this, refVariantSnps.consensusVariants :: samplesOutput.map(_._2.outputSnps.consensusVariants).toList, outputDir + "fastas/consensus.variant.snps_only.fasta") + val catConsensusVariantsSnps = Cat(this, refVariantSnps.consensusVariants :: samples.map(_._2.outputSnps.consensusVariants).toList, outputDir + "fastas/consensus.variant.snps_only.fasta") add(catConsensusVariantsSnps) val seed: Int = config("seed", default = 12345) @@ -106,38 +124,14 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript { val gubbins = new RunGubbins(this) gubbins.fastafile = concensusVariants - gubbins.startingTree = raxmlBi.getBipartitionsFile + gubbins.startingTree = Some(raxmlBi.getBipartitionsFile) gubbins.outputDirectory = outputDir + dirSufixGubbins add(gubbins) } addTreeJobs(catVariantsSnps.output, catConsensusVariantsSnps.output, outputDir + "trees" + File.separator + "snps_only", "snps_only") addTreeJobs(catVariants.output, catConsensusVariants.output, outputDir + "trees" + File.separator + "snps_indels", "snps_indels") - } - - // Called for each sample - def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = { - val sampleOutput = new SampleOutput - val sampleID: String = sampleConfig("ID").toString - val sampleDir = globalSampleDir + sampleID + "/" - - sampleOutput.libraries = runLibraryJobs(sampleConfig) - - sampleOutput.output = addGenerateFasta(sampleID, sampleDir) - sampleOutput.outputSnps = addGenerateFasta(sampleID, sampleDir, snpsOnly = true) - - return sampleOutput - } - - // Called for each run from a sample - def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = { - val libraryOutput = new LibraryOutput - - val runID: String = runConfig("ID").toString - val sampleID: String = sampleConfig("ID").toString - val runDir: String = globalSampleDir + sampleID + "/run_" + runID + "/" - return libraryOutput } def addGenerateFasta(sampleName: String, outputDir: String, outputName: String = null, @@ -145,15 +139,15 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript { val bastyGenerateFasta = new BastyGenerateFasta(this) bastyGenerateFasta.outputName = if (outputName != null) outputName else sampleName bastyGenerateFasta.inputVcf = gatkPipeline.multisampleVariantcalling.scriptOutput.finalVcfFile - if (gatkPipeline.samplesOutput.contains(sampleName)) { - bastyGenerateFasta.bamFile = gatkPipeline.samplesOutput(sampleName).variantcalling.bamFiles.head + if (gatkPipeline.samples.contains(sampleName)) { + bastyGenerateFasta.bamFile = gatkPipeline.samples(sampleName).gatkVariantcalling.scriptOutput.bamFiles.head } bastyGenerateFasta.outputVariants = outputDir + bastyGenerateFasta.outputName + ".variants" + (if (snpsOnly) ".snps_only" else "") + ".fasta" bastyGenerateFasta.outputConsensus = outputDir + bastyGenerateFasta.outputName + ".consensus" + (if (snpsOnly) ".snps_only" else "") + ".fasta" bastyGenerateFasta.outputConsensusVariants = outputDir + bastyGenerateFasta.outputName + ".consensus_variants" + (if (snpsOnly) ".snps_only" else "") + ".fasta" bastyGenerateFasta.sampleName = sampleName bastyGenerateFasta.snpsOnly = snpsOnly - add(bastyGenerateFasta) + qscript.add(bastyGenerateFasta) return FastaOutput(bastyGenerateFasta.outputVariants, bastyGenerateFasta.outputConsensus, bastyGenerateFasta.outputConsensusVariants) } } diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala index 423d9e7899da25d3e6c77be0b02d0f17f59b61dc..8728b6c651824e340556222cb60ef52d7fd9ab0a 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/ApplyRecalibration.scala @@ -9,13 +9,17 @@ import java.io.File import nl.lumc.sasc.biopet.core.config.Configurable class ApplyRecalibration(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.ApplyRecalibration with GatkGeneral { + scatterCount = config("scattercount", default = 0) + override def afterGraph { super.afterGraph - if (config.contains("scattercount")) scatterCount = config("scattercount") - nt = Option(getThreads(3)) memoryLimit = Option(nt.getOrElse(1) * 2) + + import org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode + if (mode == Mode.INDEL) ts_filter_level = config("ts_filter_level", default = 99.0) + else if (mode == Mode.SNP) ts_filter_level = config("ts_filter_level", default = 99.5) ts_filter_level = config("ts_filter_level") } } @@ -24,11 +28,9 @@ object ApplyRecalibration { def apply(root: Configurable, input: File, output: File, recal_file: File, tranches_file: File, indel: Boolean = false): ApplyRecalibration = { val ar = if (indel) new ApplyRecalibration(root) { mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL - defaults ++= Map("ts_filter_level" -> 99.0) } else new ApplyRecalibration(root) { mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP - defaults ++= Map("ts_filter_level" -> 99.5) } ar.input :+= input ar.recal_file = recal_file diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala index 3912958a59d8d58a67b4cd615409e4a5c03ec990..c07a2a66c363fe9623ad35225e9f76eb45aa67a4 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala @@ -12,7 +12,7 @@ class BaseRecalibrator(val root: Configurable) extends org.broadinstitute.gatk.q memoryLimit = Option(4) override val defaultVmem = "8G" - if (config.contains("scattercount")) scatterCount = config("scattercount") + if (config.contains("scattercount")) scatterCount = config("scattercount", default = 1) if (config.contains("dbsnp")) knownSites :+= new File(config("dbsnp").asString) if (config.contains("known_sites")) knownSites :+= new File(config("known_sites").asString) } diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkGeneral.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkGeneral.scala index 5f69823582b793408e3516202ecb85b3eeba59f4..147398ac798c077da0722b2078f7ea21c666b7d1 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkGeneral.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkGeneral.scala @@ -11,13 +11,15 @@ import org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGATK trait GatkGeneral extends CommandLineGATK with BiopetJavaCommandLineFunction { memoryLimit = Option(3) - if (config.contains("gatk_jar")) jarFile = config("gatk_jar") + override def subPath = "gatk" :: super.subPath + + jarFile = config("gatk_jar", required = true) override val defaultVmem = "7G" - if (config.contains("intervals", submodule = "gatk")) intervals = config("intervals", submodule = "gatk").asFileList - if (config.contains("exclude_intervals", submodule = "gatk")) excludeIntervals = config("exclude_intervals", submodule = "gatk").asFileList - reference_sequence = config("reference", submodule = "gatk") - gatk_key = config("gatk_key", submodule = "gatk") - if (config.contains("pedigree", submodule = "gatk")) pedigree = config("pedigree", submodule = "gatk").asFileList + if (config.contains("intervals")) intervals = config("intervals").asFileList + if (config.contains("exclude_intervals")) excludeIntervals = config("exclude_intervals").asFileList + reference_sequence = config("reference", required = true) + if (config.contains("gatk_key")) gatk_key = config("gatk_key") + if (config.contains("pedigree")) pedigree = config("pedigree").asFileList } diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala index 001fb9d17b858320d22549bae321ca0b820eac12..b8f4ea4efa3ea8cfed563c2b82651c0001b794f7 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala @@ -9,40 +9,40 @@ import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.HaplotypeCaller with GatkGeneral { - override def afterGraph { - super.afterGraph - - min_mapping_quality_score = config("minMappingQualityScore", default = 20) - if (config.contains("scattercount")) scatterCount = config("scattercount") - if (config.contains("dbsnp")) this.dbsnp = config("dbsnp") - this.sample_ploidy = config("ploidy") - nct = config("threads", default = 1) - bamOutput = config("bamOutput") - memoryLimit = Option(nct.getOrElse(1) * 2) - if (config.contains("allSitePLs")) this.allSitePLs = config("allSitePLs") - if (config.contains("output_mode")) { - import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode._ - config("output_mode").asString match { - case "EMIT_ALL_CONFIDENT_SITES" => output_mode = EMIT_ALL_CONFIDENT_SITES - case "EMIT_ALL_SITES" => output_mode = EMIT_ALL_SITES - case "EMIT_VARIANTS_ONLY" => output_mode = EMIT_VARIANTS_ONLY - case e => logger.warn("output mode '" + e + "' does not exist") - } + min_mapping_quality_score = config("minMappingQualityScore", default = 20) + scatterCount = config("scattercount", default = 1) + if (config.contains("dbsnp")) this.dbsnp = config("dbsnp") + this.sample_ploidy = config("ploidy") + if (config.contains("bamOutput")) bamOutput = config("bamOutput") + if (config.contains("allSitePLs")) allSitePLs = config("allSitePLs") + if (config.contains("output_mode")) { + import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode._ + config("output_mode").asString match { + case "EMIT_ALL_CONFIDENT_SITES" => output_mode = EMIT_ALL_CONFIDENT_SITES + case "EMIT_ALL_SITES" => output_mode = EMIT_ALL_SITES + case "EMIT_VARIANTS_ONLY" => output_mode = EMIT_VARIANTS_ONLY + case e => logger.warn("output mode '" + e + "' does not exist") } + } - if (config("inputtype", default = "dna").asString == "rna") { - dontUseSoftClippedBases = config("dontusesoftclippedbases", default = true) - stand_call_conf = config("stand_call_conf", default = 5) - stand_emit_conf = config("stand_emit_conf", default = 0) - } else { - dontUseSoftClippedBases = config("dontusesoftclippedbases", default = false) - stand_call_conf = config("stand_call_conf", default = 5) - stand_emit_conf = config("stand_emit_conf", default = 0) - } + if (config("inputtype", default = "dna").asString == "rna") { + dontUseSoftClippedBases = config("dontusesoftclippedbases", default = true) + stand_call_conf = config("stand_call_conf", default = 5) + stand_emit_conf = config("stand_emit_conf", default = 0) + } else { + dontUseSoftClippedBases = config("dontusesoftclippedbases", default = false) + stand_call_conf = config("stand_call_conf", default = 5) + stand_emit_conf = config("stand_emit_conf", default = 0) + } + + override def afterGraph { + super.afterGraph if (bamOutput != null && nct.getOrElse(1) > 1) { - nct = Option(1) + threads = 1 logger.warn("BamOutput is on, nct/threads is forced to set on 1, this option is only for debug") } + nct = Some(threads) + memoryLimit = Option(memoryLimit.getOrElse(2.0) * nct.getOrElse(1)) } def useGvcf() { diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala index 002e515c997082825310b367d0ccf874f62d8b73..e8866c2201d85b12eaaab1924be9234e2dac9fe1 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/VariantRecalibrator.scala @@ -27,11 +27,9 @@ object VariantRecalibrator { override def configPath: List[String] = (if (indel) "indel" else "snp") :: super.configPath if (indel) { mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL - defaults ++= Map("ts_filter_level" -> 99.0) if (config.contains("mills")) resource :+= new TaggedFile(config("mills").asString, "known=false,training=true,truth=true,prior=12.0") } else { mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP - defaults ++= Map("ts_filter_level" -> 99.5) if (config.contains("hapmap")) resource +:= new TaggedFile(config("hapmap").asString, "known=false,training=true,truth=true,prior=15.0") if (config.contains("omni")) resource +:= new TaggedFile(config("omni").asString, "known=false,training=true,truth=true,prior=12.0") if (config.contains("1000G")) resource +:= new TaggedFile(config("1000G").asString, "known=false,training=true,truth=false,prior=10.0") diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala index 0031e2c15640e9f20d9e15a309379b712d73e84f..c76139133f23cf434884ad69ccc9dafc69b2cac9 100644 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala +++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala @@ -10,7 +10,6 @@ import nl.lumc.sasc.biopet.core.PipelineCommand import nl.lumc.sasc.biopet.core.config.Configurable import htsjdk.samtools.SamReaderFactory import scala.collection.JavaConversions._ -import java.io.File import nl.lumc.sasc.biopet.extensions.gatk.{ CombineVariants, CombineGVCFs } import nl.lumc.sasc.biopet.extensions.picard.AddOrReplaceReadGroups import nl.lumc.sasc.biopet.extensions.picard.SamToFastq @@ -20,219 +19,192 @@ import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.utils.commandline.{ Argument } class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScript { + qscript => def this() = this(null) - @Argument(doc = "Only Sample", shortName = "sample", required = false) - val onlySample: List[String] = Nil - @Argument(doc = "Skip Genotyping step", shortName = "skipgenotyping", required = false) - var skipGenotyping: Boolean = false + var skipGenotyping: Boolean = config("skip_genotyping", default = false) - @Argument(doc = "Merge gvcfs", shortName = "mergegvcfs", required = false) - var mergeGvcfs: Boolean = false + /** Merge gvcfs */ + var mergeGvcfs: Boolean = config("merge_gvcfs", default = false) - @Argument(doc = "Joint variantcalling", shortName = "jointVariantCalling", required = false) + /** Joint variantcalling */ var jointVariantcalling: Boolean = config("joint_variantcalling", default = false) - @Argument(doc = "Joint genotyping", shortName = "jointGenotyping", required = false) + /** Joint genotyping */ var jointGenotyping: Boolean = config("joint_genotyping", default = false) var singleSampleCalling = config("single_sample_calling", default = true) var reference: File = config("reference", required = true) - var dbsnp: File = config("dbsnp") - var gvcfFiles: List[File] = Nil - var finalBamFiles: List[File] = Nil var useAllelesOption: Boolean = config("use_alleles_option", default = false) + val externalGvcfs = config("external_gvcfs_files", default = Nil).asFileList + + def makeSample(id: String) = new Sample(id) + class Sample(sampleId: String) extends AbstractSample(sampleId) { + def makeLibrary(id: String) = new Library(id) + class Library(libId: String) extends AbstractLibrary(libId) { + val mapping = new Mapping(qscript) + mapping.sampleId = sampleId + mapping.libId = libId + mapping.outputDir = libDir + "/variantcalling/" + + /** Library variantcalling */ + val gatkVariantcalling = new GatkVariantcalling(qscript) + gatkVariantcalling.sampleID = sampleId + gatkVariantcalling.outputDir = libDir + + protected def addJobs(): Unit = { + val bamFile: Option[File] = if (config.contains("R1")) { + mapping.input_R1 = config("R1") + mapping.input_R2 = config("R2") + mapping.init + mapping.biopetScript + addAll(mapping.functions) // Add functions of mapping to curent function pool + Some(mapping.finalBamFile) + } else if (config.contains("bam")) { + var bamFile: File = config("bam") + if (!bamFile.exists) throw new IllegalStateException("Bam in config does not exist, file: " + bamFile) + + if (config("bam_to_fastq", default = false).asBoolean) { + val samToFastq = SamToFastq(qscript, bamFile, libDir + sampleId + "-" + libId + ".R1.fastq", + libDir + sampleId + "-" + libId + ".R2.fastq") + samToFastq.isIntermediate = true + qscript.add(samToFastq) + mapping.input_R1 = samToFastq.fastqR1 + mapping.input_R2 = Some(samToFastq.fastqR2) + mapping.init + mapping.biopetScript + addAll(mapping.functions) // Add functions of mapping to curent function pool + Some(mapping.finalBamFile) + } else { + var readGroupOke = true + val inputSam = SamReaderFactory.makeDefault.open(bamFile) + val header = inputSam.getFileHeader.getReadGroups + for (readGroup <- inputSam.getFileHeader.getReadGroups) { + if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same") + if (readGroup.getLibrary != libId) logger.warn("Library ID readgroup in bam file is not the same") + if (readGroup.getSample != sampleId || readGroup.getLibrary != libId) readGroupOke = false + } + inputSam.close + + if (!readGroupOke) { + if (config("correct_readgroups", default = false)) { + logger.info("Correcting readgroups, file:" + bamFile) + val aorrg = AddOrReplaceReadGroups(qscript, bamFile, new File(libDir + sampleId + "-" + libId + ".bam")) + aorrg.RGID = sampleId + "-" + libId + aorrg.RGLB = libId + aorrg.RGSM = sampleId + aorrg.isIntermediate = true + qscript.add(aorrg) + bamFile = aorrg.output + } else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile + + "\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this") + } + addAll(BamMetrics(qscript, bamFile, libDir + "metrics/").functions) + + Some(bamFile) + } + } else { + logger.error("Sample: " + sampleId + ": No R1 found for run: " + libId) + None + } - class LibraryOutput extends AbstractLibraryOutput { - var mappedBamFile: File = _ - var variantcalling: GatkVariantcalling.ScriptOutput = _ - } + if (bamFile.isDefined) { + gatkVariantcalling.inputBams = List(bamFile.get) + gatkVariantcalling.variantcalling = config("library_variantcalling", default = false) + gatkVariantcalling.preProcesBams = true + gatkVariantcalling.init + gatkVariantcalling.biopetScript + addAll(gatkVariantcalling.functions) + } + } + } + + /** sample variantcalling */ + val gatkVariantcalling = new GatkVariantcalling(qscript) + gatkVariantcalling.sampleID = sampleId + gatkVariantcalling.outputDir = sampleDir + "/variantcalling/" - class SampleOutput extends AbstractSampleOutput { - var variantcalling: GatkVariantcalling.ScriptOutput = _ + protected def addJobs(): Unit = { + addPerLibJobs() + gatkVariantcalling.inputBams = libraries.map(_._2.mapping.finalBamFile).toList + gatkVariantcalling.preProcesBams = false + if (!singleSampleCalling) { + gatkVariantcalling.useHaplotypecaller = false + gatkVariantcalling.useUnifiedGenotyper = false + } + gatkVariantcalling.init + gatkVariantcalling.biopetScript + addAll(gatkVariantcalling.functions) + } } def init() { - if (config.contains("gvcfFiles")) - for (file <- config("gvcfFiles").asList) - gvcfFiles :+= file.toString if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module") else if (!outputDir.endsWith("/")) outputDir += "/" } val multisampleVariantcalling = new GatkVariantcalling(this) { override def configName = "gatkvariantcalling" - override def configPath: List[String] = "multisample" :: super.configPath + override def configPath: List[String] = super.configPath ::: "multisample" :: Nil } - def biopetScript() { - if (onlySample.isEmpty) { - runSamplesJobs + def biopetScript(): Unit = { + addSamplesJobs() + } - //SampleWide jobs - if (mergeGvcfs && gvcfFiles.size > 0) { - val newFile = outputDir + "merged.gvcf.vcf.gz" - add(CombineGVCFs(this, gvcfFiles, newFile)) - gvcfFiles = List(newFile) + def addMultiSampleJobs(): Unit = { + val gvcfFiles: List[File] = if (mergeGvcfs && externalGvcfs.size + samples.size > 1) { + val newFile = outputDir + "merged.gvcf.vcf.gz" + add(CombineGVCFs(this, externalGvcfs ++ samples.map(_._2.gatkVariantcalling.scriptOutput.gvcfFile), newFile)) + List(newFile) + } else externalGvcfs ++ samples.map(_._2.gatkVariantcalling.scriptOutput.gvcfFile) + + if (!skipGenotyping && gvcfFiles.size > 0) { + if (jointGenotyping) { + val gatkGenotyping = new GatkGenotyping(this) + gatkGenotyping.inputGvcfs = gvcfFiles + gatkGenotyping.outputDir = outputDir + "genotyping/" + gatkGenotyping.init + gatkGenotyping.biopetScript + addAll(gatkGenotyping.functions) + var vcfFile = gatkGenotyping.outputFile } + } else logger.warn("No gVCFs to genotype") - if (!skipGenotyping && gvcfFiles.size > 0) { - if (jointGenotyping) { - val gatkGenotyping = new GatkGenotyping(this) - gatkGenotyping.inputGvcfs = gvcfFiles - gatkGenotyping.outputDir = outputDir + "genotyping/" - gatkGenotyping.init - gatkGenotyping.biopetScript - addAll(gatkGenotyping.functions) - var vcfFile = gatkGenotyping.outputFile - } - } else logger.warn("No gVCFs to genotype") - - if (jointVariantcalling) { - val allBamfiles = for ( - (sampleID, sampleOutput) <- samplesOutput; - file <- sampleOutput.variantcalling.bamFiles - ) yield file - val allRawVcfFiles = for ((sampleID, sampleOutput) <- samplesOutput) yield sampleOutput.variantcalling.rawFilterVcfFile - - val gatkVariantcalling = new GatkVariantcalling(this) { - override def configName = "gatkvariantcalling" - override def configPath: List[String] = "multisample" :: super.configPath - } + if (jointVariantcalling) { + val allBamfiles = samples.map(_._2.gatkVariantcalling.scriptOutput.bamFiles).toList.fold(Nil)(_ ++ _) + val allRawVcfFiles = samples.map(_._2.gatkVariantcalling.scriptOutput.rawVcfFile).filter(_ != null).toList - if (gatkVariantcalling.useMpileup) { - val cvRaw = CombineVariants(this, allRawVcfFiles.toList, outputDir + "variantcalling/multisample.raw.vcf.gz") - add(cvRaw) - gatkVariantcalling.rawVcfInput = cvRaw.out - } - - multisampleVariantcalling.preProcesBams = false - multisampleVariantcalling.doublePreProces = false - multisampleVariantcalling.inputBams = allBamfiles.toList - multisampleVariantcalling.outputDir = outputDir + "variantcalling" - multisampleVariantcalling.outputName = "multisample" - multisampleVariantcalling.init - multisampleVariantcalling.biopetScript - addAll(multisampleVariantcalling.functions) - - if (config("inputtype", default = "dna").asString != "rna" && config("recalibration", default = false).asBoolean) { - val recalibration = new GatkVariantRecalibration(this) - recalibration.inputVcf = multisampleVariantcalling.scriptOutput.finalVcfFile - recalibration.bamFiles = finalBamFiles - recalibration.outputDir = outputDir + "recalibration/" - recalibration.init - recalibration.biopetScript - } + val gatkVariantcalling = new GatkVariantcalling(this) { + override def configName = "gatkvariantcalling" + override def configPath: List[String] = super.configPath ::: "multisample" :: Nil } - } else for (sample <- onlySample) runSingleSampleJobs(sample) - } - // Called for each sample - def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = { - val sampleOutput = new SampleOutput - var libraryBamfiles: List[File] = List() - val sampleID: String = sampleConfig("ID").toString - sampleOutput.libraries = runLibraryJobs(sampleConfig) - val sampleDir = globalSampleDir + sampleID - for ((libraryID, libraryOutput) <- sampleOutput.libraries) { - libraryBamfiles ++= libraryOutput.variantcalling.bamFiles - } - - if (libraryBamfiles.size > 0) { - finalBamFiles ++= libraryBamfiles - val gatkVariantcalling = new GatkVariantcalling(this) - gatkVariantcalling.inputBams = libraryBamfiles - gatkVariantcalling.outputDir = sampleDir + "/variantcalling/" - gatkVariantcalling.preProcesBams = false - if (!singleSampleCalling) { - gatkVariantcalling.useHaplotypecaller = false - gatkVariantcalling.useUnifiedGenotyper = false + if (gatkVariantcalling.useMpileup) { + val cvRaw = CombineVariants(this, allRawVcfFiles.toList, outputDir + "variantcalling/multisample.raw.vcf.gz") + add(cvRaw) + gatkVariantcalling.rawVcfInput = cvRaw.out } - gatkVariantcalling.sampleID = sampleID - gatkVariantcalling.init - gatkVariantcalling.biopetScript - addAll(gatkVariantcalling.functions) - sampleOutput.variantcalling = gatkVariantcalling.scriptOutput - gvcfFiles :+= gatkVariantcalling.scriptOutput.gvcfFile - } else logger.warn("No bamfiles for variant calling for sample: " + sampleID) - return sampleOutput - } - // Called for each run from a sample - def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = { - val libraryOutput = new LibraryOutput - val runID: String = runConfig("ID").toString - val sampleID: String = sampleConfig("ID").toString - val runDir: String = globalSampleDir + sampleID + "/run_" + runID + "/" - var inputType = "" - if (runConfig.contains("inputtype")) inputType = runConfig("inputtype").toString - else inputType = config("inputtype", default = "dna").toString - if (runConfig.contains("R1")) { - val mapping = Mapping.loadFromLibraryConfig(this, runConfig, sampleConfig, runDir) - addAll(mapping.functions) // Add functions of mapping to curent function pool - libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile") - } else if (runConfig.contains("bam")) { - var bamFile = new File(runConfig("bam").toString) - if (!bamFile.exists) throw new IllegalStateException("Bam in config does not exist, file: " + bamFile) - - if (config("bam_to_fastq", default = false).asBoolean) { - val samToFastq = SamToFastq(this, bamFile, runDir + sampleID + "-" + runID + ".R1.fastq", - runDir + sampleID + "-" + runID + ".R2.fastq") - add(samToFastq, isIntermediate = true) - val mapping = Mapping.loadFromLibraryConfig(this, runConfig, sampleConfig, runDir, startJobs = false) - mapping.input_R1 = samToFastq.fastqR1 - mapping.input_R2 = samToFastq.fastqR2 - mapping.init - mapping.biopetScript - addAll(mapping.functions) // Add functions of mapping to curent function pool - libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile") - } else { - var readGroupOke = true - val inputSam = SamReaderFactory.makeDefault.open(bamFile) - val header = inputSam.getFileHeader.getReadGroups - for (readGroup <- inputSam.getFileHeader.getReadGroups) { - if (readGroup.getSample != sampleID) logger.warn("Sample ID readgroup in bam file is not the same") - if (readGroup.getLibrary != runID) logger.warn("Library ID readgroup in bam file is not the same") - if (readGroup.getSample != sampleID || readGroup.getLibrary != runID) readGroupOke = false - } - inputSam.close - - if (!readGroupOke) { - if (config("correct_readgroups", default = false)) { - logger.info("Correcting readgroups, file:" + bamFile) - val aorrg = AddOrReplaceReadGroups(this, bamFile, new File(runDir + sampleID + "-" + runID + ".bam")) - aorrg.RGID = sampleID + "-" + runID - aorrg.RGLB = runID - aorrg.RGSM = sampleID - if (runConfig.contains("PL")) aorrg.RGPL = runConfig("PL").toString - else aorrg.RGPL = "illumina" - if (runConfig.contains("PU")) aorrg.RGPU = runConfig("PU").toString - else aorrg.RGPU = "na" - if (runConfig.contains("CN")) aorrg.RGCN = runConfig("CN").toString - add(aorrg, isIntermediate = true) - bamFile = aorrg.output - } else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile + - "\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this") - } - addAll(BamMetrics(this, bamFile, runDir + "metrics/").functions) - - libraryOutput.mappedBamFile = bamFile + multisampleVariantcalling.preProcesBams = false + multisampleVariantcalling.doublePreProces = false + multisampleVariantcalling.inputBams = allBamfiles.toList + multisampleVariantcalling.outputDir = outputDir + "variantcalling" + multisampleVariantcalling.outputName = "multisample" + multisampleVariantcalling.init + multisampleVariantcalling.biopetScript + addAll(multisampleVariantcalling.functions) + + if (config("inputtype", default = "dna").asString != "rna" && config("recalibration", default = false).asBoolean) { + val recalibration = new GatkVariantRecalibration(this) + recalibration.inputVcf = multisampleVariantcalling.scriptOutput.finalVcfFile + recalibration.bamFiles = allBamfiles + recalibration.outputDir = outputDir + "recalibration/" + recalibration.init + recalibration.biopetScript } - } else logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig) - - val gatkVariantcalling = new GatkVariantcalling(this) - gatkVariantcalling.inputBams = List(libraryOutput.mappedBamFile) - gatkVariantcalling.outputDir = runDir - gatkVariantcalling.variantcalling = config("library_variantcalling", default = false) - gatkVariantcalling.preProcesBams = true - gatkVariantcalling.sampleID = sampleID - gatkVariantcalling.init - gatkVariantcalling.biopetScript - addAll(gatkVariantcalling.functions) - libraryOutput.variantcalling = gatkVariantcalling.scriptOutput - - return libraryOutput + } } } diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala index c242539f4de05b7636b8b8615ae89297dfcc31cc..8bac4aaf68c33a245da877d460bc26abb9ebe564 100644 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala +++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala @@ -7,10 +7,11 @@ package nl.lumc.sasc.biopet.pipelines.gatk import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } import java.io.File -import nl.lumc.sasc.biopet.tools.{ MpileupToVcf, VcfFilter, MergeAlleles } +import nl.lumc.sasc.biopet.tools.{ VcfStats, MpileupToVcf, VcfFilter, MergeAlleles } import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.gatk.{ AnalyzeCovariates, BaseRecalibrator, GenotypeGVCFs, HaplotypeCaller, IndelRealigner, PrintReads, RealignerTargetCreator, SelectVariants, CombineVariants, UnifiedGenotyper } import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates +import nl.lumc.sasc.biopet.utils.ConfigUtils import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile import org.broadinstitute.gatk.utils.commandline.{ Input, Argument } @@ -31,9 +32,6 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr @Argument(doc = "Reference", shortName = "R", required = false) var reference: File = config("reference", required = true) - @Argument(doc = "Dbsnp", shortName = "dbsnp", required = false) - var dbsnp: File = config("dbsnp") - @Argument(doc = "OutputName", required = false) var outputName: String = _ @@ -52,7 +50,7 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr def init() { if (outputName == null && sampleID != null) outputName = sampleID - else if (outputName == null) outputName = "noname" + else if (outputName == null) outputName = config("output_name", default = "noname") if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module") else if (!outputDir.endsWith("/")) outputDir += "/" @@ -68,7 +66,8 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr if (files.isEmpty) throw new IllegalStateException("Files can't be empty") if (!doublePreProces.get) return files val markDup = MarkDuplicates(this, files, new File(outputDir + outputName + ".dedup.bam")) - add(markDup, isIntermediate = useIndelRealigner) + markDup.isIntermediate = useIndelRealigner + add(markDup) if (useIndelRealigner) { List(addIndelRealign(markDup.output, outputDir, isIntermediate = false)) } else { @@ -141,11 +140,13 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr add(m2v) scriptOutput.rawVcfFile = m2v.output - val vcfFilter = new VcfFilter(this) - vcfFilter.defaults ++= Map("min_sample_depth" -> 8, - "min_alternate_depth" -> 2, - "min_samples_pass" -> 1, - "filter_ref_calls" -> true) + val vcfFilter = new VcfFilter(this) { + override def defaults = ConfigUtils.mergeMaps(Map("min_sample_depth" -> 8, + "min_alternate_depth" -> 2, + "min_samples_pass" -> 1, + "filter_ref_calls" -> true + ), super.defaults) + } vcfFilter.inputVcf = m2v.output vcfFilter.outputVcf = this.swapExt(outputDir, m2v.output, ".vcf", ".filter.vcf.gz") add(vcfFilter) @@ -157,7 +158,8 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr // Allele mode if (useAllelesOption.get) { val mergeAlleles = MergeAlleles(this, mergeList.toList, outputDir + "raw.allele__temp_only.vcf.gz") - add(mergeAlleles, isIntermediate = true) + mergeAlleles.isIntermediate = true + add(mergeAlleles) if (useHaplotypecaller.get) { val hcAlleles = new HaplotypeCaller(this) @@ -187,23 +189,32 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr val sv = SelectVariants(this, input, output) sv.excludeFiltered = true sv.excludeNonVariants = true - add(sv, isIntermediate = true) + sv.isIntermediate = true + add(sv) sv.out } val cvFinal = CombineVariants(this, mergeList.toList, outputDir + outputName + ".final.vcf.gz") cvFinal.genotypemergeoption = org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils.GenotypeMergeType.UNSORTED add(cvFinal) + + val vcfStats = new VcfStats(this) + vcfStats.input = cvFinal.out + vcfStats.setOutputDir(outputDir + File.separator + "vcfstats") + add(vcfStats) + scriptOutput.finalVcfFile = cvFinal.out } } def addIndelRealign(inputBam: File, dir: String, isIntermediate: Boolean = true): File = { val realignerTargetCreator = RealignerTargetCreator(this, inputBam, dir) - add(realignerTargetCreator, isIntermediate = true) + realignerTargetCreator.isIntermediate = true + add(realignerTargetCreator) val indelRealigner = IndelRealigner.apply(this, inputBam, realignerTargetCreator.out, dir) - add(indelRealigner, isIntermediate = isIntermediate) + indelRealigner.isIntermediate = isIntermediate + add(indelRealigner) return indelRealigner.o } @@ -227,7 +238,8 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr val printReads = PrintReads(this, inputBam, swapExt(dir, inputBam, ".bam", ".baserecal.bam")) printReads.BQSR = baseRecalibrator.o - add(printReads, isIntermediate = isIntermediate) + printReads.isIntermediate = isIntermediate + add(printReads) return printReads.o } diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala deleted file mode 100644 index c3f5add4b52092e607e1c4975745e7f58793dd08..0000000000000000000000000000000000000000 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala +++ /dev/null @@ -1,87 +0,0 @@ -/** - * Due to the license issue with GATK, this part of Biopet can only be used inside the - * LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions - * on how to use this protected part of biopet or contact us at sasc@lumc.nl - */ -package nl.lumc.sasc.biopet.pipelines.gatk - -import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } -import java.io.File -import nl.lumc.sasc.biopet.core.config.Configurable -import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants -import nl.lumc.sasc.biopet.extensions.gatk.SelectVariants -import nl.lumc.sasc.biopet.extensions.gatk.VariantEval -import org.broadinstitute.gatk.queue.QScript -import org.broadinstitute.gatk.utils.commandline.{ Input, Argument } - -class GatkVcfSampleCompare(val root: Configurable) extends QScript with BiopetQScript { - def this() = this(null) - - @Input(doc = "Sample vcf file(s)", shortName = "V") - var vcfFiles: List[File] = _ - - @Argument(doc = "Reference", shortName = "R", required = false) - var reference: File = config("reference") - - @Argument(doc = "Target bed", shortName = "targetBed", required = false) - var targetBed: List[File] = Nil - - @Argument(doc = "Samples", shortName = "sample", required = false) - var samples: List[String] = Nil - - var vcfFile: File = _ - var sampleVcfs: Map[String, File] = Map() - def generalSampleDir = outputDir + "samples/" - - def init() { - if (config.contains("target_bed")) - for (bed <- config("target_bed").asList) - targetBed :+= bed.toString - if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module") - else if (!outputDir.endsWith("/")) outputDir += "/" - } - - def biopetScript() { - vcfFile = if (vcfFiles.size > 1) { - val combineVariants = CombineVariants(this, vcfFiles, outputDir + "merge.vcf") - add(combineVariants) - combineVariants.out - } else vcfFiles.head - - for (sample <- samples) { - sampleVcfs += (sample -> new File(generalSampleDir + sample + File.separator + sample + ".vcf")) - val selectVariants = SelectVariants(this, vcfFile, sampleVcfs(sample)) - selectVariants.sample_name = Seq(sample) - selectVariants.excludeNonVariants = true - add(selectVariants) - } - - val sampleCompareMetrics = new SampleCompareMetrics(this) - sampleCompareMetrics.samples = samples - sampleCompareMetrics.sampleDir = generalSampleDir - sampleCompareMetrics.snpRelFile = outputDir + "compare.snp.rel.tsv" - sampleCompareMetrics.snpAbsFile = outputDir + "compare.snp.abs.tsv" - sampleCompareMetrics.indelRelFile = outputDir + "compare.indel.rel.tsv" - sampleCompareMetrics.indelAbsFile = outputDir + "compare.indel.abs.tsv" - sampleCompareMetrics.totalFile = outputDir + "total.tsv" - - for ((sample, sampleVcf) <- sampleVcfs) { - val sampleDir = generalSampleDir + sample + File.separator - for ((compareSample, compareSampleVcf) <- sampleVcfs) { - val variantEval = VariantEval(this, - sampleVcf, - compareSampleVcf, - new File(sampleDir + sample + "-" + compareSample + ".eval.txt"), - Seq("VariantType", "CompRod"), - Seq("CompOverlap") - ) - if (targetBed != null) variantEval.L = targetBed - add(variantEval) - sampleCompareMetrics.deps ::= variantEval.out - } - } - add(sampleCompareMetrics) - } -} - -object GatkVcfSampleCompare extends PipelineCommand diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala deleted file mode 100644 index 861455fe4d886219813409023d022ea096f413c9..0000000000000000000000000000000000000000 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala +++ /dev/null @@ -1,153 +0,0 @@ -/** - * Due to the license issue with GATK, this part of Biopet can only be used inside the - * LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions - * on how to use this protected part of biopet or contact us at sasc@lumc.nl - */ -package nl.lumc.sasc.biopet.pipelines.gatk - -import java.io.File -import java.io.PrintWriter -import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction -import nl.lumc.sasc.biopet.core.config.Configurable -import org.broadinstitute.gatk.utils.R.RScriptExecutor -import org.broadinstitute.gatk.utils.commandline.{ Output, Argument } -import scala.io.Source -import org.broadinstitute.gatk.utils.R.{ RScriptLibrary, RScriptExecutor } -import org.broadinstitute.gatk.utils.io.Resource -import scala.collection.mutable.Map -import scala.math._ - -class SampleCompareMetrics(val root: Configurable) extends BiopetJavaCommandLineFunction { - javaMainClass = getClass.getName - - @Argument(doc = "Sample Dir", shortName = "sampleDir", required = true) - var sampleDir: String = _ - - @Argument(doc = "Samples", shortName = "sample", required = true) - var samples: List[String] = Nil - - @Argument(doc = "File sufix", shortName = "sufix", required = false) - var fileSufix: String = _ - - @Output(doc = "snpRelFile", shortName = "snpRelFile", required = true) - var snpRelFile: File = _ - - @Output(doc = "snpAbsFile", shortName = "snpAbsFile", required = true) - var snpAbsFile: File = _ - - @Output(doc = "indelRelFile", shortName = "indelRelFile", required = true) - var indelRelFile: File = _ - - @Output(doc = "indelAbsFile", shortName = "indelAbsFile", required = true) - var indelAbsFile: File = _ - - @Output(doc = "totalFile", shortName = "totalFile", required = true) - var totalFile: File = _ - - override val defaultVmem = "8G" - memoryLimit = Option(4.0) - - override def commandLine = super.commandLine + - required("-sampleDir", sampleDir) + - repeat("-sample", samples) + - optional("-fileSufix", fileSufix) + - required("-snpRelFile", snpRelFile) + - required("-snpAbsFile", snpAbsFile) + - required("-indelRelFile", indelRelFile) + - required("-indelAbsFile", indelAbsFile) + - required("-totalFile", totalFile) -} - -object SampleCompareMetrics { - var sampleDir: String = _ - var samples: List[String] = Nil - var fileSufix: String = ".eval.txt" - var snpRelFile: File = _ - var snpAbsFile: File = _ - var indelRelFile: File = _ - var indelAbsFile: File = _ - var totalFile: File = _ - /** - * @param args the command line arguments - */ - def main(args: Array[String]): Unit = { - - for (t <- 0 until args.size) { - args(t) match { - case "-sample" => samples +:= args(t + 1) - case "-sampleDir" => sampleDir = args(t + 1) - case "-fileSufix" => fileSufix = args(t + 1) - case "-snpRelFile" => snpRelFile = new File(args(t + 1)) - case "-snpAbsFile" => snpAbsFile = new File(args(t + 1)) - case "-indelRelFile" => indelRelFile = new File(args(t + 1)) - case "-indelAbsFile" => indelAbsFile = new File(args(t + 1)) - case "-totalFile" => totalFile = new File(args(t + 1)) - case _ => - } - } - if (sampleDir == null) throw new IllegalStateException("No sampleDir, use -sampleDir") - else if (!sampleDir.endsWith("/")) sampleDir += "/" - - val regex = """\W+""".r - val snpsOverlap: Map[(String, String), Int] = Map() - val indelsOverlap: Map[(String, String), Int] = Map() - val snpsTotal: Map[String, Int] = Map() - val indelsTotal: Map[String, Int] = Map() - for (sample1 <- samples; sample2 <- samples) { - val reader = Source.fromFile(new File(sampleDir + sample1 + "/" + sample1 + "-" + sample2 + fileSufix)) - for (line <- reader.getLines) { - regex.split(line) match { - case Array(_, _, _, varType, all, novel, overlap, rate, _*) => { - varType match { - case "SNP" => { - snpsOverlap += (sample1, sample2) -> overlap.toInt - snpsTotal += sample1 -> all.toInt - } - case "INDEL" => { - indelsOverlap += (sample1, sample2) -> overlap.toInt - indelsTotal += sample1 -> all.toInt - } - case _ => - } - } - case _ => - } - } - reader.close() - } - - val snpRelWritter = new PrintWriter(snpRelFile) - val snpAbsWritter = new PrintWriter(snpAbsFile) - val indelRelWritter = new PrintWriter(indelRelFile) - val indelAbsWritter = new PrintWriter(indelAbsFile) - - val allWritters = List(snpRelWritter, snpAbsWritter, indelRelWritter, indelAbsWritter) - for (writter <- allWritters) writter.println(samples.mkString("\t", "\t", "")) - for (sample1 <- samples) { - for (writter <- allWritters) writter.print(sample1) - for (sample2 <- samples) { - snpRelWritter.print("\t" + (round((snpsOverlap(sample1, sample2).toDouble / snpsTotal(sample1) * 10000.0)) / 10000.0)) - snpAbsWritter.print("\t" + snpsOverlap(sample1, sample2)) - indelRelWritter.print("\t" + (round((indelsOverlap(sample1, sample2).toDouble / indelsTotal(sample1) * 10000.0)) / 10000.0)) - indelAbsWritter.print("\t" + indelsOverlap(sample1, sample2)) - } - for (writter <- allWritters) writter.println() - } - for (writter <- allWritters) writter.close() - - val totalWritter = new PrintWriter(totalFile) - totalWritter.println("Sample\tSNPs\tIndels") - for (sample <- samples) - totalWritter.println(sample + "\t" + snpsTotal(sample) + "\t" + indelsTotal(sample)) - totalWritter.close() - - def plot(file: File) { - val executor = new RScriptExecutor - executor.addScript(new Resource("plotHeatmap.R", getClass)) - executor.addArgs(file, file.getAbsolutePath.stripSuffix(".tsv") + ".png", file.getAbsolutePath.stripSuffix(".tsv") + ".clustering.png") - executor.exec() - } - plot(snpRelFile) - plot(indelRelFile) - } -} \ No newline at end of file diff --git a/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala b/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala index 9457fd36cfab986c5fdc9d5cbc29836ced4e0962..902e292fca017947affb287249ccff764e943100 100644 --- a/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala +++ b/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala @@ -12,7 +12,6 @@ object BiopetExecutableProtected extends BiopetExecutable { nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantcalling, nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline, nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantRecalibration, - nl.lumc.sasc.biopet.pipelines.gatk.GatkVcfSampleCompare, nl.lumc.sasc.biopet.pipelines.basty.Basty) def tools = BiopetExecutablePublic.tools diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R deleted file mode 100644 index 4158db708d58c8cc19b535dcfe871c626fa51ad6..0000000000000000000000000000000000000000 --- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R +++ /dev/null @@ -1,24 +0,0 @@ -library('gplots') - -args <- commandArgs(TRUE) -inputArg <- args[1] -outputArg <- args[2] -outputArgClustering <- args[3] - -col <- heat.colors(250) -col[250] <- "#00FF00" - -heat<-read.table(inputArg, header = 1, sep= '\t') -rownames(heat) <- heat[,1] -heat<- heat[,-1] - -heat<- as.matrix(heat) - -png(file = outputArg, width = 1000, height = 1000) -heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none") -dev.off() - - -png(file = outputArgClustering, width = 1000, height = 1000) -heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row") -dev.off() diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/sync_paired_end_reads.py b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/sync_paired_end_reads.py deleted file mode 100644 index be4ab0035b7254cfbfd70b18c272cca1fd8c47dd..0000000000000000000000000000000000000000 --- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/sync_paired_end_reads.py +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python -# -# Biopet is built on top of GATK Queue for building bioinformatic -# pipelines. It is mainly intended to support LUMC SHARK cluster which is running -# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) -# should also be able to execute Biopet tools and pipelines. -# -# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center -# -# Contact us at: sasc@lumc.nl -# -# A dual licensing mode is applied. The source code within this project that are -# not part of GATK Queue is freely available for non-commercial use under an AGPL -# license; For commercial users or users who do not want to follow the AGPL -# license, please contact us to obtain a separate license. -# - -""" -(Re-)sync two filtered paired end FASTQ files. - -Given two filtered paired end read files and one of the original read files, -re-sync the filtered reads by filtering out anything that is only present in -one of the two files. - -Usage: - {command} <orig.fq> <reads_1.fq> <reads_2.fq> \\ - <reads_1.synced.fq> <reads_2.synced.fq> - -The synced reads are written to disk as <reads_1.synced.fq> and -<reads_2.synced.fq>. Afterwards some counts are printed. - - -Both Illumina old-style and new-style paired-end header lines are supported. - -The original read file is used to speed up processing: it contains all -possible reads from both edited reads (in all files in the same order) so it -can process all files line by line, not having to read a single file in -memory. Some ideas were taken from [1]. - -[1] https://gist.github.com/588841/ - -2011-11-03, Martijn Vermaat <m.vermaat.hg@lumc.nl> -""" - - -import sys -import re - - -def sync_paired_end_reads(original, reads_a, reads_b, synced_a, synced_b): - """ - Filter out reads from two paired end read files that are not present in - both of them. Do this in a reasonable amount of time by using a file - containing all of the reads for one of the paired ends. - - All arguments are open file handles. - - @arg original: File containing all original reads for one of the paired - ends. - @arg reads_a: First from paired end read files. - @arg reads_b: Second from paired end read files. - @arg synced_a: Filtered reads from first paired end read file. - @arg synced_b: Filtered reads from second paired end read file. - - @return: Triple (filtered_a, filtered_b, kept) containing counts - of the number of reads filtered from both input files and - the total number of reads kept in the synced results. - - @todo: Print warnings if obvious things are not right (a or b still has - lines after original is processed). - """ - # This matches 1, 2, or 3 preceded by / _ or whitespace. Its rightmost - # match in a header line is used to identify the read pair. - sep = re.compile('[\s_/][123]') - - def next_record(fh): - return [fh.readline().strip() for i in range(4)] - - def head(record): - return sep.split(record[0])[:-1] - - headers = (sep.split(x.strip())[:-1] for i, x in enumerate(original) - if not (i % 4)) - - filtered_a = filtered_b = kept = 0 - - a, b = next_record(reads_a), next_record(reads_b) - - for header in headers: - if header == head(a) and head(b) != header: - a = next_record(reads_a) - filtered_a += 1 - - if header == head(b) and head(a) != header: - b = next_record(reads_b) - filtered_b += 1 - - if header == head(a) == head(b): - print >>synced_a, '\n'.join(a) - print >>synced_b, '\n'.join(b) - a, b = next_record(reads_a), next_record(reads_b) - kept += 1 - - return filtered_a, filtered_b, kept - - -if __name__ == '__main__': - if len(sys.argv) < 6: - sys.stderr.write(__doc__.split('\n\n\n')[0].strip().format( - command=sys.argv[0]) + '\n') - sys.exit(1) - try: - original = open(sys.argv[1], 'r') - reads_a = open(sys.argv[2], 'r') - reads_b = open(sys.argv[3], 'r') - synced_a = open(sys.argv[4], 'w') - synced_b = open(sys.argv[5], 'w') - filtered_a, filtered_b, kept = \ - sync_paired_end_reads(original, reads_a, reads_b, - synced_a, synced_b) - print 'Filtered %i reads from first read file.' % filtered_a - print 'Filtered %i reads from second read file.' % filtered_b - print 'Synced read files contain %i reads.' % kept - except IOError as (_, message): - sys.stderr.write('Error: %s\n' % message) - sys.exit(1) diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R new file mode 100644 index 0000000000000000000000000000000000000000..7f7237e90f6593e3d6cf110da005cd89c154d466 --- /dev/null +++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R @@ -0,0 +1,35 @@ +library('gplots') +library('RColorBrewer') + +args <- commandArgs(TRUE) +inputArg <- args[1] +outputArg <- args[2] +outputArgClustering <- args[3] +outputArgDendrogram <- args[4] + + +heat<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F) +#heat[heat==1] <- NA +rownames(heat) <- heat[,1] +heat<- heat[,-1] +heat<- as.matrix(heat) + +colNumber <- 50 +col <- rev(colorRampPalette(brewer.pal(11, "Spectral"))(colNumber)) +for (i in (colNumber+1):(colNumber+round((dist(range(heat)) - dist(range(heat[heat < 1]))) / dist(range(heat[heat < 1])) * colNumber))) { + col[i] <- col[colNumber] +} +col[length(col)] <- "#00FF00" + +png(file = outputArg, width = 1200, height = 1200) +heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none", margins = c(12, 12), na.color="#00FF00") +dev.off() + +hc <- hclust(d = dist(heat)) +png(file = outputArgDendrogram, width = 1200, height = 1200) +plot(as.dendrogram(hc), horiz=TRUE, asp=0.02) +dev.off() + +png(file = outputArgClustering, width = 1200, height = 1200) +heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row",margins = c(12, 12), na.color="#00FF00") +dev.off() diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R new file mode 100644 index 0000000000000000000000000000000000000000..63fd7b03262d94094a9f151b22cca812f10cee1f --- /dev/null +++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R @@ -0,0 +1,20 @@ +library('ggplot2') +library('reshape2') + +args <- commandArgs(TRUE) +inputArg <- args[1] +outputArg <- args[2] + +tsv<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F) + +data <- melt(tsv) + +data$X <- as.numeric(data$X) +data <- na.omit(data) +data <- data[data$value > 0,] + +print("Starting to plot") +png(file = outputArg, width = 1500, height = 1500) +ggplot(data, aes(x=X, y=value, color=variable, group=variable)) + geom_line() +dev.off() +print("plot done") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala index d3474814245d690ce71fc63203e8fa03d3149ff3..4a5b25f7a5b816fe32ad39897a334a6e13fe1eda 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala @@ -15,13 +15,11 @@ */ package nl.lumc.sasc.biopet.core -//import java.io.BufferedInputStream import java.io.File import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.queue.QException import org.broadinstitute.gatk.queue.function.CommandLineFunction import org.broadinstitute.gatk.utils.commandline.{ Input, Argument } -//import scala.io.Source import scala.sys.process.{ Process, ProcessLogger } import scala.util.matching.Regex import java.io.FileInputStream @@ -38,7 +36,7 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab val defaultThreads = 1 @Argument(doc = "Vmem", required = false) - var vmem: String = _ + var vmem: Option[String] = None val defaultVmem: String = "" @Argument(doc = "Executable", required = false) @@ -53,17 +51,17 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab override def freezeFieldValues() { checkExecutable afterGraph - jobOutputFile = new File(firstOutput.getParent + "/." + firstOutput.getName + "." + configName + ".out") + if (jobOutputFile == null) jobOutputFile = new File(firstOutput.getParent + "/." + firstOutput.getName + "." + configName + ".out") if (threads == 0) threads = getThreads(defaultThreads) if (threads > 1) nCoresRequest = Option(threads) - if (vmem == null) { + if (vmem.isEmpty) { vmem = config("vmem") - if (vmem == null && !defaultVmem.isEmpty) vmem = defaultVmem + if (vmem.isEmpty && defaultVmem.nonEmpty) vmem = Some(defaultVmem) } - if (vmem != null) jobResourceRequests :+= "h_vmem=" + vmem - jobName = configName + ":" + firstOutput.getName + if (vmem.isDefined) jobResourceRequests :+= "h_vmem=" + vmem.get + jobName = configName + ":" + (if (firstOutput != null) firstOutput.getName else jobOutputFile) super.freezeFieldValues() } @@ -99,14 +97,13 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab val temp = MessageDigest.getInstance("MD5").digest(bytes).map("%02X".format(_)).mkString.toLowerCase BiopetCommandLineFunctionTrait.executableMd5Cache += executable -> temp } - - addJobReportBinding("md5sum_exe", BiopetCommandLineFunctionTrait.executableMd5Cache(executable)) } catch { case ioe: java.io.IOException => logger.warn("Could not use 'which', check on executable skipped: " + ioe) } - } else { - addJobReportBinding("md5sum_exe", BiopetCommandLineFunctionTrait.executableMd5Cache(executable)) } + val md5 = BiopetCommandLineFunctionTrait.executableMd5Cache(executable) + if (md5 == null) addJobReportBinding("md5sum_exe", md5) + else addJobReportBinding("md5sum_exe", "None") } final protected def preCmdInternal { diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala index 7b7dda2135b92747167d12898958bd2c6680c1d6..1a3f565880dc3a2bd5450efd86cfa188e647eef4 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala @@ -17,7 +17,7 @@ package nl.lumc.sasc.biopet.core import java.io.File import java.io.PrintWriter -import nl.lumc.sasc.biopet.core.config.{ Config, Configurable } +import nl.lumc.sasc.biopet.core.config.{ ConfigValueIndex, Config, Configurable } import org.broadinstitute.gatk.utils.commandline.Argument import org.broadinstitute.gatk.queue.QSettings import org.broadinstitute.gatk.queue.function.QFunction @@ -29,11 +29,17 @@ trait BiopetQScript extends Configurable with GatkLogging { @Argument(doc = "JSON config file(s)", fullName = "config_file", shortName = "config", required = false) val configfiles: List[File] = Nil - @Argument(doc = "Output directory", fullName = "output_directory", shortName = "outDir", required = true) - var outputDir: String = _ + var outputDir: String = { + val temp = Config.getValueFromMap(Config.global.map, ConfigValueIndex(this.configName, configPath, "output_dir")) + if (temp.isEmpty) throw new IllegalArgumentException("No output_dir defined in config") + else { + val t = temp.get.value.toString + if (!t.endsWith("/")) t + "/" else t + } + } @Argument(doc = "Disable all scatters", shortName = "DSC", required = false) - var disableScatterDefault: Boolean = false + var disableScatter: Boolean = false var outputFiles: Map[String, File] = Map() @@ -45,11 +51,12 @@ trait BiopetQScript extends Configurable with GatkLogging { var functions: Seq[QFunction] final def script() { + outputDir = config("output_dir", required = true) if (!outputDir.endsWith("/")) outputDir += "/" init biopetScript - if (disableScatterDefault) for (function <- functions) function match { + if (disableScatter) for (function <- functions) function match { case f: ScatterGatherableFunction => f.scatterCount = 1 case _ => } @@ -57,13 +64,8 @@ trait BiopetQScript extends Configurable with GatkLogging { case f: BiopetCommandLineFunctionTrait => f.afterGraph case _ => } - val configReport = Config.global.getReport - val configReportFile = new File(outputDir + qSettings.runName + ".configreport.txt") - configReportFile.getParentFile.mkdir - val writer = new PrintWriter(configReportFile) - writer.write(configReport) - writer.close() - for (line <- configReport.split("\n")) logger.debug(line) + + Config.global.writeReport(qSettings.runName, outputDir + ".log/" + qSettings.runName) } def add(functions: QFunction*) // Gets implemeted at org.broadinstitute.sting.queue.QScript diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala index fb5c512635c435c902ffc8018642ca5069a7d023..eb76502c8593a9e970ea2c95b9152569dfffb389 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala @@ -15,166 +15,150 @@ */ package nl.lumc.sasc.biopet.core -import nl.lumc.sasc.biopet.core.config.{ ConfigValue, Config, Configurable } -import nl.lumc.sasc.biopet.utils.ConfigUtils._ +import java.io.File +import nl.lumc.sasc.biopet.core.config.{ Config } +import nl.lumc.sasc.biopet.utils.ConfigUtils +import org.broadinstitute.gatk.utils.commandline.{ Argument } + +/** + * This trait creates a structured way of use multisample pipelines + */ trait MultiSampleQScript extends BiopetQScript { - type LibraryOutput <: AbstractLibraryOutput - type SampleOutput <: AbstractSampleOutput - - abstract class AbstractLibraryOutput - abstract class AbstractSampleOutput { - var libraries: Map[String, LibraryOutput] = Map() - def getAllLibraries = libraries - def getLibrary(key: String) = libraries(key) - } + @Argument(doc = "Only Sample", shortName = "sample", required = false) + private val onlySamples: List[String] = Nil - if (!config.contains("samples")) logger.warn("No Samples found in config") + require(Config.global.map.contains("samples"), "No Samples found in config") /** - * Returns a map with all sample configs + * Sample class with basic functions build in + * @param sampleId */ - val getSamplesConfig: Map[String, Any] = config("samples", default = Map()) + abstract class AbstractSample(val sampleId: String) { + /** Overrules config of qscript with default sample */ + val config = new ConfigFunctions(defaultSample = sampleId) + + /** + * Library class with basic functions build in + * @param libId + */ + abstract class AbstractLibrary(val libId: String) { + /** Overrules config of qscript with default sample and default library */ + val config = new ConfigFunctions(defaultSample = sampleId, defaultLibrary = libId) + + /** Adds the library jobs */ + final def addAndTrackJobs(): Unit = { + currentSample = Some(sampleId) + currentLib = Some(libId) + addJobs() + currentLib = None + currentSample = None + } - /** - * Returns a list of all sampleIDs - */ - val getSamples: Set[String] = getSamplesConfig.keySet + /** Creates a library file with given suffix */ + def createFile(suffix: String): File = new File(libDir, sampleId + "-" + libId + suffix) - /** - * Returns the global sample directory - * @return global sample directory - */ - def globalSampleDir: String = outputDir + "samples/" + /** Returns library directory */ + def libDir = sampleDir + "lib_" + libId + File.separator - var samplesOutput: Map[String, SampleOutput] = Map() + /** Function that add library jobs */ + protected def addJobs() + } - /** - * Runs runSingleSampleJobs method for each sample - */ - final def runSamplesJobs() { - for ((key, value) <- getSamplesConfig) { - var sample = any2map(value) - if (!sample.contains("ID")) sample += ("ID" -> key) - if (sample("ID") == key) { - currentSample = key - samplesOutput += key -> runSingleSampleJobs(sample) - currentSample = null - } else logger.warn("Key is not the same as ID on value for sample") + /** Library type, need implementation in pipeline */ + type Library <: AbstractLibrary + + /** Stores all libraries */ + val libraries: Map[String, Library] = libIds.map(id => id -> makeLibrary(id)).toMap + + /** + * Factory method for Library class + * @param id SampleId + * @return Sample class + */ + def makeLibrary(id: String): Library + + /** returns a set with library names */ + protected def libIds: Set[String] = { + ConfigUtils.getMapFromPath(Config.global.map, List("samples", sampleId, "libraries")).getOrElse(Map()).keySet } - } - def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput + /** Adds sample jobs */ + final def addAndTrackJobs(): Unit = { + currentSample = Some(sampleId) + addJobs() + currentSample = None + } - /** - * Run sample with only sampleID - * @param sample sampleID - * @return - */ - def runSingleSampleJobs(sample: String): SampleOutput = { - var map = any2map(getSamplesConfig(sample)) - if (map.contains("ID") && map("ID") != sample) - throw new IllegalStateException("ID in config not the same as the key") - else map += ("ID" -> sample) - return runSingleSampleJobs(map) - } + /** Function to add sample jobs */ + protected def addJobs() - /** - * Runs runSingleLibraryJobs method for each library found in sampleConfig - * @param sampleConfig sample config - * @return Map with libraryID -> LibraryOutput object - */ - final def runLibraryJobs(sampleConfig: Map[String, Any]): Map[String, LibraryOutput] = { - var output: Map[String, LibraryOutput] = Map() - val sampleID = sampleConfig("ID").toString - if (sampleConfig.contains("libraries")) { - val runs = any2map(sampleConfig("libraries")) - for ((key, value) <- runs) { - var library = any2map(value) - if (!library.contains("ID")) library += ("ID" -> key) - if (library("ID") == key) { - currentLibrary = key - output += key -> runSingleLibraryJobs(library, sampleConfig) - currentLibrary = null - } else logger.warn("Key is not the same as ID on value for run of sample: " + sampleID) + /** function add all libraries in one call */ + protected final def addPerLibJobs(): Unit = { + for ((libId, library) <- libraries) { + library.addAndTrackJobs() } - } else logger.warn("No runs found in config for sample: " + sampleID) - return output - } - def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput + } - protected var currentSample: String = null - protected var currentLibrary: String = null + /** + * Creates a sample file with given suffix + * @param suffix + * @return + */ + def createFile(suffix: String) = new File(sampleDir, sampleId + suffix) - /** - * Set current sample manual, only use this when not using runSamplesJobs method - * @param sample - */ - def setCurrentSample(sample: String) { - logger.debug("Manual sample set to: " + sample) - currentSample = sample + /** Returns sample directory */ + def sampleDir = outputDir + "samples" + File.separator + sampleId + File.separator } - /** - * Gets current sample - * @return current sample - */ - def getCurrentSample = currentSample + /** Sample type, need implementation in pipeline */ + type Sample <: AbstractSample /** - * Reset current sample manual, only use this when not using runSamplesJobs method + * Factory method for Sample class + * @param id SampleId + * @return Sample class */ - def resetCurrentSample() { - logger.debug("Manual sample reset") - currentSample = null + def makeSample(id: String): Sample + + /** Stores all samples */ + val samples: Map[String, Sample] = sampleIds.map(id => id -> makeSample(id)).toMap + + /** Returns a list of all sampleIDs */ + protected def sampleIds: Set[String] = ConfigUtils.any2map(Config.global.map("samples")).keySet + + /** Runs addAndTrackJobs method for each sample */ + final def addSamplesJobs() { + if (onlySamples.isEmpty) { + samples.foreach { case (sampleId, sample) => sample.addAndTrackJobs() } + addMultiSampleJobs() + } else onlySamples.foreach(sampleId => samples.get(sampleId) match { + case Some(sample) => sample.addAndTrackJobs() + case None => logger.warn("sampleId '" + sampleId + "' not found") + }) } /** - * Set current library manual, only use this when not using runLibraryJobs method - * @param library + * Method where the multisample jobs should be added, this will be executed only when running the -sample argument is not given */ - def setCurrentLibrary(library: String) { - logger.debug("Manual library set to: " + library) - currentLibrary = library - } + def addMultiSampleJobs() - /** - * Gets current library - * @return current library - */ - def getCurrentLibrary = currentLibrary + /** Stores sample state */ + private var currentSample: Option[String] = None - /** - * Reset current library manual, only use this when not using runLibraryJobs method - */ - def resetCurrentLibrary() { - logger.debug("Manual library reset") - currentLibrary = null - } + /** Stores library state */ + private var currentLib: Option[String] = None + /** Prefix full path with sample and library for jobs that's are created in current state */ override protected[core] def configFullPath: List[String] = { - (if (currentSample != null) "samples" :: currentSample :: Nil else Nil) ::: - (if (currentLibrary != null) "libraries" :: currentLibrary :: Nil else Nil) ::: - super.configFullPath - } - - protected class ConfigFunctions extends super.ConfigFunctions { - override def apply(key: String, - default: Any = null, - submodule: String = null, - required: Boolean = false, - freeVar: Boolean = true, - sample: String = currentSample, - library: String = currentLibrary): ConfigValue = { - super.apply(key, default, submodule, required, freeVar, sample, library) + val s = currentSample match { + case Some(s) => "samples" :: s :: Nil + case _ => Nil } - - override def contains(key: String, - submodule: String = null, - freeVar: Boolean = true, - sample: String = currentSample, - library: String = currentLibrary) = { - super.contains(key, submodule, freeVar, sample, library) + val l = currentLib match { + case Some(l) => "libraries" :: l :: Nil + case _ => Nil } + s ::: l ::: super.configFullPath } } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala index 94d5b4cc5d18759f97641ae01a7cb0dd64078dc6..60f7112525a92597c8e88c480e87c43bb228105a 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala @@ -31,6 +31,20 @@ trait PipelineCommand extends MainCommand with GatkLogging { if (t >= argsSize) throw new IllegalStateException("-config needs a value") Config.global.loadConfigFile(new File(args(t + 1))) } + if (args(t) == "--logging_level" || args(t) == "-l") { + args(t + 1).toLowerCase match { + case "debug" => Logging.logger.setLevel(org.apache.log4j.Level.DEBUG) + case "info" => Logging.logger.setLevel(org.apache.log4j.Level.INFO) + case "warn" => Logging.logger.setLevel(org.apache.log4j.Level.WARN) + case "error" => Logging.logger.setLevel(org.apache.log4j.Level.ERROR) + case _ => + } + } + } + for (t <- 0 until argsSize) { + if (args(t) == "--outputDir" || args(t) == "-outDir") { + throw new IllegalArgumentException("Commandline argument is deprecated, should use config for this now") + } } var argv: Array[String] = Array() diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Config.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Config.scala index bf476eac4fcd0f1ae7e664b6304c060dac773858..4ea65cb5a7fc7ceb1226853d47687a0865c818d0 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Config.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Config.scala @@ -15,10 +15,13 @@ */ package nl.lumc.sasc.biopet.core.config -import java.io.File +import java.io.{ PrintWriter, File } import nl.lumc.sasc.biopet.core.Logging +import nl.lumc.sasc.biopet.utils.ConfigUtils import nl.lumc.sasc.biopet.utils.ConfigUtils._ +import scala.reflect.io.Directory + /** * This class can store nested config values * @param map Map with value for new config @@ -97,7 +100,7 @@ class Config(var map: Map[String, Any]) extends Logging { else if (foundCache.contains(requestedIndex)) return true else { val value = Config.getValueFromMap(map, requestedIndex) - if (value.isDefined) { + if (value.isDefined && value.get.value != None) { foundCache += (requestedIndex -> value.get) return true } else { @@ -137,36 +140,47 @@ class Config(var map: Map[String, Any]) extends Logging { } else throw new IllegalStateException("Value in config could not be found but it seems required, index: " + requestedIndex) } - //TODO: New version of report is needed - /** - * Makes report for all used values - * @return Config report - */ - def getReport: String = { - val output: StringBuilder = new StringBuilder - output.append("Config report, sorted on module:\n") - var modules: Map[String, StringBuilder] = Map() - for ((key, value) <- foundCache) { - val module = key.module - if (!modules.contains(module)) modules += (module -> new StringBuilder) - modules(module).append("Found: " + value.toString + "\n") - } - for ((key, value) <- defaultCache) { - val module = key.module - if (!modules.contains(module)) modules += (module -> new StringBuilder) - modules(module).append("Default used: " + value.toString + "\n") + def writeReport(id: String, directory: String): Unit = { + + def convertIndexValuesToMap(input: List[(ConfigValueIndex, Any)], forceFreeVar: Option[Boolean] = None): Map[String, Any] = { + input.foldLeft(Map[String, Any]())( + (a: Map[String, Any], x: (ConfigValueIndex, Any)) => { + val v = { + if (forceFreeVar.getOrElse(x._1.freeVar)) Map(x._1.key -> x._2) + else Map(x._1.module -> Map(x._1.key -> x._2)) + } + val newMap = x._1.path.foldRight(v)((p, map) => Map(p -> map)) + ConfigUtils.mergeMaps(a, newMap) + }) } - for (value <- notFoundCache) { - val module = value.module - if (!modules.contains(module)) modules += (module -> new StringBuilder) - if (!defaultCache.contains(value)) modules(module).append("Not Found: " + value.toString + "\n") - } - for ((key, value) <- modules) { - output.append("Config options for module: " + key + "\n") - output.append(value.toString) - output.append("\n") + + def writeMapToJsonFile(map: Map[String, Any], name: String): Unit = { + val file = new File(directory + "/" + id + "." + name + ".json") + file.getParentFile.mkdirs() + val writer = new PrintWriter(file) + writer.write(ConfigUtils.mapToJson(map).spaces2) + writer.close() } - return output.toString + + // Positions where values are found + val found = convertIndexValuesToMap(foundCache.filter(!_._2.default).toList.map(x => (x._2.foundIndex, x._2.value))) + + // Positions where to start searching + val effectiveFound = convertIndexValuesToMap(foundCache.filter(!_._2.default).toList.map(x => (x._2.requestIndex, x._2.value)), Some(false)) + val effectiveDefaultFound = convertIndexValuesToMap(defaultCache.filter(_._2.default).toList.map(x => (x._2.requestIndex, x._2.value)), Some(false)) + val notFound = convertIndexValuesToMap(notFoundCache.map((_, None)), Some(false)) + + // Merged maps + val fullEffective = ConfigUtils.mergeMaps(effectiveFound, effectiveDefaultFound) + val fullEffectiveWithNotFound = ConfigUtils.mergeMaps(fullEffective, notFound) + + writeMapToJsonFile(Config.global.map, "input") + writeMapToJsonFile(found, "found") + writeMapToJsonFile(effectiveFound, "effective.found") + writeMapToJsonFile(effectiveDefaultFound, "effective.defaults") + writeMapToJsonFile(notFound, "not.found") + writeMapToJsonFile(fullEffective, "effective.full") + writeMapToJsonFile(fullEffectiveWithNotFound, "effective.full.notfound") } override def toString(): String = map.toString diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Configurable.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Configurable.scala index 7e8aff51eaffd4cb24d3985efa33018a74c714be..54e6dfd170a8cb2e68f5c63dd8ca2675513cee7d 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Configurable.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Configurable.scala @@ -15,39 +15,30 @@ */ package nl.lumc.sasc.biopet.core.config -import java.io.File import nl.lumc.sasc.biopet.core.Logging import nl.lumc.sasc.biopet.utils.ConfigUtils.ImplicitConversions trait Configurable extends ImplicitConversions { - /** - * Should be object of parant object - */ + /** Should be object of parant object */ val root: Configurable - /** - * Get default path to search config values for current object - * @return - */ - def configPath: List[String] = if (root != null) root.configFullPath else List() + /** subfix to the path */ + def subPath: List[String] = Nil - /** - * Gets name of module for config - * @return - */ + /** Get default path to search config values for current object */ + def configPath: List[String] = if (root != null) root.configFullPath ::: subPath else subPath + + /** Gets name of module for config */ protected[core] def configName = getClass.getSimpleName.toLowerCase - /** - * Full path with module in there - * @return - */ + /** ull path with module in there */ protected[core] def configFullPath: List[String] = configPath ::: configName :: Nil - /** - * Map to store defaults for config - */ - var defaults: scala.collection.mutable.Map[String, Any] = if (root != null) scala.collection.mutable.Map(root.defaults.toArray: _*) - else scala.collection.mutable.Map() + /** Map to store defaults for config */ + def defaults: Map[String, Any] = { + if (root != null) root.defaults + else Map() + } val config = new ConfigFunctions @@ -62,13 +53,27 @@ trait Configurable extends ImplicitConversions { def path(sample: String = null, library: String = null, submodule: String = null) = { (if (sample != null) "samples" :: sample :: Nil else Nil) ::: (if (library != null) "libraries" :: library :: Nil else Nil) ::: - (if (submodule != null) configName :: configPath else configPath) + (if (submodule != null) configPath ::: configName :: Nil else configPath) } /** * Class is used for retrieval of config values */ - protected class ConfigFunctions { + protected class ConfigFunctions(val defaultSample: Option[String] = None, val defaultLibrary: Option[String] = None) { + def this(defaultSample: String, defaultLibrary: String) = { + this(defaultSample = Some(defaultSample), defaultLibrary = Some(defaultLibrary)) + } + + def this(defaultSample: String) = { + this(defaultSample = Some(defaultSample), defaultLibrary = None) + } + + (defaultSample, defaultLibrary) match { + case (Some(null), _) => throw new IllegalArgumentException("defaultSample can not be null") + case (_, Some(null)) => throw new IllegalArgumentException("defaultLibrary can not be null") + case _ => + } + /** * * @param key Name of value @@ -87,13 +92,15 @@ trait Configurable extends ImplicitConversions { freeVar: Boolean = true, sample: String = null, library: String = null): ConfigValue = { + val s = if (sample != null || defaultSample.isEmpty) sample else defaultSample.get + val l = if (library != null || defaultLibrary.isEmpty) library else defaultLibrary.get val m = if (submodule != null) submodule else configName - val p = path(sample, library, submodule) + val p = path(s, l, submodule) val d = { val value = Config.getValueFromMap(defaults.toMap, ConfigValueIndex(m, p, key, freeVar)) if (value.isDefined) value.get.value else default } - if (!contains(key, submodule, freeVar, sample = sample, library = library) && d == null) { + if (!contains(key, submodule, freeVar, sample = s, library = l) && d == null) { if (required) { Logging.logger.error("Value in config could not be found but it is required, key: " + key + " module: " + m + " path: " + p) throw new IllegalStateException("Value in config could not be found but it is required, key: " + key + " module: " + m + " path: " + p) @@ -117,8 +124,10 @@ trait Configurable extends ImplicitConversions { freeVar: Boolean = true, sample: String = null, library: String = null) = { + val s = if (sample != null || defaultSample.isEmpty) sample else defaultSample.get + val l = if (library != null || defaultLibrary.isEmpty) library else defaultLibrary.get val m = if (submodule != null) submodule else configName - val p = path(sample, library, submodule) + val p = path(s, l, submodule) Config.global.contains(m, p, key, freeVar) || !(Config.getValueFromMap(defaults.toMap, ConfigValueIndex(m, p, key, freeVar)) == None) } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Bowtie.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala similarity index 92% rename from public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Bowtie.scala rename to public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala index cc5ff37a35aeffc5e58eec027856f63ac83a59a6..e192a845be6db96d2a754d2d2ff6a5e393d136ca 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Bowtie.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala @@ -13,12 +13,13 @@ * license; For commercial users or users who do not want to follow the AGPL * license, please contact us to obtain a separate license. */ -package nl.lumc.sasc.biopet.extensions.aligners +package nl.lumc.sasc.biopet.extensions + +import java.io.File import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output } -import java.io.File class Bowtie(val root: Configurable) extends BiopetCommandLineFunction { @Input(doc = "Fastq file R1", shortName = "R1") @@ -42,14 +43,14 @@ class Bowtie(val root: Configurable) extends BiopetCommandLineFunction { override val defaultThreads = 8 var sam: Boolean = config("sam", default = true) - var sam_RG: String = config("sam-RG") + var sam_RG: Option[String] = config("sam-RG") var seedlen: Option[Int] = config("seedlen") var seedmms: Option[Int] = config("seedmms") var k: Option[Int] = config("k") var m: Option[Int] = config("m") - var best: Boolean = config("best") + var best: Boolean = config("best", default = false) var maxbts: Option[Int] = config("maxbts") - var strata: Boolean = config("strata") + var strata: Boolean = config("strata", default = false) var maqerr: Option[Int] = config("maqerr") def cmdLine = { diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala index f6e036f33e746aec2124e37fa6d2b1123bb84760..e7a5a319ead2bc797713479de80ff949373bc8ff 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala @@ -42,9 +42,9 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction { var opt_front: Set[String] = Set() if (config.contains("front")) for (adapter <- config("front").asList) opt_front += adapter.toString - var opt_discard: Boolean = config("discard") - var opt_minimum_length: String = config("minimum_length", 1) - var opt_maximum_length: String = config("maximum_length") + var opt_discard: Boolean = config("discard", default = false) + var opt_minimum_length: Option[Int] = config("minimum_length", 1) + var opt_maximum_length: Option[Int] = config("maximum_length") def cmdLine = required(executable) + // options diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala index 2b60a6d916330e141ac9bf4152a48af562732c36..2671c05e531a2bc965c31eb92a940f68439051b8 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala @@ -25,10 +25,10 @@ import nl.lumc.sasc.biopet.core.config.Configurable class Fastqc(val root: Configurable) extends BiopetCommandLineFunction { @Input(doc = "Contaminants", required = false) - var contaminants: File = _ + var contaminants: Option[File] = None @Input(doc = "Adapters", required = false) - var adapters: File = _ + var adapters: Option[File] = None @Input(doc = "Fastq file", shortName = "FQ") var fastqfile: File = _ @@ -39,9 +39,9 @@ class Fastqc(val root: Configurable) extends BiopetCommandLineFunction { executable = config("exe", default = "fastqc") var java_exe: String = config("exe", default = "java", submodule = "java", freeVar = false) var kmers: Option[Int] = config("kmers") - var quiet: Boolean = config("quiet") - var noextract: Boolean = config("noextract") - var nogroup: Boolean = config("nogroup") + var quiet: Boolean = config("quiet", default = false) + var noextract: Boolean = config("noextract", default = false) + var nogroup: Boolean = config("nogroup", default = false) var extract: Boolean = config("extract", default = true) override val versionRegex = """FastQC (.*)""".r @@ -50,17 +50,19 @@ class Fastqc(val root: Configurable) extends BiopetCommandLineFunction { override def afterGraph { this.checkExecutable - if (contaminants == null) { - val fastqcDir = executable.substring(0, executable.lastIndexOf("/")) - val defaultContams = getVersion match { - case "v0.11.2" => new File(fastqcDir + "/Configuration/contaminant_list.txt") - case _ => new File(fastqcDir + "/Contaminants/contaminant_list.txt") - } - val defaultAdapters = getVersion match { - case "v0.11.2" => new File(fastqcDir + "/Configuration/adapter_list.txt") - case _ => null - } - contaminants = config("contaminants", default = defaultContams) + contaminants = contaminants match { + case None => + val fastqcDir = executable.substring(0, executable.lastIndexOf("/")) + val defaultContams = getVersion match { + case "v0.11.2" => Option(new File(fastqcDir + "/Configuration/contaminant_list.txt")) + case _ => Option(new File(fastqcDir + "/Contaminants/contaminant_list.txt")) + } + val defaultAdapters = getVersion match { + case "v0.11.2" => Option(new File(fastqcDir + "/Configuration/adapter_list.txt")) + case _ => None + } + config("contaminants", default = defaultContams) + case wrapped @ Some(_) => wrapped } } @@ -74,6 +76,6 @@ class Fastqc(val root: Configurable) extends BiopetCommandLineFunction { conditional(noextract, "--noextract") + conditional(extract, "--extract") + conditional(quiet, "--quiet") + - required("-o", output.getParent()) + + required("-o", output.getParent) + required(fastqfile) } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/PythonCommandLineFunction.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/PythonCommandLineFunction.scala index 68219fd22bbf0a481ba9fa7fe552bc254bf71ef4..ebfaac812bbedec10a14d0daad609d54469f9c03 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/PythonCommandLineFunction.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/PythonCommandLineFunction.scala @@ -28,7 +28,14 @@ trait PythonCommandLineFunction extends BiopetCommandLineFunction { executable = config("exe", default = "python", submodule = "python") protected var python_script_name: String = _ - def setPythonScript(script: String) { setPythonScript(script, "") } + def setPythonScript(script: String) { + python_script = new File(script) + if (!python_script.exists()) { + setPythonScript(script, "") + } else { + python_script_name = script + } + } def setPythonScript(script: String, subpackage: String) { python_script_name = script python_script = new File(".queue/tmp/" + subpackage + python_script_name) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala index f735823718bd87ca5765c925d167143c08494c72..1d7b45ec9617091a457f073e4110ff5e01932e24 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala @@ -50,6 +50,8 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction { @Argument(doc = "Output directory", required = true) var w: String = _ + var noBfgs: Boolean = config("no_bfgs", default = false) + @Input(required = false) var t: File = _ @@ -60,11 +62,11 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction { private var out: List[File] = Nil var executableNonThreads: String = config("exe", default = "raxmlHPC") - var executableThreads: String = config("exe_pthreads") + var executableThreads: Option[String] = config("exe_pthreads") override def afterGraph { if (threads == 0) threads = getThreads(defaultThreads) - executable = if (threads > 1 && executableThreads != null) executableThreads else executableNonThreads + executable = if (threads > 1 && executableThreads.isDefined) executableThreads.get else executableNonThreads super.afterGraph out +:= getInfoFile f match { @@ -101,5 +103,6 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction { optional("-f", f) + optional("-t", t) + optional("-z", z) + + conditional(noBfgs, "--no-bgfs") + required("-T", threads) } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala index cc1dabdfd03c5201f69627fb61b3c1f879b39f38..e732c1023de5f713f4c21ed366da68eddc825285 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala @@ -24,7 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction { @Input(doc = "Contaminants", required = false) - var startingTree: File = config("starting_tree") + var startingTree: Option[File] = config("starting_tree") @Input(doc = "Fasta file", shortName = "FQ") var fastafile: File = _ @@ -36,21 +36,21 @@ class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction { var outputDirectory: String = _ executable = config("exe", default = "run_gubbins.py") - var outgroup: String = config("outgroup") - var filterPercentage: String = config("filter_percentage") - var treeBuilder: String = config("tree_builder") + var outgroup: Option[String] = config("outgroup") + var filterPercentage: Option[String] = config("filter_percentage") + var treeBuilder: Option[String] = config("tree_builder") var iterations: Option[Int] = config("iterations") var minSnps: Option[Int] = config("min_snps") - var convergeMethod: String = config("converge_method") - var useTimeStamp: Boolean = config("use_time_stamp") - var prefix: String = config("prefix") - var verbose: Boolean = config("verbose") - var noCleanup: Boolean = config("no_cleanup") + var convergeMethod: Option[String] = config("converge_method") + var useTimeStamp: Boolean = config("use_time_stamp", default = false) + var prefix: Option[String] = config("prefix") + var verbose: Boolean = config("verbose", default = false) + var noCleanup: Boolean = config("no_cleanup", default = false) override def afterGraph: Unit = { super.afterGraph jobLocalDir = new File(outputDirectory) - if (prefix == null) prefix = fastafile.getName + if (prefix.isEmpty) prefix = Some(fastafile.getName) val out: List[String] = List(".recombination_predictions.embl", ".recombination_predictions.gff", ".branch_base_reconstruction.embl", @@ -59,7 +59,7 @@ class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction { ".filtered_polymorphic_sites.fasta", ".filtered_polymorphic_sites.phylip", ".final_tree.tre") - for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix + t) + for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix.getOrElse("gubbins") + t) } def cmdLine = required("cd", outputDirectory) + " && " + required(executable) + diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Sickle.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Sickle.scala index 19b4e7886623ef7069e6ca8cb82a138cb7d7fb41..5056c70abc0e122fac0b95862f3443faaa48b5c7 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Sickle.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Sickle.scala @@ -43,18 +43,18 @@ class Sickle(val root: Configurable) extends BiopetCommandLineFunction { var fastqc: Fastqc = _ executable = config("exe", default = "sickle", freeVar = false) - var qualityType: String = config("qualitytype") + var qualityType: Option[String] = config("qualitytype") var qualityThreshold: Option[Int] = config("qualityThreshold") var lengthThreshold: Option[Int] = config("lengthThreshold") - var noFiveprime: Boolean = config("noFiveprime") - var discardN: Boolean = config("discardN") - var quiet: Boolean = config("quiet") + var noFiveprime: Boolean = config("noFiveprime", default = false) + var discardN: Boolean = config("discardN", default = false) + var quiet: Boolean = config("quiet", default = false) var defaultQualityType: String = config("defaultqualitytype", default = "sanger") override val versionRegex = """sickle version (.*)""".r override def versionCommand = executable + " --version" override def afterGraph { - if (qualityType == null && defaultQualityType != null) qualityType = defaultQualityType + if (qualityType.isEmpty) qualityType = Some(defaultQualityType) } def cmdLine = { diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Stampy.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Stampy.scala similarity index 98% rename from public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Stampy.scala rename to public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Stampy.scala index 6156f4a7f1a6aaf55532ca8d3ac6a6080657d5db..a7bfeab78447e62eee1634c73ca09c880679327e 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Stampy.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Stampy.scala @@ -13,12 +13,13 @@ * license; For commercial users or users who do not want to follow the AGPL * license, please contact us to obtain a separate license. */ -package nl.lumc.sasc.biopet.extensions.aligners +package nl.lumc.sasc.biopet.extensions + +import java.io.File import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output } -import java.io.File class Stampy(val root: Configurable) extends BiopetCommandLineFunction { @Input(doc = "FastQ file R1", shortName = "R1") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Star.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Star.scala similarity index 97% rename from public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Star.scala rename to public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Star.scala index 5179b6381ae7765d7f95c80a8b748ffa89b0c447..25fa1876b2abef8a1a6c34555e74d699607f9ba6 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Star.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Star.scala @@ -13,12 +13,13 @@ * license; For commercial users or users who do not want to follow the AGPL * license, please contact us to obtain a separate license. */ -package nl.lumc.sasc.biopet.extensions.aligners +package nl.lumc.sasc.biopet.extensions + +import java.io.File import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction import nl.lumc.sasc.biopet.core.config.Configurable -import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument } -import java.io.File +import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } class Star(val root: Configurable) extends BiopetCommandLineFunction { @Input(doc = "The reference file for the bam files.", required = false) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/TopHat.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/TopHat.scala similarity index 95% rename from public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/TopHat.scala rename to public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/TopHat.scala index bca4a2b23ac71f52a0dc3927afde372ac7ea220f..e0b8bd68bda46a6b3dc6915a54482b093cfd43d6 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/TopHat.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/TopHat.scala @@ -13,12 +13,13 @@ * license; For commercial users or users who do not want to follow the AGPL * license, please contact us to obtain a separate license. */ -package nl.lumc.sasc.biopet.extensions.aligners +package nl.lumc.sasc.biopet.extensions + +import java.io.File import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction import nl.lumc.sasc.biopet.core.config.Configurable -import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument } -import java.io.File +import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } class TopHat(val root: Configurable) extends BiopetCommandLineFunction { @Input(doc = "FastQ file R1", shortName = "R1") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/WigToBigWig.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/WigToBigWig.scala new file mode 100644 index 0000000000000000000000000000000000000000..531425b7a50bde7a77d62213561cef91095ecdd9 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/WigToBigWig.scala @@ -0,0 +1,38 @@ +package nl.lumc.sasc.biopet.extensions + +import java.io.File + +import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } + +/** + * Created by pjvan_thof on 1/29/15. + * Versions from the executable are not reliable, this extension is based on md5 '3d033ff8a1f4ea9cb3ede12939561141' from the executable + */ +class WigToBigWig(val root: Configurable) extends BiopetCommandLineFunction { + @Input(doc = "Input wig file") + var inputWigFile: File = _ + + @Input(doc = "Input chrom sizes file") + var inputChromSizesFile: File = _ + + @Output(doc = "Output BigWig file") + var outputBigWig: File = _ + + executable = config("exe", default = "wigToBigWig") + + var blockSize: Option[Int] = config("blockSize") + var itemsPerSlot: Option[Int] = config("itemsPerSlot") + var clip: Boolean = config("clip", default = false) + var unc: Boolean = config("unc", default = false) + + def cmdLine = required(executable) + + optional("-blockSize=", blockSize, spaceSeparated = false) + + optional("-itemsPerSlot=", itemsPerSlot, spaceSeparated = false) + + conditional(clip, "-clip") + + conditional(unc, "-unc") + + required(inputWigFile) + + required(inputChromSizesFile) + + required(outputBigWig) +} \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/Bedtools.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/Bedtools.scala index a6ea3588b4b147ff1eb679de60208631503b6ceb..cf7f16e381093eb0e4a547434b4e119936cd44b6 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/Bedtools.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/Bedtools.scala @@ -18,6 +18,7 @@ package nl.lumc.sasc.biopet.extensions.bedtools import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction abstract class Bedtools extends BiopetCommandLineFunction { + override def subPath = "bedtools" :: super.subPath executable = config("exe", default = "bedtools", submodule = "bedtools") override def versionCommand = executable + " --version" override val versionRegex = """bedtools (.*)""".r diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/Bwa.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/Bwa.scala new file mode 100644 index 0000000000000000000000000000000000000000..3e0ffc8011a1669b3205157cce5bde5d3d5053f1 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/Bwa.scala @@ -0,0 +1,14 @@ +package nl.lumc.sasc.biopet.extensions.bwa + +import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction + +/** + * Created by pjvan_thof on 1/16/15. + */ +abstract class Bwa extends BiopetCommandLineFunction { + override def subPath = "bwa" :: super.subPath + executable = config("exe", default = "bwa") + override val versionRegex = """Version: (.*)""".r + override val versionExitcode = List(0, 1) + override def versionCommand = executable +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaAln.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaAln.scala new file mode 100644 index 0000000000000000000000000000000000000000..bfd0a5846e6531b47f285453a2d848b1b7f1bafe --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaAln.scala @@ -0,0 +1,74 @@ +package nl.lumc.sasc.biopet.extensions.bwa + +import java.io.File + +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } + +/** + * Created by pjvan_thof on 1/16/15. + */ +class BwaAln(val root: Configurable) extends Bwa { + @Input(doc = "Fastq file", required = true) + var fastq: File = _ + + @Input(doc = "The reference file for the bam files.", required = true) + var reference: File = config("reference", required = true) + + @Output(doc = "Output file SAM", required = false) + var output: File = _ + + var n: Option[Int] = config("n") + var o: Option[Int] = config("o") + var e: Option[Int] = config("e") + var i: Option[Int] = config("i") + var d: Option[Int] = config("d") + var l: Option[Int] = config("l") + var k: Option[Int] = config("k") + var m: Option[Int] = config("m") + var M: Option[Int] = config("M") + var O: Option[Int] = config("O") + var E: Option[Int] = config("E") + var R: Option[Int] = config("R") + var q: Option[Int] = config("q") + var B: Option[Int] = config("B") + var L: Boolean = config("L", default = false) + var N: Boolean = config("N", default = false) + var I: Boolean = config("I", default = false) + var b: Boolean = config("b", default = false) + var n0: Boolean = config("0", default = false) + var n1: Boolean = config("1", default = false) + var n2: Boolean = config("2", default = false) + var Y: Boolean = config("Y", default = false) + + override val defaultVmem = "5G" + override val defaultThreads = 8 + + def cmdLine = required(executable) + + required("aln") + + optional("-n", n) + + optional("-o", o) + + optional("-e", e) + + optional("-i", i) + + optional("-d", d) + + optional("-l", l) + + optional("-k", k) + + optional("-m", m) + + optional("-M", M) + + optional("-O", O) + + optional("-E", E) + + optional("-R", R) + + optional("-q", q) + + optional("-B", B) + + conditional(L, "-L") + + conditional(N, "-N") + + conditional(I, "-I") + + conditional(b, "-b") + + conditional(n0, "-0") + + conditional(n1, "-1") + + conditional(n2, "-2") + + conditional(Y, "-Y") + + optional("-f", output) + + required(reference) + + required(fastq) +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Bwa.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaMem.scala similarity index 80% rename from public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Bwa.scala rename to public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaMem.scala index 7ebca27f3af58f435cc72fa5327469352dfcb385..fc790b183b5bae2922a1b5f89ec66fcf4f5b85b2 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/aligners/Bwa.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaMem.scala @@ -13,14 +13,15 @@ * license; For commercial users or users who do not want to follow the AGPL * license, please contact us to obtain a separate license. */ -package nl.lumc.sasc.biopet.extensions.aligners +package nl.lumc.sasc.biopet.extensions.bwa + +import java.io.File import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output } -import java.io.File -class Bwa(val root: Configurable) extends BiopetCommandLineFunction { +class BwaMem(val root: Configurable) extends Bwa { @Input(doc = "Fastq file R1", shortName = "R1") var R1: File = _ @@ -33,10 +34,10 @@ class Bwa(val root: Configurable) extends BiopetCommandLineFunction { @Output(doc = "Output file SAM", shortName = "output") var output: File = _ - var R: String = config("R") + var R: Option[String] = config("R") var k: Option[Int] = config("k") var r: Option[Float] = config("r") - var S: Boolean = config("S") + var S: Boolean = config("S", default = false) var M: Boolean = config("M", default = true) var w: Option[Int] = config("w") var d: Option[Int] = config("d") @@ -44,33 +45,27 @@ class Bwa(val root: Configurable) extends BiopetCommandLineFunction { var D: Option[Float] = config("D") var W: Option[Int] = config("W") var m: Option[Int] = config("m") - var P: Boolean = config("P") - var e: Boolean = config("e") + var P: Boolean = config("P", default = false) + var e: Boolean = config("e", default = false) var A: Option[Int] = config("A") var B: Option[Int] = config("B") - var O: String = config("O") - var E: String = config("E") - var L: String = config("L") + var O: Option[String] = config("O") + var E: Option[String] = config("E") + var L: Option[String] = config("L") var U: Option[Int] = config("U") - var x: String = config("x") - var p: Boolean = config("p") + var x: Option[String] = config("x") + var p: Boolean = config("p", default = false) var v: Option[Int] = config("v") var T: Option[Int] = config("T") var h: Option[Int] = config("h") - var a: Boolean = config("a") - var C: Boolean = config("C") - var Y: Boolean = config("Y") - var I: String = config("I") - - executable = config("exe", default = "bwa", freeVar = false) - override val versionRegex = """Version: (.*)""".r - override val versionExitcode = List(0, 1) + var a: Boolean = config("a", default = false) + var C: Boolean = config("C", default = false) + var Y: Boolean = config("Y", default = false) + var I: Option[String] = config("I") override val defaultVmem = "6G" override val defaultThreads = 8 - override def versionCommand = executable - def cmdLine = { required(executable) + required("mem") + diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaSampe.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaSampe.scala new file mode 100644 index 0000000000000000000000000000000000000000..b857eea014ac52acb9608debb94db9cd75cef929 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaSampe.scala @@ -0,0 +1,62 @@ +package nl.lumc.sasc.biopet.extensions.bwa + +import java.io.File + +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } + +/** + * BWA sampe wrapper + * + * based on executable version 0.7.10-r789 + * + * @param root Configurable + */ +class BwaSampe(val root: Configurable) extends Bwa { + @Input(doc = "Fastq file R1", required = true) + var fastqR1: File = _ + + @Input(doc = "Fastq file R2", required = true) + var fastqR2: File = _ + + @Input(doc = "Sai file R1", required = true) + var saiR1: File = _ + + @Input(doc = "Sai file R2", required = true) + var saiR2: File = _ + + @Input(doc = "The reference file for the bam files.", required = true) + var reference: File = config("reference", required = true) + + @Output(doc = "Output file SAM", required = false) + var output: File = _ + + var a: Option[Int] = config("a") + var o: Option[Int] = config("o") + var n: Option[Int] = config("n") + var N: Option[Int] = config("N") + var c: Option[Float] = config("c") + var P: Boolean = config("P", default = false) + var s: Boolean = config("s", default = false) + var A: Boolean = config("A", default = false) + + var r: String = _ + + def cmdLine = required(executable) + + required("sampe") + + optional("-a", a) + + optional("-o", o) + + optional("-n", n) + + optional("-N", N) + + optional("-c", c) + + optional("-f", output) + + optional("-r", r) + + conditional(P, "-P") + + conditional(s, "-s") + + conditional(A, "-A") + + required(reference) + + required(saiR1) + + required(saiR2) + + required(fastqR1) + + required(fastqR2) +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaSamse.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaSamse.scala new file mode 100644 index 0000000000000000000000000000000000000000..51f9a0f30eaf73a897fefaa2b6cf69d4e3386b62 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaSamse.scala @@ -0,0 +1,35 @@ +package nl.lumc.sasc.biopet.extensions.bwa + +import java.io.File + +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } + +/** + * Created by pjvan_thof on 1/16/15. + */ +class BwaSamse(val root: Configurable) extends Bwa { + @Input(doc = "Fastq file", required = true) + var fastq: File = _ + + @Input(doc = "Sai file", required = true) + var sai: File = _ + + @Input(doc = "The reference file for the bam files.", required = true) + var reference: File = config("reference", required = true) + + @Output(doc = "Output file SAM", required = false) + var output: File = _ + + var n: Option[Int] = config("n") + var r: String = _ + + def cmdLine = required(executable) + + required("samse") + + optional("-n", n) + + optional("-f", output) + + optional("-r", r) + + required(reference) + + required(sai) + + required(fastq) +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/Conifer.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/Conifer.scala new file mode 100644 index 0000000000000000000000000000000000000000..79fadce176938d293b378599f9f52553636f7aea --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/Conifer.scala @@ -0,0 +1,34 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.conifer + +import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction +import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction + +abstract class Conifer extends PythonCommandLineFunction { + override def subPath = "conifer" :: super.subPath + // executable = config("exe", default = "conifer") + setPythonScript(config("script", default = "conifer")) + override val versionRegex = """(.*)""".r + override val versionExitcode = List(0) + override def versionCommand = executable + " " + python_script + " --version" + + override val defaultVmem = "8G" + override val defaultThreads = 1 + + def cmdLine = getPythonCommand + +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferAnalyze.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferAnalyze.scala new file mode 100644 index 0000000000000000000000000000000000000000..7ce27881dec042d466b4b2d0e7bdd3a1d2025704 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferAnalyze.scala @@ -0,0 +1,49 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.conifer + +import java.io.File + +import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.extensions.Ln +import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } + +class ConiferAnalyze(val root: Configurable) extends Conifer { + + @Input(doc = "Probes / capture kit definition as bed file: chr,start,stop,gene-annot", required = true) + var probes: File = _ + + // @Input(doc = "Path to Conifer RPKM files", required = true) + var rpkmDir: File = _ + + @Output(doc = "Output analyse.hdf5", shortName = "out") + var output: File = _ + + @Argument(doc = "SVD, number of components to remove", minRecommendedValue = 2, maxRecommendedValue = 5, + minValue = 2, maxValue = 20, required = false) + var svd: Option[Int] = config("svd", default = 1) + + @Argument(doc = "Minimum population median RPKM per probe", required = false) + var min_rpkm: Option[Double] = config("min_rpkm") + + override def cmdLine = super.cmdLine + + " analyze " + + " --probes" + required(probes) + + " --rpkm_dir" + required(rpkmDir) + + " --output" + required(output) + + optional("--svd", svd) + + optional("--min_rpkm", min_rpkm) +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferCall.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferCall.scala new file mode 100644 index 0000000000000000000000000000000000000000..6583ed64ebef5636e1afcd51b2432f5fc99c9765 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferCall.scala @@ -0,0 +1,35 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.conifer + +import java.io.File + +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } + +class ConiferCall(val root: Configurable) extends Conifer { + + @Input(doc = "Input analysis.hdf5", required = true) + var input: File = _ + + @Output(doc = "Output calls.txt", shortName = "out") + var output: File = _ + + override def cmdLine = super.cmdLine + + " call " + + " --input" + required(input) + + " --output" + required(output) +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferExport.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferExport.scala new file mode 100644 index 0000000000000000000000000000000000000000..02c0f09584206770965e67e6afc32765f9997a3f --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferExport.scala @@ -0,0 +1,39 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.conifer + +import java.io.File + +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +class ConiferExport(val root: Configurable) extends Conifer { + + @Input(doc = "Input analysis.hdf5", required = true) + var input: File = _ + + @Output(doc = "Output <sample>.svdzrpkm.bed", shortName = "out", required = true) + var output: File = _ + + override def afterGraph { + this.checkExecutable + } + + override def cmdLine = super.cmdLine + + " export " + + " --input" + required(input) + + " --output" + required(output) +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferRPKM.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferRPKM.scala new file mode 100644 index 0000000000000000000000000000000000000000..52c0e48113715133092144a0def6347ec1e6617a --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/conifer/ConiferRPKM.scala @@ -0,0 +1,40 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.conifer + +import java.io.File + +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } + +class ConiferRPKM(val root: Configurable) extends Conifer { + + @Input(doc = "Bam file", required = true) + var bamFile: File = _ + + @Input(doc = "Probes / capture kit definition as bed file: chr,start,stop,gene-annot", required = true) + var probes: File = _ + + /** The output RPKM should outputted to a directory which contains all the RPKM files from previous experiments */ + @Output(doc = "Output RPKM.txt", shortName = "out") + var output: File = _ + + override def cmdLine = super.cmdLine + + " rpkm " + + " --probes" + required(probes) + + " --input" + required(bamFile) + + " --output" + required(output) +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVTools.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVTools.scala new file mode 100644 index 0000000000000000000000000000000000000000..d017864f6988828100f6bbda421fbf734a9a5878 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVTools.scala @@ -0,0 +1,14 @@ +/** + * Created by wyleung on 5-1-15. + */ + +package nl.lumc.sasc.biopet.extensions.igvtools + +import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction + +abstract class IGVTools extends BiopetCommandLineFunction { + executable = config("exe", default = "igvtools", submodule = "igvtools", freeVar = false) + override def versionCommand = executable + " version" + override val versionRegex = """IGV Version: ([\d\.]) .*""".r + override val versionExitcode = List(0) +} \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVToolsCount.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVToolsCount.scala new file mode 100644 index 0000000000000000000000000000000000000000..8037616834ecd4de02e9949883b75d20b45c7347 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVToolsCount.scala @@ -0,0 +1,105 @@ + +package nl.lumc.sasc.biopet.extensions.igvtools + +import java.nio.file.InvalidPathException + +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument } +import java.io.{ FileNotFoundException, File } + +/** + * IGVTools `count` wrapper + * + * @constructor create a new IGVTools instance from a `.bam` file + * + */ + +class IGVToolsCount(val root: Configurable) extends IGVTools { + @Input(doc = "Bam File") + var input: File = _ + + @Input(doc = "<genome>.chrom.sizes File") + var genomeChromSizes: File = _ + + @Output + var tdf: Option[File] = _ + + @Output + var wig: Option[File] = _ + + var maxZoom: Option[Int] = config("maxZoom") + var windowSize: Option[Int] = config("windowSize") + var extFactor: Option[Int] = config("extFactor") + + var preExtFactor: Option[Int] = config("preExtFactor") + var postExtFactor: Option[Int] = config("postExtFactor") + + var windowFunctions: Option[String] = config("windowFunctions") + var strands: Option[String] = config("strands") + var bases: Boolean = config("bases", default = false) + + var query: Option[String] = config("query") + var minMapQuality: Option[Int] = config("minMapQuality") + var includeDuplicates: Boolean = config("includeDuplicates", default = false) + + var pairs: Boolean = config("pairs", default = false) + + override def afterGraph { + super.afterGraph + if (!input.exists()) throw new FileNotFoundException("Input bam is required for IGVToolsCount") + + if (!wig.isEmpty && !wig.get.getAbsolutePath.endsWith(".wig")) throw new IllegalArgumentException("Wiggle file should have a .wig file-extension") + if (!tdf.isEmpty && !tdf.get.getAbsolutePath.endsWith(".tdf")) throw new IllegalArgumentException("TDF file should have a .tdf file-extension") + } + + def cmdLine = { + required(executable) + + required("count") + + optional("--maxZoom", maxZoom) + + optional("--windowSize", windowSize) + + optional("--extFactor", extFactor) + + optional("--preExtFactor", preExtFactor) + + optional("--postExtFactor", postExtFactor) + + optional("--windowFunctions", windowFunctions) + + optional("--strands", strands) + + conditional(bases, "--bases") + + optional("--query", query) + + optional("--minMapQuality", minMapQuality) + + conditional(includeDuplicates, "--includeDuplicates") + + conditional(pairs, "--pairs") + + required(input) + + required(outputArg) + + required(genomeChromSizes) + } + + /** + * This part should never fail, these values are set within this wrapper + * + */ + private def outputArg: String = { + (tdf, wig) match { + case (None, None) => throw new IllegalArgumentException("Either TDF or WIG should be supplied"); + case (Some(a), None) => a.getAbsolutePath; + case (None, Some(b)) => b.getAbsolutePath; + case (Some(a), Some(b)) => a.getAbsolutePath + "," + b.getAbsolutePath; + } + } +} + +object IGVToolsCount { + /** + * Create an object by specifying the `input` (.bam), + * and the `genomename` (hg18,hg19,mm10) + * + * @param input Bamfile to count reads from + * @return a new IGVToolsCount instance + * @throws FileNotFoundException bam File is not found + * @throws IllegalArgumentException tdf or wig not supplied + */ + def apply(root: Configurable, input: File, genomeChromSizes: File): IGVToolsCount = { + val counting = new IGVToolsCount(root) + counting.input = input + counting.genomeChromSizes = genomeChromSizes + return counting + } +} \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2.scala new file mode 100644 index 0000000000000000000000000000000000000000..5c52970ed8cc0c049ef236f571acedbb8a4610dd --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2.scala @@ -0,0 +1,13 @@ +package nl.lumc.sasc.biopet.extensions.macs2 + +import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction + +/** + * Created by sajvanderzeeuw on 12/19/14. + */ +abstract class Macs2 extends BiopetCommandLineFunction { + executable = config("exe", default = "macs2", submodule = "macs2", freeVar = false) + override def versionCommand = executable + " --version" + override val versionRegex = """macs2 (.*)""".r + override val versionExitcode = List(0, 1) +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2CallPeak.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2CallPeak.scala new file mode 100644 index 0000000000000000000000000000000000000000..e6a9c48e925949bdd413a9aca79afbb7e28b91d7 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2CallPeak.scala @@ -0,0 +1,99 @@ +package nl.lumc.sasc.biopet.extensions.macs2 + +import java.io.File + +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } + +class Macs2CallPeak(val root: Configurable) extends Macs2 { + @Input(doc = "Treatment input", required = true) + var treatment: File = _ + + @Input(doc = "Control input", required = false) + var control: File = _ + + @Output(doc = "Output file NARROWPEAKS") + private var output_narrow: File = _ + + @Output(doc = "Output file BROADPEAKS") + private var output_broad: File = _ + + @Output(doc = "Output in Excel format") + private var output_xls: File = _ + + @Output(doc = "R script with Bimodal model") + private var output_r: File = _ + + @Output(doc = "Output file Bedgraph") + private var output_bdg: File = _ + + @Output(doc = "Output file gappedPeak") + private var output_gapped: File = _ + + var fileformat: Option[String] = config("fileformat") + var gsize: Option[Float] = config("gsize") + var keepdup: Boolean = config("keep-dup", default = false) + var buffersize: Option[Int] = config("buffer-size") + var outputdir: String = _ + var name: Option[String] = config("name") + var bdg: Boolean = config("bdg", default = false) + var verbose: Boolean = config("verbose", default = false) + var tsize: Option[Int] = config("tsize") + var bandwith: Option[Int] = config("bandwith") + var mfold: Option[Float] = config("mfold") + var fixbimodel: Boolean = config("fixbimodel", default = false) + var nomodel: Boolean = config("nomodel", default = false) + var shift: Option[Int] = config("shift") + var qvalue: Option[Float] = config("qvalue") + var pvalue: Option[Float] = config("pvalue") + var tolarge: Boolean = config("tolarge", default = false) + var downsample: Boolean = config("downsample", default = false) + var nolambda: Boolean = config("nolambda", default = false) + var slocal: Option[Int] = config("slocal") + var llocal: Option[Int] = config("llocal") + var broad: Boolean = config("broad", default = false) + var broadcutoff: Option[Int] = config("broadcutoff") + var callsummits: Boolean = config("callsummits", default = false) + + override def afterGraph: Unit = { + if (name.isEmpty) throw new IllegalArgumentException("Name is not defined") + if (outputdir == null) throw new IllegalArgumentException("Outputdir is not defined") + output_narrow = new File(outputdir + name.get + ".narrowPeak") + output_broad = new File(outputdir + name.get + ".broadPeak") + output_xls = new File(outputdir + name.get + ".xls") + output_bdg = new File(outputdir + name.get + ".bdg") + output_r = new File(outputdir + name.get + ".r") + output_gapped = new File(outputdir + name.get + ".gappedPeak") + } + + def cmdLine = { + required(executable) + required("callpeak") + + required("--treatment", treatment) + /* Treatment sample */ + optional("--control", control) + /* Control sample */ + optional("-f", fileformat) + /* Input file format */ + required("-g", gsize) + /* Estimated genome size.(format: 2.7e9) (presets: hs, mm, ce, dm) */ + conditional(keepdup, "--keep-dup") + /* Whether to keep duplicates */ + optional("--buffer-size", buffersize) + /* Buffer size */ + required("--outdir", outputdir) + /* The output directory */ + optional("--name", name) + /* prefix name of the output files. (note that also the peak names inside the files will have this name */ + conditional(bdg, "-B") + /* Whether to output in BDG format */ + conditional(verbose, "--verbose") + /* Whether to output verbosely */ + optional("--tsize", tsize) + /* Sets custom tag length, if not specified macs will use first 10 sequences to estimate the size */ + optional("--bw", bandwith) + /* The bandwith to use for model building. Set this parameter as the sonication fragment size estimated in the wetlab */ + optional("--mfold", mfold) + /* The parameter to select regions within the model fold. Must be a upper and lower limit. */ + conditional(fixbimodel, "--fix-bimodal") + /* Whether turn on the auto paired-peak model process. If it's set, when MACS failed to build paired model, it will use the nomodel settings, the '--extsize' parameter to extend each tags. If set, MACS will be terminated if paried-peak model is failed. */ + conditional(nomodel, "--nomodel") + /* While on, MACS will bypass building the shifting model */ + optional("--shift", shift) + /* You can set an arbitrary shift in basepairs here */ + optional("--extsize", shift) + /* While '--nomodel' is set, MACS uses this parameter to extend reads in 5'->3' direction to fix-sized fragments. For example, if the size of binding region for your transcription factor is 200 bp, and you want to bypass the model building by MACS, this parameter can be set as 200. This option is only valid when --nomodel is set or when MACS fails to build model and --fix-bimodal is on. */ + optional("--qvalue", qvalue) + /* the Q-value(FDR) cutoff */ + optional("--pvalue", pvalue) + /* The P-value cutoff, if --pvalue is set no Qvalue is calculated */ + conditional(tolarge, "--to-large") + /* Whether to scale up the smallest input file to the larger one */ + conditional(downsample, "--down-sample") + /* This is the reversed from --to-large */ + conditional(nolambda, "--nolambda") + /* With this flag on, MACS will use the background lambda as local lambda. This means MACS will not consider the local bias at peak candidate regions.*/ + optional("--slocal", slocal) + /* These two parameters control which two levels of regions will be checked around the peak regions to calculate the maximum lambda as local lambda */ + optional("--llocal", llocal) + + conditional(broad, "--broad") + /* whether to enable broad peak calling */ + optional("--broad-cutoff", broadcutoff) + /* Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. */ + conditional(callsummits, "--call-summits") /* MACS will now reanalyze the shape of signal profile (p or q-score depending on cutoff setting) to deconvolve subpeaks within each peak called from general procedure. */ + } +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectAlignmentSummaryMetrics.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectAlignmentSummaryMetrics.scala index 026947277272e1ebe57b82bb866605916edbbe9e..b2553c47e2bf05bc14dd87f667cf80d0165181ce 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectAlignmentSummaryMetrics.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectAlignmentSummaryMetrics.scala @@ -29,7 +29,7 @@ class CollectAlignmentSummaryMetrics(val root: Configurable) extends Picard { var maxInstertSize: Option[Int] = config("maxInstertSize") @Argument(doc = "ADAPTER_SEQUENCE", required = false) - var adapterSequence: List[String] = config("adapterSequence") + var adapterSequence: List[String] = config("adapterSequence", default = Nil) @Argument(doc = "IS_BISULFITE_SEQUENCED", required = false) var isBisulfiteSequenced: Option[Boolean] = config("isBisulfiteSequenced") @@ -44,7 +44,7 @@ class CollectAlignmentSummaryMetrics(val root: Configurable) extends Picard { var assumeSorted: Boolean = config("assumeSorted", default = true) @Argument(doc = "METRIC_ACCUMULATION_LEVEL", required = false) - var metricAccumulationLevel: List[String] = config("metricaccumulationlevel") + var metricAccumulationLevel: List[String] = config("metricaccumulationlevel", default = Nil) @Argument(doc = "STOP_AFTER", required = false) var stopAfter: Option[Long] = config("stopAfter") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectInsertSizeMetrics.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectInsertSizeMetrics.scala index dcda7e9cf305543f5fc77d93a60f61824e8a7aa9..7d7c7ddc2e773601e772ea3e1fc744f97059f5cb 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectInsertSizeMetrics.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectInsertSizeMetrics.scala @@ -19,6 +19,8 @@ import java.io.File import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument } +import scala.collection.immutable.Nil + class CollectInsertSizeMetrics(val root: Configurable) extends Picard { javaMainClass = "picard.analysis.CollectInsertSizeMetrics" @@ -47,7 +49,7 @@ class CollectInsertSizeMetrics(val root: Configurable) extends Picard { var stopAfter: Option[Long] = config("stopAfter") @Argument(doc = "METRIC_ACCUMULATION_LEVEL", required = false) - var metricAccumulationLevel: List[String] = config("metricaccumulationlevel") + var metricAccumulationLevel: List[String] = config("metricaccumulationlevel", default = Nil) @Argument(doc = "HISTOGRAM_WIDTH", required = false) var histogramWidth: Option[Int] = config("histogramWidth") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala index f88304a0aa3fe25ad601fd5ad9f75672e03a1965..6df04c12ccb0ad1ca1c4853dac940ec3a304d043 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala @@ -32,19 +32,19 @@ class MarkDuplicates(val root: Configurable) extends Picard { var outputMetrics: File = _ @Argument(doc = "PROGRAM_RECORD_ID", required = false) - var programRecordId: String = config("programrecordid") + var programRecordId: Option[String] = config("programrecordid") @Argument(doc = "PROGRAM_GROUP_VERSION", required = false) - var programGroupVersion: String = config("programgroupversion") + var programGroupVersion: Option[String] = config("programgroupversion") @Argument(doc = "PROGRAM_GROUP_COMMAND_LINE", required = false) - var programGroupCommandLine: String = config("programgroupcommandline") + var programGroupCommandLine: Option[String] = config("programgroupcommandline") @Argument(doc = "PROGRAM_GROUP_NAME", required = false) - var programGroupName: String = config("programgroupname") + var programGroupName: Option[String] = config("programgroupname") @Argument(doc = "COMMENT", required = false) - var comment: String = config("comment") + var comment: Option[String] = config("comment") @Argument(doc = "REMOVE_DUPLICATES", required = false) var removeDuplicates: Boolean = config("removeduplicates", default = false) @@ -62,7 +62,7 @@ class MarkDuplicates(val root: Configurable) extends Picard { var sortingCollectionSizeRatio: Option[Double] = config("sortingCollectionSizeRatio") @Argument(doc = "READ_NAME_REGEX", required = false) - var readNameRegex: String = config("readNameRegex") + var readNameRegex: Option[String] = config("readNameRegex") @Argument(doc = "OPTICAL_DUPLICATE_PIXEL_DISTANCE", required = false) var opticalDuplicatePixelDistance: Option[Int] = config("opticalDuplicatePixelDistance") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala index b1327bc3aa5d00d8608515897a5fd81ab2d3fbe3..4de143498d9812502a2141facfacc0bbf2b046d5 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala @@ -41,7 +41,7 @@ class MergeSamFiles(val root: Configurable) extends Picard { var useThreading: Boolean = config("use_threading", default = false) @Argument(doc = "COMMENT", required = false) - var comment: String = config("comment") + var comment: Option[String] = config("comment") override def commandLine = super.commandLine + repeat("INPUT=", input, spaceSeparated = false) + diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/Picard.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/Picard.scala index e1d5ccf4e1e68c0d2078f0dbd3f371cc7bb960bc..e417a85fd65c0d096961a910d20a8c890de28e91 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/Picard.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/Picard.scala @@ -18,27 +18,29 @@ package nl.lumc.sasc.biopet.extensions.picard import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction import org.broadinstitute.gatk.utils.commandline.{ Argument } -trait Picard extends BiopetJavaCommandLineFunction { +abstract class Picard extends BiopetJavaCommandLineFunction { + override def subPath = "picard" :: super.subPath + @Argument(doc = "VERBOSITY", required = false) - var verbosity: String = config("verbosity", submodule = "picard") + var verbosity: Option[String] = config("verbosity") @Argument(doc = "QUIET", required = false) - var quiet: Boolean = config("quiet", default = false, submodule = "picard") + var quiet: Boolean = config("quiet", default = false) @Argument(doc = "VALIDATION_STRINGENCY", required = false) - var stringency: String = config("validationstringency", submodule = "picard") + var stringency: Option[String] = config("validationstringency") @Argument(doc = "COMPRESSION_LEVEL", required = false) - var compression: Option[Int] = config("compressionlevel", submodule = "picard") + var compression: Option[Int] = config("compressionlevel") @Argument(doc = "MAX_RECORDS_IN_RAM", required = false) - var maxRecordsInRam: Option[Int] = config("maxrecordsinram", submodule = "picard") + var maxRecordsInRam: Option[Int] = config("maxrecordsinram") @Argument(doc = "CREATE_INDEX", required = false) - var createIndex: Boolean = config("createindex", default = true, submodule = "picard") + var createIndex: Boolean = config("createindex", default = true) @Argument(doc = "CREATE_MD5_FILE", required = false) - var createMd5: Boolean = config("createmd5", default = false, submodule = "picard") + var createMd5: Boolean = config("createmd5", default = false) // override def versionCommand = executable + " " + javaOpts + " " + javaExecutable + " -h" // override val versionRegex = """Version: (.*)""".r diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala index cb61cc4a8088da7dcbf9b87e5395722dd51f5f01..db784ee2b11767654e8fb2eb2174f9c0d63a11fa 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala @@ -38,7 +38,7 @@ class SamToFastq(val root: Configurable) extends Picard { var outputPerRg: Boolean = config("outputPerRg", default = false) @Argument(doc = "Output dir", required = false) - var outputDir: String = config("outputDir") + var outputDir: String = _ @Argument(doc = "re reverse", required = false) var reReverse: Boolean = config("reReverse", default = false) @@ -53,7 +53,7 @@ class SamToFastq(val root: Configurable) extends Picard { var clippingAtribute: String = config("clippingAtribute") @Argument(doc = "clippingAction", required = false) - var clippingAction: String = config("clippingAction") + var clippingAction: Option[String] = config("clippingAction") @Argument(doc = "read1Trim", required = false) var read1Trim: Option[Int] = config("read1Trim") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/sambamba/Sambamba.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/sambamba/Sambamba.scala index 8c6cd2e6fc41a56db4909af198e77f5c3c3d0ff2..a6fe688c2486f46701d6397c8fcffacee4b1c29c 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/sambamba/Sambamba.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/sambamba/Sambamba.scala @@ -21,7 +21,9 @@ abstract class Sambamba extends BiopetCommandLineFunction { override val defaultVmem = "4G" override val defaultThreads = 2 - executable = config("exe", default = "sambamba", submodule = "sambamba", freeVar = false) + override def subPath = "sambamba" :: super.subPath + + executable = config("exe", default = "sambamba", freeVar = false) override def versionCommand = executable override val versionRegex = """sambamba v(.*)""".r override val versionExitcode = List(0, 1) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/Samtools.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/Samtools.scala index 1cb345a692b05680a2474f3d6169d643ad4f733c..740da10524d33f6e5e370bdc3320defa50f7ee79 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/Samtools.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/Samtools.scala @@ -18,7 +18,8 @@ package nl.lumc.sasc.biopet.extensions.samtools import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction abstract class Samtools extends BiopetCommandLineFunction { - executable = config("exe", default = "samtools", submodule = "samtools", freeVar = false) + override def subPath = "samtools" :: super.subPath + executable = config("exe", default = "samtools") override def versionCommand = executable override val versionRegex = """Version: (.*)""".r override val versionExitcode = List(0, 1) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsMpileup.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsMpileup.scala index f0e6457a491a09e10580ad100dcb1c0622974b4c..b8a3a2d0e6ec913611abe21b00c1dfe200842418 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsMpileup.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsMpileup.scala @@ -30,7 +30,7 @@ class SamtoolsMpileup(val root: Configurable) extends Samtools { var reference: File = config("reference") @Input(doc = "Interval bed") - var intervalBed: File = config("interval_bed") + var intervalBed: Option[File] = config("interval_bed") var disableBaq: Boolean = config("disable_baq") var minMapQuality: Option[Int] = config("min_map_quality") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/Seqtk.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/Seqtk.scala index 8b82228d8727195fc9dc7997fad0f28affaa2c79..a1131371ade82303aca47155593568f2cadbeb2d 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/Seqtk.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/Seqtk.scala @@ -21,7 +21,8 @@ import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction * Abstract class for all seqtk wrappers. */ abstract class Seqtk extends BiopetCommandLineFunction { - executable = config("exe", default = "seqtk", submodule = "seqtk") + override def subPath = "seqtk" :: super.subPath + executable = config("exe", default = "seqtk", freeVar = true) override def versionCommand = executable override val versionRegex = """Version: (.*)""".r override val versionExitcode = List(0, 1) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSeq.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSeq.scala index a8e94d24a11adb7bce4c652b13f5ed1030bceb09..9838040cc74a5da4cff3073ac0b2123b6de9e954 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSeq.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSeq.scala @@ -37,7 +37,7 @@ class SeqtkSeq(val root: Configurable) extends Seqtk { var q: Option[Int] = config("q") /** masked bases converted to CHAR; 0 for lowercase [0] */ - var n: String = config("n") + var n: Option[String] = config("n") /** number of residues per line; 0 for 2^32-1 [0] */ var l: Option[Int] = config("l") @@ -52,34 +52,34 @@ class SeqtkSeq(val root: Configurable) extends Seqtk { var f: Option[Int] = config("f") /** mask regions in BED or name list FILE [null] */ - var M: File = config("M") + var M: Option[File] = config("M") /** drop sequences with length shorter than INT [0] */ var L: Option[Int] = config("L") /** mask complement region (effective with -M) */ - var c: Boolean = config("c") + var c: Boolean = config("c", default = false) /** reverse complement */ - var r: Boolean = config("r") + var r: Boolean = config("r", default = false) /** force FASTA output (discard quality) */ - var A: Boolean = config("A") + var A: Boolean = config("A", default = false) /** drop comments at the header lines */ - var C: Boolean = config("C") + var C: Boolean = config("C", default = false) /** drop sequences containing ambiguous bases */ - var N: Boolean = config("N") + var N: Boolean = config("N", default = false) /** output the 2n-1 reads only */ - var flag1: Boolean = config("1") + var flag1: Boolean = config("1", default = false) /** output the 2n reads only */ - var flag2: Boolean = config("2") + var flag2: Boolean = config("2", default = false) /** shift quality by '(-Q) - 33' */ - var V: Boolean = config("V") + var V: Boolean = config("V", default = false) def cmdLine = { required(executable) + diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/MultisamplePipelineTemplate.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/MultisamplePipelineTemplate.scala new file mode 100644 index 0000000000000000000000000000000000000000..7d46c4b8fb910398f61e9fb3b873f682efaf023f --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/MultisamplePipelineTemplate.scala @@ -0,0 +1,50 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines + +import nl.lumc.sasc.biopet.core.{ PipelineCommand, MultiSampleQScript, BiopetQScript } +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.queue.QScript + +class MultisamplePipelineTemplate(val root: Configurable) extends QScript with MultiSampleQScript { + def this() = this(null) + + def makeSample(id: String) = new Sample(id) + class Sample(sampleId: String) extends AbstractSample(sampleId) { + + def makeLibrary(id: String) = new Library(id) + class Library(libId: String) extends AbstractLibrary(libId) { + protected def addJobs(): Unit = { + // Library jobs + } + } + + protected def addJobs(): Unit = { + // Sample jobs + } + } + + def addMultiSampleJobs(): Unit = { + } + + def init(): Unit = { + } + + def biopetScript() { + } +} + +object MultisamplePipelineTemplate extends PipelineCommand \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqSync.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqSync.scala deleted file mode 100644 index 6cada2301456817e3e3c191fbd2b5461788fa47d..0000000000000000000000000000000000000000 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqSync.scala +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.scripts - -import java.io.File - -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } - -import argonaut._, Argonaut._ -import scalaz._, Scalaz._ - -import nl.lumc.sasc.biopet.core.config.Configurable -import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction - -import scala.io.Source - -class FastqSync(val root: Configurable) extends PythonCommandLineFunction { - setPythonScript("sync_paired_end_reads.py") - - @Input(doc = "Start fastq") - var input_start_fastq: File = _ - - @Input(doc = "R1 input") - var input_R1: File = _ - - @Input(doc = "R2 input") - var input_R2: File = _ - - @Output(doc = "R1 output") - var output_R1: File = _ - - @Output(doc = "R2 output") - var output_R2: File = _ - - //No output Annotation so file - var output_stats: File = _ - - def cmdLine = { - getPythonCommand + - required(input_start_fastq) + - required(input_R1) + - required(input_R2) + - required(output_R1) + - required(output_R2) + - " > " + - required(output_stats) - } - - def getSummary: Json = { - val R1_filteredR = """Filtered (\d*) reads from first read file.""".r - val R2_filteredR = """Filtered (\d*) reads from second read file.""".r - val readsLeftR = """Synced read files contain (\d*) reads.""".r - - var R1_filtered = 0 - var R2_filtered = 0 - var readsLeft = 0 - - if (output_stats.exists) for (line <- Source.fromFile(output_stats).getLines) { - line match { - case R1_filteredR(m) => R1_filtered = m.toInt - case R2_filteredR(m) => R2_filtered = m.toInt - case readsLeftR(m) => readsLeft = m.toInt - case _ => - } - } - - return ("num_reads_discarded_R1" := R1_filtered) ->: - ("num_reads_discarded_R2" := R2_filtered) ->: - ("num_reads_kept" := readsLeft) ->: - jEmptyObject - } -} - -object FastqSync { - def apply(root: Configurable, input_start_fastq: File, input_R1: File, input_R2: File, - output_R1: File, output_R2: File, output_stats: File): FastqSync = { - val fastqSync = new FastqSync(root) - fastqSync.input_start_fastq = input_start_fastq - fastqSync.input_R1 = input_R1 - fastqSync.input_R2 = input_R2 - fastqSync.output_R1 = output_R1 - fastqSync.output_R2 = output_R2 - fastqSync.output_stats = output_stats - return fastqSync - } - - def mergeSummaries(jsons: List[Json]): Json = { - var R1_filtered = 0 - var R2_filtered = 0 - var readsLeft = 0 - - for (json <- jsons) { - R1_filtered += json.field("num_reads_discarded_R1").get.numberOrZero.toInt - R2_filtered += json.field("num_reads_discarded_R2").get.numberOrZero.toInt - readsLeft += json.field("num_reads_kept").get.numberOrZero.toInt - } - - return ("num_reads_discarded_R1" := R1_filtered) ->: - ("num_reads_discarded_R2" := R2_filtered) ->: - ("num_reads_kept" := readsLeft) ->: - jEmptyObject - } -} \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/FastqSync.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/FastqSync.scala new file mode 100644 index 0000000000000000000000000000000000000000..aaa2797b65930d197089aaa72d90dd5e4d9382e4 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/FastqSync.scala @@ -0,0 +1,293 @@ +/** + * Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl> + * @author Wibowo Arindrarto <w.arindrarto@lumc.nl> + * + * This tool is a port of a Python implementation written by Martijn Vermaat[1] + * + * [1] https://github.com/martijnvermaat/bio-playground/blob/master/sync-paired-end-reads/sync_paired_end_reads.py + */ +package nl.lumc.sasc.biopet.tools + +import java.io.File +import scala.io.Source +import scala.util.matching.Regex + +import scala.annotation.tailrec +import scala.collection.JavaConverters._ + +import argonaut._, Argonaut._ +import scalaz._, Scalaz._ +import htsjdk.samtools.fastq.{ AsyncFastqWriter, BasicFastqWriter, FastqReader, FastqRecord } +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction +import nl.lumc.sasc.biopet.core.ToolCommand +import nl.lumc.sasc.biopet.core.config.Configurable + +/** + * FastqSync function class for usage in Biopet pipelines + * + * @param root Configuration object for the pipeline + */ +class FastqSync(val root: Configurable) extends BiopetJavaCommandLineFunction { + + javaMainClass = getClass.getName + + @Input(doc = "Original FASTQ file (read 1 or 2)", shortName = "r", required = true) + var refFastq: File = _ + + @Input(doc = "Input read 1 FASTQ file", shortName = "i", required = true) + var inputFastq1: File = _ + + @Input(doc = "Input read 2 FASTQ file", shortName = "j", required = true) + var inputFastq2: File = _ + + @Output(doc = "Output read 1 FASTQ file", shortName = "o", required = true) + var outputFastq1: File = _ + + @Output(doc = "Output read 2 FASTQ file", shortName = "p", required = true) + var outputFastq2: File = _ + + @Output(doc = "Sync statistics", required = true) + var outputStats: File = _ + + // executed command line + override def commandLine = + super.commandLine + + required("-r", refFastq) + + required("-i", inputFastq1) + + required("-j", inputFastq2) + + required("-o", outputFastq1) + + required("-p", outputFastq2) + " > " + + required(outputStats) + + // summary statistics + def summary: Json = { + + val regex = new Regex("""Filtered (\d*) reads from first read file. + |Filtered (\d*) reads from second read file. + |Synced read files contain (\d*) reads.""".stripMargin, + "R1", "R2", "RL") + + val (countFilteredR1, countFilteredR2, countRLeft) = + if (outputStats.exists) { + val text = Source + .fromFile(outputStats) + .getLines() + .mkString("\n") + regex.findFirstMatchIn(text) match { + case None => (0, 0, 0) + case Some(rmatch) => (rmatch.group("R1").toInt, rmatch.group("R2").toInt, rmatch.group("RL").toInt) + } + } else (0, 0, 0) + + ("num_reads_discarded_R1" := countFilteredR1) ->: + ("num_reads_discarded_R2" := countFilteredR2) ->: + ("num_reads_kept" := countRLeft) ->: + jEmptyObject + } +} + +object FastqSync extends ToolCommand { + + /** + * Implicit class to allow for lazy retrieval of FastqRecord ID + * without any read pair mark + * + * @param fq FastqRecord + */ + private implicit class FastqPair(fq: FastqRecord) { + lazy val fragId = fq.getReadHeader.split("[_/][12]\\s??|\\s")(0) + } + + /** + * Counts from syncing FastqRecords + * + * @param numDiscard1 Number of reads discarded from the initial read 1 + * @param numDiscard2 Number of reads discarded from the initial read 2 + * @param numKept Number of items in result + */ + case class SyncCounts(numDiscard1: Int, numDiscard2: Int, numKept: Int) + + /** + * Filters out FastqRecord that are not present in the input iterators, using + * a reference sequence object + * + * @param pre FastqReader over reference FASTQ file + * @param seqA FastqReader over read 1 + * @param seqB FastqReader over read 2 + * @return + */ + def syncFastq(pre: FastqReader, seqA: FastqReader, seqB: FastqReader): (Stream[(FastqRecord, FastqRecord)], SyncCounts) = { + // counters for discarded A and B seqections + total kept + // NOTE: we are reasigning values to these variables in the recursion below + var (numDiscA, numDiscB, numKept) = (0, 0, 0) + + /** + * Syncs read pairs recursively + * + * @param pre Reference sequence, assumed to be a superset of both seqA and seqB + * @param seqA Sequence over read 1 + * @param seqB Sequence over read 2 + * @param acc Stream containing pairs which are present in read 1 and read 2 + * @return + */ + @tailrec def syncIter(pre: Stream[FastqRecord], + seqA: Stream[FastqRecord], seqB: Stream[FastqRecord], + acc: Stream[(FastqRecord, FastqRecord)]): Stream[(FastqRecord, FastqRecord)] = + (pre.headOption, seqA.headOption, seqB.headOption) match { + // recursion base case: both iterators have been exhausted + case (_, None, None) => acc + // illegal state: reference sequence exhausted but not seqA or seqB + case (None, Some(_), _) | (None, _, Some(_)) => + throw new NoSuchElementException("Reference record stream shorter than expected") + // keep recursion going if A still has items (we want to count how many) + case (_, _, None) => + numDiscA += 1 + syncIter(pre.tail, seqA.tail, Stream(), acc) + // like above but for B + case (_, None, _) => + numDiscB += 1 + syncIter(pre.tail, Stream(), seqB.tail, acc) + // where the magic happens! + case (Some(r), Some(a), Some(b)) => + // value of A iterator in the next recursion + val nextA = + // hold A if its head is not equal to reference + if (a.fragId != r.fragId) { + if (b.fragId == r.fragId) numDiscB += 1 + seqA + // otherwise, go to next item + } else seqA.tail + // like A above + val nextB = + if (b.fragId != r.fragId) { + if (a.fragId == r.fragId) numDiscA += 1 + seqB + } else seqB.tail + // value of accumulator in the next recursion + val nextAcc = + // keep accumulator unchanged if any of the two post streams + // have different elements compared to the reference stream + if (a.fragId != r.fragId || b.fragId != r.fragId) acc + // otherwise, grow accumulator + else { + numKept += 1 + acc ++ Stream((a, b)) + } + syncIter(pre.tail, nextA, nextB, nextAcc) + } + + (syncIter(pre.iterator.asScala.toStream, seqA.iterator.asScala.toStream, seqB.iterator.asScala.toStream, + Stream.empty[(FastqRecord, FastqRecord)]), + SyncCounts(numDiscA, numDiscB, numKept)) + } + + def writeSyncedFastq(sync: Stream[(FastqRecord, FastqRecord)], + counts: SyncCounts, + outputFastq1: AsyncFastqWriter, + outputFastq2: AsyncFastqWriter): Unit = { + sync.foreach { + case (rec1, rec2) => + outputFastq1.write(rec1) + outputFastq2.write(rec2) + } + println("Filtered %d reads from first read file.".format(counts.numDiscard1)) + println("Filtered %d reads from second read file.".format(counts.numDiscard2)) + println("Synced read files contain %d reads.".format(counts.numKept)) + } + + /** Function to merge this tool's summary with summaries from other objects */ + // TODO: refactor this into the object? At least make it work on the summary object + def mergeSummaries(jsons: List[Json]): Json = { + + val (read1FilteredCount, read2FilteredCount, readsLeftCount) = jsons + // extract the values we require from each JSON object into tuples + .map { + case json => + (json.field("num_reads_discarded_R1").get.numberOrZero.toInt, + json.field("num_reads_discarded_R2").get.numberOrZero.toInt, + json.field("num_reads_kept").get.numberOrZero.toInt) + } + // reduce the tuples + .reduceLeft { + (x: (Int, Int, Int), y: (Int, Int, Int)) => + (x._1 + y._1, x._2 + y._2, x._3 + y._3) + } + + ("num_reads_discarded_R1" := read1FilteredCount) ->: + ("num_reads_discarded_R2" := read2FilteredCount) ->: + ("num_reads_kept" := readsLeftCount) ->: + jEmptyObject + } + + case class Args(refFastq: File = new File(""), + inputFastq1: File = new File(""), + inputFastq2: File = new File(""), + outputFastq1: File = new File(""), + outputFastq2: File = new File("")) extends AbstractArgs + + class OptParser extends AbstractOptParser { + + // TODO: make output format independent from input format? + head( + s""" + |$commandName - Sync paired-end FASTQ files. + | + |This tool works with gzipped or non-gzipped FASTQ files. The output + |file will be gzipped when the input is also gzipped. + """.stripMargin) + + opt[File]('r', "ref") required () valueName "<fastq>" action { (x, c) => + c.copy(refFastq = x) + } validate { + x => if (x.exists) success else failure("Reference FASTQ file not found") + } text "Reference FASTQ file" + + opt[File]('i', "in1") required () valueName "<fastq>" action { (x, c) => + c.copy(inputFastq1 = x) + } validate { + x => if (x.exists) success else failure("Input FASTQ file 1 not found") + } text "Input FASTQ file 1" + + opt[File]('j', "in2") required () valueName "<fastq[.gz]>" action { (x, c) => + c.copy(inputFastq2 = x) + } validate { + x => if (x.exists) success else failure("Input FASTQ file 2 not found") + } text "Input FASTQ file 2" + + opt[File]('o', "out1") required () valueName "<fastq[.gz]>" action { (x, c) => + c.copy(outputFastq1 = x) + } text "Output FASTQ file 1" + + opt[File]('p', "out2") required () valueName "<fastq>" action { (x, c) => + c.copy(outputFastq2 = x) + } text "Output FASTQ file 2" + } + + /** + * Parses the command line argument + * + * @param args Array of arguments + * @return + */ + def parseArgs(args: Array[String]): Args = new OptParser() + .parse(args, Args()) + .getOrElse(sys.exit(1)) + + def main(args: Array[String]): Unit = { + + val commandArgs: Args = parseArgs(args) + + val (synced, counts) = syncFastq( + new FastqReader(commandArgs.refFastq), + new FastqReader(commandArgs.inputFastq1), + new FastqReader(commandArgs.inputFastq2)) + + writeSyncedFastq(synced, counts, + // using 3000 for queue size to approximate NFS buffer + new AsyncFastqWriter(new BasicFastqWriter(commandArgs.outputFastq1), 3000), + new AsyncFastqWriter(new BasicFastqWriter(commandArgs.outputFastq2), 3000) + ) + } +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala index ac5eaa3c9d3b9648fadb1b90dd173c673bb368f5..ae21c34e4b820ebfb0b33e3b033196e03a58ef6e 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala @@ -21,6 +21,7 @@ import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsMpileup +import nl.lumc.sasc.biopet.utils.ConfigUtils import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.collection.mutable.ArrayBuffer import scala.io.Source @@ -50,7 +51,8 @@ class MpileupToVcf(val root: Configurable) extends BiopetJavaCommandLineFunction override val defaultVmem = "6G" memoryLimit = Option(2.0) - defaults ++= Map("samtoolsmpileup" -> Map("disable_baq" -> true, "min_map_quality" -> 1)) + override def defaults = ConfigUtils.mergeMaps(Map("samtoolsmpileup" -> Map("disable_baq" -> true, "min_map_quality" -> 1)), + super.defaults) override def afterGraph { super.afterGraph diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala index 7c5c678c741a34ccdc7a9bfb81b70808c8849e41..76f8887cced08a6c2ef0e16ca469f1700a7ee208 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala @@ -26,6 +26,7 @@ import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Output, Input } import scala.collection.JavaConversions._ +import scala.io.Source class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction { javaMainClass = getClass.getName @@ -40,7 +41,7 @@ class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction { var minTotalDepth: Option[Int] = config("min_total_depth") var minAlternateDepth: Option[Int] = config("min_alternate_depth") var minSamplesPass: Option[Int] = config("min_samples_pass") - var filterRefCalls: Boolean = config("filter_ref_calls") + var filterRefCalls: Boolean = config("filter_ref_calls", default = false) override val defaultVmem = "8G" memoryLimit = Option(4.0) @@ -58,6 +59,7 @@ class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction { object VcfFilter extends ToolCommand { case class Args(inputVcf: File = null, outputVcf: File = null, + invertedOutputVcf: Option[File] = None, minQualscore: Option[Double] = None, minSampleDepth: Int = -1, minTotalDepth: Int = -1, @@ -69,7 +71,8 @@ object VcfFilter extends ToolCommand { diffGenotype: List[(String, String)] = Nil, filterHetVarToHomVar: List[(String, String)] = Nil, filterRefCalls: Boolean = false, - filterNoCalls: Boolean = false) extends AbstractArgs + filterNoCalls: Boolean = false, + iDset: Set[String] = Set()) extends AbstractArgs class OptParser extends AbstractOptParser { opt[File]('I', "inputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) => @@ -78,6 +81,9 @@ object VcfFilter extends ToolCommand { opt[File]('o', "outputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) => c.copy(outputVcf = x) } text ("Output vcf file") + opt[File]("invertedOutputVcf") maxOccurs (1) valueName ("<file>") action { (x, c) => + c.copy(invertedOutputVcf = Some(x)) + } text ("inverted output vcf file") opt[Int]("minSampleDepth") unbounded () valueName ("<int>") action { (x, c) => c.copy(minSampleDepth = x) } text ("Min value for DP in genotype fields") @@ -116,6 +122,12 @@ object VcfFilter extends ToolCommand { opt[Double]("minQualscore") unbounded () action { (x, c) => c.copy(minQualscore = Some(x)) } text ("Min qual score") + opt[String]("id") unbounded () action { (x, c) => + c.copy(iDset = c.iDset + x) + } text ("Id that may pass the filter") + opt[File]("id-file") unbounded () action { (x, c) => + c.copy(iDset = c.iDset ++ Source.fromFile(x).getLines()) + } text ("File that contain list of IDs to get from vcf file") } var commandArgs: Args = _ @@ -124,6 +136,7 @@ object VcfFilter extends ToolCommand { * @param args the command line arguments */ def main(args: Array[String]): Unit = { + logger.info("Start") val argsParser = new OptParser commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1) @@ -132,6 +145,11 @@ object VcfFilter extends ToolCommand { val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputVcf).build) writer.writeHeader(header) + val invertedWriter = commandArgs.invertedOutputVcf.collect { case x => new VariantContextWriterBuilder().setOutputFile(x).build } + invertedWriter.foreach(_.writeHeader(header)) + + var counterTotal = 0 + var counterLeft = 0 for (record <- reader) { if (minQualscore(record) && filterRefCalls(record) && @@ -143,12 +161,20 @@ object VcfFilter extends ToolCommand { mustHaveVariant(record) && notSameGenotype(record) && filterHetVarToHomVar(record) && - denovoInSample(record)) { + denovoInSample(record) && + inIdSet(record)) { writer.add(record) - } + counterLeft += 1 + } else + invertedWriter.foreach(_.add(record)) + counterTotal += 1 + if (counterTotal % 100000 == 0) logger.info(counterTotal + " variants processed, " + counterLeft + " left") } + logger.info(counterTotal + " variants processed, " + counterLeft + " left") reader.close writer.close + invertedWriter.foreach(_.close()) + logger.info("Done") } def minQualscore(record: VariantContext): Boolean = { @@ -241,4 +267,9 @@ object VcfFilter extends ToolCommand { } return true } + + def inIdSet(record: VariantContext): Boolean = { + if (commandArgs.iDset.isEmpty) true + else record.getID.split(",").exists(commandArgs.iDset.contains(_)) + } } \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala new file mode 100644 index 0000000000000000000000000000000000000000..11d930d5972897824e341d22d46808d849d7767b --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala @@ -0,0 +1,472 @@ +package nl.lumc.sasc.biopet.tools + +import java.io.{ FileOutputStream, PrintWriter, File } + +import htsjdk.variant.variantcontext.{ VariantContext, Genotype } +import htsjdk.variant.vcf.VCFFileReader +import nl.lumc.sasc.biopet.core.{ BiopetJavaCommandLineFunction, ToolCommand } +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } +import scala.collection.JavaConversions._ +import scala.collection.mutable +import scala.sys.process.{ Process, ProcessLogger } +import htsjdk.samtools.util.Interval + +/** + * Created by pjvan_thof on 1/10/15. + */ +class VcfStats(val root: Configurable) extends BiopetJavaCommandLineFunction { + javaMainClass = getClass.getName + + @Input(doc = "Input fastq", shortName = "I", required = true) + var input: File = _ + + protected var outputDir: String = _ + + /** + * Set output dir and a output file + * @param dir + */ + def setOutputDir(dir: String): Unit = { + outputDir = dir + this.jobOutputFile = new File(dir + File.separator + ".vcfstats.out") + } + + /** + * Creates command to execute extension + * @return + */ + override def commandLine = super.commandLine + + required("-I", input) + + required("-o", outputDir) +} + +object VcfStats extends ToolCommand { + /** Commandline argument */ + case class Args(inputFile: File = null, outputDir: String = null, intervals: Option[File] = None) extends AbstractArgs + + /** Parsing commandline arguments */ + class OptParser extends AbstractOptParser { + opt[File]('I', "inputFile") required () unbounded () valueName ("<file>") action { (x, c) => + c.copy(inputFile = x) + } + opt[String]('o', "outputDir") required () unbounded () valueName ("<file>") action { (x, c) => + c.copy(outputDir = x) + } + //TODO: add interval argument + /* + opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) => + c.copy(intervals = Some(x)) + } + */ + } + + /** + * Class to store sample to sample compare stats + * @param genotypeOverlap Number of genotypes match with other sample + * @param alleleOverlap Number of alleles also found in other sample + */ + case class SampleToSampleStats(var genotypeOverlap: Int = 0, + var alleleOverlap: Int = 0) { + /** Add an other class */ + def +=(other: SampleToSampleStats) { + this.genotypeOverlap += other.genotypeOverlap + this.alleleOverlap += other.alleleOverlap + } + } + + /** + * class to store all sample relative stats + * @param genotypeStats Stores all genotype relative stats + * @param sampleToSample Stores sample to sample compare stats + */ + case class SampleStats(val genotypeStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(), + val sampleToSample: mutable.Map[String, SampleToSampleStats] = mutable.Map()) { + /** Add an other class */ + def +=(other: SampleStats): Unit = { + for ((key, value) <- other.sampleToSample) { + if (this.sampleToSample.contains(key)) this.sampleToSample(key) += value + else this.sampleToSample(key) = value + } + for ((field, fieldMap) <- other.genotypeStats) { + val thisField = this.genotypeStats.get(field) + if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap) + else this.genotypeStats += field -> fieldMap + } + } + } + + /** + * General stats class to store vcf stats + * @param generalStats Stores are general stats + * @param samplesStats Stores all sample/genotype specific stats + */ + case class Stats(val generalStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(), + val samplesStats: mutable.Map[String, SampleStats] = mutable.Map()) { + /** Add an other class */ + def +=(other: Stats): Stats = { + for ((key, value) <- other.samplesStats) { + if (this.samplesStats.contains(key)) this.samplesStats(key) += value + else this.samplesStats(key) = value + } + for ((field, fieldMap) <- other.generalStats) { + val thisField = this.generalStats.get(field) + if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap) + else this.generalStats += field -> fieldMap + } + this + } + } + + /** + * Merge m2 into m1 + * @param m1 + * @param m2 + */ + def mergeStatsMap(m1: mutable.Map[Any, Int], m2: mutable.Map[Any, Int]): Unit = { + for (key <- m2.keySet) + m1(key) = m1.getOrElse(key, 0) + m2(key) + } + + /** + * Merge m2 into m1 + * @param m1 + * @param m2 + */ + def mergeNestedStatsMap(m1: mutable.Map[String, mutable.Map[Any, Int]], m2: Map[String, Map[Any, Int]]): Unit = { + for ((field, fieldMap) <- m2) { + if (m1.contains(field)) { + for ((key, value) <- fieldMap) { + if (m1(field).contains(key)) m1(field)(key) += value + else m1(field)(key) = value + } + } else m1(field) = mutable.Map(fieldMap.toList: _*) + } + } + + protected var commandArgs: Args = _ + + /** + * @param args the command line arguments + */ + def main(args: Array[String]): Unit = { + logger.info("Started") + val argsParser = new OptParser + commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + val reader = new VCFFileReader(commandArgs.inputFile, true) + val header = reader.getFileHeader + val samples = header.getSampleNamesInOrder.toList + + val intervals: List[Interval] = ( + for ( + seq <- header.getSequenceDictionary.getSequences; + chunks = seq.getSequenceLength / 10000000; + i <- 1 until chunks + ) yield { + val size = seq.getSequenceLength / chunks + val begin = size * (i - 1) + 1 + val end = if (i >= chunks) seq.getSequenceLength else size * i + new Interval(seq.getSequenceName, begin, end) + } + ).toList + + val totalBases = intervals.foldRight(0L)(_.length() + _) + + // Reading vcf records + logger.info("Start reading vcf records") + + def createStats: Stats = { + val stats = new Stats + //init stats + for (sample1 <- samples) { + stats.samplesStats += sample1 -> new SampleStats + for (sample2 <- samples) { + stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats + } + } + stats + } + + var variantCounter = 0L + var baseCounter = 0L + + def status(count: Int, interval: Interval): Unit = { + variantCounter += count + baseCounter += interval.length() + val fraction = baseCounter.toFloat / totalBases * 100 + logger.info(interval + " done, " + count + " rows processed") + logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done") + } + + val statsChunks = for (interval <- intervals.par) yield { + val reader = new VCFFileReader(commandArgs.inputFile, true) + var chunkCounter = 0 + val stats = createStats + logger.info("Starting on: " + interval) + + for ( + record <- reader.query(interval.getSequence, interval.getStart, interval.getEnd) if record.getStart <= interval.getEnd + ) { + mergeNestedStatsMap(stats.generalStats, checkGeneral(record)) + for (sample1 <- samples) yield { + val genotype = record.getGenotype(sample1) + mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype)) + for (sample2 <- samples) { + val genotype2 = record.getGenotype(sample2) + if (genotype.getAlleles == genotype2.getAlleles) + stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1 + stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += genotype.getAlleles.count(allele => genotype2.getAlleles.exists(_.basesMatch(allele))) + } + } + chunkCounter += 1 + } + status(chunkCounter, interval) + stats + } + + val stats = statsChunks.toList.fold(createStats)(_ += _) + + logger.info("Done reading vcf records") + + writeField("QUAL", stats.generalStats.getOrElse("QUAL", mutable.Map())) + writeField("general", stats.generalStats.getOrElse("general", mutable.Map())) + writeGenotypeFields(stats, commandArgs.outputDir + "/genotype_", samples) + writeOverlap(stats, _.genotypeOverlap, commandArgs.outputDir + "/sample_compare/genotype_overlap", samples) + writeOverlap(stats, _.alleleOverlap, commandArgs.outputDir + "/sample_compare/allele_overlap", samples) + + logger.info("Done") + } + + /** + * Function to check all general stats, all info expect sample/genotype specific stats + * @param record + * @return + */ + protected def checkGeneral(record: VariantContext): Map[String, Map[Any, Int]] = { + val buffer = mutable.Map[String, Map[Any, Int]]() + + def addToBuffer(key: String, value: Any): Unit = { + val map = buffer.getOrElse(key, Map()) + buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1))) + } + + buffer += "QUAL" -> Map(record.getPhredScaledQual -> 1) + + addToBuffer("general", "Total") + if (record.isBiallelic) addToBuffer("general", "Biallelic") + if (record.isComplexIndel) addToBuffer("general", "ComplexIndel") + if (record.isFiltered) addToBuffer("general", "Filtered") + if (record.isFullyDecoded) addToBuffer("general", "FullyDecoded") + if (record.isIndel) addToBuffer("general", "Indel") + if (record.isMixed) addToBuffer("general", "Mixed") + if (record.isMNP) addToBuffer("general", "MNP") + if (record.isMonomorphicInSamples) addToBuffer("general", "MonomorphicInSamples") + if (record.isNotFiltered) addToBuffer("general", "NotFiltered") + if (record.isPointEvent) addToBuffer("general", "PointEvent") + if (record.isPolymorphicInSamples) addToBuffer("general", "PolymorphicInSamples") + if (record.isSimpleDeletion) addToBuffer("general", "SimpleDeletion") + if (record.isSimpleInsertion) addToBuffer("general", "SimpleInsertion") + if (record.isSNP) addToBuffer("general", "SNP") + if (record.isStructuralIndel) addToBuffer("general", "StructuralIndel") + if (record.isSymbolic) addToBuffer("general", "Symbolic") + if (record.isSymbolicOrSV) addToBuffer("general", "SymbolicOrSV") + if (record.isVariant) addToBuffer("general", "Variant") + + buffer.toMap + } + + /** + * Function to check sample/genotype specific stats + * @param record + * @param genotype + * @return + */ + protected def checkGenotype(record: VariantContext, genotype: Genotype): Map[String, Map[Any, Int]] = { + val buffer = mutable.Map[String, Map[Any, Int]]() + + def addToBuffer(key: String, value: Any): Unit = { + val map = buffer.getOrElse(key, Map()) + buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1))) + } + + buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1) + buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1) + + val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList + + addToBuffer("general", "Total") + if (genotype.isHet) addToBuffer("general", "Het") + if (genotype.isHetNonRef) addToBuffer("general", "HetNonRef") + if (genotype.isHom) addToBuffer("general", "Hom") + if (genotype.isHomRef) addToBuffer("general", "HomRef") + if (genotype.isHomVar) addToBuffer("general", "HomVar") + if (genotype.isMixed) addToBuffer("general", "Mixed") + if (genotype.isNoCall) addToBuffer("general", "NoCall") + if (genotype.isNonInformative) addToBuffer("general", "NonInformative") + if (genotype.isAvailable) addToBuffer("general", "Available") + if (genotype.isCalled) addToBuffer("general", "Called") + if (genotype.isFiltered) addToBuffer("general", "Filtered") + + if (genotype.hasAD) { + val ad = genotype.getAD + for (i <- 0 until ad.size if ad(i) > 0) { + addToBuffer("AD", ad(i)) + if (i == 0) addToBuffer("AD-ref", ad(i)) + if (i > 0) addToBuffer("AD-alt", ad(i)) + if (usedAlleles.exists(_ == i)) addToBuffer("AD-used", ad(i)) + else addToBuffer("AD-not_used", ad(i)) + } + } + + buffer.toMap + } + + /** + * Function to write stats to tsv files + * @param stats + * @param prefix + * @param samples + */ + protected def writeGenotypeFields(stats: Stats, prefix: String, samples: List[String]) { + val fields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general") + for (field <- fields) { + writeGenotypeField(stats, prefix, samples, field) + } + } + + /** + * Function to write 1 specific genotype field + * @param stats + * @param prefix + * @param samples + * @param field + */ + protected def writeGenotypeField(stats: Stats, prefix: String, samples: List[String], field: String): Unit = { + val file = new File(prefix + field + ".tsv") + file.getParentFile.mkdirs() + val writer = new PrintWriter(file) + writer.println(samples.mkString(field + "\t", "\t", "")) + val keySet = (for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).keySet).fold(Set[Any]())(_ ++ _) + for (key <- keySet.toList.sortWith(sortAnyAny(_, _))) { + val values = for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).getOrElse(key, 0) + writer.println(values.mkString(key + "\t", "\t", "")) + } + writer.close() + plotLine(file) + } + + /** + * Function to write 1 specific general field + * @param prefix + * @param data + * @return + */ + protected def writeField(prefix: String, data: mutable.Map[Any, Int]): File = { + val file = new File(commandArgs.outputDir + "/" + prefix + ".tsv") + println(file) + file.getParentFile.mkdirs() + val writer = new PrintWriter(file) + writer.println("\t" + prefix) + for (key <- data.keySet.toList.sortWith(sortAnyAny(_, _))) { + writer.println(key + "\t" + data(key)) + } + writer.close() + file + } + + /** + * Function to sort Any values + * @param a + * @param b + * @return + */ + def sortAnyAny(a: Any, b: Any): Boolean = { + a match { + case ai: Int => { + b match { + case bi: Int => ai < bi + case bi: Double => ai < bi + case _ => a.toString < b.toString + } + } + case _ => a.toString < b.toString + } + } + + /** + * Function to write sample to sample compare tsv's / heatmaps + * @param stats + * @param function function to extract targeted var in SampleToSampleStats + * @param prefix + * @param samples + */ + def writeOverlap(stats: Stats, function: SampleToSampleStats => Int, + prefix: String, samples: List[String]): Unit = { + val absFile = new File(prefix + ".abs.tsv") + val relFile = new File(prefix + ".rel.tsv") + + absFile.getParentFile.mkdirs() + + val absWriter = new PrintWriter(absFile) + val relWriter = new PrintWriter(relFile) + + absWriter.println(samples.mkString("\t", "\t", "")) + relWriter.println(samples.mkString("\t", "\t", "")) + for (sample1 <- samples) { + val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2)) + + absWriter.println(values.mkString(sample1 + "\t", "\t", "")) + + val total = function(stats.samplesStats(sample1).sampleToSample(sample1)) + relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", "")) + } + absWriter.close() + relWriter.close() + + plotHeatmap(relFile) + } + + /** + * Plots heatmaps on target tsv file + * @param file + */ + def plotHeatmap(file: File) { + executeRscript("plotHeatmap.R", Array(file.getAbsolutePath, + file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png", + file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png", + file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png")) + } + + /** + * Plots line graph with target tsv file + * @param file + */ + def plotLine(file: File) { + executeRscript("plotXY.R", Array(file.getAbsolutePath, + file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png")) + } + + /** + * Function to execute Rscript as subproces + * @param resource + * @param args + */ + def executeRscript(resource: String, args: Array[String]): Unit = { + val is = getClass.getResourceAsStream(resource) + val file = File.createTempFile("script.", "." + resource) + val os = new FileOutputStream(file) + org.apache.commons.io.IOUtils.copy(is, os) + os.close() + + val command: String = "Rscript " + file + " " + args.mkString(" ") + + logger.info("Starting: " + command) + val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x))) + if (process.exitValue() == 0) logger.info("Done: " + command) + else { + logger.warn("Failed: " + command) + if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info") + } + } +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala index a0a428d60db140abe196b4089069ee8c8f79c643..9f1cddb7913cf7846daa72c7b9e695988b3a14c6 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala @@ -141,7 +141,8 @@ object ConfigUtils extends Logging { val num = json.number.get if (num % 1 > 0) return num.toDouble else return num.toLong - } else throw new IllegalStateException("Config value type not supported, value: " + json) + } else if (json.isNull) return None + else throw new IllegalStateException("Config value type not supported, value: " + json) } /** @@ -166,8 +167,10 @@ object ConfigUtils extends Logging { def anyToJson(any: Any): Json = { any match { case j: Json => j + case None => Json.jNull case m: Map[_, _] => mapToJson(m.map(m => m._1.toString -> anyToJson(m._2))) case l: List[_] => Json.array(l.map(anyToJson(_)): _*) + case b: Boolean => Json.jBool(b) case n: Int => Json.jNumberOrString(n) case n: Double => Json.jNumberOrString(n) case n: Long => Json.jNumberOrString(n) @@ -333,7 +336,18 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2file(value: ConfigValue): File = { - if (value != null && value.value != null) new File(any2string(value.value)) else null + if (value != null && value.value != null && value.value != None) new File(any2string(value.value)) + else throw new IllegalStateException("Value does not exist") + } + + /** + * Convert ConfigValue to File + * @param value Input ConfigValue + * @return + */ + implicit def configValue2optionFile(value: ConfigValue): Option[File] = { + if (value != null && value.value != null && value.value != None) Some(new File(any2string(value.value))) + else None } /** @@ -342,7 +356,18 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2string(value: ConfigValue): String = { - if (value != null) any2string(value.value) else null + if (value != null && value.value != null && value.value != None) any2string(value.value) + else throw new IllegalStateException("Value does not exist") + } + + /** + * Convert ConfigValue to String + * @param value Input ConfigValue + * @return + */ + implicit def configValue2optionString(value: ConfigValue): Option[String] = { + if (value != null && value.value != null && value.value != None) Some(any2string(value.value)) + else None } /** @@ -351,7 +376,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2long(value: ConfigValue): Long = { - if (value != null) any2long(value.value) else 0 + if (value != null && value.value != null && value.value != None) any2long(value.value) + else throw new IllegalStateException("Value does not exist") } /** @@ -360,7 +386,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionLong(value: ConfigValue): Option[Long] = { - if (value != null) Option(any2long(value.value)) else None + if (value != null && value.value != null && value.value != None) Option(any2long(value.value)) + else None } /** @@ -369,7 +396,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2int(value: ConfigValue): Int = { - if (value != null) any2int(value.value) else 0 + if (value != null && value.value != null && value.value != None) any2int(value.value) + else throw new IllegalStateException("Value does not exist") } /** @@ -378,7 +406,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionInt(value: ConfigValue): Option[Int] = { - if (value != null) Option(any2int(value.value)) else None + if (value != null && value.value != null && value.value != None) Option(any2int(value.value)) + else None } /** @@ -387,7 +416,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2double(value: ConfigValue): Double = { - if (value != null) any2double(value.value) else 0 + if (value != null && value.value != null && value.value != None) any2double(value.value) + else throw new IllegalStateException("Value does not exist") } /** @@ -396,7 +426,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionDouble(value: ConfigValue): Option[Double] = { - if (value != null) Option(any2double(value.value)) else None + if (value != null && value.value != null && value.value != None) Option(any2double(value.value)) + else None } /** @@ -405,7 +436,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2float(value: ConfigValue): Float = { - if (value != null) any2float(value.value) else 0 + if (value != null && value.value != null && value.value != None) any2float(value.value) + else throw new IllegalStateException("Value does not exist") } /** @@ -414,7 +446,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionFloat(value: ConfigValue): Option[Float] = { - if (value != null) Option(any2float(value.value)) else None + if (value != null && value.value != null && value.value != None) Option(any2float(value.value)) + else None } /** @@ -423,7 +456,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2boolean(value: ConfigValue): Boolean = { - if (value != null) any2boolean(value.value) else false + if (value != null && value.value != null && value.value != None) any2boolean(value.value) + else throw new IllegalStateException("Value does not exist") } /** @@ -432,7 +466,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionBoolean(value: ConfigValue): Option[Boolean] = { - if (value != null) Option(any2boolean(value.value)) else None + if (value != null && value.value != null && value.value != None) Option(any2boolean(value.value)) + else None } /** @@ -441,7 +476,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2list(value: ConfigValue): List[Any] = { - if (value != null) any2list(value.value) else null + if (value != null && value.value != null && value.value != None) any2list(value.value) + else throw new IllegalStateException("Value does not exist") } /** @@ -450,7 +486,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2stringList(value: ConfigValue): List[String] = { - if (value != null) any2stringList(value.value) else null + if (value != null && value.value != null && value.value != None) any2stringList(value.value) + else throw new IllegalStateException("Value does not exist") } /** @@ -459,7 +496,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2stringSet(value: ConfigValue): Set[String] = { - if (value != null && value.value != null) any2stringList(value.value).toSet else null + if (value != null && value.value != null && value.value != None) any2stringList(value.value).toSet + else throw new IllegalStateException("Value does not exist") } /** @@ -468,7 +506,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2map(value: ConfigValue): Map[String, Any] = { - if (value != null) any2map(value.value) else null + if (value != null && value.value != null && value.value != None) any2map(value.value) + else throw new IllegalStateException("Value does not exist") } } } diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/FastqSyncTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/FastqSyncTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..1c98a36f2698d23806aa48de151b4cffa20eb056 --- /dev/null +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/FastqSyncTest.scala @@ -0,0 +1,230 @@ +/** + * Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl> + * @author Wibowo Arindrarto <w.arindrarto@lumc.nl> + */ +package nl.lumc.sasc.biopet.tools + +import java.io.File +import java.nio.file.Paths +import scala.collection.JavaConverters._ + +import htsjdk.samtools.fastq.{ AsyncFastqWriter, FastqReader, FastqRecord } +import org.mockito.Mockito.{ inOrder => inOrd, when } +import org.scalatest.Matchers +import org.scalatest.mock.MockitoSugar +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.{ DataProvider, Test } + +class FastqSyncTest extends TestNGSuite with MockitoSugar with Matchers { + + import FastqSync._ + + private def resourceFile(p: String): File = + new File(resourcePath(p)) + + private def resourcePath(p: String): String = + Paths.get(getClass.getResource(p).toURI).toString + + // Helper functions to create iterator over FastqRecords given its IDs as Ints + private def recordsOver(ids: String*): java.util.Iterator[FastqRecord] = ids + .map(x => new FastqRecord(x, "A", "", "H")) + .toIterator.asJava + + @DataProvider(name = "mockReaderProvider") + def mockReaderProvider() = + Array( + Array(mock[FastqReader], mock[FastqReader], mock[FastqReader])) + + @Test(dataProvider = "mockReaderProvider") + def testDefault(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1", "2", "3") + when(aMock.iterator) thenReturn recordsOver("1", "2", "3") + when(bMock.iterator) thenReturn recordsOver("1", "2", "3") + + val (sync, counts) = syncFastq(refMock, aMock, bMock) + sync.length shouldBe 3 + sync(0) shouldBe (new FastqRecord("1", "A", "", "H"), new FastqRecord("1", "A", "", "H")) + sync(1) shouldBe (new FastqRecord("2", "A", "", "H"), new FastqRecord("2", "A", "", "H")) + sync(2) shouldBe (new FastqRecord("3", "A", "", "H"), new FastqRecord("3", "A", "", "H")) + counts.numDiscard1 shouldBe 0 + counts.numDiscard2 shouldBe 0 + counts.numKept shouldBe 3 + } + + @Test(dataProvider = "mockReaderProvider") + def testRefTooShort(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1", "2") + when(aMock.iterator) thenReturn recordsOver("1", "2", "3") + when(bMock.iterator) thenReturn recordsOver("1", "2", "3") + + val thrown = intercept[NoSuchElementException] { + syncFastq(refMock, aMock, bMock) + } + thrown.getMessage should ===("Reference record stream shorter than expected") + } + + @Test(dataProvider = "mockReaderProvider") + def testSeqAEmpty(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1", "2", "3") + when(aMock.iterator) thenReturn recordsOver() + when(bMock.iterator) thenReturn recordsOver("1", "2", "3") + + val (sync, counts) = syncFastq(refMock, aMock, bMock) + sync.length shouldBe 0 + counts.numDiscard1 shouldBe 0 + counts.numDiscard2 shouldBe 3 + counts.numKept shouldBe 0 + } + + @Test(dataProvider = "mockReaderProvider") + def testSeqBEmpty(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1", "2", "3") + when(aMock.iterator) thenReturn recordsOver("1", "2", "3") + when(bMock.iterator) thenReturn recordsOver() + + val (sync, counts) = syncFastq(refMock, aMock, bMock) + sync.length shouldBe 0 + counts.numDiscard1 shouldBe 3 + counts.numDiscard2 shouldBe 0 + counts.numKept shouldBe 0 + } + + @Test(dataProvider = "mockReaderProvider") + def testSeqAShorter(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1", "2", "3") + when(aMock.iterator) thenReturn recordsOver("2", "3") + when(bMock.iterator) thenReturn recordsOver("1", "2", "3") + + val (sync, counts) = syncFastq(refMock, aMock, bMock) + sync.length shouldBe 2 + sync(0) shouldBe (new FastqRecord("2", "A", "", "H"), new FastqRecord("2", "A", "", "H")) + sync(1) shouldBe (new FastqRecord("3", "A", "", "H"), new FastqRecord("3", "A", "", "H")) + counts.numDiscard1 shouldBe 0 + counts.numDiscard2 shouldBe 1 + counts.numKept shouldBe 2 + } + + @Test(dataProvider = "mockReaderProvider") + def testSeqBShorter(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1", "2", "3") + when(aMock.iterator) thenReturn recordsOver("2", "3") + when(bMock.iterator) thenReturn recordsOver("1", "2", "3") + + val (sync, counts) = syncFastq(refMock, aMock, bMock) + sync.length shouldBe 2 + sync(0) shouldBe (new FastqRecord("2", "A", "", "H"), new FastqRecord("2", "A", "", "H")) + sync(1) shouldBe (new FastqRecord("3", "A", "", "H"), new FastqRecord("3", "A", "", "H")) + counts.numDiscard1 shouldBe 0 + counts.numDiscard2 shouldBe 1 + counts.numKept shouldBe 2 + } + + @Test(dataProvider = "mockReaderProvider") + def testSeqABShorter(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1", "2", "3") + when(aMock.iterator) thenReturn recordsOver("2", "3") + when(bMock.iterator) thenReturn recordsOver("1", "2") + + val (sync, counts) = syncFastq(refMock, aMock, bMock) + sync.length shouldBe 1 + sync(0) shouldBe (new FastqRecord("2", "A", "", "H"), new FastqRecord("2", "A", "", "H")) + counts.numDiscard1 shouldBe 1 + counts.numDiscard2 shouldBe 1 + counts.numKept shouldBe 1 + } + + @Test(dataProvider = "mockReaderProvider") + def testSeqABShorterPairMarkSlash(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1/1", "2/1", "3/1") + when(aMock.iterator) thenReturn recordsOver("2/1", "3/1") + when(bMock.iterator) thenReturn recordsOver("1/2", "2/2") + + val (sync, counts) = syncFastq(refMock, aMock, bMock) + sync.length shouldBe 1 + sync(0) shouldBe (new FastqRecord("2/1", "A", "", "H"), new FastqRecord("2/2", "A", "", "H")) + counts.numDiscard1 shouldBe 1 + counts.numDiscard2 shouldBe 1 + counts.numKept shouldBe 1 + } + + @Test(dataProvider = "mockReaderProvider") + def testSeqABShorterPairMarkUnderscore(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1_1", "2_1", "3_1") + when(aMock.iterator) thenReturn recordsOver("2_1", "3_1") + when(bMock.iterator) thenReturn recordsOver("1_2", "2_2") + + val (sync, counts) = syncFastq(refMock, aMock, bMock) + sync.length shouldBe 1 + sync(0) shouldBe (new FastqRecord("2_1", "A", "", "H"), new FastqRecord("2_2", "A", "", "H")) + counts.numDiscard1 shouldBe 1 + counts.numDiscard2 shouldBe 1 + counts.numKept shouldBe 1 + } + + @Test(dataProvider = "mockReaderProvider") + def testSeqABShorterWithDesc(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1 desc1b", "2 desc2b", "3 desc3b") + when(aMock.iterator) thenReturn recordsOver("2 desc2a", "3 desc3a") + when(bMock.iterator) thenReturn recordsOver("1 desc1b", "2 desc2b") + + val (sync, counts) = syncFastq(refMock, aMock, bMock) + sync.length shouldBe 1 + sync(0) shouldBe (new FastqRecord("2 desc2a", "A", "", "H"), new FastqRecord("2 desc2b", "A", "", "H")) + counts.numDiscard1 shouldBe 1 + counts.numDiscard2 shouldBe 1 + counts.numKept shouldBe 1 + } + + @Test(dataProvider = "mockReaderProvider") + def testComplex(refMock: FastqReader, aMock: FastqReader, bMock: FastqReader) = { + when(refMock.iterator) thenReturn recordsOver("1/2 yep", "2/2 yep", "3/2 yep", "4/2 yep", "5/2 yep") + when(aMock.iterator) thenReturn recordsOver("1/1 yep", "2/1 yep", "4/1 yep") + when(bMock.iterator) thenReturn recordsOver("1/2 yep", "3/2 yep", "4/2 yep") + + val (sync, counts) = syncFastq(refMock, aMock, bMock) + sync.length shouldBe 2 + sync(0) shouldBe (new FastqRecord("1/1 yep", "A", "", "H"), new FastqRecord("1/2 yep", "A", "", "H")) + sync(1) shouldBe (new FastqRecord("4/1 yep", "A", "", "H"), new FastqRecord("4/2 yep", "A", "", "H")) + counts.numDiscard1 shouldBe 1 + counts.numDiscard2 shouldBe 1 + counts.numKept shouldBe 2 + } + + @Test def testWriteSynced() = { + val aMock = mock[AsyncFastqWriter] + val bMock = mock[AsyncFastqWriter] + val sync = Stream( + (new FastqRecord("1", "A", "", "H"), new FastqRecord("1", "T", "", "E")), + (new FastqRecord("2", "A", "", "H"), new FastqRecord("2", "T", "", "E"))) + val counts = SyncCounts(4, 3, 2) + val obs = inOrd(aMock, bMock) + val stdout = new java.io.ByteArrayOutputStream + Console.withOut(stdout) { + writeSyncedFastq(sync, counts, aMock, bMock) + } + stdout.toString should ===(List( + "Filtered 4 reads from first read file.", + "Filtered 3 reads from second read file.", + "Synced read files contain 2 reads.\n" + ).mkString("\n")) + obs.verify(aMock).write(new FastqRecord("1", "A", "", "H")) + obs.verify(bMock).write(new FastqRecord("1", "T", "", "E")) + obs.verify(aMock).write(new FastqRecord("2", "A", "", "H")) + obs.verify(bMock).write(new FastqRecord("2", "T", "", "E")) + } + + @Test def testArgsMinimum() = { + val args = Array( + "-r", resourcePath("/paired01a.fq"), + "-i", resourcePath("/paired01a.fq"), + "-j", resourcePath("/paired01b.fq"), + "-o", "/tmp/mockout1.fq", + "-p", "/tmp/mockout2.fq") + val parsed = parseArgs(args) + parsed.refFastq shouldBe resourceFile("/paired01a.fq") + parsed.inputFastq1 shouldBe resourceFile("/paired01a.fq") + parsed.inputFastq2 shouldBe resourceFile("/paired01b.fq") + parsed.outputFastq1 shouldBe new File("/tmp/mockout1.fq") + parsed.outputFastq2 shouldBe new File("/tmp/mockout2.fq") + } +} \ No newline at end of file diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..9ac90deecad83fab23d95bfafaf89442899b798e --- /dev/null +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala @@ -0,0 +1,77 @@ +package nl.lumc.sasc.biopet.tools + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test +import scala.collection.mutable +import VcfStats._ + +/** + * Created by pjvan_thof on 2/5/15. + */ +class VcfStatsTest extends TestNGSuite with Matchers { + + @Test + def testSampleToSampleStats: Unit = { + val s1 = SampleToSampleStats() + val s2 = SampleToSampleStats() + s1.alleleOverlap shouldBe 0 + s1.genotypeOverlap shouldBe 0 + s2.alleleOverlap shouldBe 0 + s2.genotypeOverlap shouldBe 0 + + s1 += s2 + s1.alleleOverlap shouldBe 0 + s1.genotypeOverlap shouldBe 0 + s2.alleleOverlap shouldBe 0 + s2.genotypeOverlap shouldBe 0 + + s2.alleleOverlap = 2 + s2.genotypeOverlap = 3 + + s1 += s2 + s1.alleleOverlap shouldBe 2 + s1.genotypeOverlap shouldBe 3 + s2.alleleOverlap shouldBe 2 + s2.genotypeOverlap shouldBe 3 + + s1 += s2 + s1.alleleOverlap shouldBe 4 + s1.genotypeOverlap shouldBe 6 + s2.alleleOverlap shouldBe 2 + s2.genotypeOverlap shouldBe 3 + } + + @Test + def testSampleStats: Unit = { + val s1 = SampleStats() + val s2 = SampleStats() + + s1.sampleToSample += "s1" -> SampleToSampleStats() + s1.sampleToSample += "s2" -> SampleToSampleStats() + s2.sampleToSample += "s1" -> SampleToSampleStats() + s2.sampleToSample += "s2" -> SampleToSampleStats() + + s1.sampleToSample("s1").alleleOverlap = 1 + s2.sampleToSample("s2").alleleOverlap = 2 + + s1.genotypeStats += "1" -> mutable.Map(1 -> 1) + s2.genotypeStats += "2" -> mutable.Map(2 -> 2) + + val ss1 = SampleToSampleStats() + val ss2 = SampleToSampleStats() + + s1 += s2 + s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 1), "2" -> mutable.Map(2 -> 2)) + ss1.alleleOverlap = 1 + ss2.alleleOverlap = 2 + s1.sampleToSample shouldBe mutable.Map("s1" -> ss1, "s2" -> ss2) + + s1 += s2 + s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 1), "2" -> mutable.Map(2 -> 4)) + + s1 += s1 + s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 2), "2" -> mutable.Map(2 -> 8)) + } + +} diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/ConfigUtilsTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/ConfigUtilsTest.scala index 739850b28c24e8b2bf26a1848baa0f0ab06d172e..26f989c5369c4e138ded82620d1d03f0c076ce7c 100644 --- a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/ConfigUtilsTest.scala +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/ConfigUtilsTest.scala @@ -169,17 +169,20 @@ class ConfigUtilsTest extends TestNGSuite with Matchers { @Test def testImplicits: Unit = { val index = ConfigValueIndex("test", Nil, "test") new ImplicitConversions { - var map: Map[String, Any] = ConfigValue(index, index, Map()) - map = ConfigValue(index, index, null) - configValue2list(ConfigValue(index, index, List(""))) shouldBe List("") - configValue2list(ConfigValue(index, index, null)) shouldBe null + intercept[IllegalStateException] { + configValue2list(ConfigValue(index, index, null)) + } configValue2stringList(ConfigValue(index, index, List(""))) shouldBe List("") - configValue2stringList(ConfigValue(index, index, null)) shouldBe null + intercept[IllegalStateException] { + configValue2stringList(ConfigValue(index, index, null)) + } configValue2stringSet(ConfigValue(index, index, List(""))) shouldBe Set("") - configValue2stringSet(ConfigValue(index, index, null)) shouldBe null + intercept[IllegalStateException] { + configValue2stringSet(ConfigValue(index, index, null)) + } var int: Int = ConfigValue(index, index, 1) intercept[IllegalStateException] { @@ -232,10 +235,14 @@ class ConfigUtilsTest extends TestNGSuite with Matchers { } var string: String = ConfigValue(index, index, "test") - string = ConfigValue(index, index, null) + intercept[IllegalStateException] { + string = ConfigValue(index, index, null) + } var file: File = ConfigValue(index, index, "test") - file = ConfigValue(index, index, null) + intercept[IllegalStateException] { + file = ConfigValue(index, index, null) + } } } } diff --git a/public/biopet-public-package/pom.xml b/public/biopet-public-package/pom.xml index a4670bdd446366697f14a8cd92b67fc61c8b4a43..12e31dad57fe487d238cea749213f78449cf79fb 100644 --- a/public/biopet-public-package/pom.xml +++ b/public/biopet-public-package/pom.xml @@ -75,6 +75,16 @@ <artifactId>Yamsvp</artifactId> <version>${project.version}</version> </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Kopisu</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Carp</artifactId> + <version>${project.version}</version> + </dependency> <dependency> <groupId>nl.lumc.sasc</groupId> <artifactId>Toucan</artifactId> diff --git a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala index 9dcc6123b390d9886be7be21873dd51203a485d6..b0ad47dcc228580fdfc7dac59bb5fd180ffb0fef 100644 --- a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala +++ b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala @@ -23,16 +23,19 @@ object BiopetExecutablePublic extends BiopetExecutable { nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics, nl.lumc.sasc.biopet.pipelines.yamsvp.Yamsvp, nl.lumc.sasc.biopet.pipelines.sage.Sage, - nl.lumc.sasc.biopet.pipelines.toucan.Toucan + nl.lumc.sasc.biopet.pipelines.kopisu.ConiferPipeline, + nl.lumc.sasc.biopet.pipelines.carp.Carp ) def tools: List[MainCommand] = List( nl.lumc.sasc.biopet.tools.WipeReads, nl.lumc.sasc.biopet.tools.ExtractAlignedFastq, + nl.lumc.sasc.biopet.tools.FastqSync, nl.lumc.sasc.biopet.tools.BiopetFlagstat, nl.lumc.sasc.biopet.tools.CheckAllelesVcfInBam, nl.lumc.sasc.biopet.tools.VcfToTsv, nl.lumc.sasc.biopet.tools.VcfFilter, + nl.lumc.sasc.biopet.tools.VcfStats, nl.lumc.sasc.biopet.tools.FindRepeatsPacBio, nl.lumc.sasc.biopet.tools.BedToInterval, nl.lumc.sasc.biopet.tools.MpileupToVcf, diff --git a/public/carp/.gitignore b/public/carp/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..a6f89c2da7a029afa02b6e7a2bf80ad34958a311 --- /dev/null +++ b/public/carp/.gitignore @@ -0,0 +1 @@ +/target/ \ No newline at end of file diff --git a/public/carp/pom.xml b/public/carp/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..482b83147312cf332cc7325c6a9196c092dcd91a --- /dev/null +++ b/public/carp/pom.xml @@ -0,0 +1,48 @@ +<!-- + + Biopet is built on top of GATK Queue for building bioinformatic + pipelines. It is mainly intended to support LUMC SHARK cluster which is running + SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + should also be able to execute Biopet tools and pipelines. + + Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + + Contact us at: sasc@lumc.nl + + A dual licensing mode is applied. The source code within this project that are + not part of GATK Queue is freely available for non-commercial use under an AGPL + license; For commercial users or users who do not want to follow the AGPL + license, please contact us to obtain a separate license. + +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>nl.lumc.sasc</groupId> + <artifactId>Carp</artifactId> + <packaging>jar</packaging> + + <parent> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Biopet</artifactId> + <version>0.3.0-DEV</version> + <relativePath>../</relativePath> + </parent> + + <inceptionYear>2014</inceptionYear> + <name>Carp</name> + + <dependencies> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>BiopetFramework</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Mapping</artifactId> + <version>${project.version}</version> + </dependency> + </dependencies> +</project> diff --git a/public/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala b/public/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala new file mode 100644 index 0000000000000000000000000000000000000000..578f3afc79f6cac8d2621399bb577222383ee2bb --- /dev/null +++ b/public/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala @@ -0,0 +1,124 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.carp + +import java.io.File + +import nl.lumc.sasc.biopet.extensions.Ln +import nl.lumc.sasc.biopet.extensions.macs2.Macs2CallPeak +import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles +import nl.lumc.sasc.biopet.utils.ConfigUtils +import org.broadinstitute.gatk.queue.QScript +import org.broadinstitute.gatk.utils.commandline.{ Argument, Input } +import org.broadinstitute.gatk.utils.commandline.{ Input, Argument } +import nl.lumc.sasc.biopet.core._ +import nl.lumc.sasc.biopet.core.config._ +import nl.lumc.sasc.biopet.pipelines.mapping.Mapping + +/** + * Carp pipeline + * Chip-Seq analysis pipeline + * This pipeline performs QC,mapping and peak calling + */ +class Carp(val root: Configurable) extends QScript with MultiSampleQScript { + qscript => + def this() = this(null) + + override def defaults = ConfigUtils.mergeMaps(Map( + "mapping" -> Map("skip_markduplicates" -> true) + ), super.defaults) + + def makeSample(id: String) = new Sample(id) + class Sample(sampleId: String) extends AbstractSample(sampleId) { + def makeLibrary(id: String) = new Library(id) + class Library(libId: String) extends AbstractLibrary(libId) { + val mapping = new Mapping(qscript) + + def addJobs(): Unit = { + if (config.contains("R1")) { + mapping.input_R1 = config("R1") + if (config.contains("R2")) mapping.input_R2 = config("R2") + mapping.libId = libId + mapping.sampleId = sampleId + mapping.outputDir = libDir + + mapping.init + mapping.biopetScript + addAll(mapping.functions) + + } else logger.error("Sample: " + sampleId + ": No R1 found for library: " + libId) + } + } + + val bamFile = createFile(".bam") + val controls: List[String] = config("control", default = Nil) + + def addJobs(): Unit = { + addPerLibJobs() + val bamFiles = libraries.map(_._2.mapping.finalBamFile).toList + if (bamFiles.length == 1) { + add(Ln(qscript, bamFiles.head, bamFile)) + val oldIndex = new File(bamFiles.head.getAbsolutePath.stripSuffix(".bam") + ".bai") + val newIndex = new File(bamFile.getAbsolutePath.stripSuffix(".bam") + ".bai") + add(Ln(qscript, oldIndex, newIndex)) + } else if (bamFiles.length > 1) { + val merge = new MergeSamFiles(qscript) + merge.input = bamFiles + merge.sortOrder = "coordinate" + merge.output = bamFile + add(merge) + + //TODO: Add BigWIg track + } + + val macs2 = new Macs2CallPeak(qscript) + macs2.treatment = bamFile + macs2.name = Some(sampleId) + macs2.outputdir = sampleDir + "macs2/" + sampleId + "/" + add(macs2) + } + } + + def init() { + } + + def biopetScript() { + // Define what the pipeline should do + // First step is QC, this will be done with Flexiprep + // Second step is mapping, this will be done with the Mapping pipeline + // Third step is calling peaks on the bam files produced with the mapping pipeline, this will be done with MACS2 + logger.info("Starting CArP pipeline") + + addSamplesJobs() + } + + def addMultiSampleJobs(): Unit = { + for ((sampleId, sample) <- samples) { + for (controlId <- sample.controls) { + if (!samples.contains(controlId)) + throw new IllegalStateException("For sample: " + sampleId + " this control: " + controlId + " does not exist") + val macs2 = new Macs2CallPeak(this) + macs2.treatment = sample.bamFile + macs2.control = samples(controlId).bamFile + macs2.name = Some(sampleId + "_VS_" + controlId) + macs2.outputdir = sample.sampleDir + "/" + "macs2/" + macs2.name.get + "/" + add(macs2) + } + } + } +} + +object Carp extends PipelineCommand diff --git a/public/flexiprep/pom.xml b/public/flexiprep/pom.xml index e9b58ab28a615ac8ce4e76063285b125f2b66b1b..86666db29645cd35f9d8f71c6b1aa775d0f22444 100644 --- a/public/flexiprep/pom.xml +++ b/public/flexiprep/pom.xml @@ -39,5 +39,17 @@ <artifactId>BiopetFramework</artifactId> <version>${project.version}</version> </dependency> + <dependency> + <groupId>org.testng</groupId> + <artifactId>testng</artifactId> + <version>6.8</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_2.11</artifactId> + <version>2.2.1</version> + <scope>test</scope> + </dependency> </dependencies> </project> diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala index 1bd84bb36e21c8e577adaf5e9ad33d02b1db47fa..9aaca5f66336e38b16b215a9c175781fafc97fe0 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala @@ -33,14 +33,14 @@ class Cutadapt(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Cutada override def beforeCmd() { super.beforeCmd - val foundAdapters = fastqc.getFoundAdapters.map(_.seq) + val foundAdapters = fastqc.foundAdapters.map(_.seq) if (default_clip_mode == "3") opt_adapter ++= foundAdapters else if (default_clip_mode == "5") opt_front ++= foundAdapters else if (default_clip_mode == "both") opt_anywhere ++= foundAdapters } override def cmdLine = { - if (!opt_adapter.isEmpty || !opt_anywhere.isEmpty || !opt_front.isEmpty) { + if (opt_adapter.nonEmpty || opt_anywhere.nonEmpty || opt_front.nonEmpty) { analysisName = getClass.getSimpleName super.cmdLine } else { diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala index 4ee01c2605d5449dac33b19ac9c2ab360b383d45..dc5972ab488caf26cedc51f8362071d6e36cbd90 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala @@ -16,82 +16,154 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep -import java.io.File -import nl.lumc.sasc.biopet.core.config.Configurable +import java.io.{ File, FileNotFoundException } + import scala.io.Source import argonaut._, Argonaut._ import scalaz._, Scalaz._ +import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.utils.ConfigUtils + +/** + * FastQC wrapper with added functionality for the Flexiprep pipeline + * + * This wrapper implements additional methods for parsing FastQC output files and aggregating everything in a summary + * object. The current implementation is based on FastQC v0.10.1. + */ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(root) { - def getDataBlock(name: String): Array[String] = { // Based on Fastqc v0.10.1 - val outputDir = output.getAbsolutePath.stripSuffix(".zip") - val dataFile = new File(outputDir + "/fastqc_data.txt") - if (!dataFile.exists) return null - val data = Source.fromFile(dataFile).mkString - for (block <- data.split(">>END_MODULE\n")) { - val b = if (block.startsWith("##FastQC")) block.substring(block.indexOf("\n") + 1) else block - if (b.startsWith(">>" + name)) - return for (line <- b.split("\n")) - yield line - } - return null - } - def getEncoding: String = { - val block = getDataBlock("Basic Statistics") - if (block == null) return null - for ( - line <- block if (line.startsWith("Encoding")) - ) return line.stripPrefix("Encoding\t") - return null // Could be default Sanger with a warning in the log + /** Class for storing a single FastQC module result */ + protected case class FastQCModule(name: String, status: String, lines: Seq[String]) + + /** Default FastQC output directory containing actual results */ + // this is a def instead of a val since the value depends on the variable `output`, which is null on class creation + def outputDir: File = new File(output.getAbsolutePath.stripSuffix(".zip")) + + /** Default FastQC output data file */ + // this is a def instead of a val since the value depends on the variable `output`, which is null on class creation + def dataFile: File = new File(outputDir, "fastqc_data.txt") + + /** + * FastQC QC modules. + * + * @return Mapping of FastQC module names and its contents as array of strings (one item per line) + * @throws FileNotFoundException if the FastQC data file can not be found. + * @throws IllegalStateException if the module lines have no content or mapping is empty. + */ + @throws(classOf[FileNotFoundException]) + @throws(classOf[IllegalStateException]) + def qcModules: Map[String, FastQCModule] = { + + val fqModules = Source.fromFile(dataFile) + // drop all the characters before the first module delimiter (i.e. '>>') + .dropWhile(_ != '>') + // pull everything into a string + .mkString + // split into modules + .split(">>END_MODULE\n") + // make map of module name -> module lines + .map { + case (modString) => + // module name is in the first line, without '>>' and before the tab character + val Array(firstLine, otherLines) = modString + // drop all '>>' character (start of module) + .dropWhile(_ == '>') + // split first line and others + .split("\n", 2) + // and slice them + .slice(0, 2) + // extract module name and module status + val Array(modName, modStatus) = firstLine + .split("\t", 2) + .slice(0, 2) + modName -> FastQCModule(modName, modStatus, otherLines.split("\n").toSeq) + } + .toMap + + if (fqModules.isEmpty) throw new IllegalStateException("Empty FastQC data file " + dataFile.toString) + else fqModules } - protected case class Sequence(name: String, seq: String) - def getFoundAdapters: List[Sequence] = { - def getSeqs(file: File) = { - if (file != null) { - (for ( - line <- Source.fromFile(file).getLines(); if line.startsWith("#"); - values = line.split("\t*") if values.size >= 2 - ) yield Sequence(values(0), values(1))).toList - } else Nil - } + /** + * Retrieves the FASTQ file encoding as computed by FastQC. + * + * @return encoding name + * @throws NoSuchElementException when the "Basic Statistics" key does not exist in the mapping or + * when a line starting with "Encoding" does not exist. + */ + @throws(classOf[NoSuchElementException]) + def encoding: String = + qcModules("Basic Statistics") + .lines + .dropWhile(!_.startsWith("Encoding")) + .head + .stripPrefix("Encoding\t") + .stripSuffix("\t") + + /** Case class representing a known adapter sequence */ + protected case class AdapterSequence(name: String, seq: String) - val seqs = getSeqs(adapters) ::: getSeqs(contaminants) + /** + * Retrieves overrepresented sequences found by FastQ. + * + * @return a [[Set]] of [[AdapterSequence]] objects. + */ + def foundAdapters: Set[AdapterSequence] = { - val block = getDataBlock("Overrepresented sequences") - if (block == null) return Nil + /** Returns a list of adapter and/or contaminant sequences known to FastQC */ + def getFastqcSeqs(file: Option[File]): Set[AdapterSequence] = file match { + case None => Set.empty[AdapterSequence] + case Some(f) => + (for { + line <- Source.fromFile(f).getLines() + if !line.startsWith("#") + values = line.split("\t+") + if values.size >= 2 + } yield AdapterSequence(values(0), values(1))).toSet + } - val found = for ( - line <- block if !line.startsWith("#"); - values = line.split("\t") if values.size >= 4 - ) yield values(3) + val found = qcModules.get("Overrepresented sequences") match { + case None => Seq.empty[String] + case Some(qcModule) => + for ( + line <- qcModule.lines if !(line.startsWith("#") || line.startsWith(">")); + values = line.split("\t") if values.size >= 4 + ) yield values(3) + } - seqs.filter(x => found.exists(_.startsWith(x.name))) + // select full sequences from known adapters and contaminants + // based on overrepresented sequences results + (getFastqcSeqs(adapters) ++ getFastqcSeqs(contaminants)) + .filter(x => found.exists(_.startsWith(x.name))) } - def getSummary: Json = { - val subfixs = Map("plot_duplication_levels" -> "Images/duplication_levels.png", - "plot_kmer_profiles" -> "Images/kmer_profiles.png", - "plot_per_base_gc_content" -> "Images/per_base_gc_content.png", - "plot_per_base_n_content" -> "Images/per_base_n_content.png", - "plot_per_base_quality" -> "Images/per_base_quality.png", - "plot_per_base_sequence_content" -> "Images/per_base_sequence_content.png", - "plot_per_sequence_gc_content" -> "Images/per_sequence_gc_content.png", - "plot_per_sequence_quality" -> "Images/per_sequence_quality.png", - "plot_sequence_length_distribution" -> "Images/sequence_length_distribution.png", - "fastqc_data" -> "fastqc_data.txt") - val dir = output.getAbsolutePath.stripSuffix(".zip") + "/" - var outputMap: Map[String, Map[String, String]] = Map() - for ((k, v) <- subfixs) outputMap += (k -> Map("path" -> (dir + v))) - - val temp = ("" := outputMap) ->: jEmptyObject - return temp.fieldOrEmptyObject("") + /** Summary of the FastQC run, stored in a [[Json]] object */ + def summary: Json = { + + val outputMap = + Map("plot_duplication_levels" -> "Images/duplication_levels.png", + "plot_kmer_profiles" -> "Images/kmer_profiles.png", + "plot_per_base_gc_content" -> "Images/per_base_gc_content.png", + "plot_per_base_n_content" -> "Images/per_base_n_content.png", + "plot_per_base_quality" -> "Images/per_base_quality.png", + "plot_per_base_sequence_content" -> "Images/per_base_sequence_content.png", + "plot_per_sequence_gc_content" -> "Images/per_sequence_gc_content.png", + "plot_per_sequence_quality" -> "Images/per_sequence_quality.png", + "plot_sequence_length_distribution" -> "Images/sequence_length_distribution.png", + "fastqc_data" -> "fastqc_data.txt") + .map { + case (name, relPath) => + name -> Map("path" -> (outputDir + relPath)) + } + + ConfigUtils.mapToJson(outputMap) } } object Fastqc { + def apply(root: Configurable, fastqfile: File, outDir: String): Fastqc = { val fastqcCommand = new Fastqc(root) fastqcCommand.fastqfile = fastqfile @@ -102,6 +174,6 @@ object Fastqc { //if (filename.endsWith(".fq")) filename = filename.substring(0,filename.size - 3) fastqcCommand.output = new File(outDir + "/" + filename + "_fastqc.zip") fastqcCommand.afterGraph - return fastqcCommand + fastqcCommand } } diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala index 75de7488ac27f35784a56a321b3741df2397e53c..9ab16032bd095d40728e91f92a611366b400af2f 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala @@ -21,7 +21,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument } import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat, Seqstat } -import nl.lumc.sasc.biopet.scripts.{ FastqSync } +import nl.lumc.sasc.biopet.tools.FastqSync class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { def this() = this(null) @@ -30,21 +30,25 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { var input_R1: File = _ @Input(doc = "R2 fastq file (gzipped allowed)", shortName = "R2", required = false) - var input_R2: File = _ + var input_R2: Option[File] = None - @Argument(doc = "Skip Trim fastq files", shortName = "skiptrim", required = false) - var skipTrim: Boolean = config("skiptrim", default = false) + /** Skip Trim fastq files */ + var skipTrim: Boolean = config("skip_trim", default = false) - @Argument(doc = "Skip Clip fastq files", shortName = "skipclip", required = false) - var skipClip: Boolean = config("skipclip", default = false) + /** Skip Clip fastq files */ + var skipClip: Boolean = config("skip_clip", default = false) - @Argument(doc = "Sample name", shortName = "sample", required = true) - var sampleName: String = _ + // TODO: hide sampleId and libId from the command line so they do not interfere with our config values - @Argument(doc = "Library name", shortName = "library", required = true) - var libraryName: String = _ + /** Sample name */ + @Argument(doc = "Sample ID", shortName = "sample", required = true) + var sampleId: String = _ - var paired: Boolean = (input_R2 != null) + /** Library name */ + @Argument(doc = "Library ID", shortName = "library", required = true) + var libId: String = _ + + var paired: Boolean = input_R2.isDefined var R1_ext: String = _ var R2_ext: String = _ var R1_name: String = _ @@ -58,12 +62,12 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { val summary = new FlexiprepSummary(this) def init() { - if (input_R1 == null) throw new IllegalStateException("Missing R1 on flexiprep module") - if (outputDir == null) throw new IllegalStateException("Missing Output directory on flexiprep module") - if (sampleName == null) throw new IllegalStateException("Missing Sample name on flexiprep module") - if (libraryName == null) throw new IllegalStateException("Missing Library name on flexiprep module") - else if (!outputDir.endsWith("/")) outputDir += "/" - paired = (input_R2 != null) + require(outputDir != null, "Missing output directory on flexiprep module") + require(input_R1 != null, "Missing input R1 on flexiprep module") + require(sampleId != null, "Missing sample ID on flexiprep module") + require(libId != null, "Missing library ID on flexiprep module") + + paired = input_R2.isDefined if (input_R1.endsWith(".gz")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gz")) else if (input_R1.endsWith(".gzip")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gzip")) @@ -71,15 +75,19 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { R1_ext = R1_name.substring(R1_name.lastIndexOf("."), R1_name.size) R1_name = R1_name.substring(0, R1_name.lastIndexOf(R1_ext)) - if (paired) { - if (input_R2.endsWith(".gz")) R2_name = input_R2.getName.substring(0, input_R2.getName.lastIndexOf(".gz")) - else if (input_R2.endsWith(".gzip")) R2_name = input_R2.getName.substring(0, input_R2.getName.lastIndexOf(".gzip")) - else R2_name = input_R2.getName - R2_ext = R2_name.substring(R2_name.lastIndexOf("."), R2_name.size) - R2_name = R2_name.substring(0, R2_name.lastIndexOf(R2_ext)) + input_R2 match { + case Some(fileR2) => { + paired = true + if (fileR2.endsWith(".gz")) R2_name = fileR2.getName.substring(0, fileR2.getName.lastIndexOf(".gz")) + else if (fileR2.endsWith(".gzip")) R2_name = fileR2.getName.substring(0, fileR2.getName.lastIndexOf(".gzip")) + else R2_name = fileR2.getName + R2_ext = R2_name.substring(R2_name.lastIndexOf("."), R2_name.size) + R2_name = R2_name.substring(0, R2_name.lastIndexOf(R2_ext)) + } + case _ => } - summary.out = outputDir + sampleName + "-" + libraryName + ".qc.summary.json" + summary.out = outputDir + sampleId + "-" + libId + ".qc.summary.json" } def biopetScript() { @@ -95,7 +103,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { def runInitialJobs() { outputFiles += ("fastq_input_R1" -> extractIfNeeded(input_R1, outputDir)) - if (paired) outputFiles += ("fastq_input_R2" -> extractIfNeeded(input_R2, outputDir)) + if (paired) outputFiles += ("fastq_input_R2" -> extractIfNeeded(input_R2.get, outputDir)) fastqc_R1 = Fastqc(this, input_R1, outputDir + "/" + R1_name + ".fastqc/") add(fastqc_R1) @@ -107,12 +115,12 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { summary.addMd5sum(md5sum_R1, R2 = false, after = false) if (paired) { - fastqc_R2 = Fastqc(this, input_R2, outputDir + "/" + R2_name + ".fastqc/") + fastqc_R2 = Fastqc(this, input_R2.get, outputDir + "/" + R2_name + ".fastqc/") add(fastqc_R2) summary.addFastqc(fastqc_R2, R2 = true) outputFiles += ("fastqc_R2" -> fastqc_R2.output) - val md5sum_R2 = Md5sum(this, input_R2, outputDir) + val md5sum_R2 = Md5sum(this, input_R2.get, outputDir) add(md5sum_R2) summary.addMd5sum(md5sum_R2, R2 = true, after = false) } @@ -132,28 +140,32 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { var results: Map[String, File] = Map() var R1: File = new File(R1_in) - var R2: File = new File(R2_in) + var R2: File = if (paired) new File(R2_in) else null var deps: List[File] = if (paired) List(R1, R2) else List(R1) val seqtkSeq_R1 = SeqtkSeq(this, R1, swapExt(outDir, R1, R1_ext, ".sanger" + R1_ext), fastqc_R1) - add(seqtkSeq_R1, isIntermediate = true) + seqtkSeq_R1.isIntermediate = true + add(seqtkSeq_R1) R1 = seqtkSeq_R1.output deps ::= R1 if (paired) { val seqtkSeq_R2 = SeqtkSeq(this, R2, swapExt(outDir, R2, R2_ext, ".sanger" + R2_ext), fastqc_R2) - add(seqtkSeq_R2, isIntermediate = true) + seqtkSeq_R2.isIntermediate = true + add(seqtkSeq_R2) R2 = seqtkSeq_R2.output deps ::= R2 } val seqstat_R1 = Seqstat(this, R1, outDir) - add(seqstat_R1, isIntermediate = true) + seqstat_R1.isIntermediate = true + add(seqstat_R1) summary.addSeqstat(seqstat_R1, R2 = false, after = false, chunk) if (paired) { val seqstat_R2 = Seqstat(this, R2, outDir) - add(seqstat_R2, isIntermediate = true) + seqstat_R2.isIntermediate = true + add(seqstat_R2) summary.addSeqstat(seqstat_R2, R2 = true, after = false, chunk) } @@ -161,7 +173,8 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { val cutadapt_R1 = Cutadapt(this, R1, swapExt(outDir, R1, R1_ext, ".clip" + R1_ext)) cutadapt_R1.fastqc = fastqc_R1 - add(cutadapt_R1, isIntermediate = true) + cutadapt_R1.isIntermediate = true + add(cutadapt_R1) summary.addCutadapt(cutadapt_R1, R2 = false, chunk) R1 = cutadapt_R1.fastq_output deps ::= R1 @@ -171,19 +184,26 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { val cutadapt_R2 = Cutadapt(this, R2, swapExt(outDir, R2, R2_ext, ".clip" + R2_ext)) outputFiles += ("cutadapt_R2_stats" -> cutadapt_R2.stats_output) cutadapt_R2.fastqc = fastqc_R2 - add(cutadapt_R2, isIntermediate = true) + cutadapt_R2.isIntermediate = true + add(cutadapt_R2) summary.addCutadapt(cutadapt_R2, R2 = true, chunk) R2 = cutadapt_R2.fastq_output deps ::= R2 - val fastqSync = FastqSync(this, cutadapt_R1.fastq_input, cutadapt_R1.fastq_output, cutadapt_R2.fastq_output, - swapExt(outDir, R1, R1_ext, ".sync" + R1_ext), swapExt(outDir, R2, R2_ext, ".sync" + R2_ext), swapExt(outDir, R1, R1_ext, ".sync.stats")) - fastqSync.deps :::= deps - add(fastqSync, isIntermediate = true) - summary.addFastqcSync(fastqSync, chunk) - outputFiles += ("syncStats" -> fastqSync.output_stats) - R1 = fastqSync.output_R1 - R2 = fastqSync.output_R2 + val fqSync = new FastqSync(this) + fqSync.refFastq = cutadapt_R1.fastq_input + fqSync.inputFastq1 = cutadapt_R1.fastq_output + fqSync.inputFastq2 = cutadapt_R2.fastq_output + fqSync.outputFastq1 = swapExt(outDir, R1, R1_ext, ".sync" + R1_ext) + fqSync.outputFastq2 = swapExt(outDir, R2, R2_ext, ".sync" + R2_ext) + fqSync.outputStats = swapExt(outDir, R1, R1_ext, ".sync.stats") + fqSync.deps :::= deps + add(fqSync) + + summary.addFastqcSync(fqSync, chunk) + outputFiles += ("syncStats" -> fqSync.outputStats) + R1 = fqSync.outputFastq1 + R2 = fqSync.outputFastq2 deps :::= R1 :: R2 :: Nil } } @@ -199,7 +219,8 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { } sickle.output_stats = swapExt(outDir, R1, R1_ext, ".trim.stats") sickle.deps = deps - add(sickle, isIntermediate = true) + sickle.isIntermediate = true + add(sickle) summary.addSickle(sickle, chunk) R1 = sickle.output_R1 if (paired) R2 = sickle.output_R2 @@ -256,7 +277,8 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { } def extractIfNeeded(file: File, runDir: String): File = { - if (file.getName().endsWith(".gz") || file.getName().endsWith(".gzip")) { + if (file == null) return file + else if (file.getName().endsWith(".gz") || file.getName().endsWith(".gzip")) { var newFile: File = swapExt(runDir, file, ".gz", "") if (file.getName().endsWith(".gzip")) newFile = swapExt(runDir, file, ".gzip", "") val zcatCommand = Zcat(this, file, newFile) @@ -264,7 +286,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { add(zcatCommand) return newFile } else if (file.getName().endsWith(".bz2")) { - var newFile = swapExt(runDir, file, ".bz2", "") + val newFile = swapExt(runDir, file, ".bz2", "") val pbzip2 = Pbzip2(this, file, newFile) pbzip2.isIntermediate = true add(pbzip2) diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala index aa5168c1ab5f5a93775e65cdda22bedbc58bf104..4ff18fb7cc90d7c3a90255c3c9a49ffb0a191eda 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala @@ -18,7 +18,7 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep import java.io.PrintWriter import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.{ Md5sum, Seqstat } -import nl.lumc.sasc.biopet.scripts.{ FastqSync } +import nl.lumc.sasc.biopet.tools.FastqSync import org.broadinstitute.gatk.queue.function.InProcessFunction import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import java.io.File @@ -112,16 +112,16 @@ class FlexiprepSummary(val root: Configurable) extends InProcessFunction with Co def addFastqcSync(fastqSync: FastqSync, chunk: String = ""): FastqSync = { if (!chunks.contains(chunk)) chunks += (chunk -> new Chunk) chunks(chunk).fastqSync = fastqSync - deps ::= fastqSync.output_stats - return fastqSync + deps ::= fastqSync.outputStats + fastqSync } // format: OFF override def run { logger.debug("Start") md5Summary() val summary = - ("samples" := ( flexiprep.sampleName := - ("libraries" := ( flexiprep.libraryName := ( + ("samples" := ( flexiprep.sampleId := + ("libraries" := ( flexiprep.libId := ( ("flexiprep" := ( ("clipping" := !flexiprep.skipClip) ->: ("trimming" := !flexiprep.skipTrim) ->: @@ -201,7 +201,7 @@ class FlexiprepSummary(val root: Configurable) extends InProcessFunction with Co def fastqcSummary(fastqc: Fastqc): Option[Json] = { if (fastqc == null) return None - else return Option(fastqc.getSummary) + else return Option(fastqc.summary) } def clipstatSummary(): Option[Json] = { @@ -223,11 +223,13 @@ class FlexiprepSummary(val root: Configurable) extends InProcessFunction with Co jEmptyObject) } - def syncstatSummary(): Option[Json] = { - if (flexiprep.skipClip || !flexiprep.paired) return None - val s = for ((key, value) <- chunks) yield value.fastqSync.getSummary - return Option(FastqSync.mergeSummaries(s.toList)) - } + def syncstatSummary(): Option[Json] = + if (flexiprep.skipClip || !flexiprep.paired) + None + else { + val s = for ((key, value) <- chunks) yield value.fastqSync.summary + Option(FastqSync.mergeSummaries(s.toList)) + } def trimstatSummary(): Option[Json] = { if (flexiprep.skipTrim) return None diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/SeqtkSeq.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/SeqtkSeq.scala index f6d6ac9d7c2727445723bd26c61d77398d081e95..0fdeee289de9672d917264e3e9dc2d556f6bc48b 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/SeqtkSeq.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/SeqtkSeq.scala @@ -25,7 +25,7 @@ class SeqtkSeq(root: Configurable) extends nl.lumc.sasc.biopet.extensions.seqtk. override def beforeCmd { super.beforeCmd if (fastqc != null && Q == None) { - val encoding = fastqc.getEncoding + val encoding = fastqc.encoding Q = encoding match { case null => None case s if (s.contains("Sanger / Illumina 1.9")) => None diff --git a/public/flexiprep/src/test/resources/fqc_contaminants_v0101.txt b/public/flexiprep/src/test/resources/fqc_contaminants_v0101.txt new file mode 100644 index 0000000000000000000000000000000000000000..13c6a999940201a402c3a7f9dd931ab9102de360 --- /dev/null +++ b/public/flexiprep/src/test/resources/fqc_contaminants_v0101.txt @@ -0,0 +1,170 @@ +# This file contains a list of potential contaminants which are +# frequently found in high throughput sequencing reactions. These +# are mostly sequences of adapters / primers used in the various +# sequencing chemistries. +# +# Please DO NOT rely on these sequences to design your own oligos, some +# of them are truncated at ambiguous positions, and none of them are +# definitive sequences from the manufacturers so don't blame us if you +# try to use them and they don't work. +# +# You can add more sequences to the file by putting one line per entry +# and specifying a name[tab]sequence. If the contaminant you add is +# likely to be of use to others please consider sending it to the FastQ +# authors, either via a bug report at www.bioinformatics.bbsrc.ac.uk/bugzilla/ +# or by directly emailing simon.andrews@bbsrc.ac.uk so other users of +# the program can benefit. + +Illumina Single End Adapter 1 GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +Illumina Single End Adapter 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +Illumina Single End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Single End PCR Primer 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +Illumina Single End Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT + +Illumina Paired End Adapter 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End Adapter 2 GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +Illumina Paried End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End PCR Primer 2 CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +Illumina Paried End Sequencing Primer 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End Sequencing Primer 2 CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT + +Illumina DpnII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGAC +Illumina DpnII expression Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina DpnII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina DpnII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina DpnII expression Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina NlaIII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGACATG +Illumina NlaIII expression Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina NlaIII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina NlaIII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina NlaIII expression Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG + +Illumina Small RNA Adapter 1 GTTCAGAGTTCTACAGTCCGACGATC +Illumina Small RNA Adapter 2 TGGAATTCTCGGGTGCCAAGG +Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina Multiplexing Adapter 1 GATCGGAAGAGCACACGTCT +Illumina Multiplexing Adapter 2 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing PCR Primer 1.01 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing PCR Primer 2.01 GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +Illumina Multiplexing Read1 Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing Index Sequencing Primer GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +Illumina Multiplexing Read2 Sequencing Primer GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT + +Illumina PCR Primer Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC +Illumina PCR Primer Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC +Illumina PCR Primer Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC +Illumina PCR Primer Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC +Illumina PCR Primer Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC +Illumina PCR Primer Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC +Illumina PCR Primer Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC +Illumina PCR Primer Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC +Illumina PCR Primer Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC +Illumina PCR Primer Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC +Illumina PCR Primer Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC +Illumina PCR Primer Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC + +Illumina DpnII Gex Adapter 1 GATCGTCGGACTGTAGAACTCTGAAC +Illumina DpnII Gex Adapter 1.01 ACAGGTTCAGAGTTCTACAGTCCGAC +Illumina DpnII Gex Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina DpnII Gex Adapter 2.01 TCGTATGCCGTCTTCTGCTTG +Illumina DpnII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina DpnII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina DpnII Gex Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina NlaIII Gex Adapter 1.01 TCGGACTGTAGAACTCTGAAC +Illumina NlaIII Gex Adapter 1.02 ACAGGTTCAGAGTTCTACAGTCCGACATG +Illumina NlaIII Gex Adapter 2.01 CAAGCAGAAGACGGCATACGA +Illumina NlaIII Gex Adapter 2.02 TCGTATGCCGTCTTCTGCTTG +Illumina NlaIII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina NlaIII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina NlaIII Gex Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG + +Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA +Illumina 5p RNA Adapter GTTCAGAGTTCTACAGTCCGACGATC +Illumina RNA Adapter1 TGGAATTCTCGGGTGCCAAGG + +Illumina Small RNA 3p Adapter 1 ATCTCGTATGCCGTCTTCTGCTTG +Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +TruSeq Universal Adapter AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +TruSeq Adapter, Index 1 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 2 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 3 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 4 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 5 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 6 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 7 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 8 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 9 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 10 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 11 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 12 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG + +Illumina RNA RT Primer GCCTTGGCACCCGAGAATTCCA +Illumina RNA PCR Primer AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA + +RNA PCR Primer, Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 13 CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 14 CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 15 CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 16 CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 17 CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 18 CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 19 CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 20 CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 21 CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 22 CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 23 CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 24 CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 25 CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 26 CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 27 CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 28 CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 29 CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 30 CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 31 CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 32 CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 33 CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 34 CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 35 CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 36 CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 37 CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 38 CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 39 CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 40 CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 41 CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 42 CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 43 CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 44 CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 45 CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 46 CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 47 CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 48 CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA + +ABI Dynabead EcoP Oligo CTGATCTAGAGGTACCGGATCCCAGCAGT +ABI Solid3 Adapter A CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG +ABI Solid3 Adapter B CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT +ABI Solid3 5' AMP Primer CCACTACGCCTCCGCTTTCCTCTCTATG +ABI Solid3 3' AMP Primer CTGCCCCGGGTTCCTCATTCT +ABI Solid3 EF1 alpha Sense Primer CATGTGTGTTGAGAGCTTC +ABI Solid3 EF1 alpha Antisense Primer GAAAACCAAAGTGGTCCAC +ABI Solid3 GAPDH Forward Primer TTAGCACCCCTGGCCAAGG +ABI Solid3 GAPDH Reverse Primer CTTACTCCTTGGAGGCCATG diff --git a/public/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt b/public/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d44bfae6fa962cd5d3e88084107b22efed3b025 --- /dev/null +++ b/public/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt @@ -0,0 +1,838 @@ +##FastQC 0.10.1 +>>Basic Statistics pass +#Measure Value +Filename ct_r1.fq +File type Conventional base calls +Encoding Sanger / Illumina 1.9 +Total Sequences 1000 +Filtered Sequences 0 +Sequence length 100 +%GC 53 +>>END_MODULE +>>Per base sequence quality fail +#Base Mean Median Lower Quartile Upper Quartile 10th Percentile 90th Percentile +1 32.244 33.0 31.0 34.0 30.0 34.0 +2 32.589 34.0 31.0 34.0 31.0 34.0 +3 32.814 34.0 31.0 34.0 31.0 34.0 +4 36.231 37.0 35.0 37.0 35.0 37.0 +5 35.907 37.0 35.0 37.0 35.0 37.0 +6 35.934 37.0 35.0 37.0 35.0 37.0 +7 35.783 37.0 35.0 37.0 35.0 37.0 +8 36.008 37.0 35.0 37.0 35.0 37.0 +9 37.706 39.0 37.0 39.0 35.0 39.0 +10-14 37.857600000000005 39.2 37.2 39.4 34.8 39.4 +15-19 38.9788 40.2 38.0 41.0 35.0 41.0 +20-24 38.8246 40.0 38.0 41.0 34.8 41.0 +25-29 38.589600000000004 40.0 38.0 41.0 34.4 41.0 +30-34 38.3568 40.0 38.0 41.0 33.8 41.0 +35-39 38.1592 40.0 37.4 41.0 33.6 41.0 +40-44 37.4808 39.8 36.0 41.0 32.6 41.0 +45-49 36.9478 39.0 35.0 40.8 31.2 41.0 +50-54 35.845600000000005 37.8 34.6 40.0 29.4 41.0 +55-59 34.739 36.6 33.6 40.0 27.4 41.0 +60-64 34.1336 35.4 33.4 38.6 27.2 40.2 +65-69 32.7464 35.0 32.6 37.2 24.6 39.6 +70-74 29.3478 34.0 29.6 35.6 2.0 38.6 +75-79 27.4908 33.2 26.4 35.0 2.0 36.6 +80-84 25.893000000000008 33.0 21.8 35.0 2.0 35.4 +85-89 25.031799999999997 32.4 16.2 34.6 2.0 35.0 +90-94 23.9446 31.4 6.4 34.0 2.0 35.0 +95-99 22.9358 30.4 2.0 34.0 2.0 35.0 +100 21.984 30.0 2.0 34.0 2.0 35.0 +>>END_MODULE +>>Per sequence quality scores pass +#Quality Count +11 1.0 +12 4.0 +13 3.0 +14 1.0 +15 4.0 +16 4.0 +17 6.0 +18 7.0 +19 4.0 +20 2.0 +21 7.0 +22 9.0 +23 9.0 +24 17.0 +25 23.0 +26 30.0 +27 52.0 +28 39.0 +29 28.0 +30 23.0 +31 33.0 +32 43.0 +33 47.0 +34 74.0 +35 88.0 +36 148.0 +37 202.0 +38 89.0 +39 3.0 +>>END_MODULE +>>Per base sequence content fail +#Base G A T C +1 52.35707121364093 17.251755265797392 11.735205616850552 18.655967903711137 +2 34.300000000000004 11.1 24.8 29.799999999999997 +3 41.0 6.5 20.200000000000003 32.300000000000004 +4 37.5 8.7 26.0 27.800000000000004 +5 35.4 12.4 31.8 20.4 +6 57.3 11.1 1.6 30.0 +7 20.9 24.7 32.6 21.8 +8 20.0 27.200000000000003 30.0 22.8 +9 24.5 21.5 27.800000000000004 26.200000000000003 +10-14 25.22 23.28 26.26 25.240000000000002 +15-19 26.44 21.34 26.1 26.119999999999997 +20-24 25.240000000000002 22.1 24.6 28.060000000000002 +25-29 24.62 22.06 25.119999999999997 28.199999999999996 +30-34 26.240000000000002 21.44 24.279999999999998 28.04 +35-39 24.8 22.439999999999998 24.34 28.42 +40-44 25.8 22.84 23.9 27.46 +45-49 26.26 22.64 23.66 27.439999999999998 +50-54 26.72 22.58 23.18 27.52 +55-59 25.019999999999996 22.58 24.38 28.02 +60-64 26.251501802162597 22.00640768922707 23.28794553464157 28.454144973968766 +65-69 25.683829444891394 23.873692679002414 23.049074818986323 27.39340305711987 +70-74 25.554134697357206 25.44757033248082 21.717817561807333 27.28047740835465 +75-79 25.818501428257523 23.643155350472423 23.071852340145025 27.466490881125026 +80-84 26.973532796317606 23.95857307249712 21.74913693901036 27.318757192174914 +85-89 25.452016689847014 24.849327770050998 22.624014835419565 27.07464070468243 +90-94 24.547101449275363 22.35054347826087 24.139492753623188 28.962862318840582 +95-99 25.318837549655026 24.231653773782146 23.186284758519758 27.263223918043067 +100 24.0 26.0 21.9 28.1 +>>END_MODULE +>>Per base GC content fail +#Base %GC +1 71.01303911735206 +2 64.1 +3 73.3 +4 65.3 +5 55.800000000000004 +6 87.3 +7 42.699999999999996 +8 42.8 +9 50.7 +10-14 50.46000000000001 +15-19 52.559999999999995 +20-24 53.300000000000004 +25-29 52.82 +30-34 54.279999999999994 +35-39 53.22 +40-44 53.26 +45-49 53.7 +50-54 54.24 +55-59 53.04 +60-64 54.70564677613135 +65-69 53.07723250201126 +70-74 52.834612105711855 +75-79 53.28499230938255 +80-84 54.29228998849251 +85-89 52.526657394529444 +90-94 53.509963768115945 +95-99 52.5820614676981 +100 52.1 +>>END_MODULE +>>Per sequence GC content fail +#GC Content Count +0 0.0 +1 0.0 +2 0.0 +3 0.0 +4 0.0 +5 0.0 +6 0.0 +7 0.0 +8 0.0 +9 0.0 +10 0.0 +11 0.0 +12 0.0 +13 0.0 +14 0.0 +15 0.0 +16 0.0 +17 0.0 +18 0.0 +19 0.0 +20 0.0 +21 0.0 +22 0.0 +23 0.5 +24 0.5 +25 0.5 +26 1.0 +27 1.5 +28 2.0 +29 3.5 +30 5.5 +31 6.0 +32 6.5 +33 6.0 +34 4.5 +35 6.0 +36 11.0 +37 17.0 +38 21.0 +39 16.5 +40 15.0 +41 24.0 +42 28.5 +43 33.0 +44 35.5 +45 32.5 +46 32.0 +47 32.0 +48 29.5 +49 30.5 +50 30.0 +51 29.5 +52 30.0 +53 27.5 +54 26.5 +55 27.0 +56 29.5 +57 34.0 +58 36.0 +59 36.0 +60 37.0 +61 31.5 +62 24.0 +63 22.5 +64 27.0 +65 28.5 +66 20.5 +67 15.0 +68 17.0 +69 13.5 +70 8.0 +71 7.0 +72 9.0 +73 8.0 +74 5.5 +75 4.5 +76 2.0 +77 2.0 +78 3.0 +79 2.0 +80 1.5 +81 1.0 +82 0.0 +83 0.5 +84 1.0 +85 0.5 +86 0.0 +87 0.0 +88 0.0 +89 0.0 +90 0.0 +91 0.0 +92 0.0 +93 0.0 +94 0.0 +95 0.0 +96 0.0 +97 0.0 +98 0.0 +99 0.0 +100 0.0 +>>END_MODULE +>>Per base N content warn +#Base N-Count +1 0.3 +2 0.0 +3 0.0 +4 0.0 +5 0.0 +6 0.0 +7 0.0 +8 0.0 +9 0.0 +10-14 0.0 +15-19 0.0 +20-24 0.0 +25-29 0.0 +30-34 0.0 +35-39 0.0 +40-44 0.0 +45-49 0.0 +50-54 0.0 +55-59 0.0 +60-64 0.12 +65-69 0.5599999999999999 +70-74 6.16 +75-79 8.98 +80-84 13.100000000000001 +85-89 13.719999999999999 +90-94 11.68 +95-99 4.34 +100 0.0 +>>END_MODULE +>>Sequence Length Distribution pass +#Length Count +100 1000.0 +>>END_MODULE +>>Sequence Duplication Levels pass +#Total Duplicate Percentage 3.4 +#Duplication Level Relative count +1 100.0 +2 0.4140786749482402 +3 0.0 +4 0.0 +5 0.0 +6 0.0 +7 0.0 +8 0.0 +9 0.0 +10++ 0.2070393374741201 +>>END_MODULE +>>Overrepresented sequences fail +#Sequence Count Percentage Possible Source +AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTAT 14 1.4000000000000001 TruSeq Adapter, Index 1 (97% over 36bp) +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATG 12 1.2 TruSeq Adapter, Index 1 (97% over 36bp) +AGGGGGAATGATGGTTGTCTTTGGATATACTACAGCGATGGCTATTGAGG 2 0.2 No Hit +GGCTTGTTTTATTTTAATGGCTGATCTATGTAATCACAGAGGCCAGTATG 2 0.2 No Hit +GTGGGGTGGTGTTTGTGGGGGACTTCATCATCTCAGGCTTCCCAGGGTCC 2 0.2 No Hit +CGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATGCCG 2 0.2 TruSeq Adapter, Index 1 (96% over 33bp) +>>END_MODULE +>>Kmer Content fail +#Sequence Count Obs/Exp Overall Obs/Exp Max Max Obs/Exp Position +AAAAA 385 7.3597403 68.038994 65-69 +AGATC 435 5.4375157 23.135067 1 +GAAGA 375 5.258809 32.443344 6 +GGAAG 420 5.044668 33.345257 5 +TCCAG 475 4.8355613 14.131038 2 +AAGAG 320 4.487517 25.954676 7 +CCAGG 475 4.4180827 17.21471 3 +GAGCA 380 4.3399205 21.1377 9 +AGCAC 395 4.2895336 15.0741825 7 +CTCCA 415 4.0171337 12.105032 95-96 +AGAGC 340 3.883087 21.137697 8 +TTTTT 280 3.8749053 8.964593 10-14 +CTTCT 370 3.8646336 11.598914 55-59 +CTGAA 305 3.812511 13.130004 90-94 +CGGAA 320 3.65467 26.422123 5 +ACCAG 335 3.6379597 10.049457 7 +TCTGA 310 3.6325634 12.308498 90-94 +CACAC 340 3.5108058 14.806036 85-89 +ATCGG 325 3.4795394 24.768969 3 +TCGGA 320 3.426008 19.815174 3 +GATCG 320 3.426008 19.815174 1 +CGTCT 355 3.387832 11.578538 85-89 +CTGCT 355 3.387832 17.662533 3 +GCACA 310 3.3664696 15.0741825 8 +TCTTC 320 3.3423858 7.7326093 50-54 +CAGCA 305 3.3121717 10.049455 6 +GAACT 260 3.2500093 13.130004 90-94 +GTCTG 320 3.2116532 12.65067 90-94 +CAGGA 280 3.197836 15.8532715 3 +AACTC 265 3.1497202 23.781752 95-96 +TGAAC 250 3.125009 13.130004 90-94 +CCAGC 350 3.0954454 6.6359653 95-96 +AGTCA 240 3.0000086 10.41078 25-29 +CACCA 290 2.9945107 6.079907 70-74 +TGCTG 295 2.960743 9.2877 2 +CAGAT 230 2.875008 11.040063 70-74 +CTTCC 315 2.8583732 10.916445 30-34 +CACGT 280 2.8504362 12.351324 85-89 +CAGGG 290 2.8367646 22.630535 9 +ACACG 260 2.8234906 13.175687 85-89 +TTCCA 250 2.7855206 9.279795 30-34 +TTCTT 230 2.765239 6.6755276 50-54 +AGCAG 240 2.7410026 15.853272 2 +TTCTG 240 2.6363494 10.165324 55-59 +ACTCC 270 2.6135564 14.526036 95-96 +GCCAG 280 2.6043434 8.607355 1 +ACGTC 255 2.595933 10.105629 85-89 +GATCT 220 2.5779483 8.675031 40-44 +TCTGC 265 2.5289452 13.2469015 2 +AAGAT 160 2.4557784 12.783248 35-39 +ATCTC 220 2.4512577 9.279794 40-44 +CAGTC 240 2.4432309 8.554544 90-94 +TCCAA 205 2.4365761 10.999062 7 +CTTTT 200 2.4045558 16.688818 6 +TTCCT 230 2.40234 9.665762 7 +CCAGT 235 2.3923304 9.4206915 25-29 +TTTCT 195 2.3444414 16.688818 8 +CTGGG 255 2.3383298 6.004135 80-84 +TGCTT 210 2.3068056 10.165323 4 +TCTTT 190 2.284328 5.5629396 15-19 +TTTTC 190 2.2843277 11.125878 7 +GGGGG 255 2.2468696 16.307867 2 +AGGAA 160 2.2437584 19.466007 5 +GTCAC 220 2.2396283 10.184532 95-96 +TCACT 200 2.2284167 8.360176 95-96 +CACTT 200 2.2284167 10.3108835 30-34 +GAAAA 135 2.2103586 10.606119 60-64 +ACTTC 195 2.172706 9.279794 30-34 +TTGAA 150 2.1582448 11.9834385 60-64 +CTCCT 235 2.1324375 16.794533 4 +TCCTC 235 2.1324372 8.397265 5 +ATCTT 165 2.11616 7.1210704 10-14 +GGGGA 205 2.1089406 14.2801 3 +ACACA 165 2.092039 11.7331705 8 +TGCAG 195 2.0877237 9.907587 5 +GACCA 190 2.0633202 10.049455 6 +AGGGG 200 2.057503 9.520067 1 +CCTCC 260 2.049668 14.590484 5 +AGGAG 170 2.0418897 5.557543 2 +TCCTT 195 2.0367663 14.498643 4 +GTCTT 185 2.032186 15.247986 7 +GCTGG 220 2.0173824 8.485845 1 +CCAGA 185 2.0090222 5.3284492 70-74 +CCTGG 230 2.0054333 8.068818 3 +GCAGG 205 2.005299 9.052214 3 +GGACC 215 1.9997637 8.607355 5 +TTCAT 155 1.987908 5.934226 2 +CCTTT 190 1.9845415 14.498643 5 +TTTCC 190 1.9845415 5.799457 15-19 +TGGCA 185 1.980661 14.861383 2 +TCTTG 180 1.977262 10.165323 5 +CCAAG 180 1.9547247 9.044511 35-39 +CTTCA 175 1.9498644 10.310883 6 +CAAGA 145 1.933477 12.339583 35-39 +CTGGA 180 1.9271295 9.907587 6 +GGCTG 210 1.9256833 16.97169 2 +AATGA 125 1.918577 7.677627 95-96 +TGAAA 125 1.918577 15.623971 60-64 +GCTTC 200 1.9086379 13.2469015 2 +GTCCA 185 1.8833237 14.131036 1 +AGAAA 115 1.882898 7.5757995 7 +TGGGG 195 1.8805519 13.386638 1 +TTCTC 180 1.880092 5.799457 25-29 +CTTGA 160 1.8748715 8.675031 60-64 +ACAAA 120 1.8682072 5.762797 40-44 +TCTCG 195 1.8609219 8.831266 5 +GGGAC 190 1.8585701 9.052216 5 +TGAGG 165 1.8578365 5.209824 2 +TGAAG 140 1.8404517 6.082693 2 +CATCT 165 1.8384434 5.155441 4 +CACTG 180 1.8324232 9.4206915 6 +CTGCA 180 1.8324231 5.3465896 90-94 +GCTGC 210 1.8310483 8.068819 1 +GCAGA 160 1.8273348 10.568848 3 +CCTTC 200 1.8148402 8.397265 9 +AGGGA 150 1.8016673 6.0081544 95-96 +TTTCA 140 1.7955297 7.1210704 15-19 +CACAG 165 1.7918309 5.432139 95-96 +AAACA 115 1.7903653 7.6389136 70-74 +ATTTT 120 1.7715117 13.661307 6 +TTTTG 140 1.7701824 17.551357 7 +GGGGC 210 1.7594293 11.629828 3 +GATTT 130 1.7534488 12.481857 6 +CAAAT 120 1.7513192 6.7527947 50-54 +GAGGG 170 1.7488776 9.520067 1 +GAAGG 145 1.7416117 6.0081544 95-96 +CATTT 135 1.7314036 5.9342256 5 +ATTTC 135 1.7314036 5.9342256 7 +CCTCT 190 1.7240983 8.397266 1 +ATCCA 145 1.7234317 5.49953 4 +GCAGC 185 1.7207267 6.9789357 95-96 +TCCTG 180 1.717774 13.2469 2 +CTCTG 180 1.717774 13.2469 2 +AAAAC 110 1.7125233 7.6389136 70-74 +CTTGG 170 1.7061908 9.2877 2 +AAAAT 95 1.7024158 8.291661 9 +TCACC 175 1.693972 8.957724 8 +TCCAC 175 1.693972 8.957724 5 +GAGAA 120 1.6828189 6.488669 6 +TCTCC 185 1.6787271 5.038359 55-59 +GAGCC 180 1.6742208 8.607355 9 +TCATC 150 1.6713123 5.1554413 2 +AGACA 125 1.6667906 6.169792 2 +TGATG 135 1.6636823 11.404236 9 +GGGAG 160 1.6460025 9.520067 1 +AGCCA 150 1.6289369 6.029673 10-14 +ATGCC 160 1.6288207 8.478622 45-49 +CTCGT 170 1.6223421 8.831266 3 +GAGGA 135 1.6215005 11.115086 3 +TGTTG 140 1.6173534 10.690706 2 +CTCAT 145 1.6156021 5.1554418 2 +CAGGT 150 1.6059413 9.907587 4 +GCTTG 160 1.6058266 9.2877 60-64 +GGGTC 175 1.6047363 12.728768 2 +TCATT 125 1.6031516 5.934226 9 +GTTGA 130 1.6020645 5.702118 1 +ACAGA 120 1.6001189 10.005068 95-96 +GGAGG 155 1.5945649 9.520067 2 +GGGGT 165 1.5912362 13.386638 1 +TGGGA 140 1.5763463 10.419649 2 +GGATG 140 1.5763462 15.629472 6 +GCCTC 190 1.575248 7.672287 2 +CCTGC 190 1.5752479 11.508429 2 +GCTCC 190 1.5752479 11.508429 6 +TCTCT 150 1.5667434 5.224736 95-96 +GGGAA 130 1.561445 11.115086 4 +TCCAT 140 1.5598917 10.3108835 8 +GGCTT 155 1.5556445 13.93155 1 +TTGAT 115 1.5511277 6.240928 4 +CATCA 130 1.5451456 5.49953 2 +AGAGA 110 1.542584 6.488669 9 +AGGAC 135 1.541814 6.341309 55-59 +GTATG 125 1.5404466 9.123388 45-49 +AACAT 105 1.5324043 13.5055895 9 +AGCTC 150 1.5270194 9.4206915 5 +TTTGT 120 1.5172992 17.551357 8 +GATGA 115 1.5117996 6.082693 5 +GAGAT 115 1.5117996 6.082693 4 +AGGAT 115 1.5117996 12.165386 4 +TGAGA 115 1.5117996 6.082693 5 +CTGGT 150 1.5054625 9.2877 4 +GCTGT 150 1.5054625 18.5754 3 +TTCAC 135 1.504181 10.310883 7 +CCCAG 170 1.5035021 12.276537 2 +CAGTG 140 1.4988785 9.907587 5 +CTCCC 190 1.4978343 7.295242 1 +CCCTG 180 1.4923402 11.5084305 2 +CAGAG 130 1.4847097 7.398194 20-24 +CTTTG 135 1.4829465 10.165323 2 +CAAAA 95 1.4789973 7.203496 9 +TCTCA 130 1.4484707 5.1554413 8 +GAATG 110 1.4460692 12.165386 7 +GGAAT 110 1.4460692 12.165386 5 +TTTGG 125 1.4440656 5.345353 7 +GGCCT 165 1.4386805 12.103227 1 +GCTCT 150 1.4314783 6.1818867 20-24 +TCTGT 130 1.4280226 15.247986 3 +CTGTT 130 1.4280226 15.247986 4 +AGGTT 115 1.4172109 11.404235 8 +TTGAG 115 1.4172107 5.702117 4 +TTTGA 105 1.416247 7.4891143 10-14 +ATCTG 120 1.4061534 5.4218936 2 +GGTCT 140 1.4050984 9.287701 6 +TTTTA 95 1.4024467 7.384491 95-96 +GGGTG 145 1.3983592 13.386638 2 +GGCAC 150 1.3951839 8.607355 4 +AAAGA 85 1.3917071 7.5757985 8 +AAGAA 85 1.3917071 5.254889 75-79 +TTGTT 110 1.3908576 5.850453 4 +GGAGA 115 1.3812783 5.557543 3 +ATGAC 110 1.3750039 6.252721 95-96 +TGTTC 125 1.3730987 10.165325 5 +GGGCA 140 1.3694727 9.052216 4 +ATGAT 95 1.3668885 6.6574664 6 +CCACT 140 1.3551775 5.3746343 30-34 +TGGCT 135 1.3549163 13.931552 3 +GATGG 120 1.3511539 10.419648 9 +TCGTA 115 1.3475639 5.421894 40-44 +TGTCA 115 1.3475639 5.421894 5 +GCTGA 125 1.3382844 9.907587 6 +CAGAA 100 1.3334324 5.6025352 90-94 +CCAAA 105 1.3312978 5.8665853 8 +GGGCT 145 1.3296387 12.728768 1 +TAGGA 100 1.3146083 12.165386 4 +GACAG 115 1.313397 5.2844243 1 +GGTCC 150 1.3078917 8.068819 6 +CCATC 135 1.3067783 8.957724 9 +AAATG 85 1.3046323 7.101804 6 +TTCAA 95 1.2997144 6.330293 9 +CGTAT 110 1.2889742 8.675031 45-49 +TGACT 110 1.2889742 5.421894 3 +TATGC 110 1.2889739 8.67503 45-49 +GCCCT 155 1.2850707 7.672287 3 +TGGGC 140 1.283789 8.485846 7 +ACTTT 100 1.2825212 5.9342256 1 +ATGTT 95 1.2813665 6.2409286 1 +ATTTG 95 1.2813663 12.481856 9 +TGGTT 110 1.2707777 5.345353 5 +TGGTG 120 1.2666163 9.767722 7 +GTTTT 100 1.2644161 5.8504534 6 +GCCTG 145 1.2642952 12.103229 1 +TTGCT 115 1.2632507 6.0991945 50-54 +CCACC 150 1.2614243 7.7821474 5 +GGACA 110 1.2562928 15.853274 6 +GAAGC 110 1.2562928 10.568849 9 +TGACA 100 1.2500036 5.7837667 9 +GACAT 100 1.2500035 11.567533 7 +TGGAA 95 1.248878 6.082693 5 +ACAGC 115 1.2488517 10.049455 5 +AATCC 105 1.2480024 5.499531 7 +TGCCT 130 1.2406145 8.831266 3 +AGGTG 110 1.2385577 5.209824 4 +GTGGC 135 1.2379395 12.728768 1 +CATGT 105 1.2303842 5.4218936 1 +TAGAT 85 1.2230055 6.0453725 90-94 +CCCTC 155 1.2219174 7.295242 4 +GCCGT 140 1.2206988 8.068819 3 +AGTTT 90 1.2139261 6.2409286 7 +TTTAG 90 1.213926 6.240928 8 +TTGGG 115 1.2138406 9.767722 2 +ACCTC 125 1.20998 8.957724 1 +AGCAA 90 1.2000892 6.169792 9 +CAAAG 90 1.2000891 6.169791 5 +AAAGC 90 1.2000891 6.169791 6 +ACAGG 105 1.1991886 10.568849 8 +AGGCA 105 1.1991886 5.712891 95-96 +ATCAG 95 1.1875033 5.7837663 6 +ATGAG 90 1.1831475 6.082693 25-29 +CAGTT 100 1.1717947 5.1698627 85-89 +ATGCT 100 1.1717947 5.421894 8 +TCAAT 85 1.1629024 6.3302937 10-14 +TGTGT 100 1.1552525 10.690706 3 +GCCCA 130 1.1497369 12.276536 1 +TGATT 85 1.1464858 12.481857 5 +TGCTC 120 1.1451827 8.831267 4 +TGTCC 120 1.1451827 13.2469015 2 +TCCCC 145 1.143084 7.295242 2 +AAGGC 100 1.1420842 5.493164 65-69 +CAACA 90 1.1411123 5.8665853 8 +CACAA 90 1.1411123 11.7331705 9 +ACATC 95 1.129145 5.4995303 8 +AAGCT 90 1.1250031 6.2527194 95-96 +GAAAG 80 1.1218792 12.977338 7 +AAGGA 80 1.1218792 6.488669 3 +GCACT 110 1.1198142 9.4206915 5 +CCTGA 110 1.119814 9.420691 9 +ACCTT 100 1.1142083 5.1554418 7 +GTCAT 95 1.113205 5.421894 1 +TGATC 95 1.113205 10.843788 5 +TCATG 95 1.113205 5.421894 3 +TGGAT 90 1.1091216 5.702118 9 +GTGGG 115 1.1090435 8.924425 1 +CTGTG 110 1.1040058 9.2877 4 +GCTTT 100 1.0984789 5.4947696 95-96 +TGTCT 100 1.0984789 10.165323 5 +TTGGT 95 1.0974898 5.345353 4 +CTGTC 115 1.0974668 17.662535 4 +CAGAC 100 1.0859579 5.0247273 5 +GGAAC 95 1.0849801 5.2844243 6 +CCTCG 130 1.0778012 7.672287 6 +GCGGC 135 1.075477 7.372196 1 +ATAAA 60 1.0752101 8.291662 7 +GGGAT 95 1.0696635 10.419649 3 +CATCC 110 1.0647823 8.957723 3 +ACAGT 85 1.062503 5.7837663 4 +ACTGA 85 1.062503 11.567533 7 +GTTGG 100 1.0555136 9.767722 1 +TGTGG 100 1.0555136 9.767722 5 +GGAAA 75 1.0517617 19.466007 6 +GTGAA 80 1.0516868 6.082693 1 +GAAGT 80 1.0516866 6.082693 5 +GTCTC 110 1.0497508 8.831267 1 +CGGCT 120 1.046313 8.068818 1 +TTTAT 70 1.0333818 5.4645233 10-14 +GACAC 95 1.0316601 10.049455 7 +GGCAA 90 1.0278759 10.56885 3 +TCATA 75 1.0260904 6.330293 5 +ATTCA 75 1.0260903 6.3302927 7 +TAACA 70 1.0216029 6.7527957 8 +GGTCA 95 1.0170963 9.907589 3 +ATGGC 95 1.0170962 9.907587 1 +TCAGG 95 1.0170962 9.907587 8 +GGTGA 90 1.0133655 15.629474 3 +TGTTT 80 1.0115329 5.8504534 5 +TGAAT 70 1.007181 6.6574664 5 +ATTGA 70 1.0071809 6.6574664 7 +AAGTT 70 1.0071809 6.6574664 6 +TTGCC 105 1.0020349 8.831267 2 +CTTGC 105 1.0020349 8.831267 6 +GCAAA 75 1.0000744 6.169792 4 +CATAG 80 1.0000029 6.2527204 95-96 +GACTT 85 0.99602544 5.421894 1 +CTGAT 85 0.99602544 5.421894 4 +CTTGT 90 0.988631 10.165323 3 +AATGG 75 0.98595625 6.082693 8 +AAGGT 75 0.9859562 6.0826926 4 +GATGT 80 0.98588586 5.7021174 7 +GGATT 80 0.98588586 11.404235 5 +GGCGG 115 0.96349704 7.753219 1 +AGAGG 80 0.9608892 5.557543 8 +GAGGT 85 0.95706743 5.2098246 3 +ATGGG 85 0.9570673 5.209824 1 +CCGTC 115 0.95343953 7.672287 4 +TAGCA 75 0.9375027 5.7837667 1 +ACATG 75 0.9375026 5.7837663 2 +TTGCA 80 0.93743575 5.421894 4 +GTTCA 80 0.93743575 5.421894 6 +ATGTC 80 0.93743575 5.421894 5 +TTCAG 80 0.93743575 5.421894 8 +TTGAC 80 0.9374356 5.4218936 2 +GTTCT 85 0.93370706 5.0826616 1 +TTGTC 85 0.93370706 5.0826616 9 +TTTGC 85 0.93370706 5.0826616 3 +ATGGT 75 0.924268 5.7021174 4 +ATGAA 60 0.920917 7.1018047 9 +AGATG 70 0.92022586 6.082693 5 +GCTCA 90 0.91621155 5.092265 95-96 +AGTGC 85 0.9100334 9.907587 2 +AGGGT 80 0.90076935 10.419649 1 +GTAGG 80 0.90076923 10.419648 6 +AGTGG 80 0.90076923 5.209824 2 +TAAAA 50 0.89600843 8.291662 8 +CACAT 75 0.89143026 5.499531 6 +CCATT 80 0.89136666 10.3108835 9 +ATACT 65 0.8892783 6.330293 9 +ACATT 65 0.88927823 6.3302927 7 +GCGGG 105 0.87971467 7.753219 2 +ACACC 85 0.8777014 9.555587 9 +CATAA 60 0.8756596 6.7527947 6 +ACCCT 90 0.8711856 13.436585 1 +GAACA 65 0.8667311 6.169792 7 +ACTGC 85 0.8653109 5.092265 95-96 +GGTAT 70 0.86265016 17.106354 6 +AGTTG 70 0.86265016 5.702118 7 +GAGAC 75 0.85656327 5.2844243 1 +GTGTC 85 0.8530954 13.93155 1 +GTTGC 85 0.8530954 9.2877 1 +ATAGA 55 0.84417385 7.1018047 8 +GAAAT 55 0.84417385 7.1018047 5 +CATTC 75 0.83565605 5.155441 6 +TCACA 70 0.83200157 5.499531 3 +TGCGG 90 0.8252928 8.485845 3 +GCATT 70 0.8202563 5.421894 4 +GAACC 75 0.8144686 5.0247283 6 +CTCGA 80 0.81441027 9.420691 6 +GAATC 65 0.8125023 5.7837667 6 +TACAG 65 0.81250226 11.567533 7 +TGGTA 65 0.80103225 11.404236 5 +AAGAC 60 0.80005944 6.169791 8 +CAAGG 70 0.7994591 5.2844243 2 +ATGTA 55 0.7913565 6.6574664 4 +AATGT 55 0.7913565 6.6574664 3 +CGGCA 85 0.7906042 8.607354 2 +GAGAG 65 0.7807225 5.557543 8 +ACCAT 65 0.7725729 5.499531 8 +TTCTA 60 0.7695128 5.934226 9 +TAGAA 50 0.7674308 7.1018047 9 +GCATC 75 0.7635097 9.4206915 1 +GTTCC 80 0.76345515 8.831267 6 +AGCTT 65 0.76166654 5.421894 1 +TTAGC 65 0.76166654 5.421894 9 +CTGTA 65 0.76166654 5.421894 2 +ACTTG 65 0.7616664 5.4218936 2 +GTGCT 75 0.7527313 9.287701 3 +ATCAT 55 0.7524662 6.3302927 3 +GTTTG 65 0.7509141 5.345353 9 +GTGTT 65 0.7509141 10.690706 1 +GTCAA 60 0.75000215 11.5675335 6 +AATGC 60 0.75000215 6.252721 95-96 +CAAGT 60 0.7500021 5.7837663 9 +GCAAT 60 0.7500021 5.7837663 4 +GCAAG 65 0.74235487 5.2844243 1 +AGTGT 60 0.7394144 5.7021174 1 +TTAGG 60 0.7394144 5.702118 7 +AGCGG 75 0.73364604 9.052214 1 +ATCCT 65 0.72423524 5.155441 4 +ACTCT 65 0.72423524 5.155441 9 +AGTGA 55 0.7230346 6.082693 6 +AATAA 40 0.71680677 8.291662 6 +AACCT 60 0.71314424 5.4995303 1 +ATTCT 55 0.70538664 5.9342256 7 +AGTCT 60 0.7030768 5.421894 3 +GTGCA 65 0.69590795 9.907589 6 +AAAGT 45 0.69068766 7.101804 8 +AACTG 55 0.6875019 5.7837663 1 +CGAAG 60 0.68525064 5.2844243 4 +GATTG 55 0.67779654 5.702118 6 +GTGAT 55 0.67779654 11.404236 4 +TGTTA 50 0.67440337 12.481857 5 +TTGTA 50 0.6744033 6.240928 9 +TATTG 50 0.6744033 6.240928 7 +CTCTA 60 0.6685249 5.1554413 7 +TACCT 60 0.66852486 10.310882 8 +ATGGA 50 0.65730417 6.082693 8 +ATACA 45 0.6567447 6.7527957 6 +ATCAA 45 0.65674466 6.7527947 9 +TGTAA 45 0.6474735 6.6574664 7 +GCGGT 70 0.6418945 8.485846 4 +GGCCG 80 0.63731974 7.372196 2 +GGTTT 55 0.63538885 10.690706 9 +TTGTG 55 0.63538885 5.345353 1 +TATAT 40 0.62991583 7.2865515 8 +CCTGT 65 0.62030727 8.831266 3 +GTGAG 55 0.6192789 5.2098246 1 +TAGGG 55 0.61927885 5.209824 8 +GAGTT 50 0.6161787 5.7021174 6 +ATGTG 50 0.6161787 5.702118 2 +GAATA 40 0.61394465 7.1018047 6 +CTGCG 70 0.6103493 8.068818 2 +CGGTG 65 0.59604484 8.485845 2 +TAAGG 45 0.5915738 6.082693 9 +AAGTG 45 0.5915737 6.0826926 1 +TATTT 40 0.5905039 6.8306537 8 +GGCAT 55 0.5888452 14.861383 3 +GTATC 50 0.5858973 5.421894 4 +ATAAC 40 0.5837731 13.505591 7 +TTACT 45 0.57713455 5.934226 9 +GTATA 40 0.575532 13.314933 7 +GAGTG 50 0.5629808 5.209824 1 +GTACA 45 0.5625016 5.7837667 6 +ATAGC 45 0.5625016 5.7837667 9 +TCTAC 50 0.5571041 5.1554413 8 +GCGAG 55 0.53800714 9.052216 1 +ACGGG 55 0.5380071 9.052214 1 +GATAA 35 0.5372016 7.1018047 6 +AATAG 35 0.5372016 7.101805 7 +CAACT 45 0.53485817 5.4995303 6 +CATAC 45 0.53485817 5.4995303 5 +GATTC 45 0.52730757 5.421894 6 +AGGTA 40 0.5258433 12.165386 5 +CGGTC 60 0.52315664 8.068819 5 +ACGAG 45 0.51393795 5.2844243 7 +TATTC 40 0.5130085 5.9342256 7 +CTAAA 35 0.51080143 6.7527957 9 +TACAA 35 0.51080143 5.402236 35-39 +CCTTA 45 0.5013937 5.1554413 6 +CAGTA 40 0.50000143 5.7837667 4 +GTGTA 40 0.49294293 5.702118 4 +TAACT 35 0.47884214 6.330293 8 +CTTAA 35 0.47884214 6.330293 7 +CTATA 35 0.47884214 6.330293 4 +TTAAC 35 0.47884214 6.330293 8 +TATCA 35 0.4788421 6.3302927 5 +TCAAC 40 0.47542948 5.499531 7 +ACTCA 40 0.47542942 5.49953 8 +TTAGT 35 0.47208238 10.120425 95-96 +TGTAT 35 0.47208238 6.2409286 3 +ATTGT 35 0.47208235 6.240928 8 +GTTAC 40 0.46871787 5.421894 6 +TGTAC 40 0.46871787 10.843788 7 +AGAGT 35 0.46011293 6.082693 5 +AGTAG 35 0.46011293 6.082693 5 +CTCCG 55 0.45599285 7.672287 6 +GGTAG 40 0.45038468 5.2098246 2 +TTTAC 35 0.44888243 5.9342256 8 +CTACT 40 0.44568333 5.1554418 4 +AACTA 30 0.4378298 6.7527947 9 +TATAG 30 0.43164897 6.6574664 5 +ATATA 25 0.4199739 7.7728767 9 +CTCAA 35 0.41600078 5.499531 9 +TATAC 30 0.4104361 6.3302927 5 +ACTAT 30 0.4104361 6.3302927 6 +TACTA 30 0.4104361 6.3302927 5 +TCGAT 35 0.41012815 10.843788 7 +ACGTT 35 0.41012815 5.421894 4 +CGAAA 30 0.40002972 6.169792 9 +GTAAG 30 0.3943825 6.082693 8 +ATAGG 30 0.3943825 6.082693 3 +TCCTA 35 0.38997287 5.1554413 5 +TTACC 35 0.38997287 5.1554413 7 +ACCGA 35 0.3800853 5.0247273 7 +GCATA 30 0.37500107 5.7837667 1 +TCGAA 30 0.37500107 5.7837667 4 +GCTAA 30 0.37500107 5.7837667 8 +TAGGT 30 0.3697072 5.7021174 7 +GTTAG 30 0.3697072 5.702118 6 +CAATA 25 0.36485815 6.7527947 5 +ATACC 30 0.35657212 5.499531 6 +GACGA 30 0.3426253 5.284424 6 +AAGCG 30 0.3426253 10.568848 7 +GTTTA 25 0.33720168 6.2409286 7 +GTATT 25 0.33720168 12.481857 6 +AGATA 20 0.30697232 7.1018047 5 +CGTCA 30 0.30540386 9.420691 5 +CCTAA 25 0.29714343 5.499531 7 +TACCA 25 0.2971434 5.49953 9 +TGCTA 25 0.29294866 5.421894 7 +TACGT 25 0.29294863 5.4218936 9 +AGACG 25 0.2855211 5.284425 9 +CCTAT 25 0.2785521 5.1554418 3 +TAAGC 20 0.25000072 5.7837667 9 +CTAAG 20 0.25000072 5.7837667 8 +CGATT 20 0.23435894 5.421894 9 +GGGTA 20 0.22519234 5.2098246 2 +ACGCA 20 0.21719159 5.0247273 5 +GCGAA 15 0.17131266 5.284425 3 +CGAAC 15 0.16289368 5.0247273 5 +>>END_MODULE diff --git a/public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala b/public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala new file mode 100644 index 0000000000000000000000000000000000000000..0951bea84834b611c323c8e0b1b77ae55f0461b1 --- /dev/null +++ b/public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala @@ -0,0 +1,80 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.flexiprep + +import java.io.File +import java.nio.file.Paths + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +class FastqcV0101Test extends TestNGSuite with Matchers { + + /** Returns the absolute path to test resource directory as a File object */ + private val resourceDir: File = new File(Paths.get(getClass.getResource("/").toURI).toString) + + /** Given a resource file name, returns the the absolute path to it as a File object */ + private def resourceFile(p: String): File = new File(resourceDir, p) + + /** Mock output file of a FastQC v0.10.1 run */ + // the file doesn't actually exist, we just need it so the outputDir value can be computed correctly + private val outputv0101: File = resourceFile("v0101.fq_fastqc.zip") + + @Test def testOutputDir() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + fqc.outputDir shouldBe new File(resourceDir, "v0101.fq_fastqc") + } + + @Test def testQcModules() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + // 11 QC modules + fqc.qcModules.size shouldBe 11 + // first module + fqc.qcModules.keySet should contain("Basic Statistics") + // mid (6th module) + fqc.qcModules.keySet should contain("Per sequence GC content") + // last module + fqc.qcModules.keySet should contain("Kmer Content") + } + + @Test def testSingleQcModule() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + fqc.qcModules("Basic Statistics").name should ===("Basic Statistics") + fqc.qcModules("Basic Statistics").status should ===("pass") + fqc.qcModules("Basic Statistics").lines.size shouldBe 8 + } + + @Test def testEncoding() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + fqc.encoding shouldBe "Sanger / Illumina 1.9" + } + + @Test def testFoundAdapter() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + fqc.contaminants = Option(resourceFile("fqc_contaminants_v0101.txt")) + val adapters = fqc.foundAdapters + adapters.size shouldBe 1 + adapters.head.name should ===("TruSeq Adapter, Index 1") + // from fqc_contaminants_v0101.txt + adapters.head.seq should ===("GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG") + } +} \ No newline at end of file diff --git a/public/kopisu/.gitignore b/public/kopisu/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..a6f89c2da7a029afa02b6e7a2bf80ad34958a311 --- /dev/null +++ b/public/kopisu/.gitignore @@ -0,0 +1 @@ +/target/ \ No newline at end of file diff --git a/public/kopisu/pom.xml b/public/kopisu/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..3eae712aa40f517b182d345f7c7187eb846995f6 --- /dev/null +++ b/public/kopisu/pom.xml @@ -0,0 +1,43 @@ +<!-- + + Biopet is built on top of GATK Queue for building bioinformatic + pipelines. It is mainly intended to support LUMC SHARK cluster which is running + SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + should also be able to execute Biopet tools and pipelines. + + Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + + Contact us at: sasc@lumc.nl + + A dual licensing mode is applied. The source code within this project that are + not part of GATK Queue is freely available for non-commercial use under an AGPL + license; For commercial users or users who do not want to follow the AGPL + license, please contact us to obtain a separate license. + +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>nl.lumc.sasc</groupId> + <artifactId>Kopisu</artifactId> + <packaging>jar</packaging> + + <parent> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Biopet</artifactId> + <version>0.3.0-DEV</version> + <relativePath>../</relativePath> + </parent> + + <inceptionYear>2015</inceptionYear> + <name>Kopisu</name> + + <dependencies> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>BiopetFramework</artifactId> + <version>${project.version}</version> + </dependency> + </dependencies> +</project> diff --git a/public/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/ConiferPipeline.scala b/public/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/ConiferPipeline.scala new file mode 100644 index 0000000000000000000000000000000000000000..c45de4eac904af6b110c7448416d0ce0a90fe8e3 --- /dev/null +++ b/public/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/ConiferPipeline.scala @@ -0,0 +1,121 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.kopisu + +import java.io.{ BufferedWriter, FileWriter, File } + +import nl.lumc.sasc.biopet.core.{ PipelineCommand, _ } +import nl.lumc.sasc.biopet.core.config._ +import nl.lumc.sasc.biopet.extensions.Ln +import nl.lumc.sasc.biopet.extensions.conifer.{ ConiferAnalyze, ConiferCall, ConiferRPKM } +import org.broadinstitute.gatk.queue.QScript + +import scala.io.Source + +class ConiferPipeline(val root: Configurable) extends QScript with BiopetQScript { + //* + // Kopisu - Coniferpipeline is a pipeline that can run standalone + // */ + def this() = this(null) + + /** Input bamfile */ + @Input(doc = "Bamfile to start from", fullName = "bam", shortName = "bam", required = true) + var inputBam: File = _ + + @Argument(doc = "Label this sample with a name/ID [0-9a-zA-Z] and [-_]", + fullName = "label", + shortName = "label", required = false) + var sampleLabel: String = _ + + /** Exon definitions in bed format */ + @Input(doc = "Exon definition file in bed format", fullName = "exon_bed", shortName = "bed", required = false) + var probeFile: File = config("probeFile") + + @Input(doc = "Previous RPKM files (controls)", fullName = "rpkm_controls", shortName = "rc", required = false) + var controlsDir: File = config("controlsDir") + + @Argument(doc = "Enable RPKM only mode, generate files for reference db", shortName = "rpkmonly", required = false) + var RPKMonly: Boolean = false + + val summary = new ConiferSummary(this) + + def init() { + + } + + def input2RPKM(inputBam: File): String = { + if (!sampleLabel.isEmpty) sampleLabel ++ ".txt" + else swapExt(inputBam.getName, ".bam", ".txt") + } + + def input2HDF5(inputBam: File): String = { + if (!sampleLabel.isEmpty) sampleLabel ++ ".hdf5" + else swapExt(inputBam.getName, ".bam", ".hdf5") + } + def input2Calls(inputBam: File): String = { + if (!sampleLabel.isEmpty) sampleLabel ++ ".calls.txt" + else swapExt(inputBam.getName, ".bam", "calls.txt") + } + + def biopetScript(): Unit = { + + /** Setup RPKM directory */ + val sampleDir: String = outputDir + val RPKMdir: File = new File(sampleDir + File.separator + "RPKM" + File.separator) + RPKMdir.mkdir() + + val coniferRPKM = new ConiferRPKM(this) + coniferRPKM.bamFile = this.inputBam.getAbsoluteFile + coniferRPKM.probes = this.probeFile + coniferRPKM.output = new File(RPKMdir + File.separator + input2RPKM(inputBam)) + add(coniferRPKM) + + if (!RPKMonly) { + /** Collect the rpkm_output to a temp directory, where we merge with the control files */ + var refRPKMlist: List[File] = Nil + for (f <- controlsDir.listFiles()) { + var target = new File(RPKMdir + File.separator + f.getName) + if (!target.exists()) { + logger.info("Creating " + target.getAbsolutePath) + add(Ln(this, f, target, true)) + refRPKMlist :+= target + } + } + + val coniferAnalyze = new ConiferAnalyze(this) + coniferAnalyze.deps = List(coniferRPKM.output) ++ refRPKMlist + coniferAnalyze.probes = this.probeFile + coniferAnalyze.rpkmDir = RPKMdir + coniferAnalyze.output = new File(sampleDir + File.separator + input2HDF5(inputBam)) + add(coniferAnalyze) + + val coniferCall = new ConiferCall(this) + coniferCall.input = coniferAnalyze.output + coniferCall.output = new File(sampleDir + File.separator + "calls.txt") + add(coniferCall) + + summary.deps = List(coniferCall.output) + summary.label = sampleLabel + summary.calls = coniferCall.output + summary.out = new File(sampleDir + File.separator + input2Calls(inputBam)) + + add(summary) + } + + } +} + +object ConiferPipeline extends PipelineCommand diff --git a/public/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/ConiferSummary.scala b/public/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/ConiferSummary.scala new file mode 100644 index 0000000000000000000000000000000000000000..78ffcbb29540bbfd801c04b4b5d709b7eaf69191 --- /dev/null +++ b/public/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/ConiferSummary.scala @@ -0,0 +1,65 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.kopisu + +import java.io.{ FileWriter, BufferedWriter, File, PrintWriter } + +import argonaut._ +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.queue.function.InProcessFunction +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +import scala.io.Source + +class ConiferSummary(val root: Configurable) extends InProcessFunction with Configurable { + def filterCalls(callFile: File, outFile: File, sampleName: String): Unit = { + // val filename = callFile.getAbsolutePath + val writer = new BufferedWriter(new FileWriter(outFile)) + + for (line <- Source.fromFile(callFile).getLines()) { + line.startsWith(sampleName) || line.startsWith("sampleID") match { + case true => writer.write(line + "\n"); + case _ => + } + } + writer.close() + } + + this.analysisName = getClass.getSimpleName + + @Input(doc = "deps") + var deps: List[File] = Nil + + @Output(doc = "Summary output", required = true) + var out: File = _ + + @Input(doc = "calls") + var calls: File = _ + + var label: String = _ + + var coniferPipeline: ConiferPipeline = if (root.isInstanceOf[ConiferPipeline]) root.asInstanceOf[ConiferPipeline] else { + throw new IllegalStateException("Root is no instance of ConiferPipeline") + } + + var resources: Map[String, Json] = Map() + + override def run { + logger.debug("Start") + filterCalls(calls, out, label) + logger.debug("Stop") + } +} diff --git a/public/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/Kopisu.scala b/public/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/Kopisu.scala new file mode 100644 index 0000000000000000000000000000000000000000..94b3bcbe619ee5b01bd29bea6aa9b9e707934a6d --- /dev/null +++ b/public/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/Kopisu.scala @@ -0,0 +1,54 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.kopisu + +import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, PipelineCommand } +import org.broadinstitute.gatk.queue.QScript + +class Kopisu(val root: Configurable) extends QScript with MultiSampleQScript { + def this() = this(null) + + @Input(doc = "Input bamfile", required = true) + var bamFile: File = config("bam") + + def init() { + if (!outputDir.endsWith("/")) outputDir += "/" + } + + def biopetScript() { + addSamplesJobs() + } + + def makeSample(id: String) = new Sample(id) + class Sample(sampleId: String) extends AbstractSample(sampleId) { + def makeLibrary(id: String) = new Library(id) + class Library(libId: String) extends AbstractLibrary(libId) { + def addJobs(): Unit = { + + } + } + + def addJobs(): Unit = { + + } + } + + def addMultiSampleJobs(): Unit = { + } +} + +object Kopisu extends PipelineCommand diff --git a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala index 40c9472ea122e68679f8eaeb76f38938048b6682..9094d38cddbe70d8c7b6787200d2afe94e19d486 100644 --- a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala +++ b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala @@ -19,8 +19,9 @@ import nl.lumc.sasc.biopet.core.config.Configurable import java.io.File import java.util.Date import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } +import nl.lumc.sasc.biopet.extensions.{ Ln, Star, Stampy, Bowtie } +import nl.lumc.sasc.biopet.extensions.bwa.{ BwaSamse, BwaSampe, BwaAln, BwaMem } import nl.lumc.sasc.biopet.tools.FastqSplitter -import nl.lumc.sasc.biopet.extensions.aligners.{ Bwa, Star, Bowtie, Stampy } import nl.lumc.sasc.biopet.extensions.picard.{ MarkDuplicates, SortSam, MergeSamFiles, AddOrReplaceReadGroups } import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep @@ -29,83 +30,86 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument, ClassType } import scala.math._ class Mapping(val root: Configurable) extends QScript with BiopetQScript { - qscript => def this() = this(null) @Input(doc = "R1 fastq file", shortName = "R1", required = true) var input_R1: File = _ @Input(doc = "R2 fastq file", shortName = "R2", required = false) - var input_R2: File = _ + var input_R2: Option[File] = None - @Argument(doc = "Output name", shortName = "outputName", required = false) + /** Output name */ var outputName: String = _ - @Argument(doc = "Skip flexiprep", shortName = "skipflexiprep", required = false) - var skipFlexiprep: Boolean = false + /** Skip flexiprep */ + protected var skipFlexiprep: Boolean = config("skip_flexiprep", default = false) - @Argument(doc = "Skip mark duplicates", shortName = "skipmarkduplicates", required = false) - var skipMarkduplicates: Boolean = false + /** Skip mark duplicates */ + protected var skipMarkduplicates: Boolean = config("skip_markduplicates", default = false) - @Argument(doc = "Skip metrics", shortName = "skipmetrics", required = false) - var skipMetrics: Boolean = false + /** Skip metrics */ + protected var skipMetrics: Boolean = config("skip_metrics", default = false) - @Argument(doc = "Aligner", shortName = "ALN", required = false) - var aligner: String = config("aligner", default = "bwa") + /** Aligner */ + protected var aligner: String = config("aligner", default = "bwa") - @Argument(doc = "Reference", shortName = "R", required = false) - var reference: File = config("reference") + /** Reference */ + protected var reference: File = config("reference") - @Argument(doc = "Chunking", shortName = "chunking", required = false) - var chunking: Boolean = config("chunking", false) + /** Number of chunks, when not defined pipeline will automatic calculate number of chunks */ + protected var numberChunks: Option[Int] = config("number_chunks") - @ClassType(classOf[Int]) - @Argument(doc = "Number of chunks, when not defined pipeline will automatic calculate number of chunks", shortName = "numberChunks", required = false) - var numberChunks: Option[Int] = None + /** Enable chunking */ + protected var chunking: Boolean = config("chunking", numberChunks.getOrElse(1) > 1) // Readgroup items - @Argument(doc = "Readgroup ID", shortName = "RGID", required = false) - var RGID: String = config("RGID") + /** Readgroup ID */ + protected var readgroupId: String = _ - @Argument(doc = "Readgroup Library", shortName = "RGLB", required = false) - var RGLB: String = config("RGLB") + // TODO: hide sampleId and libId from the command line so they do not interfere with our config values - @Argument(doc = "Readgroup Platform", shortName = "RGPL", required = false) - var RGPL: String = config("RGPL", default = "illumina") + /** Readgroup Library */ + @Argument(doc = "Library ID", shortName = "library", required = true) + var libId: String = _ - @Argument(doc = "Readgroup platform unit", shortName = "RGPU", required = false) - var RGPU: String = config("RGPU", default = "na") + /**Readgroup sample */ + @Argument(doc = "Sample ID", shortName = "sample", required = true) + var sampleId: String = _ - @Argument(doc = "Readgroup sample", shortName = "RGSM", required = false) - var RGSM: String = config("RGSM") + /** Readgroup Platform */ + protected var platform: String = config("platform", default = "illumina") - @Argument(doc = "Readgroup sequencing center", shortName = "RGCN", required = false) - var RGCN: String = config("RGCN") + /** Readgroup platform unit */ + protected var platformUnit: String = config("platform_unit", default = "na") - @Argument(doc = "Readgroup description", shortName = "RGDS", required = false) - var RGDS: String = config("RGDS") + /** Readgroup sequencing center */ + protected var readgroupSequencingCenter: Option[String] = config("readgroup_sequencing_center") - @Argument(doc = "Readgroup sequencing date", shortName = "RGDT", required = false) - var RGDT: Date = _ + /** Readgroup description */ + protected var readgroupDescription: Option[String] = config("readgroup_description") - @Argument(doc = "Readgroup predicted insert size", shortName = "RGPI", required = false) - var RGPI: Int = config("RGPI") + /** Readgroup sequencing date */ + protected var readgroupDate: Date = _ - var paired: Boolean = false + /** Readgroup predicted insert size */ + protected var predictedInsertsize: Option[Int] = config("predicted_insertsize") + + protected var paired: Boolean = false val flexiprep = new Flexiprep(this) + def finalBamFile: File = outputDir + outputName + ".final.bam" def init() { - if (outputDir == null) throw new IllegalStateException("Missing Output directory on mapping module") - else if (!outputDir.endsWith("/")) outputDir += "/" - if (input_R1 == null) throw new IllegalStateException("Missing FastQ R1 on mapping module") - paired = (input_R2 != null) + require(outputDir != null, "Missing output directory on mapping module") + require(input_R1 != null, "Missing output directory on mapping module") + require(sampleId != null, "Missing sample ID on mapping module") + require(libId != null, "Missing library ID on mapping module") + + paired = input_R2.isDefined - if (RGLB == null) throw new IllegalStateException("Missing Readgroup library on mapping module") - if (RGLB == null) throw new IllegalStateException("Missing Readgroup sample on mapping module") - if (RGID == null && RGSM != null && RGLB != null) RGID = RGSM + "-" + RGLB - else if (RGID == null) throw new IllegalStateException("Missing Readgroup ID on mapping module") + if (readgroupId == null && sampleId != null && libId != null) readgroupId = sampleId + "-" + libId + else if (readgroupId == null) readgroupId = config("readgroup_id") - if (outputName == null) outputName = RGID + if (outputName == null) outputName = readgroupId if (chunking) { if (numberChunks.isEmpty) { @@ -122,14 +126,12 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { } def biopetScript() { - var fastq_R1: File = input_R1 - var fastq_R2: File = if (paired) input_R2 else "" if (!skipFlexiprep) { - flexiprep.outputDir = outputDir + "flexiprep/" - flexiprep.input_R1 = fastq_R1 - if (paired) flexiprep.input_R2 = fastq_R2 - flexiprep.sampleName = this.RGSM - flexiprep.libraryName = this.RGLB + flexiprep.outputDir = outputDir + "flexiprep" + File.separator + flexiprep.input_R1 = input_R1 + flexiprep.input_R2 = input_R2 + flexiprep.sampleId = this.sampleId + flexiprep.libId = this.libId flexiprep.init flexiprep.runInitialJobs } @@ -145,23 +147,27 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { var chunks: Map[String, (String, String)] = Map() if (chunking) for (t <- 1 to numberChunks.getOrElse(1)) { val chunkDir = outputDir + "chunks/" + t + "/" - chunks += (chunkDir -> (removeGz(chunkDir + fastq_R1.getName), - if (paired) removeGz(chunkDir + fastq_R2.getName) else "")) + chunks += (chunkDir -> (removeGz(chunkDir + input_R1.getName), + if (paired) removeGz(chunkDir + input_R2.get.getName) else "")) } - else chunks += (outputDir -> (flexiprep.extractIfNeeded(fastq_R1, flexiprep.outputDir), - flexiprep.extractIfNeeded(fastq_R2, flexiprep.outputDir))) + else chunks += (outputDir -> ( + flexiprep.extractIfNeeded(input_R1, flexiprep.outputDir), + if (paired) flexiprep.extractIfNeeded(input_R2.get, flexiprep.outputDir) else "") + ) if (chunking) { val fastSplitter_R1 = new FastqSplitter(this) - fastSplitter_R1.input = fastq_R1 + fastSplitter_R1.input = input_R1 for ((chunkDir, fastqfile) <- chunks) fastSplitter_R1.output :+= fastqfile._1 - add(fastSplitter_R1, isIntermediate = true) + fastSplitter_R1.isIntermediate = true + add(fastSplitter_R1) if (paired) { val fastSplitter_R2 = new FastqSplitter(this) - fastSplitter_R2.input = fastq_R2 + fastSplitter_R2.input = input_R2.get for ((chunkDir, fastqfile) <- chunks) fastSplitter_R2.output :+= fastqfile._2 - add(fastSplitter_R2, isIntermediate = true) + fastSplitter_R2.isIntermediate = true + add(fastSplitter_R2) } } @@ -182,7 +188,8 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { val outputBam = new File(chunkDir + outputName + ".bam") bamFiles :+= outputBam aligner match { - case "bwa" => addBwa(R1, R2, outputBam, deps) + case "bwa" => addBwaMem(R1, R2, outputBam, deps) + case "bwa-aln" => addBwaAln(R1, R2, outputBam, deps) case "bowtie" => addBowtie(R1, R2, outputBam, deps) case "stampy" => addStampy(R1, R2, outputBam, deps) case "star" => addStar(R1, R2, outputBam, deps) @@ -207,19 +214,67 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { bamFile = mergeSamFile.output } - if (!skipMetrics) addAll(BamMetrics(this, bamFile, outputDir + "metrics/").functions) + if (!skipMetrics) addAll(BamMetrics(this, bamFile, outputDir + "metrics" + File.separator).functions) + add(Ln(this, swapExt(outputDir, bamFile, ".bam", ".bai"), swapExt(outputDir, finalBamFile, ".bam", ".bai"))) + add(Ln(this, bamFile, finalBamFile)) outputFiles += ("finalBamFile" -> bamFile) } - def addBwa(R1: File, R2: File, output: File, deps: List[File]): File = { - val bwaCommand = new Bwa(this) + def addBwaAln(R1: File, R2: File, output: File, deps: List[File]): File = { + val bwaAlnR1 = new BwaAln(this) + bwaAlnR1.fastq = R1 + bwaAlnR1.deps = deps + bwaAlnR1.output = swapExt(output.getParent, output, ".bam", ".R1.sai") + bwaAlnR1.isIntermediate = true + add(bwaAlnR1) + + val samFile: File = if (paired) { + val bwaAlnR2 = new BwaAln(this) + bwaAlnR2.fastq = R2 + bwaAlnR2.deps = deps + bwaAlnR2.output = swapExt(output.getParent, output, ".bam", ".R2.sai") + bwaAlnR2.isIntermediate = true + add(bwaAlnR2) + + val bwaSampe = new BwaSampe(this) + bwaSampe.fastqR1 = R1 + bwaSampe.fastqR2 = R2 + bwaSampe.saiR1 = bwaAlnR1.output + bwaSampe.saiR2 = bwaAlnR2.output + bwaSampe.r = getReadGroup + bwaSampe.output = swapExt(output.getParent, output, ".bam", ".sam") + bwaSampe.isIntermediate = true + add(bwaSampe) + + bwaSampe.output + } else { + val bwaSamse = new BwaSamse(this) + bwaSamse.fastq = R1 + bwaSamse.sai = bwaAlnR1.output + bwaSamse.r = getReadGroup + bwaSamse.output = swapExt(output.getParent, output, ".bam", ".sam") + bwaSamse.isIntermediate = true + add(bwaSamse) + + bwaSamse.output + } + + val sortSam = SortSam(this, samFile, output) + if (chunking || !skipMarkduplicates) sortSam.isIntermediate = true + add(sortSam) + return sortSam.output + } + + def addBwaMem(R1: File, R2: File, output: File, deps: List[File]): File = { + val bwaCommand = new BwaMem(this) bwaCommand.R1 = R1 if (paired) bwaCommand.R2 = R2 bwaCommand.deps = deps - bwaCommand.R = getReadGroup - bwaCommand.output = this.swapExt(output.getParent, output, ".bam", ".sam") - add(bwaCommand, isIntermediate = true) + bwaCommand.R = Some(getReadGroup) + bwaCommand.output = swapExt(output.getParent, output, ".bam", ".sam") + bwaCommand.isIntermediate = true + add(bwaCommand) val sortSam = SortSam(this, bwaCommand.output, output) if (chunking || !skipMarkduplicates) sortSam.isIntermediate = true add(sortSam) @@ -228,15 +283,15 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { def addStampy(R1: File, R2: File, output: File, deps: List[File]): File = { - var RG: String = "ID:" + RGID + "," - RG += "SM:" + RGSM + "," - RG += "LB:" + RGLB + "," - if (RGDS != null) RG += "DS" + RGDS + "," - RG += "PU:" + RGPU + "," - if (RGPI > 0) RG += "PI:" + RGPI + "," - if (RGCN != null) RG += "CN:" + RGCN + "," - if (RGDT != null) RG += "DT:" + RGDT + "," - RG += "PL:" + RGPL + var RG: String = "ID:" + readgroupId + "," + RG += "SM:" + sampleId + "," + RG += "LB:" + libId + "," + if (readgroupDescription != null) RG += "DS" + readgroupDescription + "," + RG += "PU:" + platformUnit + "," + if (predictedInsertsize.getOrElse(0) > 0) RG += "PI:" + predictedInsertsize.get + "," + if (readgroupSequencingCenter.isDefined) RG += "CN:" + readgroupSequencingCenter.get + "," + if (readgroupDate != null) RG += "DT:" + readgroupDate + "," + RG += "PL:" + platform val stampyCmd = new Stampy(this) stampyCmd.R1 = R1 @@ -245,7 +300,8 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { stampyCmd.readgroup = RG stampyCmd.sanger = true stampyCmd.output = this.swapExt(output.getParent, output, ".bam", ".sam") - add(stampyCmd, isIntermediate = true) + stampyCmd.isIntermediate = true + add(stampyCmd) val sortSam = SortSam(this, stampyCmd.output, output) if (chunking || !skipMarkduplicates) sortSam.isIntermediate = true add(sortSam) @@ -258,7 +314,8 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { if (paired) bowtie.R2 = R2 bowtie.deps = deps bowtie.output = this.swapExt(output.getParent, output, ".bam", ".sam") - add(bowtie, isIntermediate = true) + bowtie.isIntermediate = true + add(bowtie) return addAddOrReplaceReadGroups(bowtie.output, output) } @@ -278,13 +335,13 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { val addOrReplaceReadGroups = AddOrReplaceReadGroups(this, input, output) addOrReplaceReadGroups.createIndex = true - addOrReplaceReadGroups.RGID = RGID - addOrReplaceReadGroups.RGLB = RGLB - addOrReplaceReadGroups.RGPL = RGPL - addOrReplaceReadGroups.RGPU = RGPU - addOrReplaceReadGroups.RGSM = RGSM - if (RGCN != null) addOrReplaceReadGroups.RGCN = RGCN - if (RGDS != null) addOrReplaceReadGroups.RGDS = RGDS + addOrReplaceReadGroups.RGID = readgroupId + addOrReplaceReadGroups.RGLB = libId + addOrReplaceReadGroups.RGPL = platform + addOrReplaceReadGroups.RGPU = platformUnit + addOrReplaceReadGroups.RGSM = sampleId + if (readgroupSequencingCenter.isDefined) addOrReplaceReadGroups.RGCN = readgroupSequencingCenter.get + if (readgroupDescription.isDefined) addOrReplaceReadGroups.RGDS = readgroupDescription.get if (!skipMarkduplicates) addOrReplaceReadGroups.isIntermediate = true add(addOrReplaceReadGroups) @@ -292,40 +349,18 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { } def getReadGroup(): String = { - var RG: String = "@RG\\t" + "ID:" + RGID + "\\t" - RG += "LB:" + RGLB + "\\t" - RG += "PL:" + RGPL + "\\t" - RG += "PU:" + RGPU + "\\t" - RG += "SM:" + RGSM + "\\t" - if (RGCN != null) RG += "CN:" + RGCN + "\\t" - if (RGDS != null) RG += "DS" + RGDS + "\\t" - if (RGDT != null) RG += "DT" + RGDT + "\\t" - if (RGPI > 0) RG += "PI" + RGPI + "\\t" + var RG: String = "@RG\\t" + "ID:" + readgroupId + "\\t" + RG += "LB:" + libId + "\\t" + RG += "PL:" + platform + "\\t" + RG += "PU:" + platformUnit + "\\t" + RG += "SM:" + sampleId + "\\t" + if (readgroupSequencingCenter.isDefined) RG += "CN:" + readgroupSequencingCenter.get + "\\t" + if (readgroupDescription.isDefined) RG += "DS" + readgroupDescription.get + "\\t" + if (readgroupDate != null) RG += "DT" + readgroupDate + "\\t" + if (predictedInsertsize.isDefined) RG += "PI" + predictedInsertsize.get + "\\t" return RG.substring(0, RG.lastIndexOf("\\t")) } } -object Mapping extends PipelineCommand { - def loadFromLibraryConfig(root: Configurable, runConfig: Map[String, Any], sampleConfig: Map[String, Any], - runDir: String, startJobs: Boolean = true): Mapping = { - val mapping = new Mapping(root) - - logger.debug("Mapping runconfig: " + runConfig) - if (runConfig.contains("R1")) mapping.input_R1 = new File(runConfig("R1").toString) - if (runConfig.contains("R2")) mapping.input_R2 = new File(runConfig("R2").toString) - mapping.paired = (mapping.input_R2 != null) - mapping.RGLB = runConfig("ID").toString - mapping.RGSM = sampleConfig("ID").toString - if (runConfig.contains("PL")) mapping.RGPL = runConfig("PL").toString - if (runConfig.contains("PU")) mapping.RGPU = runConfig("PU").toString - if (runConfig.contains("CN")) mapping.RGCN = runConfig("CN").toString - mapping.outputDir = runDir - - if (startJobs) { - mapping.init - mapping.biopetScript - } - return mapping - } -} +object Mapping extends PipelineCommand \ No newline at end of file diff --git a/public/pom.xml b/public/pom.xml index 34cb3b86f1de86d6501d065c2674465e6fc1f934..2820c83511f6d05d1989ef8b0fec97a0214d0d9b 100644 --- a/public/pom.xml +++ b/public/pom.xml @@ -32,8 +32,10 @@ <module>gentrap</module> <module>mapping</module> <module>sage</module> + <module>kopisu</module> <module>yamsvp</module> - <module>toucan</module> + <module>carp</module> + <module>toucan</module> </modules> <properties> @@ -183,4 +185,4 @@ </plugin> </plugins> </build> -</project> \ No newline at end of file +</project> diff --git a/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala b/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala index c3f88b61f866ea6b62bada4d789b5b43037a8428..ed55ca382d13346b3cbb7bb4862941ef4da591f5 100644 --- a/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala +++ b/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala @@ -15,7 +15,7 @@ */ package nl.lumc.sasc.biopet.pipelines.sage -import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, PipelineCommand } +import nl.lumc.sasc.biopet.core.{ BiopetQScript, MultiSampleQScript, PipelineCommand } import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.Cat import nl.lumc.sasc.biopet.extensions.bedtools.BedtoolsCoverage @@ -28,148 +28,128 @@ import nl.lumc.sasc.biopet.scripts.SquishBed import nl.lumc.sasc.biopet.tools.SageCountFastq import nl.lumc.sasc.biopet.tools.SageCreateLibrary import nl.lumc.sasc.biopet.tools.SageCreateTagCounts +import nl.lumc.sasc.biopet.utils.ConfigUtils import org.broadinstitute.gatk.queue.QScript class Sage(val root: Configurable) extends QScript with MultiSampleQScript { + qscript => def this() = this(null) - @Input(doc = "countBed", required = false) - var countBed: File = config("count_bed") + var countBed: Option[File] = config("count_bed") + var squishedCountBed: File = _ + var transcriptome: Option[File] = config("transcriptome") + var tagsLibrary: Option[File] = config("tags_library") - @Input(doc = "squishedCountBed, by suppling this file the auto squish job will be skipped", required = false) - var squishedCountBed: File = config("squished_count_bed") - - @Input(doc = "Transcriptome, used for generation of tag library", required = false) - var transcriptome: File = config("transcriptome") - - var tagsLibrary: File = config("tags_library") - - defaults ++= Map("bowtie" -> Map( + override def defaults = ConfigUtils.mergeMaps(Map("bowtie" -> Map( "m" -> 1, "k" -> 1, "best" -> true, "strata" -> true, "seedmms" -> 1 + ), "mapping" -> Map( + "aligner" -> "bowtie", + "skip_flexiprep" -> true, + "skip_markduplicates" -> true + ), "flexiprep" -> Map( + "skip_clip" -> true, + "skip_trim" -> true ) - ) - - class LibraryOutput extends AbstractLibraryOutput { - var mappedBamFile: File = _ - var prefixFastq: File = _ - } - - class SampleOutput extends AbstractSampleOutput { + ), super.defaults) + + def makeSample(id: String) = new Sample(id) + class Sample(sampleId: String) extends AbstractSample(sampleId) { + def makeLibrary(id: String) = new Library(id) + class Library(libId: String) extends AbstractLibrary(libId) { + val inputFastq: File = config("R1", required = true) + val prefixFastq: File = createFile(".prefix.fastq") + + val flexiprep = new Flexiprep(qscript) + flexiprep.sampleId = sampleId + flexiprep.libId = libId + + val mapping = new Mapping(qscript) + mapping.libId = libId + mapping.sampleId = sampleId + + protected def addJobs(): Unit = { + flexiprep.outputDir = libDir + "flexiprep/" + flexiprep.input_R1 = inputFastq + flexiprep.init + flexiprep.biopetScript + qscript.addAll(flexiprep.functions) + + val flexiprepOutput = for ((key, file) <- flexiprep.outputFiles if key.endsWith("output_R1")) yield file + val pf = new PrefixFastq(qscript) + pf.inputFastq = flexiprepOutput.head + pf.outputFastq = prefixFastq + pf.prefixSeq = config("sage_tag", default = "CATG") + pf.deps +:= flexiprep.outputFiles("fastq_input_R1") + qscript.add(pf) + + mapping.input_R1 = pf.outputFastq + mapping.outputDir = libDir + mapping.init + mapping.biopetScript + qscript.addAll(mapping.functions) + + if (config("library_counts", default = false).asBoolean) { + addBedtoolsCounts(mapping.finalBamFile, sampleId + "-" + libId, libDir) + addTablibCounts(pf.outputFastq, sampleId + "-" + libId, libDir) + } + } + } + protected def addJobs(): Unit = { + addPerLibJobs() + val libraryBamfiles = libraries.map(_._2.mapping.finalBamFile).toList + val libraryFastqFiles = libraries.map(_._2.prefixFastq).toList + + val bamFile: File = if (libraryBamfiles.size == 1) libraryBamfiles.head + else if (libraryBamfiles.size > 1) { + val mergeSamFiles = MergeSamFiles(qscript, libraryBamfiles, sampleDir) + qscript.add(mergeSamFiles) + mergeSamFiles.output + } else null + val fastqFile: File = if (libraryFastqFiles.size == 1) libraryFastqFiles.head + else if (libraryFastqFiles.size > 1) { + val cat = Cat(qscript, libraryFastqFiles, sampleDir + sampleId + ".fastq") + qscript.add(cat) + cat.output + } else null + + addBedtoolsCounts(bamFile, sampleId, sampleDir) + addTablibCounts(fastqFile, sampleId, sampleDir) + } } def init() { if (!outputDir.endsWith("/")) outputDir += "/" - if (transcriptome == null && tagsLibrary == null) + if (transcriptome.isEmpty && tagsLibrary.isEmpty) throw new IllegalStateException("No transcriptome or taglib found") - if (countBed == null && squishedCountBed == null) - throw new IllegalStateException("No bedfile supplied, please add a countBed or squishedCountBed") + if (countBed.isEmpty) + throw new IllegalStateException("No bedfile supplied, please add a countBed") } def biopetScript() { - if (squishedCountBed == null) { - val squishBed = SquishBed(this, countBed, outputDir) - add(squishBed) - squishedCountBed = squishBed.output - } + val squishBed = SquishBed(this, countBed.get, outputDir) + add(squishBed) + squishedCountBed = squishBed.output - if (tagsLibrary == null) { + if (tagsLibrary.isEmpty) { val cdl = new SageCreateLibrary(this) - cdl.input = transcriptome + cdl.input = transcriptome.get cdl.output = outputDir + "taglib/tag.lib" cdl.noAntiTagsOutput = outputDir + "taglib/no_antisense_genes.txt" cdl.noTagsOutput = outputDir + "taglib/no_sense_genes.txt" cdl.allGenesOutput = outputDir + "taglib/all_genes.txt" add(cdl) - tagsLibrary = cdl.output - } - - runSamplesJobs - } - - // Called for each sample - def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = { - val sampleOutput = new SampleOutput - var libraryBamfiles: List[File] = List() - var libraryFastqFiles: List[File] = List() - val sampleID: String = sampleConfig("ID").toString - val sampleDir: String = globalSampleDir + sampleID + "/" - for ((library, libraryFiles) <- runLibraryJobs(sampleConfig)) { - libraryFastqFiles +:= libraryFiles.prefixFastq - libraryBamfiles +:= libraryFiles.mappedBamFile + tagsLibrary = Some(cdl.output) } - val bamFile: File = if (libraryBamfiles.size == 1) libraryBamfiles.head - else if (libraryBamfiles.size > 1) { - val mergeSamFiles = MergeSamFiles(this, libraryBamfiles, sampleDir) - add(mergeSamFiles) - mergeSamFiles.output - } else null - val fastqFile: File = if (libraryFastqFiles.size == 1) libraryFastqFiles.head - else if (libraryFastqFiles.size > 1) { - val cat = Cat.apply(this, libraryFastqFiles, sampleDir + sampleID + ".fastq") - add(cat) - cat.output - } else null - - addBedtoolsCounts(bamFile, sampleID, sampleDir) - addTablibCounts(fastqFile, sampleID, sampleDir) - - return sampleOutput + addSamplesJobs() } - // Called for each run from a sample - def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = { - val libraryOutput = new LibraryOutput - val runID: String = runConfig("ID").toString - val sampleID: String = sampleConfig("ID").toString - val runDir: String = globalSampleDir + sampleID + "/run_" + runID + "/" - if (runConfig.contains("R1")) { - val flexiprep = new Flexiprep(this) - flexiprep.outputDir = runDir + "flexiprep/" - flexiprep.input_R1 = new File(runConfig("R1").toString) - flexiprep.skipClip = true - flexiprep.skipTrim = true - flexiprep.sampleName = sampleID - flexiprep.libraryName = runID - flexiprep.init - flexiprep.biopetScript - addAll(flexiprep.functions) - - val flexiprepOutput = for ((key, file) <- flexiprep.outputFiles if key.endsWith("output_R1")) yield file - val prefixFastq = PrefixFastq(this, flexiprepOutput.head, runDir) - prefixFastq.prefixSeq = config("sage_tag", default = "CATG") - prefixFastq.deps +:= flexiprep.outputFiles("fastq_input_R1") - add(prefixFastq) - libraryOutput.prefixFastq = prefixFastq.outputFastq - - val mapping = new Mapping(this) - mapping.skipFlexiprep = true - mapping.skipMarkduplicates = true - mapping.aligner = config("aligner", default = "bowtie") - mapping.input_R1 = prefixFastq.outputFastq - mapping.RGLB = runConfig("ID").toString - mapping.RGSM = sampleConfig("ID").toString - if (runConfig.contains("PL")) mapping.RGPL = runConfig("PL").toString - if (runConfig.contains("PU")) mapping.RGPU = runConfig("PU").toString - if (runConfig.contains("CN")) mapping.RGCN = runConfig("CN").toString - mapping.outputDir = runDir - mapping.init - mapping.biopetScript - addAll(mapping.functions) - - if (config("library_counts", default = false).asBoolean) { - addBedtoolsCounts(mapping.outputFiles("finalBamFile"), sampleID + "-" + runID, runDir) - addTablibCounts(prefixFastq.outputFastq, sampleID + "-" + runID, runDir) - } - - libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile") - } else this.logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig) - return libraryOutput + def addMultiSampleJobs(): Unit = { } def addBedtoolsCounts(bamFile: File, outputPrefix: String, outputDir: String) { @@ -202,7 +182,7 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { val createTagCounts = new SageCreateTagCounts(this) createTagCounts.input = countFastq.output - createTagCounts.tagLib = tagsLibrary + createTagCounts.tagLib = tagsLibrary.get createTagCounts.countSense = outputDir + outputPrefix + ".tagcount.sense.counts" createTagCounts.countAllSense = outputDir + outputPrefix + ".tagcount.all.sense.counts" createTagCounts.countAntiSense = outputDir + outputPrefix + ".tagcount.antisense.counts" @@ -211,4 +191,4 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { } } -object Sage extends PipelineCommand +object Sage extends PipelineCommand \ No newline at end of file diff --git a/public/yamsvp/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala b/public/yamsvp/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala index 29b299c30e2071df0a631d3cb0c1ec73149bb043..206c92a69453105fb2034d6cd72cce00f5ab75cd 100644 --- a/public/yamsvp/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala +++ b/public/yamsvp/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala @@ -20,8 +20,7 @@ package nl.lumc.sasc.biopet.pipelines.yamsvp import nl.lumc.sasc.biopet.core.config.Configurable -import nl.lumc.sasc.biopet.core.MultiSampleQScript -import nl.lumc.sasc.biopet.core.PipelineCommand +import nl.lumc.sasc.biopet.core.{ BiopetQScript, MultiSampleQScript, PipelineCommand } import nl.lumc.sasc.biopet.extensions.sambamba.{ SambambaIndex, SambambaMerge } import nl.lumc.sasc.biopet.extensions.svcallers.pindel.Pindel @@ -33,12 +32,12 @@ import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.queue.function._ import org.broadinstitute.gatk.queue.engine.JobRunInfo -class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { +class Yamsvp(val root: Configurable) extends QScript with BiopetQScript { //with MultiSampleQScript { def this() = this(null) var reference: File = config("reference", required = true) var finalBamFiles: List[File] = Nil - + /* class LibraryOutput extends AbstractLibraryOutput { var mappedBamFile: File = _ } @@ -47,7 +46,7 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { var vcf: Map[String, List[File]] = Map() var mappedBamFile: File = _ } - +*/ override def init() { if (outputDir == null) throw new IllegalStateException("Output directory is not specified in the config / argument") @@ -61,7 +60,7 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { // read config and set all parameters for the pipeline logger.info("Starting YAM SV Pipeline") - runSamplesJobs + //runSamplesJobs // } @@ -69,19 +68,18 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { override def onExecutionDone(jobs: Map[QFunction, JobRunInfo], success: Boolean) { logger.info("YAM SV Pipeline has run .......................") } - - def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = { + /* + def runSingleSampleJobs(sampleID: String): SampleOutput = { val sampleOutput = new SampleOutput var libraryBamfiles: List[File] = List() var outputFiles: Map[String, List[File]] = Map() var libraryFastqFiles: List[File] = List() - val sampleID: String = sampleConfig("ID").toString val sampleDir: String = outputDir + sampleID + "/" val alignmentDir: String = sampleDir + "alignment/" val svcallingDir: String = sampleDir + "svcalls/" - sampleOutput.libraries = runLibraryJobs(sampleConfig) + sampleOutput.libraries = runLibraryJobs(sampleID) for ((libraryID, libraryOutput) <- sampleOutput.libraries) { // this is extending the libraryBamfiles list like '~=' in D or .append in Python or .push_back in C++ libraryBamfiles ++= List(libraryOutput.mappedBamFile) @@ -126,29 +124,27 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { // Called for each run from a sample - def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = { + def runSingleLibraryJobs(libId: String, sampleID: String): LibraryOutput = { val libraryOutput = new LibraryOutput - val runID: String = runConfig("ID").toString - val sampleID: String = sampleConfig("ID").toString val alignmentDir: String = outputDir + sampleID + "/alignment/" - val runDir: String = alignmentDir + "run_" + runID + "/" + val runDir: String = alignmentDir + "run_" + libId + "/" - if (runConfig.contains("R1")) { + if (config.contains("R1")) { val mapping = new Mapping(this) mapping.aligner = config("aligner", default = "stampy") mapping.skipFlexiprep = false mapping.skipMarkduplicates = true // we do the dedup marking using Sambamba - if (runConfig.contains("R1")) mapping.input_R1 = new File(runConfig("R1").toString) - if (runConfig.contains("R2")) mapping.input_R2 = new File(runConfig("R2").toString) + mapping.input_R1 = config("R1") + mapping.input_R2 = config("R2") mapping.paired = (mapping.input_R2 != null) - mapping.RGLB = runConfig("ID").toString - mapping.RGSM = sampleConfig("ID").toString - if (runConfig.contains("PL")) mapping.RGPL = runConfig("PL").toString - if (runConfig.contains("PU")) mapping.RGPU = runConfig("PU").toString - if (runConfig.contains("CN")) mapping.RGCN = runConfig("CN").toString + mapping.RGLB = libId + mapping.RGSM = sampleID + mapping.RGPL = config("PL") + mapping.RGPU = config("PU") + mapping.RGCN = config("CN") mapping.outputDir = runDir mapping.init @@ -158,11 +154,12 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { // start sambamba dedup libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile") - } else this.logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig) + } else this.logger.error("Sample: " + sampleID + ": No R1 found for library: " + libId) return libraryOutput // logger.debug(outputFiles) // return outputFiles } + */ } object Yamsvp extends PipelineCommand \ No newline at end of file