diff --git a/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala b/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala index 934224a237b1422e3c2af08fd70d118270c7c754..5088e44c2eca6f54b23104c7695c3a03d5f762c8 100644 --- a/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala +++ b/protected/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala @@ -124,7 +124,7 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript { val gubbins = new RunGubbins(this) gubbins.fastafile = concensusVariants - gubbins.startingTree = raxmlBi.getBipartitionsFile + gubbins.startingTree = Some(raxmlBi.getBipartitionsFile) gubbins.outputDirectory = outputDir + dirSufixGubbins add(gubbins) } diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala index 3912958a59d8d58a67b4cd615409e4a5c03ec990..c07a2a66c363fe9623ad35225e9f76eb45aa67a4 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala @@ -12,7 +12,7 @@ class BaseRecalibrator(val root: Configurable) extends org.broadinstitute.gatk.q memoryLimit = Option(4) override val defaultVmem = "8G" - if (config.contains("scattercount")) scatterCount = config("scattercount") + if (config.contains("scattercount")) scatterCount = config("scattercount", default = 1) if (config.contains("dbsnp")) knownSites :+= new File(config("dbsnp").asString) if (config.contains("known_sites")) knownSites :+= new File(config("known_sites").asString) } diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkGeneral.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkGeneral.scala index dc7cc5e4035b9cbffdeb6db732c06cabd4d52c74..147398ac798c077da0722b2078f7ea21c666b7d1 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkGeneral.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/GatkGeneral.scala @@ -19,7 +19,7 @@ trait GatkGeneral extends CommandLineGATK with BiopetJavaCommandLineFunction { if (config.contains("intervals")) intervals = config("intervals").asFileList if (config.contains("exclude_intervals")) excludeIntervals = config("exclude_intervals").asFileList - reference_sequence = config("reference") - gatk_key = config("gatk_key") + reference_sequence = config("reference", required = true) + if (config.contains("gatk_key")) gatk_key = config("gatk_key") if (config.contains("pedigree")) pedigree = config("pedigree").asFileList } diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala index 001fb9d17b858320d22549bae321ca0b820eac12..b8f4ea4efa3ea8cfed563c2b82651c0001b794f7 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala @@ -9,40 +9,40 @@ import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.HaplotypeCaller with GatkGeneral { - override def afterGraph { - super.afterGraph - - min_mapping_quality_score = config("minMappingQualityScore", default = 20) - if (config.contains("scattercount")) scatterCount = config("scattercount") - if (config.contains("dbsnp")) this.dbsnp = config("dbsnp") - this.sample_ploidy = config("ploidy") - nct = config("threads", default = 1) - bamOutput = config("bamOutput") - memoryLimit = Option(nct.getOrElse(1) * 2) - if (config.contains("allSitePLs")) this.allSitePLs = config("allSitePLs") - if (config.contains("output_mode")) { - import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode._ - config("output_mode").asString match { - case "EMIT_ALL_CONFIDENT_SITES" => output_mode = EMIT_ALL_CONFIDENT_SITES - case "EMIT_ALL_SITES" => output_mode = EMIT_ALL_SITES - case "EMIT_VARIANTS_ONLY" => output_mode = EMIT_VARIANTS_ONLY - case e => logger.warn("output mode '" + e + "' does not exist") - } + min_mapping_quality_score = config("minMappingQualityScore", default = 20) + scatterCount = config("scattercount", default = 1) + if (config.contains("dbsnp")) this.dbsnp = config("dbsnp") + this.sample_ploidy = config("ploidy") + if (config.contains("bamOutput")) bamOutput = config("bamOutput") + if (config.contains("allSitePLs")) allSitePLs = config("allSitePLs") + if (config.contains("output_mode")) { + import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode._ + config("output_mode").asString match { + case "EMIT_ALL_CONFIDENT_SITES" => output_mode = EMIT_ALL_CONFIDENT_SITES + case "EMIT_ALL_SITES" => output_mode = EMIT_ALL_SITES + case "EMIT_VARIANTS_ONLY" => output_mode = EMIT_VARIANTS_ONLY + case e => logger.warn("output mode '" + e + "' does not exist") } + } - if (config("inputtype", default = "dna").asString == "rna") { - dontUseSoftClippedBases = config("dontusesoftclippedbases", default = true) - stand_call_conf = config("stand_call_conf", default = 5) - stand_emit_conf = config("stand_emit_conf", default = 0) - } else { - dontUseSoftClippedBases = config("dontusesoftclippedbases", default = false) - stand_call_conf = config("stand_call_conf", default = 5) - stand_emit_conf = config("stand_emit_conf", default = 0) - } + if (config("inputtype", default = "dna").asString == "rna") { + dontUseSoftClippedBases = config("dontusesoftclippedbases", default = true) + stand_call_conf = config("stand_call_conf", default = 5) + stand_emit_conf = config("stand_emit_conf", default = 0) + } else { + dontUseSoftClippedBases = config("dontusesoftclippedbases", default = false) + stand_call_conf = config("stand_call_conf", default = 5) + stand_emit_conf = config("stand_emit_conf", default = 0) + } + + override def afterGraph { + super.afterGraph if (bamOutput != null && nct.getOrElse(1) > 1) { - nct = Option(1) + threads = 1 logger.warn("BamOutput is on, nct/threads is forced to set on 1, this option is only for debug") } + nct = Some(threads) + memoryLimit = Option(memoryLimit.getOrElse(2.0) * nct.getOrElse(1)) } def useGvcf() { diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala index ee814bd31e4bb7a84f8d6d06f08a5b1beba5fa0e..c76139133f23cf434884ad69ccc9dafc69b2cac9 100644 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala +++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala @@ -36,7 +36,6 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri var singleSampleCalling = config("single_sample_calling", default = true) var reference: File = config("reference", required = true) - var dbsnp: File = config("dbsnp") var useAllelesOption: Boolean = config("use_alleles_option", default = false) val externalGvcfs = config("external_gvcfs_files", default = Nil).asFileList @@ -72,7 +71,7 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri samToFastq.isIntermediate = true qscript.add(samToFastq) mapping.input_R1 = samToFastq.fastqR1 - mapping.input_R2 = samToFastq.fastqR2 + mapping.input_R2 = Some(samToFastq.fastqR2) mapping.init mapping.biopetScript addAll(mapping.functions) // Add functions of mapping to curent function pool diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala index 759181117116f0df027a2c98243cb5123777fa16..8bac4aaf68c33a245da877d460bc26abb9ebe564 100644 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala +++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala @@ -7,7 +7,7 @@ package nl.lumc.sasc.biopet.pipelines.gatk import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } import java.io.File -import nl.lumc.sasc.biopet.tools.{ MpileupToVcf, VcfFilter, MergeAlleles } +import nl.lumc.sasc.biopet.tools.{ VcfStats, MpileupToVcf, VcfFilter, MergeAlleles } import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.gatk.{ AnalyzeCovariates, BaseRecalibrator, GenotypeGVCFs, HaplotypeCaller, IndelRealigner, PrintReads, RealignerTargetCreator, SelectVariants, CombineVariants, UnifiedGenotyper } import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates @@ -32,9 +32,6 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr @Argument(doc = "Reference", shortName = "R", required = false) var reference: File = config("reference", required = true) - @Argument(doc = "Dbsnp", shortName = "dbsnp", required = false) - var dbsnp: File = config("dbsnp") - @Argument(doc = "OutputName", required = false) var outputName: String = _ @@ -53,7 +50,7 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr def init() { if (outputName == null && sampleID != null) outputName = sampleID - else if (outputName == null) outputName = "noname" + else if (outputName == null) outputName = config("output_name", default = "noname") if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module") else if (!outputDir.endsWith("/")) outputDir += "/" @@ -200,6 +197,12 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr val cvFinal = CombineVariants(this, mergeList.toList, outputDir + outputName + ".final.vcf.gz") cvFinal.genotypemergeoption = org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils.GenotypeMergeType.UNSORTED add(cvFinal) + + val vcfStats = new VcfStats(this) + vcfStats.input = cvFinal.out + vcfStats.setOutputDir(outputDir + File.separator + "vcfstats") + add(vcfStats) + scriptOutput.finalVcfFile = cvFinal.out } } diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala deleted file mode 100644 index c3f5add4b52092e607e1c4975745e7f58793dd08..0000000000000000000000000000000000000000 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala +++ /dev/null @@ -1,87 +0,0 @@ -/** - * Due to the license issue with GATK, this part of Biopet can only be used inside the - * LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions - * on how to use this protected part of biopet or contact us at sasc@lumc.nl - */ -package nl.lumc.sasc.biopet.pipelines.gatk - -import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } -import java.io.File -import nl.lumc.sasc.biopet.core.config.Configurable -import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants -import nl.lumc.sasc.biopet.extensions.gatk.SelectVariants -import nl.lumc.sasc.biopet.extensions.gatk.VariantEval -import org.broadinstitute.gatk.queue.QScript -import org.broadinstitute.gatk.utils.commandline.{ Input, Argument } - -class GatkVcfSampleCompare(val root: Configurable) extends QScript with BiopetQScript { - def this() = this(null) - - @Input(doc = "Sample vcf file(s)", shortName = "V") - var vcfFiles: List[File] = _ - - @Argument(doc = "Reference", shortName = "R", required = false) - var reference: File = config("reference") - - @Argument(doc = "Target bed", shortName = "targetBed", required = false) - var targetBed: List[File] = Nil - - @Argument(doc = "Samples", shortName = "sample", required = false) - var samples: List[String] = Nil - - var vcfFile: File = _ - var sampleVcfs: Map[String, File] = Map() - def generalSampleDir = outputDir + "samples/" - - def init() { - if (config.contains("target_bed")) - for (bed <- config("target_bed").asList) - targetBed :+= bed.toString - if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module") - else if (!outputDir.endsWith("/")) outputDir += "/" - } - - def biopetScript() { - vcfFile = if (vcfFiles.size > 1) { - val combineVariants = CombineVariants(this, vcfFiles, outputDir + "merge.vcf") - add(combineVariants) - combineVariants.out - } else vcfFiles.head - - for (sample <- samples) { - sampleVcfs += (sample -> new File(generalSampleDir + sample + File.separator + sample + ".vcf")) - val selectVariants = SelectVariants(this, vcfFile, sampleVcfs(sample)) - selectVariants.sample_name = Seq(sample) - selectVariants.excludeNonVariants = true - add(selectVariants) - } - - val sampleCompareMetrics = new SampleCompareMetrics(this) - sampleCompareMetrics.samples = samples - sampleCompareMetrics.sampleDir = generalSampleDir - sampleCompareMetrics.snpRelFile = outputDir + "compare.snp.rel.tsv" - sampleCompareMetrics.snpAbsFile = outputDir + "compare.snp.abs.tsv" - sampleCompareMetrics.indelRelFile = outputDir + "compare.indel.rel.tsv" - sampleCompareMetrics.indelAbsFile = outputDir + "compare.indel.abs.tsv" - sampleCompareMetrics.totalFile = outputDir + "total.tsv" - - for ((sample, sampleVcf) <- sampleVcfs) { - val sampleDir = generalSampleDir + sample + File.separator - for ((compareSample, compareSampleVcf) <- sampleVcfs) { - val variantEval = VariantEval(this, - sampleVcf, - compareSampleVcf, - new File(sampleDir + sample + "-" + compareSample + ".eval.txt"), - Seq("VariantType", "CompRod"), - Seq("CompOverlap") - ) - if (targetBed != null) variantEval.L = targetBed - add(variantEval) - sampleCompareMetrics.deps ::= variantEval.out - } - } - add(sampleCompareMetrics) - } -} - -object GatkVcfSampleCompare extends PipelineCommand diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala deleted file mode 100644 index 861455fe4d886219813409023d022ea096f413c9..0000000000000000000000000000000000000000 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala +++ /dev/null @@ -1,153 +0,0 @@ -/** - * Due to the license issue with GATK, this part of Biopet can only be used inside the - * LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions - * on how to use this protected part of biopet or contact us at sasc@lumc.nl - */ -package nl.lumc.sasc.biopet.pipelines.gatk - -import java.io.File -import java.io.PrintWriter -import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction -import nl.lumc.sasc.biopet.core.config.Configurable -import org.broadinstitute.gatk.utils.R.RScriptExecutor -import org.broadinstitute.gatk.utils.commandline.{ Output, Argument } -import scala.io.Source -import org.broadinstitute.gatk.utils.R.{ RScriptLibrary, RScriptExecutor } -import org.broadinstitute.gatk.utils.io.Resource -import scala.collection.mutable.Map -import scala.math._ - -class SampleCompareMetrics(val root: Configurable) extends BiopetJavaCommandLineFunction { - javaMainClass = getClass.getName - - @Argument(doc = "Sample Dir", shortName = "sampleDir", required = true) - var sampleDir: String = _ - - @Argument(doc = "Samples", shortName = "sample", required = true) - var samples: List[String] = Nil - - @Argument(doc = "File sufix", shortName = "sufix", required = false) - var fileSufix: String = _ - - @Output(doc = "snpRelFile", shortName = "snpRelFile", required = true) - var snpRelFile: File = _ - - @Output(doc = "snpAbsFile", shortName = "snpAbsFile", required = true) - var snpAbsFile: File = _ - - @Output(doc = "indelRelFile", shortName = "indelRelFile", required = true) - var indelRelFile: File = _ - - @Output(doc = "indelAbsFile", shortName = "indelAbsFile", required = true) - var indelAbsFile: File = _ - - @Output(doc = "totalFile", shortName = "totalFile", required = true) - var totalFile: File = _ - - override val defaultVmem = "8G" - memoryLimit = Option(4.0) - - override def commandLine = super.commandLine + - required("-sampleDir", sampleDir) + - repeat("-sample", samples) + - optional("-fileSufix", fileSufix) + - required("-snpRelFile", snpRelFile) + - required("-snpAbsFile", snpAbsFile) + - required("-indelRelFile", indelRelFile) + - required("-indelAbsFile", indelAbsFile) + - required("-totalFile", totalFile) -} - -object SampleCompareMetrics { - var sampleDir: String = _ - var samples: List[String] = Nil - var fileSufix: String = ".eval.txt" - var snpRelFile: File = _ - var snpAbsFile: File = _ - var indelRelFile: File = _ - var indelAbsFile: File = _ - var totalFile: File = _ - /** - * @param args the command line arguments - */ - def main(args: Array[String]): Unit = { - - for (t <- 0 until args.size) { - args(t) match { - case "-sample" => samples +:= args(t + 1) - case "-sampleDir" => sampleDir = args(t + 1) - case "-fileSufix" => fileSufix = args(t + 1) - case "-snpRelFile" => snpRelFile = new File(args(t + 1)) - case "-snpAbsFile" => snpAbsFile = new File(args(t + 1)) - case "-indelRelFile" => indelRelFile = new File(args(t + 1)) - case "-indelAbsFile" => indelAbsFile = new File(args(t + 1)) - case "-totalFile" => totalFile = new File(args(t + 1)) - case _ => - } - } - if (sampleDir == null) throw new IllegalStateException("No sampleDir, use -sampleDir") - else if (!sampleDir.endsWith("/")) sampleDir += "/" - - val regex = """\W+""".r - val snpsOverlap: Map[(String, String), Int] = Map() - val indelsOverlap: Map[(String, String), Int] = Map() - val snpsTotal: Map[String, Int] = Map() - val indelsTotal: Map[String, Int] = Map() - for (sample1 <- samples; sample2 <- samples) { - val reader = Source.fromFile(new File(sampleDir + sample1 + "/" + sample1 + "-" + sample2 + fileSufix)) - for (line <- reader.getLines) { - regex.split(line) match { - case Array(_, _, _, varType, all, novel, overlap, rate, _*) => { - varType match { - case "SNP" => { - snpsOverlap += (sample1, sample2) -> overlap.toInt - snpsTotal += sample1 -> all.toInt - } - case "INDEL" => { - indelsOverlap += (sample1, sample2) -> overlap.toInt - indelsTotal += sample1 -> all.toInt - } - case _ => - } - } - case _ => - } - } - reader.close() - } - - val snpRelWritter = new PrintWriter(snpRelFile) - val snpAbsWritter = new PrintWriter(snpAbsFile) - val indelRelWritter = new PrintWriter(indelRelFile) - val indelAbsWritter = new PrintWriter(indelAbsFile) - - val allWritters = List(snpRelWritter, snpAbsWritter, indelRelWritter, indelAbsWritter) - for (writter <- allWritters) writter.println(samples.mkString("\t", "\t", "")) - for (sample1 <- samples) { - for (writter <- allWritters) writter.print(sample1) - for (sample2 <- samples) { - snpRelWritter.print("\t" + (round((snpsOverlap(sample1, sample2).toDouble / snpsTotal(sample1) * 10000.0)) / 10000.0)) - snpAbsWritter.print("\t" + snpsOverlap(sample1, sample2)) - indelRelWritter.print("\t" + (round((indelsOverlap(sample1, sample2).toDouble / indelsTotal(sample1) * 10000.0)) / 10000.0)) - indelAbsWritter.print("\t" + indelsOverlap(sample1, sample2)) - } - for (writter <- allWritters) writter.println() - } - for (writter <- allWritters) writter.close() - - val totalWritter = new PrintWriter(totalFile) - totalWritter.println("Sample\tSNPs\tIndels") - for (sample <- samples) - totalWritter.println(sample + "\t" + snpsTotal(sample) + "\t" + indelsTotal(sample)) - totalWritter.close() - - def plot(file: File) { - val executor = new RScriptExecutor - executor.addScript(new Resource("plotHeatmap.R", getClass)) - executor.addArgs(file, file.getAbsolutePath.stripSuffix(".tsv") + ".png", file.getAbsolutePath.stripSuffix(".tsv") + ".clustering.png") - executor.exec() - } - plot(snpRelFile) - plot(indelRelFile) - } -} \ No newline at end of file diff --git a/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala b/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala index 9457fd36cfab986c5fdc9d5cbc29836ced4e0962..902e292fca017947affb287249ccff764e943100 100644 --- a/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala +++ b/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala @@ -12,7 +12,6 @@ object BiopetExecutableProtected extends BiopetExecutable { nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantcalling, nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline, nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantRecalibration, - nl.lumc.sasc.biopet.pipelines.gatk.GatkVcfSampleCompare, nl.lumc.sasc.biopet.pipelines.basty.Basty) def tools = BiopetExecutablePublic.tools diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R deleted file mode 100644 index 4158db708d58c8cc19b535dcfe871c626fa51ad6..0000000000000000000000000000000000000000 --- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R +++ /dev/null @@ -1,24 +0,0 @@ -library('gplots') - -args <- commandArgs(TRUE) -inputArg <- args[1] -outputArg <- args[2] -outputArgClustering <- args[3] - -col <- heat.colors(250) -col[250] <- "#00FF00" - -heat<-read.table(inputArg, header = 1, sep= '\t') -rownames(heat) <- heat[,1] -heat<- heat[,-1] - -heat<- as.matrix(heat) - -png(file = outputArg, width = 1000, height = 1000) -heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none") -dev.off() - - -png(file = outputArgClustering, width = 1000, height = 1000) -heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row") -dev.off() diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R new file mode 100644 index 0000000000000000000000000000000000000000..7f7237e90f6593e3d6cf110da005cd89c154d466 --- /dev/null +++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R @@ -0,0 +1,35 @@ +library('gplots') +library('RColorBrewer') + +args <- commandArgs(TRUE) +inputArg <- args[1] +outputArg <- args[2] +outputArgClustering <- args[3] +outputArgDendrogram <- args[4] + + +heat<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F) +#heat[heat==1] <- NA +rownames(heat) <- heat[,1] +heat<- heat[,-1] +heat<- as.matrix(heat) + +colNumber <- 50 +col <- rev(colorRampPalette(brewer.pal(11, "Spectral"))(colNumber)) +for (i in (colNumber+1):(colNumber+round((dist(range(heat)) - dist(range(heat[heat < 1]))) / dist(range(heat[heat < 1])) * colNumber))) { + col[i] <- col[colNumber] +} +col[length(col)] <- "#00FF00" + +png(file = outputArg, width = 1200, height = 1200) +heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none", margins = c(12, 12), na.color="#00FF00") +dev.off() + +hc <- hclust(d = dist(heat)) +png(file = outputArgDendrogram, width = 1200, height = 1200) +plot(as.dendrogram(hc), horiz=TRUE, asp=0.02) +dev.off() + +png(file = outputArgClustering, width = 1200, height = 1200) +heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row",margins = c(12, 12), na.color="#00FF00") +dev.off() diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R new file mode 100644 index 0000000000000000000000000000000000000000..63fd7b03262d94094a9f151b22cca812f10cee1f --- /dev/null +++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R @@ -0,0 +1,20 @@ +library('ggplot2') +library('reshape2') + +args <- commandArgs(TRUE) +inputArg <- args[1] +outputArg <- args[2] + +tsv<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F) + +data <- melt(tsv) + +data$X <- as.numeric(data$X) +data <- na.omit(data) +data <- data[data$value > 0,] + +print("Starting to plot") +png(file = outputArg, width = 1500, height = 1500) +ggplot(data, aes(x=X, y=value, color=variable, group=variable)) + geom_line() +dev.off() +print("plot done") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala index 57c63593526f20ba6d94cd169d93af15b54a6328..4a5b25f7a5b816fe32ad39897a334a6e13fe1eda 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala @@ -15,13 +15,11 @@ */ package nl.lumc.sasc.biopet.core -//import java.io.BufferedInputStream import java.io.File import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.queue.QException import org.broadinstitute.gatk.queue.function.CommandLineFunction import org.broadinstitute.gatk.utils.commandline.{ Input, Argument } -//import scala.io.Source import scala.sys.process.{ Process, ProcessLogger } import scala.util.matching.Regex import java.io.FileInputStream @@ -38,7 +36,7 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab val defaultThreads = 1 @Argument(doc = "Vmem", required = false) - var vmem: String = _ + var vmem: Option[String] = None val defaultVmem: String = "" @Argument(doc = "Executable", required = false) @@ -53,17 +51,17 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab override def freezeFieldValues() { checkExecutable afterGraph - jobOutputFile = new File(firstOutput.getParent + "/." + firstOutput.getName + "." + configName + ".out") + if (jobOutputFile == null) jobOutputFile = new File(firstOutput.getParent + "/." + firstOutput.getName + "." + configName + ".out") if (threads == 0) threads = getThreads(defaultThreads) if (threads > 1) nCoresRequest = Option(threads) - if (vmem == null) { + if (vmem.isEmpty) { vmem = config("vmem") - if (vmem == null && !defaultVmem.isEmpty) vmem = defaultVmem + if (vmem.isEmpty && defaultVmem.nonEmpty) vmem = Some(defaultVmem) } - if (vmem != null) jobResourceRequests :+= "h_vmem=" + vmem - jobName = configName + ":" + firstOutput.getName + if (vmem.isDefined) jobResourceRequests :+= "h_vmem=" + vmem.get + jobName = configName + ":" + (if (firstOutput != null) firstOutput.getName else jobOutputFile) super.freezeFieldValues() } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala index 7f670a44468ee63230f6df282b8027057ae9a08a..e192a845be6db96d2a754d2d2ff6a5e393d136ca 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala @@ -43,7 +43,7 @@ class Bowtie(val root: Configurable) extends BiopetCommandLineFunction { override val defaultThreads = 8 var sam: Boolean = config("sam", default = true) - var sam_RG: String = config("sam-RG") + var sam_RG: Option[String] = config("sam-RG") var seedlen: Option[Int] = config("seedlen") var seedmms: Option[Int] = config("seedmms") var k: Option[Int] = config("k") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala index ba145e2aa8fad1c942c0104a494cf5b43ec1b45c..e7a5a319ead2bc797713479de80ff949373bc8ff 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala @@ -43,8 +43,8 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction { if (config.contains("front")) for (adapter <- config("front").asList) opt_front += adapter.toString var opt_discard: Boolean = config("discard", default = false) - var opt_minimum_length: String = config("minimum_length", 1) - var opt_maximum_length: String = config("maximum_length") + var opt_minimum_length: Option[Int] = config("minimum_length", 1) + var opt_maximum_length: Option[Int] = config("maximum_length") def cmdLine = required(executable) + // options diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala index f735823718bd87ca5765c925d167143c08494c72..4f6236179755659b3b42d74cef6c722713fbf4c9 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala @@ -60,11 +60,11 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction { private var out: List[File] = Nil var executableNonThreads: String = config("exe", default = "raxmlHPC") - var executableThreads: String = config("exe_pthreads") + var executableThreads: Option[String] = config("exe_pthreads") override def afterGraph { if (threads == 0) threads = getThreads(defaultThreads) - executable = if (threads > 1 && executableThreads != null) executableThreads else executableNonThreads + executable = if (threads > 1 && executableThreads.isDefined) executableThreads.get else executableNonThreads super.afterGraph out +:= getInfoFile f match { diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala index 6c8892c9bc66f3ce3cc6997032cd11700d7535d0..e732c1023de5f713f4c21ed366da68eddc825285 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala @@ -24,7 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction { @Input(doc = "Contaminants", required = false) - var startingTree: File = config("starting_tree") + var startingTree: Option[File] = config("starting_tree") @Input(doc = "Fasta file", shortName = "FQ") var fastafile: File = _ @@ -36,21 +36,21 @@ class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction { var outputDirectory: String = _ executable = config("exe", default = "run_gubbins.py") - var outgroup: String = config("outgroup") - var filterPercentage: String = config("filter_percentage") - var treeBuilder: String = config("tree_builder") + var outgroup: Option[String] = config("outgroup") + var filterPercentage: Option[String] = config("filter_percentage") + var treeBuilder: Option[String] = config("tree_builder") var iterations: Option[Int] = config("iterations") var minSnps: Option[Int] = config("min_snps") - var convergeMethod: String = config("converge_method") + var convergeMethod: Option[String] = config("converge_method") var useTimeStamp: Boolean = config("use_time_stamp", default = false) - var prefix: String = config("prefix") + var prefix: Option[String] = config("prefix") var verbose: Boolean = config("verbose", default = false) var noCleanup: Boolean = config("no_cleanup", default = false) override def afterGraph: Unit = { super.afterGraph jobLocalDir = new File(outputDirectory) - if (prefix == null) prefix = fastafile.getName + if (prefix.isEmpty) prefix = Some(fastafile.getName) val out: List[String] = List(".recombination_predictions.embl", ".recombination_predictions.gff", ".branch_base_reconstruction.embl", @@ -59,7 +59,7 @@ class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction { ".filtered_polymorphic_sites.fasta", ".filtered_polymorphic_sites.phylip", ".final_tree.tre") - for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix + t) + for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix.getOrElse("gubbins") + t) } def cmdLine = required("cd", outputDirectory) + " && " + required(executable) + diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Sickle.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Sickle.scala index 5c068b9eed4a9d9bed5438c47aeb88da93efa63e..5056c70abc0e122fac0b95862f3443faaa48b5c7 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Sickle.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Sickle.scala @@ -43,7 +43,7 @@ class Sickle(val root: Configurable) extends BiopetCommandLineFunction { var fastqc: Fastqc = _ executable = config("exe", default = "sickle", freeVar = false) - var qualityType: String = config("qualitytype") + var qualityType: Option[String] = config("qualitytype") var qualityThreshold: Option[Int] = config("qualityThreshold") var lengthThreshold: Option[Int] = config("lengthThreshold") var noFiveprime: Boolean = config("noFiveprime", default = false) @@ -54,7 +54,7 @@ class Sickle(val root: Configurable) extends BiopetCommandLineFunction { override def versionCommand = executable + " --version" override def afterGraph { - if (qualityType == null && defaultQualityType != null) qualityType = defaultQualityType + if (qualityType.isEmpty) qualityType = Some(defaultQualityType) } def cmdLine = { diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaMem.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaMem.scala index 09401898a9ee9ac8b6fb3df2be45c7b7c66ffa3b..fc790b183b5bae2922a1b5f89ec66fcf4f5b85b2 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaMem.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/bwa/BwaMem.scala @@ -34,7 +34,7 @@ class BwaMem(val root: Configurable) extends Bwa { @Output(doc = "Output file SAM", shortName = "output") var output: File = _ - var R: String = config("R") + var R: Option[String] = config("R") var k: Option[Int] = config("k") var r: Option[Float] = config("r") var S: Boolean = config("S", default = false) @@ -49,11 +49,11 @@ class BwaMem(val root: Configurable) extends Bwa { var e: Boolean = config("e", default = false) var A: Option[Int] = config("A") var B: Option[Int] = config("B") - var O: String = config("O") - var E: String = config("E") - var L: String = config("L") + var O: Option[String] = config("O") + var E: Option[String] = config("E") + var L: Option[String] = config("L") var U: Option[Int] = config("U") - var x: String = config("x") + var x: Option[String] = config("x") var p: Boolean = config("p", default = false) var v: Option[Int] = config("v") var T: Option[Int] = config("T") @@ -61,7 +61,7 @@ class BwaMem(val root: Configurable) extends Bwa { var a: Boolean = config("a", default = false) var C: Boolean = config("C", default = false) var Y: Boolean = config("Y", default = false) - var I: String = config("I") + var I: Option[String] = config("I") override val defaultVmem = "6G" override val defaultThreads = 8 diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVTools.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVTools.scala new file mode 100644 index 0000000000000000000000000000000000000000..d017864f6988828100f6bbda421fbf734a9a5878 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVTools.scala @@ -0,0 +1,14 @@ +/** + * Created by wyleung on 5-1-15. + */ + +package nl.lumc.sasc.biopet.extensions.igvtools + +import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction + +abstract class IGVTools extends BiopetCommandLineFunction { + executable = config("exe", default = "igvtools", submodule = "igvtools", freeVar = false) + override def versionCommand = executable + " version" + override val versionRegex = """IGV Version: ([\d\.]) .*""".r + override val versionExitcode = List(0) +} \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVToolsCount.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVToolsCount.scala new file mode 100644 index 0000000000000000000000000000000000000000..8037616834ecd4de02e9949883b75d20b45c7347 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVToolsCount.scala @@ -0,0 +1,105 @@ + +package nl.lumc.sasc.biopet.extensions.igvtools + +import java.nio.file.InvalidPathException + +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument } +import java.io.{ FileNotFoundException, File } + +/** + * IGVTools `count` wrapper + * + * @constructor create a new IGVTools instance from a `.bam` file + * + */ + +class IGVToolsCount(val root: Configurable) extends IGVTools { + @Input(doc = "Bam File") + var input: File = _ + + @Input(doc = "<genome>.chrom.sizes File") + var genomeChromSizes: File = _ + + @Output + var tdf: Option[File] = _ + + @Output + var wig: Option[File] = _ + + var maxZoom: Option[Int] = config("maxZoom") + var windowSize: Option[Int] = config("windowSize") + var extFactor: Option[Int] = config("extFactor") + + var preExtFactor: Option[Int] = config("preExtFactor") + var postExtFactor: Option[Int] = config("postExtFactor") + + var windowFunctions: Option[String] = config("windowFunctions") + var strands: Option[String] = config("strands") + var bases: Boolean = config("bases", default = false) + + var query: Option[String] = config("query") + var minMapQuality: Option[Int] = config("minMapQuality") + var includeDuplicates: Boolean = config("includeDuplicates", default = false) + + var pairs: Boolean = config("pairs", default = false) + + override def afterGraph { + super.afterGraph + if (!input.exists()) throw new FileNotFoundException("Input bam is required for IGVToolsCount") + + if (!wig.isEmpty && !wig.get.getAbsolutePath.endsWith(".wig")) throw new IllegalArgumentException("Wiggle file should have a .wig file-extension") + if (!tdf.isEmpty && !tdf.get.getAbsolutePath.endsWith(".tdf")) throw new IllegalArgumentException("TDF file should have a .tdf file-extension") + } + + def cmdLine = { + required(executable) + + required("count") + + optional("--maxZoom", maxZoom) + + optional("--windowSize", windowSize) + + optional("--extFactor", extFactor) + + optional("--preExtFactor", preExtFactor) + + optional("--postExtFactor", postExtFactor) + + optional("--windowFunctions", windowFunctions) + + optional("--strands", strands) + + conditional(bases, "--bases") + + optional("--query", query) + + optional("--minMapQuality", minMapQuality) + + conditional(includeDuplicates, "--includeDuplicates") + + conditional(pairs, "--pairs") + + required(input) + + required(outputArg) + + required(genomeChromSizes) + } + + /** + * This part should never fail, these values are set within this wrapper + * + */ + private def outputArg: String = { + (tdf, wig) match { + case (None, None) => throw new IllegalArgumentException("Either TDF or WIG should be supplied"); + case (Some(a), None) => a.getAbsolutePath; + case (None, Some(b)) => b.getAbsolutePath; + case (Some(a), Some(b)) => a.getAbsolutePath + "," + b.getAbsolutePath; + } + } +} + +object IGVToolsCount { + /** + * Create an object by specifying the `input` (.bam), + * and the `genomename` (hg18,hg19,mm10) + * + * @param input Bamfile to count reads from + * @return a new IGVToolsCount instance + * @throws FileNotFoundException bam File is not found + * @throws IllegalArgumentException tdf or wig not supplied + */ + def apply(root: Configurable, input: File, genomeChromSizes: File): IGVToolsCount = { + val counting = new IGVToolsCount(root) + counting.input = input + counting.genomeChromSizes = genomeChromSizes + return counting + } +} \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2CallPeak.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2CallPeak.scala index 0722e99a26f02221bfdf2e2064cc92b08d11950f..e6a9c48e925949bdd413a9aca79afbb7e28b91d7 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2CallPeak.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2CallPeak.scala @@ -30,12 +30,12 @@ class Macs2CallPeak(val root: Configurable) extends Macs2 { @Output(doc = "Output file gappedPeak") private var output_gapped: File = _ - var fileformat: String = config("fileformat") + var fileformat: Option[String] = config("fileformat") var gsize: Option[Float] = config("gsize") var keepdup: Boolean = config("keep-dup", default = false) var buffersize: Option[Int] = config("buffer-size") - var outputdir: String = config("outputDir") - var name: String = config("name") + var outputdir: String = _ + var name: Option[String] = config("name") var bdg: Boolean = config("bdg", default = false) var verbose: Boolean = config("verbose", default = false) var tsize: Option[Int] = config("tsize") @@ -56,12 +56,14 @@ class Macs2CallPeak(val root: Configurable) extends Macs2 { var callsummits: Boolean = config("callsummits", default = false) override def afterGraph: Unit = { - output_narrow = new File(outputdir + name + ".narrowPeak") - output_broad = new File(outputdir + name + ".broadPeak") - output_xls = new File(outputdir + name + ".xls") - output_bdg = new File(outputdir + name + ".bdg") - output_r = new File(outputdir + name + ".r") - output_gapped = new File(outputdir + name + ".gappedPeak") + if (name.isEmpty) throw new IllegalArgumentException("Name is not defined") + if (outputdir == null) throw new IllegalArgumentException("Outputdir is not defined") + output_narrow = new File(outputdir + name.get + ".narrowPeak") + output_broad = new File(outputdir + name.get + ".broadPeak") + output_xls = new File(outputdir + name.get + ".xls") + output_bdg = new File(outputdir + name.get + ".bdg") + output_r = new File(outputdir + name.get + ".r") + output_gapped = new File(outputdir + name.get + ".gappedPeak") } def cmdLine = { diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala index f88304a0aa3fe25ad601fd5ad9f75672e03a1965..6df04c12ccb0ad1ca1c4853dac940ec3a304d043 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala @@ -32,19 +32,19 @@ class MarkDuplicates(val root: Configurable) extends Picard { var outputMetrics: File = _ @Argument(doc = "PROGRAM_RECORD_ID", required = false) - var programRecordId: String = config("programrecordid") + var programRecordId: Option[String] = config("programrecordid") @Argument(doc = "PROGRAM_GROUP_VERSION", required = false) - var programGroupVersion: String = config("programgroupversion") + var programGroupVersion: Option[String] = config("programgroupversion") @Argument(doc = "PROGRAM_GROUP_COMMAND_LINE", required = false) - var programGroupCommandLine: String = config("programgroupcommandline") + var programGroupCommandLine: Option[String] = config("programgroupcommandline") @Argument(doc = "PROGRAM_GROUP_NAME", required = false) - var programGroupName: String = config("programgroupname") + var programGroupName: Option[String] = config("programgroupname") @Argument(doc = "COMMENT", required = false) - var comment: String = config("comment") + var comment: Option[String] = config("comment") @Argument(doc = "REMOVE_DUPLICATES", required = false) var removeDuplicates: Boolean = config("removeduplicates", default = false) @@ -62,7 +62,7 @@ class MarkDuplicates(val root: Configurable) extends Picard { var sortingCollectionSizeRatio: Option[Double] = config("sortingCollectionSizeRatio") @Argument(doc = "READ_NAME_REGEX", required = false) - var readNameRegex: String = config("readNameRegex") + var readNameRegex: Option[String] = config("readNameRegex") @Argument(doc = "OPTICAL_DUPLICATE_PIXEL_DISTANCE", required = false) var opticalDuplicatePixelDistance: Option[Int] = config("opticalDuplicatePixelDistance") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala index b1327bc3aa5d00d8608515897a5fd81ab2d3fbe3..4de143498d9812502a2141facfacc0bbf2b046d5 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala @@ -41,7 +41,7 @@ class MergeSamFiles(val root: Configurable) extends Picard { var useThreading: Boolean = config("use_threading", default = false) @Argument(doc = "COMMENT", required = false) - var comment: String = config("comment") + var comment: Option[String] = config("comment") override def commandLine = super.commandLine + repeat("INPUT=", input, spaceSeparated = false) + diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/Picard.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/Picard.scala index cc571ca7d6dad5be5c8afccd5c4839a4c167991a..e417a85fd65c0d096961a910d20a8c890de28e91 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/Picard.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/Picard.scala @@ -22,13 +22,13 @@ abstract class Picard extends BiopetJavaCommandLineFunction { override def subPath = "picard" :: super.subPath @Argument(doc = "VERBOSITY", required = false) - var verbosity: String = config("verbosity") + var verbosity: Option[String] = config("verbosity") @Argument(doc = "QUIET", required = false) var quiet: Boolean = config("quiet", default = false) @Argument(doc = "VALIDATION_STRINGENCY", required = false) - var stringency: String = config("validationstringency") + var stringency: Option[String] = config("validationstringency") @Argument(doc = "COMPRESSION_LEVEL", required = false) var compression: Option[Int] = config("compressionlevel") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala index cb61cc4a8088da7dcbf9b87e5395722dd51f5f01..db784ee2b11767654e8fb2eb2174f9c0d63a11fa 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala @@ -38,7 +38,7 @@ class SamToFastq(val root: Configurable) extends Picard { var outputPerRg: Boolean = config("outputPerRg", default = false) @Argument(doc = "Output dir", required = false) - var outputDir: String = config("outputDir") + var outputDir: String = _ @Argument(doc = "re reverse", required = false) var reReverse: Boolean = config("reReverse", default = false) @@ -53,7 +53,7 @@ class SamToFastq(val root: Configurable) extends Picard { var clippingAtribute: String = config("clippingAtribute") @Argument(doc = "clippingAction", required = false) - var clippingAction: String = config("clippingAction") + var clippingAction: Option[String] = config("clippingAction") @Argument(doc = "read1Trim", required = false) var read1Trim: Option[Int] = config("read1Trim") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsMpileup.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsMpileup.scala index f0e6457a491a09e10580ad100dcb1c0622974b4c..b8a3a2d0e6ec913611abe21b00c1dfe200842418 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsMpileup.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsMpileup.scala @@ -30,7 +30,7 @@ class SamtoolsMpileup(val root: Configurable) extends Samtools { var reference: File = config("reference") @Input(doc = "Interval bed") - var intervalBed: File = config("interval_bed") + var intervalBed: Option[File] = config("interval_bed") var disableBaq: Boolean = config("disable_baq") var minMapQuality: Option[Int] = config("min_map_quality") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSeq.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSeq.scala index be42104c0562707ea5afca657d6a9b9d04315e8a..9838040cc74a5da4cff3073ac0b2123b6de9e954 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSeq.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSeq.scala @@ -37,7 +37,7 @@ class SeqtkSeq(val root: Configurable) extends Seqtk { var q: Option[Int] = config("q") /** masked bases converted to CHAR; 0 for lowercase [0] */ - var n: String = config("n") + var n: Option[String] = config("n") /** number of residues per line; 0 for 2^32-1 [0] */ var l: Option[Int] = config("l") @@ -52,7 +52,7 @@ class SeqtkSeq(val root: Configurable) extends Seqtk { var f: Option[Int] = config("f") /** mask regions in BED or name list FILE [null] */ - var M: File = config("M") + var M: Option[File] = config("M") /** drop sequences with length shorter than INT [0] */ var L: Option[Int] = config("L") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala new file mode 100644 index 0000000000000000000000000000000000000000..11d930d5972897824e341d22d46808d849d7767b --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala @@ -0,0 +1,472 @@ +package nl.lumc.sasc.biopet.tools + +import java.io.{ FileOutputStream, PrintWriter, File } + +import htsjdk.variant.variantcontext.{ VariantContext, Genotype } +import htsjdk.variant.vcf.VCFFileReader +import nl.lumc.sasc.biopet.core.{ BiopetJavaCommandLineFunction, ToolCommand } +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } +import scala.collection.JavaConversions._ +import scala.collection.mutable +import scala.sys.process.{ Process, ProcessLogger } +import htsjdk.samtools.util.Interval + +/** + * Created by pjvan_thof on 1/10/15. + */ +class VcfStats(val root: Configurable) extends BiopetJavaCommandLineFunction { + javaMainClass = getClass.getName + + @Input(doc = "Input fastq", shortName = "I", required = true) + var input: File = _ + + protected var outputDir: String = _ + + /** + * Set output dir and a output file + * @param dir + */ + def setOutputDir(dir: String): Unit = { + outputDir = dir + this.jobOutputFile = new File(dir + File.separator + ".vcfstats.out") + } + + /** + * Creates command to execute extension + * @return + */ + override def commandLine = super.commandLine + + required("-I", input) + + required("-o", outputDir) +} + +object VcfStats extends ToolCommand { + /** Commandline argument */ + case class Args(inputFile: File = null, outputDir: String = null, intervals: Option[File] = None) extends AbstractArgs + + /** Parsing commandline arguments */ + class OptParser extends AbstractOptParser { + opt[File]('I', "inputFile") required () unbounded () valueName ("<file>") action { (x, c) => + c.copy(inputFile = x) + } + opt[String]('o', "outputDir") required () unbounded () valueName ("<file>") action { (x, c) => + c.copy(outputDir = x) + } + //TODO: add interval argument + /* + opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) => + c.copy(intervals = Some(x)) + } + */ + } + + /** + * Class to store sample to sample compare stats + * @param genotypeOverlap Number of genotypes match with other sample + * @param alleleOverlap Number of alleles also found in other sample + */ + case class SampleToSampleStats(var genotypeOverlap: Int = 0, + var alleleOverlap: Int = 0) { + /** Add an other class */ + def +=(other: SampleToSampleStats) { + this.genotypeOverlap += other.genotypeOverlap + this.alleleOverlap += other.alleleOverlap + } + } + + /** + * class to store all sample relative stats + * @param genotypeStats Stores all genotype relative stats + * @param sampleToSample Stores sample to sample compare stats + */ + case class SampleStats(val genotypeStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(), + val sampleToSample: mutable.Map[String, SampleToSampleStats] = mutable.Map()) { + /** Add an other class */ + def +=(other: SampleStats): Unit = { + for ((key, value) <- other.sampleToSample) { + if (this.sampleToSample.contains(key)) this.sampleToSample(key) += value + else this.sampleToSample(key) = value + } + for ((field, fieldMap) <- other.genotypeStats) { + val thisField = this.genotypeStats.get(field) + if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap) + else this.genotypeStats += field -> fieldMap + } + } + } + + /** + * General stats class to store vcf stats + * @param generalStats Stores are general stats + * @param samplesStats Stores all sample/genotype specific stats + */ + case class Stats(val generalStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(), + val samplesStats: mutable.Map[String, SampleStats] = mutable.Map()) { + /** Add an other class */ + def +=(other: Stats): Stats = { + for ((key, value) <- other.samplesStats) { + if (this.samplesStats.contains(key)) this.samplesStats(key) += value + else this.samplesStats(key) = value + } + for ((field, fieldMap) <- other.generalStats) { + val thisField = this.generalStats.get(field) + if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap) + else this.generalStats += field -> fieldMap + } + this + } + } + + /** + * Merge m2 into m1 + * @param m1 + * @param m2 + */ + def mergeStatsMap(m1: mutable.Map[Any, Int], m2: mutable.Map[Any, Int]): Unit = { + for (key <- m2.keySet) + m1(key) = m1.getOrElse(key, 0) + m2(key) + } + + /** + * Merge m2 into m1 + * @param m1 + * @param m2 + */ + def mergeNestedStatsMap(m1: mutable.Map[String, mutable.Map[Any, Int]], m2: Map[String, Map[Any, Int]]): Unit = { + for ((field, fieldMap) <- m2) { + if (m1.contains(field)) { + for ((key, value) <- fieldMap) { + if (m1(field).contains(key)) m1(field)(key) += value + else m1(field)(key) = value + } + } else m1(field) = mutable.Map(fieldMap.toList: _*) + } + } + + protected var commandArgs: Args = _ + + /** + * @param args the command line arguments + */ + def main(args: Array[String]): Unit = { + logger.info("Started") + val argsParser = new OptParser + commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + val reader = new VCFFileReader(commandArgs.inputFile, true) + val header = reader.getFileHeader + val samples = header.getSampleNamesInOrder.toList + + val intervals: List[Interval] = ( + for ( + seq <- header.getSequenceDictionary.getSequences; + chunks = seq.getSequenceLength / 10000000; + i <- 1 until chunks + ) yield { + val size = seq.getSequenceLength / chunks + val begin = size * (i - 1) + 1 + val end = if (i >= chunks) seq.getSequenceLength else size * i + new Interval(seq.getSequenceName, begin, end) + } + ).toList + + val totalBases = intervals.foldRight(0L)(_.length() + _) + + // Reading vcf records + logger.info("Start reading vcf records") + + def createStats: Stats = { + val stats = new Stats + //init stats + for (sample1 <- samples) { + stats.samplesStats += sample1 -> new SampleStats + for (sample2 <- samples) { + stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats + } + } + stats + } + + var variantCounter = 0L + var baseCounter = 0L + + def status(count: Int, interval: Interval): Unit = { + variantCounter += count + baseCounter += interval.length() + val fraction = baseCounter.toFloat / totalBases * 100 + logger.info(interval + " done, " + count + " rows processed") + logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done") + } + + val statsChunks = for (interval <- intervals.par) yield { + val reader = new VCFFileReader(commandArgs.inputFile, true) + var chunkCounter = 0 + val stats = createStats + logger.info("Starting on: " + interval) + + for ( + record <- reader.query(interval.getSequence, interval.getStart, interval.getEnd) if record.getStart <= interval.getEnd + ) { + mergeNestedStatsMap(stats.generalStats, checkGeneral(record)) + for (sample1 <- samples) yield { + val genotype = record.getGenotype(sample1) + mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype)) + for (sample2 <- samples) { + val genotype2 = record.getGenotype(sample2) + if (genotype.getAlleles == genotype2.getAlleles) + stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1 + stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += genotype.getAlleles.count(allele => genotype2.getAlleles.exists(_.basesMatch(allele))) + } + } + chunkCounter += 1 + } + status(chunkCounter, interval) + stats + } + + val stats = statsChunks.toList.fold(createStats)(_ += _) + + logger.info("Done reading vcf records") + + writeField("QUAL", stats.generalStats.getOrElse("QUAL", mutable.Map())) + writeField("general", stats.generalStats.getOrElse("general", mutable.Map())) + writeGenotypeFields(stats, commandArgs.outputDir + "/genotype_", samples) + writeOverlap(stats, _.genotypeOverlap, commandArgs.outputDir + "/sample_compare/genotype_overlap", samples) + writeOverlap(stats, _.alleleOverlap, commandArgs.outputDir + "/sample_compare/allele_overlap", samples) + + logger.info("Done") + } + + /** + * Function to check all general stats, all info expect sample/genotype specific stats + * @param record + * @return + */ + protected def checkGeneral(record: VariantContext): Map[String, Map[Any, Int]] = { + val buffer = mutable.Map[String, Map[Any, Int]]() + + def addToBuffer(key: String, value: Any): Unit = { + val map = buffer.getOrElse(key, Map()) + buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1))) + } + + buffer += "QUAL" -> Map(record.getPhredScaledQual -> 1) + + addToBuffer("general", "Total") + if (record.isBiallelic) addToBuffer("general", "Biallelic") + if (record.isComplexIndel) addToBuffer("general", "ComplexIndel") + if (record.isFiltered) addToBuffer("general", "Filtered") + if (record.isFullyDecoded) addToBuffer("general", "FullyDecoded") + if (record.isIndel) addToBuffer("general", "Indel") + if (record.isMixed) addToBuffer("general", "Mixed") + if (record.isMNP) addToBuffer("general", "MNP") + if (record.isMonomorphicInSamples) addToBuffer("general", "MonomorphicInSamples") + if (record.isNotFiltered) addToBuffer("general", "NotFiltered") + if (record.isPointEvent) addToBuffer("general", "PointEvent") + if (record.isPolymorphicInSamples) addToBuffer("general", "PolymorphicInSamples") + if (record.isSimpleDeletion) addToBuffer("general", "SimpleDeletion") + if (record.isSimpleInsertion) addToBuffer("general", "SimpleInsertion") + if (record.isSNP) addToBuffer("general", "SNP") + if (record.isStructuralIndel) addToBuffer("general", "StructuralIndel") + if (record.isSymbolic) addToBuffer("general", "Symbolic") + if (record.isSymbolicOrSV) addToBuffer("general", "SymbolicOrSV") + if (record.isVariant) addToBuffer("general", "Variant") + + buffer.toMap + } + + /** + * Function to check sample/genotype specific stats + * @param record + * @param genotype + * @return + */ + protected def checkGenotype(record: VariantContext, genotype: Genotype): Map[String, Map[Any, Int]] = { + val buffer = mutable.Map[String, Map[Any, Int]]() + + def addToBuffer(key: String, value: Any): Unit = { + val map = buffer.getOrElse(key, Map()) + buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1))) + } + + buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1) + buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1) + + val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList + + addToBuffer("general", "Total") + if (genotype.isHet) addToBuffer("general", "Het") + if (genotype.isHetNonRef) addToBuffer("general", "HetNonRef") + if (genotype.isHom) addToBuffer("general", "Hom") + if (genotype.isHomRef) addToBuffer("general", "HomRef") + if (genotype.isHomVar) addToBuffer("general", "HomVar") + if (genotype.isMixed) addToBuffer("general", "Mixed") + if (genotype.isNoCall) addToBuffer("general", "NoCall") + if (genotype.isNonInformative) addToBuffer("general", "NonInformative") + if (genotype.isAvailable) addToBuffer("general", "Available") + if (genotype.isCalled) addToBuffer("general", "Called") + if (genotype.isFiltered) addToBuffer("general", "Filtered") + + if (genotype.hasAD) { + val ad = genotype.getAD + for (i <- 0 until ad.size if ad(i) > 0) { + addToBuffer("AD", ad(i)) + if (i == 0) addToBuffer("AD-ref", ad(i)) + if (i > 0) addToBuffer("AD-alt", ad(i)) + if (usedAlleles.exists(_ == i)) addToBuffer("AD-used", ad(i)) + else addToBuffer("AD-not_used", ad(i)) + } + } + + buffer.toMap + } + + /** + * Function to write stats to tsv files + * @param stats + * @param prefix + * @param samples + */ + protected def writeGenotypeFields(stats: Stats, prefix: String, samples: List[String]) { + val fields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general") + for (field <- fields) { + writeGenotypeField(stats, prefix, samples, field) + } + } + + /** + * Function to write 1 specific genotype field + * @param stats + * @param prefix + * @param samples + * @param field + */ + protected def writeGenotypeField(stats: Stats, prefix: String, samples: List[String], field: String): Unit = { + val file = new File(prefix + field + ".tsv") + file.getParentFile.mkdirs() + val writer = new PrintWriter(file) + writer.println(samples.mkString(field + "\t", "\t", "")) + val keySet = (for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).keySet).fold(Set[Any]())(_ ++ _) + for (key <- keySet.toList.sortWith(sortAnyAny(_, _))) { + val values = for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).getOrElse(key, 0) + writer.println(values.mkString(key + "\t", "\t", "")) + } + writer.close() + plotLine(file) + } + + /** + * Function to write 1 specific general field + * @param prefix + * @param data + * @return + */ + protected def writeField(prefix: String, data: mutable.Map[Any, Int]): File = { + val file = new File(commandArgs.outputDir + "/" + prefix + ".tsv") + println(file) + file.getParentFile.mkdirs() + val writer = new PrintWriter(file) + writer.println("\t" + prefix) + for (key <- data.keySet.toList.sortWith(sortAnyAny(_, _))) { + writer.println(key + "\t" + data(key)) + } + writer.close() + file + } + + /** + * Function to sort Any values + * @param a + * @param b + * @return + */ + def sortAnyAny(a: Any, b: Any): Boolean = { + a match { + case ai: Int => { + b match { + case bi: Int => ai < bi + case bi: Double => ai < bi + case _ => a.toString < b.toString + } + } + case _ => a.toString < b.toString + } + } + + /** + * Function to write sample to sample compare tsv's / heatmaps + * @param stats + * @param function function to extract targeted var in SampleToSampleStats + * @param prefix + * @param samples + */ + def writeOverlap(stats: Stats, function: SampleToSampleStats => Int, + prefix: String, samples: List[String]): Unit = { + val absFile = new File(prefix + ".abs.tsv") + val relFile = new File(prefix + ".rel.tsv") + + absFile.getParentFile.mkdirs() + + val absWriter = new PrintWriter(absFile) + val relWriter = new PrintWriter(relFile) + + absWriter.println(samples.mkString("\t", "\t", "")) + relWriter.println(samples.mkString("\t", "\t", "")) + for (sample1 <- samples) { + val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2)) + + absWriter.println(values.mkString(sample1 + "\t", "\t", "")) + + val total = function(stats.samplesStats(sample1).sampleToSample(sample1)) + relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", "")) + } + absWriter.close() + relWriter.close() + + plotHeatmap(relFile) + } + + /** + * Plots heatmaps on target tsv file + * @param file + */ + def plotHeatmap(file: File) { + executeRscript("plotHeatmap.R", Array(file.getAbsolutePath, + file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png", + file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png", + file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png")) + } + + /** + * Plots line graph with target tsv file + * @param file + */ + def plotLine(file: File) { + executeRscript("plotXY.R", Array(file.getAbsolutePath, + file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png")) + } + + /** + * Function to execute Rscript as subproces + * @param resource + * @param args + */ + def executeRscript(resource: String, args: Array[String]): Unit = { + val is = getClass.getResourceAsStream(resource) + val file = File.createTempFile("script.", "." + resource) + val os = new FileOutputStream(file) + org.apache.commons.io.IOUtils.copy(is, os) + os.close() + + val command: String = "Rscript " + file + " " + args.mkString(" ") + + logger.info("Starting: " + command) + val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x))) + if (process.exitValue() == 0) logger.info("Done: " + command) + else { + logger.warn("Failed: " + command) + if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info") + } + } +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala index 16a00b55157918b6ce92a49ff29d1a22a06110d5..9f1cddb7913cf7846daa72c7b9e695988b3a14c6 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala @@ -336,8 +336,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2file(value: ConfigValue): File = { - //TODO: throw IllegalStateException - if (value != null && value.value != null && value.value != None) new File(any2string(value.value)) else null + if (value != null && value.value != null && value.value != None) new File(any2string(value.value)) + else throw new IllegalStateException("Value does not exist") } /** @@ -346,7 +346,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionFile(value: ConfigValue): Option[File] = { - if (value != null && value.value != null && value.value != None) Some(new File(any2string(value.value))) else None + if (value != null && value.value != null && value.value != None) Some(new File(any2string(value.value))) + else None } /** @@ -355,8 +356,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2string(value: ConfigValue): String = { - //TODO: throw IllegalStateException - if (value != null && value.value != null && value.value != None) any2string(value.value) else null + if (value != null && value.value != null && value.value != None) any2string(value.value) + else throw new IllegalStateException("Value does not exist") } /** @@ -365,7 +366,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionString(value: ConfigValue): Option[String] = { - if (value != null && value.value != null && value.value != None) Some(any2string(value.value)) else None + if (value != null && value.value != null && value.value != None) Some(any2string(value.value)) + else None } /** @@ -384,7 +386,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionLong(value: ConfigValue): Option[Long] = { - if (value != null && value.value != null && value.value != None) Option(any2long(value.value)) else None + if (value != null && value.value != null && value.value != None) Option(any2long(value.value)) + else None } /** @@ -403,7 +406,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionInt(value: ConfigValue): Option[Int] = { - if (value != null && value.value != null && value.value != None) Option(any2int(value.value)) else None + if (value != null && value.value != null && value.value != None) Option(any2int(value.value)) + else None } /** @@ -422,7 +426,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionDouble(value: ConfigValue): Option[Double] = { - if (value != null && value.value != null && value.value != None) Option(any2double(value.value)) else None + if (value != null && value.value != null && value.value != None) Option(any2double(value.value)) + else None } /** @@ -441,7 +446,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionFloat(value: ConfigValue): Option[Float] = { - if (value != null && value.value != null && value.value != None) Option(any2float(value.value)) else None + if (value != null && value.value != null && value.value != None) Option(any2float(value.value)) + else None } /** @@ -460,7 +466,8 @@ object ConfigUtils extends Logging { * @return */ implicit def configValue2optionBoolean(value: ConfigValue): Option[Boolean] = { - if (value != null && value.value != null && value.value != None) Option(any2boolean(value.value)) else None + if (value != null && value.value != null && value.value != None) Option(any2boolean(value.value)) + else None } /** diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..9ac90deecad83fab23d95bfafaf89442899b798e --- /dev/null +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala @@ -0,0 +1,77 @@ +package nl.lumc.sasc.biopet.tools + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test +import scala.collection.mutable +import VcfStats._ + +/** + * Created by pjvan_thof on 2/5/15. + */ +class VcfStatsTest extends TestNGSuite with Matchers { + + @Test + def testSampleToSampleStats: Unit = { + val s1 = SampleToSampleStats() + val s2 = SampleToSampleStats() + s1.alleleOverlap shouldBe 0 + s1.genotypeOverlap shouldBe 0 + s2.alleleOverlap shouldBe 0 + s2.genotypeOverlap shouldBe 0 + + s1 += s2 + s1.alleleOverlap shouldBe 0 + s1.genotypeOverlap shouldBe 0 + s2.alleleOverlap shouldBe 0 + s2.genotypeOverlap shouldBe 0 + + s2.alleleOverlap = 2 + s2.genotypeOverlap = 3 + + s1 += s2 + s1.alleleOverlap shouldBe 2 + s1.genotypeOverlap shouldBe 3 + s2.alleleOverlap shouldBe 2 + s2.genotypeOverlap shouldBe 3 + + s1 += s2 + s1.alleleOverlap shouldBe 4 + s1.genotypeOverlap shouldBe 6 + s2.alleleOverlap shouldBe 2 + s2.genotypeOverlap shouldBe 3 + } + + @Test + def testSampleStats: Unit = { + val s1 = SampleStats() + val s2 = SampleStats() + + s1.sampleToSample += "s1" -> SampleToSampleStats() + s1.sampleToSample += "s2" -> SampleToSampleStats() + s2.sampleToSample += "s1" -> SampleToSampleStats() + s2.sampleToSample += "s2" -> SampleToSampleStats() + + s1.sampleToSample("s1").alleleOverlap = 1 + s2.sampleToSample("s2").alleleOverlap = 2 + + s1.genotypeStats += "1" -> mutable.Map(1 -> 1) + s2.genotypeStats += "2" -> mutable.Map(2 -> 2) + + val ss1 = SampleToSampleStats() + val ss2 = SampleToSampleStats() + + s1 += s2 + s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 1), "2" -> mutable.Map(2 -> 2)) + ss1.alleleOverlap = 1 + ss2.alleleOverlap = 2 + s1.sampleToSample shouldBe mutable.Map("s1" -> ss1, "s2" -> ss2) + + s1 += s2 + s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 1), "2" -> mutable.Map(2 -> 4)) + + s1 += s1 + s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 2), "2" -> mutable.Map(2 -> 8)) + } + +} diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/ConfigUtilsTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/ConfigUtilsTest.scala index 6b6f990bf5938b4adb073812de835c13ea830236..26f989c5369c4e138ded82620d1d03f0c076ce7c 100644 --- a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/ConfigUtilsTest.scala +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/ConfigUtilsTest.scala @@ -235,10 +235,14 @@ class ConfigUtilsTest extends TestNGSuite with Matchers { } var string: String = ConfigValue(index, index, "test") - string = ConfigValue(index, index, null) + intercept[IllegalStateException] { + string = ConfigValue(index, index, null) + } var file: File = ConfigValue(index, index, "test") - file = ConfigValue(index, index, null) + intercept[IllegalStateException] { + file = ConfigValue(index, index, null) + } } } } diff --git a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala index cd5376a790fcc9ab544c7cdecc0b347ad0f601df..070b1c756b7677541b5835dd272bc1216758e7be 100644 --- a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala +++ b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala @@ -35,6 +35,7 @@ object BiopetExecutablePublic extends BiopetExecutable { nl.lumc.sasc.biopet.tools.CheckAllelesVcfInBam, nl.lumc.sasc.biopet.tools.VcfToTsv, nl.lumc.sasc.biopet.tools.VcfFilter, + nl.lumc.sasc.biopet.tools.VcfStats, nl.lumc.sasc.biopet.tools.FindRepeatsPacBio, nl.lumc.sasc.biopet.tools.BedToInterval, nl.lumc.sasc.biopet.tools.MpileupToVcf, diff --git a/public/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala b/public/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala index 007636a637cdac27d7b66b110ef4ff20613d4b4a..578f3afc79f6cac8d2621399bb577222383ee2bb 100644 --- a/public/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala +++ b/public/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala @@ -86,8 +86,8 @@ class Carp(val root: Configurable) extends QScript with MultiSampleQScript { val macs2 = new Macs2CallPeak(qscript) macs2.treatment = bamFile - macs2.name = sampleId - macs2.outputdir = sampleDir + "macs2/" + macs2.name + "/" + macs2.name = Some(sampleId) + macs2.outputdir = sampleDir + "macs2/" + sampleId + "/" add(macs2) } } @@ -113,8 +113,8 @@ class Carp(val root: Configurable) extends QScript with MultiSampleQScript { val macs2 = new Macs2CallPeak(this) macs2.treatment = sample.bamFile macs2.control = samples(controlId).bamFile - macs2.name = sampleId + "_VS_" + controlId - macs2.outputdir = sample.sampleDir + "/" + "macs2/" + macs2.name + "/" + macs2.name = Some(sampleId + "_VS_" + controlId) + macs2.outputdir = sample.sampleDir + "/" + "macs2/" + macs2.name.get + "/" add(macs2) } } diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala index 91f29322ee89e0df04560677e4dd2b7fcf406481..98daba52d0960fa63b4f33b43117c3fb456bddc4 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala @@ -30,7 +30,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { var input_R1: File = _ @Input(doc = "R2 fastq file (gzipped allowed)", shortName = "R2", required = false) - var input_R2: File = _ + var input_R2: Option[File] = _ /** Skip Trim fastq files */ var skipTrim: Boolean = config("skip_trim", default = false) @@ -48,7 +48,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { @Argument(doc = "Library ID", shortName = "library", required = true) var libId: String = _ - var paired: Boolean = (input_R2 != null) + var paired: Boolean = input_R2.isDefined var R1_ext: String = _ var R2_ext: String = _ var R1_name: String = _ @@ -63,12 +63,11 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { def init() { require(outputDir != null, "Missing output directory on flexiprep module") - require(input_R1 != null, "Missing output directory on flexiprep module") + require(input_R1 != null, "Missing input R1 on flexiprep module") require(sampleId != null, "Missing sample ID on flexiprep module") require(libId != null, "Missing library ID on flexiprep module") - if (!outputDir.endsWith("/")) outputDir += "/" - paired = (input_R2 != null) + paired = input_R2.isDefined if (input_R1.endsWith(".gz")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gz")) else if (input_R1.endsWith(".gzip")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gzip")) @@ -76,12 +75,16 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { R1_ext = R1_name.substring(R1_name.lastIndexOf("."), R1_name.size) R1_name = R1_name.substring(0, R1_name.lastIndexOf(R1_ext)) - if (paired) { - if (input_R2.endsWith(".gz")) R2_name = input_R2.getName.substring(0, input_R2.getName.lastIndexOf(".gz")) - else if (input_R2.endsWith(".gzip")) R2_name = input_R2.getName.substring(0, input_R2.getName.lastIndexOf(".gzip")) - else R2_name = input_R2.getName - R2_ext = R2_name.substring(R2_name.lastIndexOf("."), R2_name.size) - R2_name = R2_name.substring(0, R2_name.lastIndexOf(R2_ext)) + input_R2 match { + case Some(fileR2) => { + paired = true + if (fileR2.endsWith(".gz")) R2_name = fileR2.getName.substring(0, fileR2.getName.lastIndexOf(".gz")) + else if (fileR2.endsWith(".gzip")) R2_name = fileR2.getName.substring(0, fileR2.getName.lastIndexOf(".gzip")) + else R2_name = fileR2.getName + R2_ext = R2_name.substring(R2_name.lastIndexOf("."), R2_name.size) + R2_name = R2_name.substring(0, R2_name.lastIndexOf(R2_ext)) + } + case _ => } summary.out = outputDir + sampleId + "-" + libId + ".qc.summary.json" @@ -100,7 +103,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { def runInitialJobs() { outputFiles += ("fastq_input_R1" -> extractIfNeeded(input_R1, outputDir)) - if (paired) outputFiles += ("fastq_input_R2" -> extractIfNeeded(input_R2, outputDir)) + if (paired) outputFiles += ("fastq_input_R2" -> extractIfNeeded(input_R2.get, outputDir)) fastqc_R1 = Fastqc(this, input_R1, outputDir + "/" + R1_name + ".fastqc/") add(fastqc_R1) @@ -112,12 +115,12 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { summary.addMd5sum(md5sum_R1, R2 = false, after = false) if (paired) { - fastqc_R2 = Fastqc(this, input_R2, outputDir + "/" + R2_name + ".fastqc/") + fastqc_R2 = Fastqc(this, input_R2.get, outputDir + "/" + R2_name + ".fastqc/") add(fastqc_R2) summary.addFastqc(fastqc_R2, R2 = true) outputFiles += ("fastqc_R2" -> fastqc_R2.output) - val md5sum_R2 = Md5sum(this, input_R2, outputDir) + val md5sum_R2 = Md5sum(this, input_R2.get, outputDir) add(md5sum_R2) summary.addMd5sum(md5sum_R2, R2 = true, after = false) } @@ -283,7 +286,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { add(zcatCommand) return newFile } else if (file.getName().endsWith(".bz2")) { - var newFile = swapExt(runDir, file, ".bz2", "") + val newFile = swapExt(runDir, file, ".bz2", "") val pbzip2 = Pbzip2(this, file, newFile) pbzip2.isIntermediate = true add(pbzip2) diff --git a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala index 36809bb19d2df6c72342f528d9e1d714ff7e79d5..9094d38cddbe70d8c7b6787200d2afe94e19d486 100644 --- a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala +++ b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala @@ -36,7 +36,7 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { var input_R1: File = _ @Input(doc = "R2 fastq file", shortName = "R2", required = false) - var input_R2: File = _ + var input_R2: Option[File] = None /** Output name */ var outputName: String = _ @@ -104,7 +104,7 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { require(sampleId != null, "Missing sample ID on mapping module") require(libId != null, "Missing library ID on mapping module") - paired = (input_R2 != null) + paired = input_R2.isDefined if (readgroupId == null && sampleId != null && libId != null) readgroupId = sampleId + "-" + libId else if (readgroupId == null) readgroupId = config("readgroup_id") @@ -129,7 +129,7 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { if (!skipFlexiprep) { flexiprep.outputDir = outputDir + "flexiprep" + File.separator flexiprep.input_R1 = input_R1 - if (paired) flexiprep.input_R2 = input_R2 + flexiprep.input_R2 = input_R2 flexiprep.sampleId = this.sampleId flexiprep.libId = this.libId flexiprep.init @@ -148,10 +148,12 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { if (chunking) for (t <- 1 to numberChunks.getOrElse(1)) { val chunkDir = outputDir + "chunks/" + t + "/" chunks += (chunkDir -> (removeGz(chunkDir + input_R1.getName), - if (paired) removeGz(chunkDir + input_R2.getName) else "")) + if (paired) removeGz(chunkDir + input_R2.get.getName) else "")) } - else chunks += (outputDir -> (flexiprep.extractIfNeeded(input_R1, flexiprep.outputDir), - flexiprep.extractIfNeeded(input_R2, flexiprep.outputDir))) + else chunks += (outputDir -> ( + flexiprep.extractIfNeeded(input_R1, flexiprep.outputDir), + if (paired) flexiprep.extractIfNeeded(input_R2.get, flexiprep.outputDir) else "") + ) if (chunking) { val fastSplitter_R1 = new FastqSplitter(this) @@ -162,7 +164,7 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { if (paired) { val fastSplitter_R2 = new FastqSplitter(this) - fastSplitter_R2.input = input_R2 + fastSplitter_R2.input = input_R2.get for ((chunkDir, fastqfile) <- chunks) fastSplitter_R2.output :+= fastqfile._2 fastSplitter_R2.isIntermediate = true add(fastSplitter_R2) @@ -269,7 +271,7 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { bwaCommand.R1 = R1 if (paired) bwaCommand.R2 = R2 bwaCommand.deps = deps - bwaCommand.R = getReadGroup + bwaCommand.R = Some(getReadGroup) bwaCommand.output = swapExt(output.getParent, output, ".bam", ".sam") bwaCommand.isIntermediate = true add(bwaCommand) diff --git a/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala b/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala index 1bc0837a32e335f6d6cfc49037e7d4ec291d9085..ed55ca382d13346b3cbb7bb4862941ef4da591f5 100644 --- a/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala +++ b/public/sage/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala @@ -35,13 +35,10 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { qscript => def this() = this(null) - var countBed: File = config("count_bed") - - var squishedCountBed: File = config("squished_count_bed") - - var transcriptome: File = config("transcriptome") - - var tagsLibrary: File = config("tags_library") + var countBed: Option[File] = config("count_bed") + var squishedCountBed: File = _ + var transcriptome: Option[File] = config("transcriptome") + var tagsLibrary: Option[File] = config("tags_library") override def defaults = ConfigUtils.mergeMaps(Map("bowtie" -> Map( "m" -> 1, @@ -127,28 +124,26 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { def init() { if (!outputDir.endsWith("/")) outputDir += "/" - if (transcriptome == null && tagsLibrary == null) + if (transcriptome.isEmpty && tagsLibrary.isEmpty) throw new IllegalStateException("No transcriptome or taglib found") - if (countBed == null && squishedCountBed == null) - throw new IllegalStateException("No bedfile supplied, please add a countBed or squishedCountBed") + if (countBed.isEmpty) + throw new IllegalStateException("No bedfile supplied, please add a countBed") } def biopetScript() { - if (squishedCountBed == null) { - val squishBed = SquishBed(this, countBed, outputDir) - add(squishBed) - squishedCountBed = squishBed.output - } + val squishBed = SquishBed(this, countBed.get, outputDir) + add(squishBed) + squishedCountBed = squishBed.output - if (tagsLibrary == null) { + if (tagsLibrary.isEmpty) { val cdl = new SageCreateLibrary(this) - cdl.input = transcriptome + cdl.input = transcriptome.get cdl.output = outputDir + "taglib/tag.lib" cdl.noAntiTagsOutput = outputDir + "taglib/no_antisense_genes.txt" cdl.noTagsOutput = outputDir + "taglib/no_sense_genes.txt" cdl.allGenesOutput = outputDir + "taglib/all_genes.txt" add(cdl) - tagsLibrary = cdl.output + tagsLibrary = Some(cdl.output) } addSamplesJobs() @@ -187,7 +182,7 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { val createTagCounts = new SageCreateTagCounts(this) createTagCounts.input = countFastq.output - createTagCounts.tagLib = tagsLibrary + createTagCounts.tagLib = tagsLibrary.get createTagCounts.countSense = outputDir + outputPrefix + ".tagcount.sense.counts" createTagCounts.countAllSense = outputDir + outputPrefix + ".tagcount.all.sense.counts" createTagCounts.countAntiSense = outputDir + outputPrefix + ".tagcount.antisense.counts"