diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala index 230eff772ec9506b2f3c2f2fbe3012d03046ee43..10036b6bc88bfb9061a2d6e8e78eb5bc821db4da 100644 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala +++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala @@ -7,7 +7,7 @@ package nl.lumc.sasc.biopet.pipelines.gatk import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } import java.io.File -import nl.lumc.sasc.biopet.tools.{ MpileupToVcf, VcfFilter, MergeAlleles } +import nl.lumc.sasc.biopet.tools.{ VcfStats, MpileupToVcf, VcfFilter, MergeAlleles } import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.extensions.gatk.{ AnalyzeCovariates, BaseRecalibrator, GenotypeGVCFs, HaplotypeCaller, IndelRealigner, PrintReads, RealignerTargetCreator, SelectVariants, CombineVariants, UnifiedGenotyper } import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates @@ -197,6 +197,12 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr val cvFinal = CombineVariants(this, mergeList.toList, outputDir + outputName + ".final.vcf.gz") cvFinal.genotypemergeoption = org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils.GenotypeMergeType.UNSORTED add(cvFinal) + + val vcfStats = new VcfStats(this) + vcfStats.input = cvFinal.out + vcfStats.setOutputDir(outputDir + File.separator + "vcfstats") + add(vcfStats) + scriptOutput.finalVcfFile = cvFinal.out } } diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala deleted file mode 100644 index c3f5add4b52092e607e1c4975745e7f58793dd08..0000000000000000000000000000000000000000 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala +++ /dev/null @@ -1,87 +0,0 @@ -/** - * Due to the license issue with GATK, this part of Biopet can only be used inside the - * LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions - * on how to use this protected part of biopet or contact us at sasc@lumc.nl - */ -package nl.lumc.sasc.biopet.pipelines.gatk - -import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } -import java.io.File -import nl.lumc.sasc.biopet.core.config.Configurable -import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants -import nl.lumc.sasc.biopet.extensions.gatk.SelectVariants -import nl.lumc.sasc.biopet.extensions.gatk.VariantEval -import org.broadinstitute.gatk.queue.QScript -import org.broadinstitute.gatk.utils.commandline.{ Input, Argument } - -class GatkVcfSampleCompare(val root: Configurable) extends QScript with BiopetQScript { - def this() = this(null) - - @Input(doc = "Sample vcf file(s)", shortName = "V") - var vcfFiles: List[File] = _ - - @Argument(doc = "Reference", shortName = "R", required = false) - var reference: File = config("reference") - - @Argument(doc = "Target bed", shortName = "targetBed", required = false) - var targetBed: List[File] = Nil - - @Argument(doc = "Samples", shortName = "sample", required = false) - var samples: List[String] = Nil - - var vcfFile: File = _ - var sampleVcfs: Map[String, File] = Map() - def generalSampleDir = outputDir + "samples/" - - def init() { - if (config.contains("target_bed")) - for (bed <- config("target_bed").asList) - targetBed :+= bed.toString - if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module") - else if (!outputDir.endsWith("/")) outputDir += "/" - } - - def biopetScript() { - vcfFile = if (vcfFiles.size > 1) { - val combineVariants = CombineVariants(this, vcfFiles, outputDir + "merge.vcf") - add(combineVariants) - combineVariants.out - } else vcfFiles.head - - for (sample <- samples) { - sampleVcfs += (sample -> new File(generalSampleDir + sample + File.separator + sample + ".vcf")) - val selectVariants = SelectVariants(this, vcfFile, sampleVcfs(sample)) - selectVariants.sample_name = Seq(sample) - selectVariants.excludeNonVariants = true - add(selectVariants) - } - - val sampleCompareMetrics = new SampleCompareMetrics(this) - sampleCompareMetrics.samples = samples - sampleCompareMetrics.sampleDir = generalSampleDir - sampleCompareMetrics.snpRelFile = outputDir + "compare.snp.rel.tsv" - sampleCompareMetrics.snpAbsFile = outputDir + "compare.snp.abs.tsv" - sampleCompareMetrics.indelRelFile = outputDir + "compare.indel.rel.tsv" - sampleCompareMetrics.indelAbsFile = outputDir + "compare.indel.abs.tsv" - sampleCompareMetrics.totalFile = outputDir + "total.tsv" - - for ((sample, sampleVcf) <- sampleVcfs) { - val sampleDir = generalSampleDir + sample + File.separator - for ((compareSample, compareSampleVcf) <- sampleVcfs) { - val variantEval = VariantEval(this, - sampleVcf, - compareSampleVcf, - new File(sampleDir + sample + "-" + compareSample + ".eval.txt"), - Seq("VariantType", "CompRod"), - Seq("CompOverlap") - ) - if (targetBed != null) variantEval.L = targetBed - add(variantEval) - sampleCompareMetrics.deps ::= variantEval.out - } - } - add(sampleCompareMetrics) - } -} - -object GatkVcfSampleCompare extends PipelineCommand diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala deleted file mode 100644 index 861455fe4d886219813409023d022ea096f413c9..0000000000000000000000000000000000000000 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala +++ /dev/null @@ -1,153 +0,0 @@ -/** - * Due to the license issue with GATK, this part of Biopet can only be used inside the - * LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions - * on how to use this protected part of biopet or contact us at sasc@lumc.nl - */ -package nl.lumc.sasc.biopet.pipelines.gatk - -import java.io.File -import java.io.PrintWriter -import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction -import nl.lumc.sasc.biopet.core.config.Configurable -import org.broadinstitute.gatk.utils.R.RScriptExecutor -import org.broadinstitute.gatk.utils.commandline.{ Output, Argument } -import scala.io.Source -import org.broadinstitute.gatk.utils.R.{ RScriptLibrary, RScriptExecutor } -import org.broadinstitute.gatk.utils.io.Resource -import scala.collection.mutable.Map -import scala.math._ - -class SampleCompareMetrics(val root: Configurable) extends BiopetJavaCommandLineFunction { - javaMainClass = getClass.getName - - @Argument(doc = "Sample Dir", shortName = "sampleDir", required = true) - var sampleDir: String = _ - - @Argument(doc = "Samples", shortName = "sample", required = true) - var samples: List[String] = Nil - - @Argument(doc = "File sufix", shortName = "sufix", required = false) - var fileSufix: String = _ - - @Output(doc = "snpRelFile", shortName = "snpRelFile", required = true) - var snpRelFile: File = _ - - @Output(doc = "snpAbsFile", shortName = "snpAbsFile", required = true) - var snpAbsFile: File = _ - - @Output(doc = "indelRelFile", shortName = "indelRelFile", required = true) - var indelRelFile: File = _ - - @Output(doc = "indelAbsFile", shortName = "indelAbsFile", required = true) - var indelAbsFile: File = _ - - @Output(doc = "totalFile", shortName = "totalFile", required = true) - var totalFile: File = _ - - override val defaultVmem = "8G" - memoryLimit = Option(4.0) - - override def commandLine = super.commandLine + - required("-sampleDir", sampleDir) + - repeat("-sample", samples) + - optional("-fileSufix", fileSufix) + - required("-snpRelFile", snpRelFile) + - required("-snpAbsFile", snpAbsFile) + - required("-indelRelFile", indelRelFile) + - required("-indelAbsFile", indelAbsFile) + - required("-totalFile", totalFile) -} - -object SampleCompareMetrics { - var sampleDir: String = _ - var samples: List[String] = Nil - var fileSufix: String = ".eval.txt" - var snpRelFile: File = _ - var snpAbsFile: File = _ - var indelRelFile: File = _ - var indelAbsFile: File = _ - var totalFile: File = _ - /** - * @param args the command line arguments - */ - def main(args: Array[String]): Unit = { - - for (t <- 0 until args.size) { - args(t) match { - case "-sample" => samples +:= args(t + 1) - case "-sampleDir" => sampleDir = args(t + 1) - case "-fileSufix" => fileSufix = args(t + 1) - case "-snpRelFile" => snpRelFile = new File(args(t + 1)) - case "-snpAbsFile" => snpAbsFile = new File(args(t + 1)) - case "-indelRelFile" => indelRelFile = new File(args(t + 1)) - case "-indelAbsFile" => indelAbsFile = new File(args(t + 1)) - case "-totalFile" => totalFile = new File(args(t + 1)) - case _ => - } - } - if (sampleDir == null) throw new IllegalStateException("No sampleDir, use -sampleDir") - else if (!sampleDir.endsWith("/")) sampleDir += "/" - - val regex = """\W+""".r - val snpsOverlap: Map[(String, String), Int] = Map() - val indelsOverlap: Map[(String, String), Int] = Map() - val snpsTotal: Map[String, Int] = Map() - val indelsTotal: Map[String, Int] = Map() - for (sample1 <- samples; sample2 <- samples) { - val reader = Source.fromFile(new File(sampleDir + sample1 + "/" + sample1 + "-" + sample2 + fileSufix)) - for (line <- reader.getLines) { - regex.split(line) match { - case Array(_, _, _, varType, all, novel, overlap, rate, _*) => { - varType match { - case "SNP" => { - snpsOverlap += (sample1, sample2) -> overlap.toInt - snpsTotal += sample1 -> all.toInt - } - case "INDEL" => { - indelsOverlap += (sample1, sample2) -> overlap.toInt - indelsTotal += sample1 -> all.toInt - } - case _ => - } - } - case _ => - } - } - reader.close() - } - - val snpRelWritter = new PrintWriter(snpRelFile) - val snpAbsWritter = new PrintWriter(snpAbsFile) - val indelRelWritter = new PrintWriter(indelRelFile) - val indelAbsWritter = new PrintWriter(indelAbsFile) - - val allWritters = List(snpRelWritter, snpAbsWritter, indelRelWritter, indelAbsWritter) - for (writter <- allWritters) writter.println(samples.mkString("\t", "\t", "")) - for (sample1 <- samples) { - for (writter <- allWritters) writter.print(sample1) - for (sample2 <- samples) { - snpRelWritter.print("\t" + (round((snpsOverlap(sample1, sample2).toDouble / snpsTotal(sample1) * 10000.0)) / 10000.0)) - snpAbsWritter.print("\t" + snpsOverlap(sample1, sample2)) - indelRelWritter.print("\t" + (round((indelsOverlap(sample1, sample2).toDouble / indelsTotal(sample1) * 10000.0)) / 10000.0)) - indelAbsWritter.print("\t" + indelsOverlap(sample1, sample2)) - } - for (writter <- allWritters) writter.println() - } - for (writter <- allWritters) writter.close() - - val totalWritter = new PrintWriter(totalFile) - totalWritter.println("Sample\tSNPs\tIndels") - for (sample <- samples) - totalWritter.println(sample + "\t" + snpsTotal(sample) + "\t" + indelsTotal(sample)) - totalWritter.close() - - def plot(file: File) { - val executor = new RScriptExecutor - executor.addScript(new Resource("plotHeatmap.R", getClass)) - executor.addArgs(file, file.getAbsolutePath.stripSuffix(".tsv") + ".png", file.getAbsolutePath.stripSuffix(".tsv") + ".clustering.png") - executor.exec() - } - plot(snpRelFile) - plot(indelRelFile) - } -} \ No newline at end of file diff --git a/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala b/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala index 9457fd36cfab986c5fdc9d5cbc29836ced4e0962..902e292fca017947affb287249ccff764e943100 100644 --- a/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala +++ b/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala @@ -12,7 +12,6 @@ object BiopetExecutableProtected extends BiopetExecutable { nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantcalling, nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline, nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantRecalibration, - nl.lumc.sasc.biopet.pipelines.gatk.GatkVcfSampleCompare, nl.lumc.sasc.biopet.pipelines.basty.Basty) def tools = BiopetExecutablePublic.tools diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R deleted file mode 100644 index 4158db708d58c8cc19b535dcfe871c626fa51ad6..0000000000000000000000000000000000000000 --- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R +++ /dev/null @@ -1,24 +0,0 @@ -library('gplots') - -args <- commandArgs(TRUE) -inputArg <- args[1] -outputArg <- args[2] -outputArgClustering <- args[3] - -col <- heat.colors(250) -col[250] <- "#00FF00" - -heat<-read.table(inputArg, header = 1, sep= '\t') -rownames(heat) <- heat[,1] -heat<- heat[,-1] - -heat<- as.matrix(heat) - -png(file = outputArg, width = 1000, height = 1000) -heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none") -dev.off() - - -png(file = outputArgClustering, width = 1000, height = 1000) -heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row") -dev.off() diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R new file mode 100644 index 0000000000000000000000000000000000000000..7f7237e90f6593e3d6cf110da005cd89c154d466 --- /dev/null +++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R @@ -0,0 +1,35 @@ +library('gplots') +library('RColorBrewer') + +args <- commandArgs(TRUE) +inputArg <- args[1] +outputArg <- args[2] +outputArgClustering <- args[3] +outputArgDendrogram <- args[4] + + +heat<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F) +#heat[heat==1] <- NA +rownames(heat) <- heat[,1] +heat<- heat[,-1] +heat<- as.matrix(heat) + +colNumber <- 50 +col <- rev(colorRampPalette(brewer.pal(11, "Spectral"))(colNumber)) +for (i in (colNumber+1):(colNumber+round((dist(range(heat)) - dist(range(heat[heat < 1]))) / dist(range(heat[heat < 1])) * colNumber))) { + col[i] <- col[colNumber] +} +col[length(col)] <- "#00FF00" + +png(file = outputArg, width = 1200, height = 1200) +heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none", margins = c(12, 12), na.color="#00FF00") +dev.off() + +hc <- hclust(d = dist(heat)) +png(file = outputArgDendrogram, width = 1200, height = 1200) +plot(as.dendrogram(hc), horiz=TRUE, asp=0.02) +dev.off() + +png(file = outputArgClustering, width = 1200, height = 1200) +heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row",margins = c(12, 12), na.color="#00FF00") +dev.off() diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R new file mode 100644 index 0000000000000000000000000000000000000000..63fd7b03262d94094a9f151b22cca812f10cee1f --- /dev/null +++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R @@ -0,0 +1,20 @@ +library('ggplot2') +library('reshape2') + +args <- commandArgs(TRUE) +inputArg <- args[1] +outputArg <- args[2] + +tsv<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F) + +data <- melt(tsv) + +data$X <- as.numeric(data$X) +data <- na.omit(data) +data <- data[data$value > 0,] + +print("Starting to plot") +png(file = outputArg, width = 1500, height = 1500) +ggplot(data, aes(x=X, y=value, color=variable, group=variable)) + geom_line() +dev.off() +print("plot done") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala index b87e3cb1ec19fc84b6b4852e525e2d18f03d4e77..e732c1023de5f713f4c21ed366da68eddc825285 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala @@ -59,7 +59,7 @@ class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction { ".filtered_polymorphic_sites.fasta", ".filtered_polymorphic_sites.phylip", ".final_tree.tre") - for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix + t) + for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix.getOrElse("gubbins") + t) } def cmdLine = required("cd", outputDirectory) + " && " + required(executable) + diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala new file mode 100644 index 0000000000000000000000000000000000000000..11d930d5972897824e341d22d46808d849d7767b --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala @@ -0,0 +1,472 @@ +package nl.lumc.sasc.biopet.tools + +import java.io.{ FileOutputStream, PrintWriter, File } + +import htsjdk.variant.variantcontext.{ VariantContext, Genotype } +import htsjdk.variant.vcf.VCFFileReader +import nl.lumc.sasc.biopet.core.{ BiopetJavaCommandLineFunction, ToolCommand } +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } +import scala.collection.JavaConversions._ +import scala.collection.mutable +import scala.sys.process.{ Process, ProcessLogger } +import htsjdk.samtools.util.Interval + +/** + * Created by pjvan_thof on 1/10/15. + */ +class VcfStats(val root: Configurable) extends BiopetJavaCommandLineFunction { + javaMainClass = getClass.getName + + @Input(doc = "Input fastq", shortName = "I", required = true) + var input: File = _ + + protected var outputDir: String = _ + + /** + * Set output dir and a output file + * @param dir + */ + def setOutputDir(dir: String): Unit = { + outputDir = dir + this.jobOutputFile = new File(dir + File.separator + ".vcfstats.out") + } + + /** + * Creates command to execute extension + * @return + */ + override def commandLine = super.commandLine + + required("-I", input) + + required("-o", outputDir) +} + +object VcfStats extends ToolCommand { + /** Commandline argument */ + case class Args(inputFile: File = null, outputDir: String = null, intervals: Option[File] = None) extends AbstractArgs + + /** Parsing commandline arguments */ + class OptParser extends AbstractOptParser { + opt[File]('I', "inputFile") required () unbounded () valueName ("<file>") action { (x, c) => + c.copy(inputFile = x) + } + opt[String]('o', "outputDir") required () unbounded () valueName ("<file>") action { (x, c) => + c.copy(outputDir = x) + } + //TODO: add interval argument + /* + opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) => + c.copy(intervals = Some(x)) + } + */ + } + + /** + * Class to store sample to sample compare stats + * @param genotypeOverlap Number of genotypes match with other sample + * @param alleleOverlap Number of alleles also found in other sample + */ + case class SampleToSampleStats(var genotypeOverlap: Int = 0, + var alleleOverlap: Int = 0) { + /** Add an other class */ + def +=(other: SampleToSampleStats) { + this.genotypeOverlap += other.genotypeOverlap + this.alleleOverlap += other.alleleOverlap + } + } + + /** + * class to store all sample relative stats + * @param genotypeStats Stores all genotype relative stats + * @param sampleToSample Stores sample to sample compare stats + */ + case class SampleStats(val genotypeStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(), + val sampleToSample: mutable.Map[String, SampleToSampleStats] = mutable.Map()) { + /** Add an other class */ + def +=(other: SampleStats): Unit = { + for ((key, value) <- other.sampleToSample) { + if (this.sampleToSample.contains(key)) this.sampleToSample(key) += value + else this.sampleToSample(key) = value + } + for ((field, fieldMap) <- other.genotypeStats) { + val thisField = this.genotypeStats.get(field) + if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap) + else this.genotypeStats += field -> fieldMap + } + } + } + + /** + * General stats class to store vcf stats + * @param generalStats Stores are general stats + * @param samplesStats Stores all sample/genotype specific stats + */ + case class Stats(val generalStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(), + val samplesStats: mutable.Map[String, SampleStats] = mutable.Map()) { + /** Add an other class */ + def +=(other: Stats): Stats = { + for ((key, value) <- other.samplesStats) { + if (this.samplesStats.contains(key)) this.samplesStats(key) += value + else this.samplesStats(key) = value + } + for ((field, fieldMap) <- other.generalStats) { + val thisField = this.generalStats.get(field) + if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap) + else this.generalStats += field -> fieldMap + } + this + } + } + + /** + * Merge m2 into m1 + * @param m1 + * @param m2 + */ + def mergeStatsMap(m1: mutable.Map[Any, Int], m2: mutable.Map[Any, Int]): Unit = { + for (key <- m2.keySet) + m1(key) = m1.getOrElse(key, 0) + m2(key) + } + + /** + * Merge m2 into m1 + * @param m1 + * @param m2 + */ + def mergeNestedStatsMap(m1: mutable.Map[String, mutable.Map[Any, Int]], m2: Map[String, Map[Any, Int]]): Unit = { + for ((field, fieldMap) <- m2) { + if (m1.contains(field)) { + for ((key, value) <- fieldMap) { + if (m1(field).contains(key)) m1(field)(key) += value + else m1(field)(key) = value + } + } else m1(field) = mutable.Map(fieldMap.toList: _*) + } + } + + protected var commandArgs: Args = _ + + /** + * @param args the command line arguments + */ + def main(args: Array[String]): Unit = { + logger.info("Started") + val argsParser = new OptParser + commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + val reader = new VCFFileReader(commandArgs.inputFile, true) + val header = reader.getFileHeader + val samples = header.getSampleNamesInOrder.toList + + val intervals: List[Interval] = ( + for ( + seq <- header.getSequenceDictionary.getSequences; + chunks = seq.getSequenceLength / 10000000; + i <- 1 until chunks + ) yield { + val size = seq.getSequenceLength / chunks + val begin = size * (i - 1) + 1 + val end = if (i >= chunks) seq.getSequenceLength else size * i + new Interval(seq.getSequenceName, begin, end) + } + ).toList + + val totalBases = intervals.foldRight(0L)(_.length() + _) + + // Reading vcf records + logger.info("Start reading vcf records") + + def createStats: Stats = { + val stats = new Stats + //init stats + for (sample1 <- samples) { + stats.samplesStats += sample1 -> new SampleStats + for (sample2 <- samples) { + stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats + } + } + stats + } + + var variantCounter = 0L + var baseCounter = 0L + + def status(count: Int, interval: Interval): Unit = { + variantCounter += count + baseCounter += interval.length() + val fraction = baseCounter.toFloat / totalBases * 100 + logger.info(interval + " done, " + count + " rows processed") + logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done") + } + + val statsChunks = for (interval <- intervals.par) yield { + val reader = new VCFFileReader(commandArgs.inputFile, true) + var chunkCounter = 0 + val stats = createStats + logger.info("Starting on: " + interval) + + for ( + record <- reader.query(interval.getSequence, interval.getStart, interval.getEnd) if record.getStart <= interval.getEnd + ) { + mergeNestedStatsMap(stats.generalStats, checkGeneral(record)) + for (sample1 <- samples) yield { + val genotype = record.getGenotype(sample1) + mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype)) + for (sample2 <- samples) { + val genotype2 = record.getGenotype(sample2) + if (genotype.getAlleles == genotype2.getAlleles) + stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1 + stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += genotype.getAlleles.count(allele => genotype2.getAlleles.exists(_.basesMatch(allele))) + } + } + chunkCounter += 1 + } + status(chunkCounter, interval) + stats + } + + val stats = statsChunks.toList.fold(createStats)(_ += _) + + logger.info("Done reading vcf records") + + writeField("QUAL", stats.generalStats.getOrElse("QUAL", mutable.Map())) + writeField("general", stats.generalStats.getOrElse("general", mutable.Map())) + writeGenotypeFields(stats, commandArgs.outputDir + "/genotype_", samples) + writeOverlap(stats, _.genotypeOverlap, commandArgs.outputDir + "/sample_compare/genotype_overlap", samples) + writeOverlap(stats, _.alleleOverlap, commandArgs.outputDir + "/sample_compare/allele_overlap", samples) + + logger.info("Done") + } + + /** + * Function to check all general stats, all info expect sample/genotype specific stats + * @param record + * @return + */ + protected def checkGeneral(record: VariantContext): Map[String, Map[Any, Int]] = { + val buffer = mutable.Map[String, Map[Any, Int]]() + + def addToBuffer(key: String, value: Any): Unit = { + val map = buffer.getOrElse(key, Map()) + buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1))) + } + + buffer += "QUAL" -> Map(record.getPhredScaledQual -> 1) + + addToBuffer("general", "Total") + if (record.isBiallelic) addToBuffer("general", "Biallelic") + if (record.isComplexIndel) addToBuffer("general", "ComplexIndel") + if (record.isFiltered) addToBuffer("general", "Filtered") + if (record.isFullyDecoded) addToBuffer("general", "FullyDecoded") + if (record.isIndel) addToBuffer("general", "Indel") + if (record.isMixed) addToBuffer("general", "Mixed") + if (record.isMNP) addToBuffer("general", "MNP") + if (record.isMonomorphicInSamples) addToBuffer("general", "MonomorphicInSamples") + if (record.isNotFiltered) addToBuffer("general", "NotFiltered") + if (record.isPointEvent) addToBuffer("general", "PointEvent") + if (record.isPolymorphicInSamples) addToBuffer("general", "PolymorphicInSamples") + if (record.isSimpleDeletion) addToBuffer("general", "SimpleDeletion") + if (record.isSimpleInsertion) addToBuffer("general", "SimpleInsertion") + if (record.isSNP) addToBuffer("general", "SNP") + if (record.isStructuralIndel) addToBuffer("general", "StructuralIndel") + if (record.isSymbolic) addToBuffer("general", "Symbolic") + if (record.isSymbolicOrSV) addToBuffer("general", "SymbolicOrSV") + if (record.isVariant) addToBuffer("general", "Variant") + + buffer.toMap + } + + /** + * Function to check sample/genotype specific stats + * @param record + * @param genotype + * @return + */ + protected def checkGenotype(record: VariantContext, genotype: Genotype): Map[String, Map[Any, Int]] = { + val buffer = mutable.Map[String, Map[Any, Int]]() + + def addToBuffer(key: String, value: Any): Unit = { + val map = buffer.getOrElse(key, Map()) + buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1))) + } + + buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1) + buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1) + + val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList + + addToBuffer("general", "Total") + if (genotype.isHet) addToBuffer("general", "Het") + if (genotype.isHetNonRef) addToBuffer("general", "HetNonRef") + if (genotype.isHom) addToBuffer("general", "Hom") + if (genotype.isHomRef) addToBuffer("general", "HomRef") + if (genotype.isHomVar) addToBuffer("general", "HomVar") + if (genotype.isMixed) addToBuffer("general", "Mixed") + if (genotype.isNoCall) addToBuffer("general", "NoCall") + if (genotype.isNonInformative) addToBuffer("general", "NonInformative") + if (genotype.isAvailable) addToBuffer("general", "Available") + if (genotype.isCalled) addToBuffer("general", "Called") + if (genotype.isFiltered) addToBuffer("general", "Filtered") + + if (genotype.hasAD) { + val ad = genotype.getAD + for (i <- 0 until ad.size if ad(i) > 0) { + addToBuffer("AD", ad(i)) + if (i == 0) addToBuffer("AD-ref", ad(i)) + if (i > 0) addToBuffer("AD-alt", ad(i)) + if (usedAlleles.exists(_ == i)) addToBuffer("AD-used", ad(i)) + else addToBuffer("AD-not_used", ad(i)) + } + } + + buffer.toMap + } + + /** + * Function to write stats to tsv files + * @param stats + * @param prefix + * @param samples + */ + protected def writeGenotypeFields(stats: Stats, prefix: String, samples: List[String]) { + val fields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general") + for (field <- fields) { + writeGenotypeField(stats, prefix, samples, field) + } + } + + /** + * Function to write 1 specific genotype field + * @param stats + * @param prefix + * @param samples + * @param field + */ + protected def writeGenotypeField(stats: Stats, prefix: String, samples: List[String], field: String): Unit = { + val file = new File(prefix + field + ".tsv") + file.getParentFile.mkdirs() + val writer = new PrintWriter(file) + writer.println(samples.mkString(field + "\t", "\t", "")) + val keySet = (for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).keySet).fold(Set[Any]())(_ ++ _) + for (key <- keySet.toList.sortWith(sortAnyAny(_, _))) { + val values = for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).getOrElse(key, 0) + writer.println(values.mkString(key + "\t", "\t", "")) + } + writer.close() + plotLine(file) + } + + /** + * Function to write 1 specific general field + * @param prefix + * @param data + * @return + */ + protected def writeField(prefix: String, data: mutable.Map[Any, Int]): File = { + val file = new File(commandArgs.outputDir + "/" + prefix + ".tsv") + println(file) + file.getParentFile.mkdirs() + val writer = new PrintWriter(file) + writer.println("\t" + prefix) + for (key <- data.keySet.toList.sortWith(sortAnyAny(_, _))) { + writer.println(key + "\t" + data(key)) + } + writer.close() + file + } + + /** + * Function to sort Any values + * @param a + * @param b + * @return + */ + def sortAnyAny(a: Any, b: Any): Boolean = { + a match { + case ai: Int => { + b match { + case bi: Int => ai < bi + case bi: Double => ai < bi + case _ => a.toString < b.toString + } + } + case _ => a.toString < b.toString + } + } + + /** + * Function to write sample to sample compare tsv's / heatmaps + * @param stats + * @param function function to extract targeted var in SampleToSampleStats + * @param prefix + * @param samples + */ + def writeOverlap(stats: Stats, function: SampleToSampleStats => Int, + prefix: String, samples: List[String]): Unit = { + val absFile = new File(prefix + ".abs.tsv") + val relFile = new File(prefix + ".rel.tsv") + + absFile.getParentFile.mkdirs() + + val absWriter = new PrintWriter(absFile) + val relWriter = new PrintWriter(relFile) + + absWriter.println(samples.mkString("\t", "\t", "")) + relWriter.println(samples.mkString("\t", "\t", "")) + for (sample1 <- samples) { + val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2)) + + absWriter.println(values.mkString(sample1 + "\t", "\t", "")) + + val total = function(stats.samplesStats(sample1).sampleToSample(sample1)) + relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", "")) + } + absWriter.close() + relWriter.close() + + plotHeatmap(relFile) + } + + /** + * Plots heatmaps on target tsv file + * @param file + */ + def plotHeatmap(file: File) { + executeRscript("plotHeatmap.R", Array(file.getAbsolutePath, + file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png", + file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png", + file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png")) + } + + /** + * Plots line graph with target tsv file + * @param file + */ + def plotLine(file: File) { + executeRscript("plotXY.R", Array(file.getAbsolutePath, + file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png")) + } + + /** + * Function to execute Rscript as subproces + * @param resource + * @param args + */ + def executeRscript(resource: String, args: Array[String]): Unit = { + val is = getClass.getResourceAsStream(resource) + val file = File.createTempFile("script.", "." + resource) + val os = new FileOutputStream(file) + org.apache.commons.io.IOUtils.copy(is, os) + os.close() + + val command: String = "Rscript " + file + " " + args.mkString(" ") + + logger.info("Starting: " + command) + val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x))) + if (process.exitValue() == 0) logger.info("Done: " + command) + else { + logger.warn("Failed: " + command) + if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info") + } + } +} diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..9ac90deecad83fab23d95bfafaf89442899b798e --- /dev/null +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala @@ -0,0 +1,77 @@ +package nl.lumc.sasc.biopet.tools + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test +import scala.collection.mutable +import VcfStats._ + +/** + * Created by pjvan_thof on 2/5/15. + */ +class VcfStatsTest extends TestNGSuite with Matchers { + + @Test + def testSampleToSampleStats: Unit = { + val s1 = SampleToSampleStats() + val s2 = SampleToSampleStats() + s1.alleleOverlap shouldBe 0 + s1.genotypeOverlap shouldBe 0 + s2.alleleOverlap shouldBe 0 + s2.genotypeOverlap shouldBe 0 + + s1 += s2 + s1.alleleOverlap shouldBe 0 + s1.genotypeOverlap shouldBe 0 + s2.alleleOverlap shouldBe 0 + s2.genotypeOverlap shouldBe 0 + + s2.alleleOverlap = 2 + s2.genotypeOverlap = 3 + + s1 += s2 + s1.alleleOverlap shouldBe 2 + s1.genotypeOverlap shouldBe 3 + s2.alleleOverlap shouldBe 2 + s2.genotypeOverlap shouldBe 3 + + s1 += s2 + s1.alleleOverlap shouldBe 4 + s1.genotypeOverlap shouldBe 6 + s2.alleleOverlap shouldBe 2 + s2.genotypeOverlap shouldBe 3 + } + + @Test + def testSampleStats: Unit = { + val s1 = SampleStats() + val s2 = SampleStats() + + s1.sampleToSample += "s1" -> SampleToSampleStats() + s1.sampleToSample += "s2" -> SampleToSampleStats() + s2.sampleToSample += "s1" -> SampleToSampleStats() + s2.sampleToSample += "s2" -> SampleToSampleStats() + + s1.sampleToSample("s1").alleleOverlap = 1 + s2.sampleToSample("s2").alleleOverlap = 2 + + s1.genotypeStats += "1" -> mutable.Map(1 -> 1) + s2.genotypeStats += "2" -> mutable.Map(2 -> 2) + + val ss1 = SampleToSampleStats() + val ss2 = SampleToSampleStats() + + s1 += s2 + s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 1), "2" -> mutable.Map(2 -> 2)) + ss1.alleleOverlap = 1 + ss2.alleleOverlap = 2 + s1.sampleToSample shouldBe mutable.Map("s1" -> ss1, "s2" -> ss2) + + s1 += s2 + s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 1), "2" -> mutable.Map(2 -> 4)) + + s1 += s1 + s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 2), "2" -> mutable.Map(2 -> 8)) + } + +} diff --git a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala index cd5376a790fcc9ab544c7cdecc0b347ad0f601df..070b1c756b7677541b5835dd272bc1216758e7be 100644 --- a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala +++ b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala @@ -35,6 +35,7 @@ object BiopetExecutablePublic extends BiopetExecutable { nl.lumc.sasc.biopet.tools.CheckAllelesVcfInBam, nl.lumc.sasc.biopet.tools.VcfToTsv, nl.lumc.sasc.biopet.tools.VcfFilter, + nl.lumc.sasc.biopet.tools.VcfStats, nl.lumc.sasc.biopet.tools.FindRepeatsPacBio, nl.lumc.sasc.biopet.tools.BedToInterval, nl.lumc.sasc.biopet.tools.MpileupToVcf,