Commit 1a870a62 authored by bow's avatar bow
Browse files

Merge branch 'feature-vcf_stats' into 'develop'

Feature vcf stats

More functions can be added but this is a first version that is ready to use in the gatk pipeline.

is also fix for #93

See merge request !84
parents 74077f92 74f11c8e
......@@ -7,7 +7,7 @@ package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import java.io.File
import nl.lumc.sasc.biopet.tools.{ MpileupToVcf, VcfFilter, MergeAlleles }
import nl.lumc.sasc.biopet.tools.{ VcfStats, MpileupToVcf, VcfFilter, MergeAlleles }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.{ AnalyzeCovariates, BaseRecalibrator, GenotypeGVCFs, HaplotypeCaller, IndelRealigner, PrintReads, RealignerTargetCreator, SelectVariants, CombineVariants, UnifiedGenotyper }
import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates
......@@ -197,6 +197,12 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
val cvFinal = CombineVariants(this, mergeList.toList, outputDir + outputName + ".final.vcf.gz")
cvFinal.genotypemergeoption = org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils.GenotypeMergeType.UNSORTED
add(cvFinal)
val vcfStats = new VcfStats(this)
vcfStats.input = cvFinal.out
vcfStats.setOutputDir(outputDir + File.separator + "vcfstats")
add(vcfStats)
scriptOutput.finalVcfFile = cvFinal.out
}
}
......
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import nl.lumc.sasc.biopet.extensions.gatk.SelectVariants
import nl.lumc.sasc.biopet.extensions.gatk.VariantEval
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
class GatkVcfSampleCompare(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Sample vcf file(s)", shortName = "V")
var vcfFiles: List[File] = _
@Argument(doc = "Reference", shortName = "R", required = false)
var reference: File = config("reference")
@Argument(doc = "Target bed", shortName = "targetBed", required = false)
var targetBed: List[File] = Nil
@Argument(doc = "Samples", shortName = "sample", required = false)
var samples: List[String] = Nil
var vcfFile: File = _
var sampleVcfs: Map[String, File] = Map()
def generalSampleDir = outputDir + "samples/"
def init() {
if (config.contains("target_bed"))
for (bed <- config("target_bed").asList)
targetBed :+= bed.toString
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
vcfFile = if (vcfFiles.size > 1) {
val combineVariants = CombineVariants(this, vcfFiles, outputDir + "merge.vcf")
add(combineVariants)
combineVariants.out
} else vcfFiles.head
for (sample <- samples) {
sampleVcfs += (sample -> new File(generalSampleDir + sample + File.separator + sample + ".vcf"))
val selectVariants = SelectVariants(this, vcfFile, sampleVcfs(sample))
selectVariants.sample_name = Seq(sample)
selectVariants.excludeNonVariants = true
add(selectVariants)
}
val sampleCompareMetrics = new SampleCompareMetrics(this)
sampleCompareMetrics.samples = samples
sampleCompareMetrics.sampleDir = generalSampleDir
sampleCompareMetrics.snpRelFile = outputDir + "compare.snp.rel.tsv"
sampleCompareMetrics.snpAbsFile = outputDir + "compare.snp.abs.tsv"
sampleCompareMetrics.indelRelFile = outputDir + "compare.indel.rel.tsv"
sampleCompareMetrics.indelAbsFile = outputDir + "compare.indel.abs.tsv"
sampleCompareMetrics.totalFile = outputDir + "total.tsv"
for ((sample, sampleVcf) <- sampleVcfs) {
val sampleDir = generalSampleDir + sample + File.separator
for ((compareSample, compareSampleVcf) <- sampleVcfs) {
val variantEval = VariantEval(this,
sampleVcf,
compareSampleVcf,
new File(sampleDir + sample + "-" + compareSample + ".eval.txt"),
Seq("VariantType", "CompRod"),
Seq("CompOverlap")
)
if (targetBed != null) variantEval.L = targetBed
add(variantEval)
sampleCompareMetrics.deps ::= variantEval.out
}
}
add(sampleCompareMetrics)
}
}
object GatkVcfSampleCompare extends PipelineCommand
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import java.io.File
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.R.RScriptExecutor
import org.broadinstitute.gatk.utils.commandline.{ Output, Argument }
import scala.io.Source
import org.broadinstitute.gatk.utils.R.{ RScriptLibrary, RScriptExecutor }
import org.broadinstitute.gatk.utils.io.Resource
import scala.collection.mutable.Map
import scala.math._
class SampleCompareMetrics(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Argument(doc = "Sample Dir", shortName = "sampleDir", required = true)
var sampleDir: String = _
@Argument(doc = "Samples", shortName = "sample", required = true)
var samples: List[String] = Nil
@Argument(doc = "File sufix", shortName = "sufix", required = false)
var fileSufix: String = _
@Output(doc = "snpRelFile", shortName = "snpRelFile", required = true)
var snpRelFile: File = _
@Output(doc = "snpAbsFile", shortName = "snpAbsFile", required = true)
var snpAbsFile: File = _
@Output(doc = "indelRelFile", shortName = "indelRelFile", required = true)
var indelRelFile: File = _
@Output(doc = "indelAbsFile", shortName = "indelAbsFile", required = true)
var indelAbsFile: File = _
@Output(doc = "totalFile", shortName = "totalFile", required = true)
var totalFile: File = _
override val defaultVmem = "8G"
memoryLimit = Option(4.0)
override def commandLine = super.commandLine +
required("-sampleDir", sampleDir) +
repeat("-sample", samples) +
optional("-fileSufix", fileSufix) +
required("-snpRelFile", snpRelFile) +
required("-snpAbsFile", snpAbsFile) +
required("-indelRelFile", indelRelFile) +
required("-indelAbsFile", indelAbsFile) +
required("-totalFile", totalFile)
}
object SampleCompareMetrics {
var sampleDir: String = _
var samples: List[String] = Nil
var fileSufix: String = ".eval.txt"
var snpRelFile: File = _
var snpAbsFile: File = _
var indelRelFile: File = _
var indelAbsFile: File = _
var totalFile: File = _
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
for (t <- 0 until args.size) {
args(t) match {
case "-sample" => samples +:= args(t + 1)
case "-sampleDir" => sampleDir = args(t + 1)
case "-fileSufix" => fileSufix = args(t + 1)
case "-snpRelFile" => snpRelFile = new File(args(t + 1))
case "-snpAbsFile" => snpAbsFile = new File(args(t + 1))
case "-indelRelFile" => indelRelFile = new File(args(t + 1))
case "-indelAbsFile" => indelAbsFile = new File(args(t + 1))
case "-totalFile" => totalFile = new File(args(t + 1))
case _ =>
}
}
if (sampleDir == null) throw new IllegalStateException("No sampleDir, use -sampleDir")
else if (!sampleDir.endsWith("/")) sampleDir += "/"
val regex = """\W+""".r
val snpsOverlap: Map[(String, String), Int] = Map()
val indelsOverlap: Map[(String, String), Int] = Map()
val snpsTotal: Map[String, Int] = Map()
val indelsTotal: Map[String, Int] = Map()
for (sample1 <- samples; sample2 <- samples) {
val reader = Source.fromFile(new File(sampleDir + sample1 + "/" + sample1 + "-" + sample2 + fileSufix))
for (line <- reader.getLines) {
regex.split(line) match {
case Array(_, _, _, varType, all, novel, overlap, rate, _*) => {
varType match {
case "SNP" => {
snpsOverlap += (sample1, sample2) -> overlap.toInt
snpsTotal += sample1 -> all.toInt
}
case "INDEL" => {
indelsOverlap += (sample1, sample2) -> overlap.toInt
indelsTotal += sample1 -> all.toInt
}
case _ =>
}
}
case _ =>
}
}
reader.close()
}
val snpRelWritter = new PrintWriter(snpRelFile)
val snpAbsWritter = new PrintWriter(snpAbsFile)
val indelRelWritter = new PrintWriter(indelRelFile)
val indelAbsWritter = new PrintWriter(indelAbsFile)
val allWritters = List(snpRelWritter, snpAbsWritter, indelRelWritter, indelAbsWritter)
for (writter <- allWritters) writter.println(samples.mkString("\t", "\t", ""))
for (sample1 <- samples) {
for (writter <- allWritters) writter.print(sample1)
for (sample2 <- samples) {
snpRelWritter.print("\t" + (round((snpsOverlap(sample1, sample2).toDouble / snpsTotal(sample1) * 10000.0)) / 10000.0))
snpAbsWritter.print("\t" + snpsOverlap(sample1, sample2))
indelRelWritter.print("\t" + (round((indelsOverlap(sample1, sample2).toDouble / indelsTotal(sample1) * 10000.0)) / 10000.0))
indelAbsWritter.print("\t" + indelsOverlap(sample1, sample2))
}
for (writter <- allWritters) writter.println()
}
for (writter <- allWritters) writter.close()
val totalWritter = new PrintWriter(totalFile)
totalWritter.println("Sample\tSNPs\tIndels")
for (sample <- samples)
totalWritter.println(sample + "\t" + snpsTotal(sample) + "\t" + indelsTotal(sample))
totalWritter.close()
def plot(file: File) {
val executor = new RScriptExecutor
executor.addScript(new Resource("plotHeatmap.R", getClass))
executor.addArgs(file, file.getAbsolutePath.stripSuffix(".tsv") + ".png", file.getAbsolutePath.stripSuffix(".tsv") + ".clustering.png")
executor.exec()
}
plot(snpRelFile)
plot(indelRelFile)
}
}
\ No newline at end of file
......@@ -12,7 +12,6 @@ object BiopetExecutableProtected extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantcalling,
nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline,
nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantRecalibration,
nl.lumc.sasc.biopet.pipelines.gatk.GatkVcfSampleCompare,
nl.lumc.sasc.biopet.pipelines.basty.Basty)
def tools = BiopetExecutablePublic.tools
......
library('gplots')
args <- commandArgs(TRUE)
inputArg <- args[1]
outputArg <- args[2]
outputArgClustering <- args[3]
col <- heat.colors(250)
col[250] <- "#00FF00"
heat<-read.table(inputArg, header = 1, sep= '\t')
rownames(heat) <- heat[,1]
heat<- heat[,-1]
heat<- as.matrix(heat)
png(file = outputArg, width = 1000, height = 1000)
heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none")
dev.off()
png(file = outputArgClustering, width = 1000, height = 1000)
heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row")
dev.off()
library('gplots')
library('RColorBrewer')
args <- commandArgs(TRUE)
inputArg <- args[1]
outputArg <- args[2]
outputArgClustering <- args[3]
outputArgDendrogram <- args[4]
heat<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F)
#heat[heat==1] <- NA
rownames(heat) <- heat[,1]
heat<- heat[,-1]
heat<- as.matrix(heat)
colNumber <- 50
col <- rev(colorRampPalette(brewer.pal(11, "Spectral"))(colNumber))
for (i in (colNumber+1):(colNumber+round((dist(range(heat)) - dist(range(heat[heat < 1]))) / dist(range(heat[heat < 1])) * colNumber))) {
col[i] <- col[colNumber]
}
col[length(col)] <- "#00FF00"
png(file = outputArg, width = 1200, height = 1200)
heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none", margins = c(12, 12), na.color="#00FF00")
dev.off()
hc <- hclust(d = dist(heat))
png(file = outputArgDendrogram, width = 1200, height = 1200)
plot(as.dendrogram(hc), horiz=TRUE, asp=0.02)
dev.off()
png(file = outputArgClustering, width = 1200, height = 1200)
heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row",margins = c(12, 12), na.color="#00FF00")
dev.off()
library('ggplot2')
library('reshape2')
args <- commandArgs(TRUE)
inputArg <- args[1]
outputArg <- args[2]
tsv<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F)
data <- melt(tsv)
data$X <- as.numeric(data$X)
data <- na.omit(data)
data <- data[data$value > 0,]
print("Starting to plot")
png(file = outputArg, width = 1500, height = 1500)
ggplot(data, aes(x=X, y=value, color=variable, group=variable)) + geom_line()
dev.off()
print("plot done")
......@@ -59,7 +59,7 @@ class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction {
".filtered_polymorphic_sites.fasta",
".filtered_polymorphic_sites.phylip",
".final_tree.tre")
for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix + t)
for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix.getOrElse("gubbins") + t)
}
def cmdLine = required("cd", outputDirectory) + " && " + required(executable) +
......
package nl.lumc.sasc.biopet.tools
import java.io.{ FileOutputStream, PrintWriter, File }
import htsjdk.variant.variantcontext.{ VariantContext, Genotype }
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.core.{ BiopetJavaCommandLineFunction, ToolCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.sys.process.{ Process, ProcessLogger }
import htsjdk.samtools.util.Interval
/**
* Created by pjvan_thof on 1/10/15.
*/
class VcfStats(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input fastq", shortName = "I", required = true)
var input: File = _
protected var outputDir: String = _
/**
* Set output dir and a output file
* @param dir
*/
def setOutputDir(dir: String): Unit = {
outputDir = dir
this.jobOutputFile = new File(dir + File.separator + ".vcfstats.out")
}
/**
* Creates command to execute extension
* @return
*/
override def commandLine = super.commandLine +
required("-I", input) +
required("-o", outputDir)
}
object VcfStats extends ToolCommand {
/** Commandline argument */
case class Args(inputFile: File = null, outputDir: String = null, intervals: Option[File] = None) extends AbstractArgs
/** Parsing commandline arguments */
class OptParser extends AbstractOptParser {
opt[File]('I', "inputFile") required () unbounded () valueName ("<file>") action { (x, c) =>
c.copy(inputFile = x)
}
opt[String]('o', "outputDir") required () unbounded () valueName ("<file>") action { (x, c) =>
c.copy(outputDir = x)
}
//TODO: add interval argument
/*
opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) =>
c.copy(intervals = Some(x))
}
*/
}
/**
* Class to store sample to sample compare stats
* @param genotypeOverlap Number of genotypes match with other sample
* @param alleleOverlap Number of alleles also found in other sample
*/
case class SampleToSampleStats(var genotypeOverlap: Int = 0,
var alleleOverlap: Int = 0) {
/** Add an other class */
def +=(other: SampleToSampleStats) {
this.genotypeOverlap += other.genotypeOverlap
this.alleleOverlap += other.alleleOverlap
}
}
/**
* class to store all sample relative stats
* @param genotypeStats Stores all genotype relative stats
* @param sampleToSample Stores sample to sample compare stats
*/
case class SampleStats(val genotypeStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(),
val sampleToSample: mutable.Map[String, SampleToSampleStats] = mutable.Map()) {
/** Add an other class */
def +=(other: SampleStats): Unit = {
for ((key, value) <- other.sampleToSample) {
if (this.sampleToSample.contains(key)) this.sampleToSample(key) += value
else this.sampleToSample(key) = value
}
for ((field, fieldMap) <- other.genotypeStats) {
val thisField = this.genotypeStats.get(field)
if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap)
else this.genotypeStats += field -> fieldMap
}
}
}
/**
* General stats class to store vcf stats
* @param generalStats Stores are general stats
* @param samplesStats Stores all sample/genotype specific stats
*/
case class Stats(val generalStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(),
val samplesStats: mutable.Map[String, SampleStats] = mutable.Map()) {
/** Add an other class */
def +=(other: Stats): Stats = {
for ((key, value) <- other.samplesStats) {
if (this.samplesStats.contains(key)) this.samplesStats(key) += value
else this.samplesStats(key) = value
}
for ((field, fieldMap) <- other.generalStats) {
val thisField = this.generalStats.get(field)
if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap)
else this.generalStats += field -> fieldMap
}
this
}
}
/**
* Merge m2 into m1
* @param m1
* @param m2
*/
def mergeStatsMap(m1: mutable.Map[Any, Int], m2: mutable.Map[Any, Int]): Unit = {
for (key <- m2.keySet)
m1(key) = m1.getOrElse(key, 0) + m2(key)
}
/**
* Merge m2 into m1
* @param m1
* @param m2
*/
def mergeNestedStatsMap(m1: mutable.Map[String, mutable.Map[Any, Int]], m2: Map[String, Map[Any, Int]]): Unit = {
for ((field, fieldMap) <- m2) {
if (m1.contains(field)) {
for ((key, value) <- fieldMap) {
if (m1(field).contains(key)) m1(field)(key) += value
else m1(field)(key) = value
}
} else m1(field) = mutable.Map(fieldMap.toList: _*)
}
}
protected var commandArgs: Args = _
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
logger.info("Started")
val argsParser = new OptParser
commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val reader = new VCFFileReader(commandArgs.inputFile, true)
val header = reader.getFileHeader
val samples = header.getSampleNamesInOrder.toList
val intervals: List[Interval] = (
for (
seq <- header.getSequenceDictionary.getSequences;
chunks = seq.getSequenceLength / 10000000;
i <- 1 until chunks
) yield {
val size = seq.getSequenceLength / chunks
val begin = size * (i - 1) + 1
val end = if (i >= chunks) seq.getSequenceLength else size * i
new Interval(seq.getSequenceName, begin, end)
}
).toList
val totalBases = intervals.foldRight(0L)(_.length() + _)
// Reading vcf records
logger.info("Start reading vcf records")