Skip to content
Snippets Groups Projects
Commit 1a870a62 authored by bow's avatar bow
Browse files

Merge branch 'feature-vcf_stats' into 'develop'

Feature vcf stats

More functions can be added but this is a first version that is ready to use in the gatk pipeline.

is also fix for #93

See merge request !84
parents 74077f92 74f11c8e
No related branches found
No related tags found
No related merge requests found
Showing
with 613 additions and 267 deletions
......@@ -7,7 +7,7 @@ package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import java.io.File
import nl.lumc.sasc.biopet.tools.{ MpileupToVcf, VcfFilter, MergeAlleles }
import nl.lumc.sasc.biopet.tools.{ VcfStats, MpileupToVcf, VcfFilter, MergeAlleles }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.{ AnalyzeCovariates, BaseRecalibrator, GenotypeGVCFs, HaplotypeCaller, IndelRealigner, PrintReads, RealignerTargetCreator, SelectVariants, CombineVariants, UnifiedGenotyper }
import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates
......@@ -197,6 +197,12 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
val cvFinal = CombineVariants(this, mergeList.toList, outputDir + outputName + ".final.vcf.gz")
cvFinal.genotypemergeoption = org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils.GenotypeMergeType.UNSORTED
add(cvFinal)
val vcfStats = new VcfStats(this)
vcfStats.input = cvFinal.out
vcfStats.setOutputDir(outputDir + File.separator + "vcfstats")
add(vcfStats)
scriptOutput.finalVcfFile = cvFinal.out
}
}
......
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import nl.lumc.sasc.biopet.extensions.gatk.SelectVariants
import nl.lumc.sasc.biopet.extensions.gatk.VariantEval
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
class GatkVcfSampleCompare(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Sample vcf file(s)", shortName = "V")
var vcfFiles: List[File] = _
@Argument(doc = "Reference", shortName = "R", required = false)
var reference: File = config("reference")
@Argument(doc = "Target bed", shortName = "targetBed", required = false)
var targetBed: List[File] = Nil
@Argument(doc = "Samples", shortName = "sample", required = false)
var samples: List[String] = Nil
var vcfFile: File = _
var sampleVcfs: Map[String, File] = Map()
def generalSampleDir = outputDir + "samples/"
def init() {
if (config.contains("target_bed"))
for (bed <- config("target_bed").asList)
targetBed :+= bed.toString
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
vcfFile = if (vcfFiles.size > 1) {
val combineVariants = CombineVariants(this, vcfFiles, outputDir + "merge.vcf")
add(combineVariants)
combineVariants.out
} else vcfFiles.head
for (sample <- samples) {
sampleVcfs += (sample -> new File(generalSampleDir + sample + File.separator + sample + ".vcf"))
val selectVariants = SelectVariants(this, vcfFile, sampleVcfs(sample))
selectVariants.sample_name = Seq(sample)
selectVariants.excludeNonVariants = true
add(selectVariants)
}
val sampleCompareMetrics = new SampleCompareMetrics(this)
sampleCompareMetrics.samples = samples
sampleCompareMetrics.sampleDir = generalSampleDir
sampleCompareMetrics.snpRelFile = outputDir + "compare.snp.rel.tsv"
sampleCompareMetrics.snpAbsFile = outputDir + "compare.snp.abs.tsv"
sampleCompareMetrics.indelRelFile = outputDir + "compare.indel.rel.tsv"
sampleCompareMetrics.indelAbsFile = outputDir + "compare.indel.abs.tsv"
sampleCompareMetrics.totalFile = outputDir + "total.tsv"
for ((sample, sampleVcf) <- sampleVcfs) {
val sampleDir = generalSampleDir + sample + File.separator
for ((compareSample, compareSampleVcf) <- sampleVcfs) {
val variantEval = VariantEval(this,
sampleVcf,
compareSampleVcf,
new File(sampleDir + sample + "-" + compareSample + ".eval.txt"),
Seq("VariantType", "CompRod"),
Seq("CompOverlap")
)
if (targetBed != null) variantEval.L = targetBed
add(variantEval)
sampleCompareMetrics.deps ::= variantEval.out
}
}
add(sampleCompareMetrics)
}
}
object GatkVcfSampleCompare extends PipelineCommand
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import java.io.File
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.R.RScriptExecutor
import org.broadinstitute.gatk.utils.commandline.{ Output, Argument }
import scala.io.Source
import org.broadinstitute.gatk.utils.R.{ RScriptLibrary, RScriptExecutor }
import org.broadinstitute.gatk.utils.io.Resource
import scala.collection.mutable.Map
import scala.math._
class SampleCompareMetrics(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Argument(doc = "Sample Dir", shortName = "sampleDir", required = true)
var sampleDir: String = _
@Argument(doc = "Samples", shortName = "sample", required = true)
var samples: List[String] = Nil
@Argument(doc = "File sufix", shortName = "sufix", required = false)
var fileSufix: String = _
@Output(doc = "snpRelFile", shortName = "snpRelFile", required = true)
var snpRelFile: File = _
@Output(doc = "snpAbsFile", shortName = "snpAbsFile", required = true)
var snpAbsFile: File = _
@Output(doc = "indelRelFile", shortName = "indelRelFile", required = true)
var indelRelFile: File = _
@Output(doc = "indelAbsFile", shortName = "indelAbsFile", required = true)
var indelAbsFile: File = _
@Output(doc = "totalFile", shortName = "totalFile", required = true)
var totalFile: File = _
override val defaultVmem = "8G"
memoryLimit = Option(4.0)
override def commandLine = super.commandLine +
required("-sampleDir", sampleDir) +
repeat("-sample", samples) +
optional("-fileSufix", fileSufix) +
required("-snpRelFile", snpRelFile) +
required("-snpAbsFile", snpAbsFile) +
required("-indelRelFile", indelRelFile) +
required("-indelAbsFile", indelAbsFile) +
required("-totalFile", totalFile)
}
object SampleCompareMetrics {
var sampleDir: String = _
var samples: List[String] = Nil
var fileSufix: String = ".eval.txt"
var snpRelFile: File = _
var snpAbsFile: File = _
var indelRelFile: File = _
var indelAbsFile: File = _
var totalFile: File = _
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
for (t <- 0 until args.size) {
args(t) match {
case "-sample" => samples +:= args(t + 1)
case "-sampleDir" => sampleDir = args(t + 1)
case "-fileSufix" => fileSufix = args(t + 1)
case "-snpRelFile" => snpRelFile = new File(args(t + 1))
case "-snpAbsFile" => snpAbsFile = new File(args(t + 1))
case "-indelRelFile" => indelRelFile = new File(args(t + 1))
case "-indelAbsFile" => indelAbsFile = new File(args(t + 1))
case "-totalFile" => totalFile = new File(args(t + 1))
case _ =>
}
}
if (sampleDir == null) throw new IllegalStateException("No sampleDir, use -sampleDir")
else if (!sampleDir.endsWith("/")) sampleDir += "/"
val regex = """\W+""".r
val snpsOverlap: Map[(String, String), Int] = Map()
val indelsOverlap: Map[(String, String), Int] = Map()
val snpsTotal: Map[String, Int] = Map()
val indelsTotal: Map[String, Int] = Map()
for (sample1 <- samples; sample2 <- samples) {
val reader = Source.fromFile(new File(sampleDir + sample1 + "/" + sample1 + "-" + sample2 + fileSufix))
for (line <- reader.getLines) {
regex.split(line) match {
case Array(_, _, _, varType, all, novel, overlap, rate, _*) => {
varType match {
case "SNP" => {
snpsOverlap += (sample1, sample2) -> overlap.toInt
snpsTotal += sample1 -> all.toInt
}
case "INDEL" => {
indelsOverlap += (sample1, sample2) -> overlap.toInt
indelsTotal += sample1 -> all.toInt
}
case _ =>
}
}
case _ =>
}
}
reader.close()
}
val snpRelWritter = new PrintWriter(snpRelFile)
val snpAbsWritter = new PrintWriter(snpAbsFile)
val indelRelWritter = new PrintWriter(indelRelFile)
val indelAbsWritter = new PrintWriter(indelAbsFile)
val allWritters = List(snpRelWritter, snpAbsWritter, indelRelWritter, indelAbsWritter)
for (writter <- allWritters) writter.println(samples.mkString("\t", "\t", ""))
for (sample1 <- samples) {
for (writter <- allWritters) writter.print(sample1)
for (sample2 <- samples) {
snpRelWritter.print("\t" + (round((snpsOverlap(sample1, sample2).toDouble / snpsTotal(sample1) * 10000.0)) / 10000.0))
snpAbsWritter.print("\t" + snpsOverlap(sample1, sample2))
indelRelWritter.print("\t" + (round((indelsOverlap(sample1, sample2).toDouble / indelsTotal(sample1) * 10000.0)) / 10000.0))
indelAbsWritter.print("\t" + indelsOverlap(sample1, sample2))
}
for (writter <- allWritters) writter.println()
}
for (writter <- allWritters) writter.close()
val totalWritter = new PrintWriter(totalFile)
totalWritter.println("Sample\tSNPs\tIndels")
for (sample <- samples)
totalWritter.println(sample + "\t" + snpsTotal(sample) + "\t" + indelsTotal(sample))
totalWritter.close()
def plot(file: File) {
val executor = new RScriptExecutor
executor.addScript(new Resource("plotHeatmap.R", getClass))
executor.addArgs(file, file.getAbsolutePath.stripSuffix(".tsv") + ".png", file.getAbsolutePath.stripSuffix(".tsv") + ".clustering.png")
executor.exec()
}
plot(snpRelFile)
plot(indelRelFile)
}
}
\ No newline at end of file
......@@ -12,7 +12,6 @@ object BiopetExecutableProtected extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantcalling,
nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline,
nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantRecalibration,
nl.lumc.sasc.biopet.pipelines.gatk.GatkVcfSampleCompare,
nl.lumc.sasc.biopet.pipelines.basty.Basty)
def tools = BiopetExecutablePublic.tools
......
library('gplots')
args <- commandArgs(TRUE)
inputArg <- args[1]
outputArg <- args[2]
outputArgClustering <- args[3]
col <- heat.colors(250)
col[250] <- "#00FF00"
heat<-read.table(inputArg, header = 1, sep= '\t')
rownames(heat) <- heat[,1]
heat<- heat[,-1]
heat<- as.matrix(heat)
png(file = outputArg, width = 1000, height = 1000)
heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none")
dev.off()
png(file = outputArgClustering, width = 1000, height = 1000)
heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row")
dev.off()
library('gplots')
library('RColorBrewer')
args <- commandArgs(TRUE)
inputArg <- args[1]
outputArg <- args[2]
outputArgClustering <- args[3]
outputArgDendrogram <- args[4]
heat<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F)
#heat[heat==1] <- NA
rownames(heat) <- heat[,1]
heat<- heat[,-1]
heat<- as.matrix(heat)
colNumber <- 50
col <- rev(colorRampPalette(brewer.pal(11, "Spectral"))(colNumber))
for (i in (colNumber+1):(colNumber+round((dist(range(heat)) - dist(range(heat[heat < 1]))) / dist(range(heat[heat < 1])) * colNumber))) {
col[i] <- col[colNumber]
}
col[length(col)] <- "#00FF00"
png(file = outputArg, width = 1200, height = 1200)
heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none", margins = c(12, 12), na.color="#00FF00")
dev.off()
hc <- hclust(d = dist(heat))
png(file = outputArgDendrogram, width = 1200, height = 1200)
plot(as.dendrogram(hc), horiz=TRUE, asp=0.02)
dev.off()
png(file = outputArgClustering, width = 1200, height = 1200)
heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row",margins = c(12, 12), na.color="#00FF00")
dev.off()
library('ggplot2')
library('reshape2')
args <- commandArgs(TRUE)
inputArg <- args[1]
outputArg <- args[2]
tsv<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F)
data <- melt(tsv)
data$X <- as.numeric(data$X)
data <- na.omit(data)
data <- data[data$value > 0,]
print("Starting to plot")
png(file = outputArg, width = 1500, height = 1500)
ggplot(data, aes(x=X, y=value, color=variable, group=variable)) + geom_line()
dev.off()
print("plot done")
......@@ -59,7 +59,7 @@ class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction {
".filtered_polymorphic_sites.fasta",
".filtered_polymorphic_sites.phylip",
".final_tree.tre")
for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix + t)
for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix.getOrElse("gubbins") + t)
}
def cmdLine = required("cd", outputDirectory) + " && " + required(executable) +
......
package nl.lumc.sasc.biopet.tools
import java.io.{ FileOutputStream, PrintWriter, File }
import htsjdk.variant.variantcontext.{ VariantContext, Genotype }
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.core.{ BiopetJavaCommandLineFunction, ToolCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.sys.process.{ Process, ProcessLogger }
import htsjdk.samtools.util.Interval
/**
* Created by pjvan_thof on 1/10/15.
*/
class VcfStats(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input fastq", shortName = "I", required = true)
var input: File = _
protected var outputDir: String = _
/**
* Set output dir and a output file
* @param dir
*/
def setOutputDir(dir: String): Unit = {
outputDir = dir
this.jobOutputFile = new File(dir + File.separator + ".vcfstats.out")
}
/**
* Creates command to execute extension
* @return
*/
override def commandLine = super.commandLine +
required("-I", input) +
required("-o", outputDir)
}
object VcfStats extends ToolCommand {
/** Commandline argument */
case class Args(inputFile: File = null, outputDir: String = null, intervals: Option[File] = None) extends AbstractArgs
/** Parsing commandline arguments */
class OptParser extends AbstractOptParser {
opt[File]('I', "inputFile") required () unbounded () valueName ("<file>") action { (x, c) =>
c.copy(inputFile = x)
}
opt[String]('o', "outputDir") required () unbounded () valueName ("<file>") action { (x, c) =>
c.copy(outputDir = x)
}
//TODO: add interval argument
/*
opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) =>
c.copy(intervals = Some(x))
}
*/
}
/**
* Class to store sample to sample compare stats
* @param genotypeOverlap Number of genotypes match with other sample
* @param alleleOverlap Number of alleles also found in other sample
*/
case class SampleToSampleStats(var genotypeOverlap: Int = 0,
var alleleOverlap: Int = 0) {
/** Add an other class */
def +=(other: SampleToSampleStats) {
this.genotypeOverlap += other.genotypeOverlap
this.alleleOverlap += other.alleleOverlap
}
}
/**
* class to store all sample relative stats
* @param genotypeStats Stores all genotype relative stats
* @param sampleToSample Stores sample to sample compare stats
*/
case class SampleStats(val genotypeStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(),
val sampleToSample: mutable.Map[String, SampleToSampleStats] = mutable.Map()) {
/** Add an other class */
def +=(other: SampleStats): Unit = {
for ((key, value) <- other.sampleToSample) {
if (this.sampleToSample.contains(key)) this.sampleToSample(key) += value
else this.sampleToSample(key) = value
}
for ((field, fieldMap) <- other.genotypeStats) {
val thisField = this.genotypeStats.get(field)
if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap)
else this.genotypeStats += field -> fieldMap
}
}
}
/**
* General stats class to store vcf stats
* @param generalStats Stores are general stats
* @param samplesStats Stores all sample/genotype specific stats
*/
case class Stats(val generalStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(),
val samplesStats: mutable.Map[String, SampleStats] = mutable.Map()) {
/** Add an other class */
def +=(other: Stats): Stats = {
for ((key, value) <- other.samplesStats) {
if (this.samplesStats.contains(key)) this.samplesStats(key) += value
else this.samplesStats(key) = value
}
for ((field, fieldMap) <- other.generalStats) {
val thisField = this.generalStats.get(field)
if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap)
else this.generalStats += field -> fieldMap
}
this
}
}
/**
* Merge m2 into m1
* @param m1
* @param m2
*/
def mergeStatsMap(m1: mutable.Map[Any, Int], m2: mutable.Map[Any, Int]): Unit = {
for (key <- m2.keySet)
m1(key) = m1.getOrElse(key, 0) + m2(key)
}
/**
* Merge m2 into m1
* @param m1
* @param m2
*/
def mergeNestedStatsMap(m1: mutable.Map[String, mutable.Map[Any, Int]], m2: Map[String, Map[Any, Int]]): Unit = {
for ((field, fieldMap) <- m2) {
if (m1.contains(field)) {
for ((key, value) <- fieldMap) {
if (m1(field).contains(key)) m1(field)(key) += value
else m1(field)(key) = value
}
} else m1(field) = mutable.Map(fieldMap.toList: _*)
}
}
protected var commandArgs: Args = _
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
logger.info("Started")
val argsParser = new OptParser
commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val reader = new VCFFileReader(commandArgs.inputFile, true)
val header = reader.getFileHeader
val samples = header.getSampleNamesInOrder.toList
val intervals: List[Interval] = (
for (
seq <- header.getSequenceDictionary.getSequences;
chunks = seq.getSequenceLength / 10000000;
i <- 1 until chunks
) yield {
val size = seq.getSequenceLength / chunks
val begin = size * (i - 1) + 1
val end = if (i >= chunks) seq.getSequenceLength else size * i
new Interval(seq.getSequenceName, begin, end)
}
).toList
val totalBases = intervals.foldRight(0L)(_.length() + _)
// Reading vcf records
logger.info("Start reading vcf records")
def createStats: Stats = {
val stats = new Stats
//init stats
for (sample1 <- samples) {
stats.samplesStats += sample1 -> new SampleStats
for (sample2 <- samples) {
stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
}
}
stats
}
var variantCounter = 0L
var baseCounter = 0L
def status(count: Int, interval: Interval): Unit = {
variantCounter += count
baseCounter += interval.length()
val fraction = baseCounter.toFloat / totalBases * 100
logger.info(interval + " done, " + count + " rows processed")
logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
}
val statsChunks = for (interval <- intervals.par) yield {
val reader = new VCFFileReader(commandArgs.inputFile, true)
var chunkCounter = 0
val stats = createStats
logger.info("Starting on: " + interval)
for (
record <- reader.query(interval.getSequence, interval.getStart, interval.getEnd) if record.getStart <= interval.getEnd
) {
mergeNestedStatsMap(stats.generalStats, checkGeneral(record))
for (sample1 <- samples) yield {
val genotype = record.getGenotype(sample1)
mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype))
for (sample2 <- samples) {
val genotype2 = record.getGenotype(sample2)
if (genotype.getAlleles == genotype2.getAlleles)
stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += genotype.getAlleles.count(allele => genotype2.getAlleles.exists(_.basesMatch(allele)))
}
}
chunkCounter += 1
}
status(chunkCounter, interval)
stats
}
val stats = statsChunks.toList.fold(createStats)(_ += _)
logger.info("Done reading vcf records")
writeField("QUAL", stats.generalStats.getOrElse("QUAL", mutable.Map()))
writeField("general", stats.generalStats.getOrElse("general", mutable.Map()))
writeGenotypeFields(stats, commandArgs.outputDir + "/genotype_", samples)
writeOverlap(stats, _.genotypeOverlap, commandArgs.outputDir + "/sample_compare/genotype_overlap", samples)
writeOverlap(stats, _.alleleOverlap, commandArgs.outputDir + "/sample_compare/allele_overlap", samples)
logger.info("Done")
}
/**
* Function to check all general stats, all info expect sample/genotype specific stats
* @param record
* @return
*/
protected def checkGeneral(record: VariantContext): Map[String, Map[Any, Int]] = {
val buffer = mutable.Map[String, Map[Any, Int]]()
def addToBuffer(key: String, value: Any): Unit = {
val map = buffer.getOrElse(key, Map())
buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
}
buffer += "QUAL" -> Map(record.getPhredScaledQual -> 1)
addToBuffer("general", "Total")
if (record.isBiallelic) addToBuffer("general", "Biallelic")
if (record.isComplexIndel) addToBuffer("general", "ComplexIndel")
if (record.isFiltered) addToBuffer("general", "Filtered")
if (record.isFullyDecoded) addToBuffer("general", "FullyDecoded")
if (record.isIndel) addToBuffer("general", "Indel")
if (record.isMixed) addToBuffer("general", "Mixed")
if (record.isMNP) addToBuffer("general", "MNP")
if (record.isMonomorphicInSamples) addToBuffer("general", "MonomorphicInSamples")
if (record.isNotFiltered) addToBuffer("general", "NotFiltered")
if (record.isPointEvent) addToBuffer("general", "PointEvent")
if (record.isPolymorphicInSamples) addToBuffer("general", "PolymorphicInSamples")
if (record.isSimpleDeletion) addToBuffer("general", "SimpleDeletion")
if (record.isSimpleInsertion) addToBuffer("general", "SimpleInsertion")
if (record.isSNP) addToBuffer("general", "SNP")
if (record.isStructuralIndel) addToBuffer("general", "StructuralIndel")
if (record.isSymbolic) addToBuffer("general", "Symbolic")
if (record.isSymbolicOrSV) addToBuffer("general", "SymbolicOrSV")
if (record.isVariant) addToBuffer("general", "Variant")
buffer.toMap
}
/**
* Function to check sample/genotype specific stats
* @param record
* @param genotype
* @return
*/
protected def checkGenotype(record: VariantContext, genotype: Genotype): Map[String, Map[Any, Int]] = {
val buffer = mutable.Map[String, Map[Any, Int]]()
def addToBuffer(key: String, value: Any): Unit = {
val map = buffer.getOrElse(key, Map())
buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
}
buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)
val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList
addToBuffer("general", "Total")
if (genotype.isHet) addToBuffer("general", "Het")
if (genotype.isHetNonRef) addToBuffer("general", "HetNonRef")
if (genotype.isHom) addToBuffer("general", "Hom")
if (genotype.isHomRef) addToBuffer("general", "HomRef")
if (genotype.isHomVar) addToBuffer("general", "HomVar")
if (genotype.isMixed) addToBuffer("general", "Mixed")
if (genotype.isNoCall) addToBuffer("general", "NoCall")
if (genotype.isNonInformative) addToBuffer("general", "NonInformative")
if (genotype.isAvailable) addToBuffer("general", "Available")
if (genotype.isCalled) addToBuffer("general", "Called")
if (genotype.isFiltered) addToBuffer("general", "Filtered")
if (genotype.hasAD) {
val ad = genotype.getAD
for (i <- 0 until ad.size if ad(i) > 0) {
addToBuffer("AD", ad(i))
if (i == 0) addToBuffer("AD-ref", ad(i))
if (i > 0) addToBuffer("AD-alt", ad(i))
if (usedAlleles.exists(_ == i)) addToBuffer("AD-used", ad(i))
else addToBuffer("AD-not_used", ad(i))
}
}
buffer.toMap
}
/**
* Function to write stats to tsv files
* @param stats
* @param prefix
* @param samples
*/
protected def writeGenotypeFields(stats: Stats, prefix: String, samples: List[String]) {
val fields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")
for (field <- fields) {
writeGenotypeField(stats, prefix, samples, field)
}
}
/**
* Function to write 1 specific genotype field
* @param stats
* @param prefix
* @param samples
* @param field
*/
protected def writeGenotypeField(stats: Stats, prefix: String, samples: List[String], field: String): Unit = {
val file = new File(prefix + field + ".tsv")
file.getParentFile.mkdirs()
val writer = new PrintWriter(file)
writer.println(samples.mkString(field + "\t", "\t", ""))
val keySet = (for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).keySet).fold(Set[Any]())(_ ++ _)
for (key <- keySet.toList.sortWith(sortAnyAny(_, _))) {
val values = for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).getOrElse(key, 0)
writer.println(values.mkString(key + "\t", "\t", ""))
}
writer.close()
plotLine(file)
}
/**
* Function to write 1 specific general field
* @param prefix
* @param data
* @return
*/
protected def writeField(prefix: String, data: mutable.Map[Any, Int]): File = {
val file = new File(commandArgs.outputDir + "/" + prefix + ".tsv")
println(file)
file.getParentFile.mkdirs()
val writer = new PrintWriter(file)
writer.println("\t" + prefix)
for (key <- data.keySet.toList.sortWith(sortAnyAny(_, _))) {
writer.println(key + "\t" + data(key))
}
writer.close()
file
}
/**
* Function to sort Any values
* @param a
* @param b
* @return
*/
def sortAnyAny(a: Any, b: Any): Boolean = {
a match {
case ai: Int => {
b match {
case bi: Int => ai < bi
case bi: Double => ai < bi
case _ => a.toString < b.toString
}
}
case _ => a.toString < b.toString
}
}
/**
* Function to write sample to sample compare tsv's / heatmaps
* @param stats
* @param function function to extract targeted var in SampleToSampleStats
* @param prefix
* @param samples
*/
def writeOverlap(stats: Stats, function: SampleToSampleStats => Int,
prefix: String, samples: List[String]): Unit = {
val absFile = new File(prefix + ".abs.tsv")
val relFile = new File(prefix + ".rel.tsv")
absFile.getParentFile.mkdirs()
val absWriter = new PrintWriter(absFile)
val relWriter = new PrintWriter(relFile)
absWriter.println(samples.mkString("\t", "\t", ""))
relWriter.println(samples.mkString("\t", "\t", ""))
for (sample1 <- samples) {
val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2))
absWriter.println(values.mkString(sample1 + "\t", "\t", ""))
val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
}
absWriter.close()
relWriter.close()
plotHeatmap(relFile)
}
/**
* Plots heatmaps on target tsv file
* @param file
*/
def plotHeatmap(file: File) {
executeRscript("plotHeatmap.R", Array(file.getAbsolutePath,
file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"))
}
/**
* Plots line graph with target tsv file
* @param file
*/
def plotLine(file: File) {
executeRscript("plotXY.R", Array(file.getAbsolutePath,
file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
}
/**
* Function to execute Rscript as subproces
* @param resource
* @param args
*/
def executeRscript(resource: String, args: Array[String]): Unit = {
val is = getClass.getResourceAsStream(resource)
val file = File.createTempFile("script.", "." + resource)
val os = new FileOutputStream(file)
org.apache.commons.io.IOUtils.copy(is, os)
os.close()
val command: String = "Rscript " + file + " " + args.mkString(" ")
logger.info("Starting: " + command)
val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
if (process.exitValue() == 0) logger.info("Done: " + command)
else {
logger.warn("Failed: " + command)
if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
}
}
}
package nl.lumc.sasc.biopet.tools
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
import scala.collection.mutable
import VcfStats._
/**
* Created by pjvan_thof on 2/5/15.
*/
class VcfStatsTest extends TestNGSuite with Matchers {
@Test
def testSampleToSampleStats: Unit = {
val s1 = SampleToSampleStats()
val s2 = SampleToSampleStats()
s1.alleleOverlap shouldBe 0
s1.genotypeOverlap shouldBe 0
s2.alleleOverlap shouldBe 0
s2.genotypeOverlap shouldBe 0
s1 += s2
s1.alleleOverlap shouldBe 0
s1.genotypeOverlap shouldBe 0
s2.alleleOverlap shouldBe 0
s2.genotypeOverlap shouldBe 0
s2.alleleOverlap = 2
s2.genotypeOverlap = 3
s1 += s2
s1.alleleOverlap shouldBe 2
s1.genotypeOverlap shouldBe 3
s2.alleleOverlap shouldBe 2
s2.genotypeOverlap shouldBe 3
s1 += s2
s1.alleleOverlap shouldBe 4
s1.genotypeOverlap shouldBe 6
s2.alleleOverlap shouldBe 2
s2.genotypeOverlap shouldBe 3
}
@Test
def testSampleStats: Unit = {
val s1 = SampleStats()
val s2 = SampleStats()
s1.sampleToSample += "s1" -> SampleToSampleStats()
s1.sampleToSample += "s2" -> SampleToSampleStats()
s2.sampleToSample += "s1" -> SampleToSampleStats()
s2.sampleToSample += "s2" -> SampleToSampleStats()
s1.sampleToSample("s1").alleleOverlap = 1
s2.sampleToSample("s2").alleleOverlap = 2
s1.genotypeStats += "1" -> mutable.Map(1 -> 1)
s2.genotypeStats += "2" -> mutable.Map(2 -> 2)
val ss1 = SampleToSampleStats()
val ss2 = SampleToSampleStats()
s1 += s2
s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 1), "2" -> mutable.Map(2 -> 2))
ss1.alleleOverlap = 1
ss2.alleleOverlap = 2
s1.sampleToSample shouldBe mutable.Map("s1" -> ss1, "s2" -> ss2)
s1 += s2
s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 1), "2" -> mutable.Map(2 -> 4))
s1 += s1
s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 2), "2" -> mutable.Map(2 -> 8))
}
}
......@@ -35,6 +35,7 @@ object BiopetExecutablePublic extends BiopetExecutable {
nl.lumc.sasc.biopet.tools.CheckAllelesVcfInBam,
nl.lumc.sasc.biopet.tools.VcfToTsv,
nl.lumc.sasc.biopet.tools.VcfFilter,
nl.lumc.sasc.biopet.tools.VcfStats,
nl.lumc.sasc.biopet.tools.FindRepeatsPacBio,
nl.lumc.sasc.biopet.tools.BedToInterval,
nl.lumc.sasc.biopet.tools.MpileupToVcf,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment