Commit 0eb984b0 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Added consensus builder

parent e3065aec
package nl.lumc.sasc.biopet.tools
import htsjdk.samtools.SamReaderFactory
import htsjdk.samtools.reference.FastaSequenceFile
import htsjdk.samtools.reference.IndexedFastaSequenceFile
import htsjdk.samtools.reference.ReferenceSequenceFileWalker
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf.VCFFileReader
import java.io.File
......@@ -49,11 +53,14 @@ class BastyGenerateFasta(val root: Configurable) extends BiopetJavaCommandLineFu
object BastyGenerateFasta extends ToolCommand {
case class Args(inputVcf: File = null,
outputVariants: File = null,
outputConsensus: File = null,
outputConsensusVariants: File = null,
bamFile: File = null,
snpsOnly: Boolean = false,
sampleName: String = null,
minAD: Int = 8,
reference: Boolean = false) extends AbstractArgs
minDepth: Int = 8,
reference: File = null) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('V', "inputVcf") unbounded () valueName ("<file>") action { (x, c) =>
......@@ -65,6 +72,12 @@ object BastyGenerateFasta extends ToolCommand {
opt[File]("outputVariants") unbounded () valueName ("<file>") action { (x, c) =>
c.copy(outputVariants = x)
}
opt[File]("outputConsensus") unbounded () valueName ("<file>") action { (x, c) =>
c.copy(outputConsensus = x)
}
opt[File]("outputConsensusVariants") unbounded () valueName ("<file>") action { (x, c) =>
c.copy(outputConsensusVariants = x)
}
opt[Unit]("snpsOnly") unbounded () action { (x, c) =>
c.copy(snpsOnly = true)
}
......@@ -74,50 +87,122 @@ object BastyGenerateFasta extends ToolCommand {
opt[Int]("minAD") unbounded () action { (x, c) =>
c.copy(minAD = x)
}
opt[Unit]("reference") unbounded () action { (x, c) =>
c.copy(reference = true)
opt[Int]("minDepth") unbounded () action { (x, c) =>
c.copy(minDepth = x)
}
opt[File]("reference") unbounded () action { (x, c) =>
c.copy(reference = x)
}
}
var commandArgs: Args = _
protected var cmdArgs: Args = _
private val chunkSize = 100000
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
cmdArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
if (cmdArgs.outputVariants != null) writeVariantsOnly()
if (cmdArgs.outputConsensus != null || cmdArgs.outputConsensusVariants != null) writeConsensus()
}
if (commandArgs.outputVariants != null) writeVariantsOnly()
protected def writeConsensus() {
if (cmdArgs.reference == null)
throw new IllegalStateException("No reference suplied")
if (cmdArgs.outputConsensusVariants != null) {
if (cmdArgs.inputVcf == null)
throw new IllegalStateException("To write outputVariants input vcf is required, please use --inputVcf option")
if (!cmdArgs.inputVcf.exists)
throw new IllegalStateException("File does not exist: " + cmdArgs.inputVcf)
}
if (cmdArgs.sampleName == null) {
if (cmdArgs.bamFile == null)
throw new IllegalStateException("To write Consensus input bam file is required, please use --bamFile option")
if (!cmdArgs.bamFile.exists)
throw new IllegalStateException("File does not exist: " + cmdArgs.bamFile)
}
logger.info(cmdArgs.reference)
val referenceFile = new IndexedFastaSequenceFile(cmdArgs.reference)
val referenceDict = referenceFile.getSequenceDictionary
for (chr <- referenceDict.getSequences) {
val chunks = (for (chunk <- (0 to (chr.getSequenceLength / chunkSize)).par) yield {
val chrName = chr.getSequenceName
val begin = chunk * chunkSize + 1
val end = {
val e = (chunk + 1) * chunkSize
if (e > chr.getSequenceLength) chr.getSequenceLength else e
}
logger.info("begin on: chrName: " + chrName + " begin: " + begin + " end: " + end)
val referenceSequence = referenceFile.getSubsequenceAt(chrName, begin, end)
val variants: Map[(Int, Int), VariantContext] = if (cmdArgs.inputVcf != null) {
val reader = new VCFFileReader(cmdArgs.inputVcf, true)
(for (variant <- reader.query(chrName, begin, end)) yield {
(variant.getStart, variant.getEnd) -> variant
}).toMap
} else Map()
val coverage: Array[Int] = Array.fill(end - begin + 1)(0)
if (cmdArgs.bamFile != null) {
val inputSam = SamReaderFactory.makeDefault.open(cmdArgs.bamFile)
for (r <- inputSam.query(chr.getSequenceName, begin, end, false)) {
val s = if (r.getAlignmentStart < begin) begin else r.getAlignmentStart
val e = if (r.getAlignmentEnd > end) end else r.getAlignmentEnd
for (t <- s to e) coverage(t - begin) += 1
}
}
val consensus = for (t <- 0 until coverage.length) yield {
if (coverage(t) >= cmdArgs.minDepth) referenceSequence.getBases()(t).toChar
else 'N'
}
(chunk -> consensus)
}).toMap
val writer = new PrintWriter(cmdArgs.outputConsensus)
writer.println(">" + cmdArgs.sampleName)
for (c <- chunks.keySet.toList.sortWith(_ < _)) {
writer.print(chunks(c).map(_.toChar).mkString)
}
writer.close
}
}
def writeVariantsOnly() {
if (commandArgs.inputVcf == null) throw new IllegalStateException("To write outputVariants input vcf is required, please use --outputVariants option")
if (!commandArgs.inputVcf.exists) throw new IllegalStateException("File does not exist: " + commandArgs.inputVcf)
val writer = new PrintWriter(commandArgs.outputVariants)
writer.println(">" + commandArgs.sampleName)
val vcfReader = new VCFFileReader(commandArgs.inputVcf, false)
for (vcfRecord <- vcfReader if (!commandArgs.snpsOnly || vcfRecord.isSNP)) yield {
writer.print(getMaxAllele(vcfRecord, commandArgs.sampleName))
protected def writeVariantsOnly() {
if (cmdArgs.inputVcf == null)
throw new IllegalStateException("To write outputVariants input vcf is required, please use --inputVcf option")
if (!cmdArgs.inputVcf.exists)
throw new IllegalStateException("File does not exist: " + cmdArgs.inputVcf)
val writer = new PrintWriter(cmdArgs.outputVariants)
writer.println(">" + cmdArgs.sampleName)
val vcfReader = new VCFFileReader(cmdArgs.inputVcf, false)
for (vcfRecord <- vcfReader if (!cmdArgs.snpsOnly || vcfRecord.isSNP)) yield {
writer.print(getMaxAllele(vcfRecord))
}
writer.println()
writer.close
vcfReader.close
}
def getMaxAllele(vcfRecord: VariantContext, sample: String): String = {
val genotype = vcfRecord.getGenotype(sample)
protected def getMaxAllele(vcfRecord: VariantContext): String = {
val maxSize = getLongestAllele(vcfRecord).getBases.length
def fill(bases: String) = bases + (Array.fill[Char](maxSize - bases.size)('N')).mkString
if (commandArgs.reference) return fill(vcfRecord.getReference.getBaseString)
if (cmdArgs.sampleName == null) return fillAllele(vcfRecord.getReference.getBaseString, maxSize)
if (genotype == null) return fill("")
val genotype = vcfRecord.getGenotype(cmdArgs.sampleName)
if (genotype == null) return fillAllele("", maxSize)
val AD = genotype.getAD
if (AD == null) return fill("")
if (AD == null) return fillAllele("", maxSize)
val maxADid = AD.zipWithIndex.maxBy(_._1)._2
if (AD(maxADid) < commandArgs.minAD) return fill("")
return fill(vcfRecord.getAlleles()(maxADid).getBaseString)
if (AD(maxADid) < cmdArgs.minAD) return fillAllele("", maxSize)
return fillAllele(vcfRecord.getAlleles()(maxADid).getBaseString, maxSize)
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment