CheckAllelesVcfInBam.scala 8.27 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
17 18

import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
19 20 21 22 23

import htsjdk.samtools.{ QueryInterval, SAMRecord, SamReader, SamReaderFactory }
import htsjdk.variant.variantcontext.{ VariantContext, VariantContextBuilder }
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.vcf.{ VCFFileReader, VCFHeaderLineCount, VCFHeaderLineType, VCFInfoHeaderLine }
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.utils.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
25

Peter van 't Hof's avatar
Peter van 't Hof committed
26 27 28
import scala.collection.JavaConversions._
import scala.collection.mutable

Peter van 't Hof's avatar
Peter van 't Hof committed
29 30 31 32 33 34 35 36 37 38 39 40 41 42
//class CheckAllelesVcfInBam(val root: Configurable) extends BiopetJavaCommandLineFunction {
//  javaMainClass = getClass.getName
//  
//  @Input(doc = "Input vcf file", shortName = "V", required = true)
//  var inputFile: File = _
//
//  @Input(doc = "Input bam file", shortName = "bam", required = true)
//  var bamFiles: List[File] = Nil
//  
//  @Output(doc = "Output vcf file", shortName = "o", required = true)
//  var output: File = _
//
//  override def commandLine = super.commandLine + required("-V", inputFile) + repeat("-bam", bamFiles) + required(output)
//}
Peter van 't Hof's avatar
Peter van 't Hof committed
43 44

object CheckAllelesVcfInBam extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
45 46
  case class Args(inputFile: File = null, outputFile: File = null, samples: List[String] = Nil,
                  bamFiles: List[File] = Nil, minMapQual: Int = 1) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
47 48

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
49
    opt[File]('I', "inputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
50 51
      c.copy(inputFile = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
52
    opt[File]('o', "outputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
53 54
      c.copy(outputFile = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
55
    opt[String]('s', "sample") unbounded () minOccurs 1 action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
56 57
      c.copy(samples = x :: c.samples)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
58
    opt[File]('b', "bam") unbounded () minOccurs 1 action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
59 60
      c.copy(bamFiles = x :: c.bamFiles)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
61
    opt[Int]('m', "min_mapping_quality") maxOccurs 1 action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
62 63
      c.copy(minMapQual = c.minMapQual)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
64
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
65 66

  private class CountReport(
Peter van 't Hof's avatar
Peter van 't Hof committed
67 68 69
    var notFound: Int = 0,
    var aCounts: mutable.Map[String, Int] = mutable.Map(),
    var duplicateReads: Int = 0,
Peter van 't Hof's avatar
Peter van 't Hof committed
70 71
    var lowMapQualReads: Int = 0)

Peter van 't Hof's avatar
Peter van 't Hof committed
72 73
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
74
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
Peter van 't Hof's avatar
Peter van 't Hof committed
75

Peter van 't Hof's avatar
Peter van 't Hof committed
76
    if (commandArgs.bamFiles.size != commandArgs.samples.size)
77
      logger.warn("Number of samples is different from number of bam files: additional samples or bam files will not be used")
Peter van 't Hof's avatar
Peter van 't Hof committed
78 79
    val samReaderFactory = SamReaderFactory.makeDefault
    val bamReaders: Map[String, SamReader] = Map(commandArgs.samples zip commandArgs.bamFiles.map(x => samReaderFactory.open(x)): _*)
Peter van 't Hof's avatar
Peter van 't Hof committed
80
    val bamHeaders = bamReaders.map(x => (x._1, x._2.getFileHeader))
Peter van 't Hof's avatar
Peter van 't Hof committed
81

Peter van 't Hof's avatar
Peter van 't Hof committed
82
    val reader = new VCFFileReader(commandArgs.inputFile, false)
Sander Bollen's avatar
Sander Bollen committed
83 84
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputFile).
      setReferenceDictionary(reader.getFileHeader.getSequenceDictionary).build)
Peter van 't Hof's avatar
Peter van 't Hof committed
85

Peter van 't Hof's avatar
Peter van 't Hof committed
86
    val header = reader.getFileHeader
Peter van 't Hof's avatar
Peter van 't Hof committed
87 88 89 90 91
    for ((sample, _) <- bamReaders) {
      header.addMetaDataLine(new VCFInfoHeaderLine("BAM-AD-" + sample,
        VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allele depth, ref and alt on order of vcf file"))
      header.addMetaDataLine(new VCFInfoHeaderLine("BAM-DP-" + sample,
        1, VCFHeaderLineType.Integer, "Total reads on this location"))
Peter van 't Hof's avatar
Peter van 't Hof committed
92
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
93

Peter van 't Hof's avatar
Peter van 't Hof committed
94
    writer.writeHeader(header)
Peter van 't Hof's avatar
Peter van 't Hof committed
95

Peter van 't Hof's avatar
Peter van 't Hof committed
96
    for (vcfRecord <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
97
      val countReports: Map[String, CountReport] = bamReaders.map(x => (x._1, new CountReport))
Peter van 't Hof's avatar
Peter van 't Hof committed
98 99
      val refAllele = vcfRecord.getReference.getBaseString
      for ((sample, bamReader) <- bamReaders) {
100
        val queryInterval = new QueryInterval(bamHeaders(sample).getSequenceIndex(vcfRecord.getContig),
Peter van 't Hof's avatar
Peter van 't Hof committed
101
          vcfRecord.getStart, vcfRecord.getStart + refAllele.length - 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
102
        val bamIter = bamReader.query(Array(queryInterval), false)
Peter van 't Hof's avatar
Peter van 't Hof committed
103 104

        def filterRead(samRecord: SAMRecord): Boolean = {
Peter van 't Hof's avatar
Peter van 't Hof committed
105 106 107 108 109 110 111 112 113 114
          if (samRecord.getDuplicateReadFlag) {
            countReports(sample).duplicateReads += 1
            return true
          }
          if (samRecord.getSupplementaryAlignmentFlag) return true
          if (samRecord.getNotPrimaryAlignmentFlag) return true
          if (samRecord.getMappingQuality < commandArgs.minMapQual) {
            countReports(sample).lowMapQualReads += 1
            return true
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
115
          false
Peter van 't Hof's avatar
Peter van 't Hof committed
116
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
117

Peter van 't Hof's avatar
Peter van 't Hof committed
118
        val counts = for (samRecord <- bamIter if !filterRead(samRecord)) {
119
          checkAlleles(samRecord, vcfRecord) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
120 121
            case Some(a) => if (countReports(sample).aCounts.contains(a)) countReports(sample).aCounts(a) += 1
            else countReports(sample).aCounts += (a -> 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
122 123
            case _ => countReports(sample).notFound += 1
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
124
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
125
        bamIter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
126
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
127

Peter van 't Hof's avatar
Peter van 't Hof committed
128
      val builder = new VariantContextBuilder(vcfRecord)
Peter van 't Hof's avatar
Peter van 't Hof committed
129
      for ((k, v) <- countReports) {
Peter van 't Hof's avatar
Peter van 't Hof committed
130 131 132 133 134 135
        val s = for (allele <- vcfRecord.getAlleles) yield {
          val s = allele.getBaseString
          if (v.aCounts.contains(s)) v.aCounts(s)
          else 0
        }
        builder.attribute("BAM-AD-" + k, s.mkString(","))
Peter van 't Hof's avatar
Peter van 't Hof committed
136
        builder.attribute("BAM-DP-" + k, (0 /: s)(_ + _) + v.notFound)
Peter van 't Hof's avatar
Peter van 't Hof committed
137 138 139
      }
      writer.add(builder.make)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
140 141 142
    for ((_, r) <- bamReaders) r.close()
    reader.close()
    writer.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
143
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
144

145
  def checkAlleles(samRecord: SAMRecord, vcfRecord: VariantContext): Option[String] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
146
    val readStartPos = List.range(0, samRecord.getReadBases.length)
Peter van 't Hof's avatar
Peter van 't Hof committed
147
      .find(x => samRecord.getReferencePositionAtReadPosition(x + 1) == vcfRecord.getStart) getOrElse { return None }
Peter van 't Hof's avatar
Peter van 't Hof committed
148
    val readBases = samRecord.getReadBases
Peter van 't Hof's avatar
Peter van 't Hof committed
149 150 151
    val alleles = vcfRecord.getAlleles.map(x => x.getBaseString)
    val refAllele = alleles.head
    var maxSize = 1
Peter van 't Hof's avatar
Peter van 't Hof committed
152
    for (allele <- alleles if allele.length > maxSize) maxSize = allele.length
Peter van 't Hof's avatar
Peter van 't Hof committed
153
    val readC = for (t <- readStartPos until readStartPos + maxSize if t < readBases.length) yield readBases(t).toChar
Peter van 't Hof's avatar
Peter van 't Hof committed
154
    val allelesInRead = mutable.Set(alleles.filter(readC.mkString.startsWith): _*)
Peter van 't Hof's avatar
Peter van 't Hof committed
155

Peter van 't Hof's avatar
Peter van 't Hof committed
156
    // Removal of insertions that are not really in the cigarstring
Peter van 't Hof's avatar
Peter van 't Hof committed
157 158
    for (allele <- allelesInRead if allele.length > refAllele.length) {
      val refPos = for (t <- refAllele.length until allele.length) yield samRecord.getReferencePositionAtReadPosition(readStartPos + t + 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
159 160
      if (refPos.exists(_ > 0)) allelesInRead -= allele
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
161

Peter van 't Hof's avatar
Peter van 't Hof committed
162
    // Removal of alleles that are not really in the cigarstring
Peter van 't Hof's avatar
Peter van 't Hof committed
163
    for (allele <- allelesInRead) {
Peter van 't Hof's avatar
Peter van 't Hof committed
164 165
      val readPosAfterAllele = samRecord.getReferencePositionAtReadPosition(readStartPos + allele.length + 1)
      val vcfPosAfterAllele = vcfRecord.getStart + refAllele.length
Peter van 't Hof's avatar
Peter van 't Hof committed
166
      if (readPosAfterAllele != vcfPosAfterAllele &&
Peter van 't Hof's avatar
Peter van 't Hof committed
167
        (refAllele.length != allele.length || (refAllele.length == allele.length && readPosAfterAllele < 0))) allelesInRead -= allele
Peter van 't Hof's avatar
Peter van 't Hof committed
168
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
169

Peter van 't Hof's avatar
Peter van 't Hof committed
170
    for (allele <- allelesInRead if allele.length >= refAllele.length) {
Peter van 't Hof's avatar
Peter van 't Hof committed
171
      if (allelesInRead.exists(_.length > allele.length)) allelesInRead -= allele
Peter van 't Hof's avatar
Peter van 't Hof committed
172
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
173
    if (allelesInRead.contains(refAllele) && allelesInRead.exists(_.length < refAllele.length)) allelesInRead -= refAllele
Peter van 't Hof's avatar
Peter van 't Hof committed
174 175
    if (allelesInRead.isEmpty) None
    else if (allelesInRead.size == 1) Some(allelesInRead.head)
Peter van 't Hof's avatar
Peter van 't Hof committed
176 177
    else {
      logger.warn("vcfRecord: " + vcfRecord)
Peter van 't Hof's avatar
Peter van 't Hof committed
178
      logger.warn("samRecord: " + samRecord.getSAMString)
Peter van 't Hof's avatar
Peter van 't Hof committed
179
      logger.warn("Found multiple options: " + allelesInRead.toString)
Peter van 't Hof's avatar
Peter van 't Hof committed
180
      logger.warn("ReadStartPos: " + readStartPos + "  Read Length: " + samRecord.getReadLength)
Peter van 't Hof's avatar
Peter van 't Hof committed
181
      logger.warn("Read skipped, please report this")
Peter van 't Hof's avatar
Peter van 't Hof committed
182
      None
Peter van 't Hof's avatar
Peter van 't Hof committed
183 184 185
    }
  }
}