CheckAllelesVcfInBam.scala 8.56 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
17

Peter van 't Hof's avatar
Peter van 't Hof committed
18
import htsjdk.samtools.{ QueryInterval, SamReaderFactory, SAMRecord, SamReader }
Peter van 't Hof's avatar
Peter van 't Hof committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.variantcontext.VariantContextBuilder
import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
import htsjdk.variant.vcf.VCFFileReader
import htsjdk.variant.vcf.VCFInfoHeaderLine
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.collection.JavaConversions._
import scala.collection.mutable
import htsjdk.variant.vcf.VCFHeaderLineType
import htsjdk.variant.vcf.VCFHeaderLineCount
import scala.math._

Peter van 't Hof's avatar
Peter van 't Hof committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
//class CheckAllelesVcfInBam(val root: Configurable) extends BiopetJavaCommandLineFunction {
//  javaMainClass = getClass.getName
//  
//  @Input(doc = "Input vcf file", shortName = "V", required = true)
//  var inputFile: File = _
//
//  @Input(doc = "Input bam file", shortName = "bam", required = true)
//  var bamFiles: List[File] = Nil
//  
//  @Output(doc = "Output vcf file", shortName = "o", required = true)
//  var output: File = _
//
//  override def commandLine = super.commandLine + required("-V", inputFile) + repeat("-bam", bamFiles) + required(output)
//}
Peter van 't Hof's avatar
Peter van 't Hof committed
50
51

object CheckAllelesVcfInBam extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
52
53
  case class Args(inputFile: File = null, outputFile: File = null, samples: List[String] = Nil,
                  bamFiles: List[File] = Nil, minMapQual: Int = 1) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
54
55

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('o', "outputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    }
    opt[String]('s', "sample") unbounded () minOccurs (1) action { (x, c) =>
      c.copy(samples = x :: c.samples)
    }
    opt[File]('b', "bam") unbounded () minOccurs (1) action { (x, c) =>
      c.copy(bamFiles = x :: c.bamFiles)
    }
    opt[Int]('m', "min_mapping_quality") maxOccurs (1) action { (x, c) =>
      c.copy(minMapQual = c.minMapQual)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
71
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
72
73

  private class CountReport(
Peter van 't Hof's avatar
Peter van 't Hof committed
74
75
76
    var notFound: Int = 0,
    var aCounts: mutable.Map[String, Int] = mutable.Map(),
    var duplicateReads: Int = 0,
Peter van 't Hof's avatar
Peter van 't Hof committed
77
78
    var lowMapQualReads: Int = 0)

Peter van 't Hof's avatar
Peter van 't Hof committed
79
80
81
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
82

Peter van 't Hof's avatar
Peter van 't Hof committed
83
    if (commandArgs.bamFiles.size != commandArgs.samples.size)
Sander Bollen's avatar
Sander Bollen committed
84
      logger.warn("Number of samples is different from number of bam files: those left over will be removed")
Peter van 't Hof's avatar
Peter van 't Hof committed
85
86
    val samReaderFactory = SamReaderFactory.makeDefault
    val bamReaders: Map[String, SamReader] = Map(commandArgs.samples zip commandArgs.bamFiles.map(x => samReaderFactory.open(x)): _*)
Peter van 't Hof's avatar
Peter van 't Hof committed
87
    val bamHeaders = bamReaders.map(x => (x._1, x._2.getFileHeader))
Peter van 't Hof's avatar
Peter van 't Hof committed
88

Peter van 't Hof's avatar
Peter van 't Hof committed
89
    val reader = new VCFFileReader(commandArgs.inputFile, false)
Sander Bollen's avatar
Sander Bollen committed
90
91
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputFile).
      setReferenceDictionary(reader.getFileHeader.getSequenceDictionary).build)
Peter van 't Hof's avatar
Peter van 't Hof committed
92

Peter van 't Hof's avatar
Peter van 't Hof committed
93
    val header = reader.getFileHeader
Peter van 't Hof's avatar
Peter van 't Hof committed
94
95
96
97
98
    for ((sample, _) <- bamReaders) {
      header.addMetaDataLine(new VCFInfoHeaderLine("BAM-AD-" + sample,
        VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allele depth, ref and alt on order of vcf file"))
      header.addMetaDataLine(new VCFInfoHeaderLine("BAM-DP-" + sample,
        1, VCFHeaderLineType.Integer, "Total reads on this location"))
Peter van 't Hof's avatar
Peter van 't Hof committed
99
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
100

Peter van 't Hof's avatar
Peter van 't Hof committed
101
    writer.writeHeader(header)
Peter van 't Hof's avatar
Peter van 't Hof committed
102

Peter van 't Hof's avatar
Peter van 't Hof committed
103
    for (vcfRecord <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
104
      val countReports: Map[String, CountReport] = bamReaders.map(x => (x._1, new CountReport))
Peter van 't Hof's avatar
Peter van 't Hof committed
105
106
      val refAllele = vcfRecord.getReference.getBaseString
      for ((sample, bamReader) <- bamReaders) {
Peter van 't Hof's avatar
Peter van 't Hof committed
107
108
        val queryInterval = new QueryInterval(bamHeaders(sample).getSequenceIndex(vcfRecord.getChr),
          vcfRecord.getStart, vcfRecord.getStart + refAllele.size - 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
109
        val bamIter = bamReader.query(Array(queryInterval), false)
Peter van 't Hof's avatar
Peter van 't Hof committed
110
111

        def filterRead(samRecord: SAMRecord): Boolean = {
Peter van 't Hof's avatar
Peter van 't Hof committed
112
113
114
115
116
117
118
119
120
121
122
123
          if (samRecord.getDuplicateReadFlag) {
            countReports(sample).duplicateReads += 1
            return true
          }
          if (samRecord.getSupplementaryAlignmentFlag) return true
          if (samRecord.getNotPrimaryAlignmentFlag) return true
          if (samRecord.getMappingQuality < commandArgs.minMapQual) {
            countReports(sample).lowMapQualReads += 1
            return true
          }
          return false
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
124

Peter van 't Hof's avatar
Peter van 't Hof committed
125
126
        val counts = for (samRecord <- bamIter if !filterRead(samRecord)) {
          checkAlles(samRecord, vcfRecord) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
127
128
            case Some(a) => if (countReports(sample).aCounts.contains(a)) countReports(sample).aCounts(a) += 1
            else countReports(sample).aCounts += (a -> 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
129
130
            case _ => countReports(sample).notFound += 1
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
131
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
132
133
        bamIter.close
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
134

Peter van 't Hof's avatar
Peter van 't Hof committed
135
      val builder = new VariantContextBuilder(vcfRecord)
Peter van 't Hof's avatar
Peter van 't Hof committed
136
      for ((k, v) <- countReports) {
Peter van 't Hof's avatar
Peter van 't Hof committed
137
138
139
140
141
142
        val s = for (allele <- vcfRecord.getAlleles) yield {
          val s = allele.getBaseString
          if (v.aCounts.contains(s)) v.aCounts(s)
          else 0
        }
        builder.attribute("BAM-AD-" + k, s.mkString(","))
Peter van 't Hof's avatar
Peter van 't Hof committed
143
        builder.attribute("BAM-DP-" + k, (0 /: s)(_ + _) + v.notFound)
Peter van 't Hof's avatar
Peter van 't Hof committed
144
145
146
147
148
149
150
      }
      writer.add(builder.make)
    }
    for ((_, r) <- bamReaders) r.close
    reader.close
    writer.close
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
151
152

  def checkAlles(samRecord: SAMRecord, vcfRecord: VariantContext): Option[String] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
153
    val readStartPos = List.range(0, samRecord.getReadBases.length)
Peter van 't Hof's avatar
Peter van 't Hof committed
154
      .find(x => samRecord.getReferencePositionAtReadPosition(x + 1) == vcfRecord.getStart) getOrElse { return None }
Peter van 't Hof's avatar
Peter van 't Hof committed
155
156
157
158
159
    val readBases = samRecord.getReadBases()
    val alleles = vcfRecord.getAlleles.map(x => x.getBaseString)
    val refAllele = alleles.head
    var maxSize = 1
    for (allele <- alleles if allele.size > maxSize) maxSize = allele.size
Peter van 't Hof's avatar
Peter van 't Hof committed
160
161
162
    val readC = for (t <- readStartPos until readStartPos + maxSize if t < readBases.length) yield readBases(t).toChar
    val allelesInRead = mutable.Set(alleles.filter(readC.mkString.startsWith(_)): _*)

Peter van 't Hof's avatar
Peter van 't Hof committed
163
164
165
166
167
    // Removal of insertions that are not really in the cigarstring
    for (allele <- allelesInRead if allele.size > refAllele.size) {
      val refPos = for (t <- refAllele.size until allele.size) yield samRecord.getReferencePositionAtReadPosition(readStartPos + t + 1)
      if (refPos.exists(_ > 0)) allelesInRead -= allele
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
168

Peter van 't Hof's avatar
Peter van 't Hof committed
169
    // Removal of alleles that are not really in the cigarstring
Peter van 't Hof's avatar
Peter van 't Hof committed
170
    for (allele <- allelesInRead) {
Peter van 't Hof's avatar
Peter van 't Hof committed
171
172
      val readPosAfterAllele = samRecord.getReferencePositionAtReadPosition(readStartPos + allele.size + 1)
      val vcfPosAfterAllele = vcfRecord.getStart + refAllele.size
Peter van 't Hof's avatar
Peter van 't Hof committed
173
174
      if (readPosAfterAllele != vcfPosAfterAllele &&
        (refAllele.size != allele.size || (refAllele.size == allele.size && readPosAfterAllele < 0))) allelesInRead -= allele
Peter van 't Hof's avatar
Peter van 't Hof committed
175
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
176

Peter van 't Hof's avatar
Peter van 't Hof committed
177
178
179
180
    for (allele <- allelesInRead if allele.size >= refAllele.size) {
      if (allelesInRead.exists(_.size > allele.size)) allelesInRead -= allele
    }
    if (allelesInRead.contains(refAllele) && allelesInRead.exists(_.size < refAllele.size)) allelesInRead -= refAllele
Peter van 't Hof's avatar
Peter van 't Hof committed
181
182
183
184
    if (allelesInRead.isEmpty) return None
    else if (allelesInRead.size == 1) return Some(allelesInRead.head)
    else {
      logger.warn("vcfRecord: " + vcfRecord)
Peter van 't Hof's avatar
Peter van 't Hof committed
185
      logger.warn("samRecord: " + samRecord.getSAMString)
Peter van 't Hof's avatar
Peter van 't Hof committed
186
      logger.warn("Found multiple options: " + allelesInRead.toString)
Peter van 't Hof's avatar
Peter van 't Hof committed
187
      logger.warn("ReadStartPos: " + readStartPos + "  Read Length: " + samRecord.getReadLength)
Peter van 't Hof's avatar
Peter van 't Hof committed
188
189
190
191
192
      logger.warn("Read skipped, please report this")
      return None
    }
  }
}