CheckAllelesVcfInBam.scala 8.24 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
17
18

import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
19
20
21
22
23

import htsjdk.samtools.{ QueryInterval, SAMRecord, SamReader, SamReaderFactory }
import htsjdk.variant.variantcontext.{ VariantContext, VariantContextBuilder }
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.vcf.{ VCFFileReader, VCFHeaderLineCount, VCFHeaderLineType, VCFInfoHeaderLine }
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.core.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
25

Peter van 't Hof's avatar
Peter van 't Hof committed
26
27
28
import scala.collection.JavaConversions._
import scala.collection.mutable

Peter van 't Hof's avatar
Peter van 't Hof committed
29
30
31
32
33
34
35
36
37
38
39
40
41
42
//class CheckAllelesVcfInBam(val root: Configurable) extends BiopetJavaCommandLineFunction {
//  javaMainClass = getClass.getName
//  
//  @Input(doc = "Input vcf file", shortName = "V", required = true)
//  var inputFile: File = _
//
//  @Input(doc = "Input bam file", shortName = "bam", required = true)
//  var bamFiles: List[File] = Nil
//  
//  @Output(doc = "Output vcf file", shortName = "o", required = true)
//  var output: File = _
//
//  override def commandLine = super.commandLine + required("-V", inputFile) + repeat("-bam", bamFiles) + required(output)
//}
Peter van 't Hof's avatar
Peter van 't Hof committed
43
44

object CheckAllelesVcfInBam extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
45
46
  case class Args(inputFile: File = null, outputFile: File = null, samples: List[String] = Nil,
                  bamFiles: List[File] = Nil, minMapQual: Int = 1) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
47
48

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('o', "outputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    }
    opt[String]('s', "sample") unbounded () minOccurs (1) action { (x, c) =>
      c.copy(samples = x :: c.samples)
    }
    opt[File]('b', "bam") unbounded () minOccurs (1) action { (x, c) =>
      c.copy(bamFiles = x :: c.bamFiles)
    }
    opt[Int]('m', "min_mapping_quality") maxOccurs (1) action { (x, c) =>
      c.copy(minMapQual = c.minMapQual)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
64
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
65
66

  private class CountReport(
Peter van 't Hof's avatar
Peter van 't Hof committed
67
68
69
    var notFound: Int = 0,
    var aCounts: mutable.Map[String, Int] = mutable.Map(),
    var duplicateReads: Int = 0,
Peter van 't Hof's avatar
Peter van 't Hof committed
70
71
    var lowMapQualReads: Int = 0)

Peter van 't Hof's avatar
Peter van 't Hof committed
72
73
74
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
75

Peter van 't Hof's avatar
Peter van 't Hof committed
76
    if (commandArgs.bamFiles.size != commandArgs.samples.size)
77
      logger.warn("Number of samples is different from number of bam files: additional samples or bam files will not be used")
Peter van 't Hof's avatar
Peter van 't Hof committed
78
79
    val samReaderFactory = SamReaderFactory.makeDefault
    val bamReaders: Map[String, SamReader] = Map(commandArgs.samples zip commandArgs.bamFiles.map(x => samReaderFactory.open(x)): _*)
Peter van 't Hof's avatar
Peter van 't Hof committed
80
    val bamHeaders = bamReaders.map(x => (x._1, x._2.getFileHeader))
Peter van 't Hof's avatar
Peter van 't Hof committed
81

Peter van 't Hof's avatar
Peter van 't Hof committed
82
    val reader = new VCFFileReader(commandArgs.inputFile, false)
Sander Bollen's avatar
Sander Bollen committed
83
84
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputFile).
      setReferenceDictionary(reader.getFileHeader.getSequenceDictionary).build)
Peter van 't Hof's avatar
Peter van 't Hof committed
85

Peter van 't Hof's avatar
Peter van 't Hof committed
86
    val header = reader.getFileHeader
Peter van 't Hof's avatar
Peter van 't Hof committed
87
88
89
90
91
    for ((sample, _) <- bamReaders) {
      header.addMetaDataLine(new VCFInfoHeaderLine("BAM-AD-" + sample,
        VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allele depth, ref and alt on order of vcf file"))
      header.addMetaDataLine(new VCFInfoHeaderLine("BAM-DP-" + sample,
        1, VCFHeaderLineType.Integer, "Total reads on this location"))
Peter van 't Hof's avatar
Peter van 't Hof committed
92
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
93

Peter van 't Hof's avatar
Peter van 't Hof committed
94
    writer.writeHeader(header)
Peter van 't Hof's avatar
Peter van 't Hof committed
95

Peter van 't Hof's avatar
Peter van 't Hof committed
96
    for (vcfRecord <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
97
      val countReports: Map[String, CountReport] = bamReaders.map(x => (x._1, new CountReport))
Peter van 't Hof's avatar
Peter van 't Hof committed
98
99
      val refAllele = vcfRecord.getReference.getBaseString
      for ((sample, bamReader) <- bamReaders) {
Peter van 't Hof's avatar
Peter van 't Hof committed
100
101
        val queryInterval = new QueryInterval(bamHeaders(sample).getSequenceIndex(vcfRecord.getChr),
          vcfRecord.getStart, vcfRecord.getStart + refAllele.size - 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
102
        val bamIter = bamReader.query(Array(queryInterval), false)
Peter van 't Hof's avatar
Peter van 't Hof committed
103
104

        def filterRead(samRecord: SAMRecord): Boolean = {
Peter van 't Hof's avatar
Peter van 't Hof committed
105
106
107
108
109
110
111
112
113
114
115
116
          if (samRecord.getDuplicateReadFlag) {
            countReports(sample).duplicateReads += 1
            return true
          }
          if (samRecord.getSupplementaryAlignmentFlag) return true
          if (samRecord.getNotPrimaryAlignmentFlag) return true
          if (samRecord.getMappingQuality < commandArgs.minMapQual) {
            countReports(sample).lowMapQualReads += 1
            return true
          }
          return false
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
117

Peter van 't Hof's avatar
Peter van 't Hof committed
118
119
        val counts = for (samRecord <- bamIter if !filterRead(samRecord)) {
          checkAlles(samRecord, vcfRecord) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
120
121
            case Some(a) => if (countReports(sample).aCounts.contains(a)) countReports(sample).aCounts(a) += 1
            else countReports(sample).aCounts += (a -> 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
122
123
            case _ => countReports(sample).notFound += 1
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
124
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
125
126
        bamIter.close
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
127

Peter van 't Hof's avatar
Peter van 't Hof committed
128
      val builder = new VariantContextBuilder(vcfRecord)
Peter van 't Hof's avatar
Peter van 't Hof committed
129
      for ((k, v) <- countReports) {
Peter van 't Hof's avatar
Peter van 't Hof committed
130
131
132
133
134
135
        val s = for (allele <- vcfRecord.getAlleles) yield {
          val s = allele.getBaseString
          if (v.aCounts.contains(s)) v.aCounts(s)
          else 0
        }
        builder.attribute("BAM-AD-" + k, s.mkString(","))
Peter van 't Hof's avatar
Peter van 't Hof committed
136
        builder.attribute("BAM-DP-" + k, (0 /: s)(_ + _) + v.notFound)
Peter van 't Hof's avatar
Peter van 't Hof committed
137
138
139
140
141
142
143
      }
      writer.add(builder.make)
    }
    for ((_, r) <- bamReaders) r.close
    reader.close
    writer.close
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
144
145

  def checkAlles(samRecord: SAMRecord, vcfRecord: VariantContext): Option[String] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
146
    val readStartPos = List.range(0, samRecord.getReadBases.length)
Peter van 't Hof's avatar
Peter van 't Hof committed
147
      .find(x => samRecord.getReferencePositionAtReadPosition(x + 1) == vcfRecord.getStart) getOrElse { return None }
Peter van 't Hof's avatar
Peter van 't Hof committed
148
149
150
151
152
    val readBases = samRecord.getReadBases()
    val alleles = vcfRecord.getAlleles.map(x => x.getBaseString)
    val refAllele = alleles.head
    var maxSize = 1
    for (allele <- alleles if allele.size > maxSize) maxSize = allele.size
Peter van 't Hof's avatar
Peter van 't Hof committed
153
154
155
    val readC = for (t <- readStartPos until readStartPos + maxSize if t < readBases.length) yield readBases(t).toChar
    val allelesInRead = mutable.Set(alleles.filter(readC.mkString.startsWith(_)): _*)

Peter van 't Hof's avatar
Peter van 't Hof committed
156
157
158
159
160
    // Removal of insertions that are not really in the cigarstring
    for (allele <- allelesInRead if allele.size > refAllele.size) {
      val refPos = for (t <- refAllele.size until allele.size) yield samRecord.getReferencePositionAtReadPosition(readStartPos + t + 1)
      if (refPos.exists(_ > 0)) allelesInRead -= allele
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
161

Peter van 't Hof's avatar
Peter van 't Hof committed
162
    // Removal of alleles that are not really in the cigarstring
Peter van 't Hof's avatar
Peter van 't Hof committed
163
    for (allele <- allelesInRead) {
Peter van 't Hof's avatar
Peter van 't Hof committed
164
165
      val readPosAfterAllele = samRecord.getReferencePositionAtReadPosition(readStartPos + allele.size + 1)
      val vcfPosAfterAllele = vcfRecord.getStart + refAllele.size
Peter van 't Hof's avatar
Peter van 't Hof committed
166
167
      if (readPosAfterAllele != vcfPosAfterAllele &&
        (refAllele.size != allele.size || (refAllele.size == allele.size && readPosAfterAllele < 0))) allelesInRead -= allele
Peter van 't Hof's avatar
Peter van 't Hof committed
168
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
169

Peter van 't Hof's avatar
Peter van 't Hof committed
170
171
172
173
    for (allele <- allelesInRead if allele.size >= refAllele.size) {
      if (allelesInRead.exists(_.size > allele.size)) allelesInRead -= allele
    }
    if (allelesInRead.contains(refAllele) && allelesInRead.exists(_.size < refAllele.size)) allelesInRead -= refAllele
Peter van 't Hof's avatar
Peter van 't Hof committed
174
175
176
177
    if (allelesInRead.isEmpty) return None
    else if (allelesInRead.size == 1) return Some(allelesInRead.head)
    else {
      logger.warn("vcfRecord: " + vcfRecord)
Peter van 't Hof's avatar
Peter van 't Hof committed
178
      logger.warn("samRecord: " + samRecord.getSAMString)
Peter van 't Hof's avatar
Peter van 't Hof committed
179
      logger.warn("Found multiple options: " + allelesInRead.toString)
Peter van 't Hof's avatar
Peter van 't Hof committed
180
      logger.warn("ReadStartPos: " + readStartPos + "  Read Length: " + samRecord.getReadLength)
Peter van 't Hof's avatar
Peter van 't Hof committed
181
182
183
184
185
      logger.warn("Read skipped, please report this")
      return None
    }
  }
}