CheckAllelesVcfInBam.scala 8.48 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
17

Peter van 't Hof's avatar
Peter van 't Hof committed
18
import htsjdk.samtools.{ QueryInterval, SamReaderFactory, SAMRecord, SamReader }
Peter van 't Hof's avatar
Peter van 't Hof committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.variantcontext.VariantContextBuilder
import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
import htsjdk.variant.vcf.VCFFileReader
import htsjdk.variant.vcf.VCFInfoHeaderLine
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.collection.JavaConversions._
import scala.collection.mutable
import htsjdk.variant.vcf.VCFHeaderLineType
import htsjdk.variant.vcf.VCFHeaderLineCount
import scala.math._

Peter van 't Hof's avatar
Peter van 't Hof committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
//class CheckAllelesVcfInBam(val root: Configurable) extends BiopetJavaCommandLineFunction {
//  javaMainClass = getClass.getName
//  
//  @Input(doc = "Input vcf file", shortName = "V", required = true)
//  var inputFile: File = _
//
//  @Input(doc = "Input bam file", shortName = "bam", required = true)
//  var bamFiles: List[File] = Nil
//  
//  @Output(doc = "Output vcf file", shortName = "o", required = true)
//  var output: File = _
//
//  override def commandLine = super.commandLine + required("-V", inputFile) + repeat("-bam", bamFiles) + required(output)
//}
Peter van 't Hof's avatar
Peter van 't Hof committed
50
51

object CheckAllelesVcfInBam extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
52
53
  case class Args(inputFile: File = null, outputFile: File = null, samples: List[String] = Nil,
                  bamFiles: List[File] = Nil, minMapQual: Int = 1) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
54
55

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('o', "outputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    }
    opt[String]('s', "sample") unbounded () minOccurs (1) action { (x, c) =>
      c.copy(samples = x :: c.samples)
    }
    opt[File]('b', "bam") unbounded () minOccurs (1) action { (x, c) =>
      c.copy(bamFiles = x :: c.bamFiles)
    }
    opt[Int]('m', "min_mapping_quality") maxOccurs (1) action { (x, c) =>
      c.copy(minMapQual = c.minMapQual)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
71
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
72
73

  private class CountReport(
Peter van 't Hof's avatar
Peter van 't Hof committed
74
75
76
    var notFound: Int = 0,
    var aCounts: mutable.Map[String, Int] = mutable.Map(),
    var duplicateReads: Int = 0,
Peter van 't Hof's avatar
Peter van 't Hof committed
77
78
    var lowMapQualReads: Int = 0)

Peter van 't Hof's avatar
Peter van 't Hof committed
79
80
81
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
82

Peter van 't Hof's avatar
Peter van 't Hof committed
83
84
    if (commandArgs.bamFiles.size != commandArgs.samples.size)
      logger.warn("Number of samples is diffrent then number of bam files, left over will be removed")
Peter van 't Hof's avatar
Peter van 't Hof committed
85
86
    val samReaderFactory = SamReaderFactory.makeDefault
    val bamReaders: Map[String, SamReader] = Map(commandArgs.samples zip commandArgs.bamFiles.map(x => samReaderFactory.open(x)): _*)
Peter van 't Hof's avatar
Peter van 't Hof committed
87
    val bamHeaders = bamReaders.map(x => (x._1, x._2.getFileHeader))
Peter van 't Hof's avatar
Peter van 't Hof committed
88

Peter van 't Hof's avatar
Peter van 't Hof committed
89
90
    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputFile).build)
Peter van 't Hof's avatar
Peter van 't Hof committed
91

Peter van 't Hof's avatar
Peter van 't Hof committed
92
    val header = reader.getFileHeader
Peter van 't Hof's avatar
Peter van 't Hof committed
93
94
95
96
97
    for ((sample, _) <- bamReaders) {
      header.addMetaDataLine(new VCFInfoHeaderLine("BAM-AD-" + sample,
        VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allele depth, ref and alt on order of vcf file"))
      header.addMetaDataLine(new VCFInfoHeaderLine("BAM-DP-" + sample,
        1, VCFHeaderLineType.Integer, "Total reads on this location"))
Peter van 't Hof's avatar
Peter van 't Hof committed
98
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
99

Peter van 't Hof's avatar
Peter van 't Hof committed
100
    writer.writeHeader(header)
Peter van 't Hof's avatar
Peter van 't Hof committed
101

Peter van 't Hof's avatar
Peter van 't Hof committed
102
    for (vcfRecord <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
103
      val countReports: Map[String, CountReport] = bamReaders.map(x => (x._1, new CountReport))
Peter van 't Hof's avatar
Peter van 't Hof committed
104
105
      val refAllele = vcfRecord.getReference.getBaseString
      for ((sample, bamReader) <- bamReaders) {
Peter van 't Hof's avatar
Peter van 't Hof committed
106
107
        val queryInterval = new QueryInterval(bamHeaders(sample).getSequenceIndex(vcfRecord.getChr),
          vcfRecord.getStart, vcfRecord.getStart + refAllele.size - 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
108
        val bamIter = bamReader.query(Array(queryInterval), false)
Peter van 't Hof's avatar
Peter van 't Hof committed
109
110

        def filterRead(samRecord: SAMRecord): Boolean = {
Peter van 't Hof's avatar
Peter van 't Hof committed
111
112
113
114
115
116
117
118
119
120
121
122
          if (samRecord.getDuplicateReadFlag) {
            countReports(sample).duplicateReads += 1
            return true
          }
          if (samRecord.getSupplementaryAlignmentFlag) return true
          if (samRecord.getNotPrimaryAlignmentFlag) return true
          if (samRecord.getMappingQuality < commandArgs.minMapQual) {
            countReports(sample).lowMapQualReads += 1
            return true
          }
          return false
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
123

Peter van 't Hof's avatar
Peter van 't Hof committed
124
125
        val counts = for (samRecord <- bamIter if !filterRead(samRecord)) {
          checkAlles(samRecord, vcfRecord) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
126
127
            case Some(a) => if (countReports(sample).aCounts.contains(a)) countReports(sample).aCounts(a) += 1
            else countReports(sample).aCounts += (a -> 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
128
129
            case _ => countReports(sample).notFound += 1
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
130
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
131
132
        bamIter.close
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
133

Peter van 't Hof's avatar
Peter van 't Hof committed
134
      val builder = new VariantContextBuilder(vcfRecord)
Peter van 't Hof's avatar
Peter van 't Hof committed
135
      for ((k, v) <- countReports) {
Peter van 't Hof's avatar
Peter van 't Hof committed
136
137
138
139
140
141
        val s = for (allele <- vcfRecord.getAlleles) yield {
          val s = allele.getBaseString
          if (v.aCounts.contains(s)) v.aCounts(s)
          else 0
        }
        builder.attribute("BAM-AD-" + k, s.mkString(","))
Peter van 't Hof's avatar
Peter van 't Hof committed
142
        builder.attribute("BAM-DP-" + k, (0 /: s)(_ + _) + v.notFound)
Peter van 't Hof's avatar
Peter van 't Hof committed
143
144
145
146
147
148
149
      }
      writer.add(builder.make)
    }
    for ((_, r) <- bamReaders) r.close
    reader.close
    writer.close
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
150
151

  def checkAlles(samRecord: SAMRecord, vcfRecord: VariantContext): Option[String] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
152
    val readStartPos = List.range(0, samRecord.getReadBases.length)
Peter van 't Hof's avatar
Peter van 't Hof committed
153
      .find(x => samRecord.getReferencePositionAtReadPosition(x + 1) == vcfRecord.getStart) getOrElse { return None }
Peter van 't Hof's avatar
Peter van 't Hof committed
154
155
156
157
158
    val readBases = samRecord.getReadBases()
    val alleles = vcfRecord.getAlleles.map(x => x.getBaseString)
    val refAllele = alleles.head
    var maxSize = 1
    for (allele <- alleles if allele.size > maxSize) maxSize = allele.size
Peter van 't Hof's avatar
Peter van 't Hof committed
159
160
161
    val readC = for (t <- readStartPos until readStartPos + maxSize if t < readBases.length) yield readBases(t).toChar
    val allelesInRead = mutable.Set(alleles.filter(readC.mkString.startsWith(_)): _*)

Peter van 't Hof's avatar
Peter van 't Hof committed
162
163
164
165
166
    // Removal of insertions that are not really in the cigarstring
    for (allele <- allelesInRead if allele.size > refAllele.size) {
      val refPos = for (t <- refAllele.size until allele.size) yield samRecord.getReferencePositionAtReadPosition(readStartPos + t + 1)
      if (refPos.exists(_ > 0)) allelesInRead -= allele
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
167

Peter van 't Hof's avatar
Peter van 't Hof committed
168
    // Removal of alleles that are not really in the cigarstring
Peter van 't Hof's avatar
Peter van 't Hof committed
169
    for (allele <- allelesInRead) {
Peter van 't Hof's avatar
Peter van 't Hof committed
170
171
      val readPosAfterAllele = samRecord.getReferencePositionAtReadPosition(readStartPos + allele.size + 1)
      val vcfPosAfterAllele = vcfRecord.getStart + refAllele.size
Peter van 't Hof's avatar
Peter van 't Hof committed
172
173
      if (readPosAfterAllele != vcfPosAfterAllele &&
        (refAllele.size != allele.size || (refAllele.size == allele.size && readPosAfterAllele < 0))) allelesInRead -= allele
Peter van 't Hof's avatar
Peter van 't Hof committed
174
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
175

Peter van 't Hof's avatar
Peter van 't Hof committed
176
177
178
179
    for (allele <- allelesInRead if allele.size >= refAllele.size) {
      if (allelesInRead.exists(_.size > allele.size)) allelesInRead -= allele
    }
    if (allelesInRead.contains(refAllele) && allelesInRead.exists(_.size < refAllele.size)) allelesInRead -= refAllele
Peter van 't Hof's avatar
Peter van 't Hof committed
180
181
182
183
    if (allelesInRead.isEmpty) return None
    else if (allelesInRead.size == 1) return Some(allelesInRead.head)
    else {
      logger.warn("vcfRecord: " + vcfRecord)
Peter van 't Hof's avatar
Peter van 't Hof committed
184
      logger.warn("samRecord: " + samRecord.getSAMString)
Peter van 't Hof's avatar
Peter van 't Hof committed
185
      logger.warn("Found multiple options: " + allelesInRead.toString)
Peter van 't Hof's avatar
Peter van 't Hof committed
186
      logger.warn("ReadStartPos: " + readStartPos + "  Read Length: " + samRecord.getReadLength)
Peter van 't Hof's avatar
Peter van 't Hof committed
187
188
189
190
191
      logger.warn("Read skipped, please report this")
      return None
    }
  }
}