CheckAllelesVcfInBam.scala 7.77 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
2

Peter van 't Hof's avatar
Peter van 't Hof committed
3
import htsjdk.samtools.{ QueryInterval, SamReaderFactory, SAMRecord, SamReader }
Peter van 't Hof's avatar
Peter van 't Hof committed
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.variantcontext.VariantContextBuilder
import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
import htsjdk.variant.vcf.VCFFileReader
import htsjdk.variant.vcf.VCFInfoHeaderLine
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.collection.JavaConversions._
import scala.collection.mutable
import htsjdk.variant.vcf.VCFHeaderLineType
import htsjdk.variant.vcf.VCFHeaderLineCount
import scala.math._

Peter van 't Hof's avatar
Peter van 't Hof committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
//class CheckAllelesVcfInBam(val root: Configurable) extends BiopetJavaCommandLineFunction {
//  javaMainClass = getClass.getName
//  
//  @Input(doc = "Input vcf file", shortName = "V", required = true)
//  var inputFile: File = _
//
//  @Input(doc = "Input bam file", shortName = "bam", required = true)
//  var bamFiles: List[File] = Nil
//  
//  @Output(doc = "Output vcf file", shortName = "o", required = true)
//  var output: File = _
//
//  override def commandLine = super.commandLine + required("-V", inputFile) + repeat("-bam", bamFiles) + required(output)
//}
Peter van 't Hof's avatar
Peter van 't Hof committed
35
36

object CheckAllelesVcfInBam extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
37
38
  case class Args(inputFile: File = null, outputFile: File = null, samples: List[String] = Nil,
                  bamFiles: List[File] = Nil, minMapQual: Int = 1) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
39
40

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('o', "outputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    }
    opt[String]('s', "sample") unbounded () minOccurs (1) action { (x, c) =>
      c.copy(samples = x :: c.samples)
    }
    opt[File]('b', "bam") unbounded () minOccurs (1) action { (x, c) =>
      c.copy(bamFiles = x :: c.bamFiles)
    }
    opt[Int]('m', "min_mapping_quality") maxOccurs (1) action { (x, c) =>
      c.copy(minMapQual = c.minMapQual)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
56
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
57
58

  private class CountReport(
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
61
    var notFound: Int = 0,
    var aCounts: mutable.Map[String, Int] = mutable.Map(),
    var duplicateReads: Int = 0,
Peter van 't Hof's avatar
Peter van 't Hof committed
62
63
    var lowMapQualReads: Int = 0)

Peter van 't Hof's avatar
Peter van 't Hof committed
64
65
66
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
67

Peter van 't Hof's avatar
Peter van 't Hof committed
68
69
    if (commandArgs.bamFiles.size != commandArgs.samples.size)
      logger.warn("Number of samples is diffrent then number of bam files, left over will be removed")
Peter van 't Hof's avatar
Peter van 't Hof committed
70
71
    val samReaderFactory = SamReaderFactory.makeDefault
    val bamReaders: Map[String, SamReader] = Map(commandArgs.samples zip commandArgs.bamFiles.map(x => samReaderFactory.open(x)): _*)
Peter van 't Hof's avatar
Peter van 't Hof committed
72
    val bamHeaders = bamReaders.map(x => (x._1, x._2.getFileHeader))
Peter van 't Hof's avatar
Peter van 't Hof committed
73

Peter van 't Hof's avatar
Peter van 't Hof committed
74
75
    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputFile).build)
Peter van 't Hof's avatar
Peter van 't Hof committed
76

Peter van 't Hof's avatar
Peter van 't Hof committed
77
    val header = reader.getFileHeader
Peter van 't Hof's avatar
Peter van 't Hof committed
78
79
80
81
82
    for ((sample, _) <- bamReaders) {
      header.addMetaDataLine(new VCFInfoHeaderLine("BAM-AD-" + sample,
        VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allele depth, ref and alt on order of vcf file"))
      header.addMetaDataLine(new VCFInfoHeaderLine("BAM-DP-" + sample,
        1, VCFHeaderLineType.Integer, "Total reads on this location"))
Peter van 't Hof's avatar
Peter van 't Hof committed
83
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
84

Peter van 't Hof's avatar
Peter van 't Hof committed
85
    writer.writeHeader(header)
Peter van 't Hof's avatar
Peter van 't Hof committed
86

Peter van 't Hof's avatar
Peter van 't Hof committed
87
    for (vcfRecord <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
88
      val countReports: Map[String, CountReport] = bamReaders.map(x => (x._1, new CountReport))
Peter van 't Hof's avatar
Peter van 't Hof committed
89
90
      val refAllele = vcfRecord.getReference.getBaseString
      for ((sample, bamReader) <- bamReaders) {
Peter van 't Hof's avatar
Peter van 't Hof committed
91
92
        val queryInterval = new QueryInterval(bamHeaders(sample).getSequenceIndex(vcfRecord.getChr),
          vcfRecord.getStart, vcfRecord.getStart + refAllele.size - 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
93
        val bamIter = bamReader.query(Array(queryInterval), false)
Peter van 't Hof's avatar
Peter van 't Hof committed
94
95

        def filterRead(samRecord: SAMRecord): Boolean = {
Peter van 't Hof's avatar
Peter van 't Hof committed
96
97
98
99
100
101
102
103
104
105
106
107
          if (samRecord.getDuplicateReadFlag) {
            countReports(sample).duplicateReads += 1
            return true
          }
          if (samRecord.getSupplementaryAlignmentFlag) return true
          if (samRecord.getNotPrimaryAlignmentFlag) return true
          if (samRecord.getMappingQuality < commandArgs.minMapQual) {
            countReports(sample).lowMapQualReads += 1
            return true
          }
          return false
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
108

Peter van 't Hof's avatar
Peter van 't Hof committed
109
110
        val counts = for (samRecord <- bamIter if !filterRead(samRecord)) {
          checkAlles(samRecord, vcfRecord) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
111
112
            case Some(a) => if (countReports(sample).aCounts.contains(a)) countReports(sample).aCounts(a) += 1
            else countReports(sample).aCounts += (a -> 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
113
114
            case _ => countReports(sample).notFound += 1
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
115
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
116
117
        bamIter.close
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
118

Peter van 't Hof's avatar
Peter van 't Hof committed
119
      val builder = new VariantContextBuilder(vcfRecord)
Peter van 't Hof's avatar
Peter van 't Hof committed
120
      for ((k, v) <- countReports) {
Peter van 't Hof's avatar
Peter van 't Hof committed
121
122
123
124
125
126
        val s = for (allele <- vcfRecord.getAlleles) yield {
          val s = allele.getBaseString
          if (v.aCounts.contains(s)) v.aCounts(s)
          else 0
        }
        builder.attribute("BAM-AD-" + k, s.mkString(","))
Peter van 't Hof's avatar
Peter van 't Hof committed
127
        builder.attribute("BAM-DP-" + k, (0 /: s)(_ + _) + v.notFound)
Peter van 't Hof's avatar
Peter van 't Hof committed
128
129
130
131
132
133
134
      }
      writer.add(builder.make)
    }
    for ((_, r) <- bamReaders) r.close
    reader.close
    writer.close
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
135
136

  def checkAlles(samRecord: SAMRecord, vcfRecord: VariantContext): Option[String] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
137
    val readStartPos = List.range(0, samRecord.getReadBases.length)
Peter van 't Hof's avatar
Peter van 't Hof committed
138
      .find(x => samRecord.getReferencePositionAtReadPosition(x + 1) == vcfRecord.getStart) getOrElse { return None }
Peter van 't Hof's avatar
Peter van 't Hof committed
139
140
141
142
143
    val readBases = samRecord.getReadBases()
    val alleles = vcfRecord.getAlleles.map(x => x.getBaseString)
    val refAllele = alleles.head
    var maxSize = 1
    for (allele <- alleles if allele.size > maxSize) maxSize = allele.size
Peter van 't Hof's avatar
Peter van 't Hof committed
144
145
146
    val readC = for (t <- readStartPos until readStartPos + maxSize if t < readBases.length) yield readBases(t).toChar
    val allelesInRead = mutable.Set(alleles.filter(readC.mkString.startsWith(_)): _*)

Peter van 't Hof's avatar
Peter van 't Hof committed
147
148
149
150
151
    // Removal of insertions that are not really in the cigarstring
    for (allele <- allelesInRead if allele.size > refAllele.size) {
      val refPos = for (t <- refAllele.size until allele.size) yield samRecord.getReferencePositionAtReadPosition(readStartPos + t + 1)
      if (refPos.exists(_ > 0)) allelesInRead -= allele
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
152

Peter van 't Hof's avatar
Peter van 't Hof committed
153
    // Removal of alleles that are not really in the cigarstring
Peter van 't Hof's avatar
Peter van 't Hof committed
154
    for (allele <- allelesInRead) {
Peter van 't Hof's avatar
Peter van 't Hof committed
155
156
      val readPosAfterAllele = samRecord.getReferencePositionAtReadPosition(readStartPos + allele.size + 1)
      val vcfPosAfterAllele = vcfRecord.getStart + refAllele.size
Peter van 't Hof's avatar
Peter van 't Hof committed
157
158
      if (readPosAfterAllele != vcfPosAfterAllele &&
        (refAllele.size != allele.size || (refAllele.size == allele.size && readPosAfterAllele < 0))) allelesInRead -= allele
Peter van 't Hof's avatar
Peter van 't Hof committed
159
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
160

Peter van 't Hof's avatar
Peter van 't Hof committed
161
162
163
164
    for (allele <- allelesInRead if allele.size >= refAllele.size) {
      if (allelesInRead.exists(_.size > allele.size)) allelesInRead -= allele
    }
    if (allelesInRead.contains(refAllele) && allelesInRead.exists(_.size < refAllele.size)) allelesInRead -= refAllele
Peter van 't Hof's avatar
Peter van 't Hof committed
165
166
167
168
    if (allelesInRead.isEmpty) return None
    else if (allelesInRead.size == 1) return Some(allelesInRead.head)
    else {
      logger.warn("vcfRecord: " + vcfRecord)
Peter van 't Hof's avatar
Peter van 't Hof committed
169
      logger.warn("samRecord: " + samRecord.getSAMString)
Peter van 't Hof's avatar
Peter van 't Hof committed
170
      logger.warn("Found multiple options: " + allelesInRead.toString)
Peter van 't Hof's avatar
Peter van 't Hof committed
171
      logger.warn("ReadStartPos: " + readStartPos + "  Read Length: " + samRecord.getReadLength)
Peter van 't Hof's avatar
Peter van 't Hof committed
172
173
174
175
176
      logger.warn("Read skipped, please report this")
      return None
    }
  }
}