FindRepeatsPacBio.scala 6.22 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16 17
package nl.lumc.sasc.biopet.tools

18
import java.io.{ PrintWriter, File }
Peter van 't Hof's avatar
Peter van 't Hof committed
19 20

import htsjdk.samtools.{ QueryInterval, SAMRecord, SamReaderFactory, ValidationStringency }
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.utils.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
22

Peter van 't Hof's avatar
Peter van 't Hof committed
23
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
25 26

object FindRepeatsPacBio extends ToolCommand {
27 28 29
  case class Args(inputBam: File = null,
                  outputFile: Option[File] = None,
                  inputBed: File = null) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
30 31

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
32
    opt[File]('I', "inputBam") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
33
      c.copy(inputBam = x)
Sander Bollen's avatar
Sander Bollen committed
34
    } text "Path to input file"
35 36 37
    opt[File]('o', "outputFile") maxOccurs 1 valueName "<file>" action { (x, c) =>
      c.copy(outputFile = Some(x))
    } text "Path to input file"
Peter van 't Hof's avatar
Peter van 't Hof committed
38
    opt[File]('b', "inputBed") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
39
      c.copy(inputBed = x)
Sander Bollen's avatar
Sander Bollen committed
40
    } text "Path to bed file"
Peter van 't Hof's avatar
Peter van 't Hof committed
41
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
42

Peter van 't Hof's avatar
Peter van 't Hof committed
43 44 45 46
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
47

Peter van 't Hof's avatar
Peter van 't Hof committed
48
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
49
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
Peter van 't Hof's avatar
Peter van 't Hof committed
50 51 52
    val bamReader = SamReaderFactory.makeDefault
      .validationStringency(ValidationStringency.SILENT)
      .open(commandArgs.inputBam)
Peter van 't Hof's avatar
Peter van 't Hof committed
53
    val bamHeader = bamReader.getFileHeader
Peter van 't Hof's avatar
Peter van 't Hof committed
54 55 56 57 58 59

    val header = List("chr", "startPos", "stopPos", "Repeat_seq", "repeatLength",
      "original_Repeat_readLength", "Calculated_repeat_readLength",
      "minLength", "maxLength", "inserts", "deletions", "notSpan")

    for (
Peter van 't Hof's avatar
Peter van 't Hof committed
60
      bedLine <- Source.fromFile(commandArgs.inputBed).getLines();
Peter van 't Hof's avatar
Peter van 't Hof committed
61
      values = bedLine.split("\t"); if values.size >= 3
Peter van 't Hof's avatar
Peter van 't Hof committed
62
    ) {
Peter van 't Hof's avatar
Peter van 't Hof committed
63
      val interval = new QueryInterval(bamHeader.getSequenceIndex(values(0)), values(1).toInt, values(2).toInt)
Peter van 't Hof's avatar
Peter van 't Hof committed
64
      val bamIter = bamReader.query(Array(interval), false)
Peter van 't Hof's avatar
Peter van 't Hof committed
65
      val results = for (samRecord <- bamIter) yield procesSamrecord(samRecord, interval)
Peter van 't Hof's avatar
Peter van 't Hof committed
66 67
      val chr = values(0)
      val startPos = values(1)
68
      val stopPos = values(2)
Peter van 't Hof's avatar
Peter van 't Hof committed
69
      val typeRepeat: String = if (values.size >= 15) values(14) else ""
70
      val repeatLength = typeRepeat.length
Peter van 't Hof's avatar
Peter van 't Hof committed
71 72 73 74 75 76 77
      val oriRepeatLength = values(2).toInt - values(1).toInt + 1
      var calcRepeatLength: List[Int] = Nil
      var minLength = -1
      var maxLength = -1
      var inserts: List[String] = Nil
      var deletions: List[String] = Nil
      var notSpan = 0
Peter van 't Hof's avatar
Peter van 't Hof committed
78

Peter van 't Hof's avatar
Peter van 't Hof committed
79 80 81 82 83
      for (result <- results) {
        if (result.isEmpty) notSpan += 1
        else {
          inserts ::= result.get.ins.map(_.insert).mkString(",")
          deletions ::= result.get.dels.map(_.length).mkString(",")
Peter van 't Hof's avatar
Peter van 't Hof committed
84
          val length = oriRepeatLength - result.get.beginDel - result.get.endDel -
Peter van 't Hof's avatar
Peter van 't Hof committed
85
            (0 /: result.get.dels.map(_.length))(_ + _) + (0 /: result.get.ins.map(_.insert.length))(_ + _)
Peter van 't Hof's avatar
Peter van 't Hof committed
86 87 88 89 90
          calcRepeatLength ::= length
          if (length > maxLength) maxLength = length
          if (length < minLength || minLength == -1) minLength = length
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
91
      bamIter.close()
92 93 94 95 96 97 98 99 100 101 102 103 104 105
      commandArgs.outputFile match {
        case Some(file) => {
          val writer = new PrintWriter(file)
          writer.println(header.mkString("\t"))
          writer.println(List(chr, startPos, stopPos, typeRepeat, repeatLength, oriRepeatLength, calcRepeatLength.mkString(","), minLength,
            maxLength, inserts.mkString("/"), deletions.mkString("/"), notSpan).mkString("\t"))
          writer.close()
        }
        case _ => {
          println(header.mkString("\t"))
          println(List(chr, startPos, stopPos, typeRepeat, repeatLength, oriRepeatLength, calcRepeatLength.mkString(","), minLength,
            maxLength, inserts.mkString("/"), deletions.mkString("/"), notSpan).mkString("\t"))
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
106 107
    }
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
108 109 110 111

  case class Del(pos: Int, length: Int)
  case class Ins(pos: Int, insert: String)

Peter van 't Hof's avatar
Peter van 't Hof committed
112 113 114 115 116 117
  class Result() {
    var beginDel = 0
    var endDel = 0
    var dels: List[Del] = Nil
    var ins: List[Ins] = Nil
    var samRecord: SAMRecord = _
Peter van 't Hof's avatar
Peter van 't Hof committed
118

Peter van 't Hof's avatar
Peter van 't Hof committed
119
    override def toString = {
Peter van 't Hof's avatar
Peter van 't Hof committed
120
      "id: " + samRecord.getReadName + "  beginDel: " + beginDel + "  endDel: " + endDel + "  dels: " + dels + "  ins: " + ins
Peter van 't Hof's avatar
Peter van 't Hof committed
121 122
    }
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
123 124

  def procesSamrecord(samRecord: SAMRecord, interval: QueryInterval): Option[Result] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
125
    val readStartPos = List.range(0, samRecord.getReadBases.length)
Peter van 't Hof's avatar
Peter van 't Hof committed
126
      .find(x => samRecord.getReferencePositionAtReadPosition(x) >= interval.start)
Peter van 't Hof's avatar
Peter van 't Hof committed
127 128 129 130
    var readPos = if (readStartPos.isEmpty) return None else readStartPos.get
    if (samRecord.getAlignmentEnd < interval.end) return None
    if (samRecord.getAlignmentStart > interval.start) return None
    var refPos = samRecord.getReferencePositionAtReadPosition(readPos)
Peter van 't Hof's avatar
Peter van 't Hof committed
131

Peter van 't Hof's avatar
Peter van 't Hof committed
132 133 134 135 136 137 138 139 140
    val result = new Result
    result.samRecord = samRecord
    result.beginDel = interval.start - refPos
    while (refPos < interval.end) {
      val oldRefPos = refPos
      val oldReadPos = readPos
      do {
        readPos += 1
        refPos = samRecord.getReferencePositionAtReadPosition(readPos)
Peter van 't Hof's avatar
Peter van 't Hof committed
141
      } while (refPos < oldReadPos)
Peter van 't Hof's avatar
Peter van 't Hof committed
142 143 144 145 146
      val readDiff = readPos - oldReadPos
      val refDiff = refPos - oldRefPos
      if (refPos > interval.end) {
        result.endDel = interval.end - oldRefPos
      } else if (readDiff > refDiff) { //Insertion
Peter van 't Hof's avatar
Peter van 't Hof committed
147
        val insert = for (t <- oldReadPos + 1 until readPos) yield samRecord.getReadBases()(t - 1).toChar
Peter van 't Hof's avatar
Peter van 't Hof committed
148 149 150 151 152
        result.ins ::= Ins(oldRefPos, insert.mkString)
      } else if (readDiff < refDiff) { // Deletion
        result.dels ::= Del(oldRefPos, refDiff - readDiff)
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
153

Peter van 't Hof's avatar
Peter van 't Hof committed
154
    Some(result)
Peter van 't Hof's avatar
Peter van 't Hof committed
155 156
  }
}