BamUtils.scala 3.12 KB
Newer Older
1
2
3
4
package nl.lumc.sasc.biopet.utils

import java.io.File

Wai Yi Leung's avatar
Wai Yi Leung committed
5
import htsjdk.samtools.{ SamReader, SamReaderFactory }
6
7

import scala.collection.JavaConversions._
Wai Yi Leung's avatar
Wai Yi Leung committed
8
import scala.collection.parallel.immutable
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

/**
 * Created by pjvan_thof on 11/19/15.
 */
object BamUtils {

  /**
   * This method will convert a list of bam files to a Map[<sampleName>, <bamFile>]
   *
   * Each sample may only be once in the list
   *
   * @throws IllegalArgumentException
   * @param bamFiles input bam files
   * @return
   */
  def sampleBamMap(bamFiles: List[File]): Map[String, File] = {
    val temp = bamFiles.map { file =>
      val inputSam = SamReaderFactory.makeDefault.open(file)
      val samples = inputSam.getFileHeader.getReadGroups.map(_.getSample).distinct
      if (samples.size == 1) samples.head -> file
      else throw new IllegalArgumentException("Bam contains multiple sample IDs: " + file)
    }
    if (temp.map(_._1).distinct.size != temp.size) throw new IllegalArgumentException("Samples has been found twice")
    temp.toMap
  }
34

Wai Yi Leung's avatar
Wai Yi Leung committed
35
36
37
38
39
40
41
42
43
  /**
   * Estimate the insertsize of fragments within the given contig.
   * Uses the properly paired reads according to flags set by the aligner
   *
   * @param inputBam input bam file
   * @param contig contig to scan for
   * @param end postion to stop scanning
   * @return Int with insertsize for this contig
   */
44
  def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, samplingSize: Int = 100000): Option[Int] = {
Wai Yi Leung's avatar
Wai Yi Leung committed
45
    val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam)
46
    val samIterator = inputSam.query(contig, start, end, true)
Wai Yi Leung's avatar
Wai Yi Leung committed
47
48
    val insertsizes: List[Int] = (for {
      read <- samIterator.toStream.takeWhile(rec => {
49
50
51
52
        val paired = rec.getReadPairedFlag && rec.getProperPairFlag
        val bothMapped = (rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)
        paired && bothMapped
      }).take(samplingSize)
53
    } yield {
Wai Yi Leung's avatar
Wai Yi Leung committed
54
55
56
57
58
59
60
61
      read.getInferredInsertSize.asInstanceOf[Int].abs
    })(collection.breakOut)
    val cti = insertsizes.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1))

    samIterator.close()
    inputSam.close()
    val ret = if (cti._2 == 0) None else Some((cti._1 / cti._2).toInt)
    ret
62
  }
63

64
  /**
65
   * Estimate the insertsize for one single bamfile and return the insertsize
66
   *
67
   * @param bamFile bamfile to estimate avg insertsize from
68
69
   * @return
   */
70
  def sampleBamInsertSize(bamFile: File): Int = {
Wai Yi Leung's avatar
Wai Yi Leung committed
71
72
    val inputSam: SamReader = SamReaderFactory.makeDefault.open(bamFile)
    val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({
73
      contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength)
Wai Yi Leung's avatar
Wai Yi Leung committed
74
    }).toList
75
76
77
78
79
80
81
82
83
84
85
86
87
88
    val counts = baminsertsizes.flatMap(x => x)
    val sum = counts.reduceLeft(_ + _)
    val n = counts.size
    sum / n
  }

  /**
   * Estimate the insertsize for each bam file and return Map[<sampleBamFile>, <insertSize>]
   *
   * @param bamFiles input bam files
   * @return
   */
  def sampleBamInsertSize(bamFiles: List[File]): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile =>
    bamFile -> sampleBamInsertSize(bamFile)
89
  }.toMap
90

91
}