BamUtils.scala 3.37 KB
Newer Older
1
2
3
4
package nl.lumc.sasc.biopet.utils

import java.io.File

Wai Yi Leung's avatar
Wai Yi Leung committed
5
import htsjdk.samtools.{ SamReader, SamReaderFactory }
6
7

import scala.collection.JavaConversions._
Wai Yi Leung's avatar
Wai Yi Leung committed
8
import scala.collection.parallel.immutable
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

/**
 * Created by pjvan_thof on 11/19/15.
 */
object BamUtils {

  /**
   * This method will convert a list of bam files to a Map[<sampleName>, <bamFile>]
   *
   * Each sample may only be once in the list
   *
   * @throws IllegalArgumentException
   * @param bamFiles input bam files
   * @return
   */
  def sampleBamMap(bamFiles: List[File]): Map[String, File] = {
    val temp = bamFiles.map { file =>
      val inputSam = SamReaderFactory.makeDefault.open(file)
      val samples = inputSam.getFileHeader.getReadGroups.map(_.getSample).distinct
      if (samples.size == 1) samples.head -> file
29
30
      else if (samples.size > 1) throw new IllegalArgumentException("Bam contains multiple sample IDs: " + file)
      else throw new IllegalArgumentException("Bam does not contain sample ID or have no readgroups defined: " + file)
31
32
33
34
    }
    if (temp.map(_._1).distinct.size != temp.size) throw new IllegalArgumentException("Samples has been found twice")
    temp.toMap
  }
35

Wai Yi Leung's avatar
Wai Yi Leung committed
36
37
38
39
40
41
42
43
44
  /**
   * Estimate the insertsize of fragments within the given contig.
   * Uses the properly paired reads according to flags set by the aligner
   *
   * @param inputBam input bam file
   * @param contig contig to scan for
   * @param end postion to stop scanning
   * @return Int with insertsize for this contig
   */
45
  def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, samplingSize: Int = 100000): Option[Int] = {
Wai Yi Leung's avatar
Wai Yi Leung committed
46
    val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam)
47
    val samIterator = inputSam.query(contig, start, end, true)
Wai Yi Leung's avatar
Wai Yi Leung committed
48
49
    val insertsizes: List[Int] = (for {
      read <- samIterator.toStream.takeWhile(rec => {
50
        val paired = rec.getReadPairedFlag && rec.getProperPairFlag
Wai Yi Leung's avatar
Wai Yi Leung committed
51
        val bothMapped = if (paired) ((rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)) else false
52
53
        paired && bothMapped
      }).take(samplingSize)
54
    } yield {
Wai Yi Leung's avatar
Wai Yi Leung committed
55
56
57
58
59
60
61
62
      read.getInferredInsertSize.asInstanceOf[Int].abs
    })(collection.breakOut)
    val cti = insertsizes.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1))

    samIterator.close()
    inputSam.close()
    val ret = if (cti._2 == 0) None else Some((cti._1 / cti._2).toInt)
    ret
63
  }
64

65
  /**
66
   * Estimate the insertsize for one single bamfile and return the insertsize
67
   *
68
   * @param bamFile bamfile to estimate avg insertsize from
69
70
   * @return
   */
71
  def sampleBamInsertSize(bamFile: File, samplingSize: Int = 100000): Int = {
Wai Yi Leung's avatar
Wai Yi Leung committed
72
73
    val inputSam: SamReader = SamReaderFactory.makeDefault.open(bamFile)
    val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({
74
      contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize)
Wai Yi Leung's avatar
Wai Yi Leung committed
75
    }).toList
76
    val counts = baminsertsizes.flatMap(x => x)
Wai Yi Leung's avatar
Wai Yi Leung committed
77
78

    if (counts.size != 0) {
79
      counts.sum / counts.size
Wai Yi Leung's avatar
Wai Yi Leung committed
80
81
82
    } else {
      0
    }
83
84
85
86
87
88
89
90
  }

  /**
   * Estimate the insertsize for each bam file and return Map[<sampleBamFile>, <insertSize>]
   *
   * @param bamFiles input bam files
   * @return
   */
91
92
  def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 100000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile =>
    bamFile -> sampleBamInsertSize(bamFile, samplingSize)
93
  }.toMap
94

95
}