Skip to content
Snippets Groups Projects
Commit 998687b1 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'fix-bamutils-insertsize' into 'develop'

BamUtils Insertsize estimation is more acurate by sampling from all over the contig.

Updated BamUtils to fix the insertsize estimation used in the PindelConfig tool.

See merge request !371
parents d7177565 26d14d83
No related branches found
No related tags found
No related merge requests found
...@@ -18,8 +18,10 @@ package nl.lumc.sasc.biopet.utils ...@@ -18,8 +18,10 @@ package nl.lumc.sasc.biopet.utils
import java.io.File import java.io.File
import htsjdk.samtools.{ SamReader, SamReaderFactory } import htsjdk.samtools.{ SamReader, SamReaderFactory }
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
import scala.collection.JavaConversions._ import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.collection.parallel.immutable import scala.collection.parallel.immutable
/** /**
...@@ -57,39 +59,76 @@ object BamUtils { ...@@ -57,39 +59,76 @@ object BamUtils {
* @param end postion to stop scanning * @param end postion to stop scanning
* @return Int with insertsize for this contig * @return Int with insertsize for this contig
*/ */
def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, samplingSize: Int = 100000): Option[Int] = { def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int,
val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam) samplingSize: Int = 10000,
val samIterator = inputSam.query(contig, start, end, true) binSize: Int = 1000000): Option[Int] = {
val insertsizes: List[Int] = (for {
read <- samIterator.toStream.takeWhile(rec => { // create a bedList to devide the contig into multiple pieces
val paired = rec.getReadPairedFlag && rec.getProperPairFlag val insertSizesOnAllFragments = BedRecordList.fromList(Seq(BedRecord(contig, start, end)))
val bothMapped = if (paired) ((rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)) else false .scatter(binSize)
paired && bothMapped .allRecords.par.flatMap({
}).take(samplingSize) bedRecord =>
} yield { // for each scatter, open the bamfile for this specific region-query
read.getInferredInsertSize.asInstanceOf[Int].abs val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam)
})(collection.breakOut) val samIterator = inputSam.query(bedRecord.chr, bedRecord.start, bedRecord.end, true)
val cti = insertsizes.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1))
val counts: mutable.Map[Int, Int] = mutable.Map()
samIterator.close()
inputSam.close() for (i <- 0 until samplingSize if samIterator.hasNext) {
val ret = if (cti._2 == 0) None else Some((cti._1 / cti._2).toInt) val rec = samIterator.next()
ret val isPaired = rec.getReadPairedFlag
val minQ10 = rec.getMappingQuality >= 10
val pairOnSameContig = rec.getContig == rec.getMateReferenceName
if (isPaired && minQ10 && pairOnSameContig) {
val insertSize = rec.getInferredInsertSize.abs
counts(insertSize) = counts.getOrElse(insertSize, 0) + 1
}
}
counts.keys.size match {
case 1 => Some(counts.keys.head)
case 0 => None
case _ => {
Some(counts.foldLeft(0)((old, observation) => {
observation match {
case (insertSize: Int, observations: Int) => {
(old + (insertSize * observations)) / (observations + 1)
}
case _ => 0
}
}))
}
}
})
insertSizesOnAllFragments.size match {
case 1 => Some(insertSizesOnAllFragments.head)
case 0 => None
case _ => {
Some(
insertSizesOnAllFragments.foldLeft(0)((old, observation) => {
(old + observation) / 2
}))
}
}
} }
/** /**
* Estimate the insertsize for one single bamfile and return the insertsize * Estimate the insertsize for one single bamfile and return the insertsize
* *
* @param bamFile bamfile to estimate avg insertsize from * @param bamFile bamfile to estimate average insertsize from
* @return * @return
*/ */
def sampleBamInsertSize(bamFile: File, samplingSize: Int = 100000): Int = { def sampleBamInsertSize(bamFile: File, samplingSize: Int = 10000, binSize: Int = 1000000): Int = {
val inputSam: SamReader = SamReaderFactory.makeDefault.open(bamFile) val inputSam: SamReader = SamReaderFactory.makeDefault.open(bamFile)
val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({ val bamInsertSizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({
contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize) contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize, binSize)
}).toList }).toList
val counts = baminsertsizes.flatMap(x => x) val counts = bamInsertSizes.flatMap(x => x)
// avoid division by zero
if (counts.size != 0) { if (counts.size != 0) {
counts.sum / counts.size counts.sum / counts.size
} else { } else {
...@@ -103,8 +142,8 @@ object BamUtils { ...@@ -103,8 +142,8 @@ object BamUtils {
* @param bamFiles input bam files * @param bamFiles input bam files
* @return * @return
*/ */
def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 100000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile => def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 10000, binSize: Int = 1000000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile =>
bamFile -> sampleBamInsertSize(bamFile, samplingSize) bamFile -> sampleBamInsertSize(bamFile, samplingSize, binSize)
}.toMap }.toMap
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment