From eea29773b47709114d9d2404fb79b8089618de1c Mon Sep 17 00:00:00 2001 From: Wai Yi Leung <w.y.leung@lumc.nl> Date: Mon, 4 Apr 2016 11:35:13 +0200 Subject: [PATCH] BamUtils Insertsize estimation is more acurate by sampling from all over the contig. --- .../nl/lumc/sasc/biopet/utils/BamUtils.scala | 57 ++++++++++++------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala index 254097088..26be068c0 100644 --- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala +++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala @@ -18,6 +18,7 @@ package nl.lumc.sasc.biopet.utils import java.io.File import htsjdk.samtools.{ SamReader, SamReaderFactory } +import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList } import scala.collection.JavaConversions._ import scala.collection.parallel.immutable @@ -57,24 +58,38 @@ object BamUtils { * @param end postion to stop scanning * @return Int with insertsize for this contig */ - def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, samplingSize: Int = 100000): Option[Int] = { - val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam) - val samIterator = inputSam.query(contig, start, end, true) - val insertsizes: List[Int] = (for { - read <- samIterator.toStream.takeWhile(rec => { - val paired = rec.getReadPairedFlag && rec.getProperPairFlag - val bothMapped = if (paired) ((rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)) else false - paired && bothMapped - }).take(samplingSize) - } yield { - read.getInferredInsertSize.asInstanceOf[Int].abs - })(collection.breakOut) - val cti = insertsizes.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1)) + def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, + samplingSize: Int = 10000, + binSize: Int = 1000000): Option[Int] = { - samIterator.close() - inputSam.close() - val ret = if (cti._2 == 0) None else Some((cti._1 / cti._2).toInt) - ret + // create a bedList to devide the contig into multiple pieces + val insertSizesOnAllFragments = BedRecordList.fromList(Seq(BedRecord(contig, start, end))) + .scatter(binSize) + .allRecords.par.map({ + bedRecord => + // for each scatter, open the bamfile for this specific region-query + val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam) + val samIterator = inputSam.query(bedRecord.chr, bedRecord.start, bedRecord.end, true) + + val insertsizes: List[Int] = (for { + read <- samIterator.toStream.takeWhile(rec => { + val minQ10 = rec.getMappingQuality >= 10 + val paired = rec.getReadPairedFlag && rec.getProperPairFlag + val bothMapped = if (paired) ((rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)) else false + paired && bothMapped && minQ10 + }).take(samplingSize) + } yield { + read.getInferredInsertSize.asInstanceOf[Int].abs + })(collection.breakOut) + val cti = insertsizes.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1)) + + samIterator.close() + inputSam.close() + if (cti._2 == 0) None else Some((cti._1 / cti._2).toInt) + }).toList.flatten + + val contigInsertSize = insertSizesOnAllFragments.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1)) + if (contigInsertSize._2 == 0) None else Some((contigInsertSize._1 / contigInsertSize._2).toInt) } /** @@ -83,10 +98,10 @@ object BamUtils { * @param bamFile bamfile to estimate avg insertsize from * @return */ - def sampleBamInsertSize(bamFile: File, samplingSize: Int = 100000): Int = { + def sampleBamInsertSize(bamFile: File, samplingSize: Int = 10000, binSize: Int = 1000000): Int = { val inputSam: SamReader = SamReaderFactory.makeDefault.open(bamFile) val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({ - contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize) + contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize, binSize) }).toList val counts = baminsertsizes.flatMap(x => x) @@ -103,8 +118,8 @@ object BamUtils { * @param bamFiles input bam files * @return */ - def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 100000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile => - bamFile -> sampleBamInsertSize(bamFile, samplingSize) + def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 10000, binSize: Int = 1000000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile => + bamFile -> sampleBamInsertSize(bamFile, samplingSize, binSize) }.toMap } -- GitLab