diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala index 2540970887f7b920cd46e1ecf74bed46e39fb19a..df8068c0c7e28b57e3344d736aa7cc04abe2b8f2 100644 --- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala +++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala @@ -18,8 +18,10 @@ package nl.lumc.sasc.biopet.utils import java.io.File import htsjdk.samtools.{ SamReader, SamReaderFactory } +import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList } import scala.collection.JavaConversions._ +import scala.collection.mutable import scala.collection.parallel.immutable /** @@ -57,39 +59,76 @@ object BamUtils { * @param end postion to stop scanning * @return Int with insertsize for this contig */ - def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, samplingSize: Int = 100000): Option[Int] = { - val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam) - val samIterator = inputSam.query(contig, start, end, true) - val insertsizes: List[Int] = (for { - read <- samIterator.toStream.takeWhile(rec => { - val paired = rec.getReadPairedFlag && rec.getProperPairFlag - val bothMapped = if (paired) ((rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)) else false - paired && bothMapped - }).take(samplingSize) - } yield { - read.getInferredInsertSize.asInstanceOf[Int].abs - })(collection.breakOut) - val cti = insertsizes.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1)) - - samIterator.close() - inputSam.close() - val ret = if (cti._2 == 0) None else Some((cti._1 / cti._2).toInt) - ret + def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, + samplingSize: Int = 10000, + binSize: Int = 1000000): Option[Int] = { + + // create a bedList to devide the contig into multiple pieces + val insertSizesOnAllFragments = BedRecordList.fromList(Seq(BedRecord(contig, start, end))) + .scatter(binSize) + .allRecords.par.flatMap({ + bedRecord => + // for each scatter, open the bamfile for this specific region-query + val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam) + val samIterator = inputSam.query(bedRecord.chr, bedRecord.start, bedRecord.end, true) + + val counts: mutable.Map[Int, Int] = mutable.Map() + + for (i <- 0 until samplingSize if samIterator.hasNext) { + val rec = samIterator.next() + val isPaired = rec.getReadPairedFlag + val minQ10 = rec.getMappingQuality >= 10 + val pairOnSameContig = rec.getContig == rec.getMateReferenceName + + if (isPaired && minQ10 && pairOnSameContig) { + val insertSize = rec.getInferredInsertSize.abs + counts(insertSize) = counts.getOrElse(insertSize, 0) + 1 + } + } + + counts.keys.size match { + case 1 => Some(counts.keys.head) + case 0 => None + case _ => { + Some(counts.foldLeft(0)((old, observation) => { + observation match { + case (insertSize: Int, observations: Int) => { + (old + (insertSize * observations)) / (observations + 1) + } + case _ => 0 + } + })) + } + } + }) + + insertSizesOnAllFragments.size match { + case 1 => Some(insertSizesOnAllFragments.head) + case 0 => None + case _ => { + Some( + insertSizesOnAllFragments.foldLeft(0)((old, observation) => { + (old + observation) / 2 + })) + } + + } } /** * Estimate the insertsize for one single bamfile and return the insertsize * - * @param bamFile bamfile to estimate avg insertsize from + * @param bamFile bamfile to estimate average insertsize from * @return */ - def sampleBamInsertSize(bamFile: File, samplingSize: Int = 100000): Int = { + def sampleBamInsertSize(bamFile: File, samplingSize: Int = 10000, binSize: Int = 1000000): Int = { val inputSam: SamReader = SamReaderFactory.makeDefault.open(bamFile) - val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({ - contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize) + val bamInsertSizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({ + contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize, binSize) }).toList - val counts = baminsertsizes.flatMap(x => x) + val counts = bamInsertSizes.flatMap(x => x) + // avoid division by zero if (counts.size != 0) { counts.sum / counts.size } else { @@ -103,8 +142,8 @@ object BamUtils { * @param bamFiles input bam files * @return */ - def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 100000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile => - bamFile -> sampleBamInsertSize(bamFile, samplingSize) + def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 10000, binSize: Int = 1000000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile => + bamFile -> sampleBamInsertSize(bamFile, samplingSize, binSize) }.toMap }