diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala index f13230534af0a61fa29f68daa8d48c81a55e1f23..a37b3d54647a3f8175e1935d0c059cb8c34a5384 100644 --- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala +++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala @@ -2,7 +2,7 @@ package nl.lumc.sasc.biopet.utils import java.io.File -import htsjdk.samtools.SamReaderFactory +import htsjdk.samtools.{SamReader, SamReaderFactory} import scala.collection.JavaConversions._ @@ -30,4 +30,33 @@ object BamUtils { if (temp.map(_._1).distinct.size != temp.size) throw new IllegalArgumentException("Samples has been found twice") temp.toMap } + + /** + * Estimate the insertsize for each bam file and return Map[<sampleName>, <insertSize>] + * + * @param bamFiles input bam files + * @return + */ + def sampleBamInsertSize(bamFiles: List[File]): Map[File, Float] = bamFiles.map { file => + + val inputSam: SamReader = SamReaderFactory.makeDefault.open(file) + + val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.map { + contig => + val insertsizes: Iterator[Int] = for { + read <- inputSam.query( contig.getSequenceName, 1, contig.getSequenceLength, true) //.toStream.slice(0, 100).toList + insertsize = read.getInferredInsertSize + paired = read.getReadPairedFlag + bothMapped = (read.getReadUnmappedFlag == false) && (read.getMateUnmappedFlag == false) + if paired && bothMapped + } yield { + insertsize + } + val contigInsertSize = insertsizes.foldLeft((0.0,0))((t, r) => (t._1 + r, t._2 +1)) + contigInsertSize._1 / contigInsertSize._2 + }.foldLeft((0.0,0))((t, r) => (t._1 + r, t._2 +1)) + + file -> baminsertsizes._1 / baminsertsizes._2 + } + }