From 5827c070675f37834625a16acb5210abaab9ef86 Mon Sep 17 00:00:00 2001 From: Wai Yi Leung <w.y.leung@lumc.nl> Date: Mon, 25 Jan 2016 13:26:25 +0100 Subject: [PATCH] Adding sampleBamInsertSize (with some error, too be fixed later) --- .../nl/lumc/sasc/biopet/utils/BamUtils.scala | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala index f13230534..a37b3d546 100644 --- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala +++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala @@ -2,7 +2,7 @@ package nl.lumc.sasc.biopet.utils import java.io.File -import htsjdk.samtools.SamReaderFactory +import htsjdk.samtools.{SamReader, SamReaderFactory} import scala.collection.JavaConversions._ @@ -30,4 +30,33 @@ object BamUtils { if (temp.map(_._1).distinct.size != temp.size) throw new IllegalArgumentException("Samples has been found twice") temp.toMap } + + /** + * Estimate the insertsize for each bam file and return Map[<sampleName>, <insertSize>] + * + * @param bamFiles input bam files + * @return + */ + def sampleBamInsertSize(bamFiles: List[File]): Map[File, Float] = bamFiles.map { file => + + val inputSam: SamReader = SamReaderFactory.makeDefault.open(file) + + val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.map { + contig => + val insertsizes: Iterator[Int] = for { + read <- inputSam.query( contig.getSequenceName, 1, contig.getSequenceLength, true) //.toStream.slice(0, 100).toList + insertsize = read.getInferredInsertSize + paired = read.getReadPairedFlag + bothMapped = (read.getReadUnmappedFlag == false) && (read.getMateUnmappedFlag == false) + if paired && bothMapped + } yield { + insertsize + } + val contigInsertSize = insertsizes.foldLeft((0.0,0))((t, r) => (t._1 + r, t._2 +1)) + contigInsertSize._1 / contigInsertSize._2 + }.foldLeft((0.0,0))((t, r) => (t._1 + r, t._2 +1)) + + file -> baminsertsizes._1 / baminsertsizes._2 + } + } -- GitLab