Skip to content
Snippets Groups Projects
Commit 2ecba16e authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'fix-bamutils-insertsize' into fix-svcallers-vcf

parents 6b55bbde b1c4e031
No related branches found
No related tags found
No related merge requests found
......@@ -276,8 +276,10 @@ object VcfFilter extends ToolCommand {
* @return
*/
def minGenomeQuality(record: VariantContext, minGQ: Int, minSamplesPass: Int = 1): Boolean = {
record.getGenotypes.count(x => if (!x.hasGQ) false
else if (x.getGQ >= minGQ) true else false) >= minSamplesPass
record.getGenotypes.count(x =>
if (minGQ == 0) true
else if (!x.hasGQ) false
else if (x.getGQ >= minGQ) true else false) >= minSamplesPass
}
/**
......
......@@ -18,6 +18,7 @@ package nl.lumc.sasc.biopet.utils
import java.io.File
import htsjdk.samtools.{ SamReader, SamReaderFactory }
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
import scala.collection.JavaConversions._
import scala.collection.parallel.immutable
......@@ -57,39 +58,56 @@ object BamUtils {
* @param end postion to stop scanning
* @return Int with insertsize for this contig
*/
def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, samplingSize: Int = 100000): Option[Int] = {
val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam)
val samIterator = inputSam.query(contig, start, end, true)
val insertsizes: List[Int] = (for {
read <- samIterator.toStream.takeWhile(rec => {
val paired = rec.getReadPairedFlag && rec.getProperPairFlag
val bothMapped = if (paired) ((rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)) else false
paired && bothMapped
}).take(samplingSize)
} yield {
read.getInferredInsertSize.asInstanceOf[Int].abs
})(collection.breakOut)
val cti = insertsizes.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1))
def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int,
samplingSize: Int = 10000,
binSize: Int = 1000000): Option[Int] = {
samIterator.close()
inputSam.close()
val ret = if (cti._2 == 0) None else Some((cti._1 / cti._2).toInt)
ret
// create a bedList to devide the contig into multiple pieces
val insertSizesOnAllFragments = BedRecordList.fromList(Seq(BedRecord(contig, start, end)))
.scatter(binSize)
.allRecords.par.map({
bedRecord =>
// for each scatter, open the bamfile for this specific region-query
val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam)
val samIterator = inputSam.query(bedRecord.chr, bedRecord.start, bedRecord.end, true)
val insertsizes: List[Int] = (for {
read <- samIterator.toStream.takeWhile(rec => {
// TODO: This value is now hard-coded. I'm not sure whether this is the best practice on selecting reads with a minimum required quality.
val minQ10 = rec.getMappingQuality >= 10
// with properPairFlag we exclude readpairs that span multiple contigs.
val paired = rec.getReadPairedFlag && rec.getProperPairFlag
val bothMapped = if (paired) ((rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)) else false
paired && bothMapped && minQ10
}).take(samplingSize)
} yield {
read.getInferredInsertSize.asInstanceOf[Int].abs
})(collection.breakOut)
val lociInsertSize = insertsizes.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1))
samIterator.close()
inputSam.close()
if (lociInsertSize._2 == 0) None else Some((lociInsertSize._1 / lociInsertSize._2).toInt)
}).toList.flatten
val contigInsertSize = insertSizesOnAllFragments.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1))
if (contigInsertSize._2 == 0) None else Some((contigInsertSize._1 / contigInsertSize._2).toInt)
}
/**
* Estimate the insertsize for one single bamfile and return the insertsize
*
* @param bamFile bamfile to estimate avg insertsize from
* @param bamFile bamfile to estimate average insertsize from
* @return
*/
def sampleBamInsertSize(bamFile: File, samplingSize: Int = 100000): Int = {
def sampleBamInsertSize(bamFile: File, samplingSize: Int = 10000, binSize: Int = 1000000): Int = {
val inputSam: SamReader = SamReaderFactory.makeDefault.open(bamFile)
val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({
contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize)
val bamInsertSizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({
contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize, binSize)
}).toList
val counts = baminsertsizes.flatMap(x => x)
val counts = bamInsertSizes.flatMap(x => x)
// avoid division by zero
if (counts.size != 0) {
counts.sum / counts.size
} else {
......@@ -103,8 +121,8 @@ object BamUtils {
* @param bamFiles input bam files
* @return
*/
def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 100000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile =>
bamFile -> sampleBamInsertSize(bamFile, samplingSize)
def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 10000, binSize: Int = 1000000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile =>
bamFile -> sampleBamInsertSize(bamFile, samplingSize, binSize)
}.toMap
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment