Skip to content
Snippets Groups Projects
Commit 320f4a82 authored by bow's avatar bow
Browse files

Comments update and code style refactor

parent c7b2052e
No related branches found
No related tags found
No related merge requests found
......@@ -4,7 +4,7 @@
*/
package nl.lumc.sasc.biopet.tools
import java.io.{ File, IOException }
import java.io.File
import scala.collection.JavaConverters._
......@@ -38,10 +38,10 @@ class WipeReads(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input BAM file (must be indexed)", shortName = "I", required = true)
var inputBAM: File = _
var inputBam: File = _
@Output(doc = "Output BAM", shortName = "o", required = true)
var outputBAM: File = _
var outputBam: File = _
}
......@@ -61,8 +61,10 @@ object WipeReads extends ToolCommand {
else
feat1.getStart <= feat2.getStart && feat1.getEnd >= feat2.getStart
/** Function to merge two overlapping intervals
* NOTE: we assume subtypes of Feature has a constructor with (chr, start, end) signature */
/**
* Function to merge two overlapping intervals
* NOTE: we assume subtypes of Feature has a constructor with (chr, start, end) signature
*/
private def merge[T <: Feature](feat1: T, feat2: T): T = {
if (feat1.getChr != feat2.getChr)
throw new IllegalArgumentException("Can not merge features from different chromosomes")
......@@ -72,7 +74,7 @@ object WipeReads extends ToolCommand {
else if (overlaps(feat1, feat2))
new BasicFeature(feat1.getChr, feat1.getStart, feat2.getEnd).asInstanceOf[T]
else
throw new IllegalArgumentException("Can not merge non-overlapping RawInterval objects")
throw new IllegalArgumentException("Can not merge non-overlapping Feature objects")
}
/** Function to create an iterator yielding non-overlapped Feature objects */
......@@ -80,11 +82,13 @@ object WipeReads extends ToolCommand {
ri.toList
.sortBy(x => (x.getChr, x.getStart, x.getEnd))
.foldLeft(List.empty[T]) {
(acc, i) => acc match {
case head :: tail if overlaps(head, i) => merge(head, i) :: tail
case _ => i :: acc
(acc, i) =>
acc match {
case head :: tail if overlaps(head, i) => merge(head, i) :: tail
case _ => i :: acc
}}
}
}
.toIterator
/**
......@@ -97,70 +101,60 @@ object WipeReads extends ToolCommand {
logger.info("Parsing interval file ...")
/** Function to create iterator from BED file */
def makeFeatureFromBED(inFile: File): Iterator[Feature] =
def makeFeatureFromBed(inFile: File): Iterator[Feature] =
asScalaIteratorConverter(getFeatureReader(inFile.toPath.toString, new BEDCodec(), false).iterator).asScala
// TODO: implementation
/** Function to create iterator from refFlat file */
def makeFeatureFromRefFlat(inFile: File): Iterator[Feature] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// TODO: implementation
/** Function to create iterator from GTF file */
def makeFeatureFromGTF(inFile: File): Iterator[Feature] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
def makeFeatureFromGtf(inFile: File): Iterator[Feature] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// detect interval file format from extension
val iterFunc: (File => Iterator[Feature]) =
if (getExtension(inFile.toString.toLowerCase) == "bed")
makeFeatureFromBED
makeFeatureFromBed
else
throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath)
mergeOverlappingIntervals(iterFunc(inFile))
}
def bloomParamsOk(bloomSize: Long, bloomFp: Double): Boolean = {
val optimalArraySize = FilterBuilder.optimalM(bloomSize, bloomFp)
// we are currently limited to maximum integer size
// if the optimal array size equals or exceeds it, we assume
// that it's a result of a truncation and return false
optimalArraySize <= Int.MaxValue
}
// TODO: set minimum fraction for overlap
/**
* Function to create function to check SAMRecord for exclusion in filtered BAM file.
*
* The returned function evaluates all filtered-in SAMRecord to false.
*
* @param iv iterator yielding RawInterval objects
* @param inBAM input BAM file
* @param inBAMIndex input BAM file index
* @param iv iterator yielding Feature objects
* @param inBam input BAM file
* @param inBamIndex input BAM file index
* @param filterOutMulti whether to filter out reads with same name outside target region (default: true)
* @param minMapQ minimum MapQ of reads in target region to filter out (default: 0)
* @param readGroupIDs read group IDs of reads in target region to filter out (default: all IDs)
* @param readGroupIds read group IDs of reads in target region to filter out (default: all IDs)
* @param bloomSize expected size of elements to contain in the Bloom filter
* @param bloomFp expected Bloom filter false positive rate
* @return function that checks whether a SAMRecord or String is to be excluded
*/
def makeFilterOutFunction(iv: Iterator[Feature],
inBAM: File, inBAMIndex: File = null,
inBam: File, inBamIndex: File = null,
filterOutMulti: Boolean = true,
minMapQ: Int = 0, readGroupIDs: Set[String] = Set(),
minMapQ: Int = 0, readGroupIds: Set[String] = Set(),
bloomSize: Long, bloomFp: Double): (SAMRecord => Boolean) = {
logger.info("Building set of reads to exclude ...")
// TODO: implement optional index creation
/** Function to check for BAM file index and return a SAMFileReader given a File */
def prepIndexedInputBAM(): SAMFileReader =
if (inBAMIndex != null)
new SAMFileReader(inBAM, inBAMIndex)
def prepIndexedInputBam(): SAMFileReader =
if (inBamIndex != null)
new SAMFileReader(inBam, inBamIndex)
else {
val sfr = new SAMFileReader(inBAM)
val sfr = new SAMFileReader(inBam)
sfr.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT)
if (!sfr.hasIndex)
throw new IllegalStateException("Input BAM file must be indexed")
......@@ -174,27 +168,28 @@ object WipeReads extends ToolCommand {
* The function still works when only either one of the interval or BAM
* file contig is prepended with "chr"
*
* @param inBAM BAM file to query as SAMFileReader
* @param inBam BAM file to query as SAMFileReader
* @param feat feature object containing query
* @return QueryInterval wrapped in Option
*/
def monadicMakeQueryInterval(inBAM: SAMFileReader, feat: Feature): Option[QueryInterval] =
if (inBAM.getFileHeader.getSequenceIndex(feat.getChr) > -1)
Some(inBAM.makeQueryInterval(feat.getChr, feat.getStart, feat.getEnd))
def monadicMakeQueryInterval(inBam: SAMFileReader, feat: Feature): Option[QueryInterval] =
if (inBam.getFileHeader.getSequenceIndex(feat.getChr) > -1)
Some(inBam.makeQueryInterval(feat.getChr, feat.getStart, feat.getEnd))
else if (feat.getChr.startsWith("chr")
&& inBAM.getFileHeader.getSequenceIndex(feat.getChr.substring(3)) > -1)
Some(inBAM.makeQueryInterval(feat.getChr.substring(3), feat.getStart, feat.getEnd))
&& inBam.getFileHeader.getSequenceIndex(feat.getChr.substring(3)) > -1)
Some(inBam.makeQueryInterval(feat.getChr.substring(3), feat.getStart, feat.getEnd))
else if (!feat.getChr.startsWith("chr")
&& inBAM.getFileHeader.getSequenceIndex("chr" + feat.getChr) > -1)
Some(inBAM.makeQueryInterval("chr" + feat.getChr, feat.getStart, feat.getEnd))
&& inBam.getFileHeader.getSequenceIndex("chr" + feat.getChr) > -1)
Some(inBam.makeQueryInterval("chr" + feat.getChr, feat.getStart, feat.getEnd))
else
None
/** function to make IntervalTree from our RawInterval objects
*
* @param ifeat iterable over feature objects
* @return IntervalTree objects, filled with intervals from RawInterval
*/
/**
* function to make IntervalTree from our Feature objects
*
* @param ifeat iterable over feature objects
* @return IntervalTree objects, filled with intervals from Feature
*/
def makeIntervalTree[T <: Feature](ifeat: Iterable[T]): IntervalTree = {
val ivt = new IntervalTree
for (iv <- ifeat)
......@@ -228,39 +223,43 @@ object WipeReads extends ToolCommand {
/** filter function for read IDs */
val rgFilter =
if (readGroupIDs.size == 0)
if (readGroupIds.size == 0)
(r: SAMRecord) => true
else
(r: SAMRecord) => readGroupIDs.contains(r.getReadGroup.getReadGroupId)
(r: SAMRecord) => readGroupIds.contains(r.getReadGroup.getReadGroupId)
/** function to get set element */
val SAMRecordElement =
val SamRecordElement =
if (filterOutMulti)
(r: SAMRecord) => r.getReadName
else
(r: SAMRecord) => r.getReadName + "_" + r.getAlignmentStart.toString
val SAMRecordMateElement =
(r: SAMRecord) => r.getReadName + "_" + r.getMateAlignmentStart.toString
val SamRecordMateElement =
(r: SAMRecord) => r.getReadName + "_" + r.getMateAlignmentStart.toString
val firstBAM = prepIndexedInputBAM()
val readyBam = prepIndexedInputBam()
/* NOTE: the interval vector here should be bypass-able if we can make
the BAM query intervals with Interval objects. This is not possible
at the moment since we can not retrieve star and end coordinates
of an Interval, so we resort to our own RawInterval vector
at the moment since we can not retrieve start and end coordinates
of an Interval, so we resort to our own Feature vector
*/
lazy val intervals = iv.toVector
lazy val intervalTreeMap: Map[String, IntervalTree] = intervals
.groupBy(x => x.getChr)
.map({ case (key, value) => (key, makeIntervalTree(value)) })
lazy val queryIntervals = intervals
.flatMap(x => monadicMakeQueryInterval(firstBAM, x))
.flatMap(x => monadicMakeQueryInterval(readyBam, x))
// makeQueryInterval only accepts a sorted QueryInterval list
.sortBy(x => (x.referenceIndex, x.start, x.end))
.toArray
val filteredRecords: Iterator[SAMRecord] = firstBAM.queryOverlapping(queryIntervals).asScala
val filteredRecords: Iterator[SAMRecord] = readyBam
// query BAM file with intervals
.queryOverlapping(queryIntervals)
// for compatibility
.asScala
// ensure spliced reads have at least one block overlapping target region
.filter(x => alignmentBlockOverlaps(x, intervalTreeMap))
// filter for MAPQ on target region reads
......@@ -275,11 +274,10 @@ object WipeReads extends ToolCommand {
for (rec <- filteredRecords) {
logger.debug("Adding read " + rec.getReadName + " to set ...")
if ((!filterOutMulti) && rec.getReadPairedFlag) {
filteredOutSet.add(SAMRecordElement(rec))
filteredOutSet.add(SAMRecordMateElement(rec))
}
else
filteredOutSet.add(SAMRecordElement(rec))
filteredOutSet.add(SamRecordElement(rec))
filteredOutSet.add(SamRecordMateElement(rec))
} else
filteredOutSet.add(SamRecordElement(rec))
}
if (filterOutMulti)
......@@ -287,40 +285,39 @@ object WipeReads extends ToolCommand {
else
(rec: SAMRecord) => {
if (rec.getReadPairedFlag)
filteredOutSet.contains(SAMRecordElement(rec)) &&
filteredOutSet.contains(SAMRecordMateElement(rec))
filteredOutSet.contains(SamRecordElement(rec)) &&
filteredOutSet.contains(SamRecordMateElement(rec))
else
filteredOutSet.contains(SAMRecordElement(rec))
filteredOutSet.contains(SamRecordElement(rec))
}
}
// TODO: implement stats file output
/**
* Function to filter input BAM and write its output to the filesystem
*
* @param filterFunc filter function that evaluates true for excluded SAMRecord
* @param inBAM input BAM file
* @param outBAM output BAM file
* @param inBam input BAM file
* @param outBam output BAM file
* @param writeIndex whether to write index for output BAM file
* @param async whether to write asynchronously
* @param filteredOutBAM whether to write excluded SAMRecords to their own BAM file
* @param filteredOutBam whether to write excluded SAMRecords to their own BAM file
*/
def writeFilteredBAM(filterFunc: (SAMRecord => Boolean), inBAM: File, outBAM: File,
def writeFilteredBam(filterFunc: (SAMRecord => Boolean), inBam: File, outBam: File,
writeIndex: Boolean = true, async: Boolean = true,
filteredOutBAM: File = null) = {
filteredOutBam: File = null) = {
val factory = new SAMFileWriterFactory()
.setCreateIndex(writeIndex)
.setUseAsyncIo(async)
val templateBAM = new SAMFileReader(inBAM)
templateBAM.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT)
val templateBam = new SAMFileReader(inBam)
templateBam.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT)
val targetBAM = factory.makeBAMWriter(templateBAM.getFileHeader, true, outBAM)
val targetBam = factory.makeBAMWriter(templateBam.getFileHeader, true, outBam)
val filteredBAM =
if (filteredOutBAM != null)
factory.makeBAMWriter(templateBAM.getFileHeader, true, filteredOutBAM)
val filteredBam =
if (filteredOutBam != null)
factory.makeBAMWriter(templateBam.getFileHeader, true, filteredOutBam)
else
null
......@@ -328,35 +325,41 @@ object WipeReads extends ToolCommand {
try {
var (incl, excl) = (0, 0)
for (rec <- templateBAM.asScala) {
for (rec <- templateBam.asScala) {
if (!filterFunc(rec)) {
targetBAM.addAlignment(rec)
targetBam.addAlignment(rec)
incl += 1
}
else {
} else {
excl += 1
if (filteredBAM != null) filteredBAM.addAlignment(rec)
if (filteredBam != null) filteredBam.addAlignment(rec)
}
}
println(List("input_bam", "output_bam", "count_included", "count_excluded").mkString("\t"))
println(List(inBAM.getName, outBAM.getName, incl, excl).mkString("\t"))
println(List(inBam.getName, outBam.getName, incl, excl).mkString("\t"))
} finally {
templateBAM.close()
targetBAM.close()
if (filteredBAM != null) filteredBAM.close()
templateBam.close()
targetBam.close()
if (filteredBam != null) filteredBam.close()
}
}
case class Args (inputBAM: File = null,
targetRegions: File = null,
outputBAM: File = null,
filteredOutBAM: File = null,
readGroupIDs: Set[String] = Set.empty[String],
minMapQ: Int = 0,
limitToRegion: Boolean = false,
noMakeIndex: Boolean = false,
bloomSize: Long = 70000000,
bloomFp: Double = 4e-7) extends AbstractArgs
/** Function to check whether the bloom filter can fulfill size and false positive guarantees
As we are currently limited to maximum integer size if the optimal array size equals or
exceeds it, we assume that it's a result of a truncation and return false.
*/
def bloomParamsOk(bloomSize: Long, bloomFp: Double): Boolean =
FilterBuilder.optimalM(bloomSize, bloomFp) <= Int.MaxValue
case class Args(inputBam: File = null,
targetRegions: File = null,
outputBam: File = null,
filteredOutBam: File = null,
readGroupIds: Set[String] = Set.empty[String],
minMapQ: Int = 0,
limitToRegion: Boolean = false,
noMakeIndex: Boolean = false,
bloomSize: Long = 70000000,
bloomFp: Double = 4e-7) extends AbstractArgs
class OptParser extends AbstractOptParser {
......@@ -365,43 +368,53 @@ object WipeReads extends ToolCommand {
|$commandName - Region-based reads removal from an indexed BAM file
""".stripMargin)
opt[File]('I', "input_file") required() valueName "<bam>" action { (x, c) =>
c.copy(inputBAM = x) } validate {
x => if (x.exists) success else failure("Input BAM file not found")
} text "Input BAM file"
opt[File]('r', "interval_file") required() valueName "<bed>" action { (x, c) =>
c.copy(targetRegions = x) } validate {
x => if (x.exists) success else failure("Target regions file not found")
} text "Interval BED file"
opt[File]('o', "output_file") required() valueName "<bam>" action { (x, c) =>
c.copy(outputBAM = x) } text "Output BAM file"
opt[File]('f', "discarded_file") optional() valueName "<bam>" action { (x, c) =>
c.copy(filteredOutBAM = x) } text "Discarded reads BAM file (default: none)"
opt[Int]('Q', "min_mapq") optional() action { (x, c) =>
c.copy(minMapQ = x) } text "Minimum MAPQ of reads in target region to remove (default: 0)"
opt[String]('G', "read_group") unbounded() optional() valueName "<rgid>" action { (x, c) =>
c.copy(readGroupIDs = c.readGroupIDs + x) } text "Read group IDs to be removed (default: remove reads from all read groups)"
opt[Unit]("limit_removal") optional() action { (_, c) =>
c.copy(limitToRegion = true) } text
opt[File]('I', "input_file") required () valueName "<bam>" action { (x, c) =>
c.copy(inputBam = x)
} validate {
x => if (x.exists) success else failure("Input BAM file not found")
} text "Input BAM file"
opt[File]('r', "interval_file") required () valueName "<bed>" action { (x, c) =>
c.copy(targetRegions = x)
} validate {
x => if (x.exists) success else failure("Target regions file not found")
} text "Interval BED file"
opt[File]('o', "output_file") required () valueName "<bam>" action { (x, c) =>
c.copy(outputBam = x)
} text "Output BAM file"
opt[File]('f', "discarded_file") optional () valueName "<bam>" action { (x, c) =>
c.copy(filteredOutBam = x)
} text "Discarded reads BAM file (default: none)"
opt[Int]('Q', "min_mapq") optional () action { (x, c) =>
c.copy(minMapQ = x)
} text "Minimum MAPQ of reads in target region to remove (default: 0)"
opt[String]('G', "read_group") unbounded () optional () valueName "<rgid>" action { (x, c) =>
c.copy(readGroupIds = c.readGroupIds + x)
} text "Read group IDs to be removed (default: remove reads from all read groups)"
opt[Unit]("limit_removal") optional () action { (_, c) =>
c.copy(limitToRegion = true)
} text
"Whether to remove multiple-mapped reads outside the target regions (default: yes)"
opt[Unit]("no_make_index") optional() action { (_, c) =>
c.copy(noMakeIndex = true) } text
opt[Unit]("no_make_index") optional () action { (_, c) =>
c.copy(noMakeIndex = true)
} text
"Whether to index output BAM file or not (default: yes)"
note("\nAdvanced options")
opt[Long]("bloom_size") optional() action { (x, c) =>
c.copy(bloomSize = x) } text "expected maximum number of reads in target regions (default: 7e7)"
opt[Long]("bloom_size") optional () action { (x, c) =>
c.copy(bloomSize = x)
} text "expected maximum number of reads in target regions (default: 7e7)"
opt[Double]("false_positive") optional() action { (x, c) =>
c.copy(bloomFp = x) } text "false positive rate (default: 4e-7)"
opt[Double]("false_positive") optional () action { (x, c) =>
c.copy(bloomFp = x)
} text "false positive rate (default: 4e-7)"
note(
"""
......@@ -410,7 +423,7 @@ object WipeReads extends ToolCommand {
|the given ones, they will also be removed.
""".stripMargin)
checkConfig { c=>
checkConfig { c =>
if (!bloomParamsOk(c.bloomSize, c.bloomFp))
failure("Bloom parameters combination exceed Int limitation")
else
......@@ -426,20 +439,20 @@ object WipeReads extends ToolCommand {
val filterFunc = makeFilterOutFunction(
iv = makeFeatureFromFile(commandArgs.targetRegions),
inBAM = commandArgs.inputBAM,
inBam = commandArgs.inputBam,
filterOutMulti = !commandArgs.limitToRegion,
minMapQ = commandArgs.minMapQ,
readGroupIDs = commandArgs.readGroupIDs,
readGroupIds = commandArgs.readGroupIds,
bloomSize = commandArgs.bloomSize,
bloomFp = commandArgs.bloomFp
)
writeFilteredBAM(
writeFilteredBam(
filterFunc,
commandArgs.inputBAM,
commandArgs.outputBAM,
commandArgs.inputBam,
commandArgs.outputBam,
writeIndex = !commandArgs.noMakeIndex,
filteredOutBAM = commandArgs.filteredOutBAM
filteredOutBam = commandArgs.filteredOutBam
)
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment