Commit cbf4d93c authored by bow's avatar bow
Browse files

Add initial functions to parse interval from file

parent ec703d07
......@@ -5,7 +5,12 @@
package nl.lumc.sasc.biopet.core.apps
import java.io.{ File, IOException }
import scala.io.Source
import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.SAMFileReader.QueryInterval
import htsjdk.samtools.SAMRecord
import org.apache.commons.io.FilenameUtils.getExtension
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
......@@ -28,13 +33,71 @@ class WipeReads(val root: Configurable) extends BiopetJavaCommandLineFunction {
object WipeReads {
type OptionMap = Map[String, Any]
case class RawInterval(chrom: String, start: Int, end: Int, strand: String)
object Strand extends Enumeration {
type Strand = Value
val Plus, Minus, Ignore = Value
}
def checkInputFile(inFile: File): File =
private def makeRawIntervalFromBED(inFile: File): Iterator[RawInterval] =
// BED file coordinates are 0-based, half open so we need to do some conversion
Source.fromFile(inFile)
.getLines()
.filterNot(_.trim.isEmpty)
.dropWhile(_.matches("^track | ^browser "))
.map(line => line.trim.split("\t") match {
case Array(chrom, start, end) => new RawInterval(chrom, start.toInt + 1, end.toInt, "")
case Array(chrom, start, end, _, _, strand, _*) => new RawInterval(chrom, start.toInt + 1, end.toInt, strand)
})
private def makeRawIntervalFromRefFlat(inFile: File): Iterator[RawInterval] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
private def makeRawIntervalFromGTF(inFile: File): Iterator[RawInterval] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// TODO: check that interval chrom is in the BAM file (optionally, when prepended with 'chr' too)
def makeQueryIntervalFromFile(inFile: File, inBAM: SAMFileReader): Iterator[QueryInterval] = {
// detect interval file format from extension
val iterFunc: (File => Iterator[RawInterval]) =
if (getExtension(inFile.toString.toLowerCase) == "bed")
makeRawIntervalFromBED
else
throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath)
iterFunc(inFile)
.filter(x => inBAM.getFileHeader.getSequenceIndex(x.chrom) > -1)
.map(x => inBAM.makeQueryInterval(x.chrom, x.start, x.end))
}
// TODO: implement optional index creation
private def prepIndexedInputBAM(inFile: File, inFileIndex: File = null): SAMFileReader =
if (inFileIndex != null)
new SAMFileReader(inFile, inFileIndex)
else {
val sfr = new SAMFileReader(inFile)
if (!sfr.hasIndex)
throw new IllegalStateException("Input BAM file must be indexed")
else
sfr
}
def queryTargetRecords(iv: Iterator[QueryInterval], reader: SAMFileReader, minMapQ: Int = 0): Set[SAMRecord] = ???
// TODO: set minimum fraction for overlap
// TODO: RG filtering
// query BAM files for SAM records overlapping target region
// optional: filter for MapQ value
// conditional: get mates (if records are paired)
def queryMateRecords(records: Vector[SAMRecord]): Set[SAMRecord] = ???
// query mates
private def writeWipedBAM(inBAM: SAMFileReader, targetNames: Set[SAMRecord]): Unit = ???
private def checkInputFile(inFile: File): File =
if (inFile.exists)
inFile
else
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment