From ef2a9acd9915b5086becac579e45b4be6aea7305 Mon Sep 17 00:00:00 2001 From: bow <bow@bow.web.id> Date: Tue, 28 Oct 2014 01:51:59 +0100 Subject: [PATCH] Update to htsjdk use in GATK 3.3 --- .../biopet/tools/ExtractAlignedFastq.scala | 134 +++++++++--------- .../tools/ExtractAlignedFastqUnitTest.scala | 95 +++++++------ 2 files changed, 123 insertions(+), 106 deletions(-) diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala index cdbc13e99..83a607de4 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala @@ -9,19 +9,20 @@ import java.io.File import scala.collection.mutable.{ Set => MSet } import scala.collection.JavaConverters._ -import htsjdk.samtools.SAMFileReader import htsjdk.samtools.QueryInterval +import htsjdk.samtools.SamReaderFactory +import htsjdk.samtools.ValidationStringency import htsjdk.samtools.fastq.{ BasicFastqWriter, FastqReader, FastqRecord } -import htsjdk.tribble.Feature -import htsjdk.tribble.BasicFeature +import htsjdk.samtools.util.Interval import nl.lumc.sasc.biopet.core.ToolCommand object ExtractAlignedFastq extends ToolCommand { type FastqPair = (FastqRecord, FastqRecord) + /** - * Function to create iterator over features given input interval string + * Function to create iterator over Interval given input interval string * * Valid interval strings are either of these: * - chr5:10000-11000 @@ -38,7 +39,7 @@ object ExtractAlignedFastq extends ToolCommand { * * @param inStrings iterable yielding input interval string */ - def makeFeatureFromString(inStrings: Iterable[String]): Iterator[Feature] = { + def makeIntervalFromString(inStrings: Iterable[String]): Iterator[Interval] = { // FIXME: can we combine these two patterns into one regex? // matches intervals with start and end coordinates @@ -47,15 +48,16 @@ object ExtractAlignedFastq extends ToolCommand { val ptn2 = """([\w_-]+):([\d.,]+)""".r // make ints from coordinate strings // NOTE: while it is possible for coordinates to exceed Int.MaxValue, we are limited - // by the BasicFeature constructor only accepting ints + // by the Interval constructor only accepting ints def intFromCoord(s: String): Int = s.replaceAll(",", "").replaceAll("\\.", "").toInt inStrings.map(x => x match { - case ptn1(chr, start, end) => new BasicFeature(chr, intFromCoord(start), intFromCoord(end)) - case ptn2(chr, start) => val startCoord = intFromCoord(start) - new BasicFeature(chr, startCoord, startCoord) - case _ => throw new IllegalArgumentException("Invalid interval string: " + x) - }) + case ptn1(chr, start, end) => new Interval(chr, intFromCoord(start), intFromCoord(end)) + case ptn2(chr, start) => + val startCoord = intFromCoord(start) + new Interval(chr, startCoord, startCoord) + case _ => throw new IllegalArgumentException("Invalid interval string: " + x) + }) .toIterator } @@ -69,34 +71,30 @@ object ExtractAlignedFastq extends ToolCommand { * @param commonSuffixLength length of suffix common to all read pairs * @return */ - def makeMembershipFunction(iv: Iterator[Feature], + def makeMembershipFunction(iv: Iterator[Interval], inAln: File, minMapQ: Int = 0, - commonSuffixLength: Int = 0 - ): (FastqPair => Boolean) = { - - /** function to make interval queries for BAM files */ - def makeQueryInterval(aln: SAMFileReader, feat: Feature): QueryInterval = - if (aln.getFileHeader.getSequenceIndex(feat.getChr) > -1) - aln.makeQueryInterval(feat.getChr, feat.getStart, feat.getEnd) - else if (feat.getChr.startsWith("chr") - && aln.getFileHeader.getSequenceIndex(feat.getChr.substring(3)) > -1) - aln.makeQueryInterval(feat.getChr.substring(3), feat.getStart, feat.getEnd) - else if (!feat.getChr.startsWith("chr") - && aln.getFileHeader.getSequenceIndex("chr" + feat.getChr) > -1) - aln.makeQueryInterval("chr" + feat.getChr, feat.getStart, feat.getEnd) - else - throw new IllegalStateException("Unexpected feature: " + feat.toString) + commonSuffixLength: Int = 0): (FastqPair => Boolean) = { - val inAlnReader = new SAMFileReader(inAln) + val inAlnReader = SamReaderFactory + .make() + .validationStringency(ValidationStringency.LENIENT) + .open(inAln) require(inAlnReader.hasIndex) + def getSequenceIndex(name: String): Int = inAlnReader.getFileHeader.getSequenceIndex(name) match { + case x if x >= 0 => + x + case otherwise => + throw new IllegalArgumentException("Chromosome " + name + " is not found in the alignment file") + } + val queries: Array[QueryInterval] = iv.toList - // sort features - .sortBy(x => (x.getChr, x.getStart, x.getEnd)) - // turn them into QueryInterval objects - .map(x => makeQueryInterval(inAlnReader, x)) - // return as an array + // sort Interval + .sortBy(x => (x.getSequence, x.getStart, x.getEnd)) + // transform to QueryInterval + .map(x => new QueryInterval(getSequenceIndex(x.getSequence), x.getStart, x.getEnd)) + // cast to array .toArray lazy val selected: MSet[String] = inAlnReader @@ -112,11 +110,11 @@ object ExtractAlignedFastq extends ToolCommand { logger.debug("Adding " + x.getReadName + " to set ...") acc += x.getReadName } - ) + ) (pair: FastqPair) => pair._2 match { - case null => selected.contains(pair._1.getReadHeader) - case otherwise => + case null => selected.contains(pair._1.getReadHeader) + case otherwise => require(commonSuffixLength < pair._1.getReadHeader.length) require(commonSuffixLength < pair._2.getReadHeader.length) selected.contains(pair._1.getReadHeader.dropRight(commonSuffixLength)) @@ -131,8 +129,8 @@ object ExtractAlignedFastq extends ToolCommand { val i1 = inputFastq1.iterator.asScala val i2 = inputFastq2 match { - case null => Iterator.continually(null) - case otherwise => otherwise.iterator.asScala + case null => Iterator.continually(null) + case otherwise => otherwise.iterator.asScala } val o1 = outputFastq1 val o2 = (inputFastq2, outputFastq2) match { @@ -152,18 +150,18 @@ object ExtractAlignedFastq extends ToolCommand { case (rec1, rec2) => o1.write(rec1) o2.write(rec2) - } + } } - case class Args (inputBam: File = null, - intervals: List[String] = List.empty[String], - inputFastq1: File = null, - inputFastq2: File = null, - outputFastq1: File = null, - outputFastq2: File = null, - minMapQ: Int = 0, - commonSuffixLength: Int = 0) extends AbstractArgs + case class Args(inputBam: File = null, + intervals: List[String] = List.empty[String], + inputFastq1: File = null, + inputFastq2: File = null, + outputFastq1: File = null, + outputFastq2: File = null, + minMapQ: Int = 0, + commonSuffixLength: Int = 0) extends AbstractArgs class OptParser extends AbstractOptParser { @@ -172,36 +170,44 @@ object ExtractAlignedFastq extends ToolCommand { |$commandName - Select aligned FASTQ records """.stripMargin) - opt[File]('I', "input_file") required() valueName "<bam>" action { (x, c) => - c.copy(inputBam = x) } validate { + opt[File]('I', "input_file") required () valueName "<bam>" action { (x, c) => + c.copy(inputBam = x) + } validate { x => if (x.exists) success else failure("Input BAM file not found") } text "Input BAM file" - opt[String]('r', "interval") required() unbounded() valueName "<interval>" action { (x, c) => + opt[String]('r', "interval") required () unbounded () valueName "<interval>" action { (x, c) => // yes, we are appending and yes it's O(n) ~ preserving order is more important than speed here - c.copy(intervals = c.intervals :+ x) } text "Interval strings" + c.copy(intervals = c.intervals :+ x) + } text "Interval strings" - opt[File]('i', "in1") required() valueName "<fastq>" action { (x, c) => - c.copy(inputFastq1 = x) } validate { + opt[File]('i', "in1") required () valueName "<fastq>" action { (x, c) => + c.copy(inputFastq1 = x) + } validate { x => if (x.exists) success else failure("Input FASTQ file 1 not found") } text "Input FASTQ file 1" - opt[File]('j', "in2") optional() valueName "<fastq>" action { (x, c) => - c.copy(inputFastq1 = x) } validate { + opt[File]('j', "in2") optional () valueName "<fastq>" action { (x, c) => + c.copy(inputFastq1 = x) + } validate { x => if (x.exists) success else failure("Input FASTQ file 2 not found") } text "Input FASTQ file 2 (default: none)" - opt[File]('o', "out1") required() valueName "<fastq>" action { (x, c) => - c.copy(outputFastq1 = x) } text "Output FASTQ file 1" + opt[File]('o', "out1") required () valueName "<fastq>" action { (x, c) => + c.copy(outputFastq1 = x) + } text "Output FASTQ file 1" - opt[File]('p', "out2") optional() valueName "<fastq>" action { (x, c) => - c.copy(outputFastq1 = x) } text "Output FASTQ file 2 (default: none)" + opt[File]('p', "out2") optional () valueName "<fastq>" action { (x, c) => + c.copy(outputFastq1 = x) + } text "Output FASTQ file 2 (default: none)" - opt[Int]('Q', "min_mapq") optional() action { (x, c) => - c.copy(minMapQ = x) } text "Minimum MAPQ of reads in target region to remove (default: 0)" + opt[Int]('Q', "min_mapq") optional () action { (x, c) => + c.copy(minMapQ = x) + } text "Minimum MAPQ of reads in target region to remove (default: 0)" - opt[Int]('s', "read_suffix_length") optional() action { (x, c) => - c.copy(commonSuffixLength = x) } text "Length of common suffix from each read pair (default: 0)" + opt[Int]('s', "read_suffix_length") optional () action { (x, c) => + c.copy(commonSuffixLength = x) + } text "Length of common suffix from each read pair (default: 0)" note( """ @@ -229,7 +235,7 @@ object ExtractAlignedFastq extends ToolCommand { .getOrElse(sys.exit(1)) val memFunc = makeMembershipFunction( - iv = makeFeatureFromString(commandArgs.intervals), + iv = makeIntervalFromString(commandArgs.intervals), inAln = commandArgs.inputBam, minMapQ = commandArgs.minMapQ, commonSuffixLength = commandArgs.commonSuffixLength) diff --git a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastqUnitTest.scala b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastqUnitTest.scala index d166ad4f7..0dac57875 100644 --- a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastqUnitTest.scala +++ b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastqUnitTest.scala @@ -14,8 +14,8 @@ import org.scalatest.mock.MockitoSugar import org.scalatest.testng.TestNGSuite import org.testng.annotations.{ DataProvider, Test } +import htsjdk.samtools.util.Interval import htsjdk.samtools.fastq.{ BasicFastqWriter, FastqReader, FastqRecord } -import htsjdk.tribble._ class ExtractAlignedFastqUnitTest extends TestNGSuite with MockitoSugar with Matchers { @@ -24,8 +24,8 @@ class ExtractAlignedFastqUnitTest extends TestNGSuite with MockitoSugar with Mat private def resourceFile(p: String): File = new File(Paths.get(getClass.getResource(p).toURI).toString) - private def makeFeature(chr: String, start: Int, end: Int): Feature = - new BasicFeature(chr, start ,end) + private def makeInterval(chr: String, start: Int, end: Int): Interval = + new Interval(chr, start, end) private def makeRecord(header: String): FastqRecord = new FastqRecord(header, "ATGC", "", "HIHI") @@ -40,41 +40,52 @@ class ExtractAlignedFastqUnitTest extends TestNGSuite with MockitoSugar with Mat tName + " on " + f.getName + ", read " + rName + ": " @Test def testIntervalStartEnd() = { - val obs = makeFeatureFromString(List("chr5:1000-1100")).next() - val exp = new BasicFeature("chr5", 1000, 1100) - obs.getChr should === (exp.getChr) + val obs = makeIntervalFromString(List("chr5:1000-1100")).next() + val exp = new Interval("chr5", 1000, 1100) + obs.getSequence should === (exp.getSequence) obs.getStart should === (exp.getStart) obs.getEnd should === (exp.getEnd) } @Test def testIntervalStartEndComma() = { - val obs = makeFeatureFromString(List("chr5:1,000-1,100")).next() - val exp = new BasicFeature("chr5", 1000, 1100) - obs.getChr should === (exp.getChr) + val obs = makeIntervalFromString(List("chr5:1,000-1,100")).next() + val exp = new Interval("chr5", 1000, 1100) + obs.getSequence should === (exp.getSequence) obs.getStart should === (exp.getStart) obs.getEnd should === (exp.getEnd) } @Test def testIntervalStartEndDot() = { - val obs = makeFeatureFromString(List("chr5:1.000-1.100")).next() - val exp = new BasicFeature("chr5", 1000, 1100) - obs.getChr should === (exp.getChr) + val obs = makeIntervalFromString(List("chr5:1.000-1.100")).next() + val exp = new Interval("chr5", 1000, 1100) + obs.getSequence should === (exp.getSequence) obs.getStart should === (exp.getStart) obs.getEnd should === (exp.getEnd) } @Test def testIntervalStart() = { - val obs = makeFeatureFromString(List("chr5:1000")).next() - val exp = new BasicFeature("chr5", 1000, 1000) - obs.getChr should === (exp.getChr) + val obs = makeIntervalFromString(List("chr5:1000")).next() + val exp = new Interval("chr5", 1000, 1000) + obs.getSequence should === (exp.getSequence) obs.getStart should === (exp.getStart) obs.getEnd should === (exp.getEnd) } - @Test def testIntervalError() = - intercept[IllegalArgumentException] { - makeFeatureFromString(List("chr5:1000-")).next() + @Test def testIntervalError() = { + val thrown = intercept[IllegalArgumentException] { + makeIntervalFromString(List("chr5:1000-")).next() + } + thrown.getMessage should === ("Invalid interval string: chr5:1000-") + } + + @Test def testMemFuncIntervalError() = { + val iv = Iterator(new Interval("chrP", 1, 1000)) + val inAln = resourceFile("/single01.bam") + val thrown = intercept[IllegalArgumentException] { + makeMembershipFunction(iv, inAln) } + thrown.getMessage should === ("Chromosome chrP is not found in the alignment file") + } @DataProvider(name = "singleAlnProvider1", parallel = true) def singleAlnProvider1() = { @@ -84,20 +95,20 @@ class ExtractAlignedFastqUnitTest extends TestNGSuite with MockitoSugar with Mat Array( Array("adjacent left", - makeFeature("chrQ", 30, 49), sBam01, sFastq1, sFastq1Default), + makeInterval("chrQ", 30, 49), sBam01, sFastq1, sFastq1Default), Array("adjacent right", - makeFeature("chrQ", 200, 210), sBam01, sFastq1, sFastq1Default), + makeInterval("chrQ", 200, 210), sBam01, sFastq1, sFastq1Default), Array("no overlap", - makeFeature("chrQ", 220, 230), sBam01, sFastq1, sFastq1Default), + makeInterval("chrQ", 220, 230), sBam01, sFastq1, sFastq1Default), Array("partial overlap", - makeFeature("chrQ", 430, 460), sBam01, sFastq1, sFastq1Default.updated("r04", true)), + makeInterval("chrQ", 430, 460), sBam01, sFastq1, sFastq1Default.updated("r04", true)), Array("enveloped", - makeFeature("chrQ", 693, 698), sBam01, sFastq1, sFastq1Default.updated("r03", true)) + makeInterval("chrQ", 693, 698), sBam01, sFastq1, sFastq1Default.updated("r03", true)) ) } @Test(dataProvider = "singleAlnProvider1") - def testSingleBamDefault(name: String, feat: Feature, inAln: File, + def testSingleBamDefault(name: String, feat: Interval, inAln: File, fastqMap: Map[String, FastqPair], resultMap: Map[String, Boolean]) = { require(resultMap.keySet == fastqMap.keySet) val memFunc = makeMembershipFunction(Iterator(feat), inAln) @@ -116,16 +127,16 @@ class ExtractAlignedFastqUnitTest extends TestNGSuite with MockitoSugar with Mat Array( Array("less than minimum MAPQ", - makeFeature("chrQ", 830, 890), sBam02, 60, sFastq2, sFastq2Default), + makeInterval("chrQ", 830, 890), sBam02, 60, sFastq2, sFastq2Default), Array("greater than minimum MAPQ", - makeFeature("chrQ", 830, 890), sBam02, 20, sFastq2, sFastq2Default.updated("r07", true)), + makeInterval("chrQ", 830, 890), sBam02, 20, sFastq2, sFastq2Default.updated("r07", true)), Array("equal to minimum MAPQ", - makeFeature("chrQ", 260, 320), sBam02, 30, sFastq2, sFastq2Default.updated("r01", true)) + makeInterval("chrQ", 260, 320), sBam02, 30, sFastq2, sFastq2Default.updated("r01", true)) ) } @Test(dataProvider = "singleAlnProvider2") - def testSingleBamMinMapQ(name: String, feat: Feature, inAln: File, minMapQ: Int, + def testSingleBamMinMapQ(name: String, feat: Interval, inAln: File, minMapQ: Int, fastqMap: Map[String, FastqPair], resultMap: Map[String, Boolean]) = { require(resultMap.keySet == fastqMap.keySet) val memFunc = makeMembershipFunction(Iterator(feat), inAln, minMapQ) @@ -148,22 +159,22 @@ class ExtractAlignedFastqUnitTest extends TestNGSuite with MockitoSugar with Mat Array( Array("adjacent left", - makeFeature("chrQ", 30, 49), pBam01, pFastq1, pFastq1Default), + makeInterval("chrQ", 30, 49), pBam01, pFastq1, pFastq1Default), Array("adjacent right", - makeFeature("chrQ", 200, 210), pBam01, pFastq1, pFastq1Default), + makeInterval("chrQ", 200, 210), pBam01, pFastq1, pFastq1Default), Array("no overlap", - makeFeature("chrQ", 220, 230), pBam01, pFastq1, pFastq1Default), + makeInterval("chrQ", 220, 230), pBam01, pFastq1, pFastq1Default), Array("partial overlap", - makeFeature("chrQ", 430, 460), pBam01, pFastq1, pFastq1Default.updated("r04", true)), + makeInterval("chrQ", 430, 460), pBam01, pFastq1, pFastq1Default.updated("r04", true)), Array("enveloped", - makeFeature("chrQ", 693, 698), pBam01, pFastq1, pFastq1Default.updated("r03", true)), + makeInterval("chrQ", 693, 698), pBam01, pFastq1, pFastq1Default.updated("r03", true)), Array("in intron", - makeFeature("chrQ", 900, 999), pBam01, pFastq1, pFastq1Default.updated("r05", true)) + makeInterval("chrQ", 900, 999), pBam01, pFastq1, pFastq1Default.updated("r05", true)) ) } @Test(dataProvider = "pairAlnProvider1") - def testPairBamDefault(name: String, feat: Feature, inAln: File, + def testPairBamDefault(name: String, feat: Interval, inAln: File, fastqMap: Map[String, FastqPair], resultMap: Map[String, Boolean]) = { require(resultMap.keySet == fastqMap.keySet) val memFunc = makeMembershipFunction(Iterator(feat), inAln, commonSuffixLength = 2) @@ -175,13 +186,13 @@ class ExtractAlignedFastqUnitTest extends TestNGSuite with MockitoSugar with Mat } @Test def testWriteSingleBamDefault() = { - val memFunc = (recs: FastqPair) => Set("r01", "r03").contains(recs._1.getReadHeader) - val in1 = new FastqReader(resourceFile("/single01.fq")) - val mo1 = mock[BasicFastqWriter] - selectFastqReads(memFunc, in1, mo1) - verify(mo1, times(2)).write(anyObject.asInstanceOf[FastqRecord]) - verify(mo1).write(new FastqRecord("r01", "A", "", "H")) - verify(mo1).write(new FastqRecord("r03", "G", "", "H")) + val memFunc = (recs: FastqPair) => Set("r01", "r03").contains(recs._1.getReadHeader) + val in1 = new FastqReader(resourceFile("/single01.fq")) + val mo1 = mock[BasicFastqWriter] + selectFastqReads(memFunc, in1, mo1) + verify(mo1, times(2)).write(anyObject.asInstanceOf[FastqRecord]) + verify(mo1).write(new FastqRecord("r01", "A", "", "H")) + verify(mo1).write(new FastqRecord("r03", "G", "", "H")) } @Test def testWritePairBamDefault() = { -- GitLab