Skip to content
Snippets Groups Projects
Commit 5414c7ab authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'patch-picard_summary' into 'develop'

Fix Picard metrics parsing to use the correct types

Work in progress: don't merge yet.

This merge request is for #151. I implemented a general utility function that should be able to handle arbitrarily large numbers. The trade off is that it loses the type information. Maybe there is some clever way to retain it, based on the return types of the supplied conversion function(s). But for now, this should be sufficient for the fix of issue #151 (there, we pattern match to check the actual type).

There are still some todos:

- [x] Update Picard parsing to handle this
- [x] Add unit tests for the conversion functions
- [ ] Add BigInt and BigDecimal support for the `anyToJson` function (this is optional, I think)

See merge request !148
parents f7e873b0 ee303c87
No related branches found
No related tags found
No related merge requests found
......@@ -70,7 +70,7 @@ class CollectAlignmentSummaryMetrics(val root: Configurable) extends Picard with
case None => Map()
case Some((header, content)) =>
(for (category <- 0 until content.size) yield {
content(category)(0) -> (
content(category)(0).toString -> (
for (
i <- 1 until header.size if i < content(category).size
) yield {
......
......@@ -103,7 +103,7 @@ class MarkDuplicates(val root: Configurable) extends Picard with Summarizable {
case None => Map()
case Some((header, content)) =>
(for (category <- 0 until content.size) yield {
content(category)(0) -> (
content(category)(0).toString -> (
for (
i <- 1 until header.size if i < content(category).size
) yield {
......
......@@ -16,11 +16,12 @@
package nl.lumc.sasc.biopet.extensions.picard
import java.io.File
import scala.io.Source
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import org.broadinstitute.gatk.utils.commandline.{ Argument }
import org.broadinstitute.gatk.utils.commandline.Argument
import scala.io.Source
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.utils.tryToParseNumber
/**
* General picard extension
......@@ -80,7 +81,7 @@ object Picard {
* @param file input metrics file
* @return (header, content)
*/
def getMetrics(file: File): Option[(Array[String], List[Array[String]])] =
def getMetrics(file: File): Option[(Array[String], List[Array[Any]])] =
if (file.exists) {
val lines = Source.fromFile(file).getLines().toArray
......@@ -88,9 +89,10 @@ object Picard {
val end = lines.indexOf("", start)
val header = lines(start).split("\t")
val content = (for (i <- (start + 1) until end) yield lines(i).split("\t")).toList
val content = (for (i <- (start + 1) until end) yield lines(i).split("\t"))
.map(row => row.map(col => tryToParseNumber(col, true).getOrElse(col)))
Option((header, content))
Option((header, content.toList))
} else {
None
}
......
package nl.lumc.sasc.biopet
import scala.util.{ Failure, Success, Try }
/**
* General utility functions.
*
* @author Wibowo Arindrarto <w.arindrarto@lumc.nl>
*/
package object utils {
/** Regular expression for matching entire integer numbers (numbers without decimals / fractions) */
val isInteger = """^([-+]?\d+)L?$""".r
/** Regular expression for matching entire decimal numbers (compatible with the scientific notation) */
val isDecimal = """^([-+]?\d*\.?\d+(?:[eE][-+]?[0-9]+)?)$""".r
/**
* Tries to convert the given string with the given conversion functions recursively.
*
* If conversion is successful, the converted object is returned within as a [[Success]] type. Otherwise, a [[Failure]]
* is returned. The order of conversion functions is the same as the order they are specified.
*
* @param raw the string to convert.
* @param funcs one or more conversion functions to apply.
* @return a [[Try]] object encapsulating the conversion result.
*/
def tryToConvert(raw: String, funcs: (String => Any)*): Try[Any] = {
if (funcs.isEmpty) Try(throw new Exception(s"Can not extract value from string $raw"))
else Try(funcs.head(raw))
.transform(s => Success(s), f => tryToConvert(raw, funcs.tail: _*))
}
/**
* Tries to convert the given string into the appropriate number representation.
*
* The given string must be whole numbers without any preceeding or succeeding whitespace. This function takes
* into account the maximum values of the number object to use. For example, if the raw string represents a bigger
* number than the maximum [[Int]] value, then a [[Long]] will be used. If the number is still bigger than a [[Long]],
* the [[BigInt]] class will be used. The same is applied for decimal numbers, where the conversion order is first
* a [[Double]], then a [[BigDecimal]].
*
* @param raw the string to convert.
* @param fallBack Allows also to return the string itself when converting fails, default false.
* @return a [[Try]] object encapsulating the conversion result.
*/
def tryToParseNumber(raw: String, fallBack: Boolean = false) = raw match {
case isInteger(i) => tryToConvert(i, x => x.toInt, x => x.toLong, x => BigInt(x))
case isDecimal(f) => tryToConvert(f, x => x.toDouble, x => BigDecimal(x))
case _ if fallBack => Try(raw)
case _ => Try(throw new Exception(s"Can not extract number from string $raw"))
}
}
package nl.lumc.sasc.biopet.utils
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
import scala.util.Try
/**
* Created by pjvan_thof on 4/14/15.
*/
class PackageTest extends TestNGSuite with Matchers {
@Test def testConvert: Unit = {
tryToParseNumber("4") shouldBe Try(4)
tryToParseNumber("13.37") shouldBe Try(13.37)
tryToParseNumber("I'm not a number") should not be Try("I'm not a number")
tryToParseNumber("4", true) shouldBe Try(4)
tryToParseNumber("13.37", true) shouldBe Try(13.37)
tryToParseNumber("I'm not a number", true) shouldBe Try("I'm not a number")
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment