From 45b54bc61d4bc52c4d7998b03b15ba690fdcc52a Mon Sep 17 00:00:00 2001 From: Wai Yi Leung <w.y.leung@lumc.nl> Date: Thu, 1 Oct 2015 16:06:47 +0200 Subject: [PATCH] Simplified tree management --- .../biopet/tools/KrakenReportToJson.scala | 115 ++++++------------ 1 file changed, 34 insertions(+), 81 deletions(-) diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/KrakenReportToJson.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/KrakenReportToJson.scala index 580eb0e00..ed5b1f028 100644 --- a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/KrakenReportToJson.scala +++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/KrakenReportToJson.scala @@ -23,6 +23,7 @@ import java.io.{ PrintWriter, File } import nl.lumc.sasc.biopet.utils.ConfigUtils._ import nl.lumc.sasc.biopet.utils.ToolCommand +import scala.collection.mutable.ListBuffer import scala.collection.{ immutable, mutable } import scala.io.Source @@ -34,9 +35,9 @@ case class KrakenHit(taxonomyID: Long, taxonRank: String, cladeLevel: Int, parentTaxonomyID: Long, - children: List[KrakenHit]) { + children: ListBuffer[KrakenHit]) { def toJSON(): Map[String, Any] = { - val childJSON = children.map(entry => entry.toJSON()) + val childJSON = children.toList.map(entry => entry.toJSON()) Map( "name" -> taxonomyName, "taxid" -> taxonomyID, @@ -84,53 +85,33 @@ object KrakenReportToJson extends ToolCommand { .parse(args, Args()) .getOrElse(sys.exit(1)) - def mergeBranch(branchA: Map[Long, KrakenHit], - branchB: KrakenHit): KrakenHit = { - var brA = branchA.head._2 - var children = branchB.children - var cladeCount = branchB.cladeCount - var cladeSize = branchB.cladeSize + def parseLine( krakenRawHit: String ): Map[Long, KrakenHit] = { + val values: Array[String] = krakenRawHit.stripLineEnd.split("\t") + val scientificName: String = values(5) + val cladeLevel = spacePattern.findFirstIn(scientificName).getOrElse("").length / 2 - /* special case for the root node */ - if (brA.taxonomyID == branchB.taxonomyID) { - cladeCount = brA.cladeCount - cladeSize = brA.cladeSize - - } - - /* determine to scan in branchB or return Map containing a because we cannot merge? */ - if (brA.cladeLevel > branchB.cladeLevel) { - /* if brA's cladelevel is deeper than branchB, work on the children if any when it doesn't match it as parent */ - - if (brA.parentTaxonomyID == branchB.taxonomyID) { - children :+= brA - } else { - /* extend in its children */ - // TODO: do preliminary escape, don't check deeper in the tree when we have a hit. - children = children.map(child => { - mergeBranch(branchA, child) - }) - } - } else { - /* Hits are on the same level(have siblings, adding to parent) */ + if (cladeIDs.length <= cladeLevel + 1) { + cladeIDs ++= mutable.ArrayBuffer.fill(10)(0L) } - new KrakenHit( - taxonomyID = branchB.taxonomyID, - taxonomyName = branchB.taxonomyName, - cladeCount = cladeCount, - cladeSize = cladeSize, - taxonRank = branchB.taxonRank, - cladeLevel = branchB.cladeLevel, - parentTaxonomyID = branchB.parentTaxonomyID, - children = children - ) + cladeIDs(cladeLevel + 1) = values(4).toLong + Map( + values(4).toLong -> new KrakenHit( + taxonomyID = values(4).toLong, + taxonomyName = scientificName.trim, + cladeCount = values(2).toLong, + cladeSize = values(1).toLong, + taxonRank = values(3), + cladeLevel = cladeLevel, + parentTaxonomyID = cladeIDs(cladeLevel), + children = ListBuffer() + )) } def reportToJson(reportRaw: File): String = { val reader = Source.fromFile(reportRaw) - val lines = reader.getLines().toList.filter(!_.isEmpty) +// val lines = reader.getLines().toList.filter(!_.isEmpty) /* * http://ccb.jhu.edu/software/kraken/MANUAL.html @@ -143,48 +124,20 @@ object KrakenReportToJson extends ToolCommand { * 6. indented scientific name * */ - /* - * Entries will be formatted to: - * entries[ <taxid> ] = Map( <taxid>, Map(...)) - * */ - val entries: List[Map[Long, KrakenHit]] = for (tsvLine <- lines.tail) yield { - val values = tsvLine.split("\t") - val scientificName: String = values(5) - val cladeLevel = spacePattern.findFirstIn(scientificName).getOrElse("").length / 2 + val lines = reader.getLines() + .map(line => parseLine(line)) + .filter(p => p.head._2.cladeSize > 0) + .foldLeft(Map.empty[Long, KrakenHit])( (a,b) => { + a + b.head + } ) - if (cladeIDs.length <= cladeLevel + 1) { - cladeIDs ++= mutable.ArrayBuffer.fill(10)(0L) - } + lines.keys.foreach(k => { + // append itself to the children attribute of the parent + lines(lines(k).parentTaxonomyID).children += lines(k) + }) + + mapToJson(lines(1).toJSON()).spaces2 - cladeIDs(cladeLevel + 1) = values(4).toLong - Map( - values(4).toLong -> new KrakenHit( - taxonomyID = values(4).toLong, - taxonomyName = scientificName.trim, - cladeCount = values(2).toLong, - cladeSize = values(1).toLong, - taxonRank = values(3), - cladeLevel = cladeLevel, - parentTaxonomyID = cladeIDs(cladeLevel), - children = List() - )) - } - val mm: KrakenHit = entries.foldLeft( - new KrakenHit( - taxonomyID = 1L, - taxonomyName = "root", - cladeCount = 0L, - cladeSize = 0L, - taxonRank = "-", - cladeLevel = 0, - parentTaxonomyID = 0L, - children = List() - )) { (bb: KrakenHit, aa: Map[Long, KrakenHit]) => - { - mergeBranch(aa, bb) - } - } - mapToJson(mm.toJSON()).spaces2 } def main(args: Array[String]): Unit = { -- GitLab