From 8e85dc54a1b9d3f809fbdda8e912b8efb59294a2 Mon Sep 17 00:00:00 2001 From: Wai Yi Leung <w.y.leung@lumc.nl> Date: Tue, 29 Sep 2015 15:54:36 +0200 Subject: [PATCH] Adding KrakenToJSON tool, converting the kraken format to JSON. --- .../sasc/biopet/BiopetExecutablePublic.scala | 3 +- .../sasc/biopet/BiopetToolsExecutable.scala | 3 +- .../biopet/tools/KrakenReportToJson.scala | 204 ++++++++++++++++++ 3 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/KrakenReportToJson.scala diff --git a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutablePublic.scala b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutablePublic.scala index 36770bb18..946922ec7 100644 --- a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutablePublic.scala +++ b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutablePublic.scala @@ -60,5 +60,6 @@ object BiopetExecutablePublic extends BiopetExecutable { nl.lumc.sasc.biopet.tools.SeqStat, nl.lumc.sasc.biopet.tools.VepNormalizer, nl.lumc.sasc.biopet.tools.AnnotateVcfWithBed, - nl.lumc.sasc.biopet.tools.VcfWithVcf) + nl.lumc.sasc.biopet.tools.VcfWithVcf, + nl.lumc.sasc.biopet.tools.KrakenReportToJson) } diff --git a/public/biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala b/public/biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala index f3eae932c..263833058 100644 --- a/public/biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala +++ b/public/biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala @@ -44,5 +44,6 @@ object BiopetToolsExecutable extends BiopetExecutable { nl.lumc.sasc.biopet.tools.SeqStat, nl.lumc.sasc.biopet.tools.VepNormalizer, nl.lumc.sasc.biopet.tools.AnnotateVcfWithBed, - nl.lumc.sasc.biopet.tools.VcfWithVcf) + nl.lumc.sasc.biopet.tools.VcfWithVcf, + nl.lumc.sasc.biopet.tools.KrakenReportToJson) } diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/KrakenReportToJson.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/KrakenReportToJson.scala new file mode 100644 index 000000000..580eb0e00 --- /dev/null +++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/KrakenReportToJson.scala @@ -0,0 +1,204 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.tools + +/** + * Created by wyleung on 25-9-15. + */ + +import java.io.{ PrintWriter, File } + +import nl.lumc.sasc.biopet.utils.ConfigUtils._ +import nl.lumc.sasc.biopet.utils.ToolCommand +import scala.collection.{ immutable, mutable } + +import scala.io.Source + +case class KrakenHit(taxonomyID: Long, + taxonomyName: String, + cladeCount: Long, + cladeSize: Long, // size of parent - including itself + taxonRank: String, + cladeLevel: Int, + parentTaxonomyID: Long, + children: List[KrakenHit]) { + def toJSON(): Map[String, Any] = { + val childJSON = children.map(entry => entry.toJSON()) + Map( + "name" -> taxonomyName, + "taxid" -> taxonomyID, + "taxonrank" -> taxonRank, + "cladelevel" -> cladeLevel, + "count" -> cladeCount, + "size" -> cladeSize, + "children" -> childJSON + ) + } + +} + +object KrakenReportToJson extends ToolCommand { + + var cladeIDs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill(32)(0) + val spacePattern = "^( +)".r + + case class Args(krakenreport: File = null, outputJson: Option[File] = None) extends AbstractArgs + + class OptParser extends AbstractOptParser { + + head( + s""" + |$commandName - Convert Kraken-report (full) output to JSON + """.stripMargin) + + opt[File]('i', "krakenreport") required () unbounded () valueName "<krakenreport>" action { (x, c) => + c.copy(krakenreport = x) + } validate { + x => if (x.exists) success else failure("Krakenreport not found") + } text "Kraken report to generate stats from" + opt[File]('o', "output") unbounded () valueName "<json>" action { (x, c) => + c.copy(outputJson = Some(x)) + } text "File to write output to, if not supplied output go to stdout" + } + + /** + * Parses the command line argument + * + * @param args Array of arguments + * @return + */ + def parseArgs(args: Array[String]): Args = new OptParser() + .parse(args, Args()) + .getOrElse(sys.exit(1)) + + def mergeBranch(branchA: Map[Long, KrakenHit], + branchB: KrakenHit): KrakenHit = { + + var brA = branchA.head._2 + var children = branchB.children + var cladeCount = branchB.cladeCount + var cladeSize = branchB.cladeSize + + /* special case for the root node */ + if (brA.taxonomyID == branchB.taxonomyID) { + cladeCount = brA.cladeCount + cladeSize = brA.cladeSize + + } + + /* determine to scan in branchB or return Map containing a because we cannot merge? */ + if (brA.cladeLevel > branchB.cladeLevel) { + /* if brA's cladelevel is deeper than branchB, work on the children if any when it doesn't match it as parent */ + + if (brA.parentTaxonomyID == branchB.taxonomyID) { + children :+= brA + } else { + /* extend in its children */ + // TODO: do preliminary escape, don't check deeper in the tree when we have a hit. + children = children.map(child => { + mergeBranch(branchA, child) + }) + } + } else { + /* Hits are on the same level(have siblings, adding to parent) */ + } + + new KrakenHit( + taxonomyID = branchB.taxonomyID, + taxonomyName = branchB.taxonomyName, + cladeCount = cladeCount, + cladeSize = cladeSize, + taxonRank = branchB.taxonRank, + cladeLevel = branchB.cladeLevel, + parentTaxonomyID = branchB.parentTaxonomyID, + children = children + ) + } + + def reportToJson(reportRaw: File): String = { + val reader = Source.fromFile(reportRaw) + val lines = reader.getLines().toList.filter(!_.isEmpty) + + /* + * http://ccb.jhu.edu/software/kraken/MANUAL.html + * The header layout is: + * 1. Percentage of reads covered by the clade rooted at this taxon + * 2. Number of reads covered by the clade rooted at this taxon + * 3. Number of reads assigned directly to this taxon + * 4. A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'. + * 5. NCBI taxonomy ID + * 6. indented scientific name + * */ + + /* + * Entries will be formatted to: + * entries[ <taxid> ] = Map( <taxid>, Map(...)) + * */ + val entries: List[Map[Long, KrakenHit]] = for (tsvLine <- lines.tail) yield { + val values = tsvLine.split("\t") + val scientificName: String = values(5) + val cladeLevel = spacePattern.findFirstIn(scientificName).getOrElse("").length / 2 + + if (cladeIDs.length <= cladeLevel + 1) { + cladeIDs ++= mutable.ArrayBuffer.fill(10)(0L) + } + + cladeIDs(cladeLevel + 1) = values(4).toLong + Map( + values(4).toLong -> new KrakenHit( + taxonomyID = values(4).toLong, + taxonomyName = scientificName.trim, + cladeCount = values(2).toLong, + cladeSize = values(1).toLong, + taxonRank = values(3), + cladeLevel = cladeLevel, + parentTaxonomyID = cladeIDs(cladeLevel), + children = List() + )) + } + val mm: KrakenHit = entries.foldLeft( + new KrakenHit( + taxonomyID = 1L, + taxonomyName = "root", + cladeCount = 0L, + cladeSize = 0L, + taxonRank = "-", + cladeLevel = 0, + parentTaxonomyID = 0L, + children = List() + )) { (bb: KrakenHit, aa: Map[Long, KrakenHit]) => + { + mergeBranch(aa, bb) + } + } + mapToJson(mm.toJSON()).spaces2 + } + + def main(args: Array[String]): Unit = { + val commandArgs: Args = parseArgs(args) + + val jsonString: String = reportToJson(commandArgs.krakenreport) + commandArgs.outputJson match { + case Some(file) => { + val writer = new PrintWriter(file) + writer.println(jsonString) + writer.close() + } + case _ => println(jsonString) + } + + } +} -- GitLab