KrakenReportToJson.scala 6.33 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
package nl.lumc.sasc.biopet.tools

/**
 * Created by wyleung on 25-9-15.
 */

Peter van 't Hof's avatar
Peter van 't Hof committed
22
import java.io.{ File, PrintWriter }
23
24
25
26

import nl.lumc.sasc.biopet.utils.ConfigUtils._
import nl.lumc.sasc.biopet.utils.ToolCommand

27
28
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
29
30
import scala.io.Source

31
object KrakenReportToJson extends ToolCommand {
32

33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
  case class KrakenHit(taxonomyID: Long,
                       taxonomyName: String,
                       cladeCount: Long,
                       cladeSize: Long, // size of parent - including itself
                       taxonRank: String,
                       cladeLevel: Int,
                       parentTaxonomyID: Long,
                       children: ListBuffer[KrakenHit]) {
    def toJSON(withChildren: Boolean = false): Map[String, Any] = {
      val childJSON = if (withChildren) children.toList.map(entry => entry.toJSON(withChildren)) else List()
      Map(
        "name" -> taxonomyName,
        "taxid" -> taxonomyID,
        "taxonrank" -> taxonRank,
        "cladelevel" -> cladeLevel,
        "count" -> cladeCount,
        "size" -> cladeSize,
        "children" -> childJSON
      )
    }
53

54
  }
55
56
57

  var cladeIDs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill(32)(0)
  val spacePattern = "^( +)".r
Wai Yi Leung's avatar
Wai Yi Leung committed
58
  private var lines: Map[Long, KrakenHit] = Map.empty
59

60
  case class Args(krakenreport: File = null, outputJson: Option[File] = None, skipNames: Boolean = false) extends AbstractArgs
61
62
63
64
65
66
67
68
69
70
71
72
73

  class OptParser extends AbstractOptParser {

    head(
      s"""
         |$commandName - Convert Kraken-report (full) output to JSON
      """.stripMargin)

    opt[File]('i', "krakenreport") required () unbounded () valueName "<krakenreport>" action { (x, c) =>
      c.copy(krakenreport = x)
    } validate {
      x => if (x.exists) success else failure("Krakenreport not found")
    } text "Kraken report to generate stats from"
74

75
76
77
    opt[File]('o', "output") unbounded () valueName "<json>" action { (x, c) =>
      c.copy(outputJson = Some(x))
    } text "File to write output to, if not supplied output go to stdout"
78
79
80
81
82

    opt[Boolean]('n', "skipnames") unbounded () valueName "<skipnames>" action { (x, c) =>
      c.copy(skipNames = x)
    } text "Don't report the scientific name of the taxon."

83
84
85
86
87
88
89
90
91
92
93
94
  }

  /**
   * Parses the command line argument
   *
   * @param args Array of arguments
   * @return
   */
  def parseArgs(args: Array[String]): Args = new OptParser()
    .parse(args, Args())
    .getOrElse(sys.exit(1))

95
96
97
98
99
100
101
102
103
  /**
   * Takes a line from the kraken report, converts into Map with taxonID and
   * information on this hit as `KrakenHit`. `KrakenHit` is used later on for
   * building the tree
   *
   * @param krakenRawHit Line from the KrakenReport output
   * @param skipNames Specify to skip names in the report output to reduce size of JSON
   * @return
   */
104
  def parseLine(krakenRawHit: String, skipNames: Boolean): Map[Long, KrakenHit] = {
Wai Yi Leung's avatar
Wai Yi Leung committed
105
    val values: Array[String] = krakenRawHit.stripLineEnd.split("\t")
106
107
108

    assert(values.length == 6)

Wai Yi Leung's avatar
Wai Yi Leung committed
109
110
    val scientificName: String = values(5)
    val cladeLevel = spacePattern.findFirstIn(scientificName).getOrElse("").length / 2
111

Wai Yi Leung's avatar
Wai Yi Leung committed
112
113
    if (cladeIDs.length <= cladeLevel + 1) {
      cladeIDs ++= mutable.ArrayBuffer.fill(10)(0L)
114
115
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
116
117
118
119
    cladeIDs(cladeLevel + 1) = values(4).toLong
    Map(
      values(4).toLong -> new KrakenHit(
        taxonomyID = values(4).toLong,
120
        taxonomyName = if (skipNames) "" else scientificName.trim,
Wai Yi Leung's avatar
Wai Yi Leung committed
121
122
123
124
125
126
127
        cladeCount = values(2).toLong,
        cladeSize = values(1).toLong,
        taxonRank = values(3),
        cladeLevel = cladeLevel,
        parentTaxonomyID = cladeIDs(cladeLevel),
        children = ListBuffer()
      ))
128
129
  }

130
131
132
133
134
135
136
137
  /**
   * Read the `KrakenReport` output and transform into `Map` by TaxonID and `KrakenHit`
   * A JSON-string output is given.
   *
   * @param reportRaw The `KrakenReport` output
   * @param skipNames Specify to skip names in the report output to reduce size of JSON
   * @return
   */
138
  def reportToJson(reportRaw: File, skipNames: Boolean): String = {
139
140
141
142
143
144
145
146
147
148
149
150
151
    val reader = Source.fromFile(reportRaw)

    /*
    * http://ccb.jhu.edu/software/kraken/MANUAL.html
    * The header layout is:
    * 1. Percentage of reads covered by the clade rooted at this taxon
    * 2. Number of reads covered by the clade rooted at this taxon
    * 3. Number of reads assigned directly to this taxon
    * 4. A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.
    * 5. NCBI taxonomy ID
    * 6. indented scientific name
    * */

Wai Yi Leung's avatar
Wai Yi Leung committed
152
    lines = reader.getLines()
153
      .map(line => parseLine(line, skipNames))
154
      .filter(p => (p.head._2.cladeSize > 0) || List(0L, 1L).contains(p.head._2.taxonomyID))
155
156
157
      .foldLeft(Map.empty[Long, KrakenHit])((a, b) => {
        a + b.head
      })
158

Wai Yi Leung's avatar
Wai Yi Leung committed
159
160
    lines.keys.foreach(k => {
      // append itself to the children attribute of the parent
Wai Yi Leung's avatar
Wai Yi Leung committed
161
162
163
164
      if (lines(k).parentTaxonomyID > 0L) {
        // avoid the root and unclassified appending to the unclassified node
        lines(lines(k).parentTaxonomyID).children += lines(k)
      }
Wai Yi Leung's avatar
Wai Yi Leung committed
165
166
    })

Wai Yi Leung's avatar
Wai Yi Leung committed
167
    val result = Map("unclassified" -> lines(0).toJSON(withChildren = false),
Wai Yi Leung's avatar
Wai Yi Leung committed
168
169
170
171
      "classified" -> lines(1).toJSON(withChildren = true))
    mapToJson(result).spaces2

  }
172
173
174
175

  def main(args: Array[String]): Unit = {
    val commandArgs: Args = parseArgs(args)

176
    val jsonString: String = reportToJson(commandArgs.krakenreport, skipNames = commandArgs.skipNames)
177
    commandArgs.outputJson match {
Wai Yi Leung's avatar
Wai Yi Leung committed
178
      case Some(file) =>
179
180
181
182
183
184
185
186
        val writer = new PrintWriter(file)
        writer.println(jsonString)
        writer.close()
      case _ => println(jsonString)
    }

  }
}