KrakenReportToJson.scala 5.68 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
package nl.lumc.sasc.biopet.tools

/**
 * Created by wyleung on 25-9-15.
 */

import java.io.{ PrintWriter, File }

import nl.lumc.sasc.biopet.utils.ConfigUtils._
import nl.lumc.sasc.biopet.utils.ToolCommand
Wai Yi Leung's avatar
Wai Yi Leung committed
26
import scala.collection.mutable.ListBuffer
Wai Yi Leung's avatar
Wai Yi Leung committed
27
import scala.collection.mutable
28
29
30
31
32
33
34
35
36
37

import scala.io.Source

case class KrakenHit(taxonomyID: Long,
                     taxonomyName: String,
                     cladeCount: Long,
                     cladeSize: Long, // size of parent - including itself
                     taxonRank: String,
                     cladeLevel: Int,
                     parentTaxonomyID: Long,
Wai Yi Leung's avatar
Wai Yi Leung committed
38
                     children: ListBuffer[KrakenHit]) {
Wai Yi Leung's avatar
Wai Yi Leung committed
39
40
  def toJSON(withChildren: Boolean = false): Map[String, Any] = {
    val childJSON = if (withChildren) children.toList.map(entry => entry.toJSON(withChildren)) else List()
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    Map(
      "name" -> taxonomyName,
      "taxid" -> taxonomyID,
      "taxonrank" -> taxonRank,
      "cladelevel" -> cladeLevel,
      "count" -> cladeCount,
      "size" -> cladeSize,
      "children" -> childJSON
    )
  }

}

object KrakenReportToJson extends ToolCommand {

  var cladeIDs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill(32)(0)
  val spacePattern = "^( +)".r
Wai Yi Leung's avatar
Wai Yi Leung committed
58
  private var lines: Map[Long, KrakenHit] = Map.empty
59

60
  case class Args(krakenreport: File = null, outputJson: Option[File] = None, skipNames: Boolean = false) extends AbstractArgs
61
62
63
64
65
66
67
68
69
70
71
72
73

  class OptParser extends AbstractOptParser {

    head(
      s"""
         |$commandName - Convert Kraken-report (full) output to JSON
      """.stripMargin)

    opt[File]('i', "krakenreport") required () unbounded () valueName "<krakenreport>" action { (x, c) =>
      c.copy(krakenreport = x)
    } validate {
      x => if (x.exists) success else failure("Krakenreport not found")
    } text "Kraken report to generate stats from"
74

75
76
77
    opt[File]('o', "output") unbounded () valueName "<json>" action { (x, c) =>
      c.copy(outputJson = Some(x))
    } text "File to write output to, if not supplied output go to stdout"
78
79
80
81
82

    opt[Boolean]('n', "skipnames") unbounded () valueName "<skipnames>" action { (x, c) =>
      c.copy(skipNames = x)
    } text "Don't report the scientific name of the taxon."

83
84
85
86
87
88
89
90
91
92
93
94
  }

  /**
   * Parses the command line argument
   *
   * @param args Array of arguments
   * @return
   */
  def parseArgs(args: Array[String]): Args = new OptParser()
    .parse(args, Args())
    .getOrElse(sys.exit(1))

95
  def parseLine(krakenRawHit: String, skipNames: Boolean): Map[Long, KrakenHit] = {
Wai Yi Leung's avatar
Wai Yi Leung committed
96
    val values: Array[String] = krakenRawHit.stripLineEnd.split("\t")
97
98
99

    assert(values.length == 6)

Wai Yi Leung's avatar
Wai Yi Leung committed
100
101
    val scientificName: String = values(5)
    val cladeLevel = spacePattern.findFirstIn(scientificName).getOrElse("").length / 2
102

Wai Yi Leung's avatar
Wai Yi Leung committed
103
104
    if (cladeIDs.length <= cladeLevel + 1) {
      cladeIDs ++= mutable.ArrayBuffer.fill(10)(0L)
105
106
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
107
108
109
110
    cladeIDs(cladeLevel + 1) = values(4).toLong
    Map(
      values(4).toLong -> new KrakenHit(
        taxonomyID = values(4).toLong,
111
        taxonomyName = if (skipNames) "" else scientificName.trim,
Wai Yi Leung's avatar
Wai Yi Leung committed
112
113
114
115
116
117
118
        cladeCount = values(2).toLong,
        cladeSize = values(1).toLong,
        taxonRank = values(3),
        cladeLevel = cladeLevel,
        parentTaxonomyID = cladeIDs(cladeLevel),
        children = ListBuffer()
      ))
119
120
  }

121
  def reportToJson(reportRaw: File, skipNames: Boolean): String = {
122
    val reader = Source.fromFile(reportRaw)
123
    //    val lines = reader.getLines().toList.filter(!_.isEmpty)
124
125
126
127
128
129
130
131
132
133
134
135

    /*
    * http://ccb.jhu.edu/software/kraken/MANUAL.html
    * The header layout is:
    * 1. Percentage of reads covered by the clade rooted at this taxon
    * 2. Number of reads covered by the clade rooted at this taxon
    * 3. Number of reads assigned directly to this taxon
    * 4. A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.
    * 5. NCBI taxonomy ID
    * 6. indented scientific name
    * */

Wai Yi Leung's avatar
Wai Yi Leung committed
136
    lines = reader.getLines()
137
      .map(line => parseLine(line, skipNames))
138
139
140
141
      .filter(p => p.head._2.cladeSize > 0)
      .foldLeft(Map.empty[Long, KrakenHit])((a, b) => {
        a + b.head
      })
142

Wai Yi Leung's avatar
Wai Yi Leung committed
143
144
    lines.keys.foreach(k => {
      // append itself to the children attribute of the parent
Wai Yi Leung's avatar
Wai Yi Leung committed
145
146
147
148
      if (lines(k).parentTaxonomyID > 0L) {
        // avoid the root and unclassified appending to the unclassified node
        lines(lines(k).parentTaxonomyID).children += lines(k)
      }
Wai Yi Leung's avatar
Wai Yi Leung committed
149
150
    })

Wai Yi Leung's avatar
Wai Yi Leung committed
151
    val result = Map("unclassified" -> lines(0).toJSON(withChildren = false),
Wai Yi Leung's avatar
Wai Yi Leung committed
152
153
154
155
      "classified" -> lines(1).toJSON(withChildren = true))
    mapToJson(result).spaces2

  }
156
157
158
159

  def main(args: Array[String]): Unit = {
    val commandArgs: Args = parseArgs(args)

160
    val jsonString: String = reportToJson(commandArgs.krakenreport, skipNames = commandArgs.skipNames)
161
    commandArgs.outputJson match {
Wai Yi Leung's avatar
Wai Yi Leung committed
162
      case Some(file) =>
163
164
165
166
167
168
169
170
        val writer = new PrintWriter(file)
        writer.println(jsonString)
        writer.close()
      case _ => println(jsonString)
    }

  }
}