KrakenReportToJson.scala 5.66 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
package nl.lumc.sasc.biopet.tools

/**
 * Created by wyleung on 25-9-15.
 */

22
import java.io.{File, PrintWriter}
23
24
25
26

import nl.lumc.sasc.biopet.utils.ConfigUtils._
import nl.lumc.sasc.biopet.utils.ToolCommand

27
28
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
29
30
31
32
33
34
35
36
37
import scala.io.Source

case class KrakenHit(taxonomyID: Long,
                     taxonomyName: String,
                     cladeCount: Long,
                     cladeSize: Long, // size of parent - including itself
                     taxonRank: String,
                     cladeLevel: Int,
                     parentTaxonomyID: Long,
Wai Yi Leung's avatar
Wai Yi Leung committed
38
                     children: ListBuffer[KrakenHit]) {
Wai Yi Leung's avatar
Wai Yi Leung committed
39
40
  def toJSON(withChildren: Boolean = false): Map[String, Any] = {
    val childJSON = if (withChildren) children.toList.map(entry => entry.toJSON(withChildren)) else List()
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    Map(
      "name" -> taxonomyName,
      "taxid" -> taxonomyID,
      "taxonrank" -> taxonRank,
      "cladelevel" -> cladeLevel,
      "count" -> cladeCount,
      "size" -> cladeSize,
      "children" -> childJSON
    )
  }

}

object KrakenReportToJson extends ToolCommand {

  var cladeIDs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill(32)(0)
  val spacePattern = "^( +)".r
Wai Yi Leung's avatar
Wai Yi Leung committed
58
  private var lines: Map[Long, KrakenHit] = Map.empty
59

60
  case class Args(krakenreport: File = null, outputJson: Option[File] = None, skipNames: Boolean = false) extends AbstractArgs
61
62
63
64
65
66
67
68
69
70
71
72
73

  class OptParser extends AbstractOptParser {

    head(
      s"""
         |$commandName - Convert Kraken-report (full) output to JSON
      """.stripMargin)

    opt[File]('i', "krakenreport") required () unbounded () valueName "<krakenreport>" action { (x, c) =>
      c.copy(krakenreport = x)
    } validate {
      x => if (x.exists) success else failure("Krakenreport not found")
    } text "Kraken report to generate stats from"
74

75
76
77
    opt[File]('o', "output") unbounded () valueName "<json>" action { (x, c) =>
      c.copy(outputJson = Some(x))
    } text "File to write output to, if not supplied output go to stdout"
78
79
80
81
82

    opt[Boolean]('n', "skipnames") unbounded () valueName "<skipnames>" action { (x, c) =>
      c.copy(skipNames = x)
    } text "Don't report the scientific name of the taxon."

83
84
85
86
87
88
89
90
91
92
93
94
  }

  /**
   * Parses the command line argument
   *
   * @param args Array of arguments
   * @return
   */
  def parseArgs(args: Array[String]): Args = new OptParser()
    .parse(args, Args())
    .getOrElse(sys.exit(1))

95
  def parseLine(krakenRawHit: String, skipNames: Boolean): Map[Long, KrakenHit] = {
Wai Yi Leung's avatar
Wai Yi Leung committed
96
    val values: Array[String] = krakenRawHit.stripLineEnd.split("\t")
97
98
99

    assert(values.length == 6)

Wai Yi Leung's avatar
Wai Yi Leung committed
100
101
    val scientificName: String = values(5)
    val cladeLevel = spacePattern.findFirstIn(scientificName).getOrElse("").length / 2
102

Wai Yi Leung's avatar
Wai Yi Leung committed
103
104
    if (cladeIDs.length <= cladeLevel + 1) {
      cladeIDs ++= mutable.ArrayBuffer.fill(10)(0L)
105
106
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
107
108
109
110
    cladeIDs(cladeLevel + 1) = values(4).toLong
    Map(
      values(4).toLong -> new KrakenHit(
        taxonomyID = values(4).toLong,
111
        taxonomyName = if (skipNames) "" else scientificName.trim,
Wai Yi Leung's avatar
Wai Yi Leung committed
112
113
114
115
116
117
118
        cladeCount = values(2).toLong,
        cladeSize = values(1).toLong,
        taxonRank = values(3),
        cladeLevel = cladeLevel,
        parentTaxonomyID = cladeIDs(cladeLevel),
        children = ListBuffer()
      ))
119
120
  }

121
  def reportToJson(reportRaw: File, skipNames: Boolean): String = {
122
123
124
125
126
127
128
129
130
131
132
133
134
    val reader = Source.fromFile(reportRaw)

    /*
    * http://ccb.jhu.edu/software/kraken/MANUAL.html
    * The header layout is:
    * 1. Percentage of reads covered by the clade rooted at this taxon
    * 2. Number of reads covered by the clade rooted at this taxon
    * 3. Number of reads assigned directly to this taxon
    * 4. A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.
    * 5. NCBI taxonomy ID
    * 6. indented scientific name
    * */

Wai Yi Leung's avatar
Wai Yi Leung committed
135
    lines = reader.getLines()
136
      .map(line => parseLine(line, skipNames))
137
      .filter(p => (p.head._2.cladeSize > 0) || List(0L, 1L).contains(p.head._2.taxonomyID))
138
139
140
      .foldLeft(Map.empty[Long, KrakenHit])((a, b) => {
        a + b.head
      })
141

Wai Yi Leung's avatar
Wai Yi Leung committed
142
143
    lines.keys.foreach(k => {
      // append itself to the children attribute of the parent
Wai Yi Leung's avatar
Wai Yi Leung committed
144
145
146
147
      if (lines(k).parentTaxonomyID > 0L) {
        // avoid the root and unclassified appending to the unclassified node
        lines(lines(k).parentTaxonomyID).children += lines(k)
      }
Wai Yi Leung's avatar
Wai Yi Leung committed
148
149
    })

Wai Yi Leung's avatar
Wai Yi Leung committed
150
    val result = Map("unclassified" -> lines(0).toJSON(withChildren = false),
Wai Yi Leung's avatar
Wai Yi Leung committed
151
152
153
154
      "classified" -> lines(1).toJSON(withChildren = true))
    mapToJson(result).spaces2

  }
155
156
157
158

  def main(args: Array[String]): Unit = {
    val commandArgs: Args = parseArgs(args)

159
    val jsonString: String = reportToJson(commandArgs.krakenreport, skipNames = commandArgs.skipNames)
160
    commandArgs.outputJson match {
Wai Yi Leung's avatar
Wai Yi Leung committed
161
      case Some(file) =>
162
163
164
165
166
167
168
169
        val writer = new PrintWriter(file)
        writer.println(jsonString)
        writer.close()
      case _ => println(jsonString)
    }

  }
}