KrakenReportToJson.scala 5.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
package nl.lumc.sasc.biopet.tools

/**
 * Created by wyleung on 25-9-15.
 */

import java.io.{ PrintWriter, File }

import nl.lumc.sasc.biopet.utils.ConfigUtils._
import nl.lumc.sasc.biopet.utils.ToolCommand
Wai Yi Leung's avatar
Wai Yi Leung committed
26
import scala.collection.mutable.ListBuffer
27
28
29
30
31
32
33
34
35
36
37
import scala.collection.{ immutable, mutable }

import scala.io.Source

case class KrakenHit(taxonomyID: Long,
                     taxonomyName: String,
                     cladeCount: Long,
                     cladeSize: Long, // size of parent - including itself
                     taxonRank: String,
                     cladeLevel: Int,
                     parentTaxonomyID: Long,
Wai Yi Leung's avatar
Wai Yi Leung committed
38
                     children: ListBuffer[KrakenHit]) {
39
  def toJSON(): Map[String, Any] = {
Wai Yi Leung's avatar
Wai Yi Leung committed
40
    val childJSON = children.toList.map(entry => entry.toJSON())
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
    Map(
      "name" -> taxonomyName,
      "taxid" -> taxonomyID,
      "taxonrank" -> taxonRank,
      "cladelevel" -> cladeLevel,
      "count" -> cladeCount,
      "size" -> cladeSize,
      "children" -> childJSON
    )
  }

}

object KrakenReportToJson extends ToolCommand {

  var cladeIDs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill(32)(0)
  val spacePattern = "^( +)".r

59
  case class Args(krakenreport: File = null, outputJson: Option[File] = None, skipNames: Boolean = true) extends AbstractArgs
60
61
62
63
64
65
66
67
68
69
70
71
72

  class OptParser extends AbstractOptParser {

    head(
      s"""
         |$commandName - Convert Kraken-report (full) output to JSON
      """.stripMargin)

    opt[File]('i', "krakenreport") required () unbounded () valueName "<krakenreport>" action { (x, c) =>
      c.copy(krakenreport = x)
    } validate {
      x => if (x.exists) success else failure("Krakenreport not found")
    } text "Kraken report to generate stats from"
73

74
75
76
    opt[File]('o', "output") unbounded () valueName "<json>" action { (x, c) =>
      c.copy(outputJson = Some(x))
    } text "File to write output to, if not supplied output go to stdout"
77
78
79
80
81

    opt[Boolean]('n', "skipnames") unbounded () valueName "<skipnames>" action { (x, c) =>
      c.copy(skipNames = x)
    } text "Don't report the scientific name of the taxon."

82
83
84
85
86
87
88
89
90
91
92
93
  }

  /**
   * Parses the command line argument
   *
   * @param args Array of arguments
   * @return
   */
  def parseArgs(args: Array[String]): Args = new OptParser()
    .parse(args, Args())
    .getOrElse(sys.exit(1))

94
  def parseLine(krakenRawHit: String, skipNames: Boolean): Map[Long, KrakenHit] = {
Wai Yi Leung's avatar
Wai Yi Leung committed
95
96
97
    val values: Array[String] = krakenRawHit.stripLineEnd.split("\t")
    val scientificName: String = values(5)
    val cladeLevel = spacePattern.findFirstIn(scientificName).getOrElse("").length / 2
98

Wai Yi Leung's avatar
Wai Yi Leung committed
99
100
    if (cladeIDs.length <= cladeLevel + 1) {
      cladeIDs ++= mutable.ArrayBuffer.fill(10)(0L)
101
102
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
103
104
105
106
    cladeIDs(cladeLevel + 1) = values(4).toLong
    Map(
      values(4).toLong -> new KrakenHit(
        taxonomyID = values(4).toLong,
107
        taxonomyName = if (skipNames) "" else scientificName.trim,
Wai Yi Leung's avatar
Wai Yi Leung committed
108
109
110
111
112
113
114
        cladeCount = values(2).toLong,
        cladeSize = values(1).toLong,
        taxonRank = values(3),
        cladeLevel = cladeLevel,
        parentTaxonomyID = cladeIDs(cladeLevel),
        children = ListBuffer()
      ))
115
116
  }

117
  def reportToJson(reportRaw: File, skipNames: Boolean): String = {
118
    val reader = Source.fromFile(reportRaw)
119
    //    val lines = reader.getLines().toList.filter(!_.isEmpty)
120
121
122
123
124
125
126
127
128
129
130
131

    /*
    * http://ccb.jhu.edu/software/kraken/MANUAL.html
    * The header layout is:
    * 1. Percentage of reads covered by the clade rooted at this taxon
    * 2. Number of reads covered by the clade rooted at this taxon
    * 3. Number of reads assigned directly to this taxon
    * 4. A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.
    * 5. NCBI taxonomy ID
    * 6. indented scientific name
    * */

Wai Yi Leung's avatar
Wai Yi Leung committed
132
    val lines = reader.getLines()
133
      .map(line => parseLine(line, skipNames))
134
135
136
137
      .filter(p => p.head._2.cladeSize > 0)
      .foldLeft(Map.empty[Long, KrakenHit])((a, b) => {
        a + b.head
      })
138

Wai Yi Leung's avatar
Wai Yi Leung committed
139
140
141
142
143
144
    lines.keys.foreach(k => {
      // append itself to the children attribute of the parent
      lines(lines(k).parentTaxonomyID).children += lines(k)
    })

    mapToJson(lines(1).toJSON()).spaces2
145
146
147
148
149
150

  }

  def main(args: Array[String]): Unit = {
    val commandArgs: Args = parseArgs(args)

151
    val jsonString: String = reportToJson(commandArgs.krakenreport, skipNames = commandArgs.skipNames)
152
153
154
155
156
157
158
159
160
161
162
    commandArgs.outputJson match {
      case Some(file) => {
        val writer = new PrintWriter(file)
        writer.println(jsonString)
        writer.close()
      }
      case _ => println(jsonString)
    }

  }
}