SeqStat.scala 11 KB
Newer Older
wyleung's avatar
wyleung committed
1
/**
bow's avatar
bow committed
2
3
4
5
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
wyleung's avatar
wyleung committed
6
 *
bow's avatar
bow committed
7
8
9
10
11
12
13
14
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
wyleung's avatar
wyleung committed
15
16
17
18
 */
package nl.lumc.sasc.biopet.tools

import java.io.File
19
import nl.lumc.sasc.biopet.core.summary.Summarizable
Wai Yi Leung's avatar
Wai Yi Leung committed
20
21
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }

wyleung's avatar
wyleung committed
22
import scala.collection.JavaConverters._
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import scala.collection.mutable
24
import scala.collection.immutable.Map
Wai Yi Leung's avatar
Wai Yi Leung committed
25
import scala.io.Source
wyleung's avatar
wyleung committed
26
import scala.language.postfixOps
wyleung's avatar
wyleung committed
27

wyleung's avatar
wyleung committed
28
import htsjdk.samtools.fastq.{ FastqReader, FastqRecord }
29
30
import scalaz._, Scalaz._
import argonaut._, Argonaut._
wyleung's avatar
wyleung committed
31

32
import nl.lumc.sasc.biopet.core.{ ToolCommandFuntion, BiopetJavaCommandLineFunction, ToolCommand }
wyleung's avatar
wyleung committed
33
import nl.lumc.sasc.biopet.core.config.Configurable
34
import nl.lumc.sasc.biopet.utils.ConfigUtils
wyleung's avatar
wyleung committed
35
36
37
38
39
40

/**
 * Seqstat function class for usage in Biopet pipelines
 *
 * @param root Configuration object for the pipeline
 */
bow's avatar
bow committed
41
class SeqStat(val root: Configurable) extends ToolCommandFuntion with Summarizable {
wyleung's avatar
wyleung committed
42
  javaMainClass = getClass.getName
Wai Yi Leung's avatar
Wai Yi Leung committed
43
44

  @Input(doc = "Input FASTQ", shortName = "input", required = true)
45
  var input: File = null
Wai Yi Leung's avatar
Wai Yi Leung committed
46
47

  @Output(doc = "Output JSON", shortName = "output", required = true)
48
  var output: File = null
Wai Yi Leung's avatar
Wai Yi Leung committed
49

Peter van 't Hof's avatar
Peter van 't Hof committed
50
  override val defaultCoreMemory = 2.5
Wai Yi Leung's avatar
Wai Yi Leung committed
51
52
53

  override def commandLine = super.commandLine + required("-i", input) + " > " + required(output)

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
  def summaryStats: Map[String, Any] = {
    val map = ConfigUtils.fileToConfigMap(output)

    ConfigUtils.any2map(map.getOrElse("stats", Map()))
  }

  def summaryFiles: Map[String, File] = Map()

  override def resolveSummaryConflict(v1: Any, v2: Any, key: String): Any = {
    (v1, v2) match {
      case (v1: Int, v2: Int) if key == "len_min" => if (v1 < v2) v1 else v2
      case (v1: Int, v2: Int) if key == "len_max" => if (v1 > v2) v1 else v2
      case (v1: Int, v2: Int)                     => v1 + v2
      case _                                      => v1
    }
Wai Yi Leung's avatar
Wai Yi Leung committed
69
  }
wyleung's avatar
wyleung committed
70
71
}

Wai Yi Leung's avatar
Wai Yi Leung committed
72
73
74
75
76
77
78
object FqEncoding extends Enumeration {
  type FqEncoding = Value
  val Sanger = Value(33, "Sanger")
  val Solexa = Value(64, "Solexa")
  val Unknown = Value(0, "Unknown")
}

bow's avatar
bow committed
79
80
81
object SeqStat extends ToolCommand {
  def apply(root: Configurable, input: File, output: File): SeqStat = {
    val seqstat = new SeqStat(root)
Wai Yi Leung's avatar
Wai Yi Leung committed
82
    seqstat.input = input
Wai Yi Leung's avatar
Wai Yi Leung committed
83
    seqstat.output = new File(output, input.getName.substring(0, input.getName.lastIndexOf(".")) + ".seqstats.json")
84
    seqstat
Wai Yi Leung's avatar
Wai Yi Leung committed
85
86
  }

bow's avatar
bow committed
87
88
  def apply(root: Configurable, fastqfile: File, outDir: String): SeqStat = {
    val seqstat = new SeqStat(root)
Wai Yi Leung's avatar
Wai Yi Leung committed
89
    seqstat.input = fastqfile
Wai Yi Leung's avatar
Wai Yi Leung committed
90
    seqstat.output = new File(outDir, fastqfile.getName.substring(0, fastqfile.getName.lastIndexOf(".")) + ".seqstats.json")
91
    seqstat
Wai Yi Leung's avatar
Wai Yi Leung committed
92
93
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
94
  import FqEncoding._
wyleung's avatar
wyleung committed
95

Wai Yi Leung's avatar
Wai Yi Leung committed
96
  var phredEncoding: FqEncoding.Value = Sanger
97

98
99
100
101
102
103
104
105
106
107
108
  val reportValues = List(1, 10, 20, 30, 40, 50, 60)

  // the base quality for each position on the reads
  var quals: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
  var nucs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()

  // generate the baseHistogram and readHistogram
  var baseHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
  var readHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()

  var nucleotideHistoMap: mutable.Map[Char, Long] = mutable.Map()
Wai Yi Leung's avatar
Wai Yi Leung committed
109
110
  private var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
  private var readQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
111

wyleung's avatar
wyleung committed
112
113
114
115
116
117
  case class Args(fastq: File = new File("")) extends AbstractArgs

  class OptParser extends AbstractOptParser {

    head(
      s"""
118
        |$commandName - Summarize FastQ
wyleung's avatar
wyleung committed
119
120
121
122
123
124
      """.stripMargin)

    opt[File]('i', "fastq") required () valueName "<fastq>" action { (x, c) =>
      c.copy(fastq = x)
    } validate {
      x => if (x.exists) success else failure("FASTQ file not found")
125
    } text "FastQ file to generate stats from"
wyleung's avatar
wyleung committed
126
127
128
129
130
131
132
133
134
135
136
137
  }

  /**
   * Parses the command line argument
   *
   * @param args Array of arguments
   * @return
   */
  def parseArgs(args: Array[String]): Args = new OptParser()
    .parse(args, Args())
    .getOrElse(sys.exit(1))

Wai Yi Leung's avatar
Wai Yi Leung committed
138
139
140
141
  /**
   *
   * @param quals Computed quality histogram [flat]
   */
142
  def detectPhredEncoding(quals: mutable.ArrayBuffer[Long]): Unit = {
Wai Yi Leung's avatar
Wai Yi Leung committed
143
    // substract 1 on high value, because we start from index 0
144
145
    val l_qual = quals.takeWhile(_ == 0).length
    val h_qual = quals.length - 1
146

Wai Yi Leung's avatar
Wai Yi Leung committed
147
    (l_qual < 59, h_qual > 74) match {
bow's avatar
bow committed
148
149
150
151
152
153
154
155
      case (false, true) => phredEncoding = Solexa
      // TODO: check this later on
      // complex case, we cannot tell wheter this is a sanger or solexa
      // but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
      case (true, true)  => phredEncoding = Solexa
      // this is definite a sanger sequence, the lower end is sanger only
      case (true, false) => phredEncoding = Sanger
      case (_, _)        => phredEncoding = Unknown
156
157
158
    }
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
159
160
161
  // Setting up the internal storage for the statistics gathered for each read
  // 'nuc' are the nucleotides 'ACTGN', the max ASCII value for this is T, pre-init the ArrayBuffer to this value
  // as we don't expect the have other 'higher' numbered Nucleotides for now.
Peter van 't Hof's avatar
Peter van 't Hof committed
162
  case class BaseStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
Wai Yi Leung's avatar
Wai Yi Leung committed
163
164
165
166
                      nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0))

  case class ReadStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
                      nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
Peter van 't Hof's avatar
Peter van 't Hof committed
167
                      var withN: Long = 0L,
Wai Yi Leung's avatar
Wai Yi Leung committed
168
                      lengths: mutable.ArrayBuffer[Int] = mutable.ArrayBuffer())
Peter van 't Hof's avatar
Peter van 't Hof committed
169
170

  val baseStats: mutable.ArrayBuffer[BaseStat] = mutable.ArrayBuffer()
Peter van 't Hof's avatar
Peter van 't Hof committed
171
  val readStats: ReadStat = new ReadStat()
Peter van 't Hof's avatar
Peter van 't Hof committed
172

wyleung's avatar
wyleung committed
173
174
  /**
   * Compute the quality metric per read
Wai Yi Leung's avatar
Wai Yi Leung committed
175
176
177
   * Results are stored in baseStats and readStats
   *
   * @param record FastqRecord
wyleung's avatar
wyleung committed
178
179
   */
  def procesRead(record: FastqRecord): Unit = {
Wai Yi Leung's avatar
Wai Yi Leung committed
180
181
182
183

    // Adjust/expand the length of baseStat case classes to the size of current
    // read if the current list is not long enough to store the data
    if (baseStats.length < record.length) {
Peter van 't Hof's avatar
Peter van 't Hof committed
184
185
186
      baseStats ++= mutable.ArrayBuffer.fill(record.length - baseStats.length)(BaseStat())
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
187
188
189
190
    if (readStats.lengths.length < record.length) {
      readStats.lengths ++= mutable.ArrayBuffer.fill(record.length - readStats.lengths.length + 1)(0)
    }

191
192
    val readQual = record.getBaseQualityString
    val readNucleotides = record.getReadString
wyleung's avatar
wyleung committed
193

194
195
196
    if (record.length >= readStats.lengths.size) // Extends array when length not yet possible
      (0 to (record.length - readStats.lengths.size)).foreach(_ => readStats.lengths.append(0))

Wai Yi Leung's avatar
Wai Yi Leung committed
197
198
    readStats.lengths(record.length) += 1

Peter van 't Hof's avatar
Peter van 't Hof committed
199
    for (t <- 0 until record.length()) {
200
      if (baseStats(t).qual.length <= readQual(t)) {
Wai Yi Leung's avatar
Wai Yi Leung committed
201
        baseStats(t).qual ++= mutable.ArrayBuffer.fill(readQual(t).toInt - baseStats(t).qual.length + 1)(0)
Peter van 't Hof's avatar
Peter van 't Hof committed
202
      }
203
204
      baseStats(t).qual(readQual(t)) += 1
      baseStats(t).nuc(readNucleotides(t)) += 1
Wai Yi Leung's avatar
Wai Yi Leung committed
205
      readStats.nuc(readNucleotides(t)) += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
206
    }
wyleung's avatar
wyleung committed
207
208

    // implicit conversion to Int using foldLeft(0)
bow's avatar
bow committed
209
    val avgQual: Int = readQual.sum / readQual.length
Wai Yi Leung's avatar
Wai Yi Leung committed
210
211
212
213
    if (readStats.qual.length <= avgQual) {
      readStats.qual ++= mutable.ArrayBuffer.fill(avgQual - readStats.qual.length + 1)(0)
    }
    readStats.qual(avgQual) += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
214
    if (readNucleotides.contains("N")) readStats.withN += 1L
Peter van 't Hof's avatar
Peter van 't Hof committed
215
216
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
217
218
219
220
221
222
223
224
  /**
   * seqStat, the compute entrypoint where all statistics collection starts
   *
   * @param fqreader FastqReader
   * @return numReads - number of reads counted
   */
  def seqStat(fqreader: FastqReader): Long = {
    var numReads: Long = 0
225
    for (read <- fqreader.iterator.asScala) {
wyleung's avatar
wyleung committed
226
      procesRead(read)
Wai Yi Leung's avatar
Wai Yi Leung committed
227
      numReads += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
228
    }
229
230
    numReads
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
231

232
  def summarize(): Unit = {
233
    // for every position to the max length of any read
wyleung's avatar
wyleung committed
234
235
    for (pos <- 0 until baseStats.length) {
      // list all qualities at this particular position `pos`
236
      // fix the length of `quals`
wyleung's avatar
wyleung committed
237
      if (quals.length <= baseStats(pos).qual.length) {
Wai Yi Leung's avatar
Wai Yi Leung committed
238
        quals ++= mutable.ArrayBuffer.fill(baseStats(pos).qual.length - quals.length)(0)
239
240
241
      }
      if (nucs.length <= baseStats(pos).nuc.length) {
        for (_ <- nucs.length until baseStats(pos).nuc.length) nucs.append(0)
wyleung's avatar
wyleung committed
242
      }
243
      // count into the quals
wyleung's avatar
wyleung committed
244
      baseStats(pos).qual.zipWithIndex foreach { case (value, index) => quals(index) += value }
245
246
247
248
      // count N into nucs
      baseStats(pos).nuc.zipWithIndex foreach { case (value, index) => nucs(index) += value }
    }
    detectPhredEncoding(quals)
bow's avatar
bow committed
249
    logger.debug("Detected '" + phredEncoding.toString.toLowerCase + "' encoding in fastq file ...")
250
251
252
253
254
255
256
257
258
259
260
261

    for (pos <- 0 until nucs.length) {
      // always export the N-nucleotide
      if (nucs(pos) > 0 || pos.toChar == 'N') {
        nucleotideHistoMap += (pos.toChar -> nucs(pos))
      }
    }

    // init baseHistogram with the bounderies of the report values
    for (pos <- 0 until reportValues.max + 1) {
      baseHistogram.append(0)
      readHistogram.append(0)
wyleung's avatar
wyleung committed
262
263
    }

264
    for (pos <- 0 until quals.length) {
bow's avatar
bow committed
265
      val key: Int = pos - phredEncoding.id
266
267
      if (key >= 0) {
        baseHistogram(key) += quals(pos)
268
269
270
      }
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
271
    for (pos <- 0 until readStats.qual.length) {
Peter van 't Hof's avatar
Peter van 't Hof committed
272
      val key: Int = pos - phredEncoding.id
273
274
      if (key > 0) {
        // count till the max of baseHistogram.length
Wai Yi Leung's avatar
Wai Yi Leung committed
275
        for (histokey <- 0 until key + 1) {
Wai Yi Leung's avatar
Wai Yi Leung committed
276
          readHistogram(histokey) += readStats.qual(pos)
277
278
279
280
281
282
283
        }
      }
    }

    for (pos <- 0 until readHistogram.length) {
      readQualHistoMap += (pos -> readHistogram(pos))
    }
wyleung's avatar
wyleung committed
284

285
286
287
288
289
290
  }

  def main(args: Array[String]): Unit = {

    val commandArgs: Args = parseArgs(args)

Wai Yi Leung's avatar
Wai Yi Leung committed
291
    logger.info("Start seqstat")
292
    seqStat(new FastqReader(commandArgs.fastq))
293
    summarize()
Wai Yi Leung's avatar
Wai Yi Leung committed
294
    logger.info("Seqstat done")
wyleung's avatar
wyleung committed
295

296
297
298
299
    val report: Map[String, Any] = Map(
      ("files",
        Map(
          ("fastq", Map(
bow's avatar
bow committed
300
            ("path", commandArgs.fastq))
301
302
303
304
305
306
          )
        )
      ),
      ("stats", Map(
        ("bases", Map(
          ("num_total", nucleotideHistoMap.values.sum),
307
          ("num_qual", baseHistogram.toList),
308
309
310
          ("nucleotides", nucleotideHistoMap.toMap)
        )),
        ("reads", Map(
Wai Yi Leung's avatar
Wai Yi Leung committed
311
312
313
          ("num_with_n", readStats.withN),
          ("num_total", readStats.qual.sum),
          ("len_min", readStats.lengths.takeWhile(_ == 0).length),
Wai Yi Leung's avatar
Wai Yi Leung committed
314
          ("len_max", readStats.lengths.length - 1),
315
          ("num_avg_qual_gte", readQualHistoMap.toMap),
Wai Yi Leung's avatar
Wai Yi Leung committed
316
          ("qual_encoding", phredEncoding.toString.toLowerCase)
317
318
319
320
        ))
      ))
    )

321
    println(ConfigUtils.mapToJson(report))
wyleung's avatar
wyleung committed
322
323
  }
}