Seqstat.scala 10.9 KB
Newer Older
wyleung's avatar
wyleung committed
1
2
3
4
5
6
7
8
/**
 * Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl>
 * @author Wai Yi Leung <w.y.leung@lumc.nl>
 *
 */
package nl.lumc.sasc.biopet.tools

import java.io.File
9
import nl.lumc.sasc.biopet.core.summary.Summarizable
Wai Yi Leung's avatar
Wai Yi Leung committed
10
11
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }

wyleung's avatar
wyleung committed
12
import scala.collection.JavaConverters._
Peter van 't Hof's avatar
Peter van 't Hof committed
13
import scala.collection.mutable
14
import scala.collection.immutable.Map
Wai Yi Leung's avatar
Wai Yi Leung committed
15
import scala.io.Source
wyleung's avatar
wyleung committed
16
import scala.language.postfixOps
wyleung's avatar
wyleung committed
17

wyleung's avatar
wyleung committed
18
import htsjdk.samtools.fastq.{ FastqReader, FastqRecord }
19
20
import scalaz._, Scalaz._
import argonaut._, Argonaut._
wyleung's avatar
wyleung committed
21
22
23
24

import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
25
import nl.lumc.sasc.biopet.utils.ConfigUtils
wyleung's avatar
wyleung committed
26
27
28
29
30
31

/**
 * Seqstat function class for usage in Biopet pipelines
 *
 * @param root Configuration object for the pipeline
 */
32
class Seqstat(val root: Configurable) extends BiopetJavaCommandLineFunction with Summarizable {
wyleung's avatar
wyleung committed
33
  javaMainClass = getClass.getName
Wai Yi Leung's avatar
Wai Yi Leung committed
34
35

  @Input(doc = "Input FASTQ", shortName = "input", required = true)
36
  var input: File = null
Wai Yi Leung's avatar
Wai Yi Leung committed
37
38

  @Output(doc = "Output JSON", shortName = "output", required = true)
39
  var output: File = null
Wai Yi Leung's avatar
Wai Yi Leung committed
40

41
42
  override val defaultVmem = "3G"
  memoryLimit = Option(1.0)
Wai Yi Leung's avatar
Wai Yi Leung committed
43
44
45

  override def commandLine = super.commandLine + required("-i", input) + " > " + required(output)

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
  def summaryStats: Map[String, Any] = {
    val map = ConfigUtils.fileToConfigMap(output)

    ConfigUtils.any2map(map.getOrElse("stats", Map()))
  }

  def summaryFiles: Map[String, File] = Map()

  override def resolveSummaryConflict(v1: Any, v2: Any, key: String): Any = {
    (v1, v2) match {
      case (v1: Int, v2: Int) if key == "len_min" => if (v1 < v2) v1 else v2
      case (v1: Int, v2: Int) if key == "len_max" => if (v1 > v2) v1 else v2
      case (v1: Int, v2: Int)                     => v1 + v2
      case _                                      => v1
    }
Wai Yi Leung's avatar
Wai Yi Leung committed
61
  }
wyleung's avatar
wyleung committed
62
63
}

Wai Yi Leung's avatar
Wai Yi Leung committed
64
65
66
67
68
69
70
object FqEncoding extends Enumeration {
  type FqEncoding = Value
  val Sanger = Value(33, "Sanger")
  val Solexa = Value(64, "Solexa")
  val Unknown = Value(0, "Unknown")
}

wyleung's avatar
wyleung committed
71
object Seqstat extends ToolCommand {
Wai Yi Leung's avatar
Wai Yi Leung committed
72
73
74
  def apply(root: Configurable, input: File, output: File): Seqstat = {
    val seqstat = new Seqstat(root)
    seqstat.input = input
Wai Yi Leung's avatar
Wai Yi Leung committed
75
    seqstat.output = new File(output, input.getName.substring(0, input.getName.lastIndexOf(".")) + ".seqstats.json")
76
    seqstat
Wai Yi Leung's avatar
Wai Yi Leung committed
77
78
79
80
81
  }

  def apply(root: Configurable, fastqfile: File, outDir: String): Seqstat = {
    val seqstat = new Seqstat(root)
    seqstat.input = fastqfile
Wai Yi Leung's avatar
Wai Yi Leung committed
82
    seqstat.output = new File(outDir, fastqfile.getName.substring(0, fastqfile.getName.lastIndexOf(".")) + ".seqstats.json")
83
    seqstat
Wai Yi Leung's avatar
Wai Yi Leung committed
84
85
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
86
  import FqEncoding._
wyleung's avatar
wyleung committed
87

Wai Yi Leung's avatar
Wai Yi Leung committed
88
  var phredEncoding: FqEncoding.Value = Sanger
89

90
91
92
93
94
95
96
97
98
99
100
  val reportValues = List(1, 10, 20, 30, 40, 50, 60)

  // the base quality for each position on the reads
  var quals: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
  var nucs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()

  // generate the baseHistogram and readHistogram
  var baseHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
  var readHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()

  var nucleotideHistoMap: mutable.Map[Char, Long] = mutable.Map()
Wai Yi Leung's avatar
Wai Yi Leung committed
101
102
  private var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
  private var readQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
103

wyleung's avatar
wyleung committed
104
105
106
107
108
109
  case class Args(fastq: File = new File("")) extends AbstractArgs

  class OptParser extends AbstractOptParser {

    head(
      s"""
110
        |$commandName - Summarize FastQ
wyleung's avatar
wyleung committed
111
112
113
114
115
116
      """.stripMargin)

    opt[File]('i', "fastq") required () valueName "<fastq>" action { (x, c) =>
      c.copy(fastq = x)
    } validate {
      x => if (x.exists) success else failure("FASTQ file not found")
117
    } text "FastQ file to generate stats from"
wyleung's avatar
wyleung committed
118
119
120
121
122
123
124
125
126
127
128
129
  }

  /**
   * Parses the command line argument
   *
   * @param args Array of arguments
   * @return
   */
  def parseArgs(args: Array[String]): Args = new OptParser()
    .parse(args, Args())
    .getOrElse(sys.exit(1))

Wai Yi Leung's avatar
Wai Yi Leung committed
130
131
132
133
  /**
   *
   * @param quals Computed quality histogram [flat]
   */
134
  def detectPhredEncoding(quals: mutable.ArrayBuffer[Long]): Unit = {
Wai Yi Leung's avatar
Wai Yi Leung committed
135
    // substract 1 on high value, because we start from index 0
136
137
    val l_qual = quals.takeWhile(_ == 0).length
    val h_qual = quals.length - 1
138

Wai Yi Leung's avatar
Wai Yi Leung committed
139
140
    (l_qual < 59, h_qual > 74) match {
      case (false, true) => {
Wai Yi Leung's avatar
Wai Yi Leung committed
141
        phredEncoding = Solexa
Wai Yi Leung's avatar
Wai Yi Leung committed
142
143
144
145
146
      }
      case (true, true) => {
        // TODO: check this later on
        // complex case, we cannot tell wheter this is a sanger or solexa
        // but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
Wai Yi Leung's avatar
Wai Yi Leung committed
147
        phredEncoding = Solexa
Wai Yi Leung's avatar
Wai Yi Leung committed
148
149
150
      }
      case (true, false) => {
        // this is definite a sanger sequence, the lower end is sanger only
Wai Yi Leung's avatar
Wai Yi Leung committed
151
        phredEncoding = Sanger
Wai Yi Leung's avatar
Wai Yi Leung committed
152
153
      }
      case (_, _) => {
Wai Yi Leung's avatar
Wai Yi Leung committed
154
        phredEncoding = Unknown
Wai Yi Leung's avatar
Wai Yi Leung committed
155
      }
156
157
158
    }
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
159
160
161
  // Setting up the internal storage for the statistics gathered for each read
  // 'nuc' are the nucleotides 'ACTGN', the max ASCII value for this is T, pre-init the ArrayBuffer to this value
  // as we don't expect the have other 'higher' numbered Nucleotides for now.
Peter van 't Hof's avatar
Peter van 't Hof committed
162
  case class BaseStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
Wai Yi Leung's avatar
Wai Yi Leung committed
163
164
165
166
                      nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0))

  case class ReadStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
                      nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
Peter van 't Hof's avatar
Peter van 't Hof committed
167
                      var withN: Long = 0L,
Wai Yi Leung's avatar
Wai Yi Leung committed
168
                      lengths: mutable.ArrayBuffer[Int] = mutable.ArrayBuffer())
Peter van 't Hof's avatar
Peter van 't Hof committed
169
170

  val baseStats: mutable.ArrayBuffer[BaseStat] = mutable.ArrayBuffer()
Peter van 't Hof's avatar
Peter van 't Hof committed
171
  val readStats: ReadStat = new ReadStat()
Peter van 't Hof's avatar
Peter van 't Hof committed
172

wyleung's avatar
wyleung committed
173
174
  /**
   * Compute the quality metric per read
Wai Yi Leung's avatar
Wai Yi Leung committed
175
176
177
   * Results are stored in baseStats and readStats
   *
   * @param record FastqRecord
wyleung's avatar
wyleung committed
178
179
   */
  def procesRead(record: FastqRecord): Unit = {
Wai Yi Leung's avatar
Wai Yi Leung committed
180
181
182
183

    // Adjust/expand the length of baseStat case classes to the size of current
    // read if the current list is not long enough to store the data
    if (baseStats.length < record.length) {
Peter van 't Hof's avatar
Peter van 't Hof committed
184
185
186
      baseStats ++= mutable.ArrayBuffer.fill(record.length - baseStats.length)(BaseStat())
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
187
188
189
190
    if (readStats.lengths.length < record.length) {
      readStats.lengths ++= mutable.ArrayBuffer.fill(record.length - readStats.lengths.length + 1)(0)
    }

191
192
    val readQual = record.getBaseQualityString
    val readNucleotides = record.getReadString
wyleung's avatar
wyleung committed
193

194
195
196
    if (record.length >= readStats.lengths.size) // Extends array when length not yet possible
      (0 to (record.length - readStats.lengths.size)).foreach(_ => readStats.lengths.append(0))

Wai Yi Leung's avatar
Wai Yi Leung committed
197
198
    readStats.lengths(record.length) += 1

Peter van 't Hof's avatar
Peter van 't Hof committed
199
    for (t <- 0 until record.length()) {
200
      if (baseStats(t).qual.length <= readQual(t)) {
Wai Yi Leung's avatar
Wai Yi Leung committed
201
        baseStats(t).qual ++= mutable.ArrayBuffer.fill(readQual(t).toInt - baseStats(t).qual.length + 1)(0)
Peter van 't Hof's avatar
Peter van 't Hof committed
202
      }
203
204
      baseStats(t).qual(readQual(t)) += 1
      baseStats(t).nuc(readNucleotides(t)) += 1
Wai Yi Leung's avatar
Wai Yi Leung committed
205
      readStats.nuc(readNucleotides(t)) += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
206
    }
wyleung's avatar
wyleung committed
207
208

    // implicit conversion to Int using foldLeft(0)
209
    val avgQual: Int = (readQual.sum / readQual.length)
Wai Yi Leung's avatar
Wai Yi Leung committed
210
211
212
213
    if (readStats.qual.length <= avgQual) {
      readStats.qual ++= mutable.ArrayBuffer.fill(avgQual - readStats.qual.length + 1)(0)
    }
    readStats.qual(avgQual) += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
214
    if (readNucleotides.contains("N")) readStats.withN += 1L
Peter van 't Hof's avatar
Peter van 't Hof committed
215
216
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
217
218
219
220
221
222
223
224
  /**
   * seqStat, the compute entrypoint where all statistics collection starts
   *
   * @param fqreader FastqReader
   * @return numReads - number of reads counted
   */
  def seqStat(fqreader: FastqReader): Long = {
    var numReads: Long = 0
225
    for (read <- fqreader.iterator.asScala) {
wyleung's avatar
wyleung committed
226
      procesRead(read)
Wai Yi Leung's avatar
Wai Yi Leung committed
227
      numReads += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
228
    }
229
230
    numReads
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
231

232
  def summarize(): Unit = {
233
    // for every position to the max length of any read
wyleung's avatar
wyleung committed
234
235
    for (pos <- 0 until baseStats.length) {
      // list all qualities at this particular position `pos`
236
      // fix the length of `quals`
wyleung's avatar
wyleung committed
237
      if (quals.length <= baseStats(pos).qual.length) {
Wai Yi Leung's avatar
Wai Yi Leung committed
238
        quals ++= mutable.ArrayBuffer.fill(baseStats(pos).qual.length - quals.length)(0)
239
240
241
      }
      if (nucs.length <= baseStats(pos).nuc.length) {
        for (_ <- nucs.length until baseStats(pos).nuc.length) nucs.append(0)
wyleung's avatar
wyleung committed
242
      }
243
      // count into the quals
wyleung's avatar
wyleung committed
244
      baseStats(pos).qual.zipWithIndex foreach { case (value, index) => quals(index) += value }
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
      // count N into nucs
      baseStats(pos).nuc.zipWithIndex foreach { case (value, index) => nucs(index) += value }
    }
    detectPhredEncoding(quals)

    for (pos <- 0 until nucs.length) {
      // always export the N-nucleotide
      if (nucs(pos) > 0 || pos.toChar == 'N') {
        nucleotideHistoMap += (pos.toChar -> nucs(pos))
      }
    }

    // init baseHistogram with the bounderies of the report values
    for (pos <- 0 until reportValues.max + 1) {
      baseHistogram.append(0)
      readHistogram.append(0)
wyleung's avatar
wyleung committed
261
262
    }

263
    for (pos <- 0 until quals.length) {
Wai Yi Leung's avatar
Wai Yi Leung committed
264
      var key: Int = pos - phredEncoding.id
265
266
      if (key > 0) {
        // count till the max of baseHistogram.length
Wai Yi Leung's avatar
Wai Yi Leung committed
267
        for (histokey <- 0 until key + 1) {
268
269
270
271
272
273
274
275
276
          baseHistogram(histokey) += quals(pos)
        }
      }
    }

    for (pos <- 0 until baseHistogram.length) {
      baseQualHistoMap += (pos -> baseHistogram(pos))
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
277
    for (pos <- 0 until readStats.qual.length) {
Peter van 't Hof's avatar
Peter van 't Hof committed
278
      val key: Int = pos - phredEncoding.id
279
280
      if (key > 0) {
        // count till the max of baseHistogram.length
Wai Yi Leung's avatar
Wai Yi Leung committed
281
        for (histokey <- 0 until key + 1) {
Wai Yi Leung's avatar
Wai Yi Leung committed
282
          readHistogram(histokey) += readStats.qual(pos)
283
284
285
286
287
288
289
        }
      }
    }

    for (pos <- 0 until readHistogram.length) {
      readQualHistoMap += (pos -> readHistogram(pos))
    }
wyleung's avatar
wyleung committed
290

291
292
293
294
295
296
  }

  def main(args: Array[String]): Unit = {

    val commandArgs: Args = parseArgs(args)

Wai Yi Leung's avatar
Wai Yi Leung committed
297
    logger.info("Start seqstat")
298
299

    val reader = new FastqReader(commandArgs.fastq)
Wai Yi Leung's avatar
Wai Yi Leung committed
300
    val numReads = seqStat(reader)
301
302
    summarize()

303
304
    logger.debug(nucs)
    //    logger.debug(baseStats)
Wai Yi Leung's avatar
Wai Yi Leung committed
305
    logger.info("Seqstat done")
wyleung's avatar
wyleung committed
306

307
308
309
310
311
312
313
314
315
316
317
318
    val report: Map[String, Any] = Map(
      ("files",
        Map(
          ("fastq", Map(
            ("path", commandArgs.fastq),
            ("checksum_sha1", "")
          )
          )
        )
      ),
      ("stats", Map(
        ("bases", Map(
Peter van 't Hof's avatar
Peter van 't Hof committed
319
          ("num_n", nucleotideHistoMap.getOrElse('N', 0)),
320
321
322
323
324
          ("num_total", nucleotideHistoMap.values.sum),
          ("num_qual_gte", baseQualHistoMap.toMap),
          ("nucleotides", nucleotideHistoMap.toMap)
        )),
        ("reads", Map(
Wai Yi Leung's avatar
Wai Yi Leung committed
325
326
327
          ("num_with_n", readStats.withN),
          ("num_total", readStats.qual.sum),
          ("len_min", readStats.lengths.takeWhile(_ == 0).length),
Wai Yi Leung's avatar
Wai Yi Leung committed
328
          ("len_max", readStats.lengths.length - 1),
329
          ("num_qual_gte", readQualHistoMap.toMap),
Wai Yi Leung's avatar
Wai Yi Leung committed
330
          ("qual_encoding", phredEncoding.toString.toLowerCase)
331
332
333
334
        ))
      ))
    )

Peter van 't Hof's avatar
Peter van 't Hof committed
335
    println(ConfigUtils.mapToJson(report).spaces2)
wyleung's avatar
wyleung committed
336
337
  }
}