SeqStat.scala 11.1 KB
Newer Older
wyleung's avatar
wyleung committed
1
/**
bow's avatar
bow committed
2
3
4
5
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
wyleung's avatar
wyleung committed
6
 *
bow's avatar
bow committed
7
8
9
10
11
12
13
14
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
wyleung's avatar
wyleung committed
15
16
17
18
 */
package nl.lumc.sasc.biopet.tools

import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
19
20
21

import htsjdk.samtools.fastq.{ FastqReader, FastqRecord }
import nl.lumc.sasc.biopet.core.config.Configurable
22
import nl.lumc.sasc.biopet.core.summary.Summarizable
Peter van 't Hof's avatar
Peter van 't Hof committed
23
24
25
import nl.lumc.sasc.biopet.core.{ ToolCommand, ToolCommandFuntion }
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
Wai Yi Leung's avatar
Wai Yi Leung committed
26

wyleung's avatar
wyleung committed
27
import scala.collection.JavaConverters._
28
import scala.collection.immutable.Map
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import scala.collection.mutable
wyleung's avatar
wyleung committed
30
import scala.language.postfixOps
wyleung's avatar
wyleung committed
31
32
33
34
35
36

/**
 * Seqstat function class for usage in Biopet pipelines
 *
 * @param root Configuration object for the pipeline
 */
bow's avatar
bow committed
37
class SeqStat(val root: Configurable) extends ToolCommandFuntion with Summarizable {
wyleung's avatar
wyleung committed
38
  javaMainClass = getClass.getName
Wai Yi Leung's avatar
Wai Yi Leung committed
39
40

  @Input(doc = "Input FASTQ", shortName = "input", required = true)
41
  var input: File = null
Wai Yi Leung's avatar
Wai Yi Leung committed
42
43

  @Output(doc = "Output JSON", shortName = "output", required = true)
44
  var output: File = null
Wai Yi Leung's avatar
Wai Yi Leung committed
45

Peter van 't Hof's avatar
Peter van 't Hof committed
46
  override def defaultCoreMemory = 2.5
Wai Yi Leung's avatar
Wai Yi Leung committed
47
48
49

  override def commandLine = super.commandLine + required("-i", input) + " > " + required(output)

50
51
52
53
54
55
56
57
58
59
  def summaryStats: Map[String, Any] = {
    val map = ConfigUtils.fileToConfigMap(output)

    ConfigUtils.any2map(map.getOrElse("stats", Map()))
  }

  def summaryFiles: Map[String, File] = Map()

  override def resolveSummaryConflict(v1: Any, v2: Any, key: String): Any = {
    (v1, v2) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
60
61
      case (v1: Array[_], v2: Array[_])           => v1.zip(v2).map(v => resolveSummaryConflict(v._1, v._2, key))
      case (v1: List[_], v2: List[_])             => v1.zip(v2).map(v => resolveSummaryConflict(v._1, v._2, key))
62
63
64
      case (v1: Int, v2: Int) if key == "len_min" => if (v1 < v2) v1 else v2
      case (v1: Int, v2: Int) if key == "len_max" => if (v1 > v2) v1 else v2
      case (v1: Int, v2: Int)                     => v1 + v2
Peter van 't Hof's avatar
Peter van 't Hof committed
65
      case (v1: Long, v2: Long)                   => v1 + v2
66
67
      case _                                      => v1
    }
Wai Yi Leung's avatar
Wai Yi Leung committed
68
  }
wyleung's avatar
wyleung committed
69
70
}

Wai Yi Leung's avatar
Wai Yi Leung committed
71
72
73
74
75
76
77
object FqEncoding extends Enumeration {
  type FqEncoding = Value
  val Sanger = Value(33, "Sanger")
  val Solexa = Value(64, "Solexa")
  val Unknown = Value(0, "Unknown")
}

bow's avatar
bow committed
78
79
80
object SeqStat extends ToolCommand {
  def apply(root: Configurable, input: File, output: File): SeqStat = {
    val seqstat = new SeqStat(root)
Wai Yi Leung's avatar
Wai Yi Leung committed
81
    seqstat.input = input
Wai Yi Leung's avatar
Wai Yi Leung committed
82
    seqstat.output = new File(output, input.getName.substring(0, input.getName.lastIndexOf(".")) + ".seqstats.json")
83
    seqstat
Wai Yi Leung's avatar
Wai Yi Leung committed
84
85
  }

bow's avatar
bow committed
86
87
  def apply(root: Configurable, fastqfile: File, outDir: String): SeqStat = {
    val seqstat = new SeqStat(root)
Wai Yi Leung's avatar
Wai Yi Leung committed
88
    seqstat.input = fastqfile
Wai Yi Leung's avatar
Wai Yi Leung committed
89
    seqstat.output = new File(outDir, fastqfile.getName.substring(0, fastqfile.getName.lastIndexOf(".")) + ".seqstats.json")
90
    seqstat
Wai Yi Leung's avatar
Wai Yi Leung committed
91
92
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
93
  import FqEncoding._
wyleung's avatar
wyleung committed
94

Wai Yi Leung's avatar
Wai Yi Leung committed
95
  var phredEncoding: FqEncoding.Value = Sanger
96

97
98
99
100
101
102
103
104
105
106
107
  val reportValues = List(1, 10, 20, 30, 40, 50, 60)

  // the base quality for each position on the reads
  var quals: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
  var nucs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()

  // generate the baseHistogram and readHistogram
  var baseHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
  var readHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()

  var nucleotideHistoMap: mutable.Map[Char, Long] = mutable.Map()
Wai Yi Leung's avatar
Wai Yi Leung committed
108
109
  private var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
  private var readQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
110

wyleung's avatar
wyleung committed
111
112
113
114
115
116
  case class Args(fastq: File = new File("")) extends AbstractArgs

  class OptParser extends AbstractOptParser {

    head(
      s"""
117
        |$commandName - Summarize FastQ
wyleung's avatar
wyleung committed
118
119
120
121
122
123
      """.stripMargin)

    opt[File]('i', "fastq") required () valueName "<fastq>" action { (x, c) =>
      c.copy(fastq = x)
    } validate {
      x => if (x.exists) success else failure("FASTQ file not found")
124
    } text "FastQ file to generate stats from"
wyleung's avatar
wyleung committed
125
126
127
128
129
130
131
132
133
134
135
136
  }

  /**
   * Parses the command line argument
   *
   * @param args Array of arguments
   * @return
   */
  def parseArgs(args: Array[String]): Args = new OptParser()
    .parse(args, Args())
    .getOrElse(sys.exit(1))

Wai Yi Leung's avatar
Wai Yi Leung committed
137
138
139
140
  /**
   *
   * @param quals Computed quality histogram [flat]
   */
141
  def detectPhredEncoding(quals: mutable.ArrayBuffer[Long]): Unit = {
Wai Yi Leung's avatar
Wai Yi Leung committed
142
    // substract 1 on high value, because we start from index 0
143
144
    val l_qual = quals.takeWhile(_ == 0).length
    val h_qual = quals.length - 1
145

Wai Yi Leung's avatar
Wai Yi Leung committed
146
    (l_qual < 59, h_qual > 74) match {
bow's avatar
bow committed
147
148
149
150
151
152
153
154
      case (false, true) => phredEncoding = Solexa
      // TODO: check this later on
      // complex case, we cannot tell wheter this is a sanger or solexa
      // but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
      case (true, true)  => phredEncoding = Solexa
      // this is definite a sanger sequence, the lower end is sanger only
      case (true, false) => phredEncoding = Sanger
      case (_, _)        => phredEncoding = Unknown
155
156
157
    }
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
158
159
160
  // Setting up the internal storage for the statistics gathered for each read
  // 'nuc' are the nucleotides 'ACTGN', the max ASCII value for this is T, pre-init the ArrayBuffer to this value
  // as we don't expect the have other 'higher' numbered Nucleotides for now.
Peter van 't Hof's avatar
Peter van 't Hof committed
161
  case class BaseStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
Wai Yi Leung's avatar
Wai Yi Leung committed
162
163
164
165
                      nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0))

  case class ReadStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
                      nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
Peter van 't Hof's avatar
Peter van 't Hof committed
166
                      var withN: Long = 0L,
Wai Yi Leung's avatar
Wai Yi Leung committed
167
                      lengths: mutable.ArrayBuffer[Int] = mutable.ArrayBuffer())
Peter van 't Hof's avatar
Peter van 't Hof committed
168
169

  val baseStats: mutable.ArrayBuffer[BaseStat] = mutable.ArrayBuffer()
Peter van 't Hof's avatar
Peter van 't Hof committed
170
  val readStats: ReadStat = new ReadStat()
Peter van 't Hof's avatar
Peter van 't Hof committed
171

wyleung's avatar
wyleung committed
172
173
  /**
   * Compute the quality metric per read
Wai Yi Leung's avatar
Wai Yi Leung committed
174
175
176
   * Results are stored in baseStats and readStats
   *
   * @param record FastqRecord
wyleung's avatar
wyleung committed
177
178
   */
  def procesRead(record: FastqRecord): Unit = {
Wai Yi Leung's avatar
Wai Yi Leung committed
179
180
181
182

    // Adjust/expand the length of baseStat case classes to the size of current
    // read if the current list is not long enough to store the data
    if (baseStats.length < record.length) {
Peter van 't Hof's avatar
Peter van 't Hof committed
183
184
185
      baseStats ++= mutable.ArrayBuffer.fill(record.length - baseStats.length)(BaseStat())
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
186
187
188
189
    if (readStats.lengths.length < record.length) {
      readStats.lengths ++= mutable.ArrayBuffer.fill(record.length - readStats.lengths.length + 1)(0)
    }

190
191
    val readQual = record.getBaseQualityString
    val readNucleotides = record.getReadString
wyleung's avatar
wyleung committed
192

193
194
195
    if (record.length >= readStats.lengths.size) // Extends array when length not yet possible
      (0 to (record.length - readStats.lengths.size)).foreach(_ => readStats.lengths.append(0))

Wai Yi Leung's avatar
Wai Yi Leung committed
196
197
    readStats.lengths(record.length) += 1

Peter van 't Hof's avatar
Peter van 't Hof committed
198
    for (t <- 0 until record.length()) {
199
      if (baseStats(t).qual.length <= readQual(t)) {
Wai Yi Leung's avatar
Wai Yi Leung committed
200
        baseStats(t).qual ++= mutable.ArrayBuffer.fill(readQual(t).toInt - baseStats(t).qual.length + 1)(0)
Peter van 't Hof's avatar
Peter van 't Hof committed
201
      }
202
203
      baseStats(t).qual(readQual(t)) += 1
      baseStats(t).nuc(readNucleotides(t)) += 1
Wai Yi Leung's avatar
Wai Yi Leung committed
204
      readStats.nuc(readNucleotides(t)) += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
205
    }
wyleung's avatar
wyleung committed
206
207

    // implicit conversion to Int using foldLeft(0)
bow's avatar
bow committed
208
    val avgQual: Int = readQual.sum / readQual.length
Wai Yi Leung's avatar
Wai Yi Leung committed
209
210
211
212
    if (readStats.qual.length <= avgQual) {
      readStats.qual ++= mutable.ArrayBuffer.fill(avgQual - readStats.qual.length + 1)(0)
    }
    readStats.qual(avgQual) += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
213
    if (readNucleotides.contains("N")) readStats.withN += 1L
Peter van 't Hof's avatar
Peter van 't Hof committed
214
215
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
216
217
218
219
220
221
222
223
  /**
   * seqStat, the compute entrypoint where all statistics collection starts
   *
   * @param fqreader FastqReader
   * @return numReads - number of reads counted
   */
  def seqStat(fqreader: FastqReader): Long = {
    var numReads: Long = 0
224
    for (read <- fqreader.iterator.asScala) {
wyleung's avatar
wyleung committed
225
      procesRead(read)
Wai Yi Leung's avatar
Wai Yi Leung committed
226
      numReads += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
227
    }
228
229
    numReads
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
230

231
  def summarize(): Unit = {
232
    // for every position to the max length of any read
Peter van 't Hof's avatar
Peter van 't Hof committed
233
    for (pos <- baseStats.indices) {
wyleung's avatar
wyleung committed
234
      // list all qualities at this particular position `pos`
235
      // fix the length of `quals`
wyleung's avatar
wyleung committed
236
      if (quals.length <= baseStats(pos).qual.length) {
Wai Yi Leung's avatar
Wai Yi Leung committed
237
        quals ++= mutable.ArrayBuffer.fill(baseStats(pos).qual.length - quals.length)(0)
238
239
240
      }
      if (nucs.length <= baseStats(pos).nuc.length) {
        for (_ <- nucs.length until baseStats(pos).nuc.length) nucs.append(0)
wyleung's avatar
wyleung committed
241
      }
242
      // count into the quals
wyleung's avatar
wyleung committed
243
      baseStats(pos).qual.zipWithIndex foreach { case (value, index) => quals(index) += value }
244
245
246
247
      // count N into nucs
      baseStats(pos).nuc.zipWithIndex foreach { case (value, index) => nucs(index) += value }
    }
    detectPhredEncoding(quals)
bow's avatar
bow committed
248
    logger.debug("Detected '" + phredEncoding.toString.toLowerCase + "' encoding in fastq file ...")
249

Peter van 't Hof's avatar
Peter van 't Hof committed
250
    for (pos <- nucs.indices) {
251
252
253
254
255
256
257
258
259
260
      // always export the N-nucleotide
      if (nucs(pos) > 0 || pos.toChar == 'N') {
        nucleotideHistoMap += (pos.toChar -> nucs(pos))
      }
    }

    // init baseHistogram with the bounderies of the report values
    for (pos <- 0 until reportValues.max + 1) {
      baseHistogram.append(0)
      readHistogram.append(0)
wyleung's avatar
wyleung committed
261
262
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
263
    for (pos <- quals.indices) {
bow's avatar
bow committed
264
      val key: Int = pos - phredEncoding.id
265
266
      if (key >= 0) {
        baseHistogram(key) += quals(pos)
267
268
269
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
270
    for (pos <- readStats.qual.indices) {
Peter van 't Hof's avatar
Peter van 't Hof committed
271
      val key: Int = pos - phredEncoding.id
272
273
      if (key > 0) {
        // count till the max of baseHistogram.length
Wai Yi Leung's avatar
Wai Yi Leung committed
274
        for (histokey <- 0 until key + 1) {
Wai Yi Leung's avatar
Wai Yi Leung committed
275
          readHistogram(histokey) += readStats.qual(pos)
276
277
278
279
        }
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
280
    for (pos <- readHistogram.indices) {
281
282
      readQualHistoMap += (pos -> readHistogram(pos))
    }
wyleung's avatar
wyleung committed
283

284
285
286
287
288
289
  }

  def main(args: Array[String]): Unit = {

    val commandArgs: Args = parseArgs(args)

Wai Yi Leung's avatar
Wai Yi Leung committed
290
    logger.info("Start seqstat")
291
    seqStat(new FastqReader(commandArgs.fastq))
292
    summarize()
Wai Yi Leung's avatar
Wai Yi Leung committed
293
    logger.info("Seqstat done")
wyleung's avatar
wyleung committed
294

295
296
297
298
    val report: Map[String, Any] = Map(
      ("files",
        Map(
          ("fastq", Map(
bow's avatar
bow committed
299
            ("path", commandArgs.fastq))
300
301
302
303
304
305
          )
        )
      ),
      ("stats", Map(
        ("bases", Map(
          ("num_total", nucleotideHistoMap.values.sum),
306
          ("num_qual", baseHistogram.toList),
307
308
309
          ("nucleotides", nucleotideHistoMap.toMap)
        )),
        ("reads", Map(
Wai Yi Leung's avatar
Wai Yi Leung committed
310
311
312
          ("num_with_n", readStats.withN),
          ("num_total", readStats.qual.sum),
          ("len_min", readStats.lengths.takeWhile(_ == 0).length),
Wai Yi Leung's avatar
Wai Yi Leung committed
313
          ("len_max", readStats.lengths.length - 1),
314
          ("num_avg_qual_gte", readQualHistoMap.toMap),
Wai Yi Leung's avatar
Wai Yi Leung committed
315
          ("qual_encoding", phredEncoding.toString.toLowerCase)
316
317
318
319
        ))
      ))
    )

320
    println(ConfigUtils.mapToJson(report))
wyleung's avatar
wyleung committed
321
322
  }
}