SeqStat.scala 9.84 KB
Newer Older
wyleung's avatar
wyleung committed
1
/**
bow's avatar
bow committed
2
3
4
5
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
wyleung's avatar
wyleung committed
6
 *
bow's avatar
bow committed
7
8
9
10
11
12
13
14
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
wyleung's avatar
wyleung committed
15
16
17
 */
package nl.lumc.sasc.biopet.tools

Wai Yi Leung's avatar
Wai Yi Leung committed
18
import java.io.{ File, PrintWriter }
Peter van 't Hof's avatar
Peter van 't Hof committed
19
20

import htsjdk.samtools.fastq.{ FastqReader, FastqRecord }
Wai Yi Leung's avatar
Wai Yi Leung committed
21
import nl.lumc.sasc.biopet.utils.{ ConfigUtils, ToolCommand }
Wai Yi Leung's avatar
Wai Yi Leung committed
22

wyleung's avatar
wyleung committed
23
import scala.collection.JavaConverters._
24
import scala.collection.immutable.Map
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import scala.collection.mutable
wyleung's avatar
wyleung committed
26
import scala.language.postfixOps
wyleung's avatar
wyleung committed
27
28

/**
Wai Yi Leung's avatar
Wai Yi Leung committed
29
30
 * Created by wyleung on 01/12/14.
 * Modified by pjvanthof and warindrarto on 27/06/2015
wyleung's avatar
wyleung committed
31
 */
bow's avatar
bow committed
32
object SeqStat extends ToolCommand {
Wai Yi Leung's avatar
Wai Yi Leung committed
33

Wai Yi Leung's avatar
Wai Yi Leung committed
34
  import FqEncoding._
wyleung's avatar
wyleung committed
35

Wai Yi Leung's avatar
Wai Yi Leung committed
36
  var phredEncoding: FqEncoding.Value = Sanger
37

38
39
40
41
42
43
44
  val reportValues = List(1, 10, 20, 30, 40, 50, 60)

  // the base quality for each position on the reads
  var quals: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
  var nucs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()

  // generate the baseHistogram and readHistogram
Wai Yi Leung's avatar
Wai Yi Leung committed
45
46
  var baseQualHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
  var readQualHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
47
48

  var nucleotideHistoMap: mutable.Map[Char, Long] = mutable.Map()
Wai Yi Leung's avatar
Wai Yi Leung committed
49
  private var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
50
  private var readQualGTEHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
51

52
  case class Args(fastq: File = null, outputJson: Option[File] = None) extends AbstractArgs
wyleung's avatar
wyleung committed
53
54
55
56
57

  class OptParser extends AbstractOptParser {

    head(
      s"""
58
         |$commandName - Summarize FastQ
wyleung's avatar
wyleung committed
59
60
      """.stripMargin)

Peter van 't Hof's avatar
Peter van 't Hof committed
61
    opt[File]('i', "fastq") required () unbounded () valueName "<fastq>" action { (x, c) =>
wyleung's avatar
wyleung committed
62
63
64
      c.copy(fastq = x)
    } validate {
      x => if (x.exists) success else failure("FASTQ file not found")
65
    } text "FastQ file to generate stats from"
Peter van 't Hof's avatar
Peter van 't Hof committed
66
    opt[File]('o', "output") unbounded () valueName "<json>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
67
      c.copy(outputJson = Some(x))
68
    } text "File to write output to, if not supplied output go to stdout"
wyleung's avatar
wyleung committed
69
70
71
72
73
74
75
76
77
78
79
80
  }

  /**
   * Parses the command line argument
   *
   * @param args Array of arguments
   * @return
   */
  def parseArgs(args: Array[String]): Args = new OptParser()
    .parse(args, Args())
    .getOrElse(sys.exit(1))

Wai Yi Leung's avatar
Wai Yi Leung committed
81
82
83
84
  /**
   *
   * @param quals Computed quality histogram [flat]
   */
85
  def detectPhredEncoding(quals: mutable.ArrayBuffer[Long]): Unit = {
Wai Yi Leung's avatar
Wai Yi Leung committed
86
    // substract 1 on high value, because we start from index 0
Wai Yi Leung's avatar
Wai Yi Leung committed
87
88
    val qual_low_boundery = quals.takeWhile(_ == 0).length
    val qual_high_boundery = quals.length - 1
89

Wai Yi Leung's avatar
Wai Yi Leung committed
90
    (qual_low_boundery < 59, qual_high_boundery > 74) match {
bow's avatar
bow committed
91
      case (false, true) => phredEncoding = Solexa
Peter van 't Hof's avatar
Peter van 't Hof committed
92
93
94
95
      // TODO: check this later on
      // complex case, we cannot tell wheter this is a sanger or solexa
      // but since the qual_high_boundery exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
      // New @ 2016/01/26: Illumina X ten samples can contain Phred=Q42 (qual_high_boundery==75/K)
bow's avatar
bow committed
96
97
98
99
      case (true, true)  => phredEncoding = Solexa
      // this is definite a sanger sequence, the lower end is sanger only
      case (true, false) => phredEncoding = Sanger
      case (_, _)        => phredEncoding = Unknown
100
101
102
    }
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
103
104
105
  // Setting up the internal storage for the statistics gathered for each read
  // 'nuc' are the nucleotides 'ACTGN', the max ASCII value for this is T, pre-init the ArrayBuffer to this value
  // as we don't expect the have other 'higher' numbered Nucleotides for now.
Peter van 't Hof's avatar
Peter van 't Hof committed
106
  case class BaseStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
Wai Yi Leung's avatar
Wai Yi Leung committed
107
                      nucs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0))
Wai Yi Leung's avatar
Wai Yi Leung committed
108
109

  case class ReadStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
Wai Yi Leung's avatar
Wai Yi Leung committed
110
                      nucs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
Peter van 't Hof's avatar
Peter van 't Hof committed
111
                      var withN: Long = 0L,
Wai Yi Leung's avatar
Wai Yi Leung committed
112
                      lengths: mutable.ArrayBuffer[Int] = mutable.ArrayBuffer())
Peter van 't Hof's avatar
Peter van 't Hof committed
113
114

  val baseStats: mutable.ArrayBuffer[BaseStat] = mutable.ArrayBuffer()
Peter van 't Hof's avatar
Peter van 't Hof committed
115
  val readStats: ReadStat = new ReadStat()
Peter van 't Hof's avatar
Peter van 't Hof committed
116

Wai Yi Leung's avatar
Wai Yi Leung committed
117
118
  var readLengthHistogram: mutable.Map[String, Long] = mutable.Map.empty

wyleung's avatar
wyleung committed
119
120
  /**
   * Compute the quality metric per read
Wai Yi Leung's avatar
Wai Yi Leung committed
121
122
123
   * Results are stored in baseStats and readStats
   *
   * @param record FastqRecord
wyleung's avatar
wyleung committed
124
125
   */
  def procesRead(record: FastqRecord): Unit = {
Wai Yi Leung's avatar
Wai Yi Leung committed
126
127
128
129

    // Adjust/expand the length of baseStat case classes to the size of current
    // read if the current list is not long enough to store the data
    if (baseStats.length < record.length) {
Peter van 't Hof's avatar
Peter van 't Hof committed
130
131
132
      baseStats ++= mutable.ArrayBuffer.fill(record.length - baseStats.length)(BaseStat())
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
133
134
135
136
    if (readStats.lengths.length < record.length) {
      readStats.lengths ++= mutable.ArrayBuffer.fill(record.length - readStats.lengths.length + 1)(0)
    }

Wai Yi Leung's avatar
Wai Yi Leung committed
137
    val readQuality = record.getBaseQualityString
138
    val readNucleotides = record.getReadString
wyleung's avatar
wyleung committed
139

Wai Yi Leung's avatar
Wai Yi Leung committed
140
141
    readStats.lengths(record.length) += 1

Peter van 't Hof's avatar
Peter van 't Hof committed
142
    for (t <- 0 until record.length()) {
Wai Yi Leung's avatar
Wai Yi Leung committed
143
144
      if (baseStats(t).qual.length <= readQuality(t)) {
        baseStats(t).qual ++= mutable.ArrayBuffer.fill(readQuality(t).toInt - baseStats(t).qual.length + 1)(0)
Peter van 't Hof's avatar
Peter van 't Hof committed
145
      }
Wai Yi Leung's avatar
Wai Yi Leung committed
146
147
148
      baseStats(t).qual(readQuality(t)) += 1
      baseStats(t).nucs(readNucleotides(t)) += 1
      readStats.nucs(readNucleotides(t)) += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
149
    }
wyleung's avatar
wyleung committed
150
151

    // implicit conversion to Int using foldLeft(0)
Wai Yi Leung's avatar
Wai Yi Leung committed
152
    val avgQual: Int = readQuality.sum / readQuality.length
Wai Yi Leung's avatar
Wai Yi Leung committed
153
154
155
156
    if (readStats.qual.length <= avgQual) {
      readStats.qual ++= mutable.ArrayBuffer.fill(avgQual - readStats.qual.length + 1)(0)
    }
    readStats.qual(avgQual) += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
157
    if (readNucleotides.contains("N")) readStats.withN += 1L
Peter van 't Hof's avatar
Peter van 't Hof committed
158
159
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
160
161
162
163
164
165
166
167
  /**
   * seqStat, the compute entrypoint where all statistics collection starts
   *
   * @param fqreader FastqReader
   * @return numReads - number of reads counted
   */
  def seqStat(fqreader: FastqReader): Long = {
    var numReads: Long = 0
168
    for (read <- fqreader.iterator.asScala) {
wyleung's avatar
wyleung committed
169
      procesRead(read)
Wai Yi Leung's avatar
Wai Yi Leung committed
170
      numReads += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
171
    }
172
173
    numReads
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
174

175
  def summarize(): Unit = {
176
    // for every position to the max length of any read
Peter van 't Hof's avatar
Peter van 't Hof committed
177
    for (pos <- baseStats.indices) {
wyleung's avatar
wyleung committed
178
      // list all qualities at this particular position `pos`
179
      // fix the length of `quals`
wyleung's avatar
wyleung committed
180
      if (quals.length <= baseStats(pos).qual.length) {
Wai Yi Leung's avatar
Wai Yi Leung committed
181
        quals ++= mutable.ArrayBuffer.fill(baseStats(pos).qual.length - quals.length)(0)
182
      }
Wai Yi Leung's avatar
Wai Yi Leung committed
183
      if (nucs.length <= baseStats(pos).nucs.length) {
Peter van 't Hof's avatar
Peter van 't Hof committed
184
        nucs ++= mutable.ArrayBuffer.fill(baseStats(pos).nucs.length - nucs.length)(0)
wyleung's avatar
wyleung committed
185
      }
186
      // count into the quals
wyleung's avatar
wyleung committed
187
      baseStats(pos).qual.zipWithIndex foreach { case (value, index) => quals(index) += value }
188
      // count N into nucs
Wai Yi Leung's avatar
Wai Yi Leung committed
189
      baseStats(pos).nucs.zipWithIndex foreach { case (value, index) => nucs(index) += value }
190
191
    }
    detectPhredEncoding(quals)
bow's avatar
bow committed
192
    logger.debug("Detected '" + phredEncoding.toString.toLowerCase + "' encoding in fastq file ...")
193

Wai Yi Leung's avatar
Wai Yi Leung committed
194
195
196
197
198
199
200
    nucleotideHistoMap = nucs.toList
      .foldLeft(mutable.Map[Char, Long]())(
        (output, nucleotideCount) => output + (output.size.toChar -> nucleotideCount)
      )
      // ensure bases: `ACTGN` is always reported even having a zero count.
      // Other chars might be counted also, these are also reported
      .retain((nucleotide, count) => (count > 0 || "ACTGN".contains(nucleotide.toString)))
201

Wai Yi Leung's avatar
Wai Yi Leung committed
202
203
    baseQualHistogram = quals.slice(phredEncoding.id, quals.size)
    baseQualHistogram ++= mutable.ArrayBuffer.fill(reportValues.max + 1 - baseQualHistogram.size)(0L)
wyleung's avatar
wyleung committed
204

Wai Yi Leung's avatar
Wai Yi Leung committed
205
206
    readQualHistogram = readStats.qual.slice(phredEncoding.id, readStats.qual.size)
    readQualHistogram ++= mutable.ArrayBuffer.fill(reportValues.max + 1 - readQualHistogram.size)(0L)
207

208
    readQualGTEHistoMap = readQualHistogram.indices
209
210
211
212
213
214
      .foldLeft(mutable.Map[Int, Long]())(
        (output, index) => {
          output + (output.keys.size -> readQualHistogram.slice(index, readQualHistogram.size).sum)
        }
      )

215
216
  }

Wai Yi Leung's avatar
Wai Yi Leung committed
217
218
  def reportMap(fastqPath: File): Map[String, Any] = {
    Map(
219
220
221
      ("files",
        Map(
          ("fastq", Map(
Wai Yi Leung's avatar
Wai Yi Leung committed
222
            ("path", fastqPath.getAbsolutePath))
223
224
225
226
227
228
          )
        )
      ),
      ("stats", Map(
        ("bases", Map(
          ("num_total", nucleotideHistoMap.values.sum),
Wai Yi Leung's avatar
Wai Yi Leung committed
229
          ("num_qual", baseQualHistogram.toList),
230
231
232
          ("nucleotides", nucleotideHistoMap.toMap)
        )),
        ("reads", Map(
Wai Yi Leung's avatar
Wai Yi Leung committed
233
234
235
          ("num_with_n", readStats.withN),
          ("num_total", readStats.qual.sum),
          ("len_min", readStats.lengths.takeWhile(_ == 0).length),
Wai Yi Leung's avatar
Wai Yi Leung committed
236
          ("len_max", readStats.lengths.length - 1),
237
          ("num_avg_qual_gte", readQualGTEHistoMap.toMap),
Wai Yi Leung's avatar
Wai Yi Leung committed
238
          ("qual_encoding", phredEncoding.toString.toLowerCase),
239
          ("len_histogram", readStats.lengths.toList)
240
241
242
        ))
      ))
    )
Wai Yi Leung's avatar
Wai Yi Leung committed
243
244
245
246
247
248
249
250
251
252
253
254
  }

  def main(args: Array[String]): Unit = {

    val commandArgs: Args = parseArgs(args)

    logger.info("Start seqstat")
    seqStat(new FastqReader(commandArgs.fastq))
    summarize()
    logger.info("Seqstat done")

    val report = reportMap(commandArgs.fastq)
255

256
257
258
259
260
261
262
263
    commandArgs.outputJson match {
      case Some(file) => {
        val writer = new PrintWriter(file)
        writer.println(ConfigUtils.mapToJson(report))
        writer.close()
      }
      case _ => println(ConfigUtils.mapToJson(report))
    }
wyleung's avatar
wyleung committed
264
265
  }
}
266
267
268
269
270
271
272
273

object FqEncoding extends Enumeration {
  type FqEncoding = Value
  val Sanger = Value(33, "Sanger")
  val Solexa = Value(64, "Solexa")
  val Unknown = Value(0, "Unknown")
}