Commit b34fc129 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'feature-seqstat-reportfragmentlengths' into 'develop'

Seqstat reporting read length histogram

Added reporting of readlength histogram as `stats > reads > len_histogram`
Simplified / Scalafied some of the SeqStat code

Adding more unit tests

Fixes #284 

See merge request !331
parents ddda9ebd d2eea1ba
......@@ -15,10 +15,10 @@
*/
package nl.lumc.sasc.biopet.tools
import java.io.{ PrintWriter, File }
import java.io.{ File, PrintWriter }
import htsjdk.samtools.fastq.{ FastqReader, FastqRecord }
import nl.lumc.sasc.biopet.utils.{ ToolCommand, ConfigUtils }
import nl.lumc.sasc.biopet.utils.{ ConfigUtils, ToolCommand }
import scala.collection.JavaConverters._
import scala.collection.immutable.Map
......@@ -26,7 +26,8 @@ import scala.collection.mutable
import scala.language.postfixOps
/**
* Created by pjvanthof on 11/09/15.
* Created by wyleung on 01/12/14.
* Modified by pjvanthof and warindrarto on 27/06/2015
*/
object SeqStat extends ToolCommand {
......@@ -41,12 +42,12 @@ object SeqStat extends ToolCommand {
var nucs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
// generate the baseHistogram and readHistogram
var baseHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
var readHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
var baseQualHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
var readQualHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
var nucleotideHistoMap: mutable.Map[Char, Long] = mutable.Map()
private var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
private var readQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
private var readQualGTEHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
case class Args(fastq: File = null, outputJson: Option[File] = None) extends AbstractArgs
......@@ -83,14 +84,15 @@ object SeqStat extends ToolCommand {
*/
def detectPhredEncoding(quals: mutable.ArrayBuffer[Long]): Unit = {
// substract 1 on high value, because we start from index 0
val l_qual = quals.takeWhile(_ == 0).length
val h_qual = quals.length - 1
val qual_low_boundery = quals.takeWhile(_ == 0).length
val qual_high_boundery = quals.length - 1
(l_qual < 59, h_qual > 74) match {
(qual_low_boundery < 59, qual_high_boundery > 74) match {
case (false, true) => phredEncoding = Solexa
// TODO: check this later on
// complex case, we cannot tell wheter this is a sanger or solexa
// but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
// TODO: check this later on
// complex case, we cannot tell wheter this is a sanger or solexa
// but since the qual_high_boundery exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
// New @ 2016/01/26: Illumina X ten samples can contain Phred=Q42 (qual_high_boundery==75/K)
case (true, true) => phredEncoding = Solexa
// this is definite a sanger sequence, the lower end is sanger only
case (true, false) => phredEncoding = Sanger
......@@ -102,16 +104,18 @@ object SeqStat extends ToolCommand {
// 'nuc' are the nucleotides 'ACTGN', the max ASCII value for this is T, pre-init the ArrayBuffer to this value
// as we don't expect the have other 'higher' numbered Nucleotides for now.
case class BaseStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0))
nucs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0))
case class ReadStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
nucs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
var withN: Long = 0L,
lengths: mutable.ArrayBuffer[Int] = mutable.ArrayBuffer())
val baseStats: mutable.ArrayBuffer[BaseStat] = mutable.ArrayBuffer()
val readStats: ReadStat = new ReadStat()
var readLengthHistogram: mutable.Map[String, Long] = mutable.Map.empty
/**
* Compute the quality metric per read
* Results are stored in baseStats and readStats
......@@ -130,25 +134,22 @@ object SeqStat extends ToolCommand {
readStats.lengths ++= mutable.ArrayBuffer.fill(record.length - readStats.lengths.length + 1)(0)
}
val readQual = record.getBaseQualityString
val readQuality = record.getBaseQualityString
val readNucleotides = record.getReadString
if (record.length >= readStats.lengths.size) // Extends array when length not yet possible
(0 to (record.length - readStats.lengths.size)).foreach(_ => readStats.lengths.append(0))
readStats.lengths(record.length) += 1
for (t <- 0 until record.length()) {
if (baseStats(t).qual.length <= readQual(t)) {
baseStats(t).qual ++= mutable.ArrayBuffer.fill(readQual(t).toInt - baseStats(t).qual.length + 1)(0)
if (baseStats(t).qual.length <= readQuality(t)) {
baseStats(t).qual ++= mutable.ArrayBuffer.fill(readQuality(t).toInt - baseStats(t).qual.length + 1)(0)
}
baseStats(t).qual(readQual(t)) += 1
baseStats(t).nuc(readNucleotides(t)) += 1
readStats.nuc(readNucleotides(t)) += 1
baseStats(t).qual(readQuality(t)) += 1
baseStats(t).nucs(readNucleotides(t)) += 1
readStats.nucs(readNucleotides(t)) += 1
}
// implicit conversion to Int using foldLeft(0)
val avgQual: Int = readQual.sum / readQual.length
val avgQual: Int = readQuality.sum / readQuality.length
if (readStats.qual.length <= avgQual) {
readStats.qual ++= mutable.ArrayBuffer.fill(avgQual - readStats.qual.length + 1)(0)
}
......@@ -179,74 +180,53 @@ object SeqStat extends ToolCommand {
if (quals.length <= baseStats(pos).qual.length) {
quals ++= mutable.ArrayBuffer.fill(baseStats(pos).qual.length - quals.length)(0)
}
if (nucs.length <= baseStats(pos).nuc.length) {
for (_ <- nucs.length until baseStats(pos).nuc.length) nucs.append(0)
if (nucs.length <= baseStats(pos).nucs.length) {
nucs ++= mutable.ArrayBuffer.fill( baseStats(pos).nucs.length - nucs.length )(0)
}
// count into the quals
baseStats(pos).qual.zipWithIndex foreach { case (value, index) => quals(index) += value }
// count N into nucs
baseStats(pos).nuc.zipWithIndex foreach { case (value, index) => nucs(index) += value }
baseStats(pos).nucs.zipWithIndex foreach { case (value, index) => nucs(index) += value }
}
detectPhredEncoding(quals)
logger.debug("Detected '" + phredEncoding.toString.toLowerCase + "' encoding in fastq file ...")
for (pos <- nucs.indices) {
// always export the N-nucleotide
if (nucs(pos) > 0 || pos.toChar == 'N') {
nucleotideHistoMap += (pos.toChar -> nucs(pos))
}
}
nucleotideHistoMap = nucs.toList
.foldLeft(mutable.Map[Char, Long]())(
(output, nucleotideCount) => output + (output.size.toChar -> nucleotideCount)
)
// ensure bases: `ACTGN` is always reported even having a zero count.
// Other chars might be counted also, these are also reported
.retain((nucleotide, count) => (count > 0 || "ACTGN".contains(nucleotide.toString)))
// init baseHistogram with the bounderies of the report values
for (pos <- 0 until reportValues.max + 1) {
baseHistogram.append(0)
readHistogram.append(0)
}
baseQualHistogram = quals.slice(phredEncoding.id, quals.size)
baseQualHistogram ++= mutable.ArrayBuffer.fill(reportValues.max + 1 - baseQualHistogram.size)(0L)
for (pos <- quals.indices) {
val key: Int = pos - phredEncoding.id
if (key >= 0) {
baseHistogram(key) += quals(pos)
}
}
readQualHistogram = readStats.qual.slice(phredEncoding.id, readStats.qual.size)
readQualHistogram ++= mutable.ArrayBuffer.fill(reportValues.max + 1 - readQualHistogram.size)(0L)
for (pos <- readStats.qual.indices) {
val key: Int = pos - phredEncoding.id
if (key > 0) {
// count till the max of baseHistogram.length
for (histokey <- 0 until key + 1) {
readHistogram(histokey) += readStats.qual(pos)
readQualGTEHistoMap = readQualHistogram.indices
.foldLeft(mutable.Map[Int, Long]())(
(output, index) => {
output + (output.keys.size -> readQualHistogram.slice(index, readQualHistogram.size).sum)
}
}
}
for (pos <- readHistogram.indices) {
readQualHistoMap += (pos -> readHistogram(pos))
}
)
}
def main(args: Array[String]): Unit = {
val commandArgs: Args = parseArgs(args)
logger.info("Start seqstat")
seqStat(new FastqReader(commandArgs.fastq))
summarize()
logger.info("Seqstat done")
val report: Map[String, Any] = Map(
def reportMap(fastqPath: File): Map[String, Any] = {
Map(
("files",
Map(
("fastq", Map(
("path", commandArgs.fastq))
("path", fastqPath.getAbsolutePath))
)
)
),
("stats", Map(
("bases", Map(
("num_total", nucleotideHistoMap.values.sum),
("num_qual", baseHistogram.toList),
("num_qual", baseQualHistogram.toList),
("nucleotides", nucleotideHistoMap.toMap)
)),
("reads", Map(
......@@ -254,11 +234,24 @@ object SeqStat extends ToolCommand {
("num_total", readStats.qual.sum),
("len_min", readStats.lengths.takeWhile(_ == 0).length),
("len_max", readStats.lengths.length - 1),
("num_avg_qual_gte", readQualHistoMap.toMap),
("qual_encoding", phredEncoding.toString.toLowerCase)
("num_avg_qual_gte", readQualGTEHistoMap.toMap),
("qual_encoding", phredEncoding.toString.toLowerCase),
("len_histogram", readStats.lengths.toList)
))
))
)
}
def main(args: Array[String]): Unit = {
val commandArgs: Args = parseArgs(args)
logger.info("Start seqstat")
seqStat(new FastqReader(commandArgs.fastq))
summarize()
logger.info("Seqstat done")
val report = reportMap(commandArgs.fastq)
commandArgs.outputJson match {
case Some(file) => {
......
......@@ -88,11 +88,25 @@ class SeqStatTest extends TestNGSuite with MockitoSugar with Matchers {
def testEncodingBaseHistogram(fqMock: FastqReader) = {
val seqstat = SeqStat
baseHistogram(40) shouldEqual 5
baseHistogram(39) shouldEqual 5
baseHistogram(34) shouldEqual 5
baseHistogram(33) shouldEqual 5
baseHistogram.head shouldEqual 5
baseQualHistogram(40) shouldEqual 5
baseQualHistogram(39) shouldEqual 5
baseQualHistogram(34) shouldEqual 5
baseQualHistogram(33) shouldEqual 5
baseQualHistogram.head shouldEqual 5
}
@Test(dataProvider = "mockReaderProvider", groups = Array("report"), singleThreaded = true, dependsOnGroups = Array("basehistogram"))
def testReportOutputScheme(fqMock: FastqReader) = {
when(fqMock.getFile) thenReturn new File("/tmp/test.fq")
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3", "4", "5")
val seqstat = SeqStat
seqstat.seqStat(fqMock)
seqstat.summarize()
val report = seqstat.reportMap(fqMock.getFile)
report should contain key "files"
report should contain key "stats"
}
@Test def testArgsMinimum() = {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment