Commit 6a923d1c authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Refactor

parent 573cd8f2
......@@ -48,8 +48,8 @@ object Seqstat extends ToolCommand {
var readHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
var nucleotideHistoMap: mutable.Map[Char, Long] = mutable.Map()
var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
var readQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
private var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
private var readQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
case class Args(fastq: File = new File("")) extends AbstractArgs
......@@ -78,65 +78,118 @@ object Seqstat extends ToolCommand {
.getOrElse(sys.exit(1))
def detectPhredEncoding(quals: mutable.ArrayBuffer[Long]): Unit = {
// substract 1 on high value, because we start from index 0
val l_qual = quals.takeWhile(_ == 0).length
val h_qual = quals.length - 1
if (h_qual > 74) {
phred_correction = 64
phred_encoding = "solexa"
} else if (l_qual < 59) {
phred_correction = 33
phred_encoding = "sanger"
(l_qual < 59, h_qual > 74) match {
case (false, true) => {
phred_correction = 64
phred_encoding = "solexa"
}
case (true, true) => {
// TODO: check this later on
// complex case, we cannot tell wheter this is a sanger or solexa
// but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
phred_correction = 64
phred_encoding = "solexa"
}
case (true, false) => {
// this is definite a sanger sequence, the lower end is sanger only
phred_correction = 33
phred_encoding = "sanger"
}
case (_, _) => {
phred_correction = 0
phred_encoding = "unknown"
}
}
// if (h_qual > 74) {
// phred_correction = 64
// phred_encoding = "solexa"
// } else if (l_qual < 59) {
// phred_correction = 33
// phred_encoding = "sanger"
// }
}
def transformPhredEncoding(qualMap: Map[Int, Int]): (Map[Int, Int]) = {
qualMap map { case (k, v) => (k - phred_correction, v) }
}
// Setting up the internal storage for the statistics gathered for each read
// 'nuc' are the nucleotides 'ACTGN', the max ASCII value for this is T, pre-init the ArrayBuffer to this value
// as we don't expect the have other 'higher' numbered Nucleotides for now.
case class BaseStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer())
nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0))
case class ReadStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
var withN: Long,
lengths: mutable.ArrayBuffer[Int] = mutable.ArrayBuffer())
val baseStats: mutable.ArrayBuffer[BaseStat] = mutable.ArrayBuffer()
val readQuals: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
val readStats: ReadStat = new ReadStat(mutable.ArrayBuffer(),
mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
0L,
mutable.ArrayBuffer())
/**
* Compute the quality metric per read
* Results are stored in baseStats and readQuals
* Results are stored in baseStats and readStats
*
* @param record FastqRecord
*/
def procesRead(record: FastqRecord): Unit = {
if (baseStats.length < record.length()) {
// Adjust/expand the length of baseStat case classes to the size of current
// read if the current list is not long enough to store the data
if (baseStats.length < record.length) {
baseStats ++= mutable.ArrayBuffer.fill(record.length - baseStats.length)(BaseStat())
}
if (readStats.lengths.length < record.length) {
readStats.lengths ++= mutable.ArrayBuffer.fill(record.length - readStats.lengths.length + 1)(0)
}
val readQual = record.getBaseQualityString
val readNucleotides = record.getReadString
readStats.lengths(record.length) += 1
for (t <- 0 until record.length()) {
if (baseStats(t).qual.length <= readQual(t)) {
for (_ <- 0 to readQual(t).toInt - baseStats(t).qual.length) baseStats(t).qual.append(0)
}
if (baseStats(t).nuc.length <= readNucleotides(t)) {
for (_ <- 0 to readNucleotides(t).toInt - baseStats(t).nuc.length) baseStats(t).nuc.append(0)
baseStats(t).qual ++= mutable.ArrayBuffer.fill(readQual(t).toInt - baseStats(t).qual.length + 1)(0)
}
baseStats(t).qual(readQual(t)) += 1
baseStats(t).nuc(readNucleotides(t)) += 1
readStats.nuc(readNucleotides(t)) += 1
}
// implicit conversion to Int using foldLeft(0)
val avgQual: Int = (readQual.foldLeft(0)(_ + _) / readQual.length)
if (readQuals.length <= avgQual) {
for (_ <- 0 to avgQual - readQuals.length) readQuals.append(0)
if (readStats.qual.length <= avgQual) {
readStats.qual ++= mutable.ArrayBuffer.fill(avgQual - readStats.qual.length + 1)(0)
}
readStats.qual(avgQual) += 1
readStats.withN += {
if (readNucleotides.contains("N")) 1L
else 0L
}
readQuals(avgQual) += 1
}
def seqStat(fqreader: FastqReader): (Long) = {
val numReads: Long = fqreader.iterator.size
/**
* seqStat, the compute entrypoint where all statistics collection starts
*
* @param fqreader FastqReader
* @return numReads - number of reads counted
*/
def seqStat(fqreader: FastqReader): Long = {
var numReads: Long = 0
for (read <- fqreader.iterator.asScala) {
procesRead(read)
// numReads += 1
numReads += 1
}
numReads
}
......@@ -147,7 +200,7 @@ object Seqstat extends ToolCommand {
// list all qualities at this particular position `pos`
// fix the length of `quals`
if (quals.length <= baseStats(pos).qual.length) {
for (_ <- 0 until baseStats(pos).qual.length - quals.length) quals.append(0)
quals ++= mutable.ArrayBuffer.fill(baseStats(pos).qual.length - quals.length)(0)
}
if (nucs.length <= baseStats(pos).nuc.length) {
for (_ <- nucs.length until baseStats(pos).nuc.length) nucs.append(0)
......@@ -186,12 +239,12 @@ object Seqstat extends ToolCommand {
baseQualHistoMap += (pos -> baseHistogram(pos))
}
for (pos <- 0 until readQuals.length) {
for (pos <- 0 until readStats.qual.length) {
var key: Int = pos - phred_correction
if (key > 0) {
// count till the max of baseHistogram.length
for (histokey <- 0 until key) {
readHistogram(histokey) += readQuals(pos)
readHistogram(histokey) += readStats.qual(pos)
}
}
}
......@@ -209,7 +262,7 @@ object Seqstat extends ToolCommand {
logger.info("Start")
val reader = new FastqReader(commandArgs.fastq)
seqStat(reader)
val numReads = seqStat(reader)
summarize()
logger.debug(nucs)
......@@ -234,10 +287,10 @@ object Seqstat extends ToolCommand {
("nucleotides", nucleotideHistoMap.toMap)
)),
("reads", Map(
("num_with_n", 0),
("num_total", 0),
("len_min", 0),
("len_max", 0),
("num_with_n", readStats.withN),
("num_total", readStats.qual.sum),
("len_min", readStats.lengths.takeWhile(_ == 0).length),
("len_max", readStats.lengths.length-1),
("num_qual_gte", readQualHistoMap.toMap),
("qual_encoding", phred_encoding)
))
......@@ -245,6 +298,6 @@ object Seqstat extends ToolCommand {
)
val json_report: Json = ConfigUtils.mapToJson(report)
println(json_report.spaces2)
print(json_report.spaces2)
}
}
......@@ -29,11 +29,7 @@ class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
// Helper functions to create iterator over FastqRecords given its IDs as Ints
// Record with 'A' and Qual==39 (SangerEncoding)
private def recordsOver(ids: String*): java.util.Iterator[FastqRecord] = ids
.map(x => new FastqRecord(x, "A", "", "H"))
.toIterator.asJava
private def solexaRecordsOver(ids: String*): java.util.Iterator[FastqRecord] = ids
.map(x => new FastqRecord(x, "ACTGTNCGATAG", "", "abcde;ABCDEF"))
.map(x => new FastqRecord(x, "ACGTN", "", "HIBC!"))
.toIterator.asJava
@DataProvider(name = "mockReaderProvider")
......@@ -42,7 +38,7 @@ class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
Array(mock[FastqReader])
)
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"))
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"), singleThreaded = true)
def testDefault(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3")
}
......@@ -51,80 +47,27 @@ class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
def testSeqCountReads(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3", "4", "5")
val (numReads) = seqStat(fqMock)
numReads shouldBe 5
}
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"))
def testSeqQuality(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3", "4", "5")
val (numReads) = seqStat(fqMock)
val seqstat = Seqstat
val numReads = seqstat.seqStat(fqMock)
numReads shouldBe 5
}
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"))
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"), singleThreaded = true)
def testEncodingDetectionSanger(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3", "4", "5")
val (numReads) = seqStat(fqMock)
numReads shouldBe 5
when(fqMock.iterator) thenReturn recordsOver("1")
println("In testEncodingDetectionSanger ")
println(phred_correction)
println(phred_encoding)
println(quals)
val seqstat = Seqstat
val numReads = seqstat.seqStat(fqMock)
numReads shouldBe 1
seqstat.summarize()
summarize()
println(phred_correction)
println(phred_encoding)
println(quals)
phred_correction shouldBe 33
phred_encoding shouldBe "sanger"
seqstat.phred_correction shouldBe 33
seqstat.phred_encoding shouldBe "sanger"
// nucleotideHistoMap.values.sum shouldBe 5
// nucleotideHistoMap('N') shouldBe 0
// nucleotideHistoMap('A') shouldBe 5
// nucleotideHistoMap('N') shouldBe 1
// nucleotideHistoMap('A') shouldBe 1
}
// @Test(dataProvider = "mockReaderProvider", groups = Array("solexa"))
// def testEncodingDetectionSolexa(refMock: FastqReader) = {
// when(refMock.iterator) thenReturn solexaRecordsOver("1", "2", "3", "4", "5")
//
// println("In testEncodingDetectionSolexa ")
// val (numReads) = seqStat(refMock)
// numReads shouldBe 5
//
// summarize()
//
// phred_correction shouldBe 64
// phred_encoding shouldBe "solexa"
// }
//
// @Test(dataProvider = "mockReaderProvider", groups = Array("solexa"))
// def testBaseCount(refMock: FastqReader) = {
// when(refMock.iterator) thenReturn solexaRecordsOver("1", "2", "3", "4", "5")
//
// val (numReads) = seqStat(refMock)
// numReads shouldBe 5
//
// println("In testBaseCount ")
// println(phred_correction)
// println(phred_encoding)
// println(quals)
//
// summarize()
//
// println("In testBaseCount ")
// println(phred_correction)
// println(phred_encoding)
// println(quals)
//
// nucleotideHistoMap.values.sum shouldBe 60
// nucleotideHistoMap('N') shouldBe 5
// }
@Test def testArgsMinimum() = {
val args = Array(
"-i", resourcePath("/paired01a.fq"))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment