Skip to content
Snippets Groups Projects
Commit 6a923d1c authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Refactor

parent 573cd8f2
No related branches found
No related tags found
No related merge requests found
......@@ -48,8 +48,8 @@ object Seqstat extends ToolCommand {
var readHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
var nucleotideHistoMap: mutable.Map[Char, Long] = mutable.Map()
var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
var readQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
private var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
private var readQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
case class Args(fastq: File = new File("")) extends AbstractArgs
......@@ -78,65 +78,118 @@ object Seqstat extends ToolCommand {
.getOrElse(sys.exit(1))
def detectPhredEncoding(quals: mutable.ArrayBuffer[Long]): Unit = {
// substract 1 on high value, because we start from index 0
val l_qual = quals.takeWhile(_ == 0).length
val h_qual = quals.length - 1
if (h_qual > 74) {
phred_correction = 64
phred_encoding = "solexa"
} else if (l_qual < 59) {
phred_correction = 33
phred_encoding = "sanger"
(l_qual < 59, h_qual > 74) match {
case (false, true) => {
phred_correction = 64
phred_encoding = "solexa"
}
case (true, true) => {
// TODO: check this later on
// complex case, we cannot tell wheter this is a sanger or solexa
// but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
phred_correction = 64
phred_encoding = "solexa"
}
case (true, false) => {
// this is definite a sanger sequence, the lower end is sanger only
phred_correction = 33
phred_encoding = "sanger"
}
case (_, _) => {
phred_correction = 0
phred_encoding = "unknown"
}
}
// if (h_qual > 74) {
// phred_correction = 64
// phred_encoding = "solexa"
// } else if (l_qual < 59) {
// phred_correction = 33
// phred_encoding = "sanger"
// }
}
def transformPhredEncoding(qualMap: Map[Int, Int]): (Map[Int, Int]) = {
qualMap map { case (k, v) => (k - phred_correction, v) }
}
// Setting up the internal storage for the statistics gathered for each read
// 'nuc' are the nucleotides 'ACTGN', the max ASCII value for this is T, pre-init the ArrayBuffer to this value
// as we don't expect the have other 'higher' numbered Nucleotides for now.
case class BaseStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer())
nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0))
case class ReadStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
var withN: Long,
lengths: mutable.ArrayBuffer[Int] = mutable.ArrayBuffer())
val baseStats: mutable.ArrayBuffer[BaseStat] = mutable.ArrayBuffer()
val readQuals: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
val readStats: ReadStat = new ReadStat(mutable.ArrayBuffer(),
mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
0L,
mutable.ArrayBuffer())
/**
* Compute the quality metric per read
* Results are stored in baseStats and readQuals
* Results are stored in baseStats and readStats
*
* @param record FastqRecord
*/
def procesRead(record: FastqRecord): Unit = {
if (baseStats.length < record.length()) {
// Adjust/expand the length of baseStat case classes to the size of current
// read if the current list is not long enough to store the data
if (baseStats.length < record.length) {
baseStats ++= mutable.ArrayBuffer.fill(record.length - baseStats.length)(BaseStat())
}
if (readStats.lengths.length < record.length) {
readStats.lengths ++= mutable.ArrayBuffer.fill(record.length - readStats.lengths.length + 1)(0)
}
val readQual = record.getBaseQualityString
val readNucleotides = record.getReadString
readStats.lengths(record.length) += 1
for (t <- 0 until record.length()) {
if (baseStats(t).qual.length <= readQual(t)) {
for (_ <- 0 to readQual(t).toInt - baseStats(t).qual.length) baseStats(t).qual.append(0)
}
if (baseStats(t).nuc.length <= readNucleotides(t)) {
for (_ <- 0 to readNucleotides(t).toInt - baseStats(t).nuc.length) baseStats(t).nuc.append(0)
baseStats(t).qual ++= mutable.ArrayBuffer.fill(readQual(t).toInt - baseStats(t).qual.length + 1)(0)
}
baseStats(t).qual(readQual(t)) += 1
baseStats(t).nuc(readNucleotides(t)) += 1
readStats.nuc(readNucleotides(t)) += 1
}
// implicit conversion to Int using foldLeft(0)
val avgQual: Int = (readQual.foldLeft(0)(_ + _) / readQual.length)
if (readQuals.length <= avgQual) {
for (_ <- 0 to avgQual - readQuals.length) readQuals.append(0)
if (readStats.qual.length <= avgQual) {
readStats.qual ++= mutable.ArrayBuffer.fill(avgQual - readStats.qual.length + 1)(0)
}
readStats.qual(avgQual) += 1
readStats.withN += {
if (readNucleotides.contains("N")) 1L
else 0L
}
readQuals(avgQual) += 1
}
def seqStat(fqreader: FastqReader): (Long) = {
val numReads: Long = fqreader.iterator.size
/**
* seqStat, the compute entrypoint where all statistics collection starts
*
* @param fqreader FastqReader
* @return numReads - number of reads counted
*/
def seqStat(fqreader: FastqReader): Long = {
var numReads: Long = 0
for (read <- fqreader.iterator.asScala) {
procesRead(read)
// numReads += 1
numReads += 1
}
numReads
}
......@@ -147,7 +200,7 @@ object Seqstat extends ToolCommand {
// list all qualities at this particular position `pos`
// fix the length of `quals`
if (quals.length <= baseStats(pos).qual.length) {
for (_ <- 0 until baseStats(pos).qual.length - quals.length) quals.append(0)
quals ++= mutable.ArrayBuffer.fill(baseStats(pos).qual.length - quals.length)(0)
}
if (nucs.length <= baseStats(pos).nuc.length) {
for (_ <- nucs.length until baseStats(pos).nuc.length) nucs.append(0)
......@@ -186,12 +239,12 @@ object Seqstat extends ToolCommand {
baseQualHistoMap += (pos -> baseHistogram(pos))
}
for (pos <- 0 until readQuals.length) {
for (pos <- 0 until readStats.qual.length) {
var key: Int = pos - phred_correction
if (key > 0) {
// count till the max of baseHistogram.length
for (histokey <- 0 until key) {
readHistogram(histokey) += readQuals(pos)
readHistogram(histokey) += readStats.qual(pos)
}
}
}
......@@ -209,7 +262,7 @@ object Seqstat extends ToolCommand {
logger.info("Start")
val reader = new FastqReader(commandArgs.fastq)
seqStat(reader)
val numReads = seqStat(reader)
summarize()
logger.debug(nucs)
......@@ -234,10 +287,10 @@ object Seqstat extends ToolCommand {
("nucleotides", nucleotideHistoMap.toMap)
)),
("reads", Map(
("num_with_n", 0),
("num_total", 0),
("len_min", 0),
("len_max", 0),
("num_with_n", readStats.withN),
("num_total", readStats.qual.sum),
("len_min", readStats.lengths.takeWhile(_ == 0).length),
("len_max", readStats.lengths.length-1),
("num_qual_gte", readQualHistoMap.toMap),
("qual_encoding", phred_encoding)
))
......@@ -245,6 +298,6 @@ object Seqstat extends ToolCommand {
)
val json_report: Json = ConfigUtils.mapToJson(report)
println(json_report.spaces2)
print(json_report.spaces2)
}
}
......@@ -29,11 +29,7 @@ class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
// Helper functions to create iterator over FastqRecords given its IDs as Ints
// Record with 'A' and Qual==39 (SangerEncoding)
private def recordsOver(ids: String*): java.util.Iterator[FastqRecord] = ids
.map(x => new FastqRecord(x, "A", "", "H"))
.toIterator.asJava
private def solexaRecordsOver(ids: String*): java.util.Iterator[FastqRecord] = ids
.map(x => new FastqRecord(x, "ACTGTNCGATAG", "", "abcde;ABCDEF"))
.map(x => new FastqRecord(x, "ACGTN", "", "HIBC!"))
.toIterator.asJava
@DataProvider(name = "mockReaderProvider")
......@@ -42,7 +38,7 @@ class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
Array(mock[FastqReader])
)
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"))
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"), singleThreaded = true)
def testDefault(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3")
}
......@@ -51,80 +47,27 @@ class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
def testSeqCountReads(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3", "4", "5")
val (numReads) = seqStat(fqMock)
numReads shouldBe 5
}
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"))
def testSeqQuality(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3", "4", "5")
val (numReads) = seqStat(fqMock)
val seqstat = Seqstat
val numReads = seqstat.seqStat(fqMock)
numReads shouldBe 5
}
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"))
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"), singleThreaded = true)
def testEncodingDetectionSanger(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3", "4", "5")
val (numReads) = seqStat(fqMock)
numReads shouldBe 5
when(fqMock.iterator) thenReturn recordsOver("1")
println("In testEncodingDetectionSanger ")
println(phred_correction)
println(phred_encoding)
println(quals)
val seqstat = Seqstat
val numReads = seqstat.seqStat(fqMock)
numReads shouldBe 1
seqstat.summarize()
summarize()
println(phred_correction)
println(phred_encoding)
println(quals)
phred_correction shouldBe 33
phred_encoding shouldBe "sanger"
seqstat.phred_correction shouldBe 33
seqstat.phred_encoding shouldBe "sanger"
// nucleotideHistoMap.values.sum shouldBe 5
// nucleotideHistoMap('N') shouldBe 0
// nucleotideHistoMap('A') shouldBe 5
// nucleotideHistoMap('N') shouldBe 1
// nucleotideHistoMap('A') shouldBe 1
}
// @Test(dataProvider = "mockReaderProvider", groups = Array("solexa"))
// def testEncodingDetectionSolexa(refMock: FastqReader) = {
// when(refMock.iterator) thenReturn solexaRecordsOver("1", "2", "3", "4", "5")
//
// println("In testEncodingDetectionSolexa ")
// val (numReads) = seqStat(refMock)
// numReads shouldBe 5
//
// summarize()
//
// phred_correction shouldBe 64
// phred_encoding shouldBe "solexa"
// }
//
// @Test(dataProvider = "mockReaderProvider", groups = Array("solexa"))
// def testBaseCount(refMock: FastqReader) = {
// when(refMock.iterator) thenReturn solexaRecordsOver("1", "2", "3", "4", "5")
//
// val (numReads) = seqStat(refMock)
// numReads shouldBe 5
//
// println("In testBaseCount ")
// println(phred_correction)
// println(phred_encoding)
// println(quals)
//
// summarize()
//
// println("In testBaseCount ")
// println(phred_correction)
// println(phred_encoding)
// println(quals)
//
// nucleotideHistoMap.values.sum shouldBe 60
// nucleotideHistoMap('N') shouldBe 5
// }
@Test def testArgsMinimum() = {
val args = Array(
"-i", resourcePath("/paired01a.fq"))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment