Commit 3ec61eee authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'feature-seqstat_updates' into 'develop'

Feature seqstat updates

Updates on SeqStat (per the commit messages).

We still need to rely on FastQC since some other methods require further testing to be implemented. For example, to get the median base quality from all bases is not that straightforward since the container size may well exceet `Int.MaxValue` while the default containers' maximum sizes are all `Int.MaxValue`.

See merge request !182
parents 03c1cfa0 0833e74c
......@@ -38,7 +38,7 @@ import nl.lumc.sasc.biopet.utils.ConfigUtils
*
* @param root Configuration object for the pipeline
*/
class Seqstat(val root: Configurable) extends ToolCommandFuntion with Summarizable {
class SeqStat(val root: Configurable) extends ToolCommandFuntion with Summarizable {
javaMainClass = getClass.getName
@Input(doc = "Input FASTQ", shortName = "input", required = true)
......@@ -76,16 +76,16 @@ object FqEncoding extends Enumeration {
val Unknown = Value(0, "Unknown")
}
object Seqstat extends ToolCommand {
def apply(root: Configurable, input: File, output: File): Seqstat = {
val seqstat = new Seqstat(root)
object SeqStat extends ToolCommand {
def apply(root: Configurable, input: File, output: File): SeqStat = {
val seqstat = new SeqStat(root)
seqstat.input = input
seqstat.output = new File(output, input.getName.substring(0, input.getName.lastIndexOf(".")) + ".seqstats.json")
seqstat
}
def apply(root: Configurable, fastqfile: File, outDir: String): Seqstat = {
val seqstat = new Seqstat(root)
def apply(root: Configurable, fastqfile: File, outDir: String): SeqStat = {
val seqstat = new SeqStat(root)
seqstat.input = fastqfile
seqstat.output = new File(outDir, fastqfile.getName.substring(0, fastqfile.getName.lastIndexOf(".")) + ".seqstats.json")
seqstat
......@@ -145,22 +145,14 @@ object Seqstat extends ToolCommand {
val h_qual = quals.length - 1
(l_qual < 59, h_qual > 74) match {
case (false, true) => {
phredEncoding = Solexa
}
case (true, true) => {
// TODO: check this later on
// complex case, we cannot tell wheter this is a sanger or solexa
// but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
phredEncoding = Solexa
}
case (true, false) => {
// this is definite a sanger sequence, the lower end is sanger only
phredEncoding = Sanger
}
case (_, _) => {
phredEncoding = Unknown
}
case (false, true) => phredEncoding = Solexa
// TODO: check this later on
// complex case, we cannot tell wheter this is a sanger or solexa
// but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
case (true, true) => phredEncoding = Solexa
// this is definite a sanger sequence, the lower end is sanger only
case (true, false) => phredEncoding = Sanger
case (_, _) => phredEncoding = Unknown
}
}
......@@ -214,7 +206,7 @@ object Seqstat extends ToolCommand {
}
// implicit conversion to Int using foldLeft(0)
val avgQual: Int = (readQual.sum / readQual.length)
val avgQual: Int = readQual.sum / readQual.length
if (readStats.qual.length <= avgQual) {
readStats.qual ++= mutable.ArrayBuffer.fill(avgQual - readStats.qual.length + 1)(0)
}
......@@ -254,6 +246,7 @@ object Seqstat extends ToolCommand {
baseStats(pos).nuc.zipWithIndex foreach { case (value, index) => nucs(index) += value }
}
detectPhredEncoding(quals)
logger.debug("Detected '" + phredEncoding.toString.toLowerCase + "' encoding in fastq file ...")
for (pos <- 0 until nucs.length) {
// always export the N-nucleotide
......@@ -269,19 +262,12 @@ object Seqstat extends ToolCommand {
}
for (pos <- 0 until quals.length) {
var key: Int = pos - phredEncoding.id
if (key > 0) {
// count till the max of baseHistogram.length
for (histokey <- 0 until key + 1) {
baseHistogram(histokey) += quals(pos)
}
val key: Int = pos - phredEncoding.id
if (key >= 0) {
baseHistogram(key) += quals(pos)
}
}
for (pos <- 0 until baseHistogram.length) {
baseQualHistoMap += (pos -> baseHistogram(pos))
}
for (pos <- 0 until readStats.qual.length) {
val key: Int = pos - phredEncoding.id
if (key > 0) {
......@@ -303,30 +289,22 @@ object Seqstat extends ToolCommand {
val commandArgs: Args = parseArgs(args)
logger.info("Start seqstat")
val reader = new FastqReader(commandArgs.fastq)
val numReads = seqStat(reader)
seqStat(new FastqReader(commandArgs.fastq))
summarize()
logger.debug(nucs)
// logger.debug(baseStats)
logger.info("Seqstat done")
val report: Map[String, Any] = Map(
("files",
Map(
("fastq", Map(
("path", commandArgs.fastq),
("checksum_sha1", "")
)
("path", commandArgs.fastq))
)
)
),
("stats", Map(
("bases", Map(
("num_n", nucleotideHistoMap.getOrElse('N', 0)),
("num_total", nucleotideHistoMap.values.sum),
("num_qual_gte", baseQualHistoMap.toMap),
("num_qual", baseHistogram.toList),
("nucleotides", nucleotideHistoMap.toMap)
)),
("reads", Map(
......@@ -334,12 +312,12 @@ object Seqstat extends ToolCommand {
("num_total", readStats.qual.sum),
("len_min", readStats.lengths.takeWhile(_ == 0).length),
("len_max", readStats.lengths.length - 1),
("num_qual_gte", readQualHistoMap.toMap),
("num_avg_qual_gte", readQualHistoMap.toMap),
("qual_encoding", phredEncoding.toString.toLowerCase)
))
))
)
println(ConfigUtils.mapToJson(report).spaces2)
println(ConfigUtils.mapToJson(report))
}
}
......@@ -27,9 +27,9 @@ import org.testng.annotations.{ DataProvider, Test }
import scala.collection.JavaConverters._
class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
class SeqStatTest extends TestNGSuite with MockitoSugar with Matchers {
import nl.lumc.sasc.biopet.tools.Seqstat._
import nl.lumc.sasc.biopet.tools.SeqStat._
import nl.lumc.sasc.biopet.tools.FqEncoding._
private def resourceFile(p: String): File =
......@@ -59,7 +59,7 @@ class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
def testSeqCountReads(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3", "4", "5")
val seqstat = Seqstat
val seqstat = SeqStat
val numReads = seqstat.seqStat(fqMock)
numReads shouldBe 5
}
......@@ -67,7 +67,7 @@ class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
@Test(dataProvider = "mockReaderProvider", groups = Array("phredscore"), singleThreaded = true, dependsOnGroups = Array("read"))
def testEncodingDetectionSanger(fqMock: FastqReader) = {
val seqstat = Seqstat
val seqstat = SeqStat
seqstat.summarize()
seqstat.phredEncoding shouldBe Sanger
......@@ -76,7 +76,7 @@ class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
@Test(dataProvider = "mockReaderProvider", groups = Array("nucleocount"), singleThreaded = true, dependsOnGroups = Array("phredscore"))
def testEncodingNucleotideCount(fqMock: FastqReader) = {
val seqstat = Seqstat
val seqstat = SeqStat
nucleotideHistoMap('N') shouldEqual 5
nucleotideHistoMap('A') shouldEqual 5
nucleotideHistoMap('C') shouldEqual 5
......@@ -87,7 +87,7 @@ class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
@Test(dataProvider = "mockReaderProvider", groups = Array("basehistogram"), singleThreaded = true, dependsOnGroups = Array("nucleocount"))
def testEncodingBaseHistogram(fqMock: FastqReader) = {
val seqstat = Seqstat
val seqstat = SeqStat
baseHistogram(40) shouldEqual 5
baseHistogram(39) shouldEqual 10
baseHistogram(34) shouldEqual 15
......
......@@ -54,7 +54,7 @@ object BiopetExecutablePublic extends BiopetExecutable {
nl.lumc.sasc.biopet.tools.BastyGenerateFasta,
nl.lumc.sasc.biopet.tools.MergeAlleles,
nl.lumc.sasc.biopet.tools.SamplesTsvToJson,
nl.lumc.sasc.biopet.tools.Seqstat,
nl.lumc.sasc.biopet.tools.SeqStat,
nl.lumc.sasc.biopet.tools.VepNormalizer,
nl.lumc.sasc.biopet.tools.AnnotateVcfWithBed,
nl.lumc.sasc.biopet.tools.VcfWithVcf)
......
......@@ -100,6 +100,50 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r
else ""
}
protected case class BasePositionStats(mean: Double, median: Double,
lowerQuartile: Double, upperQuartile: Double,
percentile10th: Double, percentile90th: Double) {
def toMap = Map(
"mean" -> mean,
"median" -> median,
"lower_quartile" -> lowerQuartile,
"upper_quartile" -> upperQuartile,
"percentile_10th" -> percentile10th,
"percentile_90th" -> percentile90th)
}
/**
* Retrieves the base quality per position values as computed by FastQc.
*/
def perBaseSequenceQuality: Map[String, Map[String, Double]] =
if (dataFile.exists) {
qcModules.get("Per base sequence quality") match {
case None => Map()
case Some(qcModule) =>
val tableContents = for {
line <- qcModule.lines if !(line.startsWith("#") || line.startsWith(">"));
values = line.split("\t") if values.size == 7
} yield (values(0), BasePositionStats(values(1).toDouble, values(2).toDouble, values(3).toDouble,
values(4).toDouble, values(5).toDouble, values(6).toDouble).toMap)
tableContents.toMap
}
} else Map()
def perBaseSequenceContent: Map[String, Map[String, Double]] =
if (dataFile.exists) {
qcModules.get("Per base sequence content") match {
case None => Map()
case Some(qcModule) =>
val bases = qcModule.lines.head.split("\t").tail
val tableContents = for {
line <- qcModule.lines if !(line.startsWith("#") || line.startsWith(">"));
values = line.split("\t") if values.size == 5
} yield (values(0), bases.zip(values.tail.map(_.toDouble)).toMap)
tableContents.toMap
}
} else Map()
/** Case class representing a known adapter sequence */
protected case class AdapterSequence(name: String, seq: String)
......@@ -159,7 +203,9 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r
outputFiles ++ Map("fastq_file" -> this.fastqfile)
}
def summaryStats: Map[String, Any] = Map()
def summaryStats: Map[String, Any] = Map(
"per_base_sequence_quality" -> perBaseSequenceQuality,
"per_base_sequence_content" -> perBaseSequenceContent)
}
object Fastqc {
......
......@@ -22,7 +22,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import nl.lumc.sasc.biopet.core.{ SampleLibraryTag, BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.tools.Seqstat
import nl.lumc.sasc.biopet.tools.SeqStat
import nl.lumc.sasc.biopet.tools.FastqSync
class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with SampleLibraryTag {
......@@ -186,14 +186,14 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
deps_R2 ::= R2.get
}
val seqstat_R1 = Seqstat(this, R1, outDir)
val seqstat_R1 = SeqStat(this, R1, outDir)
seqstat_R1.isIntermediate = true
seqstat_R1.deps = deps_R1
add(seqstat_R1)
addSummarizable(seqstat_R1, "seqstat_R1")
if (paired) {
val seqstat_R2 = Seqstat(this, R2.get, outDir)
val seqstat_R2 = SeqStat(this, R2.get, outDir)
seqstat_R2.isIntermediate = true
seqstat_R2.deps = deps_R2
add(seqstat_R2)
......@@ -259,16 +259,16 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
if (paired) R2 = Some(sickle.output_R2)
}
val seqstat_R1_after = Seqstat(this, R1, outDir)
val seqstat_R1_after = SeqStat(this, R1, outDir)
seqstat_R1_after.deps = deps_R1
add(seqstat_R1_after)
addSummarizable(seqstat_R1_after, "seqstat_R1_after")
addSummarizable(seqstat_R1_after, "seqstat_R1_qc")
if (paired) {
val seqstat_R2_after = Seqstat(this, R2.get, outDir)
val seqstat_R2_after = SeqStat(this, R2.get, outDir)
seqstat_R2_after.deps = deps_R2
add(seqstat_R2_after)
addSummarizable(seqstat_R2_after, "seqstat_R2_after")
addSummarizable(seqstat_R2_after, "seqstat_R2_qc")
}
outputFiles += (chunk + "output_R1" -> R1)
......
......@@ -26,7 +26,7 @@ import org.testng.annotations.{ AfterClass, DataProvider, Test }
import nl.lumc.sasc.biopet.core.config.Config
import nl.lumc.sasc.biopet.extensions.{ Sickle, Gzip, Zcat }
import nl.lumc.sasc.biopet.tools.Seqstat
import nl.lumc.sasc.biopet.tools.SeqStat
import nl.lumc.sasc.biopet.tools.FastqSync
import nl.lumc.sasc.biopet.utils.ConfigUtils
......@@ -78,7 +78,7 @@ class FlexiprepTest extends TestNGSuite with Matchers {
else if (!paired && (skipClip && skipTrim)) 1
else if (paired && !(skipClip && skipTrim)) 4
else if (!paired && !(skipClip && skipTrim)) 2)
flexiprep.functions.count(_.isInstanceOf[Seqstat]) shouldBe (if (paired) 4 else 2)
flexiprep.functions.count(_.isInstanceOf[SeqStat]) shouldBe (if (paired) 4 else 2)
flexiprep.functions.count(_.isInstanceOf[Zcat]) shouldBe (if (zipped) (if (paired) 2 else 1) else 0)
flexiprep.functions.count(_.isInstanceOf[SeqtkSeq]) shouldBe (if (paired) 2 else 1)
flexiprep.functions.count(_.isInstanceOf[Cutadapt]) shouldBe (if (skipClip) 0 else (if (paired) 2 else 1))
......
......@@ -29,7 +29,7 @@ import nl.lumc.sasc.biopet.core.config.Config
import nl.lumc.sasc.biopet.extensions.bwa.{ BwaSamse, BwaSampe, BwaAln, BwaMem }
import nl.lumc.sasc.biopet.extensions.picard.{ MergeSamFiles, AddOrReplaceReadGroups, MarkDuplicates, SortSam }
import nl.lumc.sasc.biopet.pipelines.flexiprep.{ SeqtkSeq, Cutadapt, Fastqc }
import nl.lumc.sasc.biopet.tools.{ Seqstat, FastqSync }
import nl.lumc.sasc.biopet.tools.{ SeqStat, FastqSync }
import nl.lumc.sasc.biopet.utils.ConfigUtils
/**
......@@ -91,7 +91,7 @@ class MappingTest extends TestNGSuite with Matchers {
//Flexiprep
mapping.functions.count(_.isInstanceOf[Fastqc]) shouldBe (if (skipFlexiprep) 0 else if (paired) 4 else 2)
mapping.functions.count(_.isInstanceOf[Zcat]) shouldBe (if (!zipped || (chunks > 1 && skipFlexiprep)) 0 else if (paired) 2 else 1)
mapping.functions.count(_.isInstanceOf[Seqstat]) shouldBe ((if (skipFlexiprep) 0 else if (paired) 4 else 2) * chunks)
mapping.functions.count(_.isInstanceOf[SeqStat]) shouldBe ((if (skipFlexiprep) 0 else if (paired) 4 else 2) * chunks)
mapping.functions.count(_.isInstanceOf[SeqtkSeq]) shouldBe ((if (skipFlexiprep) 0 else if (paired) 2 else 1) * chunks)
mapping.functions.count(_.isInstanceOf[Cutadapt]) shouldBe ((if (skipFlexiprep) 0 else if (paired) 2 else 1) * chunks)
mapping.functions.count(_.isInstanceOf[FastqSync]) shouldBe ((if (skipFlexiprep) 0 else if (paired && !skipFlexiprep) 1 else 0) * chunks)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment