Commit f202827c authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'develop' into feature-summary

Conflicts:
	public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Seqstat.scala
	public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala
	public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala
	public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepTest.scala
parents 2b9639f8 f04424c1
......@@ -21,6 +21,9 @@ import scala.io.Source
import nl.lumc.sasc.biopet.core.config.Config
import nl.lumc.sasc.biopet.utils.ConfigUtils._
/**
* This tool can convert a tsv to a json file
*/
object SamplesTsvToJson extends ToolCommand {
case class Args(inputFiles: List[File] = Nil) extends AbstractArgs
......@@ -29,13 +32,18 @@ object SamplesTsvToJson extends ToolCommand {
c.copy(inputFiles = x :: c.inputFiles)
} text ("Input must be a tsv file, first line is seen as header and must at least have a 'sample' column, 'library' column is optional, multiple files allowed")
}
/**
* Executes SamplesTsvToJson
* @param args
*/
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val fileMaps = for (inputFile <- commandArgs.inputFiles) yield {
val reader = Source.fromFile(inputFile)
val lines = reader.getLines.toList
val lines = reader.getLines.toList.filter(!_.isEmpty)
val header = lines.head.split("\t")
val sampleColumn = header.indexOf("sample")
val libraryColumn = header.indexOf("library")
......
/**
* Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl>
* @author Wai Yi Leung <w.y.leung@lumc.nl>
*
*/
package nl.lumc.sasc.biopet.tools
import java.io.File
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.collection.immutable.Map
import scala.io.Source
import scala.language.postfixOps
import htsjdk.samtools.fastq.{ FastqReader, FastqRecord }
import scalaz._, Scalaz._
import argonaut._, Argonaut._
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.utils.ConfigUtils
/**
* Seqstat function class for usage in Biopet pipelines
*
* @param root Configuration object for the pipeline
*/
class Seqstat(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input FASTQ", shortName = "input", required = true)
var input: File = null
@Output(doc = "Output JSON", shortName = "output", required = true)
var output: File = null
override val defaultVmem = "4G"
memoryLimit = Option(3.0)
override def commandLine = super.commandLine + required("-i", input) + " > " + required(output)
def summary: Json = {
val json = Parse.parseOption(Source.fromFile(output).mkString)
if (json.isEmpty) jNull
else json.get.fieldOrEmptyObject("stats")
}
}
object FqEncoding extends Enumeration {
type FqEncoding = Value
val Sanger = Value(33, "Sanger")
val Solexa = Value(64, "Solexa")
val Unknown = Value(0, "Unknown")
}
object Seqstat extends ToolCommand {
def apply(root: Configurable, input: File, output: File): Seqstat = {
val seqstat = new Seqstat(root)
seqstat.input = input
seqstat.output = new File(output, input.getName.substring(0, input.getName.lastIndexOf(".")) + ".seqstats.json")
seqstat
}
def apply(root: Configurable, fastqfile: File, outDir: String): Seqstat = {
val seqstat = new Seqstat(root)
seqstat.input = fastqfile
seqstat.output = new File(outDir, fastqfile.getName.substring(0, fastqfile.getName.lastIndexOf(".")) + ".seqstats.json")
seqstat
}
def mergeSummaries(jsons: List[Json]): Json = {
def addJson(json: Json, total: mutable.Map[String, Long]) {
for (key <- json.objectFieldsOrEmpty) {
if (json.field(key).get.isObject) addJson(json.field(key).get, total)
else if (json.field(key).get.isNumber) {
val number = json.field(key).get.numberOrZero.toLong
if (total.contains(key)) {
if (key == "len_min") {
if (total(key) > number) total(key) = number
} else if (key == "len_max") {
if (total(key) < number) total(key) = number
} else total(key) += number
} else total += (key -> number)
}
}
}
var basesTotal: mutable.Map[String, Long] = mutable.Map()
var readsTotal: mutable.Map[String, Long] = mutable.Map()
var encoding: Set[Json] = Set()
for (json <- jsons) {
encoding += json.fieldOrEmptyString("qual_encoding")
val bases = json.fieldOrEmptyObject("bases")
addJson(bases, basesTotal)
val reads = json.fieldOrEmptyObject("reads")
addJson(reads, readsTotal)
}
("bases" := (
("num_n" := basesTotal("num_n")) ->:
("num_total" := basesTotal("num_total")) ->:
("num_qual_gte" := (
("1" := basesTotal("1")) ->:
("10" := basesTotal("10")) ->:
("20" := basesTotal("20")) ->:
("30" := basesTotal("30")) ->:
("40" := basesTotal("40")) ->:
("50" := basesTotal("50")) ->:
("60" := basesTotal("60")) ->:
jEmptyObject)) ->: jEmptyObject)) ->:
("reads" := (
("num_with_n" := readsTotal("num_with_n")) ->:
("num_total" := readsTotal("num_total")) ->:
("len_min" := readsTotal("len_min")) ->:
("len_max" := readsTotal("len_max")) ->:
("num_mean_qual_gte" := (
("1" := readsTotal("1")) ->:
("10" := readsTotal("10")) ->:
("20" := readsTotal("20")) ->:
("30" := readsTotal("30")) ->:
("40" := readsTotal("40")) ->:
("50" := readsTotal("50")) ->:
("60" := readsTotal("60")) ->:
jEmptyObject)) ->: jEmptyObject)) ->:
("qual_encoding" := encoding.head) ->:
jEmptyObject
}
import FqEncoding._
var phredEncoding: FqEncoding.Value = Sanger
val reportValues = List(1, 10, 20, 30, 40, 50, 60)
// the base quality for each position on the reads
var quals: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
var nucs: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
// generate the baseHistogram and readHistogram
var baseHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
var readHistogram: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer()
var nucleotideHistoMap: mutable.Map[Char, Long] = mutable.Map()
private var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
private var readQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
case class Args(fastq: File = new File("")) extends AbstractArgs
class OptParser extends AbstractOptParser {
head(
s"""
|$commandName - Summarize FastQ
""".stripMargin)
opt[File]('i', "fastq") required () valueName "<fastq>" action { (x, c) =>
c.copy(fastq = x)
} validate {
x => if (x.exists) success else failure("FASTQ file not found")
} text "FastQ file to generate stats from"
}
/**
* Parses the command line argument
*
* @param args Array of arguments
* @return
*/
def parseArgs(args: Array[String]): Args = new OptParser()
.parse(args, Args())
.getOrElse(sys.exit(1))
/**
*
* @param quals Computed quality histogram [flat]
*/
def detectPhredEncoding(quals: mutable.ArrayBuffer[Long]): Unit = {
// substract 1 on high value, because we start from index 0
val l_qual = quals.takeWhile(_ == 0).length
val h_qual = quals.length - 1
(l_qual < 59, h_qual > 74) match {
case (false, true) => {
phredEncoding = Solexa
}
case (true, true) => {
// TODO: check this later on
// complex case, we cannot tell wheter this is a sanger or solexa
// but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
phredEncoding = Solexa
}
case (true, false) => {
// this is definite a sanger sequence, the lower end is sanger only
phredEncoding = Sanger
}
case (_, _) => {
phredEncoding = Unknown
}
}
}
// Setting up the internal storage for the statistics gathered for each read
// 'nuc' are the nucleotides 'ACTGN', the max ASCII value for this is T, pre-init the ArrayBuffer to this value
// as we don't expect the have other 'higher' numbered Nucleotides for now.
case class BaseStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0))
case class ReadStat(qual: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer(),
nuc: mutable.ArrayBuffer[Long] = mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
var withN: Long,
lengths: mutable.ArrayBuffer[Int] = mutable.ArrayBuffer())
val baseStats: mutable.ArrayBuffer[BaseStat] = mutable.ArrayBuffer()
val readStats: ReadStat = new ReadStat(mutable.ArrayBuffer(),
mutable.ArrayBuffer.fill('T'.toInt + 1)(0),
0L,
mutable.ArrayBuffer())
/**
* Compute the quality metric per read
* Results are stored in baseStats and readStats
*
* @param record FastqRecord
*/
def procesRead(record: FastqRecord): Unit = {
// Adjust/expand the length of baseStat case classes to the size of current
// read if the current list is not long enough to store the data
if (baseStats.length < record.length) {
baseStats ++= mutable.ArrayBuffer.fill(record.length - baseStats.length)(BaseStat())
}
if (readStats.lengths.length < record.length) {
readStats.lengths ++= mutable.ArrayBuffer.fill(record.length - readStats.lengths.length + 1)(0)
}
val readQual = record.getBaseQualityString
val readNucleotides = record.getReadString
readStats.lengths(record.length) += 1
for (t <- 0 until record.length()) {
if (baseStats(t).qual.length <= readQual(t)) {
baseStats(t).qual ++= mutable.ArrayBuffer.fill(readQual(t).toInt - baseStats(t).qual.length + 1)(0)
}
baseStats(t).qual(readQual(t)) += 1
baseStats(t).nuc(readNucleotides(t)) += 1
readStats.nuc(readNucleotides(t)) += 1
}
// implicit conversion to Int using foldLeft(0)
val avgQual: Int = (readQual.sum / readQual.length)
if (readStats.qual.length <= avgQual) {
readStats.qual ++= mutable.ArrayBuffer.fill(avgQual - readStats.qual.length + 1)(0)
}
readStats.qual(avgQual) += 1
readStats.withN += {
if (readNucleotides.contains("N")) 1L
else 0L
}
}
/**
* seqStat, the compute entrypoint where all statistics collection starts
*
* @param fqreader FastqReader
* @return numReads - number of reads counted
*/
def seqStat(fqreader: FastqReader): Long = {
var numReads: Long = 0
for (read <- fqreader.iterator.asScala) {
procesRead(read)
numReads += 1
}
numReads
}
def summarize(): Unit = {
// for every position to the max length of any read
for (pos <- 0 until baseStats.length) {
// list all qualities at this particular position `pos`
// fix the length of `quals`
if (quals.length <= baseStats(pos).qual.length) {
quals ++= mutable.ArrayBuffer.fill(baseStats(pos).qual.length - quals.length)(0)
}
if (nucs.length <= baseStats(pos).nuc.length) {
for (_ <- nucs.length until baseStats(pos).nuc.length) nucs.append(0)
}
// count into the quals
baseStats(pos).qual.zipWithIndex foreach { case (value, index) => quals(index) += value }
// count N into nucs
baseStats(pos).nuc.zipWithIndex foreach { case (value, index) => nucs(index) += value }
}
detectPhredEncoding(quals)
for (pos <- 0 until nucs.length) {
// always export the N-nucleotide
if (nucs(pos) > 0 || pos.toChar == 'N') {
nucleotideHistoMap += (pos.toChar -> nucs(pos))
}
}
// init baseHistogram with the bounderies of the report values
for (pos <- 0 until reportValues.max + 1) {
baseHistogram.append(0)
readHistogram.append(0)
}
for (pos <- 0 until quals.length) {
var key: Int = pos - phredEncoding.id
if (key > 0) {
// count till the max of baseHistogram.length
for (histokey <- 0 until key + 1) {
baseHistogram(histokey) += quals(pos)
}
}
}
for (pos <- 0 until baseHistogram.length) {
baseQualHistoMap += (pos -> baseHistogram(pos))
}
for (pos <- 0 until readStats.qual.length) {
var key: Int = pos - phredEncoding.id
if (key > 0) {
// count till the max of baseHistogram.length
for (histokey <- 0 until key + 1) {
readHistogram(histokey) += readStats.qual(pos)
}
}
}
for (pos <- 0 until readHistogram.length) {
readQualHistoMap += (pos -> readHistogram(pos))
}
}
def main(args: Array[String]): Unit = {
val commandArgs: Args = parseArgs(args)
logger.info("Start seqstat")
val reader = new FastqReader(commandArgs.fastq)
val numReads = seqStat(reader)
summarize()
logger.debug(nucs)
// logger.debug(baseStats)
logger.info("Seqstat done")
val report: Map[String, Any] = Map(
("files",
Map(
("fastq", Map(
("path", commandArgs.fastq),
("checksum_sha1", "")
)
)
)
),
("stats", Map(
("bases", Map(
("num_n", nucleotideHistoMap('N')),
("num_total", nucleotideHistoMap.values.sum),
("num_qual_gte", baseQualHistoMap.toMap),
("nucleotides", nucleotideHistoMap.toMap)
)),
("reads", Map(
("num_with_n", readStats.withN),
("num_total", readStats.qual.sum),
("len_min", readStats.lengths.takeWhile(_ == 0).length),
("len_max", readStats.lengths.length - 1),
("num_qual_gte", readQualHistoMap.toMap),
("qual_encoding", phredEncoding.toString.toLowerCase)
))
))
)
val jsonReport: Json = {
ConfigUtils.mapToJson(report)
}
println(jsonReport.spaces2)
}
}
/**
* Copyright (c) 2015 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl>
* @author Wai Yi Leung <w.y.leung@lumc.nl>
*/
package nl.lumc.sasc.biopet.tools
import java.io.File
import java.nio.file.Paths
import htsjdk.samtools.fastq.{ FastqReader, FastqRecord }
import org.mockito.Mockito.{ inOrder => inOrd, when }
import org.scalatest.Matchers
import org.scalatest.mock.MockitoSugar
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.{ DataProvider, Test }
import scala.collection.JavaConverters._
class SeqstatTest extends TestNGSuite with MockitoSugar with Matchers {
import nl.lumc.sasc.biopet.tools.Seqstat._
import nl.lumc.sasc.biopet.tools.FqEncoding._
private def resourceFile(p: String): File =
new File(resourcePath(p))
private def resourcePath(p: String): String =
Paths.get(getClass.getResource(p).toURI).toString
// Helper functions to create iterator over FastqRecords given its IDs as Ints
// Record with 'A' and Qual==39 (SangerEncoding)
private def recordsOver(ids: String*): java.util.Iterator[FastqRecord] = ids
.map(x => new FastqRecord(x, "ACGTN", "", "HIBC!"))
.toIterator.asJava
@DataProvider(name = "mockReaderProvider")
def mockReaderProvider() =
Array(
Array(mock[FastqReader])
)
@Test(dataProvider = "mockReaderProvider", groups = Array("sanger"), singleThreaded = true)
def testDefault(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3")
}
@Test(dataProvider = "mockReaderProvider", groups = Array("read"), singleThreaded = true)
def testSeqCountReads(fqMock: FastqReader) = {
when(fqMock.iterator) thenReturn recordsOver("1", "2", "3", "4", "5")
val seqstat = Seqstat
val numReads = seqstat.seqStat(fqMock)
numReads shouldBe 5
}
@Test(dataProvider = "mockReaderProvider", groups = Array("phredscore"), singleThreaded = true, dependsOnGroups = Array("read"))
def testEncodingDetectionSanger(fqMock: FastqReader) = {
val seqstat = Seqstat
seqstat.summarize()
seqstat.phredEncoding shouldBe Sanger
}
@Test(dataProvider = "mockReaderProvider", groups = Array("nucleocount"), singleThreaded = true, dependsOnGroups = Array("phredscore"))
def testEncodingNucleotideCount(fqMock: FastqReader) = {
val seqstat = Seqstat
nucleotideHistoMap('N') shouldEqual 5
nucleotideHistoMap('A') shouldEqual 5
nucleotideHistoMap('C') shouldEqual 5
nucleotideHistoMap('T') shouldEqual 5
nucleotideHistoMap('G') shouldEqual 5
}
@Test(dataProvider = "mockReaderProvider", groups = Array("basehistogram"), singleThreaded = true, dependsOnGroups = Array("nucleocount"))
def testEncodingBaseHistogram(fqMock: FastqReader) = {
val seqstat = Seqstat
baseHistogram(40) shouldEqual 5
baseHistogram(39) shouldEqual 10
baseHistogram(34) shouldEqual 15
baseHistogram(33) shouldEqual 20
baseHistogram(0) shouldEqual 20
}
@Test def testArgsMinimum() = {
val args = Array(
"-i", resourcePath("/paired01a.fq"))
val parsed = parseArgs(args)
parsed.fastq shouldBe resourceFile("/paired01a.fq")
}
}
\ No newline at end of file
......@@ -49,6 +49,7 @@ object BiopetExecutablePublic extends BiopetExecutable {
nl.lumc.sasc.biopet.tools.BastyGenerateFasta,
nl.lumc.sasc.biopet.tools.MergeAlleles,
nl.lumc.sasc.biopet.tools.SamplesTsvToJson,
nl.lumc.sasc.biopet.tools.Seqstat,
nl.lumc.sasc.biopet.tools.VEPNormalizer,
nl.lumc.sasc.biopet.tools.AnnotateVcfWithBed)
}
......@@ -84,7 +84,6 @@ object CarpTest {
val executables = Map(
"reference" -> "test",
"seqstat" -> Map("exe" -> "test"),
"fastqc" -> Map("exe" -> "test"),
"seqtk" -> Map("exe" -> "test"),
"sickle" -> Map("exe" -> "test"),
......
......@@ -21,7 +21,8 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import nl.lumc.sasc.biopet.core.{ SampleLibraryTag, BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat }
import nl.lumc.sasc.biopet.tools.Seqstat
import nl.lumc.sasc.biopet.tools.FastqSync
class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with SampleLibraryTag {
......
......@@ -11,6 +11,7 @@ import org.testng.annotations.{ AfterClass, DataProvider, Test }
import nl.lumc.sasc.biopet.core.config.Config
import nl.lumc.sasc.biopet.extensions.{ Sickle, Gzip, Zcat }
import nl.lumc.sasc.biopet.tools.Seqstat
import nl.lumc.sasc.biopet.tools.FastqSync
import nl.lumc.sasc.biopet.utils.ConfigUtils
......@@ -62,6 +63,7 @@ class FlexiprepTest extends TestNGSuite with Matchers {
else if (!paired && (skipClip && skipTrim)) 1
else if (paired && !(skipClip && skipTrim)) 4
else if (!paired && !(skipClip && skipTrim)) 2)
flexiprep.functions.count(_.isInstanceOf[Seqstat]) shouldBe (if (paired) 4 else 2)
flexiprep.functions.count(_.isInstanceOf[Zcat]) shouldBe (if (zipped) (if (paired) 2 else 1) else 0)
flexiprep.functions.count(_.isInstanceOf[SeqtkSeq]) shouldBe (if (paired) 2 else 1)
flexiprep.functions.count(_.isInstanceOf[Cutadapt]) shouldBe (if (skipClip) 0 else (if (paired) 2 else 1))
......@@ -86,4 +88,4 @@ object FlexiprepTest {
"sickle" -> Map("exe" -> "test"),
"cutadapt" -> Map("exe" -> "test")
)