Commit b9350819 authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'tool-checkFastq-pair' into 'develop'

New tool to validate if paired end sequences are in sync

The tools has been checked with 4 different fastq files and with randomly removing one line. In all cases the tool did what it is supposed to do. Probably some extra features might be build in, but that comes later.

See merge request !317
parents d19d4308 5fde3293
......@@ -111,6 +111,11 @@
<artifactId>Basty</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetToolsPackage</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
......
......@@ -40,29 +40,5 @@ object BiopetExecutablePublic extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.basty.Basty
) ::: publicPipelines
def tools: List[MainCommand] = List(
nl.lumc.sasc.biopet.tools.MergeTables,
nl.lumc.sasc.biopet.tools.WipeReads,
nl.lumc.sasc.biopet.tools.ExtractAlignedFastq,
nl.lumc.sasc.biopet.tools.FastqSync,
nl.lumc.sasc.biopet.tools.BiopetFlagstat,
nl.lumc.sasc.biopet.tools.CheckAllelesVcfInBam,
nl.lumc.sasc.biopet.tools.VcfToTsv,
nl.lumc.sasc.biopet.tools.VcfFilter,
nl.lumc.sasc.biopet.tools.VcfStats,
nl.lumc.sasc.biopet.tools.FindRepeatsPacBio,
nl.lumc.sasc.biopet.tools.MpileupToVcf,
nl.lumc.sasc.biopet.tools.FastqSplitter,
nl.lumc.sasc.biopet.tools.BedtoolsCoverageToCounts,
nl.lumc.sasc.biopet.tools.SageCountFastq,
nl.lumc.sasc.biopet.tools.SageCreateLibrary,
nl.lumc.sasc.biopet.tools.SageCreateTagCounts,
nl.lumc.sasc.biopet.tools.BastyGenerateFasta,
nl.lumc.sasc.biopet.tools.MergeAlleles,
nl.lumc.sasc.biopet.tools.SamplesTsvToJson,
nl.lumc.sasc.biopet.tools.SeqStat,
nl.lumc.sasc.biopet.tools.VepNormalizer,
nl.lumc.sasc.biopet.tools.AnnotateVcfWithBed,
nl.lumc.sasc.biopet.tools.VcfWithVcf,
nl.lumc.sasc.biopet.tools.KrakenReportToJson)
def tools: List[MainCommand] = BiopetToolsExecutable.tools
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.tools
import java.io.File
import nl.lumc.sasc.biopet.core.{ Reference, ToolCommandFunction }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
class ValidateFastq(val root: Configurable) extends ToolCommandFunction {
def toolObject = nl.lumc.sasc.biopet.tools.ValidateFastq
@Input(doc = "Input R1 fastq file", required = true)
var r1Fastq: File = _
@Input(doc = "Input R1 fastq file", required = false)
var r2Fastq: Option[File] = None
override def cmdLine = super.cmdLine +
required("-i", r1Fastq) +
optional("-j", r2Fastq)
}
......@@ -44,5 +44,7 @@ object BiopetToolsExecutable extends BiopetExecutable {
nl.lumc.sasc.biopet.tools.SeqStat,
nl.lumc.sasc.biopet.tools.VepNormalizer,
nl.lumc.sasc.biopet.tools.AnnotateVcfWithBed,
nl.lumc.sasc.biopet.tools.VcfWithVcf)
nl.lumc.sasc.biopet.tools.VcfWithVcf,
nl.lumc.sasc.biopet.tools.ValidateFastq,
nl.lumc.sasc.biopet.tools.KrakenReportToJson)
}
package nl.lumc.sasc.biopet.tools
import java.io.File
import htsjdk.samtools.fastq.{ FastqRecord, FastqReader }
import nl.lumc.sasc.biopet.utils.ToolCommand
import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer
/**
* Created by sajvanderzeeuw on 2-2-16.
*/
object ValidateFastq extends ToolCommand {
/**
* Args for commandline program
* @param input input first fastq file (R1) (can be zipped)
* @param input2 input second fastq file (R2) (can be zipped)
*/
case class Args(input: File = null, input2: Option[File] = None) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('i', "fastq1") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(input = x)
}
opt[File]('j', "fastq2") maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(input2 = Some(x))
}
}
def main(args: Array[String]): Unit = {
//Start analyses of fastq files
logger.info("Start")
//parse all possible options into OptParser
val argsParser = new OptParser
val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
//read in fastq file 1 and if present fastq file 2
val readFq1 = new FastqReader(cmdArgs.input)
val readFq2 = cmdArgs.input2.map(new FastqReader(_))
//define a counter to track the number of objects passing through the for loop
var counter = 0
try {
//Iterate over the fastq file check for the length of both files if not correct, exit the tool and print error message
var lastRecordR1: Option[FastqRecord] = None
var lastRecordR2: Option[FastqRecord] = None
for (recordR1 <- readFq1.iterator()) {
counter += 1
if (readFq2.map(_.hasNext) == Some(false))
throw new IllegalStateException("R2 contains less reads then R1")
//Getting R2 record, None if it's single end
val recordR2 = readFq2.map(_.next())
validFastqRecord(recordR1)
duplicateCheck(recordR1, lastRecordR1)
//Here we check if the readnames of both files are concordant, and if the sequence content are correct DNA/RNA sequences
recordR2 match {
case Some(recordR2) => // Paired End
validFastqRecord(recordR2)
duplicateCheck(recordR2, lastRecordR2)
checkMate(recordR1, recordR2)
case _ => // Single end
}
if (counter % 1e5 == 0) logger.info(counter + " reads processed")
lastRecordR1 = Some(recordR1)
lastRecordR2 = recordR2
}
//if R2 is longer then R1 print an error code and exit the tool
if (readFq2.map(_.hasNext) == Some(true))
throw new IllegalStateException("R2 contains more reads then R1")
logger.info(s"Possible quality encodings found: ${getPossibleEncodings.mkString(", ")}")
logger.info(s"Done processing ${counter} fastq records, no errors found")
} catch {
case e: IllegalStateException =>
logger.error(s"Error found at readnumber: $counter, linenumber ${(counter * 4) - 3}")
logger.error(e.getMessage)
}
//close both iterators
readFq1.close()
readFq2.foreach(_.close())
}
private[tools] var minQual: Option[Char] = None
private[tools] var maxQual: Option[Char] = None
/**
*
* @param record
* @throws IllegalStateException
*/
private[tools] def checkQualEncoding(record: FastqRecord): Unit = {
val min = record.getBaseQualityString.min
val max = record.getBaseQualityString.max
if (!minQual.exists(_ <= min)) {
minQual = Some(min)
getPossibleEncodings
}
if (!maxQual.exists(_ >= max)) {
maxQual = Some(max)
getPossibleEncodings
}
}
/**
*
* @return
* @throws IllegalStateException
*/
private[tools] def getPossibleEncodings: List[String] = {
val buffer: ListBuffer[String] = ListBuffer()
(minQual, maxQual) match {
case (Some(min), Some(max)) =>
if (min >= '!' && max <= 'I') buffer += "Sanger"
if (min >= ';' && max <= 'h') buffer += "Solexa"
if (min >= '@' && max <= 'h') buffer += "Illumina 1.3+"
if (min >= 'C' && max <= 'h') buffer += "Illumina 1.5+"
if (min >= '!' && max <= 'J') buffer += "Illumina 1.8+"
if (buffer.isEmpty)
throw new IllegalStateException(s"No possible quality encoding found. minQual: '$min', maxQual: '$max'")
case _ =>
}
buffer.toList
}
val allowedBases = """([actgnACTGN+]+)""".r
/**
* This function checks for duplicates.
* @param current
* @param before
* @throws IllegalStateException
*/
def duplicateCheck(current: FastqRecord, before: Option[FastqRecord]): Unit = {
if (before.exists(_.getReadHeader == current.getReadHeader))
throw new IllegalStateException("Duplicate read ID found")
}
/**
*
* @param record
* @throws IllegalStateException
*/
def validFastqRecord(record: FastqRecord): Unit = {
checkQualEncoding(record)
record.getReadString match {
case allowedBases(m) =>
case _ => throw new IllegalStateException(s"Non IUPAC symbols identified")
}
if (record.getReadString.size != record.getBaseQualityString.size)
throw new IllegalStateException(s"Sequence length does not match quality length")
}
/**
*
* @param r1
* @param r2
* @throws IllegalStateException
*/
def checkMate(r1: FastqRecord, r2: FastqRecord): Unit = {
val id1 = r1.getReadHeader.takeWhile(_ != ' ')
val id2 = r2.getReadHeader.takeWhile(_ != ' ')
if (!(id1 == id2 ||
id1.stripSuffix("/1") == id2.stripSuffix("/2") ||
id1.stripSuffix(".1") == id2.stripSuffix(".2")))
throw new IllegalStateException(s"Sequence headers do not match. R1: '${r1.getReadHeader}', R2: '${r2.getReadHeader}'")
}
}
\ No newline at end of file
@r01/2 hello
T
+
I
@r02/2
A
+
H
@r03/2
C
+
I
@r04/2
G
+
H
@r05/2
T
+
I
@r06/2
T
+
I
package nl.lumc.sasc.biopet.tools
import java.io.{OutputStream, PrintStream, ByteArrayOutputStream}
import java.nio.file.Paths
import htsjdk.samtools.fastq.FastqRecord
import nl.lumc.sasc.biopet.utils.Logging
import org.apache.log4j.{FileAppender, Appender}
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.{DataProvider, Test}
import scala.collection.JavaConversions._
/**
* Created by pjvan_thof on 2/17/16.
*/
class ValidateFastqTest extends TestNGSuite with Matchers {
@Test
def testCheckMate: Unit = {
ValidateFastq.checkMate(new FastqRecord("read_1", "ATCG", "", "AAAA"), new FastqRecord("read_1", "ATCG", "", "AAAA"))
intercept[IllegalStateException] {
ValidateFastq.checkMate(new FastqRecord("read_1", "ATCG", "", "AAAA"), new FastqRecord("read_2", "ATCG", "", "AAAA"))
}
}
@Test
def testDuplicateCheck: Unit = {
ValidateFastq.duplicateCheck(new FastqRecord("read_1", "ATCG", "", "AAAA"), None)
ValidateFastq.duplicateCheck(new FastqRecord("read_1", "ATCG", "", "AAAA"), Some(new FastqRecord("read_2", "ATCG", "", "AAAA")))
intercept[IllegalStateException] {
ValidateFastq.duplicateCheck(new FastqRecord("read_1", "ATCG", "", "AAAA"), Some(new FastqRecord("read_1", "ATCG", "", "AAAA")))
}
}
@DataProvider(name = "providerGetPossibleEncodings")
def providerGetPossibleEncodings = Array(
Array(None, None, Nil),
Array(Some('A'), None, Nil),
Array(None, Some('A'), Nil),
Array(Some('E'), Some('E'), List("Sanger", "Solexa", "Illumina 1.3+", "Illumina 1.5+", "Illumina 1.8+")),
Array(Some('+'), Some('+'), List("Sanger", "Illumina 1.8+")),
Array(Some('!'), Some('I'), List("Sanger", "Illumina 1.8+")),
Array(Some('!'), Some('J'), List("Illumina 1.8+")),
Array(Some(';'), Some('h'), List("Solexa")),
Array(Some('@'), Some('h'), List("Solexa", "Illumina 1.3+")),
Array(Some('C'), Some('h'), List("Solexa", "Illumina 1.3+", "Illumina 1.5+"))
)
@Test(dataProvider = "providerGetPossibleEncodings")
def testGetPossibleEncodings(min: Option[Char], max: Option[Char], output: List[String]): Unit = {
ValidateFastq.minQual = min
ValidateFastq.maxQual = max
ValidateFastq.getPossibleEncodings shouldBe output
}
@Test
def testGetPossibleEncodingsFail: Unit = {
intercept[IllegalStateException] {
ValidateFastq.minQual = Some('!')
ValidateFastq.maxQual = Some('h')
ValidateFastq.getPossibleEncodings
}
}
@Test
def testCheckQualEncoding: Unit = {
ValidateFastq.minQual = None
ValidateFastq.maxQual = None
ValidateFastq.checkQualEncoding(new FastqRecord("read_1", "ATCG", "", "AAAA"))
intercept[IllegalStateException] {
ValidateFastq.minQual = None
ValidateFastq.maxQual = None
ValidateFastq.checkQualEncoding(new FastqRecord("read_1", "ATCG", "", "A!hA"))
}
intercept[IllegalStateException] {
ValidateFastq.minQual = None
ValidateFastq.maxQual = None
ValidateFastq.checkQualEncoding(new FastqRecord("read_1", "ATCG", "", "hhhh"))
ValidateFastq.checkQualEncoding(new FastqRecord("read_1", "ATCG", "", "!!!!"))
}
}
@Test
def testValidFastqRecord: Unit = {
ValidateFastq.minQual = None
ValidateFastq.maxQual = None
ValidateFastq.validFastqRecord(new FastqRecord("read_1", "ATCG", "", "AAAA"))
intercept[IllegalStateException] {
ValidateFastq.validFastqRecord(new FastqRecord("read_1", "ATCG", "", "AAA"))
}
intercept[IllegalStateException] {
ValidateFastq.validFastqRecord(new FastqRecord("read_1", "ATYG", "", "AAAA"))
}
}
private def resourcePath(p: String): String =
Paths.get(getClass.getResource(p).toURI).toString
@Test
def testMain: Unit = {
ValidateFastq.minQual = None
ValidateFastq.maxQual = None
val r1 = resourcePath("/paired01a.fq")
val r2 = resourcePath("/paired01b.fq")
ValidateFastq.main(Array("-i", r1, "-j", r2))
//TODO: capture logs
ValidateFastq.minQual = None
ValidateFastq.maxQual = None
val r2fail = resourcePath("/paired01c.fq")
ValidateFastq.main(Array("-i", r1, "-j", r2fail))
}
}
......@@ -69,8 +69,8 @@ class VcfFilterTest extends TestNGSuite with MockitoSugar with Matchers {
@Test def testMustHaveGenotypes() = {
/**
* This should simply not raise an exception
*/
* This should simply not raise an exception
*/
val tmp = File.createTempFile("VCfFilter", ".vcf.gz")
tmp.deleteOnExit()
val tmp_path = tmp.getAbsolutePath
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines.flexiprep
import java.io.File
import nl.lumc.sasc.biopet.core.summary.WriteSummary
import org.broadinstitute.gatk.queue.function.InProcessFunction
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input }
import scala.io.Source
/**
* This class checks md5sums and give an exit code 1 when md5sum is not the same
*
* Created by pjvanthof on 16/08/15.
*/
class CheckValidateFastq extends InProcessFunction {
@Input(required = true)
var inputLogFile: File = _
/** Exits whenever the input md5sum is not the same as the output md5sum */
def run: Unit = {
val reader = Source.fromFile(inputLogFile)
reader.getLines().foreach { line =>
if (line.startsWith("ERROR")) {
logger.error("Corrupt fastq file found, aborting pipeline")
// 130 Simulates a ctr-C
Runtime.getRuntime.halt(130)
}
}
reader.close()
}
}
\ No newline at end of file
......@@ -20,7 +20,7 @@ import nl.lumc.sasc.biopet.core.{ BiopetFifoPipe, PipelineCommand, SampleLibrary
import nl.lumc.sasc.biopet.extensions.{ Zcat, Gzip }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.IoUtils._
import nl.lumc.sasc.biopet.extensions.tools.{ SeqStat, FastqSync }
import nl.lumc.sasc.biopet.extensions.tools.{ ValidateFastq, SeqStat, FastqSync }
import org.broadinstitute.gatk.queue.QScript
......@@ -114,6 +114,17 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
addSummarizable(fastqc_R1, "fastqc_R1")
outputFiles += ("fastqc_R1" -> fastqc_R1.output)
val validateFastq = new ValidateFastq(this)
validateFastq.r1Fastq = input_R1
validateFastq.r2Fastq = input_R2
validateFastq.jobOutputFile = new File(outputDir, ".validate_fastq.log.out")
add(validateFastq)
val checkValidateFastq = new CheckValidateFastq
checkValidateFastq.inputLogFile = validateFastq.jobOutputFile
checkValidateFastq.jobOutputFile = new File(outputDir, ".check.validate_fastq.log.out")
add(checkValidateFastq)
if (paired) {
fastqc_R2 = Fastqc(this, input_R2.get, new File(outputDir, R2_name + ".fastqc/"))
add(fastqc_R2)
......@@ -254,6 +265,17 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
}
}
val validateFastq = new ValidateFastq(this)
validateFastq.r1Fastq = fastqR1Qc
validateFastq.r2Fastq = fastqR2Qc
validateFastq.jobOutputFile = new File(outputDir, ".validate_fastq.qc.log.out")
add(validateFastq)
val checkValidateFastq = new CheckValidateFastq
checkValidateFastq.inputLogFile = validateFastq.jobOutputFile
checkValidateFastq.jobOutputFile = new File(outputDir, ".check.validate_fastq.qc.log.out")
add(checkValidateFastq)
outputFiles += ("output_R1_gzip" -> fastqR1Qc)
if (paired) outputFiles += ("output_R2_gzip" -> fastqR2Qc.get)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment