diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckFastqPairs.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckFastqPairs.scala index 4fa87fd05957fc3a0715eea69f122a66ddeac15c..3a8d075bf28cee7bbe7dd47670d7f28811bc5ab0 100644 --- a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckFastqPairs.scala +++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckFastqPairs.scala @@ -2,7 +2,7 @@ package nl.lumc.sasc.biopet.tools import java.io.File -import htsjdk.samtools.fastq.FastqReader +import htsjdk.samtools.fastq.{FastqRecord, FastqReader} import nl.lumc.sasc.biopet.utils.ToolCommand import scala.collection.JavaConversions._ @@ -15,10 +15,9 @@ object CheckFastqPairs extends ToolCommand { * Args for commandline program * @param input input first fastq file (R1) (can be zipped) * @param input2 input second fastq file (R2) (can be zipped) - * @param output output fastq file (can be zipped) */ - case class Args(input: File = null, input2: Option[File] = None, output: File = null) extends AbstractArgs + case class Args(input: File = null, input2: Option[File] = None) extends AbstractArgs class OptParser extends AbstractOptParser { opt[File]('i', "fastq1") required () maxOccurs 1 valueName "<file>" action { (x, c) => @@ -27,9 +26,6 @@ object CheckFastqPairs extends ToolCommand { opt[File]('j', "fastq2") maxOccurs 1 valueName "<file>" action { (x, c) => c.copy(input2 = Some(x)) } - opt[File]('o', "output") required () maxOccurs 1 valueName "<file>" action { (x, c) => - c.copy(output = x) - } } def main(args: Array[String]): Unit = { @@ -45,63 +41,77 @@ object CheckFastqPairs extends ToolCommand { val readFq2 = cmdArgs.input2.map(new FastqReader(_)) //define a counter to track the number of objects passing through the for loop - var counter = 0 - - //Iterate over the fastq file check for the length of both files if not correct, exit the tool and print error message - for (recordR1 <- readFq1.iterator()) { - if (readFq2.map(_.hasNext) == Some(false)) - throw new IllegalStateException("R2 has less reads then R1") - - //Getting R2 record, None if it's single end - val recordR2 = readFq2.map(_.next()) - - //Here we check if the readnames of both files are concordant, and if the sequence content are correct DNA/RNA sequences - recordR2 match { - case Some(recordR2) => // Paired End - val readHeader = recordR1.getReadHeader - val readHeader2 = recordR2.getReadHeader - val readSeq = recordR1.getReadString - val readSeq2 = recordR2.getReadString - val id1 = readHeader.takeWhile(_ != ' ') - val id2 = readHeader2.takeWhile(_ != ' ') - - if (counter % 1e5 == 0) logger.info(counter + " reads processed") + var counter = 0 - val allowedBases = """([actgnACTGN+]+)""".r - - val validBases: Boolean = readSeq match { - case allowedBases(m) => true - case _ => throw new IllegalStateException(s"Non IUPAC symbols identified '${(counter*4)-3}'") - } - - val validBases2: Boolean = readSeq2 match { - case allowedBases(m) => true - case _ => throw new IllegalStateException(s"Non IUPAC symbols identified '${(counter*4)-3}'") - } - - if (id1 == id2){ - - } else if (id1.stripSuffix("/1") == id2.stripSuffix("/2")) { + try { + //Iterate over the fastq file check for the length of both files if not correct, exit the tool and print error message + for (recordR1 <- readFq1.iterator()) { + counter += 1 + if (readFq2.map(_.hasNext) == Some(false)) + throw new IllegalStateException("R2 has less reads then R1") + + //Getting R2 record, None if it's single end + val recordR2 = readFq2.map(_.next()) + + validFastqRecord(recordR1) + + //Here we check if the readnames of both files are concordant, and if the sequence content are correct DNA/RNA sequences + recordR2 match { + case Some(recordR2) => // Paired End + validFastqRecord(recordR2) + checkMate(recordR1, recordR2) + case _ => // Single end + } + if (counter % 1e5 == 0) logger.info(counter + " reads processed") + } - } else if (id1.stripSuffix(".1") == id2.stripSuffix(".2")) { + //if R2 is longer then R1 print an error code and exit the tool + if (readFq2.map(_.hasNext) == Some(true)) + throw new IllegalStateException("R2 has more reads then R1") - } else - throw new IllegalStateException(s"sequenceHeaders does not match at line '${(counter*4)-3}'. R1: '$readHeader', R2: '$readHeader2'") - case _ => // Single end - } - counter += 1 + logger.info(s"Done processing ${counter} fastq records, no errors found") + } catch { + case e:IllegalStateException => + logger.error(s"Error found at readnumber: $counter, linenumber ${(counter*4)-3}") + logger.error(e.getMessage) } - //if R2 is longer then R1 print an error code and exit the tool - if (readFq2.map(_.hasNext) == Some(true)) - throw new IllegalStateException("R2 has more reads then R1") - - logger.info("Done processing the Fastq file(s) no errors found") - logger.info("total reads processed: " + counter) //close both iterators readFq1.close() readFq2.foreach(_.close()) } + val allowedBases = """([actgnACTGN+]+)""".r + + /** + * + * @param record + * @throws IllegalStateException + */ + def validFastqRecord(record: FastqRecord): Unit = { + record.getReadString match { + case allowedBases(m) => + case _ => throw new IllegalStateException(s"Non IUPAC symbols identified") + } + if (record.getReadString.size != record.getBaseQualityString.size) + throw new IllegalStateException(s"Sequence length do not match quality length") + } + + /** + * + * @param r1 + * @param r2 + * @throws IllegalStateException + */ + def checkMate(r1: FastqRecord, r2: FastqRecord): Unit = { + val id1 = r1.getReadHeader.takeWhile(_ != ' ') + val id2 = r2.getReadHeader.takeWhile(_ != ' ') + if (!(id1 == id2 || + id1.stripSuffix("/1") == id2.stripSuffix("/2") || + id1.stripSuffix(".1") == id2.stripSuffix(".2"))) + throw new IllegalStateException(s"sequenceHeaders does not match. R1: '${r1.getReadHeader}', R2: '${r2.getReadHeader}'") + } + + //TODO: check duplicate read ideas in both R1 and R2 } \ No newline at end of file