From 734801f6c59a3e63a759e378e2dcf0ec9a19cdc0 Mon Sep 17 00:00:00 2001
From: sajvanderzeeuw <s.a.j.van_der_zeeuw@lumc.nl>
Date: Fri, 5 Feb 2016 10:48:34 +0100
Subject: [PATCH] add DNA base check (testing still needed)

---
 .../sasc/biopet/tools/CheckFastqPairs.scala   | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckFastqPairs.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckFastqPairs.scala
index d3b45590f..e4c78dd32 100644
--- a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckFastqPairs.scala
+++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckFastqPairs.scala
@@ -55,15 +55,30 @@ object CheckFastqPairs extends ToolCommand {
       //Getting R2 record, None if it's single end
       val recordR2 = readFq2.map(_.next())
 
-      //Here we check if the readnames of both files are concordant
+      //Here we check if the readnames of both files are concordant, and if the sequence content are correct DNA/RNA sequences
       recordR2 match {
         case Some(recordR2) => // Paired End
           val readHeader = recordR1.getReadHeader
           val readHeader2 = recordR2.getReadHeader
+          val readSeq = recordR1.getReadString
+          val readSeq2 = recordR2.getReadString
           val id1 = readHeader.takeWhile(_ != ' ')
           val id2 = readHeader2.takeWhile(_ != ' ')
 
-          if (counter % 1e5 == 0) logger.info(counter + " reads processed")
+          if (counter % 1e4 == 0) logger.info(counter + " reads processed")
+
+
+          val allowedBases = """([actgnACTGN+]+)""".r
+
+          val validBases: Boolean = readSeq match {
+            case allowedBases(m) => true
+            case _ => throw new IllegalStateException(s"Non IUPAC symbols identified '${(counter*4)-3}'")
+          }
+
+          val validBases2: Boolean = readSeq2 match {
+            case allowedBases(m) => true
+            case _ => throw new IllegalStateException(s"Non IUPAC symbols identified '${(counter*4)-3}'")
+          }
 
           if (id1 == id2){
 
-- 
GitLab