Added refflat support for base counting

32adeb5e · Peter van 't Hof · 9428f738 · 32adeb5e · 32adeb5e · 32adeb5e
Commit 32adeb5e authored 9 years ago by Peter van 't Hof
--- a/public/biopet-tools/pom.xml
+++ b/public/biopet-tools/pom.xml
@@ -70,5 +70,10 @@
            <artifactId>biojava3-sequencing</artifactId>
            <version>3.1.0</version>
        </dependency>
+        <dependency>
+            <groupId>com.github.broadinstitute</groupId>
+            <artifactId>picard</artifactId>
+            <version>1.141</version>
+        </dependency>
    </dependencies>
 </project>
\ No newline at end of file
--- a/public/biopet-tools/src/main/resources/log4j.properties
+++ b/public/biopet-tools/src/main/resources/log4j.properties
+#
+# Biopet is built on top of GATK Queue for building bioinformatic
+# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
+# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
+# should also be able to execute Biopet tools and pipelines.
+#
+# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
+#
+# Contact us at: sasc@lumc.nl
+#
+# A dual licensing mode is applied. The source code within this project that are
+# not part of GATK Queue is freely available for non-commercial use under an AGPL
+# license; For commercial users or users who do not want to follow the AGPL
+# license, please contact us to obtain a separate license.
+#
+
+# Set root logger level to DEBUG and its only appender to A1.
+log4j.rootLogger=INFO, A1
+
+# A1 is set to be a ConsoleAppender.
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+log4j.appender.A1.Target=System.err
+
+# A1 uses PatternLayout.
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-5p [%d] [%C{1}] - %m%n
\ No newline at end of file
--- a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BaseCounter.scala
+++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BaseCounter.scala
@@ -4,49 +4,153 @@ import java.io.File

 import htsjdk.samtools.{SAMRecord, SamReaderFactory}
 import nl.lumc.sasc.biopet.utils.ToolCommand
-import nl.lumc.sasc.biopet.utils.intervals.{BedRecord, BedRecordList}
+import picard.annotation.{Gene, GeneAnnotationReader}

 import scala.collection.JavaConversions._
-import scala.collection.mutable

 /**
  * Created by pjvanthof on 22/01/16.
  */
 object BaseCounter extends ToolCommand {
-  case class Args(bedFile: File = null,
+
+  case class Args(refFlat: File = null,
                  outputDir: File = null,
                  bamFile: File = null) extends AbstractArgs

  class OptParser extends AbstractOptParser {
-    opt[File]("bedFile") required () valueName "<file>" action { (x, c) =>
-      c.copy(bedFile = x)
+    opt[File]("refFlat") required() valueName "<file>" action { (x, c) =>
+      c.copy(refFlat = x)
    }
-    opt[File]('o', "outputDir") required () valueName "<file>" action { (x, c) =>
+    opt[File]('o', "outputDir") required() valueName "<file>" action { (x, c) =>
      c.copy(outputDir = x)
    }
-    opt[File]('b', "bam") required () valueName "<file>" action { (x, c) =>
+    opt[File]('b', "bam") required() valueName "<file>" action { (x, c) =>
      c.copy(bamFile = x)
    }
  }

  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
-    val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse(throw new IllegalArgumentException(""))
+    val cmdArgs: Args = argsParser.parse(args, Args()) match {
+      case Some(x) => x
+      case _ => throw new IllegalArgumentException
+    }
+
+    //Sets picard logging level
+    htsjdk.samtools.util.Log.setGlobalLogLevel(htsjdk.samtools.util.Log.LogLevel.valueOf(logger.getLevel.toString))
+
+    logger.info("Start reading RefFlat file")
+    val bamReader = SamReaderFactory.makeDefault().open(cmdArgs.bamFile)
+    val geneReader = GeneAnnotationReader.loadRefFlat(cmdArgs.refFlat, bamReader.getFileHeader.getSequenceDictionary)
+    bamReader.close()
+    logger.info("Done reading RefFlat file")
+
+    logger.info("Start reading bamFile")
+    val counts = for (gene <- geneReader.getAll.par if gene.getName == "Rpl19") yield bamToGeneCount(cmdArgs.bamFile, gene)
+    logger.info("Done reading bamFile")
+
+    counts.foreach { geneCount =>
+      println(geneCount.gene.getName + "\t" + geneCount.counts.senseBases)
+      geneCount.transcripts.foreach { transcriptCount =>
+        println(transcriptCount.transcript.name + "\t" + transcriptCount.counts.senseBases)
+        transcriptCount.exons.zipWithIndex.foreach { exonCounts =>
+          println(transcriptCount.transcript.name + "_exon_" + exonCounts._2 + "\t" + exonCounts._1.counts.senseBases)
+        }
+      }
+      geneCount.exons.zipWithIndex.foreach { exonCounts =>
+        println(geneCount.gene.getName + "_exon_" + exonCounts._2 + "\t" + exonCounts._1.counts.senseBases)
+      }
+    }
+  }
+
+  def bamToGeneCount(bamFile: File, gene: Gene): GeneCount = {
+    val counts = new GeneCount(gene)
+    val bamReader = SamReaderFactory.makeDefault().open(bamFile)
+
+    for (record <- bamReader.queryOverlapping(gene.getContig, gene.getStart, gene.getEnd)) {
+      //TODO: check if sense this is correct with normal paired end
+      val sense = if (record.getReadPairedFlag && record.getSecondOfPairFlag)
+        record.getReadNegativeStrandFlag != gene.isPositiveStrand
+      else record.getReadNegativeStrandFlag == gene.isPositiveStrand
+      counts.addRecord(record, sense)
+    }
+
+    bamReader.close()
+    logger.debug(s"Done gene '${gene.getName}'")
+    counts
+  }
+
+  def bamRecordBasesOverlap(samRecord: SAMRecord, start: Int, end: Int): Int = {
+    samRecord.getAlignmentBlocks
+      .map { block =>
+        val blockStart = block.getReferenceStart
+        val blockEnd = blockStart + block.getLength - 1
+        if (blockStart <= end && blockStart + blockEnd >= start) {
+          (if (end < blockEnd) end else blockEnd) - (if (start > blockStart) start else blockStart) + 1
+        } else 0
+      }.sum
+  }
+
+  def bamRecordBasesOverlap(samRecord: SAMRecord, start: Int, end: Int, counts: Counts, sense: Boolean): Int = {
+    val overlap = bamRecordBasesOverlap(samRecord, start, end)
+    if (overlap > 0) {
+      if (sense) {
+        counts.senseBases += overlap
+        counts.senseReads += 1
+      } else {
+        counts.antiSenseBases += overlap
+        counts.antiSenseReads += 1
+      }
+    }
+    overlap
+  }

-    val bedrecords = BedRecordList.fromFile(cmdArgs.bedFile).combineOverlap
+  class Counts {
+    var senseBases = 0L
+    var antiSenseBases = 0L
+    def totalBases = senseBases + antiSenseBases
+    var senseReads = 0L
+    var antiSenseReads = 0L
+    def totalReads = senseReads + antiSenseReads
+  }
+
+  class GeneCount(val gene: Gene) {
+    val counts = new Counts
+    val transcripts = gene.iterator().map(new TranscriptCount(_)).toList
+    val exons = {
+      val tempList = gene.iterator().flatMap(_.exons).toList.sortBy(_.start)
+
+      /** This function will remove duplicates exons when having multiple transcripts */
+      def dedupList(input: List[Gene#Transcript#Exon], output: List[Gene#Transcript#Exon] = Nil): List[Gene#Transcript#Exon] = {
+        if (input.isEmpty) output
+        else {
+          val value = if (output.isEmpty || input.head.start != output.head.start || input.head.end != output.head.end) Some(input.head)
+          else None
+          dedupList(input.tail, value.toList ::: output)
+        }
+      }
+      dedupList(tempList).map(new ExonCount(_))
+    }
+    def addRecord(samRecord: SAMRecord, sense: Boolean): Unit = {
+      bamRecordBasesOverlap(samRecord, gene.getStart, gene.getEnd, counts, sense)
+      transcripts.foreach(_.addRecord(samRecord, sense))
+      exons.foreach(_.addRecord(samRecord, sense))
+    }
+  }

-    val counts = for (region <- bedrecords.allRecords) yield {
-      val bamReader = SamReaderFactory.makeDefault.open(cmdArgs.bamFile)
-      val interval = region.toSamInterval
-      val samIterator = bamReader.queryOverlapping(interval.getContig, interval.getStart, interval.getEnd)
-      for (samRecord <- samIterator) samRecordToCounts(samRecord, region.originals())
+  class TranscriptCount(val transcript: Gene#Transcript) {
+    val counts = new Counts
+    val exons = transcript.exons.map(new ExonCount(_))
+    def addRecord(samRecord: SAMRecord, sense: Boolean): Unit = {
+      bamRecordBasesOverlap(samRecord, transcript.start, transcript.end, counts, sense)
+      exons.foreach(_.addRecord(samRecord, sense))
    }
  }

-  def samRecordToCounts(samRecord: SAMRecord, bedRecords: List[BedRecord]): Unit = {
-    samRecord.getAlignmentBlocks.foreach { x =>
-      val bedStart = x.getReferenceStart - 1
-      val bedEnd = bedStart + x.getLength
+  class ExonCount(val exon: Gene#Transcript#Exon) {
+    val counts = new Counts
+    def addRecord(samRecord: SAMRecord, sense: Boolean): Unit = {
+      bamRecordBasesOverlap(samRecord, exon.start, exon.end, counts, sense)
    }
  }
-}
+}
\ No newline at end of file