Commit 32adeb5e authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Added refflat support for base counting

parent 9428f738
......@@ -70,5 +70,10 @@
<artifactId>biojava3-sequencing</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>com.github.broadinstitute</groupId>
<artifactId>picard</artifactId>
<version>1.141</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.Target=System.err
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-5p [%d] [%C{1}] - %m%n
\ No newline at end of file
......@@ -4,49 +4,153 @@ import java.io.File
import htsjdk.samtools.{SAMRecord, SamReaderFactory}
import nl.lumc.sasc.biopet.utils.ToolCommand
import nl.lumc.sasc.biopet.utils.intervals.{BedRecord, BedRecordList}
import picard.annotation.{Gene, GeneAnnotationReader}
import scala.collection.JavaConversions._
import scala.collection.mutable
/**
* Created by pjvanthof on 22/01/16.
*/
object BaseCounter extends ToolCommand {
case class Args(bedFile: File = null,
case class Args(refFlat: File = null,
outputDir: File = null,
bamFile: File = null) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]("bedFile") required () valueName "<file>" action { (x, c) =>
c.copy(bedFile = x)
opt[File]("refFlat") required() valueName "<file>" action { (x, c) =>
c.copy(refFlat = x)
}
opt[File]('o', "outputDir") required () valueName "<file>" action { (x, c) =>
opt[File]('o', "outputDir") required() valueName "<file>" action { (x, c) =>
c.copy(outputDir = x)
}
opt[File]('b', "bam") required () valueName "<file>" action { (x, c) =>
opt[File]('b', "bam") required() valueName "<file>" action { (x, c) =>
c.copy(bamFile = x)
}
}
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse(throw new IllegalArgumentException(""))
val cmdArgs: Args = argsParser.parse(args, Args()) match {
case Some(x) => x
case _ => throw new IllegalArgumentException
}
//Sets picard logging level
htsjdk.samtools.util.Log.setGlobalLogLevel(htsjdk.samtools.util.Log.LogLevel.valueOf(logger.getLevel.toString))
logger.info("Start reading RefFlat file")
val bamReader = SamReaderFactory.makeDefault().open(cmdArgs.bamFile)
val geneReader = GeneAnnotationReader.loadRefFlat(cmdArgs.refFlat, bamReader.getFileHeader.getSequenceDictionary)
bamReader.close()
logger.info("Done reading RefFlat file")
logger.info("Start reading bamFile")
val counts = for (gene <- geneReader.getAll.par if gene.getName == "Rpl19") yield bamToGeneCount(cmdArgs.bamFile, gene)
logger.info("Done reading bamFile")
counts.foreach { geneCount =>
println(geneCount.gene.getName + "\t" + geneCount.counts.senseBases)
geneCount.transcripts.foreach { transcriptCount =>
println(transcriptCount.transcript.name + "\t" + transcriptCount.counts.senseBases)
transcriptCount.exons.zipWithIndex.foreach { exonCounts =>
println(transcriptCount.transcript.name + "_exon_" + exonCounts._2 + "\t" + exonCounts._1.counts.senseBases)
}
}
geneCount.exons.zipWithIndex.foreach { exonCounts =>
println(geneCount.gene.getName + "_exon_" + exonCounts._2 + "\t" + exonCounts._1.counts.senseBases)
}
}
}
def bamToGeneCount(bamFile: File, gene: Gene): GeneCount = {
val counts = new GeneCount(gene)
val bamReader = SamReaderFactory.makeDefault().open(bamFile)
for (record <- bamReader.queryOverlapping(gene.getContig, gene.getStart, gene.getEnd)) {
//TODO: check if sense this is correct with normal paired end
val sense = if (record.getReadPairedFlag && record.getSecondOfPairFlag)
record.getReadNegativeStrandFlag != gene.isPositiveStrand
else record.getReadNegativeStrandFlag == gene.isPositiveStrand
counts.addRecord(record, sense)
}
bamReader.close()
logger.debug(s"Done gene '${gene.getName}'")
counts
}
def bamRecordBasesOverlap(samRecord: SAMRecord, start: Int, end: Int): Int = {
samRecord.getAlignmentBlocks
.map { block =>
val blockStart = block.getReferenceStart
val blockEnd = blockStart + block.getLength - 1
if (blockStart <= end && blockStart + blockEnd >= start) {
(if (end < blockEnd) end else blockEnd) - (if (start > blockStart) start else blockStart) + 1
} else 0
}.sum
}
def bamRecordBasesOverlap(samRecord: SAMRecord, start: Int, end: Int, counts: Counts, sense: Boolean): Int = {
val overlap = bamRecordBasesOverlap(samRecord, start, end)
if (overlap > 0) {
if (sense) {
counts.senseBases += overlap
counts.senseReads += 1
} else {
counts.antiSenseBases += overlap
counts.antiSenseReads += 1
}
}
overlap
}
val bedrecords = BedRecordList.fromFile(cmdArgs.bedFile).combineOverlap
class Counts {
var senseBases = 0L
var antiSenseBases = 0L
def totalBases = senseBases + antiSenseBases
var senseReads = 0L
var antiSenseReads = 0L
def totalReads = senseReads + antiSenseReads
}
class GeneCount(val gene: Gene) {
val counts = new Counts
val transcripts = gene.iterator().map(new TranscriptCount(_)).toList
val exons = {
val tempList = gene.iterator().flatMap(_.exons).toList.sortBy(_.start)
/** This function will remove duplicates exons when having multiple transcripts */
def dedupList(input: List[Gene#Transcript#Exon], output: List[Gene#Transcript#Exon] = Nil): List[Gene#Transcript#Exon] = {
if (input.isEmpty) output
else {
val value = if (output.isEmpty || input.head.start != output.head.start || input.head.end != output.head.end) Some(input.head)
else None
dedupList(input.tail, value.toList ::: output)
}
}
dedupList(tempList).map(new ExonCount(_))
}
def addRecord(samRecord: SAMRecord, sense: Boolean): Unit = {
bamRecordBasesOverlap(samRecord, gene.getStart, gene.getEnd, counts, sense)
transcripts.foreach(_.addRecord(samRecord, sense))
exons.foreach(_.addRecord(samRecord, sense))
}
}
val counts = for (region <- bedrecords.allRecords) yield {
val bamReader = SamReaderFactory.makeDefault.open(cmdArgs.bamFile)
val interval = region.toSamInterval
val samIterator = bamReader.queryOverlapping(interval.getContig, interval.getStart, interval.getEnd)
for (samRecord <- samIterator) samRecordToCounts(samRecord, region.originals())
class TranscriptCount(val transcript: Gene#Transcript) {
val counts = new Counts
val exons = transcript.exons.map(new ExonCount(_))
def addRecord(samRecord: SAMRecord, sense: Boolean): Unit = {
bamRecordBasesOverlap(samRecord, transcript.start, transcript.end, counts, sense)
exons.foreach(_.addRecord(samRecord, sense))
}
}
def samRecordToCounts(samRecord: SAMRecord, bedRecords: List[BedRecord]): Unit = {
samRecord.getAlignmentBlocks.foreach { x =>
val bedStart = x.getReferenceStart - 1
val bedEnd = bedStart + x.getLength
class ExonCount(val exon: Gene#Transcript#Exon) {
val counts = new Counts
def addRecord(samRecord: SAMRecord, sense: Boolean): Unit = {
bamRecordBasesOverlap(samRecord, exon.start, exon.end, counts, sense)
}
}
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment