Commit ed444315 authored by bow's avatar bow
Browse files

Initial jobs for base counting

parent 672a25a8
#!/usr/bin/env python
"""
Convert a histogram generated by coverageBed to counts per region.
A histogram file can be generated with the following command:
coverageBed -split -hist -abam sample.bam -b selection.bed > selection.hist
The output consists of four columns:
- Chromosome name.
- Start position.
- End position.
- Number of nucleotides mapped to this region.
- Normalised expression for this region.
If the -c option is used, additional columns can be added.
"""
import argparse
import sys
def hist2count(inputHandle, outputHandle, copy):
"""
Split a fasta file on length.
@arg inputHandle: Open readable handle to a histogram file.
@type inputHandle: stream
@arg outputHandle: Open writable handle to the counts file.
@type outputHandle: stream
@arg outputHandle: List of columns to copy to the output file.
@type outputHandle: list[int]
"""
def __copy():
copyList = ""
for i in copy:
copyList += "\t%s" % data[i]
return copyList
#__copy
def __write():
outputHandle.write("%s\t%i\t%i\t%i\t%f%s\n" % (chromosome, start,
end, count, float(count) / (end - start), copyList))
chromosome = ""
start = 0
end = 0
count = 0
for line in inputHandle.readlines():
data = line.split()
if not data[0] == "all":
start_temp = int(data[1])
end_temp = int(data[2])
if data[0] != chromosome or start_temp != start or end_temp != end:
if chromosome:
__write()
chromosome = data[0]
start = start_temp
end = end_temp
count = 0
copyList = __copy()
#if
count += int(data[-4]) * int(data[-3])
#if
#for
__write()
#hist2count
def main():
"""
Main entry point.
"""
usage = __doc__.split("\n\n\n")
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=usage[0], epilog=usage[1])
parser.add_argument("-i", dest="input", type=argparse.FileType("r"),
default=sys.stdin, help="histogram input file (default=<stdin>)")
parser.add_argument("-o", dest="output", type=argparse.FileType("w"),
default=sys.stdout, help="file used as output (default=<stdout>)")
parser.add_argument("-c", dest="copy", type=int, nargs="+", default=[],
help="copy a column to the output file")
args = parser.parse_args()
hist2count(args.input, args.output, args.copy)
#main
if __name__ == '__main__':
main()
......@@ -24,6 +24,7 @@ import nl.lumc.sasc.biopet.core.summary._
import nl.lumc.sasc.biopet.extensions.{ Cufflinks, HtseqCount, Ln }
import nl.lumc.sasc.biopet.extensions.picard.{ CollectRnaSeqMetrics, MergeSamFiles, SortSam }
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import nl.lumc.sasc.biopet.pipelines.gentrap.extensions.RawBaseCounter
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.tools.MergeTables
......@@ -270,6 +271,17 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
job
}
/** Raw base counting job */
private def rawBaseCountJob: Option[RawBaseCounter] =
(expMeasures.contains(BasesPerExon) || expMeasures.contains(BasesPerGene))
.option {
val job = new RawBaseCounter(qscript)
job.inputBoth = alnFile
job.annotationBed = annotationBed.get
job.output = createFile(".raw_base_count")
job
}
/** Case class for containing cufflinks + its output symlink jobs */
private case class CufflinksJobSet(cuffJob: Cufflinks, geneJob: Ln, isoformJob: Ln) {
/** Adds all contained jobs to Queue */
......@@ -376,6 +388,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
idSortingJob.foreach(add(_))
fragmentsPerGeneJob.foreach(add(_))
fragmentsPerExonJob.foreach(add(_))
rawBaseCountJob.foreach(add(_))
// symlink results with distinct extensions ~ actually to make it easier to use MergeTables on these as well
// since the Queue argument parser doesn't play nice with Map[_, _] types
cufflinksStrictJobSet.foreach { case jobSet => jobSet.addAllJobs() }
......
package nl.lumc.sasc.biopet.pipelines.gentrap.extensions
import java.io.File
import scala.language.reflectiveCalls
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
/** Ad-hoc extension for counting bases that involves 3-command pipe */
// FIXME: generalize piping instead of building something by hand like this!
// Better to do everything quick and dirty here rather than something half-implemented with the objects
class RawBaseCounter(val root: Configurable) extends BiopetCommandLineFunction { wrapper =>
@Input(doc = "Reference BED file", required = true)
var annotationBed: File = null
@Input(doc = "Input BAM file from both strands", required = true)
var inputBoth: File = null
@Input(doc = "Input BAM file from the plus strand", required = false)
var inputPlus: File = null
@Input(doc = "Input BAM file from the minus strand", required = false)
var inputMinus: File = null
@Output(doc = "Output base count file", required = true)
var output: File = null
/** Internal flag for mixed strand mode */
private lazy val mixedStrand: Boolean = inputBoth != null && inputPlus == null && inputMinus == null
/** Internal flag for distinct strand / strand-specific mode */
private lazy val distinctStrand: Boolean = inputBoth == null && inputPlus != null && inputMinus != null
private def grepForStrand = new BiopetCommandLineFunction {
var strand: String = null
override val root: Configurable = wrapper.root
executable = "grep"
override def cmdLine: String = required(executable) +
required("-P", """\""" + strand + """$""") +
required(annotationBed)
}
private def bedtoolsCovHist = new BiopetCommandLineFunction {
var bam: File = null
override val root: Configurable = wrapper.root
executable = "coverageBed"
override def cmdLine: String = required(executable) +
required("-split") +
required("-hist") +
required("-abam", bam) +
required("-b", if (mixedStrand) annotationBed else "stdin")
}
private def hist2Count = new PythonCommandLineFunction {
setPythonScript("hist2count.py", "/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/")
override val root: Configurable = wrapper.root
def cmdLine = getPythonCommand + optional("-c", "3")
}
override def beforeGraph: Unit = {
require(annotationBed != null, "Annotation BED must be supplied")
require(output != null, "Output must be defined")
require((mixedStrand && !distinctStrand) || (!mixedStrand && distinctStrand),
"Invalid input BAM combinations for RawBaseCounter")
}
def cmdLine: String =
if (mixedStrand && !distinctStrand) {
val btCov = bedtoolsCovHist
btCov.bam = inputBoth
btCov.commandLine + "|" + hist2Count.commandLine + " > " + output
} else {
val plusGrep = grepForStrand
plusGrep.strand = "+"
val plusBtCov = bedtoolsCovHist
plusBtCov.bam = inputPlus
val minusGrep = grepForStrand
minusGrep.strand = "-"
val minusBtCov = bedtoolsCovHist
minusBtCov.bam = inputMinus
plusGrep.commandLine + "|" + plusBtCov.commandLine + "|" + hist2Count.commandLine + " > " + output + " && " +
minusGrep.commandLine + "|" + minusBtCov.commandLine + "|" + hist2Count.commandLine + " >> " + output
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment