Commit 7d8f335d authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Added tagcounting to Sage pipeline

parent ce8bb5c9
......@@ -10,7 +10,7 @@
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-classpath %classpath nl.lumc.sasc.biopet.core.apps.BedToInterval /data/DIV5/SASC/project-062-snake/analysis/deps/custom_region.bed /data/DIV5/SASC/project-062-snake/analysis/runs/biopet/110/run_1/110-1.dedup.bam /home/pjvan_thof/pipelines/test/test.interval</exec.args>
<exec.args>-classpath %classpath nl.lumc.sasc.biopet.pipelines.sage.CreateTagCounts -I raw.count -taglib tag.lib -sense sense.counts -allsense all.sense.counts -antisense antisense.counts -allantisense all.antisense.counts</exec.args>
<exec.executable>java</exec.executable>
<exec.workingdir>/home/pjvan_thof/pipelines/test</exec.workingdir>
</properties>
......@@ -25,7 +25,7 @@
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath nl.lumc.sasc.biopet.core.apps.BedToInterval /data/DIV5/SASC/project-062-snake/analysis/deps/custom_region.bed /data/DIV5/SASC/project-062-snake/analysis/runs/biopet/110/run_1/110-1.dedup.bam /home/pjvan_thof/pipelines/test/test.interval</exec.args>
<exec.args>-Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath nl.lumc.sasc.biopet.pipelines.sage.CreateTagCounts -I raw.count -taglib tag.lib -sense sense.counts -allsense all.sense.counts -antisense antisense.counts -allantisense all.antisense.counts</exec.args>
<exec.executable>java</exec.executable>
<jpda.listen>true</jpda.listen>
<exec.workingdir>/home/pjvan_thof/pipelines/test</exec.workingdir>
......@@ -41,7 +41,7 @@
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-classpath %classpath nl.lumc.sasc.biopet.core.apps.BedToInterval /data/DIV5/SASC/project-062-snake/analysis/deps/custom_region.bed /data/DIV5/SASC/project-062-snake/analysis/runs/biopet/110/run_1/110-1.dedup.bam /home/pjvan_thof/pipelines/test/test.interval</exec.args>
<exec.args>-classpath %classpath nl.lumc.sasc.biopet.pipelines.sage.CreateTagCounts -I raw.count -taglib tag.lib -sense sense.counts -allsense all.sense.counts -antisense antisense.counts -allantisense all.antisense.counts</exec.args>
<exec.executable>java</exec.executable>
<exec.workingdir>/home/pjvan_thof/pipelines/test</exec.workingdir>
</properties>
......
......@@ -17,7 +17,13 @@
<sting.binary-dist.name>SASC-Pipelines</sting.binary-dist.name>
<app.main.class>nl.lumc.sasc.biopet.core.BiopetExecutable</app.main.class>
</properties>
<repositories>
<repository>
<id>biojava-maven-repo</id>
<name>BioJava repository</name>
<url>http://www.biojava.org/download/maven/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.testng</groupId>
......@@ -40,6 +46,16 @@
<artifactId>argonaut_2.11</artifactId>
<version>6.1-M4</version>
</dependency>
<dependency>
<groupId>org.biojava</groupId>
<artifactId>biojava3-core</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>org.biojava</groupId>
<artifactId>biojava3-sequencing</artifactId>
<version>3.1.0</version>
</dependency>
</dependencies>
<build>
<resources>
......
package nl.lumc.sasc.biopet.pipelines.sage
import java.io.File
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import org.biojava3.sequencing.io.fastq.{SangerFastqReader, StreamListener, Fastq}
import scala.collection.JavaConversions._
import scala.collection.SortedMap
import scala.collection.mutable.Map
import java.io.FileReader
class CountFastq(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input fasta", shortName = "input", required = true)
var input: File = _
@Output(doc = "Output tag library", shortName = "output", required = true)
var output: File = _
override val defaultVmem = "8G"
memoryLimit = Option(4.0)
override def commandLine = super.commandLine +
required("-I", input) +
required("-o", output)
}
object CountFastq {
var input: File = _
var output: File = _
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
for (t <- 0 until args.size) {
args(t) match {
case "-I" => input = new File(args(t+1))
case "-o" => output = new File(args(t+1))
case _ =>
}
}
if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I")
if (output == null) throw new IllegalStateException("Output file not found, use -o")
val counts:Map[String, Long] = Map()
val reader = new SangerFastqReader
var count = 0
System.err.println("Reading fastq file: " + input)
val fileReader = new FileReader(input)
reader.stream(fileReader, new StreamListener {
def fastq(fastq:Fastq) {
val seq = fastq.getSequence
if (counts.contains(seq)) counts(seq) += 1
else counts += (seq -> 1)
count += 1
if (count % 1000000 == 0) System.err.println(count + " sequences done")
}
})
System.err.println(count + " sequences done")
System.err.println("Sorting")
val sortedCounts:SortedMap[String, Long] = SortedMap(counts.toArray:_*)
System.err.println("Writting outputfile: " + output)
val writer = new PrintWriter(output)
for ((seq,count) <- sortedCounts) {
writer.println(seq + "\t" + count)
}
writer.close
}
}
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.sage
import java.io.File
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.biojava3.core.sequence.DNASequence
import org.biojava3.core.sequence.io.FastaReaderHelper
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.collection.SortedMap
import scala.collection.mutable.{Map, Set}
import scala.collection.JavaConversions._
class CreateDeepsageLibrary(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input fasta", shortName = "input", required = true)
var input: File = _
@Output(doc = "Output tag library", shortName = "output", required = true)
var output: File = _
@Output(doc = "Output no tags", shortName = "noTagsOutput", required = false)
var noTagsOutput: File = _
@Output(doc = "Output no anti tags library", shortName = "noAntiTagsOutput", required = false)
var noAntiTagsOutput: File = _
@Output(doc = "Output file all genes", shortName = "allGenes", required = false)
var allGenesOutput: File = _
var tag: String = config("tag", default = "CATG")
var length: Option[Int] = config("length", default = 17)
override val defaultVmem = "8G"
memoryLimit = Option(4.0)
override def commandLine = super.commandLine +
required("-I", input) +
optional("-tag", tag) +
optional("-length", length) +
optional("-notag", noTagsOutput) +
optional("-noantitag", noAntiTagsOutput) +
required("-o", output)
}
object CreateDeepsageLibrary {
var tag = "CATG"
var length = 17
var input: File = _
var noTagsOutput: File = _
var noAntiTagsOutput: File = _
var allGenesOutput: File = _
var output: File = _
lazy val tagRegex = (tag + "[CATG]{" + length + "}").r
var geneRegex = """ENSG[0-9]{11}""".r
val tagGenesMap: Map[String, TagGenes] = Map()
val allGenes: Set[String] = Set()
val tagGenes: Set[String] = Set()
val antiTagGenes: Set[String] = Set()
class TagGenes {
val firstTag: Set[String] = Set()
val allTags:Set[String] = Set()
val firstAntiTag: Set[String] = Set()
val allAntiTags:Set[String] = Set()
}
class TagResult( val firstTag: String, val allTags:List[String], val firstAntiTag: String, val allAntiTags:List[String] )
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
for (t <- 0 until args.size) {
args(t) match {
case "-I" => input = new File(args(t+1))
case "-tag" => tag = args(t+1)
case "-length" => length = args(t+1).toInt
case "-o" => output = new File(args(t+1))
case "-notag" => noTagsOutput = new File(args(t+1))
case "-noantitag" => noAntiTagsOutput = new File(args(t+1))
case "-allgenes" => allGenesOutput = new File(args(t+1))
case _ =>
}
}
if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I")
if (output == null) throw new IllegalStateException("Output file not found, use -o")
var count = 0
System.err.println("Reading fasta file")
val reader = FastaReaderHelper.readFastaDNASequence(input)
System.err.println("Finding tags")
for ((name, seq) <- reader) {
getTags(name, seq)
count += 1
if (count % 10000 == 0) System.err.println(count + " transcripts done")
}
System.err.println(count + " transcripts done")
System.err.println("Start sorting tags")
val tagGenesMapSorted:SortedMap[String, TagGenes] = SortedMap(tagGenesMap.toArray:_*)
System.err.println("Writting output files")
val writer = new PrintWriter(output)
writer.println("#tag\tfirstTag\tAllTags\tFirstAntiTag\tAllAntiTags")
for ((tag, genes) <- tagGenesMapSorted) {
val line = tag + "\t" + genes.firstTag.mkString(",") +
"\t" + genes.allTags.mkString(",") +
"\t" + genes.firstAntiTag.mkString(",") +
"\t" + genes.allAntiTags.mkString(",")
writer.println(line)
}
writer.close()
if (noTagsOutput != null) {
val writer = new PrintWriter(noTagsOutput)
for (gene <- allGenes if !tagGenes.contains(gene)) {
writer.println(gene)
}
writer.close
}
if (noAntiTagsOutput != null) {
val writer = new PrintWriter(noAntiTagsOutput)
for (gene <- allGenes if !antiTagGenes.contains(gene)) {
writer.println(gene)
}
writer.close
}
if (allGenesOutput != null) {
val writer = new PrintWriter(allGenesOutput)
for (gene <- allGenes) {
writer.println(gene)
}
writer.close
}
}
def addTagresultToTaglib(name:String, tagResult:TagResult) {
val id = name.split(" ").head //.stripPrefix("hg19_ensGene_")
val geneID = geneRegex.findFirstIn(name).getOrElse("unknown_gene")
allGenes.add(geneID)
if (tagResult.firstTag != null) {
if (!tagGenesMap.contains(tagResult.firstTag)) tagGenesMap += (tagResult.firstTag -> new TagGenes)
tagGenesMap(tagResult.firstTag).firstTag.add(geneID)
tagGenes.add(geneID)
}
for (tag <- tagResult.allTags) {
if (!tagGenesMap.contains(tag)) tagGenesMap += (tag -> new TagGenes)
tagGenesMap(tag).allTags.add(geneID)
}
if (tagResult.firstAntiTag != null) {
if (!tagGenesMap.contains(tagResult.firstAntiTag)) tagGenesMap += (tagResult.firstAntiTag -> new TagGenes)
tagGenesMap(tagResult.firstAntiTag).firstAntiTag.add(geneID)
antiTagGenes.add(geneID)
}
for (tag <- tagResult.allAntiTags) {
if (!tagGenesMap.contains(tag)) tagGenesMap += (tag -> new TagGenes)
tagGenesMap(tag).allAntiTags.add(geneID)
}
}
def getTags(name:String, seq: DNASequence): TagResult = {
val allTags: List[String] = for (tag <- tagRegex.findAllMatchIn(seq.getSequenceAsString).toList) yield tag.toString
val firstTag = if (allTags.isEmpty) null else allTags.last
val allAntiTags: List[String] = for (tag <- tagRegex.findAllMatchIn(seq.getReverseComplement.getSequenceAsString).toList) yield tag.toString
val firstAntiTag = if (allAntiTags.isEmpty) null else allAntiTags.head
val result = new TagResult(firstTag, allTags, firstAntiTag, allAntiTags)
addTagresultToTaglib(name, result)
return result
}
}
package nl.lumc.sasc.biopet.pipelines.sage
import java.io.File
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.io.Source
import scala.collection.mutable.Map
import scala.collection.SortedMap
class CreateTagCounts(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Raw count file", shortName = "input", required = true)
var input: File = _
@Input(doc = "tag library", shortName = "taglib", required = true)
var tagLib: File = _
@Output(doc = "Sense count file", shortName = "sense", required = true)
var countSense: File = _
@Output(doc = "Sense all coun filet", shortName = "allsense", required = true)
var countAllSense: File = _
@Output(doc = "AntiSense count file", shortName = "antisense", required = true)
var countAntiSense: File = _
@Output(doc = "AntiSense all count file", shortName = "allantisense", required = true)
var countAllAntiSense: File = _
override val defaultVmem = "8G"
memoryLimit = Option(4.0)
override def commandLine = super.commandLine +
required("-I", input) +
required("-taglib", tagLib) +
optional("-sense", countSense) +
optional("-allsense", countAllSense) +
optional("-antisense", countAntiSense) +
optional("-allantisense", countAllAntiSense)
}
object CreateTagCounts {
var input: File = _
var tagLib: File = _
var countSense: File = _
var countAllSense: File = _
var countAntiSense: File = _
var countAllAntiSense: File = _
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
for (t <- 0 until args.size) {
args(t) match {
case "-I" => input = new File(args(t+1))
case "-taglib" => tagLib = new File(args(t+1))
case "-sense" => countSense = new File(args(t+1))
case "-allsense" => countAllSense = new File(args(t+1))
case "-antisense" => countAntiSense = new File(args(t+1))
case "-allantisense" => countAllAntiSense = new File(args(t+1))
case _ =>
}
}
if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I")
if (tagLib == null) throw new IllegalStateException("Output file not found, use -o")
val rawCounts: Map[String, Long] = Map()
for (line <- Source.fromFile(input).getLines) {
val values = line.split("\t")
val gene = values(0)
val count = values(1).toLong
if (rawCounts.contains(gene)) rawCounts(gene) += count
else rawCounts += gene -> count
}
val senseCounts: Map[String, Long] = Map()
val allSenseCounts: Map[String, Long] = Map()
val antiSenseCounts: Map[String, Long] = Map()
val allAntiSenseCounts: Map[String, Long] = Map()
for (line <- Source.fromFile(tagLib).getLines if !line.startsWith("#")) {
val values = line.split("\t")
val tag = values(0)
val sense = values(1)
val allSense = values(2)
val antiSense = if (values.size > 3) values(3) else ""
val allAntiSense = if (values.size > 4) values(4) else ""
if (!sense.isEmpty && !sense.contains(",")) {
val count = if (rawCounts.contains(tag)) rawCounts(tag) else 0
if (senseCounts.contains(sense)) senseCounts(sense) += count
else senseCounts += sense -> count
}
if (!allSense.isEmpty && !allSense.contains(",")) {
val count = if (rawCounts.contains(tag)) rawCounts(tag) else 0
if (allSenseCounts.contains(allSense)) allSenseCounts(allSense) += count
else allSenseCounts += allSense -> count
}
if (!antiSense.isEmpty && !antiSense.contains(",")) {
val count = if (rawCounts.contains(tag)) rawCounts(tag) else 0
if (antiSenseCounts.contains(antiSense)) antiSenseCounts(antiSense) += count
else antiSenseCounts += antiSense -> count
}
if (!allAntiSense.isEmpty && !allAntiSense.contains(",")) {
val count = if (rawCounts.contains(tag)) rawCounts(tag) else 0
if (allAntiSenseCounts.contains(allAntiSense)) allAntiSenseCounts(allAntiSense) += count
else allAntiSenseCounts += allAntiSense -> count
}
}
def writeFile(file:File, counts:Map[String, Long]) {
val sorted: SortedMap[String,Long] = SortedMap(counts.toArray:_*)
if (file != null) {
val writer = new PrintWriter(file)
for ((gene,count) <- sorted) {
if (count > 0) writer.println(gene + "\t" + count)
}
writer.close
}
}
writeFile(countSense, senseCounts)
writeFile(countAllSense, allSenseCounts)
writeFile(countAntiSense, antiSenseCounts)
writeFile(countAllAntiSense, allAntiSenseCounts)
}
}
\ No newline at end of file
......@@ -21,9 +21,22 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript {
@Input(doc = "squishedCountBed, by suppling this file the auto squish job will be skipped", required = false)
var squishedCountBed : File = _
@Input(doc = "Transcriptome, used for generation of tag library", required = false)
var transcriptome : File = _
var tagsLibrary : File = _
def init() {
for (file <- configfiles) globalConfig.loadConfigFile(file)
if (countBed == null && squishedCountBed == null) throw new IllegalStateException("No bedfile supplied, please add a countBed or squishedCountBed")
if (!outputDir.endsWith("/")) outputDir += "/"
if (countBed == null) countBed = config("count_bed")
if (squishedCountBed == null) squishedCountBed = config("squished_count_bed")
if (tagsLibrary == null) tagsLibrary = config("tags_library")
if (transcriptome == null) transcriptome = config("transcriptome")
if (transcriptome == null && tagsLibrary == null)
throw new IllegalStateException("No transcriptome or taglib found")
if (countBed == null && squishedCountBed == null)
throw new IllegalStateException("No bedfile supplied, please add a countBed or squishedCountBed")
}
def biopetScript() {
......@@ -33,7 +46,16 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript {
squishedCountBed = squishBed.output
}
// Tag library creation
if (tagsLibrary == null) {
val cdl = new CreateDeepsageLibrary(this)
cdl.input = transcriptome
cdl.output = outputDir + "tablib/tab.lib"
cdl.noAntiTagsOutput = outputDir + "tablib/no_antisense_genes.txt"
cdl.noTagsOutput = outputDir + "tablib/no_sense_genes.txt"
cdl.allGenesOutput = outputDir + "tablib/all_genes.txt"
add(cdl)
tagsLibrary = cdl.output
}
runSamplesJobs
}
......@@ -56,14 +78,15 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript {
add(mergeSamFiles)
mergeSamFiles.output
} else null
val fastqFile: File = if (libraryBamfiles.size == 1) libraryBamfiles.head
else if (libraryBamfiles.size > 1) {
val cat = Cat.apply(this, libraryBamfiles, sampleDir + sampleID + ".fastq")
val fastqFile: File = if (libraryFastqFiles.size == 1) libraryFastqFiles.head
else if (libraryFastqFiles.size > 1) {
val cat = Cat.apply(this, libraryFastqFiles, sampleDir + sampleID + ".fastq")
add(cat)
cat.output
} else null
this.addBedtoolsCounts(bamFile, sampleID, sampleDir)
addBedtoolsCounts(bamFile, sampleID, sampleDir)
addTablibCounts(fastqFile, sampleID, sampleDir)
return outputFiles
}
......@@ -109,9 +132,8 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript {
addAll(mapping.functions)
if (config("library_counts", default = false).getBoolean) {
this.addBedtoolsCounts(mapping.outputFiles("finalBamFile"), sampleID + "-" + runID, runDir)
// TODO: Tag counts
addBedtoolsCounts(mapping.outputFiles("finalBamFile"), sampleID + "-" + runID, runDir)
addTablibCounts(prefixFastq.output, sampleID + "-" + runID, runDir)
}
outputFiles += ("FinalBam" -> mapping.outputFiles("finalBamFile"))
......@@ -120,13 +142,29 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript {
}
def addBedtoolsCounts(bamFile:File, outputPrefix: String, outputDir: String) {
add(BedtoolsCoverage(this, bamFile, squishedCountBed, outputDir + outputPrefix + ".sense.count",
add(BedtoolsCoverage(this, bamFile, squishedCountBed, outputDir + outputPrefix + ".genome.sense.counts",
depth = false, sameStrand = true, diffStrand = false))
add(BedtoolsCoverage(this, bamFile, squishedCountBed, outputDir + outputPrefix + ".antisense.count",
add(BedtoolsCoverage(this, bamFile, squishedCountBed, outputDir + outputPrefix + ".genome.antisense.counts",
depth = false, sameStrand = false, diffStrand = true))
add(BedtoolsCoverage(this, bamFile, squishedCountBed, outputDir + outputPrefix + ".count",
add(BedtoolsCoverage(this, bamFile, squishedCountBed, outputDir + outputPrefix + ".genome.counts",
depth = false, sameStrand = false, diffStrand = false))
}
def addTablibCounts(fastq:File, outputPrefix: String, outputDir: String) {
val countFastq = new CountFastq(this)
countFastq.input = fastq
countFastq.output = outputDir + outputPrefix + ".raw.counts"
add(countFastq)
val createTagCounts = new CreateTagCounts(this)
createTagCounts.input = countFastq.output
createTagCounts.tagLib = tagsLibrary
createTagCounts.countSense = outputDir + outputPrefix + ".tagcount.sense.counts"
createTagCounts.countAllSense = outputDir + outputPrefix + ".tagcount.all.sense.counts"
createTagCounts.countAntiSense = outputDir + outputPrefix + ".tagcount.antisense.counts"
createTagCounts.countAllAntiSense = outputDir + outputPrefix + ".tagcount.all.antisense.counts"
add(createTagCounts)
}
}
object Sage extends PipelineCommand {
......
Markdown is supported
0% or .