Commit b7690fca authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Added chunking for mapping

parent 6090fa33
package nl.lumc.sasc.biopet.core.apps
import java.io.BufferedInputStream
import java.io.File
import java.io.FileInputStream
import java.io.PrintWriter
import java.util.zip.GZIPInputStream
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import scala.io.Source
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.sting.commandline._
class FastqSplitter(val root:Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = "nl.lumc.sasc.biopet.core.apps.FastqSplitter"
@Input(doc="Input fastq", shortName = "input", required = true)
var input: File = _
@Output(doc="Output fastq files", shortName="output", required = true)
var output: List[File] = Nil
override def commandLine = super.commandLine + required(input) + repeat(output)
}
object FastqSplitter {
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
val groupsize = 100
val input = new File(args.head)
val output:Array[PrintWriter] = new Array[PrintWriter](args.tail.size)
for (t <- 1 to args.tail.size) output(t-1) = new PrintWriter(args(t))
val inputStream = {
if (input.getName.endsWith(".gz") || input.getName.endsWith(".gzip")) Source.fromInputStream(
new GZIPInputStream(
new BufferedInputStream(
new FileInputStream(input)))).bufferedReader
else Source.fromFile(input).bufferedReader
}
while (inputStream.ready) {
for (writter <- output) {
for (t <- 1 to groupsize) {
for (t <- 1 to (4)) {
if (inputStream.ready) {
writter.write(inputStream.readLine + "\n")
}
}
}
writter.flush
}
}
for (writter <- output) writter.close
}
}
package nl.lumc.sasc.biopet.function
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.core.config._
import org.broadinstitute.sting.commandline._
import java.io.File
class Cat(val root:Configurable) extends BiopetCommandLineFunction {
@Input(doc="Input file", required=true)
var input: List[File] = Nil
@Output(doc="Unzipped file", required=true)
var output: File = _
executeble = config("exe", "cat")
def cmdLine = required(executeble) + repeat(input) + " > " + required(output)
}
object Cat {
def apply(root:Configurable, input:List[File], output:File): Cat = {
val cat = new Cat(root)
cat.input = input
cat.output = output
return cat
}
}
\ No newline at end of file
......@@ -58,8 +58,15 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
}
def biopetScript() {
runInitialFastqc()
runInitialJobs()
if (paired) runTrimClip(outputFiles("fastq_input_R1"), outputFiles("fastq_input_R2"), outputDir)
else runTrimClip(outputFiles("fastq_input_R1"), outputDir)
runFinalize(List(outputFiles("output_R1")), if (outputFiles.contains("output_R2")) List(outputFiles("output_R2")) else List())
}
def runInitialJobs() {
outputFiles += ("fastq_input_R1" -> extractIfNeeded(input_R1,outputDir))
if (paired) outputFiles += ("fastq_input_R2" -> extractIfNeeded(input_R2,outputDir))
......@@ -69,13 +76,6 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
addSha1sum(outputFiles("fastq_input_R1"), "sha1_R1")
if (paired) addSha1sum(outputFiles("fastq_input_R2"), "sha1_R2")
if (paired) runTrimClip(outputFiles("fastq_input_R1"), outputFiles("fastq_input_R2"), outputDir)
else runTrimClip(outputFiles("fastq_input_R1"), outputDir)
runFinalize(List(outputFiles("output_R1")), if (outputFiles.contains("output_R2")) List(outputFiles("output_R2")) else List())
}
def runInitialFastqc() {
var fastqc_R1 = runFastqc(input_R1,outputDir + "/" + R1_name + ".fastqc/")
outputFiles += ("fastqc_R1" -> fastqc_R1.output)
outputFiles += ("qualtype_R1" -> getQualtype(fastqc_R1, R1_name))
......@@ -115,12 +115,13 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
def runTrimClip(R1_in:File, R2_in:File, outDir:String) {
runTrimClip(R1_in, R2_in, outDir, "")
}
def runTrimClip(R1_in:File, R2_in:File, outDir:String, chunk:String) {
def runTrimClip(R1_in:File, R2_in:File, outDir:String, chunkarg:String) : (File,File) = {
val chunk = if (chunkarg.isEmpty || chunkarg.endsWith("_")) chunkarg else chunkarg + "_"
var results: Map[String,File] = Map()
var R1: File = new File(R1_in)
var R2: File = new File(R2_in)
if (!skipClip) { // Adapter clipping
val cutadapt_R1 = new Cutadapt(this)
if (!skipTrim || paired) cutadapt_R1.isIntermediate = true
......@@ -157,12 +158,12 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
sickle.input_R1 = R1
sickle.output_R1 = swapExt(outDir, R1, R1_ext, ".trim"+R1_ext)
if (outputFiles.contains("qualtype_R1")) sickle.qualityTypeFile = outputFiles("qualtype_R1")
if (!skipClip) sickle.deps :+= outputFiles(chunk + "fastq_input_R1")
if (!skipClip) sickle.deps :+= R1_in
if (paired) {
sickle.input_R2 = R2
sickle.output_R2 = swapExt(outDir, R2, R2_ext, ".trim"+R2_ext)
sickle.output_singles = swapExt(outDir, R2, R2_ext, ".trim.singles"+R1_ext)
if (!skipClip) sickle.deps :+= outputFiles(chunk + "fastq_input_R2")
if (!skipClip) sickle.deps :+= R2_in
}
sickle.output_stats = swapExt(outDir, R1, R1_ext, ".trim.stats")
add(sickle)
......@@ -170,8 +171,9 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
if (paired) R2 = sickle.output_R2
}
outputFiles += ("output_R1" -> R1)
if (paired) outputFiles += ("output_R2" -> R2)
outputFiles += (chunk + "output_R1" -> R1)
if (paired) outputFiles += (chunk + "output_R2" -> R2)
return (R1, R2)
}
def runFinalize(fastq_R1:List[File], fastq_R2:List[File]) {
......@@ -182,10 +184,17 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
for (file <- fastq_R1) R1 = file
for (file <- fastq_R2) R2 = file
} else {
throw new IllegalStateException("Not yet inplemented") // For chuncking
R1 = new File(outputDir + R1_name + ".qc" + R1_ext)
logger.debug(fastq_R1)
add(Cat(this, fastq_R1, R1), true)
if (paired) {
logger.debug(fastq_R2)
R2 = new File(outputDir + R2_name + ".qc" + R2_ext)
add(Cat(this, fastq_R2, R2), true)
}
}
if (!config("skip_native_link", false)) {
if (fastq_R1.length == 1 && !config("skip_native_link", false)) {
val lnR1 = new Ln(this)
lnR1.in = R1
R1 = new File(outputDir + R1_name + ".qc" + R1_ext)
......@@ -200,6 +209,9 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
}
}
outputFiles += ("output_R1" -> R1)
if (paired) outputFiles += ("output_R2" -> R2)
if (!skipTrim || !skipClip) {
addSeqstat(R1, "seqstat_qc_R1")
if (paired) addSeqstat(R2, "seqstat_qc_R2")
......
......@@ -4,16 +4,19 @@ import nl.lumc.sasc.biopet.function._
import nl.lumc.sasc.biopet.function.aligners._
import java.util.Date
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.core.apps.FastqSplitter
import nl.lumc.sasc.biopet.core.config._
import nl.lumc.sasc.biopet.pipelines.flexiprep._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.picard.MarkDuplicates
import org.broadinstitute.sting.queue.extensions.picard.MergeSamFiles
import org.broadinstitute.sting.queue.extensions.picard.SortSam
import org.broadinstitute.sting.queue.extensions.picard.AddOrReplaceReadGroups
import org.broadinstitute.sting.queue.function._
import scala.util.parsing.json._
import org.broadinstitute.sting.utils.variant._
import scala.math._
class Mapping(val root:Configurable) extends QScript with BiopetQScript {
qscript =>
......@@ -40,6 +43,9 @@ class Mapping(val root:Configurable) extends QScript with BiopetQScript {
@Argument(doc="Reference", shortName="R", required=false)
var referenceFile: File = _
@Argument(doc="auto chuning", shortName="chunking", required=false)
var chunking: Boolean = false
// Readgroup items
@Argument(doc="Readgroup ID", shortName="RGID", required=false)
var RGID: String = _
......@@ -68,9 +74,8 @@ class Mapping(val root:Configurable) extends QScript with BiopetQScript {
@Argument(doc="Readgroup predicted insert size", shortName="RGPI", required=false)
var RGPI: Int = _
var paired: Boolean = false
var numberChunks = 0
def init() {
for (file <- configfiles) globalConfig.loadConfigFile(file)
......@@ -99,42 +104,101 @@ class Mapping(val root:Configurable) extends QScript with BiopetQScript {
if (RGDS == null && configContains("RGDS")) RGDS = config("RGDS")
if (outputName == null) outputName = RGID
if (!chunking && numberChunks > 1) chunking = true
if (!chunking) chunking = config("chunking", false)
if (chunking) {
val chunkSize:Int = config("chunksize", (1 << 30))
val filesize = if (input_R1.getName.endsWith(".gz") || input_R1.getName.endsWith(".gzip")) input_R1.length * 3
else input_R1.length
if (numberChunks == 0 && configContains("numberchunks")) numberChunks = config("numberchunks")
else if (numberChunks == 0) numberChunks = ceil(filesize.toDouble / chunkSize).toInt
logger.debug("Chunks: " + numberChunks)
}
}
def biopetScript() {
var fastq_R1: String = input_R1
var fastq_R2: String = if (paired) input_R2 else ""
var fastq_R1: File = input_R1
var fastq_R2: File = if (paired) input_R2 else ""
val flexiprep = new Flexiprep(this)
var bamFiles:List[File] = Nil
var fastq_R1_output: List[File] = Nil
var fastq_R2_output: List[File] = Nil
def removeGz(file:String):String = {
if (file.endsWith(".gz")) return file.substring(0, file.lastIndexOf(".gz"))
else if (file.endsWith(".gzip")) return file.substring(0, file.lastIndexOf(".gzip"))
else return file
}
var chunks:Map[String, (String,String)] = Map()
if (chunking) for (t <- 1 to numberChunks) {
chunks += ("chunk_" + t -> (removeGz(outputDir + "chunk_" + t + "/" + fastq_R1.getName),
if (paired) removeGz(outputDir + "chunk_" + t + "/" + fastq_R2.getName) else ""))
} else chunks += ("" -> (fastq_R1,fastq_R2))
if (chunking) {
val fastSplitter_R1 = new FastqSplitter(this)
fastSplitter_R1.input = fastq_R1
fastSplitter_R1.memoryLimit = 4
fastSplitter_R1.jobResourceRequests :+= "h_vmem=8G"
for ((chunk,fastqfile) <- chunks) fastSplitter_R1.output :+= fastqfile._1
add(fastSplitter_R1)
if (paired) {
val fastSplitter_R2 = new FastqSplitter(this)
fastSplitter_R2.input = fastq_R2
fastSplitter_R2.memoryLimit = 4
fastSplitter_R2.jobResourceRequests :+= "h_vmem=8G"
for ((chunk,fastqfile) <- chunks) fastSplitter_R2.output :+= fastqfile._2
add(fastSplitter_R2)
}
}
for ((chunk,fastqfile) <- chunks) {
var R1 = fastqfile._1
var R2 = fastqfile._2
if (!skipFlexiprep) {
flexiprep.input_R1 = fastq_R1
if (paired) flexiprep.input_R2 = fastq_R2
flexiprep.outputDir = outputDir + "flexiprep/"
flexiprep.init
flexiprep.runInitialJobs
//flexiprep.biopetScript
val flexiout = flexiprep.runTrimClip(R1, R2, outputDir + chunk, chunk)
logger.debug(chunk + " - " + flexiout)
R1 = flexiout._1
if (paired) R2 = flexiout._2
fastq_R1_output :+= R1
fastq_R2_output :+= R2
}
val chunkDir = if (chunk.isEmpty) outputDir else outputDir + chunk + "/"
if (aligner == "bwa") {
val bwaCommand = new Bwa(this)
bwaCommand.R1 = R1
if (paired) bwaCommand.R2 = R2
bwaCommand.RG = getReadGroup
bwaCommand.output = new File(chunkDir + outputName + ".sam")
add(bwaCommand, isIntermediate = true)
bamFiles :+= addSortSam(List(bwaCommand.output), swapExt(chunkDir,bwaCommand.output,".sam",".bam"), chunkDir)
} else if (aligner == "star") {
val starCommand = Star(this, R1, if (paired) R2 else null, outputDir, isIntermediate = true)
add(starCommand)
bamFiles :+= addAddOrReplaceReadGroups(List(starCommand.outputSam), new File(chunkDir + outputName + ".bam"), chunkDir)
} else if (aligner == "star-2pass") {
val star2pass = Star._2pass(this, R1, if (paired) R2 else null, chunkDir, isIntermediate = true)
addAll(star2pass._2)
bamFiles :+= addAddOrReplaceReadGroups(List(star2pass._1), new File(chunkDir + outputName + ".bam"), chunkDir)
} else throw new IllegalStateException("Option Alginer: '" + aligner + "' is not valid")
}
if (!skipFlexiprep) {
val flexiprep = new Flexiprep(this)
flexiprep.input_R1 = fastq_R1
if (paired) flexiprep.input_R2 = fastq_R2
flexiprep.outputDir = outputDir + "flexiprep/"
flexiprep.init
flexiprep.biopetScript
flexiprep.runFinalize(fastq_R1_output, fastq_R2_output)
addAll(flexiprep.functions) // Add function of flexiprep to curent function pool
fastq_R1 = flexiprep.outputFiles("output_R1")
if (paired) fastq_R2 = flexiprep.outputFiles("output_R2")
}
var bamFile:File = null
if (aligner == "bwa") {
val bwaCommand = new Bwa(this)
bwaCommand.R1 = fastq_R1
if (paired) bwaCommand.R2 = fastq_R2
bwaCommand.RG = getReadGroup
bwaCommand.output = new File(outputDir + outputName + ".sam")
add(bwaCommand, isIntermediate = true)
bamFile = addSortSam(List(bwaCommand.output), swapExt(outputDir,bwaCommand.output,".sam",".bam"), outputDir)
} else if (aligner == "star") {
val starCommand = Star(this, fastq_R1, if (paired) fastq_R2 else null, outputDir, isIntermediate = true)
add(starCommand)
bamFile = addAddOrReplaceReadGroups(List(starCommand.outputSam), new File(outputDir + outputName + ".bam"), outputDir)
} else if (aligner == "star-2pass") {
val star2pass = Star._2pass(this, fastq_R1, if (paired) fastq_R2 else null, outputDir, isIntermediate = true)
addAll(star2pass._2)
bamFile = addAddOrReplaceReadGroups(List(star2pass._1), new File(outputDir + outputName + ".bam"), outputDir)
} else throw new IllegalStateException("Option Alginer: '" + aligner + "' is not valid")
if (!skipMarkduplicates) bamFile = addMarkDuplicates(List(bamFile), swapExt(outputDir,bamFile,".bam",".dedup.bam"), outputDir)
var bamFile = ""
if (!skipMarkduplicates) bamFile = addMarkDuplicates(bamFiles, new File(outputDir + outputName + ".dedup.bam"), outputDir)
if (skipMarkduplicates && chunking) bamFile = addMergeBam(bamFiles, new File(outputDir + outputName + ".bam"), outputDir)
outputFiles += ("finalBamFile" -> bamFile)
}
......@@ -152,6 +216,22 @@ class Mapping(val root:Configurable) extends QScript with BiopetQScript {
return sortSam.output
}
def addMergeBam(inputSam:List[File], outputFile:File, dir:String) : File = {
val mergeSam = new MergeSamFiles
mergeSam.input = inputSam
mergeSam.createIndex = true
mergeSam.output = outputFile
mergeSam.memoryLimit = 2
mergeSam.nCoresRequest = 2
mergeSam.assumeSorted = true
mergeSam.USE_THREADING = true
mergeSam.jobResourceRequests :+= "h_vmem=4G"
if (!skipMarkduplicates) mergeSam.isIntermediate = true
add(mergeSam)
return mergeSam.output
}
def addAddOrReplaceReadGroups(inputSam:List[File], outputFile:File, dir:String) : File = {
val addOrReplaceReadGroups = new AddOrReplaceReadGroups
addOrReplaceReadGroups.input = inputSam
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment