From eac4cd7be5fae0eaf05601676c6212cef2709e85 Mon Sep 17 00:00:00 2001 From: Peter van 't Hof <p.j.van_t_hof@lumc.nl> Date: Sat, 22 Nov 2014 16:31:43 +0100 Subject: [PATCH] Added mapping as separated artifact --- public/mapping/.gitignore | 1 + public/mapping/pom.xml | 88 +++++ .../biopet/pipelines/mapping/Mapping.scala | 316 ++++++++++++++++++ 3 files changed, 405 insertions(+) create mode 100644 public/mapping/.gitignore create mode 100644 public/mapping/pom.xml create mode 100644 public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala diff --git a/public/mapping/.gitignore b/public/mapping/.gitignore new file mode 100644 index 000000000..a6f89c2da --- /dev/null +++ b/public/mapping/.gitignore @@ -0,0 +1 @@ +/target/ \ No newline at end of file diff --git a/public/mapping/pom.xml b/public/mapping/pom.xml new file mode 100644 index 000000000..9a7f688f4 --- /dev/null +++ b/public/mapping/pom.xml @@ -0,0 +1,88 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>nl.lumc.sasc</groupId> + <artifactId>Mapping</artifactId> + <version>0.2.0-DEV</version> + <packaging>jar</packaging> + + <inceptionYear>2014</inceptionYear> + <name>Mapping</name> + <url>http://maven.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <sting.unpack.phase>prepare-package</sting.unpack.phase> + <sting.shade.phase>package</sting.shade.phase> + <app.main.class>nl.lumc.sasc.biopet.core.BiopetExecutable</app.main.class> + </properties> + + <dependencies> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>BiopetFramework</artifactId> + <version>0.2.0-DEV</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Flexiprep</artifactId> + <version>0.2.0-DEV</version> + </dependency> + </dependencies> + <build> + <resources> + <resource> + <directory>src/main/resources</directory> + <includes> + <include>**/*</include> + </includes> + </resource> + </resources> + <plugins> + <plugin> + <groupId>org.scala-tools</groupId> + <artifactId>maven-scala-plugin</artifactId> + <version>2.15.2</version> + <executions> + <execution> + <id>scala-compile</id> + <goals> + <goal>compile</goal> + <goal>testCompile</goal> + </goals> + <configuration> + <args> + <arg>-dependencyfile</arg> + <arg>${project.build.directory}/.scala_dependencies</arg> + <arg>-deprecation</arg> + <arg>-feature</arg> + </args> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <version>2.5</version> + <configuration> + <archive> + <manifest> + <addDefaultImplementationEntries>true</addDefaultImplementationEntries> + <addDefaultSpecificationEntries>true</addDefaultSpecificationEntries> + </manifest> + </archive> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>2.3.2</version> + <configuration> + <showDeprecation>true</showDeprecation> + </configuration> + </plugin> + </plugins> + </build> +</project> diff --git a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala new file mode 100644 index 000000000..681948583 --- /dev/null +++ b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala @@ -0,0 +1,316 @@ +package nl.lumc.sasc.biopet.pipelines.mapping + +import nl.lumc.sasc.biopet.core.config.Configurable +import java.io.File +import java.util.Date +import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } +import nl.lumc.sasc.biopet.tools.FastqSplitter +import nl.lumc.sasc.biopet.extensions.aligners.{ Bwa, Star, Bowtie, Stampy } +import nl.lumc.sasc.biopet.extensions.picard.{ MarkDuplicates, SortSam, MergeSamFiles, AddOrReplaceReadGroups } +import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics +import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep +import org.broadinstitute.gatk.queue.QScript +import org.broadinstitute.gatk.utils.commandline.{ Input, Argument, ClassType } +import scala.math._ + +class Mapping(val root: Configurable) extends QScript with BiopetQScript { + qscript => + def this() = this(null) + + @Input(doc = "R1 fastq file", shortName = "R1", required = true) + var input_R1: File = _ + + @Input(doc = "R2 fastq file", shortName = "R2", required = false) + var input_R2: File = _ + + @Argument(doc = "Output name", shortName = "outputName", required = false) + var outputName: String = _ + + @Argument(doc = "Skip flexiprep", shortName = "skipflexiprep", required = false) + var skipFlexiprep: Boolean = false + + @Argument(doc = "Skip mark duplicates", shortName = "skipmarkduplicates", required = false) + var skipMarkduplicates: Boolean = false + + @Argument(doc = "Skip metrics", shortName = "skipmetrics", required = false) + var skipMetrics: Boolean = false + + @Argument(doc = "Aligner", shortName = "ALN", required = false) + var aligner: String = config("aligner", default = "bwa") + + @Argument(doc = "Reference", shortName = "R", required = false) + var reference: File = config("reference") + + @Argument(doc = "Chunking", shortName = "chunking", required = false) + var chunking: Boolean = config("chunking", false) + + @ClassType(classOf[Int]) + @Argument(doc = "Number of chunks, when not defined pipeline will automatic calculate number of chunks", shortName = "numberChunks", required = false) + var numberChunks: Option[Int] = None + + // Readgroup items + @Argument(doc = "Readgroup ID", shortName = "RGID", required = false) + var RGID: String = config("RGID") + + @Argument(doc = "Readgroup Library", shortName = "RGLB", required = false) + var RGLB: String = config("RGLB") + + @Argument(doc = "Readgroup Platform", shortName = "RGPL", required = false) + var RGPL: String = config("RGPL", default = "illumina") + + @Argument(doc = "Readgroup platform unit", shortName = "RGPU", required = false) + var RGPU: String = config("RGPU", default = "na") + + @Argument(doc = "Readgroup sample", shortName = "RGSM", required = false) + var RGSM: String = config("RGSM") + + @Argument(doc = "Readgroup sequencing center", shortName = "RGCN", required = false) + var RGCN: String = config("RGCN") + + @Argument(doc = "Readgroup description", shortName = "RGDS", required = false) + var RGDS: String = config("RGDS") + + @Argument(doc = "Readgroup sequencing date", shortName = "RGDT", required = false) + var RGDT: Date = _ + + @Argument(doc = "Readgroup predicted insert size", shortName = "RGPI", required = false) + var RGPI: Int = config("RGPI") + + var paired: Boolean = false + val flexiprep = new Flexiprep(this) + + def init() { + if (outputDir == null) throw new IllegalStateException("Missing Output directory on mapping module") + else if (!outputDir.endsWith("/")) outputDir += "/" + if (input_R1 == null) throw new IllegalStateException("Missing FastQ R1 on mapping module") + paired = (input_R2 != null) + + if (RGLB == null) throw new IllegalStateException("Missing Readgroup library on mapping module") + if (RGLB == null) throw new IllegalStateException("Missing Readgroup sample on mapping module") + if (RGID == null && RGSM != null && RGLB != null) RGID = RGSM + "-" + RGLB + else if (RGID == null) throw new IllegalStateException("Missing Readgroup ID on mapping module") + + if (outputName == null) outputName = RGID + + if (chunking) { + if (numberChunks.isEmpty) { + if (config.contains("numberchunks")) numberChunks = config("numberchunks", default = None) + else { + val chunkSize: Int = config("chunksize", (1 << 30)) + val filesize = if (input_R1.getName.endsWith(".gz") || input_R1.getName.endsWith(".gzip")) input_R1.length * 3 + else input_R1.length + numberChunks = Option(ceil(filesize.toDouble / chunkSize).toInt) + } + } + logger.debug("Chunks: " + numberChunks.getOrElse(1)) + } + } + + def biopetScript() { + var fastq_R1: File = input_R1 + var fastq_R2: File = if (paired) input_R2 else "" + if (!skipFlexiprep) { + flexiprep.outputDir = outputDir + "flexiprep/" + flexiprep.input_R1 = fastq_R1 + if (paired) flexiprep.input_R2 = fastq_R2 + flexiprep.sampleName = this.RGSM + flexiprep.libraryName = this.RGLB + flexiprep.init + flexiprep.runInitialJobs + } + var bamFiles: List[File] = Nil + var fastq_R1_output: List[File] = Nil + var fastq_R2_output: List[File] = Nil + + def removeGz(file: String): String = { + if (file.endsWith(".gz")) return file.substring(0, file.lastIndexOf(".gz")) + else if (file.endsWith(".gzip")) return file.substring(0, file.lastIndexOf(".gzip")) + else return file + } + var chunks: Map[String, (String, String)] = Map() + if (chunking) for (t <- 1 to numberChunks.getOrElse(1)) { + val chunkDir = outputDir + "chunks/" + t + "/" + chunks += (chunkDir -> (removeGz(chunkDir + fastq_R1.getName), + if (paired) removeGz(chunkDir + fastq_R2.getName) else "")) + } + else chunks += (outputDir -> (flexiprep.extractIfNeeded(fastq_R1, flexiprep.outputDir), + flexiprep.extractIfNeeded(fastq_R2, flexiprep.outputDir))) + + if (chunking) { + val fastSplitter_R1 = new FastqSplitter(this) + fastSplitter_R1.input = fastq_R1 + for ((chunkDir, fastqfile) <- chunks) fastSplitter_R1.output :+= fastqfile._1 + add(fastSplitter_R1, isIntermediate = true) + + if (paired) { + val fastSplitter_R2 = new FastqSplitter(this) + fastSplitter_R2.input = fastq_R2 + for ((chunkDir, fastqfile) <- chunks) fastSplitter_R2.output :+= fastqfile._2 + add(fastSplitter_R2, isIntermediate = true) + } + } + + for ((chunkDir, fastqfile) <- chunks) { + var R1 = fastqfile._1 + var R2 = fastqfile._2 + var deps: List[File] = Nil + if (!skipFlexiprep) { + val flexiout = flexiprep.runTrimClip(R1, R2, chunkDir + "flexiprep/", chunkDir) + logger.debug(chunkDir + " - " + flexiout) + R1 = flexiout._1 + if (paired) R2 = flexiout._2 + deps = flexiout._3 + fastq_R1_output :+= R1 + fastq_R2_output :+= R2 + } + + val outputBam = new File(chunkDir + outputName + ".bam") + bamFiles :+= outputBam + aligner match { + case "bwa" => addBwa(R1, R2, outputBam, deps) + case "bowtie" => addBowtie(R1, R2, outputBam, deps) + case "stampy" => addStampy(R1, R2, outputBam, deps) + case "star" => addStar(R1, R2, outputBam, deps) + case "star-2pass" => addStar2pass(R1, R2, outputBam, deps) + case _ => throw new IllegalStateException("Option Aligner: '" + aligner + "' is not valid") + } + if (config("chunk_metrics", default = false)) + addAll(BamMetrics(this, outputBam, chunkDir + "metrics/").functions) + } + if (!skipFlexiprep) { + flexiprep.runFinalize(fastq_R1_output, fastq_R2_output) + addAll(flexiprep.functions) // Add function of flexiprep to curent function pool + } + + var bamFile = bamFiles.head + if (!skipMarkduplicates) { + bamFile = new File(outputDir + outputName + ".dedup.bam") + add(MarkDuplicates(this, bamFiles, bamFile)) + } else if (skipMarkduplicates && chunking) { + val mergeSamFile = MergeSamFiles(this, bamFiles, outputDir) + add(mergeSamFile) + bamFile = mergeSamFile.output + } + + if (!skipMetrics) addAll(BamMetrics(this, bamFile, outputDir + "metrics/").functions) + + outputFiles += ("finalBamFile" -> bamFile) + } + + def addBwa(R1: File, R2: File, output: File, deps: List[File]): File = { + val bwaCommand = new Bwa(this) + bwaCommand.R1 = R1 + if (paired) bwaCommand.R2 = R2 + bwaCommand.deps = deps + bwaCommand.R = getReadGroup + bwaCommand.output = this.swapExt(output.getParent, output, ".bam", ".sam") + add(bwaCommand, isIntermediate = true) + val sortSam = SortSam(this, bwaCommand.output, output) + if (chunking || !skipMarkduplicates) sortSam.isIntermediate = true + add(sortSam) + return sortSam.output + } + + def addStampy(R1: File, R2: File, output: File, deps: List[File]): File = { + + var RG: String = "ID:" + RGID + "," + RG += "SM:" + RGSM + "," + RG += "LB:" + RGLB + "," + if (RGDS != null) RG += "DS" + RGDS + "," + RG += "PU:" + RGPU + "," + if (RGPI > 0) RG += "PI:" + RGPI + "," + if (RGCN != null) RG += "CN:" + RGCN + "," + if (RGDT != null) RG += "DT:" + RGDT + "," + RG += "PL:" + RGPL + + val stampyCmd = new Stampy(this) + stampyCmd.R1 = R1 + if (paired) stampyCmd.R2 = R2 + stampyCmd.deps = deps + stampyCmd.readgroup = RG + stampyCmd.sanger = true + stampyCmd.output = this.swapExt(output.getParent, output, ".bam", ".sam") + add(stampyCmd, isIntermediate = true) + val sortSam = SortSam(this, stampyCmd.output, output) + if (chunking || !skipMarkduplicates) sortSam.isIntermediate = true + add(sortSam) + return sortSam.output + } + + def addBowtie(R1: File, R2: File, output: File, deps: List[File]): File = { + val bowtie = new Bowtie(this) + bowtie.R1 = R1 + if (paired) bowtie.R2 = R2 + bowtie.deps = deps + bowtie.output = this.swapExt(output.getParent, output, ".bam", ".sam") + add(bowtie, isIntermediate = true) + return addAddOrReplaceReadGroups(bowtie.output, output) + } + + def addStar(R1: File, R2: File, output: File, deps: List[File]): File = { + val starCommand = Star(this, R1, if (paired) R2 else null, outputDir, isIntermediate = true, deps = deps) + add(starCommand) + return addAddOrReplaceReadGroups(starCommand.outputSam, output) + } + + def addStar2pass(R1: File, R2: File, output: File, deps: List[File]): File = { + val starCommand = Star._2pass(this, R1, if (paired) R2 else null, outputDir, isIntermediate = true, deps = deps) + addAll(starCommand._2) + return addAddOrReplaceReadGroups(starCommand._1, output) + } + + def addAddOrReplaceReadGroups(input: File, output: File): File = { + val addOrReplaceReadGroups = AddOrReplaceReadGroups(this, input, output) + addOrReplaceReadGroups.createIndex = true + + addOrReplaceReadGroups.RGID = RGID + addOrReplaceReadGroups.RGLB = RGLB + addOrReplaceReadGroups.RGPL = RGPL + addOrReplaceReadGroups.RGPU = RGPU + addOrReplaceReadGroups.RGSM = RGSM + if (RGCN != null) addOrReplaceReadGroups.RGCN = RGCN + if (RGDS != null) addOrReplaceReadGroups.RGDS = RGDS + if (!skipMarkduplicates) addOrReplaceReadGroups.isIntermediate = true + add(addOrReplaceReadGroups) + + return addOrReplaceReadGroups.output + } + + def getReadGroup(): String = { + var RG: String = "@RG\\t" + "ID:" + RGID + "\\t" + RG += "LB:" + RGLB + "\\t" + RG += "PL:" + RGPL + "\\t" + RG += "PU:" + RGPU + "\\t" + RG += "SM:" + RGSM + "\\t" + if (RGCN != null) RG += "CN:" + RGCN + "\\t" + if (RGDS != null) RG += "DS" + RGDS + "\\t" + if (RGDT != null) RG += "DT" + RGDT + "\\t" + if (RGPI > 0) RG += "PI" + RGPI + "\\t" + + return RG.substring(0, RG.lastIndexOf("\\t")) + } +} + +object Mapping extends PipelineCommand { + def loadFromLibraryConfig(root: Configurable, runConfig: Map[String, Any], sampleConfig: Map[String, Any], + runDir: String, startJobs: Boolean = true): Mapping = { + val mapping = new Mapping(root) + + logger.debug("Mapping runconfig: " + runConfig) + if (runConfig.contains("R1")) mapping.input_R1 = new File(runConfig("R1").toString) + if (runConfig.contains("R2")) mapping.input_R2 = new File(runConfig("R2").toString) + mapping.paired = (mapping.input_R2 != null) + mapping.RGLB = runConfig("ID").toString + mapping.RGSM = sampleConfig("ID").toString + if (runConfig.contains("PL")) mapping.RGPL = runConfig("PL").toString + if (runConfig.contains("PU")) mapping.RGPU = runConfig("PU").toString + if (runConfig.contains("CN")) mapping.RGCN = runConfig("CN").toString + mapping.outputDir = runDir + + if (startJobs) { + mapping.init + mapping.biopetScript + } + return mapping + } +} -- GitLab