Merge branch 'feature-Chipcap' into 'develop'

Feature chipcap CArp pipeline can be merged into develop! See merge request !77

Merge branch 'feature-Chipcap' into 'develop'
d46ad182 · bow · c874be08 · 51da5c65 · d46ad182 · d46ad182
Commit d46ad182 authored 10 years ago by bow
--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala
@@ -17,7 +17,7 @@ package nl.lumc.sasc.biopet.core

 import java.io.File
 import java.io.PrintWriter
-import nl.lumc.sasc.biopet.core.config.{ Config, Configurable }
+import nl.lumc.sasc.biopet.core.config.{ ConfigValueIndex, Config, Configurable }
 import org.broadinstitute.gatk.utils.commandline.Argument
 import org.broadinstitute.gatk.queue.QSettings
 import org.broadinstitute.gatk.queue.function.QFunction
@@ -29,7 +29,14 @@ trait BiopetQScript extends Configurable with GatkLogging {
  @Argument(doc = "JSON config file(s)", fullName = "config_file", shortName = "config", required = false)
  val configfiles: List[File] = Nil

-  var outputDir: String = _
+  var outputDir: String = {
+    val temp = Config.getValueFromMap(Config.global.map, ConfigValueIndex(this.configName, configPath, "output_dir"))
+    if (temp.isEmpty) throw new IllegalArgumentException("No output_dir defined in config")
+    else {
+      val t = temp.get.value.toString
+      if (!t.endsWith("/")) t + "/" else t
+    }
+  }

  @Argument(doc = "Disable all scatters", shortName = "DSC", required = false)
  var disableScatter: Boolean = false

--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala
@@ -108,7 +108,7 @@ trait MultiSampleQScript extends BiopetQScript {
    def createFile(suffix: String) = new File(sampleDir, sampleId + suffix)

    /** Returns sample directory */
-    def sampleDir = outputDir + "samples" + File.pathSeparator + sampleId + File.pathSeparator
+    def sampleDir = outputDir + "samples" + File.separator + sampleId + File.separator
  }

  /** Sample type, need implementation in pipeline */

--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2.scala
+package nl.lumc.sasc.biopet.extensions.macs2
+
+import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
+
+/**
+ * Created by sajvanderzeeuw on 12/19/14.
+ */
+abstract class Macs2 extends BiopetCommandLineFunction {
+  executable = config("exe", default = "macs2", submodule = "macs2", freeVar = false)
+  override def versionCommand = executable + " --version"
+  override val versionRegex = """macs2 (.*)""".r
+  override val versionExitcode = List(0, 1)
+}
--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2CallPeak.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2CallPeak.scala
+package nl.lumc.sasc.biopet.extensions.macs2
+
+import java.io.File
+
+import nl.lumc.sasc.biopet.core.config.Configurable
+import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
+
+class Macs2CallPeak(val root: Configurable) extends Macs2 {
+  @Input(doc = "Treatment input", required = true)
+  var treatment: File = _
+
+  @Input(doc = "Control input", required = false)
+  var control: File = _
+
+  @Output(doc = "Output file NARROWPEAKS")
+  private var output_narrow: File = _
+
+  @Output(doc = "Output file BROADPEAKS")
+  private var output_broad: File = _
+
+  @Output(doc = "Output in Excel format")
+  private var output_xls: File = _
+
+  @Output(doc = "R script with Bimodal model")
+  private var output_r: File = _
+
+  @Output(doc = "Output file Bedgraph")
+  private var output_bdg: File = _
+
+  @Output(doc = "Output file gappedPeak")
+  private var output_gapped: File = _
+
+  var fileformat: String = config("fileformat")
+  var gsize: Option[Float] = config("gsize")
+  var keepdup: Boolean = config("keep-dup", default = false)
+  var buffersize: Option[Int] = config("buffer-size")
+  var outputdir: String = config("outputDir")
+  var name: String = config("name")
+  var bdg: Boolean = config("bdg", default = false)
+  var verbose: Boolean = config("verbose", default = false)
+  var tsize: Option[Int] = config("tsize")
+  var bandwith: Option[Int] = config("bandwith")
+  var mfold: Option[Float] = config("mfold")
+  var fixbimodel: Boolean = config("fixbimodel", default = false)
+  var nomodel: Boolean = config("nomodel", default = false)
+  var shift: Option[Int] = config("shift")
+  var qvalue: Option[Float] = config("qvalue")
+  var pvalue: Option[Float] = config("pvalue")
+  var tolarge: Boolean = config("tolarge", default = false)
+  var downsample: Boolean = config("downsample", default = false)
+  var nolambda: Boolean = config("nolambda", default = false)
+  var slocal: Option[Int] = config("slocal")
+  var llocal: Option[Int] = config("llocal")
+  var broad: Boolean = config("broad", default = false)
+  var broadcutoff: Option[Int] = config("broadcutoff")
+  var callsummits: Boolean = config("callsummits", default = false)
+
+  override def afterGraph: Unit = {
+    output_narrow = new File(outputdir + name + ".narrowPeak")
+    output_broad = new File(outputdir + name + ".broadPeak")
+    output_xls = new File(outputdir + name + ".xls")
+    output_bdg = new File(outputdir + name + ".bdg")
+    output_r = new File(outputdir + name + ".r")
+    output_gapped = new File(outputdir + name + ".gappedPeak")
+  }
+
+  def cmdLine = {
+    required(executable) + required("callpeak") +
+      required("--treatment", treatment) + /* Treatment sample */
+      optional("--control", control) + /* Control sample */
+      optional("-f", fileformat) + /* Input file format */
+      required("-g", gsize) + /* Estimated genome size.(format: 2.7e9) (presets: hs, mm, ce, dm) */
+      conditional(keepdup, "--keep-dup") + /* Whether to keep duplicates */
+      optional("--buffer-size", buffersize) + /* Buffer size */
+      required("--outdir", outputdir) + /* The output directory */
+      optional("--name", name) + /* prefix name of the output files. (note that also the peak names inside the files will have this name */
+      conditional(bdg, "-B") + /* Whether to output in BDG format */
+      conditional(verbose, "--verbose") + /* Whether to output verbosely */
+      optional("--tsize", tsize) + /* Sets custom tag length, if not specified macs will use first 10 sequences to estimate the size */
+      optional("--bw", bandwith) + /* The bandwith to use for model building. Set this parameter as the sonication fragment size estimated in the wetlab */
+      optional("--mfold", mfold) + /* The parameter to select regions within the model fold. Must be a upper and lower limit. */
+      conditional(fixbimodel, "--fix-bimodal") + /* Whether turn on the auto paired-peak model process. If it's set, when MACS failed to build paired model, it will use the nomodel settings, the '--extsize' parameter to extend each tags. If set, MACS will be terminated if paried-peak model is failed. */
+      conditional(nomodel, "--nomodel") + /* While on, MACS will bypass building the shifting model */
+      optional("--shift", shift) + /* You can set an arbitrary shift in basepairs here */
+      optional("--extsize", shift) + /* While '--nomodel' is set, MACS uses this parameter to extend reads in 5'->3' direction to fix-sized fragments. For example, if the size of binding region for your transcription factor is 200 bp, and you want to bypass the model building by MACS, this parameter can be set as 200. This option is only valid when --nomodel is set or when MACS fails to build model and --fix-bimodal is on. */
+      optional("--qvalue", qvalue) + /* the Q-value(FDR) cutoff */
+      optional("--pvalue", pvalue) + /* The P-value cutoff, if --pvalue is set no Qvalue is calculated */
+      conditional(tolarge, "--to-large") + /* Whether to scale up the smallest input file to the larger one */
+      conditional(downsample, "--down-sample") + /* This is the reversed from --to-large */
+      conditional(nolambda, "--nolambda") + /* With this flag on, MACS will use the background lambda as local lambda. This means MACS will not consider the local bias at peak candidate regions.*/
+      optional("--slocal", slocal) + /* These two parameters control which two levels of regions will be checked around the peak regions to calculate the maximum lambda as local lambda */
+      optional("--llocal", llocal) +
+      conditional(broad, "--broad") + /* whether to enable broad peak calling */
+      optional("--broad-cutoff", broadcutoff) + /* Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. */
+      conditional(callsummits, "--call-summits") /* MACS will now reanalyze the shape of signal profile (p or q-score depending on cutoff setting) to deconvolve subpeaks within each peak called from general procedure. */
+  }
+}
--- a/public/biopet-public-package/pom.xml
+++ b/public/biopet-public-package/pom.xml
@@ -80,6 +80,11 @@
            <artifactId>Kopisu</artifactId>
            <version>${project.version}</version>
        </dependency>
+        <dependency>
+            <groupId>nl.lumc.sasc</groupId>
+            <artifactId>Carp</artifactId>
+            <version>${project.version}</version>
+        </dependency>
    </dependencies>
    <build>
        <plugins>

--- a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala
+++ b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala
@@ -23,7 +23,8 @@ object BiopetExecutablePublic extends BiopetExecutable {
    nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics,
    nl.lumc.sasc.biopet.pipelines.yamsvp.Yamsvp,
    nl.lumc.sasc.biopet.pipelines.sage.Sage,
-    nl.lumc.sasc.biopet.pipelines.kopisu.ConiferPipeline
+    nl.lumc.sasc.biopet.pipelines.kopisu.ConiferPipeline,
+    nl.lumc.sasc.biopet.pipelines.carp.Carp
  )

  def tools: List[MainCommand] = List(

--- a/public/carp/.gitignore
+++ b/public/carp/.gitignore
+/target/
\ No newline at end of file
--- a/public/carp/pom.xml
+++ b/public/carp/pom.xml
+<!--
+
+    Biopet is built on top of GATK Queue for building bioinformatic
+    pipelines. It is mainly intended to support LUMC SHARK cluster which is running
+    SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
+    should also be able to execute Biopet tools and pipelines.
+
+    Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
+
+    Contact us at: sasc@lumc.nl
+
+    A dual licensing mode is applied. The source code within this project that are
+    not part of GATK Queue is freely available for non-commercial use under an AGPL
+    license; For commercial users or users who do not want to follow the AGPL
+    license, please contact us to obtain a separate license.
+
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>nl.lumc.sasc</groupId>
+    <artifactId>Carp</artifactId>
+    <packaging>jar</packaging>
+    
+    <parent>
+        <groupId>nl.lumc.sasc</groupId>
+        <artifactId>Biopet</artifactId>
+        <version>0.3.0-DEV</version>
+        <relativePath>../</relativePath>
+    </parent>
+    
+    <inceptionYear>2014</inceptionYear>
+    <name>Carp</name>
+
+    <dependencies>
+        <dependency>
+            <groupId>nl.lumc.sasc</groupId>
+            <artifactId>BiopetFramework</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>nl.lumc.sasc</groupId>
+            <artifactId>Mapping</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+    </dependencies>
+</project>
--- a/public/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala
+++ b/public/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala
+/**
+ * Biopet is built on top of GATK Queue for building bioinformatic
+ * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
+ * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
+ * should also be able to execute Biopet tools and pipelines.
+ *
+ * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
+ *
+ * Contact us at: sasc@lumc.nl
+ *
+ * A dual licensing mode is applied. The source code within this project that are
+ * not part of GATK Queue is freely available for non-commercial use under an AGPL
+ * license; For commercial users or users who do not want to follow the AGPL
+ * license, please contact us to obtain a separate license.
+ */
+package nl.lumc.sasc.biopet.pipelines.carp
+
+import java.io.File
+
+import nl.lumc.sasc.biopet.extensions.Ln
+import nl.lumc.sasc.biopet.extensions.macs2.Macs2CallPeak
+import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles
+import nl.lumc.sasc.biopet.utils.ConfigUtils
+import org.broadinstitute.gatk.queue.QScript
+import org.broadinstitute.gatk.utils.commandline.{ Argument, Input }
+import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
+import nl.lumc.sasc.biopet.core._
+import nl.lumc.sasc.biopet.core.config._
+import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
+
+/**
+ * Carp pipeline
+ * Chip-Seq analysis pipeline
+ * This pipeline performs QC,mapping and peak calling
+ */
+class Carp(val root: Configurable) extends QScript with MultiSampleQScript {
+  qscript =>
+  def this() = this(null)
+
+  override def defaults = ConfigUtils.mergeMaps(Map(
+    "mapping" -> Map("skip_markduplicates" -> true)
+  ), super.defaults)
+
+  def makeSample(id: String) = new Sample(id)
+  class Sample(sampleId: String) extends AbstractSample(sampleId) {
+    def makeLibrary(id: String) = new Library(id)
+    class Library(libraryId: String) extends AbstractLibrary(libraryId) {
+      val mapping = new Mapping(qscript)
+
+      def addJobs(): Unit = {
+        if (config.contains("R1")) {
+          mapping.input_R1 = config("R1")
+          if (config.contains("R2")) mapping.input_R2 = config("R2")
+          mapping.libraryId = libraryId
+          mapping.sampleId = sampleId
+          mapping.outputDir = libDir
+
+          mapping.init
+          mapping.biopetScript
+          addAll(mapping.functions)
+
+        } else logger.error("Sample: " + sampleId + ": No R1 found for library: " + libraryId)
+      }
+    }
+
+    val bamFile = createFile(".bam")
+    val controls: List[String] = config("control", default = Nil)
+
+    def addJobs(): Unit = {
+      addLibsJobs()
+      val bamFiles = libraries.map(_._2.mapping.finalBamFile).toList
+      if (bamFiles.length == 1) {
+        add(Ln(qscript, bamFiles.head, bamFile))
+        val oldIndex = new File(bamFiles.head.getAbsolutePath.stripSuffix(".bam") + ".bai")
+        val newIndex = new File(bamFile.getAbsolutePath.stripSuffix(".bam") + ".bai")
+        add(Ln(qscript, oldIndex, newIndex))
+      } else if (bamFiles.length > 1) {
+        val merge = new MergeSamFiles(qscript)
+        merge.input = bamFiles
+        merge.sortOrder = "coordinate"
+        merge.output = bamFile
+        add(merge)
+
+        //TODO: Add BigWIg track
+      }
+
+      val macs2 = new Macs2CallPeak(qscript)
+      macs2.treatment = bamFile
+      macs2.name = sampleId
+      macs2.outputdir = sampleDir + "macs2/" + macs2.name + "/"
+      add(macs2)
+    }
+  }
+
+  def init() {
+  }
+
+  def biopetScript() {
+    // Define what the pipeline should do
+    // First step is QC, this will be done with Flexiprep
+    // Second step is mapping, this will be done with the Mapping pipeline
+    // Third step is calling peaks on the bam files produced with the mapping pipeline, this will be done with MACS2
+    logger.info("Starting CArP pipeline")
+
+    addSamplesJobs
+
+    for ((sampleId, sample) <- samples) {
+      for (controlId <- sample.controls) {
+        if (!samples.contains(controlId))
+          throw new IllegalStateException("For sample: " + sampleId + " this control: " + controlId + " does not exist")
+        val macs2 = new Macs2CallPeak(this)
+        macs2.treatment = sample.bamFile
+        macs2.control = samples(controlId).bamFile
+        macs2.name = sampleId + "_VS_" + controlId
+        macs2.outputdir = sample.sampleDir + "/" + "macs2/" + macs2.name + "/"
+        add(macs2)
+      }
+    }
+  }
+}
+
+object Carp extends PipelineCommand
--- a/public/pom.xml
+++ b/public/pom.xml
@@ -34,6 +34,7 @@
        <module>sage</module>
        <module>kopisu</module>
        <module>yamsvp</module>
+        <module>carp</module>
    </modules>
    
    <properties>