Gentrap.scala 12 KB
Newer Older
bow's avatar
bow committed
1
/**
2 3 4 5 6 7 8 9 10 11 12 13 14
  * Biopet is built on top of GATK Queue for building bioinformatic
  * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
  * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
  * should also be able to execute Biopet tools and pipelines.
  *
  * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
  *
  * Contact us at: sasc@lumc.nl
  *
  * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
  * license; For commercial users or users who do not want to follow the AGPL
  * license, please contact us to obtain a separate license.
  */
bow's avatar
bow committed
15 16 17
package nl.lumc.sasc.biopet.pipelines.gentrap

import nl.lumc.sasc.biopet.core._
18
import nl.lumc.sasc.biopet.core.annotations.{AnnotationRefFlat, RibosomalRefFlat}
19
import nl.lumc.sasc.biopet.core.report.ReportBuilderExtension
Peter van 't Hof's avatar
Peter van 't Hof committed
20 21 22 23 24 25
import nl.lumc.sasc.biopet.extensions.tools.{
  CheckValidateAnnotation,
  RefflatStats,
  ValidateAnnotation,
  WipeReads
}
26
import nl.lumc.sasc.biopet.pipelines.gentrap.Gentrap.{ExpMeasures, StrandProtocol}
27
import nl.lumc.sasc.biopet.pipelines.gentrap.measures._
28
import nl.lumc.sasc.biopet.pipelines.mapping.MultisampleMappingTrait
29
import nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling
30
import nl.lumc.sasc.biopet.utils.{LazyCheck, Logging}
31
import nl.lumc.sasc.biopet.utils.config._
32
import nl.lumc.sasc.biopet.utils.camelize
Peter van 't Hof's avatar
Peter van 't Hof committed
33 34
import org.broadinstitute.gatk.queue.QScript
import picard.analysis.directed.RnaSeqMetricsCollector.StrandSpecificity
35
import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
36 37

import scala.language.reflectiveCalls
bow's avatar
bow committed
38

bow's avatar
bow committed
39
/**
40 41 42 43 44 45 46 47 48 49 50
  * Gentrap pipeline
  * Generic transcriptome analysis pipeline
  *
  * @author Peter van 't Hof <p.j.van_t_hof@lumc.nl>
  * @author Wibowo Arindrarto <w.arindrarto@lumc.nl>
  */
class Gentrap(val parent: Configurable)
    extends QScript
    with MultisampleMappingTrait
    with AnnotationRefFlat
    with RibosomalRefFlat { qscript =>
bow's avatar
bow committed
51

52
  // alternative constructor for initialization with empty configuration
bow's avatar
bow committed
53 54
  def this() = this(null)

55 56 57
  override def reportClass: Option[ReportBuilderExtension] = {
    val report = new GentrapReport(this)
    report.outputDir = new File(outputDir, "report")
58
    report.summaryDbFile = summaryDbFile
59 60 61
    Some(report)
  }

62
  /** Expression measurement modes */
bow's avatar
bow committed
63
  // see the enumeration below for valid modes
Peter van 't Hof's avatar
Peter van 't Hof committed
64
  lazy val expMeasures = new LazyCheck({
65 66 67 68 69 70 71 72
    config("expression_measures", default = Nil).asStringList
      .map(value =>
        ExpMeasures.values.find(_.toString == camelize(value)) match {
          case Some(v) => v
          case _ =>
            throw new IllegalArgumentException(s"'$value' is not a valid Expression measurement")
      })
      .toSet
Peter van 't Hof's avatar
Peter van 't Hof committed
73
  })
74

75
  /** Strandedness modes */
Peter van 't Hof's avatar
Peter van 't Hof committed
76
  lazy val strandProtocol = new LazyCheck({
77
    val value: String = config("strand_protocol")
78
    StrandProtocol.values.find(_.toString == camelize(value)) match {
79
      case Some(v) => v
Peter van 't Hof's avatar
Peter van 't Hof committed
80
      case other =>
81 82
        Logging.addError(s"'$other' is no strand_protocol or strand_protocol is not given")
        StrandProtocol.NonSpecific
83
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
84
  })
85

86
  /** Whether to remove rRNA regions or not */
87
  lazy val removeRibosomalReads: Boolean = config("remove_ribosomal_reads", default = false)
88

bow's avatar
bow committed
89
  /** Default pipeline config */
Peter van 't Hof's avatar
Peter van 't Hof committed
90
  override def defaults: Map[String, Any] = super.defaults ++ Map(
Peter van 't Hof's avatar
Peter van 't Hof committed
91
    "htseqcount" -> (if (strandProtocol.isSet) Map("stranded" -> (strandProtocol() match {
92 93 94 95 96
                       case StrandProtocol.NonSpecific => "no"
                       case StrandProtocol.Dutp => "reverse"
                       case otherwise => throw new IllegalStateException(otherwise.toString)
                     }))
                     else Map()),
Peter van 't Hof's avatar
Peter van 't Hof committed
97
    "cufflinks" -> (if (strandProtocol.isSet) Map("library_type" -> (strandProtocol() match {
98 99 100 101 102
                      case StrandProtocol.NonSpecific => "fr-unstranded"
                      case StrandProtocol.Dutp => "fr-firststrand"
                      case otherwise => throw new IllegalStateException(otherwise.toString)
                    }))
                    else Map()),
103
    "merge_strategy" -> "preprocessmergesam",
Peter van 't Hof's avatar
Peter van 't Hof committed
104 105
    "gsnap" -> Map(
      "novelsplicing" -> 1,
106
      "batch" -> 4
Peter van 't Hof's avatar
Peter van 't Hof committed
107
    ),
108 109 110 111
    "shivavariantcalling" -> Map(
      "variantcallers" -> List("varscan_cns_singlesample"),
      "name_prefix" -> "multisample"
    ),
Peter van 't Hof's avatar
Peter van 't Hof committed
112
    "bammetrics" -> Map(
113 114
      "wgs_metrics" -> false,
      "rna_metrics" -> true,
Peter van 't Hof's avatar
Peter van 't Hof committed
115 116 117 118 119 120 121 122 123 124 125 126
      "collectrnaseqmetrics" -> (if (strandProtocol.isSet)
                                   Map(
                                     "strand_specificity" -> (strandProtocol() match {
                                       case StrandProtocol.NonSpecific =>
                                         StrandSpecificity.NONE.toString
                                       case StrandProtocol.Dutp =>
                                         StrandSpecificity.SECOND_READ_TRANSCRIPTION_STRAND.toString
                                       case otherwise =>
                                         throw new IllegalStateException(otherwise.toString)
                                     })
                                   )
                                 else Map())
Peter van 't Hof's avatar
Peter van 't Hof committed
127
    ),
Peter van 't Hof's avatar
Peter van 't Hof committed
128 129 130 131 132 133 134
    "cutadapt" -> Map("minimum_length" -> 20),
    // avoid conflicts when merging since the MarkDuplicate tags often cause merges to fail
    "picard" -> Map(
      "programrecordid" -> "null"
    ),
    // disable markduplicates since it may not play well with all aligners (this can still be overriden via config)
    "mapping" -> Map(
135
      "aligner" -> "gsnap",
Peter van 't Hof's avatar
Peter van 't Hof committed
136
      "skip_markduplicates" -> true
Wai Yi Leung's avatar
Wai Yi Leung committed
137 138 139 140
    ),
    "wipereads" -> Map(
      "limit_removal" -> true,
      "no_make_index" -> false
141
    )
Peter van 't Hof's avatar
Peter van 't Hof committed
142
  )
143

144 145
  override def biopetScript(): Unit = {
    val validate = new ValidateAnnotation(this)
146
    validate.refflatFile = Some(annotationRefFlat.get)
147 148 149 150
    fragmentsPerGene.foreach(validate.gtfFile :+= _.annotationGtf)
    cufflinksBlind.foreach(validate.gtfFile :+= _.annotationGtf)
    cufflinksGuided.foreach(validate.gtfFile :+= _.annotationGtf)
    cufflinksStrict.foreach(validate.gtfFile :+= _.annotationGtf)
151
    stringtie.foreach(validate.gtfFile :+= _.annotationGtf)
152 153 154
    validate.jobOutputFile = new File(outputDir, ".validate.annotation.out")
    add(validate)

155
    val check = new CheckValidateAnnotation(this)
156
    check.inputLogFile = validate.jobOutputFile
157
    check.jobOutputFile = new File(outputDir, ".check.validate.out")
158 159 160 161 162
    add(check)

    super.biopetScript()
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
163
  lazy val fragmentsPerGene: Option[FragmentsPerGene] =
164 165 166
    if (expMeasures().contains(ExpMeasures.FragmentsPerGene))
      Some(new FragmentsPerGene(this))
    else None
167

Peter van 't Hof's avatar
Peter van 't Hof committed
168
  lazy val baseCounts: Option[BaseCounts] =
169 170 171
    if (expMeasures().contains(ExpMeasures.BaseCounts))
      Some(new BaseCounts(this))
    else None
172

173 174 175 176 177
  lazy val stringtie: Option[Stringtie] =
    if (expMeasures().contains(ExpMeasures.Stringtie))
      Some(new Stringtie(this))
    else None

Peter van 't Hof's avatar
Peter van 't Hof committed
178
  lazy val cufflinksBlind: Option[CufflinksBlind] =
179 180 181
    if (expMeasures().contains(ExpMeasures.CufflinksBlind))
      Some(new CufflinksBlind(this))
    else None
182

Peter van 't Hof's avatar
Peter van 't Hof committed
183
  lazy val cufflinksGuided: Option[CufflinksGuided] =
184 185 186
    if (expMeasures().contains(ExpMeasures.CufflinksGuided))
      Some(new CufflinksGuided(this))
    else None
187

Peter van 't Hof's avatar
Peter van 't Hof committed
188
  lazy val cufflinksStrict: Option[CufflinksStrict] =
189 190 191
    if (expMeasures().contains(ExpMeasures.CufflinksStrict))
      Some(new CufflinksStrict(this))
    else None
192

Peter van 't Hof's avatar
Peter van 't Hof committed
193
  def executedMeasures: List[QScript with Measurement] =
194
    (fragmentsPerGene :: baseCounts :: cufflinksBlind ::
195
      cufflinksGuided :: cufflinksStrict :: stringtie :: Nil).flatten
196 197

  /** Whether to do simple variant calling on RNA or not */
Peter van 't Hof's avatar
Peter van 't Hof committed
198 199 200 201 202 203
  lazy val shivaVariantcalling: Option[ShivaVariantcalling] =
    if (config("call_variants", default = false)) {
      val pipeline = new ShivaVariantcalling(this)
      pipeline.outputDir = new File(outputDir, "variantcalling")
      Some(pipeline)
    } else None
bow's avatar
bow committed
204

bow's avatar
bow committed
205
  /** Files that will be listed in the summary file */
206 207 208 209
  override def summaryFiles: Map[String, File] =
    super.summaryFiles ++ Map(
      "annotation_refflat" -> annotationRefFlat()
    ) ++ Map(
Peter van 't Hof's avatar
Peter van 't Hof committed
210
      "ribosome_refflat" -> ribosomalRefFlat()
211
    ).collect { case (key, Some(value)) => key -> value }
bow's avatar
bow committed
212 213

  /** Pipeline settings shown in the summary file */
214
  override def summarySettings: Map[String, Any] = super.summarySettings ++ Map(
Peter van 't Hof's avatar
Peter van 't Hof committed
215
    "expression_measures" -> expMeasures().toList.map(_.toString),
Peter van 't Hof's avatar
Peter van 't Hof committed
216
    "strand_protocol" -> strandProtocol().toString,
217 218
    "call_variants" -> shivaVariantcalling.isDefined,
    "remove_ribosomal_reads" -> removeRibosomalReads
bow's avatar
bow committed
219
  )
bow's avatar
bow committed
220

bow's avatar
bow committed
221
  /** Steps to run before biopetScript */
222 223 224
  override def init(): Unit = {
    super.init()

Peter van 't Hof's avatar
Peter van 't Hof committed
225
    if (expMeasures().isEmpty) Logging.addError("'expression_measures' is missing in the config")
Peter van 't Hof's avatar
Peter van 't Hof committed
226 227 228
    require(Gentrap.StrandProtocol.values.contains(strandProtocol()))
    if (removeRibosomalReads && ribosomalRefFlat().isEmpty)
      Logging.addError("removeRibosomalReads is enabled but no ribosomalRefFlat is given")
229

230 231
    executedMeasures.foreach(x =>
      x.outputDir = new File(outputDir, "expression_measures" + File.separator + x.name))
bow's avatar
bow committed
232
  }
233

234
  /** Pipeline run for multiple samples */
235
  override def addMultiSampleJobs(): Unit = {
236
    super.addMultiSampleJobs()
237 238 239 240 241 242 243 244 245

    val refflatStats = new RefflatStats(this)
    refflatStats.refflatFile = this.annotationRefFlat()
    refflatStats.geneOutput =
      new File(outputDir, "expression_measures" + File.separator + "gene.stats")
    refflatStats.transcriptOutput =
      new File(outputDir, "expression_measures" + File.separator + "transcript.stats")
    refflatStats.exonOutput =
      new File(outputDir, "expression_measures" + File.separator + "exon.stats")
246 247
    refflatStats.intronOutput =
      new File(outputDir, "expression_measures" + File.separator + "intron.stats")
248 249 250
    refflatStats.jobOutputFile = new File(outputDir, ".reflatstats.out")
    add(refflatStats)

251
    // merge expression tables
252 253
    executedMeasures.foreach(add)
    shivaVariantcalling.foreach(add)
254
  }
255

256
  /** Returns a [[Sample]] object */
257
  override def makeSample(sampleId: String): Sample = new Sample(sampleId)
bow's avatar
bow committed
258

259
  /**
260 261 262 263
    * Gentrap sample
    *
    * @param sampleId Unique identifier of the sample
    */
264
  class Sample(sampleId: String) extends super.Sample(sampleId) {
265

266
    /** Summary stats of the sample */
267
    override def summaryStats: Map[String, Any] = super.summaryStats ++ Map(
268 269 270
      "all_paired" -> allPaired,
      "all_single" -> allSingle
    )
271

Peter van 't Hof's avatar
Peter van 't Hof committed
272
    override lazy val preProcessBam: Option[File] = if (removeRibosomalReads) {
273 274
      val job = new WipeReads(qscript)
      job.inputBam = bamFile.get
Peter van 't Hof's avatar
Peter van 't Hof committed
275
      ribosomalRefFlat().foreach(job.intervalFile = _)
Peter van 't Hof's avatar
Peter van 't Hof committed
276
      job.outputBam = createFile("cleaned.bam")
Wai Yi Leung's avatar
Wai Yi Leung committed
277
      job.discardedBam = Some(createFile("rrna.bam"))
278 279 280
      add(job)
      Some(job.outputBam)
    } else bamFile
281

282
    /** Whether all libraries are paired or not */
283
    def allPaired: Boolean = libraries.values.forall(_.mapping.forall(_.inputR2.isDefined))
284 285

    /** Whether all libraries are single or not */
286
    def allSingle: Boolean = libraries.values.forall(_.mapping.forall(_.inputR2.isEmpty))
287

288
    /** Adds all jobs for the sample */
289 290
    override def addJobs(): Unit = {
      super.addJobs()
291
      // TODO: this is our requirement since it's easier to calculate base counts when all libraries are either paired or single
292 293
      require(allPaired || allSingle,
              s"Sample $sampleId contains only single-end or paired-end libraries")
bow's avatar
bow committed
294
      // add bigwig output, also per-strand when possible
295 296 297 298 299

      preProcessBam.foreach { file =>
        executedMeasures.foreach(_.addBamfile(sampleId, file))
        shivaVariantcalling.foreach(_.inputBams += sampleId -> file)
      }
300
    }
bow's avatar
bow committed
301
  }
bow's avatar
bow committed
302
}
bow's avatar
bow committed
303

bow's avatar
bow committed
304
object Gentrap extends PipelineCommand {
305

bow's avatar
bow committed
306 307
  /** Enumeration of available expression measures */
  object ExpMeasures extends Enumeration {
Peter van 't Hof's avatar
Peter van 't Hof committed
308 309
    val FragmentsPerGene, BaseCounts, CufflinksStrict, CufflinksGuided, CufflinksBlind, Stringtie =
      Value
310
  }
bow's avatar
bow committed
311

bow's avatar
bow committed
312
  /** Enumeration of available strandedness */
313
  object StrandProtocol extends Enumeration {
bow's avatar
bow committed
314 315 316
    // for now, only non-strand specific and dUTP stranded protocol is supported
    val NonSpecific, Dutp = Value
  }
317
}