Gentrap.scala 36.4 KB
Newer Older
bow's avatar
bow committed
1
/**
2
3
4
5
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
bow's avatar
bow committed
6
 *
7
8
9
10
11
12
13
14
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
bow's avatar
bow committed
15
 */
bow's avatar
bow committed
16
17
package nl.lumc.sasc.biopet.pipelines.gentrap

18
import java.io.File
19
20
import scala.language.reflectiveCalls

bow's avatar
bow committed
21
import org.broadinstitute.gatk.queue.QScript
22
import org.broadinstitute.gatk.queue.function.QFunction
bow's avatar
bow committed
23
import picard.analysis.directed.RnaSeqMetricsCollector.StrandSpecificity
bow's avatar
bow committed
24
import scalaz._, Scalaz._
bow's avatar
bow committed
25

bow's avatar
bow committed
26
import nl.lumc.sasc.biopet.FullVersion
bow's avatar
bow committed
27
28
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.core.config._
bow's avatar
bow committed
29
import nl.lumc.sasc.biopet.core.summary._
30
import nl.lumc.sasc.biopet.extensions.{ HtseqCount, Ln }
bow's avatar
bow committed
31
import nl.lumc.sasc.biopet.extensions.picard.{ CollectRnaSeqMetrics, SortSam, MergeSamFiles }
bow's avatar
bow committed
32
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsView
bow's avatar
bow committed
33
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
bow's avatar
bow committed
34
import nl.lumc.sasc.biopet.pipelines.bamtobigwig.Bam2Wig
bow's avatar
bow committed
35
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
bow's avatar
bow committed
36
import nl.lumc.sasc.biopet.pipelines.gentrap.extensions.{ CustomVarScan, Pdflatex, RawBaseCounter }
bow's avatar
bow committed
37
import nl.lumc.sasc.biopet.pipelines.gentrap.scripts.{ AggrBaseCount, PdfReportTemplateWriter, PlotHeatmap }
38
import nl.lumc.sasc.biopet.utils.ConfigUtils
bow's avatar
bow committed
39
import nl.lumc.sasc.biopet.tools.{ MergeTables, WipeReads }
bow's avatar
bow committed
40

bow's avatar
bow committed
41
42
43
/**
 * Gentrap pipeline
 * Generic transcriptome analysis pipeline
bow's avatar
bow committed
44
45
 *
 * @author Wibowo Arindrarto <w.arindrarto@lumc.nl>
bow's avatar
bow committed
46
 */
bow's avatar
bow committed
47
class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript with SummaryQScript { qscript =>
bow's avatar
bow committed
48

bow's avatar
bow committed
49
50
  import Gentrap._
  import Gentrap.ExpMeasures._
51
  import Gentrap.StrandProtocol._
bow's avatar
bow committed
52

53
  // alternative constructor for initialization with empty configuration
bow's avatar
bow committed
54
55
  def this() = this(null)

bow's avatar
bow committed
56
  /** Split aligner to use */
bow's avatar
bow committed
57
58
  var aligner: String = config("aligner", default = "gsnap")

59
  /** Expression measurement modes */
bow's avatar
bow committed
60
  // see the enumeration below for valid modes
61
62
63
64
  var expMeasures: Set[ExpMeasures.Value] = config("expression_measures")
    .asStringList
    .map { makeExpMeasure }
    .toSet
65

66
  /** Strandedness modes */
67
  var strandProtocol: StrandProtocol.Value = makeStrandProtocol(config("strand_protocol").asString)
68

69
  /** GTF reference file */
bow's avatar
bow committed
70
  var annotationGtf: Option[File] = config("annotation_gtf")
71
72

  /** BED reference file */
bow's avatar
bow committed
73
  var annotationBed: Option[File] = config("annotation_bed")
74
75

  /** refFlat reference file */
bow's avatar
bow committed
76
  var annotationRefFlat: File = config("annotation_refflat")
bow's avatar
bow committed
77

bow's avatar
bow committed
78
79
80
  /** rRNA refFlat annotation */
  var ribosomalRefFlat: Option[File] = config("ribosome_refflat")

81
82
83
  /** Whether to remove rRNA regions or not */
  var removeRibosomalReads: Boolean = config("remove_ribosomal_reads", default = false)

84
85
  /** Whether to do simple variant calling on RNA or not */
  var callVariants: Boolean = config("call_variants", default = false)
86

bow's avatar
bow committed
87
  /** Default pipeline config */
88
89
90
91
92
93
  override def defaults = ConfigUtils.mergeMaps(
    Map(
      "gsnap" -> Map(
        "novelsplicing" -> 1,
        "batch" -> 4,
        "format" -> "sam"
94
      ),
95
      "cutadapt" -> Map("minimum_length" -> 20),
96
      // avoid conflicts when merging since the MarkDuplicate tags often cause merges to fail
bow's avatar
bow committed
97
      "picard" -> Map(
98
        "programrecordid" -> "null"
99
100
101
      ),
      // disable markduplicates since it may not play well with all aligners (this can still be overriden via config)
      "mapping" -> Map("skip_markduplicates" -> true)
102
103
    ), super.defaults)

bow's avatar
bow committed
104
105
106
107
108
109
110
111
112
113
114
115
116
  /** Reference FASTA file */
  private val reference: File = config("reference")

  /** Reference DICT file */
  private def referenceDict: File = {
    val refName: String = reference.getName
    require(refName.contains('.'), "Reference file must have an extension")
    val dictFile = new File(Option(reference.getParentFile).getOrElse(new File(".")),
      refName.take(refName.lastIndexOf('.')) + ".dict")
    require(dictFile.exists, s"Dict file '$dictFile' must exist")
    dictFile
  }

117
118
119
  /** Adds output merge jobs for the given expression mode */
  // TODO: can we combine the enum with the file extension (to reduce duplication and potential errors)
  private def makeMergeTableJob(inFunc: (Sample => Option[File]), ext: String, idCols: List[Int], valCol: Int,
120
                                outBaseName: String = "all_samples", fallback: String = "-"): Option[MergeTables] = {
121
122
123
    val tables = samples.values.map { inFunc }.toList.flatten
    tables.nonEmpty
      .option {
bow's avatar
bow committed
124
125
        val job = new MergeTables(qscript)
        job.inputTables = tables
126
        job.output = new File(outputDir, "expression_estimates" + File.separator + outBaseName + ext)
bow's avatar
bow committed
127
128
129
        job.idColumnIndices = idCols.map(_.toString)
        job.valueColumnIndex = valCol
        job.fileExtension = Option(ext)
130
        job.fallbackString = Option(fallback)
bow's avatar
bow committed
131
132
133
        // TODO: separate the addition into another function?
        job
      }
134
135
  }

bow's avatar
bow committed
136
  /** Expression measures which are subject to TMM normalization during correlation calculation */
bow's avatar
bow committed
137
138
139
  protected lazy val forTmmNormalization: Set[ExpMeasures.Value] =
    Set(FragmentsPerGene, FragmentsPerExon, BasesPerGene, BasesPerExon)

bow's avatar
bow committed
140
  /** Returns a QFunction to generate heatmaps */
bow's avatar
bow committed
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
  private def makeHeatmapJob(mergeJob: Option[MergeTables], outName: String,
                             expMeasure: ExpMeasures.Value, isCuffIsoform: Boolean = false): Option[PlotHeatmap] =
    (mergeJob.isDefined && samples.size > 2)
      .option {
        val job = new PlotHeatmap(qscript)
        job.input = mergeJob.get.output
        job.output = new File(outputDir, "heatmaps" + File.separator + s"heatmap_$outName.png")
        job.tmmNormalize = forTmmNormalization.contains(expMeasure)
        job.useLog = job.tmmNormalize
        job.countType =
          if (expMeasure.toString.startsWith("Cufflinks")) {
            if (isCuffIsoform) Option("CufflinksIsoform")
            else Option("CufflinksGene")
          } else Option(expMeasure.toString)
        job
      }

158
  /** Merged gene fragment count table */
bow's avatar
bow committed
159
  private lazy val geneFragmentsCountJob =
160
    makeMergeTableJob((s: Sample) => s.geneFragmentsCount, ".fragments_per_gene", List(1), 2, fallback = "0")
161

bow's avatar
bow committed
162
  /** Heatmap job for gene fragment count */
bow's avatar
bow committed
163
164
  private lazy val geneFragmentsCountHeatmapJob =
    makeHeatmapJob(geneFragmentsCountJob, "fragments_per_gene", FragmentsPerGene)
bow's avatar
bow committed
165

166
  /** Merged exon fragment count table */
bow's avatar
bow committed
167
  private lazy val exonFragmentsCountJob =
168
    makeMergeTableJob((s: Sample) => s.exonFragmentsCount, ".fragments_per_exon", List(1), 2, fallback = "0")
169

bow's avatar
bow committed
170
  /** Heatmap job for exon fragment count */
bow's avatar
bow committed
171
172
173
  private lazy val exonFragmentsCountHeatmapJob =
    makeHeatmapJob(exonFragmentsCountJob, "fragments_per_exon", FragmentsPerExon)

174
  /** Merged gene base count table */
bow's avatar
bow committed
175
  private lazy val geneBasesCountJob =
176
    makeMergeTableJob((s: Sample) => s.geneBasesCount, ".bases_per_gene", List(1), 2, fallback = "0")
177

bow's avatar
bow committed
178
  /** Heatmap job for gene base count */
bow's avatar
bow committed
179
180
181
  private lazy val geneBasesCountHeatmapJob =
    makeHeatmapJob(geneBasesCountJob, "bases_per_gene", BasesPerGene)

182
  /** Merged exon base count table */
bow's avatar
bow committed
183
  private lazy val exonBasesCountJob =
184
    makeMergeTableJob((s: Sample) => s.exonBasesCount, ".bases_per_exon", List(1), 2, fallback = "0")
185

bow's avatar
bow committed
186
  /** Heatmap job for exon base count */
bow's avatar
bow committed
187
188
189
  private lazy val exonBasesCountHeatmapJob =
    makeHeatmapJob(exonBasesCountJob, "bases_per_exon", BasesPerExon)

190
  /** Merged gene FPKM table for Cufflinks, strict mode */
bow's avatar
bow committed
191
192
193
  private lazy val geneFpkmCufflinksStrictJob =
    makeMergeTableJob((s: Sample) => s.geneFpkmCufflinksStrict, ".genes_fpkm_cufflinks_strict", List(1, 7), 10, fallback = "0.0")

bow's avatar
bow committed
194
  /** Heatmap job for gene FPKM Cufflinks, strict mode */
bow's avatar
bow committed
195
196
  private lazy val geneFpkmCufflinksStrictHeatmapJob =
    makeHeatmapJob(geneFpkmCufflinksStrictJob, "genes_fpkm_cufflinks_strict", CufflinksStrict)
197
198

  /** Merged exon FPKM table for Cufflinks, strict mode */
bow's avatar
bow committed
199
200
201
  private lazy val isoFpkmCufflinksStrictJob =
    makeMergeTableJob((s: Sample) => s.isoformFpkmCufflinksStrict, ".isoforms_fpkm_cufflinks_strict", List(1, 7), 10, fallback = "0.0")

bow's avatar
bow committed
202
  /** Heatmap job for isoform FPKM Cufflinks, strict mode */
bow's avatar
bow committed
203
204
  private lazy val isoFpkmCufflinksStrictHeatmapJob =
    makeHeatmapJob(isoFpkmCufflinksStrictJob, "isoforms_fpkm_cufflinks_strict", CufflinksStrict, true)
205
206

  /** Merged gene FPKM table for Cufflinks, guided mode */
bow's avatar
bow committed
207
208
209
  private lazy val geneFpkmCufflinksGuidedJob =
    makeMergeTableJob((s: Sample) => s.geneFpkmCufflinksGuided, ".genes_fpkm_cufflinks_guided", List(1, 7), 10, fallback = "0.0")

bow's avatar
bow committed
210
  /** Heatmap job for gene FPKM Cufflinks, guided mode */
bow's avatar
bow committed
211
212
  private lazy val geneFpkmCufflinksGuidedHeatmapJob =
    makeHeatmapJob(geneFpkmCufflinksGuidedJob, "genes_fpkm_cufflinks_guided", CufflinksGuided)
213
214

  /** Merged isoforms FPKM table for Cufflinks, guided mode */
bow's avatar
bow committed
215
216
217
  private lazy val isoFpkmCufflinksGuidedJob =
    makeMergeTableJob((s: Sample) => s.isoformFpkmCufflinksGuided, ".isoforms_fpkm_cufflinks_guided", List(1, 7), 10, fallback = "0.0")

bow's avatar
bow committed
218
  /** Heatmap job for isoform FPKM Cufflinks, guided mode */
bow's avatar
bow committed
219
220
  private lazy val isoFpkmCufflinksGuidedHeatmapJob =
    makeHeatmapJob(isoFpkmCufflinksGuidedJob, "isoforms_fpkm_cufflinks_guided", CufflinksGuided, true)
221
222

  /** Merged gene FPKM table for Cufflinks, blind mode */
bow's avatar
bow committed
223
224
225
  private lazy val geneFpkmCufflinksBlindJob =
    makeMergeTableJob((s: Sample) => s.geneFpkmCufflinksBlind, ".genes_fpkm_cufflinks_blind", List(1, 7), 10, fallback = "0.0")

bow's avatar
bow committed
226
  /** Heatmap job for gene FPKM Cufflinks, blind mode */
bow's avatar
bow committed
227
228
  private lazy val geneFpkmCufflinksBlindHeatmapJob =
    makeHeatmapJob(geneFpkmCufflinksBlindJob, "genes_fpkm_cufflinks_blind", CufflinksBlind)
229
230

  /** Merged isoforms FPKM table for Cufflinks, blind mode */
bow's avatar
bow committed
231
232
233
  private lazy val isoFpkmCufflinksBlindJob =
    makeMergeTableJob((s: Sample) => s.isoformFpkmCufflinksBlind, ".isoforms_fpkm_cufflinks_blind", List(1, 7), 10, fallback = "0.0")

bow's avatar
bow committed
234
  /** Heatmap job for isoform FPKM Cufflinks, blind mode */
bow's avatar
bow committed
235
236
  private lazy val isoFpkmCufflinksBlindHeatmapJob =
    makeHeatmapJob(isoFpkmCufflinksBlindJob, "isoforms_fpkm_cufflinks_blind", CufflinksBlind, true)
237

bow's avatar
bow committed
238
  /** Container for merge table jobs */
bow's avatar
bow committed
239
  private lazy val mergeTableJobs: Map[String, Option[MergeTables]] = Map(
bow's avatar
bow committed
240
241
242
243
244
    "gene_fragments_count" -> geneFragmentsCountJob,
    "exon_fragments_count" -> exonFragmentsCountJob,
    "gene_bases_count" -> geneBasesCountJob,
    "exon_bases_count" -> exonBasesCountJob,
    "gene_fpkm_cufflinks_strict" -> geneFpkmCufflinksStrictJob,
bow's avatar
bow committed
245
    "isoform_fpkm_cufflinks_strict" -> isoFpkmCufflinksStrictJob,
bow's avatar
bow committed
246
    "gene_fpkm_cufflinks_guided" -> geneFpkmCufflinksGuidedJob,
bow's avatar
bow committed
247
248
249
    "isoform_fpkm_cufflinks_guided" -> isoFpkmCufflinksGuidedJob,
    "gene_fpkm_cufflinks_blind" -> geneFpkmCufflinksBlindJob,
    "isoform_fpkm_cufflinks_blind" -> isoFpkmCufflinksBlindJob
bow's avatar
bow committed
250
  )
251

bow's avatar
bow committed
252
  /** Container for heatmap jobs */
bow's avatar
bow committed
253
  private lazy val heatmapJobs: Map[String, Option[PlotHeatmap]] = Map(
bow's avatar
bow committed
254
255
256
257
258
259
260
261
262
263
    "gene_fragments_count_heatmap" -> geneFragmentsCountHeatmapJob,
    "exon_fragments_count_heatmap" -> exonFragmentsCountHeatmapJob,
    "gene_bases_count_heatmap" -> geneBasesCountHeatmapJob,
    "exon_bases_count_heatmap" -> exonBasesCountHeatmapJob,
    "gene_fpkm_cufflinks_strict_heatmap" -> geneFpkmCufflinksStrictHeatmapJob,
    "isoform_fpkm_cufflinks_strict_heatmap" -> isoFpkmCufflinksStrictHeatmapJob,
    "gene_fpkm_cufflinks_guided_heatmap" -> geneFpkmCufflinksGuidedHeatmapJob,
    "isoform_fpkm_cufflinks_guided_heatmap" -> isoFpkmCufflinksGuidedHeatmapJob,
    "gene_fpkm_cufflinks_blind_heatmap" -> geneFpkmCufflinksBlindHeatmapJob,
    "isoform_fpkm_cufflinks_blind_heatmap" -> isoFpkmCufflinksBlindHeatmapJob
bow's avatar
bow committed
264
265
  )

bow's avatar
bow committed
266
267
268
269
  /** Output summary file */
  def summaryFile: File = new File(outputDir, "gentrap.summary.json")

  /** Files that will be listed in the summary file */
bow's avatar
bow committed
270
  def summaryFiles: Map[String, File] = Map(
bow's avatar
bow committed
271
272
273
274
    // NOTE: technically we can use different references in each of the sample, but for simplicity's sake we store the
    //       reference information in the pipeline level now.
    "reference_fasta" -> reference,
    "reference_dict" -> referenceDict,
bow's avatar
bow committed
275
276
277
278
279
280
    "annotation_refflat" -> annotationRefFlat
  ) ++ Map(
      "annotation_gtf" -> annotationGtf,
      "annotation_bed" -> annotationBed,
      "ribosome_refflat" -> ribosomalRefFlat
    ).collect { case (key, Some(value)) => key -> value } ++
bow's avatar
bow committed
281
282
      mergeTableJobs.collect { case (key, Some(value)) => key -> value.output } ++
      heatmapJobs.collect { case (key, Some(value)) => key -> value.output }
bow's avatar
bow committed
283
284
285
286
287

  /** Statistics shown in the summary file */
  def summaryStats: Map[String, Any] = Map()

  /** Pipeline settings shown in the summary file */
bow's avatar
bow committed
288
289
  def summarySettings: Map[String, Any] = Map(
    "aligner" -> aligner,
bow's avatar
bow committed
290
291
292
293
    "expression_measures" -> expMeasures.toList.map(_.toString),
    "strand_protocol" -> strandProtocol.toString,
    "call_variants" -> callVariants,
    "remove_ribosomal_reads" -> removeRibosomalReads,
bow's avatar
bow committed
294
    "version" -> FullVersion
bow's avatar
bow committed
295
  )
bow's avatar
bow committed
296

297
  /** Job for writing PDF report template */
bow's avatar
bow committed
298
  protected lazy val pdfTemplateJob: PdfReportTemplateWriter = {
299
300
    val job = new PdfReportTemplateWriter(qscript)
    job.summaryFile = summaryFile
bow's avatar
bow committed
301
302
303
304
305
306
307
308
309
310
    job.output = new File(outputDir, "gentrap_report.tex")
    job
  }

  /** Job for writing PDF report */
  protected def pdfReportJob: Pdflatex = {
    val job = new Pdflatex(qscript)
    job.input = pdfTemplateJob.output
    job.outputDir = new File(outputDir, "report")
    job.name = "gentrap_report"
311
312
313
    job
  }

314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
  /** General function to create CollectRnaSeqMetrics job, for per-sample and per-library runs */
  protected def makeCollectRnaSeqMetricsJob(alnFile: File, outMetrics: File,
                                            outChart: Option[File] = None): CollectRnaSeqMetrics = {
    val job = new CollectRnaSeqMetrics(qscript)
    job.input = alnFile
    job.output = outMetrics
    job.refFlat = annotationRefFlat
    job.chartOutput = outChart
    job.assumeSorted = true
    job.strandSpecificity = strandProtocol match {
      case NonSpecific => Option(StrandSpecificity.NONE.toString)
      case Dutp        => Option(StrandSpecificity.SECOND_READ_TRANSCRIPTION_STRAND.toString)
      case _           => throw new IllegalStateException
    }
    job.ribosomalIntervals = ribosomalRefFlat
    job
  }

332
333
  // used to ensure that the required .dict file is present before the run starts
  // can not store it in config since the tools that use it (Picard) have this value based on the reference file name
bow's avatar
bow committed
334
335
  protected def checkDictFile(): Unit =
    require(referenceDict.exists, s"Dict file '$referenceDict' must exist")
336

bow's avatar
bow committed
337
338
  /** Steps to run before biopetScript */
  def init(): Unit = {
339
    checkDictFile()
340

bow's avatar
bow committed
341
    // TODO: validate that exons are flattened or not (depending on another option flag?)
bow's avatar
bow committed
342
    // validate required annotation files
343
344
    if (expMeasures.contains(FragmentsPerGene))
      require(annotationGtf.isDefined, "GTF file must be defined for counting fragments per gene")
345

346
    if (expMeasures.contains(FragmentsPerExon))
bow's avatar
bow committed
347
      // TODO: validate that GTF file contains exon features
348
      require(annotationGtf.isDefined, "GTF file must be defined for counting fragments per exon")
bow's avatar
bow committed
349

350
    if (expMeasures.contains(BasesPerGene))
bow's avatar
bow committed
351
      require(annotationBed.isDefined, "BED file must be defined for counting bases per gene")
352

353
    if (expMeasures.contains(BasesPerExon))
bow's avatar
bow committed
354
      require(annotationBed.isDefined, "BED file must be defined for counting bases per exon")
355
356
357

    if (expMeasures.contains(CufflinksBlind) || expMeasures.contains(CufflinksGuided) || expMeasures.contains(CufflinksStrict))
      require(annotationGtf.isDefined, "GTF file must be defined for Cufflinks-based modes")
358
359
360

    if (removeRibosomalReads)
      require(ribosomalRefFlat.isDefined, "rRNA intervals must be supplied if removeRibosomalReads is set")
bow's avatar
bow committed
361
  }
362

bow's avatar
bow committed
363
  /** Pipeline run for each sample */
bow's avatar
bow committed
364
365
366
  def biopetScript(): Unit = {
    addSamplesJobs()
  }
367

bow's avatar
bow committed
368
  /** Pipeline run for multiple samples */
369
  def addMultiSampleJobs(): Unit = {
370
    // merge expression tables
bow's avatar
bow committed
371
    mergeTableJobs.values.foreach { case maybeJob => maybeJob.foreach(add(_)) }
bow's avatar
bow committed
372
373
374
375
376
377
    // add heatmap jobs
    heatmapJobs.values.foreach { case maybeJob => maybeJob.foreach(add(_)) }
    // plot heatmap for each expression measure if sample is > 1
    if (samples.size > 1) {
      geneFragmentsCountJob
    }
bow's avatar
bow committed
378
379
    // TODO: use proper notation
    addSummaryJobs
bow's avatar
bow committed
380
    add(pdfTemplateJob)
381
    add(pdfReportJob)
382
  }
383

bow's avatar
bow committed
384
  /** Returns a [[Sample]] object */
bow's avatar
bow committed
385
386
  def makeSample(sampleId: String): Sample = new Sample(sampleId)

bow's avatar
bow committed
387
388
389
390
391
  /**
   * Gentrap sample
   *
   * @param sampleId Unique identifier of the sample
   */
392
393
394
395
  class Sample(sampleId: String) extends AbstractSample(sampleId) with CufflinksProducer {

    /** Shortcut to qscript object */
    protected def pipeline: Gentrap = qscript
bow's avatar
bow committed
396

bow's avatar
bow committed
397
398
399
    /** Sample output directory */
    override def sampleDir: File = new File(outputDir, "sample_" + sampleId)

400
    /** Summary stats of the sample */
401
402
403
404
    def summaryStats: Map[String, Any] = Map(
      "all_paired" -> allPaired,
      "all_single" -> allSingle
    )
405
406
407

    /** Summary files of the sample */
    def summaryFiles: Map[String, File] = Map(
408
      "alignment" -> alnFile
409
    ) ++ Map(
410
411
        "gene_fragments_count" -> geneFragmentsCount,
        "exon_fragments_count" -> exonFragmentsCount,
412
413
        "gene_bases_count" -> geneBasesCount,
        "exon_bases_count" -> exonBasesCount,
414
415
416
417
418
419
        "gene_fpkm_cufflinks_strict" -> cufflinksStrictJobSet.collect { case js => js.geneFpkmJob.output },
        "isoform_fpkm_cufflinks_strict" -> cufflinksStrictJobSet.collect { case js => js.isoformFpkmJob.output },
        "gene_fpkm_cufflinks_guided" -> cufflinksGuidedJobSet.collect { case js => js.geneFpkmJob.output },
        "isoform_fpkm_cufflinks_guided" -> cufflinksGuidedJobSet.collect { case js => js.isoformFpkmJob.output },
        "gene_fpkm_cufflinks_blind" -> cufflinksBlindJobSet.collect { case js => js.geneFpkmJob.output },
        "isoform_fpkm_cufflinks_blind" -> cufflinksBlindJobSet.collect { case js => js.isoformFpkmJob.output },
420
        "variant_calls" -> variantCalls
421
422
      ).collect { case (key, Some(value)) => key -> value }

bow's avatar
bow committed
423
424
425
426
427
    /** Per-sample alignment file, pre rRNA cleanup (if chosen) */
    lazy val alnFileDirty: File = sampleAlnJob.output

    /** Per-sample alignment file, post rRNA cleanup (if chosen) */
    lazy val alnFile: File = wipeJob match {
428
429
      case Some(j) => j.outputBam
      case None    => alnFileDirty
bow's avatar
bow committed
430
    }
bow's avatar
bow committed
431

432
    /** Read count per gene file */
433
    def geneFragmentsCount: Option[File] = fragmentsPerGeneJob
434
435
436
      .collect { case job => job.output }

    /** Read count per exon file */
437
    def exonFragmentsCount: Option[File] = fragmentsPerExonJob
438
439
      .collect { case job => job.output }

bow's avatar
bow committed
440
441
442
443
444
445
446
447
    /** Base count per gene file */
    def geneBasesCount: Option[File] = basesPerGeneJob
      .collect { case job => job.output }

    /** Base count per exon file */
    def exonBasesCount: Option[File] = basesPerExonJob
      .collect { case job => job.output }

448
449
450
451
452
    /** JobSet for Cufflinks strict mode */
    protected lazy val cufflinksStrictJobSet: Option[CufflinksJobSet] = expMeasures
      .find(_ == CufflinksStrict)
      .collect { case found => new CufflinksJobSet(found) }

453
    /** Gene tracking file from Cufflinks strict mode */
bow's avatar
bow committed
454
    def geneFpkmCufflinksStrict: Option[File] = cufflinksStrictJobSet
455
      .collect { case jobSet => jobSet.geneFpkmJob.output }
456
457

    /** Isoforms tracking file from Cufflinks strict mode */
bow's avatar
bow committed
458
    def isoformFpkmCufflinksStrict: Option[File] = cufflinksStrictJobSet
459
460
461
462
463
464
      .collect { case jobSet => jobSet.isoformFpkmJob.output }

    /** JobSet for Cufflinks strict mode */
    protected lazy val cufflinksGuidedJobSet: Option[CufflinksJobSet] = expMeasures
      .find(_ == CufflinksGuided)
      .collect { case found => new CufflinksJobSet(found) }
465

466
    /** Gene tracking file from Cufflinks guided mode */
467
468
    def geneFpkmCufflinksGuided: Option[File] = cufflinksGuidedJobSet
      .collect { case jobSet => jobSet.geneFpkmJob.output }
469
470

    /** Isoforms tracking file from Cufflinks guided mode */
471
472
473
474
475
476
477
    def isoformFpkmCufflinksGuided: Option[File] = cufflinksGuidedJobSet
      .collect { case jobSet => jobSet.isoformFpkmJob.output }

    /** JobSet for Cufflinks blind mode */
    protected lazy val cufflinksBlindJobSet: Option[CufflinksJobSet] = expMeasures
      .find(_ == CufflinksBlind)
      .collect { case found => new CufflinksJobSet(found) }
478

479
    /** Gene tracking file from Cufflinks guided mode */
480
481
    def geneFpkmCufflinksBlind: Option[File] = cufflinksBlindJobSet
      .collect { case jobSet => jobSet.geneFpkmJob.output }
482
483

    /** Isoforms tracking file from Cufflinks blind mode */
484
485
    def isoformFpkmCufflinksBlind: Option[File] = cufflinksBlindJobSet
      .collect { case jobSet => jobSet.isoformFpkmJob.output }
486

487
488
489
490
    /** Raw variant calling file */
    def variantCalls: Option[File] = varCallJob
      .collect { case job => job.output }

491
    /** ID-sorting job for HTseq-count jobs */
492
    private def idSortingJob: Option[SortSam] = (expMeasures.contains(FragmentsPerExon) || expMeasures.contains(FragmentsPerGene))
bow's avatar
bow committed
493
494
495
496
497
      .option {
        val job = new SortSam(qscript)
        job.input = alnFile
        job.output = createFile(".idsorted.bam")
        job.sortOrder = "queryname"
bow's avatar
bow committed
498
        job.isIntermediate = true
bow's avatar
bow committed
499
500
        job
      }
501
502

    /** Read counting job per gene */
503
504
    private def fragmentsPerGeneJob: Option[HtseqCount] = expMeasures
      .contains(FragmentsPerGene)
bow's avatar
bow committed
505
506
      .option {
        require(idSortingJob.nonEmpty)
507
508
        val job = new HtseqCount(qscript)
        job.inputAnnotation = annotationGtf.get
bow's avatar
bow committed
509
        job.inputAlignment = idSortingJob.get.output
510
        job.output = createFile(".fragments_per_gene")
511
        job.format = Option("bam")
bow's avatar
bow committed
512
513
        // We are forcing the sort order to be ID-sorted, since HTSeq-count often chokes when using position-sorting due
        // to its buffer not being large enough.
514
        job.order = Option("name")
515
        job.stranded = strandProtocol match {
516
517
518
519
520
521
          case NonSpecific => Option("no")
          case Dutp        => Option("reverse")
          case _           => throw new IllegalStateException
        }
        job
      }
bow's avatar
bow committed
522

523
    /** Read counting job per exon */
524
525
    private def fragmentsPerExonJob: Option[HtseqCount] = expMeasures
      .contains(FragmentsPerExon)
bow's avatar
bow committed
526
527
      .option {
        require(idSortingJob.nonEmpty)
528
529
        val job = new HtseqCount(qscript)
        job.inputAnnotation = annotationGtf.get
bow's avatar
bow committed
530
        job.inputAlignment = idSortingJob.get.output
bow's avatar
bow committed
531
        job.output = createFile(".fragments_per_exon")
532
533
        job.format = Option("bam")
        job.order = Option("name")
534
        job.stranded = strandProtocol match {
535
536
537
538
539
540
541
          case NonSpecific => Option("no")
          case Dutp        => Option("reverse")
          case _           => throw new IllegalStateException
        }
        // TODO: ensure that the "exon_id" attributes exist for all exons in the GTF
        job.idattr = Option("exon_id")
        job
bow's avatar
bow committed
542
      }
bow's avatar
bow committed
543

bow's avatar
bow committed
544
545
    /** Container for strand-separation jobs */
    private case class StrandSeparationJobSet(pair1Job: SamtoolsView, pair2Job: Option[SamtoolsView],
546
                                              combineJob: QFunction { def output: File }) {
bow's avatar
bow committed
547
      def addAllJobs(): Unit = {
548
        add(pair1Job); pair2Job.foreach(add(_)); add(combineJob)
bow's avatar
bow committed
549
550
551
      }
    }

bow's avatar
bow committed
552
    /** Alignment file of reads from the plus strand, only defined when run is strand-specific */
bow's avatar
bow committed
553
    def alnFilePlusStrand: Option[File] = alnPlusStrandJobs
554
      .collect { case jobSet => jobSet.combineJob.output }
bow's avatar
bow committed
555

bow's avatar
bow committed
556
    /** Jobs for generating reads from the plus strand, only defined when run is strand-specific */
557
    private def alnPlusStrandJobs: Option[StrandSeparationJobSet] = strandProtocol match {
bow's avatar
bow committed
558
      case Dutp =>
559
        val r2Job = this.allPaired
560
561
562
563
564
565
566
567
568
569
570
          .option {
            val job = new SamtoolsView(qscript)
            job.input = alnFile
            job.b = true
            job.h = true
            job.f = List("0x40")
            job.F = List("0x10")
            job.output = createFile(".r2.bam")
            job.isIntermediate = true
            job
          }
571
572
573
574
575
576
577
578
579
580
581

        val f1Job = new SamtoolsView(qscript)
        f1Job.input = alnFile
        f1Job.b = true
        f1Job.h = true
        f1Job.f = if (this.allSingle) List("0x10") else List("0x50")
        f1Job.output = createFile(".f1.bam")
        // since we are symlinking if the other pair does not exist,
        // we want to keep this job as non-intermediate as well
        f1Job.isIntermediate = r2Job.isDefined

582
        val perStrandFiles = r2Job match {
bow's avatar
bow committed
583
584
          case Some(r2j) => List(f1Job.output, r2j.output)
          case None      => List(f1Job.output)
bow's avatar
bow committed
585
        }
bow's avatar
bow committed
586
        val combineJob = makeCombineJob(perStrandFiles, createFile(".plus_strand.bam"))
bow's avatar
bow committed
587

588
        Option(StrandSeparationJobSet(f1Job, r2Job, combineJob))
bow's avatar
bow committed
589
590
591
592
593

      case NonSpecific => None
      case _           => throw new IllegalStateException
    }

bow's avatar
bow committed
594
    /** Alignment file of reads from the minus strand, only defined when run is strand-specific */
bow's avatar
bow committed
595
    def alnFileMinusStrand: Option[File] = alnMinusStrandJobs
596
      .collect { case jobSet => jobSet.combineJob.output }
bow's avatar
bow committed
597

bow's avatar
bow committed
598
    /** Jobs for generating reads from the minus, only defined when run is strand-specific */
599
    private def alnMinusStrandJobs: Option[StrandSeparationJobSet] = strandProtocol match {
bow's avatar
bow committed
600
      case Dutp =>
601
        val r1Job = this.allPaired
602
603
604
605
606
607
608
609
610
611
612
          .option {
            val job = new SamtoolsView(qscript)
            job.input = alnFile
            job.b = true
            job.h = true
            job.f = List("0x80")
            job.F = List("0x10")
            job.output = createFile(".r1.bam")
            job.isIntermediate = true
            job
          }
613
614
615
616
617
618
619
620
621
622
623
624

        val f2Job = new SamtoolsView(qscript)
        f2Job.input = alnFile
        f2Job.b = true
        f2Job.h = true
        f2Job.output = createFile(".f2.bam")
        // since we are symlinking if the other pair does not exist,
        // we want to keep this job as non-intermediate as well
        f2Job.isIntermediate = r1Job.isDefined
        if (this.allSingle) f2Job.F = List("0x10")
        else f2Job.f = List("0x90")

625
        val perStrandFiles = r1Job match {
bow's avatar
bow committed
626
627
          case Some(r1j) => List(f2Job.output, r1j.output)
          case None      => List(f2Job.output)
bow's avatar
bow committed
628
        }
bow's avatar
bow committed
629
        val combineJob = makeCombineJob(perStrandFiles, createFile(".minus_strand.bam"))
bow's avatar
bow committed
630

631
        Option(StrandSeparationJobSet(f2Job, r1Job, combineJob))
bow's avatar
bow committed
632
633
634
635
636

      case NonSpecific => None
      case _           => throw new IllegalStateException
    }
    /** Raw base counting job */
637
    private def rawBaseCountJob: Option[RawBaseCounter] = strandProtocol match {
bow's avatar
bow committed
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
      case NonSpecific =>
        (expMeasures.contains(BasesPerExon) || expMeasures.contains(BasesPerGene))
          .option {
            val job = new RawBaseCounter(qscript)
            job.inputBoth = alnFile
            job.annotationBed = annotationBed.get
            job.output = createFile(".raw_base_count")
            job
          }
      case Dutp => {
        (expMeasures.contains(BasesPerExon) || expMeasures.contains(BasesPerGene))
          .option {
            require(alnFilePlusStrand.isDefined && alnFileMinusStrand.isDefined)
            val job = new RawBaseCounter(qscript)
            job.inputPlus = alnFilePlusStrand.get
            job.inputMinus = alnFileMinusStrand.get
            job.annotationBed = annotationBed.get
            job.output = createFile(".raw_base_count")
            job
          }
      }
      case _ => throw new IllegalStateException
    }
bow's avatar
bow committed
661

662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
    /** Base counting job per gene */
    private def basesPerGeneJob: Option[AggrBaseCount] = expMeasures
      .contains(BasesPerGene)
      .option {
        require(rawBaseCountJob.nonEmpty)
        val job = new AggrBaseCount(qscript)
        job.input = rawBaseCountJob.get.output
        job.output = createFile(".bases_per_gene")
        job.inputLabel = sampleId
        job.mode = "gene"
        job
      }

    /** Base counting job per exon */
    private def basesPerExonJob: Option[AggrBaseCount] = expMeasures
      .contains(BasesPerExon)
      .option {
        require(rawBaseCountJob.nonEmpty)
        val job = new AggrBaseCount(qscript)
        job.input = rawBaseCountJob.get.output
        job.output = createFile(".bases_per_exon")
        job.inputLabel = sampleId
        job.mode = "exon"
        job
      }

688
689
690
691
692
693
694
695
696
    /** Variant calling job */
    private def varCallJob: Option[CustomVarScan] = callVariants
      .option {
        val job = new CustomVarScan(qscript)
        job.input = alnFile
        job.output = createFile(".raw.vcf.gz")
        job
      }

bow's avatar
bow committed
697
698
699
700
701
702
703
704
705
706
    /** General metrics job, only when library > 1 */
    private lazy val bamMetricsModule: Option[BamMetrics] = (libraries.size > 1)
      .option {
        val mod = new BamMetrics(qscript)
        mod.inputBam = alnFile
        mod.outputDir = new File(sampleDir, "metrics")
        mod.sampleId = Option(sampleId)
        mod
      }

707
708
709
710
711
    /** Picard CollectRnaSeqMetrics job, only when library > 1 */
    private lazy val collectRnaSeqMetricsJob: Option[CollectRnaSeqMetrics] = (libraries.size > 1)
      .option {
        makeCollectRnaSeqMetricsJob(alnFileDirty, createFile(".rna_metrics"), Option(createFile(".coverage_bias.pdf")))
      }
bow's avatar
bow committed
712

bow's avatar
bow committed
713
    /** Job for removing ribosomal reads */
bow's avatar
bow committed
714
715
716
717
718
719
    private def wipeJob: Option[WipeReads] = removeRibosomalReads
      .option {
        require(ribosomalRefFlat.isDefined)
        val job = new WipeReads(qscript)
        job.inputBam = alnFileDirty
        job.intervalFile = ribosomalRefFlat.get
720
        job.outputBam = createFile(".cleaned.bam")
bow's avatar
bow committed
721
722
723
724
        job.discardedBam = createFile(".rrna.bam")
        job
      }

bow's avatar
bow committed
725
    /** Super type of Ln and MergeSamFiles */
726
727
728
    private type CombineFileFunction = QFunction { def output: File }

    /** Ln or MergeSamFile job, depending on how many inputs are supplied */
bow's avatar
bow committed
729
    private def makeCombineJob(inFiles: List[File], outFile: File,
730
731
732
733
                               mergeSortOrder: String = "coordinate"): CombineFileFunction = {
      require(inFiles.nonEmpty, "At least one input files for combine job")
      if (inFiles.size == 1) {
        val job = new Ln(qscript)
734
735
        job.input = inFiles.head
        job.output = outFile
736
737
738
739
740
741
742
        job
      } else {
        val job = new MergeSamFiles(qscript)
        job.input = inFiles
        job.output = outFile
        job.sortOrder = mergeSortOrder
        job
743
      }
744
745
746
747
748
    }

    /** Job for combining all library BAMs */
    private def sampleAlnJob: CombineFileFunction =
      makeCombineJob(libraries.values.map(_.alnFile).toList, createFile(".bam"))
749

750
751
752
753
754
755
    /** Whether all libraries are paired or not */
    def allPaired: Boolean = libraries.values.forall(_.paired)

    /** Whether all libraries are single or not */
    def allSingle: Boolean = libraries.values.forall(!_.paired)

756
    // TODO: add warnings or other messages for config values that are hard-coded by the pipeline
bow's avatar
bow committed
757
    /** Adds all jobs for the sample */
bow's avatar
bow committed
758
    def addJobs(): Unit = {
759
760
      // TODO: this is our requirement since it's easier to calculate base counts when all libraries are either paired or single
      require(allPaired || allSingle, s"Sample $sampleId contains only single-end or paired-end libraries")
bow's avatar
bow committed
761
762
763
      // add per-library jobs
      addPerLibJobs()
      // merge or symlink per-library alignments
764
      add(sampleAlnJob)
765
766
      // general RNA-seq metrics, if there are > 1 library
      collectRnaSeqMetricsJob match {
bow's avatar
bow committed
767
768
769
        case Some(j) =>
          add(j); addSummarizable(j, "rna_metrics")
        case None => ;
770
      }
bow's avatar
bow committed
771
      bamMetricsModule match {
bow's avatar
bow committed
772
        case Some(m) =>
bow's avatar
bow committed
773
774
775
776
          m.init()
          m.biopetScript()
          addAll(m.functions)
          addSummaryQScript(m)
bow's avatar
bow committed
777
        case None => ;
bow's avatar
bow committed
778
      }
bow's avatar
bow committed
779
780
781
782
      // add bigwig output, also per-strand when possible
      addAll(Bam2Wig(qscript, alnFile).functions)
      alnFilePlusStrand.collect { case f => addAll(Bam2Wig(qscript, f).functions) }
      alnFileMinusStrand.collect { case f => addAll(Bam2Wig(qscript, f).functions) }
bow's avatar
bow committed
783
784
785
      // add strand-specific jobs if defined
      alnPlusStrandJobs.foreach(_.addAllJobs())
      alnMinusStrandJobs.foreach(_.addAllJobs())
786
      // add htseq-count jobs, if defined
bow's avatar
bow committed
787
      idSortingJob.foreach(add(_))
788
789
      fragmentsPerGeneJob.foreach(add(_))
      fragmentsPerExonJob.foreach(add(_))
790
      // add custom base count jobs, if defined
bow's avatar
bow committed
791
      rawBaseCountJob.foreach(add(_))
792
793
      basesPerGeneJob.foreach(add(_))
      basesPerExonJob.foreach(add(_))
bow's avatar
bow committed
794
795
      // symlink results with distinct extensions ~ actually to make it easier to use MergeTables on these as well
      // since the Queue argument parser doesn't play nice with Map[_, _] types
bow's avatar
bow committed
796
797
798
      cufflinksStrictJobSet.foreach(_.jobs.foreach(add(_)))
      cufflinksGuidedJobSet.foreach(_.jobs.foreach(add(_)))
      cufflinksBlindJobSet.foreach(_.jobs.foreach(add(_)))
799
800
      // add variant calling job if requested
      varCallJob.foreach(add(_))
801
    }
bow's avatar
bow committed
802

bow's avatar
bow committed
803
    /** Returns a [[Library]] object */
804
    def makeLibrary(libId: String): Library = new Library(libId)
bow's avatar
bow committed
805

bow's avatar
bow committed
806
807
808
809
810
    /**
     * Gentrap library
     *
     * @param libId Unique identifier of the library
     */
811
    class Library(libId: String) extends AbstractLibrary(libId) {
812
813
814
815
816
817

      /** Summary stats of the library */
      def summaryStats: Map[String, Any] = Map()

      /** Summary files of the library */
      def summaryFiles: Map[String, File] = Map(
bow's avatar
bow committed
818
        "alignment" -> mappingJob.outputFiles("finalBamFile")
819
      )
bow's avatar
bow committed
820

821
      /** Convenience method to check whether the library is paired or not */
822
      def paired: Boolean = config.contains("R2")
823

824
      /** Alignment results of this library ~ can only be accessed after addJobs is run! */
bow's avatar
bow committed
825
826
      def alnFile: File = mappingJob.outputFiles("finalBamFile")

827
      /** Library-level RNA-seq metrics job, only when we have more than 1 library in the sample */
828
829
      def collectRnaSeqMetricsJob: CollectRnaSeqMetrics =
        makeCollectRnaSeqMetricsJob(alnFile, createFile(".rna_metrics"), Option(createFile(".coverage_bias.pdf")))
830

bow's avatar
bow committed
831
832
833
      /** Wiggle track job */
      private lazy val bam2wigModule: Bam2Wig = Bam2Wig(qscript, alnFile)

bow's avatar
bow committed
834
835
836
837
838
839
840
841
842
843
844
845
      /** Per-library mapping job */
      def mappingJob: Mapping = {
        val job = new Mapping(qscript)
        job.sampleId = Option(sampleId)
        job.libId = Option(libId)
        job.outputDir = libDir
        job.input_R1 = config("R1")
        job.input_R2 = config("R2")
        job.init()
        job.biopetScript()
        job
      }
bow's avatar
bow committed
846

bow's avatar
bow committed
847
      /** Adds all jobs for the library */
848
849
      def addJobs(): Unit = {
        // create per-library alignment file
bow's avatar
bow committed
850
        addAll(mappingJob.functions)
bow's avatar
bow committed
851
852
        // add bigwig track
        addAll(bam2wigModule.functions)
853
854
855
        // create RNA metrics job
        add(collectRnaSeqMetricsJob)
        addSummarizable(collectRnaSeqMetricsJob, "rna_metrics")
bow's avatar
bow committed
856
        qscript.addSummaryQScript(mappingJob)
857
      }
858

bow's avatar
bow committed
859
    }
bow's avatar
bow committed
860
  }
bow's avatar
bow committed
861
}
bow's avatar
bow committed
862

bow's avatar
bow committed
863
object Gentrap extends PipelineCommand {
864

bow's avatar
bow committed
865
866
  /** Enumeration of available expression measures */
  object ExpMeasures extends Enumeration {
bow's avatar
bow committed
867
868
869
    val FragmentsPerGene, FragmentsPerExon, BasesPerGene, BasesPerExon, CufflinksStrict, CufflinksGuided, CufflinksBlind = Value
    //Cuffquant,
    //Rsem = Value
870
  }
bow's avatar
bow committed
871

bow's avatar
bow committed
872
  /** Enumeration of available strandedness */
873
  object StrandProtocol extends Enumeration {
bow's avatar
bow committed
874
    // for now, only non-strand specific and dUTP stranded protocol is supported
bow's avatar
bow committed
875
    // TODO: other strandedness protocol?
bow's avatar
bow committed
876
877
    val NonSpecific, Dutp = Value
  }
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903

  /** Converts string with underscores into camel-case strings */
  private def camelize(ustring: String): String = ustring
    .split("_")
    .map(_.toLowerCase.capitalize)
    .mkString("")

  /** Conversion from raw user-supplied expression measure string to enum value */
  private def makeExpMeasure(rawName: String): ExpMeasures.Value = {
    try {
      ExpMeasures.withName(camelize(rawName))
    } catch {
      case nse: NoSuchElementException => throw new IllegalArgumentException("Invalid expression measure: " + rawName)
      case e: Exception                => throw e
    }
  }

  /** Conversion from raw user-supplied expression measure string to enum value */
  private def makeStrandProtocol(rawName: String): StrandProtocol.Value = {
    try {
      StrandProtocol.withName(camelize(rawName))
    } catch {
      case nse: NoSuchElementException => throw new IllegalArgumentException("Invalid strand protocol: " + rawName)
      case e: Exception                => throw e
    }
  }
bow's avatar
bow committed
904
}