Flexiprep.scala 12 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
16
17
package nl.lumc.sasc.biopet.pipelines.flexiprep

Peter van 't Hof's avatar
Peter van 't Hof committed
18
import nl.lumc.sasc.biopet.utils.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
19
20
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.{ PipelineCommand, SampleLibraryTag }
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.extensions._
Peter van 't Hof's avatar
Peter van 't Hof committed
22
23
import nl.lumc.sasc.biopet.tools.{ FastqSync, SeqStat }
import org.broadinstitute.gatk.queue.QScript
24

25
class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with SampleLibraryTag {
26
27
28
29
30
31
  def this() = this(null)

  @Input(doc = "R1 fastq file (gzipped allowed)", shortName = "R1", required = true)
  var input_R1: File = _

  @Input(doc = "R2 fastq file (gzipped allowed)", shortName = "R2", required = false)
32
  var input_R2: Option[File] = None
33

34
35
  /** Skip Trim fastq files */
  var skipTrim: Boolean = config("skip_trim", default = false)
36

37
38
  /** Skip Clip fastq files */
  var skipClip: Boolean = config("skip_clip", default = false)
39

40
41
42
  /** Make a final fastq files, by default only when flexiprep is the main pipeline */
  var keepQcFastqFiles: Boolean = config("keepQcFastqFiles", default = root == null)

Peter van 't Hof's avatar
Peter van 't Hof committed
43
  /** Location of summary file */
44
  def summaryFile = new File(outputDir, sampleId.getOrElse("x") + "-" + libId.getOrElse("x") + ".qc.summary.json")
45

Peter van 't Hof's avatar
Peter van 't Hof committed
46
  /** Returns files to store in summary */
Peter van 't Hof's avatar
Peter van 't Hof committed
47
  def summaryFiles: Map[String, File] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
48
49
50
51
52
    if (!skipTrim || !skipClip)
      Map("input_R1" -> input_R1, "output_R1" -> outputFiles("output_R1_gzip")) ++
        (if (paired) Map("input_R2" -> input_R2.get, "output_R2" -> outputFiles("output_R2_gzip")) else Map())
    else Map("input_R1" -> input_R1) ++
      (if (paired) Map("input_R2" -> input_R2.get) else Map())
Peter van 't Hof's avatar
Peter van 't Hof committed
53
  }
54

Peter van 't Hof's avatar
Peter van 't Hof committed
55
  /** returns settings to store in summary */
Peter van 't Hof's avatar
Peter van 't Hof committed
56
  def summarySettings = Map("skip_trim" -> skipTrim, "skip_clip" -> skipClip, "paired" -> paired)
57

58
  var paired: Boolean = input_R2.isDefined
59
60
61
62
63
64
65
66
67
68
  var R1_ext: String = _
  var R2_ext: String = _
  var R1_name: String = _
  var R2_name: String = _

  var fastqc_R1: Fastqc = _
  var fastqc_R2: Fastqc = _
  var fastqc_R1_after: Fastqc = _
  var fastqc_R2_after: Fastqc = _

Peter van 't Hof's avatar
Peter van 't Hof committed
69
70
71
72
73
74
75
76
77
78
  override def reportClass = {
    val flexiprepReport = new FlexiprepReport(this)
    flexiprepReport.outputDir = new File(outputDir, "report")
    flexiprepReport.summaryFile = summaryFile
    flexiprepReport.args = Map(
      "sampleId" -> sampleId.getOrElse("."),
      "libId" -> libId.getOrElse("."))
    Some(flexiprepReport)
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
79
  /** Function that's need to be executed before the script is accessed */
80
  def init() {
81
    require(outputDir != null, "Missing output directory on flexiprep module")
82
    require(input_R1 != null, "Missing input R1 on flexiprep module")
Peter van 't Hof's avatar
Peter van 't Hof committed
83
84
    require(sampleId != null, "Missing sample ID on flexiprep module")
    require(libId != null, "Missing library ID on flexiprep module")
85

86
    paired = input_R2.isDefined
87
88
89
90

    if (input_R1.endsWith(".gz")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gz"))
    else if (input_R1.endsWith(".gzip")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gzip"))
    else R1_name = input_R1.getName
Peter van 't Hof's avatar
Peter van 't Hof committed
91
    R1_ext = R1_name.substring(R1_name.lastIndexOf("."), R1_name.length)
92
93
    R1_name = R1_name.substring(0, R1_name.lastIndexOf(R1_ext))

Peter van 't Hof's avatar
Peter van 't Hof committed
94
    input_R2 match {
Peter van 't Hof's avatar
Peter van 't Hof committed
95
      case Some(fileR2) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
96
97
98
99
        paired = true
        if (fileR2.endsWith(".gz")) R2_name = fileR2.getName.substring(0, fileR2.getName.lastIndexOf(".gz"))
        else if (fileR2.endsWith(".gzip")) R2_name = fileR2.getName.substring(0, fileR2.getName.lastIndexOf(".gzip"))
        else R2_name = fileR2.getName
Peter van 't Hof's avatar
Peter van 't Hof committed
100
        R2_ext = R2_name.substring(R2_name.lastIndexOf("."), R2_name.length)
Peter van 't Hof's avatar
Peter van 't Hof committed
101
102
        R2_name = R2_name.substring(0, R2_name.lastIndexOf(R2_ext))
      case _ =>
103
104
105
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
106
  /** Script to add jobs */
107
108
109
  def biopetScript() {
    runInitialJobs()

Peter van 't Hof's avatar
Peter van 't Hof committed
110
    val out = if (paired) runTrimClip(outputFiles("fastq_input_R1"), Some(outputFiles("fastq_input_R2")), outputDir)
111
112
113
114
115
116
117
    else runTrimClip(outputFiles("fastq_input_R1"), outputDir)

    val R1_files = for ((k, v) <- outputFiles if k.endsWith("output_R1")) yield v
    val R2_files = for ((k, v) <- outputFiles if k.endsWith("output_R2")) yield v
    runFinalize(R1_files.toList, R2_files.toList)
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
118
  /** Add init non chunkable jobs */
119
120
  def runInitialJobs() {
    outputFiles += ("fastq_input_R1" -> extractIfNeeded(input_R1, outputDir))
Peter van 't Hof's avatar
Peter van 't Hof committed
121
    if (paired) outputFiles += ("fastq_input_R2" -> extractIfNeeded(input_R2.get, outputDir))
122

Peter van 't Hof's avatar
Peter van 't Hof committed
123
    fastqc_R1 = Fastqc(this, input_R1, new File(outputDir, R1_name + ".fastqc/"))
124
    add(fastqc_R1)
125
    addSummarizable(fastqc_R1, "fastqc_R1")
126
127
128
    outputFiles += ("fastqc_R1" -> fastqc_R1.output)

    if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
129
      fastqc_R2 = Fastqc(this, input_R2.get, new File(outputDir, R2_name + ".fastqc/"))
130
      add(fastqc_R2)
131
      addSummarizable(fastqc_R2, "fastqc_R2")
132
133
134
135
      outputFiles += ("fastqc_R2" -> fastqc_R2.output)
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
136
137
  //TODO: Refactor need to combine all this functions

Peter van 't Hof's avatar
Peter van 't Hof committed
138
  /** Adds all chunkable jobs of flexiprep */
Peter van 't Hof's avatar
Peter van 't Hof committed
139
140
  def runTrimClip(R1_in: File, outDir: File, chunk: String): (File, Option[File], List[File]) =
    runTrimClip(R1_in, None, outDir, chunk)
Peter van 't Hof's avatar
Peter van 't Hof committed
141

Peter van 't Hof's avatar
Peter van 't Hof committed
142
  /** Adds all chunkable jobs of flexiprep */
Peter van 't Hof's avatar
Peter van 't Hof committed
143
144
  def runTrimClip(R1_in: File, outDir: File): (File, Option[File], List[File]) =
    runTrimClip(R1_in, None, outDir, "")
Peter van 't Hof's avatar
Peter van 't Hof committed
145

Peter van 't Hof's avatar
Peter van 't Hof committed
146
  /** Adds all chunkable jobs of flexiprep */
Peter van 't Hof's avatar
Peter van 't Hof committed
147
  def runTrimClip(R1_in: File, R2_in: Option[File], outDir: File): (File, Option[File], List[File]) =
148
    runTrimClip(R1_in, R2_in, outDir, "")
Peter van 't Hof's avatar
Peter van 't Hof committed
149

Peter van 't Hof's avatar
Peter van 't Hof committed
150
  /** Adds all chunkable jobs of flexiprep */
Peter van 't Hof's avatar
Peter van 't Hof committed
151
  def runTrimClip(R1_in: File, R2_in: Option[File], outDir: File, chunkarg: String): (File, Option[File], List[File]) = {
152
153
154
    val chunk = if (chunkarg.isEmpty || chunkarg.endsWith("_")) chunkarg else chunkarg + "_"
    var results: Map[String, File] = Map()

Peter van 't Hof's avatar
Peter van 't Hof committed
155
156
157
158
    var R1 = R1_in
    var R2 = R2_in
    var deps_R1 = R1 :: Nil
    var deps_R2 = if (paired) R2.get :: Nil else Nil
159
    def deps: List[File] = deps_R1 ::: deps_R2
160

161
    val seqtkSeq_R1 = SeqtkSeq(this, R1, swapExt(outDir, R1, R1_ext, ".sanger" + R1_ext), fastqc_R1)
162
163
    seqtkSeq_R1.isIntermediate = true
    add(seqtkSeq_R1)
Peter van 't Hof's avatar
Peter van 't Hof committed
164
    addSummarizable(seqtkSeq_R1, "seqtkSeq_R1")
165
    R1 = seqtkSeq_R1.output
166
    deps_R1 ::= R1
167
168

    if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
169
      val seqtkSeq_R2 = SeqtkSeq(this, R2.get, swapExt(outDir, R2.get, R2_ext, ".sanger" + R2_ext), fastqc_R2)
170
171
      seqtkSeq_R2.isIntermediate = true
      add(seqtkSeq_R2)
Peter van 't Hof's avatar
Peter van 't Hof committed
172
      addSummarizable(seqtkSeq_R2, "seqtkSeq_R2")
Peter van 't Hof's avatar
Peter van 't Hof committed
173
174
      R2 = Some(seqtkSeq_R2.output)
      deps_R2 ::= R2.get
175
176
    }

bow's avatar
bow committed
177
    val seqstat_R1 = SeqStat(this, R1, outDir)
178
    seqstat_R1.isIntermediate = true
179
    seqstat_R1.deps = deps_R1
180
    add(seqstat_R1)
Peter van 't Hof's avatar
Peter van 't Hof committed
181
    addSummarizable(seqstat_R1, "seqstat_R1")
182
183

    if (paired) {
bow's avatar
bow committed
184
      val seqstat_R2 = SeqStat(this, R2.get, outDir)
185
      seqstat_R2.isIntermediate = true
186
      seqstat_R2.deps = deps_R2
187
      add(seqstat_R2)
Peter van 't Hof's avatar
Peter van 't Hof committed
188
      addSummarizable(seqstat_R2, "seqstat_R2")
189
190
191
192
193
    }

    if (!skipClip) { // Adapter clipping

      val cutadapt_R1 = Cutadapt(this, R1, swapExt(outDir, R1, R1_ext, ".clip" + R1_ext))
194
      cutadapt_R1.fastqc = fastqc_R1
195
      cutadapt_R1.isIntermediate = true
196
      cutadapt_R1.deps = deps_R1
197
      add(cutadapt_R1)
Peter van 't Hof's avatar
Peter van 't Hof committed
198
      addSummarizable(cutadapt_R1, "clipping_R1")
199
      R1 = cutadapt_R1.fastq_output
200
      deps_R1 ::= R1
201
202
203
      outputFiles += ("cutadapt_R1_stats" -> cutadapt_R1.stats_output)

      if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
204
        val cutadapt_R2 = Cutadapt(this, R2.get, swapExt(outDir, R2.get, R2_ext, ".clip" + R2_ext))
205
        outputFiles += ("cutadapt_R2_stats" -> cutadapt_R2.stats_output)
206
        cutadapt_R2.fastqc = fastqc_R2
207
        cutadapt_R2.isIntermediate = true
208
        cutadapt_R2.deps = deps_R2
209
        add(cutadapt_R2)
Peter van 't Hof's avatar
Peter van 't Hof committed
210
        addSummarizable(cutadapt_R2, "clipping_R2")
Peter van 't Hof's avatar
Peter van 't Hof committed
211
212
        R2 = Some(cutadapt_R2.fastq_output)
        deps_R2 ::= R2.get
213

bow's avatar
bow committed
214
215
216
217
218
        val fqSync = new FastqSync(this)
        fqSync.refFastq = cutadapt_R1.fastq_input
        fqSync.inputFastq1 = cutadapt_R1.fastq_output
        fqSync.inputFastq2 = cutadapt_R2.fastq_output
        fqSync.outputFastq1 = swapExt(outDir, R1, R1_ext, ".sync" + R1_ext)
Peter van 't Hof's avatar
Peter van 't Hof committed
219
        fqSync.outputFastq2 = swapExt(outDir, R2.get, R2_ext, ".sync" + R2_ext)
bow's avatar
bow committed
220
221
222
        fqSync.outputStats = swapExt(outDir, R1, R1_ext, ".sync.stats")
        fqSync.deps :::= deps
        add(fqSync)
Peter van 't Hof's avatar
Peter van 't Hof committed
223
        addSummarizable(fqSync, "fastq_sync")
bow's avatar
bow committed
224
225
        outputFiles += ("syncStats" -> fqSync.outputStats)
        R1 = fqSync.outputFastq1
Peter van 't Hof's avatar
Peter van 't Hof committed
226
        R2 = Some(fqSync.outputFastq2)
227
        deps_R1 ::= R1
Peter van 't Hof's avatar
Peter van 't Hof committed
228
        deps_R2 ::= R2.get
229
230
231
232
233
234
235
236
      }
    }

    if (!skipTrim) { // Quality trimming
      val sickle = new Sickle(this)
      sickle.input_R1 = R1
      sickle.output_R1 = swapExt(outDir, R1, R1_ext, ".trim" + R1_ext)
      if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
237
238
239
        sickle.input_R2 = R2.get
        sickle.output_R2 = swapExt(outDir, R2.get, R2_ext, ".trim" + R2_ext)
        sickle.output_singles = swapExt(outDir, R2.get, R2_ext, ".trim.singles" + R1_ext)
240
241
242
      }
      sickle.output_stats = swapExt(outDir, R1, R1_ext, ".trim.stats")
      sickle.deps = deps
Peter van 't Hof's avatar
Peter van 't Hof committed
243
244
      sickle.isIntermediate = true
      add(sickle)
Peter van 't Hof's avatar
Peter van 't Hof committed
245
      addSummarizable(sickle, "trimming")
246
      R1 = sickle.output_R1
Peter van 't Hof's avatar
Peter van 't Hof committed
247
      if (paired) R2 = Some(sickle.output_R2)
248
249
    }

bow's avatar
bow committed
250
    val seqstat_R1_after = SeqStat(this, R1, outDir)
251
    seqstat_R1_after.deps = deps_R1
252
    add(seqstat_R1_after)
253
    addSummarizable(seqstat_R1_after, "seqstat_R1_qc")
254
255

    if (paired) {
bow's avatar
bow committed
256
      val seqstat_R2_after = SeqStat(this, R2.get, outDir)
257
      seqstat_R2_after.deps = deps_R2
258
      add(seqstat_R2_after)
259
      addSummarizable(seqstat_R2_after, "seqstat_R2_qc")
260
261
262
    }

    outputFiles += (chunk + "output_R1" -> R1)
Peter van 't Hof's avatar
Peter van 't Hof committed
263
    if (paired) outputFiles += (chunk + "output_R2" -> R2.get)
Peter van 't Hof's avatar
Peter van 't Hof committed
264
    (R1, R2, deps)
265
266
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
267
  /** Adds last non chunkable jobs */
268
269
  def runFinalize(fastq_R1: List[File], fastq_R2: List[File]) {
    if (fastq_R1.length != fastq_R2.length && paired) throw new IllegalStateException("R1 and R2 file number is not the same")
Peter van 't Hof's avatar
Peter van 't Hof committed
270
271
    val R1 = new File(outputDir, R1_name + ".qc" + R1_ext + ".gz")
    val R2 = new File(outputDir, R2_name + ".qc" + R2_ext + ".gz")
272

273
274
275
    if (!skipTrim || !skipClip) {
      add(Gzip(this, fastq_R1, R1), !keepQcFastqFiles)
      if (paired) add(Gzip(this, fastq_R2, R2), !keepQcFastqFiles)
276

277
278
      outputFiles += ("output_R1_gzip" -> R1)
      if (paired) outputFiles += ("output_R2_gzip" -> R2)
279

Peter van 't Hof's avatar
Peter van 't Hof committed
280
      fastqc_R1_after = Fastqc(this, R1, new File(outputDir, R1_name + ".qc.fastqc/"))
281
      add(fastqc_R1_after)
282
      addSummarizable(fastqc_R1_after, "fastqc_R1_qc")
Peter van 't Hof's avatar
Peter van 't Hof committed
283

284
      if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
285
        fastqc_R2_after = Fastqc(this, R2, new File(outputDir, R2_name + ".qc.fastqc/"))
286
        add(fastqc_R2_after)
287
        addSummarizable(fastqc_R2_after, "fastqc_R2_qc")
288
289
290
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
291
    addSummaryJobs()
292
293
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
294
  /** Extracts file if file is compressed */
Peter van 't Hof's avatar
Peter van 't Hof committed
295
  def extractIfNeeded(file: File, runDir: File): File = {
Peter van 't Hof's avatar
Peter van 't Hof committed
296
297
    if (file == null) file
    else if (file.getName.endsWith(".gz") || file.getName.endsWith(".gzip")) {
298
      var newFile: File = swapExt(runDir, file, ".gz", "")
Peter van 't Hof's avatar
Peter van 't Hof committed
299
      if (file.getName.endsWith(".gzip")) newFile = swapExt(runDir, file, ".gzip", "")
300
301
302
      val zcatCommand = Zcat(this, file, newFile)
      zcatCommand.isIntermediate = true
      add(zcatCommand)
Peter van 't Hof's avatar
Peter van 't Hof committed
303
304
      newFile
    } else if (file.getName.endsWith(".bz2")) {
Peter van 't Hof's avatar
Peter van 't Hof committed
305
      val newFile = swapExt(runDir, file, ".bz2", "")
306
307
308
      val pbzip2 = Pbzip2(this, file, newFile)
      pbzip2.isIntermediate = true
      add(pbzip2)
Peter van 't Hof's avatar
Peter van 't Hof committed
309
310
      newFile
    } else file
311
312
313
314
  }
}

object Flexiprep extends PipelineCommand