Flexiprep.scala 12.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
16
17
package nl.lumc.sasc.biopet.pipelines.flexiprep

Peter van 't Hof's avatar
Peter van 't Hof committed
18
19
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.{ PipelineCommand, SampleLibraryTag }
20
21
22
import nl.lumc.sasc.biopet.extensions.{ Pbzip2, Zcat, Gzip, Sickle }
import nl.lumc.sasc.biopet.extensions.tools.{ SeqStat, FastqSync }
import nl.lumc.sasc.biopet.utils.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import org.broadinstitute.gatk.queue.QScript
24

25
class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with SampleLibraryTag {
26
27
28
29
30
31
  def this() = this(null)

  @Input(doc = "R1 fastq file (gzipped allowed)", shortName = "R1", required = true)
  var input_R1: File = _

  @Input(doc = "R2 fastq file (gzipped allowed)", shortName = "R2", required = false)
32
  var input_R2: Option[File] = None
33

34
35
  /** Skip Trim fastq files */
  var skipTrim: Boolean = config("skip_trim", default = false)
36

37
38
  /** Skip Clip fastq files */
  var skipClip: Boolean = config("skip_clip", default = false)
39

40
41
42
  /** Make a final fastq files, by default only when flexiprep is the main pipeline */
  var keepQcFastqFiles: Boolean = config("keepQcFastqFiles", default = root == null)

Peter van 't Hof's avatar
Peter van 't Hof committed
43
  /** Location of summary file */
44
  def summaryFile = new File(outputDir, sampleId.getOrElse("x") + "-" + libId.getOrElse("x") + ".qc.summary.json")
45

Peter van 't Hof's avatar
Peter van 't Hof committed
46
  /** Returns files to store in summary */
Peter van 't Hof's avatar
Peter van 't Hof committed
47
  def summaryFiles: Map[String, File] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
48
49
50
51
52
    if (!skipTrim || !skipClip)
      Map("input_R1" -> input_R1, "output_R1" -> outputFiles("output_R1_gzip")) ++
        (if (paired) Map("input_R2" -> input_R2.get, "output_R2" -> outputFiles("output_R2_gzip")) else Map())
    else Map("input_R1" -> input_R1) ++
      (if (paired) Map("input_R2" -> input_R2.get) else Map())
Peter van 't Hof's avatar
Peter van 't Hof committed
53
  }
54

Peter van 't Hof's avatar
Peter van 't Hof committed
55
  /** returns settings to store in summary */
Peter van 't Hof's avatar
Peter van 't Hof committed
56
  def summarySettings = Map("skip_trim" -> skipTrim, "skip_clip" -> skipClip, "paired" -> paired)
57

58
  var paired: Boolean = input_R2.isDefined
59
60
61
62
63
64
65
66
67
68
  var R1_ext: String = _
  var R2_ext: String = _
  var R1_name: String = _
  var R2_name: String = _

  var fastqc_R1: Fastqc = _
  var fastqc_R2: Fastqc = _
  var fastqc_R1_after: Fastqc = _
  var fastqc_R2_after: Fastqc = _

Peter van 't Hof's avatar
Peter van 't Hof committed
69
70
71
72
73
74
75
76
77
78
  override def reportClass = {
    val flexiprepReport = new FlexiprepReport(this)
    flexiprepReport.outputDir = new File(outputDir, "report")
    flexiprepReport.summaryFile = summaryFile
    flexiprepReport.args = Map(
      "sampleId" -> sampleId.getOrElse("."),
      "libId" -> libId.getOrElse("."))
    Some(flexiprepReport)
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
79
  /** Function that's need to be executed before the script is accessed */
80
  def init() {
81
    require(outputDir != null, "Missing output directory on flexiprep module")
82
    require(input_R1 != null, "Missing input R1 on flexiprep module")
Peter van 't Hof's avatar
Peter van 't Hof committed
83
84
    require(sampleId != null, "Missing sample ID on flexiprep module")
    require(libId != null, "Missing library ID on flexiprep module")
85

86
    paired = input_R2.isDefined
87

Peter van 't Hof's avatar
Peter van 't Hof committed
88
89
90
    inputFiles :+= InputFile(input_R1)
    input_R2.foreach(inputFiles :+= InputFile(_))

91
92
93
    if (input_R1.endsWith(".gz")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gz"))
    else if (input_R1.endsWith(".gzip")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gzip"))
    else R1_name = input_R1.getName
Peter van 't Hof's avatar
Peter van 't Hof committed
94
    R1_ext = R1_name.substring(R1_name.lastIndexOf("."), R1_name.length)
95
96
    R1_name = R1_name.substring(0, R1_name.lastIndexOf(R1_ext))

Peter van 't Hof's avatar
Peter van 't Hof committed
97
    input_R2 match {
Peter van 't Hof's avatar
Peter van 't Hof committed
98
      case Some(fileR2) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
99
100
101
102
        paired = true
        if (fileR2.endsWith(".gz")) R2_name = fileR2.getName.substring(0, fileR2.getName.lastIndexOf(".gz"))
        else if (fileR2.endsWith(".gzip")) R2_name = fileR2.getName.substring(0, fileR2.getName.lastIndexOf(".gzip"))
        else R2_name = fileR2.getName
Peter van 't Hof's avatar
Peter van 't Hof committed
103
        R2_ext = R2_name.substring(R2_name.lastIndexOf("."), R2_name.length)
Peter van 't Hof's avatar
Peter van 't Hof committed
104
105
        R2_name = R2_name.substring(0, R2_name.lastIndexOf(R2_ext))
      case _ =>
106
107
108
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
109
  /** Script to add jobs */
110
111
112
  def biopetScript() {
    runInitialJobs()

Peter van 't Hof's avatar
Peter van 't Hof committed
113
    val out = if (paired) runTrimClip(outputFiles("fastq_input_R1"), Some(outputFiles("fastq_input_R2")), outputDir)
114
115
116
117
118
119
120
    else runTrimClip(outputFiles("fastq_input_R1"), outputDir)

    val R1_files = for ((k, v) <- outputFiles if k.endsWith("output_R1")) yield v
    val R2_files = for ((k, v) <- outputFiles if k.endsWith("output_R2")) yield v
    runFinalize(R1_files.toList, R2_files.toList)
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
121
  /** Add init non chunkable jobs */
122
123
  def runInitialJobs() {
    outputFiles += ("fastq_input_R1" -> extractIfNeeded(input_R1, outputDir))
Peter van 't Hof's avatar
Peter van 't Hof committed
124
    if (paired) outputFiles += ("fastq_input_R2" -> extractIfNeeded(input_R2.get, outputDir))
125

Peter van 't Hof's avatar
Peter van 't Hof committed
126
    fastqc_R1 = Fastqc(this, input_R1, new File(outputDir, R1_name + ".fastqc/"))
127
    add(fastqc_R1)
128
    addSummarizable(fastqc_R1, "fastqc_R1")
129
130
131
    outputFiles += ("fastqc_R1" -> fastqc_R1.output)

    if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
132
      fastqc_R2 = Fastqc(this, input_R2.get, new File(outputDir, R2_name + ".fastqc/"))
133
      add(fastqc_R2)
134
      addSummarizable(fastqc_R2, "fastqc_R2")
135
136
137
138
      outputFiles += ("fastqc_R2" -> fastqc_R2.output)
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
139
140
  //TODO: Refactor need to combine all this functions

Peter van 't Hof's avatar
Peter van 't Hof committed
141
  /** Adds all chunkable jobs of flexiprep */
Peter van 't Hof's avatar
Peter van 't Hof committed
142
143
  def runTrimClip(R1_in: File, outDir: File, chunk: String): (File, Option[File], List[File]) =
    runTrimClip(R1_in, None, outDir, chunk)
Peter van 't Hof's avatar
Peter van 't Hof committed
144

Peter van 't Hof's avatar
Peter van 't Hof committed
145
  /** Adds all chunkable jobs of flexiprep */
Peter van 't Hof's avatar
Peter van 't Hof committed
146
147
  def runTrimClip(R1_in: File, outDir: File): (File, Option[File], List[File]) =
    runTrimClip(R1_in, None, outDir, "")
Peter van 't Hof's avatar
Peter van 't Hof committed
148

Peter van 't Hof's avatar
Peter van 't Hof committed
149
  /** Adds all chunkable jobs of flexiprep */
Peter van 't Hof's avatar
Peter van 't Hof committed
150
  def runTrimClip(R1_in: File, R2_in: Option[File], outDir: File): (File, Option[File], List[File]) =
151
    runTrimClip(R1_in, R2_in, outDir, "")
Peter van 't Hof's avatar
Peter van 't Hof committed
152

Peter van 't Hof's avatar
Peter van 't Hof committed
153
  /** Adds all chunkable jobs of flexiprep */
Peter van 't Hof's avatar
Peter van 't Hof committed
154
  def runTrimClip(R1_in: File, R2_in: Option[File], outDir: File, chunkarg: String): (File, Option[File], List[File]) = {
155
156
157
    val chunk = if (chunkarg.isEmpty || chunkarg.endsWith("_")) chunkarg else chunkarg + "_"
    var results: Map[String, File] = Map()

Peter van 't Hof's avatar
Peter van 't Hof committed
158
159
160
161
    var R1 = R1_in
    var R2 = R2_in
    var deps_R1 = R1 :: Nil
    var deps_R2 = if (paired) R2.get :: Nil else Nil
162
    def deps: List[File] = deps_R1 ::: deps_R2
163

164
    val seqtkSeq_R1 = SeqtkSeq(this, R1, swapExt(outDir, R1, R1_ext, ".sanger" + R1_ext), fastqc_R1)
165
166
    seqtkSeq_R1.isIntermediate = true
    add(seqtkSeq_R1)
Peter van 't Hof's avatar
Peter van 't Hof committed
167
    addSummarizable(seqtkSeq_R1, "seqtkSeq_R1")
168
    R1 = seqtkSeq_R1.output
169
    deps_R1 ::= R1
170
171

    if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
172
      val seqtkSeq_R2 = SeqtkSeq(this, R2.get, swapExt(outDir, R2.get, R2_ext, ".sanger" + R2_ext), fastqc_R2)
173
174
      seqtkSeq_R2.isIntermediate = true
      add(seqtkSeq_R2)
Peter van 't Hof's avatar
Peter van 't Hof committed
175
      addSummarizable(seqtkSeq_R2, "seqtkSeq_R2")
Peter van 't Hof's avatar
Peter van 't Hof committed
176
177
      R2 = Some(seqtkSeq_R2.output)
      deps_R2 ::= R2.get
178
179
    }

bow's avatar
bow committed
180
    val seqstat_R1 = SeqStat(this, R1, outDir)
181
    seqstat_R1.isIntermediate = true
182
    seqstat_R1.deps = deps_R1
183
    add(seqstat_R1)
Peter van 't Hof's avatar
Peter van 't Hof committed
184
    addSummarizable(seqstat_R1, "seqstat_R1")
185
186

    if (paired) {
bow's avatar
bow committed
187
      val seqstat_R2 = SeqStat(this, R2.get, outDir)
188
      seqstat_R2.isIntermediate = true
189
      seqstat_R2.deps = deps_R2
190
      add(seqstat_R2)
Peter van 't Hof's avatar
Peter van 't Hof committed
191
      addSummarizable(seqstat_R2, "seqstat_R2")
192
193
194
195
196
    }

    if (!skipClip) { // Adapter clipping

      val cutadapt_R1 = Cutadapt(this, R1, swapExt(outDir, R1, R1_ext, ".clip" + R1_ext))
197
      cutadapt_R1.fastqc = fastqc_R1
198
      cutadapt_R1.isIntermediate = true
199
      cutadapt_R1.deps = deps_R1
200
      add(cutadapt_R1)
Peter van 't Hof's avatar
Peter van 't Hof committed
201
      addSummarizable(cutadapt_R1, "clipping_R1")
202
      R1 = cutadapt_R1.fastq_output
203
      deps_R1 ::= R1
204
205
206
      outputFiles += ("cutadapt_R1_stats" -> cutadapt_R1.stats_output)

      if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
207
        val cutadapt_R2 = Cutadapt(this, R2.get, swapExt(outDir, R2.get, R2_ext, ".clip" + R2_ext))
208
        outputFiles += ("cutadapt_R2_stats" -> cutadapt_R2.stats_output)
209
        cutadapt_R2.fastqc = fastqc_R2
210
        cutadapt_R2.isIntermediate = true
211
        cutadapt_R2.deps = deps_R2
212
        add(cutadapt_R2)
Peter van 't Hof's avatar
Peter van 't Hof committed
213
        addSummarizable(cutadapt_R2, "clipping_R2")
Peter van 't Hof's avatar
Peter van 't Hof committed
214
215
        R2 = Some(cutadapt_R2.fastq_output)
        deps_R2 ::= R2.get
216

bow's avatar
bow committed
217
218
219
220
221
        val fqSync = new FastqSync(this)
        fqSync.refFastq = cutadapt_R1.fastq_input
        fqSync.inputFastq1 = cutadapt_R1.fastq_output
        fqSync.inputFastq2 = cutadapt_R2.fastq_output
        fqSync.outputFastq1 = swapExt(outDir, R1, R1_ext, ".sync" + R1_ext)
Peter van 't Hof's avatar
Peter van 't Hof committed
222
        fqSync.outputFastq2 = swapExt(outDir, R2.get, R2_ext, ".sync" + R2_ext)
bow's avatar
bow committed
223
        fqSync.outputStats = swapExt(outDir, R1, R1_ext, ".sync.stats")
224
        fqSync.isIntermediate = true
bow's avatar
bow committed
225
226
        fqSync.deps :::= deps
        add(fqSync)
Peter van 't Hof's avatar
Peter van 't Hof committed
227
        addSummarizable(fqSync, "fastq_sync")
bow's avatar
bow committed
228
229
        outputFiles += ("syncStats" -> fqSync.outputStats)
        R1 = fqSync.outputFastq1
Peter van 't Hof's avatar
Peter van 't Hof committed
230
        R2 = Some(fqSync.outputFastq2)
231
        deps_R1 ::= R1
Peter van 't Hof's avatar
Peter van 't Hof committed
232
        deps_R2 ::= R2.get
233
234
235
236
237
238
239
240
      }
    }

    if (!skipTrim) { // Quality trimming
      val sickle = new Sickle(this)
      sickle.input_R1 = R1
      sickle.output_R1 = swapExt(outDir, R1, R1_ext, ".trim" + R1_ext)
      if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
241
242
243
        sickle.input_R2 = R2.get
        sickle.output_R2 = swapExt(outDir, R2.get, R2_ext, ".trim" + R2_ext)
        sickle.output_singles = swapExt(outDir, R2.get, R2_ext, ".trim.singles" + R1_ext)
244
245
246
      }
      sickle.output_stats = swapExt(outDir, R1, R1_ext, ".trim.stats")
      sickle.deps = deps
Peter van 't Hof's avatar
Peter van 't Hof committed
247
248
      sickle.isIntermediate = true
      add(sickle)
Peter van 't Hof's avatar
Peter van 't Hof committed
249
      addSummarizable(sickle, "trimming")
250
      R1 = sickle.output_R1
Peter van 't Hof's avatar
Peter van 't Hof committed
251
      if (paired) R2 = Some(sickle.output_R2)
252
253
    }

bow's avatar
bow committed
254
    val seqstat_R1_after = SeqStat(this, R1, outDir)
255
    seqstat_R1_after.deps = deps_R1
256
    add(seqstat_R1_after)
257
    addSummarizable(seqstat_R1_after, "seqstat_R1_qc")
258
259

    if (paired) {
bow's avatar
bow committed
260
      val seqstat_R2_after = SeqStat(this, R2.get, outDir)
261
      seqstat_R2_after.deps = deps_R2
262
      add(seqstat_R2_after)
263
      addSummarizable(seqstat_R2_after, "seqstat_R2_qc")
264
265
266
    }

    outputFiles += (chunk + "output_R1" -> R1)
Peter van 't Hof's avatar
Peter van 't Hof committed
267
    if (paired) outputFiles += (chunk + "output_R2" -> R2.get)
Peter van 't Hof's avatar
Peter van 't Hof committed
268
    (R1, R2, deps)
269
270
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
271
  /** Adds last non chunkable jobs */
272
273
  def runFinalize(fastq_R1: List[File], fastq_R2: List[File]) {
    if (fastq_R1.length != fastq_R2.length && paired) throw new IllegalStateException("R1 and R2 file number is not the same")
Peter van 't Hof's avatar
Peter van 't Hof committed
274
275
    val R1 = new File(outputDir, R1_name + ".qc" + R1_ext + ".gz")
    val R2 = new File(outputDir, R2_name + ".qc" + R2_ext + ".gz")
276

277
278
279
    if (!skipTrim || !skipClip) {
      add(Gzip(this, fastq_R1, R1), !keepQcFastqFiles)
      if (paired) add(Gzip(this, fastq_R2, R2), !keepQcFastqFiles)
280

281
282
      outputFiles += ("output_R1_gzip" -> R1)
      if (paired) outputFiles += ("output_R2_gzip" -> R2)
283

Peter van 't Hof's avatar
Peter van 't Hof committed
284
      fastqc_R1_after = Fastqc(this, R1, new File(outputDir, R1_name + ".qc.fastqc/"))
285
      add(fastqc_R1_after)
286
      addSummarizable(fastqc_R1_after, "fastqc_R1_qc")
Peter van 't Hof's avatar
Peter van 't Hof committed
287

288
      if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
289
        fastqc_R2_after = Fastqc(this, R2, new File(outputDir, R2_name + ".qc.fastqc/"))
290
        add(fastqc_R2_after)
291
        addSummarizable(fastqc_R2_after, "fastqc_R2_qc")
292
293
294
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
295
    addSummaryJobs()
296
297
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
298
  /** Extracts file if file is compressed */
Peter van 't Hof's avatar
Peter van 't Hof committed
299
  def extractIfNeeded(file: File, runDir: File): File = {
Peter van 't Hof's avatar
Peter van 't Hof committed
300
301
    if (file == null) file
    else if (file.getName.endsWith(".gz") || file.getName.endsWith(".gzip")) {
302
      var newFile: File = swapExt(runDir, file, ".gz", "")
Peter van 't Hof's avatar
Peter van 't Hof committed
303
      if (file.getName.endsWith(".gzip")) newFile = swapExt(runDir, file, ".gzip", "")
304
305
306
      val zcatCommand = Zcat(this, file, newFile)
      zcatCommand.isIntermediate = true
      add(zcatCommand)
Peter van 't Hof's avatar
Peter van 't Hof committed
307
308
      newFile
    } else if (file.getName.endsWith(".bz2")) {
Peter van 't Hof's avatar
Peter van 't Hof committed
309
      val newFile = swapExt(runDir, file, ".bz2", "")
310
311
312
      val pbzip2 = Pbzip2(this, file, newFile)
      pbzip2.isIntermediate = true
      add(pbzip2)
Peter van 't Hof's avatar
Peter van 't Hof committed
313
314
      newFile
    } else file
315
316
317
318
  }
}

object Flexiprep extends PipelineCommand