Flexiprep.scala 11.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
16
17
package nl.lumc.sasc.biopet.pipelines.flexiprep

18
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
19
20
21
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }

22
import nl.lumc.sasc.biopet.core.{ SampleLibraryTag, BiopetQScript, PipelineCommand }
23
24
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat, Seqstat }
bow's avatar
bow committed
25
import nl.lumc.sasc.biopet.tools.FastqSync
26

27
class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with SampleLibraryTag {
28
29
30
31
32
33
  def this() = this(null)

  @Input(doc = "R1 fastq file (gzipped allowed)", shortName = "R1", required = true)
  var input_R1: File = _

  @Input(doc = "R2 fastq file (gzipped allowed)", shortName = "R2", required = false)
34
  var input_R2: Option[File] = None
35

36
37
  /** Skip Trim fastq files */
  var skipTrim: Boolean = config("skip_trim", default = false)
38

39
40
  /** Skip Clip fastq files */
  var skipClip: Boolean = config("skip_clip", default = false)
41

42
  // TODO: hide sampleId and libId from the command line so they do not interfere with our config values
43

44
  def summaryFile = new File(outputDir, sampleId.getOrElse("x") + "-" + libId.getOrElse("x") + ".qc.summary.json")
45

46
  def summaryFiles = Map()
47

48
  def summaryData = Map("skip_trim" -> skipTrim, "skip_clip" -> skipClip)
49

50
  var paired: Boolean = input_R2.isDefined
51
52
53
54
55
56
57
58
59
60
61
62
63
  var R1_ext: String = _
  var R2_ext: String = _
  var R1_name: String = _
  var R2_name: String = _

  var fastqc_R1: Fastqc = _
  var fastqc_R2: Fastqc = _
  var fastqc_R1_after: Fastqc = _
  var fastqc_R2_after: Fastqc = _

  val summary = new FlexiprepSummary(this)

  def init() {
64
    require(outputDir != null, "Missing output directory on flexiprep module")
65
    require(input_R1 != null, "Missing input R1 on flexiprep module")
66
67
    //require(sampleId != null, "Missing sample ID on flexiprep module")
    //require(libId != null, "Missing library ID on flexiprep module")
68

69
    paired = input_R2.isDefined
70
71
72
73
74
75
76

    if (input_R1.endsWith(".gz")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gz"))
    else if (input_R1.endsWith(".gzip")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gzip"))
    else R1_name = input_R1.getName
    R1_ext = R1_name.substring(R1_name.lastIndexOf("."), R1_name.size)
    R1_name = R1_name.substring(0, R1_name.lastIndexOf(R1_ext))

Peter van 't Hof's avatar
Peter van 't Hof committed
77
78
79
80
81
82
83
84
85
86
    input_R2 match {
      case Some(fileR2) => {
        paired = true
        if (fileR2.endsWith(".gz")) R2_name = fileR2.getName.substring(0, fileR2.getName.lastIndexOf(".gz"))
        else if (fileR2.endsWith(".gzip")) R2_name = fileR2.getName.substring(0, fileR2.getName.lastIndexOf(".gzip"))
        else R2_name = fileR2.getName
        R2_ext = R2_name.substring(R2_name.lastIndexOf("."), R2_name.size)
        R2_name = R2_name.substring(0, R2_name.lastIndexOf(R2_ext))
      }
      case _ =>
87
88
    }

89
    //summary.out = new File(outputDir, sampleId.getOrElse("x") + "-" + libId.getOrElse("x") + ".qc.summary.json")
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
  }

  def biopetScript() {
    runInitialJobs()

    val out = if (paired) runTrimClip(outputFiles("fastq_input_R1"), outputFiles("fastq_input_R2"), outputDir)
    else runTrimClip(outputFiles("fastq_input_R1"), outputDir)

    val R1_files = for ((k, v) <- outputFiles if k.endsWith("output_R1")) yield v
    val R2_files = for ((k, v) <- outputFiles if k.endsWith("output_R2")) yield v
    runFinalize(R1_files.toList, R2_files.toList)
  }

  def runInitialJobs() {
    outputFiles += ("fastq_input_R1" -> extractIfNeeded(input_R1, outputDir))
Peter van 't Hof's avatar
Peter van 't Hof committed
105
    if (paired) outputFiles += ("fastq_input_R2" -> extractIfNeeded(input_R2.get, outputDir))
106

Peter van 't Hof's avatar
Peter van 't Hof committed
107
    fastqc_R1 = Fastqc(this, input_R1, new File(outputDir, R1_name + ".fastqc/"))
108
    add(fastqc_R1)
109
110
    //summary.addFastqc(fastqc_R1)
    addSummarizable(fastqc_R1, "fastqc_R1")
111
112
    outputFiles += ("fastqc_R1" -> fastqc_R1.output)

113
114
115
    //val md5sum_R1 = Md5sum(this, input_R1, outputDir)
    //add(md5sum_R1)
    //summary.addMd5sum(md5sum_R1, R2 = false, after = false)
116
117

    if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
118
      fastqc_R2 = Fastqc(this, input_R2.get, new File(outputDir, R2_name + ".fastqc/"))
119
      add(fastqc_R2)
120
121
      addSummarizable(fastqc_R2, "fastqc_R2")
      //summary.addFastqc(fastqc_R2, R2 = true)
122
123
      outputFiles += ("fastqc_R2" -> fastqc_R2.output)

124
125
126
      //val md5sum_R2 = Md5sum(this, input_R2.get, outputDir)
      //add(md5sum_R2)
      //summary.addMd5sum(md5sum_R2, R2 = true, after = false)
127
128
129
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
130
  def runTrimClip(R1_in: File, outDir: File, chunk: String): (File, File, List[File]) = {
131
132
    runTrimClip(R1_in, new File(""), outDir, chunk)
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
133
  def runTrimClip(R1_in: File, outDir: File): (File, File, List[File]) = {
134
135
    runTrimClip(R1_in, new File(""), outDir, "")
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
136
  def runTrimClip(R1_in: File, R2_in: File, outDir: File): (File, File, List[File]) = {
137
138
    runTrimClip(R1_in, R2_in, outDir, "")
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
139
  def runTrimClip(R1_in: File, R2_in: File, outDir: File, chunkarg: String): (File, File, List[File]) = {
140
141
142
143
    val chunk = if (chunkarg.isEmpty || chunkarg.endsWith("_")) chunkarg else chunkarg + "_"
    var results: Map[String, File] = Map()

    var R1: File = new File(R1_in)
Peter van 't Hof's avatar
Peter van 't Hof committed
144
    var R2: File = if (paired) new File(R2_in) else null
145
146
    var deps: List[File] = if (paired) List(R1, R2) else List(R1)

147
    val seqtkSeq_R1 = SeqtkSeq(this, R1, swapExt(outDir, R1, R1_ext, ".sanger" + R1_ext), fastqc_R1)
148
149
    seqtkSeq_R1.isIntermediate = true
    add(seqtkSeq_R1)
150
151
152
153
    R1 = seqtkSeq_R1.output
    deps ::= R1

    if (paired) {
154
      val seqtkSeq_R2 = SeqtkSeq(this, R2, swapExt(outDir, R2, R2_ext, ".sanger" + R2_ext), fastqc_R2)
155
156
      seqtkSeq_R2.isIntermediate = true
      add(seqtkSeq_R2)
157
158
159
160
161
      R2 = seqtkSeq_R2.output
      deps ::= R2
    }

    val seqstat_R1 = Seqstat(this, R1, outDir)
162
163
    seqstat_R1.isIntermediate = true
    add(seqstat_R1)
164
165
166
167
    summary.addSeqstat(seqstat_R1, R2 = false, after = false, chunk)

    if (paired) {
      val seqstat_R2 = Seqstat(this, R2, outDir)
168
169
      seqstat_R2.isIntermediate = true
      add(seqstat_R2)
170
171
172
173
174
175
      summary.addSeqstat(seqstat_R2, R2 = true, after = false, chunk)
    }

    if (!skipClip) { // Adapter clipping

      val cutadapt_R1 = Cutadapt(this, R1, swapExt(outDir, R1, R1_ext, ".clip" + R1_ext))
176
      cutadapt_R1.fastqc = fastqc_R1
177
178
      cutadapt_R1.isIntermediate = true
      add(cutadapt_R1)
179
180
181
182
183
184
185
186
      summary.addCutadapt(cutadapt_R1, R2 = false, chunk)
      R1 = cutadapt_R1.fastq_output
      deps ::= R1
      outputFiles += ("cutadapt_R1_stats" -> cutadapt_R1.stats_output)

      if (paired) {
        val cutadapt_R2 = Cutadapt(this, R2, swapExt(outDir, R2, R2_ext, ".clip" + R2_ext))
        outputFiles += ("cutadapt_R2_stats" -> cutadapt_R2.stats_output)
187
        cutadapt_R2.fastqc = fastqc_R2
188
189
        cutadapt_R2.isIntermediate = true
        add(cutadapt_R2)
190
191
192
193
        summary.addCutadapt(cutadapt_R2, R2 = true, chunk)
        R2 = cutadapt_R2.fastq_output
        deps ::= R2

bow's avatar
bow committed
194
195
196
197
198
199
200
201
202
203
204
205
206
207
        val fqSync = new FastqSync(this)
        fqSync.refFastq = cutadapt_R1.fastq_input
        fqSync.inputFastq1 = cutadapt_R1.fastq_output
        fqSync.inputFastq2 = cutadapt_R2.fastq_output
        fqSync.outputFastq1 = swapExt(outDir, R1, R1_ext, ".sync" + R1_ext)
        fqSync.outputFastq2 = swapExt(outDir, R2, R2_ext, ".sync" + R2_ext)
        fqSync.outputStats = swapExt(outDir, R1, R1_ext, ".sync.stats")
        fqSync.deps :::= deps
        add(fqSync)

        summary.addFastqcSync(fqSync, chunk)
        outputFiles += ("syncStats" -> fqSync.outputStats)
        R1 = fqSync.outputFastq1
        R2 = fqSync.outputFastq2
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
        deps :::= R1 :: R2 :: Nil
      }
    }

    if (!skipTrim) { // Quality trimming
      val sickle = new Sickle(this)
      sickle.input_R1 = R1
      sickle.output_R1 = swapExt(outDir, R1, R1_ext, ".trim" + R1_ext)
      if (paired) {
        sickle.input_R2 = R2
        sickle.output_R2 = swapExt(outDir, R2, R2_ext, ".trim" + R2_ext)
        sickle.output_singles = swapExt(outDir, R2, R2_ext, ".trim.singles" + R1_ext)
      }
      sickle.output_stats = swapExt(outDir, R1, R1_ext, ".trim.stats")
      sickle.deps = deps
Peter van 't Hof's avatar
Peter van 't Hof committed
223
224
      sickle.isIntermediate = true
      add(sickle)
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
      summary.addSickle(sickle, chunk)
      R1 = sickle.output_R1
      if (paired) R2 = sickle.output_R2
    }

    val seqstat_R1_after = Seqstat(this, R1, outDir)
    seqstat_R1_after.deps = deps
    add(seqstat_R1_after)
    summary.addSeqstat(seqstat_R1_after, R2 = false, after = true, chunk)

    if (paired) {
      val seqstat_R2_after = Seqstat(this, R2, outDir)
      seqstat_R2_after.deps = deps
      add(seqstat_R2_after)
      summary.addSeqstat(seqstat_R2_after, R2 = true, after = true, chunk)
    }

    outputFiles += (chunk + "output_R1" -> R1)
    if (paired) outputFiles += (chunk + "output_R2" -> R2)
    return (R1, R2, deps)
  }

  def runFinalize(fastq_R1: List[File], fastq_R2: List[File]) {
    if (fastq_R1.length != fastq_R2.length && paired) throw new IllegalStateException("R1 and R2 file number is not the same")
Peter van 't Hof's avatar
Peter van 't Hof committed
249
250
    val R1 = new File(outputDir, R1_name + ".qc" + R1_ext + ".gz")
    val R2 = new File(outputDir, R2_name + ".qc" + R2_ext + ".gz")
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266

    add(Gzip(this, fastq_R1, R1))
    if (paired) add(Gzip(this, fastq_R2, R2))

    outputFiles += ("output_R1_gzip" -> R1)
    if (paired) outputFiles += ("output_R2_gzip" -> R2)

    if (!skipTrim || !skipClip) {
      val md5sum_R1 = Md5sum(this, R1, outputDir)
      add(md5sum_R1)
      summary.addMd5sum(md5sum_R1, R2 = false, after = true)
      if (paired) {
        val md5sum_R2 = Md5sum(this, R2, outputDir)
        add(md5sum_R2)
        summary.addMd5sum(md5sum_R2, R2 = true, after = true)
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
267
      fastqc_R1_after = Fastqc(this, R1, new File(outputDir, R1_name + ".qc.fastqc/"))
268
      add(fastqc_R1_after)
269
270
      addSummarizable(fastqc_R1_after, "fastqc_R1_qc")
      //summary.addFastqc(fastqc_R1_after, after = true)
271
      if (paired) {
Peter van 't Hof's avatar
Peter van 't Hof committed
272
        fastqc_R2_after = Fastqc(this, R2, new File(outputDir, R2_name + ".qc.fastqc/"))
273
        add(fastqc_R2_after)
274
275
        addSummarizable(fastqc_R2_after, "fastqc_R2_qc")
        //summary.addFastqc(fastqc_R2_after, R2 = true, after = true)
276
277
278
      }
    }

279
280
    //add(summary)
    addSummaryJobs
281
282
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
283
  def extractIfNeeded(file: File, runDir: File): File = {
Peter van 't Hof's avatar
Peter van 't Hof committed
284
285
    if (file == null) return file
    else if (file.getName().endsWith(".gz") || file.getName().endsWith(".gzip")) {
286
287
288
289
290
291
292
      var newFile: File = swapExt(runDir, file, ".gz", "")
      if (file.getName().endsWith(".gzip")) newFile = swapExt(runDir, file, ".gzip", "")
      val zcatCommand = Zcat(this, file, newFile)
      zcatCommand.isIntermediate = true
      add(zcatCommand)
      return newFile
    } else if (file.getName().endsWith(".bz2")) {
Peter van 't Hof's avatar
Peter van 't Hof committed
293
      val newFile = swapExt(runDir, file, ".bz2", "")
294
295
296
297
298
299
300
301
302
      val pbzip2 = Pbzip2(this, file, newFile)
      pbzip2.isIntermediate = true
      add(pbzip2)
      return newFile
    } else return file
  }
}

object Flexiprep extends PipelineCommand