Mapping.scala 12.5 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
package nl.lumc.sasc.biopet.pipelines.mapping

Peter van 't Hof's avatar
Peter van 't Hof committed
3
import nl.lumc.sasc.biopet.core.config.Configurable
4
import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
5
import java.util.Date
bow's avatar
bow committed
6
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
Peter van 't Hof's avatar
Peter van 't Hof committed
7
import nl.lumc.sasc.biopet.core.apps.FastqSplitter
8
9
import nl.lumc.sasc.biopet.extensions.aligners.{ Bwa, Star }
import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates
10
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
Peter van 't Hof's avatar
Peter van 't Hof committed
11
import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep
Peter van 't Hof's avatar
Peter van 't Hof committed
12
import org.broadinstitute.gatk.queue.QScript
13
import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument, ClassType }
bow's avatar
bow committed
14
import org.broadinstitute.gatk.queue.extensions.picard.{ MergeSamFiles, SortSam, AddOrReplaceReadGroups }
Peter van 't Hof's avatar
Peter van 't Hof committed
15
import scala.math._
Peter van 't Hof's avatar
Peter van 't Hof committed
16

bow's avatar
bow committed
17
class Mapping(val root: Configurable) extends QScript with BiopetQScript {
Peter van 't Hof's avatar
Peter van 't Hof committed
18
  qscript =>
Peter van 't Hof's avatar
Peter van 't Hof committed
19
  def this() = this(null)
bow's avatar
bow committed
20
21

  @Input(doc = "R1 fastq file", shortName = "R1", required = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
22
  var input_R1: File = _
bow's avatar
bow committed
23
24

  @Input(doc = "R2 fastq file", shortName = "R2", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
25
  var input_R2: File = _
bow's avatar
bow committed
26
27

  @Argument(doc = "Output name", shortName = "outputName", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
28
  var outputName: String = _
bow's avatar
bow committed
29
30

  @Argument(doc = "Skip flexiprep", shortName = "skipflexiprep", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
31
  var skipFlexiprep: Boolean = false
bow's avatar
bow committed
32
33

  @Argument(doc = "Skip mark duplicates", shortName = "skipmarkduplicates", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
34
  var skipMarkduplicates: Boolean = false
bow's avatar
bow committed
35
36

  @Argument(doc = "Alginer", shortName = "ALN", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
37
  var aligner: String = _
bow's avatar
bow committed
38
39

  @Argument(doc = "Reference", shortName = "R", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
40
  var referenceFile: File = _
bow's avatar
bow committed
41

42
  @Argument(doc = "Chunking", shortName = "chunking", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
43
  var chunking: Boolean = false
44
45
46
47
  
  @ClassType(classOf[Int])
  @Argument(doc = "Number of chunks, when not defined pipeline will automatic calculate number of chunks", shortName = "numberChunks", required = false)
  var numberChunks: Option[Int] = None
bow's avatar
bow committed
48

Peter van 't Hof's avatar
Peter van 't Hof committed
49
  // Readgroup items
bow's avatar
bow committed
50
  @Argument(doc = "Readgroup ID", shortName = "RGID", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
51
  var RGID: String = _
bow's avatar
bow committed
52
53

  @Argument(doc = "Readgroup Library", shortName = "RGLB", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
54
  var RGLB: String = _
bow's avatar
bow committed
55
56

  @Argument(doc = "Readgroup Platform", shortName = "RGPL", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
57
  var RGPL: String = _
bow's avatar
bow committed
58
59

  @Argument(doc = "Readgroup platform unit", shortName = "RGPU", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
60
  var RGPU: String = _
bow's avatar
bow committed
61
62

  @Argument(doc = "Readgroup sample", shortName = "RGSM", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
63
  var RGSM: String = _
bow's avatar
bow committed
64
65

  @Argument(doc = "Readgroup sequencing center", shortName = "RGCN", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
66
  var RGCN: String = _
bow's avatar
bow committed
67
68

  @Argument(doc = "Readgroup description", shortName = "RGDS", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
69
  var RGDS: String = _
bow's avatar
bow committed
70
71

  @Argument(doc = "Readgroup sequencing date", shortName = "RGDT", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
72
  var RGDT: Date = _
bow's avatar
bow committed
73
74

  @Argument(doc = "Readgroup predicted insert size", shortName = "RGPI", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
75
  var RGPI: Int = _
bow's avatar
bow committed
76

Peter van 't Hof's avatar
Peter van 't Hof committed
77
  var paired: Boolean = false
bow's avatar
bow committed
78

Peter van 't Hof's avatar
Peter van 't Hof committed
79
80
  def init() {
    for (file <- configfiles) globalConfig.loadConfigFile(file)
bow's avatar
bow committed
81
    var inputtype: String = config("inputtype", "dna")
Peter van 't Hof's avatar
Peter van 't Hof committed
82
83
84
85
86
    if (aligner == null) {
      if (inputtype == "rna") aligner = config("aligner", "star-2pass")
      else aligner = config("aligner", "bwa")
    }
    if (referenceFile == null) referenceFile = config("referenceFile")
Peter van 't Hof's avatar
Peter van 't Hof committed
87
88
89
90
    if (outputDir == null) throw new IllegalStateException("Missing Output directory on mapping module")
    else if (!outputDir.endsWith("/")) outputDir += "/"
    if (input_R1 == null) throw new IllegalStateException("Missing Fastq R1 on mapping module")
    paired = (input_R2 != null)
bow's avatar
bow committed
91

Peter van 't Hof's avatar
Peter van 't Hof committed
92
    if (RGLB == null && configContains("RGLB")) RGLB = config("RGLB")
Peter van 't Hof's avatar
Peter van 't Hof committed
93
    else if (RGLB == null) throw new IllegalStateException("Missing Readgroup library on mapping module")
Peter van 't Hof's avatar
Peter van 't Hof committed
94
    if (RGSM == null && configContains("RGSM")) RGSM = config("RGSM")
Peter van 't Hof's avatar
Peter van 't Hof committed
95
    else if (RGLB == null) throw new IllegalStateException("Missing Readgroup sample on mapping module")
Peter van 't Hof's avatar
Peter van 't Hof committed
96
    if (RGID == null && configContains("RGID")) RGID = config("RGID")
Peter van 't Hof's avatar
Peter van 't Hof committed
97
    else if (RGID == null && RGSM != null && RGLB != null) RGID = RGSM + "-" + RGLB
bow's avatar
bow committed
98
99
    else if (RGID == null) throw new IllegalStateException("Missing Readgroup ID on mapping module")

Peter van 't Hof's avatar
Peter van 't Hof committed
100
101
102
103
    if (RGPL == null) RGPL = config("RGPL", "illumina")
    if (RGPU == null) RGPU = config("RGPU", "na")
    if (RGCN == null && configContains("RGCN")) RGCN = config("RGCN")
    if (RGDS == null && configContains("RGDS")) RGDS = config("RGDS")
bow's avatar
bow committed
104

105
    if (outputName == null) outputName = RGID
bow's avatar
bow committed
106

107
    if (!chunking && numberChunks.isDefined) chunking = true
Peter van 't Hof's avatar
Peter van 't Hof committed
108
109
    if (!chunking) chunking = config("chunking", false)
    if (chunking) {
110
111
112
113
114
115
116
117
118
119
      if (numberChunks.isEmpty) {
        if (configContains("numberchunks")) numberChunks = config("numberchunks", default = None)
        else {
          val chunkSize: Int = config("chunksize", (1 << 30))
          val filesize = if (input_R1.getName.endsWith(".gz") || input_R1.getName.endsWith(".gzip")) input_R1.length * 3
                         else input_R1.length
          numberChunks = Option(ceil(filesize.toDouble / chunkSize).toInt)
        }
      }
      logger.debug("Chunks: " + numberChunks.getOrElse(1))
Peter van 't Hof's avatar
Peter van 't Hof committed
120
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
121
  }
bow's avatar
bow committed
122

Peter van 't Hof's avatar
Peter van 't Hof committed
123
  def biopetScript() {
Peter van 't Hof's avatar
Peter van 't Hof committed
124
125
126
    var fastq_R1: File = input_R1
    var fastq_R2: File = if (paired) input_R2 else ""
    val flexiprep = new Flexiprep(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
127
    flexiprep.outputDir = outputDir + "flexiprep/"
bow's avatar
bow committed
128
    var bamFiles: List[File] = Nil
Peter van 't Hof's avatar
Peter van 't Hof committed
129
130
    var fastq_R1_output: List[File] = Nil
    var fastq_R2_output: List[File] = Nil
bow's avatar
bow committed
131
132

    def removeGz(file: String): String = {
Peter van 't Hof's avatar
Peter van 't Hof committed
133
134
135
136
      if (file.endsWith(".gz")) return file.substring(0, file.lastIndexOf(".gz"))
      else if (file.endsWith(".gzip")) return file.substring(0, file.lastIndexOf(".gzip"))
      else return file
    }
bow's avatar
bow committed
137
    var chunks: Map[String, (String, String)] = Map()
138
    if (chunking) for (t <- 1 to numberChunks.getOrElse(1)) {
Peter van 't Hof's avatar
Peter van 't Hof committed
139
      chunks += ("chunk_" + t -> (removeGz(outputDir + "chunk_" + t + "/" + fastq_R1.getName),
bow's avatar
bow committed
140
141
142
143
144
        if (paired) removeGz(outputDir + "chunk_" + t + "/" + fastq_R2.getName) else ""))
    }
    else chunks += ("flexiprep" -> (flexiprep.extractIfNeeded(fastq_R1, flexiprep.outputDir),
      flexiprep.extractIfNeeded(fastq_R2, flexiprep.outputDir)))

Peter van 't Hof's avatar
Peter van 't Hof committed
145
146
147
148
149
    if (chunking) {
      val fastSplitter_R1 = new FastqSplitter(this)
      fastSplitter_R1.input = fastq_R1
      fastSplitter_R1.memoryLimit = 4
      fastSplitter_R1.jobResourceRequests :+= "h_vmem=8G"
bow's avatar
bow committed
150
      for ((chunk, fastqfile) <- chunks) fastSplitter_R1.output :+= fastqfile._1
Peter van 't Hof's avatar
Peter van 't Hof committed
151
      add(fastSplitter_R1)
bow's avatar
bow committed
152

Peter van 't Hof's avatar
Peter van 't Hof committed
153
154
155
156
157
      if (paired) {
        val fastSplitter_R2 = new FastqSplitter(this)
        fastSplitter_R2.input = fastq_R2
        fastSplitter_R2.memoryLimit = 4
        fastSplitter_R2.jobResourceRequests :+= "h_vmem=8G"
bow's avatar
bow committed
158
        for ((chunk, fastqfile) <- chunks) fastSplitter_R2.output :+= fastqfile._2
Peter van 't Hof's avatar
Peter van 't Hof committed
159
160
161
        add(fastSplitter_R2)
      }
    }
bow's avatar
bow committed
162
163

    for ((chunk, fastqfile) <- chunks) {
Peter van 't Hof's avatar
Peter van 't Hof committed
164
165
      var R1 = fastqfile._1
      var R2 = fastqfile._2
166
      val chunkDir = if (chunking) outputDir + chunk + "/" else outputDir
Peter van 't Hof's avatar
Peter van 't Hof committed
167
168
169
      if (!skipFlexiprep) {
        flexiprep.input_R1 = fastq_R1
        if (paired) flexiprep.input_R2 = fastq_R2
170
171
        flexiprep.sampleName = this.RGSM
        flexiprep.libraryName = this.RGLB
Peter van 't Hof's avatar
Peter van 't Hof committed
172
173
174
        flexiprep.init
        flexiprep.runInitialJobs
        //flexiprep.biopetScript
175
        val flexiout = flexiprep.runTrimClip(R1, R2, chunkDir + "flexiprep/", chunk)
Peter van 't Hof's avatar
Peter van 't Hof committed
176
177
178
179
180
181
        logger.debug(chunk + " - " + flexiout)
        R1 = flexiout._1
        if (paired) R2 = flexiout._2
        fastq_R1_output :+= R1
        fastq_R2_output :+= R2
      }
182
      
Peter van 't Hof's avatar
Peter van 't Hof committed
183
184
185
186
187
188
189
      if (aligner == "bwa") {
        val bwaCommand = new Bwa(this)
        bwaCommand.R1 = R1
        if (paired) bwaCommand.R2 = R2
        bwaCommand.RG = getReadGroup
        bwaCommand.output = new File(chunkDir + outputName + ".sam")
        add(bwaCommand, isIntermediate = true)
bow's avatar
bow committed
190
        bamFiles :+= addSortSam(List(bwaCommand.output), swapExt(chunkDir, bwaCommand.output, ".sam", ".bam"), chunkDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
191
192
193
194
195
196
197
198
199
200
      } else if (aligner == "star") {
        val starCommand = Star(this, R1, if (paired) R2 else null, outputDir, isIntermediate = true)
        add(starCommand)
        bamFiles :+= addAddOrReplaceReadGroups(List(starCommand.outputSam), new File(chunkDir + outputName + ".bam"), chunkDir)
      } else if (aligner == "star-2pass") {
        val star2pass = Star._2pass(this, R1, if (paired) R2 else null, chunkDir, isIntermediate = true)
        addAll(star2pass._2)
        bamFiles :+= addAddOrReplaceReadGroups(List(star2pass._1), new File(chunkDir + outputName + ".bam"), chunkDir)
      } else throw new IllegalStateException("Option Alginer: '" + aligner + "' is not valid")
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
201
    if (!skipFlexiprep) {
Peter van 't Hof's avatar
Peter van 't Hof committed
202
      flexiprep.runFinalize(fastq_R1_output, fastq_R2_output)
Peter van 't Hof's avatar
Peter van 't Hof committed
203
204
      addAll(flexiprep.functions) // Add function of flexiprep to curent function pool
    }
bow's avatar
bow committed
205

206
207
208
209
210
    var bamFile = bamFiles.head
    if (!skipMarkduplicates) {
      bamFile = new File(outputDir + outputName + ".dedup.bam")
      add(MarkDuplicates(this, bamFiles, bamFile))
    } else if (skipMarkduplicates && chunking) bamFile = addMergeBam(bamFiles, new File(outputDir + outputName + ".bam"), outputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
211
212
213
    
    addAll(BamMetrics.apply(this, bamFile, outputDir + "metrics/").functions)
    
214
    outputFiles += ("finalBamFile" -> bamFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
215
  }
bow's avatar
bow committed
216
217

  def addSortSam(inputSam: List[File], outputFile: File, dir: String): File = {
218
219
220
221
222
223
224
    val sortSam = new SortSam
    sortSam.input = inputSam
    sortSam.createIndex = true
    sortSam.output = outputFile
    sortSam.memoryLimit = 2
    sortSam.nCoresRequest = 2
    sortSam.jobResourceRequests :+= "h_vmem=4G"
Peter van 't Hof's avatar
Peter van 't Hof committed
225
    if (!skipMarkduplicates) sortSam.isIntermediate = true
Peter van 't Hof's avatar
Peter van 't Hof committed
226
    add(sortSam)
bow's avatar
bow committed
227

Peter van 't Hof's avatar
Peter van 't Hof committed
228
229
    return sortSam.output
  }
bow's avatar
bow committed
230
231

  def addMergeBam(inputSam: List[File], outputFile: File, dir: String): File = {
Peter van 't Hof's avatar
Peter van 't Hof committed
232
233
234
235
236
237
238
239
240
241
242
    val mergeSam = new MergeSamFiles
    mergeSam.input = inputSam
    mergeSam.createIndex = true
    mergeSam.output = outputFile
    mergeSam.memoryLimit = 2
    mergeSam.nCoresRequest = 2
    mergeSam.assumeSorted = true
    mergeSam.USE_THREADING = true
    mergeSam.jobResourceRequests :+= "h_vmem=4G"
    if (!skipMarkduplicates) mergeSam.isIntermediate = true
    add(mergeSam)
bow's avatar
bow committed
243

Peter van 't Hof's avatar
Peter van 't Hof committed
244
245
    return mergeSam.output
  }
bow's avatar
bow committed
246
247

  def addAddOrReplaceReadGroups(inputSam: List[File], outputFile: File, dir: String): File = {
248
    val addOrReplaceReadGroups = new AddOrReplaceReadGroups
Peter van 't Hof's avatar
Peter van 't Hof committed
249
250
251
252
253
254
255
256
257
258
259
260
261
262
    addOrReplaceReadGroups.input = inputSam
    addOrReplaceReadGroups.output = outputFile
    addOrReplaceReadGroups.createIndex = true
    addOrReplaceReadGroups.memoryLimit = 2
    addOrReplaceReadGroups.nCoresRequest = 2
    addOrReplaceReadGroups.jobResourceRequests :+= "h_vmem=4G"

    addOrReplaceReadGroups.RGID = RGID
    addOrReplaceReadGroups.RGLB = RGLB
    addOrReplaceReadGroups.RGPL = RGPL
    addOrReplaceReadGroups.RGPU = RGPU
    addOrReplaceReadGroups.RGSM = RGSM
    if (RGCN != null) addOrReplaceReadGroups.RGCN = RGCN
    if (RGDS != null) addOrReplaceReadGroups.RGDS = RGDS
Peter van 't Hof's avatar
Peter van 't Hof committed
263
    if (!skipMarkduplicates) addOrReplaceReadGroups.isIntermediate = true
Peter van 't Hof's avatar
Peter van 't Hof committed
264
    add(addOrReplaceReadGroups)
bow's avatar
bow committed
265

Peter van 't Hof's avatar
Peter van 't Hof committed
266
267
    return addOrReplaceReadGroups.output
  }
bow's avatar
bow committed
268
269

  def getReadGroup(): String = {
Peter van 't Hof's avatar
Peter van 't Hof committed
270
271
272
273
274
    var RG: String = "@RG\\t" + "ID:" + RGID + "\\t"
    RG += "LB:" + RGLB + "\\t"
    RG += "PL:" + RGPL + "\\t"
    RG += "PU:" + RGPU + "\\t"
    RG += "SM:" + RGSM + "\\t"
275
276
277
278
    if (RGCN != null) RG += "CN:" + RGCN + "\\t"
    if (RGDS != null) RG += "DS" + RGDS + "\\t"
    if (RGDT != null) RG += "DT" + RGDT + "\\t"
    if (RGPI > 0) RG += "PI" + RGPI + "\\t"
bow's avatar
bow committed
279

Peter van 't Hof's avatar
Peter van 't Hof committed
280
281
    return RG.substring(0, RG.lastIndexOf("\\t"))
  }
282
283
284
285
}

object Mapping extends PipelineCommand {
  override val pipeline = "/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.class"
bow's avatar
bow committed
286
287

  def loadFromRunConfig(root: Configurable, runConfig: Map[String, Any], sampleConfig: Map[String, Any], runDir: String): Mapping = {
288
    val mapping = new Mapping(root)
bow's avatar
bow committed
289

Peter van 't Hof's avatar
Peter van 't Hof committed
290
    logger.debug("Mapping runconfig: " + runConfig)
Peter van 't Hof's avatar
Peter van 't Hof committed
291
292
    var inputType = ""
    if (runConfig.contains("inputtype")) inputType = runConfig("inputtype").toString
293
294
295
296
297
298
299
300
301
302
303
    else inputType = root.config("inputtype", "dna").getString
    if (inputType == "rna") mapping.aligner = root.config("rna_aligner", "star-2pass").getString
    if (runConfig.contains("R1")) mapping.input_R1 = new File(runConfig("R1").toString)
    if (runConfig.contains("R2")) mapping.input_R2 = new File(runConfig("R2").toString)
    mapping.paired = (mapping.input_R2 != null)
    mapping.RGLB = runConfig("ID").toString
    mapping.RGSM = sampleConfig("ID").toString
    if (runConfig.contains("PL")) mapping.RGPL = runConfig("PL").toString
    if (runConfig.contains("PU")) mapping.RGPU = runConfig("PU").toString
    if (runConfig.contains("CN")) mapping.RGCN = runConfig("CN").toString
    mapping.outputDir = runDir
bow's avatar
bow committed
304

305
306
307
    mapping.init
    mapping.biopetScript
    return mapping
308
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
309
}