Mapping.scala 12.3 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
package nl.lumc.sasc.biopet.pipelines.mapping

3
4
import nl.lumc.sasc.biopet.function._
import nl.lumc.sasc.biopet.function.aligners._
5
import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
6
7
import java.util.Date
import nl.lumc.sasc.biopet.core._
Peter van 't Hof's avatar
Peter van 't Hof committed
8
import nl.lumc.sasc.biopet.core.apps.FastqSplitter
Peter van 't Hof's avatar
Peter van 't Hof committed
9
import nl.lumc.sasc.biopet.core.config._
10
11
import nl.lumc.sasc.biopet.function.picard.MarkDuplicates
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
Peter van 't Hof's avatar
Peter van 't Hof committed
12
import nl.lumc.sasc.biopet.pipelines.flexiprep._
Peter van 't Hof's avatar
Peter van 't Hof committed
13
14
15
16
17
18
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.broadinstitute.gatk.queue.extensions.picard.MergeSamFiles
import org.broadinstitute.gatk.queue.extensions.picard.SortSam
import org.broadinstitute.gatk.queue.extensions.picard.AddOrReplaceReadGroups
import org.broadinstitute.gatk.queue.function._
Peter van 't Hof's avatar
Peter van 't Hof committed
19
import scala.util.parsing.json._
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import org.broadinstitute.gatk.utils.variant._
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import scala.math._
Peter van 't Hof's avatar
Peter van 't Hof committed
22

Peter van 't Hof's avatar
Peter van 't Hof committed
23
class Mapping(val root:Configurable) extends QScript with BiopetQScript {
Peter van 't Hof's avatar
Peter van 't Hof committed
24
  qscript =>
Peter van 't Hof's avatar
Peter van 't Hof committed
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
  def this() = this(null)
  
  @Input(doc="R1 fastq file", shortName="R1",required=true)
  var input_R1: File = _
  
  @Input(doc="R2 fastq file", shortName="R2", required=false)
  var input_R2: File = _
  
  @Argument(doc="Output name", shortName="outputName", required=false)
  var outputName: String = _
  
  @Argument(doc="Skip flexiprep", shortName="skipflexiprep", required=false)
  var skipFlexiprep: Boolean = false
  
  @Argument(doc="Skip mark duplicates", shortName="skipmarkduplicates", required=false)
  var skipMarkduplicates: Boolean = false
  
  @Argument(doc="Alginer", shortName="ALN", required=false)
  var aligner: String = _
  
  @Argument(doc="Reference", shortName="R", required=false)
  var referenceFile: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
47
  
Peter van 't Hof's avatar
Peter van 't Hof committed
48
49
50
  @Argument(doc="auto chuning", shortName="chunking", required=false)
  var chunking: Boolean = false
  
Peter van 't Hof's avatar
Peter van 't Hof committed
51
  // Readgroup items
Peter van 't Hof's avatar
Peter van 't Hof committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  @Argument(doc="Readgroup ID", shortName="RGID", required=false)
  var RGID: String = _
  
  @Argument(doc="Readgroup Library", shortName="RGLB", required=false)
  var RGLB: String = _
  
  @Argument(doc="Readgroup Platform", shortName="RGPL", required=false)
  var RGPL: String = _
  
  @Argument(doc="Readgroup platform unit", shortName="RGPU", required=false)
  var RGPU: String = _
  
  @Argument(doc="Readgroup sample", shortName="RGSM", required=false)
  var RGSM: String = _
  
  @Argument(doc="Readgroup sequencing center", shortName="RGCN", required=false)
  var RGCN: String = _
  
  @Argument(doc="Readgroup description", shortName="RGDS", required=false)
  var RGDS: String = _
  
  @Argument(doc="Readgroup sequencing date", shortName="RGDT", required=false)
  var RGDT: Date = _
  
  @Argument(doc="Readgroup predicted insert size", shortName="RGPI", required=false)
  var RGPI: Int = _
  
Peter van 't Hof's avatar
Peter van 't Hof committed
79
  var paired: Boolean = false
Peter van 't Hof's avatar
Peter van 't Hof committed
80
  var numberChunks = 0
Peter van 't Hof's avatar
Peter van 't Hof committed
81
82
83
  
  def init() {
    for (file <- configfiles) globalConfig.loadConfigFile(file)
Peter van 't Hof's avatar
Peter van 't Hof committed
84
85
86
87
88
89
    var inputtype:String = config("inputtype", "dna")
    if (aligner == null) {
      if (inputtype == "rna") aligner = config("aligner", "star-2pass")
      else aligner = config("aligner", "bwa")
    }
    if (referenceFile == null) referenceFile = config("referenceFile")
Peter van 't Hof's avatar
Peter van 't Hof committed
90
91
92
93
94
    if (outputDir == null) throw new IllegalStateException("Missing Output directory on mapping module")
    else if (!outputDir.endsWith("/")) outputDir += "/"
    if (input_R1 == null) throw new IllegalStateException("Missing Fastq R1 on mapping module")
    paired = (input_R2 != null)
    
Peter van 't Hof's avatar
Peter van 't Hof committed
95
    if (RGLB == null && configContains("RGLB")) RGLB = config("RGLB")
Peter van 't Hof's avatar
Peter van 't Hof committed
96
    else if (RGLB == null) throw new IllegalStateException("Missing Readgroup library on mapping module")
Peter van 't Hof's avatar
Peter van 't Hof committed
97
    if (RGSM == null && configContains("RGSM")) RGSM = config("RGSM")
Peter van 't Hof's avatar
Peter van 't Hof committed
98
    else if (RGLB == null) throw new IllegalStateException("Missing Readgroup sample on mapping module")
Peter van 't Hof's avatar
Peter van 't Hof committed
99
    if (RGID == null && configContains("RGID")) RGID = config("RGID")
Peter van 't Hof's avatar
Peter van 't Hof committed
100
101
102
    else if (RGID == null && RGSM != null && RGLB != null) RGID = RGSM + "-" + RGLB
    else if (RGID == null) throw new IllegalStateException("Missing Readgroup ID on mapping module")    
    
Peter van 't Hof's avatar
Peter van 't Hof committed
103
104
105
106
    if (RGPL == null) RGPL = config("RGPL", "illumina")
    if (RGPU == null) RGPU = config("RGPU", "na")
    if (RGCN == null && configContains("RGCN")) RGCN = config("RGCN")
    if (RGDS == null && configContains("RGDS")) RGDS = config("RGDS")
107
108
    
    if (outputName == null) outputName = RGID
Peter van 't Hof's avatar
Peter van 't Hof committed
109
110
111
112
113
114
115
116
117
118
119
    
    if (!chunking && numberChunks > 1) chunking = true
    if (!chunking) chunking = config("chunking", false)
    if (chunking) {
      val chunkSize:Int = config("chunksize", (1 << 30))
      val filesize = if (input_R1.getName.endsWith(".gz") || input_R1.getName.endsWith(".gzip")) input_R1.length * 3 
                     else input_R1.length
      if (numberChunks == 0 && configContains("numberchunks")) numberChunks = config("numberchunks")
      else if (numberChunks == 0) numberChunks = ceil(filesize.toDouble / chunkSize).toInt
      logger.debug("Chunks: " + numberChunks)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
120
121
  }
  
Peter van 't Hof's avatar
Peter van 't Hof committed
122
  def biopetScript() {
Peter van 't Hof's avatar
Peter van 't Hof committed
123
124
125
    var fastq_R1: File = input_R1
    var fastq_R2: File = if (paired) input_R2 else ""
    val flexiprep = new Flexiprep(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
126
    flexiprep.outputDir = outputDir + "flexiprep/"
Peter van 't Hof's avatar
Peter van 't Hof committed
127
128
129
130
131
132
133
134
135
136
137
138
139
    var bamFiles:List[File] = Nil
    var fastq_R1_output: List[File] = Nil
    var fastq_R2_output: List[File] = Nil
    
    def removeGz(file:String):String = {
      if (file.endsWith(".gz")) return file.substring(0, file.lastIndexOf(".gz"))
      else if (file.endsWith(".gzip")) return file.substring(0, file.lastIndexOf(".gzip"))
      else return file
    }
    var chunks:Map[String, (String,String)] = Map()
    if (chunking) for (t <- 1 to numberChunks) {
      chunks += ("chunk_" + t -> (removeGz(outputDir + "chunk_" + t + "/" + fastq_R1.getName),
                                  if (paired) removeGz(outputDir + "chunk_" + t + "/" + fastq_R2.getName) else ""))
140
141
    } else chunks += ("flexiprep" -> (flexiprep.extractIfNeeded(fastq_R1, flexiprep.outputDir),
                                      flexiprep.extractIfNeeded(fastq_R2, flexiprep.outputDir)))
Peter van 't Hof's avatar
Peter van 't Hof committed
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    
    if (chunking) {
      val fastSplitter_R1 = new FastqSplitter(this)
      fastSplitter_R1.input = fastq_R1
      fastSplitter_R1.memoryLimit = 4
      fastSplitter_R1.jobResourceRequests :+= "h_vmem=8G"
      for ((chunk,fastqfile) <- chunks) fastSplitter_R1.output :+= fastqfile._1
      add(fastSplitter_R1)
      
      if (paired) {
        val fastSplitter_R2 = new FastqSplitter(this)
        fastSplitter_R2.input = fastq_R2
        fastSplitter_R2.memoryLimit = 4
        fastSplitter_R2.jobResourceRequests :+= "h_vmem=8G"
        for ((chunk,fastqfile) <- chunks) fastSplitter_R2.output :+= fastqfile._2
        add(fastSplitter_R2)
      }
    }
    
    for ((chunk,fastqfile) <- chunks) {
      var R1 = fastqfile._1
      var R2 = fastqfile._2
      if (!skipFlexiprep) {
        flexiprep.input_R1 = fastq_R1
        if (paired) flexiprep.input_R2 = fastq_R2
167
        if (chunking) flexiprep.skipSummary = true
Peter van 't Hof's avatar
Peter van 't Hof committed
168
169
170
171
172
173
174
175
176
177
        flexiprep.init
        flexiprep.runInitialJobs
        //flexiprep.biopetScript
        val flexiout = flexiprep.runTrimClip(R1, R2, outputDir + chunk, chunk)
        logger.debug(chunk + " - " + flexiout)
        R1 = flexiout._1
        if (paired) R2 = flexiout._2
        fastq_R1_output :+= R1
        fastq_R2_output :+= R2
      }
178
      val chunkDir = if (chunking) outputDir + chunk + "/" else outputDir
Peter van 't Hof's avatar
Peter van 't Hof committed
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
      if (aligner == "bwa") {
        val bwaCommand = new Bwa(this)
        bwaCommand.R1 = R1
        if (paired) bwaCommand.R2 = R2
        bwaCommand.RG = getReadGroup
        bwaCommand.output = new File(chunkDir + outputName + ".sam")
        add(bwaCommand, isIntermediate = true)
        bamFiles :+= addSortSam(List(bwaCommand.output), swapExt(chunkDir,bwaCommand.output,".sam",".bam"), chunkDir)
      } else if (aligner == "star") {
        val starCommand = Star(this, R1, if (paired) R2 else null, outputDir, isIntermediate = true)
        add(starCommand)
        bamFiles :+= addAddOrReplaceReadGroups(List(starCommand.outputSam), new File(chunkDir + outputName + ".bam"), chunkDir)
      } else if (aligner == "star-2pass") {
        val star2pass = Star._2pass(this, R1, if (paired) R2 else null, chunkDir, isIntermediate = true)
        addAll(star2pass._2)
        bamFiles :+= addAddOrReplaceReadGroups(List(star2pass._1), new File(chunkDir + outputName + ".bam"), chunkDir)
      } else throw new IllegalStateException("Option Alginer: '" + aligner + "' is not valid")
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
197
    if (!skipFlexiprep) {
Peter van 't Hof's avatar
Peter van 't Hof committed
198
      flexiprep.runFinalize(fastq_R1_output, fastq_R2_output)
Peter van 't Hof's avatar
Peter van 't Hof committed
199
200
201
      addAll(flexiprep.functions) // Add function of flexiprep to curent function pool
    }
    
202
203
204
205
206
207
    var bamFile = bamFiles.head
    if (!skipMarkduplicates) {
      bamFile = new File(outputDir + outputName + ".dedup.bam")
      add(MarkDuplicates(this, bamFiles, bamFile))
    } else if (skipMarkduplicates && chunking) bamFile = addMergeBam(bamFiles, new File(outputDir + outputName + ".bam"), outputDir)
    
208
    outputFiles += ("finalBamFile" -> bamFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
209
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
210
  
Peter van 't Hof's avatar
Peter van 't Hof committed
211
  def addSortSam(inputSam:List[File], outputFile:File, dir:String) : File = {
212
213
214
215
216
217
218
    val sortSam = new SortSam
    sortSam.input = inputSam
    sortSam.createIndex = true
    sortSam.output = outputFile
    sortSam.memoryLimit = 2
    sortSam.nCoresRequest = 2
    sortSam.jobResourceRequests :+= "h_vmem=4G"
Peter van 't Hof's avatar
Peter van 't Hof committed
219
    if (!skipMarkduplicates) sortSam.isIntermediate = true
Peter van 't Hof's avatar
Peter van 't Hof committed
220
221
222
223
224
    add(sortSam)
    
    return sortSam.output
  }
  
Peter van 't Hof's avatar
Peter van 't Hof committed
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
  def addMergeBam(inputSam:List[File], outputFile:File, dir:String) : File = {
    val mergeSam = new MergeSamFiles
    mergeSam.input = inputSam
    mergeSam.createIndex = true
    mergeSam.output = outputFile
    mergeSam.memoryLimit = 2
    mergeSam.nCoresRequest = 2
    mergeSam.assumeSorted = true
    mergeSam.USE_THREADING = true
    mergeSam.jobResourceRequests :+= "h_vmem=4G"
    if (!skipMarkduplicates) mergeSam.isIntermediate = true
    add(mergeSam)
    
    return mergeSam.output
  }
  
Peter van 't Hof's avatar
Peter van 't Hof committed
241
  def addAddOrReplaceReadGroups(inputSam:List[File], outputFile:File, dir:String) : File = {
242
    val addOrReplaceReadGroups = new AddOrReplaceReadGroups
Peter van 't Hof's avatar
Peter van 't Hof committed
243
244
245
246
247
248
249
250
251
252
253
254
255
256
    addOrReplaceReadGroups.input = inputSam
    addOrReplaceReadGroups.output = outputFile
    addOrReplaceReadGroups.createIndex = true
    addOrReplaceReadGroups.memoryLimit = 2
    addOrReplaceReadGroups.nCoresRequest = 2
    addOrReplaceReadGroups.jobResourceRequests :+= "h_vmem=4G"

    addOrReplaceReadGroups.RGID = RGID
    addOrReplaceReadGroups.RGLB = RGLB
    addOrReplaceReadGroups.RGPL = RGPL
    addOrReplaceReadGroups.RGPU = RGPU
    addOrReplaceReadGroups.RGSM = RGSM
    if (RGCN != null) addOrReplaceReadGroups.RGCN = RGCN
    if (RGDS != null) addOrReplaceReadGroups.RGDS = RGDS
Peter van 't Hof's avatar
Peter van 't Hof committed
257
    if (!skipMarkduplicates) addOrReplaceReadGroups.isIntermediate = true
Peter van 't Hof's avatar
Peter van 't Hof committed
258
259
    add(addOrReplaceReadGroups)
    
Peter van 't Hof's avatar
Peter van 't Hof committed
260
261
262
263
264
265
266
267
268
    return addOrReplaceReadGroups.output
  }
  
  def getReadGroup() : String = {
    var RG: String = "@RG\\t" + "ID:" + RGID + "\\t"
    RG += "LB:" + RGLB + "\\t"
    RG += "PL:" + RGPL + "\\t"
    RG += "PU:" + RGPU + "\\t"
    RG += "SM:" + RGSM + "\\t"
269
270
271
272
    if (RGCN != null) RG += "CN:" + RGCN + "\\t"
    if (RGDS != null) RG += "DS" + RGDS + "\\t"
    if (RGDT != null) RG += "DT" + RGDT + "\\t"
    if (RGPI > 0) RG += "PI" + RGPI + "\\t"
Peter van 't Hof's avatar
Peter van 't Hof committed
273
274
275
    
    return RG.substring(0, RG.lastIndexOf("\\t"))
  }
276
277
278
279
}

object Mapping extends PipelineCommand {
  override val pipeline = "/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.class"
280
  
281
282
283
  def loadFromRunConfig(root:Configurable, runConfig:Map[String,Any], sampleConfig:Map[String,Any], runDir: String) : Mapping = {
    val mapping = new Mapping(root)
    
Peter van 't Hof's avatar
Peter van 't Hof committed
284
    logger.debug("Mapping runconfig: " + runConfig)
Peter van 't Hof's avatar
Peter van 't Hof committed
285
286
    var inputType = ""
    if (runConfig.contains("inputtype")) inputType = runConfig("inputtype").toString
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
    else inputType = root.config("inputtype", "dna").getString
    if (inputType == "rna") mapping.aligner = root.config("rna_aligner", "star-2pass").getString
    if (runConfig.contains("R1")) mapping.input_R1 = new File(runConfig("R1").toString)
    if (runConfig.contains("R2")) mapping.input_R2 = new File(runConfig("R2").toString)
    mapping.paired = (mapping.input_R2 != null)
    mapping.RGLB = runConfig("ID").toString
    mapping.RGSM = sampleConfig("ID").toString
    if (runConfig.contains("PL")) mapping.RGPL = runConfig("PL").toString
    if (runConfig.contains("PU")) mapping.RGPU = runConfig("PU").toString
    if (runConfig.contains("CN")) mapping.RGCN = runConfig("CN").toString
    mapping.outputDir = runDir
    
    mapping.init
    mapping.biopetScript
    return mapping
302
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
303
}