GenerateIndexes.scala 11.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.pipelines.generateindexes
17

18
import java.util
19

Peter van 't Hof's avatar
Peter van 't Hof committed
20
import nl.lumc.sasc.biopet.core.extensions.Md5sum
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
22
import nl.lumc.sasc.biopet.extensions._
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2Build, BowtieBuild }
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.extensions.bwa.BwaIndex
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
Peter van 't Hof's avatar
Peter van 't Hof committed
26
import nl.lumc.sasc.biopet.extensions.gmap.GmapBuild
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx
29
import nl.lumc.sasc.biopet.utils.ConfigUtils
Peter van 't Hof's avatar
Peter van 't Hof committed
30
import nl.lumc.sasc.biopet.utils.config.Configurable
31
import org.broadinstitute.gatk.queue.QScript
32

33
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
34
import scala.language.reflectiveCalls
35
36
37
38

class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript {
  def this() = this(null)

Peter van 't Hof's avatar
Peter van 't Hof committed
39
40
  @Argument(required = true)
  var referenceConfigFiles: List[File] = Nil
41

Peter van 't Hof's avatar
Peter van 't Hof committed
42
  var referenceConfig: Map[String, Any] = null
43

44
  protected var configDeps: List[File] = Nil
Peter van 't Hof's avatar
Peter van 't Hof committed
45

46
47
  /** This is executed before the script starts */
  def init(): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
48
49
    if (referenceConfig == null)
      referenceConfig = referenceConfigFiles.foldLeft(Map[String, Any]())((a, b) => ConfigUtils.mergeMaps(a, ConfigUtils.fileToConfigMap(b)))
50
51
52
53
54
  }

  /** Method where jobs must be added */
  def biopetScript(): Unit = {

Peter van 't Hof's avatar
Peter van 't Hof committed
55
    val outputConfig = for ((speciesName, c) <- referenceConfig) yield speciesName -> {
56
      val speciesConfig = ConfigUtils.any2map(c)
Peter van 't Hof's avatar
Peter van 't Hof committed
57
      val speciesDir = new File(outputDir, speciesName)
Peter van 't Hof's avatar
Peter van 't Hof committed
58
      for ((genomeName, c) <- speciesConfig) yield genomeName -> {
59
        var configDeps: List[File] = Nil
60
        val genomeConfig = ConfigUtils.any2map(c)
61
62
        val fastaUris = genomeConfig.getOrElse("fasta_uri",
          throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
63
            case a: Traversable[_]    => a.map(_.toString).toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
64
            case a: util.ArrayList[_] => a.map(_.toString).toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
65
            case a                    => Array(a.toString)
66
          }
67

Peter van 't Hof's avatar
Peter van 't Hof committed
68
69
        val genomeDir = new File(speciesDir, genomeName)
        val fastaFile = new File(genomeDir, "reference.fa")
Peter van 't Hof's avatar
Peter van 't Hof committed
70
        var outputConfig: Map[String, Any] = Map("reference_fasta" -> fastaFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
71

72
73
74
75
76
77
78
79
80
81
82
83
84
        val fastaFiles = for (fastaUri <- fastaUris) yield {
          val curl = new Curl(this)
          curl.url = fastaUri
          curl.output = if (fastaUris.length > 1 || fastaUri.endsWith(".gz")) {
            curl.isIntermediate = true
            new File(genomeDir, new File(fastaUri).getName)
          } else fastaFile

          add(curl)
          add(Md5sum(this, curl.output, genomeDir))
          curl.output
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
85
86
        val fastaCat = new FastaMerging(this)
        fastaCat.output = fastaFile
87

Peter van 't Hof's avatar
Peter van 't Hof committed
88
        if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) {
89
90
91
92
          fastaFiles.foreach { file =>
            if (file.getName.endsWith(".gz")) {
              val zcat = new Zcat(this)
              zcat.appending = true
Peter van 't Hof's avatar
Peter van 't Hof committed
93
              zcat.input :+= file
94
95
96
97
98
99
100
101
102
103
104
105
106
              zcat.output = fastaFile
              fastaCat.cmds :+= zcat
              fastaCat.input :+= file
            } else {
              val cat = new Cat(this)
              cat.appending = true
              cat.input :+= file
              cat.output = fastaFile
              fastaCat.cmds :+= cat
              fastaCat.input :+= file
            }
          }
          add(fastaCat)
107
          configDeps :+= fastaCat.output
108
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
109

Peter van 't Hof's avatar
Peter van 't Hof committed
110
111
        val faidx = SamtoolsFaidx(this, fastaFile)
        add(faidx)
112
        configDeps :+= faidx.output
Peter van 't Hof's avatar
Peter van 't Hof committed
113

Peter van 't Hof's avatar
Peter van 't Hof committed
114
115
116
117
118
        val createDict = new CreateSequenceDictionary(this)
        createDict.reference = fastaFile
        createDict.output = new File(genomeDir, fastaFile.getName.stripSuffix(".fa") + ".dict")
        createDict.species = Some(speciesName)
        createDict.genomeAssembly = Some(genomeName)
119
        createDict.uri = Some(fastaUris.mkString(","))
Peter van 't Hof's avatar
Peter van 't Hof committed
120
        add(createDict)
121
        configDeps :+= createDict.output
122

Peter van 't Hof's avatar
Peter van 't Hof committed
123
124
125
126
127
128
129
130
131
132
133
134
135
        def createLinks(dir: File): File = {
          val newFastaFile = new File(dir, fastaFile.getName)
          val newFai = new File(dir, faidx.output.getName)
          val newDict = new File(dir, createDict.output.getName)

          add(Ln(this, faidx.output, newFai))
          add(Ln(this, createDict.output, newDict))
          val lnFasta = Ln(this, fastaFile, newFastaFile)
          lnFasta.deps ++= List(newFai, newDict)
          add(lnFasta)
          newFastaFile
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
136
137
138
        val annotationDir = new File(genomeDir, "annotation")

        genomeConfig.get("vep_cache_uri").foreach { vepCacheUri =>
139
          val vepDir = new File(annotationDir, "vep")
Peter van 't Hof's avatar
Peter van 't Hof committed
140
          val curl = new Curl(this)
141
142
          curl.url = vepCacheUri.toString
          curl.output = new File(vepDir, new File(curl.url).getName)
143
          curl.isIntermediate = true
Peter van 't Hof's avatar
Peter van 't Hof committed
144
145
          add(curl)

146
147
148
149
150
151
152
          val tar = new TarExtract(this)
          tar.inputTar = curl.output
          tar.outputDir = vepDir
          add(tar)

          val regex = """.*\/(.*)_vep_(\d*)_(.*)\.tar\.gz""".r
          vepCacheUri.toString match {
Peter van 't Hof's avatar
Peter van 't Hof committed
153
            case regex(species, version, assembly) if version.forall(_.isDigit) =>
154
155
156
157
158
159
160
161
162
163
164
              outputConfig ++= Map("varianteffectpredictor" -> Map(
                "species" -> species,
                "assembly" -> assembly,
                "cache_version" -> version.toInt,
                "cache" -> vepDir,
                "fasta" -> createLinks(vepDir)))
            case _ => throw new IllegalArgumentException("Cache found but no version was found")
          }
        }

        genomeConfig.get("dbsnp_vcf_uri").foreach { dbsnpUri =>
Peter van 't Hof's avatar
Peter van 't Hof committed
165
166
167
168
169
170
171
          val contigMap = genomeConfig.get("dbsnp_contig_map").map(_.asInstanceOf[Map[String, Any]])
          val contigSed = contigMap.map { map =>
            val sed = new Sed(this)
            sed.expressions = map.map(x => s"""s/^${x._1}\t/${x._2}\t/""").toList
            sed
          }

172
          val cv = new CombineVariants(this)
173
          cv.reference_sequence = fastaFile
174
175
          cv.deps ::= createDict.output
          def addDownload(uri: String): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
176
177
            val isZipped = uri.endsWith(".gz")
            val output = new File(annotationDir, new File(uri).getName + (if (isZipped) "" else ".gz"))
178
179
            val curl = new Curl(this)
            curl.url = uri
Peter van 't Hof's avatar
Peter van 't Hof committed
180
181
182
183
184
185
186
187
188

            val downloadCmd = (isZipped, contigSed) match {
              case (true, Some(sed)) => curl | Zcat(this) | sed | new Bgzip(this) > output
              case (false, Some(sed)) => curl | sed | new Bgzip(this) > output
              case (true, None) => curl > output
              case (false, None) => curl | new Bgzip(this) > output
            }
            downloadCmd.isIntermediate = true
            add(downloadCmd)
189

Peter van 't Hof's avatar
Peter van 't Hof committed
190
191
192
193
194
            val tabix = new Tabix(this)
            tabix.input = output
            tabix.p = Some("vcf")
            tabix.isIntermediate = true
            add(tabix)
Peter van 't Hof's avatar
Peter van 't Hof committed
195
196

            cv.variant :+= output
197
198
199
200
201
202
203
204
          }

          dbsnpUri match {
            case l: Traversable[_]    => l.foreach(x => addDownload(x.toString))
            case l: util.ArrayList[_] => l.foreach(x => addDownload(x.toString))
            case _                    => addDownload(dbsnpUri.toString)
          }

205
          cv.out = new File(annotationDir, "dbsnp.vcf.gz")
206
          add(cv)
Peter van 't Hof's avatar
Peter van 't Hof committed
207
          outputConfig += "dbsnp" -> cv.out
Peter van 't Hof's avatar
Peter van 't Hof committed
208
209
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
210
        val gtfFile: Option[File] = genomeConfig.get("gtf_uri").map { gtfUri =>
Peter van 't Hof's avatar
Peter van 't Hof committed
211
          val outputFile = new File(annotationDir, new File(gtfUri.toString).getName.stripSuffix(".gz"))
Peter van 't Hof's avatar
Peter van 't Hof committed
212
213
          val curl = new Curl(this)
          curl.url = gtfUri.toString
Peter van 't Hof's avatar
Peter van 't Hof committed
214
215
          if (gtfUri.toString.endsWith(".gz")) add(curl | Zcat(this) > outputFile)
          else add(curl > outputFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
216
          outputConfig += "annotation_gtf" -> outputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
217
          outputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
218
219
220
221
        }

        val refFlatFile: Option[File] = gtfFile.map { gtf =>
          val refFlat = new File(gtf + ".refFlat")
Peter van 't Hof's avatar
Peter van 't Hof committed
222
223
          val gtfToGenePred = new GtfToGenePred(this)
          gtfToGenePred.inputGtfs :+= gtf
Peter van 't Hof's avatar
Peter van 't Hof committed
224

Peter van 't Hof's avatar
Peter van 't Hof committed
225
226
227
          add(gtfToGenePred | Awk(this, """{ print $12"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10 }""") > refFlat)

          outputConfig += "annotation_refflat" -> refFlat
Peter van 't Hof's avatar
Peter van 't Hof committed
228
229
230
          refFlat
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
231
232
233
234
        // Bwa index
        val bwaIndex = new BwaIndex(this)
        bwaIndex.reference = createLinks(new File(genomeDir, "bwa"))
        add(bwaIndex)
235
        configDeps :+= bwaIndex.jobOutputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
236
        outputConfig += "bwa" -> Map("reference_fasta" -> bwaIndex.reference.getAbsolutePath)
Peter van 't Hof's avatar
Peter van 't Hof committed
237

Peter van 't Hof's avatar
Peter van 't Hof committed
238
239
240
241
242
243
244
        // Gmap index
        val gmapDir = new File(genomeDir, "gmap")
        val gmapBuild = new GmapBuild(this)
        gmapBuild.dir = gmapDir
        gmapBuild.db = genomeName
        gmapBuild.fastaFiles ::= createLinks(gmapDir)
        add(gmapBuild)
245
        configDeps :+= gmapBuild.jobOutputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
246
247
        outputConfig += "gsnap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName)
        outputConfig += "gmap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName)
Peter van 't Hof's avatar
Peter van 't Hof committed
248

Peter van 't Hof's avatar
Peter van 't Hof committed
249
        // STAR index
Peter van 't Hof's avatar
Peter van 't Hof committed
250
        val starDir = new File(genomeDir, "star")
Peter van 't Hof's avatar
Peter van 't Hof committed
251
252
253
254
        val starIndex = new Star(this)
        starIndex.outputDir = starDir
        starIndex.reference = createLinks(starDir)
        starIndex.runmode = "genomeGenerate"
255
        starIndex.sjdbGTFfile = gtfFile
Peter van 't Hof's avatar
Peter van 't Hof committed
256
        add(starIndex)
257
        configDeps :+= starIndex.jobOutputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
258
259
260
261
        outputConfig += "star" -> Map(
          "reference_fasta" -> starIndex.reference.getAbsolutePath,
          "genomeDir" -> starDir.getAbsolutePath
        )
Peter van 't Hof's avatar
Peter van 't Hof committed
262

Peter van 't Hof's avatar
Peter van 't Hof committed
263
        // Bowtie index
Peter van 't Hof's avatar
Peter van 't Hof committed
264
265
266
267
        val bowtieIndex = new BowtieBuild(this)
        bowtieIndex.reference = createLinks(new File(genomeDir, "bowtie"))
        bowtieIndex.baseName = "reference"
        add(bowtieIndex)
268
        configDeps :+= bowtieIndex.jobOutputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
269
270
        outputConfig += "bowtie" -> Map("reference_fasta" -> bowtieIndex.reference.getAbsolutePath)

Peter van 't Hof's avatar
Peter van 't Hof committed
271
        // Bowtie2 index
Peter van 't Hof's avatar
Peter van 't Hof committed
272
        val bowtie2Index = new Bowtie2Build(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
273
274
275
        bowtie2Index.reference = createLinks(new File(genomeDir, "bowtie2"))
        bowtie2Index.baseName = "reference"
        add(bowtie2Index)
276
        configDeps :+= bowtie2Index.jobOutputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
277
        outputConfig += "bowtie2" -> Map("reference_fasta" -> bowtie2Index.reference.getAbsolutePath)
Peter van 't Hof's avatar
Peter van 't Hof committed
278
279
280
        outputConfig += "tophat" -> Map(
          "bowtie_index" -> bowtie2Index.reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta")
        )
Peter van 't Hof's avatar
Peter van 't Hof committed
281

Peter van 't Hof's avatar
Peter van 't Hof committed
282
283
284
285
286
        val writeConfig = new WriteConfig
        writeConfig.deps = configDeps
        writeConfig.out = new File(genomeDir, s"$speciesName-$genomeName.json")
        writeConfig.config = Map("references" -> Map(speciesName -> Map(genomeName -> outputConfig)))
        add(writeConfig)
287
288

        this.configDeps :::= configDeps
Peter van 't Hof's avatar
Peter van 't Hof committed
289
        outputConfig
290
291
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
292

Peter van 't Hof's avatar
Peter van 't Hof committed
293
294
295
296
297
    val writeConfig = new WriteConfig
    writeConfig.deps = configDeps
    writeConfig.out = new File(outputDir, "references.json")
    writeConfig.config = Map("references" -> outputConfig)
    add(writeConfig)
298
299
300
301
  }
}

object GenerateIndexes extends PipelineCommand