GenerateIndexes.scala 11.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.pipelines.generateindexes
17

Peter van 't Hof's avatar
Peter van 't Hof committed
18
import java.io.{ File, PrintWriter }
19
import java.util
20

Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.core.extensions.Md5sum
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
23
import nl.lumc.sasc.biopet.extensions._
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2Build, BowtieBuild }
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import nl.lumc.sasc.biopet.extensions.bwa.BwaIndex
Peter van 't Hof's avatar
Peter van 't Hof committed
26
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import nl.lumc.sasc.biopet.extensions.gmap.GmapBuild
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx
30
import nl.lumc.sasc.biopet.utils.ConfigUtils
Peter van 't Hof's avatar
Peter van 't Hof committed
31
import nl.lumc.sasc.biopet.utils.config.Configurable
32
import org.broadinstitute.gatk.queue.QScript
33

34
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
35
import scala.language.reflectiveCalls
36 37 38 39

class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript {
  def this() = this(null)

Peter van 't Hof's avatar
Peter van 't Hof committed
40 41
  @Argument(required = true)
  var referenceConfigFiles: List[File] = Nil
42

Peter van 't Hof's avatar
Peter van 't Hof committed
43
  var referenceConfig: Map[String, Any] = null
44

45
  protected var configDeps: List[File] = Nil
Peter van 't Hof's avatar
Peter van 't Hof committed
46

47 48
  /** This is executed before the script starts */
  def init(): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
49 50
    if (referenceConfig == null)
      referenceConfig = referenceConfigFiles.foldLeft(Map[String, Any]())((a, b) => ConfigUtils.mergeMaps(a, ConfigUtils.fileToConfigMap(b)))
51 52 53 54 55
  }

  /** Method where jobs must be added */
  def biopetScript(): Unit = {

Peter van 't Hof's avatar
Peter van 't Hof committed
56
    val outputConfig = for ((speciesName, c) <- referenceConfig) yield speciesName -> {
57
      val speciesConfig = ConfigUtils.any2map(c)
Peter van 't Hof's avatar
Peter van 't Hof committed
58
      val speciesDir = new File(outputDir, speciesName)
Peter van 't Hof's avatar
Peter van 't Hof committed
59
      for ((genomeName, c) <- speciesConfig) yield genomeName -> {
60
        var configDeps: List[File] = Nil
61
        val genomeConfig = ConfigUtils.any2map(c)
62 63
        val fastaUris = genomeConfig.getOrElse("fasta_uri",
          throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
64
            case a: Traversable[_]    => a.map(_.toString).toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
65
            case a: util.ArrayList[_] => a.map(_.toString).toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
66
            case a                    => Array(a.toString)
67
          }
68

Peter van 't Hof's avatar
Peter van 't Hof committed
69 70
        val genomeDir = new File(speciesDir, genomeName)
        val fastaFile = new File(genomeDir, "reference.fa")
Peter van 't Hof's avatar
Peter van 't Hof committed
71
        var outputConfig: Map[String, Any] = Map("reference_fasta" -> fastaFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
72

73 74 75 76 77 78 79 80 81 82
        val fastaFiles = for (fastaUri <- fastaUris) yield {
          val curl = new Curl(this)
          curl.url = fastaUri
          curl.output = if (fastaUris.length > 1 || fastaUri.endsWith(".gz")) {
            curl.isIntermediate = true
            new File(genomeDir, new File(fastaUri).getName)
          } else fastaFile

          add(curl)
          add(Md5sum(this, curl.output, genomeDir))
83
          configDeps :+= curl.output
84 85 86
          curl.output
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
87 88
        val fastaCat = new FastaMerging(this)
        fastaCat.output = fastaFile
89

Peter van 't Hof's avatar
Peter van 't Hof committed
90
        if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) {
91 92 93 94
          fastaFiles.foreach { file =>
            if (file.getName.endsWith(".gz")) {
              val zcat = new Zcat(this)
              zcat.appending = true
Peter van 't Hof's avatar
Peter van 't Hof committed
95
              zcat.input :+= file
96 97 98 99 100 101 102 103 104 105 106 107 108
              zcat.output = fastaFile
              fastaCat.cmds :+= zcat
              fastaCat.input :+= file
            } else {
              val cat = new Cat(this)
              cat.appending = true
              cat.input :+= file
              cat.output = fastaFile
              fastaCat.cmds :+= cat
              fastaCat.input :+= file
            }
          }
          add(fastaCat)
109
          configDeps :+= fastaCat.output
110
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
111

Peter van 't Hof's avatar
Peter van 't Hof committed
112 113
        val faidx = SamtoolsFaidx(this, fastaFile)
        add(faidx)
114
        configDeps :+= faidx.output
Peter van 't Hof's avatar
Peter van 't Hof committed
115

Peter van 't Hof's avatar
Peter van 't Hof committed
116 117 118 119 120
        val createDict = new CreateSequenceDictionary(this)
        createDict.reference = fastaFile
        createDict.output = new File(genomeDir, fastaFile.getName.stripSuffix(".fa") + ".dict")
        createDict.species = Some(speciesName)
        createDict.genomeAssembly = Some(genomeName)
121
        createDict.uri = Some(fastaUris.mkString(","))
Peter van 't Hof's avatar
Peter van 't Hof committed
122
        add(createDict)
123
        configDeps :+= createDict.output
124

Peter van 't Hof's avatar
Peter van 't Hof committed
125 126 127 128 129 130 131 132 133 134 135 136 137
        def createLinks(dir: File): File = {
          val newFastaFile = new File(dir, fastaFile.getName)
          val newFai = new File(dir, faidx.output.getName)
          val newDict = new File(dir, createDict.output.getName)

          add(Ln(this, faidx.output, newFai))
          add(Ln(this, createDict.output, newDict))
          val lnFasta = Ln(this, fastaFile, newFastaFile)
          lnFasta.deps ++= List(newFai, newDict)
          add(lnFasta)
          newFastaFile
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
138 139 140
        val annotationDir = new File(genomeDir, "annotation")

        genomeConfig.get("vep_cache_uri").foreach { vepCacheUri =>
141
          val vepDir = new File(annotationDir, "vep")
Peter van 't Hof's avatar
Peter van 't Hof committed
142
          val curl = new Curl(this)
143 144
          curl.url = vepCacheUri.toString
          curl.output = new File(vepDir, new File(curl.url).getName)
145
          curl.isIntermediate = true
Peter van 't Hof's avatar
Peter van 't Hof committed
146 147
          add(curl)

148 149 150 151 152 153 154
          val tar = new TarExtract(this)
          tar.inputTar = curl.output
          tar.outputDir = vepDir
          add(tar)

          val regex = """.*\/(.*)_vep_(\d*)_(.*)\.tar\.gz""".r
          vepCacheUri.toString match {
Peter van 't Hof's avatar
Peter van 't Hof committed
155
            case regex(species, version, assembly) if version.forall(_.isDigit) =>
156 157 158 159 160 161 162 163 164 165 166 167
              outputConfig ++= Map("varianteffectpredictor" -> Map(
                "species" -> species,
                "assembly" -> assembly,
                "cache_version" -> version.toInt,
                "cache" -> vepDir,
                "fasta" -> createLinks(vepDir)))
            case _ => throw new IllegalArgumentException("Cache found but no version was found")
          }
        }

        genomeConfig.get("dbsnp_vcf_uri").foreach { dbsnpUri =>
          val cv = new CombineVariants(this)
168
          cv.reference_sequence = fastaFile
169 170 171 172 173 174 175
          cv.deps ::= createDict.output
          def addDownload(uri: String): Unit = {
            val curl = new Curl(this)
            curl.url = uri
            curl.output = new File(annotationDir, new File(curl.url).getName)
            curl.isIntermediate = true
            add(curl)
176
            cv.variant :+= curl.output
177

178 179 180 181 182 183 184 185
            if (curl.output.getName.endsWith(".vcf.gz")) {
              val tabix = new Tabix(this)
              tabix.input = curl.output
              tabix.p = Some("vcf")
              tabix.isIntermediate = true
              add(tabix)
              configDeps :+= tabix.outputIndex
            }
186 187 188 189 190 191 192 193
          }

          dbsnpUri match {
            case l: Traversable[_]    => l.foreach(x => addDownload(x.toString))
            case l: util.ArrayList[_] => l.foreach(x => addDownload(x.toString))
            case _                    => addDownload(dbsnpUri.toString)
          }

194
          cv.out = new File(annotationDir, "dbsnp.vcf.gz")
195
          add(cv)
Peter van 't Hof's avatar
Peter van 't Hof committed
196
          outputConfig += "dbsnp" -> cv.out
Peter van 't Hof's avatar
Peter van 't Hof committed
197 198
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
199
        val gtfFile: Option[File] = genomeConfig.get("gtf_uri").map { gtfUri =>
Peter van 't Hof's avatar
Peter van 't Hof committed
200
          val outputFile = new File(annotationDir, new File(gtfUri.toString).getName.stripSuffix(".gz"))
Peter van 't Hof's avatar
Peter van 't Hof committed
201 202
          val curl = new Curl(this)
          curl.url = gtfUri.toString
Peter van 't Hof's avatar
Peter van 't Hof committed
203 204
          if (gtfUri.toString.endsWith(".gz")) add(curl | Zcat(this) > outputFile)
          else add(curl > outputFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
205
          outputConfig += "annotation_gtf" -> outputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
206
          outputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
207 208 209 210
        }

        val refFlatFile: Option[File] = gtfFile.map { gtf =>
          val refFlat = new File(gtf + ".refFlat")
Peter van 't Hof's avatar
Peter van 't Hof committed
211 212
          val gtfToGenePred = new GtfToGenePred(this)
          gtfToGenePred.inputGtfs :+= gtf
Peter van 't Hof's avatar
Peter van 't Hof committed
213

Peter van 't Hof's avatar
Peter van 't Hof committed
214 215 216
          add(gtfToGenePred | Awk(this, """{ print $12"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10 }""") > refFlat)

          outputConfig += "annotation_refflat" -> refFlat
Peter van 't Hof's avatar
Peter van 't Hof committed
217 218 219
          refFlat
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
220 221 222 223
        // Bwa index
        val bwaIndex = new BwaIndex(this)
        bwaIndex.reference = createLinks(new File(genomeDir, "bwa"))
        add(bwaIndex)
224
        configDeps :+= bwaIndex.jobOutputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
225
        outputConfig += "bwa" -> Map("reference_fasta" -> bwaIndex.reference.getAbsolutePath)
Peter van 't Hof's avatar
Peter van 't Hof committed
226

Peter van 't Hof's avatar
Peter van 't Hof committed
227 228 229 230 231 232 233
        // Gmap index
        val gmapDir = new File(genomeDir, "gmap")
        val gmapBuild = new GmapBuild(this)
        gmapBuild.dir = gmapDir
        gmapBuild.db = genomeName
        gmapBuild.fastaFiles ::= createLinks(gmapDir)
        add(gmapBuild)
234
        configDeps :+= gmapBuild.jobOutputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
235 236
        outputConfig += "gsnap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName)
        outputConfig += "gmap" -> Map("dir" -> gmapBuild.dir.getAbsolutePath, "db" -> genomeName)
Peter van 't Hof's avatar
Peter van 't Hof committed
237

Peter van 't Hof's avatar
Peter van 't Hof committed
238
        // STAR index
Peter van 't Hof's avatar
Peter van 't Hof committed
239
        val starDir = new File(genomeDir, "star")
Peter van 't Hof's avatar
Peter van 't Hof committed
240 241 242 243
        val starIndex = new Star(this)
        starIndex.outputDir = starDir
        starIndex.reference = createLinks(starDir)
        starIndex.runmode = "genomeGenerate"
244
        starIndex.sjdbGTFfile = gtfFile
Peter van 't Hof's avatar
Peter van 't Hof committed
245
        add(starIndex)
246
        configDeps :+= starIndex.jobOutputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
247 248 249 250
        outputConfig += "star" -> Map(
          "reference_fasta" -> starIndex.reference.getAbsolutePath,
          "genomeDir" -> starDir.getAbsolutePath
        )
Peter van 't Hof's avatar
Peter van 't Hof committed
251

Peter van 't Hof's avatar
Peter van 't Hof committed
252
        // Bowtie index
Peter van 't Hof's avatar
Peter van 't Hof committed
253 254 255 256
        val bowtieIndex = new BowtieBuild(this)
        bowtieIndex.reference = createLinks(new File(genomeDir, "bowtie"))
        bowtieIndex.baseName = "reference"
        add(bowtieIndex)
257
        configDeps :+= bowtieIndex.jobOutputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
258 259
        outputConfig += "bowtie" -> Map("reference_fasta" -> bowtieIndex.reference.getAbsolutePath)

Peter van 't Hof's avatar
Peter van 't Hof committed
260
        // Bowtie2 index
Peter van 't Hof's avatar
Peter van 't Hof committed
261
        val bowtie2Index = new Bowtie2Build(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
262 263 264
        bowtie2Index.reference = createLinks(new File(genomeDir, "bowtie2"))
        bowtie2Index.baseName = "reference"
        add(bowtie2Index)
265
        configDeps :+= bowtie2Index.jobOutputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
266
        outputConfig += "bowtie2" -> Map("reference_fasta" -> bowtie2Index.reference.getAbsolutePath)
Peter van 't Hof's avatar
Peter van 't Hof committed
267 268 269
        outputConfig += "tophat" -> Map(
          "bowtie_index" -> bowtie2Index.reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta")
        )
Peter van 't Hof's avatar
Peter van 't Hof committed
270

Peter van 't Hof's avatar
Peter van 't Hof committed
271 272 273 274 275
        val writeConfig = new WriteConfig
        writeConfig.deps = configDeps
        writeConfig.out = new File(genomeDir, s"$speciesName-$genomeName.json")
        writeConfig.config = Map("references" -> Map(speciesName -> Map(genomeName -> outputConfig)))
        add(writeConfig)
276 277

        this.configDeps :::= configDeps
Peter van 't Hof's avatar
Peter van 't Hof committed
278
        outputConfig
279 280
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
281

Peter van 't Hof's avatar
Peter van 't Hof committed
282 283 284 285 286
    val writeConfig = new WriteConfig
    writeConfig.deps = configDeps
    writeConfig.out = new File(outputDir, "references.json")
    writeConfig.config = Map("references" -> outputConfig)
    add(writeConfig)
287 288 289 290
  }
}

object GenerateIndexes extends PipelineCommand