DownloadGenomes.scala 11.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
12 13 14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.pipelines.generateindexes
16

17
import java.io.File
18
import java.util
19

Peter van 't Hof's avatar
Peter van 't Hof committed
20
import nl.lumc.sasc.biopet.core.extensions.Md5sum
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
22 23
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.extensions.picard.NormalizeFasta
25
import nl.lumc.sasc.biopet.extensions.tools.DownloadNcbiAssembly
26
import nl.lumc.sasc.biopet.utils.ConfigUtils
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import nl.lumc.sasc.biopet.utils.config.Configurable
28
import org.broadinstitute.gatk.queue.QScript
29

30
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
31
import scala.language.reflectiveCalls
32

Peter van 't Hof's avatar
Peter van 't Hof committed
33
class DownloadGenomes(val parent: Configurable) extends QScript with BiopetQScript {
34 35
  def this() = this(null)

36 37
  @Argument(required = true)
  var referenceConfigFiles: List[File] = Nil
38

39
  var referenceConfig: Map[String, Any] = _
40

Peter van 't Hof's avatar
Peter van 't Hof committed
41 42
  override def fixedValues = Map("gffread" -> Map("T" -> true))

Peter van 't Hof's avatar
Peter van 't Hof committed
43
  override def defaults = Map("normalizefasta" -> Map("line_length" -> 60))
Peter van 't Hof's avatar
Peter van 't Hof committed
44

45 46
  val downloadAnnotations: Boolean = config("download_annotations", default = false)

47 48
  /** This is executed before the script starts */
  def init(): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
49 50
    if (referenceConfig == null)
      referenceConfig = referenceConfigFiles.foldLeft(Map[String, Any]())((a, b) => ConfigUtils.mergeMaps(a, ConfigUtils.fileToConfigMap(b)))
51 52 53 54 55
  }

  /** Method where jobs must be added */
  def biopetScript(): Unit = {

56
    for ((speciesName, c) <- referenceConfig) yield speciesName -> {
57
      val speciesConfig = ConfigUtils.any2map(c)
Peter van 't Hof's avatar
Peter van 't Hof committed
58
      val speciesDir = new File(outputDir, speciesName)
Peter van 't Hof's avatar
Peter van 't Hof committed
59
      for ((genomeName, c) <- speciesConfig) yield genomeName -> {
60
        var configDeps: List[File] = Nil
61 62
        val genomeConfig = ConfigUtils.any2map(c)

Peter van 't Hof's avatar
Peter van 't Hof committed
63 64
        val genomeDir = new File(speciesDir, genomeName)
        val fastaFile = new File(genomeDir, "reference.fa")
Peter van 't Hof's avatar
Peter van 't Hof committed
65
        val downloadFastaFile = new File(genomeDir, "download.reference.fa")
Peter van 't Hof's avatar
Peter van 't Hof committed
66

Peter van 't Hof's avatar
Peter van 't Hof committed
67
        genomeConfig.get("ncbi_assembly_report") match {
Peter van 't Hof's avatar
Peter van 't Hof committed
68
          case Some(assemblyReport: String) =>
69
            val downloadAssembly = new DownloadNcbiAssembly(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
70
            downloadAssembly.assemblyReport = new File(assemblyReport)
Peter van 't Hof's avatar
Peter van 't Hof committed
71
            downloadAssembly.output = downloadFastaFile
Peter van 't Hof's avatar
Peter van 't Hof committed
72 73 74 75 76 77 78 79 80 81
            downloadAssembly.outputReport = new File(genomeDir, s"$speciesName-$genomeName.assembly.report")
            downloadAssembly.nameHeader = genomeConfig.get("ncbi_assembly_header_name").map(_.toString)
            downloadAssembly.mustHaveOne = genomeConfig.get("ncbi_assembly_must_have_one")
              .map(_.asInstanceOf[util.ArrayList[util.LinkedHashMap[String, String]]])
              .getOrElse(new util.ArrayList()).flatMap(x => x.map(y => y._1 + "=" + y._2))
              .toList
            downloadAssembly.mustNotHave = genomeConfig.get("ncbi_assembly_must_not_have")
              .map(_.asInstanceOf[util.ArrayList[util.LinkedHashMap[String, String]]])
              .getOrElse(new util.ArrayList()).flatMap(x => x.map(y => y._1 + "=" + y._2))
              .toList
Peter van 't Hof's avatar
Peter van 't Hof committed
82
            downloadAssembly.isIntermediate = true
83
            add(downloadAssembly)
84
          case _ =>
85 86
            val fastaUris = genomeConfig.getOrElse("fasta_uri",
              throw new IllegalArgumentException(s"No fasta_uri found for $speciesName - $genomeName")) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
87 88 89 90
                case a: Traversable[_]    => a.map(_.toString).toArray
                case a: util.ArrayList[_] => a.map(_.toString).toArray
                case a                    => Array(a.toString)
              }
91

92 93 94 95 96 97 98 99 100 101 102 103 104 105
            val fastaFiles = for (fastaUri <- fastaUris) yield {
              val curl = new Curl(this)
              curl.url = fastaUri
              curl.output = if (fastaUris.length > 1 || fastaUri.endsWith(".gz")) {
                curl.isIntermediate = true
                new File(genomeDir, new File(fastaUri).getName)
              } else fastaFile

              add(curl)
              add(Md5sum(this, curl.output, genomeDir))
              curl.output
            }

            val fastaCat = new FastaMerging(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
106 107
            fastaCat.output = downloadFastaFile
            fastaCat.isIntermediate = true
108 109 110 111 112 113
            if (fastaUris.length > 1 || fastaFiles.exists(_.getName.endsWith(".gz"))) {
              fastaFiles.foreach { file =>
                if (file.getName.endsWith(".gz")) {
                  val zcat = new Zcat(this)
                  zcat.appending = true
                  zcat.input :+= file
Peter van 't Hof's avatar
Peter van 't Hof committed
114
                  zcat.output = downloadFastaFile
115 116 117 118 119 120
                  fastaCat.cmds :+= zcat
                  fastaCat.input :+= file
                } else {
                  val cat = new Cat(this)
                  cat.appending = true
                  cat.input :+= file
Peter van 't Hof's avatar
Peter van 't Hof committed
121
                  cat.output = downloadFastaFile
122 123 124 125 126 127
                  fastaCat.cmds :+= cat
                  fastaCat.input :+= file
                }
              }
              add(fastaCat)
              configDeps :+= fastaCat.output
128 129
            }
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
130

Peter van 't Hof's avatar
Peter van 't Hof committed
131 132 133 134 135
        val normalizeFasta = new NormalizeFasta(this)
        normalizeFasta.input = downloadFastaFile
        normalizeFasta.output = fastaFile
        add(normalizeFasta)

136 137 138 139
        val generateIndexes = new GenerateIndexes(this)
        generateIndexes.fastaFile = fastaFile
        generateIndexes.speciesName = speciesName
        generateIndexes.genomeName = genomeName
Peter van 't Hof's avatar
Peter van 't Hof committed
140
        generateIndexes.outputDir = genomeDir
141
        //generateIndexes.fastaUris = fastaUris
142 143 144
        //TODO: add gtf file
        add(generateIndexes)

145 146 147 148
        if (downloadAnnotations) {
          val annotationDir = new File(genomeDir, "annotation")

          def getAnnotation(tag: String): Map[String, Map[String, Any]] = (genomeConfig.get(tag) match {
149
            case Some(s: Map[_, _]) => s.map(x => x._2 match {
150
              case o: Map[_, _] => x._1.toString -> o.map(x => (x._1.toString, x._2))
Peter van 't Hof's avatar
Peter van 't Hof committed
151
              case _            => throw new IllegalStateException(s"values in the tag $tag should be json objects")
152 153
            })
            case None => Map()
Peter van 't Hof's avatar
Peter van 't Hof committed
154
            case x    => throw new IllegalStateException(s"tag $tag should be an object with objects, now $x")
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
          })

          // Download vep caches
          getAnnotation("vep").foreach {
            case (version, vep) =>
              val vepDir = new File(annotationDir, "vep" + File.separator + version)
              val curl = new Curl(this)
              curl.url = vep("cache_uri").toString
              curl.output = new File(vepDir, new File(curl.url).getName)
              curl.isIntermediate = true
              add(curl)

              val tar = new TarExtract(this)
              tar.inputTar = curl.output
              tar.outputDir = vepDir
              add(tar)
          }

          getAnnotation("dbsnp").foreach {
            case (version, dbsnp) =>
              val dbpsnpDir = new File(annotationDir, "dbsnp")
              val contigMap = dbsnp.get("dbsnp_contig_map").map(_.asInstanceOf[Map[String, Any]])
              val contigSed = contigMap.map { map =>
                val sed = new Sed(this)
                sed.expressions = map.map(x => s"""s/^${x._1}\t/${x._2}\t/""").toList
                sed
              }

              val cv = new CombineVariants(this)
              cv.reference_sequence = fastaFile
              def addDownload(uri: String): Unit = {
                val isZipped = uri.endsWith(".gz")
                val output = new File(dbpsnpDir, version + "." + new File(uri).getName + (if (isZipped) "" else ".gz"))
                val curl = new Curl(this)
                curl.url = uri

                val downloadCmd = (isZipped, contigSed) match {
                  case (true, Some(sed))  => curl | Zcat(this) | sed | new Bgzip(this) > output
                  case (false, Some(sed)) => curl | sed | new Bgzip(this) > output
                  case (true, None)       => curl > output
                  case (false, None)      => curl | new Bgzip(this) > output
                }
                downloadCmd.isIntermediate = true
                add(downloadCmd)

                val tabix = new Tabix(this)
                tabix.input = output
                tabix.p = Some("vcf")
                tabix.isIntermediate = true
                add(tabix)

                cv.variant :+= output
              }

              dbsnp.get("vcf_uri") match {
                case Some(l: Traversable[_])    => l.foreach(x => addDownload(x.toString))
                case Some(l: util.ArrayList[_]) => l.foreach(x => addDownload(x.toString))
                case Some(s)                    => addDownload(s.toString)
                case None                       => throw new IllegalStateException("Dbsnp should always have a 'vcf_uri' key")
              }

              cv.out = new File(dbpsnpDir, s"dbsnp.$version.vcf.gz")
              add(cv)
          }

          getAnnotation("gene_annotation").foreach {
            case (version, geneAnnotation) =>
              val dir = new File(annotationDir, version)
              val gffFile: Option[File] = geneAnnotation.get("gff_uri").map { gtfUri =>
                val outputFile = new File(dir, new File(gtfUri.toString).getName.stripSuffix(".gz"))
                val curl = new Curl(this)
                curl.url = gtfUri.toString
                if (gtfUri.toString.endsWith(".gz")) add(curl | Zcat(this) > outputFile)
                else add(curl > outputFile)
                outputFile
              }

232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
              val gtfFile: Option[File] = geneAnnotation.get("gtf_uri") match {
                case Some(gtfUri) =>
                  val outputFile = new File(dir, new File(gtfUri.toString).getName.stripSuffix(".gz"))
                  val curl = new Curl(this)
                  curl.url = gtfUri.toString
                  if (gtfUri.toString.endsWith(".gz")) add(curl | Zcat(this) > outputFile)
                  else add(curl > outputFile)
                  Some(outputFile)
                case _ => gffFile.map { gff =>
                  val gffRead = new GffRead(this)
                  gffRead.input = gff
                  gffRead.output = swapExt(dir, gff, ".gff", ".gtf")
                  add(gffRead)
                  gffRead.output
                }
247 248 249 250 251 252 253 254 255 256 257 258 259
              }

              val refFlatFile: Option[File] = gtfFile.map { gtf =>
                val refFlat = new File(gtf + ".refFlat")
                val gtfToGenePred = new GtfToGenePred(this)
                gtfToGenePred.inputGtfs :+= gtf

                add(gtfToGenePred | Awk(this, """{ print $12"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10 }""") > refFlat)

                refFlat
              }
          }
        }
260 261 262 263 264
      }
    }
  }
}

265
object DownloadGenomes extends PipelineCommand