VariantEffectPredictor.scala 13.9 KB
Newer Older
bow's avatar
bow committed
1 2 3 4 5 6 7 8 9 10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
bow's avatar
bow committed
12 13 14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Sander Bollen's avatar
Sander Bollen committed
15 16 17 18
package nl.lumc.sasc.biopet.extensions

import java.io.File

Sander Bollen's avatar
Sander Bollen committed
19
import nl.lumc.sasc.biopet.core.summary.Summarizable
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import nl.lumc.sasc.biopet.utils.{ LazyCheck, Logging, VcfUtils, tryToParseNumber }
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.utils.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
22 23
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Reference, Version }
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
Sander Bollen's avatar
Sander Bollen committed
24

Sander Bollen's avatar
Sander Bollen committed
25 26
import scala.io.Source

Sander Bollen's avatar
Sander Bollen committed
27
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
28
 * Extension for VariantEffectPredictor
Sander Bollen's avatar
Sander Bollen committed
29 30
 * Created by ahbbollen on 15-1-15.
 */
Sander Bollen's avatar
Sander Bollen committed
31
class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version with Summarizable {
Sander Bollen's avatar
Sander Bollen committed
32

Peter van 't Hof's avatar
Peter van 't Hof committed
33 34 35 36 37 38
  lazy val vepVersion = new LazyCheck({
    val s: Option[String] = config("vep_version")
    s
  })
  vepVersion()

Sander Bollen's avatar
Sander Bollen committed
39
  executable = config("exe", namespace = "perl", default = "perl")
Peter van 't Hof's avatar
Peter van 't Hof committed
40
  var vepScript: String = config("vep_script")
Sander Bollen's avatar
Sander Bollen committed
41 42

  @Input(doc = "input VCF", required = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
43
  var input: File = _
Sander Bollen's avatar
Sander Bollen committed
44 45

  @Output(doc = "output file", required = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
46
  var output: File = _
Sander Bollen's avatar
Sander Bollen committed
47

Peter van 't Hof's avatar
Peter van 't Hof committed
48
  override def subPath = {
Peter van 't Hof's avatar
Peter van 't Hof committed
49
    if (vepVersion.isSet) super.subPath ++ List("vep_settings") ++ vepVersion()
Peter van 't Hof's avatar
Peter van 't Hof committed
50 51
    else super.subPath
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
52

53 54
  def versionRegex = """version (\d*)""".r
  def versionCommand = executable + " " + vepScript + " --help"
Sander Bollen's avatar
Sander Bollen committed
55 56

  //Boolean vars
57 58
  var v: Boolean = config("v", default = true, freeVar = false)
  var q: Boolean = config("q", default = false, freeVar = false)
Sander Bollen's avatar
Sander Bollen committed
59
  var offline: Boolean = config("offline", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
60
  var noProgress: Boolean = config("no_progress", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
61
  var everything: Boolean = config("everything", default = false)
Sander Bollen's avatar
Sander Bollen committed
62
  var force: Boolean = config("force", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
63 64
  var noStats: Boolean = config("no_stats", default = false)
  var statsText: Boolean = config("stats_text", default = true)
Sander Bollen's avatar
Sander Bollen committed
65 66 67 68
  var html: Boolean = config("html", default = false)
  var cache: Boolean = config("cache", default = false)
  var humdiv: Boolean = config("humdiv", default = false)
  var regulatory: Boolean = config("regulatory", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
69
  var cellType: Boolean = config("cell_type", default = false)
Sander Bollen's avatar
Sander Bollen committed
70
  var phased: Boolean = config("phased", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
71
  var alleleNumber: Boolean = config("allele_number", default = false)
Sander Bollen's avatar
Sander Bollen committed
72 73
  var numbers: Boolean = config("numbers", default = false)
  var domains: Boolean = config("domains", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
74
  var noEscape: Boolean = config("no_escape", default = false)
Sander Bollen's avatar
Sander Bollen committed
75 76 77 78 79 80 81 82
  var hgvs: Boolean = config("hgvs", default = false)
  var protein: Boolean = config("protein", default = false)
  var symbol: Boolean = config("symbol", default = false)
  var ccds: Boolean = config("ccds", default = false)
  var uniprot: Boolean = config("uniprot", default = false)
  var tsl: Boolean = config("tsl", default = false)
  var canonical: Boolean = config("canonical", default = false)
  var biotype: Boolean = config("biotype", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
83 84 85 86
  var xrefRefseq: Boolean = config("xref_refseq", default = false)
  var checkExisting: Boolean = config("check_existing", default = false)
  var checkAlleles: Boolean = config("check_alleles", default = false)
  var checkSvs: Boolean = config("svs", default = false)
Sander Bollen's avatar
Sander Bollen committed
87
  var gmaf: Boolean = config("gmaf", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
88 89 90
  var maf1kg: Boolean = config("maf_1kg", default = false)
  var mafEsp: Boolean = config("maf_esp", default = false)
  var oldMap: Boolean = config("old_maf", default = false)
Sander Bollen's avatar
Sander Bollen committed
91
  var pubmed: Boolean = config("pubmed", default = false)
Sander Bollen's avatar
Sander Bollen committed
92

93 94
  var vcf: Boolean = config("vcf", default = true, freeVar = false)
  var json: Boolean = config("json", default = false, freeVar = false)
Sander Bollen's avatar
Sander Bollen committed
95
  var gvf: Boolean = config("gvf", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
96 97 98
  var checkRef: Boolean = config("check_ref", default = false)
  var codingOnly: Boolean = config("coding_only", default = false)
  var noIntergenic: Boolean = config("no_intergenic", default = false)
Sander Bollen's avatar
Sander Bollen committed
99
  var pick: Boolean = config("pick", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
100 101 102 103 104
  var pickAllele: Boolean = config("pick_allele", default = false)
  var flagPick: Boolean = config("flag_pick", default = false)
  var flagPickAllele: Boolean = config("flag_pick_allele", default = false)
  var perGene: Boolean = config("per_gene", default = false)
  var mostSevere: Boolean = config("most_severe", default = false)
Sander Bollen's avatar
Sander Bollen committed
105
  var summary: Boolean = config("summary", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
106 107 108
  var filterCommon: Boolean = config("filter_common", default = false)
  var checkFrequency: Boolean = config("check_frequency", default = false)
  var allowNonVariant: Boolean = config("allow_non_variant", default = false)
Sander Bollen's avatar
Sander Bollen committed
109 110
  var database: Boolean = config("database", default = false)
  var genomes: Boolean = config("genomes", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
111
  var gencodeBasic: Boolean = config("gencode_basic", default = false)
Sander Bollen's avatar
Sander Bollen committed
112 113
  var refseq: Boolean = config("refseq", default = false)
  var merged: Boolean = config("merged", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
114
  var allRefseq: Boolean = config("all_refseq", default = false)
Sander Bollen's avatar
Sander Bollen committed
115
  var lrg: Boolean = config("lrg", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
116 117
  var noWholeGenome: Boolean = config("no_whole_genome", default = false)
  var skibDbCheck: Boolean = config("skip_db_check", default = false)
Sander Bollen's avatar
Sander Bollen committed
118 119

  // Textual args
Peter van 't Hof's avatar
Peter van 't Hof committed
120
  var vepConfigArg: Option[String] = config("config", freeVar = false)
121
  var species: Option[String] = config("species", freeVar = false)
Sander Bollen's avatar
Sander Bollen committed
122 123 124
  var assembly: Option[String] = config("assembly")
  var format: Option[String] = config("format")
  var dir: Option[String] = config("dir")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
125 126
  var dirCache: Option[String] = config("dir_cache")
  var dirPlugins: Option[String] = config("dir_plugins")
Sander Bollen's avatar
Sander Bollen committed
127 128 129
  var fasta: Option[String] = config("fasta")
  var sift: Option[String] = config("sift")
  var polyphen: Option[String] = config("polyphen")
130
  var custom: List[String] = config("custom", default = Nil)
Sander Bollen's avatar
Sander Bollen committed
131 132 133 134 135 136
  var plugin: List[String] = config("plugin", default = Nil)
  var individual: Option[String] = config("individual")
  var fields: Option[String] = config("fields")
  var convert: Option[String] = config("convert")
  var terms: Option[String] = config("terms")
  var chr: Option[String] = config("chr")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
137 138 139 140
  var pickOrder: Option[String] = config("pick_order")
  var freqPop: Option[String] = config("check_pop")
  var freqGtLt: Option[String] = config("freq_gt_lt")
  var freqFilter: Option[String] = config("freq_filter")
Sander Bollen's avatar
Sander Bollen committed
141 142 143 144 145 146 147
  var filter: Option[String] = config("filter")
  var host: Option[String] = config("host")
  var user: Option[String] = config("user")
  var password: Option[String] = config("password")
  var registry: Option[String] = config("registry")
  var build: Option[String] = config("build")
  var compress: Option[String] = config("compress")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
148
  var cacheRegionSize: Option[String] = config("cache_region_size")
Sander Bollen's avatar
Sander Bollen committed
149 150

  // Numeric args
Peter van 't Hof's avatar
Peter van 't Hof committed
151
  override def defaultThreads: Int = config("fork", default = 2)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
152 153
  var cacheVersion: Option[Int] = config("cache_version")
  var freqFreq: Option[Float] = config("freq_freq")
Sander Bollen's avatar
Sander Bollen committed
154
  var port: Option[Int] = config("port")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
155 156
  var dbVersion: Option[Int] = config("db_version")
  var bufferSize: Option[Int] = config("buffer_size")
Sander Bollen's avatar
Sander Bollen committed
157 158
  // ought to be a flag, but is BUG in VEP; becomes numeric ("1" is true)
  var failed: Option[Int] = config("failed")
Sander Bollen's avatar
Sander Bollen committed
159

160 161
  override def defaultCoreMemory = 4.0

162
  @Output
Peter van 't Hof's avatar
Peter van 't Hof committed
163
  private var _summary: File = _
164

Peter van 't Hof's avatar
Peter van 't Hof committed
165 166
  override def beforeGraph(): Unit = {
    super.beforeGraph()
167
    if (!cache && !database) {
Sander Bollen's avatar
Sander Bollen committed
168
      Logging.addError("Must either set 'cache' or 'database' to true for VariantEffectPredictor")
Sander Bollen's avatar
Sander Bollen committed
169
    } else if (cache && dir.isEmpty && dirCache.isEmpty) {
Sander Bollen's avatar
Sander Bollen committed
170
      Logging.addError("Must supply 'dir_cache' to cache for VariantEffectPredictor")
171
    }
172
    if (statsText) _summary = new File(output.getAbsolutePath + "_summary.txt")
173 174
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
175
  /** Returns command to execute */
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
  def cmdLine = {
    if (input.exists() && VcfUtils.vcfFileIsEmpty(input)) {
      val zcat = Zcat(this, input, output)
      zcat.cmdLine
    } else required(executable) +
      required(vepScript) +
      required("-i", input) +
      required("-o", output) +
      conditional(v, "-v") +
      conditional(q, "-q") +
      conditional(offline, "--offline") +
      conditional(noProgress, "--no_progress") +
      conditional(everything, "--everything") +
      conditional(force, "--force_overwrite") +
      conditional(noStats, "--no_stats") +
      conditional(statsText, "--stats_text") +
      conditional(html, "--html") +
      conditional(cache, "--cache") +
      conditional(humdiv, "--humdiv") +
      conditional(regulatory, "--regulatory") +
      conditional(cellType, "--cel_type") +
      conditional(phased, "--phased") +
      conditional(alleleNumber, "--allele_number") +
      conditional(numbers, "--numbers") +
      conditional(domains, "--domains") +
      conditional(noEscape, "--no_escape") +
      conditional(hgvs, "--hgvs") +
      conditional(protein, "--protein") +
      conditional(symbol, "--symbol") +
      conditional(ccds, "--ccds") +
      conditional(uniprot, "--uniprot") +
      conditional(tsl, "--tsl") +
      conditional(canonical, "--canonical") +
      conditional(biotype, "--biotype") +
      conditional(xrefRefseq, "--xref_refseq") +
      conditional(checkExisting, "--check_existing") +
      conditional(checkAlleles, "--check_alleles") +
      conditional(checkSvs, "--check_svs") +
      conditional(gmaf, "--gmaf") +
      conditional(maf1kg, "--maf_1kg") +
      conditional(mafEsp, "--maf_esp") +
      conditional(pubmed, "--pubmed") +
      conditional(vcf, "--vcf") +
      conditional(json, "--json") +
      conditional(gvf, "--gvf") +
      conditional(checkRef, "--check_ref") +
      conditional(codingOnly, "--coding_only") +
      conditional(noIntergenic, "--no_intergenic") +
      conditional(pick, "--pick") +
      conditional(pickAllele, "--pick_allele") +
      conditional(flagPick, "--flag_pick") +
      conditional(flagPickAllele, "--flag_pick_allele") +
      conditional(perGene, "--per_gene") +
      conditional(mostSevere, "--most_severe") +
      conditional(summary, "--summary") +
      conditional(filterCommon, "--filter_common") +
      conditional(checkFrequency, "--check_frequency") +
      conditional(allowNonVariant, "--allow_non_variant") +
      conditional(database, "--database") +
      conditional(genomes, "--genomes") +
      conditional(gencodeBasic, "--gencode_basic") +
      conditional(refseq, "--refseq") +
      conditional(merged, "--merged") +
      conditional(allRefseq, "--all_refseq") +
      conditional(lrg, "--lrg") +
      conditional(noWholeGenome, "--no_whole_genome") +
      conditional(skibDbCheck, "--skip_db_check") +
Peter van 't Hof's avatar
Peter van 't Hof committed
243
      optional("--config", vepConfigArg) +
244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
      optional("--species", species) +
      optional("--assembly", assembly) +
      optional("--format", format) +
      optional("--dir", dir) +
      optional("--dir_cache", dirCache) +
      optional("--dir_plugins", dirPlugins) +
      optional("--fasta", fasta) +
      optional("--sift", sift) +
      optional("--polyphen", polyphen) +
      repeat("--custom", custom) +
      repeat("--plugin", plugin) +
      optional("--individual", individual) +
      optional("--fields", fields) +
      optional("--convert", convert) +
      optional("--terms", terms) +
      optional("--chr", chr) +
      optional("--pick_order", pickOrder) +
      optional("--freq_pop", freqPop) +
      optional("--freq_gt_lt", freqGtLt) +
      optional("--freq_filter", freqFilter) +
      optional("--filter", filter) +
      optional("--host", host) +
      optional("--user", user) +
      optional("--password", password) +
      optional("--registry", registry) +
      optional("--build", build) +
      optional("--compress", compress) +
      optional("--cache_region_size", cacheRegionSize) +
      optional("--fork", threads) +
      optional("--cache_version", cacheVersion) +
      optional("--freq_freq", freqFreq) +
      optional("--port", port) +
      optional("--db_version", dbVersion) +
      optional("--buffer_size", bufferSize) +
      optional("--failed", failed)
  }
Sander Bollen's avatar
Sander Bollen committed
280

Sander Bollen's avatar
Sander Bollen committed
281 282 283
  def summaryFiles: Map[String, File] = Map()

  def summaryStats: Map[String, Any] = {
284 285
    val statsFile = new File(output.getAbsolutePath + "_summary.txt")
    if (statsText && statsFile.exists()) parseStatsFile(statsFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
286
    else Map()
Sander Bollen's avatar
Sander Bollen committed
287 288
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
289 290
  protected val removeOnConflict = Set("Output_file", "Run_time", "Start_time", "End_time", "Novel_/_existing_variants", "Input_file_(format)")
  protected val nonNumber = Set("VEP_version_(API)", "Cache/Database", "Species", "Command_line_options")
291

292
  override def resolveSummaryConflict(v1: Any, v2: Any, key: String): Any = {
293
    if (removeOnConflict.contains(key)) None
Peter van 't Hof's avatar
Peter van 't Hof committed
294
    else if (nonNumber.contains(key)) v2
295 296 297
    else {
      (v1, v2) match {
        case (x1: Int, x2: Int) => x1 + x2
Peter van 't Hof's avatar
Peter van 't Hof committed
298
        case _                  => throw new IllegalStateException(s"Value are not Int's, unable to sum them up, key: $key, v1: $v1, v2: $v2")
299
      }
300 301 302
    }
  }

Sander Bollen's avatar
Sander Bollen committed
303
  def parseStatsFile(file: File): Map[String, Any] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
304
    val reader = Source.fromFile(file)
Peter van 't Hof's avatar
Peter van 't Hof committed
305
    val contents = reader.getLines().filter(_ != "").toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
306
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
307

Peter van 't Hof's avatar
Peter van 't Hof committed
308
    def isHeader(line: String) = line.startsWith("[") && line.endsWith("]")
Sander Bollen's avatar
Sander Bollen committed
309

Peter van 't Hof's avatar
Peter van 't Hof committed
310 311
    val headers = contents.zipWithIndex
      .filter(x => x._1.startsWith("[") && x._1.endsWith("]"))
Sander Bollen's avatar
Sander Bollen committed
312

Peter van 't Hof's avatar
Peter van 't Hof committed
313 314
    (for ((header, headerIndex) <- headers) yield {
      val name = header.stripPrefix("[").stripSuffix("]")
Peter van 't Hof's avatar
Peter van 't Hof committed
315
      name.replaceAll(" ", "_") -> contents.drop(headerIndex + 1).takeWhile(!isHeader(_)).flatMap { line =>
Peter van 't Hof's avatar
Peter van 't Hof committed
316
        val values = line.split("\t", 2)
Peter van 't Hof's avatar
Peter van 't Hof committed
317 318
        if (values.last.isEmpty || values.last == "-") None
        else Some(values.head.replaceAll(" ", "_") -> tryToParseNumber(values.last).getOrElse(values.last))
Peter van 't Hof's avatar
Peter van 't Hof committed
319
      }.toMap
Peter van 't Hof's avatar
Peter van 't Hof committed
320 321
    }).toMap
  }
Sander Bollen's avatar
Sander Bollen committed
322
}