VariantEffectPredictor.scala 13.6 KB
Newer Older
bow's avatar
bow committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Sander Bollen's avatar
Sander Bollen committed
16
17
18
19
package nl.lumc.sasc.biopet.extensions

import java.io.File

Sander Bollen's avatar
Sander Bollen committed
20
import nl.lumc.sasc.biopet.core.summary.Summarizable
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.utils.{ Logging, VcfUtils, tryToParseNumber }
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import nl.lumc.sasc.biopet.utils.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
23
24
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Reference, Version }
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
Sander Bollen's avatar
Sander Bollen committed
25

Sander Bollen's avatar
Sander Bollen committed
26
27
import scala.io.Source

Sander Bollen's avatar
Sander Bollen committed
28
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
29
 * Extension for VariantEffectPredictor
Sander Bollen's avatar
Sander Bollen committed
30
31
 * Created by ahbbollen on 15-1-15.
 */
Sander Bollen's avatar
Sander Bollen committed
32
class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version with Summarizable {
Sander Bollen's avatar
Sander Bollen committed
33

Sander Bollen's avatar
Sander Bollen committed
34
  executable = config("exe", namespace = "perl", default = "perl")
Peter van 't Hof's avatar
Peter van 't Hof committed
35
  var vepScript: String = config("vep_script")
Sander Bollen's avatar
Sander Bollen committed
36
37

  @Input(doc = "input VCF", required = true)
38
  var input: File = null
Sander Bollen's avatar
Sander Bollen committed
39
40

  @Output(doc = "output file", required = true)
41
  var output: File = null
Sander Bollen's avatar
Sander Bollen committed
42

43
44
  def versionRegex = """version (\d*)""".r
  def versionCommand = executable + " " + vepScript + " --help"
Sander Bollen's avatar
Sander Bollen committed
45
46

  //Boolean vars
47
48
  var v: Boolean = config("v", default = true, freeVar = false)
  var q: Boolean = config("q", default = false, freeVar = false)
Sander Bollen's avatar
Sander Bollen committed
49
  var offline: Boolean = config("offline", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
50
  var noProgress: Boolean = config("no_progress", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
51
  var everything: Boolean = config("everything", default = false)
Sander Bollen's avatar
Sander Bollen committed
52
  var force: Boolean = config("force", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
53
54
  var noStats: Boolean = config("no_stats", default = false)
  var statsText: Boolean = config("stats_text", default = true)
Sander Bollen's avatar
Sander Bollen committed
55
56
57
58
  var html: Boolean = config("html", default = false)
  var cache: Boolean = config("cache", default = false)
  var humdiv: Boolean = config("humdiv", default = false)
  var regulatory: Boolean = config("regulatory", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
59
  var cellType: Boolean = config("cell_type", default = false)
Sander Bollen's avatar
Sander Bollen committed
60
  var phased: Boolean = config("phased", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
61
  var alleleNumber: Boolean = config("allele_number", default = false)
Sander Bollen's avatar
Sander Bollen committed
62
63
  var numbers: Boolean = config("numbers", default = false)
  var domains: Boolean = config("domains", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
64
  var noEscape: Boolean = config("no_escape", default = false)
Sander Bollen's avatar
Sander Bollen committed
65
66
67
68
69
70
71
72
  var hgvs: Boolean = config("hgvs", default = false)
  var protein: Boolean = config("protein", default = false)
  var symbol: Boolean = config("symbol", default = false)
  var ccds: Boolean = config("ccds", default = false)
  var uniprot: Boolean = config("uniprot", default = false)
  var tsl: Boolean = config("tsl", default = false)
  var canonical: Boolean = config("canonical", default = false)
  var biotype: Boolean = config("biotype", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
73
74
75
76
  var xrefRefseq: Boolean = config("xref_refseq", default = false)
  var checkExisting: Boolean = config("check_existing", default = false)
  var checkAlleles: Boolean = config("check_alleles", default = false)
  var checkSvs: Boolean = config("svs", default = false)
Sander Bollen's avatar
Sander Bollen committed
77
  var gmaf: Boolean = config("gmaf", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
78
79
80
  var maf1kg: Boolean = config("maf_1kg", default = false)
  var mafEsp: Boolean = config("maf_esp", default = false)
  var oldMap: Boolean = config("old_maf", default = false)
Sander Bollen's avatar
Sander Bollen committed
81
  var pubmed: Boolean = config("pubmed", default = false)
Sander Bollen's avatar
Sander Bollen committed
82

83
84
  var vcf: Boolean = config("vcf", default = true, freeVar = false)
  var json: Boolean = config("json", default = false, freeVar = false)
Sander Bollen's avatar
Sander Bollen committed
85
  var gvf: Boolean = config("gvf", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
86
87
88
  var checkRef: Boolean = config("check_ref", default = false)
  var codingOnly: Boolean = config("coding_only", default = false)
  var noIntergenic: Boolean = config("no_intergenic", default = false)
Sander Bollen's avatar
Sander Bollen committed
89
  var pick: Boolean = config("pick", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
90
91
92
93
94
  var pickAllele: Boolean = config("pick_allele", default = false)
  var flagPick: Boolean = config("flag_pick", default = false)
  var flagPickAllele: Boolean = config("flag_pick_allele", default = false)
  var perGene: Boolean = config("per_gene", default = false)
  var mostSevere: Boolean = config("most_severe", default = false)
Sander Bollen's avatar
Sander Bollen committed
95
  var summary: Boolean = config("summary", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
96
97
98
  var filterCommon: Boolean = config("filter_common", default = false)
  var checkFrequency: Boolean = config("check_frequency", default = false)
  var allowNonVariant: Boolean = config("allow_non_variant", default = false)
Sander Bollen's avatar
Sander Bollen committed
99
100
  var database: Boolean = config("database", default = false)
  var genomes: Boolean = config("genomes", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
101
  var gencodeBasic: Boolean = config("gencode_basic", default = false)
Sander Bollen's avatar
Sander Bollen committed
102
103
  var refseq: Boolean = config("refseq", default = false)
  var merged: Boolean = config("merged", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
104
  var allRefseq: Boolean = config("all_refseq", default = false)
Sander Bollen's avatar
Sander Bollen committed
105
  var lrg: Boolean = config("lrg", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
106
107
  var noWholeGenome: Boolean = config("no_whole_genome", default = false)
  var skibDbCheck: Boolean = config("skip_db_check", default = false)
Sander Bollen's avatar
Sander Bollen committed
108
109

  // Textual args
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
110
  var vepConfig: Option[String] = config("config", freeVar = false)
111
  var species: Option[String] = config("species", freeVar = false)
Sander Bollen's avatar
Sander Bollen committed
112
113
114
  var assembly: Option[String] = config("assembly")
  var format: Option[String] = config("format")
  var dir: Option[String] = config("dir")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
115
116
  var dirCache: Option[String] = config("dir_cache")
  var dirPlugins: Option[String] = config("dir_plugins")
Sander Bollen's avatar
Sander Bollen committed
117
118
119
  var fasta: Option[String] = config("fasta")
  var sift: Option[String] = config("sift")
  var polyphen: Option[String] = config("polyphen")
120
  var custom: List[String] = config("custom", default = Nil)
Sander Bollen's avatar
Sander Bollen committed
121
122
123
124
125
126
  var plugin: List[String] = config("plugin", default = Nil)
  var individual: Option[String] = config("individual")
  var fields: Option[String] = config("fields")
  var convert: Option[String] = config("convert")
  var terms: Option[String] = config("terms")
  var chr: Option[String] = config("chr")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
127
128
129
130
  var pickOrder: Option[String] = config("pick_order")
  var freqPop: Option[String] = config("check_pop")
  var freqGtLt: Option[String] = config("freq_gt_lt")
  var freqFilter: Option[String] = config("freq_filter")
Sander Bollen's avatar
Sander Bollen committed
131
132
133
134
135
136
137
  var filter: Option[String] = config("filter")
  var host: Option[String] = config("host")
  var user: Option[String] = config("user")
  var password: Option[String] = config("password")
  var registry: Option[String] = config("registry")
  var build: Option[String] = config("build")
  var compress: Option[String] = config("compress")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
138
  var cacheRegionSize: Option[String] = config("cache_region_size")
Sander Bollen's avatar
Sander Bollen committed
139
140

  // Numeric args
Peter van 't Hof's avatar
Peter van 't Hof committed
141
  override def defaultThreads: Int = config("fork", default = 2)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
142
143
  var cacheVersion: Option[Int] = config("cache_version")
  var freqFreq: Option[Float] = config("freq_freq")
Sander Bollen's avatar
Sander Bollen committed
144
  var port: Option[Int] = config("port")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
145
146
  var dbVersion: Option[Int] = config("db_version")
  var bufferSize: Option[Int] = config("buffer_size")
Sander Bollen's avatar
Sander Bollen committed
147
148
  // ought to be a flag, but is BUG in VEP; becomes numeric ("1" is true)
  var failed: Option[Int] = config("failed")
Sander Bollen's avatar
Sander Bollen committed
149

150
151
  override def defaultCoreMemory = 4.0

152
153
154
  @Output
  private var _summary: File = null

Peter van 't Hof's avatar
Peter van 't Hof committed
155
156
  override def beforeGraph(): Unit = {
    super.beforeGraph()
157
    if (!cache && !database) {
Sander Bollen's avatar
Sander Bollen committed
158
      Logging.addError("Must either set 'cache' or 'database' to true for VariantEffectPredictor")
Sander Bollen's avatar
Sander Bollen committed
159
    } else if (cache && dir.isEmpty) {
Sander Bollen's avatar
Sander Bollen committed
160
      Logging.addError("Must supply 'dir_cache' to cache for VariantEffectPredictor")
161
    }
162
    if (statsText) _summary = new File(output.getAbsolutePath + "_summary.txt")
163
164
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
165
  /** Returns command to execute */
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
  def cmdLine = {
    if (input.exists() && VcfUtils.vcfFileIsEmpty(input)) {
      val zcat = Zcat(this, input, output)
      zcat.cmdLine
    } else required(executable) +
      required(vepScript) +
      required("-i", input) +
      required("-o", output) +
      conditional(v, "-v") +
      conditional(q, "-q") +
      conditional(offline, "--offline") +
      conditional(noProgress, "--no_progress") +
      conditional(everything, "--everything") +
      conditional(force, "--force_overwrite") +
      conditional(noStats, "--no_stats") +
      conditional(statsText, "--stats_text") +
      conditional(html, "--html") +
      conditional(cache, "--cache") +
      conditional(humdiv, "--humdiv") +
      conditional(regulatory, "--regulatory") +
      conditional(cellType, "--cel_type") +
      conditional(phased, "--phased") +
      conditional(alleleNumber, "--allele_number") +
      conditional(numbers, "--numbers") +
      conditional(domains, "--domains") +
      conditional(noEscape, "--no_escape") +
      conditional(hgvs, "--hgvs") +
      conditional(protein, "--protein") +
      conditional(symbol, "--symbol") +
      conditional(ccds, "--ccds") +
      conditional(uniprot, "--uniprot") +
      conditional(tsl, "--tsl") +
      conditional(canonical, "--canonical") +
      conditional(biotype, "--biotype") +
      conditional(xrefRefseq, "--xref_refseq") +
      conditional(checkExisting, "--check_existing") +
      conditional(checkAlleles, "--check_alleles") +
      conditional(checkSvs, "--check_svs") +
      conditional(gmaf, "--gmaf") +
      conditional(maf1kg, "--maf_1kg") +
      conditional(mafEsp, "--maf_esp") +
      conditional(pubmed, "--pubmed") +
      conditional(vcf, "--vcf") +
      conditional(json, "--json") +
      conditional(gvf, "--gvf") +
      conditional(checkRef, "--check_ref") +
      conditional(codingOnly, "--coding_only") +
      conditional(noIntergenic, "--no_intergenic") +
      conditional(pick, "--pick") +
      conditional(pickAllele, "--pick_allele") +
      conditional(flagPick, "--flag_pick") +
      conditional(flagPickAllele, "--flag_pick_allele") +
      conditional(perGene, "--per_gene") +
      conditional(mostSevere, "--most_severe") +
      conditional(summary, "--summary") +
      conditional(filterCommon, "--filter_common") +
      conditional(checkFrequency, "--check_frequency") +
      conditional(allowNonVariant, "--allow_non_variant") +
      conditional(database, "--database") +
      conditional(genomes, "--genomes") +
      conditional(gencodeBasic, "--gencode_basic") +
      conditional(refseq, "--refseq") +
      conditional(merged, "--merged") +
      conditional(allRefseq, "--all_refseq") +
      conditional(lrg, "--lrg") +
      conditional(noWholeGenome, "--no_whole_genome") +
      conditional(skibDbCheck, "--skip_db_check") +
      optional("--config", vepConfig) +
      optional("--species", species) +
      optional("--assembly", assembly) +
      optional("--format", format) +
      optional("--dir", dir) +
      optional("--dir_cache", dirCache) +
      optional("--dir_plugins", dirPlugins) +
      optional("--fasta", fasta) +
      optional("--sift", sift) +
      optional("--polyphen", polyphen) +
      repeat("--custom", custom) +
      repeat("--plugin", plugin) +
      optional("--individual", individual) +
      optional("--fields", fields) +
      optional("--convert", convert) +
      optional("--terms", terms) +
      optional("--chr", chr) +
      optional("--pick_order", pickOrder) +
      optional("--freq_pop", freqPop) +
      optional("--freq_gt_lt", freqGtLt) +
      optional("--freq_filter", freqFilter) +
      optional("--filter", filter) +
      optional("--host", host) +
      optional("--user", user) +
      optional("--password", password) +
      optional("--registry", registry) +
      optional("--build", build) +
      optional("--compress", compress) +
      optional("--cache_region_size", cacheRegionSize) +
      optional("--fork", threads) +
      optional("--cache_version", cacheVersion) +
      optional("--freq_freq", freqFreq) +
      optional("--port", port) +
      optional("--db_version", dbVersion) +
      optional("--buffer_size", bufferSize) +
      optional("--failed", failed)
  }
Sander Bollen's avatar
Sander Bollen committed
270

Sander Bollen's avatar
Sander Bollen committed
271
272
273
  def summaryFiles: Map[String, File] = Map()

  def summaryStats: Map[String, Any] = {
274
275
    val statsFile = new File(output.getAbsolutePath + "_summary.txt")
    if (statsText && statsFile.exists()) parseStatsFile(statsFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
276
    else Map()
Sander Bollen's avatar
Sander Bollen committed
277
278
  }

279
280
281
  protected val removeOnConflict = Set("Output_file", "Command_line_options", "Run_time", "Start_time", "End_time", "Input_file_(format)", "Novel_/_existing_variants")
  protected val nonNumber = Set("VEP_version_(API)", "Cache/Database", "Species")

282
  override def resolveSummaryConflict(v1: Any, v2: Any, key: String): Any = {
283
284
285
286
287
    if (removeOnConflict.contains(key)) None
    else if (nonNumber.contains(key)) v1
    else {
      (v1, v2) match {
        case (x1: Int, x2: Int) => x1 + x2
Peter van 't Hof's avatar
Peter van 't Hof committed
288
        case _                  => throw new IllegalStateException(s"Value are not Int's, unable to sum them up, key: $key, v1: $v1, v2: $v2")
289
      }
290
291
292
    }
  }

Sander Bollen's avatar
Sander Bollen committed
293
  def parseStatsFile(file: File): Map[String, Any] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
294
    val reader = Source.fromFile(file)
Peter van 't Hof's avatar
Peter van 't Hof committed
295
    val contents = reader.getLines().filter(_ != "").toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
296
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
297

Peter van 't Hof's avatar
Peter van 't Hof committed
298
    def isHeader(line: String) = line.startsWith("[") && line.endsWith("]")
Sander Bollen's avatar
Sander Bollen committed
299

Peter van 't Hof's avatar
Peter van 't Hof committed
300
301
    val headers = contents.zipWithIndex
      .filter(x => x._1.startsWith("[") && x._1.endsWith("]"))
Sander Bollen's avatar
Sander Bollen committed
302

Peter van 't Hof's avatar
Peter van 't Hof committed
303
304
    (for ((header, headerIndex) <- headers) yield {
      val name = header.stripPrefix("[").stripSuffix("]")
Peter van 't Hof's avatar
Peter van 't Hof committed
305
      name.replaceAll(" ", "_") -> (contents.drop(headerIndex + 1).takeWhile(!isHeader(_)).map { line =>
Peter van 't Hof's avatar
Peter van 't Hof committed
306
        val values = line.split("\t", 2)
Peter van 't Hof's avatar
Peter van 't Hof committed
307
        values.head.replaceAll(" ", "_") -> tryToParseNumber(values.last).getOrElse(0)
Peter van 't Hof's avatar
Peter van 't Hof committed
308
309
310
      }.toMap)
    }).toMap
  }
Sander Bollen's avatar
Sander Bollen committed
311
}