VariantEffectPredictor.scala 13.6 KB
Newer Older
bow's avatar
bow committed
1
2
3
4
5
6
7
8
9
10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
bow's avatar
bow committed
12
13
14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Sander Bollen's avatar
Sander Bollen committed
15
16
17
18
package nl.lumc.sasc.biopet.extensions

import java.io.File

Sander Bollen's avatar
Sander Bollen committed
19
import nl.lumc.sasc.biopet.core.summary.Summarizable
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import nl.lumc.sasc.biopet.utils.{ Logging, VcfUtils, tryToParseNumber }
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.utils.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
22
23
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Reference, Version }
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
Sander Bollen's avatar
Sander Bollen committed
24

Sander Bollen's avatar
Sander Bollen committed
25
26
import scala.io.Source

Sander Bollen's avatar
Sander Bollen committed
27
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
28
 * Extension for VariantEffectPredictor
Sander Bollen's avatar
Sander Bollen committed
29
30
 * Created by ahbbollen on 15-1-15.
 */
Sander Bollen's avatar
Sander Bollen committed
31
class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version with Summarizable {
Sander Bollen's avatar
Sander Bollen committed
32

Sander Bollen's avatar
Sander Bollen committed
33
  executable = config("exe", namespace = "perl", default = "perl")
Peter van 't Hof's avatar
Peter van 't Hof committed
34
  var vepScript: String = config("vep_script")
Sander Bollen's avatar
Sander Bollen committed
35
36

  @Input(doc = "input VCF", required = true)
37
  var input: File = null
Sander Bollen's avatar
Sander Bollen committed
38
39

  @Output(doc = "output file", required = true)
40
  var output: File = null
Sander Bollen's avatar
Sander Bollen committed
41

42
43
  def versionRegex = """version (\d*)""".r
  def versionCommand = executable + " " + vepScript + " --help"
Sander Bollen's avatar
Sander Bollen committed
44
45

  //Boolean vars
46
47
  var v: Boolean = config("v", default = true, freeVar = false)
  var q: Boolean = config("q", default = false, freeVar = false)
Sander Bollen's avatar
Sander Bollen committed
48
  var offline: Boolean = config("offline", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
49
  var noProgress: Boolean = config("no_progress", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
50
  var everything: Boolean = config("everything", default = false)
Sander Bollen's avatar
Sander Bollen committed
51
  var force: Boolean = config("force", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
52
53
  var noStats: Boolean = config("no_stats", default = false)
  var statsText: Boolean = config("stats_text", default = true)
Sander Bollen's avatar
Sander Bollen committed
54
55
56
57
  var html: Boolean = config("html", default = false)
  var cache: Boolean = config("cache", default = false)
  var humdiv: Boolean = config("humdiv", default = false)
  var regulatory: Boolean = config("regulatory", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
58
  var cellType: Boolean = config("cell_type", default = false)
Sander Bollen's avatar
Sander Bollen committed
59
  var phased: Boolean = config("phased", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
60
  var alleleNumber: Boolean = config("allele_number", default = false)
Sander Bollen's avatar
Sander Bollen committed
61
62
  var numbers: Boolean = config("numbers", default = false)
  var domains: Boolean = config("domains", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
63
  var noEscape: Boolean = config("no_escape", default = false)
Sander Bollen's avatar
Sander Bollen committed
64
65
66
67
68
69
70
71
  var hgvs: Boolean = config("hgvs", default = false)
  var protein: Boolean = config("protein", default = false)
  var symbol: Boolean = config("symbol", default = false)
  var ccds: Boolean = config("ccds", default = false)
  var uniprot: Boolean = config("uniprot", default = false)
  var tsl: Boolean = config("tsl", default = false)
  var canonical: Boolean = config("canonical", default = false)
  var biotype: Boolean = config("biotype", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
72
73
74
75
  var xrefRefseq: Boolean = config("xref_refseq", default = false)
  var checkExisting: Boolean = config("check_existing", default = false)
  var checkAlleles: Boolean = config("check_alleles", default = false)
  var checkSvs: Boolean = config("svs", default = false)
Sander Bollen's avatar
Sander Bollen committed
76
  var gmaf: Boolean = config("gmaf", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
77
78
79
  var maf1kg: Boolean = config("maf_1kg", default = false)
  var mafEsp: Boolean = config("maf_esp", default = false)
  var oldMap: Boolean = config("old_maf", default = false)
Sander Bollen's avatar
Sander Bollen committed
80
  var pubmed: Boolean = config("pubmed", default = false)
Sander Bollen's avatar
Sander Bollen committed
81

82
83
  var vcf: Boolean = config("vcf", default = true, freeVar = false)
  var json: Boolean = config("json", default = false, freeVar = false)
Sander Bollen's avatar
Sander Bollen committed
84
  var gvf: Boolean = config("gvf", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
85
86
87
  var checkRef: Boolean = config("check_ref", default = false)
  var codingOnly: Boolean = config("coding_only", default = false)
  var noIntergenic: Boolean = config("no_intergenic", default = false)
Sander Bollen's avatar
Sander Bollen committed
88
  var pick: Boolean = config("pick", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
89
90
91
92
93
  var pickAllele: Boolean = config("pick_allele", default = false)
  var flagPick: Boolean = config("flag_pick", default = false)
  var flagPickAllele: Boolean = config("flag_pick_allele", default = false)
  var perGene: Boolean = config("per_gene", default = false)
  var mostSevere: Boolean = config("most_severe", default = false)
Sander Bollen's avatar
Sander Bollen committed
94
  var summary: Boolean = config("summary", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
95
96
97
  var filterCommon: Boolean = config("filter_common", default = false)
  var checkFrequency: Boolean = config("check_frequency", default = false)
  var allowNonVariant: Boolean = config("allow_non_variant", default = false)
Sander Bollen's avatar
Sander Bollen committed
98
99
  var database: Boolean = config("database", default = false)
  var genomes: Boolean = config("genomes", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
100
  var gencodeBasic: Boolean = config("gencode_basic", default = false)
Sander Bollen's avatar
Sander Bollen committed
101
102
  var refseq: Boolean = config("refseq", default = false)
  var merged: Boolean = config("merged", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
103
  var allRefseq: Boolean = config("all_refseq", default = false)
Sander Bollen's avatar
Sander Bollen committed
104
  var lrg: Boolean = config("lrg", default = false)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
105
106
  var noWholeGenome: Boolean = config("no_whole_genome", default = false)
  var skibDbCheck: Boolean = config("skip_db_check", default = false)
Sander Bollen's avatar
Sander Bollen committed
107
108

  // Textual args
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
109
  var vepConfig: Option[String] = config("config", freeVar = false)
110
  var species: Option[String] = config("species", freeVar = false)
Sander Bollen's avatar
Sander Bollen committed
111
112
113
  var assembly: Option[String] = config("assembly")
  var format: Option[String] = config("format")
  var dir: Option[String] = config("dir")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
114
115
  var dirCache: Option[String] = config("dir_cache")
  var dirPlugins: Option[String] = config("dir_plugins")
Sander Bollen's avatar
Sander Bollen committed
116
117
118
  var fasta: Option[String] = config("fasta")
  var sift: Option[String] = config("sift")
  var polyphen: Option[String] = config("polyphen")
119
  var custom: List[String] = config("custom", default = Nil)
Sander Bollen's avatar
Sander Bollen committed
120
121
122
123
124
125
  var plugin: List[String] = config("plugin", default = Nil)
  var individual: Option[String] = config("individual")
  var fields: Option[String] = config("fields")
  var convert: Option[String] = config("convert")
  var terms: Option[String] = config("terms")
  var chr: Option[String] = config("chr")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
126
127
128
129
  var pickOrder: Option[String] = config("pick_order")
  var freqPop: Option[String] = config("check_pop")
  var freqGtLt: Option[String] = config("freq_gt_lt")
  var freqFilter: Option[String] = config("freq_filter")
Sander Bollen's avatar
Sander Bollen committed
130
131
132
133
134
135
136
  var filter: Option[String] = config("filter")
  var host: Option[String] = config("host")
  var user: Option[String] = config("user")
  var password: Option[String] = config("password")
  var registry: Option[String] = config("registry")
  var build: Option[String] = config("build")
  var compress: Option[String] = config("compress")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
137
  var cacheRegionSize: Option[String] = config("cache_region_size")
Sander Bollen's avatar
Sander Bollen committed
138
139

  // Numeric args
Peter van 't Hof's avatar
Peter van 't Hof committed
140
  override def defaultThreads: Int = config("fork", default = 2)
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
141
142
  var cacheVersion: Option[Int] = config("cache_version")
  var freqFreq: Option[Float] = config("freq_freq")
Sander Bollen's avatar
Sander Bollen committed
143
  var port: Option[Int] = config("port")
Sander van der Zeeuw's avatar
Sander van der Zeeuw committed
144
145
  var dbVersion: Option[Int] = config("db_version")
  var bufferSize: Option[Int] = config("buffer_size")
Sander Bollen's avatar
Sander Bollen committed
146
147
  // ought to be a flag, but is BUG in VEP; becomes numeric ("1" is true)
  var failed: Option[Int] = config("failed")
Sander Bollen's avatar
Sander Bollen committed
148

149
150
  override def defaultCoreMemory = 4.0

151
152
153
  @Output
  private var _summary: File = null

Peter van 't Hof's avatar
Peter van 't Hof committed
154
155
  override def beforeGraph(): Unit = {
    super.beforeGraph()
156
    if (!cache && !database) {
Sander Bollen's avatar
Sander Bollen committed
157
      Logging.addError("Must either set 'cache' or 'database' to true for VariantEffectPredictor")
Sander Bollen's avatar
Sander Bollen committed
158
    } else if (cache && dir.isEmpty) {
Sander Bollen's avatar
Sander Bollen committed
159
      Logging.addError("Must supply 'dir_cache' to cache for VariantEffectPredictor")
160
    }
161
    if (statsText) _summary = new File(output.getAbsolutePath + "_summary.txt")
162
163
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
164
  /** Returns command to execute */
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
  def cmdLine = {
    if (input.exists() && VcfUtils.vcfFileIsEmpty(input)) {
      val zcat = Zcat(this, input, output)
      zcat.cmdLine
    } else required(executable) +
      required(vepScript) +
      required("-i", input) +
      required("-o", output) +
      conditional(v, "-v") +
      conditional(q, "-q") +
      conditional(offline, "--offline") +
      conditional(noProgress, "--no_progress") +
      conditional(everything, "--everything") +
      conditional(force, "--force_overwrite") +
      conditional(noStats, "--no_stats") +
      conditional(statsText, "--stats_text") +
      conditional(html, "--html") +
      conditional(cache, "--cache") +
      conditional(humdiv, "--humdiv") +
      conditional(regulatory, "--regulatory") +
      conditional(cellType, "--cel_type") +
      conditional(phased, "--phased") +
      conditional(alleleNumber, "--allele_number") +
      conditional(numbers, "--numbers") +
      conditional(domains, "--domains") +
      conditional(noEscape, "--no_escape") +
      conditional(hgvs, "--hgvs") +
      conditional(protein, "--protein") +
      conditional(symbol, "--symbol") +
      conditional(ccds, "--ccds") +
      conditional(uniprot, "--uniprot") +
      conditional(tsl, "--tsl") +
      conditional(canonical, "--canonical") +
      conditional(biotype, "--biotype") +
      conditional(xrefRefseq, "--xref_refseq") +
      conditional(checkExisting, "--check_existing") +
      conditional(checkAlleles, "--check_alleles") +
      conditional(checkSvs, "--check_svs") +
      conditional(gmaf, "--gmaf") +
      conditional(maf1kg, "--maf_1kg") +
      conditional(mafEsp, "--maf_esp") +
      conditional(pubmed, "--pubmed") +
      conditional(vcf, "--vcf") +
      conditional(json, "--json") +
      conditional(gvf, "--gvf") +
      conditional(checkRef, "--check_ref") +
      conditional(codingOnly, "--coding_only") +
      conditional(noIntergenic, "--no_intergenic") +
      conditional(pick, "--pick") +
      conditional(pickAllele, "--pick_allele") +
      conditional(flagPick, "--flag_pick") +
      conditional(flagPickAllele, "--flag_pick_allele") +
      conditional(perGene, "--per_gene") +
      conditional(mostSevere, "--most_severe") +
      conditional(summary, "--summary") +
      conditional(filterCommon, "--filter_common") +
      conditional(checkFrequency, "--check_frequency") +
      conditional(allowNonVariant, "--allow_non_variant") +
      conditional(database, "--database") +
      conditional(genomes, "--genomes") +
      conditional(gencodeBasic, "--gencode_basic") +
      conditional(refseq, "--refseq") +
      conditional(merged, "--merged") +
      conditional(allRefseq, "--all_refseq") +
      conditional(lrg, "--lrg") +
      conditional(noWholeGenome, "--no_whole_genome") +
      conditional(skibDbCheck, "--skip_db_check") +
      optional("--config", vepConfig) +
      optional("--species", species) +
      optional("--assembly", assembly) +
      optional("--format", format) +
      optional("--dir", dir) +
      optional("--dir_cache", dirCache) +
      optional("--dir_plugins", dirPlugins) +
      optional("--fasta", fasta) +
      optional("--sift", sift) +
      optional("--polyphen", polyphen) +
      repeat("--custom", custom) +
      repeat("--plugin", plugin) +
      optional("--individual", individual) +
      optional("--fields", fields) +
      optional("--convert", convert) +
      optional("--terms", terms) +
      optional("--chr", chr) +
      optional("--pick_order", pickOrder) +
      optional("--freq_pop", freqPop) +
      optional("--freq_gt_lt", freqGtLt) +
      optional("--freq_filter", freqFilter) +
      optional("--filter", filter) +
      optional("--host", host) +
      optional("--user", user) +
      optional("--password", password) +
      optional("--registry", registry) +
      optional("--build", build) +
      optional("--compress", compress) +
      optional("--cache_region_size", cacheRegionSize) +
      optional("--fork", threads) +
      optional("--cache_version", cacheVersion) +
      optional("--freq_freq", freqFreq) +
      optional("--port", port) +
      optional("--db_version", dbVersion) +
      optional("--buffer_size", bufferSize) +
      optional("--failed", failed)
  }
Sander Bollen's avatar
Sander Bollen committed
269

Sander Bollen's avatar
Sander Bollen committed
270
271
272
  def summaryFiles: Map[String, File] = Map()

  def summaryStats: Map[String, Any] = {
273
274
    val statsFile = new File(output.getAbsolutePath + "_summary.txt")
    if (statsText && statsFile.exists()) parseStatsFile(statsFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
275
    else Map()
Sander Bollen's avatar
Sander Bollen committed
276
277
  }

278
279
280
  protected val removeOnConflict = Set("Output_file", "Command_line_options", "Run_time", "Start_time", "End_time", "Input_file_(format)", "Novel_/_existing_variants")
  protected val nonNumber = Set("VEP_version_(API)", "Cache/Database", "Species")

281
  override def resolveSummaryConflict(v1: Any, v2: Any, key: String): Any = {
282
283
284
285
286
    if (removeOnConflict.contains(key)) None
    else if (nonNumber.contains(key)) v1
    else {
      (v1, v2) match {
        case (x1: Int, x2: Int) => x1 + x2
Peter van 't Hof's avatar
Peter van 't Hof committed
287
        case _                  => throw new IllegalStateException(s"Value are not Int's, unable to sum them up, key: $key, v1: $v1, v2: $v2")
288
      }
289
290
291
    }
  }

Sander Bollen's avatar
Sander Bollen committed
292
  def parseStatsFile(file: File): Map[String, Any] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
293
    val reader = Source.fromFile(file)
Peter van 't Hof's avatar
Peter van 't Hof committed
294
    val contents = reader.getLines().filter(_ != "").toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
295
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
296

Peter van 't Hof's avatar
Peter van 't Hof committed
297
    def isHeader(line: String) = line.startsWith("[") && line.endsWith("]")
Sander Bollen's avatar
Sander Bollen committed
298

Peter van 't Hof's avatar
Peter van 't Hof committed
299
300
    val headers = contents.zipWithIndex
      .filter(x => x._1.startsWith("[") && x._1.endsWith("]"))
Sander Bollen's avatar
Sander Bollen committed
301

Peter van 't Hof's avatar
Peter van 't Hof committed
302
303
    (for ((header, headerIndex) <- headers) yield {
      val name = header.stripPrefix("[").stripSuffix("]")
Peter van 't Hof's avatar
Peter van 't Hof committed
304
      name.replaceAll(" ", "_") -> (contents.drop(headerIndex + 1).takeWhile(!isHeader(_)).map { line =>
Peter van 't Hof's avatar
Peter van 't Hof committed
305
        val values = line.split("\t", 2)
Peter van 't Hof's avatar
Peter van 't Hof committed
306
        values.head.replaceAll(" ", "_") -> tryToParseNumber(values.last).getOrElse(0)
Peter van 't Hof's avatar
Peter van 't Hof committed
307
308
309
      }.toMap)
    }).toMap
  }
Sander Bollen's avatar
Sander Bollen committed
310
}