Gsnap.scala 19.2 KB
Newer Older
bow's avatar
bow committed
1
/**
bow's avatar
bow committed
2
3
4
5
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
bow's avatar
bow committed
6
 *
bow's avatar
bow committed
7
8
9
10
11
12
13
14
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
bow's avatar
bow committed
15
16
17
18
 */
package nl.lumc.sasc.biopet.extensions

import java.io.File
bow's avatar
bow committed
19
20
21

import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument }

Peter van 't Hof's avatar
Peter van 't Hof committed
22
import nl.lumc.sasc.biopet.core.{ Reference, BiopetCommandLineFunction }
bow's avatar
bow committed
23
24
25
26
import nl.lumc.sasc.biopet.core.config.Configurable

/**
 * Wrapper for the gsnap command line tool
bow's avatar
bow committed
27
 * Written based on gsnap version 2014-05-15
bow's avatar
bow committed
28
 */
29
class Gsnap(val root: Configurable) extends BiopetCommandLineFunction with Reference {
bow's avatar
bow committed
30
31

  /** default executable */
bow's avatar
bow committed
32
  executable = config("exe", default = "gsnap", freeVar = false)
bow's avatar
bow committed
33

bow's avatar
bow committed
34
  /** default threads */
Peter van 't Hof's avatar
Peter van 't Hof committed
35
  override def defaultThreads = 8
bow's avatar
bow committed
36
37

  /** default vmem for cluster jobs */
Peter van 't Hof's avatar
Peter van 't Hof committed
38
  override def defaultCoreMemory = 10.0
bow's avatar
bow committed
39

bow's avatar
bow committed
40
  /** input file */
41
  @Input(doc = "Input FASTQ file(s)", required = true) //var input: List[File] = _
bow's avatar
bow committed
42
  var input: List[File] = List.empty[File]
bow's avatar
bow committed
43
44
45

  /** output file */
  @Output(doc = "Output alignment file", required = true)
bow's avatar
bow committed
46
  var output: File = null
bow's avatar
bow committed
47
48

  /** genome directory */
bow's avatar
bow committed
49
50
  @Argument(doc = "Directory of genome database")
  var dir: Option[File] = config("dir")
bow's avatar
bow committed
51
52

  /** genome database */
bow's avatar
bow committed
53
54
  @Argument(doc = "Genome database name", required = true)
  var db: String = config("db")
bow's avatar
bow committed
55
56
57
58
59
60
61
62
63
64
65

  /** whether to use a suffix array, which will give increased speed */
  var use_sarray: Option[Int] = config("use_sarray")

  /** kmer size to use in genome database (allowed values: 16 or less) */
  var kmer: Option[Int] = config("kmer")

  /** sampling to use in genome database */
  var sampling: Option[Int] = config("sampling")

  /** process only the i-th out of every n sequences */
bow's avatar
bow committed
66
  var part: Option[String] = config("part")
bow's avatar
bow committed
67
68
69
70
71
72
73
74

  /** size of input buffer (program reads this many sequences at a time)*/
  var input_buffer_size: Option[Int] = config("input_buffer_size")

  /** amount of barcode to remove from start of read */
  var barcode_length: Option[Int] = config("barcode_length")

  /** orientation of paired-end reads */
bow's avatar
bow committed
75
  var orientation: Option[String] = config("orientation")
bow's avatar
bow committed
76
77
78
79
80
81
82
83

  /** starting position of identifier in fastq header, space-delimited (>= 1) */
  var fastq_id_start: Option[Int] = config("fastq_id_start")

  /** ending position of identifier in fastq header, space-delimited (>= 1) */
  var fastq_id_end: Option[Int] = config("fastq_id_end")

  /** when multiple fastq files are provided on the command line, gsnap assumes */
bow's avatar
bow committed
84
  var force_single_end: Boolean = config("force_single_end", default = false)
bow's avatar
bow committed
85
86

  /** skips reads marked by the illumina chastity program.  expecting a string */
bow's avatar
bow committed
87
  var filter_chastity: Option[String] = config("filter_chastity")
bow's avatar
bow committed
88
89

  /** allows accession names of reads to mismatch in paired-end files */
bow's avatar
bow committed
90
  var allow_pe_name_mismatch: Boolean = config("allow_pe_name_mismatch", default = false)
bow's avatar
bow committed
91
92

  /** uncompress gzipped input files */
bow's avatar
bow committed
93
  var gunzip: Boolean = config("gunzip", default = false)
bow's avatar
bow committed
94
95

  /** uncompress bzip2-compressed input files */
bow's avatar
bow committed
96
  var bunzip2: Boolean = config("bunzip2", default = false)
bow's avatar
bow committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

  /** batch mode (default = 2) */
  var batch: Option[Int] = config("batch")

  /** whether to expand the genomic offsets index */
  var expand_offsets: Option[Int] = config("expand_offsets")

  /** maximum number of mismatches allowed (if not specified, then */
  var max_mismatches: Option[Float] = config("max_mismatches")

  /** whether to count unknown (n) characters in the query as a mismatch */
  var query_unk_mismatch: Option[Int] = config("query_unk_mismatch")

  /** whether to count unknown (n) characters in the genome as a mismatch */
  var genome_unk_mismatch: Option[Int] = config("genome_unk_mismatch")

  /** maximum number of alignments to find (default 1000) */
  var maxsearch: Option[Int] = config("maxsearch")

  /** threshold for computing a terminal alignment (from one end of the */
  var terminal_threshold: Option[Int] = config("terminal_threshold")

  /** threshold alignment length in bp for a terminal alignment result to be printed (in bp) */
  var terminal_output_minlength: Option[Int] = config("terminal_output_minlength")

  /** penalty for an indel (default 2) */
  var indel_penalty: Option[Int] = config("indel_penalty")

  /** minimum length at end required for indel alignments (default 4) */
  var indel_endlength: Option[Int] = config("indel_endlength")

  /** maximum number of middle insertions allowed (default 9) */
  var max_middle_insertions: Option[Int] = config("max_middle_insertions")

  /** maximum number of middle deletions allowed (default 30) */
  var max_middle_deletions: Option[Int] = config("max_middle_deletions")

  /** maximum number of end insertions allowed (default 3) */
  var max_end_insertions: Option[Int] = config("max_end_insertions")

  /** maximum number of end deletions allowed (default 6) */
  var max_end_deletions: Option[Int] = config("max_end_deletions")

  /** report suboptimal hits beyond best hit (default 0) */
  var suboptimal_levels: Option[Int] = config("suboptimal_levels")

  /** method for removing adapters from reads.  currently allowed values: off, paired */
bow's avatar
bow committed
144
  var adapter_strip: Option[String] = config("adapter_strip")
bow's avatar
bow committed
145
146
147
148
149
150
151
152

  /** score to use for mismatches when trimming at ends (default is -3; */
  var trim_mismatch_score: Option[Int] = config("trim_mismatch_score")

  /** score to use for indels when trimming at ends (default is -4; */
  var trim_indel_score: Option[Int] = config("trim_indel_score")

  /** directory for snps index files (created using snpindex) (default is */
bow's avatar
bow committed
153
  var snpsdir: Option[String] = config("snpsdir")
bow's avatar
bow committed
154
155

  /** use database containing known snps (in <string>.iit, built */
bow's avatar
bow committed
156
  var use_snps: Option[String] = config("use_snps")
bow's avatar
bow committed
157
158

  /** directory for methylcytosine index files (created using cmetindex) */
bow's avatar
bow committed
159
  var cmetdir: Option[String] = config("cmetdir")
bow's avatar
bow committed
160
161

  /** directory for a-to-i rna editing index files (created using atoiindex) */
bow's avatar
bow committed
162
  var atoidir: Option[String] = config("atoidir")
bow's avatar
bow committed
163
164

  /** alignment mode: standard (default), cmet-stranded, cmet-nonstranded, */
bow's avatar
bow committed
165
  var mode: Option[String] = config("mode")
bow's avatar
bow committed
166
167

  /** directory for tally iit file to resolve concordant multiple results (default is */
bow's avatar
bow committed
168
  var tallydir: Option[String] = config("tallydir")
bow's avatar
bow committed
169
170

  /** use this tally iit file to resolve concordant multiple results */
bow's avatar
bow committed
171
  var use_tally: Option[String] = config("use_tally")
bow's avatar
bow committed
172
173

  /** directory for runlength iit file to resolve concordant multiple results (default is */
bow's avatar
bow committed
174
  var runlengthdir: Option[String] = config("runlengthdir")
bow's avatar
bow committed
175
176

  /** use this runlength iit file to resolve concordant multiple results */
bow's avatar
bow committed
177
  var use_runlength: Option[String] = config("use_runlength")
bow's avatar
bow committed
178
179

  /** cases to use gmap for complex alignments containing multiple splices or indels */
bow's avatar
bow committed
180
  var gmap_mode: Option[String] = config("gmap_mode")
bow's avatar
bow committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

  /** try gmap pairsearch on nearby genomic regions if best score (the total */
  var trigger_score_for_gmap: Option[Int] = config("trigger_score_for_gmap")

  /** keep gmap hit only if it has this many consecutive matches (default 20) */
  var gmap_min_match_length: Option[Int] = config("gmap_min_match_length")

  /** extra mismatch/indel score allowed for gmap alignments (default 3) */
  var gmap_allowance: Option[Int] = config("gmap_allowance")

  /** perform gmap pairsearch on nearby genomic regions up to this many */
  var max_gmap_pairsearch: Option[Int] = config("max_gmap_pairsearch")

  /** perform gmap terminal on nearby genomic regions up to this many */
  var max_gmap_terminal: Option[Int] = config("max_gmap_terminal")

  /** perform gmap improvement on nearby genomic regions up to this many */
  var max_gmap_improvement: Option[Int] = config("max_gmap_improvement")

  /** allow microexons only if one of the splice site probabilities is */
  var microexon_spliceprob: Option[Float] = config("microexon_spliceprob")

  /** look for novel splicing (0=no (default), 1=yes) */
  var novelsplicing: Option[Int] = config("novelsplicing")

  /** directory for splicing involving known sites or known introns, */
bow's avatar
bow committed
207
  var splicingdir: Option[String] = config("splicingdir")
bow's avatar
bow committed
208
209

  /** look for splicing involving known sites or known introns */
bow's avatar
bow committed
210
  var use_splicing: Option[String] = config("use_splicing")
bow's avatar
bow committed
211
212

  /** for ambiguous known splicing at ends of the read, do not clip at the */
bow's avatar
bow committed
213
  var ambig_splice_noclip: Boolean = config("ambig_splice_noclip", default = false)
bow's avatar
bow committed
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239

  /** definition of local novel splicing event (default 200000) */
  var localsplicedist: Option[Int] = config("localsplicedist")

  /** distance to look for novel splices at the ends of reads (default 50000) */
  var novelend_splicedist: Option[Int] = config("novelend_splicedist")

  /** penalty for a local splice (default 0).  counts against mismatches allowed */
  var local_splice_penalty: Option[Int] = config("local_splice_penalty")

  /** penalty for a distant splice (default 1).  a distant splice is one where */
  var distant_splice_penalty: Option[Int] = config("distant_splice_penalty")

  /** minimum length at end required for distant spliced alignments (default 20, min */
  var distant_splice_endlength: Option[Int] = config("distant_splice_endlength")

  /** minimum length at end required for short-end spliced alignments (default 2, */
  var shortend_splice_endlength: Option[Int] = config("shortend_splice_endlength")

  /** minimum identity at end required for distant spliced alignments (default 0.95) */
  var distant_splice_identity: Option[Float] = config("distant_splice_identity")

  /** (not currently implemented) */
  var antistranded_penalty: Option[Int] = config("antistranded_penalty")

  /** report distant splices on the same chromosome as a single splice, if possible */
bow's avatar
bow committed
240
  var merge_distant_samechr: Boolean = config("merge_distant_samechr", default = false)
bow's avatar
bow committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254

  /** max total genomic length for dna-seq paired reads, or other reads */
  var pairmax_dna: Option[Int] = config("pairmax_dna")

  /** max total genomic length for rna-seq paired reads, or other reads */
  var pairmax_rna: Option[Int] = config("pairmax_rna")

  /** expected paired-end length, used for calling splices in medial part of */
  var pairexpect: Option[Int] = config("pairexpect")

  /** allowable deviation from expected paired-end length, used for */
  var pairdev: Option[Int] = config("pairdev")

  /** protocol for input quality scores.  allowed values: */
bow's avatar
bow committed
255
  var quality_protocol: Option[String] = config("quality_protocol")
bow's avatar
bow committed
256
257
258
259
260
261
262
263
264
265
266

  /** fastq quality scores are zero at this ascii value */
  var quality_zero_score: Option[Int] = config("quality_zero_score")

  /** shift fastq quality scores by this amount in output */
  var quality_print_shift: Option[Int] = config("quality_print_shift")

  /** maximum number of paths to print (default 100) */
  var npaths: Option[Int] = config("npaths")

  /** if more than maximum number of paths are found, */
bow's avatar
bow committed
267
  var quiet_if_excessive: Boolean = config("quiet_if_excessive", default = false)
bow's avatar
bow committed
268
269

  /** print output in same order as input (relevant */
bow's avatar
bow committed
270
  var ordered: Boolean = config("ordered", default = false)
bow's avatar
bow committed
271
272

  /** for gsnap output in snp-tolerant alignment, shows all differences */
bow's avatar
bow committed
273
  var show_refdiff: Boolean = config("show_refdiff", default = false)
bow's avatar
bow committed
274
275

  /** for paired-end reads whose alignments overlap, clip the overlapping region */
bow's avatar
bow committed
276
  var clip_overlap: Boolean = config("clip_overlap", default = false)
bow's avatar
bow committed
277
278

  /** print detailed information about snps in reads (works only if -v also selected) */
bow's avatar
bow committed
279
  var print_snps: Boolean = config("print_snps", default = false)
bow's avatar
bow committed
280
281

  /** print only failed alignments, those with no results */
bow's avatar
bow committed
282
  var failsonly: Boolean = config("failsonly", default = false)
bow's avatar
bow committed
283
284

  /** exclude printing of failed alignments */
bow's avatar
bow committed
285
  var nofails: Boolean = config("nofails", default = false)
bow's avatar
bow committed
286
287

  /** print completely failed alignments as input fasta or fastq format */
bow's avatar
bow committed
288
  var fails_as_input: Boolean = config("fails_as_input", default = false)
bow's avatar
bow committed
289
290

  /** another format type, other than default */
bow's avatar
bow committed
291
  var format: Option[String] = config("format")
bow's avatar
bow committed
292
293

  /** basename for multiple-file output, separately for nomapping, */
bow's avatar
bow committed
294
  var split_output: Option[String] = config("split_output")
bow's avatar
bow committed
295
296

  /** when --split-output is given, this flag will append output to the */
bow's avatar
bow committed
297
  var append_output: Boolean = config("append_output", default = false)
bow's avatar
bow committed
298
299
300
301
302

  /** buffer size, in queries, for output thread (default 1000).  when the number */
  var output_buffer_size: Option[Int] = config("output_buffer_size")

  /** do not print headers beginning with '@' */
bow's avatar
bow committed
303
  var no_sam_headers: Boolean = config("no_sam_headers", default = false)
bow's avatar
bow committed
304
305
306
307
308

  /** print headers only for this batch, as specified by -q */
  var sam_headers_batch: Option[Int] = config("sam_headers_batch")

  /** insert 0m in cigar between adjacent insertions and deletions */
bow's avatar
bow committed
309
  var sam_use_0M: Boolean = config("sam_use_0M", default = false)
bow's avatar
bow committed
310
311

  /** allows multiple alignments to be marked as primary if they */
bow's avatar
bow committed
312
  var sam_multiple_primaries: Boolean = config("sam_multiple_primaries", default = false)
bow's avatar
bow committed
313
314

  /** for rna-seq alignments, disallows xs:a:? when the sense direction */
bow's avatar
bow committed
315
  var force_xs_dir: Boolean = config("force_xs_dir", default = false)
bow's avatar
bow committed
316
317

  /** in md string, when known snps are given by the -v flag, */
bow's avatar
bow committed
318
  var md_lowercase_snp: Boolean = config("md_lowercase_snp", default = false)
bow's avatar
bow committed
319
320

  /** value to put into read-group id (rg-id) field */
bow's avatar
bow committed
321
  var read_group_id: Option[String] = config("read_group_id")
bow's avatar
bow committed
322
323

  /** value to put into read-group name (rg-sm) field */
bow's avatar
bow committed
324
  var read_group_name: Option[String] = config("read_group_name")
bow's avatar
bow committed
325
326

  /** value to put into read-group library (rg-lb) field */
bow's avatar
bow committed
327
  var read_group_library: Option[String] = config("read_group_library")
bow's avatar
bow committed
328
329

  /** value to put into read-group library (rg-pl) field */
bow's avatar
bow committed
330
  var read_group_platform: Option[String] = config("read_group_platform")
bow's avatar
bow committed
331

Peter van 't Hof's avatar
Peter van 't Hof committed
332
  override def versionRegex = """.* version (.*)""".r
bow's avatar
bow committed
333
334
335
  override def versionCommand = executable + " --version"

  def cmdLine = {
336
    required(executable) +
bow's avatar
bow committed
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
      optional("--dir", dir) +
      optional("--db", db) +
      optional("--use-sarray", use_sarray) +
      optional("--kmer", kmer) +
      optional("--sampling", sampling) +
      optional("--part", part) +
      optional("--input-buffer-size", input_buffer_size) +
      optional("--barcode-length", barcode_length) +
      optional("--orientation", orientation) +
      optional("--fastq-id-start", fastq_id_start) +
      optional("--fastq-id-end", fastq_id_end) +
      conditional(force_single_end, "--force-single-end") +
      optional("--filter-chastity", filter_chastity) +
      conditional(allow_pe_name_mismatch, "--allow-pe-name-mismatch") +
      conditional(gunzip, "--gunzip") +
      conditional(bunzip2, "--bunzip2") +
      optional("--batch", batch) +
      optional("--expand-offsets", expand_offsets) +
      optional("--max-mismatches", max_mismatches) +
      optional("--query-unk-mismatch", query_unk_mismatch) +
      optional("--genome-unk-mismatch", genome_unk_mismatch) +
      optional("--maxsearch", maxsearch) +
      optional("--terminal-threshold", terminal_threshold) +
      optional("--terminal-output-minlength", terminal_output_minlength) +
      optional("--indel-penalty", indel_penalty) +
      optional("--indel-endlength", indel_endlength) +
      optional("--max-middle-insertions", max_middle_insertions) +
      optional("--max-middle-deletions", max_middle_deletions) +
      optional("--max-end-insertions", max_end_insertions) +
      optional("--max-end-deletions", max_end_deletions) +
      optional("--suboptimal-levels", suboptimal_levels) +
      optional("--adapter-strip", adapter_strip) +
      optional("--trim-mismatch-score", trim_mismatch_score) +
      optional("--trim-indel-score", trim_indel_score) +
      optional("--snpsdir", snpsdir) +
      optional("--use-snps", use_snps) +
      optional("--cmetdir", cmetdir) +
      optional("--atoidir", atoidir) +
      optional("--mode", mode) +
      optional("--tallydir", tallydir) +
      optional("--use-tally", use_tally) +
      optional("--runlengthdir", runlengthdir) +
      optional("--use-runlength", use_runlength) +
380
      optional("--nthreads", threads) +
bow's avatar
bow committed
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
      optional("--gmap-mode", gmap_mode) +
      optional("--trigger-score-for-gmap", trigger_score_for_gmap) +
      optional("--gmap-min-match-length", gmap_min_match_length) +
      optional("--gmap-allowance", gmap_allowance) +
      optional("--max-gmap-pairsearch", max_gmap_pairsearch) +
      optional("--max-gmap-terminal", max_gmap_terminal) +
      optional("--max-gmap-improvement", max_gmap_improvement) +
      optional("--microexon-spliceprob", microexon_spliceprob) +
      optional("--novelsplicing", novelsplicing) +
      optional("--splicingdir", splicingdir) +
      optional("--use-splicing", use_splicing) +
      conditional(ambig_splice_noclip, "--ambig-splice-noclip") +
      optional("--localsplicedist", localsplicedist) +
      optional("--novelend-splicedist", novelend_splicedist) +
      optional("--local-splice-penalty", local_splice_penalty) +
      optional("--distant-splice-penalty", distant_splice_penalty) +
      optional("--distant-splice-endlength", distant_splice_endlength) +
      optional("--shortend-splice-endlength", shortend_splice_endlength) +
      optional("--distant-splice-identity", distant_splice_identity) +
      optional("--antistranded-penalty", antistranded_penalty) +
      conditional(merge_distant_samechr, "--merge-distant-samechr") +
      optional("--pairmax-dna", pairmax_dna) +
      optional("--pairmax-rna", pairmax_rna) +
      optional("--pairexpect", pairexpect) +
      optional("--pairdev", pairdev) +
      optional("--quality-protocol", quality_protocol) +
      optional("--quality-zero-score", quality_zero_score) +
      optional("--quality-print-shift", quality_print_shift) +
      optional("--npaths", npaths) +
      conditional(quiet_if_excessive, "--quiet-if-excessive") +
      conditional(ordered, "--ordered") +
      conditional(show_refdiff, "--show-refdiff") +
      conditional(clip_overlap, "--clip-overlap") +
      conditional(print_snps, "--print-snps") +
      conditional(failsonly, "--failsonly") +
      conditional(nofails, "--nofails") +
      conditional(fails_as_input, "--fails-as-input") +
      optional("--format", format) +
      optional("--split-output", split_output) +
      conditional(append_output, "--append-output") +
      optional("--output-buffer-size", output_buffer_size) +
      conditional(no_sam_headers, "--no-sam-headers") +
      optional("--sam-headers-batch", sam_headers_batch) +
      conditional(sam_use_0M, "--sam-use-0M") +
bow's avatar
bow committed
425
      conditional(sam_multiple_primaries, "--sam-multiple-primaries") +
bow's avatar
bow committed
426
427
428
429
430
431
      conditional(force_xs_dir, "--force-xs-dir") +
      conditional(md_lowercase_snp, "--md-lowercase-snp") +
      optional("--read-group-id", read_group_id) +
      optional("--read-group-name", read_group_name) +
      optional("--read-group-library", read_group_library) +
      optional("--read-group-platform", read_group_platform) +
bow's avatar
bow committed
432
      repeat(input) +
bow's avatar
bow committed
433
434
435
      " > " + required(output)
  }
}