BaseRecalibrator.scala 15.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.extensions.gatk
16
17

import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
18

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import nl.lumc.sasc.biopet.utils.VcfUtils
Peter van 't Hof's avatar
Peter van 't Hof committed
20
21
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ }
23

Peter van 't Hof's avatar
Peter van 't Hof committed
24
25
//TODO: check gathering
class BaseRecalibrator(val root: Configurable) extends CommandLineGATK /* with ScatterGatherableFunction */ {
26
  def analysis_type = "BaseRecalibrator"
Peter van 't Hof's avatar
Peter van 't Hof committed
27
  //TODO: check gathering
Peter van 't Hof's avatar
Peter van 't Hof committed
28
  //scatterClass = classOf[ContigScatterFunction]
Peter van 't Hof's avatar
Peter van 't Hof committed
29
  //setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false }
Peter van 't Hof's avatar
Peter van 't Hof committed
30
31
32

  /** A database of known polymorphic sites */
  @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites", required = false, exclusiveOf = "", validation = "")
Peter van 't Hof's avatar
Peter van 't Hof committed
33
  var knownSites: List[File] = config("known_sites", default = Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
34
35
36
37
38
39

  /** Dependencies on any indexes of knownSites */
  @Input(fullName = "knownSitesIndexes", shortName = "", doc = "Dependencies on any indexes of knownSites", required = false, exclusiveOf = "", validation = "")
  private var knownSitesIndexes: Seq[File] = Nil

  /** The output recalibration table file to create */
Peter van 't Hof's avatar
Peter van 't Hof committed
40
41
  @Output(fullName = "out", shortName = "o", doc = "The output recalibration table file to create", required = true, exclusiveOf = "", validation = "") //TODO: check gathering
  //@Gather(classOf[org.broadinstitute.gatk.engine.recalibration.BQSRGatherer])
Peter van 't Hof's avatar
Peter van 't Hof committed
42
43
44
45
  var out: File = _

  /** One or more covariates to be used in the recalibration. Can be specified multiple times */
  @Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false, exclusiveOf = "", validation = "")
46
  var covariate: List[String] = config("covariate", default = Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
47
48
49

  /** Do not use the standard set of covariates, but rather just the ones listed using the -cov argument */
  @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false, exclusiveOf = "", validation = "")
50
  var no_standard_covs: Boolean = config("no_standard_covs", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
51
52
53

  /** If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only. */
  @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.", required = false, exclusiveOf = "", validation = "")
54
  var run_without_dbsnp_potentially_ruining_quality: Boolean = config("run_without_dbsnp_potentially_ruining_quality", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
55
56
57

  /** How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS */
  @Argument(fullName = "solid_recal_mode", shortName = "sMode", doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS", required = false, exclusiveOf = "", validation = "")
58
  var solid_recal_mode: Option[String] = config("solid_recal_mode")
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
61

  /** Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ */
  @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false, exclusiveOf = "", validation = "")
62
  var solid_nocall_strategy: Option[String] = config("solid_nocall_strategy")
Peter van 't Hof's avatar
Peter van 't Hof committed
63
64
65

  /** Size of the k-mer context to be used for base mismatches */
  @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false, exclusiveOf = "", validation = "")
66
  var mismatches_context_size: Option[Int] = config("mismatches_context_size")
Peter van 't Hof's avatar
Peter van 't Hof committed
67
68
69

  /** Size of the k-mer context to be used for base insertions and deletions */
  @Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false, exclusiveOf = "", validation = "")
70
  var indels_context_size: Option[Int] = config("indels_context_size")
Peter van 't Hof's avatar
Peter van 't Hof committed
71
72
73

  /** The maximum cycle value permitted for the Cycle covariate */
  @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false, exclusiveOf = "", validation = "")
74
  var maximum_cycle_value: Option[Int] = config("maximum_cycle_value")
Peter van 't Hof's avatar
Peter van 't Hof committed
75
76
77

  /** default quality for the base mismatches covariate */
  @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false, exclusiveOf = "", validation = "")
78
  var mismatches_default_quality: Option[String] = config("mismatches_default_quality")
Peter van 't Hof's avatar
Peter van 't Hof committed
79
80
81

  /** default quality for the base insertions covariate */
  @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false, exclusiveOf = "", validation = "")
82
  var insertions_default_quality: Option[String] = config("insertions_default_quality")
Peter van 't Hof's avatar
Peter van 't Hof committed
83
84
85

  /** default quality for the base deletions covariate */
  @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false, exclusiveOf = "", validation = "")
86
  var deletions_default_quality: Option[String] = config("deletions_default_quality")
Peter van 't Hof's avatar
Peter van 't Hof committed
87
88
89

  /** minimum quality for the bases in the tail of the reads to be considered */
  @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false, exclusiveOf = "", validation = "")
90
  var low_quality_tail: Option[String] = config("low_quality_tail")
Peter van 't Hof's avatar
Peter van 't Hof committed
91
92
93

  /** number of distinct quality scores in the quantized output */
  @Argument(fullName = "quantizing_levels", shortName = "ql", doc = "number of distinct quality scores in the quantized output", required = false, exclusiveOf = "", validation = "")
94
  var quantizing_levels: Option[Int] = config("quantizing_levels")
Peter van 't Hof's avatar
Peter van 't Hof committed
95
96
97

  /** the binary tag covariate name if using it */
  @Argument(fullName = "binary_tag_name", shortName = "bintag", doc = "the binary tag covariate name if using it", required = false, exclusiveOf = "", validation = "")
98
  var binary_tag_name: Option[String] = config("binary_tag_name")
Peter van 't Hof's avatar
Peter van 't Hof committed
99
100
101

  /** Sort the rows in the tables of reports */
  @Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false, exclusiveOf = "", validation = "")
102
  var sort_by_all_columns: Boolean = config("sort_by_all_columns", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
103
104
105

  /** If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid. */
  @Argument(fullName = "default_platform", shortName = "dP", doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.", required = false, exclusiveOf = "", validation = "")
106
  var default_platform: Option[String] = config("default_platform")
Peter van 't Hof's avatar
Peter van 't Hof committed
107
108
109

  /** If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid. */
  @Argument(fullName = "force_platform", shortName = "fP", doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.", required = false, exclusiveOf = "", validation = "")
110
  var force_platform: Option[String] = config("force_platform")
Peter van 't Hof's avatar
Peter van 't Hof committed
111
112
113

  /** If provided, the read group of EVERY read will be forced to be the provided String. */
  @Argument(fullName = "force_readgroup", shortName = "fRG", doc = "If provided, the read group of EVERY read will be forced to be the provided String.", required = false, exclusiveOf = "", validation = "")
114
  var force_readgroup: Option[String] = config("force_readgroup")
Peter van 't Hof's avatar
Peter van 't Hof committed
115
116
117
118
119
120
121
122

  /** If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only */
  @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", required = false, exclusiveOf = "", validation = "")
  @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction])
  var recal_table_update_log: File = _

  /** Max size of the k-mer context to be used for repeat covariates */
  @Argument(fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false, exclusiveOf = "", validation = "")
123
  var max_str_unit_length: Option[Int] = config("max_str_unit_length")
Peter van 't Hof's avatar
Peter van 't Hof committed
124
125
126

  /** Max number of repetitions to be used for repeat covariates */
  @Argument(fullName = "max_repeat_length", shortName = "maxrep", doc = "Max number of repetitions to be used for repeat covariates", required = false, exclusiveOf = "", validation = "")
127
  var max_repeat_length: Option[Int] = config("max_repeat_length")
Peter van 't Hof's avatar
Peter van 't Hof committed
128
129
130

  /** Reduce memory usage in multi-threaded code at the expense of threading efficiency */
  @Argument(fullName = "lowMemoryMode", shortName = "lowMemoryMode", doc = "Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false, exclusiveOf = "", validation = "")
131
  var lowMemoryMode: Boolean = config("lowMemoryMode", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
132
133
134

  /** BQSR BAQ gap open penalty (Phred Scaled).  Default value is 40.  30 is perhaps better for whole genome call sets */
  @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName = "bqsrBAQGOP", doc = "BQSR BAQ gap open penalty (Phred Scaled).  Default value is 40.  30 is perhaps better for whole genome call sets", required = false, exclusiveOf = "", validation = "")
135
  var bqsrBAQGapOpenPenalty: Option[Double] = config("bqsrBAQGapOpenPenalty")
Peter van 't Hof's avatar
Peter van 't Hof committed
136
137
138
139
140
141
142

  /** Format string for bqsrBAQGapOpenPenalty */
  @Argument(fullName = "bqsrBAQGapOpenPenaltyFormat", shortName = "", doc = "Format string for bqsrBAQGapOpenPenalty", required = false, exclusiveOf = "", validation = "")
  var bqsrBAQGapOpenPenaltyFormat: String = "%s"

  /** Filter out reads with CIGAR containing the N operator, instead of failing with an error */
  @Argument(fullName = "filter_reads_with_N_cigar", shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false, exclusiveOf = "", validation = "")
143
  var filter_reads_with_N_cigar: Boolean = config("filter_reads_with_N_cigar", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
144
145
146

  /** Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error */
  @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false, exclusiveOf = "", validation = "")
147
  var filter_mismatching_base_and_quals: Boolean = config("filter_mismatching_base_and_quals", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
148
149
150

  /** Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error */
  @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false, exclusiveOf = "", validation = "")
151
  var filter_bases_not_stored: Boolean = config("filter_bases_not_stored", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
152

153
154
155
156
  if (config.contains("dbsnp")) knownSites :+= new File(config("dbsnp").asString)

  override def beforeGraph() {
    super.beforeGraph()
Peter van 't Hof's avatar
Peter van 't Hof committed
157
    knownSitesIndexes ++= knownSites.filter(orig => orig != null && (!orig.getName.endsWith(".list"))).map(orig => VcfUtils.getVcfIndexFile(orig))
158
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189

  override def cmdLine = super.cmdLine +
    repeat("-knownSites", knownSites, formatPrefix = TaggedFile.formatCommandLineParameter, spaceSeparated = true, escape = true, format = "%s") +
    required("-o", out, spaceSeparated = true, escape = true, format = "%s") +
    repeat("-cov", covariate, spaceSeparated = true, escape = true, format = "%s") +
    conditional(no_standard_covs, "-noStandard", escape = true, format = "%s") +
    conditional(run_without_dbsnp_potentially_ruining_quality, "-run_without_dbsnp_potentially_ruining_quality", escape = true, format = "%s") +
    optional("-sMode", solid_recal_mode, spaceSeparated = true, escape = true, format = "%s") +
    optional("-solid_nocall_strategy", solid_nocall_strategy, spaceSeparated = true, escape = true, format = "%s") +
    optional("-mcs", mismatches_context_size, spaceSeparated = true, escape = true, format = "%s") +
    optional("-ics", indels_context_size, spaceSeparated = true, escape = true, format = "%s") +
    optional("-maxCycle", maximum_cycle_value, spaceSeparated = true, escape = true, format = "%s") +
    optional("-mdq", mismatches_default_quality, spaceSeparated = true, escape = true, format = "%s") +
    optional("-idq", insertions_default_quality, spaceSeparated = true, escape = true, format = "%s") +
    optional("-ddq", deletions_default_quality, spaceSeparated = true, escape = true, format = "%s") +
    optional("-lqt", low_quality_tail, spaceSeparated = true, escape = true, format = "%s") +
    optional("-ql", quantizing_levels, spaceSeparated = true, escape = true, format = "%s") +
    optional("-bintag", binary_tag_name, spaceSeparated = true, escape = true, format = "%s") +
    conditional(sort_by_all_columns, "-sortAllCols", escape = true, format = "%s") +
    optional("-dP", default_platform, spaceSeparated = true, escape = true, format = "%s") +
    optional("-fP", force_platform, spaceSeparated = true, escape = true, format = "%s") +
    optional("-fRG", force_readgroup, spaceSeparated = true, escape = true, format = "%s") +
    optional("-recal_table_update_log", recal_table_update_log, spaceSeparated = true, escape = true, format = "%s") +
    optional("-maxstr", max_str_unit_length, spaceSeparated = true, escape = true, format = "%s") +
    optional("-maxrep", max_repeat_length, spaceSeparated = true, escape = true, format = "%s") +
    conditional(lowMemoryMode, "-lowMemoryMode", escape = true, format = "%s") +
    optional("-bqsrBAQGOP", bqsrBAQGapOpenPenalty, spaceSeparated = true, escape = true, format = bqsrBAQGapOpenPenaltyFormat) +
    conditional(filter_reads_with_N_cigar, "-filterRNC", escape = true, format = "%s") +
    conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape = true, format = "%s") +
    conditional(filter_bases_not_stored, "-filterNoBases", escape = true, format = "%s")
}
Peter van 't Hof's avatar
Peter van 't Hof committed
190
191
192
193
194
195
196
197
198

object BaseRecalibrator {
  def apply(root: Configurable, input: File, output: File): BaseRecalibrator = {
    val br = new BaseRecalibrator(root)
    br.input_file :+= input
    br.out = output
    br
  }
}