BaseRecalibrator.scala 15 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
package nl.lumc.sasc.biopet.extensions.gatk
2
3

import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
4

Peter van 't Hof's avatar
Peter van 't Hof committed
5
import nl.lumc.sasc.biopet.utils.VcfUtils
Peter van 't Hof's avatar
Peter van 't Hof committed
6
7
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile
Peter van 't Hof's avatar
Peter van 't Hof committed
8
import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ }
9

Peter van 't Hof's avatar
Peter van 't Hof committed
10
11
//TODO: check gathering
class BaseRecalibrator(val root: Configurable) extends CommandLineGATK /* with ScatterGatherableFunction */ {
12
  def analysis_type = "BaseRecalibrator"
Peter van 't Hof's avatar
Peter van 't Hof committed
13
  //TODO: check gathering
Peter van 't Hof's avatar
Peter van 't Hof committed
14
  //scatterClass = classOf[ContigScatterFunction]
Peter van 't Hof's avatar
Peter van 't Hof committed
15
  //setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false }
Peter van 't Hof's avatar
Peter van 't Hof committed
16
17
18

  /** A database of known polymorphic sites */
  @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites", required = false, exclusiveOf = "", validation = "")
Peter van 't Hof's avatar
Peter van 't Hof committed
19
  var knownSites: List[File] = config("known_sites", default = Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
20
21
22
23
24
25

  /** Dependencies on any indexes of knownSites */
  @Input(fullName = "knownSitesIndexes", shortName = "", doc = "Dependencies on any indexes of knownSites", required = false, exclusiveOf = "", validation = "")
  private var knownSitesIndexes: Seq[File] = Nil

  /** The output recalibration table file to create */
Peter van 't Hof's avatar
Peter van 't Hof committed
26
27
  @Output(fullName = "out", shortName = "o", doc = "The output recalibration table file to create", required = true, exclusiveOf = "", validation = "") //TODO: check gathering
  //@Gather(classOf[org.broadinstitute.gatk.engine.recalibration.BQSRGatherer])
Peter van 't Hof's avatar
Peter van 't Hof committed
28
29
30
31
  var out: File = _

  /** One or more covariates to be used in the recalibration. Can be specified multiple times */
  @Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false, exclusiveOf = "", validation = "")
32
  var covariate: List[String] = config("covariate", default = Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
33
34
35

  /** Do not use the standard set of covariates, but rather just the ones listed using the -cov argument */
  @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false, exclusiveOf = "", validation = "")
36
  var no_standard_covs: Boolean = config("no_standard_covs", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
37
38
39

  /** If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only. */
  @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.", required = false, exclusiveOf = "", validation = "")
40
  var run_without_dbsnp_potentially_ruining_quality: Boolean = config("run_without_dbsnp_potentially_ruining_quality", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
41
42
43

  /** How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS */
  @Argument(fullName = "solid_recal_mode", shortName = "sMode", doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS", required = false, exclusiveOf = "", validation = "")
44
  var solid_recal_mode: Option[String] = config("solid_recal_mode")
Peter van 't Hof's avatar
Peter van 't Hof committed
45
46
47

  /** Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ */
  @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false, exclusiveOf = "", validation = "")
48
  var solid_nocall_strategy: Option[String] = config("solid_nocall_strategy")
Peter van 't Hof's avatar
Peter van 't Hof committed
49
50
51

  /** Size of the k-mer context to be used for base mismatches */
  @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false, exclusiveOf = "", validation = "")
52
  var mismatches_context_size: Option[Int] = config("mismatches_context_size")
Peter van 't Hof's avatar
Peter van 't Hof committed
53
54
55

  /** Size of the k-mer context to be used for base insertions and deletions */
  @Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false, exclusiveOf = "", validation = "")
56
  var indels_context_size: Option[Int] = config("indels_context_size")
Peter van 't Hof's avatar
Peter van 't Hof committed
57
58
59

  /** The maximum cycle value permitted for the Cycle covariate */
  @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false, exclusiveOf = "", validation = "")
60
  var maximum_cycle_value: Option[Int] = config("maximum_cycle_value")
Peter van 't Hof's avatar
Peter van 't Hof committed
61
62
63

  /** default quality for the base mismatches covariate */
  @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false, exclusiveOf = "", validation = "")
64
  var mismatches_default_quality: Option[String] = config("mismatches_default_quality")
Peter van 't Hof's avatar
Peter van 't Hof committed
65
66
67

  /** default quality for the base insertions covariate */
  @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false, exclusiveOf = "", validation = "")
68
  var insertions_default_quality: Option[String] = config("insertions_default_quality")
Peter van 't Hof's avatar
Peter van 't Hof committed
69
70
71

  /** default quality for the base deletions covariate */
  @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false, exclusiveOf = "", validation = "")
72
  var deletions_default_quality: Option[String] = config("deletions_default_quality")
Peter van 't Hof's avatar
Peter van 't Hof committed
73
74
75

  /** minimum quality for the bases in the tail of the reads to be considered */
  @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false, exclusiveOf = "", validation = "")
76
  var low_quality_tail: Option[String] = config("low_quality_tail")
Peter van 't Hof's avatar
Peter van 't Hof committed
77
78
79

  /** number of distinct quality scores in the quantized output */
  @Argument(fullName = "quantizing_levels", shortName = "ql", doc = "number of distinct quality scores in the quantized output", required = false, exclusiveOf = "", validation = "")
80
  var quantizing_levels: Option[Int] = config("quantizing_levels")
Peter van 't Hof's avatar
Peter van 't Hof committed
81
82
83

  /** the binary tag covariate name if using it */
  @Argument(fullName = "binary_tag_name", shortName = "bintag", doc = "the binary tag covariate name if using it", required = false, exclusiveOf = "", validation = "")
84
  var binary_tag_name: Option[String] = config("binary_tag_name")
Peter van 't Hof's avatar
Peter van 't Hof committed
85
86
87

  /** Sort the rows in the tables of reports */
  @Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false, exclusiveOf = "", validation = "")
88
  var sort_by_all_columns: Boolean = config("sort_by_all_columns", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
89
90
91

  /** If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid. */
  @Argument(fullName = "default_platform", shortName = "dP", doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.", required = false, exclusiveOf = "", validation = "")
92
  var default_platform: Option[String] = config("default_platform")
Peter van 't Hof's avatar
Peter van 't Hof committed
93
94
95

  /** If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid. */
  @Argument(fullName = "force_platform", shortName = "fP", doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.", required = false, exclusiveOf = "", validation = "")
96
  var force_platform: Option[String] = config("force_platform")
Peter van 't Hof's avatar
Peter van 't Hof committed
97
98
99

  /** If provided, the read group of EVERY read will be forced to be the provided String. */
  @Argument(fullName = "force_readgroup", shortName = "fRG", doc = "If provided, the read group of EVERY read will be forced to be the provided String.", required = false, exclusiveOf = "", validation = "")
100
  var force_readgroup: Option[String] = config("force_readgroup")
Peter van 't Hof's avatar
Peter van 't Hof committed
101
102
103
104
105
106
107
108

  /** If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only */
  @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", required = false, exclusiveOf = "", validation = "")
  @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction])
  var recal_table_update_log: File = _

  /** Max size of the k-mer context to be used for repeat covariates */
  @Argument(fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false, exclusiveOf = "", validation = "")
109
  var max_str_unit_length: Option[Int] = config("max_str_unit_length")
Peter van 't Hof's avatar
Peter van 't Hof committed
110
111
112

  /** Max number of repetitions to be used for repeat covariates */
  @Argument(fullName = "max_repeat_length", shortName = "maxrep", doc = "Max number of repetitions to be used for repeat covariates", required = false, exclusiveOf = "", validation = "")
113
  var max_repeat_length: Option[Int] = config("max_repeat_length")
Peter van 't Hof's avatar
Peter van 't Hof committed
114
115
116

  /** Reduce memory usage in multi-threaded code at the expense of threading efficiency */
  @Argument(fullName = "lowMemoryMode", shortName = "lowMemoryMode", doc = "Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false, exclusiveOf = "", validation = "")
117
  var lowMemoryMode: Boolean = config("lowMemoryMode", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
118
119
120

  /** BQSR BAQ gap open penalty (Phred Scaled).  Default value is 40.  30 is perhaps better for whole genome call sets */
  @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName = "bqsrBAQGOP", doc = "BQSR BAQ gap open penalty (Phred Scaled).  Default value is 40.  30 is perhaps better for whole genome call sets", required = false, exclusiveOf = "", validation = "")
121
  var bqsrBAQGapOpenPenalty: Option[Double] = config("bqsrBAQGapOpenPenalty")
Peter van 't Hof's avatar
Peter van 't Hof committed
122
123
124
125
126
127
128

  /** Format string for bqsrBAQGapOpenPenalty */
  @Argument(fullName = "bqsrBAQGapOpenPenaltyFormat", shortName = "", doc = "Format string for bqsrBAQGapOpenPenalty", required = false, exclusiveOf = "", validation = "")
  var bqsrBAQGapOpenPenaltyFormat: String = "%s"

  /** Filter out reads with CIGAR containing the N operator, instead of failing with an error */
  @Argument(fullName = "filter_reads_with_N_cigar", shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false, exclusiveOf = "", validation = "")
129
  var filter_reads_with_N_cigar: Boolean = config("filter_reads_with_N_cigar", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
130
131
132

  /** Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error */
  @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false, exclusiveOf = "", validation = "")
133
  var filter_mismatching_base_and_quals: Boolean = config("filter_mismatching_base_and_quals", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
134
135
136

  /** Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error */
  @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false, exclusiveOf = "", validation = "")
137
  var filter_bases_not_stored: Boolean = config("filter_bases_not_stored", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
138

139
140
141
142
  if (config.contains("dbsnp")) knownSites :+= new File(config("dbsnp").asString)

  override def beforeGraph() {
    super.beforeGraph()
Peter van 't Hof's avatar
Peter van 't Hof committed
143
    knownSitesIndexes ++= knownSites.filter(orig => orig != null && (!orig.getName.endsWith(".list"))).map(orig => VcfUtils.getVcfIndexFile(orig))
144
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

  override def cmdLine = super.cmdLine +
    repeat("-knownSites", knownSites, formatPrefix = TaggedFile.formatCommandLineParameter, spaceSeparated = true, escape = true, format = "%s") +
    required("-o", out, spaceSeparated = true, escape = true, format = "%s") +
    repeat("-cov", covariate, spaceSeparated = true, escape = true, format = "%s") +
    conditional(no_standard_covs, "-noStandard", escape = true, format = "%s") +
    conditional(run_without_dbsnp_potentially_ruining_quality, "-run_without_dbsnp_potentially_ruining_quality", escape = true, format = "%s") +
    optional("-sMode", solid_recal_mode, spaceSeparated = true, escape = true, format = "%s") +
    optional("-solid_nocall_strategy", solid_nocall_strategy, spaceSeparated = true, escape = true, format = "%s") +
    optional("-mcs", mismatches_context_size, spaceSeparated = true, escape = true, format = "%s") +
    optional("-ics", indels_context_size, spaceSeparated = true, escape = true, format = "%s") +
    optional("-maxCycle", maximum_cycle_value, spaceSeparated = true, escape = true, format = "%s") +
    optional("-mdq", mismatches_default_quality, spaceSeparated = true, escape = true, format = "%s") +
    optional("-idq", insertions_default_quality, spaceSeparated = true, escape = true, format = "%s") +
    optional("-ddq", deletions_default_quality, spaceSeparated = true, escape = true, format = "%s") +
    optional("-lqt", low_quality_tail, spaceSeparated = true, escape = true, format = "%s") +
    optional("-ql", quantizing_levels, spaceSeparated = true, escape = true, format = "%s") +
    optional("-bintag", binary_tag_name, spaceSeparated = true, escape = true, format = "%s") +
    conditional(sort_by_all_columns, "-sortAllCols", escape = true, format = "%s") +
    optional("-dP", default_platform, spaceSeparated = true, escape = true, format = "%s") +
    optional("-fP", force_platform, spaceSeparated = true, escape = true, format = "%s") +
    optional("-fRG", force_readgroup, spaceSeparated = true, escape = true, format = "%s") +
    optional("-recal_table_update_log", recal_table_update_log, spaceSeparated = true, escape = true, format = "%s") +
    optional("-maxstr", max_str_unit_length, spaceSeparated = true, escape = true, format = "%s") +
    optional("-maxrep", max_repeat_length, spaceSeparated = true, escape = true, format = "%s") +
    conditional(lowMemoryMode, "-lowMemoryMode", escape = true, format = "%s") +
    optional("-bqsrBAQGOP", bqsrBAQGapOpenPenalty, spaceSeparated = true, escape = true, format = bqsrBAQGapOpenPenaltyFormat) +
    conditional(filter_reads_with_N_cigar, "-filterRNC", escape = true, format = "%s") +
    conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape = true, format = "%s") +
    conditional(filter_bases_not_stored, "-filterNoBases", escape = true, format = "%s")
}
Peter van 't Hof's avatar
Peter van 't Hof committed
176
177
178
179
180
181
182
183
184

object BaseRecalibrator {
  def apply(root: Configurable, input: File, output: File): BaseRecalibrator = {
    val br = new BaseRecalibrator(root)
    br.input_file :+= input
    br.out = output
    br
  }
}