IndelRealigner.scala 13.8 KB
Newer Older
1
2
3
4
5
/**
 * Due to the license issue with GATK, this part of Biopet can only be used inside the
 * LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
 * on how to use this protected part of biopet or contact us at sasc@lumc.nl
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
6
package nl.lumc.sasc.biopet.extensions.gatk.broad
7
8

import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
9

Peter van 't Hof's avatar
Peter van 't Hof committed
10
import nl.lumc.sasc.biopet.utils.config.Configurable
11
import org.broadinstitute.gatk.queue.extensions.gatk.{BamGatherFunction, GATKScatterFunction, ReadScatterFunction, TaggedFile}
Peter van 't Hof's avatar
Peter van 't Hof committed
12
import nl.lumc.sasc.biopet.core.ScatterGatherableFunction
13
14
import nl.lumc.sasc.biopet.utils.VcfUtils
import org.broadinstitute.gatk.utils.commandline.{Argument, Gather, Output, _}
15
16

class IndelRealigner(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction {
17
  def analysis_type = "IndelRealigner"
18
19
20
21
22
23
24
25
26
27
28
29
30
  scatterClass = classOf[ReadScatterFunction]
  setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = true }

  /** Input VCF file(s) with known indels */
  @Input(fullName = "knownAlleles", shortName = "known", doc = "Input VCF file(s) with known indels", required = false, exclusiveOf = "", validation = "")
  var knownAlleles: Seq[File] = Nil

  /** Intervals file output from RealignerTargetCreator */
  @Input(fullName = "targetIntervals", shortName = "targetIntervals", doc = "Intervals file output from RealignerTargetCreator", required = true, exclusiveOf = "", validation = "")
  var targetIntervals: File = _

  /** LOD threshold above which the cleaner will clean */
  @Argument(fullName = "LODThresholdForCleaning", shortName = "LOD", doc = "LOD threshold above which the cleaner will clean", required = false, exclusiveOf = "", validation = "")
31
  var LODThresholdForCleaning: Option[Double] = config("LODThresholdForCleaning")
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48

  /** Format string for LODThresholdForCleaning */
  @Argument(fullName = "LODThresholdForCleaningFormat", shortName = "", doc = "Format string for LODThresholdForCleaning", required = false, exclusiveOf = "", validation = "")
  var LODThresholdForCleaningFormat: String = "%s"

  /** Output bam */
  @Output(fullName = "out", shortName = "o", doc = "Output bam", required = false, exclusiveOf = "", validation = "")
  @Gather(classOf[BamGatherFunction])
  var out: File = _

  /** Automatically generated md5 for out */
  @Output(fullName = "outMD5", shortName = "", doc = "Automatically generated md5 for out", required = false, exclusiveOf = "", validation = "")
  @Gather(enabled = false)
  private var outMD5: File = _

  /** Determines how to compute the possible alternate consenses */
  @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false, exclusiveOf = "", validation = "")
49
  var consensusDeterminationModel: Option[String] = config("consensusDeterminationModel")
50
51
52

  /** Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0) */
  @Argument(fullName = "entropyThreshold", shortName = "entropy", doc = "Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0)", required = false, exclusiveOf = "", validation = "")
53
  var entropyThreshold: Option[Double] = config("entropyThreshold")
54
55
56
57
58
59
60

  /** Format string for entropyThreshold */
  @Argument(fullName = "entropyThresholdFormat", shortName = "", doc = "Format string for entropyThreshold", required = false, exclusiveOf = "", validation = "")
  var entropyThresholdFormat: String = "%s"

  /** max reads allowed to be kept in memory at a time by the SAMFileWriter */
  @Argument(fullName = "maxReadsInMemory", shortName = "maxInMemory", doc = "max reads allowed to be kept in memory at a time by the SAMFileWriter", required = false, exclusiveOf = "", validation = "")
61
  var maxReadsInMemory: Option[Int] = config("maxReadsInMemory")
62
63
64

  /** maximum insert size of read pairs that we attempt to realign */
  @Argument(fullName = "maxIsizeForMovement", shortName = "maxIsize", doc = "maximum insert size of read pairs that we attempt to realign", required = false, exclusiveOf = "", validation = "")
65
  var maxIsizeForMovement: Option[Int] = config("maxIsizeForMovement")
66
67
68

  /** Maximum positional move in basepairs that a read can be adjusted during realignment */
  @Argument(fullName = "maxPositionalMoveAllowed", shortName = "maxPosMove", doc = "Maximum positional move in basepairs that a read can be adjusted during realignment", required = false, exclusiveOf = "", validation = "")
69
  var maxPositionalMoveAllowed: Option[Int] = config("maxPositionalMoveAllowed")
70
71
72

  /** Max alternate consensuses to try (necessary to improve performance in deep coverage) */
  @Argument(fullName = "maxConsensuses", shortName = "maxConsensuses", doc = "Max alternate consensuses to try (necessary to improve performance in deep coverage)", required = false, exclusiveOf = "", validation = "")
73
  var maxConsensuses: Option[Int] = config("maxConsensuses")
74
75
76

  /** Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage) */
  @Argument(fullName = "maxReadsForConsensuses", shortName = "greedy", doc = "Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required = false, exclusiveOf = "", validation = "")
77
  var maxReadsForConsensuses: Option[Int] = config("maxReadsForConsensuses")
78
79
80

  /** Max reads allowed at an interval for realignment */
  @Argument(fullName = "maxReadsForRealignment", shortName = "maxReads", doc = "Max reads allowed at an interval for realignment", required = false, exclusiveOf = "", validation = "")
81
  var maxReadsForRealignment: Option[Int] = config("maxReadsForRealignment")
82
83
84

  /** Don't output the original cigar or alignment start tags for each realigned read in the output bam */
  @Argument(fullName = "noOriginalAlignmentTags", shortName = "noTags", doc = "Don't output the original cigar or alignment start tags for each realigned read in the output bam", required = false, exclusiveOf = "", validation = "")
85
  var noOriginalAlignmentTags: Boolean = config("noOriginalAlignmentTags", default = false)
86
87
88

  /** Generate one output file for each input (-I) bam file (not compatible with -output) */
  @Argument(fullName = "nWayOut", shortName = "nWayOut", doc = "Generate one output file for each input (-I) bam file (not compatible with -output)", required = false, exclusiveOf = "", validation = "")
89
  var nWayOut: Option[String] = config("nWayOut")
90
91
92

  /** Generate md5sums for BAMs */
  @Argument(fullName = "generate_nWayOut_md5s", shortName = "", doc = "Generate md5sums for BAMs", required = false, exclusiveOf = "", validation = "")
93
  var generate_nWayOut_md5s: Boolean = config("generate_nWayOut_md5s", default = false)
94
95
96

  /** Do early check of reads against existing consensuses */
  @Argument(fullName = "check_early", shortName = "check_early", doc = "Do early check of reads against existing consensuses", required = false, exclusiveOf = "", validation = "")
97
  var check_early: Boolean = config("check_early", default = false)
98
99
100

  /** Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY.  This option is required in order to pass integration tests. */
  @Argument(fullName = "noPGTag", shortName = "noPG", doc = "Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY.  This option is required in order to pass integration tests.", required = false, exclusiveOf = "", validation = "")
101
  var noPGTag: Boolean = config("noPGTag", default = false)
102
103
104

  /** Keep older PG tags left in the bam header by previous runs of this tool (by default, all these historical tags will be replaced by the latest tag generated in the current run). */
  @Argument(fullName = "keepPGTags", shortName = "keepPG", doc = "Keep older PG tags left in the bam header by previous runs of this tool (by default, all these historical tags will be replaced by the latest tag generated in the current run).", required = false, exclusiveOf = "", validation = "")
105
  var keepPGTags: Boolean = config("keepPGTags", default = false)
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

  /** Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY */
  @Output(fullName = "indelsFileForDebugging", shortName = "indels", doc = "Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY", required = false, exclusiveOf = "", validation = "")
  @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction])
  var indelsFileForDebugging: File = _

  /** print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY */
  @Output(fullName = "statisticsFileForDebugging", shortName = "stats", doc = "print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required = false, exclusiveOf = "", validation = "")
  @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction])
  var statisticsFileForDebugging: File = _

  /** print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY */
  @Output(fullName = "SNPsFileForDebugging", shortName = "snps", doc = "print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required = false, exclusiveOf = "", validation = "")
  @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction])
  var SNPsFileForDebugging: File = _

  /** Filter out reads with CIGAR containing the N operator, instead of failing with an error */
  @Argument(fullName = "filter_reads_with_N_cigar", shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false, exclusiveOf = "", validation = "")
124
  var filter_reads_with_N_cigar: Boolean = config("filter_reads_with_N_cigar", default = false)
125
126
127

  /** Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error */
  @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false, exclusiveOf = "", validation = "")
128
  var filter_mismatching_base_and_quals: Boolean = config("filter_mismatching_base_and_quals", default = false)
129
130
131

  /** Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error */
  @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false, exclusiveOf = "", validation = "")
132
  var filter_bases_not_stored: Boolean = config("filter_bases_not_stored", default = false)
133
134
135

  override def freezeFieldValues() {
    super.freezeFieldValues()
136
    deps ++= knownAlleles.filter(orig => orig != null && (!orig.getName.endsWith(".list"))).map(orig => VcfUtils.getVcfIndexFile(orig))
137
138
    if (out != null && !org.broadinstitute.gatk.utils.io.IOUtils.isSpecialFile(out))
      if (!disable_bam_indexing)
139
        outputFiles :+= new File(out.getPath.stripSuffix(".bam") + ".bai")
140
141
142
    if (out != null && !org.broadinstitute.gatk.utils.io.IOUtils.isSpecialFile(out))
      if (generate_md5)
        outMD5 = new File(out.getPath + ".md5")
143
  }
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

  override def cmdLine = super.cmdLine +
    repeat("-known", knownAlleles, formatPrefix = TaggedFile.formatCommandLineParameter, spaceSeparated = true, escape = true, format = "%s") +
    required("-targetIntervals", targetIntervals, spaceSeparated = true, escape = true, format = "%s") +
    optional("-LOD", LODThresholdForCleaning, spaceSeparated = true, escape = true, format = LODThresholdForCleaningFormat) +
    optional("-o", out, spaceSeparated = true, escape = true, format = "%s") +
    optional("-model", consensusDeterminationModel, spaceSeparated = true, escape = true, format = "%s") +
    optional("-entropy", entropyThreshold, spaceSeparated = true, escape = true, format = entropyThresholdFormat) +
    optional("-maxInMemory", maxReadsInMemory, spaceSeparated = true, escape = true, format = "%s") +
    optional("-maxIsize", maxIsizeForMovement, spaceSeparated = true, escape = true, format = "%s") +
    optional("-maxPosMove", maxPositionalMoveAllowed, spaceSeparated = true, escape = true, format = "%s") +
    optional("-maxConsensuses", maxConsensuses, spaceSeparated = true, escape = true, format = "%s") +
    optional("-greedy", maxReadsForConsensuses, spaceSeparated = true, escape = true, format = "%s") +
    optional("-maxReads", maxReadsForRealignment, spaceSeparated = true, escape = true, format = "%s") +
    conditional(noOriginalAlignmentTags, "-noTags", escape = true, format = "%s") +
    optional("-nWayOut", nWayOut, spaceSeparated = true, escape = true, format = "%s") +
    conditional(generate_nWayOut_md5s, "--generate_nWayOut_md5s", escape = true, format = "%s") +
    conditional(check_early, "-check_early", escape = true, format = "%s") +
    conditional(noPGTag, "-noPG", escape = true, format = "%s") +
    conditional(keepPGTags, "-keepPG", escape = true, format = "%s") +
    optional("-indels", indelsFileForDebugging, spaceSeparated = true, escape = true, format = "%s") +
    optional("-stats", statisticsFileForDebugging, spaceSeparated = true, escape = true, format = "%s") +
    optional("-snps", SNPsFileForDebugging, spaceSeparated = true, escape = true, format = "%s") +
    conditional(filter_reads_with_N_cigar, "-filterRNC", escape = true, format = "%s") +
    conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape = true, format = "%s") +
    conditional(filter_bases_not_stored, "-filterNoBases", escape = true, format = "%s")
}
Peter van 't Hof's avatar
Peter van 't Hof committed
171
172
173
174
175
176
177
178
179
180

object IndelRealigner {
  def apply(root: Configurable, input: File, targetIntervals: File, outputDir: File): IndelRealigner = {
    val ir = new IndelRealigner(root)
    ir.input_file :+= input
    ir.targetIntervals = targetIntervals
    ir.out = new File(outputDir, input.getName.stripSuffix(".bam") + ".realign.bam")
    ir
  }
}