GatkPipeline.scala 10.3 KB
Newer Older
1
2
package nl.lumc.sasc.biopet.pipelines.gatk

Peter van 't Hof's avatar
Peter van 't Hof committed
3
4
5
import nl.lumc.sasc.biopet.core.MultiSampleQScript
import nl.lumc.sasc.biopet.core.PipelineCommand
import nl.lumc.sasc.biopet.core.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
6
import htsjdk.samtools.SamReaderFactory
7
import scala.collection.JavaConversions._
8
import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
9
import nl.lumc.sasc.biopet.extensions.gatk.{ CombineVariants, CombineGVCFs }
10
import nl.lumc.sasc.biopet.extensions.picard.AddOrReplaceReadGroups
Peter van 't Hof's avatar
Peter van 't Hof committed
11
import nl.lumc.sasc.biopet.extensions.picard.SamToFastq
12
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
Peter van 't Hof's avatar
Peter van 't Hof committed
13
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
14
import org.broadinstitute.gatk.queue.QScript
bow's avatar
bow committed
15
import org.broadinstitute.gatk.utils.commandline.{ Argument }
16

bow's avatar
bow committed
17
class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScript {
18
  def this() = this(null)
Peter van 't Hof's avatar
Peter van 't Hof committed
19

bow's avatar
bow committed
20
  @Argument(doc = "Only Sample", shortName = "sample", required = false)
21
  val onlySample: List[String] = Nil
bow's avatar
bow committed
22
23

  @Argument(doc = "Skip Genotyping step", shortName = "skipgenotyping", required = false)
24
  var skipGenotyping: Boolean = false
bow's avatar
bow committed
25
26

  @Argument(doc = "Merge gvcfs", shortName = "mergegvcfs", required = false)
27
  var mergeGvcfs: Boolean = false
bow's avatar
bow committed
28

Peter van 't Hof's avatar
Peter van 't Hof committed
29
  @Argument(doc = "Joint variantcalling", shortName = "jointVariantCalling", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
30
  var jointVariantcalling: Boolean = config("joint_variantcalling", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
31

Peter van 't Hof's avatar
Peter van 't Hof committed
32
  @Argument(doc = "Joint genotyping", shortName = "jointGenotyping", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
33
  var jointGenotyping: Boolean = config("joint_genotyping", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
34

Peter van 't Hof's avatar
Peter van 't Hof committed
35
36
37
  var singleSampleCalling = config("single_sample_calling", default = true)
  var reference: File = config("reference", required = true)
  var dbsnp: File = config("dbsnp")
38
39
  var gvcfFiles: List[File] = Nil
  var finalBamFiles: List[File] = Nil
Peter van 't Hof's avatar
Peter van 't Hof committed
40
  var useAllelesOption: Boolean = config("use_alleles_option", default = false)
bow's avatar
bow committed
41

Peter van 't Hof's avatar
Peter van 't Hof committed
42
43
  class LibraryOutput extends AbstractLibraryOutput {
    var mappedBamFile: File = _
44
    var variantcalling: GatkVariantcalling.ScriptOutput = _
Peter van 't Hof's avatar
Peter van 't Hof committed
45
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
46

47
  class SampleOutput extends AbstractSampleOutput {
48
    var variantcalling: GatkVariantcalling.ScriptOutput = _
Peter van 't Hof's avatar
Peter van 't Hof committed
49
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
50

Peter van 't Hof's avatar
Peter van 't Hof committed
51
  def init() {
52
    if (config.contains("target_bed")) {
Peter van 't Hof's avatar
Peter van 't Hof committed
53
      defaults ++= Map("gatk" -> Map(("intervals" -> config("target_bed").getStringList)))
54
    }
55
    if (config.contains("gvcfFiles"))
bow's avatar
bow committed
56
      for (file <- config("gvcfFiles").getList)
Peter van 't Hof's avatar
Peter van 't Hof committed
57
        gvcfFiles :+= file.toString
58
59
60
    if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
    else if (!outputDir.endsWith("/")) outputDir += "/"
  }
bow's avatar
bow committed
61

62
63
64
  def biopetScript() {
    if (onlySample.isEmpty) {
      runSamplesJobs
Peter van 't Hof's avatar
Peter van 't Hof committed
65

66
67
      //SampleWide jobs
      if (mergeGvcfs && gvcfFiles.size > 0) {
68
        val newFile = outputDir + "merged.gvcf.vcf.gz"
69
        add(CombineGVCFs(this, gvcfFiles, newFile))
70
71
        gvcfFiles = List(newFile)
      }
bow's avatar
bow committed
72

73
      if (!skipGenotyping && gvcfFiles.size > 0) {
74
75
76
77
78
79
80
81
        if (jointGenotyping) {
          val gatkGenotyping = new GatkGenotyping(this)
          gatkGenotyping.inputGvcfs = gvcfFiles
          gatkGenotyping.outputDir = outputDir + "genotyping/"
          gatkGenotyping.init
          gatkGenotyping.biopetScript
          addAll(gatkGenotyping.functions)
          var vcfFile = gatkGenotyping.outputFile
82
83
        }
      } else logger.warn("No gVCFs to genotype")
Peter van 't Hof's avatar
Peter van 't Hof committed
84

85
      if (jointVariantcalling) {
Peter van 't Hof's avatar
Peter van 't Hof committed
86
87
88
89
90
91
        val allBamfiles = for (
          (sampleID, sampleOutput) <- samplesOutput;
          file <- sampleOutput.variantcalling.bamFiles
        ) yield file
        val allRawVcfFiles = for ((sampleID, sampleOutput) <- samplesOutput) yield sampleOutput.variantcalling.rawFilterVcfFile

Peter van 't Hof's avatar
Peter van 't Hof committed
92
        val gatkVariantcalling = new GatkVariantcalling(this) {
Peter van 't Hof's avatar
Peter van 't Hof committed
93
          override protected lazy val configName = "gatkvariantcalling"
Peter van 't Hof's avatar
Peter van 't Hof committed
94
          override def configPath: List[String] = "multisample" :: super.configPath
Peter van 't Hof's avatar
Peter van 't Hof committed
95
        }
96
97
98
99
100
101
102

        if (gatkVariantcalling.useMpileup) {
          val cvRaw = CombineVariants(this, allRawVcfFiles.toList, outputDir + "variantcalling/multisample.raw.vcf.gz")
          add(cvRaw)
          gatkVariantcalling.rawVcfInput = cvRaw.out
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
103
104
        gatkVariantcalling.preProcesBams = Some(false)
        gatkVariantcalling.doublePreProces = Some(false)
Peter van 't Hof's avatar
Peter van 't Hof committed
105
106
107
108
109
110
        gatkVariantcalling.inputBams = allBamfiles.toList
        gatkVariantcalling.outputDir = outputDir + "variantcalling"
        gatkVariantcalling.outputName = "multisample"
        gatkVariantcalling.init
        gatkVariantcalling.biopetScript
        addAll(gatkVariantcalling.functions)
Peter van 't Hof's avatar
Peter van 't Hof committed
111

112
113
        if (config("inputtype", default = "dna").getString != "rna" && config("recalibration", default = false).getBoolean) {
          val recalibration = new GatkVariantRecalibration(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
114
          recalibration.inputVcf = gatkVariantcalling.scriptOutput.finalVcfFile
115
116
117
118
119
          recalibration.bamFiles = finalBamFiles
          recalibration.outputDir = outputDir + "recalibration/"
          recalibration.init
          recalibration.biopetScript
        }
120
      }
121
    } else for (sample <- onlySample) runSingleSampleJobs(sample)
122
  }
bow's avatar
bow committed
123

124
  // Called for each sample
125
126
  def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = {
    val sampleOutput = new SampleOutput
Peter van 't Hof's avatar
Peter van 't Hof committed
127
    var libraryBamfiles: List[File] = List()
128
    val sampleID: String = sampleConfig("ID").toString
129
    sampleOutput.libraries = runLibraryJobs(sampleConfig)
130
    val sampleDir = globalSampleDir + sampleID
131
    for ((libraryID, libraryOutput) <- sampleOutput.libraries) {
132
      libraryBamfiles ++= libraryOutput.variantcalling.bamFiles
133
    }
bow's avatar
bow committed
134

Peter van 't Hof's avatar
Peter van 't Hof committed
135
136
    if (libraryBamfiles.size > 0) {
      finalBamFiles ++= libraryBamfiles
137
      val gatkVariantcalling = new GatkVariantcalling(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
138
      gatkVariantcalling.inputBams = libraryBamfiles
139
      gatkVariantcalling.outputDir = sampleDir + "/variantcalling/"
Peter van 't Hof's avatar
Peter van 't Hof committed
140
      gatkVariantcalling.preProcesBams = Some(false)
141
      if (!singleSampleCalling) {
Peter van 't Hof's avatar
Peter van 't Hof committed
142
143
        gatkVariantcalling.useHaplotypecaller = Some(false)
        gatkVariantcalling.useUnifiedGenotyper = Some(false)
144
      }
145
      gatkVariantcalling.sampleID = sampleID
146
147
148
      gatkVariantcalling.init
      gatkVariantcalling.biopetScript
      addAll(gatkVariantcalling.functions)
149
150
      sampleOutput.variantcalling = gatkVariantcalling.scriptOutput
      gvcfFiles :+= gatkVariantcalling.scriptOutput.gvcfFile
151
    } else logger.warn("No bamfiles for variant calling for sample: " + sampleID)
Peter van 't Hof's avatar
Peter van 't Hof committed
152
    return sampleOutput
153
  }
bow's avatar
bow committed
154

155
  // Called for each run from a sample
Peter van 't Hof's avatar
Peter van 't Hof committed
156
157
  def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = {
    val libraryOutput = new LibraryOutput
158
159
    val runID: String = runConfig("ID").toString
    val sampleID: String = sampleConfig("ID").toString
160
    val runDir: String = globalSampleDir + sampleID + "/run_" + runID + "/"
161
162
    var inputType = ""
    if (runConfig.contains("inputtype")) inputType = runConfig("inputtype").toString
bow's avatar
bow committed
163
    else inputType = config("inputtype", default = "dna").toString
164
    if (runConfig.contains("R1")) {
Peter van 't Hof's avatar
Peter van 't Hof committed
165
      val mapping = Mapping.loadFromLibraryConfig(this, runConfig, sampleConfig, runDir)
166
      addAll(mapping.functions) // Add functions of mapping to curent function pool
Peter van 't Hof's avatar
Peter van 't Hof committed
167
      libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile")
168
169
170
    } else if (runConfig.contains("bam")) {
      var bamFile = new File(runConfig("bam").toString)
      if (!bamFile.exists) throw new IllegalStateException("Bam in config does not exist, file: " + bamFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
171

Peter van 't Hof's avatar
Peter van 't Hof committed
172
      if (config("bam_to_fastq", default = false).getBoolean) {
Peter van 't Hof's avatar
Peter van 't Hof committed
173
174
        val samToFastq = SamToFastq(this, bamFile, runDir + sampleID + "-" + runID + ".R1.fastq",
          runDir + sampleID + "-" + runID + ".R2.fastq")
sajvanderzeeuw's avatar
sajvanderzeeuw committed
175
        add(samToFastq, isIntermediate = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
176
177
178
179
180
181
182
183
184
        val mapping = Mapping.loadFromLibraryConfig(this, runConfig, sampleConfig, runDir, startJobs = false)
        mapping.input_R1 = samToFastq.fastqR1
        mapping.input_R2 = samToFastq.fastqR2
        mapping.init
        mapping.biopetScript
        addAll(mapping.functions) // Add functions of mapping to curent function pool
        libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile")
      } else {
        var readGroupOke = true
Peter van 't Hof's avatar
Peter van 't Hof committed
185
        val inputSam = SamReaderFactory.makeDefault.open(bamFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
        val header = inputSam.getFileHeader.getReadGroups
        for (readGroup <- inputSam.getFileHeader.getReadGroups) {
          if (readGroup.getSample != sampleID) logger.warn("Sample ID readgroup in bam file is not the same")
          if (readGroup.getLibrary != runID) logger.warn("Library ID readgroup in bam file is not the same")
          if (readGroup.getSample != sampleID || readGroup.getLibrary != runID) readGroupOke = false
        }
        inputSam.close

        if (!readGroupOke) {
          if (config("correct_readgroups", default = false)) {
            logger.info("Correcting readgroups, file:" + bamFile)
            val aorrg = AddOrReplaceReadGroups(this, bamFile, new File(runDir + sampleID + "-" + runID + ".bam"))
            aorrg.RGID = sampleID + "-" + runID
            aorrg.RGLB = runID
            aorrg.RGSM = sampleID
            if (runConfig.contains("PL")) aorrg.RGPL = runConfig("PL").toString
            else aorrg.RGPL = "illumina"
            if (runConfig.contains("PU")) aorrg.RGPU = runConfig("PU").toString
            else aorrg.RGPU = "na"
            if (runConfig.contains("CN")) aorrg.RGCN = runConfig("CN").toString
            add(aorrg, isIntermediate = true)
            bamFile = aorrg.output
Peter van 't Hof's avatar
Peter van 't Hof committed
208
209
          } else throw new IllegalStateException("Readgroup sample and/or library of input bamfile is not correct, file: " + bamFile +
            "\nPossible to set 'correct_readgroups' to true on config to automatic fix this")
Peter van 't Hof's avatar
Peter van 't Hof committed
210
211
212
213
        }
        addAll(BamMetrics(this, bamFile, runDir + "metrics/").functions)

        libraryOutput.mappedBamFile = bamFile
214
215
      }
    } else logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig)
Peter van 't Hof's avatar
Peter van 't Hof committed
216

Peter van 't Hof's avatar
Peter van 't Hof committed
217
    val gatkVariantcalling = new GatkVariantcalling(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
218
    gatkVariantcalling.inputBams = List(libraryOutput.mappedBamFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
219
220
221
    gatkVariantcalling.outputDir = runDir
    gatkVariantcalling.variantcalling = config("library_variantcalling", default = false)
    gatkVariantcalling.preProcesBams = true
222
    gatkVariantcalling.sampleID = sampleID
Peter van 't Hof's avatar
Peter van 't Hof committed
223
224
225
    gatkVariantcalling.init
    gatkVariantcalling.biopetScript
    addAll(gatkVariantcalling.functions)
226
    libraryOutput.variantcalling = gatkVariantcalling.scriptOutput
Peter van 't Hof's avatar
Peter van 't Hof committed
227

Peter van 't Hof's avatar
Peter van 't Hof committed
228
    return libraryOutput
229
230
231
  }
}

232
object GatkPipeline extends PipelineCommand