Toucan.scala 9.59 KB
Newer Older
bow's avatar
bow committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Sander Bollen's avatar
Sander Bollen committed
16 17
package nl.lumc.sasc.biopet.pipelines.toucan

18 19
import java.io.File

Peter van 't Hof's avatar
Peter van 't Hof committed
20 21
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
Sander Bollen's avatar
Sander Bollen committed
22
import nl.lumc.sasc.biopet.extensions.bcftools.BcftoolsView
Peter van 't Hof's avatar
Peter van 't Hof committed
23 24 25 26 27
import nl.lumc.sasc.biopet.extensions.bedtools.{ BedtoolsIntersect, BedtoolsMerge }
import nl.lumc.sasc.biopet.extensions.gatk.{ CatVariants, SelectVariants }
import nl.lumc.sasc.biopet.extensions.manwe.{ ManweAnnotateVcf, ManweSamplesImport }
import nl.lumc.sasc.biopet.extensions.tools.{ GvcfToBed, VcfWithVcf, VepNormalizer }
import nl.lumc.sasc.biopet.extensions.{ Bgzip, Ln, VariantEffectPredictor }
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import nl.lumc.sasc.biopet.utils.VcfUtils
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import nl.lumc.sasc.biopet.utils.config.Configurable
30
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
Sander Bollen's avatar
Sander Bollen committed
31 32 33
import org.broadinstitute.gatk.queue.QScript

/**
Peter van 't Hof's avatar
Peter van 't Hof committed
34 35
 * Pipeline to annotate a vcf file with VEP
 *
Sander Bollen's avatar
Sander Bollen committed
36 37
 * Created by ahbbollen on 15-1-15.
 */
Sander Bollen's avatar
Sander Bollen committed
38
class Toucan(val root: Configurable) extends QScript with BiopetQScript with SummaryQScript with Reference {
Sander Bollen's avatar
Sander Bollen committed
39 40 41
  def this() = this(null)

  @Input(doc = "Input VCF file", shortName = "Input", required = true)
42
  var inputVcf: File = _
Sander Bollen's avatar
Sander Bollen committed
43

Peter van 't Hof's avatar
Peter van 't Hof committed
44
  @Input(doc = "Input GVCF file", shortName = "gvcf", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
45 46
  var inputGvcf: Option[File] = None

47 48
  lazy val gonlVcfFile: Option[File] = config("gonl_vcf")
  lazy val exacVcfFile: Option[File] = config("exac_vcf")
49
  lazy val doVarda: Boolean = config("use_varda", default = false)
50

Sander Bollen's avatar
Sander Bollen committed
51
  var sampleIds: List[String] = Nil
Sander Bollen's avatar
Sander Bollen committed
52
  def init(): Unit = {
53 54
    require(inputVcf != null, "No Input vcf given")
    inputFiles :+= new InputFile(inputVcf)
Sander Bollen's avatar
Sander Bollen committed
55 56
    sampleIds = root match {
      case m: MultiSampleQScript => m.samples.keys.toList
57
      case null                  => VcfUtils.getSampleIds(inputVcf)
Sander Bollen's avatar
Sander Bollen committed
58 59
      case s: SampleLibraryTag   => s.sampleId.toList
      case _                     => throw new IllegalArgumentException("You don't have any samples")
Sander Bollen's avatar
Sander Bollen committed
60
    }
Sander Bollen's avatar
Sander Bollen committed
61 62
  }

63
  override def defaults = Map(
Sander Bollen's avatar
Sander Bollen committed
64
    "varianteffectpredictor" -> Map("everything" -> true, "failed" -> 1, "allow_non_variant" -> true)
65
  )
Sander Bollen's avatar
Sander Bollen committed
66 67

  def biopetScript(): Unit = {
68
    val useVcf: File = if (doVarda) {
Peter van 't Hof's avatar
Peter van 't Hof committed
69
      inputGvcf match {
70
        case Some(s) => varda(inputVcf, s)
71 72
        case _       => throw new IllegalArgumentException("You have not specified a GVCF file")
      }
73
    } else inputVcf
74 75 76 77 78

    val outputVcfFiles = BedRecordList.fromReference(referenceFasta())
      .scatter(config("bin_size", default = 50000000))
      .allRecords.map { region =>

Peter van 't Hof's avatar
Peter van 't Hof committed
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
        val chunkName = s"${region.chr}-${region.start}-${region.end}"
        val chunkDir = new File(outputDir, "chunk" + File.separator + chunkName)
        val sv = new SelectVariants(this)
        sv.inputFiles :+= useVcf
        sv.outputFile = new File(chunkDir, chunkName + ".vcf.gz")
        sv.isIntermediate = true
        add(sv)

        val vep = new VariantEffectPredictor(this)
        vep.input = sv.outputFile
        vep.output = new File(chunkDir, chunkName + ".vep.vcf")
        vep.isIntermediate = true
        add(vep)
        addSummarizable(vep, "variant_effect_predictor")

        val normalizer = new VepNormalizer(this)
        normalizer.inputVCF = vep.output
        normalizer.outputVcf = new File(chunkDir, chunkName + ".normalized.vcf.gz")
97
        normalizer.isIntermediate = true
Peter van 't Hof's avatar
Peter van 't Hof committed
98 99 100 101 102 103 104 105 106 107 108
        add(normalizer)

        var outputFile = normalizer.outputVcf

        gonlVcfFile match {
          case Some(gonlFile) =>
            val vcfWithVcf = new VcfWithVcf(this)
            vcfWithVcf.input = outputFile
            vcfWithVcf.secondaryVcf = gonlFile
            vcfWithVcf.output = swapExt(chunkDir, normalizer.outputVcf, ".vcf.gz", ".gonl.vcf.gz")
            vcfWithVcf.fields ::= ("AF", "AF_gonl", None)
109
            vcfWithVcf.isIntermediate = true
Peter van 't Hof's avatar
Peter van 't Hof committed
110 111 112 113 114 115 116 117 118 119 120 121
            add(vcfWithVcf)
            outputFile = vcfWithVcf.output
          case _ =>
        }

        exacVcfFile match {
          case Some(exacFile) =>
            val vcfWithVcf = new VcfWithVcf(this)
            vcfWithVcf.input = outputFile
            vcfWithVcf.secondaryVcf = exacFile
            vcfWithVcf.output = swapExt(chunkDir, outputFile, ".vcf.gz", ".exac.vcf.gz")
            vcfWithVcf.fields ::= ("AF", "AF_exac", None)
122
            vcfWithVcf.isIntermediate = true
Peter van 't Hof's avatar
Peter van 't Hof committed
123 124 125 126 127 128
            add(vcfWithVcf)
            outputFile = vcfWithVcf.output
          case _ =>
        }

        outputFile
129 130 131 132 133
      }

    val cv = new CatVariants(this)
    cv.inputFiles = outputVcfFiles.toList
    cv.outputFile = (gonlVcfFile, exacVcfFile) match {
134 135 136 137
      case (Some(_), Some(_)) => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.gonl.exac.vcf.gz")
      case (Some(_), _)       => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.gonl.vcf.gz")
      case (_, Some(_))       => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.exac.vcf.gz")
      case _                  => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.vcf.gz")
Peter van 't Hof's avatar
Peter van 't Hof committed
138
    }
139
    add(cv)
140 141

    addSummaryJobs()
Sander Bollen's avatar
Sander Bollen committed
142 143
  }

144 145
  /**
   * Performs the varda import and activate for one sample
146 147
   *
   * @param sampleID the sampleID to be used
148 149 150 151 152 153 154 155
   * @param inputVcf the input VCF
   * @param gVCF the gVCF for coverage
   * @param annotation: ManweDownloadAnnotateVcf object of annotated vcf
   * @return
   */
  def importAndActivateSample(sampleID: String, inputVcf: File,
                              gVCF: File, annotation: ManweAnnotateVcf): ManweActivateAfterAnnotImport = {

Sander Bollen's avatar
Sander Bollen committed
156 157
    val minGQ: Int = config("minimum_genome_quality", default = 20, namespace = "manwe")
    val isPublic: Boolean = config("varda_is_public", default = true, namespace = "manwe")
158 159 160 161 162 163 164 165 166

    val bedTrack = new GvcfToBed(this)
    bedTrack.inputVcf = gVCF
    bedTrack.outputBed = swapExt(outputDir, gVCF, ".vcf.gz", s""".$sampleID.bed""")
    bedTrack.minQuality = minGQ
    bedTrack.isIntermediate = true
    bedTrack.sample = Some(sampleID)
    add(bedTrack)

167 168 169 170 171 172
    val mergedBed = new BedtoolsMerge(this)
    mergedBed.input = bedTrack.outputBed
    mergedBed.dist = 5
    mergedBed.output = swapExt(outputDir, bedTrack.outputBed, ".bed", ".merged.bed")
    add(mergedBed)

173
    val bgzippedBed = new Bgzip(this)
174 175
    bgzippedBed.input = List(mergedBed.output)
    bgzippedBed.output = swapExt(outputDir, mergedBed.output, ".bed", ".bed.gz")
176 177 178
    add(bgzippedBed)

    val singleVcf = new BcftoolsView(this)
179 180
    singleVcf.input = inputVcf
    singleVcf.output = swapExt(outputDir, inputVcf, ".vcf.gz", s""".$sampleID.vcf.gz""")
181 182 183 184 185 186 187 188
    singleVcf.samples = List(sampleID)
    singleVcf.minAC = Some(1)
    singleVcf.isIntermediate = true
    add(singleVcf)

    val intersected = new BedtoolsIntersect(this)
    intersected.input = singleVcf.output
    intersected.intersectFile = bgzippedBed.output
189
    intersected.output = swapExt(outputDir, singleVcf.output, ".vcf.gz", ".intersected.vcf")
190 191
    add(intersected)

192 193 194 195 196
    val bgzippedIntersect = new Bgzip(this)
    bgzippedIntersect.input = List(intersected.output)
    bgzippedIntersect.output = swapExt(outputDir, intersected.output, ".vcf", ".vcf.gz")
    add(bgzippedIntersect)

197
    val imported = new ManweSamplesImport(this)
198
    imported.vcfs = List(bgzippedIntersect.output)
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
    imported.beds = List(bgzippedBed.output)
    imported.name = Some(sampleID)
    imported.public = isPublic
    imported.waitToComplete = false
    imported.isIntermediate = true
    imported.output = swapExt(outputDir, intersected.output, ".vcf.gz", ".manwe.import")
    add(imported)

    val active = new ManweActivateAfterAnnotImport(this, annotation, imported)
    active.output = swapExt(outputDir, imported.output, ".import", ".activated")
    add(active)
    active

  }

Sander Bollen's avatar
Sander Bollen committed
214 215
  /**
   * Perform varda analysis
216 217
   *
   * @param vcf input vcf
218
   * @param gVcf The gVCF to be used for coverage calculations
Sander Bollen's avatar
Sander Bollen committed
219 220
   * @return return vcf
   */
221
  def varda(vcf: File, gVcf: File): File = {
222

Sander Bollen's avatar
Sander Bollen committed
223
    val annotationQueries: List[String] = config("annotation_queries", default = List("GLOBAL *"), namespace = "manwe")
224
    //TODO: add groups!!! Need sample-specific group tags for this
Sander Bollen's avatar
Sander Bollen committed
225 226

    val annotate = new ManweAnnotateVcf(this)
227
    annotate.vcf = vcf
228 229 230
    if (annotationQueries.nonEmpty) {
      annotate.queries = annotationQueries
    }
Sander Bollen's avatar
Sander Bollen committed
231
    annotate.waitToComplete = true
232 233
    annotate.output = swapExt(outputDir, vcf, ".vcf.gz", ".manwe.annot")
    annotate.isIntermediate = true
Sander Bollen's avatar
Sander Bollen committed
234 235 236
    add(annotate)

    val annotatedVcf = new ManweDownloadAfterAnnotate(this, annotate)
237
    annotatedVcf.output = swapExt(outputDir, annotate.output, ".manwe.annot", "manwe.annot.vcf.gz")
238
    add(annotatedVcf)
Sander Bollen's avatar
Sander Bollen committed
239

240
    val activates = sampleIds map { x => importAndActivateSample(x, vcf, gVcf, annotate) }
Sander Bollen's avatar
Sander Bollen committed
241 242

    val finalLn = new Ln(this)
Sander Bollen's avatar
Sander Bollen committed
243
    activates.foreach(x => finalLn.deps :+= x.output)
Sander Bollen's avatar
Sander Bollen committed
244
    finalLn.input = annotatedVcf.output
245
    finalLn.output = swapExt(outputDir, annotatedVcf.output, "manwe.annot.vcf.gz", ".varda_annotated.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
246
    finalLn.relative = true
Sander Bollen's avatar
Sander Bollen committed
247
    add(finalLn)
Sander Bollen's avatar
Sander Bollen committed
248 249 250 251

    finalLn.output
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
252 253 254 255 256
  def summaryFile = new File(outputDir, "Toucan.summary.json")

  def summaryFiles = Map()

  def summarySettings = Map()
Sander Bollen's avatar
Sander Bollen committed
257 258 259
}

object Toucan extends PipelineCommand