Toucan.scala 9.94 KB
Newer Older
bow's avatar
bow committed
1
/**
2 3 4 5 6 7 8 9 10 11 12 13 14 15
  * Biopet is built on top of GATK Queue for building bioinformatic
  * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
  * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
  * should also be able to execute Biopet tools and pipelines.
  *
  * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
  *
  * Contact us at: sasc@lumc.nl
  *
  * A dual licensing mode is applied. The source code within this project that are
  * not part of GATK Queue is freely available for non-commercial use under an AGPL
  * license; For commercial users or users who do not want to follow the AGPL
  * license, please contact us to obtain a separate license.
  */
Sander Bollen's avatar
Sander Bollen committed
16 17
package nl.lumc.sasc.biopet.pipelines.toucan

18 19
import java.io.File

Peter van 't Hof's avatar
Peter van 't Hof committed
20 21
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
Sander Bollen's avatar
Sander Bollen committed
22
import nl.lumc.sasc.biopet.extensions.bcftools.BcftoolsView
Peter van 't Hof's avatar
Peter van 't Hof committed
23 24 25 26 27
import nl.lumc.sasc.biopet.extensions.bedtools.{ BedtoolsIntersect, BedtoolsMerge }
import nl.lumc.sasc.biopet.extensions.gatk.{ CatVariants, SelectVariants }
import nl.lumc.sasc.biopet.extensions.manwe.{ ManweAnnotateVcf, ManweSamplesImport }
import nl.lumc.sasc.biopet.extensions.tools.{ GvcfToBed, VcfWithVcf, VepNormalizer }
import nl.lumc.sasc.biopet.extensions.{ Bgzip, Ln, VariantEffectPredictor }
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import nl.lumc.sasc.biopet.utils.VcfUtils
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import nl.lumc.sasc.biopet.utils.config.Configurable
30
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
Sander Bollen's avatar
Sander Bollen committed
31 32 33
import org.broadinstitute.gatk.queue.QScript

/**
34 35 36 37
  * Pipeline to annotate a vcf file with VEP
  *
  * Created by ahbbollen on 15-1-15.
  */
Sander Bollen's avatar
Sander Bollen committed
38
class Toucan(val root: Configurable) extends QScript with BiopetQScript with SummaryQScript with Reference {
Sander Bollen's avatar
Sander Bollen committed
39 40 41
  def this() = this(null)

  @Input(doc = "Input VCF file", shortName = "Input", required = true)
42
  var inputVcf: File = _
Sander Bollen's avatar
Sander Bollen committed
43

Peter van 't Hof's avatar
Peter van 't Hof committed
44
  @Input(doc = "Input GVCF file", shortName = "gvcf", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
45 46
  var inputGvcf: Option[File] = None

47 48
  var outputVcf: Option[File] = None

Sander Bollen's avatar
Sander Bollen committed
49
  def sampleInfo: Map[String, Map[String, Any]] = root match {
Sander Bollen's avatar
Sander Bollen committed
50
    case m: MultiSampleQScript => m.samples.map { case (sampleId, sample) => sampleId -> sample.sampleTags }
51
    case null                  => VcfUtils.getSampleIds(inputVcf).map(x => x -> Map[String, Any]()).toMap
Sander Bollen's avatar
Sander Bollen committed
52 53
    case s: SampleLibraryTag   => s.sampleId.map(x => x -> Map[String, Any]()).toMap
    case _                     => throw new IllegalArgumentException("")
Sander Bollen's avatar
Sander Bollen committed
54 55
  }

56 57
  lazy val gonlVcfFile: Option[File] = config("gonl_vcf")
  lazy val exacVcfFile: Option[File] = config("exac_vcf")
58
  lazy val doVarda: Boolean = config("use_varda", default = false)
59

Sander Bollen's avatar
Sander Bollen committed
60
  def init(): Unit = {
61 62
    require(inputVcf != null, "No Input vcf given")
    inputFiles :+= new InputFile(inputVcf)
Sander Bollen's avatar
Sander Bollen committed
63 64
  }

65
  override def defaults = Map(
Sander Bollen's avatar
Sander Bollen committed
66
    "varianteffectpredictor" -> Map("everything" -> true, "failed" -> 1, "allow_non_variant" -> true)
67
  )
Sander Bollen's avatar
Sander Bollen committed
68 69

  def biopetScript(): Unit = {
70
    val useVcf: File = if (doVarda) {
Peter van 't Hof's avatar
Peter van 't Hof committed
71
      inputGvcf match {
72
        case Some(s) => varda(inputVcf, s)
73 74
        case _       => throw new IllegalArgumentException("You have not specified a GVCF file")
      }
75
    } else inputVcf
76 77 78 79 80

    val outputVcfFiles = BedRecordList.fromReference(referenceFasta())
      .scatter(config("bin_size", default = 50000000))
      .allRecords.map { region =>

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
      val chunkName = s"${region.chr}-${region.start}-${region.end}"
      val chunkDir = new File(outputDir, "chunk" + File.separator + chunkName)
      val sv = new SelectVariants(this)
      sv.inputFiles :+= useVcf
      sv.outputFile = new File(chunkDir, chunkName + ".vcf.gz")
      sv.isIntermediate = true
      add(sv)

      val vep = new VariantEffectPredictor(this)
      vep.input = sv.outputFile
      vep.output = new File(chunkDir, chunkName + ".vep.vcf")
      vep.isIntermediate = true
      add(vep)
      addSummarizable(vep, "variant_effect_predictor")

      val normalizer = new VepNormalizer(this)
      normalizer.inputVCF = vep.output
      normalizer.outputVcf = new File(chunkDir, chunkName + ".normalized.vcf.gz")
      normalizer.isIntermediate = true
      add(normalizer)

      var outputFile = normalizer.outputVcf

      gonlVcfFile match {
        case Some(gonlFile) =>
          val vcfWithVcf = new VcfWithVcf(this)
          vcfWithVcf.input = outputFile
          vcfWithVcf.secondaryVcf = gonlFile
          vcfWithVcf.output = swapExt(chunkDir, normalizer.outputVcf, ".vcf.gz", ".gonl.vcf.gz")
          vcfWithVcf.fields ::= ("AF", "AF_gonl", None)
          vcfWithVcf.isIntermediate = true
          add(vcfWithVcf)
          outputFile = vcfWithVcf.output
        case _ =>
      }

      exacVcfFile match {
        case Some(exacFile) =>
          val vcfWithVcf = new VcfWithVcf(this)
          vcfWithVcf.input = outputFile
          vcfWithVcf.secondaryVcf = exacFile
          vcfWithVcf.output = swapExt(chunkDir, outputFile, ".vcf.gz", ".exac.vcf.gz")
          vcfWithVcf.fields ::= ("AF", "AF_exac", None)
          vcfWithVcf.isIntermediate = true
          add(vcfWithVcf)
          outputFile = vcfWithVcf.output
        case _ =>
128 129
      }

130
      outputFile
Peter van 't Hof's avatar
Peter van 't Hof committed
131 132
    }

133 134 135
    val cv = new CatVariants(this)
    cv.inputFiles = outputVcfFiles.toList
    cv.outputFile = (gonlVcfFile, exacVcfFile) match {
136 137 138 139
      case (Some(_), Some(_)) => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.gonl.exac.vcf.gz")
      case (Some(_), _)       => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.gonl.vcf.gz")
      case (_, Some(_))       => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.exac.vcf.gz")
      case _                  => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.vcf.gz")
Peter van 't Hof's avatar
Peter van 't Hof committed
140
    }
141
    add(cv)
142 143

    addSummaryJobs()
Sander Bollen's avatar
Sander Bollen committed
144 145
  }

146
  /**
147 148 149 150 151 152 153 154
    * Performs the varda import and activate for one sample
    *
    * @param sampleID the sampleID to be used
    * @param inputVcf the input VCF
    * @param gVCF the gVCF for coverage
    * @param annotation: ManweDownloadAnnotateVcf object of annotated vcf
    * @return
    */
Sander Bollen's avatar
Sander Bollen committed
155
  def importAndActivateSample(sampleID: String, sampleGroups: List[String], inputVcf: File,
156 157
                              gVCF: File, annotation: ManweAnnotateVcf): ManweActivateAfterAnnotImport = {

Sander Bollen's avatar
Sander Bollen committed
158 159
    val minGQ: Int = config("minimum_genome_quality", default = 20, namespace = "manwe")
    val isPublic: Boolean = config("varda_is_public", default = true, namespace = "manwe")
160 161 162 163 164 165 166 167 168

    val bedTrack = new GvcfToBed(this)
    bedTrack.inputVcf = gVCF
    bedTrack.outputBed = swapExt(outputDir, gVCF, ".vcf.gz", s""".$sampleID.bed""")
    bedTrack.minQuality = minGQ
    bedTrack.isIntermediate = true
    bedTrack.sample = Some(sampleID)
    add(bedTrack)

169 170 171 172 173 174
    val mergedBed = new BedtoolsMerge(this)
    mergedBed.input = bedTrack.outputBed
    mergedBed.dist = 5
    mergedBed.output = swapExt(outputDir, bedTrack.outputBed, ".bed", ".merged.bed")
    add(mergedBed)

175
    val bgzippedBed = new Bgzip(this)
176 177
    bgzippedBed.input = List(mergedBed.output)
    bgzippedBed.output = swapExt(outputDir, mergedBed.output, ".bed", ".bed.gz")
178 179 180
    add(bgzippedBed)

    val singleVcf = new BcftoolsView(this)
181 182
    singleVcf.input = inputVcf
    singleVcf.output = swapExt(outputDir, inputVcf, ".vcf.gz", s""".$sampleID.vcf.gz""")
183 184 185 186 187 188 189 190
    singleVcf.samples = List(sampleID)
    singleVcf.minAC = Some(1)
    singleVcf.isIntermediate = true
    add(singleVcf)

    val intersected = new BedtoolsIntersect(this)
    intersected.input = singleVcf.output
    intersected.intersectFile = bgzippedBed.output
191
    intersected.output = swapExt(outputDir, singleVcf.output, ".vcf.gz", ".intersected.vcf")
192 193
    add(intersected)

194 195 196 197 198
    val bgzippedIntersect = new Bgzip(this)
    bgzippedIntersect.input = List(intersected.output)
    bgzippedIntersect.output = swapExt(outputDir, intersected.output, ".vcf", ".vcf.gz")
    add(bgzippedIntersect)

199
    val imported = new ManweSamplesImport(this)
200
    imported.vcfs = List(bgzippedIntersect.output)
201 202 203
    imported.beds = List(bgzippedBed.output)
    imported.name = Some(sampleID)
    imported.public = isPublic
Sander Bollen's avatar
Sander Bollen committed
204
    imported.group = sampleGroups
205 206 207 208 209 210 211 212 213 214 215 216
    imported.waitToComplete = false
    imported.isIntermediate = true
    imported.output = swapExt(outputDir, intersected.output, ".vcf.gz", ".manwe.import")
    add(imported)

    val active = new ManweActivateAfterAnnotImport(this, annotation, imported)
    active.output = swapExt(outputDir, imported.output, ".import", ".activated")
    add(active)
    active

  }

Sander Bollen's avatar
Sander Bollen committed
217
  /**
218 219 220 221 222 223
    * Perform varda analysis
    *
    * @param vcf input vcf
    * @param gVcf The gVCF to be used for coverage calculations
    * @return return vcf
    */
224
  def varda(vcf: File, gVcf: File): File = {
225

Sander Bollen's avatar
Sander Bollen committed
226
    val annotationQueries: List[String] = config("annotation_queries", default = List("GLOBAL *"), namespace = "manwe")
Sander Bollen's avatar
Sander Bollen committed
227 228

    val annotate = new ManweAnnotateVcf(this)
229
    annotate.vcf = vcf
230 231 232
    if (annotationQueries.nonEmpty) {
      annotate.queries = annotationQueries
    }
Sander Bollen's avatar
Sander Bollen committed
233
    annotate.waitToComplete = true
234 235
    annotate.output = swapExt(outputDir, vcf, ".vcf.gz", ".manwe.annot")
    annotate.isIntermediate = true
Sander Bollen's avatar
Sander Bollen committed
236 237 238
    add(annotate)

    val annotatedVcf = new ManweDownloadAfterAnnotate(this, annotate)
239
    annotatedVcf.output = swapExt(outputDir, annotate.output, ".manwe.annot", "manwe.annot.vcf.gz")
240
    add(annotatedVcf)
Sander Bollen's avatar
Sander Bollen committed
241

Sander Bollen's avatar
Sander Bollen committed
242 243 244 245 246 247 248 249
    val activates = sampleInfo map { x =>
      val sampleGroup = x._2.getOrElse("varda_group", Nil) match {
        case x: List[String] => x
        case Nil             => Nil
        case _               => throw new IllegalArgumentException("Sample tag 'varda_group' is not a list of strings")
      }
      importAndActivateSample(x._1, sampleGroup, vcf, gVcf, annotate)
    }
Sander Bollen's avatar
Sander Bollen committed
250 251

    val finalLn = new Ln(this)
Sander Bollen's avatar
Sander Bollen committed
252
    activates.foreach(x => finalLn.deps :+= x.output)
Sander Bollen's avatar
Sander Bollen committed
253
    finalLn.input = annotatedVcf.output
254
    finalLn.output = swapExt(outputDir, annotatedVcf.output, "manwe.annot.vcf.gz", ".varda_annotated.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
255
    finalLn.relative = true
Sander Bollen's avatar
Sander Bollen committed
256
    add(finalLn)
Sander Bollen's avatar
Sander Bollen committed
257 258 259 260

    finalLn.output
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
261 262 263 264 265
  def summaryFile = new File(outputDir, "Toucan.summary.json")

  def summaryFiles = Map()

  def summarySettings = Map()
Sander Bollen's avatar
Sander Bollen committed
266 267 268
}

object Toucan extends PipelineCommand