Toucan.scala 10.6 KB
Newer Older
bow's avatar
bow committed
1
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Sander Bollen's avatar
Sander Bollen committed
16 17
package nl.lumc.sasc.biopet.pipelines.toucan

18 19
import java.io.File

20
import htsjdk.samtools.reference.FastaSequenceFile
Peter van 't Hof's avatar
Peter van 't Hof committed
21 22
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
Sander Bollen's avatar
Sander Bollen committed
23
import nl.lumc.sasc.biopet.extensions.bcftools.BcftoolsView
24 25 26 27 28
import nl.lumc.sasc.biopet.extensions.bedtools.{BedtoolsIntersect, BedtoolsMerge}
import nl.lumc.sasc.biopet.extensions.gatk.{CatVariants, SelectVariants}
import nl.lumc.sasc.biopet.extensions.manwe.{ManweAnnotateVcf, ManweSamplesImport}
import nl.lumc.sasc.biopet.extensions.tools.{GvcfToBed, VcfWithVcf, VepNormalizer}
import nl.lumc.sasc.biopet.extensions.{Bgzip, Ln, VariantEffectPredictor}
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import nl.lumc.sasc.biopet.utils.VcfUtils
Peter van 't Hof's avatar
Peter van 't Hof committed
30
import nl.lumc.sasc.biopet.utils.config.Configurable
31
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
Sander Bollen's avatar
Sander Bollen committed
32 33 34
import org.broadinstitute.gatk.queue.QScript

/**
Peter van 't Hof's avatar
Peter van 't Hof committed
35 36 37 38
 * Pipeline to annotate a vcf file with VEP
 *
 * Created by ahbbollen on 15-1-15.
 */
Sander Bollen's avatar
Sander Bollen committed
39
class Toucan(val root: Configurable) extends QScript with BiopetQScript with SummaryQScript with Reference {
Sander Bollen's avatar
Sander Bollen committed
40 41 42
  def this() = this(null)

  @Input(doc = "Input VCF file", shortName = "Input", required = true)
43
  var inputVcf: File = _
Sander Bollen's avatar
Sander Bollen committed
44

Peter van 't Hof's avatar
Peter van 't Hof committed
45
  @Input(doc = "Input GVCF file", shortName = "gvcf", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
46 47
  var inputGvcf: Option[File] = None

48 49
  var outputVcf: Option[File] = None

50 51 52 53 54 55 56 57 58
  lazy val minScatterGenomeSize: Long = config("min_scatter_genome_size", default = 75000000)

  lazy val enableScatter: Boolean = config("enable_scater", default = {
    val ref = new FastaSequenceFile(referenceFasta(), true)
    val refLenght = ref.getSequenceDictionary.getReferenceLength
    ref.close()
    refLenght > minScatterGenomeSize
  })

Sander Bollen's avatar
Sander Bollen committed
59
  def sampleInfo: Map[String, Map[String, Any]] = root match {
Sander Bollen's avatar
Sander Bollen committed
60
    case m: MultiSampleQScript => m.samples.map { case (sampleId, sample) => sampleId -> sample.sampleTags }
61
    case null                  => VcfUtils.getSampleIds(inputVcf).map(x => x -> Map[String, Any]()).toMap
Sander Bollen's avatar
Sander Bollen committed
62 63
    case s: SampleLibraryTag   => s.sampleId.map(x => x -> Map[String, Any]()).toMap
    case _                     => throw new IllegalArgumentException("")
Sander Bollen's avatar
Sander Bollen committed
64 65
  }

66 67
  lazy val gonlVcfFile: Option[File] = config("gonl_vcf")
  lazy val exacVcfFile: Option[File] = config("exac_vcf")
68
  lazy val doVarda: Boolean = config("use_varda", default = false)
69

Sander Bollen's avatar
Sander Bollen committed
70
  def init(): Unit = {
71 72
    require(inputVcf != null, "No Input vcf given")
    inputFiles :+= new InputFile(inputVcf)
Sander Bollen's avatar
Sander Bollen committed
73 74
  }

75
  override def defaults = Map(
Sander Bollen's avatar
Sander Bollen committed
76
    "varianteffectpredictor" -> Map("everything" -> true, "failed" -> 1, "allow_non_variant" -> true)
77
  )
Sander Bollen's avatar
Sander Bollen committed
78 79

  def biopetScript(): Unit = {
80
    val useVcf: File = if (doVarda) {
Peter van 't Hof's avatar
Peter van 't Hof committed
81
      inputGvcf match {
82
        case Some(s) => varda(inputVcf, s)
83 84
        case _       => throw new IllegalArgumentException("You have not specified a GVCF file")
      }
85
    } else inputVcf
86

87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
    if (enableScatter) {
      val outputVcfFiles = BedRecordList.fromReference(referenceFasta())
        .scatter(config("bin_size", default = 50000000))
        .allRecords.map { region =>

          val chunkName = s"${region.chr}-${region.start}-${region.end}"
          val chunkDir = new File(outputDir, "chunk" + File.separator + chunkName)
          chunkDir.mkdirs()
          val bedFile = new File(chunkDir, chunkName + ".bed")
          BedRecordList.fromList(List(region)).writeToFile(bedFile)
          bedFile.deleteOnExit()
          val sv = new SelectVariants(this)
          sv.variant = useVcf
          sv.out = new File(chunkDir, chunkName + ".vcf.gz")
          sv.intervals :+= bedFile
          sv.isIntermediate = true
          add(sv)

          runChunk(sv.out, chunkDir, chunkName)
Peter van 't Hof's avatar
Peter van 't Hof committed
106 107
        }

108 109 110 111 112 113 114
      val cv = new CatVariants(this)
      cv.variant = outputVcfFiles.toList
      cv.outputFile = (gonlVcfFile, exacVcfFile) match {
        case (Some(_), Some(_)) => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.gonl.exac.vcf.gz")
        case (Some(_), _) => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.gonl.vcf.gz")
        case (_, Some(_)) => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.exac.vcf.gz")
        case _ => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.vcf.gz")
115
      }
116 117 118 119 120
      add(cv)
    } else runChunk(useVcf, outputDir, "toucan")

    addSummaryJobs()
  }
121

122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
  def runChunk(file: File, chunkDir: File, chunkName: String): File = {
    val vep = new VariantEffectPredictor(this)
    vep.input = file
    vep.output = new File(chunkDir, chunkName + ".vep.vcf")
    vep.isIntermediate = true
    add(vep)
    addSummarizable(vep, "variant_effect_predictor")

    val normalizer = new VepNormalizer(this)
    normalizer.inputVCF = vep.output
    normalizer.outputVcf = new File(chunkDir, chunkName + ".vep.normalized.vcf.gz")
    normalizer.isIntermediate = true
    add(normalizer)

    var outputFile = normalizer.outputVcf

    gonlVcfFile match {
      case Some(gonlFile) =>
        val vcfWithVcf = new VcfWithVcf(this)
        vcfWithVcf.input = outputFile
        vcfWithVcf.secondaryVcf = gonlFile
        vcfWithVcf.output = swapExt(chunkDir, normalizer.outputVcf, ".vcf.gz", ".gonl.vcf.gz")
        vcfWithVcf.fields ::= ("AF", "AF_gonl", None)
        vcfWithVcf.isIntermediate = true
        add(vcfWithVcf)
        outputFile = vcfWithVcf.output
      case _ =>
Peter van 't Hof's avatar
Peter van 't Hof committed
149
    }
150

151 152 153 154 155 156 157 158 159 160 161 162 163 164
    exacVcfFile match {
      case Some(exacFile) =>
        val vcfWithVcf = new VcfWithVcf(this)
        vcfWithVcf.input = outputFile
        vcfWithVcf.secondaryVcf = exacFile
        vcfWithVcf.output = swapExt(chunkDir, outputFile, ".vcf.gz", ".exac.vcf.gz")
        vcfWithVcf.fields ::= ("AF", "AF_exac", None)
        vcfWithVcf.isIntermediate = true
        add(vcfWithVcf)
        outputFile = vcfWithVcf.output
      case _ =>
    }

    outputFile
Sander Bollen's avatar
Sander Bollen committed
165 166
  }

167
  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
168 169 170 171 172 173 174 175
   * Performs the varda import and activate for one sample
   *
   * @param sampleID the sampleID to be used
   * @param inputVcf the input VCF
   * @param gVCF the gVCF for coverage
   * @param annotation: ManweDownloadAnnotateVcf object of annotated vcf
   * @return
   */
Sander Bollen's avatar
Sander Bollen committed
176
  def importAndActivateSample(sampleID: String, sampleGroups: List[String], inputVcf: File,
177 178
                              gVCF: File, annotation: ManweAnnotateVcf): ManweActivateAfterAnnotImport = {

Sander Bollen's avatar
Sander Bollen committed
179 180
    val minGQ: Int = config("minimum_genome_quality", default = 20, namespace = "manwe")
    val isPublic: Boolean = config("varda_is_public", default = true, namespace = "manwe")
181 182 183 184 185 186 187 188 189

    val bedTrack = new GvcfToBed(this)
    bedTrack.inputVcf = gVCF
    bedTrack.outputBed = swapExt(outputDir, gVCF, ".vcf.gz", s""".$sampleID.bed""")
    bedTrack.minQuality = minGQ
    bedTrack.isIntermediate = true
    bedTrack.sample = Some(sampleID)
    add(bedTrack)

190 191 192 193 194 195
    val mergedBed = new BedtoolsMerge(this)
    mergedBed.input = bedTrack.outputBed
    mergedBed.dist = 5
    mergedBed.output = swapExt(outputDir, bedTrack.outputBed, ".bed", ".merged.bed")
    add(mergedBed)

196
    val bgzippedBed = new Bgzip(this)
197 198
    bgzippedBed.input = List(mergedBed.output)
    bgzippedBed.output = swapExt(outputDir, mergedBed.output, ".bed", ".bed.gz")
199 200 201
    add(bgzippedBed)

    val singleVcf = new BcftoolsView(this)
202 203
    singleVcf.input = inputVcf
    singleVcf.output = swapExt(outputDir, inputVcf, ".vcf.gz", s""".$sampleID.vcf.gz""")
204 205 206 207 208 209 210 211
    singleVcf.samples = List(sampleID)
    singleVcf.minAC = Some(1)
    singleVcf.isIntermediate = true
    add(singleVcf)

    val intersected = new BedtoolsIntersect(this)
    intersected.input = singleVcf.output
    intersected.intersectFile = bgzippedBed.output
212
    intersected.output = swapExt(outputDir, singleVcf.output, ".vcf.gz", ".intersected.vcf")
213 214
    add(intersected)

215 216 217 218 219
    val bgzippedIntersect = new Bgzip(this)
    bgzippedIntersect.input = List(intersected.output)
    bgzippedIntersect.output = swapExt(outputDir, intersected.output, ".vcf", ".vcf.gz")
    add(bgzippedIntersect)

220
    val imported = new ManweSamplesImport(this)
221
    imported.vcfs = List(bgzippedIntersect.output)
222 223 224
    imported.beds = List(bgzippedBed.output)
    imported.name = Some(sampleID)
    imported.public = isPublic
Sander Bollen's avatar
Sander Bollen committed
225
    imported.group = sampleGroups
226 227 228 229 230 231 232 233 234 235 236 237
    imported.waitToComplete = false
    imported.isIntermediate = true
    imported.output = swapExt(outputDir, intersected.output, ".vcf.gz", ".manwe.import")
    add(imported)

    val active = new ManweActivateAfterAnnotImport(this, annotation, imported)
    active.output = swapExt(outputDir, imported.output, ".import", ".activated")
    add(active)
    active

  }

Sander Bollen's avatar
Sander Bollen committed
238
  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
239 240 241 242 243 244
   * Perform varda analysis
   *
   * @param vcf input vcf
   * @param gVcf The gVCF to be used for coverage calculations
   * @return return vcf
   */
245
  def varda(vcf: File, gVcf: File): File = {
246

Sander Bollen's avatar
Sander Bollen committed
247
    val annotationQueries: List[String] = config("annotation_queries", default = List("GLOBAL *"), namespace = "manwe")
Sander Bollen's avatar
Sander Bollen committed
248 249

    val annotate = new ManweAnnotateVcf(this)
250
    annotate.vcf = vcf
251 252 253
    if (annotationQueries.nonEmpty) {
      annotate.queries = annotationQueries
    }
Sander Bollen's avatar
Sander Bollen committed
254
    annotate.waitToComplete = true
255 256
    annotate.output = swapExt(outputDir, vcf, ".vcf.gz", ".manwe.annot")
    annotate.isIntermediate = true
Sander Bollen's avatar
Sander Bollen committed
257 258 259
    add(annotate)

    val annotatedVcf = new ManweDownloadAfterAnnotate(this, annotate)
260
    annotatedVcf.output = swapExt(outputDir, annotate.output, ".manwe.annot", "manwe.annot.vcf.gz")
261
    add(annotatedVcf)
Sander Bollen's avatar
Sander Bollen committed
262

Sander Bollen's avatar
Sander Bollen committed
263 264 265 266 267 268 269 270
    val activates = sampleInfo map { x =>
      val sampleGroup = x._2.getOrElse("varda_group", Nil) match {
        case x: List[String] => x
        case Nil             => Nil
        case _               => throw new IllegalArgumentException("Sample tag 'varda_group' is not a list of strings")
      }
      importAndActivateSample(x._1, sampleGroup, vcf, gVcf, annotate)
    }
Sander Bollen's avatar
Sander Bollen committed
271 272

    val finalLn = new Ln(this)
Sander Bollen's avatar
Sander Bollen committed
273
    activates.foreach(x => finalLn.deps :+= x.output)
Sander Bollen's avatar
Sander Bollen committed
274
    finalLn.input = annotatedVcf.output
275
    finalLn.output = swapExt(outputDir, annotatedVcf.output, "manwe.annot.vcf.gz", ".varda_annotated.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
276
    finalLn.relative = true
Sander Bollen's avatar
Sander Bollen committed
277
    add(finalLn)
Sander Bollen's avatar
Sander Bollen committed
278 279 280 281

    finalLn.output
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
282 283 284 285 286
  def summaryFile = new File(outputDir, "Toucan.summary.json")

  def summaryFiles = Map()

  def summarySettings = Map()
Sander Bollen's avatar
Sander Bollen committed
287 288 289
}

object Toucan extends PipelineCommand