Toucan.scala 10.2 KB
Newer Older
bow's avatar
bow committed
1
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Sander Bollen's avatar
Sander Bollen committed
16 17
package nl.lumc.sasc.biopet.pipelines.toucan

18 19
import java.io.File

Peter van 't Hof's avatar
Peter van 't Hof committed
20 21
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
Sander Bollen's avatar
Sander Bollen committed
22
import nl.lumc.sasc.biopet.extensions.bcftools.BcftoolsView
Peter van 't Hof's avatar
Peter van 't Hof committed
23 24 25 26 27
import nl.lumc.sasc.biopet.extensions.bedtools.{ BedtoolsIntersect, BedtoolsMerge }
import nl.lumc.sasc.biopet.extensions.gatk.{ CatVariants, SelectVariants }
import nl.lumc.sasc.biopet.extensions.manwe.{ ManweAnnotateVcf, ManweSamplesImport }
import nl.lumc.sasc.biopet.extensions.tools.{ GvcfToBed, VcfWithVcf, VepNormalizer }
import nl.lumc.sasc.biopet.extensions.{ Bgzip, Ln, VariantEffectPredictor }
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import nl.lumc.sasc.biopet.utils.VcfUtils
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import nl.lumc.sasc.biopet.utils.config.Configurable
30
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
Sander Bollen's avatar
Sander Bollen committed
31 32 33
import org.broadinstitute.gatk.queue.QScript

/**
Peter van 't Hof's avatar
Peter van 't Hof committed
34 35 36 37
 * Pipeline to annotate a vcf file with VEP
 *
 * Created by ahbbollen on 15-1-15.
 */
Sander Bollen's avatar
Sander Bollen committed
38
class Toucan(val root: Configurable) extends QScript with BiopetQScript with SummaryQScript with Reference {
Sander Bollen's avatar
Sander Bollen committed
39 40 41
  def this() = this(null)

  @Input(doc = "Input VCF file", shortName = "Input", required = true)
42
  var inputVcf: File = _
Sander Bollen's avatar
Sander Bollen committed
43

Peter van 't Hof's avatar
Peter van 't Hof committed
44
  @Input(doc = "Input GVCF file", shortName = "gvcf", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
45 46
  var inputGvcf: Option[File] = None

47 48
  var outputVcf: Option[File] = None

Sander Bollen's avatar
Sander Bollen committed
49
  def sampleInfo: Map[String, Map[String, Any]] = root match {
Sander Bollen's avatar
Sander Bollen committed
50
    case m: MultiSampleQScript => m.samples.map { case (sampleId, sample) => sampleId -> sample.sampleTags }
51
    case null                  => VcfUtils.getSampleIds(inputVcf).map(x => x -> Map[String, Any]()).toMap
Sander Bollen's avatar
Sander Bollen committed
52 53
    case s: SampleLibraryTag   => s.sampleId.map(x => x -> Map[String, Any]()).toMap
    case _                     => throw new IllegalArgumentException("")
Sander Bollen's avatar
Sander Bollen committed
54 55
  }

56 57
  lazy val gonlVcfFile: Option[File] = config("gonl_vcf")
  lazy val exacVcfFile: Option[File] = config("exac_vcf")
58
  lazy val doVarda: Boolean = config("use_varda", default = false)
59

Sander Bollen's avatar
Sander Bollen committed
60
  def init(): Unit = {
61 62
    require(inputVcf != null, "No Input vcf given")
    inputFiles :+= new InputFile(inputVcf)
Sander Bollen's avatar
Sander Bollen committed
63 64
  }

65
  override def defaults = Map(
Sander Bollen's avatar
Sander Bollen committed
66
    "varianteffectpredictor" -> Map("everything" -> true, "failed" -> 1, "allow_non_variant" -> true)
67
  )
Sander Bollen's avatar
Sander Bollen committed
68 69

  def biopetScript(): Unit = {
70
    val useVcf: File = if (doVarda) {
Peter van 't Hof's avatar
Peter van 't Hof committed
71
      inputGvcf match {
72
        case Some(s) => varda(inputVcf, s)
73 74
        case _       => throw new IllegalArgumentException("You have not specified a GVCF file")
      }
75
    } else inputVcf
76 77 78 79 80

    val outputVcfFiles = BedRecordList.fromReference(referenceFasta())
      .scatter(config("bin_size", default = 50000000))
      .allRecords.map { region =>

Peter van 't Hof's avatar
Peter van 't Hof committed
81 82
        val chunkName = s"${region.chr}-${region.start}-${region.end}"
        val chunkDir = new File(outputDir, "chunk" + File.separator + chunkName)
Peter van 't Hof's avatar
Peter van 't Hof committed
83
        chunkDir.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
84 85 86
        val bedFile = new File(chunkDir, chunkName + ".bed")
        BedRecordList.fromList(List(region)).writeToFile(bedFile)
        bedFile.deleteOnExit()
Peter van 't Hof's avatar
Peter van 't Hof committed
87
        val sv = new SelectVariants(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
88 89
        sv.variant = useVcf
        sv.out = new File(chunkDir, chunkName + ".vcf.gz")
Peter van 't Hof's avatar
Peter van 't Hof committed
90
        sv.intervals :+= bedFile
Peter van 't Hof's avatar
Peter van 't Hof committed
91 92 93 94
        sv.isIntermediate = true
        add(sv)

        val vep = new VariantEffectPredictor(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
95
        vep.input = sv.out
Peter van 't Hof's avatar
Peter van 't Hof committed
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
        vep.output = new File(chunkDir, chunkName + ".vep.vcf")
        vep.isIntermediate = true
        add(vep)
        addSummarizable(vep, "variant_effect_predictor")

        val normalizer = new VepNormalizer(this)
        normalizer.inputVCF = vep.output
        normalizer.outputVcf = new File(chunkDir, chunkName + ".normalized.vcf.gz")
        normalizer.isIntermediate = true
        add(normalizer)

        var outputFile = normalizer.outputVcf

        gonlVcfFile match {
          case Some(gonlFile) =>
            val vcfWithVcf = new VcfWithVcf(this)
            vcfWithVcf.input = outputFile
            vcfWithVcf.secondaryVcf = gonlFile
            vcfWithVcf.output = swapExt(chunkDir, normalizer.outputVcf, ".vcf.gz", ".gonl.vcf.gz")
            vcfWithVcf.fields ::= ("AF", "AF_gonl", None)
            vcfWithVcf.isIntermediate = true
            add(vcfWithVcf)
            outputFile = vcfWithVcf.output
          case _ =>
        }

        exacVcfFile match {
          case Some(exacFile) =>
            val vcfWithVcf = new VcfWithVcf(this)
            vcfWithVcf.input = outputFile
            vcfWithVcf.secondaryVcf = exacFile
            vcfWithVcf.output = swapExt(chunkDir, outputFile, ".vcf.gz", ".exac.vcf.gz")
            vcfWithVcf.fields ::= ("AF", "AF_exac", None)
            vcfWithVcf.isIntermediate = true
            add(vcfWithVcf)
            outputFile = vcfWithVcf.output
          case _ =>
        }

        outputFile
136 137
      }

138
    val cv = new CatVariants(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
139
    cv.variant = outputVcfFiles.toList
140
    cv.outputFile = (gonlVcfFile, exacVcfFile) match {
141 142 143 144
      case (Some(_), Some(_)) => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.gonl.exac.vcf.gz")
      case (Some(_), _)       => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.gonl.vcf.gz")
      case (_, Some(_))       => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.exac.vcf.gz")
      case _                  => swapExt(outputDir, inputVcf, ".vcf.gz", ".vep.normalized.vcf.gz")
Peter van 't Hof's avatar
Peter van 't Hof committed
145
    }
146
    add(cv)
147 148

    addSummaryJobs()
Sander Bollen's avatar
Sander Bollen committed
149 150
  }

151
  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
152 153 154 155 156 157 158 159
   * Performs the varda import and activate for one sample
   *
   * @param sampleID the sampleID to be used
   * @param inputVcf the input VCF
   * @param gVCF the gVCF for coverage
   * @param annotation: ManweDownloadAnnotateVcf object of annotated vcf
   * @return
   */
Sander Bollen's avatar
Sander Bollen committed
160
  def importAndActivateSample(sampleID: String, sampleGroups: List[String], inputVcf: File,
161 162
                              gVCF: File, annotation: ManweAnnotateVcf): ManweActivateAfterAnnotImport = {

Sander Bollen's avatar
Sander Bollen committed
163 164
    val minGQ: Int = config("minimum_genome_quality", default = 20, namespace = "manwe")
    val isPublic: Boolean = config("varda_is_public", default = true, namespace = "manwe")
165 166 167 168 169 170 171 172 173

    val bedTrack = new GvcfToBed(this)
    bedTrack.inputVcf = gVCF
    bedTrack.outputBed = swapExt(outputDir, gVCF, ".vcf.gz", s""".$sampleID.bed""")
    bedTrack.minQuality = minGQ
    bedTrack.isIntermediate = true
    bedTrack.sample = Some(sampleID)
    add(bedTrack)

174 175 176 177 178 179
    val mergedBed = new BedtoolsMerge(this)
    mergedBed.input = bedTrack.outputBed
    mergedBed.dist = 5
    mergedBed.output = swapExt(outputDir, bedTrack.outputBed, ".bed", ".merged.bed")
    add(mergedBed)

180
    val bgzippedBed = new Bgzip(this)
181 182
    bgzippedBed.input = List(mergedBed.output)
    bgzippedBed.output = swapExt(outputDir, mergedBed.output, ".bed", ".bed.gz")
183 184 185
    add(bgzippedBed)

    val singleVcf = new BcftoolsView(this)
186 187
    singleVcf.input = inputVcf
    singleVcf.output = swapExt(outputDir, inputVcf, ".vcf.gz", s""".$sampleID.vcf.gz""")
188 189 190 191 192 193 194 195
    singleVcf.samples = List(sampleID)
    singleVcf.minAC = Some(1)
    singleVcf.isIntermediate = true
    add(singleVcf)

    val intersected = new BedtoolsIntersect(this)
    intersected.input = singleVcf.output
    intersected.intersectFile = bgzippedBed.output
196
    intersected.output = swapExt(outputDir, singleVcf.output, ".vcf.gz", ".intersected.vcf")
197 198
    add(intersected)

199 200 201 202 203
    val bgzippedIntersect = new Bgzip(this)
    bgzippedIntersect.input = List(intersected.output)
    bgzippedIntersect.output = swapExt(outputDir, intersected.output, ".vcf", ".vcf.gz")
    add(bgzippedIntersect)

204
    val imported = new ManweSamplesImport(this)
205
    imported.vcfs = List(bgzippedIntersect.output)
206 207 208
    imported.beds = List(bgzippedBed.output)
    imported.name = Some(sampleID)
    imported.public = isPublic
Sander Bollen's avatar
Sander Bollen committed
209
    imported.group = sampleGroups
210 211 212 213 214 215 216 217 218 219 220 221
    imported.waitToComplete = false
    imported.isIntermediate = true
    imported.output = swapExt(outputDir, intersected.output, ".vcf.gz", ".manwe.import")
    add(imported)

    val active = new ManweActivateAfterAnnotImport(this, annotation, imported)
    active.output = swapExt(outputDir, imported.output, ".import", ".activated")
    add(active)
    active

  }

Sander Bollen's avatar
Sander Bollen committed
222
  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
223 224 225 226 227 228
   * Perform varda analysis
   *
   * @param vcf input vcf
   * @param gVcf The gVCF to be used for coverage calculations
   * @return return vcf
   */
229
  def varda(vcf: File, gVcf: File): File = {
230

Sander Bollen's avatar
Sander Bollen committed
231
    val annotationQueries: List[String] = config("annotation_queries", default = List("GLOBAL *"), namespace = "manwe")
Sander Bollen's avatar
Sander Bollen committed
232 233

    val annotate = new ManweAnnotateVcf(this)
234
    annotate.vcf = vcf
235 236 237
    if (annotationQueries.nonEmpty) {
      annotate.queries = annotationQueries
    }
Sander Bollen's avatar
Sander Bollen committed
238
    annotate.waitToComplete = true
239 240
    annotate.output = swapExt(outputDir, vcf, ".vcf.gz", ".manwe.annot")
    annotate.isIntermediate = true
Sander Bollen's avatar
Sander Bollen committed
241 242 243
    add(annotate)

    val annotatedVcf = new ManweDownloadAfterAnnotate(this, annotate)
244
    annotatedVcf.output = swapExt(outputDir, annotate.output, ".manwe.annot", "manwe.annot.vcf.gz")
245
    add(annotatedVcf)
Sander Bollen's avatar
Sander Bollen committed
246

Sander Bollen's avatar
Sander Bollen committed
247 248 249 250 251 252 253 254
    val activates = sampleInfo map { x =>
      val sampleGroup = x._2.getOrElse("varda_group", Nil) match {
        case x: List[String] => x
        case Nil             => Nil
        case _               => throw new IllegalArgumentException("Sample tag 'varda_group' is not a list of strings")
      }
      importAndActivateSample(x._1, sampleGroup, vcf, gVcf, annotate)
    }
Sander Bollen's avatar
Sander Bollen committed
255 256

    val finalLn = new Ln(this)
Sander Bollen's avatar
Sander Bollen committed
257
    activates.foreach(x => finalLn.deps :+= x.output)
Sander Bollen's avatar
Sander Bollen committed
258
    finalLn.input = annotatedVcf.output
259
    finalLn.output = swapExt(outputDir, annotatedVcf.output, "manwe.annot.vcf.gz", ".varda_annotated.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
260
    finalLn.relative = true
Sander Bollen's avatar
Sander Bollen committed
261
    add(finalLn)
Sander Bollen's avatar
Sander Bollen committed
262 263 264 265

    finalLn.output
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
266 267 268 269 270
  def summaryFile = new File(outputDir, "Toucan.summary.json")

  def summaryFiles = Map()

  def summarySettings = Map()
Sander Bollen's avatar
Sander Bollen committed
271 272 273
}

object Toucan extends PipelineCommand