MergeAlleles.scala 5.54 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16 17 18
package nl.lumc.sasc.biopet.tools

import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
19 20 21 22 23

import htsjdk.samtools.reference.FastaSequenceFile
import htsjdk.variant.variantcontext.{ Allele, VariantContext, VariantContextBuilder }
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.vcf.{ VCFFileReader, VCFHeader }
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.core.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import nl.lumc.sasc.biopet.core.{ ToolCommand, ToolCommandFuntion }
Peter van 't Hof's avatar
Peter van 't Hof committed
26
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
Peter van 't Hof's avatar
Peter van 't Hof committed
27

Peter van 't Hof's avatar
Peter van 't Hof committed
28
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import scala.collection.{ mutable, SortedMap }
Peter van 't Hof's avatar
Peter van 't Hof committed
30 31
import scala.collection.mutable.{ Map, Set }

32
class MergeAlleles(val root: Configurable) extends ToolCommandFuntion {
Peter van 't Hof's avatar
Peter van 't Hof committed
33 34 35 36 37
  javaMainClass = getClass.getName

  @Input(doc = "Input vcf files", shortName = "input", required = true)
  var input: List[File] = Nil

Peter van 't Hof's avatar
Peter van 't Hof committed
38
  @Output(doc = "Output vcf file", shortName = "output", required = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
39 40
  var output: File = _

41 42 43
  @Output(doc = "Output vcf file index", shortName = "output", required = true)
  private var outputIndex: File = _

Peter van 't Hof's avatar
Peter van 't Hof committed
44 45
  var reference: File = config("reference")

Peter van 't Hof's avatar
Peter van 't Hof committed
46
  override def defaultCoreMemory = 1.0
Peter van 't Hof's avatar
Peter van 't Hof committed
47

Peter van 't Hof's avatar
Peter van 't Hof committed
48 49
  override def beforeGraph() {
    super.beforeGraph()
50 51 52 53
    if (output.getName.endsWith(".gz")) outputIndex = new File(output.getAbsolutePath + ".tbi")
    if (output.getName.endsWith(".vcf")) outputIndex = new File(output.getAbsolutePath + ".idx")
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
54 55 56 57 58
  override def commandLine = super.commandLine +
    repeat("-I", input) +
    required("-o", output) +
    required("-R", reference)
}
Peter van 't Hof's avatar
Peter van 't Hof committed
59 60

object MergeAlleles extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
61 62 63 64
  def apply(root: Configurable, input: List[File], output: File): MergeAlleles = {
    val mergeAlleles = new MergeAlleles(root)
    mergeAlleles.input = input
    mergeAlleles.output = output
Peter van 't Hof's avatar
Peter van 't Hof committed
65
    mergeAlleles
Peter van 't Hof's avatar
Peter van 't Hof committed
66 67
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
68 69 70
  case class Args(inputFiles: List[File] = Nil, outputFile: File = null, reference: File = null) extends AbstractArgs

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
71
    opt[File]('I', "inputVcf") minOccurs 2 required () unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
72 73
      c.copy(inputFiles = x :: c.inputFiles)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
74
    opt[File]('o', "outputVcf") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
75 76
      c.copy(outputFile = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
77
    opt[File]('R', "reference") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
78 79 80 81 82 83 84 85 86 87 88 89 90
      c.copy(reference = x)
    }
  }

  private val chunkSize = 50000

  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)

Peter van 't Hof's avatar
Peter van 't Hof committed
91
    val readers = commandArgs.inputFiles.map(new VCFFileReader(_, true))
Peter van 't Hof's avatar
Peter van 't Hof committed
92
    val referenceFile = new FastaSequenceFile(commandArgs.reference, true)
Sander Bollen's avatar
Sander Bollen committed
93 94 95 96
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
      setReferenceDictionary(referenceFile.getSequenceDictionary).
      setOutputFile(commandArgs.outputFile).
      build)
Peter van 't Hof's avatar
Peter van 't Hof committed
97 98 99 100 101
    val header = new VCFHeader
    val referenceDict = referenceFile.getSequenceDictionary
    header.setSequenceDictionary(referenceDict)
    writer.writeHeader(header)

Peter van 't Hof's avatar
Peter van 't Hof committed
102 103
    for (chr <- referenceDict.getSequences; chunk <- 0 to (chr.getSequenceLength / chunkSize)) {
      val output: mutable.Map[Int, List[VariantContext]] = mutable.Map()
Peter van 't Hof's avatar
Peter van 't Hof committed
104 105 106 107 108 109 110 111

      val chrName = chr.getSequenceName
      val begin = chunk * chunkSize + 1
      val end = {
        val e = (chunk + 1) * chunkSize
        if (e > chr.getSequenceLength) chr.getSequenceLength else e
      }

Peter van 't Hof's avatar
Peter van 't Hof committed
112
      for (reader <- readers; variant <- reader.query(chrName, begin, end)) {
Peter van 't Hof's avatar
Peter van 't Hof committed
113 114 115 116 117
        val start = variant.getStart
        if (output.contains(start)) output += variant.getStart -> (variant :: output(start))
        else output += variant.getStart -> List(variant)
      }

Peter van 't Hof's avatar
Peter van 't Hof committed
118 119
      for ((k, v) <- SortedMap(output.toSeq: _*)) {
        writer.add(mergeAlleles(v))
Peter van 't Hof's avatar
Peter van 't Hof committed
120 121
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
122
    writer.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
123
    readers.foreach(_.close)
Peter van 't Hof's avatar
Peter van 't Hof committed
124 125 126 127 128
  }

  def mergeAlleles(records: List[VariantContext]): VariantContext = {
    val longestRef = {
      var l: Array[Byte] = Array()
Peter van 't Hof's avatar
Peter van 't Hof committed
129
      for (a <- records.map(_.getReference.getBases) if a.length > l.length) l = a
Peter van 't Hof's avatar
Peter van 't Hof committed
130 131
      Allele.create(l, true)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
132
    val alleles: mutable.Set[Allele] = mutable.Set()
Peter van 't Hof's avatar
Peter van 't Hof committed
133
    val builder = new VariantContextBuilder
134
    builder.chr(records.head.getContig)
Peter van 't Hof's avatar
Peter van 't Hof committed
135 136 137 138 139 140 141 142 143
    builder.start(records.head.getStart)

    for (record <- records) {
      if (record.getReference == longestRef) alleles ++= record.getAlternateAlleles
      else {
        val suffix = longestRef.getBaseString.stripPrefix(record.getReference.getBaseString)
        for (r <- record.getAlternateAlleles) alleles += Allele.create(r.getBaseString + suffix)
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
144 145
    builder.alleles(longestRef :: alleles.toList)
    builder.computeEndFromAlleles(longestRef :: alleles.toList, records.head.getStart)
Peter van 't Hof's avatar
Peter van 't Hof committed
146 147 148
    builder.make
  }
}