BastyGenerateFasta.scala 11.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
16 17
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
18 19
import java.io.{ File, PrintWriter }

Peter van 't Hof's avatar
Peter van 't Hof committed
20 21
import htsjdk.samtools.SamReaderFactory
import htsjdk.samtools.reference.IndexedFastaSequenceFile
22 23 24
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.core.config.Configurable
25
import nl.lumc.sasc.biopet.core.{ Reference, ToolCommand, ToolCommandFuntion }
Peter van 't Hof's avatar
Peter van 't Hof committed
26
import nl.lumc.sasc.biopet.utils.VcfUtils._
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
Peter van 't Hof's avatar
Peter van 't Hof committed
28

29
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
30
import scala.collection.mutable.ListBuffer
31

Peter van 't Hof's avatar
Peter van 't Hof committed
32
class BastyGenerateFasta(val root: Configurable) extends ToolCommandFuntion with Reference {
33 34 35 36 37 38 39 40
  javaMainClass = getClass.getName

  @Input(doc = "Input vcf file", required = false)
  var inputVcf: File = _

  @Input(doc = "Bam File", required = false)
  var bamFile: File = _

Peter van 't Hof's avatar
Peter van 't Hof committed
41
  @Input(doc = "reference", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
42
  var reference: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
43

44 45 46
  @Output(doc = "Output fasta, variants only", required = false)
  var outputVariants: File = _

Peter van 't Hof's avatar
Peter van 't Hof committed
47 48
  @Output(doc = "Output fasta, variants only", required = false)
  var outputConsensus: File = _
49

Peter van 't Hof's avatar
Peter van 't Hof committed
50 51
  @Output(doc = "Output fasta, variants only", required = false)
  var outputConsensusVariants: File = _
52

Peter van 't Hof's avatar
Peter van 't Hof committed
53 54
  var snpsOnly: Boolean = config("snps_only", default = false)
  var sampleName: String = _
55
  var minAD: Int = config("min_ad", default = 8)
Peter van 't Hof's avatar
Peter van 't Hof committed
56 57
  var minDepth: Int = config("min_depth", default = 8)
  var outputName: String = _
58

Peter van 't Hof's avatar
Peter van 't Hof committed
59
  override val defaultCoreMemory = 4.0
60

Peter van 't Hof's avatar
Peter van 't Hof committed
61 62
  override def beforeGraph(): Unit = {
    super.beforeGraph()
Peter van 't Hof's avatar
Peter van 't Hof committed
63 64 65
    reference = referenceFasta()
  }

66 67 68 69
  override def commandLine = super.commandLine +
    optional("--inputVcf", inputVcf) +
    optional("--bamFile", bamFile) +
    optional("--outputVariants", outputVariants) +
Peter van 't Hof's avatar
Peter van 't Hof committed
70 71
    optional("--outputConsensus", outputConsensus) +
    optional("--outputConsensusVariants", outputConsensusVariants) +
72 73
    conditional(snpsOnly, "--snpsOnly") +
    optional("--sampleName", sampleName) +
Peter van 't Hof's avatar
Peter van 't Hof committed
74
    required("--outputName", outputName) +
75
    optional("--minAD", minAD) +
Peter van 't Hof's avatar
Peter van 't Hof committed
76 77
    optional("--minDepth", minDepth) +
    optional("--reference", reference)
78 79 80 81 82
}

object BastyGenerateFasta extends ToolCommand {
  case class Args(inputVcf: File = null,
                  outputVariants: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
83 84
                  outputConsensus: File = null,
                  outputConsensusVariants: File = null,
85 86 87
                  bamFile: File = null,
                  snpsOnly: Boolean = false,
                  sampleName: String = null,
88
                  outputName: String = null,
89
                  minAD: Int = 8,
Peter van 't Hof's avatar
Peter van 't Hof committed
90 91
                  minDepth: Int = 8,
                  reference: File = null) extends AbstractArgs
92 93

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
94
    opt[File]('V', "inputVcf") unbounded () valueName "<file>" action { (x, c) =>
95
      c.copy(inputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
96
    } text "vcf file, needed for outputVariants and outputConsensusVariants" validate { x =>
Peter van 't Hof's avatar
Peter van 't Hof committed
97 98
      if (x.exists) success else failure("File does not exist: " + x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
99
    opt[File]("bamFile") unbounded () valueName "<file>" action { (x, c) =>
100
      c.copy(bamFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
101
    } text "bam file, needed for outputConsensus and outputConsensusVariants" validate { x =>
Peter van 't Hof's avatar
Peter van 't Hof committed
102 103
      if (x.exists) success else failure("File does not exist: " + x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
104
    opt[File]("outputVariants") maxOccurs 1 unbounded () valueName "<file>" action { (x, c) =>
105
      c.copy(outputVariants = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
106 107
    } text "fasta with only variants from vcf file"
    opt[File]("outputConsensus") maxOccurs 1 unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
108
      c.copy(outputConsensus = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
109 110
    } text "Consensus fasta from bam, always reference bases else 'N'"
    opt[File]("outputConsensusVariants") maxOccurs 1 unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
111
      c.copy(outputConsensusVariants = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
112
    } text "Consensus fasta from bam with variants from vcf file, always reference bases else 'N'"
113 114
    opt[Unit]("snpsOnly") unbounded () action { (x, c) =>
      c.copy(snpsOnly = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
115
    } text "Only use snps from vcf file"
116 117
    opt[String]("sampleName") unbounded () action { (x, c) =>
      c.copy(sampleName = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
118
    } text "Sample name in vcf file"
119 120
    opt[String]("outputName") required () unbounded () action { (x, c) =>
      c.copy(outputName = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
121
    } text "Output name in fasta file header"
122 123
    opt[Int]("minAD") unbounded () action { (x, c) =>
      c.copy(minAD = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
124
    } text "min AD value in vcf file for sample. Defaults to: 8"
Peter van 't Hof's avatar
Peter van 't Hof committed
125 126
    opt[Int]("minDepth") unbounded () action { (x, c) =>
      c.copy(minDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
127
    } text "min depth in bam file. Defaults to: 8"
Peter van 't Hof's avatar
Peter van 't Hof committed
128 129
    opt[File]("reference") unbounded () action { (x, c) =>
      c.copy(reference = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
130
    } text "Indexed reference fasta file" validate { x =>
Peter van 't Hof's avatar
Peter van 't Hof committed
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
      if (x.exists) success else failure("File does not exist: " + x)
    }

    checkConfig { c =>
      {
        val err: ListBuffer[String] = ListBuffer()
        if (c.outputConsensus != null || c.outputConsensusVariants != null) {
          if (c.reference == null)
            err.add("No reference suplied")
          else {
            val index = new File(c.reference.getAbsolutePath + ".fai")
            if (!index.exists) err.add("Reference does not have index")
          }
          if (c.outputConsensusVariants != null && c.inputVcf == null)
            err.add("To write outputVariants input vcf is required, please use --inputVcf option")
          if (c.sampleName != null && c.bamFile == null)
            err.add("To write Consensus input bam file is required, please use --bamFile option")
        }
        if (c.outputVariants != null && c.inputVcf == null)
          err.add("To write outputVariants input vcf is required, please use --inputVcf option")
        if (c.outputVariants == null && c.outputConsensus == null && c.outputConsensusVariants == null)
          err.add("No output file selected")
        if (err.isEmpty) success else failure(err.mkString("", "\nError: ", "\n"))
      }
    }
156 157
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
158 159
  protected var cmdArgs: Args = _
  private val chunkSize = 100000
160 161 162 163 164 165

  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
166 167 168 169 170
    cmdArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    if (cmdArgs.outputVariants != null) writeVariantsOnly()
    if (cmdArgs.outputConsensus != null || cmdArgs.outputConsensusVariants != null) writeConsensus()
  }
171

Peter van 't Hof's avatar
Peter van 't Hof committed
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
  protected def writeConsensus() {
    val referenceFile = new IndexedFastaSequenceFile(cmdArgs.reference)
    val referenceDict = referenceFile.getSequenceDictionary

    for (chr <- referenceDict.getSequences) {
      val chunks = (for (chunk <- (0 to (chr.getSequenceLength / chunkSize)).par) yield {
        val chrName = chr.getSequenceName
        val begin = chunk * chunkSize + 1
        val end = {
          val e = (chunk + 1) * chunkSize
          if (e > chr.getSequenceLength) chr.getSequenceLength else e
        }

        logger.info("begin on: chrName: " + chrName + "  begin: " + begin + "  end: " + end)

        val referenceSequence = referenceFile.getSubsequenceAt(chrName, begin, end)

        val variants: Map[(Int, Int), VariantContext] = if (cmdArgs.inputVcf != null) {
          val reader = new VCFFileReader(cmdArgs.inputVcf, true)
Peter van 't Hof's avatar
Peter van 't Hof committed
191
          (for (variant <- reader.query(chrName, begin, end) if !cmdArgs.snpsOnly || variant.isSNP) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
192 193 194 195 196 197 198 199 200 201 202 203
            (variant.getStart, variant.getEnd) -> variant
          }).toMap
        } else Map()

        val coverage: Array[Int] = Array.fill(end - begin + 1)(0)
        if (cmdArgs.bamFile != null) {
          val inputSam = SamReaderFactory.makeDefault.open(cmdArgs.bamFile)
          for (r <- inputSam.query(chr.getSequenceName, begin, end, false)) {
            val s = if (r.getAlignmentStart < begin) begin else r.getAlignmentStart
            val e = if (r.getAlignmentEnd > end) end else r.getAlignmentEnd
            for (t <- s to e) coverage(t - begin) += 1
          }
204
        } else {
Peter van 't Hof's avatar
Peter van 't Hof committed
205
          for (t <- coverage.indices) coverage(t) = cmdArgs.minDepth
Peter van 't Hof's avatar
Peter van 't Hof committed
206 207
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
208
        val consensus = for (t <- coverage.indices) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
209 210 211
          if (coverage(t) >= cmdArgs.minDepth) referenceSequence.getBases()(t).toChar
          else 'N'
        }
212 213 214 215 216 217 218 219 220 221 222 223 224

        val buffer: StringBuilder = new StringBuilder()
        if (cmdArgs.outputConsensusVariants != null) {
          var consensusPos = 0
          while (consensusPos < consensus.size) {
            val genomePos = consensusPos + begin
            val variant = variants.find(a => a._1._1 >= genomePos && a._1._2 <= genomePos)
            if (variant.isDefined) {
              logger.info(variant.get._2)
              val stripPrefix = if (variant.get._1._1 < begin) begin - variant.get._1._1 else 0
              val stripSufix = if (variant.get._1._2 > end) variant.get._1._2 - end else 0
              val allele = getMaxAllele(variant.get._2)
              consensusPos += variant.get._2.getReference.getBases.length
Peter van 't Hof's avatar
Peter van 't Hof committed
225
              buffer.append(allele.substring(stripPrefix, allele.length - stripSufix))
226 227 228 229 230 231 232
            } else {
              buffer.append(consensus(consensusPos))
              consensusPos += 1
            }
          }
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
233
        chunk -> (consensus.mkString.toUpperCase, buffer.toString().toUpperCase)
Peter van 't Hof's avatar
Peter van 't Hof committed
234
      }).toMap
235 236 237 238 239 240
      if (cmdArgs.outputConsensus != null) {
        val writer = new PrintWriter(cmdArgs.outputConsensus)
        writer.println(">" + cmdArgs.outputName)
        for (c <- chunks.keySet.toList.sortWith(_ < _)) {
          writer.print(chunks(c)._1)
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
241
        writer.println()
Peter van 't Hof's avatar
Peter van 't Hof committed
242
        writer.close()
243 244 245 246 247 248 249
      }
      if (cmdArgs.outputConsensusVariants != null) {
        val writer = new PrintWriter(cmdArgs.outputConsensusVariants)
        writer.println(">" + cmdArgs.outputName)
        for (c <- chunks.keySet.toList.sortWith(_ < _)) {
          writer.print(chunks(c)._2)
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
250
        writer.println()
Peter van 't Hof's avatar
Peter van 't Hof committed
251
        writer.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
252 253
      }
    }
254 255
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
256 257
  protected def writeVariantsOnly() {
    val writer = new PrintWriter(cmdArgs.outputVariants)
Peter van 't Hof's avatar
Peter van 't Hof committed
258
    writer.println(">" + cmdArgs.outputName)
Peter van 't Hof's avatar
Peter van 't Hof committed
259
    val vcfReader = new VCFFileReader(cmdArgs.inputVcf, false)
Peter van 't Hof's avatar
Peter van 't Hof committed
260
    for (vcfRecord <- vcfReader if !cmdArgs.snpsOnly || vcfRecord.isSNP) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
261
      writer.print(getMaxAllele(vcfRecord))
262 263
    }
    writer.println()
Peter van 't Hof's avatar
Peter van 't Hof committed
264 265
    writer.close()
    vcfReader.close()
266 267
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
268
  protected def getMaxAllele(vcfRecord: VariantContext): String = {
Peter van 't Hof's avatar
Peter van 't Hof committed
269
    val maxSize = getLongestAllele(vcfRecord).getBases.length
270

Peter van 't Hof's avatar
Peter van 't Hof committed
271
    if (cmdArgs.sampleName == null) return fillAllele(vcfRecord.getReference.getBaseString, maxSize)
272

Peter van 't Hof's avatar
Peter van 't Hof committed
273 274
    val genotype = vcfRecord.getGenotype(cmdArgs.sampleName)
    if (genotype == null) return fillAllele("", maxSize)
Peter van 't Hof's avatar
Peter van 't Hof committed
275
    val AD = if (genotype.hasAD) genotype.getAD else Array.fill(vcfRecord.getAlleles.size())(cmdArgs.minAD)
Peter van 't Hof's avatar
Peter van 't Hof committed
276
    if (AD == null) return fillAllele("", maxSize)
277
    val maxADid = AD.zipWithIndex.maxBy(_._1)._2
Peter van 't Hof's avatar
Peter van 't Hof committed
278
    if (AD(maxADid) < cmdArgs.minAD) return fillAllele("", maxSize)
Peter van 't Hof's avatar
Peter van 't Hof committed
279
    fillAllele(vcfRecord.getAlleles()(maxADid).getBaseString, maxSize)
280 281
  }
}