BastyGenerateFasta.scala 10.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
16 17
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
18 19
import java.io.{ File, PrintWriter }

20
import htsjdk.samtools.SamReaderFactory
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import htsjdk.samtools.reference.IndexedFastaSequenceFile
22 23
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf.VCFFileReader
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.utils.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
25 26
import nl.lumc.sasc.biopet.utils.VcfUtils._

27
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import scala.collection.mutable.ListBuffer
29 30 31 32

object BastyGenerateFasta extends ToolCommand {
  case class Args(inputVcf: File = null,
                  outputVariants: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
33 34
                  outputConsensus: File = null,
                  outputConsensusVariants: File = null,
35 36 37
                  bamFile: File = null,
                  snpsOnly: Boolean = false,
                  sampleName: String = null,
38
                  outputName: String = null,
39
                  minAD: Int = 8,
Peter van 't Hof's avatar
Peter van 't Hof committed
40 41
                  minDepth: Int = 8,
                  reference: File = null) extends AbstractArgs
42 43

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
44
    opt[File]('V', "inputVcf") unbounded () valueName "<file>" action { (x, c) =>
45
      c.copy(inputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
46
    } text "vcf file, needed for outputVariants and outputConsensusVariants" validate { x =>
Peter van 't Hof's avatar
Peter van 't Hof committed
47 48
      if (x.exists) success else failure("File does not exist: " + x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
49
    opt[File]("bamFile") unbounded () valueName "<file>" action { (x, c) =>
50
      c.copy(bamFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
51
    } text "bam file, needed for outputConsensus and outputConsensusVariants" validate { x =>
Peter van 't Hof's avatar
Peter van 't Hof committed
52 53
      if (x.exists) success else failure("File does not exist: " + x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
54
    opt[File]("outputVariants") maxOccurs 1 unbounded () valueName "<file>" action { (x, c) =>
55
      c.copy(outputVariants = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
56 57
    } text "fasta with only variants from vcf file"
    opt[File]("outputConsensus") maxOccurs 1 unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
58
      c.copy(outputConsensus = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
59 60
    } text "Consensus fasta from bam, always reference bases else 'N'"
    opt[File]("outputConsensusVariants") maxOccurs 1 unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
61
      c.copy(outputConsensusVariants = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
62
    } text "Consensus fasta from bam with variants from vcf file, always reference bases else 'N'"
63 64
    opt[Unit]("snpsOnly") unbounded () action { (x, c) =>
      c.copy(snpsOnly = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
65
    } text "Only use snps from vcf file"
66 67
    opt[String]("sampleName") unbounded () action { (x, c) =>
      c.copy(sampleName = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
68
    } text "Sample name in vcf file"
69 70
    opt[String]("outputName") required () unbounded () action { (x, c) =>
      c.copy(outputName = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
71
    } text "Output name in fasta file header"
72 73
    opt[Int]("minAD") unbounded () action { (x, c) =>
      c.copy(minAD = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
74
    } text "min AD value in vcf file for sample. Defaults to: 8"
Peter van 't Hof's avatar
Peter van 't Hof committed
75 76
    opt[Int]("minDepth") unbounded () action { (x, c) =>
      c.copy(minDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
77
    } text "min depth in bam file. Defaults to: 8"
Peter van 't Hof's avatar
Peter van 't Hof committed
78 79
    opt[File]("reference") unbounded () action { (x, c) =>
      c.copy(reference = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
80
    } text "Indexed reference fasta file" validate { x =>
Peter van 't Hof's avatar
Peter van 't Hof committed
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
      if (x.exists) success else failure("File does not exist: " + x)
    }

    checkConfig { c =>
      {
        val err: ListBuffer[String] = ListBuffer()
        if (c.outputConsensus != null || c.outputConsensusVariants != null) {
          if (c.reference == null)
            err.add("No reference suplied")
          else {
            val index = new File(c.reference.getAbsolutePath + ".fai")
            if (!index.exists) err.add("Reference does not have index")
          }
          if (c.outputConsensusVariants != null && c.inputVcf == null)
            err.add("To write outputVariants input vcf is required, please use --inputVcf option")
          if (c.sampleName != null && c.bamFile == null)
            err.add("To write Consensus input bam file is required, please use --bamFile option")
        }
        if (c.outputVariants != null && c.inputVcf == null)
          err.add("To write outputVariants input vcf is required, please use --inputVcf option")
        if (c.outputVariants == null && c.outputConsensus == null && c.outputConsensusVariants == null)
          err.add("No output file selected")
        if (err.isEmpty) success else failure(err.mkString("", "\nError: ", "\n"))
      }
    }
106 107
  }

Sander Bollen's avatar
Sander Bollen committed
108
  protected implicit var cmdArgs: Args = _
Peter van 't Hof's avatar
Peter van 't Hof committed
109
  private val chunkSize = 100000
110 111 112 113 114 115

  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
116
    cmdArgs = argsParser.parse(args, Args()) getOrElse(throw new IllegalArgumentException)
Peter van 't Hof's avatar
Peter van 't Hof committed
117

118 119 120 121 122 123
    if (cmdArgs.outputVariants != null) {
      writeVariantsOnly()
    }
    if (cmdArgs.outputConsensus != null || cmdArgs.outputConsensusVariants != null) {
      writeConsensus()
    }
124 125

    //FIXME: what to do if outputcConsensus is set, but not outputConsensusVariants (and vice versa)?
Peter van 't Hof's avatar
Peter van 't Hof committed
126
  }
127

Peter van 't Hof's avatar
Peter van 't Hof committed
128
  protected def writeConsensus() {
129
    //FIXME: preferably split this up in functions, so that they can be unit tested
Peter van 't Hof's avatar
Peter van 't Hof committed
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
    val referenceFile = new IndexedFastaSequenceFile(cmdArgs.reference)
    val referenceDict = referenceFile.getSequenceDictionary

    for (chr <- referenceDict.getSequences) {
      val chunks = (for (chunk <- (0 to (chr.getSequenceLength / chunkSize)).par) yield {
        val chrName = chr.getSequenceName
        val begin = chunk * chunkSize + 1
        val end = {
          val e = (chunk + 1) * chunkSize
          if (e > chr.getSequenceLength) chr.getSequenceLength else e
        }

        logger.info("begin on: chrName: " + chrName + "  begin: " + begin + "  end: " + end)

        val referenceSequence = referenceFile.getSubsequenceAt(chrName, begin, end)

        val variants: Map[(Int, Int), VariantContext] = if (cmdArgs.inputVcf != null) {
          val reader = new VCFFileReader(cmdArgs.inputVcf, true)
Peter van 't Hof's avatar
Peter van 't Hof committed
148
          (for (variant <- reader.query(chrName, begin, end) if !cmdArgs.snpsOnly || variant.isSNP) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
149 150 151 152 153 154 155 156 157 158 159 160
            (variant.getStart, variant.getEnd) -> variant
          }).toMap
        } else Map()

        val coverage: Array[Int] = Array.fill(end - begin + 1)(0)
        if (cmdArgs.bamFile != null) {
          val inputSam = SamReaderFactory.makeDefault.open(cmdArgs.bamFile)
          for (r <- inputSam.query(chr.getSequenceName, begin, end, false)) {
            val s = if (r.getAlignmentStart < begin) begin else r.getAlignmentStart
            val e = if (r.getAlignmentEnd > end) end else r.getAlignmentEnd
            for (t <- s to e) coverage(t - begin) += 1
          }
161
        } else {
Peter van 't Hof's avatar
Peter van 't Hof committed
162
          for (t <- coverage.indices) coverage(t) = cmdArgs.minDepth
Peter van 't Hof's avatar
Peter van 't Hof committed
163 164
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
165
        val consensus = for (t <- coverage.indices) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
166 167 168
          if (coverage(t) >= cmdArgs.minDepth) referenceSequence.getBases()(t).toChar
          else 'N'
        }
169 170 171 172 173 174 175 176 177 178 179 180 181

        val buffer: StringBuilder = new StringBuilder()
        if (cmdArgs.outputConsensusVariants != null) {
          var consensusPos = 0
          while (consensusPos < consensus.size) {
            val genomePos = consensusPos + begin
            val variant = variants.find(a => a._1._1 >= genomePos && a._1._2 <= genomePos)
            if (variant.isDefined) {
              logger.info(variant.get._2)
              val stripPrefix = if (variant.get._1._1 < begin) begin - variant.get._1._1 else 0
              val stripSufix = if (variant.get._1._2 > end) variant.get._1._2 - end else 0
              val allele = getMaxAllele(variant.get._2)
              consensusPos += variant.get._2.getReference.getBases.length
Peter van 't Hof's avatar
Peter van 't Hof committed
182
              buffer.append(allele.substring(stripPrefix, allele.length - stripSufix))
183 184 185 186 187 188 189
            } else {
              buffer.append(consensus(consensusPos))
              consensusPos += 1
            }
          }
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
190
        chunk -> (consensus.mkString.toUpperCase, buffer.toString().toUpperCase)
Peter van 't Hof's avatar
Peter van 't Hof committed
191
      }).toMap
192 193 194 195 196 197
      if (cmdArgs.outputConsensus != null) {
        val writer = new PrintWriter(cmdArgs.outputConsensus)
        writer.println(">" + cmdArgs.outputName)
        for (c <- chunks.keySet.toList.sortWith(_ < _)) {
          writer.print(chunks(c)._1)
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
198
        writer.println()
Peter van 't Hof's avatar
Peter van 't Hof committed
199
        writer.close()
200 201 202 203 204 205 206
      }
      if (cmdArgs.outputConsensusVariants != null) {
        val writer = new PrintWriter(cmdArgs.outputConsensusVariants)
        writer.println(">" + cmdArgs.outputName)
        for (c <- chunks.keySet.toList.sortWith(_ < _)) {
          writer.print(chunks(c)._2)
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
207
        writer.println()
Peter van 't Hof's avatar
Peter van 't Hof committed
208
        writer.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
209 210
      }
    }
211 212
  }

Sander Bollen's avatar
Sander Bollen committed
213
  protected[tools] def writeVariantsOnly() {
Peter van 't Hof's avatar
Peter van 't Hof committed
214
    val writer = new PrintWriter(cmdArgs.outputVariants)
Peter van 't Hof's avatar
Peter van 't Hof committed
215
    writer.println(">" + cmdArgs.outputName)
Peter van 't Hof's avatar
Peter van 't Hof committed
216
    val vcfReader = new VCFFileReader(cmdArgs.inputVcf, false)
Peter van 't Hof's avatar
Peter van 't Hof committed
217
    for (vcfRecord <- vcfReader if !cmdArgs.snpsOnly || vcfRecord.isSNP) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
218
      writer.print(getMaxAllele(vcfRecord))
219 220
    }
    writer.println()
Peter van 't Hof's avatar
Peter van 't Hof committed
221 222
    writer.close()
    vcfReader.close()
223 224
  }

Sander Bollen's avatar
Sander Bollen committed
225 226 227 228
  // TODO: what does this do?
  // Seems to me it finds the allele in a sample with the highest AD value
  // if this allele is shorter than the largest allele, it will append '-' to the string
  protected[tools] def getMaxAllele(vcfRecord: VariantContext)(implicit cmdArgs: Args): String = {
Peter van 't Hof's avatar
Peter van 't Hof committed
229
    val maxSize = getLongestAllele(vcfRecord).getBases.length
230

231 232 233
    if (cmdArgs.sampleName == null) {
      return fillAllele(vcfRecord.getReference.getBaseString, maxSize)
    }
234

Peter van 't Hof's avatar
Peter van 't Hof committed
235
    val genotype = vcfRecord.getGenotype(cmdArgs.sampleName)
236 237 238 239 240

    if (genotype == null) {
      return fillAllele("", maxSize)
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
241
    val AD = if (genotype.hasAD) genotype.getAD else Array.fill(vcfRecord.getAlleles.size())(cmdArgs.minAD)
242 243 244 245 246

    if (AD == null) {
      return fillAllele("", maxSize)
    }

247
    val maxADid = AD.zipWithIndex.maxBy(_._1)._2
248 249 250 251 252

    if (AD(maxADid) < cmdArgs.minAD) {
      return fillAllele("", maxSize)
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
253
    fillAllele(vcfRecord.getAlleles()(maxADid).getBaseString, maxSize)
254
  }
255
}