BastyGenerateFasta.scala 11.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
16
17
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
18
19
import java.io.{ File, PrintWriter }

Peter van 't Hof's avatar
Peter van 't Hof committed
20
21
import htsjdk.samtools.SamReaderFactory
import htsjdk.samtools.reference.IndexedFastaSequenceFile
22
23
24
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.core.config.Configurable
25
import nl.lumc.sasc.biopet.core.{ Reference, ToolCommand, ToolCommandFuntion }
Peter van 't Hof's avatar
Peter van 't Hof committed
26
import nl.lumc.sasc.biopet.utils.VcfUtils._
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
Peter van 't Hof's avatar
Peter van 't Hof committed
28

29
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
30
import scala.collection.mutable.ListBuffer
31

Peter van 't Hof's avatar
Peter van 't Hof committed
32
class BastyGenerateFasta(val root: Configurable) extends ToolCommandFuntion with Reference {
33
34
35
36
37
38
39
40
  javaMainClass = getClass.getName

  @Input(doc = "Input vcf file", required = false)
  var inputVcf: File = _

  @Input(doc = "Bam File", required = false)
  var bamFile: File = _

Peter van 't Hof's avatar
Peter van 't Hof committed
41
  @Input(doc = "reference", required = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
42
  var reference: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
43

44
45
46
  @Output(doc = "Output fasta, variants only", required = false)
  var outputVariants: File = _

Peter van 't Hof's avatar
Peter van 't Hof committed
47
48
  @Output(doc = "Output fasta, variants only", required = false)
  var outputConsensus: File = _
49

Peter van 't Hof's avatar
Peter van 't Hof committed
50
51
  @Output(doc = "Output fasta, variants only", required = false)
  var outputConsensusVariants: File = _
52

Peter van 't Hof's avatar
Peter van 't Hof committed
53
54
  var snpsOnly: Boolean = config("snps_only", default = false)
  var sampleName: String = _
55
  var minAD: Int = config("min_ad", default = 8)
Peter van 't Hof's avatar
Peter van 't Hof committed
56
57
  var minDepth: Int = config("min_depth", default = 8)
  var outputName: String = _
58

Peter van 't Hof's avatar
Peter van 't Hof committed
59
  override val defaultCoreMemory = 4.0
60

Peter van 't Hof's avatar
Peter van 't Hof committed
61
62
  override def beforeGraph(): Unit = {
    super.beforeGraph()
Peter van 't Hof's avatar
Peter van 't Hof committed
63
64
65
    reference = referenceFasta()
  }

66
67
68
69
  override def commandLine = super.commandLine +
    optional("--inputVcf", inputVcf) +
    optional("--bamFile", bamFile) +
    optional("--outputVariants", outputVariants) +
Peter van 't Hof's avatar
Peter van 't Hof committed
70
71
    optional("--outputConsensus", outputConsensus) +
    optional("--outputConsensusVariants", outputConsensusVariants) +
72
73
    conditional(snpsOnly, "--snpsOnly") +
    optional("--sampleName", sampleName) +
Peter van 't Hof's avatar
Peter van 't Hof committed
74
    required("--outputName", outputName) +
75
    optional("--minAD", minAD) +
Peter van 't Hof's avatar
Peter van 't Hof committed
76
77
    optional("--minDepth", minDepth) +
    optional("--reference", reference)
78
79
80
81
82
}

object BastyGenerateFasta extends ToolCommand {
  case class Args(inputVcf: File = null,
                  outputVariants: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
83
84
                  outputConsensus: File = null,
                  outputConsensusVariants: File = null,
85
86
87
                  bamFile: File = null,
                  snpsOnly: Boolean = false,
                  sampleName: String = null,
88
                  outputName: String = null,
89
                  minAD: Int = 8,
Peter van 't Hof's avatar
Peter van 't Hof committed
90
91
                  minDepth: Int = 8,
                  reference: File = null) extends AbstractArgs
92
93

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
94
    opt[File]('V', "inputVcf") unbounded () valueName "<file>" action { (x, c) =>
95
      c.copy(inputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
96
    } text "vcf file, needed for outputVariants and outputConsensusVariants" validate { x =>
Peter van 't Hof's avatar
Peter van 't Hof committed
97
98
      if (x.exists) success else failure("File does not exist: " + x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
99
    opt[File]("bamFile") unbounded () valueName "<file>" action { (x, c) =>
100
      c.copy(bamFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
101
    } text "bam file, needed for outputConsensus and outputConsensusVariants" validate { x =>
Peter van 't Hof's avatar
Peter van 't Hof committed
102
103
      if (x.exists) success else failure("File does not exist: " + x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
104
    opt[File]("outputVariants") maxOccurs 1 unbounded () valueName "<file>" action { (x, c) =>
105
      c.copy(outputVariants = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
106
107
    } text "fasta with only variants from vcf file"
    opt[File]("outputConsensus") maxOccurs 1 unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
108
      c.copy(outputConsensus = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
109
110
    } text "Consensus fasta from bam, always reference bases else 'N'"
    opt[File]("outputConsensusVariants") maxOccurs 1 unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
111
      c.copy(outputConsensusVariants = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
112
    } text "Consensus fasta from bam with variants from vcf file, always reference bases else 'N'"
113
114
    opt[Unit]("snpsOnly") unbounded () action { (x, c) =>
      c.copy(snpsOnly = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
115
    } text "Only use snps from vcf file"
116
117
    opt[String]("sampleName") unbounded () action { (x, c) =>
      c.copy(sampleName = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
118
    } text "Sample name in vcf file"
119
120
    opt[String]("outputName") required () unbounded () action { (x, c) =>
      c.copy(outputName = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
121
    } text "Output name in fasta file header"
122
123
    opt[Int]("minAD") unbounded () action { (x, c) =>
      c.copy(minAD = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
124
    } text "min AD value in vcf file for sample. Defaults to: 8"
Peter van 't Hof's avatar
Peter van 't Hof committed
125
126
    opt[Int]("minDepth") unbounded () action { (x, c) =>
      c.copy(minDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
127
    } text "min depth in bam file. Defaults to: 8"
Peter van 't Hof's avatar
Peter van 't Hof committed
128
129
    opt[File]("reference") unbounded () action { (x, c) =>
      c.copy(reference = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
130
    } text "Indexed reference fasta file" validate { x =>
Peter van 't Hof's avatar
Peter van 't Hof committed
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
      if (x.exists) success else failure("File does not exist: " + x)
    }

    checkConfig { c =>
      {
        val err: ListBuffer[String] = ListBuffer()
        if (c.outputConsensus != null || c.outputConsensusVariants != null) {
          if (c.reference == null)
            err.add("No reference suplied")
          else {
            val index = new File(c.reference.getAbsolutePath + ".fai")
            if (!index.exists) err.add("Reference does not have index")
          }
          if (c.outputConsensusVariants != null && c.inputVcf == null)
            err.add("To write outputVariants input vcf is required, please use --inputVcf option")
          if (c.sampleName != null && c.bamFile == null)
            err.add("To write Consensus input bam file is required, please use --bamFile option")
        }
        if (c.outputVariants != null && c.inputVcf == null)
          err.add("To write outputVariants input vcf is required, please use --inputVcf option")
        if (c.outputVariants == null && c.outputConsensus == null && c.outputConsensusVariants == null)
          err.add("No output file selected")
        if (err.isEmpty) success else failure(err.mkString("", "\nError: ", "\n"))
      }
    }
156
157
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
158
159
  protected var cmdArgs: Args = _
  private val chunkSize = 100000
160
161
162
163
164
165

  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
166
167
168
169
170
    cmdArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    if (cmdArgs.outputVariants != null) writeVariantsOnly()
    if (cmdArgs.outputConsensus != null || cmdArgs.outputConsensusVariants != null) writeConsensus()
  }
171

Peter van 't Hof's avatar
Peter van 't Hof committed
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
  protected def writeConsensus() {
    val referenceFile = new IndexedFastaSequenceFile(cmdArgs.reference)
    val referenceDict = referenceFile.getSequenceDictionary

    for (chr <- referenceDict.getSequences) {
      val chunks = (for (chunk <- (0 to (chr.getSequenceLength / chunkSize)).par) yield {
        val chrName = chr.getSequenceName
        val begin = chunk * chunkSize + 1
        val end = {
          val e = (chunk + 1) * chunkSize
          if (e > chr.getSequenceLength) chr.getSequenceLength else e
        }

        logger.info("begin on: chrName: " + chrName + "  begin: " + begin + "  end: " + end)

        val referenceSequence = referenceFile.getSubsequenceAt(chrName, begin, end)

        val variants: Map[(Int, Int), VariantContext] = if (cmdArgs.inputVcf != null) {
          val reader = new VCFFileReader(cmdArgs.inputVcf, true)
Peter van 't Hof's avatar
Peter van 't Hof committed
191
          (for (variant <- reader.query(chrName, begin, end) if !cmdArgs.snpsOnly || variant.isSNP) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
192
193
194
195
196
197
198
199
200
201
202
203
            (variant.getStart, variant.getEnd) -> variant
          }).toMap
        } else Map()

        val coverage: Array[Int] = Array.fill(end - begin + 1)(0)
        if (cmdArgs.bamFile != null) {
          val inputSam = SamReaderFactory.makeDefault.open(cmdArgs.bamFile)
          for (r <- inputSam.query(chr.getSequenceName, begin, end, false)) {
            val s = if (r.getAlignmentStart < begin) begin else r.getAlignmentStart
            val e = if (r.getAlignmentEnd > end) end else r.getAlignmentEnd
            for (t <- s to e) coverage(t - begin) += 1
          }
204
        } else {
Peter van 't Hof's avatar
Peter van 't Hof committed
205
          for (t <- coverage.indices) coverage(t) = cmdArgs.minDepth
Peter van 't Hof's avatar
Peter van 't Hof committed
206
207
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
208
        val consensus = for (t <- coverage.indices) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
209
210
211
          if (coverage(t) >= cmdArgs.minDepth) referenceSequence.getBases()(t).toChar
          else 'N'
        }
212
213
214
215
216
217
218
219
220
221
222
223
224

        val buffer: StringBuilder = new StringBuilder()
        if (cmdArgs.outputConsensusVariants != null) {
          var consensusPos = 0
          while (consensusPos < consensus.size) {
            val genomePos = consensusPos + begin
            val variant = variants.find(a => a._1._1 >= genomePos && a._1._2 <= genomePos)
            if (variant.isDefined) {
              logger.info(variant.get._2)
              val stripPrefix = if (variant.get._1._1 < begin) begin - variant.get._1._1 else 0
              val stripSufix = if (variant.get._1._2 > end) variant.get._1._2 - end else 0
              val allele = getMaxAllele(variant.get._2)
              consensusPos += variant.get._2.getReference.getBases.length
Peter van 't Hof's avatar
Peter van 't Hof committed
225
              buffer.append(allele.substring(stripPrefix, allele.length - stripSufix))
226
227
228
229
230
231
232
            } else {
              buffer.append(consensus(consensusPos))
              consensusPos += 1
            }
          }
        }

Peter van 't Hof's avatar
Peter van 't Hof committed
233
        chunk -> (consensus.mkString.toUpperCase, buffer.toString().toUpperCase)
Peter van 't Hof's avatar
Peter van 't Hof committed
234
      }).toMap
235
236
237
238
239
240
      if (cmdArgs.outputConsensus != null) {
        val writer = new PrintWriter(cmdArgs.outputConsensus)
        writer.println(">" + cmdArgs.outputName)
        for (c <- chunks.keySet.toList.sortWith(_ < _)) {
          writer.print(chunks(c)._1)
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
241
        writer.println()
Peter van 't Hof's avatar
Peter van 't Hof committed
242
        writer.close()
243
244
245
246
247
248
249
      }
      if (cmdArgs.outputConsensusVariants != null) {
        val writer = new PrintWriter(cmdArgs.outputConsensusVariants)
        writer.println(">" + cmdArgs.outputName)
        for (c <- chunks.keySet.toList.sortWith(_ < _)) {
          writer.print(chunks(c)._2)
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
250
        writer.println()
Peter van 't Hof's avatar
Peter van 't Hof committed
251
        writer.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
252
253
      }
    }
254
255
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
256
257
  protected def writeVariantsOnly() {
    val writer = new PrintWriter(cmdArgs.outputVariants)
Peter van 't Hof's avatar
Peter van 't Hof committed
258
    writer.println(">" + cmdArgs.outputName)
Peter van 't Hof's avatar
Peter van 't Hof committed
259
    val vcfReader = new VCFFileReader(cmdArgs.inputVcf, false)
Peter van 't Hof's avatar
Peter van 't Hof committed
260
    for (vcfRecord <- vcfReader if !cmdArgs.snpsOnly || vcfRecord.isSNP) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
261
      writer.print(getMaxAllele(vcfRecord))
262
263
    }
    writer.println()
Peter van 't Hof's avatar
Peter van 't Hof committed
264
265
    writer.close()
    vcfReader.close()
266
267
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
268
  protected def getMaxAllele(vcfRecord: VariantContext): String = {
Peter van 't Hof's avatar
Peter van 't Hof committed
269
    val maxSize = getLongestAllele(vcfRecord).getBases.length
270

Peter van 't Hof's avatar
Peter van 't Hof committed
271
    if (cmdArgs.sampleName == null) return fillAllele(vcfRecord.getReference.getBaseString, maxSize)
272

Peter van 't Hof's avatar
Peter van 't Hof committed
273
274
    val genotype = vcfRecord.getGenotype(cmdArgs.sampleName)
    if (genotype == null) return fillAllele("", maxSize)
Peter van 't Hof's avatar
Peter van 't Hof committed
275
    val AD = if (genotype.hasAD) genotype.getAD else Array.fill(vcfRecord.getAlleles.size())(cmdArgs.minAD)
Peter van 't Hof's avatar
Peter van 't Hof committed
276
    if (AD == null) return fillAllele("", maxSize)
277
    val maxADid = AD.zipWithIndex.maxBy(_._1)._2
Peter van 't Hof's avatar
Peter van 't Hof committed
278
    if (AD(maxADid) < cmdArgs.minAD) return fillAllele("", maxSize)
Peter van 't Hof's avatar
Peter van 't Hof committed
279
    fillAllele(vcfRecord.getAlleles()(maxADid).getBaseString, maxSize)
280
281
  }
}