BastyGenerateFasta.scala 9.64 KB
Newer Older
1
2
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
3
4
import htsjdk.samtools.SamReaderFactory
import htsjdk.samtools.reference.IndexedFastaSequenceFile
5
6
7
8
9
10
11
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf.VCFFileReader
import java.io.File
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
12
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
13
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
14
import nl.lumc.sasc.biopet.utils.VcfUtils._
15
16
17
18
19
20
21
22
23
24

class BastyGenerateFasta(val root: Configurable) extends BiopetJavaCommandLineFunction {
  javaMainClass = getClass.getName

  @Input(doc = "Input vcf file", required = false)
  var inputVcf: File = _

  @Input(doc = "Bam File", required = false)
  var bamFile: File = _

Peter van 't Hof's avatar
Peter van 't Hof committed
25
26
27
  @Input(doc = "reference", required = false)
  var reference: File = config("reference")

28
29
30
  @Output(doc = "Output fasta, variants only", required = false)
  var outputVariants: File = _

Peter van 't Hof's avatar
Peter van 't Hof committed
31
32
  @Output(doc = "Output fasta, variants only", required = false)
  var outputConsensus: File = _
33

Peter van 't Hof's avatar
Peter van 't Hof committed
34
35
  @Output(doc = "Output fasta, variants only", required = false)
  var outputConsensusVariants: File = _
36

Peter van 't Hof's avatar
Peter van 't Hof committed
37
38
  var snpsOnly: Boolean = config("snps_only", default = false)
  var sampleName: String = _
39
  var minAD: Int = config("min_ad", default = 8)
Peter van 't Hof's avatar
Peter van 't Hof committed
40
41
  var minDepth: Int = config("min_depth", default = 8)
  var outputName: String = _
42
43
44
45
46
47
48
49

  override val defaultVmem = "8G"
  memoryLimit = Option(4.0)

  override def commandLine = super.commandLine +
    optional("--inputVcf", inputVcf) +
    optional("--bamFile", bamFile) +
    optional("--outputVariants", outputVariants) +
Peter van 't Hof's avatar
Peter van 't Hof committed
50
51
    optional("--outputVariants", outputConsensus) +
    optional("--outputVariants", outputConsensusVariants) +
52
53
    conditional(snpsOnly, "--snpsOnly") +
    optional("--sampleName", sampleName) +
Peter van 't Hof's avatar
Peter van 't Hof committed
54
    required("--outputName", outputName) +
55
    optional("--minAD", minAD) +
Peter van 't Hof's avatar
Peter van 't Hof committed
56
57
    optional("--minDepth", minDepth) +
    optional("--reference", reference)
58
59
60
61
62
}

object BastyGenerateFasta extends ToolCommand {
  case class Args(inputVcf: File = null,
                  outputVariants: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
63
64
                  outputConsensus: File = null,
                  outputConsensusVariants: File = null,
65
66
67
                  bamFile: File = null,
                  snpsOnly: Boolean = false,
                  sampleName: String = null,
68
                  outputName: String = null,
69
                  minAD: Int = 8,
Peter van 't Hof's avatar
Peter van 't Hof committed
70
71
                  minDepth: Int = 8,
                  reference: File = null) extends AbstractArgs
72
73
74
75
76
77
78
79
80
81
82

  class OptParser extends AbstractOptParser {
    opt[File]('V', "inputVcf") unbounded () valueName ("<file>") action { (x, c) =>
      c.copy(inputVcf = x)
    }
    opt[File]("bamFile") unbounded () valueName ("<file>") action { (x, c) =>
      c.copy(bamFile = x)
    }
    opt[File]("outputVariants") unbounded () valueName ("<file>") action { (x, c) =>
      c.copy(outputVariants = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
83
84
85
86
87
88
    opt[File]("outputConsensus") unbounded () valueName ("<file>") action { (x, c) =>
      c.copy(outputConsensus = x)
    }
    opt[File]("outputConsensusVariants") unbounded () valueName ("<file>") action { (x, c) =>
      c.copy(outputConsensusVariants = x)
    }
89
90
91
92
93
94
    opt[Unit]("snpsOnly") unbounded () action { (x, c) =>
      c.copy(snpsOnly = true)
    }
    opt[String]("sampleName") unbounded () action { (x, c) =>
      c.copy(sampleName = x)
    }
95
96
97
    opt[String]("outputName") required () unbounded () action { (x, c) =>
      c.copy(outputName = x)
    }
98
99
100
    opt[Int]("minAD") unbounded () action { (x, c) =>
      c.copy(minAD = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
101
102
103
104
105
    opt[Int]("minDepth") unbounded () action { (x, c) =>
      c.copy(minDepth = x)
    }
    opt[File]("reference") unbounded () action { (x, c) =>
      c.copy(reference = x)
106
    }
107
108
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
109
110
  protected var cmdArgs: Args = _
  private val chunkSize = 100000
111
112
113
114
115
116

  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
117
118
119
120
121
    cmdArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    if (cmdArgs.outputVariants != null) writeVariantsOnly()
    if (cmdArgs.outputConsensus != null || cmdArgs.outputConsensusVariants != null) writeConsensus()
  }
122

Peter van 't Hof's avatar
Peter van 't Hof committed
123
124
125
126
127
128
129
130
131
  protected def writeConsensus() {
    if (cmdArgs.reference == null)
      throw new IllegalStateException("No reference suplied")
    if (cmdArgs.outputConsensusVariants != null) {
      if (cmdArgs.inputVcf == null)
        throw new IllegalStateException("To write outputVariants input vcf is required, please use --inputVcf option")
      if (!cmdArgs.inputVcf.exists)
        throw new IllegalStateException("File does not exist: " + cmdArgs.inputVcf)
    }
132
    if (cmdArgs.sampleName != null) {
Peter van 't Hof's avatar
Peter van 't Hof committed
133
134
135
136
137
      if (cmdArgs.bamFile == null)
        throw new IllegalStateException("To write Consensus input bam file is required, please use --bamFile option")
      if (!cmdArgs.bamFile.exists)
        throw new IllegalStateException("File does not exist: " + cmdArgs.bamFile)
    }
138

Peter van 't Hof's avatar
Peter van 't Hof committed
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
    logger.info(cmdArgs.reference)

    val referenceFile = new IndexedFastaSequenceFile(cmdArgs.reference)
    val referenceDict = referenceFile.getSequenceDictionary

    for (chr <- referenceDict.getSequences) {
      val chunks = (for (chunk <- (0 to (chr.getSequenceLength / chunkSize)).par) yield {
        val chrName = chr.getSequenceName
        val begin = chunk * chunkSize + 1
        val end = {
          val e = (chunk + 1) * chunkSize
          if (e > chr.getSequenceLength) chr.getSequenceLength else e
        }

        logger.info("begin on: chrName: " + chrName + "  begin: " + begin + "  end: " + end)

        val referenceSequence = referenceFile.getSubsequenceAt(chrName, begin, end)

        val variants: Map[(Int, Int), VariantContext] = if (cmdArgs.inputVcf != null) {
          val reader = new VCFFileReader(cmdArgs.inputVcf, true)
159
          (for (variant <- reader.query(chrName, begin, end) if (!cmdArgs.snpsOnly || variant.isSNP)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
160
161
162
163
164
165
166
167
168
169
170
171
            (variant.getStart, variant.getEnd) -> variant
          }).toMap
        } else Map()

        val coverage: Array[Int] = Array.fill(end - begin + 1)(0)
        if (cmdArgs.bamFile != null) {
          val inputSam = SamReaderFactory.makeDefault.open(cmdArgs.bamFile)
          for (r <- inputSam.query(chr.getSequenceName, begin, end, false)) {
            val s = if (r.getAlignmentStart < begin) begin else r.getAlignmentStart
            val e = if (r.getAlignmentEnd > end) end else r.getAlignmentEnd
            for (t <- s to e) coverage(t - begin) += 1
          }
172
173
        } else {
          for (t <- 0 until coverage.length) coverage(t) = cmdArgs.minDepth
Peter van 't Hof's avatar
Peter van 't Hof committed
174
175
176
177
178
179
        }

        val consensus = for (t <- 0 until coverage.length) yield {
          if (coverage(t) >= cmdArgs.minDepth) referenceSequence.getBases()(t).toChar
          else 'N'
        }
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

        val buffer: StringBuilder = new StringBuilder()
        if (cmdArgs.outputConsensusVariants != null) {
          var consensusPos = 0
          while (consensusPos < consensus.size) {
            val genomePos = consensusPos + begin
            val variant = variants.find(a => a._1._1 >= genomePos && a._1._2 <= genomePos)
            if (variant.isDefined) {
              logger.info(variant.get._2)
              val stripPrefix = if (variant.get._1._1 < begin) begin - variant.get._1._1 else 0
              val stripSufix = if (variant.get._1._2 > end) variant.get._1._2 - end else 0
              val allele = getMaxAllele(variant.get._2)
              consensusPos += variant.get._2.getReference.getBases.length
              buffer.append(allele.substring(stripPrefix, allele.size - stripSufix))
            } else {
              buffer.append(consensus(consensusPos))
              consensusPos += 1
            }
          }
        }

        (chunk -> (consensus.mkString.toUpperCase, buffer.toString.toUpperCase))
Peter van 't Hof's avatar
Peter van 't Hof committed
202
      }).toMap
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
      if (cmdArgs.outputConsensus != null) {
        val writer = new PrintWriter(cmdArgs.outputConsensus)
        writer.println(">" + cmdArgs.outputName)
        for (c <- chunks.keySet.toList.sortWith(_ < _)) {
          writer.print(chunks(c)._1)
        }
        writer.close
      }
      if (cmdArgs.outputConsensusVariants != null) {
        val writer = new PrintWriter(cmdArgs.outputConsensusVariants)
        writer.println(">" + cmdArgs.outputName)
        for (c <- chunks.keySet.toList.sortWith(_ < _)) {
          writer.print(chunks(c)._2)
        }
        writer.close
Peter van 't Hof's avatar
Peter van 't Hof committed
218
219
      }
    }
220
221
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
222
223
224
225
226
227
228
229
230
231
  protected def writeVariantsOnly() {
    if (cmdArgs.inputVcf == null)
      throw new IllegalStateException("To write outputVariants input vcf is required, please use --inputVcf option")
    if (!cmdArgs.inputVcf.exists)
      throw new IllegalStateException("File does not exist: " + cmdArgs.inputVcf)
    val writer = new PrintWriter(cmdArgs.outputVariants)
    writer.println(">" + cmdArgs.sampleName)
    val vcfReader = new VCFFileReader(cmdArgs.inputVcf, false)
    for (vcfRecord <- vcfReader if (!cmdArgs.snpsOnly || vcfRecord.isSNP)) yield {
      writer.print(getMaxAllele(vcfRecord))
232
233
234
235
236
237
    }
    writer.println()
    writer.close
    vcfReader.close
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
238
  protected def getMaxAllele(vcfRecord: VariantContext): String = {
Peter van 't Hof's avatar
Peter van 't Hof committed
239
    val maxSize = getLongestAllele(vcfRecord).getBases.length
240

Peter van 't Hof's avatar
Peter van 't Hof committed
241
    if (cmdArgs.sampleName == null) return fillAllele(vcfRecord.getReference.getBaseString, maxSize)
242

Peter van 't Hof's avatar
Peter van 't Hof committed
243
244
    val genotype = vcfRecord.getGenotype(cmdArgs.sampleName)
    if (genotype == null) return fillAllele("", maxSize)
245
    val AD = genotype.getAD
Peter van 't Hof's avatar
Peter van 't Hof committed
246
    if (AD == null) return fillAllele("", maxSize)
247
    val maxADid = AD.zipWithIndex.maxBy(_._1)._2
Peter van 't Hof's avatar
Peter van 't Hof committed
248
249
    if (AD(maxADid) < cmdArgs.minAD) return fillAllele("", maxSize)
    return fillAllele(vcfRecord.getAlleles()(maxADid).getBaseString, maxSize)
250
251
  }
}