VcfToTsv.scala 8.11 KB
Newer Older
1
2
3
4
5
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
Peter van 't Hof's avatar
Peter van 't Hof committed
6
 *
7
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
Peter van 't Hof's avatar
Peter van 't Hof committed
8
 *
9
 * Contact us at: sasc@lumc.nl
Peter van 't Hof's avatar
Peter van 't Hof committed
10
 *
11
12
13
14
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
Peter van 't Hof's avatar
Peter van 't Hof committed
15
16
17
 */
package nl.lumc.sasc.biopet.tools

18
19
20
import java.text.DecimalFormat
import java.util

Peter van 't Hof's avatar
Peter van 't Hof committed
21
22
23
24
25
import htsjdk.variant.vcf.VCFFileReader
import java.io.File
import java.io.PrintStream
import nl.lumc.sasc.biopet.core.ToolCommand
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
26
import scala.collection.mutable.{ Map, ListBuffer }
Peter van 't Hof's avatar
Peter van 't Hof committed
27
28
29
30
31
32

class VcfToTsv {
  // TODO: Queue wrapper
}

object VcfToTsv extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
33
  case class Args(inputFile: File = null, outputFile: File = null, fields: List[String] = Nil, infoFields: List[String] = Nil,
34
                  sampleFields: List[String] = Nil, disableDefaults: Boolean = false,
35
                  allInfo: Boolean = false, allFormat: Boolean = false,
36
                  separator: String = "\t", listSeparator: String = ",", maxDecimals: Int = 2) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
37
38

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('o', "outputFile") maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    } text ("output file, default to stdout")
    opt[String]('f', "field") unbounded () action { (x, c) =>
      c.copy(fields = x :: c.fields)
    }
    opt[String]('i', "info_field") unbounded () action { (x, c) =>
      c.copy(infoFields = x :: c.infoFields)
    }
    opt[Unit]("all_info") unbounded () action { (x, c) =>
      c.copy(allInfo = true)
    }
    opt[Unit]("all_format") unbounded () action { (x, c) =>
      c.copy(allFormat = true)
    }
    opt[String]('s', "sample_field") unbounded () action { (x, c) =>
58
      c.copy(sampleFields = x :: c.sampleFields)
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
61
62
    }
    opt[Unit]('d', "disable_defaults") unbounded () action { (x, c) =>
      c.copy(disableDefaults = true)
    }
63
64
65
66
67
68
    opt[String]("separator") maxOccurs (1) action { (x, c) =>
      c.copy(separator = x)
    } text ("Optional separator. Default is tab-delimited")
    opt[String]("list_separator") maxOccurs (1) action { (x, c) =>
      c.copy(listSeparator = x)
    } text ("Optional list separator. By default, lists are separated by a comma")
Peter van 't Hof's avatar
Peter van 't Hof committed
69
    opt[Int]("max_decimals") maxOccurs (1) action { (x, c) =>
70
71
      c.copy(maxDecimals = x)
    } text ("Number of decimal places for numbers. Default is 2")
Peter van 't Hof's avatar
Peter van 't Hof committed
72
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
73

74
  val defaultFields = List("CHROM", "POS", "ID", "REF", "ALT", "QUAL")
Peter van 't Hof's avatar
Peter van 't Hof committed
75

Peter van 't Hof's avatar
Peter van 't Hof committed
76
77
78
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
79

80
81
    // Throw exception if separator and listSeparator are identical
    if (commandArgs.separator == commandArgs.listSeparator) throw new IllegalArgumentException(
Peter van 't Hof's avatar
Peter van 't Hof committed
82
      "Separator and list_separator should not be identical"
83
84
85
86
    )

    val formatter = createFormatter(commandArgs.maxDecimals)

Peter van 't Hof's avatar
Peter van 't Hof committed
87
88
89
    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder
Peter van 't Hof's avatar
Peter van 't Hof committed
90

91
92
    val allInfoFields = header.getInfoHeaderLines.map(_.getID).toList
    val allFormatFields = header.getFormatHeaderLines.map(_.getID).toList
Peter van 't Hof's avatar
Peter van 't Hof committed
93
94
95
96
97

    val fields: Set[String] = (if (commandArgs.disableDefaults) Nil else defaultFields).toSet[String] ++
      commandArgs.fields.toSet[String] ++
      (if (commandArgs.allInfo) allInfoFields else commandArgs.infoFields).map("INFO-" + _) ++ {
        val buffer: ListBuffer[String] = ListBuffer()
98
        for (f <- (if (commandArgs.allFormat) allFormatFields else commandArgs.sampleFields); sample <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
99
100
101
          buffer += sample + "-" + f
        }
        buffer.toSet[String]
Peter van 't Hof's avatar
Peter van 't Hof committed
102
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
103

104
    val sortedFields = sortFields(fields, samples.toList)
Peter van 't Hof's avatar
Peter van 't Hof committed
105

Sander Bollen's avatar
Sander Bollen committed
106
    val writer = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile)
107
    else sys.process.stdout
Peter van 't Hof's avatar
Peter van 't Hof committed
108

109
    writer.println(sortedFields.mkString("#", commandArgs.separator, ""))
Peter van 't Hof's avatar
Peter van 't Hof committed
110
111
    for (vcfRecord <- reader) {
      val values: Map[String, Any] = Map()
112
113
114
115
116
      values += "CHROM" -> vcfRecord.getChr
      values += "POS" -> vcfRecord.getStart
      values += "ID" -> vcfRecord.getID
      values += "REF" -> vcfRecord.getReference.getBaseString
      values += "ALT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
117
        val t = for (a <- vcfRecord.getAlternateAlleles) yield a.getBaseString
118
        t.mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
119
      }
120
      values += "QUAL" -> (if (vcfRecord.getPhredScaledQual == -10) "." else formatter.format(vcfRecord.getPhredScaledQual))
121
      values += "INFO" -> vcfRecord.getFilters
Peter van 't Hof's avatar
Peter van 't Hof committed
122
      for ((field, content) <- vcfRecord.getAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
123
        values += "INFO-" + field -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
124
          content match {
125
126
127
            case a: List[_]                => a.mkString(commandArgs.listSeparator)
            case a: Array[_]               => a.mkString(commandArgs.listSeparator)
            case a: java.util.ArrayList[_] => a.mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
128
            case _                         => content
Peter van 't Hof's avatar
Peter van 't Hof committed
129
130
131
          }
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
132

Peter van 't Hof's avatar
Peter van 't Hof committed
133
134
      for (sample <- samples) {
        val genotype = vcfRecord.getGenotype(sample)
Peter van 't Hof's avatar
Peter van 't Hof committed
135
        values += sample + "-GT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
136
137
138
          val l = for (g <- genotype.getAlleles) yield vcfRecord.getAlleleIndex(g)
          l.map(x => if (x < 0) "." else x).mkString("/")
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
139
140
141
142
        if (genotype.hasAD) values += sample + "-AD" -> List(genotype.getAD: _*).mkString(commandArgs.listSeparator)
        if (genotype.hasDP) values += sample + "-DP" -> genotype.getDP
        if (genotype.hasGQ) values += sample + "-GQ" -> genotype.getGQ
        if (genotype.hasPL) values += sample + "-PL" -> List(genotype.getPL: _*).mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
143
        for ((field, content) <- genotype.getExtendedAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
144
          values += sample + "-" + field -> content
Peter van 't Hof's avatar
Peter van 't Hof committed
145
146
        }
      }
147
      val line = for (f <- sortedFields) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
148
149
        if (values.contains(f)) {
          values(f)
Peter van 't Hof's avatar
Peter van 't Hof committed
150
151
        } else ""
      }
152
      writer.println(line.mkString(commandArgs.separator))
Peter van 't Hof's avatar
Peter van 't Hof committed
153
154
    }
  }
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182

  /**
   *  This function creates a correct DecimalFormat for a specific length of decimals
   * @param len number of decimal places
   * @return DecimalFormat formatter
   */
  def createFormatter(len: Int): DecimalFormat = {
    val patternString = "###." + (for (x <- (1 to len)) yield "#").mkString("")
    new DecimalFormat(patternString)
  }

  /**
   * This fields sorts fields, such that non-info and non-sample specific fields (e.g. general ones) are on front
   * followed by info fields
   * followed by sample-specific fields
   * @param fields fields
   * @param samples samples
   * @return sorted samples
   */
  def sortFields(fields: Set[String], samples: List[String]): List[String] = {
    def fieldType(x: String) = x match {
      case _ if x.startsWith("INFO-") => 'i'
      case _ if (samples.exists(y => x.startsWith(y + "-"))) => 'f'
      case _ => 'g'
    }

    fields.toList.sortWith((a, b) => {
      (fieldType(a), fieldType(b)) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
183
        case ('g', 'g') => {
184
185
186
187
188
189
190
191
          val ai = defaultFields.indexOf(a)
          val bi = defaultFields.indexOf(b)
          if (bi < 0) true else ai <= bi
        }
        case ('f', 'f') => {
          val sampleA = a.split("-").head
          val sampleB = b.split("-").head
          sampleA.compareTo(sampleB) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
192
            case 0            => !(a.compareTo(b) > 0)
193
            case i if (i > 0) => false
Peter van 't Hof's avatar
Peter van 't Hof committed
194
            case _            => true
195
196
          }
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
197
198
        case ('g', _)         => true
        case (_, 'g')         => false
199
        case (a, b) if a == b => !(a.compareTo(b) > 0)
Peter van 't Hof's avatar
Peter van 't Hof committed
200
201
        case ('i', _)         => true
        case _                => false
202
203
204
      }
    })
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
205
}