VcfToTsv.scala 8.05 KB
Newer Older
1
2
3
4
5
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
Peter van 't Hof's avatar
Peter van 't Hof committed
6
 *
7
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
Peter van 't Hof's avatar
Peter van 't Hof committed
8
 *
9
 * Contact us at: sasc@lumc.nl
Peter van 't Hof's avatar
Peter van 't Hof committed
10
 *
11
12
13
14
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
Peter van 't Hof's avatar
Peter van 't Hof committed
15
16
17
 */
package nl.lumc.sasc.biopet.tools

18
19
20
import java.text.DecimalFormat
import java.util

Peter van 't Hof's avatar
Peter van 't Hof committed
21
22
23
24
25
import htsjdk.variant.vcf.VCFFileReader
import java.io.File
import java.io.PrintStream
import nl.lumc.sasc.biopet.core.ToolCommand
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
26
import scala.collection.mutable.{ Map, ListBuffer }
Peter van 't Hof's avatar
Peter van 't Hof committed
27
28
29
30
31
32

class VcfToTsv {
  // TODO: Queue wrapper
}

object VcfToTsv extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
33
  case class Args(inputFile: File = null, outputFile: File = null, fields: List[String] = Nil, infoFields: List[String] = Nil,
34
                  sampleFields: List[String] = Nil, disableDefaults: Boolean = false,
35
                  allInfo: Boolean = false, allFormat: Boolean = false,
36
                  separator: String = "\t", listSeparator: String = ",", maxDecimals: Int = 2) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
37
38

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('o', "outputFile") maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    } text ("output file, default to stdout")
    opt[String]('f', "field") unbounded () action { (x, c) =>
      c.copy(fields = x :: c.fields)
    }
    opt[String]('i', "info_field") unbounded () action { (x, c) =>
      c.copy(infoFields = x :: c.infoFields)
    }
    opt[Unit]("all_info") unbounded () action { (x, c) =>
      c.copy(allInfo = true)
    }
    opt[Unit]("all_format") unbounded () action { (x, c) =>
      c.copy(allFormat = true)
    }
    opt[String]('s', "sample_field") unbounded () action { (x, c) =>
58
      c.copy(sampleFields = x :: c.sampleFields)
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
61
62
    }
    opt[Unit]('d', "disable_defaults") unbounded () action { (x, c) =>
      c.copy(disableDefaults = true)
    }
63
64
65
66
67
68
    opt[String]("separator") maxOccurs (1) action { (x, c) =>
      c.copy(separator = x)
    } text ("Optional separator. Default is tab-delimited")
    opt[String]("list_separator") maxOccurs (1) action { (x, c) =>
      c.copy(listSeparator = x)
    } text ("Optional list separator. By default, lists are separated by a comma")
69
70
71
    opt[Int]("max_decimals") maxOccurs(1) action { (x, c) =>
      c.copy(maxDecimals = x)
    } text ("Number of decimal places for numbers. Default is 2")
Peter van 't Hof's avatar
Peter van 't Hof committed
72
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
73

74
  val defaultFields = List("CHROM", "POS", "ID", "REF", "ALT", "QUAL")
Peter van 't Hof's avatar
Peter van 't Hof committed
75

Peter van 't Hof's avatar
Peter van 't Hof committed
76
77
78
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
79

80
81
82
83
84
85
86
    // Throw exception if separator and listSeparator are identical
    if (commandArgs.separator == commandArgs.listSeparator) throw new IllegalArgumentException(
    "Separator and list_separator should not be identical"
    )

    val formatter = createFormatter(commandArgs.maxDecimals)

Peter van 't Hof's avatar
Peter van 't Hof committed
87
88
89
    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder
Peter van 't Hof's avatar
Peter van 't Hof committed
90

91
92
    val allInfoFields = header.getInfoHeaderLines.map(_.getID).toList
    val allFormatFields = header.getFormatHeaderLines.map(_.getID).toList
Peter van 't Hof's avatar
Peter van 't Hof committed
93
94
95
96
97

    val fields: Set[String] = (if (commandArgs.disableDefaults) Nil else defaultFields).toSet[String] ++
      commandArgs.fields.toSet[String] ++
      (if (commandArgs.allInfo) allInfoFields else commandArgs.infoFields).map("INFO-" + _) ++ {
        val buffer: ListBuffer[String] = ListBuffer()
98
        for (f <- (if (commandArgs.allFormat) allFormatFields else commandArgs.sampleFields); sample <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
99
100
101
          buffer += sample + "-" + f
        }
        buffer.toSet[String]
Peter van 't Hof's avatar
Peter van 't Hof committed
102
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
103

104
105

    val sortedFields = sortFields(fields, samples.toList)
Peter van 't Hof's avatar
Peter van 't Hof committed
106

Sander Bollen's avatar
Sander Bollen committed
107
    val writer = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile)
108
    else sys.process.stdout
Peter van 't Hof's avatar
Peter van 't Hof committed
109

110
    writer.println(sortedFields.mkString("#", commandArgs.separator, ""))
Peter van 't Hof's avatar
Peter van 't Hof committed
111
112
    for (vcfRecord <- reader) {
      val values: Map[String, Any] = Map()
113
114
115
116
117
      values += "CHROM" -> vcfRecord.getChr
      values += "POS" -> vcfRecord.getStart
      values += "ID" -> vcfRecord.getID
      values += "REF" -> vcfRecord.getReference.getBaseString
      values += "ALT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
118
        val t = for (a <- vcfRecord.getAlternateAlleles) yield a.getBaseString
119
        t.mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
120
      }
121
      values += "QUAL" -> (if (vcfRecord.getPhredScaledQual == -10) "." else formatter.format(vcfRecord.getPhredScaledQual))
122
      values += "INFO" -> vcfRecord.getFilters
Peter van 't Hof's avatar
Peter van 't Hof committed
123
      for ((field, content) <- vcfRecord.getAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
124
        values += "INFO-" + field -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
125
          content match {
126
127
128
            case a: List[_]                => a.mkString(commandArgs.listSeparator)
            case a: Array[_]               => a.mkString(commandArgs.listSeparator)
            case a: java.util.ArrayList[_] => a.mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
129
            case _                         => content
Peter van 't Hof's avatar
Peter van 't Hof committed
130
131
132
          }
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
133

Peter van 't Hof's avatar
Peter van 't Hof committed
134
135
      for (sample <- samples) {
        val genotype = vcfRecord.getGenotype(sample)
Peter van 't Hof's avatar
Peter van 't Hof committed
136
        values += sample + "-GT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
137
138
139
          val l = for (g <- genotype.getAlleles) yield vcfRecord.getAlleleIndex(g)
          l.map(x => if (x < 0) "." else x).mkString("/")
        }
140
141
142
143
        if (genotype.hasAD) values += sample + "-AD"   -> List(genotype.getAD: _*).mkString(commandArgs.listSeparator)
        if (genotype.hasDP) values += sample + "-DP"   -> genotype.getDP
        if (genotype.hasGQ) values += sample + "-GQ"   -> genotype.getGQ
        if (genotype.hasPL) values += sample + "-PL"   -> List(genotype.getPL: _*).mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
144
        for ((field, content) <- genotype.getExtendedAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
145
          values += sample + "-" + field -> content
Peter van 't Hof's avatar
Peter van 't Hof committed
146
147
        }
      }
148
      val line = for (f <- sortedFields) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
149
150
        if (values.contains(f)) {
          values(f)
Peter van 't Hof's avatar
Peter van 't Hof committed
151
152
        } else ""
      }
153
      writer.println(line.mkString(commandArgs.separator))
Peter van 't Hof's avatar
Peter van 't Hof committed
154
155
    }
  }
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

  /**
   *  This function creates a correct DecimalFormat for a specific length of decimals
   * @param len number of decimal places
   * @return DecimalFormat formatter
   */
  def createFormatter(len: Int): DecimalFormat = {
    val patternString = "###." + (for (x <- (1 to len)) yield "#").mkString("")
    new DecimalFormat(patternString)
  }


  /**
   * This fields sorts fields, such that non-info and non-sample specific fields (e.g. general ones) are on front
   * followed by info fields
   * followed by sample-specific fields
   * @param fields fields
   * @param samples samples
   * @return sorted samples
   */
  def sortFields(fields: Set[String], samples: List[String]): List[String] = {
    def fieldType(x: String) = x match {
      case _ if x.startsWith("INFO-") => 'i'
      case _ if (samples.exists(y => x.startsWith(y + "-"))) => 'f'
      case _ => 'g'
    }

    fields.toList.sortWith((a, b) => {
      (fieldType(a), fieldType(b)) match {
        case ('g','g') => {
          val ai = defaultFields.indexOf(a)
          val bi = defaultFields.indexOf(b)
          if (bi < 0) true else ai <= bi
        }
        case ('f', 'f') => {
          val sampleA = a.split("-").head
          val sampleB = b.split("-").head
          sampleA.compareTo(sampleB) match {
            case 0 => !(a.compareTo(b) > 0)
            case i if (i > 0) => false
            case _  => true
          }
        }
        case ('g', _) => true
        case (_, 'g') => false
        case (a, b) if a == b => !(a.compareTo(b) > 0)
        case ('i', _) => true
        case _ => false
      }
    })
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
207
}