VcfToTsv.scala 5.98 KB
Newer Older
1
2
3
4
5
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
Peter van 't Hof's avatar
Peter van 't Hof committed
6
 *
7
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
Peter van 't Hof's avatar
Peter van 't Hof committed
8
 *
9
 * Contact us at: sasc@lumc.nl
Peter van 't Hof's avatar
Peter van 't Hof committed
10
 *
11
12
13
14
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
Peter van 't Hof's avatar
Peter van 't Hof committed
15
16
17
18
19
20
21
22
 */
package nl.lumc.sasc.biopet.tools

import htsjdk.variant.vcf.VCFFileReader
import java.io.File
import java.io.PrintStream
import nl.lumc.sasc.biopet.core.ToolCommand
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import scala.collection.mutable.{ Map, ListBuffer }
Peter van 't Hof's avatar
Peter van 't Hof committed
24
25
26
27
28
29

class VcfToTsv {
  // TODO: Queue wrapper
}

object VcfToTsv extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
30
31
32
  case class Args(inputFile: File = null, outputFile: File = null, fields: List[String] = Nil, infoFields: List[String] = Nil,
                  sampleFileds: List[String] = Nil, disableDefaults: Boolean = false,
                  allInfo: Boolean = false, allFormat: Boolean = false) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
33
34

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('o', "outputFile") maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    } text ("output file, default to stdout")
    opt[String]('f', "field") unbounded () action { (x, c) =>
      c.copy(fields = x :: c.fields)
    }
    opt[String]('i', "info_field") unbounded () action { (x, c) =>
      c.copy(infoFields = x :: c.infoFields)
    }
    opt[Unit]("all_info") unbounded () action { (x, c) =>
      c.copy(allInfo = true)
    }
    opt[Unit]("all_format") unbounded () action { (x, c) =>
      c.copy(allFormat = true)
    }
    opt[String]('s', "sample_field") unbounded () action { (x, c) =>
      c.copy(sampleFileds = x :: c.sampleFileds)
    }
    opt[Unit]('d', "disable_defaults") unbounded () action { (x, c) =>
      c.copy(disableDefaults = true)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
59
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
60

Peter van 't Hof's avatar
Peter van 't Hof committed
61
  val defaultFields = List("chr", "pos", "id", "ref", "alt", "qual")
Peter van 't Hof's avatar
Peter van 't Hof committed
62

Peter van 't Hof's avatar
Peter van 't Hof committed
63
64
65
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
66

Peter van 't Hof's avatar
Peter van 't Hof committed
67
68
69
    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder
Peter van 't Hof's avatar
Peter van 't Hof committed
70

71
72
    val allInfoFields = header.getInfoHeaderLines.map(_.getID).toList
    val allFormatFields = header.getFormatHeaderLines.map(_.getID).toList
Peter van 't Hof's avatar
Peter van 't Hof committed
73
74
75
76
77
78
79
80
81

    val fields: Set[String] = (if (commandArgs.disableDefaults) Nil else defaultFields).toSet[String] ++
      commandArgs.fields.toSet[String] ++
      (if (commandArgs.allInfo) allInfoFields else commandArgs.infoFields).map("INFO-" + _) ++ {
        val buffer: ListBuffer[String] = ListBuffer()
        for (f <- (if (commandArgs.allFormat) allFormatFields else commandArgs.sampleFileds); sample <- samples) {
          buffer += sample + "-" + f
        }
        buffer.toSet[String]
Peter van 't Hof's avatar
Peter van 't Hof committed
82
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
83
84
85
86

    val sortedFields = fields.toList.sortWith((a, b) => {
      val aT = if (a.startsWith("INFO-")) 'i' else if (samples.exists(x => a.startsWith(x + "-"))) 'f' else 'g'
      val bT = if (b.startsWith("INFO-")) 'i' else if (samples.exists(x => b.startsWith(x + "-"))) 'f' else 'g'
87
88
89
90
91
      if (aT == 'g' && bT == 'g') {
        val ai = defaultFields.indexOf(a)
        val bi = defaultFields.indexOf(b)
        if (bi < 0) true
        else ai <= bi
Peter van 't Hof's avatar
Peter van 't Hof committed
92
      } else if (aT == 'g') true
93
94
95
96
97
      else if (bT == 'g') false
      else if (aT == bT) (if (a.compareTo(b) > 0) false else true)
      else if (aT == 'i') true
      else false
    })
Peter van 't Hof's avatar
Peter van 't Hof committed
98

Peter van 't Hof's avatar
Peter van 't Hof committed
99
    val witter = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile)
100
    else sys.process.stdout
Peter van 't Hof's avatar
Peter van 't Hof committed
101

102
    witter.println(sortedFields.mkString("#", "\t", ""))
Peter van 't Hof's avatar
Peter van 't Hof committed
103
104
105
106
107
108
109
110
111
112
    for (vcfRecord <- reader) {
      val values: Map[String, Any] = Map()
      values += "chr" -> vcfRecord.getChr
      values += "pos" -> vcfRecord.getStart
      values += "id" -> vcfRecord.getID
      values += "ref" -> vcfRecord.getReference.getBaseString
      values += "alt" -> {
        val t = for (a <- vcfRecord.getAlternateAlleles) yield a.getBaseString
        t.mkString(",")
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
113
      values += "qual" -> (if (vcfRecord.getPhredScaledQual == -10) "." else scala.math.round(vcfRecord.getPhredScaledQual * 100.0) / 100.0)
Peter van 't Hof's avatar
Peter van 't Hof committed
114
115
      values += "filter" -> vcfRecord.getFilters
      for ((field, content) <- vcfRecord.getAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
116
        values += "INFO-" + field -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
117
          content match {
Peter van 't Hof's avatar
Peter van 't Hof committed
118
119
120
121
            case a: List[_]                => a.mkString(",")
            case a: Array[_]               => a.mkString(",")
            case a: java.util.ArrayList[_] => a.mkString(",")
            case _                         => content
Peter van 't Hof's avatar
Peter van 't Hof committed
122
123
124
          }
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
125

Peter van 't Hof's avatar
Peter van 't Hof committed
126
127
      for (sample <- samples) {
        val genotype = vcfRecord.getGenotype(sample)
Peter van 't Hof's avatar
Peter van 't Hof committed
128
        values += sample + "-GT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
129
130
131
          val l = for (g <- genotype.getAlleles) yield vcfRecord.getAlleleIndex(g)
          l.map(x => if (x < 0) "." else x).mkString("/")
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
132
133
134
135
        if (genotype.hasAD) values += sample + "-AD" -> List(genotype.getAD: _*).mkString(",")
        if (genotype.hasDP) values += sample + "-DP" -> genotype.getDP
        if (genotype.hasGQ) values += sample + "-GQ" -> genotype.getGQ
        if (genotype.hasPL) values += sample + "-PL" -> List(genotype.getPL: _*).mkString(",")
Peter van 't Hof's avatar
Peter van 't Hof committed
136
        for ((field, content) <- genotype.getExtendedAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
137
          values += sample + "-" + field -> content
Peter van 't Hof's avatar
Peter van 't Hof committed
138
139
        }
      }
140
      val line = for (f <- sortedFields) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
141
142
        if (values.contains(f)) {
          values(f)
Peter van 't Hof's avatar
Peter van 't Hof committed
143
144
145
146
147
148
        } else ""
      }
      witter.println(line.mkString("\t"))
    }
  }
}