VcfWithVcf.scala 4.84 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
package nl.lumc.sasc.biopet.tools

import java.io.File

import scala.collection.JavaConversions._
import htsjdk.variant.variantcontext.{ VariantContextBuilder, VariantContext }
7
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
Sander Bollen's avatar
Sander Bollen committed
8
9
10
import htsjdk.variant.vcf._
import nl.lumc.sasc.biopet.core.ToolCommand

Sander Bollen's avatar
Sander Bollen committed
11
12
import scala.collection.immutable

Sander Bollen's avatar
Sander Bollen committed
13
14
15
/**
 * Created by ahbbollen on 11-2-15.
 */
16
object VcfWithVcf extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
17
  case class Fields(inputField: String, outputField: String, t: String = "Array")
18
19
20

  case class Args(inputFile: File = null,
                  outputFile: File = null,
21
                  secondaryVcf: File = null,
22
23
                  fields: List[Fields] = Nil,
                  matchAllele: Boolean = true) extends AbstractArgs
Sander Bollen's avatar
Sander Bollen committed
24
25
26
27
28
29
30
31

  class OptParser extends AbstractOptParser {
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('O', "outputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    }
32
33
    opt[File]('S', "SecondaryVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(secondaryVcf = x)
Sander Bollen's avatar
Sander Bollen committed
34
    }
35
36
    opt[String]('f', "field") unbounded () valueName ("<input_field:output_field>") action { (x, c) =>
      val values = x.split(":")
Peter van 't Hof's avatar
Peter van 't Hof committed
37
38
      if (values.size > 2) c.copy(fields = Fields(values(0), values(1), values(2)) :: c.fields)
      else if (values.size > 1) c.copy(fields = Fields(values(0), values(1)) :: c.fields)
39
      else c.copy(fields = Fields(x, x) :: c.fields)
Sander Bollen's avatar
Sander Bollen committed
40
    }
41
    opt[Boolean]("match") valueName ("<Boolean>") maxOccurs (1) action { (x, c) =>
42
      c.copy(matchAllele = x)
43
    } text ("Match alternative alleles; default true")
Sander Bollen's avatar
Sander Bollen committed
44
45
46
47
48
49
50
  }

  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    val reader = new VCFFileReader(commandArgs.inputFile)
51
    val secondaryReader = new VCFFileReader(commandArgs.secondaryVcf)
Sander Bollen's avatar
Sander Bollen committed
52
53

    val header = reader.getFileHeader
54
    val secondHeader = secondaryReader.getFileHeader
Sander Bollen's avatar
Sander Bollen committed
55
56
57
58
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
      setOutputFile(commandArgs.outputFile).
      setReferenceDictionary(header.getSequenceDictionary).
      build)
Sander Bollen's avatar
Sander Bollen committed
59
60

    for (x <- commandArgs.fields) {
61
62
63
64
65
      if (header.hasInfoLine(x.outputField))
        throw new IllegalArgumentException("Field '" + x.outputField + "' already exist in input vcf")
      if (!secondHeader.hasInfoLine(x.inputField))
        throw new IllegalArgumentException("Field '" + x.inputField + "' does not exist in secondary vcf")

66
67
68
69
70
      val oldHeaderLine = secondHeader.getInfoHeaderLine(x.inputField)

      val newHeaderLine = new VCFInfoHeaderLine(x.outputField, VCFHeaderLineCount.UNBOUNDED,
        oldHeaderLine.getType, oldHeaderLine.getDescription)
      header.addMetaDataLine(newHeaderLine)
Sander Bollen's avatar
Sander Bollen committed
71
72
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
73
74
    writer.writeHeader(header)

Sander Bollen's avatar
Sander Bollen committed
75
76
    var idx = 0

77
    for (record <- reader) {
78
      val secondaryRecords = if (commandArgs.matchAllele) {
79
80
81
82
83
84
        secondaryReader.query(record.getChr, record.getStart, record.getEnd).toList.
          filter(x => record.getAlternateAlleles.exists(x.hasAlternateAllele(_)))
      } else {
        secondaryReader.query(record.getChr, record.getStart, record.getEnd).toList
      }

85
86
87
88
89
90
91
92
      val fieldMap = (for (
        f <- commandArgs.fields;
        if secondaryRecords.exists(_.hasAttribute(f.inputField))
      ) yield {
        f.outputField -> (for (
          secondRecord <- secondaryRecords;
          if (secondRecord.hasAttribute(f.inputField))
        ) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
            val values = {
              secondRecord.getAttribute(f.inputField) match {
                case l: List[_] => l
                case x          => List(x)
              }
            }

            values match {
            case l: List[_] if f.t == "max" => {
              header.getInfoHeaderLine(f.outputField).getType match {
                case VCFHeaderLineType.Integer => List(l.map(_.toString.toInt).max)
                case VCFHeaderLineType.Float => List(l.map(_.toString.toFloat).max)
                case _ => throw new IllegalArgumentException("Type not fit for max function")
              }
            }
108
109
            case l: List[_] => l
            case x          => List(x)
Sander Bollen's avatar
Sander Bollen committed
110
          }
111
112
113
114
115
116
        }).fold(Nil)(_ ::: _)
      }).toMap

      writer.add(fieldMap.foldLeft(new VariantContextBuilder(record))((builder, attribute) => {
        builder.attribute(attribute._1, attribute._2.toArray)
      }).make())
117

Sander Bollen's avatar
Sander Bollen committed
118
      idx += 1
119
120
121
      if (idx % 100000 == 0) {
        logger.info(s"""Processed $idx records""")
      }
Sander Bollen's avatar
Sander Bollen committed
122
    }
123
    logger.info(s"""Processed $idx records""")
Sander Bollen's avatar
Sander Bollen committed
124

Sander Bollen's avatar
Sander Bollen committed
125
    logger.debug("Closing readers")
Sander Bollen's avatar
Sander Bollen committed
126
127
128
    writer.close()
    reader.close()
    secondaryReader.close()
Sander Bollen's avatar
Sander Bollen committed
129
    logger.info("Done. Goodbye")
Sander Bollen's avatar
Sander Bollen committed
130
131
132
  }

}