VcfWithVcf.scala 4.15 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
package nl.lumc.sasc.biopet.tools

import java.io.File

import scala.collection.JavaConversions._
import htsjdk.variant.variantcontext.{ VariantContextBuilder, VariantContext }
7
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
Sander Bollen's avatar
Sander Bollen committed
8
9
10
import htsjdk.variant.vcf._
import nl.lumc.sasc.biopet.core.ToolCommand

Sander Bollen's avatar
Sander Bollen committed
11
12
import scala.collection.immutable

Sander Bollen's avatar
Sander Bollen committed
13
14
15
/**
 * Created by ahbbollen on 11-2-15.
 */
16
object VcfWithVcf extends ToolCommand {
17
18
19
20
  case class Fields(inputField: String, outputField: String)

  case class Args(inputFile: File = null,
                  outputFile: File = null,
21
                  secondaryVcf: File = null,
22
23
                  fields: List[Fields] = Nil,
                  matchAllele: Boolean = true) extends AbstractArgs
Sander Bollen's avatar
Sander Bollen committed
24
25
26
27
28
29
30
31

  class OptParser extends AbstractOptParser {
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('O', "outputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    }
32
33
    opt[File]('S', "SecondaryVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(secondaryVcf = x)
Sander Bollen's avatar
Sander Bollen committed
34
    }
35
36
37
38
    opt[String]('f', "field") unbounded () valueName ("<input_field:output_field>") action { (x, c) =>
      val values = x.split(":")
      if (values.size > 1) c.copy(fields = Fields(values(0), values(1)) :: c.fields)
      else c.copy(fields = Fields(x, x) :: c.fields)
Sander Bollen's avatar
Sander Bollen committed
39
    }
40
    opt[Boolean]("match") valueName ("<Boolean>") maxOccurs (1) action { (x, c) =>
41
      c.copy(matchAllele = x)
42
    } text ("Match alternative alleles; default true")
Sander Bollen's avatar
Sander Bollen committed
43
44
45
46
47
48
49
  }

  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    val reader = new VCFFileReader(commandArgs.inputFile)
50
    val secondaryReader = new VCFFileReader(commandArgs.secondaryVcf)
Sander Bollen's avatar
Sander Bollen committed
51
52

    val header = reader.getFileHeader
53
    val secondHeader = secondaryReader.getFileHeader
Sander Bollen's avatar
Sander Bollen committed
54
55
56
57
58
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
      setOutputFile(commandArgs.outputFile).
      setReferenceDictionary(header.getSequenceDictionary).
      build)
    writer.writeHeader(header)
Sander Bollen's avatar
Sander Bollen committed
59
60

    for (x <- commandArgs.fields) {
61
62
63
64
65
      if (header.hasInfoLine(x.outputField))
        throw new IllegalArgumentException("Field '" + x.outputField + "' already exist in input vcf")
      if (!secondHeader.hasInfoLine(x.inputField))
        throw new IllegalArgumentException("Field '" + x.inputField + "' does not exist in secondary vcf")

66
67
68
69
70
      val oldHeaderLine = secondHeader.getInfoHeaderLine(x.inputField)

      val newHeaderLine = new VCFInfoHeaderLine(x.outputField, VCFHeaderLineCount.UNBOUNDED,
        oldHeaderLine.getType, oldHeaderLine.getDescription)
      header.addMetaDataLine(newHeaderLine)
Sander Bollen's avatar
Sander Bollen committed
71
72
    }

Sander Bollen's avatar
Sander Bollen committed
73
74
    var idx = 0

75
    for (record <- reader) {
76
      val secondaryRecords = if (commandArgs.matchAllele) {
77
78
79
80
81
82
        secondaryReader.query(record.getChr, record.getStart, record.getEnd).toList.
          filter(x => record.getAlternateAlleles.exists(x.hasAlternateAllele(_)))
      } else {
        secondaryReader.query(record.getChr, record.getStart, record.getEnd).toList
      }

83
84
85
86
87
88
89
90
91
92
93
      val fieldMap = (for (
        f <- commandArgs.fields;
        if secondaryRecords.exists(_.hasAttribute(f.inputField))
      ) yield {
        f.outputField -> (for (
          secondRecord <- secondaryRecords;
          if (secondRecord.hasAttribute(f.inputField))
        ) yield {
          secondRecord.getAttribute(f.inputField) match {
            case l: List[_] => l
            case x          => List(x)
Sander Bollen's avatar
Sander Bollen committed
94
          }
95
96
97
98
99
100
        }).fold(Nil)(_ ::: _)
      }).toMap

      writer.add(fieldMap.foldLeft(new VariantContextBuilder(record))((builder, attribute) => {
        builder.attribute(attribute._1, attribute._2.toArray)
      }).make())
101

Sander Bollen's avatar
Sander Bollen committed
102
      idx += 1
103
104
105
      if (idx % 100000 == 0) {
        logger.info(s"""Processed $idx records""")
      }
Sander Bollen's avatar
Sander Bollen committed
106
    }
107
    logger.info(s"""Processed $idx records""")
Sander Bollen's avatar
Sander Bollen committed
108

Sander Bollen's avatar
Sander Bollen committed
109
    logger.debug("Closing readers")
Sander Bollen's avatar
Sander Bollen committed
110
111
112
    writer.close()
    reader.close()
    secondaryReader.close()
Sander Bollen's avatar
Sander Bollen committed
113
    logger.info("Done. Goodbye")
Sander Bollen's avatar
Sander Bollen committed
114
115
116
  }

}