AnnotateVcfWithBed.scala 4.37 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
package nl.lumc.sasc.biopet.tools

import java.io.File

import htsjdk.tribble.AbstractFeatureReader.getFeatureReader
import htsjdk.tribble.bed.BEDCodec
import htsjdk.variant.variantcontext.VariantContextBuilder
import htsjdk.variant.variantcontext.writer.{ VariantContextWriterBuilder, AsyncVariantContextWriter }
import htsjdk.variant.vcf.{ VCFHeaderLineType, VCFHeaderLineCount, VCFInfoHeaderLine, VCFFileReader }
import nl.lumc.sasc.biopet.core.ToolCommand
import scala.collection.JavaConverters.asScalaIteratorConverter
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.io.Source

/**
 * Created by pjvan_thof on 1/10/15.
 */
class AnnotateVcfWithBed {
  // TODO: Queue wrapper
}

object AnnotateVcfWithBed extends ToolCommand {

  case class Args(inputFile: File = null,
                  bedFile: File = null,
                  outputFile: File = null,
                  fieldName: String = null,
                  fieldDescription: String = "",
                  fieldType: String = "String") extends AbstractArgs

  class OptParser extends AbstractOptParser {
    opt[File]('I', "inputFile") required () unbounded () valueName ("<vcf file>") action { (x, c) =>
      c.copy(inputFile = x)
    } text ("out is a required file property")
    opt[File]('B', "bedFile") required () unbounded () valueName ("<bed file>") action { (x, c) =>
      c.copy(bedFile = x)
    } text ("out is a required file property")
    opt[File]('o', "output") required () unbounded () valueName ("<vcf file>") action { (x, c) =>
      c.copy(outputFile = x)
    } text ("out is a required file property")
    opt[String]('f', "fieldName") required () unbounded () valueName ("<name of field in vcf file>") action { (x, c) =>
      c.copy(fieldName = x)
    } text ("Name of info field in new vcf file")
    opt[String]('d', "fieldDescription") unbounded () valueName ("<name of field in vcf file>") action { (x, c) =>
      c.copy(fieldDescription = x)
    } text ("Description of field in new vcf file")
    opt[String]('t', "fieldType") unbounded () valueName ("<name of field in vcf file>") action { (x, c) =>
      c.copy(fieldType = x)
    } text ("Description of field in new vcf file")
  }

  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    val bedRecords: mutable.Map[String, List[(Int, Int, String)]] = mutable.Map()
    // Read bed file
    /*
    // function bedRecord.getName will not compile, not clear why
    for (bedRecord <- asScalaIteratorConverter(getFeatureReader(commandArgs.bedFile.toPath.toString, new BEDCodec(), false).iterator()).asScala) {
      logger.debug(bedRecord)
      bedRecords(bedRecord.getChr) = (bedRecord.getStart, bedRecord.getEnd, bedRecord.getName) :: bedRecords.getOrElse(bedRecord.getChr, Nil)
    }
    */
    for (line <- Source.fromFile(commandArgs.bedFile).getLines()) {
      val values = line.split("\t")
      if (values.size >= 4)
        bedRecords(values(0)) = (values(1).toInt, values(2).toInt, values(3)) :: bedRecords.getOrElse(values(0), Nil)
    }

    // Sort records when needed
    for ((chr, record) <- bedRecords) {
Peter van 't Hof's avatar
Peter van 't Hof committed
74
      bedRecords(chr) = record.sortBy(x => (x._1, x._2))
Peter van 't Hof's avatar
Peter van 't Hof committed
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    }

    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val header = reader.getFileHeader

    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputFile).build)
    val fieldType = commandArgs.fieldType match {
      case "Integer"   => VCFHeaderLineType.Integer
      case "Flag"      => VCFHeaderLineType.Flag
      case "Character" => VCFHeaderLineType.Character
      case "Float"     => VCFHeaderLineType.Float
      case _           => VCFHeaderLineType.String
    }
    header.addMetaDataLine(new VCFInfoHeaderLine(commandArgs.fieldName,
      VCFHeaderLineCount.UNBOUNDED, fieldType, commandArgs.fieldDescription))
    writer.writeHeader(header)

    for (record <- reader) {
      val overlaps = bedRecords.getOrElse(record.getChr, Nil).filter(x => {
        record.getStart <= x._2 && record.getEnd >= x._1
      })
      if (overlaps.isEmpty) {
        writer.add(record)
      } else {
        val builder = new VariantContextBuilder(record)
        builder.attribute(commandArgs.fieldName, overlaps.map(_._3).mkString(","))
        writer.add(builder.make)
      }
    }
    reader.close
    writer.close
  }
}