AnnotateVcfWithBed.scala 6.2 KB
Newer Older
bow's avatar
bow committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
17
18
19
20
21
22
23
24
package nl.lumc.sasc.biopet.tools

import java.io.File

import htsjdk.tribble.AbstractFeatureReader.getFeatureReader
import htsjdk.tribble.bed.BEDCodec
import htsjdk.variant.variantcontext.VariantContextBuilder
import htsjdk.variant.variantcontext.writer.{ VariantContextWriterBuilder, AsyncVariantContextWriter }
import htsjdk.variant.vcf.{ VCFHeaderLineType, VCFHeaderLineCount, VCFInfoHeaderLine, VCFFileReader }
25
import nl.lumc.sasc.biopet.core.{ ToolCommandFuntion, ToolCommand }
Peter van 't Hof's avatar
Peter van 't Hof committed
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import scala.collection.JavaConverters.asScalaIteratorConverter
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.io.Source

/**
 * Created by pjvan_thof on 1/10/15.
 */
class AnnotateVcfWithBed {
  // TODO: Queue wrapper
}

object AnnotateVcfWithBed extends ToolCommand {

Peter van 't Hof's avatar
Peter van 't Hof committed
40
41
42
43
44
45
46
47
48
  /**
   * Args for the commandline tool
   * @param inputFile input vcf file
   * @param bedFile bed file to annotate to vcf file
   * @param outputFile output vcf file
   * @param fieldName Info field that should be used
   * @param fieldDescription Description at field if needed
   * @param fieldType Type of filed, can be: "Integer", "Flag", "Character", "Float"
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
49
50
51
52
53
54
55
56
57
58
  case class Args(inputFile: File = null,
                  bedFile: File = null,
                  outputFile: File = null,
                  fieldName: String = null,
                  fieldDescription: String = "",
                  fieldType: String = "String") extends AbstractArgs

  class OptParser extends AbstractOptParser {
    opt[File]('I', "inputFile") required () unbounded () valueName ("<vcf file>") action { (x, c) =>
      c.copy(inputFile = x)
59
    } text ("Input is a required file property")
Peter van 't Hof's avatar
Peter van 't Hof committed
60
61
    opt[File]('B', "bedFile") required () unbounded () valueName ("<bed file>") action { (x, c) =>
      c.copy(bedFile = x)
62
    } text ("Bedfile is a required file property")
Peter van 't Hof's avatar
Peter van 't Hof committed
63
64
65
66
67
68
69
70
71
72
73
74
75
76
    opt[File]('o', "output") required () unbounded () valueName ("<vcf file>") action { (x, c) =>
      c.copy(outputFile = x)
    } text ("out is a required file property")
    opt[String]('f', "fieldName") required () unbounded () valueName ("<name of field in vcf file>") action { (x, c) =>
      c.copy(fieldName = x)
    } text ("Name of info field in new vcf file")
    opt[String]('d', "fieldDescription") unbounded () valueName ("<name of field in vcf file>") action { (x, c) =>
      c.copy(fieldDescription = x)
    } text ("Description of field in new vcf file")
    opt[String]('t', "fieldType") unbounded () valueName ("<name of field in vcf file>") action { (x, c) =>
      c.copy(fieldType = x)
    } text ("Description of field in new vcf file")
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
77
78
79
80
81
  /**
   * Program will Annotate a vcf file with the overlapping regions of a bed file, 4e column of the bed file we in a info tag in the vcf file
   *
   * @param args
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
82
  def main(args: Array[String]): Unit = {
83
84
85

    logger.info("Start")

Peter van 't Hof's avatar
Peter van 't Hof committed
86
87
88
89
90
91
92
93
94
95
96
97
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    val bedRecords: mutable.Map[String, List[(Int, Int, String)]] = mutable.Map()
    // Read bed file
    /*
    // function bedRecord.getName will not compile, not clear why
    for (bedRecord <- asScalaIteratorConverter(getFeatureReader(commandArgs.bedFile.toPath.toString, new BEDCodec(), false).iterator()).asScala) {
      logger.debug(bedRecord)
      bedRecords(bedRecord.getChr) = (bedRecord.getStart, bedRecord.getEnd, bedRecord.getName) :: bedRecords.getOrElse(bedRecord.getChr, Nil)
    }
    */
98
99
100
101
102
103
104
105
106
107
108

    val fieldType = commandArgs.fieldType match {
      case "Integer"   => VCFHeaderLineType.Integer
      case "Flag"      => VCFHeaderLineType.Flag
      case "Character" => VCFHeaderLineType.Character
      case "Float"     => VCFHeaderLineType.Float
      case _           => VCFHeaderLineType.String
    }

    logger.info("Reading bed file")

Peter van 't Hof's avatar
Peter van 't Hof committed
109
110
111
112
    for (line <- Source.fromFile(commandArgs.bedFile).getLines()) {
      val values = line.split("\t")
      if (values.size >= 4)
        bedRecords(values(0)) = (values(1).toInt, values(2).toInt, values(3)) :: bedRecords.getOrElse(values(0), Nil)
113
114
      else (values.size >= 3 && fieldType == VCFHeaderLineType.Flag)
      bedRecords(values(0)) = (values(1).toInt, values(2).toInt, "") :: bedRecords.getOrElse(values(0), Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
115
116
    }

117
118
    logger.info("Sorting bed records")

Peter van 't Hof's avatar
Peter van 't Hof committed
119
120
    // Sort records when needed
    for ((chr, record) <- bedRecords) {
Peter van 't Hof's avatar
Peter van 't Hof committed
121
      bedRecords(chr) = record.sortBy(x => (x._1, x._2))
Peter van 't Hof's avatar
Peter van 't Hof committed
122
123
    }

124
125
    logger.info("Starting output file")

Peter van 't Hof's avatar
Peter van 't Hof committed
126
127
128
    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val header = reader.getFileHeader

Sander Bollen's avatar
Sander Bollen committed
129
130
131
132
133
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
      setOutputFile(commandArgs.outputFile).
      setReferenceDictionary(header.getSequenceDictionary).
      build)

Peter van 't Hof's avatar
Peter van 't Hof committed
134
135
136
137
    header.addMetaDataLine(new VCFInfoHeaderLine(commandArgs.fieldName,
      VCFHeaderLineCount.UNBOUNDED, fieldType, commandArgs.fieldDescription))
    writer.writeHeader(header)

138
139
    logger.info("Start reading vcf records")

Peter van 't Hof's avatar
Peter van 't Hof committed
140
141
142
143
144
145
146
147
    for (record <- reader) {
      val overlaps = bedRecords.getOrElse(record.getChr, Nil).filter(x => {
        record.getStart <= x._2 && record.getEnd >= x._1
      })
      if (overlaps.isEmpty) {
        writer.add(record)
      } else {
        val builder = new VariantContextBuilder(record)
148
149
        if (fieldType == VCFHeaderLineType.Flag) builder.attribute(commandArgs.fieldName, true)
        else builder.attribute(commandArgs.fieldName, overlaps.map(_._3).mkString(","))
Peter van 't Hof's avatar
Peter van 't Hof committed
150
151
152
153
154
        writer.add(builder.make)
      }
    }
    reader.close
    writer.close
155
156

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
157
158
  }
}