CleverFixVCF.scala 7.38 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
15
16
17
18
19
20
package nl.lumc.sasc.biopet.extensions.clever

/**
 * Created by wyleung on 4-4-16.
 */

Wai Yi Leung's avatar
Style    
Wai Yi Leung committed
21
import java.io.{ File, PrintWriter }
22
23

import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
24
import nl.lumc.sasc.biopet.utils.ToolCommand
25
import nl.lumc.sasc.biopet.utils.config.Configurable
Wai Yi Leung's avatar
Style    
Wai Yi Leung committed
26
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output }
27
28
29

import scala.io.Source

Peter van 't Hof's avatar
Peter van 't Hof committed
30
class CleverFixVCF(val parent: Configurable) extends BiopetJavaCommandLineFunction {
31
32
33
34
35
36
37
38
39
40
  javaMainClass = getClass.getName
  @Input(doc = "Input Clever VCF")
  var input: File = _

  @Output(doc = "Output fixed VCF")
  var output: File = _

  @Argument(doc = "Samplename")
  var sampleName: String = _

Ruben Vorderman's avatar
Ruben Vorderman committed
41
42
  override def defaultCoreMemory = 4.0

43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
  override def cmdLine = super.cmdLine +
    required("-i", input) +
    required("-o", output) +
    required("-s", sampleName)
}

object CleverFixVCF extends ToolCommand {
  case class Args(inputVCF: File = null, sampleLabel: String = "",
                  outputVCF: File = null) extends AbstractArgs

  class OptParser extends AbstractOptParser {
    opt[File]('i', "inputvcf") required () valueName "<vcffile/path>" action { (x, c) =>
      c.copy(inputVCF = x)
    } text "Please specify the input Clever VCF file"
    opt[String]('s', "samplelabel") valueName "<sample label>" action { (x, c) =>
      c.copy(sampleLabel = x)
    } text "Sample label is missing"
    opt[File]('o', "outputvcf") valueName "<output>" action { (x, c) =>
      c.copy(outputVCF = x)
    } text "Output path is missing"
  }

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  def replaceHeaderLine(inHeaderLine: String, toCheckFor: String, replacement: String, extraHeader: String): String = {
    (inHeaderLine == toCheckFor) match {
      case true => {
        extraHeader + "\n" + replacement + "\n"
      }
      case _ => {
        // We have to deal with matching records
        // these don't start with #

        inHeaderLine.startsWith("#") match {
          case true =>
            inHeaderLine + "\n"
          case _ => {
            // this should be a record
            // Ensure the REF field is at least an N
            val cols = inHeaderLine.split("\t")
            cols(3) = "N"
            cols.mkString("\t") + "\n"
          }
        }
      }
    }
  }
88

89
  val extraHeader = """##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
##INFO=<ID=NOVEL,Number=0,Type=Flag,Description="Indicates a novel structural variation">
##INFO=<ID=SVEND,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
##INFO=<ID=HOMLEN,Number=.,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints">
##INFO=<ID=HOMSEQ,Number=.,Type=String,Description="Sequence of base pair identical micro-homology at event breakpoints">
##INFO=<ID=BKPTID,Number=.,Type=String,Description="ID of the assembled alternate allele in the assembly file">
##INFO=<ID=MEINFO,Number=4,Type=String,Description="Mobile element info of the form NAME,START,END,POLARITY">
##INFO=<ID=METRANS,Number=4,Type=String,Description="Mobile element transduction info of the form CHR,START,END,POLARITY">
##INFO=<ID=DGVID,Number=1,Type=String,Description="ID of this element in Database of Genomic Variation">
##INFO=<ID=DBVARID,Number=1,Type=String,Description="ID of this element in DBVAR">
##INFO=<ID=DBRIPID,Number=1,Type=String,Description="ID of this element in DBRIP">
##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">
##INFO=<ID=PARID,Number=1,Type=String,Description="ID of partner breakend">
##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
111
##INFO=<ID=BPWINDOW,Number=2,Type=Integer,Description="Window of breakpoints">
112
113
114
115
116
##INFO=<ID=CILEN,Number=2,Type=Integer,Description="Confidence interval around the inserted material between breakends">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Read Depth of segment containing breakend">
##INFO=<ID=DPADJ,Number=.,Type=Integer,Description="Read Depth of adjacency">
##INFO=<ID=CN,Number=1,Type=Integer,Description="Copy number of segment containing breakend">
##INFO=<ID=CNADJ,Number=.,Type=Integer,Description="Copy number of adjacency">
Moustakas's avatar
Moustakas committed
117
##INFO=<ID=ESUPPORT,Number=1,Type=Float,Description="Support of event, see into clever python script for more: scripts/postprocess-predictions">
118
119
120
121
122
123
124
125
126
127
128
129
##INFO=<ID=CICN,Number=2,Type=Integer,Description="Confidence interval around copy number for the segment">
##INFO=<ID=CICNADJ,Number=.,Type=Integer,Description="Confidence interval around copy number for the adjacency">
##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">
##FORMAT=<ID=CNQ,Number=1,Type=Float,Description="Copy number genotype quality for imprecise events">
##FORMAT=<ID=CNL,Number=.,Type=Float,Description="Copy number genotype likelihood for imprecise events">
##FORMAT=<ID=NQ,Number=1,Type=Integer,Description="Phred style probability score that the variant is novel">
##FORMAT=<ID=HAP,Number=1,Type=Integer,Description="Unique haplotype identifier">
##FORMAT=<ID=AHAP,Number=1,Type=Integer,Description="Unique identifier of ancestral haplotype">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">"""

130
131
132
133
134
135
136
137
  val vcfColHeader = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tdefault"

  val vcfColReplacementHeader = s"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
138
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
139
140
141
142

    val input: File = commandArgs.inputVCF
    val output: File = commandArgs.outputVCF

143
144
    val inputVCF = Source.fromFile(input)
    val writer = new PrintWriter(output)
Wai Yi Leung's avatar
Style    
Wai Yi Leung committed
145
146
147
    inputVCF.getLines().foreach(x =>
      writer.write(replaceHeaderLine(x, vcfColHeader, vcfColReplacementHeader + commandArgs.sampleLabel, extraHeader))
    )
148
149
150
151
152
    writer.close()
    inputVCF.close()
  }
}