Commit 246f4031 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'develop' into feature-summary

parents 6fccbe6c ce4d4d49
......@@ -146,4 +146,4 @@ object BiopetExecutable {
val stream = getClass.getClassLoader.getResourceAsStream("nl/lumc/sasc/biopet/License.txt")
Source.fromInputStream(stream).getLines().mkString("\n", "\n", "\n")
}
}
\ No newline at end of file
}
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
/**
* Created by ahbbollen on 15-1-15.
*/
class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFunction {
executable = config("exe", submodule = "perl", default = "perl")
var vep_script: String = config("vep_script")
@Input(doc = "input VCF", required = true)
var input: File = null
@Output(doc = "output file", required = true)
var output: File = null
override val versionRegex = """version (\d*)""".r
override def versionCommand = executable + " " + vep_script + " --help"
//Boolean vars
var v: Boolean = config("v", default = true)
var q: Boolean = config("q", default = false)
var offline: Boolean = config("offline", default = false)
var no_progress: Boolean = config("no_progress", default = false)
var everything: Boolean = config("everything", default = true)
var force: Boolean = config("force", default = false)
var no_stats: Boolean = config("no_stats", default = false)
var stats_text: Boolean = config("stats_text", default = false)
var html: Boolean = config("html", default = false)
var cache: Boolean = config("cache", default = false)
var humdiv: Boolean = config("humdiv", default = false)
var regulatory: Boolean = config("regulatory", default = false)
var cell_type: Boolean = config("cell_type", default = false)
var phased: Boolean = config("phased", default = false)
var allele_number: Boolean = config("allele_number", default = false)
var numbers: Boolean = config("numbers", default = false)
var domains: Boolean = config("domains", default = false)
var no_escape: Boolean = config("no_escape", default = false)
var hgvs: Boolean = config("hgvs", default = false)
var protein: Boolean = config("protein", default = false)
var symbol: Boolean = config("symbol", default = false)
var ccds: Boolean = config("ccds", default = false)
var uniprot: Boolean = config("uniprot", default = false)
var tsl: Boolean = config("tsl", default = false)
var canonical: Boolean = config("canonical", default = false)
var biotype: Boolean = config("biotype", default = false)
var xref_refseq: Boolean = config("xref_refseq", default = false)
var check_existing: Boolean = config("check_existing", default = false)
var check_alleles: Boolean = config("check_alleles", default = false)
var check_svs: Boolean = config("svs", default = false)
var gmaf: Boolean = config("gmaf", default = false)
var maf_1kg: Boolean = config("maf_1kg", default = false)
var maf_esp: Boolean = config("maf_esp", default = false)
var old_map: Boolean = config("old_maf", default = false)
var pubmed: Boolean = config("pubmed", default = false)
var failed: Boolean = config("failed", default = false)
var vcf: Boolean = config("vcf", default = true)
var json: Boolean = config("json", default = false)
var gvf: Boolean = config("gvf", default = false)
var check_ref: Boolean = config("check_ref", default = false)
var coding_only: Boolean = config("coding_only", default = false)
var no_intergenic: Boolean = config("no_intergenic", default = false)
var pick: Boolean = config("pick", default = false)
var pick_allele: Boolean = config("pick_allele", default = false)
var flag_pick: Boolean = config("flag_pick", default = false)
var flag_pick_allele: Boolean = config("flag_pick_allele", default = false)
var per_gene: Boolean = config("per_gene", default = false)
var most_severe: Boolean = config("most_severe", default = false)
var summary: Boolean = config("summary", default = false)
var filter_common: Boolean = config("filter_common", default = false)
var check_frequency: Boolean = config("check_frequency", default = false)
var allow_non_variant: Boolean = config("allow_non_variant", default = false)
var database: Boolean = config("database", default = false)
var genomes: Boolean = config("genomes", default = false)
var gencode_basic: Boolean = config("gencode_basic", default = false)
var refseq: Boolean = config("refseq", default = false)
var merged: Boolean = config("merged", default = false)
var all_refseq: Boolean = config("all_refseq", default = false)
var lrg: Boolean = config("lrg", default = false)
var no_whole_genome: Boolean = config("no_whole_genome", default = false)
var skip_db_check: Boolean = config("skip_db_check", default = false)
// Textual args
var vep_config: Option[String] = config("config")
var species: Option[String] = config("species")
var assembly: Option[String] = config("assembly")
var format: Option[String] = config("format")
var dir: Option[String] = config("dir")
var dir_cache: Option[String] = config("dir_cache")
var dir_plugins: Option[String] = config("dir_plugins")
var fasta: Option[String] = config("fasta")
var sift: Option[String] = config("sift")
var polyphen: Option[String] = config("polyphen")
var custom: Option[String] = config("custom")
var plugin: List[String] = config("plugin", default = Nil)
var individual: Option[String] = config("individual")
var fields: Option[String] = config("fields")
var convert: Option[String] = config("convert")
var terms: Option[String] = config("terms")
var chr: Option[String] = config("chr")
var pick_order: Option[String] = config("pick_order")
var freq_pop: Option[String] = config("check_pop")
var freq_gt_lt: Option[String] = config("freq_gt_lt")
var freq_filter: Option[String] = config("freq_filter")
var filter: Option[String] = config("filter")
var host: Option[String] = config("host")
var user: Option[String] = config("user")
var password: Option[String] = config("password")
var registry: Option[String] = config("registry")
var build: Option[String] = config("build")
var compress: Option[String] = config("compress")
var cache_region_size: Option[String] = config("cache_region_size")
// Numeric args
var fork: Option[Int] = config("fork")
var cache_version: Option[Int] = config("cache_version")
var freq_freq: Option[Float] = config("freq_freq")
var port: Option[Int] = config("port")
var db_version: Option[Int] = config("db_version")
var buffer_size: Option[Int] = config("buffer_size")
override def beforeGraph: Unit = {
super.beforeGraph
if (!cache && !database) {
throw new IllegalArgumentException("Must supply either cache or database")
} else if (cache && dir.isEmpty) {
throw new IllegalArgumentException("Must supply dir to cache")
}
}
def cmdLine = required(executable) +
required(vep_script) +
required("-i", input) +
required("-o", output) +
conditional(v, "-v") +
conditional(q, "-q") +
conditional(offline, "--offline") +
conditional(no_progress, "--no_progress") +
conditional(everything, "--everything") +
conditional(force, "--force_overwrite") +
conditional(no_stats, "--no_stats") +
conditional(stats_text, "--stats_text") +
conditional(html, "--html") +
conditional(cache, "--cache") +
conditional(humdiv, "--humdiv") +
conditional(regulatory, "--regulatory") +
conditional(cell_type, "--cel_type") +
conditional(phased, "--phased") +
conditional(allele_number, "--allele_number") +
conditional(numbers, "--numbers") +
conditional(domains, "--domains") +
conditional(no_escape, "--no_escape") +
conditional(hgvs, "--hgvs") +
conditional(protein, "--protein") +
conditional(symbol, "--symbol") +
conditional(ccds, "--ccds") +
conditional(uniprot, "--uniprot") +
conditional(tsl, "--tsl") +
conditional(canonical, "--canonical") +
conditional(biotype, "--biotype") +
conditional(xref_refseq, "--xref_refseq") +
conditional(check_existing, "--check_existing") +
conditional(check_alleles, "--check_alleles") +
conditional(check_svs, "--check_svs") +
conditional(gmaf, "--gmaf") +
conditional(maf_1kg, "--maf_1kg") +
conditional(maf_esp, "--maf_esp") +
conditional(pubmed, "--pubmed") +
conditional(failed, "--failed") +
conditional(vcf, "--vcf") +
conditional(json, "--json") +
conditional(gvf, "--gvf") +
conditional(check_ref, "--check_ref") +
conditional(coding_only, "--coding_only") +
conditional(no_intergenic, "--no_intergenic") +
conditional(pick, "--pick") +
conditional(pick_allele, "--pick_allele") +
conditional(flag_pick, "--flag_pick") +
conditional(flag_pick_allele, "--flag_pick_allele") +
conditional(per_gene, "--per_gene") +
conditional(most_severe, "--most_severe") +
conditional(summary, "--summary") +
conditional(filter_common, "--filter_common") +
conditional(check_frequency, "--check_frequency") +
conditional(allow_non_variant, "--allow_non_variant") +
conditional(database, "--database") +
conditional(genomes, "--genomes") +
conditional(gencode_basic, "--gencode_basic") +
conditional(refseq, "--refseq") +
conditional(merged, "--merged") +
conditional(all_refseq, "--all_refseq") +
conditional(lrg, "--lrg") +
conditional(no_whole_genome, "--no_whole_genome") +
conditional(skip_db_check, "--skip_db_check") +
optional("--config", vep_config) +
optional("--species", species) +
optional("--assembly", assembly) +
optional("--format", format) +
optional("--dir", dir) +
optional("--dir_cache", dir_cache) +
optional("--dir_plugins", dir_plugins) +
optional("--fasta", fasta) +
optional("--sift", sift) +
optional("--polyphen", polyphen) +
optional("--custom", custom) +
repeat("--plugin", plugin) +
optional("--individual", individual) +
optional("--fields", fields) +
optional("--convert", convert) +
optional("--terms", terms) +
optional("--chr", chr) +
optional("--pick_order", pick_order) +
optional("--freq_pop", freq_pop) +
optional("--freq_gt_lt", freq_gt_lt) +
optional("--freq_filter", freq_filter) +
optional("--filter", filter) +
optional("--host", host) +
optional("--user", user) +
optional("--password", password) +
optional("--registry", registry) +
optional("--build", build) +
optional("--compress", compress) +
optional("--cache_region_size", cache_region_size) +
optional("--fork", fork) +
optional("--cache_version", cache_version) +
optional("--freq_freq", freq_freq) +
optional("--port", port) +
optional("--db_version", db_version) +
optional("--buffer_size", buffer_size)
}
package nl.lumc.sasc.biopet.tools
import java.io.{ File, IOException }
import scala.collection.JavaConversions._
import scala.collection.mutable.{ Map => MMap }
import htsjdk.tribble.TribbleException
import htsjdk.variant.variantcontext.{ VariantContextBuilder, VariantContext }
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.vcf._
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import nl.lumc.sasc.biopet.core.{ BiopetJavaCommandLineFunction, ToolCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
/**
* This tool parses a VEP annotated VCF into a standard VCF file.
* The VEP puts all its annotations for each variant in an CSQ string, where annotations per transcript are comma-separated
* Annotations are then furthermore pipe-separated.
* This tool has two modes:
* 1) explode - explodes all transcripts such that each is on a unique line
* 2) standard - parse as a standard VCF, where multiple transcripts occur in the same line
* Created by ahbbollen on 10/27/14.
*/
class VEPNormalizer(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input VCF, may be indexed", shortName = "InputFile", required = true)
var inputVCF: File = null
@Output(doc = "Output VCF", shortName = "OutputFile", required = true)
var outputVCF: File = null
var mode: String = config("mode", default = "explode")
override def commandLine = super.commandLine +
required("-I", inputVCF) +
required("-O", outputVCF) +
required("-m", mode)
}
object VEPNormalizer extends ToolCommand {
def main(args: Array[String]): Unit = {
val commandArgs: Args = new OptParser()
.parse(args, Args())
.getOrElse(sys.exit(1))
val input = commandArgs.inputVCF
val output = commandArgs.outputVCF
logger.info(s"""Input VCF is $input""")
logger.info(s"""Output VCF is $output""")
val reader = try {
new VCFFileReader(input, false)
} catch {
case e: TribbleException.MalformedFeatureFile =>
logger.error("Malformed VCF file! Are you sure this isn't a VCFv3 file?")
throw e
}
val header = reader.getFileHeader
logger.debug("Checking for CSQ tag")
csqCheck(header)
logger.debug("CSQ tag OK")
logger.debug("Checkion VCF version")
versionCheck(header)
logger.debug("VCF version OK")
logger.debug("Parsing header")
val new_infos = parseCsq(header)
header.setWriteCommandLine(true)
for (info <- new_infos) {
val tmpheaderline = new VCFInfoHeaderLine(info, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "A VEP annotation")
header.addMetaDataLine(tmpheaderline)
}
logger.debug("Header parsing done")
logger.debug("Writing header to file")
val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
setOutputFile(output).
build())
writer.writeHeader(header)
logger.debug("Wrote header to file")
normalize(reader, writer, new_infos, commandArgs.mode, commandArgs.removeCSQ)
writer.close()
logger.debug("Closed writer")
reader.close()
logger.debug("Closed reader")
logger.info("Done. Goodbye")
}
/**
* Normalizer
* @param reader input VCF VCFFileReader
* @param writer output VCF AsyncVariantContextWriter
* @param newInfos array of string containing names of new info fields
* @param mode normalizer mode (explode or standard)
* @param removeCsq remove csq tag (Boolean)
* @return
*/
def normalize(reader: VCFFileReader, writer: AsyncVariantContextWriter,
newInfos: Array[String], mode: String, removeCsq: Boolean) = {
logger.info(s"""You have selected mode $mode""")
logger.info("Start processing records")
for (record <- reader) {
mode match {
case "explode" => explodeTranscripts(record, newInfos, removeCsq).foreach(vc => writer.add(vc))
case "standard" => writer.add(standardTranscripts(record, newInfos, removeCsq))
case _ => throw new IllegalArgumentException("Something odd happened!")
}
}
}
/**
* Checks whether header has a CSQ tag
* @param header VCF header
*/
def csqCheck(header: VCFHeader) = {
if (!header.hasInfoLine("CSQ")) {
//logger.error("No CSQ info tag found! Is this file VEP-annotated?")
throw new IllegalArgumentException("No CSQ info tag found! Is this file VEP-annotated?")
}
}
/**
* Checks whether version of input VCF is at least 4.0
* VEP is known to cause issues below 4.0
* Throws exception if not
* @param header VCFHeader of input VCF
*/
def versionCheck(header: VCFHeader) = {
var format = ""
//HACK: getMetaDataLine does not work for fileformat
for (line <- header.getMetaDataInInputOrder) {
if (line.getKey == "fileformat" || line.getKey == "format") {
format = line.getValue
}
}
val version = VCFHeaderVersion.toHeaderVersion(format)
if (!version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) {
throw new IllegalArgumentException(s"""version $version is not supported""")
}
}
/**
* Parses the CSQ tag in the header
* @param header the VCF header
* @return list of strings with new info fields
*/
def parseCsq(header: VCFHeader): Array[String] = {
header.getInfoHeaderLine("CSQ").getDescription.
split(':')(1).trim.split('|').map("VEP_" + _)
}
/**
* Explode a single VEP-annotated record to multiple normal records
* Based on the number of annotated transcripts in the CSQ tag
* @param record the record as a VariantContext object
* @param csq_infos An array with names of new info tags
* @return An array with the new records
*/
def explodeTranscripts(record: VariantContext, csq_infos: Array[String], remove_CSQ: Boolean): Array[VariantContext] = {
val csq = record.getAttributeAsString("CSQ", "unknown")
val attributes = if (remove_CSQ) record.getAttributes.toMap - "CSQ" else record.getAttributes.toMap
csq.
stripPrefix("[").
stripSuffix("]").
split(",").
map(x => attributes ++ csq_infos.zip(x.split("""\|""", -1))).
map(x => {
if (remove_CSQ) new VariantContextBuilder(record)
.attributes(x)
.make()
else new VariantContextBuilder(record).attributes(x).make()
})
}
def standardTranscripts(record: VariantContext, csqInfos: Array[String], removeCsq: Boolean): VariantContext = {
val csq = record.getAttributeAsString("CSQ", "unknown")
val attributes = if (removeCsq) record.getAttributes.toMap - "CSQ" else record.getAttributes.toMap
val newAttrs = attributes ++ csqInfos.zip(csq.
stripPrefix("[").
stripSuffix("]").
split(",").
// This makes a list of lists with each annotation for every transcript in a top-level list element
foldLeft(List.fill(csqInfos.length) { List.empty[String] })(
(acc, x) => {
val broken = x.split("""\|""", -1)
acc.zip(broken).map(x => x._2 :: x._1)
}
).
map(x => x.mkString(",")))
new VariantContextBuilder(record).attributes(newAttrs).make()
}
case class Args(inputVCF: File = null,
outputVCF: File = null,
mode: String = null,
removeCSQ: Boolean = true) extends AbstractArgs
class OptParser extends AbstractOptParser {
head(s"""|$commandName - Parse VEP-annotated VCF to standard VCF format """)
opt[File]('I', "InputFile") required () valueName "<vcf>" action { (x, c) =>
c.copy(inputVCF = x)
} validate {
x => if (x.exists) success else failure("Input VCF not found")
} text "Input VCF file"
opt[File]('O', "OutputFile") required () valueName "<vcf>" action { (x, c) =>
c.copy(outputVCF = x)
} validate {
x =>
if (!x.getName.endsWith(".vcf") && (!x.getName.endsWith(".vcf.gz")) && (!x.getName.endsWith(".bcf")))
failure("Unsupported output file type") else success
} text "Output VCF file"
opt[String]('m', "mode") required () valueName "<mode>" action { (x, c) =>
c.copy(mode = x)
} validate {
x => if (x == "explode") success else if (x == "standard") success else failure("Unsupported mode")
} text "Mode"
opt[Unit]("do-not-remove") action { (x, c) =>
c.copy(removeCSQ = false)
} text "Do not remove CSQ tag"
}
}
......@@ -454,4 +454,4 @@ object WipeReads extends ToolCommand {
commandArgs.filteredOutBam.map(x => prepOutBam(x, commandArgs.inputBam, writeIndex = !commandArgs.noMakeIndex))
)
}
}
\ No newline at end of file
}
##fileformat=VCFv3.3
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=1000GenomesPilot-NCBI36
##phasing=partial
##INFO=NS,1,Integer,"Number of Samples With Data"
##INFO=DP,1,Integer,"Total Depth"
##INFO=AF,-1,Float,"Allele Frequency"
##INFO=AA,1,String,"Ancestral Allele"
##INFO=DB,0,Flag,"dbSNP membership, build 129"
##INFO=H2,0,Flag,"HapMap2 membership"
##FILTER=q10,"Quality below 10"
##FILTER=s50,"Less than 50% of samples have data"
##FORMAT=GT,1,String,"Genotype"
##FORMAT=GQ,1,Integer,"Genotype Quality"
##FORMAT=DP,1,Integer,"Read Depth"
##FORMAT=HQ,2,Integer,"Haplotype Quality"
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
20 14370 rs6054257 G A 29 0 NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:-1,-1
20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:-1,-1
20 1110696 rs6040355 A G,T 67 0 NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:-1,-1
20 1230237 . T . 47 0 NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:-1,-1
20 1234567 microsat1 G D4,IGA 50 0 NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
This diff is collapsed.
This diff is collapsed.
package nl.lumc.sasc.biopet.tools
import org.scalatest.testng.TestNGSuite
import org.scalatest.mock.MockitoSugar
import org.scalatest.Matchers
import java.io.File
import java.nio.file.Paths
import org.testng.annotations.Test
import htsjdk.variant.vcf.VCFFileReader
import htsjdk.tribble.TribbleException
import scala.collection.JavaConversions._
import htsjdk.variant.variantcontext.VariantContext
/**
* This class tests the VEPNormalizer
* Created by ahbbollen on 11/24/14.
*/
class VEPNormalizerTest extends TestNGSuite with MockitoSugar with Matchers {
import VEPNormalizer._
private def resourcePath(p: String): String = {
Paths.get(getClass.getResource(p).toURI).toString
}
val vcf3 = new File(resourcePath("/VCFv3.vcf"))
val vepped = new File(resourcePath("/VEP_oneline.vcf"))
val unvepped = new File(resourcePath("/unvepped.vcf"))
@Test def testVEPHeaderLength() = {
val reader = new VCFFileReader(vepped, false)