Commit d84fc9cc authored by bow's avatar bow
Browse files

Merge branch 'feature-scatter_annotation' into 'develop'

Added scattering on annotation

Fixes #305 and #324 

See merge request !374
parents bffd796d 35dca269
......@@ -18,10 +18,9 @@ package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.summary.Summarizable
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.{ Logging, VcfUtils, tryToParseNumber }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction, Reference }
import nl.lumc.sasc.biopet.utils.tryToParseNumber
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Reference, Version }
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.io.Source
......@@ -164,147 +163,149 @@ class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFu
}
/** Returns command to execute */
def cmdLine = required(executable) +
required(vepScript) +
required("-i", input) +
required("-o", output) +
conditional(v, "-v") +
conditional(q, "-q") +
conditional(offline, "--offline") +
conditional(noProgress, "--no_progress") +
conditional(everything, "--everything") +
conditional(force, "--force_overwrite") +
conditional(noStats, "--no_stats") +
conditional(statsText, "--stats_text") +
conditional(html, "--html") +
conditional(cache, "--cache") +
conditional(humdiv, "--humdiv") +
conditional(regulatory, "--regulatory") +
conditional(cellType, "--cel_type") +
conditional(phased, "--phased") +
conditional(alleleNumber, "--allele_number") +
conditional(numbers, "--numbers") +
conditional(domains, "--domains") +
conditional(noEscape, "--no_escape") +
conditional(hgvs, "--hgvs") +
conditional(protein, "--protein") +
conditional(symbol, "--symbol") +
conditional(ccds, "--ccds") +
conditional(uniprot, "--uniprot") +
conditional(tsl, "--tsl") +
conditional(canonical, "--canonical") +
conditional(biotype, "--biotype") +
conditional(xrefRefseq, "--xref_refseq") +
conditional(checkExisting, "--check_existing") +
conditional(checkAlleles, "--check_alleles") +
conditional(checkSvs, "--check_svs") +
conditional(gmaf, "--gmaf") +
conditional(maf1kg, "--maf_1kg") +
conditional(mafEsp, "--maf_esp") +
conditional(pubmed, "--pubmed") +
conditional(vcf, "--vcf") +
conditional(json, "--json") +
conditional(gvf, "--gvf") +
conditional(checkRef, "--check_ref") +
conditional(codingOnly, "--coding_only") +
conditional(noIntergenic, "--no_intergenic") +
conditional(pick, "--pick") +
conditional(pickAllele, "--pick_allele") +
conditional(flagPick, "--flag_pick") +
conditional(flagPickAllele, "--flag_pick_allele") +
conditional(perGene, "--per_gene") +
conditional(mostSevere, "--most_severe") +
conditional(summary, "--summary") +
conditional(filterCommon, "--filter_common") +
conditional(checkFrequency, "--check_frequency") +
conditional(allowNonVariant, "--allow_non_variant") +
conditional(database, "--database") +
conditional(genomes, "--genomes") +
conditional(gencodeBasic, "--gencode_basic") +
conditional(refseq, "--refseq") +
conditional(merged, "--merged") +
conditional(allRefseq, "--all_refseq") +
conditional(lrg, "--lrg") +
conditional(noWholeGenome, "--no_whole_genome") +
conditional(skibDbCheck, "--skip_db_check") +
optional("--config", vepConfig) +
optional("--species", species) +
optional("--assembly", assembly) +
optional("--format", format) +
optional("--dir", dir) +
optional("--dir_cache", dirCache) +
optional("--dir_plugins", dirPlugins) +
optional("--fasta", fasta) +
optional("--sift", sift) +
optional("--polyphen", polyphen) +
repeat("--custom", custom) +
repeat("--plugin", plugin) +
optional("--individual", individual) +
optional("--fields", fields) +
optional("--convert", convert) +
optional("--terms", terms) +
optional("--chr", chr) +
optional("--pick_order", pickOrder) +
optional("--freq_pop", freqPop) +
optional("--freq_gt_lt", freqGtLt) +
optional("--freq_filter", freqFilter) +
optional("--filter", filter) +
optional("--host", host) +
optional("--user", user) +
optional("--password", password) +
optional("--registry", registry) +
optional("--build", build) +
optional("--compress", compress) +
optional("--cache_region_size", cacheRegionSize) +
optional("--fork", threads) +
optional("--cache_version", cacheVersion) +
optional("--freq_freq", freqFreq) +
optional("--port", port) +
optional("--db_version", dbVersion) +
optional("--buffer_size", bufferSize) +
optional("--failed", failed)
def cmdLine = {
if (input.exists() && VcfUtils.vcfFileIsEmpty(input)) {
val zcat = Zcat(this, input, output)
zcat.cmdLine
} else required(executable) +
required(vepScript) +
required("-i", input) +
required("-o", output) +
conditional(v, "-v") +
conditional(q, "-q") +
conditional(offline, "--offline") +
conditional(noProgress, "--no_progress") +
conditional(everything, "--everything") +
conditional(force, "--force_overwrite") +
conditional(noStats, "--no_stats") +
conditional(statsText, "--stats_text") +
conditional(html, "--html") +
conditional(cache, "--cache") +
conditional(humdiv, "--humdiv") +
conditional(regulatory, "--regulatory") +
conditional(cellType, "--cel_type") +
conditional(phased, "--phased") +
conditional(alleleNumber, "--allele_number") +
conditional(numbers, "--numbers") +
conditional(domains, "--domains") +
conditional(noEscape, "--no_escape") +
conditional(hgvs, "--hgvs") +
conditional(protein, "--protein") +
conditional(symbol, "--symbol") +
conditional(ccds, "--ccds") +
conditional(uniprot, "--uniprot") +
conditional(tsl, "--tsl") +
conditional(canonical, "--canonical") +
conditional(biotype, "--biotype") +
conditional(xrefRefseq, "--xref_refseq") +
conditional(checkExisting, "--check_existing") +
conditional(checkAlleles, "--check_alleles") +
conditional(checkSvs, "--check_svs") +
conditional(gmaf, "--gmaf") +
conditional(maf1kg, "--maf_1kg") +
conditional(mafEsp, "--maf_esp") +
conditional(pubmed, "--pubmed") +
conditional(vcf, "--vcf") +
conditional(json, "--json") +
conditional(gvf, "--gvf") +
conditional(checkRef, "--check_ref") +
conditional(codingOnly, "--coding_only") +
conditional(noIntergenic, "--no_intergenic") +
conditional(pick, "--pick") +
conditional(pickAllele, "--pick_allele") +
conditional(flagPick, "--flag_pick") +
conditional(flagPickAllele, "--flag_pick_allele") +
conditional(perGene, "--per_gene") +
conditional(mostSevere, "--most_severe") +
conditional(summary, "--summary") +
conditional(filterCommon, "--filter_common") +
conditional(checkFrequency, "--check_frequency") +
conditional(allowNonVariant, "--allow_non_variant") +
conditional(database, "--database") +
conditional(genomes, "--genomes") +
conditional(gencodeBasic, "--gencode_basic") +
conditional(refseq, "--refseq") +
conditional(merged, "--merged") +
conditional(allRefseq, "--all_refseq") +
conditional(lrg, "--lrg") +
conditional(noWholeGenome, "--no_whole_genome") +
conditional(skibDbCheck, "--skip_db_check") +
optional("--config", vepConfig) +
optional("--species", species) +
optional("--assembly", assembly) +
optional("--format", format) +
optional("--dir", dir) +
optional("--dir_cache", dirCache) +
optional("--dir_plugins", dirPlugins) +
optional("--fasta", fasta) +
optional("--sift", sift) +
optional("--polyphen", polyphen) +
repeat("--custom", custom) +
repeat("--plugin", plugin) +
optional("--individual", individual) +
optional("--fields", fields) +
optional("--convert", convert) +
optional("--terms", terms) +
optional("--chr", chr) +
optional("--pick_order", pickOrder) +
optional("--freq_pop", freqPop) +
optional("--freq_gt_lt", freqGtLt) +
optional("--freq_filter", freqFilter) +
optional("--filter", filter) +
optional("--host", host) +
optional("--user", user) +
optional("--password", password) +
optional("--registry", registry) +
optional("--build", build) +
optional("--compress", compress) +
optional("--cache_region_size", cacheRegionSize) +
optional("--fork", threads) +
optional("--cache_version", cacheVersion) +
optional("--freq_freq", freqFreq) +
optional("--port", port) +
optional("--db_version", dbVersion) +
optional("--buffer_size", bufferSize) +
optional("--failed", failed)
}
def summaryFiles: Map[String, File] = Map()
def summaryStats: Map[String, Any] = {
if (statsText) {
val statsFile: File = new File(output.getAbsolutePath + "_summary.txt")
parseStatsFile(statsFile)
} else {
Map()
}
val statsFile = new File(output.getAbsolutePath + "_summary.txt")
if (statsText && statsFile.exists()) parseStatsFile(statsFile)
else Map()
}
def parseStatsFile(file: File): Map[String, Any] = {
val contents = Source.fromFile(file).getLines().toList
val headers = getHeadersFromStatsFile(contents)
headers.foldLeft(Map.empty[String, Any])((acc, x) => acc + (x.replace(" ", "_") -> getBlockFromStatsFile(contents, x)))
}
protected val removeOnConflict = Set("Output_file", "Command_line_options", "Run_time", "Start_time", "End_time", "Input_file_(format)", "Novel_/_existing_variants")
protected val nonNumber = Set("VEP_version_(API)", "Cache/Database", "Species")
def getBlockFromStatsFile(contents: List[String], header: String): Map[String, Any] = {
var inBlock = false
var theMap: Map[String, Any] = Map()
for (x <- contents) {
val stripped = x.stripPrefix("[").stripSuffix("]")
if (stripped == header) {
inBlock = true
} else {
if (inBlock) {
val key = stripped.split('\t').head.replace(" ", "_")
val value = stripped.split('\t').last
theMap ++= Map(key -> tryToParseNumber(value, fallBack = true).getOrElse(value))
}
}
if (stripped == "") {
inBlock = false
override def resolveSummaryConflict(v1: Any, v2: Any, key: String): Any = {
if (removeOnConflict.contains(key)) None
else if (nonNumber.contains(key)) v1
else {
(v1, v2) match {
case (x1: Int, x2: Int) => x1 + x2
case _ => throw new IllegalStateException(s"Value are not Int's, unable to sum them up, key: $key, v1: $v1, v2: $v2")
}
}
theMap
}
def getHeadersFromStatsFile(contents: List[String]): List[String] = {
// block headers are of format '[block]'
contents.filter(_.startsWith("[")).filter(_.endsWith("]")).map(_.stripPrefix("[")).map(_.stripSuffix("]"))
}
def parseStatsFile(file: File): Map[String, Any] = {
val reader = Source.fromFile(file)
val contents = reader.getLines().filter(_ != "").toArray
reader.close()
def isHeader(line: String) = line.startsWith("[") && line.endsWith("]")
val headers = contents.zipWithIndex
.filter(x => x._1.startsWith("[") && x._1.endsWith("]"))
(for ((header, headerIndex) <- headers) yield {
val name = header.stripPrefix("[").stripSuffix("]")
name.replaceAll(" ", "_") -> (contents.drop(headerIndex + 1).takeWhile(!isHeader(_)).map { line =>
val values = line.split("\t", 2)
values.head.replaceAll(" ", "_") -> tryToParseNumber(values.last).getOrElse(0)
}.toMap)
}).toMap
}
}
......@@ -2,11 +2,11 @@ package nl.lumc.sasc.biopet.extensions.gatk
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.{ BiopetJavaCommandLineFunction, Reference }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Input, Output }
class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction {
class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction with Reference {
analysisName = "CatVariants"
javaMainClass = "org.broadinstitute.gatk.tools.CatVariants"
......@@ -44,6 +44,11 @@ class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction
@Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction])
var log_to_file: File = _
override def beforeGraph() = {
super.beforeGraph()
if (reference == null) reference = referenceFasta()
}
override def cmdLine = super.cmdLine +
required("-R", reference, spaceSeparated = true, escape = true, format = "%s") +
repeat("-V", variant, spaceSeparated = true, escape = true, format = "%s") +
......
......@@ -24,6 +24,7 @@ import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, Variant
import htsjdk.variant.vcf._
import nl.lumc.sasc.biopet.utils.ToolCommand
import nl.lumc.sasc.biopet.utils.VcfUtils.scalaListToJavaObjectArrayList
import nl.lumc.sasc.biopet.utils.BamUtils.SamDictCheck
import scala.collection.JavaConversions._
......@@ -89,14 +90,14 @@ object VcfWithVcf extends ToolCommand {
val header = reader.getFileHeader
val vcfDict = header.getSequenceDictionary match {
case r if r != null =>
r.assertSameDictionary(referenceDict)
r.assertSameDictionary(referenceDict, true)
r
case _ => referenceDict
}
val secondHeader = secondaryReader.getFileHeader
secondHeader.getSequenceDictionary match {
case r if r != null => r.assertSameDictionary(referenceDict)
case r if r != null => r.assertSameDictionary(referenceDict, true)
case _ =>
}
......@@ -123,6 +124,7 @@ object VcfWithVcf extends ToolCommand {
var counter = 0
for (record <- reader) {
require(vcfDict.getSequence(record.getContig) != null, s"Contig ${record.getContig} does not exist on reference")
val secondaryRecords = getSecondaryRecords(secondaryReader, record, commandArgs.matchAllele)
val fieldMap = createFieldMap(commandArgs.fields, secondaryRecords)
......@@ -201,7 +203,6 @@ object VcfWithVcf extends ToolCommand {
}
case FieldMethod.unique => scalaListToJavaObjectArrayList(attribute._2.distinct)
case _ => {
print(attribute._2.getClass.toString)
scalaListToJavaObjectArrayList(attribute._2)
}
})
......
......@@ -24,7 +24,6 @@ import htsjdk.variant.vcf._
import nl.lumc.sasc.biopet.utils.ToolCommand
import scala.collection.JavaConversions._
import scala.collection.mutable.{ Map => MMap }
/**
* This tool parses a VEP annotated VCF into a standard VCF file.
......@@ -57,31 +56,37 @@ object VepNormalizer extends ToolCommand {
}
val header = reader.getFileHeader
logger.debug("Checking for CSQ tag")
csqCheck(header)
logger.debug("CSQ tag OK")
logger.debug("Checkion VCF version")
versionCheck(header)
logger.debug("VCF version OK")
logger.debug("Parsing header")
val newInfos = parseCsq(header)
header.setWriteCommandLine(true)
val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
setOutputFile(output).setReferenceDictionary(header.getSequenceDictionary)
build ())
for (info <- newInfos) {
val tmpheaderline = new VCFInfoHeaderLine(info, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "A VEP annotation")
header.addMetaDataLine(tmpheaderline)
}
logger.debug("Header parsing done")
if (reader.iterator().hasNext) {
logger.debug("Checking for CSQ tag")
csqCheck(header)
logger.debug("CSQ tag OK")
logger.debug("Checkion VCF version")
versionCheck(header)
logger.debug("VCF version OK")
logger.debug("Parsing header")
val newInfos = parseCsq(header)
header.setWriteCommandLine(true)
for (info <- newInfos) {
val tmpheaderline = new VCFInfoHeaderLine(info, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "A VEP annotation")
header.addMetaDataLine(tmpheaderline)
}
logger.debug("Header parsing done")
logger.debug("Writing header to file")
logger.debug("Writing header to file")
writer.writeHeader(header)
logger.debug("Wrote header to file")
writer.writeHeader(header)
logger.debug("Wrote header to file")
normalize(reader, writer, newInfos, commandArgs.mode, commandArgs.removeCSQ)
normalize(reader, writer, newInfos, commandArgs.mode, commandArgs.removeCSQ)
} else {
logger.debug("No variants found, skipping normalize step")
writer.writeHeader(header)
}
writer.close()
logger.debug("Closed writer")
reader.close()
......@@ -91,6 +96,7 @@ object VepNormalizer extends ToolCommand {
/**
* Normalizer
*
* @param reader input VCF VCFFileReader
* @param writer output VCF AsyncVariantContextWriter
* @param newInfos array of string containing names of new info fields
......@@ -118,6 +124,7 @@ object VepNormalizer extends ToolCommand {
/**
* Checks whether header has a CSQ tag
*
* @param header VCF header
*/
def csqCheck(header: VCFHeader) = {
......@@ -131,6 +138,7 @@ object VepNormalizer extends ToolCommand {
* Checks whether version of input VCF is at least 4.0
* VEP is known to cause issues below 4.0
* Throws exception if not
*
* @param header VCFHeader of input VCF
*/
def versionCheck(header: VCFHeader) = {
......@@ -149,6 +157,7 @@ object VepNormalizer extends ToolCommand {
/**
* Parses the CSQ tag in the header
*
* @param header the VCF header
* @return list of strings with new info fields
*/
......@@ -160,6 +169,7 @@ object VepNormalizer extends ToolCommand {
/**
* Explode a single VEP-annotated record to multiple normal records
* Based on the number of annotated transcripts in the CSQ tag
*
* @param record the record as a VariantContext object
* @param csqInfos An array with names of new info tags
* @return An array with the new records
......
......@@ -17,7 +17,7 @@ package nl.lumc.sasc.biopet.utils
import java.io.File
import htsjdk.samtools.{ SamReader, SamReaderFactory }
import htsjdk.samtools.{ SAMSequenceDictionary, SamReader, SamReaderFactory }
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
import scala.collection.JavaConversions._
......@@ -129,11 +129,8 @@ object BamUtils {
val counts = bamInsertSizes.flatMap(x => x)
// avoid division by zero
if (counts.size != 0) {
counts.sum / counts.size
} else {
0
}
if (counts.size != 0) counts.sum / counts.size
else 0
}
/**
......@@ -146,4 +143,21 @@ object BamUtils {
bamFile -> sampleBamInsertSize(bamFile, samplingSize, binSize)
}.toMap
/** This class will add functionality to [[SAMSequenceDictionary]] */
implicit class SamDictCheck(samDics: SAMSequenceDictionary) {
/**
* This method will check if all contig and sizes are the same without looking at the order of the contigs
*
* @throws AssertionError
* @param that Dict to compare to
* @param ignoreOrder When true the order of the contig does not matter
*/
def assertSameDictionary(that: SAMSequenceDictionary, ignoreOrder: Boolean): Unit = {
if (ignoreOrder) {
assert(samDics.getReferenceLength == that.getReferenceLength)
val thisContigNames = samDics.getSequences.map(x => (x.getSequenceName, x.getSequenceLength)).sorted.toSet
assert(thisContigNames == that.getSequences.map(x => (x.getSequenceName, x.getSequenceLength)).sorted.toSet)
} else samDics.assertSameDictionary(that)
}
}
}
......@@ -134,4 +134,11 @@ object VcfUtils {
else if (name.endsWith(".vcf.gz")) new File(name + ".tbi")
else throw new IllegalArgumentException(s"File given is no vcf file: $vcfFile")
}
def vcfFileIsEmpty(file: File): Boolean = {
val reader = new VCFFileReader(file, false)
val hasNext = reader.iterator().hasNext
reader.close()
!hasNext
}
}
......@@ -178,7 +178,7 @@ class Shiva(val root: Configurable) extends QScript with MultisampleMappingTrait
annotation.foreach { toucan =>
toucan.outputDir = new File(outputDir, "annotation")
toucan.inputVCF = vc.finalFile
toucan.inputVcf = vc.finalFile
add(toucan)
}
})
......
......@@ -15,15 +15,20 @@
*/
package nl.lumc.sasc.biopet.pipelines.toucan
import java.io.File
import htsjdk.samtools.reference.FastaSequenceFile
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.extensions.bcftools.BcftoolsView
import nl.lumc.sasc.biopet.extensions.bedtools.{ BedtoolsIntersect, BedtoolsMerge }
import nl.lumc.sasc.biopet.extensions.gatk.{ CatVariants, SelectVariants }
import nl.lumc.sasc.biopet.extensions.manwe.{ ManweAnnotateVcf, ManweSamplesImport }
import nl.lumc.sasc.biopet.extensions.tools.{ GvcfToBed, VcfWithVcf, VepNormalizer }
import nl.lumc.sasc.biopet.extensions.{ Bgzip, Ln, VariantEffectPredictor }
import nl.lumc.sasc.biopet.utils.VcfUtils
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
import org.broadinstitute.gatk.queue.QScript
/**
......@@ -35,22 +40,43 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum
def this() = this(null)
@Input(doc = "Input VCF file", shortName = "Input", required = true)
var inputVCF: File = _
var inputVcf: File = _
@Input(doc = "Input GVCF file", shortName = "gvcf", required = false)
var inputGvcf: Option[File] = None
var outputVcf: Option[F