Commit 3a1af12e authored by Peter van 't Hof's avatar Peter van 't Hof

Fix for BIOPET-395

parent e6199e01
......@@ -81,7 +81,7 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
this match {
case r: Reference =>
if (r.dictRequired) deps :+= r.referenceDict
if (r.dictRequired) deps :+= r.referenceDictFile
if (r.faiRequired) deps :+= r.referenceFai
deps = deps.distinct
case _ =>
......
......@@ -16,9 +16,10 @@ package nl.lumc.sasc.biopet.core
import java.io.File
import htsjdk.samtools.reference.IndexedFastaSequenceFile
import nl.lumc.sasc.biopet.core.summary.{ SummaryQScript, Summarizable }
import nl.lumc.sasc.biopet.utils.{ ConfigUtils, Logging }
import htsjdk.samtools.SAMSequenceDictionary
import htsjdk.samtools.reference.{ FastaSequenceFile, IndexedFastaSequenceFile }
import nl.lumc.sasc.biopet.core.summary.{ Summarizable, SummaryQScript }
import nl.lumc.sasc.biopet.utils.{ BamUtils, ConfigUtils, FastaUtils, Logging }
import nl.lumc.sasc.biopet.utils.config.{ Config, Configurable }
import scala.collection.JavaConversions._
......@@ -49,6 +50,8 @@ trait Reference extends Configurable {
}
}
def referenceDict = FastaUtils.getCachedDict(referenceFasta())
/** All config values will get a prefix */
override def subPath = {
referenceConfigPath ::: super.subPath
......@@ -66,7 +69,7 @@ trait Reference extends Configurable {
def dictRequired = this.isInstanceOf[Summarizable] || this.isInstanceOf[SummaryQScript]
/** Returns the dict file belonging to the fasta file */
def referenceDict = new File(referenceFasta().getAbsolutePath
def referenceDictFile = new File(referenceFasta().getAbsolutePath
.stripSuffix(".fa")
.stripSuffix(".fasta")
.stripSuffix(".fna") + ".dict")
......@@ -108,9 +111,8 @@ trait Reference extends Configurable {
/** Create summary part for reference */
def referenceSummary: Map[String, Any] = {
Reference.requireDict(referenceFasta())
val file = new IndexedFastaSequenceFile(referenceFasta())
Map("contigs" ->
(for (seq <- file.getSequenceDictionary.getSequences) yield seq.getSequenceName -> {
(for (seq <- referenceDict.getSequences) yield seq.getSequenceName -> {
val md5 = Option(seq.getAttribute("M5"))
Map("md5" -> md5, "length" -> seq.getSequenceLength)
}).toMap,
......
......@@ -37,7 +37,7 @@ class SortVcf(val root: Configurable) extends Picard with Reference {
override def beforeGraph(): Unit = {
super.beforeGraph()
if (sequenceDictionary == null) sequenceDictionary = referenceDict
if (sequenceDictionary == null) sequenceDictionary = referenceDictFile
}
/** Returns command to execute */
......
......@@ -21,7 +21,7 @@ import htsjdk.samtools.reference.{ FastaSequenceFile, ReferenceSequenceFileFacto
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.variantcontext.{ Allele, GenotypeBuilder, VariantContextBuilder }
import htsjdk.variant.vcf._
import nl.lumc.sasc.biopet.utils.ToolCommand
import nl.lumc.sasc.biopet.utils.{ FastaUtils, ToolCommand }
import scala.collection.JavaConversions._
import scala.io.Source
......@@ -82,12 +82,11 @@ object GensToVcf extends ToolCommand {
metaLines.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, ""))
metaLines.add(new VCFFormatHeaderLine("GP", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, ""))
val reference = new FastaSequenceFile(cmdArgs.referenceFasta, true)
require(reference.getSequenceDictionary.getSequence(cmdArgs.contig) != null,
require(FastaUtils.getCachedDict(cmdArgs.referenceFasta).getSequence(cmdArgs.contig) != null,
s"contig '${cmdArgs.contig}' not found on reference")
val header = new VCFHeader(metaLines, samples.toList)
header.setSequenceDictionary(reference.getSequenceDictionary)
header.setSequenceDictionary(FastaUtils.getCachedDict(cmdArgs.referenceFasta))
val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder()
.setOutputFile(cmdArgs.outputVcf)
.setReferenceDictionary(header.getSequenceDictionary)
......
......@@ -20,7 +20,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.variantcontext.{ Allele, VariantContext, VariantContextBuilder }
import htsjdk.variant.vcf.{ VCFFileReader, VCFHeader }
import nl.lumc.sasc.biopet.utils.ToolCommand
import nl.lumc.sasc.biopet.utils.{ FastaUtils, ToolCommand }
import nl.lumc.sasc.biopet.utils.config.Configurable
import scala.collection.JavaConversions._
......@@ -52,13 +52,12 @@ object MergeAlleles extends ToolCommand {
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
val readers = commandArgs.inputFiles.map(new VCFFileReader(_, true))
val referenceFile = new FastaSequenceFile(commandArgs.reference, true)
val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
setReferenceDictionary(referenceFile.getSequenceDictionary).
setReferenceDictionary(FastaUtils.getCachedDict(commandArgs.reference)).
setOutputFile(commandArgs.outputFile).
build)
val header = new VCFHeader
val referenceDict = referenceFile.getSequenceDictionary
val referenceDict = FastaUtils.getCachedDict(commandArgs.reference)
header.setSequenceDictionary(referenceDict)
writer.writeHeader(header)
......
......@@ -21,7 +21,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, Options, VariantContextWriterBuilder }
import htsjdk.variant.variantcontext.{ Allele, VariantContextBuilder }
import htsjdk.variant.vcf._
import nl.lumc.sasc.biopet.utils.ToolCommand
import nl.lumc.sasc.biopet.utils.{ FastaUtils, ToolCommand }
import scala.collection.JavaConversions._
import scala.io.Source
......@@ -71,9 +71,8 @@ object SnptestToVcf extends ToolCommand {
}
def writeEmptyVcf(outputVcf: File, referenceFasta: File): Unit = {
val reference = new FastaSequenceFile(referenceFasta, true)
val vcfHeader = new VCFHeader()
vcfHeader.setSequenceDictionary(reference.getSequenceDictionary)
vcfHeader.setSequenceDictionary(FastaUtils.getCachedDict(referenceFasta))
val writer = new VariantContextWriterBuilder()
.setOutputFile(outputVcf)
.setReferenceDictionary(vcfHeader.getSequenceDictionary)
......@@ -92,12 +91,11 @@ object SnptestToVcf extends ToolCommand {
key <- headerKeys if key != "rsid" if key != "chromosome" if key != "position" if key != "alleleA" if key != "alleleB" if key != "alleleA"
) metaLines.add(new VCFInfoHeaderLine(s"ST_$key", 1, VCFHeaderLineType.String, ""))
val reference = new FastaSequenceFile(cmdArgs.referenceFasta, true)
require(reference.getSequenceDictionary.getSequence(cmdArgs.contig) != null,
require(FastaUtils.getCachedDict(cmdArgs.referenceFasta).getSequence(cmdArgs.contig) != null,
s"contig '${cmdArgs.contig}' not found on reference")
val vcfHeader = new VCFHeader(metaLines)
vcfHeader.setSequenceDictionary(reference.getSequenceDictionary)
vcfHeader.setSequenceDictionary(FastaUtils.getCachedDict(cmdArgs.referenceFasta))
val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder()
.setOutputFile(cmdArgs.outputVcf)
.setReferenceDictionary(vcfHeader.getSequenceDictionary)
......
......@@ -20,7 +20,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import htsjdk.samtools.util.Interval
import htsjdk.variant.variantcontext.{ Allele, Genotype, VariantContext }
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.utils.ToolCommand
import nl.lumc.sasc.biopet.utils.{ FastaUtils, ToolCommand }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
......@@ -330,7 +330,7 @@ object VcfStats extends ToolCommand {
writeField(stats, "general", cmdArgs.outputDir)
for (field <- adInfoTags.distinct.par) {
writeField(stats, field, infoOutputDir)
for (line <- new FastaSequenceFile(cmdArgs.referenceFile, true).getSequenceDictionary.getSequences) {
for (line <- FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences) {
val chr = line.getSequenceName
writeField(stats, field, new File(infoOutputDir, "chrs" + File.separator + chr), chr = chr)
}
......@@ -341,7 +341,7 @@ object VcfStats extends ToolCommand {
writeGenotypeField(stats, samples, "general", cmdArgs.outputDir, prefix = "genotype")
for (field <- adGenotypeTags.distinct.par) {
writeGenotypeField(stats, samples, field, genotypeOutputDir)
for (line <- new FastaSequenceFile(cmdArgs.referenceFile, true).getSequenceDictionary.getSequences) {
for (line <- FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences) {
val chr = line.getSequenceName
writeGenotypeField(stats, samples, field, new File(genotypeOutputDir, "chrs" + File.separator + chr), chr = chr)
}
......
......@@ -21,7 +21,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import htsjdk.variant.variantcontext.{ VariantContext, VariantContextBuilder }
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.vcf._
import nl.lumc.sasc.biopet.utils.ToolCommand
import nl.lumc.sasc.biopet.utils.{ FastaUtils, ToolCommand }
import nl.lumc.sasc.biopet.utils.VcfUtils.scalaListToJavaObjectArrayList
import nl.lumc.sasc.biopet.utils.BamUtils.SamDictCheck
......@@ -84,7 +84,7 @@ object VcfWithVcf extends ToolCommand {
val reader = new VCFFileReader(commandArgs.inputFile)
val secondaryReader = new VCFFileReader(commandArgs.secondaryVcf)
val referenceDict = new FastaSequenceFile(commandArgs.referenceFasta, true).getSequenceDictionary
val referenceDict = FastaUtils.getCachedDict(commandArgs.referenceFasta)
val header = reader.getFileHeader
val vcfDict = header.getSequenceDictionary match {
......
......@@ -20,7 +20,7 @@ import java.util.concurrent.TimeoutException
import htsjdk.samtools.reference.FastaSequenceFile
import htsjdk.samtools.{ SAMSequenceDictionary, SamReaderFactory }
import nl.lumc.sasc.biopet.utils.BamUtils.SamDictCheck
import nl.lumc.sasc.biopet.utils.ToolCommand
import nl.lumc.sasc.biopet.utils.{ FastaUtils, ToolCommand }
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
import scala.collection.JavaConversions._
......@@ -82,11 +82,8 @@ object BamStats extends ToolCommand {
val samHeader = samReader.getFileHeader
samReader.close()
referenceFasta.map { f =>
val referenceReader = new FastaSequenceFile(f, true)
val referenceDict = referenceReader.getSequenceDictionary
samHeader.getSequenceDictionary.assertSameDictionary(referenceDict, false)
referenceReader.close()
referenceDict
samHeader.getSequenceDictionary.assertSameDictionary(FastaUtils.getCachedDict(f), false)
FastaUtils.getCachedDict(f)
}.getOrElse(samHeader.getSequenceDictionary)
}
......
package nl.lumc.sasc.biopet.utils
import java.io.File
import htsjdk.samtools.SAMSequenceDictionary
import htsjdk.samtools.reference.FastaSequenceFile
/**
* Created by pjvan_thof on 25-10-16.
*/
object FastaUtils {
/**
* This method will get a dict from the fasta file. This will not use the cache
*
* @param fastaFile Fasta file
* @return sequence dict
*/
def getDictFromFasta(fastaFile: File): SAMSequenceDictionary = {
val referenceFile = new FastaSequenceFile(fastaFile, true)
val dict = referenceFile.getSequenceDictionary
referenceFile.close()
dictCache += fastaFile -> dict
dict
}
private var dictCache: Map[File, SAMSequenceDictionary] = Map()
/** This will clear the dict cache */
def clearCache() = dictCache = Map()
/**
* This method will get a dict from the fasta file. If it's already in the cache file will not opened again.
*
* @param fastaFile Fasta file
* @return sequence dict
*/
def getCachedDict(fastaFile: File): SAMSequenceDictionary = {
if (!dictCache.contains(fastaFile)) dictCache += fastaFile -> getDictFromFasta(fastaFile)
dictCache(fastaFile)
}
}
......@@ -23,7 +23,7 @@ import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.io.Source
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.{ FastaUtils, Logging }
/**
* Created by pjvan_thof on 8/20/15.
......@@ -100,8 +100,7 @@ case class BedRecordList(val chrRecords: Map[String, List[BedRecord]], val heade
)
def validateContigs(reference: File) = {
val referenceFile = new FastaSequenceFile(reference, true)
val dict = referenceFile.getSequenceDictionary
val dict = FastaUtils.getCachedDict(reference)
val notExisting = chrRecords.keys.filter(dict.getSequence(_) == null).toList
require(notExisting.isEmpty, s"Contigs found in bed records but are not existing in reference: ${notExisting.mkString(",")}")
this
......@@ -152,12 +151,7 @@ object BedRecordList {
}
}
def fromReference(file: File) = {
val referenceFile = new FastaSequenceFile(file, true)
val dict = referenceFile.getSequenceDictionary
referenceFile.close()
fromDict(dict)
}
def fromReference(file: File) = fromDict(FastaUtils.getCachedDict(file))
def fromDict(dict: SAMSequenceDictionary) = {
fromList(for (contig <- dict.getSequences) yield {
......
......@@ -56,7 +56,6 @@ class Impute2Vcf(val root: Configurable) extends QScript with BiopetQScript with
/** Init for pipeline */
def init(): Unit = {
inputGens.foreach { g =>
val referenceDict = new FastaSequenceFile(referenceFasta(), true).getSequenceDictionary
if (referenceDict.getSequenceIndex(g.contig) == -1)
Logging.addError(s"Contig '${g.contig}' does not exist on reference: ${referenceFasta()}")
}
......
......@@ -150,7 +150,7 @@ trait MultisampleMappingTrait extends MultiSampleQScript
val dictOke: Boolean = {
var oke = true
try {
header.getSequenceDictionary.assertSameDictionary(referenceFile.getSequenceDictionary)
header.getSequenceDictionary.assertSameDictionary(referenceDict)
} catch {
case e: AssertionError =>
logger.error(e.getMessage)
......
......@@ -56,12 +56,7 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum
lazy val minScatterGenomeSize: Long = config("min_scatter_genome_size", default = 75000000)
lazy val enableScatter: Boolean = config("enable_scatter", default = {
val ref = new FastaSequenceFile(referenceFasta(), true)
val refLength = ref.getSequenceDictionary.getReferenceLength
ref.close()
refLength > minScatterGenomeSize
})
lazy val enableScatter: Boolean = config("enable_scatter", default = referenceDict.getReferenceLength > minScatterGenomeSize)
def sampleInfo: Map[String, Map[String, Any]] = root match {
case m: MultiSampleQScript => m.samples.map { case (sampleId, sample) => sampleId -> sample.sampleTags }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment