Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
18dd271e
Commit
18dd271e
authored
Oct 26, 2016
by
Sander Bollen
Browse files
Merge branch 'feature-dict_cache' into 'develop'
Fix for BIOPET-395 See merge request !463
parents
e79bb410
5c50b65a
Changes
14
Hide whitespace changes
Inline
Side-by-side
biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunction.scala
View file @
18dd271e
...
...
@@ -81,7 +81,7 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
this
match
{
case
r
:
Reference
=>
if
(
r
.
dictRequired
)
deps
:+=
r
.
referenceDict
if
(
r
.
dictRequired
)
deps
:+=
r
.
referenceDict
File
if
(
r
.
faiRequired
)
deps
:+=
r
.
referenceFai
deps
=
deps
.
distinct
case
_
=>
...
...
biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/Reference.scala
View file @
18dd271e
...
...
@@ -16,9 +16,10 @@ package nl.lumc.sasc.biopet.core
import
java.io.File
import
htsjdk.samtools.reference.IndexedFastaSequenceFile
import
nl.lumc.sasc.biopet.core.summary.
{
SummaryQScript
,
Summarizable
}
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
Logging
}
import
htsjdk.samtools.SAMSequenceDictionary
import
htsjdk.samtools.reference.
{
FastaSequenceFile
,
IndexedFastaSequenceFile
}
import
nl.lumc.sasc.biopet.core.summary.
{
Summarizable
,
SummaryQScript
}
import
nl.lumc.sasc.biopet.utils.
{
BamUtils
,
ConfigUtils
,
FastaUtils
,
Logging
}
import
nl.lumc.sasc.biopet.utils.config.
{
Config
,
Configurable
}
import
scala.collection.JavaConversions._
...
...
@@ -49,6 +50,8 @@ trait Reference extends Configurable {
}
}
def
referenceDict
=
FastaUtils
.
getCachedDict
(
referenceFasta
())
/** All config values will get a prefix */
override
def
subPath
=
{
referenceConfigPath
:::
super
.
subPath
...
...
@@ -66,7 +69,7 @@ trait Reference extends Configurable {
def
dictRequired
=
this
.
isInstanceOf
[
Summarizable
]
||
this
.
isInstanceOf
[
SummaryQScript
]
/** Returns the dict file belonging to the fasta file */
def
referenceDict
=
new
File
(
referenceFasta
().
getAbsolutePath
def
referenceDict
File
=
new
File
(
referenceFasta
().
getAbsolutePath
.
stripSuffix
(
".fa"
)
.
stripSuffix
(
".fasta"
)
.
stripSuffix
(
".fna"
)
+
".dict"
)
...
...
@@ -108,9 +111,8 @@ trait Reference extends Configurable {
/** Create summary part for reference */
def
referenceSummary
:
Map
[
String
,
Any
]
=
{
Reference
.
requireDict
(
referenceFasta
())
val
file
=
new
IndexedFastaSequenceFile
(
referenceFasta
())
Map
(
"contigs"
->
(
for
(
seq
<-
file
.
getSequ
enceDict
ionary
.
getSequences
)
yield
seq
.
getSequenceName
->
{
(
for
(
seq
<-
refer
enceDict
.
getSequences
)
yield
seq
.
getSequenceName
->
{
val
md5
=
Option
(
seq
.
getAttribute
(
"M5"
))
Map
(
"md5"
->
md5
,
"length"
->
seq
.
getSequenceLength
)
}).
toMap
,
...
...
biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SortVcf.scala
View file @
18dd271e
...
...
@@ -37,7 +37,7 @@ class SortVcf(val root: Configurable) extends Picard with Reference {
override
def
beforeGraph
()
:
Unit
=
{
super
.
beforeGraph
()
if
(
sequenceDictionary
==
null
)
sequenceDictionary
=
referenceDict
if
(
sequenceDictionary
==
null
)
sequenceDictionary
=
referenceDict
File
}
/** Returns command to execute */
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GensToVcf.scala
View file @
18dd271e
...
...
@@ -21,7 +21,7 @@ import htsjdk.samtools.reference.{ FastaSequenceFile, ReferenceSequenceFileFacto
import
htsjdk.variant.variantcontext.writer.
{
AsyncVariantContextWriter
,
VariantContextWriterBuilder
}
import
htsjdk.variant.variantcontext.
{
Allele
,
GenotypeBuilder
,
VariantContextBuilder
}
import
htsjdk.variant.vcf._
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
scala.collection.JavaConversions._
import
scala.io.Source
...
...
@@ -82,12 +82,11 @@ object GensToVcf extends ToolCommand {
metaLines
.
add
(
new
VCFFormatHeaderLine
(
"GT"
,
1
,
VCFHeaderLineType
.
String
,
""
))
metaLines
.
add
(
new
VCFFormatHeaderLine
(
"GP"
,
VCFHeaderLineCount
.
UNBOUNDED
,
VCFHeaderLineType
.
Float
,
""
))
val
reference
=
new
FastaSequenceFile
(
cmdArgs
.
referenceFasta
,
true
)
require
(
reference
.
getSequenceDictionary
.
getSequence
(
cmdArgs
.
contig
)
!=
null
,
require
(
FastaUtils
.
getCachedDict
(
cmdArgs
.
referenceFasta
).
getSequence
(
cmdArgs
.
contig
)
!=
null
,
s
"contig '${cmdArgs.contig}' not found on reference"
)
val
header
=
new
VCFHeader
(
metaLines
,
samples
.
toList
)
header
.
setSequenceDictionary
(
reference
.
getSequenceDictionary
)
header
.
setSequenceDictionary
(
FastaUtils
.
getCachedDict
(
cmdArgs
.
referenceFasta
)
)
val
writer
=
new
AsyncVariantContextWriter
(
new
VariantContextWriterBuilder
()
.
setOutputFile
(
cmdArgs
.
outputVcf
)
.
setReferenceDictionary
(
header
.
getSequenceDictionary
)
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/MergeAlleles.scala
View file @
18dd271e
...
...
@@ -20,7 +20,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.variant.variantcontext.writer.
{
AsyncVariantContextWriter
,
VariantContextWriterBuilder
}
import
htsjdk.variant.variantcontext.
{
Allele
,
VariantContext
,
VariantContextBuilder
}
import
htsjdk.variant.vcf.
{
VCFFileReader
,
VCFHeader
}
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
scala.collection.JavaConversions._
...
...
@@ -52,13 +52,12 @@ object MergeAlleles extends ToolCommand {
val
commandArgs
:
Args
=
argsParser
.
parse
(
args
,
Args
())
getOrElse
(
throw
new
IllegalArgumentException
)
val
readers
=
commandArgs
.
inputFiles
.
map
(
new
VCFFileReader
(
_
,
true
))
val
referenceFile
=
new
FastaSequenceFile
(
commandArgs
.
reference
,
true
)
val
writer
=
new
AsyncVariantContextWriter
(
new
VariantContextWriterBuilder
().
setReferenceDictionary
(
referenceF
il
e
.
get
SequenceDictionary
).
setReferenceDictionary
(
FastaUt
il
s
.
get
CachedDict
(
commandArgs
.
reference
)
).
setOutputFile
(
commandArgs
.
outputFile
).
build
)
val
header
=
new
VCFHeader
val
referenceDict
=
referenceF
il
e
.
get
SequenceDictionary
val
referenceDict
=
FastaUt
il
s
.
get
CachedDict
(
commandArgs
.
reference
)
header
.
setSequenceDictionary
(
referenceDict
)
writer
.
writeHeader
(
header
)
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/SnptestToVcf.scala
View file @
18dd271e
...
...
@@ -21,7 +21,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.variant.variantcontext.writer.
{
AsyncVariantContextWriter
,
Options
,
VariantContextWriterBuilder
}
import
htsjdk.variant.variantcontext.
{
Allele
,
VariantContextBuilder
}
import
htsjdk.variant.vcf._
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
scala.collection.JavaConversions._
import
scala.io.Source
...
...
@@ -71,9 +71,8 @@ object SnptestToVcf extends ToolCommand {
}
def
writeEmptyVcf
(
outputVcf
:
File
,
referenceFasta
:
File
)
:
Unit
=
{
val
reference
=
new
FastaSequenceFile
(
referenceFasta
,
true
)
val
vcfHeader
=
new
VCFHeader
()
vcfHeader
.
setSequenceDictionary
(
reference
.
getSequenceDictionary
)
vcfHeader
.
setSequenceDictionary
(
FastaUtils
.
getCachedDict
(
referenceFasta
)
)
val
writer
=
new
VariantContextWriterBuilder
()
.
setOutputFile
(
outputVcf
)
.
setReferenceDictionary
(
vcfHeader
.
getSequenceDictionary
)
...
...
@@ -92,12 +91,11 @@ object SnptestToVcf extends ToolCommand {
key
<-
headerKeys
if
key
!=
"rsid"
if
key
!=
"chromosome"
if
key
!=
"position"
if
key
!=
"alleleA"
if
key
!=
"alleleB"
if
key
!=
"alleleA"
)
metaLines
.
add
(
new
VCFInfoHeaderLine
(
s
"ST_$key"
,
1
,
VCFHeaderLineType
.
String
,
""
))
val
reference
=
new
FastaSequenceFile
(
cmdArgs
.
referenceFasta
,
true
)
require
(
reference
.
getSequenceDictionary
.
getSequence
(
cmdArgs
.
contig
)
!=
null
,
require
(
FastaUtils
.
getCachedDict
(
cmdArgs
.
referenceFasta
).
getSequence
(
cmdArgs
.
contig
)
!=
null
,
s
"contig '${cmdArgs.contig}' not found on reference"
)
val
vcfHeader
=
new
VCFHeader
(
metaLines
)
vcfHeader
.
setSequenceDictionary
(
reference
.
getSequenceDictionary
)
vcfHeader
.
setSequenceDictionary
(
FastaUtils
.
getCachedDict
(
cmdArgs
.
referenceFasta
)
)
val
writer
=
new
AsyncVariantContextWriter
(
new
VariantContextWriterBuilder
()
.
setOutputFile
(
cmdArgs
.
outputVcf
)
.
setReferenceDictionary
(
vcfHeader
.
getSequenceDictionary
)
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
View file @
18dd271e
...
...
@@ -20,7 +20,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.samtools.util.Interval
import
htsjdk.variant.variantcontext.
{
Allele
,
Genotype
,
VariantContext
}
import
htsjdk.variant.vcf.VCFFileReader
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
nl.lumc.sasc.biopet.utils.intervals.BedRecordList
...
...
@@ -330,7 +330,7 @@ object VcfStats extends ToolCommand {
writeField
(
stats
,
"general"
,
cmdArgs
.
outputDir
)
for
(
field
<-
adInfoTags
.
distinct
.
par
)
{
writeField
(
stats
,
field
,
infoOutputDir
)
for
(
line
<-
new
Fasta
SequenceFile
(
cmdArgs
.
referenceFile
,
true
).
getSequenceDictionary
.
getSequences
)
{
for
(
line
<-
Fasta
Utils
.
getCachedDict
(
cmdArgs
.
referenceFile
)
.
getSequences
)
{
val
chr
=
line
.
getSequenceName
writeField
(
stats
,
field
,
new
File
(
infoOutputDir
,
"chrs"
+
File
.
separator
+
chr
),
chr
=
chr
)
}
...
...
@@ -341,7 +341,7 @@ object VcfStats extends ToolCommand {
writeGenotypeField
(
stats
,
samples
,
"general"
,
cmdArgs
.
outputDir
,
prefix
=
"genotype"
)
for
(
field
<-
adGenotypeTags
.
distinct
.
par
)
{
writeGenotypeField
(
stats
,
samples
,
field
,
genotypeOutputDir
)
for
(
line
<-
new
Fasta
SequenceFile
(
cmdArgs
.
referenceFile
,
true
).
getSequenceDictionary
.
getSequences
)
{
for
(
line
<-
Fasta
Utils
.
getCachedDict
(
cmdArgs
.
referenceFile
)
.
getSequences
)
{
val
chr
=
line
.
getSequenceName
writeGenotypeField
(
stats
,
samples
,
field
,
new
File
(
genotypeOutputDir
,
"chrs"
+
File
.
separator
+
chr
),
chr
=
chr
)
}
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala
View file @
18dd271e
...
...
@@ -21,7 +21,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.variant.variantcontext.
{
VariantContext
,
VariantContextBuilder
}
import
htsjdk.variant.variantcontext.writer.
{
AsyncVariantContextWriter
,
VariantContextWriterBuilder
}
import
htsjdk.variant.vcf._
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
nl.lumc.sasc.biopet.utils.VcfUtils.scalaListToJavaObjectArrayList
import
nl.lumc.sasc.biopet.utils.BamUtils.SamDictCheck
...
...
@@ -84,7 +84,7 @@ object VcfWithVcf extends ToolCommand {
val
reader
=
new
VCFFileReader
(
commandArgs
.
inputFile
)
val
secondaryReader
=
new
VCFFileReader
(
commandArgs
.
secondaryVcf
)
val
referenceDict
=
new
Fasta
SequenceFile
(
commandArgs
.
referenceFasta
,
true
).
getSequenceDictionary
val
referenceDict
=
Fasta
Utils
.
getCachedDict
(
commandArgs
.
referenceFasta
)
val
header
=
reader
.
getFileHeader
val
vcfDict
=
header
.
getSequenceDictionary
match
{
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala
View file @
18dd271e
...
...
@@ -20,7 +20,7 @@ import java.util.concurrent.TimeoutException
import
htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.samtools.
{
SAMSequenceDictionary
,
SamReaderFactory
}
import
nl.lumc.sasc.biopet.utils.BamUtils.SamDictCheck
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
nl.lumc.sasc.biopet.utils.intervals.
{
BedRecord
,
BedRecordList
}
import
scala.collection.JavaConversions._
...
...
@@ -82,11 +82,8 @@ object BamStats extends ToolCommand {
val
samHeader
=
samReader
.
getFileHeader
samReader
.
close
()
referenceFasta
.
map
{
f
=>
val
referenceReader
=
new
FastaSequenceFile
(
f
,
true
)
val
referenceDict
=
referenceReader
.
getSequenceDictionary
samHeader
.
getSequenceDictionary
.
assertSameDictionary
(
referenceDict
,
false
)
referenceReader
.
close
()
referenceDict
samHeader
.
getSequenceDictionary
.
assertSameDictionary
(
FastaUtils
.
getCachedDict
(
f
),
false
)
FastaUtils
.
getCachedDict
(
f
)
}.
getOrElse
(
samHeader
.
getSequenceDictionary
)
}
...
...
biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/FastaUtils.scala
0 → 100644
View file @
18dd271e
package
nl.lumc.sasc.biopet.utils
import
java.io.File
import
htsjdk.samtools.SAMSequenceDictionary
import
htsjdk.samtools.reference.FastaSequenceFile
/**
* Created by pjvan_thof on 25-10-16.
*/
object
FastaUtils
{
/**
* This method will get a dict from the fasta file. This will not use the cache
*
* @param fastaFile Fasta file
* @return sequence dict
*/
def
getDictFromFasta
(
fastaFile
:
File
)
:
SAMSequenceDictionary
=
{
val
referenceFile
=
new
FastaSequenceFile
(
fastaFile
,
true
)
val
dict
=
referenceFile
.
getSequenceDictionary
referenceFile
.
close
()
dictCache
+=
fastaFile
->
dict
dict
}
private
var
dictCache
:
Map
[
File
,
SAMSequenceDictionary
]
=
Map
()
/** This will clear the dict cache */
def
clearCache
()
=
{
dictCache
=
Map
()
}
/**
* This method will get a dict from the fasta file. If it's already in the cache file will not opened again.
*
* @param fastaFile Fasta file
* @return sequence dict
*/
def
getCachedDict
(
fastaFile
:
File
)
:
SAMSequenceDictionary
=
{
if
(!
dictCache
.
contains
(
fastaFile
))
getDictFromFasta
(
fastaFile
)
else
dictCache
(
fastaFile
)
}
}
biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala
View file @
18dd271e
...
...
@@ -23,7 +23,7 @@ import scala.collection.JavaConversions._
import
scala.collection.mutable
import
scala.collection.mutable.ListBuffer
import
scala.io.Source
import
nl.lumc.sasc.biopet.utils.Logging
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
Logging
}
/**
* Created by pjvan_thof on 8/20/15.
...
...
@@ -100,8 +100,7 @@ case class BedRecordList(val chrRecords: Map[String, List[BedRecord]], val heade
)
def
validateContigs
(
reference
:
File
)
=
{
val
referenceFile
=
new
FastaSequenceFile
(
reference
,
true
)
val
dict
=
referenceFile
.
getSequenceDictionary
val
dict
=
FastaUtils
.
getCachedDict
(
reference
)
val
notExisting
=
chrRecords
.
keys
.
filter
(
dict
.
getSequence
(
_
)
==
null
).
toList
require
(
notExisting
.
isEmpty
,
s
"Contigs found in bed records but are not existing in reference: ${notExisting.mkString("
,
")}"
)
this
...
...
@@ -152,12 +151,7 @@ object BedRecordList {
}
}
def
fromReference
(
file
:
File
)
=
{
val
referenceFile
=
new
FastaSequenceFile
(
file
,
true
)
val
dict
=
referenceFile
.
getSequenceDictionary
referenceFile
.
close
()
fromDict
(
dict
)
}
def
fromReference
(
file
:
File
)
=
fromDict
(
FastaUtils
.
getCachedDict
(
file
))
def
fromDict
(
dict
:
SAMSequenceDictionary
)
=
{
fromList
(
for
(
contig
<-
dict
.
getSequences
)
yield
{
...
...
gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Impute2Vcf.scala
View file @
18dd271e
...
...
@@ -56,7 +56,6 @@ class Impute2Vcf(val root: Configurable) extends QScript with BiopetQScript with
/** Init for pipeline */
def
init
()
:
Unit
=
{
inputGens
.
foreach
{
g
=>
val
referenceDict
=
new
FastaSequenceFile
(
referenceFasta
(),
true
).
getSequenceDictionary
if
(
referenceDict
.
getSequenceIndex
(
g
.
contig
)
==
-
1
)
Logging
.
addError
(
s
"Contig '${g.contig}' does not exist on reference: ${referenceFasta()}"
)
}
...
...
mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/MultisampleMapping.scala
View file @
18dd271e
...
...
@@ -150,7 +150,7 @@ trait MultisampleMappingTrait extends MultiSampleQScript
val
dictOke
:
Boolean
=
{
var
oke
=
true
try
{
header
.
getSequenceDictionary
.
assertSameDictionary
(
reference
File
.
getSequenceDictionary
)
header
.
getSequenceDictionary
.
assertSameDictionary
(
reference
Dict
)
}
catch
{
case
e
:
AssertionError
=>
logger
.
error
(
e
.
getMessage
)
...
...
toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala
View file @
18dd271e
...
...
@@ -56,12 +56,7 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum
lazy
val
minScatterGenomeSize
:
Long
=
config
(
"min_scatter_genome_size"
,
default
=
75000000
)
lazy
val
enableScatter
:
Boolean
=
config
(
"enable_scatter"
,
default
=
{
val
ref
=
new
FastaSequenceFile
(
referenceFasta
(),
true
)
val
refLength
=
ref
.
getSequenceDictionary
.
getReferenceLength
ref
.
close
()
refLength
>
minScatterGenomeSize
})
lazy
val
enableScatter
:
Boolean
=
config
(
"enable_scatter"
,
default
=
referenceDict
.
getReferenceLength
>
minScatterGenomeSize
)
def
sampleInfo
:
Map
[
String
,
Map
[
String
,
Any
]]
=
root
match
{
case
m
:
MultiSampleQScript
=>
m
.
samples
.
map
{
case
(
sampleId
,
sample
)
=>
sampleId
->
sample
.
sampleTags
}
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment