Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
3a1af12e
Commit
3a1af12e
authored
Oct 25, 2016
by
Peter van 't Hof
Browse files
Fix for BIOPET-395
parent
e6199e01
Changes
14
Hide whitespace changes
Inline
Side-by-side
biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunction.scala
View file @
3a1af12e
...
...
@@ -81,7 +81,7 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
this
match
{
case
r
:
Reference
=>
if
(
r
.
dictRequired
)
deps
:+=
r
.
referenceDict
if
(
r
.
dictRequired
)
deps
:+=
r
.
referenceDict
File
if
(
r
.
faiRequired
)
deps
:+=
r
.
referenceFai
deps
=
deps
.
distinct
case
_
=>
...
...
biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/Reference.scala
View file @
3a1af12e
...
...
@@ -16,9 +16,10 @@ package nl.lumc.sasc.biopet.core
import
java.io.File
import
htsjdk.samtools.reference.IndexedFastaSequenceFile
import
nl.lumc.sasc.biopet.core.summary.
{
SummaryQScript
,
Summarizable
}
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
Logging
}
import
htsjdk.samtools.SAMSequenceDictionary
import
htsjdk.samtools.reference.
{
FastaSequenceFile
,
IndexedFastaSequenceFile
}
import
nl.lumc.sasc.biopet.core.summary.
{
Summarizable
,
SummaryQScript
}
import
nl.lumc.sasc.biopet.utils.
{
BamUtils
,
ConfigUtils
,
FastaUtils
,
Logging
}
import
nl.lumc.sasc.biopet.utils.config.
{
Config
,
Configurable
}
import
scala.collection.JavaConversions._
...
...
@@ -49,6 +50,8 @@ trait Reference extends Configurable {
}
}
def
referenceDict
=
FastaUtils
.
getCachedDict
(
referenceFasta
())
/** All config values will get a prefix */
override
def
subPath
=
{
referenceConfigPath
:::
super
.
subPath
...
...
@@ -66,7 +69,7 @@ trait Reference extends Configurable {
def
dictRequired
=
this
.
isInstanceOf
[
Summarizable
]
||
this
.
isInstanceOf
[
SummaryQScript
]
/** Returns the dict file belonging to the fasta file */
def
referenceDict
=
new
File
(
referenceFasta
().
getAbsolutePath
def
referenceDict
File
=
new
File
(
referenceFasta
().
getAbsolutePath
.
stripSuffix
(
".fa"
)
.
stripSuffix
(
".fasta"
)
.
stripSuffix
(
".fna"
)
+
".dict"
)
...
...
@@ -108,9 +111,8 @@ trait Reference extends Configurable {
/** Create summary part for reference */
def
referenceSummary
:
Map
[
String
,
Any
]
=
{
Reference
.
requireDict
(
referenceFasta
())
val
file
=
new
IndexedFastaSequenceFile
(
referenceFasta
())
Map
(
"contigs"
->
(
for
(
seq
<-
file
.
getSequ
enceDict
ionary
.
getSequences
)
yield
seq
.
getSequenceName
->
{
(
for
(
seq
<-
refer
enceDict
.
getSequences
)
yield
seq
.
getSequenceName
->
{
val
md5
=
Option
(
seq
.
getAttribute
(
"M5"
))
Map
(
"md5"
->
md5
,
"length"
->
seq
.
getSequenceLength
)
}).
toMap
,
...
...
biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SortVcf.scala
View file @
3a1af12e
...
...
@@ -37,7 +37,7 @@ class SortVcf(val root: Configurable) extends Picard with Reference {
override
def
beforeGraph
()
:
Unit
=
{
super
.
beforeGraph
()
if
(
sequenceDictionary
==
null
)
sequenceDictionary
=
referenceDict
if
(
sequenceDictionary
==
null
)
sequenceDictionary
=
referenceDict
File
}
/** Returns command to execute */
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GensToVcf.scala
View file @
3a1af12e
...
...
@@ -21,7 +21,7 @@ import htsjdk.samtools.reference.{ FastaSequenceFile, ReferenceSequenceFileFacto
import
htsjdk.variant.variantcontext.writer.
{
AsyncVariantContextWriter
,
VariantContextWriterBuilder
}
import
htsjdk.variant.variantcontext.
{
Allele
,
GenotypeBuilder
,
VariantContextBuilder
}
import
htsjdk.variant.vcf._
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
scala.collection.JavaConversions._
import
scala.io.Source
...
...
@@ -82,12 +82,11 @@ object GensToVcf extends ToolCommand {
metaLines
.
add
(
new
VCFFormatHeaderLine
(
"GT"
,
1
,
VCFHeaderLineType
.
String
,
""
))
metaLines
.
add
(
new
VCFFormatHeaderLine
(
"GP"
,
VCFHeaderLineCount
.
UNBOUNDED
,
VCFHeaderLineType
.
Float
,
""
))
val
reference
=
new
FastaSequenceFile
(
cmdArgs
.
referenceFasta
,
true
)
require
(
reference
.
getSequenceDictionary
.
getSequence
(
cmdArgs
.
contig
)
!=
null
,
require
(
FastaUtils
.
getCachedDict
(
cmdArgs
.
referenceFasta
).
getSequence
(
cmdArgs
.
contig
)
!=
null
,
s
"contig '${cmdArgs.contig}' not found on reference"
)
val
header
=
new
VCFHeader
(
metaLines
,
samples
.
toList
)
header
.
setSequenceDictionary
(
reference
.
getSequenceDictionary
)
header
.
setSequenceDictionary
(
FastaUtils
.
getCachedDict
(
cmdArgs
.
referenceFasta
)
)
val
writer
=
new
AsyncVariantContextWriter
(
new
VariantContextWriterBuilder
()
.
setOutputFile
(
cmdArgs
.
outputVcf
)
.
setReferenceDictionary
(
header
.
getSequenceDictionary
)
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/MergeAlleles.scala
View file @
3a1af12e
...
...
@@ -20,7 +20,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.variant.variantcontext.writer.
{
AsyncVariantContextWriter
,
VariantContextWriterBuilder
}
import
htsjdk.variant.variantcontext.
{
Allele
,
VariantContext
,
VariantContextBuilder
}
import
htsjdk.variant.vcf.
{
VCFFileReader
,
VCFHeader
}
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
scala.collection.JavaConversions._
...
...
@@ -52,13 +52,12 @@ object MergeAlleles extends ToolCommand {
val
commandArgs
:
Args
=
argsParser
.
parse
(
args
,
Args
())
getOrElse
(
throw
new
IllegalArgumentException
)
val
readers
=
commandArgs
.
inputFiles
.
map
(
new
VCFFileReader
(
_
,
true
))
val
referenceFile
=
new
FastaSequenceFile
(
commandArgs
.
reference
,
true
)
val
writer
=
new
AsyncVariantContextWriter
(
new
VariantContextWriterBuilder
().
setReferenceDictionary
(
referenceF
il
e
.
get
SequenceDictionary
).
setReferenceDictionary
(
FastaUt
il
s
.
get
CachedDict
(
commandArgs
.
reference
)
).
setOutputFile
(
commandArgs
.
outputFile
).
build
)
val
header
=
new
VCFHeader
val
referenceDict
=
referenceF
il
e
.
get
SequenceDictionary
val
referenceDict
=
FastaUt
il
s
.
get
CachedDict
(
commandArgs
.
reference
)
header
.
setSequenceDictionary
(
referenceDict
)
writer
.
writeHeader
(
header
)
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/SnptestToVcf.scala
View file @
3a1af12e
...
...
@@ -21,7 +21,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.variant.variantcontext.writer.
{
AsyncVariantContextWriter
,
Options
,
VariantContextWriterBuilder
}
import
htsjdk.variant.variantcontext.
{
Allele
,
VariantContextBuilder
}
import
htsjdk.variant.vcf._
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
scala.collection.JavaConversions._
import
scala.io.Source
...
...
@@ -71,9 +71,8 @@ object SnptestToVcf extends ToolCommand {
}
def
writeEmptyVcf
(
outputVcf
:
File
,
referenceFasta
:
File
)
:
Unit
=
{
val
reference
=
new
FastaSequenceFile
(
referenceFasta
,
true
)
val
vcfHeader
=
new
VCFHeader
()
vcfHeader
.
setSequenceDictionary
(
reference
.
getSequenceDictionary
)
vcfHeader
.
setSequenceDictionary
(
FastaUtils
.
getCachedDict
(
referenceFasta
)
)
val
writer
=
new
VariantContextWriterBuilder
()
.
setOutputFile
(
outputVcf
)
.
setReferenceDictionary
(
vcfHeader
.
getSequenceDictionary
)
...
...
@@ -92,12 +91,11 @@ object SnptestToVcf extends ToolCommand {
key
<-
headerKeys
if
key
!=
"rsid"
if
key
!=
"chromosome"
if
key
!=
"position"
if
key
!=
"alleleA"
if
key
!=
"alleleB"
if
key
!=
"alleleA"
)
metaLines
.
add
(
new
VCFInfoHeaderLine
(
s
"ST_$key"
,
1
,
VCFHeaderLineType
.
String
,
""
))
val
reference
=
new
FastaSequenceFile
(
cmdArgs
.
referenceFasta
,
true
)
require
(
reference
.
getSequenceDictionary
.
getSequence
(
cmdArgs
.
contig
)
!=
null
,
require
(
FastaUtils
.
getCachedDict
(
cmdArgs
.
referenceFasta
).
getSequence
(
cmdArgs
.
contig
)
!=
null
,
s
"contig '${cmdArgs.contig}' not found on reference"
)
val
vcfHeader
=
new
VCFHeader
(
metaLines
)
vcfHeader
.
setSequenceDictionary
(
reference
.
getSequenceDictionary
)
vcfHeader
.
setSequenceDictionary
(
FastaUtils
.
getCachedDict
(
cmdArgs
.
referenceFasta
)
)
val
writer
=
new
AsyncVariantContextWriter
(
new
VariantContextWriterBuilder
()
.
setOutputFile
(
cmdArgs
.
outputVcf
)
.
setReferenceDictionary
(
vcfHeader
.
getSequenceDictionary
)
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
View file @
3a1af12e
...
...
@@ -20,7 +20,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.samtools.util.Interval
import
htsjdk.variant.variantcontext.
{
Allele
,
Genotype
,
VariantContext
}
import
htsjdk.variant.vcf.VCFFileReader
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
nl.lumc.sasc.biopet.utils.intervals.BedRecordList
...
...
@@ -330,7 +330,7 @@ object VcfStats extends ToolCommand {
writeField
(
stats
,
"general"
,
cmdArgs
.
outputDir
)
for
(
field
<-
adInfoTags
.
distinct
.
par
)
{
writeField
(
stats
,
field
,
infoOutputDir
)
for
(
line
<-
new
Fasta
SequenceFile
(
cmdArgs
.
referenceFile
,
true
).
getSequenceDictionary
.
getSequences
)
{
for
(
line
<-
Fasta
Utils
.
getCachedDict
(
cmdArgs
.
referenceFile
)
.
getSequences
)
{
val
chr
=
line
.
getSequenceName
writeField
(
stats
,
field
,
new
File
(
infoOutputDir
,
"chrs"
+
File
.
separator
+
chr
),
chr
=
chr
)
}
...
...
@@ -341,7 +341,7 @@ object VcfStats extends ToolCommand {
writeGenotypeField
(
stats
,
samples
,
"general"
,
cmdArgs
.
outputDir
,
prefix
=
"genotype"
)
for
(
field
<-
adGenotypeTags
.
distinct
.
par
)
{
writeGenotypeField
(
stats
,
samples
,
field
,
genotypeOutputDir
)
for
(
line
<-
new
Fasta
SequenceFile
(
cmdArgs
.
referenceFile
,
true
).
getSequenceDictionary
.
getSequences
)
{
for
(
line
<-
Fasta
Utils
.
getCachedDict
(
cmdArgs
.
referenceFile
)
.
getSequences
)
{
val
chr
=
line
.
getSequenceName
writeGenotypeField
(
stats
,
samples
,
field
,
new
File
(
genotypeOutputDir
,
"chrs"
+
File
.
separator
+
chr
),
chr
=
chr
)
}
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala
View file @
3a1af12e
...
...
@@ -21,7 +21,7 @@ import htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.variant.variantcontext.
{
VariantContext
,
VariantContextBuilder
}
import
htsjdk.variant.variantcontext.writer.
{
AsyncVariantContextWriter
,
VariantContextWriterBuilder
}
import
htsjdk.variant.vcf._
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
nl.lumc.sasc.biopet.utils.VcfUtils.scalaListToJavaObjectArrayList
import
nl.lumc.sasc.biopet.utils.BamUtils.SamDictCheck
...
...
@@ -84,7 +84,7 @@ object VcfWithVcf extends ToolCommand {
val
reader
=
new
VCFFileReader
(
commandArgs
.
inputFile
)
val
secondaryReader
=
new
VCFFileReader
(
commandArgs
.
secondaryVcf
)
val
referenceDict
=
new
Fasta
SequenceFile
(
commandArgs
.
referenceFasta
,
true
).
getSequenceDictionary
val
referenceDict
=
Fasta
Utils
.
getCachedDict
(
commandArgs
.
referenceFasta
)
val
header
=
reader
.
getFileHeader
val
vcfDict
=
header
.
getSequenceDictionary
match
{
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala
View file @
3a1af12e
...
...
@@ -20,7 +20,7 @@ import java.util.concurrent.TimeoutException
import
htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.samtools.
{
SAMSequenceDictionary
,
SamReaderFactory
}
import
nl.lumc.sasc.biopet.utils.BamUtils.SamDictCheck
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
nl.lumc.sasc.biopet.utils.intervals.
{
BedRecord
,
BedRecordList
}
import
scala.collection.JavaConversions._
...
...
@@ -82,11 +82,8 @@ object BamStats extends ToolCommand {
val
samHeader
=
samReader
.
getFileHeader
samReader
.
close
()
referenceFasta
.
map
{
f
=>
val
referenceReader
=
new
FastaSequenceFile
(
f
,
true
)
val
referenceDict
=
referenceReader
.
getSequenceDictionary
samHeader
.
getSequenceDictionary
.
assertSameDictionary
(
referenceDict
,
false
)
referenceReader
.
close
()
referenceDict
samHeader
.
getSequenceDictionary
.
assertSameDictionary
(
FastaUtils
.
getCachedDict
(
f
),
false
)
FastaUtils
.
getCachedDict
(
f
)
}.
getOrElse
(
samHeader
.
getSequenceDictionary
)
}
...
...
biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/FastaUtils.scala
0 → 100644
View file @
3a1af12e
package
nl.lumc.sasc.biopet.utils
import
java.io.File
import
htsjdk.samtools.SAMSequenceDictionary
import
htsjdk.samtools.reference.FastaSequenceFile
/**
* Created by pjvan_thof on 25-10-16.
*/
object
FastaUtils
{
/**
* This method will get a dict from the fasta file. This will not use the cache
*
* @param fastaFile Fasta file
* @return sequence dict
*/
def
getDictFromFasta
(
fastaFile
:
File
)
:
SAMSequenceDictionary
=
{
val
referenceFile
=
new
FastaSequenceFile
(
fastaFile
,
true
)
val
dict
=
referenceFile
.
getSequenceDictionary
referenceFile
.
close
()
dictCache
+=
fastaFile
->
dict
dict
}
private
var
dictCache
:
Map
[
File
,
SAMSequenceDictionary
]
=
Map
()
/** This will clear the dict cache */
def
clearCache
()
=
dictCache
=
Map
()
/**
* This method will get a dict from the fasta file. If it's already in the cache file will not opened again.
*
* @param fastaFile Fasta file
* @return sequence dict
*/
def
getCachedDict
(
fastaFile
:
File
)
:
SAMSequenceDictionary
=
{
if
(!
dictCache
.
contains
(
fastaFile
))
dictCache
+=
fastaFile
->
getDictFromFasta
(
fastaFile
)
dictCache
(
fastaFile
)
}
}
biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala
View file @
3a1af12e
...
...
@@ -23,7 +23,7 @@ import scala.collection.JavaConversions._
import
scala.collection.mutable
import
scala.collection.mutable.ListBuffer
import
scala.io.Source
import
nl.lumc.sasc.biopet.utils.Logging
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
Logging
}
/**
* Created by pjvan_thof on 8/20/15.
...
...
@@ -100,8 +100,7 @@ case class BedRecordList(val chrRecords: Map[String, List[BedRecord]], val heade
)
def
validateContigs
(
reference
:
File
)
=
{
val
referenceFile
=
new
FastaSequenceFile
(
reference
,
true
)
val
dict
=
referenceFile
.
getSequenceDictionary
val
dict
=
FastaUtils
.
getCachedDict
(
reference
)
val
notExisting
=
chrRecords
.
keys
.
filter
(
dict
.
getSequence
(
_
)
==
null
).
toList
require
(
notExisting
.
isEmpty
,
s
"Contigs found in bed records but are not existing in reference: ${notExisting.mkString("
,
")}"
)
this
...
...
@@ -152,12 +151,7 @@ object BedRecordList {
}
}
def
fromReference
(
file
:
File
)
=
{
val
referenceFile
=
new
FastaSequenceFile
(
file
,
true
)
val
dict
=
referenceFile
.
getSequenceDictionary
referenceFile
.
close
()
fromDict
(
dict
)
}
def
fromReference
(
file
:
File
)
=
fromDict
(
FastaUtils
.
getCachedDict
(
file
))
def
fromDict
(
dict
:
SAMSequenceDictionary
)
=
{
fromList
(
for
(
contig
<-
dict
.
getSequences
)
yield
{
...
...
gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Impute2Vcf.scala
View file @
3a1af12e
...
...
@@ -56,7 +56,6 @@ class Impute2Vcf(val root: Configurable) extends QScript with BiopetQScript with
/** Init for pipeline */
def
init
()
:
Unit
=
{
inputGens
.
foreach
{
g
=>
val
referenceDict
=
new
FastaSequenceFile
(
referenceFasta
(),
true
).
getSequenceDictionary
if
(
referenceDict
.
getSequenceIndex
(
g
.
contig
)
==
-
1
)
Logging
.
addError
(
s
"Contig '${g.contig}' does not exist on reference: ${referenceFasta()}"
)
}
...
...
mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/MultisampleMapping.scala
View file @
3a1af12e
...
...
@@ -150,7 +150,7 @@ trait MultisampleMappingTrait extends MultiSampleQScript
val
dictOke
:
Boolean
=
{
var
oke
=
true
try
{
header
.
getSequenceDictionary
.
assertSameDictionary
(
reference
File
.
getSequenceDictionary
)
header
.
getSequenceDictionary
.
assertSameDictionary
(
reference
Dict
)
}
catch
{
case
e
:
AssertionError
=>
logger
.
error
(
e
.
getMessage
)
...
...
toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala
View file @
3a1af12e
...
...
@@ -56,12 +56,7 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum
lazy
val
minScatterGenomeSize
:
Long
=
config
(
"min_scatter_genome_size"
,
default
=
75000000
)
lazy
val
enableScatter
:
Boolean
=
config
(
"enable_scatter"
,
default
=
{
val
ref
=
new
FastaSequenceFile
(
referenceFasta
(),
true
)
val
refLength
=
ref
.
getSequenceDictionary
.
getReferenceLength
ref
.
close
()
refLength
>
minScatterGenomeSize
})
lazy
val
enableScatter
:
Boolean
=
config
(
"enable_scatter"
,
default
=
referenceDict
.
getReferenceLength
>
minScatterGenomeSize
)
def
sampleInfo
:
Map
[
String
,
Map
[
String
,
Any
]]
=
root
match
{
case
m
:
MultiSampleQScript
=>
m
.
samples
.
map
{
case
(
sampleId
,
sample
)
=>
sampleId
->
sample
.
sampleTags
}
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment