Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
2fd247f5
Commit
2fd247f5
authored
Jul 18, 2017
by
pjvan_thof
Browse files
Remove old version and replace by the spark version
parent
7d053e65
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStats.scala
View file @
2fd247f5
This diff is collapsed.
Click to expand it.
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStatsArgs.scala
View file @
2fd247f5
...
...
@@ -2,12 +2,7 @@ package nl.lumc.sasc.biopet.tools.vcfstats
import
java.io.File
import
nl.lumc.sasc.biopet.tools.vcfstats.VcfStats.
{
defaultGenotypeFields
,
defaultInfoFields
,
generalWiggleOptions
,
genotypeWiggleOptions
}
import
nl.lumc.sasc.biopet.tools.vcfstats.VcfStats.
{
defaultGenotypeFields
,
defaultInfoFields
}
import
nl.lumc.sasc.biopet.utils.AbstractOptParser
/**
...
...
@@ -26,8 +21,6 @@ case class VcfStatsArgs(inputFile: File = null,
binSize
:
Int
=
10000000
,
maxContigsInSingleJob
:
Int
=
250
,
writeBinStats
:
Boolean
=
false
,
generalWiggle
:
List
[
String
]
=
Nil
,
genotypeWiggle
:
List
[
String
]
=
Nil
,
localThreads
:
Int
=
1
,
sparkMaster
:
Option
[
String
]
=
None
,
contigSampleOverlapPlots
:
Boolean
=
false
)
...
...
@@ -80,18 +73,6 @@ class VcfStatsOptParser(cmdName: String) extends AbstractOptParser[VcfStatsArgs]
opt
[
Unit
](
"writeBinStats"
)
unbounded
()
action
{
(
_
,
c
)
=>
c
.
copy
(
writeBinStats
=
true
)
}
text
"Write bin statistics. Default False"
opt
[
String
](
"generalWiggle"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
generalWiggle
=
x
::
c
.
generalWiggle
,
writeBinStats
=
true
)
}
validate
{
x
=>
if
(
generalWiggleOptions
.
contains
(
x
))
success
else
failure
(
s
"""Nonexistent field $x"""
)
}
text
s
"""Create a wiggle track with bin size <binSize> for any of the following statistics:
|${generalWiggleOptions.mkString(", ")}"""
.
stripMargin
opt
[
String
](
"genotypeWiggle"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
genotypeWiggle
=
x
::
c
.
genotypeWiggle
,
writeBinStats
=
true
)
}
validate
{
x
=>
if
(
genotypeWiggleOptions
.
contains
(
x
))
success
else
failure
(
s
"""Non-existent field $x"""
)
}
text
s
"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
|${genotypeWiggleOptions.mkString(", ")}"""
.
stripMargin
opt
[
Int
](
't'
,
"localThreads"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
localThreads
=
x
)
}
text
s
"Number of local threads to use"
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStatsSpark.scala
deleted
100644 → 0
View file @
7d053e65
package
nl.lumc.sasc.biopet.tools.vcfstats
import
java.io.
{
File
,
PrintWriter
}
import
java.net.URLClassLoader
import
htsjdk.variant.vcf.VCFFileReader
import
nl.lumc.sasc.biopet.tools.vcfstats.VcfStats._
import
nl.lumc.sasc.biopet.utils.intervals.
{
BedRecord
,
BedRecordList
}
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
ToolCommand
,
VcfUtils
}
import
org.apache.spark.
{
SparkConf
,
SparkContext
}
import
scala.collection.JavaConversions._
/**
* Created by pjvanthof on 14/07/2017.
*/
object
VcfStatsSpark
extends
ToolCommand
{
type
Args
=
VcfStatsArgs
def
main
(
args
:
Array
[
String
])
:
Unit
=
{
logger
.
info
(
"Started"
)
val
argsParser
=
new
VcfStatsOptParser
(
commandName
)
val
cmdArgs
=
argsParser
.
parse
(
args
,
VcfStatsArgs
())
getOrElse
(
throw
new
IllegalArgumentException
)
val
reader
=
new
VCFFileReader
(
cmdArgs
.
inputFile
,
true
)
val
header
=
reader
.
getFileHeader
val
samples
=
header
.
getSampleNamesInOrder
.
toList
reader
.
close
()
val
adInfoTags
=
{
(
for
(
infoTag
<-
cmdArgs
.
infoTags
if
!
defaultInfoFields
.
contains
(
infoTag
))
yield
{
require
(
header
.
getInfoHeaderLine
(
infoTag
)
!=
null
,
"Info tag '"
+
infoTag
+
"' not found in header of vcf file"
)
infoTag
})
:::
(
for
(
line
<-
header
.
getInfoHeaderLines
if
cmdArgs
.
allInfoTags
if
!
defaultInfoFields
.
contains
(
line
.
getID
)
if
!
cmdArgs
.
infoTags
.
contains
(
line
.
getID
))
yield
{
line
.
getID
}).
toList
:::
defaultInfoFields
}
val
adGenotypeTags
=
(
for
(
genotypeTag
<-
cmdArgs
.
genotypeTags
if
!
defaultGenotypeFields
.
contains
(
genotypeTag
))
yield
{
require
(
header
.
getFormatHeaderLine
(
genotypeTag
)
!=
null
,
"Format tag '"
+
genotypeTag
+
"' not found in header of vcf file"
)
genotypeTag
})
:::
(
for
(
line
<-
header
.
getFormatHeaderLines
if
cmdArgs
.
allGenotypeTags
if
!
defaultGenotypeFields
.
contains
(
line
.
getID
)
if
!
cmdArgs
.
genotypeTags
.
contains
(
line
.
getID
)
if
line
.
getID
!=
"PL"
)
yield
{
line
.
getID
}).
toList
:::
defaultGenotypeFields
logger
.
info
(
"Init spark context"
)
val
jars
=
ClassLoader
.
getSystemClassLoader
.
asInstanceOf
[
URLClassLoader
]
.
getURLs
.
map
(
_
.
getFile
)
val
conf
=
new
SparkConf
()
.
setAppName
(
commandName
)
.
setMaster
(
cmdArgs
.
sparkMaster
.
getOrElse
(
s
"local[${cmdArgs.localThreads}]"
))
.
setJars
(
jars
)
val
sc
=
new
SparkContext
(
conf
)
logger
.
info
(
"Spark context is up"
)
val
regions
=
(
cmdArgs
.
intervals
match
{
case
Some
(
i
)
=>
BedRecordList
.
fromFile
(
i
).
validateContigs
(
cmdArgs
.
referenceFile
)
case
_
=>
BedRecordList
.
fromReference
(
cmdArgs
.
referenceFile
)
}).
combineOverlap
.
scatter
(
cmdArgs
.
binSize
,
maxContigsInSingleJob
=
Some
(
cmdArgs
.
maxContigsInSingleJob
))
val
contigs
=
regions
.
flatMap
(
_
.
map
(
_
.
chr
)).
distinct
val
regionStats
=
sc
.
parallelize
(
regions
,
regions
.
size
)
.
map
(
readBin
(
_
,
samples
,
cmdArgs
,
adInfoTags
,
adGenotypeTags
))
val
totalStats
=
regionStats
.
reduce
(
_
+=
_
)
val
allWriter
=
new
PrintWriter
(
new
File
(
cmdArgs
.
outputDir
,
"stats.json"
))
val
json
=
ConfigUtils
.
mapToJson
(
totalStats
.
getAllStats
(
contigs
,
samples
,
adGenotypeTags
,
adInfoTags
,
sampleDistributions
))
allWriter
.
println
(
json
.
nospaces
)
allWriter
.
close
()
//TODO: write wig files
writeOverlap
(
totalStats
,
_
.
genotypeOverlap
,
cmdArgs
.
outputDir
+
"/sample_compare/genotype_overlap"
,
samples
)
writeOverlap
(
totalStats
,
_
.
alleleOverlap
,
cmdArgs
.
outputDir
+
"/sample_compare/allele_overlap"
,
samples
)
sc
.
stop
logger
.
info
(
"Done"
)
}
def
readBin
(
bedRecords
:
List
[
BedRecord
],
samples
:
List
[
String
],
cmdArgs
:
Args
,
adInfoTags
:
List
[
String
],
adGenotypeTags
:
List
[
String
])
:
Stats
=
{
val
reader
=
new
VCFFileReader
(
cmdArgs
.
inputFile
,
true
)
var
chunkCounter
=
0
val
stats
=
Stats
.
emptyStats
(
samples
)
for
(
bedRecord
<-
bedRecords
)
{
logger
.
info
(
"Starting on: "
+
bedRecord
)
val
samInterval
=
bedRecord
.
toSamInterval
val
query
=
reader
.
query
(
samInterval
.
getContig
,
samInterval
.
getStart
,
samInterval
.
getEnd
)
if
(!
query
.
hasNext
)
{
Stats
.
mergeNestedStatsMap
(
stats
.
generalStats
,
fillGeneral
(
adInfoTags
))
for
(
sample
<-
samples
)
yield
{
Stats
.
mergeNestedStatsMap
(
stats
.
samplesStats
(
sample
).
genotypeStats
,
fillGenotype
(
adGenotypeTags
))
}
chunkCounter
+=
1
}
for
(
record
<-
query
if
record
.
getStart
<=
samInterval
.
getEnd
)
{
Stats
.
mergeNestedStatsMap
(
stats
.
generalStats
,
checkGeneral
(
record
,
adInfoTags
))
for
(
sample1
<-
samples
)
yield
{
val
genotype
=
record
.
getGenotype
(
sample1
)
Stats
.
mergeNestedStatsMap
(
stats
.
samplesStats
(
sample1
).
genotypeStats
,
checkGenotype
(
record
,
genotype
,
adGenotypeTags
))
for
(
sample2
<-
samples
)
{
val
genotype2
=
record
.
getGenotype
(
sample2
)
if
(
genotype
.
getAlleles
==
genotype2
.
getAlleles
)
stats
.
samplesStats
(
sample1
).
sampleToSample
(
sample2
).
genotypeOverlap
+=
1
stats
.
samplesStats
(
sample1
).
sampleToSample
(
sample2
).
alleleOverlap
+=
VcfUtils
.
alleleOverlap
(
genotype
.
getAlleles
.
toList
,
genotype2
.
getAlleles
.
toList
)
}
}
chunkCounter
+=
1
}
}
reader
.
close
()
stats
}
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment