Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
2c2e3865
Commit
2c2e3865
authored
Jul 15, 2017
by
Peter van 't Hof
Browse files
Removing some meta files from jar
parent
38103728
Changes
4
Hide whitespace changes
Inline
Side-by-side
biopet-package/pom.xml
View file @
2c2e3865
...
...
@@ -155,6 +155,14 @@
</transformer>
</transformers>
<filters>
<filter>
<artifact>
*:*
</artifact>
<excludes>
<exclude>
META-INF/*.SF
</exclude>
<exclude>
META-INF/*.DSA
</exclude>
<exclude>
META-INF/*.RSA
</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/Stats.scala
View file @
2c2e3865
...
...
@@ -163,13 +163,20 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
"info"
->
infoFields
.
map
(
f
=>
f
->
getField
(
f
,
contig
)).
toMap
,
"sample_distributions"
->
sampleDistributions
.
map
(
f
=>
f
->
getField
(
"SampleDistribution-"
+
f
,
contig
))
.
toMap
,
"sample_compare"
->
Map
(
"samples"
->
samples
,
"genotype_overlap"
->
samples
.
map
(
sample1
=>
samples
.
map
(
sample2
=>
samplesStats
(
sample1
).
sampleToSample
(
sample2
).
genotypeOverlap
)),
"allele_overlap"
->
samples
.
map
(
sample1
=>
samples
.
map
(
sample2
=>
samplesStats
(
sample1
).
sampleToSample
(
sample2
).
alleleOverlap
))
)
)
.
toMap
)
++
(
if
(
contig
==
"total"
)
Map
(
"sample_compare"
->
Map
(
"samples"
->
samples
,
"genotype_overlap"
->
samples
.
map
(
sample1
=>
samples
.
map
(
sample2
=>
samplesStats
(
sample1
).
sampleToSample
(
sample2
).
genotypeOverlap
)),
"allele_overlap"
->
samples
.
map
(
sample1
=>
samples
.
map
(
sample2
=>
samplesStats
(
sample1
).
sampleToSample
(
sample2
).
alleleOverlap
))
)
)
else
Map
())
}
/** This will generate stats for total */
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStats.scala
View file @
2c2e3865
...
...
@@ -81,18 +81,18 @@ object VcfStats extends ToolCommand {
)
val
genotypeWiggleOptions
=
List
(
"Total"
,
"Het"
,
"HetNonRef"
,
"Hom"
,
"HomRef"
,
"HomVar"
,
"Mixed"
,
"NoCall"
,
"NonInformative"
,
"Available"
,
"Called"
,
"Filtered"
,
"Variant"
)
"Het"
,
"HetNonRef"
,
"Hom"
,
"HomRef"
,
"HomVar"
,
"Mixed"
,
"NoCall"
,
"NonInformative"
,
"Available"
,
"Called"
,
"Filtered"
,
"Variant"
)
/** Parsing commandline arguments */
class
OptParser
extends
AbstractOptParser
{
...
...
@@ -149,16 +149,16 @@ object VcfStats extends ToolCommand {
if
(
genotypeWiggleOptions
.
contains
(
x
))
success
else
failure
(
s
"""Non-existent field $x"""
)
}
text
s
"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
|${genotypeWiggleOptions.mkString(", ")}"""
.
stripMargin
opt
[
Int
](
't'
,
"localThreads"
)
unbounded
()
action
{
(
x
,
c
)
=>
opt
[
Int
](
't'
,
"localThreads"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
localThreads
=
x
)
}
text
s
"Number of local threads to use"
opt
[
String
](
"sparkMaster"
)
unbounded
()
action
{
(
x
,
c
)
=>
opt
[
String
](
"sparkMaster"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
sparkMaster
=
Some
(
x
))
}
text
s
"Spark master to use"
}
protected
var
cmdArgs
:
Args
=
_
//
protected var cmdArgs: Args = _
val
defaultGenotypeFields
=
List
(
"DP"
,
"GQ"
,
"AD"
,
"AD-ref"
,
"AD-alt"
,
"AD-used"
,
"AD-not_used"
,
"general"
)
...
...
@@ -184,7 +184,7 @@ object VcfStats extends ToolCommand {
def
main
(
args
:
Array
[
String
])
:
Unit
=
{
logger
.
info
(
"Started"
)
val
argsParser
=
new
OptParser
cmdArgs
=
argsParser
.
parse
(
args
,
Args
())
getOrElse
(
throw
new
IllegalArgumentException
)
val
cmdArgs
=
argsParser
.
parse
(
args
,
Args
())
getOrElse
(
throw
new
IllegalArgumentException
)
logger
.
info
(
"Init spark context"
)
...
...
@@ -334,14 +334,14 @@ object VcfStats extends ToolCommand {
// Write general wiggle tracks
for
(
field
<-
cmdArgs
.
generalWiggle
)
{
val
file
=
new
File
(
cmdArgs
.
outputDir
,
"wigs"
+
File
.
separator
+
"general-"
+
field
+
".wig"
)
writeWiggle
(
intervals
,
field
,
"count"
,
file
,
genotype
=
false
)
writeWiggle
(
intervals
,
field
,
"count"
,
file
,
genotype
=
false
,
cmdArgs
.
outputDir
)
}
// Write sample wiggle tracks
for
(
field
<-
cmdArgs
.
genotypeWiggle
;
sample
<-
samples
)
{
val
file
=
new
File
(
cmdArgs
.
outputDir
,
"wigs"
+
File
.
separator
+
"genotype-"
+
sample
+
"-"
+
field
+
".wig"
)
writeWiggle
(
intervals
,
field
,
sample
,
file
,
genotype
=
true
)
writeWiggle
(
intervals
,
field
,
sample
,
file
,
genotype
=
true
,
cmdArgs
.
outputDir
)
}
writeOverlap
(
stats
,
...
...
@@ -362,7 +362,8 @@ object VcfStats extends ToolCommand {
row
:
String
,
column
:
String
,
outputFile
:
File
,
genotype
:
Boolean
)
:
Unit
=
{
genotype
:
Boolean
,
outputDir
:
File
)
:
Unit
=
{
val
groupedIntervals
=
intervals
.
groupBy
(
_
.
getContig
).
map
{
case
(
k
,
v
)
=>
k
->
v
.
sortBy
(
_
.
getStart
)
}
outputFile
.
getParentFile
.
mkdirs
()
...
...
@@ -375,11 +376,11 @@ object VcfStats extends ToolCommand {
val
file
=
{
if
(
genotype
)
new
File
(
cmdArgs
.
outputDir
,
outputDir
,
"bins"
+
File
.
separator
+
chr
+
File
.
separator
+
"genotype-"
+
interval
.
getStart
+
"-"
+
interval
.
getEnd
+
"-general.tsv"
)
else
new
File
(
cmdArgs
.
outputDir
,
outputDir
,
"bins"
+
File
.
separator
+
chr
+
File
.
separator
+
interval
.
getStart
+
"-"
+
interval
.
getEnd
+
"-general.tsv"
)
}
writer
.
println
(
valueFromTsv
(
file
,
row
,
column
).
getOrElse
(
0
))
...
...
@@ -639,7 +640,8 @@ object VcfStats extends ToolCommand {
def
writeOverlap
(
stats
:
Stats
,
function
:
SampleToSampleStats
=>
Int
,
prefix
:
String
,
samples
:
List
[
String
])
:
Unit
=
{
samples
:
List
[
String
],
plots
:
Boolean
=
true
)
:
Unit
=
{
val
absFile
=
new
File
(
prefix
+
".abs.tsv"
)
val
relFile
=
new
File
(
prefix
+
".rel.tsv"
)
...
...
@@ -662,7 +664,7 @@ object VcfStats extends ToolCommand {
absWriter
.
close
()
relWriter
.
close
()
plotHeatmap
(
relFile
)
if
(
plots
)
plotHeatmap
(
relFile
)
}
/** Plots heatmaps on target tsv file */
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStatsSpark.scala
View file @
2c2e3865
...
...
@@ -9,7 +9,9 @@ import nl.lumc.sasc.biopet.utils.intervals.{BedRecord, BedRecordList}
import
org.apache.spark.
{
SparkConf
,
SparkContext
}
import
scala.collection.JavaConversions._
import
scala.concurrent.
{
Await
,
Future
}
import
scala.concurrent.ExecutionContext.Implicits.global
import
scala.concurrent.duration.Duration
/**
* Created by pjvanthof on 14/07/2017.
...
...
@@ -73,15 +75,14 @@ object VcfStatsSpark extends ToolCommand {
if
(
genotypeWiggleOptions
.
contains
(
x
))
success
else
failure
(
s
"""Non-existent field $x"""
)
}
text
s
"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
|${genotypeWiggleOptions.mkString(", ")}"""
.
stripMargin
opt
[
Int
](
't'
,
"localThreads"
)
unbounded
()
action
{
(
x
,
c
)
=>
opt
[
Int
](
't'
,
"localThreads"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
localThreads
=
x
)
}
text
s
"Number of local threads to use"
opt
[
String
](
"sparkMaster"
)
unbounded
()
action
{
(
x
,
c
)
=>
opt
[
String
](
"sparkMaster"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
sparkMaster
=
Some
(
x
))
}
text
s
"Spark master to use"
}
def
main
(
args
:
Array
[
String
])
:
Unit
=
{
logger
.
info
(
"Started"
)
...
...
@@ -97,7 +98,7 @@ object VcfStatsSpark extends ToolCommand {
val
adInfoTags
=
{
(
for
(
infoTag
<-
cmdArgs
.
infoTags
if
!
defaultInfoFields
.
contains
(
infoTag
))
yield
{
require
(
header
.
getInfoHeaderLine
(
infoTag
)
!=
null
,
"Info tag '"
+
infoTag
+
"' not found in header of vcf file"
)
"Info tag '"
+
infoTag
+
"' not found in header of vcf file"
)
infoTag
})
:::
(
for
(
line
<-
header
.
getInfoHeaderLines
if
cmdArgs
.
allInfoTags
if
!
defaultInfoFields
.
contains
(
line
.
getID
)
...
...
@@ -109,7 +110,7 @@ object VcfStatsSpark extends ToolCommand {
val
adGenotypeTags
=
(
for
(
genotypeTag
<-
cmdArgs
.
genotypeTags
if
!
defaultGenotypeFields
.
contains
(
genotypeTag
))
yield
{
require
(
header
.
getFormatHeaderLine
(
genotypeTag
)
!=
null
,
"Format tag '"
+
genotypeTag
+
"' not found in header of vcf file"
)
"Format tag '"
+
genotypeTag
+
"' not found in header of vcf file"
)
genotypeTag
})
:::
(
for
(
line
<-
header
.
getFormatHeaderLines
if
cmdArgs
.
allGenotypeTags
if
!
defaultGenotypeFields
.
contains
(
line
.
getID
)
...
...
@@ -118,7 +119,6 @@ object VcfStatsSpark extends ToolCommand {
line
.
getID
}).
toList
:::
defaultGenotypeFields
logger
.
info
(
"Init spark context"
)
val
conf
=
new
SparkConf
()
...
...
@@ -135,11 +135,31 @@ object VcfStatsSpark extends ToolCommand {
.
scatter
(
cmdArgs
.
binSize
)
.
flatten
val
regionStats
=
sc
.
parallelize
(
regions
,
regions
.
size
).
groupBy
(
_
.
chr
).
map
{
case
(
contig
,
records
)
=>
contig
->
records
.
map
(
readBin
(
_
,
samples
,
cmdArgs
,
adInfoTags
,
adGenotypeTags
))}
val
regionStats
=
sc
.
parallelize
(
regions
,
regions
.
size
).
groupBy
(
_
.
chr
).
map
{
case
(
contig
,
records
)
=>
contig
->
records
.
map
(
readBin
(
_
,
samples
,
cmdArgs
,
adInfoTags
,
adGenotypeTags
))
}
val
chrStats
=
regionStats
.
map
{
case
(
contig
,
stats
)
=>
contig
->
stats
.
reduce
(
_
+=
_
)}
val
chrStats
=
regionStats
.
map
{
case
(
contig
,
stats
)
=>
contig
->
stats
.
reduce
(
_
+=
_
)
}.
cache
()
val
contigOverlap
=
chrStats
.
map
{
case
(
contig
,
stats
)
=>
writeOverlap
(
stats
,
_
.
genotypeOverlap
,
cmdArgs
.
outputDir
+
s
"/sample_compare/contigs/$contig/genotype_overlap"
,
samples
,
cmdArgs
.
contigSampleOverlapPlots
)
writeOverlap
(
stats
,
_
.
alleleOverlap
,
cmdArgs
.
outputDir
+
s
"/sample_compare/contigs/$contig/allele_overlap"
,
samples
,
cmdArgs
.
contigSampleOverlapPlots
)
}
val
totalStats
=
chrStats
.
values
.
reduce
(
_
+=
_
)
val
totalStats
=
chrStats
.
values
.
reduce
(
_
+=
_
)
// Blocking
//Await.ready(contigOverlap, Duration.Inf)
val
allWriter
=
new
PrintWriter
(
new
File
(
cmdArgs
.
outputDir
,
"stats.json"
))
val
json
=
ConfigUtils
.
mapToJson
(
...
...
@@ -155,19 +175,21 @@ object VcfStatsSpark extends ToolCommand {
//TODO: write wig files
writeOverlap
(
totalStats
,
_
.
genotypeOverlap
,
cmdArgs
.
outputDir
+
"/sample_compare/genotype_overlap"
,
samples
)
_
.
genotypeOverlap
,
cmdArgs
.
outputDir
+
"/sample_compare/genotype_overlap"
,
samples
)
writeOverlap
(
totalStats
,
_
.
alleleOverlap
,
cmdArgs
.
outputDir
+
"/sample_compare/allele_overlap"
,
samples
)
_
.
alleleOverlap
,
cmdArgs
.
outputDir
+
"/sample_compare/allele_overlap"
,
samples
)
Thread
.
sleep
(
1000000
)
sc
.
stop
logger
.
info
(
"Done"
)
}
def
readBin
(
bedRecord
:
BedRecord
,
samples
:
List
[
String
],
def
readBin
(
bedRecord
:
BedRecord
,
samples
:
List
[
String
],
cmdArgs
:
Args
,
adInfoTags
:
List
[
String
],
adGenotypeTags
:
List
[
String
])
:
Stats
=
{
...
...
@@ -184,7 +206,7 @@ object VcfStatsSpark extends ToolCommand {
Stats
.
mergeNestedStatsMap
(
stats
.
generalStats
,
fillGeneral
(
adInfoTags
))
for
(
sample
<-
samples
)
yield
{
Stats
.
mergeNestedStatsMap
(
stats
.
samplesStats
(
sample
).
genotypeStats
,
fillGenotype
(
adGenotypeTags
))
fillGenotype
(
adGenotypeTags
))
}
chunkCounter
+=
1
}
...
...
@@ -194,7 +216,7 @@ object VcfStatsSpark extends ToolCommand {
for
(
sample1
<-
samples
)
yield
{
val
genotype
=
record
.
getGenotype
(
sample1
)
Stats
.
mergeNestedStatsMap
(
stats
.
samplesStats
(
sample1
).
genotypeStats
,
checkGenotype
(
record
,
genotype
,
adGenotypeTags
))
checkGenotype
(
record
,
genotype
,
adGenotypeTags
))
for
(
sample2
<-
samples
)
{
val
genotype2
=
record
.
getGenotype
(
sample2
)
if
(
genotype
.
getAlleles
==
genotype2
.
getAlleles
)
...
...
@@ -213,16 +235,17 @@ object VcfStatsSpark extends ToolCommand {
/** Commandline argument */
case
class
VcfStatsArgs
(
inputFile
:
File
=
null
,
outputDir
:
File
=
null
,
referenceFile
:
File
=
null
,
intervals
:
Option
[
File
]
=
None
,
infoTags
:
List
[
String
]
=
Nil
,
genotypeTags
:
List
[
String
]
=
Nil
,
allInfoTags
:
Boolean
=
false
,
allGenotypeTags
:
Boolean
=
false
,
binSize
:
Int
=
10000000
,
writeBinStats
:
Boolean
=
false
,
generalWiggle
:
List
[
String
]
=
Nil
,
genotypeWiggle
:
List
[
String
]
=
Nil
,
localThreads
:
Int
=
1
,
sparkMaster
:
Option
[
String
]
=
None
)
outputDir
:
File
=
null
,
referenceFile
:
File
=
null
,
intervals
:
Option
[
File
]
=
None
,
infoTags
:
List
[
String
]
=
Nil
,
genotypeTags
:
List
[
String
]
=
Nil
,
allInfoTags
:
Boolean
=
false
,
allGenotypeTags
:
Boolean
=
false
,
binSize
:
Int
=
10000000
,
writeBinStats
:
Boolean
=
false
,
generalWiggle
:
List
[
String
]
=
Nil
,
genotypeWiggle
:
List
[
String
]
=
Nil
,
localThreads
:
Int
=
1
,
sparkMaster
:
Option
[
String
]
=
None
,
contigSampleOverlapPlots
:
Boolean
=
false
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment