Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Mirrors
biopet.biopet
Commits
8ead9c4f
Commit
8ead9c4f
authored
Jan 05, 2017
by
Sander Bollen
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feature-singlefile' into 'develop'
Feature singlefile Fixing BIOPET-309 and BIOPET-503 See merge request !504
parents
725ed53a
a61f1bdf
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
368 additions
and
342 deletions
+368
-342
bammetrics/src/main/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BammetricsReport.scala
...c/sasc/biopet/pipelines/bammetrics/BammetricsReport.scala
+4
-4
biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/BamStats.scala
...scala/nl/lumc/sasc/biopet/extensions/tools/BamStats.scala
+0
-6
biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/VcfStats.scala
...scala/nl/lumc/sasc/biopet/extensions/tools/VcfStats.scala
+7
-35
biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala
...ain/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala
+1
-1
biopet-tools/src/main/resources/nl/lumc/sasc/biopet/tools/vcfstats/plotHeatmap.R
...esources/nl/lumc/sasc/biopet/tools/vcfstats/plotHeatmap.R
+0
-0
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala
...n/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala
+12
-41
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Histogram.scala
.../scala/nl/lumc/sasc/biopet/tools/bamstats/Histogram.scala
+23
-0
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Stats.scala
...main/scala/nl/lumc/sasc/biopet/tools/bamstats/Stats.scala
+13
-0
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/flagstat/FlagstatCollector.scala
...l/lumc/sasc/biopet/tools/flagstat/FlagstatCollector.scala
+6
-0
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/SampleStats.scala
...cala/nl/lumc/sasc/biopet/tools/vcfstats/SampleStats.scala
+26
-0
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/SampleToSampleStats.scala
...lumc/sasc/biopet/tools/vcfstats/SampleToSampleStats.scala
+15
-0
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/Stats.scala
...main/scala/nl/lumc/sasc/biopet/tools/vcfstats/Stats.scala
+149
-0
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStats.scala
...n/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStats.scala
+31
-204
biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala
...c/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala
+9
-30
biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStatsTest.scala
...ala/nl/lumc/sasc/biopet/tools/bamstats/BamStatsTest.scala
+11
-9
biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala
...s/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala
+16
-2
biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/package.scala
...ls/src/main/scala/nl/lumc/sasc/biopet/utils/package.scala
+13
-0
biopet-utils/src/test/scala/VcfUtilsTest.scala
biopet-utils/src/test/scala/VcfUtilsTest.scala
+22
-0
shiva/src/main/resources/nl/lumc/sasc/biopet/pipelines/shiva/sampleVariants.ssp
...es/nl/lumc/sasc/biopet/pipelines/shiva/sampleVariants.ssp
+1
-1
shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaReport.scala
...ala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaReport.scala
+9
-9
No files found.
bammetrics/src/main/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BammetricsReport.scala
View file @
8ead9c4f
...
...
@@ -208,8 +208,8 @@ object BammetricsReport extends ReportBuilder {
val
pngFile
=
new
File
(
outputDir
,
prefix
+
".png"
)
def
paths
(
name
:
String
)
=
Map
(
"mapping_quality"
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"mapping_quality"
,
"histogram"
,
"value"
),
name
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"mapping_quality"
,
"histogram"
,
"count"
)
"mapping_quality"
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"mapping_quality"
,
"histogram"
,
"value
s
"
),
name
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"mapping_quality"
,
"histogram"
,
"count
s
"
)
)
val
tables
=
getSampleLibraries
(
summary
,
sampleId
,
libId
,
libraryLevel
)
...
...
@@ -236,8 +236,8 @@ object BammetricsReport extends ReportBuilder {
val
pngFile
=
new
File
(
outputDir
,
prefix
+
".png"
)
def
paths
(
name
:
String
)
=
Map
(
"clipping"
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"clipping"
,
"histogram"
,
"value"
),
name
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"clipping"
,
"histogram"
,
"count"
)
"clipping"
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"clipping"
,
"histogram"
,
"value
s
"
),
name
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"clipping"
,
"histogram"
,
"count
s
"
)
)
val
tables
=
getSampleLibraries
(
summary
,
sampleId
,
libId
,
libraryLevel
)
...
...
biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/BamStats.scala
View file @
8ead9c4f
...
...
@@ -40,17 +40,11 @@ class BamStats(val root: Configurable) extends ToolCommandFunction with Referenc
private
var
outputFiles
:
List
[
File
]
=
Nil
def
bamstatsSummary
:
File
=
new
File
(
outputDir
,
"bamstats.summary.json"
)
def
flagstatSummaryFile
(
contig
:
Option
[
String
]
=
None
)
:
File
=
getOutputFile
(
"flagstats.summary.json"
,
contig
)
def
mappingQualityFile
(
contig
:
Option
[
String
]
=
None
)
:
File
=
getOutputFile
(
"mapping_quality.tsv"
,
contig
)
def
clipingFile
(
contig
:
Option
[
String
]
=
None
)
:
File
=
getOutputFile
(
"clipping.tsv"
,
contig
)
override
def
beforeGraph
()
{
super
.
beforeGraph
()
deps
:+=
new
File
(
bamFile
.
getAbsolutePath
.
replaceAll
(
".bam$"
,
".bai"
))
outputFiles
:+=
bamstatsSummary
outputFiles
:+=
flagstatSummaryFile
()
outputFiles
:+=
mappingQualityFile
()
outputFiles
:+=
clipingFile
()
jobOutputFile
=
new
File
(
outputDir
,
".bamstats.out"
)
if
(
reference
==
null
)
reference
=
referenceFasta
()
}
...
...
biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/VcfStats.scala
View file @
8ead9c4f
...
...
@@ -18,8 +18,9 @@ import java.io.File
import
nl.lumc.sasc.biopet.core.summary.
{
Summarizable
,
SummaryQScript
}
import
nl.lumc.sasc.biopet.core.
{
Reference
,
ToolCommandFunction
}
import
nl.lumc.sasc.biopet.tools.vcfstats.VcfStats
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
nl.lumc.sasc.biopet.utils.tryToParseNumber
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
tryToParseNumber
}
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
}
import
scala.io.Source
...
...
@@ -30,7 +31,7 @@ import scala.io.Source
* Created by pjvan_thof on 1/10/15.
*/
class
VcfStats
(
val
root
:
Configurable
)
extends
ToolCommandFunction
with
Summarizable
with
Reference
{
def
toolObject
=
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfStats
def
toolObject
=
VcfStats
mainFunction
=
false
...
...
@@ -41,10 +42,7 @@ class VcfStats(val root: Configurable) extends ToolCommandFunction with Summariz
protected
var
index
:
File
=
null
@Output
protected
var
generalStats
:
File
=
null
@Output
protected
var
genotypeStats
:
File
=
null
protected
var
statsFile
:
File
=
null
override
def
defaultCoreMemory
=
3.0
override
def
defaultThreads
=
3
...
...
@@ -66,8 +64,7 @@ class VcfStats(val root: Configurable) extends ToolCommandFunction with Summariz
/** Set output dir and a output file */
def
setOutputDir
(
dir
:
File
)
:
Unit
=
{
outputDir
=
dir
generalStats
=
new
File
(
dir
,
"general.tsv"
)
genotypeStats
=
new
File
(
dir
,
"genotype-general.tsv"
)
statsFile
=
new
File
(
dir
,
"stats.json"
)
jobOutputFile
=
new
File
(
dir
,
".vcfstats.out"
)
}
...
...
@@ -83,35 +80,10 @@ class VcfStats(val root: Configurable) extends ToolCommandFunction with Summariz
optional
(
"--intervals"
,
intervals
)
/** Returns general stats to the summary */
def
summaryStats
:
Map
[
String
,
Any
]
=
{
Map
(
"info"
->
(
for
(
line
<-
Source
.
fromFile
(
generalStats
).
getLines
().
toList
.
tail
;
values
=
line
.
split
(
"\t"
)
if
values
.
size
>=
2
&&
!
values
(
0
).
isEmpty
)
yield
values
(
0
)
->
tryToParseNumber
(
values
(
1
)).
getOrElse
(
None
)
).
toMap
)
}
def
summaryStats
:
Map
[
String
,
Any
]
=
ConfigUtils
.
fileToConfigMap
(
statsFile
)
/** return only general files to summary */
def
summaryFiles
:
Map
[
String
,
File
]
=
Map
(
"general_stats"
->
generalStats
,
"genotype_stats"
->
genotypeStats
"stats"
->
statsFile
)
override
def
addToQscriptSummary
(
qscript
:
SummaryQScript
,
name
:
String
)
:
Unit
=
{
val
data
=
Source
.
fromFile
(
genotypeStats
).
getLines
().
map
(
_
.
split
(
"\t"
)).
toArray
for
(
s
<-
1
until
data
(
0
).
size
)
{
val
sample
=
data
(
0
)(
s
)
val
stats
=
Map
(
"genotype"
->
(
for
(
f
<-
1
until
data
.
length
)
yield
{
data
(
f
)(
0
)
->
tryToParseNumber
(
data
(
f
)(
s
)).
getOrElse
(
None
)
}).
toMap
)
val
sum
=
new
Summarizable
{
override
def
summaryFiles
:
Map
[
String
,
File
]
=
Map
()
override
def
summaryStats
:
Map
[
String
,
Any
]
=
stats
}
qscript
.
addSummarizable
(
sum
,
name
,
Some
(
sample
))
}
}
}
biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala
View file @
8ead9c4f
...
...
@@ -47,7 +47,7 @@ object BiopetToolsExecutable extends BiopetExecutable {
nl
.
lumc
.
sasc
.
biopet
.
tools
.
ValidateFastq
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
ValidateVcf
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfFilter
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfStats
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
vcfstats
.
VcfStats
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfToTsv
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfWithVcf
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VepNormalizer
,
...
...
biopet-tools/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R
→
biopet-tools/src/main/resources/nl/lumc/sasc/biopet/tools/
vcfstats/
plotHeatmap.R
View file @
8ead9c4f
File moved
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala
View file @
8ead9c4f
...
...
@@ -98,50 +98,25 @@ object BamStats extends ToolCommand {
*/
def
init
(
outputDir
:
File
,
bamFile
:
File
,
referenceDict
:
SAMSequenceDictionary
,
binSize
:
Int
,
threadBinSize
:
Int
)
:
Unit
=
{
val
contigsFutures
=
BedRecordList
.
fromDict
(
referenceDict
).
allRecords
.
map
{
contig
=>
processContig
(
contig
,
bamFile
,
binSize
,
threadBinSize
,
outputDir
)
}
val
stats
=
waitOnFutures
(
processUnmappedReads
(
bamFile
)
::
contigsFutures
.
toList
)
stats
.
writeStatsToFiles
(
outputDir
)
contig
.
chr
->
processContig
(
contig
,
bamFile
,
binSize
,
threadBinSize
,
outputDir
)
}.
toList
val
clippingHistogram
=
tsvToMap
(
new
File
(
outputDir
,
"clipping.tsv"
))
val
mappingQualityHistogram
=
tsvToMap
(
new
File
(
outputDir
,
"mapping_quality.tsv"
))
val
stats
=
waitOnFutures
(
processUnmappedReads
(
bamFile
)
::
contigsFutures
.
map
(
_
.
_2
))
val
summary
=
Map
(
"flagstats"
->
ConfigUtils
.
fileToConfigMap
(
new
File
(
outputDir
,
"flagstats.summary.json"
)),
"flagstats_per_contig"
->
referenceDict
.
getSequences
.
map
{
c
=>
c
.
getSequenceName
->
ConfigUtils
.
fileToConfigMap
(
new
File
(
outputDir
,
"contigs"
+
File
.
separator
+
c
.
getSequenceName
+
File
.
separator
+
"flagstats.summary.json"
))
}.
toMap
,
"mapping_quality"
->
Map
(
"general"
->
aggregateStats
(
mappingQualityHistogram
),
"histogram"
->
mappingQualityHistogram
),
"clipping"
->
Map
(
"general"
->
aggregateStats
(
clippingHistogram
),
"histogram"
->
clippingHistogram
)
val
statsWriter
=
new
PrintWriter
(
new
File
(
outputDir
,
"bamstats.json"
))
val
totalStats
=
stats
.
toSummaryMap
val
statsMap
=
Map
(
"total"
->
totalStats
,
"contigs"
->
contigsFutures
.
map
(
x
=>
x
.
_1
->
Await
.
result
(
x
.
_2
,
Duration
.
Zero
).
toSummaryMap
).
toMap
)
statsWriter
.
println
(
ConfigUtils
.
mapToJson
(
statsMap
).
nospaces
)
statsWriter
.
close
()
val
summaryWriter
=
new
PrintWriter
(
new
File
(
outputDir
,
"bamstats.summary.json"
))
summaryWriter
.
println
(
ConfigUtils
.
mapToJson
(
summary
).
spaces
2
)
summaryWriter
.
println
(
ConfigUtils
.
mapToJson
(
totalStats
).
no
spaces
)
summaryWriter
.
close
()
}
def
aggregateStats
(
table
:
Map
[
String
,
Array
[
Long
]])
:
Map
[
String
,
Any
]
=
{
val
values
=
table
(
"value"
)
val
counts
=
table
(
"count"
)
require
(
values
.
size
==
counts
.
size
)
if
(
values
.
nonEmpty
)
{
val
modal
=
values
(
counts
.
indexOf
(
counts
.
max
))
val
totalCounts
=
counts
.
sum
val
mean
:
Double
=
values
.
zip
(
counts
).
map
(
x
=>
x
.
_1
*
x
.
_2
).
sum
.
toDouble
/
totalCounts
val
median
:
Long
=
values
(
values
.
zip
(
counts
).
zipWithIndex
.
sortBy
(
_
.
_1
.
_1
).
foldLeft
((
0L
,
0
))
{
case
(
a
,
b
)
=>
val
total
=
a
.
_1
+
b
.
_1
.
_2
if
(
total
>=
totalCounts
/
2
)
(
total
,
a
.
_2
)
else
(
total
,
b
.
_2
)
}.
_2
)
Map
(
"min"
->
values
.
min
,
"max"
->
values
.
max
,
"median"
->
median
,
"mean"
->
mean
,
"modal"
->
modal
)
}
else
Map
()
}
/**
* This will start the subjobs for each contig and collect [[Stats]] on contig level
*
...
...
@@ -157,11 +132,7 @@ object BamStats extends ToolCommand {
.
grouped
((
region
.
length
.
toDouble
/
binSize
).
ceil
.
toInt
/
(
region
.
length
.
toDouble
/
threadBinSize
).
ceil
.
toInt
)
.
map
(
scatters
=>
processThread
(
scatters
,
bamFile
))
.
toList
val
stats
=
waitOnFutures
(
scattersFutures
,
Some
(
region
.
chr
))
val
contigDir
=
new
File
(
outputDir
,
"contigs"
+
File
.
separator
+
region
.
chr
)
contigDir
.
mkdirs
()
stats
.
writeStatsToFiles
(
contigDir
)
stats
waitOnFutures
(
scattersFutures
,
Some
(
region
.
chr
))
}
/**
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Histogram.scala
View file @
8ead9c4f
...
...
@@ -15,6 +15,7 @@
package
nl.lumc.sasc.biopet.tools.bamstats
import
java.io.
{
File
,
PrintWriter
}
import
nl.lumc.sasc.biopet.utils.sortAnyAny
import
scala.collection.mutable
...
...
@@ -49,6 +50,11 @@ class Counts[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Ordering[T
writer
.
close
()
}
def
toSummaryMap
=
{
val
values
=
counts
.
keySet
.
toList
.
sortWith
(
sortAnyAny
)
Map
(
"values"
->
values
,
"counts"
->
values
.
map
(
counts
(
_
)))
}
override
def
equals
(
other
:
Any
)
:
Boolean
=
{
other
match
{
case
c
:
Counts
[
T
]
=>
this
.
counts
==
c
.
counts
...
...
@@ -58,5 +64,22 @@ class Counts[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Ordering[T
}
class
Histogram
[
T
](
_counts
:
Map
[
T
,
Long
]
=
Map
[
T
,
Long
]())(
implicit
ord
:
Numeric
[
T
])
extends
Counts
[
T
](
_counts
)
{
def
aggregateStats
:
Map
[
String
,
Any
]
=
{
val
values
=
this
.
counts
.
keys
.
toList
val
counts
=
this
.
counts
.
values
.
toList
require
(
values
.
size
==
counts
.
size
)
if
(
values
.
nonEmpty
)
{
val
modal
=
values
(
counts
.
indexOf
(
counts
.
max
))
val
totalCounts
=
counts
.
sum
val
mean
:
Double
=
values
.
zip
(
counts
).
map
(
x
=>
ord
.
toDouble
(
x
.
_1
)
*
x
.
_2
).
sum
/
totalCounts
val
median
=
values
(
values
.
zip
(
counts
).
zipWithIndex
.
sortBy
(
_
.
_1
.
_1
).
foldLeft
((
0L
,
0
))
{
case
(
a
,
b
)
=>
val
total
=
a
.
_1
+
b
.
_1
.
_2
if
(
total
>=
totalCounts
/
2
)
(
total
,
a
.
_2
)
else
(
total
,
b
.
_2
)
}.
_2
)
Map
(
"min"
->
values
.
min
,
"max"
->
values
.
max
,
"median"
->
median
,
"mean"
->
mean
,
"modal"
->
modal
)
}
else
Map
()
}
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Stats.scala
View file @
8ead9c4f
...
...
@@ -58,4 +58,17 @@ case class Stats(flagstat: FlagstatCollector = new FlagstatCollector(),
this
.
_5_ClippingHistogram
.
writeToTsv
(
new
File
(
outputDir
,
"5_prime_clipping.tsv"
))
this
.
_3_ClippingHistogram
.
writeToTsv
(
new
File
(
outputDir
,
"3_prime_clipping.tsv"
))
}
def
toSummaryMap
=
{
Map
(
"flagstats"
->
flagstat
.
toSummaryMap
,
"mapping_quality"
->
Map
(
"histogram"
->
mappingQualityHistogram
.
toSummaryMap
,
"general"
->
mappingQualityHistogram
.
aggregateStats
),
"insert_size"
->
Map
(
"histogram"
->
insertSizeHistogram
.
toSummaryMap
,
"general"
->
insertSizeHistogram
.
aggregateStats
),
"clipping"
->
Map
(
"histogram"
->
clippingHistogram
.
toSummaryMap
,
"general"
->
clippingHistogram
.
aggregateStats
),
"left_clipping"
->
Map
(
"histogram"
->
leftClippingHistogram
.
toSummaryMap
,
"general"
->
leftClippingHistogram
.
aggregateStats
),
"right_clipping"
->
Map
(
"histogram"
->
rightClippingHistogram
.
toSummaryMap
,
"general"
->
rightClippingHistogram
.
aggregateStats
),
"5_prime_clipping"
->
Map
(
"histogram"
->
_5_ClippingHistogram
.
toSummaryMap
,
"general"
->
_5_ClippingHistogram
.
aggregateStats
),
"3_prime_clipping"
->
Map
(
"histogram"
->
_3_ClippingHistogram
.
toSummaryMap
,
"general"
->
_3_ClippingHistogram
.
aggregateStats
)
)
}
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/flagstat/FlagstatCollector.scala
View file @
8ead9c4f
...
...
@@ -201,6 +201,12 @@ class FlagstatCollector {
buffer
.
toString
()
}
def
toSummaryMap
=
{
val
sortedKeys
=
names
.
keys
.
toArray
.
sorted
sortedKeys
.
map
(
x
=>
names
(
x
)
->
totalCounts
(
x
)).
toMap
++
Map
(
"cross_counts"
->
crossCounts
)
}
def
+=
(
other
:
FlagstatCollector
)
:
FlagstatCollector
=
{
require
(
this
.
names
==
other
.
names
)
//require(this.functions == other.functions)
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/SampleStats.scala
0 → 100644
View file @
8ead9c4f
package
nl.lumc.sasc.biopet.tools.vcfstats
import
scala.collection.mutable
/**
* class to store all sample relative stats
*
* @param genotypeStats Stores all genotype relative stats
* @param sampleToSample Stores sample to sample compare stats
*/
case
class
SampleStats
(
genotypeStats
:
mutable.Map
[
String
,
mutable.Map
[
String
,
mutable.Map
[
Any
,
Int
]]]
=
mutable
.
Map
(),
sampleToSample
:
mutable.Map
[
String
,
SampleToSampleStats
]
=
mutable
.
Map
())
{
/** Add an other class */
def
+=
(
other
:
SampleStats
)
:
Unit
=
{
for
((
key
,
value
)
<-
other
.
sampleToSample
)
{
if
(
this
.
sampleToSample
.
contains
(
key
))
this
.
sampleToSample
(
key
)
+=
value
else
this
.
sampleToSample
(
key
)
=
value
}
for
((
chr
,
chrMap
)
<-
other
.
genotypeStats
;
(
field
,
fieldMap
)
<-
chrMap
)
{
if
(!
this
.
genotypeStats
.
contains
(
chr
))
genotypeStats
+=
(
chr
->
mutable
.
Map
[
String
,
mutable.Map
[
Any
,
Int
]]())
val
thisField
=
this
.
genotypeStats
(
chr
).
get
(
field
)
if
(
thisField
.
isDefined
)
Stats
.
mergeStatsMap
(
thisField
.
get
,
fieldMap
)
else
this
.
genotypeStats
(
chr
)
+=
field
->
fieldMap
}
}
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/SampleToSampleStats.scala
0 → 100644
View file @
8ead9c4f
package
nl.lumc.sasc.biopet.tools.vcfstats
/**
* Class to store sample to sample compare stats
* @param genotypeOverlap Number of genotypes match with other sample
* @param alleleOverlap Number of alleles also found in other sample
*/
case
class
SampleToSampleStats
(
var
genotypeOverlap
:
Int
=
0
,
var
alleleOverlap
:
Int
=
0
)
{
/** Add an other class */
def
+=
(
other
:
SampleToSampleStats
)
{
this
.
genotypeOverlap
+=
other
.
genotypeOverlap
this
.
alleleOverlap
+=
other
.
alleleOverlap
}
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/Stats.scala
0 → 100644
View file @
8ead9c4f
package
nl.lumc.sasc.biopet.tools.vcfstats
import
java.io.
{
File
,
PrintWriter
}
import
scala.collection.mutable
import
nl.lumc.sasc.biopet.utils.sortAnyAny
/**
* General stats class to store vcf stats
*
* @param generalStats Stores are general stats
* @param samplesStats Stores all sample/genotype specific stats
*/
case
class
Stats
(
generalStats
:
mutable.Map
[
String
,
mutable.Map
[
String
,
mutable.Map
[
Any
,
Int
]]]
=
mutable
.
Map
(),
samplesStats
:
mutable.Map
[
String
,
SampleStats
]
=
mutable
.
Map
())
{
/** Add an other class */
def
+=
(
other
:
Stats
)
:
Stats
=
{
for
((
key
,
value
)
<-
other
.
samplesStats
)
{
if
(
this
.
samplesStats
.
contains
(
key
))
this
.
samplesStats
(
key
)
+=
value
else
this
.
samplesStats
(
key
)
=
value
}
for
((
chr
,
chrMap
)
<-
other
.
generalStats
;
(
field
,
fieldMap
)
<-
chrMap
)
{
if
(!
this
.
generalStats
.
contains
(
chr
))
generalStats
+=
(
chr
->
mutable
.
Map
[
String
,
mutable.Map
[
Any
,
Int
]]())
val
thisField
=
this
.
generalStats
(
chr
).
get
(
field
)
if
(
thisField
.
isDefined
)
Stats
.
mergeStatsMap
(
thisField
.
get
,
fieldMap
)
else
this
.
generalStats
(
chr
)
+=
field
->
fieldMap
}
this
}
/** Function to write 1 specific general field */
def
writeField
(
field
:
String
,
outputDir
:
File
,
prefix
:
String
=
""
,
chr
:
String
=
"total"
)
:
File
=
{
val
file
=
(
prefix
,
chr
)
match
{
case
(
""
,
"total"
)
=>
new
File
(
outputDir
,
field
+
".tsv"
)
case
(
_
,
"total"
)
=>
new
File
(
outputDir
,
prefix
+
"-"
+
field
+
".tsv"
)
case
(
""
,
_
)
=>
new
File
(
outputDir
,
chr
+
"-"
+
field
+
".tsv"
)
case
_
=>
new
File
(
outputDir
,
prefix
+
"-"
+
chr
+
"-"
+
field
+
".tsv"
)
}
val
data
=
this
.
generalStats
.
getOrElse
(
chr
,
mutable
.
Map
[
String
,
mutable.Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
mutable
.
Map
[
Any
,
Int
]())
file
.
getParentFile
.
mkdirs
()
val
writer
=
new
PrintWriter
(
file
)
writer
.
println
(
"value\tcount"
)
for
(
key
<-
data
.
keySet
.
toList
.
sortWith
(
sortAnyAny
))
{
writer
.
println
(
key
+
"\t"
+
data
(
key
))
}
writer
.
close
()
file
}
/** Function to write 1 specific general field */
def
getField
(
field
:
String
,
chr
:
String
=
"total"
)
:
Map
[
String
,
Array
[
Any
]]
=
{
val
data
=
this
.
generalStats
.
getOrElse
(
chr
,
mutable
.
Map
[
String
,
mutable.Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
mutable
.
Map
[
Any
,
Int
]())
val
rows
=
for
(
key
<-
data
.
keySet
.
toArray
.
sortWith
(
sortAnyAny
))
yield
{
(
key
,
data
(
key
))
}
Map
(
"value"
->
rows
.
map
(
_
.
_1
),
"count"
->
rows
.
map
(
_
.
_2
))
}
/** Function to write 1 specific genotype field */
def
writeGenotypeField
(
samples
:
List
[
String
],
field
:
String
,
outputDir
:
File
,
prefix
:
String
=
""
,
chr
:
String
=
"total"
)
:
Unit
=
{
val
file
=
(
prefix
,
chr
)
match
{
case
(
""
,
"total"
)
=>
new
File
(
outputDir
,
field
+
".tsv"
)
case
(
_
,
"total"
)
=>
new
File
(
outputDir
,
prefix
+
"-"
+
field
+
".tsv"
)
case
(
""
,
_
)
=>
new
File
(
outputDir
,
chr
+
"-"
+
field
+
".tsv"
)
case
_
=>
new
File
(
outputDir
,
prefix
+
"-"
+
chr
+
"-"
+
field
+
".tsv"
)
}
file
.
getParentFile
.
mkdirs
()
val
writer
=
new
PrintWriter
(
file
)
writer
.
println
(
samples
.
mkString
(
field
+
"\t"
,
"\t"
,
""
))
val
keySet
=
(
for
(
sample
<-
samples
)
yield
this
.
samplesStats
(
sample
).
genotypeStats
.
getOrElse
(
chr
,
Map
[
String
,
Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
Map
[
Any
,
Int
]()).
keySet
).
fold
(
Set
[
Any
]())(
_
++
_
)
for
(
key
<-
keySet
.
toList
.
sortWith
(
sortAnyAny
))
{
val
values
=
for
(
sample
<-
samples
)
yield
this
.
samplesStats
(
sample
).
genotypeStats
.
getOrElse
(
chr
,
Map
[
String
,
Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
Map
[
Any
,
Int
]()).
getOrElse
(
key
,
0
)
writer
.
println
(
values
.
mkString
(
key
+
"\t"
,
"\t"
,
""
))
}
writer
.
close
()
}
/** Function to write 1 specific genotype field */
def
getGenotypeField
(
samples
:
List
[
String
],
field
:
String
,
chr
:
String
=
"total"
)
:
Map
[
String
,
Map
[
String
,
Any
]]
=
{
val
keySet
=
(
for
(
sample
<-
samples
)
yield
this
.
samplesStats
(
sample
).
genotypeStats
.
getOrElse
(
chr
,
Map
[
String
,
Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
Map
[
Any
,
Int
]()).
keySet
).
fold
(
Set
[
Any
]())(
_
++
_
)
(
for
(
sample
<-
samples
)
yield
sample
->
{
keySet
.
map
(
key
=>
key
.
toString
->
this
.
samplesStats
(
sample
).
genotypeStats
.
getOrElse
(
chr
,
Map
[
String
,
Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
Map
[
Any
,
Int
]()).
get
(
key
)
).
filter
(
_
.
_2
.
isDefined
).
toMap
}).
toMap
}
/** This will generate stats for one contig */
def
getContigStats
(
contig
:
String
,
samples
:
List
[
String
],
genotypeFields
:
List
[
String
]
=
Nil
,
infoFields
:
List
[
String
]
=
Nil
,
sampleDistributions
:
List
[
String
]
=
Nil
)
:
Map
[
String
,
Any
]
=
{
Map
(
"genotype"
->
genotypeFields
.
map
(
f
=>
f
->
getGenotypeField
(
samples
,
f
,
contig
)).
toMap
,
"info"
->
infoFields
.
map
(
f
=>
f
->
getField
(
f
,
contig
)).
toMap
,
"sample_distributions"
->
sampleDistributions
.
map
(
f
=>
f
->
getField
(
"SampleDistribution-"
+
f
,
contig
)).
toMap
)
}
/** This will generate stats for total */
def
getTotalStats
(
samples
:
List
[
String
],
genotypeFields
:
List
[
String
]
=
Nil
,
infoFields
:
List
[
String
]
=
Nil
,
sampleDistributions
:
List
[
String
]
=
Nil
)
=
getContigStats
(
"total"
,
samples
,
genotypeFields
,
infoFields
,
sampleDistributions
)
/** This will generate stats for total and contigs separated */
def
getAllStats
(
contigs
:
List
[
String
],
samples
:
List
[
String
],
genotypeFields
:
List
[
String
]
=
Nil
,
infoFields
:
List
[
String
]
=
Nil
,
sampleDistributions
:
List
[
String
]
=
Nil
)
:
Map
[
String
,
Any
]
=
{
Map
(
"contigs"
->
contigs
.
map
(
c
=>
c
->
getContigStats
(
c
,
samples
,
genotypeFields
,
infoFields
,
sampleDistributions
)).
toMap
,
"total"
->
getTotalStats
(
samples
,
genotypeFields
,
infoFields
,
sampleDistributions
)
)
}
}
object
Stats
{
/** Merge m2 into m1 */
def
mergeStatsMap
(
m1
:
mutable.Map
[
Any
,
Int
],
m2
:
mutable.Map
[
Any
,
Int
])
:
Unit
=
{
for
(
key
<-
m2
.
keySet
)
m1
(
key
)
=
m1
.
getOrElse
(
key
,
0
)
+
m2
(
key
)
}
/** Merge m2 into m1 */
def
mergeNestedStatsMap
(
m1
:
mutable.Map
[
String
,
mutable.Map
[
String
,
mutable.Map
[
Any
,
Int
]]],
m2
:
Map
[
String
,
Map
[
String
,
Map
[
Any
,
Int
]]])
:
Unit
=
{
for
((
chr
,
chrMap
)
<-
m2
;
(
field
,
fieldMap
)
<-
chrMap
)
{
if
(
m1
.
contains
(
chr
))
{
if
(
m1
(
chr
).
contains
(
field
))
{
for
((
key
,
value
)
<-
fieldMap
)
{
if
(
m1
(
chr
)(
field
).
contains
(
key
))
m1
(
chr
)(
field
)(
key
)
+=
value
else
m1
(
chr
)(
field
)(
key
)
=
value
}
}
else
m1
(
chr
)(
field
)
=
mutable
.
Map
(
fieldMap
.
toList
:
_
*
)
}
else
m1
(
chr
)
=
mutable
.
Map
(
field
->
mutable
.
Map
(
fieldMap
.
toList
:
_
*
))
}
}
}
\ No newline at end of file
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
→
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/
vcfstats/
VcfStats.scala
View file @
8ead9c4f
...
...
@@ -12,23 +12,24 @@
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package
nl.lumc.sasc.biopet.tools
package
nl.lumc.sasc.biopet.tools
.vcfstats
import
java.io.
{
File
,
FileOutputStream
,
PrintWriter
}
import
htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.samtools.util.Interval
import
htsjdk.variant.variantcontext.
{
Allele
,
Genotype
,
VariantContext
}
import
htsjdk.variant.variantcontext.
{
Genotype
,
VariantContext
}
import
htsjdk.variant.vcf.VCFFileReader
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
nl.lumc.sasc.biopet.utils.intervals.BedRecordList
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
FastaUtils
,
ToolCommand
,
VcfUtils
}
import
scala.collection.JavaConversions._
import
scala.collection.mutable
import
scala.io.Source
import
scala.sys.process.
{
Process
,
ProcessLogger
}
import
scala.util.Random
import
scala.concurrent.ExecutionContext.Implicits.global
import
scala.concurrent.duration._
import
scala.concurrent.
{
Await
,
Future
}
/**
* This tool will generate statistics from a vcf file
...
...
@@ -75,15 +76,15 @@ object VcfStats extends ToolCommand {
}
validate
{
x
=>
if
(
x
==
null
)
failure
(
"Output directory required"
)
else
success
}
text
"Path to directory for output (required)"
opt
[
File
](
'i'
,
"intervals"
)
unbounded
()
valueName
(
"<file>"
)
action
{
(
x
,
c
)
=>
opt
[
File
](
'i'
,
"intervals"
)
unbounded
()
valueName
"<file>"
action
{
(
x
,
c
)
=>
c
.
copy
(
intervals
=
Some
(
x
))
}
text
"Path to interval (BED) file (optional)"
opt
[
String
](
"infoTag"
)
unbounded
()
valueName
"<tag>"
action
{
(
x
,
c
)
=>
c
.
copy
(
infoTags
=
x
::
c
.
infoTags
)
}
text
"Summarize these info tags. Default is
all tags
"
}
text
s
"Summarize these info tags. Default is
(${defaultInfoFields.mkString("
,
")})
"
opt
[
String
](
"genotypeTag"
)
unbounded
()
valueName
"<tag>"
action
{
(
x
,
c
)
=>
c
.
copy
(
genotypeTags
=
x
::
c
.
genotypeTags
)
}
text
"Summarize these genotype tags. Default is
all tags
"
}
text
s
"Summarize these genotype tags. Default is
(${defaultGenotypeFields.mkString("
,
")})
"
opt
[
Unit
](
"allInfoTags"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
allInfoTags
=
true
)
}
text
"Summarize all info tags. Default false"
...
...
@@ -110,86 +111,6 @@ object VcfStats extends ToolCommand {
|${genotypeWiggleOptions.mkString(", ")}"""
.
stripMargin
}
/**
* Class to store sample to sample compare stats
* @param genotypeOverlap Number of genotypes match with other sample
* @param alleleOverlap Number of alleles also found in other sample
*/
case
class
SampleToSampleStats
(
var
genotypeOverlap
:
Int
=
0
,
var
alleleOverlap
:
Int
=
0
)
{
/** Add an other class */
def
+=
(
other
:
SampleToSampleStats
)
{
this
.
genotypeOverlap
+=
other
.
genotypeOverlap
this
.
alleleOverlap
+=
other
.
alleleOverlap
}
}
/**
* class to store all sample relative stats