Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
8ead9c4f
Commit
8ead9c4f
authored
Jan 05, 2017
by
Sander Bollen
Browse files
Merge branch 'feature-singlefile' into 'develop'
Feature singlefile Fixing BIOPET-309 and BIOPET-503 See merge request !504
parents
725ed53a
a61f1bdf
Changes
20
Hide whitespace changes
Inline
Side-by-side
bammetrics/src/main/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BammetricsReport.scala
View file @
8ead9c4f
...
...
@@ -208,8 +208,8 @@ object BammetricsReport extends ReportBuilder {
val
pngFile
=
new
File
(
outputDir
,
prefix
+
".png"
)
def
paths
(
name
:
String
)
=
Map
(
"mapping_quality"
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"mapping_quality"
,
"histogram"
,
"value"
),
name
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"mapping_quality"
,
"histogram"
,
"count"
)
"mapping_quality"
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"mapping_quality"
,
"histogram"
,
"value
s
"
),
name
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"mapping_quality"
,
"histogram"
,
"count
s
"
)
)
val
tables
=
getSampleLibraries
(
summary
,
sampleId
,
libId
,
libraryLevel
)
...
...
@@ -236,8 +236,8 @@ object BammetricsReport extends ReportBuilder {
val
pngFile
=
new
File
(
outputDir
,
prefix
+
".png"
)
def
paths
(
name
:
String
)
=
Map
(
"clipping"
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"clipping"
,
"histogram"
,
"value"
),
name
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"clipping"
,
"histogram"
,
"count"
)
"clipping"
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"clipping"
,
"histogram"
,
"value
s
"
),
name
->
List
(
"bammetrics"
,
"stats"
,
"bamstats"
,
"clipping"
,
"histogram"
,
"count
s
"
)
)
val
tables
=
getSampleLibraries
(
summary
,
sampleId
,
libId
,
libraryLevel
)
...
...
biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/BamStats.scala
View file @
8ead9c4f
...
...
@@ -40,17 +40,11 @@ class BamStats(val root: Configurable) extends ToolCommandFunction with Referenc
private
var
outputFiles
:
List
[
File
]
=
Nil
def
bamstatsSummary
:
File
=
new
File
(
outputDir
,
"bamstats.summary.json"
)
def
flagstatSummaryFile
(
contig
:
Option
[
String
]
=
None
)
:
File
=
getOutputFile
(
"flagstats.summary.json"
,
contig
)
def
mappingQualityFile
(
contig
:
Option
[
String
]
=
None
)
:
File
=
getOutputFile
(
"mapping_quality.tsv"
,
contig
)
def
clipingFile
(
contig
:
Option
[
String
]
=
None
)
:
File
=
getOutputFile
(
"clipping.tsv"
,
contig
)
override
def
beforeGraph
()
{
super
.
beforeGraph
()
deps
:+=
new
File
(
bamFile
.
getAbsolutePath
.
replaceAll
(
".bam$"
,
".bai"
))
outputFiles
:+=
bamstatsSummary
outputFiles
:+=
flagstatSummaryFile
()
outputFiles
:+=
mappingQualityFile
()
outputFiles
:+=
clipingFile
()
jobOutputFile
=
new
File
(
outputDir
,
".bamstats.out"
)
if
(
reference
==
null
)
reference
=
referenceFasta
()
}
...
...
biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/VcfStats.scala
View file @
8ead9c4f
...
...
@@ -18,8 +18,9 @@ import java.io.File
import
nl.lumc.sasc.biopet.core.summary.
{
Summarizable
,
SummaryQScript
}
import
nl.lumc.sasc.biopet.core.
{
Reference
,
ToolCommandFunction
}
import
nl.lumc.sasc.biopet.tools.vcfstats.VcfStats
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
nl.lumc.sasc.biopet.utils.tryToParseNumber
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
tryToParseNumber
}
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
}
import
scala.io.Source
...
...
@@ -30,7 +31,7 @@ import scala.io.Source
* Created by pjvan_thof on 1/10/15.
*/
class
VcfStats
(
val
root
:
Configurable
)
extends
ToolCommandFunction
with
Summarizable
with
Reference
{
def
toolObject
=
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfStats
def
toolObject
=
VcfStats
mainFunction
=
false
...
...
@@ -41,10 +42,7 @@ class VcfStats(val root: Configurable) extends ToolCommandFunction with Summariz
protected
var
index
:
File
=
null
@Output
protected
var
generalStats
:
File
=
null
@Output
protected
var
genotypeStats
:
File
=
null
protected
var
statsFile
:
File
=
null
override
def
defaultCoreMemory
=
3.0
override
def
defaultThreads
=
3
...
...
@@ -66,8 +64,7 @@ class VcfStats(val root: Configurable) extends ToolCommandFunction with Summariz
/** Set output dir and a output file */
def
setOutputDir
(
dir
:
File
)
:
Unit
=
{
outputDir
=
dir
generalStats
=
new
File
(
dir
,
"general.tsv"
)
genotypeStats
=
new
File
(
dir
,
"genotype-general.tsv"
)
statsFile
=
new
File
(
dir
,
"stats.json"
)
jobOutputFile
=
new
File
(
dir
,
".vcfstats.out"
)
}
...
...
@@ -83,35 +80,10 @@ class VcfStats(val root: Configurable) extends ToolCommandFunction with Summariz
optional
(
"--intervals"
,
intervals
)
/** Returns general stats to the summary */
def
summaryStats
:
Map
[
String
,
Any
]
=
{
Map
(
"info"
->
(
for
(
line
<-
Source
.
fromFile
(
generalStats
).
getLines
().
toList
.
tail
;
values
=
line
.
split
(
"\t"
)
if
values
.
size
>=
2
&&
!
values
(
0
).
isEmpty
)
yield
values
(
0
)
->
tryToParseNumber
(
values
(
1
)).
getOrElse
(
None
)
).
toMap
)
}
def
summaryStats
:
Map
[
String
,
Any
]
=
ConfigUtils
.
fileToConfigMap
(
statsFile
)
/** return only general files to summary */
def
summaryFiles
:
Map
[
String
,
File
]
=
Map
(
"general_stats"
->
generalStats
,
"genotype_stats"
->
genotypeStats
"stats"
->
statsFile
)
override
def
addToQscriptSummary
(
qscript
:
SummaryQScript
,
name
:
String
)
:
Unit
=
{
val
data
=
Source
.
fromFile
(
genotypeStats
).
getLines
().
map
(
_
.
split
(
"\t"
)).
toArray
for
(
s
<-
1
until
data
(
0
).
size
)
{
val
sample
=
data
(
0
)(
s
)
val
stats
=
Map
(
"genotype"
->
(
for
(
f
<-
1
until
data
.
length
)
yield
{
data
(
f
)(
0
)
->
tryToParseNumber
(
data
(
f
)(
s
)).
getOrElse
(
None
)
}).
toMap
)
val
sum
=
new
Summarizable
{
override
def
summaryFiles
:
Map
[
String
,
File
]
=
Map
()
override
def
summaryStats
:
Map
[
String
,
Any
]
=
stats
}
qscript
.
addSummarizable
(
sum
,
name
,
Some
(
sample
))
}
}
}
biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala
View file @
8ead9c4f
...
...
@@ -47,7 +47,7 @@ object BiopetToolsExecutable extends BiopetExecutable {
nl
.
lumc
.
sasc
.
biopet
.
tools
.
ValidateFastq
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
ValidateVcf
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfFilter
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfStats
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
vcfstats
.
VcfStats
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfToTsv
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfWithVcf
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VepNormalizer
,
...
...
biopet-tools/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R
→
biopet-tools/src/main/resources/nl/lumc/sasc/biopet/tools/
vcfstats/
plotHeatmap.R
View file @
8ead9c4f
File moved
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala
View file @
8ead9c4f
...
...
@@ -98,50 +98,25 @@ object BamStats extends ToolCommand {
*/
def
init
(
outputDir
:
File
,
bamFile
:
File
,
referenceDict
:
SAMSequenceDictionary
,
binSize
:
Int
,
threadBinSize
:
Int
)
:
Unit
=
{
val
contigsFutures
=
BedRecordList
.
fromDict
(
referenceDict
).
allRecords
.
map
{
contig
=>
processContig
(
contig
,
bamFile
,
binSize
,
threadBinSize
,
outputDir
)
}
val
stats
=
waitOnFutures
(
processUnmappedReads
(
bamFile
)
::
contigsFutures
.
toList
)
stats
.
writeStatsToFiles
(
outputDir
)
contig
.
chr
->
processContig
(
contig
,
bamFile
,
binSize
,
threadBinSize
,
outputDir
)
}.
toList
val
clippingHistogram
=
tsvToMap
(
new
File
(
outputDir
,
"clipping.tsv"
))
val
mappingQualityHistogram
=
tsvToMap
(
new
File
(
outputDir
,
"mapping_quality.tsv"
))
val
stats
=
waitOnFutures
(
processUnmappedReads
(
bamFile
)
::
contigsFutures
.
map
(
_
.
_2
))
val
summary
=
Map
(
"flagstats"
->
ConfigUtils
.
fileToConfigMap
(
new
File
(
outputDir
,
"flagstats.summary.json"
)),
"flagstats_per_contig"
->
referenceDict
.
getSequences
.
map
{
c
=>
c
.
getSequenceName
->
ConfigUtils
.
fileToConfigMap
(
new
File
(
outputDir
,
"contigs"
+
File
.
separator
+
c
.
getSequenceName
+
File
.
separator
+
"flagstats.summary.json"
))
}.
toMap
,
"mapping_quality"
->
Map
(
"general"
->
aggregateStats
(
mappingQualityHistogram
),
"histogram"
->
mappingQualityHistogram
),
"clipping"
->
Map
(
"general"
->
aggregateStats
(
clippingHistogram
),
"histogram"
->
clippingHistogram
)
val
statsWriter
=
new
PrintWriter
(
new
File
(
outputDir
,
"bamstats.json"
))
val
totalStats
=
stats
.
toSummaryMap
val
statsMap
=
Map
(
"total"
->
totalStats
,
"contigs"
->
contigsFutures
.
map
(
x
=>
x
.
_1
->
Await
.
result
(
x
.
_2
,
Duration
.
Zero
).
toSummaryMap
).
toMap
)
statsWriter
.
println
(
ConfigUtils
.
mapToJson
(
statsMap
).
nospaces
)
statsWriter
.
close
()
val
summaryWriter
=
new
PrintWriter
(
new
File
(
outputDir
,
"bamstats.summary.json"
))
summaryWriter
.
println
(
ConfigUtils
.
mapToJson
(
summary
).
spaces
2
)
summaryWriter
.
println
(
ConfigUtils
.
mapToJson
(
totalStats
).
no
spaces
)
summaryWriter
.
close
()
}
def
aggregateStats
(
table
:
Map
[
String
,
Array
[
Long
]])
:
Map
[
String
,
Any
]
=
{
val
values
=
table
(
"value"
)
val
counts
=
table
(
"count"
)
require
(
values
.
size
==
counts
.
size
)
if
(
values
.
nonEmpty
)
{
val
modal
=
values
(
counts
.
indexOf
(
counts
.
max
))
val
totalCounts
=
counts
.
sum
val
mean
:
Double
=
values
.
zip
(
counts
).
map
(
x
=>
x
.
_1
*
x
.
_2
).
sum
.
toDouble
/
totalCounts
val
median
:
Long
=
values
(
values
.
zip
(
counts
).
zipWithIndex
.
sortBy
(
_
.
_1
.
_1
).
foldLeft
((
0L
,
0
))
{
case
(
a
,
b
)
=>
val
total
=
a
.
_1
+
b
.
_1
.
_2
if
(
total
>=
totalCounts
/
2
)
(
total
,
a
.
_2
)
else
(
total
,
b
.
_2
)
}.
_2
)
Map
(
"min"
->
values
.
min
,
"max"
->
values
.
max
,
"median"
->
median
,
"mean"
->
mean
,
"modal"
->
modal
)
}
else
Map
()
}
/**
* This will start the subjobs for each contig and collect [[Stats]] on contig level
*
...
...
@@ -157,11 +132,7 @@ object BamStats extends ToolCommand {
.
grouped
((
region
.
length
.
toDouble
/
binSize
).
ceil
.
toInt
/
(
region
.
length
.
toDouble
/
threadBinSize
).
ceil
.
toInt
)
.
map
(
scatters
=>
processThread
(
scatters
,
bamFile
))
.
toList
val
stats
=
waitOnFutures
(
scattersFutures
,
Some
(
region
.
chr
))
val
contigDir
=
new
File
(
outputDir
,
"contigs"
+
File
.
separator
+
region
.
chr
)
contigDir
.
mkdirs
()
stats
.
writeStatsToFiles
(
contigDir
)
stats
waitOnFutures
(
scattersFutures
,
Some
(
region
.
chr
))
}
/**
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Histogram.scala
View file @
8ead9c4f
...
...
@@ -15,6 +15,7 @@
package
nl.lumc.sasc.biopet.tools.bamstats
import
java.io.
{
File
,
PrintWriter
}
import
nl.lumc.sasc.biopet.utils.sortAnyAny
import
scala.collection.mutable
...
...
@@ -49,6 +50,11 @@ class Counts[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Ordering[T
writer
.
close
()
}
def
toSummaryMap
=
{
val
values
=
counts
.
keySet
.
toList
.
sortWith
(
sortAnyAny
)
Map
(
"values"
->
values
,
"counts"
->
values
.
map
(
counts
(
_
)))
}
override
def
equals
(
other
:
Any
)
:
Boolean
=
{
other
match
{
case
c
:
Counts
[
T
]
=>
this
.
counts
==
c
.
counts
...
...
@@ -58,5 +64,22 @@ class Counts[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Ordering[T
}
class
Histogram
[
T
](
_counts
:
Map
[
T
,
Long
]
=
Map
[
T
,
Long
]())(
implicit
ord
:
Numeric
[
T
])
extends
Counts
[
T
](
_counts
)
{
def
aggregateStats
:
Map
[
String
,
Any
]
=
{
val
values
=
this
.
counts
.
keys
.
toList
val
counts
=
this
.
counts
.
values
.
toList
require
(
values
.
size
==
counts
.
size
)
if
(
values
.
nonEmpty
)
{
val
modal
=
values
(
counts
.
indexOf
(
counts
.
max
))
val
totalCounts
=
counts
.
sum
val
mean
:
Double
=
values
.
zip
(
counts
).
map
(
x
=>
ord
.
toDouble
(
x
.
_1
)
*
x
.
_2
).
sum
/
totalCounts
val
median
=
values
(
values
.
zip
(
counts
).
zipWithIndex
.
sortBy
(
_
.
_1
.
_1
).
foldLeft
((
0L
,
0
))
{
case
(
a
,
b
)
=>
val
total
=
a
.
_1
+
b
.
_1
.
_2
if
(
total
>=
totalCounts
/
2
)
(
total
,
a
.
_2
)
else
(
total
,
b
.
_2
)
}.
_2
)
Map
(
"min"
->
values
.
min
,
"max"
->
values
.
max
,
"median"
->
median
,
"mean"
->
mean
,
"modal"
->
modal
)
}
else
Map
()
}
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Stats.scala
View file @
8ead9c4f
...
...
@@ -58,4 +58,17 @@ case class Stats(flagstat: FlagstatCollector = new FlagstatCollector(),
this
.
_5_ClippingHistogram
.
writeToTsv
(
new
File
(
outputDir
,
"5_prime_clipping.tsv"
))
this
.
_3_ClippingHistogram
.
writeToTsv
(
new
File
(
outputDir
,
"3_prime_clipping.tsv"
))
}
def
toSummaryMap
=
{
Map
(
"flagstats"
->
flagstat
.
toSummaryMap
,
"mapping_quality"
->
Map
(
"histogram"
->
mappingQualityHistogram
.
toSummaryMap
,
"general"
->
mappingQualityHistogram
.
aggregateStats
),
"insert_size"
->
Map
(
"histogram"
->
insertSizeHistogram
.
toSummaryMap
,
"general"
->
insertSizeHistogram
.
aggregateStats
),
"clipping"
->
Map
(
"histogram"
->
clippingHistogram
.
toSummaryMap
,
"general"
->
clippingHistogram
.
aggregateStats
),
"left_clipping"
->
Map
(
"histogram"
->
leftClippingHistogram
.
toSummaryMap
,
"general"
->
leftClippingHistogram
.
aggregateStats
),
"right_clipping"
->
Map
(
"histogram"
->
rightClippingHistogram
.
toSummaryMap
,
"general"
->
rightClippingHistogram
.
aggregateStats
),
"5_prime_clipping"
->
Map
(
"histogram"
->
_5_ClippingHistogram
.
toSummaryMap
,
"general"
->
_5_ClippingHistogram
.
aggregateStats
),
"3_prime_clipping"
->
Map
(
"histogram"
->
_3_ClippingHistogram
.
toSummaryMap
,
"general"
->
_3_ClippingHistogram
.
aggregateStats
)
)
}
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/flagstat/FlagstatCollector.scala
View file @
8ead9c4f
...
...
@@ -201,6 +201,12 @@ class FlagstatCollector {
buffer
.
toString
()
}
def
toSummaryMap
=
{
val
sortedKeys
=
names
.
keys
.
toArray
.
sorted
sortedKeys
.
map
(
x
=>
names
(
x
)
->
totalCounts
(
x
)).
toMap
++
Map
(
"cross_counts"
->
crossCounts
)
}
def
+=
(
other
:
FlagstatCollector
)
:
FlagstatCollector
=
{
require
(
this
.
names
==
other
.
names
)
//require(this.functions == other.functions)
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/SampleStats.scala
0 → 100644
View file @
8ead9c4f
package
nl.lumc.sasc.biopet.tools.vcfstats
import
scala.collection.mutable
/**
* class to store all sample relative stats
*
* @param genotypeStats Stores all genotype relative stats
* @param sampleToSample Stores sample to sample compare stats
*/
case
class
SampleStats
(
genotypeStats
:
mutable.Map
[
String
,
mutable.Map
[
String
,
mutable.Map
[
Any
,
Int
]]]
=
mutable
.
Map
(),
sampleToSample
:
mutable.Map
[
String
,
SampleToSampleStats
]
=
mutable
.
Map
())
{
/** Add an other class */
def
+=
(
other
:
SampleStats
)
:
Unit
=
{
for
((
key
,
value
)
<-
other
.
sampleToSample
)
{
if
(
this
.
sampleToSample
.
contains
(
key
))
this
.
sampleToSample
(
key
)
+=
value
else
this
.
sampleToSample
(
key
)
=
value
}
for
((
chr
,
chrMap
)
<-
other
.
genotypeStats
;
(
field
,
fieldMap
)
<-
chrMap
)
{
if
(!
this
.
genotypeStats
.
contains
(
chr
))
genotypeStats
+=
(
chr
->
mutable
.
Map
[
String
,
mutable.Map
[
Any
,
Int
]]())
val
thisField
=
this
.
genotypeStats
(
chr
).
get
(
field
)
if
(
thisField
.
isDefined
)
Stats
.
mergeStatsMap
(
thisField
.
get
,
fieldMap
)
else
this
.
genotypeStats
(
chr
)
+=
field
->
fieldMap
}
}
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/SampleToSampleStats.scala
0 → 100644
View file @
8ead9c4f
package
nl.lumc.sasc.biopet.tools.vcfstats
/**
* Class to store sample to sample compare stats
* @param genotypeOverlap Number of genotypes match with other sample
* @param alleleOverlap Number of alleles also found in other sample
*/
case
class
SampleToSampleStats
(
var
genotypeOverlap
:
Int
=
0
,
var
alleleOverlap
:
Int
=
0
)
{
/** Add an other class */
def
+=
(
other
:
SampleToSampleStats
)
{
this
.
genotypeOverlap
+=
other
.
genotypeOverlap
this
.
alleleOverlap
+=
other
.
alleleOverlap
}
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/Stats.scala
0 → 100644
View file @
8ead9c4f
package
nl.lumc.sasc.biopet.tools.vcfstats
import
java.io.
{
File
,
PrintWriter
}
import
scala.collection.mutable
import
nl.lumc.sasc.biopet.utils.sortAnyAny
/**
* General stats class to store vcf stats
*
* @param generalStats Stores are general stats
* @param samplesStats Stores all sample/genotype specific stats
*/
case
class
Stats
(
generalStats
:
mutable.Map
[
String
,
mutable.Map
[
String
,
mutable.Map
[
Any
,
Int
]]]
=
mutable
.
Map
(),
samplesStats
:
mutable.Map
[
String
,
SampleStats
]
=
mutable
.
Map
())
{
/** Add an other class */
def
+=
(
other
:
Stats
)
:
Stats
=
{
for
((
key
,
value
)
<-
other
.
samplesStats
)
{
if
(
this
.
samplesStats
.
contains
(
key
))
this
.
samplesStats
(
key
)
+=
value
else
this
.
samplesStats
(
key
)
=
value
}
for
((
chr
,
chrMap
)
<-
other
.
generalStats
;
(
field
,
fieldMap
)
<-
chrMap
)
{
if
(!
this
.
generalStats
.
contains
(
chr
))
generalStats
+=
(
chr
->
mutable
.
Map
[
String
,
mutable.Map
[
Any
,
Int
]]())
val
thisField
=
this
.
generalStats
(
chr
).
get
(
field
)
if
(
thisField
.
isDefined
)
Stats
.
mergeStatsMap
(
thisField
.
get
,
fieldMap
)
else
this
.
generalStats
(
chr
)
+=
field
->
fieldMap
}
this
}
/** Function to write 1 specific general field */
def
writeField
(
field
:
String
,
outputDir
:
File
,
prefix
:
String
=
""
,
chr
:
String
=
"total"
)
:
File
=
{
val
file
=
(
prefix
,
chr
)
match
{
case
(
""
,
"total"
)
=>
new
File
(
outputDir
,
field
+
".tsv"
)
case
(
_
,
"total"
)
=>
new
File
(
outputDir
,
prefix
+
"-"
+
field
+
".tsv"
)
case
(
""
,
_
)
=>
new
File
(
outputDir
,
chr
+
"-"
+
field
+
".tsv"
)
case
_
=>
new
File
(
outputDir
,
prefix
+
"-"
+
chr
+
"-"
+
field
+
".tsv"
)
}
val
data
=
this
.
generalStats
.
getOrElse
(
chr
,
mutable
.
Map
[
String
,
mutable.Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
mutable
.
Map
[
Any
,
Int
]())
file
.
getParentFile
.
mkdirs
()
val
writer
=
new
PrintWriter
(
file
)
writer
.
println
(
"value\tcount"
)
for
(
key
<-
data
.
keySet
.
toList
.
sortWith
(
sortAnyAny
))
{
writer
.
println
(
key
+
"\t"
+
data
(
key
))
}
writer
.
close
()
file
}
/** Function to write 1 specific general field */
def
getField
(
field
:
String
,
chr
:
String
=
"total"
)
:
Map
[
String
,
Array
[
Any
]]
=
{
val
data
=
this
.
generalStats
.
getOrElse
(
chr
,
mutable
.
Map
[
String
,
mutable.Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
mutable
.
Map
[
Any
,
Int
]())
val
rows
=
for
(
key
<-
data
.
keySet
.
toArray
.
sortWith
(
sortAnyAny
))
yield
{
(
key
,
data
(
key
))
}
Map
(
"value"
->
rows
.
map
(
_
.
_1
),
"count"
->
rows
.
map
(
_
.
_2
))
}
/** Function to write 1 specific genotype field */
def
writeGenotypeField
(
samples
:
List
[
String
],
field
:
String
,
outputDir
:
File
,
prefix
:
String
=
""
,
chr
:
String
=
"total"
)
:
Unit
=
{
val
file
=
(
prefix
,
chr
)
match
{
case
(
""
,
"total"
)
=>
new
File
(
outputDir
,
field
+
".tsv"
)
case
(
_
,
"total"
)
=>
new
File
(
outputDir
,
prefix
+
"-"
+
field
+
".tsv"
)
case
(
""
,
_
)
=>
new
File
(
outputDir
,
chr
+
"-"
+
field
+
".tsv"
)
case
_
=>
new
File
(
outputDir
,
prefix
+
"-"
+
chr
+
"-"
+
field
+
".tsv"
)
}
file
.
getParentFile
.
mkdirs
()
val
writer
=
new
PrintWriter
(
file
)
writer
.
println
(
samples
.
mkString
(
field
+
"\t"
,
"\t"
,
""
))
val
keySet
=
(
for
(
sample
<-
samples
)
yield
this
.
samplesStats
(
sample
).
genotypeStats
.
getOrElse
(
chr
,
Map
[
String
,
Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
Map
[
Any
,
Int
]()).
keySet
).
fold
(
Set
[
Any
]())(
_
++
_
)
for
(
key
<-
keySet
.
toList
.
sortWith
(
sortAnyAny
))
{
val
values
=
for
(
sample
<-
samples
)
yield
this
.
samplesStats
(
sample
).
genotypeStats
.
getOrElse
(
chr
,
Map
[
String
,
Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
Map
[
Any
,
Int
]()).
getOrElse
(
key
,
0
)
writer
.
println
(
values
.
mkString
(
key
+
"\t"
,
"\t"
,
""
))
}
writer
.
close
()
}
/** Function to write 1 specific genotype field */
def
getGenotypeField
(
samples
:
List
[
String
],
field
:
String
,
chr
:
String
=
"total"
)
:
Map
[
String
,
Map
[
String
,
Any
]]
=
{
val
keySet
=
(
for
(
sample
<-
samples
)
yield
this
.
samplesStats
(
sample
).
genotypeStats
.
getOrElse
(
chr
,
Map
[
String
,
Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
Map
[
Any
,
Int
]()).
keySet
).
fold
(
Set
[
Any
]())(
_
++
_
)
(
for
(
sample
<-
samples
)
yield
sample
->
{
keySet
.
map
(
key
=>
key
.
toString
->
this
.
samplesStats
(
sample
).
genotypeStats
.
getOrElse
(
chr
,
Map
[
String
,
Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
Map
[
Any
,
Int
]()).
get
(
key
)
).
filter
(
_
.
_2
.
isDefined
).
toMap
}).
toMap
}
/** This will generate stats for one contig */
def
getContigStats
(
contig
:
String
,
samples
:
List
[
String
],
genotypeFields
:
List
[
String
]
=
Nil
,
infoFields
:
List
[
String
]
=
Nil
,
sampleDistributions
:
List
[
String
]
=
Nil
)
:
Map
[
String
,
Any
]
=
{
Map
(
"genotype"
->
genotypeFields
.
map
(
f
=>
f
->
getGenotypeField
(
samples
,
f
,
contig
)).
toMap
,
"info"
->
infoFields
.
map
(
f
=>
f
->
getField
(
f
,
contig
)).
toMap
,
"sample_distributions"
->
sampleDistributions
.
map
(
f
=>
f
->
getField
(
"SampleDistribution-"
+
f
,
contig
)).
toMap
)
}
/** This will generate stats for total */
def
getTotalStats
(
samples
:
List
[
String
],
genotypeFields
:
List
[
String
]
=
Nil
,
infoFields
:
List
[
String
]
=
Nil
,
sampleDistributions
:
List
[
String
]
=
Nil
)
=
getContigStats
(
"total"
,
samples
,
genotypeFields
,
infoFields
,
sampleDistributions
)
/** This will generate stats for total and contigs separated */
def
getAllStats
(
contigs
:
List
[
String
],
samples
:
List
[
String
],
genotypeFields
:
List
[
String
]
=
Nil
,
infoFields
:
List
[
String
]
=
Nil
,
sampleDistributions
:
List
[
String
]
=
Nil
)
:
Map
[
String
,
Any
]
=
{
Map
(
"contigs"
->
contigs
.
map
(
c
=>
c
->
getContigStats
(
c
,
samples
,
genotypeFields
,
infoFields
,
sampleDistributions
)).
toMap
,
"total"
->
getTotalStats
(
samples
,
genotypeFields
,
infoFields
,
sampleDistributions
)
)
}
}
object
Stats
{
/** Merge m2 into m1 */
def
mergeStatsMap
(
m1
:
mutable.Map
[
Any
,
Int
],
m2
:
mutable.Map
[
Any
,
Int
])
:
Unit
=
{
for
(
key
<-
m2
.
keySet
)
m1
(
key
)
=
m1
.
getOrElse
(
key
,
0
)
+
m2
(
key
)
}
/** Merge m2 into m1 */
def
mergeNestedStatsMap
(
m1
:
mutable.Map
[
String
,
mutable.Map
[
String
,
mutable.Map
[
Any
,
Int
]]],
m2
:
Map
[
String
,
Map
[
String
,
Map
[
Any
,
Int
]]])
:
Unit
=
{
for
((
chr
,
chrMap
)
<-
m2
;
(
field
,
fieldMap
)
<-
chrMap
)
{
if
(
m1
.
contains
(
chr
))
{
if
(
m1
(
chr
).
contains
(
field
))
{
for
((
key
,
value
)
<-
fieldMap
)
{
if
(
m1
(
chr
)(
field
).
contains
(
key
))
m1
(
chr
)(
field
)(
key
)
+=
value
else
m1
(
chr
)(
field
)(
key
)
=
value
}
}
else
m1
(
chr
)(
field
)
=
mutable
.
Map
(
fieldMap
.
toList
:
_
*
)
}
else
m1
(
chr
)
=
mutable
.
Map
(
field
->
mutable
.
Map
(
fieldMap
.
toList
:
_
*
))
}
}
}
\ No newline at end of file
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
→
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/
vcfstats/
VcfStats.scala
View file @
8ead9c4f
...
...
@@ -12,23 +12,24 @@
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package
nl.lumc.sasc.biopet.tools
package
nl.lumc.sasc.biopet.tools
.vcfstats
import
java.io.
{
File
,
FileOutputStream
,
PrintWriter
}
import
htsjdk.samtools.reference.FastaSequenceFile
import
htsjdk.samtools.util.Interval
import
htsjdk.variant.variantcontext.
{
Allele
,
Genotype
,
VariantContext
}
import
htsjdk.variant.variantcontext.
{
Genotype
,
VariantContext
}
import
htsjdk.variant.vcf.VCFFileReader
import
nl.lumc.sasc.biopet.utils.
{
FastaUtils
,
ToolCommand
}
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
nl.lumc.sasc.biopet.utils.intervals.BedRecordList
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
FastaUtils
,
ToolCommand
,
VcfUtils
}
import
scala.collection.JavaConversions._
import
scala.collection.mutable
import
scala.io.Source
import
scala.sys.process.
{
Process
,
ProcessLogger
}
import
scala.util.Random
import
scala.concurrent.ExecutionContext.Implicits.global
import
scala.concurrent.duration._
import
scala.concurrent.
{
Await
,
Future
}
/**
* This tool will generate statistics from a vcf file
...
...
@@ -75,15 +76,15 @@ object VcfStats extends ToolCommand {
}
validate
{
x
=>
if
(
x
==
null
)
failure
(
"Output directory required"
)
else
success
}
text
"Path to directory for output (required)"
opt
[
File
](
'i'
,
"intervals"
)
unbounded
()
valueName
(
"<file>"
)
action
{
(
x
,
c
)
=>
opt
[
File
](
'i'
,
"intervals"
)
unbounded
()
valueName
"<file>"
action
{
(
x
,
c
)
=>
c
.
copy
(
intervals
=
Some
(
x
))
}
text
"Path to interval (BED) file (optional)"
opt
[
String
](
"infoTag"
)
unbounded
()
valueName
"<tag>"
action
{
(
x
,
c
)
=>
c
.
copy
(
infoTags
=
x
::
c
.
infoTags
)
}
text
"Summarize these info tags. Default is
all tags
"
}
text
s
"Summarize these info tags. Default is
(${defaultInfoFields.mkString("
,
")})
"
opt
[
String
](
"genotypeTag"
)
unbounded
()
valueName
"<tag>"
action
{
(
x
,
c
)
=>
c
.
copy
(
genotypeTags
=
x
::
c
.
genotypeTags
)
}
text
"Summarize these genotype tags. Default is
all tags
"
}
text
s
"Summarize these genotype tags. Default is
(${defaultGenotypeFields.mkString("
,
")})
"
opt
[
Unit
](
"allInfoTags"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
allInfoTags
=
true
)
}
text
"Summarize all info tags. Default false"
...
...
@@ -110,86 +111,6 @@ object VcfStats extends ToolCommand {
|${genotypeWiggleOptions.mkString(", ")}"""
.
stripMargin
}
/**
* Class to store sample to sample compare stats
* @param genotypeOverlap Number of genotypes match with other sample
* @param alleleOverlap Number of alleles also found in other sample
*/
case
class
SampleToSampleStats
(
var
genotypeOverlap
:
Int
=
0
,
var
alleleOverlap
:
Int
=
0
)
{
/** Add an other class */
def
+=
(
other
:
SampleToSampleStats
)
{
this
.
genotypeOverlap
+=
other
.
genotypeOverlap
this
.
alleleOverlap
+=
other
.
alleleOverlap
}
}
/**
* class to store all sample relative stats
* @param genotypeStats Stores all genotype relative stats