Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
14dcba60
Commit
14dcba60
authored
Jan 02, 2017
by
Peter van 't Hof
Browse files
Fixed agregated stats
parent
575110d4
Changes
11
Hide whitespace changes
Inline
Side-by-side
biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/VcfStats.scala
View file @
14dcba60
...
...
@@ -16,12 +16,12 @@ package nl.lumc.sasc.biopet.extensions.tools
import
java.io.File
import
nl.lumc.sasc.biopet.core.summary.
{
Summarizable
,
SummaryQScript
}
import
nl.lumc.sasc.biopet.core.
{
Reference
,
ToolCommandFunction
}
import
nl.lumc.sasc.biopet.core.summary.
{
Summarizable
,
SummaryQScript
}
import
nl.lumc.sasc.biopet.core.
{
Reference
,
ToolCommandFunction
}
import
nl.lumc.sasc.biopet.tools.vcfstats.VcfStats
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
tryToParseNumber
}
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
}
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
tryToParseNumber
}
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
}
import
scala.io.Source
...
...
biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala
View file @
14dcba60
...
...
@@ -15,7 +15,7 @@
package
nl.lumc.sasc.biopet
import
nl.lumc.sasc.biopet.tools.vcfstats.VcfStats
import
nl.lumc.sasc.biopet.utils.
{
BiopetExecutable
,
MainCommand
}
import
nl.lumc.sasc.biopet.utils.
{
BiopetExecutable
,
MainCommand
}
object
BiopetToolsExecutable
extends
BiopetExecutable
{
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala
View file @
14dcba60
...
...
@@ -103,54 +103,20 @@ object BamStats extends ToolCommand {
val
stats
=
waitOnFutures
(
processUnmappedReads
(
bamFile
)
::
contigsFutures
.
map
(
_
.
_2
))
val
statsWriter
=
new
PrintWriter
(
new
File
(
outputDir
,
"stats.json"
))
val
statsWriter
=
new
PrintWriter
(
new
File
(
outputDir
,
"bamstats.json"
))
val
totalStats
=
stats
.
toSummaryMap
val
statsMap
=
Map
(
"total"
->
stats
.
toSummaryMap
,
"total"
->
totalStats
,
"contigs"
->
contigsFutures
.
map
(
x
=>
x
.
_1
->
x
.
_2
.
value
.
get
.
get
.
toSummaryMap
).
toMap
)
statsWriter
.
println
(
ConfigUtils
.
mapToJson
(
statsMap
).
spaces
2
)
statsWriter
.
println
(
ConfigUtils
.
mapToJson
(
statsMap
).
no
spaces
)
statsWriter
.
close
()
//stats.writeStatsToFiles(outputDir)
// FIXME: getting aggregated stats
val
clippingHistogram
=
tsvToMap
(
new
File
(
outputDir
,
"clipping.tsv"
))
val
mappingQualityHistogram
=
tsvToMap
(
new
File
(
outputDir
,
"mapping_quality.tsv"
))
val
summary
=
Map
(
"flagstats"
->
ConfigUtils
.
fileToConfigMap
(
new
File
(
outputDir
,
"flagstats.summary.json"
)),
"flagstats_per_contig"
->
referenceDict
.
getSequences
.
map
{
c
=>
c
.
getSequenceName
->
ConfigUtils
.
fileToConfigMap
(
new
File
(
outputDir
,
"contigs"
+
File
.
separator
+
c
.
getSequenceName
+
File
.
separator
+
"flagstats.summary.json"
))
}.
toMap
,
"mapping_quality"
->
Map
(
"general"
->
aggregateStats
(
mappingQualityHistogram
),
"histogram"
->
mappingQualityHistogram
),
"clipping"
->
Map
(
"general"
->
aggregateStats
(
clippingHistogram
),
"histogram"
->
clippingHistogram
)
)
val
summaryWriter
=
new
PrintWriter
(
new
File
(
outputDir
,
"bamstats.summary.json"
))
summaryWriter
.
println
(
ConfigUtils
.
mapToJson
(
summary
).
spaces
2
)
summaryWriter
.
println
(
ConfigUtils
.
mapToJson
(
totalStats
).
no
spaces
)
summaryWriter
.
close
()
}
def
aggregateStats
(
table
:
Map
[
String
,
Array
[
Long
]])
:
Map
[
String
,
Any
]
=
{
val
values
=
table
(
"value"
)
val
counts
=
table
(
"count"
)
require
(
values
.
size
==
counts
.
size
)
if
(
values
.
nonEmpty
)
{
val
modal
=
values
(
counts
.
indexOf
(
counts
.
max
))
val
totalCounts
=
counts
.
sum
val
mean
:
Double
=
values
.
zip
(
counts
).
map
(
x
=>
x
.
_1
*
x
.
_2
).
sum
.
toDouble
/
totalCounts
val
median
:
Long
=
values
(
values
.
zip
(
counts
).
zipWithIndex
.
sortBy
(
_
.
_1
.
_1
).
foldLeft
((
0L
,
0
))
{
case
(
a
,
b
)
=>
val
total
=
a
.
_1
+
b
.
_1
.
_2
if
(
total
>=
totalCounts
/
2
)
(
total
,
a
.
_2
)
else
(
total
,
b
.
_2
)
}.
_2
)
Map
(
"min"
->
values
.
min
,
"max"
->
values
.
max
,
"median"
->
median
,
"mean"
->
mean
,
"modal"
->
modal
)
}
else
Map
()
}
/**
* This will start the subjobs for each contig and collect [[Stats]] on contig level
*
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Histogram.scala
View file @
14dcba60
...
...
@@ -64,5 +64,22 @@ class Counts[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Ordering[T
}
class
Histogram
[
T
](
_counts
:
Map
[
T
,
Long
]
=
Map
[
T
,
Long
]())(
implicit
ord
:
Numeric
[
T
])
extends
Counts
[
T
](
_counts
)
{
def
aggregateStats
:
Map
[
String
,
Any
]
=
{
val
values
=
this
.
counts
.
keys
.
toList
val
counts
=
this
.
counts
.
values
.
toList
require
(
values
.
size
==
counts
.
size
)
if
(
values
.
nonEmpty
)
{
val
modal
=
values
(
counts
.
indexOf
(
counts
.
max
))
val
totalCounts
=
counts
.
sum
val
mean
:
Double
=
values
.
zip
(
counts
).
map
(
x
=>
ord
.
toDouble
(
x
.
_1
)
*
x
.
_2
).
sum
/
totalCounts
val
median
=
values
(
values
.
zip
(
counts
).
zipWithIndex
.
sortBy
(
_
.
_1
.
_1
).
foldLeft
((
0L
,
0
))
{
case
(
a
,
b
)
=>
val
total
=
a
.
_1
+
b
.
_1
.
_2
if
(
total
>=
totalCounts
/
2
)
(
total
,
a
.
_2
)
else
(
total
,
b
.
_2
)
}.
_2
)
Map
(
"min"
->
values
.
min
,
"max"
->
values
.
max
,
"median"
->
median
,
"mean"
->
mean
,
"modal"
->
modal
)
}
else
Map
()
}
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Stats.scala
View file @
14dcba60
...
...
@@ -62,13 +62,13 @@ case class Stats(flagstat: FlagstatCollector = new FlagstatCollector(),
def
toSummaryMap
=
{
Map
(
"flagstats"
->
flagstat
.
toSummaryMap
,
"mapping_quality"
->
mappingQualityHistogram
.
toSummaryMap
,
"insert_size"
->
insertSizeHistogram
.
toSummaryMap
,
"clipping"
->
clippingHistogram
.
toSummaryMap
,
"left_clipping"
->
leftClippingHistogram
.
toSummaryMap
,
"right_clipping"
->
rightClippingHistogram
.
toSummaryMap
,
"5_prime_clipping"
->
_5_ClippingHistogram
.
toSummaryMap
,
"3_prime_clipping"
->
_3_ClippingHistogram
.
toSummaryMap
"mapping_quality"
->
Map
(
"histrogram"
->
mappingQualityHistogram
.
toSummaryMap
,
"general"
->
mappingQualityHistogram
.
aggregateStats
),
"insert_size"
->
Map
(
"histrogram"
->
insertSizeHistogram
.
toSummaryMap
,
"general"
->
insertSizeHistogram
.
aggregateStats
),
"clipping"
->
Map
(
"histrogram"
->
clippingHistogram
.
toSummaryMap
,
"general"
->
clippingHistogram
.
aggregateStats
),
"left_clipping"
->
Map
(
"histrogram"
->
leftClippingHistogram
.
toSummaryMap
,
"general"
->
leftClippingHistogram
.
aggregateStats
),
"right_clipping"
->
Map
(
"histrogram"
->
rightClippingHistogram
.
toSummaryMap
,
"general"
->
rightClippingHistogram
.
aggregateStats
),
"5_prime_clipping"
->
Map
(
"histrogram"
->
_5_ClippingHistogram
.
toSummaryMap
,
"general"
->
_5_ClippingHistogram
.
aggregateStats
),
"3_prime_clipping"
->
Map
(
"histrogram"
->
_3_ClippingHistogram
.
toSummaryMap
,
"general"
->
_3_ClippingHistogram
.
aggregateStats
)
)
}
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/SampleStats.scala
View file @
14dcba60
...
...
@@ -3,11 +3,11 @@ package nl.lumc.sasc.biopet.tools.vcfstats
import
scala.collection.mutable
/**
* class to store all sample relative stats
*
* @param genotypeStats Stores all genotype relative stats
* @param sampleToSample Stores sample to sample compare stats
*/
* class to store all sample relative stats
*
* @param genotypeStats Stores all genotype relative stats
* @param sampleToSample Stores sample to sample compare stats
*/
case
class
SampleStats
(
genotypeStats
:
mutable.Map
[
String
,
mutable.Map
[
String
,
mutable.Map
[
Any
,
Int
]]]
=
mutable
.
Map
(),
sampleToSample
:
mutable.Map
[
String
,
SampleToSampleStats
]
=
mutable
.
Map
())
{
/** Add an other class */
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/SampleToSampleStats.scala
View file @
14dcba60
package
nl.lumc.sasc.biopet.tools.vcfstats
/**
* Class to store sample to sample compare stats
* @param genotypeOverlap Number of genotypes match with other sample
* @param alleleOverlap Number of alleles also found in other sample
*/
* Class to store sample to sample compare stats
* @param genotypeOverlap Number of genotypes match with other sample
* @param alleleOverlap Number of alleles also found in other sample
*/
case
class
SampleToSampleStats
(
var
genotypeOverlap
:
Int
=
0
,
var
alleleOverlap
:
Int
=
0
)
{
/** Add an other class */
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/Stats.scala
View file @
14dcba60
package
nl.lumc.sasc.biopet.tools.vcfstats
import
java.io.
{
File
,
PrintWriter
}
import
java.io.
{
File
,
PrintWriter
}
import
scala.collection.mutable
import
nl.lumc.sasc.biopet.utils.sortAnyAny
/**
* General stats class to store vcf stats
*
* @param generalStats Stores are general stats
* @param samplesStats Stores all sample/genotype specific stats
*/
* General stats class to store vcf stats
*
* @param generalStats Stores are general stats
* @param samplesStats Stores all sample/genotype specific stats
*/
case
class
Stats
(
generalStats
:
mutable.Map
[
String
,
mutable.Map
[
String
,
mutable.Map
[
Any
,
Int
]]]
=
mutable
.
Map
(),
samplesStats
:
mutable.Map
[
String
,
SampleStats
]
=
mutable
.
Map
())
{
/** Add an other class */
...
...
@@ -33,9 +33,9 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
def
writeField
(
field
:
String
,
outputDir
:
File
,
prefix
:
String
=
""
,
chr
:
String
=
"total"
)
:
File
=
{
val
file
=
(
prefix
,
chr
)
match
{
case
(
""
,
"total"
)
=>
new
File
(
outputDir
,
field
+
".tsv"
)
case
(
_
,
"total"
)
=>
new
File
(
outputDir
,
prefix
+
"-"
+
field
+
".tsv"
)
case
(
""
,
_
)
=>
new
File
(
outputDir
,
chr
+
"-"
+
field
+
".tsv"
)
case
_
=>
new
File
(
outputDir
,
prefix
+
"-"
+
chr
+
"-"
+
field
+
".tsv"
)
case
(
_
,
"total"
)
=>
new
File
(
outputDir
,
prefix
+
"-"
+
field
+
".tsv"
)
case
(
""
,
_
)
=>
new
File
(
outputDir
,
chr
+
"-"
+
field
+
".tsv"
)
case
_
=>
new
File
(
outputDir
,
prefix
+
"-"
+
chr
+
"-"
+
field
+
".tsv"
)
}
val
data
=
this
.
generalStats
.
getOrElse
(
chr
,
mutable
.
Map
[
String
,
mutable.Map
[
Any
,
Int
]]()).
getOrElse
(
field
,
mutable
.
Map
[
Any
,
Int
]())
...
...
@@ -65,9 +65,9 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
prefix
:
String
=
""
,
chr
:
String
=
"total"
)
:
Unit
=
{
val
file
=
(
prefix
,
chr
)
match
{
case
(
""
,
"total"
)
=>
new
File
(
outputDir
,
field
+
".tsv"
)
case
(
_
,
"total"
)
=>
new
File
(
outputDir
,
prefix
+
"-"
+
field
+
".tsv"
)
case
(
""
,
_
)
=>
new
File
(
outputDir
,
chr
+
"-"
+
field
+
".tsv"
)
case
_
=>
new
File
(
outputDir
,
prefix
+
"-"
+
chr
+
"-"
+
field
+
".tsv"
)
case
(
_
,
"total"
)
=>
new
File
(
outputDir
,
prefix
+
"-"
+
field
+
".tsv"
)
case
(
""
,
_
)
=>
new
File
(
outputDir
,
chr
+
"-"
+
field
+
".tsv"
)
case
_
=>
new
File
(
outputDir
,
prefix
+
"-"
+
chr
+
"-"
+
field
+
".tsv"
)
}
file
.
getParentFile
.
mkdirs
()
...
...
@@ -95,7 +95,7 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
/** This will generate stats for one contig */
def
getContigStats
(
contig
:
String
,
samples
:
List
[
String
],
genotypeFields
:
List
[
String
]
=
Nil
,
genotypeFields
:
List
[
String
]
=
Nil
,
infoFields
:
List
[
String
]
=
Nil
,
sampleDistributions
:
List
[
String
]
=
Nil
)
:
Map
[
String
,
Any
]
=
{
Map
(
...
...
@@ -107,7 +107,7 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
/** This will generate stats for total */
def
getTotalStats
(
samples
:
List
[
String
],
genotypeFields
:
List
[
String
]
=
Nil
,
genotypeFields
:
List
[
String
]
=
Nil
,
infoFields
:
List
[
String
]
=
Nil
,
sampleDistributions
:
List
[
String
]
=
Nil
)
=
getContigStats
(
"total"
,
samples
,
genotypeFields
,
infoFields
,
sampleDistributions
)
...
...
@@ -115,7 +115,7 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
/** This will generate stats for total and contigs separated */
def
getAllStats
(
contigs
:
List
[
String
],
samples
:
List
[
String
],
genotypeFields
:
List
[
String
]
=
Nil
,
genotypeFields
:
List
[
String
]
=
Nil
,
infoFields
:
List
[
String
]
=
Nil
,
sampleDistributions
:
List
[
String
]
=
Nil
)
:
Map
[
String
,
Any
]
=
{
Map
(
...
...
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStats.scala
View file @
14dcba60
...
...
@@ -14,22 +14,22 @@
*/
package
nl.lumc.sasc.biopet.tools.vcfstats
import
java.io.
{
File
,
FileOutputStream
,
PrintWriter
}
import
java.io.
{
File
,
FileOutputStream
,
PrintWriter
}
import
htsjdk.samtools.util.Interval
import
htsjdk.variant.variantcontext.
{
Genotype
,
VariantContext
}
import
htsjdk.variant.variantcontext.
{
Genotype
,
VariantContext
}
import
htsjdk.variant.vcf.VCFFileReader
import
nl.lumc.sasc.biopet.utils.intervals.BedRecordList
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
FastaUtils
,
ToolCommand
,
VcfUtils
}
import
nl.lumc.sasc.biopet.utils.
{
ConfigUtils
,
FastaUtils
,
ToolCommand
,
VcfUtils
}
import
scala.collection.JavaConversions._
import
scala.collection.mutable
import
scala.io.Source
import
scala.sys.process.
{
Process
,
ProcessLogger
}
import
scala.sys.process.
{
Process
,
ProcessLogger
}
import
scala.util.Random
import
scala.concurrent.ExecutionContext.Implicits.global
import
scala.concurrent.duration._
import
scala.concurrent.
{
Await
,
Future
}
import
scala.concurrent.
{
Await
,
Future
}
/**
* This tool will generate statistics from a vcf file
...
...
@@ -243,7 +243,7 @@ object VcfStats extends ToolCommand {
}
chunkStats
.
toList
.
fold
(
createStats
)(
_
+=
_
)
}
val
stats
=
statsFutures
.
foldLeft
(
createStats
)
{
case
(
a
,
b
)
=>
a
+=
Await
.
result
(
b
,
Duration
.
Inf
)
}
val
stats
=
statsFutures
.
foldLeft
(
createStats
)
{
case
(
a
,
b
)
=>
a
+=
Await
.
result
(
b
,
Duration
.
Inf
)
}
logger
.
info
(
"Done reading vcf records"
)
...
...
biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala
View file @
14dcba60
...
...
@@ -15,11 +15,10 @@
package
nl.lumc.sasc.biopet.tools
import
java.io.File
import
java.nio.file.
{
Files
,
Paths
}
import
java.nio.file.
{
Files
,
Paths
}
import
htsjdk.variant.variantcontext.Allele
import
htsjdk.variant.vcf.VCFFileReader
import
nl.lumc.sasc.biopet.tools.vcfstats.
{
SampleStats
,
SampleToSampleStats
,
Stats
,
VcfStats
}
import
nl.lumc.sasc.biopet.tools.vcfstats.
{
SampleStats
,
SampleToSampleStats
,
Stats
,
VcfStats
}
import
nl.lumc.sasc.biopet.tools.vcfstats.VcfStats._
import
org.scalatest.Matchers
import
org.scalatest.testng.TestNGSuite
...
...
biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala
View file @
14dcba60
...
...
@@ -17,8 +17,8 @@ package nl.lumc.sasc.biopet.utils
import
java.io.File
import
java.util
import
htsjdk.variant.variantcontext.
{
Allele
,
Genotype
,
VariantContext
}
import
htsjdk.variant.vcf.
{
VCFFileReader
,
VCFFilterHeaderLine
,
VCFHeader
}
import
htsjdk.variant.variantcontext.
{
Allele
,
Genotype
,
VariantContext
}
import
htsjdk.variant.vcf.
{
VCFFileReader
,
VCFFilterHeaderLine
,
VCFHeader
}
import
scala.collection.JavaConversions._
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment