Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SASC
sentinel-legacy
Commits
46b18dbc
Commit
46b18dbc
authored
Jun 29, 2015
by
bow
Browse files
Update gentrap v0.4 schema: flexiprep section
parent
c9d9587f
Changes
2
Hide whitespace changes
Inline
Side-by-side
src/main/resources/schemas/biopet/v0.4/gentrap.json
View file @
46b18dbc
...
...
@@ -576,7 +576,7 @@
"bases"
:
{
"description"
:
"Base pair-level statistics"
,
"type"
:
"object"
,
"required"
:
[
"nucleotides"
,
"num_total"
,
"num_
by_
qual"
],
"required"
:
[
"nucleotides"
,
"num_total"
,
"num_qual"
],
"properties"
:
{
...
...
@@ -595,7 +595,7 @@
"type"
:
"integer"
},
"num_
by_
qual"
:
{
"num_qual"
:
{
"description"
:
"Array of base counts by quality, from quality 0 to 60"
,
"type"
:
"array"
,
"items"
:
{
"type"
:
"integer"
}
...
...
@@ -606,7 +606,7 @@
"reads"
:
{
"description"
:
"Read-level statistics"
,
"type"
:
"object"
,
"required"
:
[
"len_max"
,
"len_min"
,
"num_with_n"
,
"num_total"
,
"qual_encoding"
],
"required"
:
[
"len_max"
,
"len_min"
,
"num_with_n"
,
"num_total"
,
"num_avg_qual_gte"
,
"qual_encoding"
],
"properties"
:
{
...
...
@@ -630,6 +630,16 @@
"type"
:
"integer"
},
"num_avg_qual_gte"
:
{
"description"
:
"Number of reads with quality greater than a certain number"
,
"type"
:
"object"
,
"additionalProperties"
:
false
,
"patternProperties"
:
{
"^[0-9]+$"
:
{
"type"
:
"integer"
}
}
},
"qual_encoding"
:
{
"description"
:
"Quality encoding; which can be sanger (ASCII offset 33), solexa, or illumina (both with ASCII offset 64)"
,
"type"
:
"string"
,
...
...
@@ -643,29 +653,49 @@
"statsFastqc"
:
{
"description"
:
"Sequence statistics created by FastQC (part of the Flexiprep pipeline)"
,
"type"
:
"object"
,
"required"
:
[
"
median_qual_by_position"
,
"content_by_position
"
],
"required"
:
[
"
per_base_sequence_quality"
,
"per_base_sequence_content
"
],
"properties"
:
{
"median_qual_by_position"
:
{
"description"
:
"Array of median base quality per position (aggregated from all reads)"
,
"type"
:
"array"
,
"items"
:
{
"type"
:
"number"
},
"minItems"
:
1
"per_base_sequence_quality"
:
{
"description"
:
"Quality statistics per base position"
,
"type"
:
"object"
,
"additionalProperties"
:
false
,
"patternProperties"
:
{
"^([0-9]+|[0-9]+-[0-9]+)$"
:
{
"description"
:
"Aggregate statistics per base position or per group of base positions"
,
"type"
:
"object"
,
"required"
:
[
"mean"
,
"median"
,
"lower_quartile"
,
"upper_quartile"
,
"percentile_10th"
,
"percentile_90th"
],
"properties"
:
{
"mean"
:
{
"type"
:
"number"
},
"median"
:
{
"type"
:
"number"
},
"lower_quartile"
:
{
"type"
:
"number"
},
"upper_quartile"
:
{
"type"
:
"number"
},
"percentile_10th"
:
{
"type"
:
"number"
},
"percentile_90th"
:
{
"type"
:
"number"
}
}
}
}
},
"content_by_position"
:
{
"description"
:
"Array of base composition per position (aggregated from all reads)"
,
"type"
:
"array"
,
"items"
:
{
"type"
:
"object"
,
"additionalProperties"
:
false
,
"per_base_sequence_content"
:
{
"description"
:
"Composition statistics per base position"
,
"type"
:
"object"
,
"additionalProperties"
:
false
,
"patternProperties"
:
{
"^([0-9]+|[0-9]+-[0-9]+)$"
:
{
"description"
:
"Base composition per base position or per group of base positions"
,
"type"
:
"object"
,
"additionalProperties"
:
false
,
"patternProperties"
:
{
"^[ACGTURYMKSWHBVDN]$"
:
{
"type"
:
"number"
}
"patternProperties"
:
{
"^[ACGTURYMKSWHBVDN]$"
:
{
"type"
:
"number"
}
}
}
},
"minItems"
:
1
}
}
}
},
...
...
src/main/scala/nl/lumc/sasc/sentinel/processors/gentrap/GentrapV04InputProcessor.scala
View file @
46b18dbc
...
...
@@ -126,10 +126,36 @@ class GentrapV04InputProcessor(protected val mongo: MongodbAccessObject)
private
[
processors
]
def
extractReadFile
(
libJson
:
JValue
,
fileKey
:
String
)
:
FileDocument
=
(
libJson
\
"flexiprep"
\
"files"
\
"pipeline"
\
fileKey
).
extract
[
FileDocument
]
/** Case class for containing per-base position statistics. */
private
[
processors
]
case
class
PerBaseStat
[
T
](
index
:
Int
,
value
:
T
)
/** Extracts FastQC module statistics which are spread out per base position or per group of base positions. */
private
[
processors
]
def
extractFastqcStats
(
fastqcJson
:
JValue
,
fastqcModuleName
:
String
,
statPerPositionName
:
String
)
:
Seq
[
Double
]
=
(
fastqcJson
\
fastqcModuleName
)
.
extract
[
Map
[
String
,
Map
[
String
,
Double
]]].
view
// filter for keys which are single base positions (not range)
.
filter
{
case
(
key
,
value
)
=>
Try
(
key
.
toInt
).
toOption
.
isDefined
}.
toSeq
// get the statistics on the position and turn the base position to 0-based indexing
.
map
{
case
(
key
,
value
)
=>
(
key
.
toInt
-
1
,
value
.
get
(
statPerPositionName
))
}
// sort on the base position
.
sortBy
(
_
.
_1
)
// take only while the numbers are consecutive
.
takeWhile
{
case
(
key
,
value
)
=>
value
.
isDefined
}
// get the stats value
.
map
{
case
(
key
,
value
)
=>
PerBaseStat
(
key
,
value
.
get
)
}
// pair with index
.
zipWithIndex
// and return only items where the base position match the index
// (so we only take consecutive stats from the first position onwards
.
takeWhile
{
case
(
PerBaseStat
(
actualIdx
,
value
),
expectedIdx
)
=>
actualIdx
==
expectedIdx
}
.
map
{
case
(
PerBaseStat
(
_
,
value
),
_
)
=>
value
}
/** Extracts a single read statistics from a library entry in a Gentrap summary. */
private
[
processors
]
def
extractReadStats
(
libJson
:
JValue
,
seqStatKey
:
String
,
fastqcKey
:
String
)
:
ReadStats
=
{
val
flexStats
=
libJson
\
"flexiprep"
\
"stats"
val
nuclCounts
=
flexStats
\
seqStatKey
\
"bases"
\
"nucleotides"
ReadStats
(
nBases
=
(
flexStats
\
seqStatKey
\
"bases"
\
"num_total"
).
extract
[
Long
],
nBasesA
=
(
nuclCounts
\
"A"
).
extract
[
Long
],
...
...
@@ -137,8 +163,8 @@ class GentrapV04InputProcessor(protected val mongo: MongodbAccessObject)
nBasesG
=
(
nuclCounts
\
"G"
).
extract
[
Long
],
nBasesC
=
(
nuclCounts
\
"C"
).
extract
[
Long
],
nBasesN
=
(
nuclCounts
\
"N"
).
extract
[
Long
],
nBasesByQual
=
(
flexStats
\
seqStatKey
\
"bases"
\
"num_
by_
qual"
).
extract
[
Seq
[
Long
]],
medianQualByPosition
=
(
flexStats
\
fastqcKey
\
"median_qual_by_position"
).
extract
[
Seq
[
Double
]]
,
nBasesByQual
=
(
flexStats
\
seqStatKey
\
"bases"
\
"num_qual"
).
extract
[
Seq
[
Long
]],
medianQualByPosition
=
extractFastqcStats
(
flexStats
\
fastqcKey
,
"per_base_sequence_quality"
,
"median"
)
,
nReads
=
(
flexStats
\
seqStatKey
\
"reads"
\
"num_total"
).
extract
[
Long
])
}
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment