Commit 46b18dbc authored by bow's avatar bow
Browse files

Update gentrap v0.4 schema: flexiprep section

parent c9d9587f
......@@ -576,7 +576,7 @@
"bases": {
"description": "Base pair-level statistics",
"type": "object",
"required": [ "nucleotides", "num_total", "num_by_qual" ],
"required": [ "nucleotides", "num_total", "num_qual" ],
"properties": {
......@@ -595,7 +595,7 @@
"type": "integer"
},
"num_by_qual": {
"num_qual": {
"description": "Array of base counts by quality, from quality 0 to 60",
"type": "array",
"items": { "type": "integer" }
......@@ -606,7 +606,7 @@
"reads": {
"description": "Read-level statistics",
"type": "object",
"required": [ "len_max", "len_min", "num_with_n", "num_total", "qual_encoding" ],
"required": [ "len_max", "len_min", "num_with_n", "num_total", "num_avg_qual_gte", "qual_encoding" ],
"properties": {
......@@ -630,6 +630,16 @@
"type": "integer"
},
"num_avg_qual_gte": {
"description": "Number of reads with quality greater than a certain number",
"type": "object",
"additionalProperties": false,
"patternProperties": {
"^[0-9]+$": { "type": "integer" }
}
},
"qual_encoding": {
"description": "Quality encoding; which can be sanger (ASCII offset 33), solexa, or illumina (both with ASCII offset 64)",
"type": "string",
......@@ -643,29 +653,49 @@
"statsFastqc": {
"description": "Sequence statistics created by FastQC (part of the Flexiprep pipeline)",
"type": "object",
"required": [ "median_qual_by_position", "content_by_position" ],
"required": [ "per_base_sequence_quality", "per_base_sequence_content" ],
"properties": {
"median_qual_by_position": {
"description": "Array of median base quality per position (aggregated from all reads)",
"type": "array",
"items": { "type": "number" },
"minItems": 1
"per_base_sequence_quality": {
"description": "Quality statistics per base position",
"type": "object",
"additionalProperties": false,
"patternProperties": {
"^([0-9]+|[0-9]+-[0-9]+)$": {
"description": "Aggregate statistics per base position or per group of base positions",
"type": "object",
"required": [ "mean", "median", "lower_quartile", "upper_quartile", "percentile_10th", "percentile_90th" ],
"properties": {
"mean": { "type": "number" },
"median": { "type": "number" },
"lower_quartile": { "type": "number" },
"upper_quartile": { "type": "number" },
"percentile_10th": { "type": "number" },
"percentile_90th": { "type": "number" }
}
}
}
},
"content_by_position": {
"description": "Array of base composition per position (aggregated from all reads)",
"type": "array",
"items": {
"type": "object",
"additionalProperties": false,
"per_base_sequence_content": {
"description": "Composition statistics per base position",
"type": "object",
"additionalProperties": false,
"patternProperties": {
"^([0-9]+|[0-9]+-[0-9]+)$": {
"description": "Base composition per base position or per group of base positions",
"type": "object",
"additionalProperties": false,
"patternProperties": {
"^[ACGTURYMKSWHBVDN]$": { "type": "number" }
"patternProperties": {
"^[ACGTURYMKSWHBVDN]$": { "type": "number" }
}
}
},
"minItems": 1
}
}
}
},
......
......@@ -126,10 +126,36 @@ class GentrapV04InputProcessor(protected val mongo: MongodbAccessObject)
private[processors] def extractReadFile(libJson: JValue, fileKey: String): FileDocument =
(libJson \ "flexiprep" \ "files" \ "pipeline" \ fileKey).extract[FileDocument]
/** Case class for containing per-base position statistics. */
private[processors] case class PerBaseStat[T](index: Int, value: T)
/** Extracts FastQC module statistics which are spread out per base position or per group of base positions. */
private[processors] def extractFastqcStats(fastqcJson: JValue, fastqcModuleName: String,
statPerPositionName: String): Seq[Double] =
(fastqcJson \ fastqcModuleName)
.extract[Map[String, Map[String, Double]]].view
// filter for keys which are single base positions (not range)
.filter { case (key, value) => Try(key.toInt).toOption.isDefined }.toSeq
// get the statistics on the position and turn the base position to 0-based indexing
.map { case (key, value) => (key.toInt - 1, value.get(statPerPositionName)) }
// sort on the base position
.sortBy(_._1)
// take only while the numbers are consecutive
.takeWhile { case (key, value) => value.isDefined }
// get the stats value
.map { case (key, value) => PerBaseStat(key, value.get) }
// pair with index
.zipWithIndex
// and return only items where the base position match the index
// (so we only take consecutive stats from the first position onwards
.takeWhile { case (PerBaseStat(actualIdx, value), expectedIdx) => actualIdx == expectedIdx }
.map { case (PerBaseStat(_, value), _) => value }
/** Extracts a single read statistics from a library entry in a Gentrap summary. */
private[processors] def extractReadStats(libJson: JValue, seqStatKey: String, fastqcKey: String): ReadStats = {
val flexStats = libJson \ "flexiprep" \ "stats"
val nuclCounts = flexStats \ seqStatKey \ "bases" \ "nucleotides"
ReadStats(
nBases = (flexStats \ seqStatKey \ "bases" \ "num_total").extract[Long],
nBasesA = (nuclCounts \ "A").extract[Long],
......@@ -137,8 +163,8 @@ class GentrapV04InputProcessor(protected val mongo: MongodbAccessObject)
nBasesG = (nuclCounts \ "G").extract[Long],
nBasesC = (nuclCounts \ "C").extract[Long],
nBasesN = (nuclCounts \ "N").extract[Long],
nBasesByQual = (flexStats \ seqStatKey \ "bases" \ "num_by_qual").extract[Seq[Long]],
medianQualByPosition = (flexStats \ fastqcKey \ "median_qual_by_position").extract[Seq[Double]],
nBasesByQual = (flexStats \ seqStatKey \ "bases" \ "num_qual").extract[Seq[Long]],
medianQualByPosition = extractFastqcStats(flexStats \ fastqcKey, "per_base_sequence_quality", "median"),
nReads = (flexStats \ seqStatKey \ "reads" \ "num_total").extract[Long])
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment