Commit 20993a69 authored by bow's avatar bow
Browse files

Update gentrap v0.4 schema and parsing: bam metrics section

parent 7d6e11c2
...@@ -155,7 +155,7 @@ ...@@ -155,7 +155,7 @@
"gentrap": { "gentrap": {
"description": "Sample-level Gentrap information", "description": "Sample-level Gentrap information",
"required": [ "files", "stats" ], "required": [ "files" ],
"properties": { "properties": {
...@@ -171,26 +171,9 @@ ...@@ -171,26 +171,9 @@
"type": "object", "type": "object",
"required": [ "alignment" ], "required": [ "alignment" ],
"additionalProperties": { "$ref": "#/definitions/file" } "additionalProperties": { "$ref": "#/definitions/file" }
},
"rna_metrics": {
"description": "Sample-level RNA-seq metrics",
"type": "object",
"required": [ "annotation", "metrics" ],
"additionalProperties": { "$ref": "#/definitions/file" }
} }
}, },
"additionalProperties": { "$ref": "#/definitions/fileGroup" } "additionalProperties": { "$ref": "#/definitions/fileGroup" }
},
"stats": {
"description": "Sample-level Gentrap statistics",
"type": "object",
"properties": {
"rna_metrics": { "$ref": "#/definitions/statsRnaSeqMetrics" }
}
} }
} }
}, },
...@@ -227,15 +210,17 @@ ...@@ -227,15 +210,17 @@
"stats": { "stats": {
"description": "Statistics gathered by BamMetrics", "description": "Statistics gathered by BamMetrics",
"type": "object", "type": "object",
"required": [ "alignment_metrics", "biopet_flagstat" ], "required": [ "CollectAlignmentSummaryMetrics", "biopet_flagstat", "rna" ],
"properties": { "properties": {
"alignment_metrics": { "$ref": "#/definitions/statsAlignmentSummary" }, "CollectAlignmentSummaryMetrics": { "$ref": "#/definitions/statsAlignmentSummary" },
"biopet_flagstat": { "$ref": "#/definitions/statsBiopetFlagstat" }, "biopet_flagstat": { "$ref": "#/definitions/statsBiopetFlagstat" },
"insert_size_metrics": { "$ref": "#/definitions/statsInsertSizeMetrics" } "rna": { "$ref": "#/definitions/statsRnaSeqMetrics" },
"CollectInsertSizeMetrics": { "$ref": "#/definitions/statsInsertSizeMetrics" }
} }
} }
} }
...@@ -462,15 +447,17 @@ ...@@ -462,15 +447,17 @@
"stats": { "stats": {
"description": "Statistics gathered by BamMetrics", "description": "Statistics gathered by BamMetrics",
"type": "object", "type": "object",
"required": [ "alignment_metrics", "biopet_flagstat" ], "required": [ "CollectAlignmentSummaryMetrics", "biopet_flagstat", "rna" ],
"properties": { "properties": {
"alignment_metrics": { "$ref": "#/definitions/statsAlignmentSummary" }, "CollectAlignmentSummaryMetrics": { "$ref": "#/definitions/statsAlignmentSummary" },
"biopet_flagstat": { "$ref": "#/definitions/statsBiopetFlagstat" }, "biopet_flagstat": { "$ref": "#/definitions/statsBiopetFlagstat" },
"insert_size_metrics": { "$ref": "#/definitions/statsInsertSizeMetrics" } "rna": { "$ref": "#/definitions/statsRnaSeqMetrics" },
"CollectInsertSizeMetrics": { "$ref": "#/definitions/statsInsertSizeMetrics" }
} }
} }
} }
...@@ -486,7 +473,7 @@ ...@@ -486,7 +473,7 @@
"files": { "files": {
"description": "File groups tracked by Gentrap", "description": "File groups tracked by Gentrap",
"type": "object", "type": "object",
"required": [ "pipeline", "rna_metrics" ], "required": [ "pipeline" ],
"properties": { "properties": {
...@@ -495,27 +482,9 @@ ...@@ -495,27 +482,9 @@
"type": "object", "type": "object",
"required": [ "alignment" ], "required": [ "alignment" ],
"additionalProperties": { "$ref": "#/definitions/file" } "additionalProperties": { "$ref": "#/definitions/file" }
},
"rna_metrics": {
"description": "Library-level RNA-seq metrics",
"type": "object",
"required": [ "annotation", "metrics" ],
"additionalProperties": { "$ref": "#/definitions/file" }
} }
}, },
"additionalProperties": { "$ref": "#/definitions/fileGroup" } "additionalProperties": { "$ref": "#/definitions/fileGroup" }
},
"stats": {
"description": "Library-level Gentrap statistics",
"type": "object",
"required": [ "rna_metrics" ],
"properties": {
"rna_metrics": { "$ref": "#/definitions/statsRnaSeqMetrics" }
}
} }
} }
} }
...@@ -747,32 +716,32 @@ ...@@ -747,32 +716,32 @@
"statsAlignmentSummaryPair": { "statsAlignmentSummaryPair": {
"description": "Alignment statistics gathered by Picard CollectAlignmentSummaryMetrics, per pair", "description": "Alignment statistics gathered by Picard CollectAlignmentSummaryMetrics, per pair",
"type": "object", "type": "object",
"required": [ "pct_chimeras", "pf_hq_aligned_reads", "pf_hq_error_rate", "pf_indel_rate", "pf_mismatch_rate", "required": [ "PCT_CHIMERAS", "PF_HQ_ALIGNED_READS", "PF_HQ_ERROR_RATE", "PF_INDEL_RATE", "PF_MISMATCH_RATE",
"pf_reads", "total_reads" ], "PF_READS", "TOTAL_READS" ],
"properties": { "properties": {
"bad_cycles": { "type": "integer" }, "BAD_CYCLES": { "type": "integer" },
"mean_read_length": { "type": "number" }, "MEAN_READ_LENGTH": { "type": "number" },
"pct_adapter": { "type": "number" }, "PCT_ADAPTER": { "type": "number" },
"pct_chimeras": { "type": "number" }, "PCT_CHIMERAS": { "type": "number" },
"pct_pf_reads": { "type": "number" }, "PCT_PF_READS": { "type": "number" },
"pct_pf_reads_aligned": { "type": "number" }, "PCT_PF_READS_ALIGNED": { "type": "number" },
"pct_reads_aligned_in_pairs": { "type": "number" }, "PCT_READS_ALIGNED_IN_PAIRS": { "type": "number" },
"pf_aligned_bases": { "type": "integer" }, "PF_ALIGNED_BASES": { "type": "integer" },
"pf_hq_aligned_bases": { "type": "integer" }, "PF_HQ_ALIGNED_BASES": { "type": "integer" },
"pf_hq_aligned_q20_bases": { "type": "integer" }, "PF_HQ_ALIGNED_Q20_BASES": { "type": "integer" },
"pf_hq_aligned_reads": { "type": "integer" }, "PF_HQ_ALIGNED_READS": { "type": "integer" },
"pf_hq_error_rate": { "type": "number" }, "PF_HQ_ERROR_RATE": { "type": "number" },
"pf_hq_median_mismatches": { "type": "number" }, "PF_HQ_MEDIAN_MISMATCHES": { "type": "number" },
"pf_indel_rate": { "type": "number" }, "PF_INDEL_RATE": { "type": "number" },
"pf_mismatch_rate": { "type": "number" }, "PF_MISMATCH_RATE": { "type": "number" },
"pf_noise_reads": { "type": "integer" }, "PF_NOISE_READS": { "type": "integer" },
"pf_reads": { "type": "integer" }, "PF_READS": { "type": "integer" },
"pf_reads_aligned": { "type": "integer" }, "PF_READS_ALIGNED": { "type": "integer" },
"reads_aligned_in_pairs": { "type": "integer" }, "READS_ALIGNED_IN_PAIRS": { "type": "integer" },
"strand_balance": { "type": "number" }, "STRAND_BALANCE": { "type": "number" },
"total_reads": { "type": "integer" } "TOTAL_READS": { "type": "integer" }
} }
}, },
...@@ -816,66 +785,100 @@ ...@@ -816,66 +785,100 @@
"statsInsertSizeMetrics": { "statsInsertSizeMetrics": {
"description": "Alignment statistics gathered by Picard CollectInsertSizeMetrics tool", "description": "Alignment statistics gathered by Picard CollectInsertSizeMetrics tool",
"type": "object", "type": "object",
"required": [ "max_insert_size", "mean_insert_size", "median_insert_size", "min_insert_size", "required": [ "metrics" ],
"pair_orientation", "read_pairs", "standard_deviation" ],
"properties": { "properties": {
"max_insert_size": { "type": "integer" },
"mean_insert_size": { "type": "number" }, "metrics": {
"median_absolute_deviation": { "type": "number" }, "description": "Metrics values",
"median_insert_size": { "type": "number" }, "type": "object",
"min_insert_size": { "type": "integer" }, "required": [ "MAX_INSERT_SIZE", "MEAN_INSERT_SIZE", "MEDIAN_INSERT_SIZE", "MIN_INSERT_SIZE",
"pair_orientation": { "type": "string" }, "PAIR_ORIENTATION", "READ_PAIRS", "STANDARD_DEVIATION" ],
"read_pairs": { "type": "integer" },
"standard_deviation": { "type": "number" }, "properties": {
"width_of_10_percent": { "type": "integer" }, "MAX_INSERT_SIZE": { "type": "integer" },
"width_of_20_percent": { "type": "integer" }, "MEAN_INSERT_SIZE": { "type": "number" },
"width_of_30_percent": { "type": "integer" }, "MEDIAN_ABSOLUTE_DEVIATION": { "type": "number" },
"width_of_40_percent": { "type": "integer" }, "MEDIAN_INSERT_SIZE": { "type": "number" },
"width_of_50_percent": { "type": "integer" }, "MIN_INSERT_SIZE": { "type": "integer" },
"width_of_60_percent": { "type": "integer" }, "PAIR_ORIENTATION": { "type": "string" },
"width_of_70_percent": { "type": "integer" }, "READ_PAIRS": { "type": "integer" },
"width_of_80_percent": { "type": "integer" }, "STANDARD_DEVIATION": { "type": "number" },
"width_of_90_percent": { "type": "integer" }, "WIDTH_OF_10_PERCENT": { "type": "integer" },
"width_of_99_percent": { "type": "integer" } "WIDTH_OF_20_PERCENT": { "type": "integer" },
"WIDTH_OF_30_PERCENT": { "type": "integer" },
"WIDTH_OF_40_PERCENT": { "type": "integer" },
"WIDTH_OF_50_PERCENT": { "type": "integer" },
"WIDTH_OF_60_PERCENT": { "type": "integer" },
"WIDTH_OF_70_PERCENT": { "type": "integer" },
"WIDTH_OF_80_PERCENT": { "type": "integer" },
"WIDTH_OF_90_PERCENT": { "type": "integer" },
"WIDTH_OF_99_PERCENT": { "type": "integer" }
}
}
} }
}, },
"statsRnaSeqMetrics": { "statsRnaSeqMetrics": {
"description": "RNA-seq statistics gathered by Picard CollectRnaSeqMetrics tool", "description": "RNA-seq statistics gathered by Picard CollectRnaSeqMetrics tool",
"type": "object", "type": "object",
"required": [ "coding_bases", "correct_strand_reads", "ignored_reads", "incorrect_strand_reads", "required": [ "metrics" ],
"intergenic_bases", "intronic_bases", "median_3prime_bias", "median_5prime_bias",
"median_5prime_to_3prime_bias", "normalized_transcript_cov", "pf_aligned_bases", "pf_bases",
"utr_bases" ],
"properties": { "properties": {
"coding_bases": { "type": "integer" },
"correct_strand_reads": { "type": "integer" }, "metrics": {
"ignored_reads": { "type": "integer" }, "description": "Metrics values",
"incorrect_strand_reads": { "type": "integer" }, "type": "object",
"intergenic_bases": { "type": "integer" },
"intronic_bases": { "type": "integer" }, "required": [ "CODING_BASES", "CORRECT_STRAND_READS", "IGNORED_READS", "INCORRECT_STRAND_READS",
"median_3prime_bias": { "type": "number" }, "INTERGENIC_BASES", "INTRONIC_BASES", "MEDIAN_3PRIME_BIAS", "MEDIAN_5PRIME_BIAS",
"median_5prime_bias": { "type": "number" }, "MEDIAN_5PRIME_TO_3PRIME_BIAS", "PF_ALIGNED_BASES", "PF_BASES", "UTR_BASES" ],
"median_5prime_to_3prime_bias": { "type": ["number", "string"] },
"median_cv_coverage": { "type": "number" }, "properties": {
"normalized_transcript_cov": { "CODING_BASES": { "type": "integer" },
"type": "array", "CORRECT_STRAND_READS": { "type": "integer" },
"items": { "type": "number" } "IGNORED_READS": { "type": "integer" },
"INCORRECT_STRAND_READS": { "type": "integer" },
"INTERGENIC_BASES": { "type": "integer" },
"INTRONIC_BASES": { "type": "integer" },
"MEDIAN_3PRIME_BIAS": { "type": "number" },
"MEDIAN_5PRIME_BIAS": { "type": "number" },
"MEDIAN_5PRIME_TO_3PRIME_BIAS": { "type": ["number", "string"] },
"MEDIAN_CV_COVERAGE": { "type": "number" },
"PCT_CODING_BASES": { "type": "number" },
"PCT_CORRECT_STRAND_READS": { "type": "number" },
"PCT_INTERGENIC_BASES": { "type": "number" },
"PCT_INTRONIC_BASES": { "type": "number" },
"PCT_MRNA_BASES": { "type": "number" },
"PCT_RIBOSOMAL_BASES": { "type": ["number", "string"] },
"PCT_USABLE_BASES": { "type": "number" },
"PCT_UTR_BASES": { "type": "number" },
"PF_ALIGNED_BASES": { "type": "integer" },
"PF_BASES": { "type": "integer" },
"RIBOSOMAL_BASES": { "type": ["integer", "string"] },
"UTR_BASES": { "type": "integer" }
}
}, },
"pct_coding_bases": { "type": "number" },
"pct_correct_strand_reads": { "type": "number" }, "histogram": {
"pct_intergenic_bases": { "type": "number" },
"pct_intronic_bases": { "type": "number" }, "description": "Histogram values (used for plotting).",
"pct_mrna_bases": { "type": "number" }, "type": "object",
"pct_ribosomal_bases": { "type": "number" }, "required": [ "All_Reads.normalized_coverage" ],
"pct_usable_bases": { "type": "number" },
"pct_utr_bases": { "type": "number" }, "properties": {
"pf_aligned_bases": { "type": "integer" },
"pf_bases": { "type": "integer" }, "normalized_position": {
"ribosomal_bases": { "type": "integer" }, "type": "array",
"utr_bases": { "type": "integer" } "items": { "type": "number" }
},
"All_Reads.normalized_coverage": {
"type": "array",
"items": { "type": "number" }
}
}
}
} }
} }
} }
......
...@@ -91,34 +91,35 @@ class GentrapV04InputProcessor(protected val mongo: MongodbAccessObject) ...@@ -91,34 +91,35 @@ class GentrapV04InputProcessor(protected val mongo: MongodbAccessObject)
/** Extracts alignment statistics from a sample or library entry in a Gentrap summary. */ /** Extracts alignment statistics from a sample or library entry in a Gentrap summary. */
private[processors] def extractAlnStats(effJson: JValue): GentrapAlignmentStats = { private[processors] def extractAlnStats(effJson: JValue): GentrapAlignmentStats = {
val isPaired = (effJson \ "bammetrics" \ "stats" \ "alignment_metrics" \ "PAIR") != JNothing val isPaired = (effJson \ "bammetrics" \ "stats" \ "CollectAlignmentSummaryMetrics" \ "PAIR") != JNothing
val alnMetrics = effJson \ "bammetrics" \ "stats" \ "alignment_metrics" \ val alnMetrics = effJson \ "bammetrics" \ "stats" \ "CollectAlignmentSummaryMetrics" \
(if (isPaired) "PAIR" else "UNPAIRED") (if (isPaired) "PAIR" else "UNPAIRED")
val bpFlagstat = effJson \ "bammetrics" \ "stats" \ "biopet_flagstat" val bpFlagstat = effJson \ "bammetrics" \ "stats" \ "biopet_flagstat"
val insMetrics = effJson \ "bammetrics" \ "stats" \ "insert_size_metrics" val insMetrics = effJson \ "bammetrics" \ "stats" \ "CollectInsertSizeMetrics" \ "metrics"
val rnaMetrics = effJson \ "gentrap" \ "stats" \ "rna_metrics" val rnaMetrics = effJson \ "bammetrics" \ "stats" \ "rna" \ "metrics"
val rnaHisto = effJson \ "bammetrics" \ "stats" \ "rna" \ "histogram"
GentrapAlignmentStats( GentrapAlignmentStats(
nReadsTotal = (alnMetrics \ "pf_reads").extract[Long], nReadsTotal = (alnMetrics \ "PF_READS").extract[Long],
nReadsAligned = (alnMetrics \ "pf_reads_aligned").extract[Long], nReadsAligned = (alnMetrics \ "PF_READS_ALIGNED").extract[Long],
nReadsSingleton = isPaired.option { (bpFlagstat \ "MateUnmapped").extract[Long] }, nReadsSingleton = isPaired.option { (bpFlagstat \ "MateUnmapped").extract[Long] },
nReadsProperPair = isPaired.option { (bpFlagstat \ "ProperPair").extract[Long] }, nReadsProperPair = isPaired.option { (bpFlagstat \ "ProperPair").extract[Long] },
rateReadsMismatch = (alnMetrics \ "pf_mismatch_rate").extract[Double], rateReadsMismatch = (alnMetrics \ "PF_MISMATCH_RATE").extract[Double],
rateIndel = (alnMetrics \ "pf_indel_rate").extract[Double], rateIndel = (alnMetrics \ "PF_INDEL_RATE").extract[Double],
pctChimeras = isPaired.option { (alnMetrics \ "pct_chimeras").extract[Double] }, pctChimeras = isPaired.option { (alnMetrics \ "PCT_CHIMERAS").extract[Double] },
maxInsertSize = (insMetrics \ "max_insert_size").extractOpt[Long], maxInsertSize = (insMetrics \ "MAX_INSERT_SIZE").extractOpt[Long],
medianInsertSize = (insMetrics \ "median_insert_size").extractOpt[Long], medianInsertSize = (insMetrics \ "MEDIAN_INSERT_SIZE").extractOpt[Long],
stdevInsertSize = (insMetrics \ "standard_deviation").extractOpt[Double], stdevInsertSize = (insMetrics \ "STANDARD_DEVIATION").extractOpt[Double],
nBasesAligned = (alnMetrics \ "pf_aligned_bases").extract[Long], nBasesAligned = (alnMetrics \ "PF_ALIGNED_BASES").extract[Long],
nBasesUtr = (rnaMetrics \ "utr_bases").extract[Long], nBasesUtr = (rnaMetrics \ "UTR_BASES").extract[Long],
nBasesCoding = (rnaMetrics \ "coding_bases").extract[Long], nBasesCoding = (rnaMetrics \ "CODING_BASES").extract[Long],
nBasesIntron = (rnaMetrics \ "intronic_bases").extract[Long], nBasesIntron = (rnaMetrics \ "INTRONIC_BASES").extract[Long],
nBasesIntergenic = (rnaMetrics \ "intergenic_bases").extract[Long], nBasesIntergenic = (rnaMetrics \ "INTERGENIC_BASES").extract[Long],
nBasesRibosomal = (rnaMetrics \ "ribosomal_bases").extractOpt[Long], nBasesRibosomal = (rnaMetrics \ "RIBOSOMAL_BASES").extractOpt[Long],
median5PrimeBias = (rnaMetrics \ "median_5prime_bias").extract[Double], median5PrimeBias = (rnaMetrics \ "MEDIAN_5PRIME_BIAS").extract[Double],
median3PrimeBias = (rnaMetrics \ "median_3prime_bias").extract[Double], median3PrimeBias = (rnaMetrics \ "MEDIAN_3PRIME_BIAS").extract[Double],
median5PrimeTo3PrimeBias = (rnaMetrics \ "median_5prime_to_3prime_bias").extractOpt[Double], median5PrimeTo3PrimeBias = (rnaMetrics \ "MEDIAN_5PRIME_TO_3PRIME_BIAS").extractOpt[Double],
normalizedTranscriptCoverage = (rnaMetrics \ "normalized_transcript_cov").extract[Seq[Double]]) normalizedTranscriptCoverage = (rnaHisto \ "All_Reads.normalized_coverage").extract[Seq[Double]])
} }
/** Extracts an input sequencing file from a library entry in a Gentrap summary. */ /** Extracts an input sequencing file from a library entry in a Gentrap summary. */
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment