diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala index 5fe30c554d3c9ba40efb687629d50fff1c6a235c..54a23127c6037e115e5586c70a6b206eb277cd13 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala @@ -93,12 +93,12 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript => if (outputDir.getParentFile.canWrite || (outputDir.exists && outputDir.canWrite)) globalConfig.writeReport(qSettings.runName, new File(outputDir, ".log/" + qSettings.runName)) - else Logging.addError("Parent of output dir: '" + outputDir.getParent + "' is not writeable, outputdir can not be created") + else Logging.addError("Parent of output dir: '" + outputDir.getParent + "' is not writeable, output directory cannot be created") inputFiles.foreach { i => if (!i.file.exists()) Logging.addError(s"Input file does not exist: ${i.file}") if (!i.file.canRead) Logging.addError(s"Input file can not be read: ${i.file}") - if (!i.file.isAbsolute) Logging.addError(s"Input file should be an absulute path: ${i.file}") + if (!i.file.isAbsolute) Logging.addError(s"Input file should be an absolute path: ${i.file}") } functions.filter(_.jobOutputFile == null).foreach(f => { diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala index 42f504e191f205782e861a463adb0ad975332f45..97b4c74134a59ce94fea03ce0e843a5b6ee3adbc 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala @@ -17,16 +17,20 @@ package nl.lumc.sasc.biopet.extensions import java.io.File +import nl.lumc.sasc.biopet.core.summary.Summarizable import nl.lumc.sasc.biopet.utils.Logging import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction, Reference } +import nl.lumc.sasc.biopet.utils.tryToParseNumber import org.broadinstitute.gatk.utils.commandline.{ Input, Output } +import scala.io.Source + /** * Extension for VariantEffectPredictor * Created by ahbbollen on 15-1-15. */ -class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version { +class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version with Summarizable { executable = config("exe", submodule = "perl", default = "perl") var vepScript: String = config("vep_script") @@ -48,7 +52,7 @@ class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFu var everything: Boolean = config("everything", default = false) var force: Boolean = config("force", default = false) var no_stats: Boolean = config("no_stats", default = false) - var stats_text: Boolean = config("stats_text", default = false) + var stats_text: Boolean = config("stats_text", default = true) var html: Boolean = config("html", default = false) var cache: Boolean = config("cache", default = false) var humdiv: Boolean = config("humdiv", default = false) @@ -254,4 +258,47 @@ class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFu optional("--buffer_size", buffer_size) + optional("--failed", failed) + def summaryFiles: Map[String, File] = Map() + + def summaryStats: Map[String, Any] = { + if (stats_text) { + val stats_file: File = new File(output.getAbsolutePath + "_summary.txt") + parseStatsFile(stats_file) + } else { + Map() + } + } + + def parseStatsFile(file: File): Map[String, Any] = { + val contents = Source.fromFile(file).getLines().toList + val headers = getHeadersFromStatsFile(contents) + headers.foldLeft(Map.empty[String, Any])((acc, x) => acc + (x.replace(" ", "_") -> getBlockFromStatsFile(contents, x))) + } + + def getBlockFromStatsFile(contents: List[String], header: String): Map[String, Any] = { + var inBlock = false + var theMap: Map[String, Any] = Map() + for (x <- contents) { + val stripped = x.stripPrefix("[").stripSuffix("]") + if (stripped == header) { + inBlock = true + } else { + if (inBlock) { + val key = stripped.split('\t').head.replace(" ", "_") + val value = stripped.split('\t').last + theMap ++= Map(key -> tryToParseNumber(value, fallBack = true).getOrElse(value)) + } + } + if (stripped == "") { + inBlock = false + } + } + theMap + } + + def getHeadersFromStatsFile(contents: List[String]): List[String] = { + // block headers are of format '[block]' + contents.filter(_.startsWith("[")).filter(_.endsWith("]")).map(_.stripPrefix("[")).map(_.stripSuffix("]")) + } + } diff --git a/public/biopet-extensions/src/test/resources/vep.metrics b/public/biopet-extensions/src/test/resources/vep.metrics new file mode 100644 index 0000000000000000000000000000000000000000..c036c2b7dae5017c995132e3e2dd35d7d9cbcc13 --- /dev/null +++ b/public/biopet-extensions/src/test/resources/vep.metrics @@ -0,0 +1,349 @@ +[VEP run statistics] +VEP version (API) 77 (77) +Cache/Database /usr/local/Genomes/H.Sapiens/hg19/vep_cache_75/homo_sapiens/75 +Species homo_sapiens +Command line options -i test.vcf -o test.vep.vcf -v --everything --stats_text --cache --vcf --allow_non_variant --species homo_sapiens --dir /usr/local/Genomes/H.Sapiens/hg19/vep_cache_75 --fasta /usr/local/Genomes/H.Sapiens/hg19/vep_cache_75/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa --fork 2 --cache_version 75 --db_version 75 --failed 1 --offline +Start time 2016-03-03 14:37:44 +End time 2016-03-03 14:37:54 +Run time 10 seconds +Input file (format) test.vcf (VCF) +Output file test.vep.vcf [text] + +[General statistics] +Lines of input read 1000 +Variants processed 874 +Variants remaining after filtering 874 +Lines of output written 874 +Novel / existing variants 105 (12.0%) / 769 (88.0%) +Overlapped genes 132 +Overlapped transcripts 661 +Overlapped regulatory features 164 + +[Variant classes] +insertion 29 +deletion 51 +SNV 794 + +[Consequences (most severe)] +splice_donor_variant 3 +frameshift_variant 1 +inframe_insertion 2 +inframe_deletion 1 +missense_variant 39 +splice_region_variant 11 +synonymous_variant 46 +5_prime_UTR_variant 19 +3_prime_UTR_variant 36 +non_coding_transcript_exon_variant 95 +intron_variant 518 +upstream_gene_variant 48 +downstream_gene_variant 27 +intergenic_variant 28 + +[Consequences (all)] +splice_donor_variant 6 +frameshift_variant 2 +inframe_insertion 15 +inframe_deletion 6 +missense_variant 101 +splice_region_variant 60 +synonymous_variant 173 +coding_sequence_variant 1 +5_prime_UTR_variant 33 +3_prime_UTR_variant 108 +non_coding_transcript_exon_variant 279 +intron_variant 3311 +NMD_transcript_variant 437 +non_coding_transcript_variant 1102 +upstream_gene_variant 1486 +downstream_gene_variant 1779 +TF_binding_site_variant 12 +feature_elongation 97 +regulatory_region_variant 253 +feature_truncation 278 +intergenic_variant 28 + +[Coding consequences] +frameshift_variant 2 +inframe_insertion 15 +inframe_deletion 6 +missense_variant 101 +synonymous_variant 173 +coding_sequence_variant 1 + +[SIFT summary] +deleterious 22 +tolerated 76 + +[PolyPhen summary] +possibly damaging 7 +unknown 12 +probably damaging 12 +benign 70 + +[Variants by chromosome] +1 874 + +[Distribution of variants on chromosome 1] +0 225 +1 407 +2 242 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 +10 0 +11 0 +12 0 +13 0 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +21 0 +22 0 +23 0 +24 0 +25 0 +26 0 +27 0 +28 0 +29 0 +30 0 +31 0 +32 0 +33 0 +34 0 +35 0 +36 0 +37 0 +38 0 +39 0 +40 0 +41 0 +42 0 +43 0 +44 0 +45 0 +46 0 +47 0 +48 0 +49 0 +50 0 +51 0 +52 0 +53 0 +54 0 +55 0 +56 0 +57 0 +58 0 +59 0 +60 0 +61 0 +62 0 +63 0 +64 0 +65 0 +66 0 +67 0 +68 0 +69 0 +70 0 +71 0 +72 0 +73 0 +74 0 +75 0 +76 0 +77 0 +78 0 +79 0 +80 0 +81 0 +82 0 +83 0 +84 0 +85 0 +86 0 +87 0 +88 0 +89 0 +90 0 +91 0 +92 0 +93 0 +94 0 +95 0 +96 0 +97 0 +98 0 +99 0 +100 0 +101 0 +102 0 +103 0 +104 0 +105 0 +106 0 +107 0 +108 0 +109 0 +110 0 +111 0 +112 0 +113 0 +114 0 +115 0 +116 0 +117 0 +118 0 +119 0 +120 0 +121 0 +122 0 +123 0 +124 0 +125 0 +126 0 +127 0 +128 0 +129 0 +130 0 +131 0 +132 0 +133 0 +134 0 +135 0 +136 0 +137 0 +138 0 +139 0 +140 0 +141 0 +142 0 +143 0 +144 0 +145 0 +146 0 +147 0 +148 0 +149 0 +150 0 +151 0 +152 0 +153 0 +154 0 +155 0 +156 0 +157 0 +158 0 +159 0 +160 0 +161 0 +162 0 +163 0 +164 0 +165 0 +166 0 +167 0 +168 0 +169 0 +170 0 +171 0 +172 0 +173 0 +174 0 +175 0 +176 0 +177 0 +178 0 +179 0 +180 0 +181 0 +182 0 +183 0 +184 0 +185 0 +186 0 +187 0 +188 0 +189 0 +190 0 +191 0 +192 0 +193 0 +194 0 +195 0 +196 0 +197 0 +198 0 +199 0 +200 0 +201 0 +202 0 +203 0 +204 0 +205 0 +206 0 +207 0 +208 0 +209 0 +210 0 +211 0 +212 0 +213 0 +214 0 +215 0 +216 0 +217 0 +218 0 +219 0 +220 0 +221 0 +222 0 +223 0 +224 0 +225 0 +226 0 +227 0 +228 0 +229 0 +230 0 +231 0 +232 0 +233 0 +234 0 +235 0 +236 0 +237 0 +238 0 +239 0 +240 0 +241 0 +242 0 +243 0 +244 0 +245 0 +246 0 +247 0 +248 0 +249 0 + +[Position in protein] +00-10% 29 +10-20% 63 +20-30% 34 +30-40% 20 +40-50% 23 +50-60% 25 +60-70% 31 +70-80% 11 +80-90% 32 +90-100% 30 diff --git a/public/biopet-extensions/src/test/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictorTest.scala b/public/biopet-extensions/src/test/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictorTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..54e5ef6733edd578653940eaa92eb77e7ea0f3c0 --- /dev/null +++ b/public/biopet-extensions/src/test/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictorTest.scala @@ -0,0 +1,37 @@ +package nl.lumc.sasc.biopet.extensions + +import java.io.File +import java.nio.file.Paths + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +/** + * Created by ahbbollen on 3-3-16. + */ +class VariantEffectPredictorTest extends TestNGSuite with Matchers { + + + @Test + def testSummaryStats = { + val file = new File(Paths.get(getClass.getResource("/vep.metrics").toURI).toString) + + val vep = new VariantEffectPredictor(null) + val stats = vep.parseStatsFile(file) + + stats.contains("VEP_run_statistics") shouldBe true + stats.contains("General_statistics") shouldBe true + stats.contains("Consequences_(most_severe)") shouldBe true + stats.contains("Consequences_(all)") shouldBe true + stats.contains("Coding_consequences") shouldBe true + stats.contains("SIFT_summary") shouldBe true + stats.contains("PolyPhen_summary") shouldBe true + stats.contains("Variants_by_chromosome") shouldBe true + stats.contains("Distribution_of_variants_on_chromosome_1") shouldBe true + stats.contains("Position_in_protein") shouldBe true + + + } + +} diff --git a/public/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala b/public/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala index 7464366e69387664569796b1b523191abec7bb80..1ccb0ea95fedc96d6e967e073ef80a6c3abc0195 100644 --- a/public/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala +++ b/public/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala @@ -68,6 +68,7 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum vep.output = new File(outputDir, inputVCF.getName.stripSuffix(".gz").stripSuffix(".vcf") + ".vep.vcf") vep.isIntermediate = true add(vep) + addSummarizable(vep, "variant_effect_predictor") val normalizer = new VepNormalizer(this) normalizer.inputVCF = vep.output