From ea03c8809adf5241d19bc72a3e5d610a6d62ac32 Mon Sep 17 00:00:00 2001 From: Peter van 't Hof <p.j.van_t_hof@lumc.nl> Date: Fri, 25 Mar 2016 09:51:11 +0100 Subject: [PATCH] Added spec validation --- .../biopet/pipelines/gwastest/GwasTest.scala | 40 ++++++++--- .../gwastest/impute/ImputeOutput.scala | 69 +++++++++++++++++++ .../pipelines/gwastest/impute/Spec.scala | 64 +++++++++++++++++ 3 files changed, 163 insertions(+), 10 deletions(-) create mode 100644 public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/ImputeOutput.scala create mode 100644 public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Spec.scala diff --git a/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala index 9c659718e..d0627bfbc 100644 --- a/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala +++ b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala @@ -4,12 +4,15 @@ import java.io.File import java.util import nl.lumc.sasc.biopet.core.{ PipelineCommand, Reference, BiopetQScript } +import nl.lumc.sasc.biopet.extensions.Cat import nl.lumc.sasc.biopet.extensions.gatk.{ SelectVariants, CombineVariants } import nl.lumc.sasc.biopet.extensions.tools.GensToVcf import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.utils.intervals.BedRecordList import org.broadinstitute.gatk.queue.QScript +import nl.lumc.sasc.biopet.pipelines.gwastest.impute.ImputeOutput + import scala.collection.JavaConversions._ /** @@ -18,13 +21,15 @@ import scala.collection.JavaConversions._ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with Reference { def this() = this(null) + import GwasTest._ + val inputVcf: Option[File] = config("input_vcf") val phenotypeFile: File = config("phenotype_file") - case class GensInput(genotypes: File, info: Option[File], contig: String) + val specsFile: Option[File] = config("imute_specs_file") - val inputBlaGens: List[GensInput] = if (inputVcf.isDefined) List[GensInput]() + val inputGens: List[GensInput] = if (inputVcf.isDefined) List[GensInput]() else config("input_gens", default = Nil).asList.map(x => x match { case value: Map[String, Any] => GensInput(new File(value("genotypes").toString), @@ -35,6 +40,9 @@ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with R value.toMap.get("info").map(x => new File(x.toString)), value.get("contig").toString) case _ => throw new IllegalArgumentException + }) ++ (specsFile match { + case Some(file) => imputeSpecsToGensInput(file, config("validate_specs", default = true)) + case _ => Nil }) /** Init for pipeline */ @@ -44,18 +52,19 @@ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with R /** Pipeline itself */ def biopetScript(): Unit = { val vcfFile: File = inputVcf.getOrElse { - require(inputBlaGens.nonEmpty, "No vcf file or gens files defined in config") + require(inputGens.nonEmpty, "No vcf file or gens files defined in config") val outputDirGens = new File(outputDir, "gens_to_vcf") val cv = new CombineVariants(this) cv.outputFile = new File(outputDirGens, "merge.gens.vcf.gz") cv.setKey = "null" - inputBlaGens.foreach { gen => + cv.genotypeMergeOptions = Some("UNSORTED") //TODO: should be a default + inputGens.zipWithIndex.foreach { gen => val gensToVcf = new GensToVcf(this) - gensToVcf.inputGens = gen.genotypes - gensToVcf.inputInfo = gen.info - gensToVcf.contig = gen.contig + gensToVcf.inputGens = gen._1.genotypes + gensToVcf.inputInfo = gen._1.info + gensToVcf.contig = gen._1.contig gensToVcf.samplesFile = phenotypeFile - gensToVcf.outputVcf = new File(outputDirGens, gen.genotypes.getName + ".vcf.gz") + gensToVcf.outputVcf = new File(outputDirGens, gen._1.genotypes.getName + s".${gen._2}.vcf.gz") gensToVcf.isIntermediate = true add(gensToVcf) cv.inputFiles :+= gensToVcf.outputVcf @@ -81,9 +90,20 @@ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with R add(sv) //TODO: snptest - (region -> "") + region -> sv.outputFile } + val cat = new Cat(this) + cat.input = snpTests.map(_._2).toList + cat.output = new File(outputDir, "merge.txt") + add(cat) } } -object GwasTest extends PipelineCommand \ No newline at end of file +object GwasTest extends PipelineCommand { + case class GensInput(genotypes: File, info: Option[File], contig: String) + + def imputeSpecsToGensInput(specsFile: File, validate: Boolean = true): List[GensInput] = { + ImputeOutput.readSpecsFile(specsFile, validate) + .map(x => GensInput(x.gens, Some(x.gensInfo), x.chromosome)) + } +} \ No newline at end of file diff --git a/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/ImputeOutput.scala b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/ImputeOutput.scala new file mode 100644 index 000000000..7ebc602ac --- /dev/null +++ b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/ImputeOutput.scala @@ -0,0 +1,69 @@ +package nl.lumc.sasc.biopet.pipelines.gwastest.impute + +import java.io.File + +import nl.lumc.sasc.biopet.utils.Logging + +import Spec.SpecDecoderOps + +import scala.io.Source + +/** + * Created by pjvan_thof on 3/25/16. + */ +object ImputeOutput { + + case class Chunk(chromosome: String, summary: File, warnings: File, + gens: File, gensInfo: File, gensInfoBySample: File) + + def expandChunk(chromosome: String, basename: String) = Chunk( + chromosome, + new File(basename + ".gens_summary"), + new File(basename + ".gens_warnings"), + new File(basename + ".gens"), + new File(basename + ".gens_info"), + new File(basename + ".gens_info_by_sample") + ) + + def readSpecsFile(specsFile: File, validate: Boolean = true): List[Chunk] = { + val content = Source.fromFile(specsFile).mkString + val chunks = content.decode[List[Spec.ImputeOutput]] + .map(x => expandChunk(x.chromosome, specsFile.getParent + File.separator + x.name.split(File.separator).last)) + chunks.flatMap(validateChunk(_, validate)) + } + + val ASSESSMENT_HEADER = "-{32}\\n Imputation accuracy assessment \\n-{32}".r + val TOO_FEW_SNPS = "There are no SNPs in the imputation interval, so " + + "there is nothing for IMPUTE2 to analyze; the program will quit now." + + def validateChunk(chunk: Chunk, raiseErrors: Boolean = true): Option[Chunk] = { + + def addError(msg: String) = if (raiseErrors) Logging.addError(msg) else Logging.logger.warn(msg) + + if (!chunk.summary.exists()) { + Logging.addError(s"Summary file '${chunk.summary}' does not exist, please check Impute output") + None + } else if (!chunk.warnings.exists()) { + addError(s"Warnings file '${chunk.warnings}' does not exist, please check Impute output") + None + } else if (chunk.summary.canRead()) { + val summaryReader = Source.fromFile(chunk.summary) + val summaryLines = summaryReader.getLines().toList + summaryReader.close() + if (summaryLines.contains(TOO_FEW_SNPS)) None + else if (summaryLines.exists(ASSESSMENT_HEADER.findFirstIn(_).isDefined)) None + else if (!chunk.gens.exists()) { + addError(s"Gens file '${chunk.gens}' does not exist, please check Impute output") + None + } else if (!chunk.gensInfo.exists()) { + addError(s"GensInfo file '${chunk.gensInfo}' does not exist, please check Impute output") + None + } else if (!chunk.gensInfoBySample.exists()) { + addError(s"GensInfoBySample file '${chunk.gensInfoBySample}' does not exist, please check Impute output") + None + } else { + Some(chunk) + } + } else None + } +} diff --git a/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Spec.scala b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Spec.scala new file mode 100644 index 000000000..e8ea1e49b --- /dev/null +++ b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Spec.scala @@ -0,0 +1,64 @@ +package nl.lumc.sasc.biopet.pipelines.gwastest.impute + +import scala.util.parsing.combinator.JavaTokenParsers + +/** + * .spec file parser combinator. + * + * TODO: clean up. + * + * @author Matthijs Moed <M.H.Moed@lumc.nl> + */ +object Spec extends JavaTokenParsers { + def obj: Parser[Map[String, Any]] = + "{" ~> rep(member) <~ "}" ^^ (Map() ++ _) + + def arr: Parser[List[Any]] = "(" ~> repsep(value, ",") <~ ")" + + def key: Parser[String] = """([\w+])*+""".r + + def member: Parser[(String, Any)] = key ~ "=" ~ value <~ literal(";") ^^ { + case name ~ "=" ~ value => (name, value) + } + + // This is a hack to get around the fact that JavaTokenParsers' + // stringLiterals do not have their double quotes removed. They're stripped + // here, without any error checking. + // TODO: implement as token parser + + def string: Parser[String] = stringLiteral ^^ { + case s => s.substring(1, s.length - 1) + } + + def value: Parser[Any] = ( + obj + | arr + | string + | wholeNumber ^^ (_.toInt) + | floatingPointNumber ^^ (_.toDouble) + ) + + // + + trait SpecDecoder[T] { + def decode(obj: Map[String, Any]): T + } + + case class ImputeOutput(chromosome: String, name: String, orig: String) + + class ImputeOutputMappingDecoder extends SpecDecoder[List[ImputeOutput]] { + override def decode(obj: Map[String, Any]): List[ImputeOutput] = { + val dicts = obj("files").asInstanceOf[List[Map[String, String]]] + dicts.map(d => ImputeOutput(d("chromosome"), d("name"), d("orig"))) + } + } + + implicit val imputeOutputMappingDecoder = new ImputeOutputMappingDecoder + + implicit class SpecDecoderOps(string: String) { + implicit def decode[T](implicit decoder: SpecDecoder[T]): T = { + decoder.decode(Spec.parseAll(Spec.obj, string).get) + } + } +} + -- GitLab