Commit 75834095 authored by bow's avatar bow
Browse files

Merge branch 'feature-imptute_to_vcf' into 'develop'

Extract impute to vcf as separated pipeline



See merge request !413
parents 4c3c746b 0e2beeb1
......@@ -34,6 +34,7 @@ object BiopetExecutableMain extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.gears.GearsSingle,
nl.lumc.sasc.biopet.pipelines.gears.Gears,
nl.lumc.sasc.biopet.pipelines.gwastest.GwasTest,
nl.lumc.sasc.biopet.pipelines.gwastest.impute.Impute2Vcf,
nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling,
nl.lumc.sasc.biopet.pipelines.basty.Basty,
nl.lumc.sasc.biopet.pipelines.shiva.Shiva,
......
......@@ -15,97 +15,35 @@
package nl.lumc.sasc.biopet.pipelines.gwastest
import java.io.File
import java.util
import htsjdk.samtools.reference.FastaSequenceFile
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand, Reference }
import nl.lumc.sasc.biopet.extensions.Snptest
import nl.lumc.sasc.biopet.extensions.gatk.{ CatVariants, SelectVariants }
import nl.lumc.sasc.biopet.extensions.tools.{ GensToVcf, SnptestToVcf }
import nl.lumc.sasc.biopet.pipelines.gwastest.impute.ImputeOutput
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.extensions.tools.SnptestToVcf
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
import org.broadinstitute.gatk.queue.QScript
import scala.collection.JavaConversions._
/**
* Created by pjvanthof on 16/03/16.
*/
class GwasTest(val root: Configurable) extends QScript with BiopetQScript with Reference {
def this() = this(null)
import GwasTest._
val inputVcf: Option[File] = config("input_vcf")
val inputVcf: File = config("input_vcf")
val phenotypeFile: File = config("phenotype_file")
val specsFile: Option[File] = config("imute_specs_file")
val inputGens: List[GensInput] = if (inputVcf.isDefined) List[GensInput]()
else config("input_gens", default = Nil).asList.map {
case value: Map[String, Any] =>
GensInput(new File(value("genotypes").toString),
value.get("info").map(x => new File(x.toString)),
value("contig").toString)
case value: util.LinkedHashMap[String, _] =>
GensInput(new File(value.get("genotypes").toString),
value.toMap.get("info").map(x => new File(x.toString)),
value.get("contig").toString)
case _ => throw new IllegalArgumentException
} ++ (specsFile match {
case Some(file) => imputeSpecsToGensInput(file, config("validate_specs", default = true))
case _ => Nil
})
override def dictRequired = true
override def defaults = Map("snptest" -> Map("genotype_field" -> "GP"))
/** Init for pipeline */
def init(): Unit = {
inputGens.foreach { g =>
val referenceDict = new FastaSequenceFile(referenceFasta(), true).getSequenceDictionary
if (referenceDict.getSequenceIndex(g.contig) == -1)
Logging.addError(s"Contig '${g.contig}' does not exist on reference: ${referenceFasta()}")
}
}
/** Pipeline itself */
def biopetScript(): Unit = {
val (vcfFile, chrVcfFiles): (File, Map[String, File]) = inputVcf.map((_, Map[String, File]())).getOrElse {
require(inputGens.nonEmpty, "No vcf file or gens files defined in config")
val outputDirGens = new File(outputDir, "gens_to_vcf")
val cvTotal = new CatVariants(this)
cvTotal.assumeSorted = true
cvTotal.outputFile = new File(outputDirGens, "merge.gens.vcf.gz")
val chrGens = inputGens.groupBy(_.contig).map {
case (contig, gens) =>
val cvChr = new CatVariants(this)
cvChr.assumeSorted = true
//cvChr.isIntermediate = true
cvChr.outputFile = new File(outputDirGens, s"$contig.merge.gens.vcf.gz")
gens.zipWithIndex.foreach { gen =>
val gensToVcf = new GensToVcf(this)
gensToVcf.inputGens = gen._1.genotypes
gensToVcf.inputInfo = gen._1.info
gensToVcf.contig = gen._1.contig
gensToVcf.samplesFile = phenotypeFile
gensToVcf.outputVcf = new File(outputDirGens, gen._1.genotypes.getName + s".${gen._2}.vcf.gz")
gensToVcf.isIntermediate = true
add(gensToVcf)
cvChr.variant :+= gensToVcf.outputVcf
}
add(cvChr)
cvTotal.variant :+= cvChr.outputFile
contig -> cvChr.outputFile
}
add(cvTotal)
(cvTotal.outputFile, chrGens)
}
val snpTests = BedRecordList.fromReference(referenceFasta())
.scatter(config("bin_size", default = 1000000))
.allRecords.map { region =>
......@@ -119,7 +57,7 @@ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with R
bedFile.deleteOnExit()
val sv = new SelectVariants(this)
sv.variant = chrVcfFiles.getOrElse(region.chr, vcfFile)
sv.variant = inputVcf
sv.out = new File(regionDir, s"$name.vcf.gz")
sv.intervals :+= bedFile
sv.isIntermediate = true
......@@ -147,11 +85,4 @@ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with R
}
}
object GwasTest extends PipelineCommand {
case class GensInput(genotypes: File, info: Option[File], contig: String)
def imputeSpecsToGensInput(specsFile: File, validate: Boolean = true): List[GensInput] = {
ImputeOutput.readSpecsFile(specsFile, validate)
.map(x => GensInput(x.gens, Some(x.gensInfo), x.chromosome))
}
}
\ No newline at end of file
object GwasTest extends PipelineCommand
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.gwastest.impute
import java.io.File
import java.util
import htsjdk.samtools.reference.FastaSequenceFile
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand, Reference }
import nl.lumc.sasc.biopet.extensions.gatk.CatVariants
import nl.lumc.sasc.biopet.extensions.tools.GensToVcf
import nl.lumc.sasc.biopet.pipelines.gwastest.impute.Impute2Vcf.GensInput
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
import scala.collection.JavaConversions._
/**
* Created by pjvan_thof on 27-5-16.
*/
class Impute2Vcf(val root: Configurable) extends QScript with BiopetQScript with Reference {
def this() = this(null)
val specsFile: Option[File] = config("imute_specs_file")
val phenotypeFile: File = config("phenotype_file")
val inputGens: List[GensInput] = config("input_gens", default = Nil).asList.map {
case value: Map[String, Any] =>
GensInput(new File(value("genotypes").toString),
value.get("info").map(x => new File(x.toString)),
value("contig").toString)
case value: util.LinkedHashMap[String, _] =>
GensInput(new File(value.get("genotypes").toString),
value.toMap.get("info").map(x => new File(x.toString)),
value.get("contig").toString)
case _ => throw new IllegalArgumentException
} ++ (specsFile match {
case Some(file) => Impute2Vcf.imputeSpecsToGensInput(file, config("validate_specs", default = true))
case _ => Nil
})
/** Init for pipeline */
def init(): Unit = {
inputGens.foreach { g =>
val referenceDict = new FastaSequenceFile(referenceFasta(), true).getSequenceDictionary
if (referenceDict.getSequenceIndex(g.contig) == -1)
Logging.addError(s"Contig '${g.contig}' does not exist on reference: ${referenceFasta()}")
}
}
/** Pipeline itself */
def biopetScript(): Unit = {
require(inputGens.nonEmpty, "No vcf file or gens files defined in config")
val cvTotal = new CatVariants(this)
cvTotal.assumeSorted = true
cvTotal.outputFile = new File(outputDir, "merge.gens.vcf.gz")
val chrGens = inputGens.groupBy(_.contig).map {
case (contig, gens) =>
val cvChr = new CatVariants(this)
cvChr.assumeSorted = true
//cvChr.isIntermediate = true
cvChr.outputFile = new File(outputDir, s"$contig.merge.gens.vcf.gz")
gens.zipWithIndex.foreach { gen =>
val gensToVcf = new GensToVcf(this)
gensToVcf.inputGens = gen._1.genotypes
gensToVcf.inputInfo = gen._1.info
gensToVcf.contig = gen._1.contig
gensToVcf.samplesFile = phenotypeFile
gensToVcf.outputVcf = new File(outputDir, gen._1.genotypes.getName + s".${gen._2}.vcf.gz")
gensToVcf.isIntermediate = true
add(gensToVcf)
cvChr.variant :+= gensToVcf.outputVcf
}
add(cvChr)
cvTotal.variant :+= cvChr.outputFile
contig -> cvChr.outputFile
}
add(cvTotal)
}
}
object Impute2Vcf extends PipelineCommand {
case class GensInput(genotypes: File, info: Option[File], contig: String)
def imputeSpecsToGensInput(specsFile: File, validate: Boolean = true): List[GensInput] = {
ImputeOutput.readSpecsFile(specsFile, validate)
.map(x => GensInput(x.gens, Some(x.gensInfo), x.chromosome))
}
}
\ No newline at end of file
......@@ -18,6 +18,7 @@ import java.io.File
import java.nio.file.Paths
import com.google.common.io.Files
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Config
import org.broadinstitute.gatk.queue.QSettings
import org.scalatest.Matchers
......@@ -39,6 +40,7 @@ class GwasTestTest extends TestNGSuite with Matchers {
@Test
def testFromVcf: Unit = {
Logging.errors.clear()
val pipeline = initPipeline(GwasTestTest.config ++
Map("input_vcf" -> GwasTestTest.vcfFile.toString
)
......@@ -46,34 +48,6 @@ class GwasTestTest extends TestNGSuite with Matchers {
pipeline.script()
}
@Test
def testFromGens: Unit = {
val pipeline = initPipeline(GwasTestTest.config ++
Map("input_gens" -> List(Map("genotypes" -> GwasTestTest.vcfFile, "contig" -> "chrQ"))
)
)
pipeline.script()
}
@Test
def testWrongContig: Unit = {
val pipeline = initPipeline(GwasTestTest.config ++
Map("input_gens" -> List(Map("genotypes" -> GwasTestTest.vcfFile, "contig" -> "chrBla"))
)
)
intercept[IllegalStateException] {
pipeline.script()
}
}
@Test
def testFromSpecs: Unit = {
val pipeline = initPipeline(GwasTestTest.config ++
Map("imute_specs_file" -> GwasTestTest.resourcePath("/specs/files.specs"))
)
pipeline.script()
}
@Test
def testEmpty: Unit = {
val pipeline = initPipeline(GwasTestTest.config)
......@@ -101,8 +75,8 @@ object GwasTestTest {
}
val config = Map(
"reference_fasta" -> GwasTestTest.reference.toString,
"phenotype_file" -> GwasTestTest.phenotypeFile.toString,
"reference_fasta" -> reference.toString,
"phenotype_file" -> phenotypeFile.toString,
"output_dir" -> outputDir,
"snptest" -> Map("exe" -> "test"),
"md5sum" -> Map("exe" -> "test"),
......
package nl.lumc.sasc.biopet.pipelines.gwastest.impute
import java.io.File
import java.nio.file.Paths
import com.google.common.io.Files
import nl.lumc.sasc.biopet.utils.config.Config
import org.broadinstitute.gatk.queue.QSettings
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
/**
* Created by pjvan_thof on 31-5-16.
*/
class Impute2VcfTest extends TestNGSuite with Matchers {
def initPipeline(map: Map[String, Any]): Impute2Vcf = {
new Impute2Vcf {
override def configNamespace = "impute2vcf"
override def globalConfig = new Config(map)
qSettings = new QSettings
qSettings.runName = "test"
}
}
@Test
def testFromGens: Unit = {
val pipeline = initPipeline(Impute2VcfTest.config ++
Map("input_gens" -> List(Map("genotypes" -> Impute2VcfTest.vcfFile, "contig" -> "chrQ"))
)
)
pipeline.script()
}
@Test
def testWrongContig: Unit = {
val pipeline = initPipeline(Impute2VcfTest.config ++
Map("input_gens" -> List(Map("genotypes" -> Impute2VcfTest.vcfFile, "contig" -> "chrBla"))
)
)
intercept[IllegalStateException] {
pipeline.script()
}
}
@Test
def testFromSpecs: Unit = {
val pipeline = initPipeline(Impute2VcfTest.config ++
Map("imute_specs_file" -> Impute2VcfTest.resourcePath("/specs/files.specs"))
)
pipeline.script()
}
@Test
def testEmpty: Unit = {
val pipeline = initPipeline(Impute2VcfTest.config)
intercept[IllegalArgumentException] {
pipeline.script()
}
}
}
object Impute2VcfTest {
val vcfFile = File.createTempFile("gwas.", ".vcf")
Files.touch(vcfFile)
vcfFile.deleteOnExit()
val phenotypeFile = File.createTempFile("gwas.", ".txt")
phenotypeFile.deleteOnExit()
val outputDir = Files.createTempDir()
outputDir.deleteOnExit()
val reference = new File(resourcePath("/fake_chrQ.fa"))
private def resourcePath(p: String): String = {
Paths.get(getClass.getResource(p).toURI).toString
}
val config = Map(
"reference_fasta" -> reference.toString,
"phenotype_file" -> phenotypeFile.toString,
"output_dir" -> outputDir,
"md5sum" -> Map("exe" -> "test"),
"gatk_jar" -> "test"
)
}
......@@ -20,6 +20,7 @@
package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers
import nl.lumc.sasc.biopet.extensions.gatk
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Configurable
/** Allele mode for Haplotypecaller */
......@@ -27,9 +28,11 @@ class HaplotypeCallerAllele(val root: Configurable) extends Variantcaller {
val name = "haplotypecaller_allele"
protected def defaultPrio = 5
val alleles: File = config("input_alleles")
def biopetScript() {
val hc = gatk.HaplotypeCaller(this, inputBams.values.toList, outputFile)
hc.alleles = config("input_alleles")
hc.alleles = Some(alleles)
hc.genotyping_mode = Some("GENOTYPE_GIVEN_ALLELES")
add(hc)
}
......
......@@ -27,9 +27,11 @@ class UnifiedGenotyperAllele(val root: Configurable) extends Variantcaller {
val name = "unifiedgenotyper_allele"
protected def defaultPrio = 9
val alleles: File = config("input_alleles")
def biopetScript() {
val ug = gatk.UnifiedGenotyper(this, inputBams.values.toList, outputFile)
ug.alleles = config("input_alleles")
ug.alleles = Some(alleles)
ug.genotyping_mode = Some("GENOTYPE_GIVEN_ALLELES")
add(ug)
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment