Commit 8a322f5c authored by bow's avatar bow
Browse files

Merge branch 'feature-snp_test' into 'develop'

Snp test pipeline



See merge request !360
parents 0a110eed b3454f23
......@@ -28,6 +28,7 @@
<module>../public/bam2wig</module>
<module>../public/carp</module>
<module>../public/toucan</module>
<module>../public/gwas-test</module>
<module>../public/shiva</module>
<module>../public/basty</module>
<module>../public/tinycap</module>
......
......@@ -50,9 +50,16 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
writer.println("set -eubf")
writer.println("set -o pipefail")
lines.foreach(writer.println)
jobDelayTime.foreach(x => writer.println(s"sleep $x"))
writer.close()
}
/**
* This value is used to let you job wait a x number of second after it finish.
* This is ionly used when having storage delay issues
*/
var jobDelayTime: Option[Int] = config("job_delay_time")
// This overrides the default "sh" from queue. For Biopet the default is "bash"
updateJobRun = {
case jt: JobTemplate =>
......@@ -174,6 +181,27 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
this
}
/**
* This method can handle args that have multiple args for 1 arg name
* @param argName Name of the arg like "-h" or "--help"
* @param values Values for this arg
* @param groupSize Values must come in groups of x number, default is 1
* @param minGroups Minimal groups that are required, default is 0, when 0 the method return en empty string
* @param maxGroups Max number of groups that can be filled here
* @return Command part of this arg
*/
def multiArg(argName: String, values: Iterable[Any], groupSize: Int = 1, minGroups: Int = 0, maxGroups: Int = 0): String = {
if (values.size % groupSize != 0)
Logging.addError(s"Arg '${argName}' values: '${values}' does not fit to a groupSize of ${groupSize}")
val groups = values.size / groupSize
if (groups < minGroups)
Logging.addError(s"Args '${argName}' need atleast $minGroups with size $groupSize")
if (maxGroups > 0 && groups > maxGroups)
Logging.addError(s"Args '${argName}' may only have $maxGroups with size $groupSize")
if (values.nonEmpty) required(argName) + values.map(required(_)).mkString
else ""
}
@Output(required = false)
private[core] var stdoutFile: Option[File] = None
......
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ Version, Reference, BiopetCommandLineFunction }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import scala.util.matching.Regex
/**
* Created by pjvan_thof on 3/25/16.
*/
class Snptest(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version {
@Input(required = true)
var inputGenotypes: List[File] = Nil
@Input(required = true)
var inputSampleFiles: List[File] = Nil
@Output(required = false)
var logFile: Option[File] = None
@Output(required = false)
var outputFile: Option[File] = None
@Output(required = false)
var outputDatabaseFile: Option[File] = None
var assumeChromosome: Option[String] = config("assume_chromosome")
var genotypeField: Option[String] = config("genotype_field")
var genotypeProbabilityScale: Option[String] = config("genotype_probability_scale")
var haploidGenotypeCoding: Option[String] = config("haploid_genotype_coding")
var missingCode: Option[String] = config("missing_code")
var totalProbLimit: Option[Double] = config("total_prob_limit")
var tableName: Option[String] = config("table_name")
var useLongColumnNamingScheme: Boolean = config("use_long_column_naming_scheme", default = false)
var excludeSamples: List[String] = config("exclude_samples", default = Nil)
@Input(required = false)
val excludeSnp: Option[File] = config("exclude_snp")
var missThresh: Option[Double] = config("miss_thresh")
var baselinePhenotype: Option[String] = config("baseline_phenotype")
var bayesian: List[String] = config("bayesian", default = Nil)
var callThresh: Option[Double] = config("call_thresh")
var frequentist: List[String] = config("frequentist", default = Nil)
var fullParameterEstimates: Boolean = config("full_parameter_estimates", default = false)
var method: Option[String] = config("method")
var pheno: Option[String] = config("pheno")
var summaryStatsOnly: Boolean = config("summary_stats_only", default = false)
var covAll: Boolean = config("cov_all", default = false)
var covAllContinuous: Boolean = config("cov_all_continuous", default = false)
var covAllDiscrete: Boolean = config("cov_all_discrete", default = false)
var covNames: List[String] = config("cov_names", default = Nil)
var sexColumn: Option[String] = config("sex_column")
var stratifyOn: Option[String] = config("stratify_on")
var conditionOn: List[String] = config("condition_on", default = Nil)
var priorAdd: List[String] = config("prior_add", default = Nil)
var priorCov: List[String] = config("prior_cov", default = Nil)
var priorDom: List[String] = config("prior_dom", default = Nil)
var priorGen: List[String] = config("prior_gen", default = Nil)
var priorHet: List[String] = config("prior_het", default = Nil)
var priorRec: List[String] = config("prior_rec", default = Nil)
var tDf: Option[String] = config("t_df")
var tPrior: Boolean = config("t_prior", default = false)
var priorMqtQ: Option[String] = config("prior_mqt_Q")
var priorQtVb: Option[String] = config("prior_qt_V_b")
var priorQtVq: Option[String] = config("prior_qt_V_q")
var priorQtA: Option[String] = config("prior_qt_a")
var priorQtB: Option[String] = config("prior_qt_b")
var priorQtMeanB: Option[String] = config("prior_qt_mean_b")
var priorQtMeanQ: Option[String] = config("prior_qt_mean_q")
var meanBf: List[String] = config("mean_bf", default = Nil)
var analysisDescription: Option[String] = config("analysis_description")
var chunk: Option[Int] = config("chunk")
var debug: Boolean = config("debug", default = false)
var hwe: Boolean = config("hwe", default = false)
var lowerSampleLimit: Option[Int] = config("lower_sample_limit")
var overlap: Boolean = config("overlap", default = false)
var printids: Boolean = config("printids", default = false)
var quantileNormalisePhenotypes: Boolean = config("quantile_normalise_phenotypes", default = false)
var range: List[String] = config("range", default = Nil)
var renorm: Boolean = config("renorm", default = false)
var snpid: List[String] = config("snpid", default = Nil)
var useRawCovariates: Boolean = config("use_raw_covariates", default = false)
var useRawPhenotypes: Boolean = config("use_raw_phenotypes", default = false)
var noClobber: Boolean = config("no_clobber", default = false)
executable = config("exe", default = "snptest")
def versionCommand: String = executable + " -help"
def versionRegex: Regex = "Welcome to SNPTEST (.*)".r
override def beforeGraph(): Unit = {
super.beforeGraph()
require(inputGenotypes.length == inputSampleFiles.length)
}
def cmdLine: String = {
val data = inputGenotypes.zip(inputSampleFiles).flatMap(x => List(x._1, x._2))
required(executable) +
optional("-assume_chromosome", assumeChromosome) +
multiArg("-data", data, groupSize = 2) +
optional("-genotype_field", genotypeField) +
optional("-genotype_probability_scale", genotypeProbabilityScale) +
optional("-haploid_genotype_coding", haploidGenotypeCoding) +
optional("-missing_code", missingCode) +
optional("-total_prob_limit", totalProbLimit) +
optional("-log", logFile) +
optional("-o", outputFile) +
optional("-odb", outputDatabaseFile) +
optional("-table_name", tableName) +
conditional(useLongColumnNamingScheme, "-use_long_column_naming_scheme") +
multiArg("-exclude_samples", excludeSamples) +
optional("-exclude_snp", excludeSnp) +
optional("-miss_thresh", missThresh) +
optional("-baseline_phenotype", baselinePhenotype) +
multiArg("-bayesian", bayesian) +
optional("-call_thresh", callThresh) +
multiArg("-frequentist", frequentist) +
conditional(fullParameterEstimates, "-full_parameter_estimates") +
optional("-method", method) +
optional("-pheno", pheno) +
conditional(summaryStatsOnly, "-summary_stats_only") +
conditional(covAll, "-cov_all") +
conditional(covAllContinuous, "-cov_all_continuous") +
conditional(covAllDiscrete, "-cov_all_discrete") +
multiArg("-cov_names", covNames) +
optional("-sex_column", sexColumn) +
optional("-stratify_on", stratifyOn) +
multiArg("-condition_on", conditionOn) +
multiArg("-prior_add", priorAdd, groupSize = 4, maxGroups = 1) +
multiArg("-prior_cov", priorCov, groupSize = 2, maxGroups = 1) +
multiArg("-prior_dom", priorDom, groupSize = 4, maxGroups = 1) +
multiArg("-prior_gen", priorGen, groupSize = 6, maxGroups = 1) +
multiArg("-prior_het", priorHet, groupSize = 4, maxGroups = 1) +
multiArg("-prior_rec", priorRec, groupSize = 4, maxGroups = 1) +
optional("-t_df", tDf) +
conditional(tPrior, "-t_prior") +
optional("-prior_mqt_Q", priorMqtQ) +
optional("-prior_qt_V_b", priorQtVb) +
optional("-prior_qt_V_q", priorQtVq) +
optional("-prior_qt_a", priorQtA) +
optional("-prior_qt_b", priorQtB) +
optional("-prior_qt_mean_b", priorQtMeanB) +
optional("-prior_qt_mean_q", priorQtMeanQ) +
multiArg("-mean_bf", meanBf, groupSize = 2, maxGroups = 1) +
optional("-analysis_description", analysisDescription) +
optional("-analysis_name", analysisName) +
optional("-chunk", chunk) +
conditional(debug, "-debug") +
conditional(hwe, "-hwe") +
optional("-lower_sample_limit", lowerSampleLimit) +
conditional(overlap, "-overlap") +
conditional(printids, "-printids") +
conditional(quantileNormalisePhenotypes, "quantile_normalise_phenotypes") +
multiArg("-range", range, groupSize = 2) +
conditional(renorm, "-renorm") +
multiArg("-snpid", snpid) +
conditional(useRawCovariates, "-use_raw_covariates") +
conditional(useRawPhenotypes, "-use_raw_phenotypes") +
conditional(noClobber, "-no_clobber")
}
}
......@@ -23,7 +23,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction with Reference {
javaMainClass = classOf[org.broadinstitute.gatk.tools.CatVariants].getClass.getName
javaMainClass = classOf[org.broadinstitute.gatk.tools.CatVariants].getName
@Input(required = true)
var inputFiles: List[File] = Nil
......@@ -34,6 +34,8 @@ class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction
@Input
var reference: File = null
var assumeSorted = false
override def beforeGraph(): Unit = {
super.beforeGraph()
if (reference == null) reference = referenceFasta()
......@@ -42,7 +44,8 @@ class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction
override def cmdLine = super.cmdLine +
repeat("-V", inputFiles) +
required("-out", outputFile) +
required("-R", reference)
required("-R", reference) +
conditional(assumeSorted, "--assumeSorted")
}
object CatVariants {
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.gatk
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Extension for CombineVariants from GATK
*
* Created by pjvan_thof on 2/26/15.
*/
class SelectVariants(val root: Configurable) extends Gatk {
val analysisType = "SelectVariants"
@Input(doc = "", required = true)
var inputFiles: List[File] = Nil
@Output(doc = "", required = true)
var outputFile: File = null
var excludeNonVariants: Boolean = false
var inputMap: Map[File, String] = Map()
def addInput(file: File, name: String): Unit = {
inputFiles :+= file
inputMap += file -> name
}
override def beforeGraph(): Unit = {
super.beforeGraph()
if (outputFile.getName.endsWith(".vcf.gz")) outputFiles :+= new File(outputFile.getAbsolutePath + ".tbi")
deps :::= inputFiles.filter(_.getName.endsWith("vcf.gz")).map(x => new File(x.getAbsolutePath + ".tbi"))
deps = deps.distinct
}
override def cmdLine = super.cmdLine +
(for (file <- inputFiles) yield {
inputMap.get(file) match {
case Some(name) => required("-V:" + name, file)
case _ => required("-V", file)
}
}).mkString +
required("-o", outputFile) +
conditional(excludeNonVariants, "--excludeNonVariants")
}
......@@ -74,16 +74,14 @@
<artifactId>Sage</artifactId>
<version>${project.version}</version>
</dependency>
<!--
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Yamsvp</artifactId>
<artifactId>Kopisu</artifactId>
<version>${project.version}</version>
</dependency>
-->
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Kopisu</artifactId>
<artifactId>GwasTest</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
......
......@@ -32,7 +32,8 @@ object BiopetExecutablePublic extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.toucan.Toucan,
nl.lumc.sasc.biopet.pipelines.shiva.ShivaSvCalling,
nl.lumc.sasc.biopet.pipelines.gears.GearsSingle,
nl.lumc.sasc.biopet.pipelines.gears.Gears
nl.lumc.sasc.biopet.pipelines.gears.Gears,
nl.lumc.sasc.biopet.pipelines.gwastest.GwasTest
)
def pipelines: List[MainCommand] = List(
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.tools
import java.io.File
import nl.lumc.sasc.biopet.core.{ Reference, ToolCommandFunction }
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
/**
*
*/
class GensToVcf(val root: Configurable) extends ToolCommandFunction with Reference {
def toolObject = nl.lumc.sasc.biopet.tools.GensToVcf
@Input(doc = "Input genotypes file", required = true)
var inputGens: File = _
@Input(doc = "input Info file", required = false)
var inputInfo: Option[File] = None
@Input(required = true)
var samplesFile: File = _
@Input(required = true)
var reference: File = _
@Output(required = true)
var outputVcf: File = _
var contig: String = _
var sortInput: Boolean = false
override def defaultCoreMemory = 6.0
override def beforeGraph(): Unit = {
super.beforeGraph()
if (reference == null) reference = referenceFasta()
if (contig == null) throw new IllegalStateException("Contig is missing")
if (outputVcf.getName.endsWith(".vcf.gz")) outputFiles :+= new File(outputVcf.getAbsolutePath + ".tbi")
}
override def setupRetry(): Unit = {
super.setupRetry()
sortInput = true
}
override def cmdLine = super.cmdLine +
required("--inputGenotypes", inputGens) +
required("--inputInfo", inputInfo) +
required("--outputVcf", outputVcf) +
optional("--contig", contig) +
required("--referenceFasta", reference) +
required("--samplesFile", samplesFile) +
conditional(sortInput, "--sortInput")
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.tools
import java.io.File
import nl.lumc.sasc.biopet.core.{ Reference, ToolCommandFunction }
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
*
*/
class SnptestToVcf(val root: Configurable) extends ToolCommandFunction with Reference {
def toolObject = nl.lumc.sasc.biopet.tools.SnptestToVcf
@Input(doc = "input Info file", required = true)
var inputInfo: File = null
@Input(required = true)
var reference: File = _
@Output(required = true)
var outputVcf: File = _
var contig: String = _
override def beforeGraph(): Unit = {
super.beforeGraph()
if (reference == null) reference = referenceFasta()
if (contig == null) throw new IllegalStateException("Contig is missing")
//if (outputVcf.getName.endsWith(".vcf.gz")) outputFiles :+= new File(outputVcf.getAbsolutePath + ".tbi")
}
override def cmdLine = super.cmdLine +
required("--inputInfo", inputInfo) +
required("--outputVcf", outputVcf) +
optional("--contig", contig) +
required("--referenceFasta", reference)
}
package nl.lumc.sasc.biopet.tools
import java.io.File
import java.util
import htsjdk.samtools.reference.{ FastaSequenceFile, ReferenceSequenceFileFactory }
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.variantcontext.{ Allele, GenotypeBuilder, VariantContextBuilder }
import htsjdk.variant.vcf._
import nl.lumc.sasc.biopet.utils.ToolCommand
import scala.collection.JavaConversions._
import scala.io.Source
/**
* Created by pjvanthof on 15/03/16.
*/
object GensToVcf extends ToolCommand {
case class Args(inputGenotypes: File = null,
inputInfo: Option[File] = None,
outputVcf: File = null,
sampleFile: File = null,
referenceFasta: File = null,
contig: String = null,
sortInput: Boolean = false) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('g', "inputGenotypes") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(inputGenotypes = x)
} text "Input genotypes"
opt[File]('i', "inputInfo") maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(inputInfo = Some(x))
} text "Input info fields"
opt[File]('o', "outputVcf") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(outputVcf = x)
} text "Output vcf file"
opt[File]('s', "samplesFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(sampleFile = x)
} text "Samples file"
opt[File]('R', "referenceFasta") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(referenceFasta = x)
} text "reference fasta file"
opt[String]('c', "contig") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(contig = x)
} text "contig of impute file"
opt[Unit]("sortInput") maxOccurs 1 action { (x, c) =>
c.copy(sortInput = true)
} text "In memory sorting"
}
def main(args: Array[String]): Unit = {
logger.info("Start")
val argsParser = new OptParser
val cmdArgs = argsParser.parse(args, Args()).getOrElse(throw new IllegalArgumentException)
val samples = Source.fromFile(cmdArgs.sampleFile).getLines().toArray.drop(2).map(_.split("\t").take(2).mkString("_"))
val infoIt = cmdArgs.inputInfo.map(Source.fromFile(_).getLines())
val infoHeaderKeys = infoIt.map(_.next().split(" ").filterNot(x => x == "rs_id" || x == "position"))
val infoHeaderMap = infoHeaderKeys.map(_.zipWithIndex.toMap)
val metaLines = new util.HashSet[VCFHeaderLine]()
for (keys <- infoHeaderKeys; key <- keys)
metaLines.add(new VCFInfoHeaderLine(s"GENS_$key", 1, VCFHeaderLineType.String, ""))
metaLines.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, ""))
metaLines.add(new VCFFormatHeaderLine("GP", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, ""))
val reference = new FastaSequenceFile(cmdArgs.referenceFasta, true)