diff --git a/biopet-aggregate/pom.xml b/biopet-aggregate/pom.xml index d105b1ef54883992abee5b83277df325c62a0686..ca3283801777ad853c1f690cdadd9557cc296602 100644 --- a/biopet-aggregate/pom.xml +++ b/biopet-aggregate/pom.xml @@ -28,6 +28,7 @@ <module>../public/bam2wig</module> <module>../public/carp</module> <module>../public/toucan</module> + <module>../public/gwas-test</module> <module>../public/shiva</module> <module>../public/basty</module> <module>../public/tinycap</module> diff --git a/docs/developer/example-pipeline.md b/docs/developer/example-pipeline.md index f4f6bb9d76da32118d6a89c3026165bd1669211e..432c4ec5711e09ab20539c0527f0efa2175bd125 100644 --- a/docs/developer/example-pipeline.md +++ b/docs/developer/example-pipeline.md @@ -138,7 +138,7 @@ Setting up the pipeline is done within the pipeline itself, fine-tuning is alway For our new pipeline, one should setup the (default) config options. -Since our pipeline is called `HelloPipeline`, the root of the configoptions will called `hellopipeline` (lowercaps). +Since our pipeline is called `HelloPipeline`, the root of the namespace for our pipeline will be called `hellopipeline` (lowercaps). ```json { diff --git a/docs/general/memory.md b/docs/general/memory.md index 95e38862e440238f0502fae40f1371cbc38459c8..c0fc6033c656550454fe11a27541c338d261ca9e 100644 --- a/docs/general/memory.md +++ b/docs/general/memory.md @@ -20,7 +20,7 @@ We assume here that the cluster will amplify those values by the number of threa - **residentFactor**: 1.2 - **vmemFactor**: 1.4, 2.0 for java jobs -This are de defaults of biopet but each extension in biopet can set their own defaults. As example the *bwa mem* tools +This are the defaults of biopet but each extension in biopet can set their own defaults. As example the *bwa mem* tools use by default 8 `threads` and `core_memory` of 6.0. ### Config diff --git a/docs/pipelines/basty.md b/docs/pipelines/basty.md index fdb21ae3ca1ed6cdcb1f373aa3265433b30f03e6..7da41338f2fc7f8327ca558eec9ca6d00948ad57 100644 --- a/docs/pipelines/basty.md +++ b/docs/pipelines/basty.md @@ -34,7 +34,7 @@ Please refer [to our mapping pipeline](mapping.md) for information about how the #### Required configuration values -| Submodule | Name | Type | Default | Function | +| namespace | Name | Type | Default | Function | | --------- | ---- | ---- | ------- | -------- | | shiva | variantcallers | List[String] | | Which variant caller to use | | - | output_dir | Path | Path to output directory | @@ -44,7 +44,7 @@ Please refer [to our mapping pipeline](mapping.md) for information about how the Specific configuration options additional to Basty are: -| Submodule | Name | Type | Default | Function | +| namespace | Name | Type | Default | Function | | --------- | ---- | ---- | ------- | -------- | | raxml | seed | Integer | 12345 | RAxML Random seed| | raxml | raxml_ml_model | String | GTRGAMMAX | RAxML model | diff --git a/docs/pipelines/shiva.md b/docs/pipelines/shiva.md index ead5f91acca3309831b4c00c56a3e17e3127d431..3c2bbaa98978f352ea5cd4ac5fe4a0b75286eb6c 100644 --- a/docs/pipelines/shiva.md +++ b/docs/pipelines/shiva.md @@ -105,7 +105,7 @@ To view all possible config options please navigate to our Gitlab wiki page <a href="https://git.lumc.nl/biopet/biopet/wikis/GATK-Variantcalling-Pipeline" target="_blank">Config</a> ### Required settings -| Namespace | Name | Type | Default | Function | +| Confignamespace | Name | Type | Default | Function | | ----------- | ---- | ---- | ------- | -------- | | - | output_dir | String | | Path to output directory | | Shiva | variantcallers | List[String] | | Which variant callers to use | @@ -113,7 +113,7 @@ To view all possible config options please navigate to our Gitlab wiki page ### Config options -| Namespace | Name | Type | Default | Function | +| ConfignNamespace | Name | Type | Default | Function | | ----------- | ---- | ----- | ------- | -------- | | shiva | species | String | unknown_species | Name of species, like H.sapiens | | shiva | reference_name | String | unknown_reference_name | Name of reference, like hg19 | @@ -161,7 +161,7 @@ The other mode, `library_variantcalling`, will call simultaneously call all libr The config for these therefore is: -| Namespace | Name | Type | Default | Function | +| namespace | Name | Type | Default | Function | | ----------- | ---- | ---- | ------- | -------- | | shiva | multisample_variantcalling | Boolean | true | Default, multisample calling | | shiva | single_sample_variantcalling | Boolean | false | Not-recommended, single sample, merged bam | diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala index 11560ea25e216a49515b618f03bbb9604accc731..e4fdf3d2d26875e7795802fbda50c00ac118c875 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala @@ -26,7 +26,7 @@ class VariantRecalibrator(val root: Configurable) extends org.broadinstitute.gat object VariantRecalibrator { def apply(root: Configurable, input: File, recal_file: File, tranches_file: File, indel: Boolean = false): VariantRecalibrator = { val vr = new VariantRecalibrator(root) { - override lazy val configName = "variantrecalibrator" + override lazy val configNamespace = "variantrecalibrator" override def configPath: List[String] = (if (indel) "indel" else "snp") :: super.configPath if (indel) { mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/Shiva.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/Shiva.scala index 86e51353983b35d5248fa7958b1f5c3c8ebffac0..244fae67a31938f4eadaa5228c979bf4a4cd9011 100644 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/Shiva.scala +++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/Shiva.scala @@ -20,15 +20,15 @@ class Shiva(val root: Configurable) extends QScript with ShivaTrait { qscript => def this() = this(null) - /** Make variantcalling submodule, this with the gatk modes in there */ + /** Make variantcalling namespace, this with the gatk modes in there */ override def makeVariantcalling(multisample: Boolean = false) = { if (multisample) new ShivaVariantcalling(qscript) { override def namePrefix = "multisample" - override def configName = "shivavariantcalling" + override def configNamespace = "shivavariantcalling" override def configPath: List[String] = super.configPath ::: "multisample" :: Nil } else new ShivaVariantcalling(qscript) { - override def configName = "shivavariantcalling" + override def configNamespace = "shivavariantcalling" } } diff --git a/protected/biopet-gatk-pipelines/src/test/scala/nl/lumc/sasc/biopet/pipelines/gatk/ShivaTest.scala b/protected/biopet-gatk-pipelines/src/test/scala/nl/lumc/sasc/biopet/pipelines/gatk/ShivaTest.scala index f1b326b7017b129eaf6edf0d44014af4f9507b7b..8be9ae9a79066c233c1c92b6dfb2f9cb1432385f 100644 --- a/protected/biopet-gatk-pipelines/src/test/scala/nl/lumc/sasc/biopet/pipelines/gatk/ShivaTest.scala +++ b/protected/biopet-gatk-pipelines/src/test/scala/nl/lumc/sasc/biopet/pipelines/gatk/ShivaTest.scala @@ -27,7 +27,7 @@ import org.testng.annotations.{ DataProvider, Test } class ShivaTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): Shiva = { new Shiva() { - override def configName = "shiva" + override def configNamespace = "shiva" override def globalConfig = new Config(ConfigUtils.mergeMaps(map, ShivaTest.config)) qSettings = new QSettings qSettings.runName = "test" diff --git a/protected/biopet-gatk-pipelines/src/test/scala/nl/lumc/sasc/biopet/pipelines/gatk/ShivaVariantcallingTest.scala b/protected/biopet-gatk-pipelines/src/test/scala/nl/lumc/sasc/biopet/pipelines/gatk/ShivaVariantcallingTest.scala index 14207e5a829de7fd7cae4738a10fd9f6b28c5546..f4576e5a10d380e572891cae1160d665ebbe8746 100644 --- a/protected/biopet-gatk-pipelines/src/test/scala/nl/lumc/sasc/biopet/pipelines/gatk/ShivaVariantcallingTest.scala +++ b/protected/biopet-gatk-pipelines/src/test/scala/nl/lumc/sasc/biopet/pipelines/gatk/ShivaVariantcallingTest.scala @@ -29,7 +29,7 @@ import scala.collection.mutable.ListBuffer class ShivaVariantcallingTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): ShivaVariantcalling = { new ShivaVariantcalling() { - override def configName = "shivavariantcalling" + override def configNamespace = "shivavariantcalling" override def globalConfig = new Config(ConfigUtils.mergeMaps(map, ShivaVariantcallingTest.config)) qSettings = new QSettings qSettings.runName = "test" diff --git a/public/bammetrics/src/test/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BamMetricsTest.scala b/public/bammetrics/src/test/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BamMetricsTest.scala index 1482e2afeb2633f6ed3000c2f399dfc5a0f1bad6..4a75d99e00d3a21324c8d5459d891e6435691185 100644 --- a/public/bammetrics/src/test/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BamMetricsTest.scala +++ b/public/bammetrics/src/test/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BamMetricsTest.scala @@ -37,7 +37,7 @@ class BamMetricsTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): BamMetrics = { new BamMetrics() { - override def configName = "bammetrics" + override def configNamespace = "bammetrics" override def globalConfig = new Config(map) qSettings = new QSettings qSettings.runName = "test" diff --git a/public/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/BastyTrait.scala b/public/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/BastyTrait.scala index 675d4d7557be2edf84c41494edfaa30af9d13a78..3202d4f37db17d1bb9b824c89e948e44d82fb829 100644 --- a/public/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/BastyTrait.scala +++ b/public/basty/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/BastyTrait.scala @@ -131,11 +131,11 @@ trait BastyTrait extends MultiSampleQScript { raxmlMl.p = Some(seed) raxmlMl.n = outputName + "_ml" raxmlMl.w = dirSufixRaxml - raxmlMl.N = config("ml_runs", default = 20, submodule = "raxml") + raxmlMl.N = config("ml_runs", default = 20, namespace = "raxml") add(raxmlMl) val r = new scala.util.Random(seed) - val numBoot = config("boot_runs", default = 100, submodule = "raxml").asInt + val numBoot = config("boot_runs", default = 100, namespace = "raxml").asInt val bootList = for (t <- 0 until numBoot) yield { val raxmlBoot = new Raxml(this) raxmlBoot.input = variants diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunction.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunction.scala index a413c8bad83199c38a1b156a67118d2e7262b50c..f367abc4c5c864215277872818ff306fb4c93a14 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunction.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunction.scala @@ -30,7 +30,7 @@ import scala.collection.JavaConversions._ /** Biopet command line trait to auto check executable and cluster values */ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction => - analysisName = configName + analysisName = configNamespace @Input(doc = "deps", required = false) var deps: List[File] = Nil @@ -50,9 +50,16 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction => writer.println("set -eubf") writer.println("set -o pipefail") lines.foreach(writer.println) + jobDelayTime.foreach(x => writer.println(s"sleep $x")) writer.close() } + /** + * This value is used to let you job wait a x number of second after it finish. + * This is ionly used when having storage delay issues + */ + var jobDelayTime: Option[Int] = config("job_delay_time") + // This overrides the default "sh" from queue. For Biopet the default is "bash" updateJobRun = { case jt: JobTemplate => @@ -73,6 +80,15 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction => def beforeGraph() {} override def freezeFieldValues() { + + this match { + case r: Reference => + if (r.dictRequired) deps :+= r.referenceDict + if (r.faiRequired) deps :+= r.referenceFai + deps = deps.distinct + case _ => + } + preProcessExecutable() beforeGraph() internalBeforeGraph() @@ -165,6 +181,27 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction => this } + /** + * This method can handle args that have multiple args for 1 arg name + * @param argName Name of the arg like "-h" or "--help" + * @param values Values for this arg + * @param groupSize Values must come in groups of x number, default is 1 + * @param minGroups Minimal groups that are required, default is 0, when 0 the method return en empty string + * @param maxGroups Max number of groups that can be filled here + * @return Command part of this arg + */ + def multiArg(argName: String, values: Iterable[Any], groupSize: Int = 1, minGroups: Int = 0, maxGroups: Int = 0): String = { + if (values.size % groupSize != 0) + Logging.addError(s"Arg '${argName}' values: '${values}' does not fit to a groupSize of ${groupSize}") + val groups = values.size / groupSize + if (groups < minGroups) + Logging.addError(s"Args '${argName}' need atleast $minGroups with size $groupSize") + if (maxGroups > 0 && groups > maxGroups) + Logging.addError(s"Args '${argName}' may only have $maxGroups with size $groupSize") + if (values.nonEmpty) required(argName) + values.map(required(_)).mkString + else "" + } + @Output(required = false) private[core] var stdoutFile: Option[File] = None diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetJavaCommandLineFunction.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetJavaCommandLineFunction.scala index 5722871028d92aeddd2723c22971a2e08e51107a..be3d05fcfa0dfb0a8a39d200cfa4c64f6802bfd9 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetJavaCommandLineFunction.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetJavaCommandLineFunction.scala @@ -19,7 +19,7 @@ import org.broadinstitute.gatk.queue.function.JavaCommandLineFunction /** Biopet commandline class for java based programs */ trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetCommandLineFunction { - executable = config("java", default = "java", submodule = "java", freeVar = false) + executable = config("java", default = "java", namespace = "java", freeVar = false) javaGCThreads = config("java_gc_threads", default = 4) javaGCHeapFreeLimit = config("java_gc_heap_freelimit", default = 10) diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetPipe.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetPipe.scala index af98f00969a0c0b0b77c59db903f7b3346ce3274..936096971632109dea09bfac739ae5e3f8c1e5e1 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetPipe.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetPipe.scala @@ -74,7 +74,7 @@ class BiopetPipe(val commands: List[BiopetCommandLineFunction]) extends BiopetCo override def defaultThreads = 0 val root: Configurable = commands.head.root - override def configName = commands.map(_.configName).mkString("-") + override def configNamespace = commands.map(_.configNamespace).mkString("-") def cmdLine: String = { "(" + commands.head.cmdLine + (if (commands.head.stdinFile.isDefined) { " < " + required(commands.head.stdinFile.map(_.getAbsoluteFile)) diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala index 54a23127c6037e115e5586c70a6b206eb277cd13..9545911b9a4de84f11a6fe0742ca2877aed7ae20 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala @@ -31,7 +31,7 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript => @Argument(doc = "JSON / YAML config file(s)", fullName = "config_file", shortName = "config", required = false) val configfiles: List[File] = Nil - @Argument(doc = "Config values, value should be formatted like 'key=value' or 'path:path:key=value'", fullName = "config_value", shortName = "cv", required = false) + @Argument(doc = "Config values, value should be formatted like 'key=value' or 'namespace:namespace:key=value'", fullName = "config_value", shortName = "cv", required = false) val configValues: List[String] = Nil /** Output directory of pipeline */ diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/CommandLineResources.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/CommandLineResources.scala index 8ef7eb746cd744021656c56ae28f3226c800a0a7..ac01cfa15a963e851cad1cc81ff627d030d39a9a 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/CommandLineResources.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/CommandLineResources.scala @@ -88,7 +88,7 @@ trait CommandLineResources extends CommandLineFunction with Configurable { else residentLimit = Some((_coreMemory + (0.5 * retryMultipler)) * residentFactor) if (!config.contains("vmem")) vmem = Some((_coreMemory * (vmemFactor + (0.5 * retryMultipler))) + "G") - jobName = configName + ":" + (if (firstOutput != null) firstOutput.getName else jobOutputFile) + jobName = configNamespace + ":" + (if (firstOutput != null) firstOutput.getName else jobOutputFile) } override def setupRetry(): Unit = { diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala index 86804903c16a0cbda64bf338a902fa6ac2dc5bb1..840a67d0abec0ba4dbb8b784d8a99ee1190dfb20 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala @@ -70,7 +70,7 @@ trait MultiSampleQScript extends SummaryQScript { qscript: QScript => def libDir = new File(sampleDir, "lib_" + libId) lazy val libTags: Map[String, Any] = - config("tags", default = Map(), freeVar = false, submodule = libId, path = List("samples", sampleId, "libraries")) + config("tags", default = Map(), freeVar = false, namespace = libId, path = List("samples", sampleId, "libraries")) def sampleId = sample.sampleId @@ -91,7 +91,7 @@ trait MultiSampleQScript extends SummaryQScript { qscript: QScript => val libraries: Map[String, Library] = libIds.map(id => id -> makeLibrary(id)).toMap lazy val sampleTags: Map[String, Any] = - config("tags", default = Map(), freeVar = false, submodule = sampleId, path = List("samples")) + config("tags", default = Map(), freeVar = false, namespace = sampleId, path = List("samples")) lazy val gender = { val g: Option[String] = sampleTags.get("gender").map(_.toString) diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/Reference.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/Reference.scala index 6a39ca3eb9c68205cf1b5354829b4bb87944bfa9..f00de9e17af5564d23a73bac0ad6645d1535ab24 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/Reference.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/Reference.scala @@ -61,25 +61,25 @@ trait Reference extends Configurable { } /** When set override this on true the pipeline with raise an exception when fai index is not found */ - protected def faiRequired = false + def faiRequired = false /** When set override this on true the pipeline with raise an exception when dict index is not found */ - protected def dictRequired = this.isInstanceOf[Summarizable] || this.isInstanceOf[SummaryQScript] + def dictRequired = this.isInstanceOf[Summarizable] || this.isInstanceOf[SummaryQScript] + + /** Returns the dict file belonging to the fasta file */ + def referenceDict = new File(referenceFasta().getAbsolutePath + .stripSuffix(".fa") + .stripSuffix(".fasta") + .stripSuffix(".fna") + ".dict") + + /** Returns the fai file belonging to the fasta file */ + def referenceFai = new File(referenceFasta().getAbsolutePath + ".fai") /** Returns the fasta file */ def referenceFasta(): File = { val file: File = config("reference_fasta") - if (config.contains("reference_fasta")) { - checkFasta(file) - - val dict = new File(file.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta").stripSuffix(".fna") + ".dict") - val fai = new File(file.getAbsolutePath + ".fai") - - this match { - case c: BiopetCommandLineFunction => c.deps :::= dict :: fai :: Nil - case _ => - } - } else { + if (config.contains("reference_fasta")) checkFasta(file) + else { val defaults = ConfigUtils.mergeMaps(this.defaults, this.internalDefaults) def getReferences(map: Map[String, Any]): Set[(String, String)] = (for ( diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/WriteDependencies.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/WriteDependencies.scala index 108ee694e54fc4dd126dfc82f90adb7470602cde..a43cb3890bcf3521e3be6645b4e175c59f84d143 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/WriteDependencies.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/WriteDependencies.scala @@ -36,7 +36,7 @@ object WriteDependencies extends Logging with Configurable { val cache: mutable.Map[String, Int] = mutable.Map() for (function <- functions) { val baseName = function match { - case f: Configurable => f.configName + case f: Configurable => f.configNamespace case f => f.getClass.getSimpleName } cache += baseName -> (cache.getOrElse(baseName, 0) + 1) diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/extensions/PythonCommandLineFunction.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/extensions/PythonCommandLineFunction.scala index e4c0b269568dbabbea6231edece9d76b5dc38dc2..40d8cd32108fc5babfb95bf5cb4b81b526f34f47 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/extensions/PythonCommandLineFunction.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/extensions/PythonCommandLineFunction.scala @@ -24,7 +24,7 @@ trait PythonCommandLineFunction extends BiopetCommandLineFunction { @Input(doc = "Python script", required = false) var pythonScript: File = _ - executable = config("exe", default = "python", submodule = "python", freeVar = false) + executable = config("exe", default = "python", namespace = "python", freeVar = false) protected var pythonScriptName: String = _ diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/report/ReportBuilder.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/report/ReportBuilder.scala index 2bfbe8f6afdc65b78e1eead4148a375425029575..41a974fcc7d7fdc4da13d0f8e47ae621c3b8ac13 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/report/ReportBuilder.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/report/ReportBuilder.scala @@ -24,6 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.Input import org.fusesource.scalate.{ TemplateEngine, TemplateSource } import scala.collection.mutable +import scala.language.postfixOps /** * This trait is meant to make an extension for a report object @@ -66,15 +67,28 @@ trait ReportBuilderExtension extends ToolCommandFunction { trait ReportBuilder extends ToolCommand { - case class Args(summary: File = null, outputDir: File = null, pageArgs: mutable.Map[String, Any] = mutable.Map()) extends AbstractArgs + case class Args(summary: File = null, + outputDir: File = null, + pageArgs: mutable.Map[String, Any] = mutable.Map()) extends AbstractArgs class OptParser extends AbstractOptParser { + + head( + s""" + |$commandName - Generate HTML formatted report from a biopet summary.json + """.stripMargin + ) + opt[File]('s', "summary") unbounded () required () maxOccurs 1 valueName "<file>" action { (x, c) => c.copy(summary = x) - } + } validate { + x => if (x.exists) success else failure("Summary JSON file not found!") + } text "Biopet summary JSON file" + opt[File]('o', "outputDir") unbounded () required () maxOccurs 1 valueName "<file>" action { (x, c) => c.copy(outputDir = x) - } + } text "Output HTML report files to this directory" + opt[Map[String, String]]('a', "args") unbounded () action { (x, c) => c.copy(pageArgs = c.pageArgs ++ x) } @@ -142,9 +156,14 @@ trait ReportBuilder extends ToolCommand { // Static files that will be copied to the output folder, then file is added to [resourceDir] it's need to be added here also val extOutputDir: File = new File(cmdArgs.outputDir, "ext") - for (resource <- extFiles.par) { - IoUtils.copyStreamToFile(getClass.getResourceAsStream(resource.resourcePath), new File(extOutputDir, resource.targetPath), createDirs = true) - } + // Copy each resource files out to the report destination + extFiles.par.foreach( + resource => + IoUtils.copyStreamToFile( + getClass.getResourceAsStream(resource.resourcePath), + new File(extOutputDir, resource.targetPath), + createDirs = true) + ) logger.info("Parsing summary") setSummary = new Summary(cmdArgs.summary) @@ -165,7 +184,7 @@ trait ReportBuilder extends ToolCommand { /** This must be implemented, this will be the root page of the report */ def indexPage: ReportPage - /** This must be implemented, this will because the title of the report */ + /** This must be implemented, this will become the title of the report */ def reportName: String /** @@ -194,8 +213,9 @@ trait ReportBuilder extends ToolCommand { ) // Generating subpages - val jobs = for ((name, subPage) <- page.subPages.par) yield { - generatePage(summary, subPage, outputDir, path ::: name :: Nil, pageArgs) + val jobs = page.subPages.par.flatMap { + case (name, subPage) => Some(generatePage(summary, subPage, outputDir, path ::: name :: Nil, pageArgs)) + case _ => None } val output = ReportBuilder.renderTemplate("/nl/lumc/sasc/biopet/core/report/main.ssp", diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/summary/SummaryQScript.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/summary/SummaryQScript.scala index 0e947ba6c7d3bbf144c98feb1b7cb21601c6629f..619a31dacbdeeea9272198efc3f343ba109ccdcb 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/summary/SummaryQScript.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/summary/SummaryQScript.scala @@ -37,7 +37,7 @@ trait SummaryQScript extends BiopetQScript { qscript: QScript => private[summary] var summaryQScripts: List[SummaryQScript] = Nil /** Name of the pipeline in the summary */ - var summaryName = configName + var summaryName = configNamespace /** Must return a map with used settings for this pipeline */ def summarySettings: Map[String, Any] @@ -103,7 +103,7 @@ trait SummaryQScript extends BiopetQScript { qscript: QScript => if (writeSummary.md5sum) { if (!SummaryQScript.md5sumCache.contains(file)) { val md5sum = new Md5sum(this) { - override def configName = "md5sum" + override def configNamespace = "md5sum" override def cmdLine: String = super.cmdLine + " || " + required("echo") + required("error_on_capture " + input.toString) + " > " + required(output) diff --git a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/summary/WriteSummary.scala b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/summary/WriteSummary.scala index b888438d7ba2150fdb156067fbdc39bfe5db9b7f..56cab45bc8555b6cf88a40ac145054cd526e5c2e 100644 --- a/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/summary/WriteSummary.scala +++ b/public/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/summary/WriteSummary.scala @@ -75,16 +75,16 @@ class WriteSummary(val root: Configurable) extends InProcessFunction with Config def fetchVersion(f: QFunction): Option[(String, Any)] = { f match { case f: BiopetJavaCommandLineFunction with Version => - Some(f.configName -> Map("version" -> f.getVersion.getOrElse(None), + Some(f.configNamespace -> Map("version" -> f.getVersion.getOrElse(None), "java_md5" -> BiopetCommandLineFunction.executableMd5Cache.getOrElse(f.executable, None), "java_version" -> f.getJavaVersion, "jar_path" -> f.jarFile)) case f: BiopetCommandLineFunction with Version => - Some(f.configName -> Map("version" -> f.getVersion.getOrElse(None), + Some(f.configNamespace -> Map("version" -> f.getVersion.getOrElse(None), "md5" -> BiopetCommandLineFunction.executableMd5Cache.getOrElse(f.executable, None), "path" -> f.executable)) case f: Configurable with Version => - Some(f.configName -> Map("version" -> f.getVersion.getOrElse(None))) + Some(f.configNamespace -> Map("version" -> f.getVersion.getOrElse(None))) case _ => None } } diff --git a/public/biopet-core/src/test/scala/nl/lumc/sasc/biopet/core/summary/WriteSummaryTest.scala b/public/biopet-core/src/test/scala/nl/lumc/sasc/biopet/core/summary/WriteSummaryTest.scala index 5d4f89f8de4a1fcdebc69ab5ad81ad2f0881ce50..e4c31c51aa07581ba0bd6a3781af205e5b15238c 100644 --- a/public/biopet-core/src/test/scala/nl/lumc/sasc/biopet/core/summary/WriteSummaryTest.scala +++ b/public/biopet-core/src/test/scala/nl/lumc/sasc/biopet/core/summary/WriteSummaryTest.scala @@ -310,7 +310,7 @@ object WriteSummaryTest { stats: Map[String, Any] = Map(), c: Map[String, Any] = Map()) = new BiopetJavaCommandLineFunction with Summarizable with Version { override def globalConfig = new Config(c) - override def configName = "java_command" + override def configNamespace = "java_command" def root: Configurable = null def summaryStats: Map[String, Any] = stats def summaryFiles: Map[String, File] = files @@ -335,7 +335,7 @@ object WriteSummaryTest { c: Map[String, Any] = Map()) = new CommandLineFunction with Configurable with Summarizable with Version { override def globalConfig = new Config(c) - override def configName = "version_command" + override def configNamespace = "version_command" def root: Configurable = null def summaryFiles: Map[String, File] = files diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala index ee375163166ca858393bb95ee9f3bc7850896f84..6242da5d7378caf98add3d3c2888245621762895 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala @@ -40,7 +40,7 @@ class Fastqc(val root: Configurable) extends BiopetCommandLineFunction with Vers var output: File = null executable = config("exe", default = "fastqc") - var javaExe: String = config("exe", default = "java", submodule = "java", freeVar = false) + var javaExe: String = config("exe", default = "java", namespace = "java", freeVar = false) var kmers: Option[Int] = config("kmers") var quiet: Boolean = config("quiet", default = false) var noextract: Boolean = config("noextract", default = false) diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Pysvtools.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Pysvtools.scala index d9ee8e68a5e9237341a3838718523741b9c90465..e7b87696ba1b3556113ad0a97a34321b8fef08a7 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Pysvtools.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Pysvtools.scala @@ -33,7 +33,7 @@ class Pysvtools(val root: Configurable) extends BiopetCommandLineFunction { @Argument(doc = "Set flanking amount") var flanking: Option[Int] = config("flanking") - var exclusionRegions: List[File] = config("exclusion_regions") + var exclusionRegions: List[File] = config("exclusion_regions", default = Nil) var translocationsOnly: Boolean = config("translocations_only", default = false) @Output(doc = "Unzipped file", required = true) diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Snptest.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Snptest.scala new file mode 100644 index 0000000000000000000000000000000000000000..d11cfceb9e8b8aaeda7351897f6b58e5e634137c --- /dev/null +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Snptest.scala @@ -0,0 +1,172 @@ +package nl.lumc.sasc.biopet.extensions + +import java.io.File + +import nl.lumc.sasc.biopet.core.{ Version, Reference, BiopetCommandLineFunction } +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } + +import scala.util.matching.Regex + +/** + * Created by pjvan_thof on 3/25/16. + */ +class Snptest(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version { + @Input(required = true) + var inputGenotypes: List[File] = Nil + + @Input(required = true) + var inputSampleFiles: List[File] = Nil + + @Output(required = false) + var logFile: Option[File] = None + + @Output(required = false) + var outputFile: Option[File] = None + + @Output(required = false) + var outputDatabaseFile: Option[File] = None + + var assumeChromosome: Option[String] = config("assume_chromosome") + var genotypeField: Option[String] = config("genotype_field") + var genotypeProbabilityScale: Option[String] = config("genotype_probability_scale") + var haploidGenotypeCoding: Option[String] = config("haploid_genotype_coding") + var missingCode: Option[String] = config("missing_code") + var totalProbLimit: Option[Double] = config("total_prob_limit") + var tableName: Option[String] = config("table_name") + var useLongColumnNamingScheme: Boolean = config("use_long_column_naming_scheme", default = false) + + var excludeSamples: List[String] = config("exclude_samples", default = Nil) + + @Input(required = false) + val excludeSnp: Option[File] = config("exclude_snp") + + var missThresh: Option[Double] = config("miss_thresh") + var baselinePhenotype: Option[String] = config("baseline_phenotype") + var bayesian: List[String] = config("bayesian", default = Nil) + var callThresh: Option[Double] = config("call_thresh") + var frequentist: List[String] = config("frequentist", default = Nil) + var fullParameterEstimates: Boolean = config("full_parameter_estimates", default = false) + var method: Option[String] = config("method") + var pheno: Option[String] = config("pheno") + var summaryStatsOnly: Boolean = config("summary_stats_only", default = false) + + var covAll: Boolean = config("cov_all", default = false) + var covAllContinuous: Boolean = config("cov_all_continuous", default = false) + var covAllDiscrete: Boolean = config("cov_all_discrete", default = false) + var covNames: List[String] = config("cov_names", default = Nil) + var sexColumn: Option[String] = config("sex_column") + var stratifyOn: Option[String] = config("stratify_on") + + var conditionOn: List[String] = config("condition_on", default = Nil) + + var priorAdd: List[String] = config("prior_add", default = Nil) + var priorCov: List[String] = config("prior_cov", default = Nil) + var priorDom: List[String] = config("prior_dom", default = Nil) + var priorGen: List[String] = config("prior_gen", default = Nil) + var priorHet: List[String] = config("prior_het", default = Nil) + var priorRec: List[String] = config("prior_rec", default = Nil) + var tDf: Option[String] = config("t_df") + var tPrior: Boolean = config("t_prior", default = false) + + var priorMqtQ: Option[String] = config("prior_mqt_Q") + var priorQtVb: Option[String] = config("prior_qt_V_b") + var priorQtVq: Option[String] = config("prior_qt_V_q") + var priorQtA: Option[String] = config("prior_qt_a") + var priorQtB: Option[String] = config("prior_qt_b") + var priorQtMeanB: Option[String] = config("prior_qt_mean_b") + var priorQtMeanQ: Option[String] = config("prior_qt_mean_q") + + var meanBf: List[String] = config("mean_bf", default = Nil) + + var analysisDescription: Option[String] = config("analysis_description") + var chunk: Option[Int] = config("chunk") + var debug: Boolean = config("debug", default = false) + var hwe: Boolean = config("hwe", default = false) + var lowerSampleLimit: Option[Int] = config("lower_sample_limit") + var overlap: Boolean = config("overlap", default = false) + var printids: Boolean = config("printids", default = false) + var quantileNormalisePhenotypes: Boolean = config("quantile_normalise_phenotypes", default = false) + var range: List[String] = config("range", default = Nil) + var renorm: Boolean = config("renorm", default = false) + var snpid: List[String] = config("snpid", default = Nil) + var useRawCovariates: Boolean = config("use_raw_covariates", default = false) + var useRawPhenotypes: Boolean = config("use_raw_phenotypes", default = false) + var noClobber: Boolean = config("no_clobber", default = false) + + executable = config("exe", default = "snptest") + + def versionCommand: String = executable + " -help" + def versionRegex: Regex = "Welcome to SNPTEST (.*)".r + + override def beforeGraph(): Unit = { + super.beforeGraph() + require(inputGenotypes.length == inputSampleFiles.length) + } + + def cmdLine: String = { + val data = inputGenotypes.zip(inputSampleFiles).flatMap(x => List(x._1, x._2)) + required(executable) + + optional("-assume_chromosome", assumeChromosome) + + multiArg("-data", data, groupSize = 2) + + optional("-genotype_field", genotypeField) + + optional("-genotype_probability_scale", genotypeProbabilityScale) + + optional("-haploid_genotype_coding", haploidGenotypeCoding) + + optional("-missing_code", missingCode) + + optional("-total_prob_limit", totalProbLimit) + + optional("-log", logFile) + + optional("-o", outputFile) + + optional("-odb", outputDatabaseFile) + + optional("-table_name", tableName) + + conditional(useLongColumnNamingScheme, "-use_long_column_naming_scheme") + + multiArg("-exclude_samples", excludeSamples) + + optional("-exclude_snp", excludeSnp) + + optional("-miss_thresh", missThresh) + + optional("-baseline_phenotype", baselinePhenotype) + + multiArg("-bayesian", bayesian) + + optional("-call_thresh", callThresh) + + multiArg("-frequentist", frequentist) + + conditional(fullParameterEstimates, "-full_parameter_estimates") + + optional("-method", method) + + optional("-pheno", pheno) + + conditional(summaryStatsOnly, "-summary_stats_only") + + conditional(covAll, "-cov_all") + + conditional(covAllContinuous, "-cov_all_continuous") + + conditional(covAllDiscrete, "-cov_all_discrete") + + multiArg("-cov_names", covNames) + + optional("-sex_column", sexColumn) + + optional("-stratify_on", stratifyOn) + + multiArg("-condition_on", conditionOn) + + multiArg("-prior_add", priorAdd, groupSize = 4, maxGroups = 1) + + multiArg("-prior_cov", priorCov, groupSize = 2, maxGroups = 1) + + multiArg("-prior_dom", priorDom, groupSize = 4, maxGroups = 1) + + multiArg("-prior_gen", priorGen, groupSize = 6, maxGroups = 1) + + multiArg("-prior_het", priorHet, groupSize = 4, maxGroups = 1) + + multiArg("-prior_rec", priorRec, groupSize = 4, maxGroups = 1) + + optional("-t_df", tDf) + + conditional(tPrior, "-t_prior") + + optional("-prior_mqt_Q", priorMqtQ) + + optional("-prior_qt_V_b", priorQtVb) + + optional("-prior_qt_V_q", priorQtVq) + + optional("-prior_qt_a", priorQtA) + + optional("-prior_qt_b", priorQtB) + + optional("-prior_qt_mean_b", priorQtMeanB) + + optional("-prior_qt_mean_q", priorQtMeanQ) + + multiArg("-mean_bf", meanBf, groupSize = 2, maxGroups = 1) + + optional("-analysis_description", analysisDescription) + + optional("-analysis_name", analysisName) + + optional("-chunk", chunk) + + conditional(debug, "-debug") + + conditional(hwe, "-hwe") + + optional("-lower_sample_limit", lowerSampleLimit) + + conditional(overlap, "-overlap") + + conditional(printids, "-printids") + + conditional(quantileNormalisePhenotypes, "quantile_normalise_phenotypes") + + multiArg("-range", range, groupSize = 2) + + conditional(renorm, "-renorm") + + multiArg("-snpid", snpid) + + conditional(useRawCovariates, "-use_raw_covariates") + + conditional(useRawPhenotypes, "-use_raw_phenotypes") + + conditional(noClobber, "-no_clobber") + } +} diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala index 2017d4c30da42cce13d4b10ac7ca3142e3f5f178..bc8ae1e7a3072db7afeaa346425c38bb83a8e633 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala @@ -32,7 +32,7 @@ import scala.io.Source */ class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version with Summarizable { - executable = config("exe", submodule = "perl", default = "perl") + executable = config("exe", namespace = "perl", default = "perl") var vepScript: String = config("vep_script") @Input(doc = "input VCF", required = true) diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bcftools/Bcftools.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bcftools/Bcftools.scala index 2291df063b26293132bbd95c9f8ccf56c8ddfd13..56a76ebbf8e443a4524433bf66b0271d3f4e5c6a 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bcftools/Bcftools.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bcftools/Bcftools.scala @@ -19,7 +19,7 @@ import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction } abstract class Bcftools extends BiopetCommandLineFunction with Version { override def subPath = "bcftools" :: super.subPath - executable = config("exe", default = "bcftools", submodule = "bcftools", freeVar = false) + executable = config("exe", default = "bcftools", namespace = "bcftools", freeVar = false) def versionCommand = executable def versionRegex = """Version: (.*)""".r override def versionExitcode = List(0, 1) diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/Bedtools.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/Bedtools.scala index 1933cbea798a11c314f267df611aa09bccb5f92c..02c1ed90a5e5007f9232e0a0523ae4b362c8dae2 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/Bedtools.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/Bedtools.scala @@ -20,7 +20,7 @@ import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction } /** General abstract class for bedtools extensions */ abstract class Bedtools extends BiopetCommandLineFunction with Version { override def subPath = "bedtools" :: super.subPath - executable = config("exe", default = "bedtools", submodule = "bedtools") + executable = config("exe", default = "bedtools", namespace = "bedtools") def versionCommand = executable + " --version" def versionRegex = """bedtools (.*)""".r } \ No newline at end of file diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala index d5e3fffcfaeb7d842743ecc9e8896e7de860bbed..ceceed5f64ba51a75b098e4cd1b18beaa4f1894d 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala @@ -23,7 +23,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output } class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction with Reference { - javaMainClass = classOf[org.broadinstitute.gatk.tools.CatVariants].getClass.getName + javaMainClass = classOf[org.broadinstitute.gatk.tools.CatVariants].getName @Input(required = true) var inputFiles: List[File] = Nil @@ -34,6 +34,8 @@ class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction @Input var reference: File = null + var assumeSorted = false + override def beforeGraph(): Unit = { super.beforeGraph() if (reference == null) reference = referenceFasta() @@ -42,7 +44,8 @@ class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction override def cmdLine = super.cmdLine + repeat("-V", inputFiles) + required("-out", outputFile) + - required("-R", reference) + required("-R", reference) + + conditional(assumeSorted, "--assumeSorted") } object CatVariants { diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/SelectVariants.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/SelectVariants.scala new file mode 100644 index 0000000000000000000000000000000000000000..65abc60985a05edcd07aee8646f5b0c04ca839ed --- /dev/null +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/SelectVariants.scala @@ -0,0 +1,62 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.gatk + +import java.io.File + +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** + * Extension for CombineVariants from GATK + * + * Created by pjvan_thof on 2/26/15. + */ +class SelectVariants(val root: Configurable) extends Gatk { + val analysisType = "SelectVariants" + + @Input(doc = "", required = true) + var inputFiles: List[File] = Nil + + @Output(doc = "", required = true) + var outputFile: File = null + + var excludeNonVariants: Boolean = false + + var inputMap: Map[File, String] = Map() + + def addInput(file: File, name: String): Unit = { + inputFiles :+= file + inputMap += file -> name + } + + override def beforeGraph(): Unit = { + super.beforeGraph() + if (outputFile.getName.endsWith(".vcf.gz")) outputFiles :+= new File(outputFile.getAbsolutePath + ".tbi") + deps :::= inputFiles.filter(_.getName.endsWith("vcf.gz")).map(x => new File(x.getAbsolutePath + ".tbi")) + deps = deps.distinct + } + + override def cmdLine = super.cmdLine + + (for (file <- inputFiles) yield { + inputMap.get(file) match { + case Some(name) => required("-V:" + name, file) + case _ => required("-V", file) + } + }).mkString + + required("-o", outputFile) + + conditional(excludeNonVariants, "--excludeNonVariants") +} diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala index 56f3060b1eb917019d9f6cfe40f353778c3fb7c4..f7d4b3313582c72893a02aae7ebbfa1b833d903d 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/GatkGeneral.scala @@ -25,7 +25,7 @@ import org.broadinstitute.gatk.engine.phonehome.GATKRunReport import org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGATK trait GatkGeneral extends CommandLineGATK with CommandLineResources with Reference with Version { - var executable: String = config("java", default = "java", submodule = "java", freeVar = false) + var executable: String = config("java", default = "java", namespace = "java", freeVar = false) override def subPath = "gatk" :: super.subPath diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVTools.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVTools.scala index a509f2e7b77149501091170b36087ed067971797..21f3019e19f56711aa1a35cdc3ac556a62a6baf8 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVTools.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/igvtools/IGVTools.scala @@ -24,7 +24,7 @@ import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction } * Created by wyleung on 5-1-15 */ abstract class IGVTools extends BiopetCommandLineFunction with Version { - executable = config("exe", default = "igvtools", submodule = "igvtools", freeVar = false) + executable = config("exe", default = "igvtools", namespace = "igvtools", freeVar = false) def versionCommand = executable + " version" def versionRegex = """IGV Version:? ([\w\.]*) .*""".r override def versionExitcode = List(0) diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2.scala index 5f8365efc793ca1903ee6eba609d6b87f3ebbf65..68e7e205a5213b058b18ecf85b73936ff8506b64 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/macs2/Macs2.scala @@ -23,7 +23,7 @@ import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction } * Created by sajvanderzeeuw on 12/19/14. */ abstract class Macs2 extends BiopetCommandLineFunction with Version { - executable = config("exe", default = "macs2", submodule = "macs2", freeVar = false) + executable = config("exe", default = "macs2", namespace = "macs2", freeVar = false) def versionCommand = executable + " --version" def versionRegex = """macs2 (.*)""".r override def versionExitcode = List(0, 1) diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/manwe/Manwe.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/manwe/Manwe.scala index b7a3c161a2e3992751923c06332205845e0403ff..09c518f617fe75023663ff4871e1caae678d4c6e 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/manwe/Manwe.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/manwe/Manwe.scala @@ -26,7 +26,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Output, Argument } * manwe [subcommand] */ abstract class Manwe extends BiopetCommandLineFunction { - executable = config("exe", default = "manwe", submodule = "manwe") + executable = config("exe", default = "manwe", namespace = "manwe") var manweConfig: File = createManweConfig(None) diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SortVcf.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SortVcf.scala new file mode 100644 index 0000000000000000000000000000000000000000..eb36ba9faad4980d28df0308162237b655e7ad4c --- /dev/null +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SortVcf.scala @@ -0,0 +1,61 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.picard + +import java.io.File + +import nl.lumc.sasc.biopet.core.Reference +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** Extension for picard SortVcf */ +class SortVcf(val root: Configurable) extends Picard with Reference { + javaMainClass = new picard.vcf.SortVcf().getClass.getName + + @Input(doc = "Input VCF(s) to be sorted. Multiple inputs must have the same sample names (in order)", required = true) + var input: File = _ + + @Output(doc = "Output VCF to be written.", required = true) + var output: File = _ + + @Input(doc = "Sequence dictionary to use", required = true) + var sequenceDictionary: File = _ + + override val dictRequired = true + + override def beforeGraph(): Unit = { + super.beforeGraph() + if (sequenceDictionary == null) sequenceDictionary = referenceDict + } + + /** Returns command to execute */ + override def cmdLine = super.cmdLine + + (if (inputAsStdin) required("INPUT=", new File("/dev/stdin"), spaceSeparated = false) + else required("INPUT=", input, spaceSeparated = false)) + + (if (outputAsStsout) required("OUTPUT=", new File("/dev/stdout"), spaceSeparated = false) + else required("OUTPUT=", output, spaceSeparated = false)) + + required("SEQUENCE_DICTIONARY=", sequenceDictionary, spaceSeparated = false) +} + +object SortVcf { + /** Returns default SortSam */ + def apply(root: Configurable, input: File, output: File): SortVcf = { + val sortVcf = new SortVcf(root) + sortVcf.input = input + sortVcf.output = output + sortVcf + } +} \ No newline at end of file diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/sambamba/Sambamba.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/sambamba/Sambamba.scala index b12ff9584da1090aa90970dbbd137c0bb6b792e9..f5153a987b7cbb78e59022bb601c0d816b1e3675 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/sambamba/Sambamba.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/sambamba/Sambamba.scala @@ -24,7 +24,7 @@ abstract class Sambamba extends BiopetCommandLineFunction with Version { override def subPath = "sambamba" :: super.subPath - executable = config("exe", default = "sambamba", submodule = "sambamba", freeVar = false) + executable = config("exe", default = "sambamba", namespace = "sambamba", freeVar = false) def versionCommand = executable def versionRegex = """sambamba v(.*)""".r override def versionExitcode = List(0, 1) diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/Samtools.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/Samtools.scala index f0703990369edb6791ffb11b61bdbbf0927a270a..ca8133896411ea979117dfcc0251e41625f45d36 100644 --- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/Samtools.scala +++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/Samtools.scala @@ -20,7 +20,7 @@ import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction } /** General class for samtools extensions */ abstract class Samtools extends BiopetCommandLineFunction with Version { override def subPath = "samtools" :: super.subPath - executable = config("exe", default = "samtools", submodule = "samtools", freeVar = false) + executable = config("exe", default = "samtools", namespace = "samtools", freeVar = false) def versionCommand = executable def versionRegex = """Version: (.*)""".r override def versionExitcode = List(0, 1) diff --git a/public/biopet-public-package/pom.xml b/public/biopet-public-package/pom.xml index 23f18d9b40a5d323fc8c4a387359d383436007ae..2272ea90129dd466c9eeaf09dad4a244b2a97965 100644 --- a/public/biopet-public-package/pom.xml +++ b/public/biopet-public-package/pom.xml @@ -74,16 +74,14 @@ <artifactId>Sage</artifactId> <version>${project.version}</version> </dependency> - <!-- <dependency> <groupId>nl.lumc.sasc</groupId> - <artifactId>Yamsvp</artifactId> + <artifactId>Kopisu</artifactId> <version>${project.version}</version> </dependency> - --> <dependency> <groupId>nl.lumc.sasc</groupId> - <artifactId>Kopisu</artifactId> + <artifactId>GwasTest</artifactId> <version>${project.version}</version> </dependency> <dependency> diff --git a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutablePublic.scala b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutablePublic.scala index a6553001fb63493fda08ab069cba37e1e8a2ebe5..8b21fd806b6ed32cf81d02f7adce43bad41e2562 100644 --- a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutablePublic.scala +++ b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutablePublic.scala @@ -32,7 +32,8 @@ object BiopetExecutablePublic extends BiopetExecutable { nl.lumc.sasc.biopet.pipelines.toucan.Toucan, nl.lumc.sasc.biopet.pipelines.shiva.ShivaSvCalling, nl.lumc.sasc.biopet.pipelines.gears.GearsSingle, - nl.lumc.sasc.biopet.pipelines.gears.Gears + nl.lumc.sasc.biopet.pipelines.gears.Gears, + nl.lumc.sasc.biopet.pipelines.gwastest.GwasTest ) def pipelines: List[MainCommand] = List( diff --git a/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/BaseCounter.scala b/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/BaseCounter.scala index d15d26a368f25080e7c0e54f668a2af1b2831699..751053b55bac3e60c09e0b3c0566648cbe997f49 100644 --- a/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/BaseCounter.scala +++ b/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/BaseCounter.scala @@ -37,7 +37,7 @@ class BaseCounter(val root: Configurable) extends ToolCommandFunction { var prefix: String = "output" - override def defaultCoreMemory = 3.0 + override def defaultCoreMemory = 5.0 override def defaultThreads = 4 def transcriptTotalCounts = new File(outputDir, s"$prefix.base.transcript.counts") @@ -89,6 +89,7 @@ class BaseCounter(val root: Configurable) extends ToolCommandFunction { nonStrandedMetaExonCounts, strandedMetaExonCounts, strandedSenseMetaExonCounts, strandedAntiSenseMetaExonCounts) jobOutputFile = new File(outputDir, s".$prefix.basecounter.out") + if (bamFile != null) deps :+= new File(bamFile.getAbsolutePath.stripSuffix(".bam") + ".bai") super.beforeGraph() } diff --git a/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/GensToVcf.scala b/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/GensToVcf.scala new file mode 100644 index 0000000000000000000000000000000000000000..10e5eca751baceeb659e3f602efa0fdc5093b9d0 --- /dev/null +++ b/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/GensToVcf.scala @@ -0,0 +1,74 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.tools + +import java.io.File + +import nl.lumc.sasc.biopet.core.{ Reference, ToolCommandFunction } +import nl.lumc.sasc.biopet.utils.Logging +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } + +/** + * + */ +class GensToVcf(val root: Configurable) extends ToolCommandFunction with Reference { + def toolObject = nl.lumc.sasc.biopet.tools.GensToVcf + + @Input(doc = "Input genotypes file", required = true) + var inputGens: File = _ + + @Input(doc = "input Info file", required = false) + var inputInfo: Option[File] = None + + @Input(required = true) + var samplesFile: File = _ + + @Input(required = true) + var reference: File = _ + + @Output(required = true) + var outputVcf: File = _ + + var contig: String = _ + + var sortInput: Boolean = false + + override def defaultCoreMemory = 6.0 + + override def beforeGraph(): Unit = { + super.beforeGraph() + if (reference == null) reference = referenceFasta() + if (contig == null) throw new IllegalStateException("Contig is missing") + if (outputVcf.getName.endsWith(".vcf.gz")) outputFiles :+= new File(outputVcf.getAbsolutePath + ".tbi") + } + + override def setupRetry(): Unit = { + super.setupRetry() + sortInput = true + } + + override def cmdLine = super.cmdLine + + required("--inputGenotypes", inputGens) + + required("--inputInfo", inputInfo) + + required("--outputVcf", outputVcf) + + optional("--contig", contig) + + required("--referenceFasta", reference) + + required("--samplesFile", samplesFile) + + conditional(sortInput, "--sortInput") + +} + diff --git a/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/SnptestToVcf.scala b/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/SnptestToVcf.scala new file mode 100644 index 0000000000000000000000000000000000000000..e2f938872d070abcd14f889d562efa27da345e4f --- /dev/null +++ b/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/SnptestToVcf.scala @@ -0,0 +1,56 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.tools + +import java.io.File + +import nl.lumc.sasc.biopet.core.{ Reference, ToolCommandFunction } +import nl.lumc.sasc.biopet.utils.Logging +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** + * + */ +class SnptestToVcf(val root: Configurable) extends ToolCommandFunction with Reference { + def toolObject = nl.lumc.sasc.biopet.tools.SnptestToVcf + + @Input(doc = "input Info file", required = true) + var inputInfo: File = null + + @Input(required = true) + var reference: File = _ + + @Output(required = true) + var outputVcf: File = _ + + var contig: String = _ + + override def beforeGraph(): Unit = { + super.beforeGraph() + if (reference == null) reference = referenceFasta() + if (contig == null) throw new IllegalStateException("Contig is missing") + //if (outputVcf.getName.endsWith(".vcf.gz")) outputFiles :+= new File(outputVcf.getAbsolutePath + ".tbi") + } + + override def cmdLine = super.cmdLine + + required("--inputInfo", inputInfo) + + required("--outputVcf", outputVcf) + + optional("--contig", contig) + + required("--referenceFasta", reference) + +} + diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BaseCounter.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BaseCounter.scala index 9e78a324b61fe1525f3f7ce6fdd26596819c6ed7..0add274bfbc477ab771466f9b4b5cdfc2024a1f1 100644 --- a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BaseCounter.scala +++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BaseCounter.scala @@ -70,7 +70,9 @@ object BaseCounter extends ToolCommand { logger.info("Finding overlapping genes") val overlapGenes = groupGenesOnOverlap(geneReader.getAll) - logger.info("Start reading bamFile") + counter = 0 + + logger.info(s"Start reading bamFile divided over ${overlapGenes.values.flatten.size} chunks") val counts = (for (genes <- overlapGenes.values.flatten.par) yield runThread(cmdArgs.bamFile, genes)).toList logger.info("Done reading bamFile") @@ -292,6 +294,8 @@ object BaseCounter extends ToolCommand { else samRecord.getReadNegativeStrandFlag == strand } + private[tools] var counter = 0 + private[tools] case class ThreadOutput(geneCounts: List[GeneCount], nonStrandedMetaExonCounts: List[(String, RegionCount)], strandedMetaExonCounts: List[(String, RegionCount)]) @@ -315,6 +319,8 @@ object BaseCounter extends ToolCommand { } bamReader.close() + counter += 1 + if (counter % 1000 == 0) logger.info(s"${counter} chunks done") ThreadOutput(counts.values.toList, metaExons, plusMetaExons ::: minMetaExons) } diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GensToVcf.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GensToVcf.scala new file mode 100644 index 0000000000000000000000000000000000000000..96b5212ab06be4c0c1243f8d80015093c68c93f1 --- /dev/null +++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GensToVcf.scala @@ -0,0 +1,166 @@ +package nl.lumc.sasc.biopet.tools + +import java.io.File +import java.util + +import htsjdk.samtools.reference.{ FastaSequenceFile, ReferenceSequenceFileFactory } +import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder } +import htsjdk.variant.variantcontext.{ Allele, GenotypeBuilder, VariantContextBuilder } +import htsjdk.variant.vcf._ +import nl.lumc.sasc.biopet.utils.ToolCommand + +import scala.collection.JavaConversions._ +import scala.io.Source + +/** + * Created by pjvanthof on 15/03/16. + */ +object GensToVcf extends ToolCommand { + + case class Args(inputGenotypes: File = null, + inputInfo: Option[File] = None, + outputVcf: File = null, + sampleFile: File = null, + referenceFasta: File = null, + contig: String = null, + sortInput: Boolean = false) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('g', "inputGenotypes") required () maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(inputGenotypes = x) + } text "Input genotypes" + opt[File]('i', "inputInfo") maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(inputInfo = Some(x)) + } text "Input info fields" + opt[File]('o', "outputVcf") required () maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(outputVcf = x) + } text "Output vcf file" + opt[File]('s', "samplesFile") required () maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(sampleFile = x) + } text "Samples file" + opt[File]('R', "referenceFasta") required () maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(referenceFasta = x) + } text "reference fasta file" + opt[String]('c', "contig") required () maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(contig = x) + } text "contig of impute file" + opt[Unit]("sortInput") maxOccurs 1 action { (x, c) => + c.copy(sortInput = true) + } text "In memory sorting" + } + + def main(args: Array[String]): Unit = { + logger.info("Start") + + val argsParser = new OptParser + val cmdArgs = argsParser.parse(args, Args()).getOrElse(throw new IllegalArgumentException) + + val samples = Source.fromFile(cmdArgs.sampleFile).getLines().toArray.drop(2).map(_.split("\t").take(2).mkString("_")) + + val infoIt = cmdArgs.inputInfo.map(Source.fromFile(_).getLines()) + val infoHeaderKeys = infoIt.map(_.next().split(" ").filterNot(x => x == "rs_id" || x == "position")) + val infoHeaderMap = infoHeaderKeys.map(_.zipWithIndex.toMap) + + val metaLines = new util.HashSet[VCFHeaderLine]() + for (keys <- infoHeaderKeys; key <- keys) + metaLines.add(new VCFInfoHeaderLine(s"GENS_$key", 1, VCFHeaderLineType.String, "")) + + metaLines.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "")) + metaLines.add(new VCFFormatHeaderLine("GP", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, "")) + + val reference = new FastaSequenceFile(cmdArgs.referenceFasta, true) + require(reference.getSequenceDictionary.getSequence(cmdArgs.contig) != null, + s"contig '${cmdArgs.contig}' not found on reference") + + val header = new VCFHeader(metaLines, samples.toList) + header.setSequenceDictionary(reference.getSequenceDictionary) + val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder() + .setOutputFile(cmdArgs.outputVcf) + .setReferenceDictionary(header.getSequenceDictionary) + .build) + writer.writeHeader(header) + + val genotypeIt = Source.fromFile(cmdArgs.inputGenotypes).getLines() + + lazy val fastaFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(cmdArgs.referenceFasta, true, true) + + case class Line(genotype: String, info: Option[String]) + val lineIt: Iterator[Line] = { + val it = infoIt match { + case Some(x) => genotypeIt.zip(x).map(x => Line(x._1, Some(x._2))) + case _ => genotypeIt.map(x => Line(x, None)) + } + + if (cmdArgs.sortInput) { + logger.info("Start Sorting input files") + val list = it.toList + val pos = list.map { line => + val values = line.genotype.split(" ") + val p = values(2).toInt + val alt = values(4) + if (alt == "-") p - 1 + else p + } + list.zip(pos).sortBy(_._2).map(_._1).toIterator + } else it + } + + logger.info("Start processing genotypes") + var count = 0L + for (line <- lineIt) { + val genotypeValues = line.genotype.split(" ") + val infoValues = line.info.map(_.split(" ")) + + val (start, end, ref, alt) = { + val start = genotypeValues(2).toInt + if (genotypeValues(4) == "-") { + val seq = fastaFile.getSubsequenceAt(cmdArgs.contig, start - 1, start + genotypeValues(4).length - 1) + (start - 1, start + genotypeValues(4).length - 1, + Allele.create(new String(seq.getBases), true), Allele.create(new String(Array(seq.getBases.head)))) + } else { + val ref = Allele.create(genotypeValues(3), true) + (start, ref.length - 1 + start, Allele.create(genotypeValues(3), true), Allele.create(genotypeValues(4))) + } + } + val genotypes = samples.toList.zipWithIndex.map { + case (sampleName, index) => + val gps = Array( + genotypeValues(5 + (index * 3)), + genotypeValues(5 + (index * 3) + 1), + genotypeValues(5 + (index * 3) + 2) + ).map(_.toDouble) + val alleles = gps.indexOf(gps.max) match { + case 0 => List(ref, ref) + case 1 => List(ref, alt) + case 2 => List(alt, alt) + } + new GenotypeBuilder() + .name(sampleName) + .alleles(alleles) + .attribute("GP", gps) + .make() + } + + val infoMap = infoHeaderKeys.map(_.map(x => ("GENS_" + x) -> infoValues.get(infoHeaderMap.get(x))).toMap).getOrElse(Map()) + + val builder = (new VariantContextBuilder) + .chr(cmdArgs.contig) + .alleles(List(ref, alt)) + .attributes(infoMap) + .start(start) + .stop(end) + .genotypes(genotypes) + val id = genotypeValues(1) + if (id.startsWith(cmdArgs.contig + ":")) writer.add(builder.make()) + else writer.add(builder.id(id).make()) + count += 1 + if (count % 10000 == 0) logger.info(s"$count lines processed") + } + + logger.info(s"$count lines processed") + + writer.close() + + logger.info("Done") + } +} diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala index c93eaed3993830d9e1860b827d772b2c925b6983..91761c1965510f240af328bb54f4408d6388a75b 100644 --- a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala +++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala @@ -17,6 +17,8 @@ package nl.lumc.sasc.biopet.tools import java.io.{ File, PrintWriter } +import cern.jet.random.Binomial +import cern.jet.random.engine.RandomEngine import nl.lumc.sasc.biopet.utils.ToolCommand import scala.collection.mutable @@ -26,7 +28,7 @@ import scala.math.{ floor, round } object MpileupToVcf extends ToolCommand { case class Args(input: File = null, output: File = null, sample: String = null, minDP: Int = 8, minAP: Int = 2, - homoFraction: Double = 0.8, ploidy: Int = 2) extends AbstractArgs + homoFraction: Double = 0.8, ploidy: Int = 2, seqError: Double = 0.005) extends AbstractArgs class OptParser extends AbstractOptParser { opt[File]('I', "input") valueName "<file>" action { (x, c) => @@ -50,6 +52,9 @@ object MpileupToVcf extends ToolCommand { opt[Int]("ploidy") action { (x, c) => c.copy(ploidy = x) } + opt[Double]("seqError") action { (x, c) => + c.copy(seqError = x) + } } /** @@ -69,8 +74,10 @@ object MpileupToVcf extends ToolCommand { writer.println("##FORMAT=<ID=FREQ,Number=A,Type=Float,Description=\"Allele Frequency\">") writer.println("##FORMAT=<ID=RFC,Number=1,Type=Integer,Description=\"Reference Forward Reads\">") writer.println("##FORMAT=<ID=RRC,Number=1,Type=Integer,Description=\"Reference Reverse Reads\">") - writer.println("##FORMAT=<ID=AFC,Number=A,Type=Integer,Description=\"Alternetive Forward Reads\">") - writer.println("##FORMAT=<ID=ARC,Number=A,Type=Integer,Description=\"Alternetive Reverse Reads\">") + writer.println("##FORMAT=<ID=AFC,Number=A,Type=Integer,Description=\"Alternative Forward Reads\">") + writer.println("##FORMAT=<ID=ARC,Number=A,Type=Integer,Description=\"Alternative Reverse Reads\">") + writer.println("##FORMAT=<ID=SEQ-ERR,Number=.,Type=Float,Description=\"Probability to not be a sequence error with error rate " + commandArgs.seqError + "\">") + writer.println("##FORMAT=<ID=MA-SEQ-ERR,Number=1,Type=Float,Description=\"Minimal probability for all alternative alleles to not be a sequence error with error rate " + commandArgs.seqError + "\">") writer.println("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">") writer.println("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + commandArgs.sample) val inputStream = if (commandArgs.input != null) { @@ -94,7 +101,7 @@ object MpileupToVcf extends ToolCommand { } val reads = values(3).toInt val mpileup = values(4) - val qual = values(5) + //val qual = values(5) val counts: mutable.Map[String, Counts] = mutable.Map(ref.toUpperCase -> new Counts(0, 0)) @@ -137,23 +144,37 @@ object MpileupToVcf extends ToolCommand { } } + val binomial = new Binomial(reads, commandArgs.seqError, RandomEngine.makeDefault()) val info: ArrayBuffer[String] = ArrayBuffer("DP=" + reads) - val format: mutable.Map[String, String] = mutable.Map("DP" -> reads.toString) + val format: mutable.Map[String, Any] = mutable.Map("DP" -> reads.toString) val alt: ArrayBuffer[String] = new ArrayBuffer + var maSeqErr: Option[Double] = None format += ("RFC" -> counts(ref.toUpperCase).forward.toString) format += ("RRC" -> counts(ref.toUpperCase).reverse.toString) format += ("AD" -> (counts(ref.toUpperCase).forward + counts(ref.toUpperCase).reverse).toString) + format += ("SEQ-ERR" -> (1.0 - binomial.cdf(counts(ref.toUpperCase).forward + counts(ref.toUpperCase).reverse)).toString) if (reads >= commandArgs.minDP) for ((key, value) <- counts if key != ref.toUpperCase if value.forward + value.reverse >= commandArgs.minAP) { alt += key format += ("AD" -> (format("AD") + "," + (value.forward + value.reverse).toString)) + val seqErr = 1.0 - binomial.cdf(value.forward + value.reverse) + maSeqErr match { + case Some(x) if x < seqErr => + case _ => maSeqErr = Some(seqErr) + } + format += ("SEQ-ERR" -> (format("SEQ-ERR") + "," + seqErr.toString)) format += ("AFC" -> ((if (format.contains("AFC")) format("AFC") + "," else "") + value.forward)) format += ("ARC" -> ((if (format.contains("ARC")) format("ARC") + "," else "") + value.reverse)) format += ("FREQ" -> ((if (format.contains("FREQ")) format("FREQ") + "," else "") + round((value.forward + value.reverse).toDouble / reads * 1E4).toDouble / 1E2)) } + maSeqErr match { + case Some(x) => format += ("MA-SEQ-ERR" -> x) + case _ => + } + if (alt.nonEmpty) { - val ad = for (ad <- format("AD").split(",")) yield ad.toInt + val ad = for (ad <- format("AD").toString.split(",")) yield ad.toInt var left = reads - dels val gt = ArrayBuffer[Int]() diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/SnptestToVcf.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/SnptestToVcf.scala new file mode 100644 index 0000000000000000000000000000000000000000..dc18f8770dce76c1f8193486f94b64934e098b26 --- /dev/null +++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/SnptestToVcf.scala @@ -0,0 +1,126 @@ +package nl.lumc.sasc.biopet.tools + +import java.io.File +import java.util + +import htsjdk.samtools.reference.FastaSequenceFile +import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, Options, VariantContextWriterBuilder } +import htsjdk.variant.variantcontext.{ Allele, VariantContextBuilder } +import htsjdk.variant.vcf._ +import nl.lumc.sasc.biopet.utils.ToolCommand + +import scala.collection.JavaConversions._ +import scala.io.Source + +/** + * Created by pjvanthof on 15/03/16. + */ +object SnptestToVcf extends ToolCommand { + + case class Args(inputInfo: File = null, + outputVcf: File = null, + referenceFasta: File = null, + contig: String = null) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('i', "inputInfo") required () maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(inputInfo = x) + } text "Input info fields" + opt[File]('o', "outputVcf") required () maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(outputVcf = x) + } text "Output vcf file" + opt[File]('R', "referenceFasta") required () maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(referenceFasta = x) + } text "reference fasta file" + opt[String]('c', "contig") required () maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(contig = x) + } text "contig of impute file" + } + + def main(args: Array[String]): Unit = { + logger.info("Start") + + val argsParser = new OptParser + val cmdArgs = argsParser.parse(args, Args()).getOrElse(throw new IllegalArgumentException) + + val infoIt = Source.fromFile(cmdArgs.inputInfo).getLines() + val infoHeader = infoIt.find(!_.startsWith("#")) + + infoHeader match { + case Some(header) => parseLines(header, infoIt, cmdArgs) + case _ => + writeEmptyVcf(cmdArgs.outputVcf, cmdArgs.referenceFasta) + logger.info("No header and records found in file") + } + + logger.info("Done") + } + + def writeEmptyVcf(outputVcf: File, referenceFasta: File): Unit = { + val reference = new FastaSequenceFile(referenceFasta, true) + val vcfHeader = new VCFHeader() + vcfHeader.setSequenceDictionary(reference.getSequenceDictionary) + val writer = new VariantContextWriterBuilder() + .setOutputFile(outputVcf) + .setReferenceDictionary(vcfHeader.getSequenceDictionary) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build + writer.writeHeader(vcfHeader) + writer.close() + } + + def parseLines(header: String, lineIt: Iterator[String], cmdArgs: Args): Unit = { + val headerKeys = header.split(" ") + val headerMap = headerKeys.zipWithIndex.toMap + require(headerKeys.size == headerMap.size, "Duplicates header keys found") + val metaLines = new util.HashSet[VCFHeaderLine]() + for ( + key <- headerKeys if key != "rsid" if key != "chromosome" if key != "position" if key != "alleleA" if key != "alleleB" if key != "alleleA" + ) metaLines.add(new VCFInfoHeaderLine(s"ST_$key", 1, VCFHeaderLineType.String, "")) + + val reference = new FastaSequenceFile(cmdArgs.referenceFasta, true) + require(reference.getSequenceDictionary.getSequence(cmdArgs.contig) != null, + s"contig '${cmdArgs.contig}' not found on reference") + + val vcfHeader = new VCFHeader(metaLines) + vcfHeader.setSequenceDictionary(reference.getSequenceDictionary) + val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder() + .setOutputFile(cmdArgs.outputVcf) + .setReferenceDictionary(vcfHeader.getSequenceDictionary) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build) + writer.writeHeader(vcfHeader) + + val infoKeys = for ( + key <- headerKeys if key != "rsid" if key != "chromosome" if key != "position" if key != "alleleA" if key != "alleleB" if key != "alleleA" + ) yield key + + var counter = 0 + for (line <- lineIt if !line.startsWith("#")) { + val values = line.split(" ") + require(values.size == headerKeys.size, "Number of values are not the same as number of header keys") + val alleles = List(Allele.create(values(headerMap("alleleA")), true), Allele.create(values(headerMap("alleleB")))) + val start = values(headerMap("position")).toLong + val end = alleles.head.length() + start - 1 + val rsid = values(headerMap("rsid")) + val builder = (new VariantContextBuilder) + .chr(cmdArgs.contig) + .alleles(alleles) + .start(start) + .stop(end) + .noGenotypes() + + val infoBuilder = infoKeys.foldLeft(builder) { case (a, b) => a.attribute("ST_" + b, values(headerMap(b)).replaceAll(";", ",")) } + + writer.add(infoBuilder.id(rsid.replaceAll(";", ",")).make()) + + counter += 1 + if (counter % 10000 == 0) logger.info(s"$counter lines processed") + } + + logger.info(s"$counter lines processed") + + writer.close() + + } +} diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala index d08b05c534896b33324297dde94d386bc2c863c7..cffe91f625699a517d0a7b07a60cd171d805917f 100644 --- a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala +++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala @@ -21,7 +21,6 @@ import htsjdk.variant.variantcontext.{ GenotypeType, VariantContext } import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder } import htsjdk.variant.vcf.VCFFileReader import nl.lumc.sasc.biopet.utils.ToolCommand -import nl.lumc.sasc.biopet.utils.config.Configurable import scala.collection.JavaConversions._ import scala.io.Source @@ -264,7 +263,7 @@ object VcfFilter extends ToolCommand { def minAlternateDepth(record: VariantContext, minAlternateDepth: Int, minSamplesPass: Int = 1): Boolean = { record.getGenotypes.count(genotype => { val AD = if (genotype.hasAD) List(genotype.getAD: _*) else Nil - if (AD.nonEmpty) AD.tail.count(_ >= minAlternateDepth) > 0 else true + if (AD.nonEmpty && minAlternateDepth >= 0) AD.tail.count(_ >= minAlternateDepth) > 0 else true }) >= minSamplesPass } @@ -276,8 +275,10 @@ object VcfFilter extends ToolCommand { * @return */ def minGenomeQuality(record: VariantContext, minGQ: Int, minSamplesPass: Int = 1): Boolean = { - record.getGenotypes.count(x => if (!x.hasGQ) false - else if (x.getGQ >= minGQ) true else false) >= minSamplesPass + record.getGenotypes.count(x => + if (minGQ == 0) true + else if (!x.hasGQ) false + else if (x.getGQ >= minGQ) true else false) >= minSamplesPass } /** diff --git a/public/biopet-tools/src/test/resources/gens.samples b/public/biopet-tools/src/test/resources/gens.samples new file mode 100644 index 0000000000000000000000000000000000000000..1789a409e45a4b78fd83d84808c60e5ba9a494f7 --- /dev/null +++ b/public/biopet-tools/src/test/resources/gens.samples @@ -0,0 +1,5 @@ +ID_1 ID_2 missing hip knee hand knee_or_hip sex age bmi +0 0 0 D D D D D P P +sample-1 fam-1 0.0 2 NA 2 2 1 59.0767123287671 27.1314117909924 +sample-2 fam-1 0.0 2 NA NA 2 2 48.4958904109589 23.8754325259516 +sample-3 fam-1 0.0 NA NA NA NA 2 57.0876712328767 45.8842609074008 diff --git a/public/biopet-tools/src/test/resources/test.empty.snptest b/public/biopet-tools/src/test/resources/test.empty.snptest new file mode 100644 index 0000000000000000000000000000000000000000..a5ca28471bb5f836dcc2bd61ca091f1a293c1a0f --- /dev/null +++ b/public/biopet-tools/src/test/resources/test.empty.snptest @@ -0,0 +1 @@ +# Completed successfully at 2016-04-08 13:47:05 diff --git a/public/biopet-tools/src/test/resources/test.gens b/public/biopet-tools/src/test/resources/test.gens new file mode 100644 index 0000000000000000000000000000000000000000..bec79dab840537cac62c1c009391297ae38e6fee --- /dev/null +++ b/public/biopet-tools/src/test/resources/test.gens @@ -0,0 +1,3 @@ +--- rs1 30 A C 1 0 0 0 1 0 0 0 1 +--- rs2 40 A - 1 0 0 0 1 0 0 0 1 +--- rs3 50 A AT 1 0 0 0 1 0 0 0 1 \ No newline at end of file diff --git a/public/biopet-tools/src/test/resources/test.gens_info b/public/biopet-tools/src/test/resources/test.gens_info new file mode 100644 index 0000000000000000000000000000000000000000..250a9677f1bf86cff13d7dee2eba0b9878963b8b --- /dev/null +++ b/public/biopet-tools/src/test/resources/test.gens_info @@ -0,0 +1,4 @@ +snp_id rs_id position exp_freq_a1 info certainty type info_type0 concord_type0 r2_type0 +--- rs1 30 0.037 0.535 0.958 0 -1 -1 -1 +--- rs2 40 0.023 0.621 0.978 0 -1 -1 -1 +--- rs3 50 0.000 0.114 0.999 0 -1 -1 -1 diff --git a/public/biopet-tools/src/test/resources/test.snptest b/public/biopet-tools/src/test/resources/test.snptest new file mode 100644 index 0000000000000000000000000000000000000000..4ddb3a25b54a638a0d3476670c17bd6cd57ff0ad --- /dev/null +++ b/public/biopet-tools/src/test/resources/test.snptest @@ -0,0 +1,14 @@ +# Analysis: "snptest" +# started: 2016-04-08 13:44:20 +# +# Analysis properties: +# -analysis_name snptest (user-supplied) +# -data /exports/sasc/project-171/analysis/snptest/18/18-22307785-33461676.vcf.gz /exports/sasc/project-171/src/Phenotypes_knee_hip_hand.fixed2.txt (user-supplied) +# -o /exports/sasc/project-171/analysis/snptest/18/18-22307785-33461676.snptest (user-supplied) +# -summary_stats_only (user-supplied) +# +alternate_ids rsid chromosome position alleleA alleleB index average_maximum_posterior_call info cohort_1_AA cohort_1_AB cohort_1_BB cohort_1_NULL all_AA all_AB all_BB all_NULL all_total all_maf missing_data_proportion comment +rs184556815 rs184556815 18 22307810 G T 1 1 1 2869 0 0 0 2869 0 0 0 2869 0 0 NA +rs143248971 rs143248971 18 22307875 C T 2 1 1 2869 0 0 0 2869 0 0 0 2869 0 0 NA +rs74666159 rs74666159 18 22307884 C T 3 1 1 2690 177 2 0 2690 177 2 0 2869 0.0315441 0 NA +# Completed successfully at 2016-04-08 13:47:05 diff --git a/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GensToVcfTest.scala b/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GensToVcfTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..5d4bbc7d8b84434293c6128ef594023909047a62 --- /dev/null +++ b/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GensToVcfTest.scala @@ -0,0 +1,48 @@ +package nl.lumc.sasc.biopet.tools + +import java.io.File +import java.nio.file.Paths + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +/** + * Created by pjvan_thof on 4/11/16. + */ +class GensToVcfTest extends TestNGSuite with Matchers { + @Test + def testGensOnly(): Unit = { + val output = File.createTempFile("test.", ".vcf.gz") + output.deleteOnExit() + GensToVcf.main(Array( + "--inputGenotypes", GensToVcfTest.resourcePath("/test.gens"), + "--outputVcf", output.getAbsolutePath, + "--referenceFasta", GensToVcfTest.resourcePath("/fake_chrQ.fa"), + "--contig", "chrQ", + "--samplesFile", GensToVcfTest.resourcePath("/gens.samples") + )) + } + + @Test + def testGensInfo(): Unit = { + val output = File.createTempFile("test.", ".vcf") + output.deleteOnExit() + GensToVcf.main(Array( + "--inputGenotypes", GensToVcfTest.resourcePath("/test.gens"), + "--inputInfo", GensToVcfTest.resourcePath("/test.gens_info"), + "--outputVcf", output.getAbsolutePath, + "--referenceFasta", GensToVcfTest.resourcePath("/fake_chrQ.fa"), + "--contig", "chrQ", + "--samplesFile", GensToVcfTest.resourcePath("/gens.samples"), + "--sortInput" + )) + } + +} + +object GensToVcfTest { + private def resourcePath(p: String): String = { + Paths.get(getClass.getResource(p).toURI).toString + } +} diff --git a/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/SnptestToVcfTest.scala b/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/SnptestToVcfTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..e1506b043dcb9623fcc5a3367e53c9a18dff59bd --- /dev/null +++ b/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/SnptestToVcfTest.scala @@ -0,0 +1,43 @@ +package nl.lumc.sasc.biopet.tools + +import java.io.File +import java.nio.file.Paths + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +/** + * Created by pjvan_thof on 4/11/16. + */ +class SnptestToVcfTest extends TestNGSuite with Matchers { + @Test + def testSnptest(): Unit = { + val output = File.createTempFile("test.", ".vcf.gz") + output.deleteOnExit() + SnptestToVcf.main(Array( + "--inputInfo", SnptestToVcfTest.resourcePath("/test.snptest"), + "--outputVcf", output.getAbsolutePath, + "--referenceFasta", SnptestToVcfTest.resourcePath("/fake_chrQ.fa"), + "--contig", "chrQ" + )) + } + + @Test + def testEmptySnptest(): Unit = { + val output = File.createTempFile("test.", ".vcf.gz") + output.deleteOnExit() + SnptestToVcf.main(Array( + "--inputInfo", SnptestToVcfTest.resourcePath("/test.empty.snptest"), + "--outputVcf", output.getAbsolutePath, + "--referenceFasta", SnptestToVcfTest.resourcePath("/fake_chrQ.fa"), + "--contig", "chrQ" + )) + } +} + +object SnptestToVcfTest { + private def resourcePath(p: String): String = { + Paths.get(getClass.getResource(p).toURI).toString + } +} diff --git a/public/biopet-utils/pom.xml b/public/biopet-utils/pom.xml index 722d1e6c629a51fd7f962d14dd5019e8e9201e06..cb80b9429d757389ab3936a99355606f035bb635 100644 --- a/public/biopet-utils/pom.xml +++ b/public/biopet-utils/pom.xml @@ -31,6 +31,11 @@ <packaging>jar</packaging> <dependencies> + <dependency> + <groupId>colt</groupId> + <artifactId>colt</artifactId> + <version>1.2.0</version> + </dependency> <dependency> <groupId>org.testng</groupId> <artifactId>testng</artifactId> diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala index 2540970887f7b920cd46e1ecf74bed46e39fb19a..df8068c0c7e28b57e3344d736aa7cc04abe2b8f2 100644 --- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala +++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala @@ -18,8 +18,10 @@ package nl.lumc.sasc.biopet.utils import java.io.File import htsjdk.samtools.{ SamReader, SamReaderFactory } +import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList } import scala.collection.JavaConversions._ +import scala.collection.mutable import scala.collection.parallel.immutable /** @@ -57,39 +59,76 @@ object BamUtils { * @param end postion to stop scanning * @return Int with insertsize for this contig */ - def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, samplingSize: Int = 100000): Option[Int] = { - val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam) - val samIterator = inputSam.query(contig, start, end, true) - val insertsizes: List[Int] = (for { - read <- samIterator.toStream.takeWhile(rec => { - val paired = rec.getReadPairedFlag && rec.getProperPairFlag - val bothMapped = if (paired) ((rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)) else false - paired && bothMapped - }).take(samplingSize) - } yield { - read.getInferredInsertSize.asInstanceOf[Int].abs - })(collection.breakOut) - val cti = insertsizes.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1)) - - samIterator.close() - inputSam.close() - val ret = if (cti._2 == 0) None else Some((cti._1 / cti._2).toInt) - ret + def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, + samplingSize: Int = 10000, + binSize: Int = 1000000): Option[Int] = { + + // create a bedList to devide the contig into multiple pieces + val insertSizesOnAllFragments = BedRecordList.fromList(Seq(BedRecord(contig, start, end))) + .scatter(binSize) + .allRecords.par.flatMap({ + bedRecord => + // for each scatter, open the bamfile for this specific region-query + val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam) + val samIterator = inputSam.query(bedRecord.chr, bedRecord.start, bedRecord.end, true) + + val counts: mutable.Map[Int, Int] = mutable.Map() + + for (i <- 0 until samplingSize if samIterator.hasNext) { + val rec = samIterator.next() + val isPaired = rec.getReadPairedFlag + val minQ10 = rec.getMappingQuality >= 10 + val pairOnSameContig = rec.getContig == rec.getMateReferenceName + + if (isPaired && minQ10 && pairOnSameContig) { + val insertSize = rec.getInferredInsertSize.abs + counts(insertSize) = counts.getOrElse(insertSize, 0) + 1 + } + } + + counts.keys.size match { + case 1 => Some(counts.keys.head) + case 0 => None + case _ => { + Some(counts.foldLeft(0)((old, observation) => { + observation match { + case (insertSize: Int, observations: Int) => { + (old + (insertSize * observations)) / (observations + 1) + } + case _ => 0 + } + })) + } + } + }) + + insertSizesOnAllFragments.size match { + case 1 => Some(insertSizesOnAllFragments.head) + case 0 => None + case _ => { + Some( + insertSizesOnAllFragments.foldLeft(0)((old, observation) => { + (old + observation) / 2 + })) + } + + } } /** * Estimate the insertsize for one single bamfile and return the insertsize * - * @param bamFile bamfile to estimate avg insertsize from + * @param bamFile bamfile to estimate average insertsize from * @return */ - def sampleBamInsertSize(bamFile: File, samplingSize: Int = 100000): Int = { + def sampleBamInsertSize(bamFile: File, samplingSize: Int = 10000, binSize: Int = 1000000): Int = { val inputSam: SamReader = SamReaderFactory.makeDefault.open(bamFile) - val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({ - contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize) + val bamInsertSizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({ + contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize, binSize) }).toList - val counts = baminsertsizes.flatMap(x => x) + val counts = bamInsertSizes.flatMap(x => x) + // avoid division by zero if (counts.size != 0) { counts.sum / counts.size } else { @@ -103,8 +142,8 @@ object BamUtils { * @param bamFiles input bam files * @return */ - def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 100000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile => - bamFile -> sampleBamInsertSize(bamFile, samplingSize) + def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 10000, binSize: Int = 1000000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile => + bamFile -> sampleBamInsertSize(bamFile, samplingSize, binSize) }.toMap } diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala index f0da112c99bea4bb016d13ce8b5b04edc691f430..01f5f37c34f02cddefa335dbdb1257ac1f16b3bf 100644 --- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala +++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/ConfigUtils.scala @@ -319,7 +319,7 @@ object ConfigUtils extends Logging { val exist = valueExists(value) if (!exist) Logging.addError("Value does not exist but is required, key: " + value.requestIndex.key + - " module: " + value.requestIndex.module, + " namespace: " + value.requestIndex.module, if (value.requestIndex.path != Nil) " path: " + value.requestIndex.path.mkString("->") else null) exist } diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/config/Configurable.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/config/Configurable.scala index bc0f207df80f003d7120d466a3eea7c0ffcc040c..786d1a1b02db257f7d28a74ab384f83c416ae63c 100644 --- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/config/Configurable.scala +++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/config/Configurable.scala @@ -30,10 +30,10 @@ trait Configurable extends ImplicitConversions { def configPath: List[String] = if (root != null) root.configFullPath else Nil /** Gets name of module for config */ - def configName = getClass.getSimpleName.toLowerCase + def configNamespace = getClass.getSimpleName.toLowerCase /** ull path with module in there */ - def configFullPath: List[String] = configPath ::: configName :: Nil + def configFullPath: List[String] = configPath ::: configNamespace :: Nil /** Map to store defaults for config */ def defaults: Map[String, Any] = Map() @@ -69,7 +69,7 @@ trait Configurable extends ImplicitConversions { def getConfigPath(sample: String = null, library: String = null, submodule: String = null) = { (if (sample != null) "samples" :: sample :: Nil else Nil) ::: (if (library != null) "libraries" :: library :: Nil else Nil) ::: - (if (submodule != null) configPath ::: configName :: Nil else configPath) + (if (submodule != null) configPath ::: configNamespace :: Nil else configPath) } /** Class is used for retrieval of config values */ @@ -92,7 +92,7 @@ trait Configurable extends ImplicitConversions { * * @param key Name of value * @param default Default value if not found - * @param submodule Adds to the path + * @param namespace Adds to the path * @param freeVar Default true, if set false value must exist in module * @param sample Default null, when set path is prefixed with "samples" -> "sampleID" * @param library Default null, when set path is prefixed with "libraries" -> "libraryID" @@ -100,15 +100,15 @@ trait Configurable extends ImplicitConversions { */ def apply(key: String, default: Any = null, - submodule: String = null, + namespace: String = null, freeVar: Boolean = true, sample: String = null, library: String = null, path: List[String] = null): ConfigValue = { val s = if (sample != null || defaultSample.isEmpty) sample else defaultSample.get val l = if (library != null || defaultLibrary.isEmpty) library else defaultLibrary.get - val m = if (submodule != null) submodule else configName - val p = if (path == null) getConfigPath(s, l, submodule) ::: subPath else path + val m = if (namespace != null) namespace else configNamespace + val p = if (path == null) getConfigPath(s, l, namespace) ::: subPath else path val d = { val value = Config.getValueFromMap(internalDefaults, ConfigValueIndex(m, p, key, freeVar)) if (value.isDefined) value.get.value else default @@ -120,22 +120,22 @@ trait Configurable extends ImplicitConversions { /** * Check if value exist in config * @param key Name of value - * @param submodule Adds to the path + * @param namespace Adds to the path * @param freeVar Default true, if set false value must exist in module * @param sample Default null, when set path is prefixed with "samples" -> "sampleID" * @param library Default null, when set path is prefixed with "libraries" -> "libraryID" * @return true when value is found in config */ def contains(key: String, - submodule: String = null, + namespace: String = null, freeVar: Boolean = true, sample: String = null, library: String = null, path: List[String] = null) = { val s = if (sample != null || defaultSample.isEmpty) sample else defaultSample.get val l = if (library != null || defaultLibrary.isEmpty) library else defaultLibrary.get - val m = if (submodule != null) submodule else configName - val p = if (path == null) getConfigPath(s, l, submodule) ::: subPath else path + val m = if (namespace != null) namespace else configNamespace + val p = if (path == null) getConfigPath(s, l, namespace) ::: subPath else path globalConfig.contains(m, p, key, freeVar, internalFixedValues) || Config.getValueFromMap(internalDefaults, ConfigValueIndex(m, p, key, freeVar)).isDefined } diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/rscript/Rscript.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/rscript/Rscript.scala index aad5042bd7ad944e5daa2ce2ecb79006807b7bcc..0901d6c8022c74c0b5adcb9f0c43f6e1529cc29e 100644 --- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/rscript/Rscript.scala +++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/rscript/Rscript.scala @@ -30,7 +30,7 @@ import scala.sys.process.{ Process, ProcessLogger } trait Rscript extends Configurable { protected var script: File - def rscriptExecutable: String = config("exe", default = "Rscript", submodule = "rscript") + def rscriptExecutable: String = config("exe", default = "Rscript", namespace = "rscript") /** This is the defaul implementation, to add arguments override this */ def cmd: Seq[String] = Seq(rscriptExecutable, script.getAbsolutePath) diff --git a/public/biopet-utils/src/test/scala/nl/lumc/sasc/biopet/utils/config/ConfigurableTest.scala b/public/biopet-utils/src/test/scala/nl/lumc/sasc/biopet/utils/config/ConfigurableTest.scala index 37e851cf1572848a648bcb45ff5b7a51edeb0a68..a252e65366bc60952e0244d8574b4008ce6d3f1a 100644 --- a/public/biopet-utils/src/test/scala/nl/lumc/sasc/biopet/utils/config/ConfigurableTest.scala +++ b/public/biopet-utils/src/test/scala/nl/lumc/sasc/biopet/utils/config/ConfigurableTest.scala @@ -29,11 +29,11 @@ class ConfigurableTest extends TestNGSuite with Matchers { abstract class Cfg extends Configurable { def get(key: String, default: String = null, - submodule: String = null, + configNamespace: String = null, freeVar: Boolean = true, sample: String = null, library: String = null) = { - config(key, default, submodule, freeVar = freeVar, sample = sample, library = library) + config(key, default, configNamespace, freeVar = freeVar, sample = sample, library = library) } } @@ -52,7 +52,7 @@ class ConfigurableTest extends TestNGSuite with Matchers { @Test def testConfigurable(): Unit = { val classC = new ClassC { - override def configName = "classc" + override def configNamespace = "classc" override val globalConfig = new Config(ConfigurableTest.map) override val fixedValues = Map("fixed" -> "fixed") } diff --git a/public/carp/src/test/scala/nl/lumc/sasc/biopet/pipelines/carp/CarpTest.scala b/public/carp/src/test/scala/nl/lumc/sasc/biopet/pipelines/carp/CarpTest.scala index da5d79939fd3729fb8838ab26bbe1a2d2d2bfb00..77345a0858f4ae2a5926b497c8df89114ffb64a5 100644 --- a/public/carp/src/test/scala/nl/lumc/sasc/biopet/pipelines/carp/CarpTest.scala +++ b/public/carp/src/test/scala/nl/lumc/sasc/biopet/pipelines/carp/CarpTest.scala @@ -37,7 +37,7 @@ import org.testng.annotations.{ AfterClass, DataProvider, Test } class CarpTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): Carp = { new Carp() { - override def configName = "carp" + override def configNamespace = "carp" override def globalConfig = new Config(map) qSettings = new QSettings qSettings.runName = "test" diff --git a/public/flexiprep/src/main/resources/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepReadSummary.ssp b/public/flexiprep/src/main/resources/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepReadSummary.ssp index 182760305b9b8c0df8f1c133f21adbaf63efabf7..fc8b8ce55816d6b45b9937157ca6ae048e8f7a3c 100644 --- a/public/flexiprep/src/main/resources/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepReadSummary.ssp +++ b/public/flexiprep/src/main/resources/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepReadSummary.ssp @@ -144,12 +144,7 @@ val afterTotal = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "seqstat_" + read + "_qc", "reads", "num_total") val clippingDiscardedToShort = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "clipping_" + read, "num_reads_discarded_too_short").getOrElse(0).toString.toLong val clippingDiscardedToLong = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "clipping_" + read, "num_reads_discarded_too_long").getOrElse(0).toString.toLong - var trimmingDiscarded: Long = 0 - if(paired == true) { - trimmingDiscarded = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "trimming_" + read, "num_reads_discarded_" + read).getOrElse(0).toString.toLong - } else { - trimmingDiscarded = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "trimming_" + read, "num_reads_discarded_total").getOrElse(0).toString.toLong - } + var trimmingDiscarded = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "trimming_" + read, "num_reads_discarded_total").getOrElse(0).toString.toLong }# <td>${read}</td> <td>${beforeTotal}</td> diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala index ab87244ee6329c0163d91660df6e8f57f866949a..c1a04ed85efe48a27fff3fe14cb789e98d3504a7 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala @@ -204,7 +204,7 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with fqSync.outputStats = new File(outDir, s"${sampleId.getOrElse("x")}-${libId.getOrElse("x")}.sync.stats") val pipe = new BiopetFifoPipe(this, fqSync :: Nil) { - override def configName = "qc-cmd" + override def configNamespace = "qc-cmd" override def beforeGraph(): Unit = { fqSync.beforeGraph() diff --git a/public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepTest.scala b/public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepTest.scala index 1f00c07a01e073d29e63c21e246d1c3c2dbc69d0..2af324fad3d2815be4c51d191e4b849692c81945 100644 --- a/public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepTest.scala +++ b/public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepTest.scala @@ -36,7 +36,7 @@ class FlexiprepTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): Flexiprep = { new Flexiprep() { - override def configName = "flexiprep" + override def configNamespace = "flexiprep" override def globalConfig = new Config(map) qSettings = new QSettings qSettings.runName = "test" diff --git a/public/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingleTest.scala b/public/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingleTest.scala index 49e2eb4ec48baca659afe4559a9943507bf16aab..ae1f8d7400da3ecd24934cc73fef362f0c278cf4 100644 --- a/public/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingleTest.scala +++ b/public/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingleTest.scala @@ -39,7 +39,7 @@ import org.testng.annotations._ class GearsSingleTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): GearsSingle = { new GearsSingle { - override def configName = "gearssingle" + override def configNamespace = "gearssingle" override def globalConfig = new Config(map) diff --git a/public/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTest.scala b/public/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTest.scala index dddd88458379feb532e02dd4b7990b5a63e4d201..cda46755da89d7af2679a77f4e23edb142567fcb 100644 --- a/public/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTest.scala +++ b/public/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTest.scala @@ -32,7 +32,7 @@ import org.testng.annotations.{ DataProvider, Test, AfterClass } class GearsTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): Gears = { new Gears { - override def configName = "gears" + override def configNamespace = "gears" override def globalConfig = new Config(map) diff --git a/public/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala b/public/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala index 81e97ab6a22aa7ea93e7dec4769219540ef60dff..73a25ef561cb2363f7b4e930b8e0fb5187a8827a 100644 --- a/public/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala +++ b/public/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala @@ -31,7 +31,7 @@ abstract class GentrapTestAbstract(val expressionMeasure: String) extends TestNG def initPipeline(map: Map[String, Any]): Gentrap = { new Gentrap() { - override def configName = "gentrap" + override def configNamespace = "gentrap" override def globalConfig = new Config(map) // disable dict file check since it is based on the reference file name (which we can't modify here since // we use the mock /usr/bin/test file diff --git a/public/gwas-test/pom.xml b/public/gwas-test/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..619df649c57eb393b9e29e4873e4019e67003f93 --- /dev/null +++ b/public/gwas-test/pom.xml @@ -0,0 +1,62 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + + Biopet is built on top of GATK Queue for building bioinformatic + pipelines. It is mainly intended to support LUMC SHARK cluster which is running + SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + should also be able to execute Biopet tools and pipelines. + + Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + + Contact us at: sasc@lumc.nl + + A dual licensing mode is applied. The source code within this project that are + not part of GATK Queue is freely available for non-commercial use under an AGPL + license; For commercial users or users who do not want to follow the AGPL + license, please contact us to obtain a separate license. + +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <artifactId>GwasTest</artifactId> + <packaging>jar</packaging> + + <parent> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Biopet</artifactId> + <version>0.7.0-SNAPSHOT</version> + <relativePath>../</relativePath> + </parent> + + <inceptionYear>2014</inceptionYear> + <name>GwasTest</name> + + <dependencies> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>BiopetCore</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>BiopetToolsExtensions</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.testng</groupId> + <artifactId>testng</artifactId> + <version>6.8</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_2.10</artifactId> + <version>2.2.1</version> + <scope>test</scope> + </dependency> + </dependencies> + +</project> \ No newline at end of file diff --git a/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..afc6bbdc3ba737db63f5c4270009de0a60b8deaa --- /dev/null +++ b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala @@ -0,0 +1,143 @@ +package nl.lumc.sasc.biopet.pipelines.gwastest + +import java.io.File +import java.util + +import htsjdk.samtools.reference.FastaSequenceFile +import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand, Reference } +import nl.lumc.sasc.biopet.extensions.Snptest +import nl.lumc.sasc.biopet.extensions.gatk.{ CatVariants, SelectVariants } +import nl.lumc.sasc.biopet.extensions.tools.{ GensToVcf, SnptestToVcf } +import nl.lumc.sasc.biopet.pipelines.gwastest.impute.ImputeOutput +import nl.lumc.sasc.biopet.utils.Logging +import nl.lumc.sasc.biopet.utils.config.Configurable +import nl.lumc.sasc.biopet.utils.intervals.BedRecordList +import org.broadinstitute.gatk.queue.QScript + +import scala.collection.JavaConversions._ + +/** + * Created by pjvanthof on 16/03/16. + */ +class GwasTest(val root: Configurable) extends QScript with BiopetQScript with Reference { + def this() = this(null) + + import GwasTest._ + + val inputVcf: Option[File] = config("input_vcf") + + val phenotypeFile: File = config("phenotype_file") + + val specsFile: Option[File] = config("imute_specs_file") + + val inputGens: List[GensInput] = if (inputVcf.isDefined) List[GensInput]() + else config("input_gens", default = Nil).asList.map { + case value: Map[String, Any] => + GensInput(new File(value("genotypes").toString), + value.get("info").map(x => new File(x.toString)), + value("contig").toString) + case value: util.LinkedHashMap[String, _] => + GensInput(new File(value.get("genotypes").toString), + value.toMap.get("info").map(x => new File(x.toString)), + value.get("contig").toString) + case _ => throw new IllegalArgumentException + } ++ (specsFile match { + case Some(file) => imputeSpecsToGensInput(file, config("validate_specs", default = true)) + case _ => Nil + }) + + override def dictRequired = true + + override def defaults = Map("snptest" -> Map("genotype_field" -> "GP")) + + /** Init for pipeline */ + def init(): Unit = { + inputGens.foreach { g => + val referenceDict = new FastaSequenceFile(referenceFasta(), true).getSequenceDictionary + if (referenceDict.getSequenceIndex(g.contig) == -1) + Logging.addError(s"Contig '${g.contig}' does not exist on reference: ${referenceFasta()}") + } + } + + /** Pipeline itself */ + def biopetScript(): Unit = { + val (vcfFile, chrVcfFiles): (File, Map[String, File]) = inputVcf.map((_, Map[String, File]())).getOrElse { + require(inputGens.nonEmpty, "No vcf file or gens files defined in config") + val outputDirGens = new File(outputDir, "gens_to_vcf") + val cvTotal = new CatVariants(this) + cvTotal.assumeSorted = true + cvTotal.outputFile = new File(outputDirGens, "merge.gens.vcf.gz") + val chrGens = inputGens.groupBy(_.contig).map { + case (contig, gens) => + val cvChr = new CatVariants(this) + cvChr.assumeSorted = true + //cvChr.isIntermediate = true + cvChr.outputFile = new File(outputDirGens, s"$contig.merge.gens.vcf.gz") + gens.zipWithIndex.foreach { gen => + val gensToVcf = new GensToVcf(this) + gensToVcf.inputGens = gen._1.genotypes + gensToVcf.inputInfo = gen._1.info + gensToVcf.contig = gen._1.contig + gensToVcf.samplesFile = phenotypeFile + gensToVcf.outputVcf = new File(outputDirGens, gen._1.genotypes.getName + s".${gen._2}.vcf.gz") + gensToVcf.isIntermediate = true + add(gensToVcf) + cvChr.inputFiles :+= gensToVcf.outputVcf + } + add(cvChr) + cvTotal.inputFiles :+= cvChr.outputFile + contig -> cvChr.outputFile + } + add(cvTotal) + (cvTotal.outputFile, chrGens) + } + + val snpTests = BedRecordList.fromReference(referenceFasta()) + .scatter(config("bin_size", default = 1000000)) + .allRecords.map { region => + val name = s"${region.chr}-${region.start + 1}-${region.end}" + + val regionDir = new File(outputDir, "snptest" + File.separator + region.chr) + val bedDir = new File(outputDir, ".queue" + File.separator + "regions" + File.separator + region.chr) + bedDir.mkdirs() + val bedFile = new File(bedDir, s"$name.bed") + BedRecordList.fromList(List(region)).writeToFile(bedFile) + bedFile.deleteOnExit() + + val sv = new SelectVariants(this) + sv.inputFiles :+= chrVcfFiles.getOrElse(region.chr, vcfFile) + sv.outputFile = new File(regionDir, s"$name.vcf.gz") + sv.intervals :+= bedFile + sv.isIntermediate = true + add(sv) + + val snptest = new Snptest(this) + snptest.inputGenotypes :+= sv.outputFile + snptest.inputSampleFiles :+= phenotypeFile + snptest.outputFile = Some(new File(regionDir, s"$name.snptest")) + add(snptest) + + val snptestToVcf = new SnptestToVcf(this) + snptestToVcf.inputInfo = snptest.outputFile.get + snptestToVcf.outputVcf = new File(regionDir, s"$name.snptest.vcf.gz") + snptestToVcf.contig = region.chr + add(snptestToVcf) + + region -> snptestToVcf.outputVcf + } + + val cv = new CatVariants(this) + cv.inputFiles = snpTests.map(_._2).toList + cv.outputFile = new File(outputDir, "snptest" + File.separator + "snptest.vcf.gz") + add(cv) + } +} + +object GwasTest extends PipelineCommand { + case class GensInput(genotypes: File, info: Option[File], contig: String) + + def imputeSpecsToGensInput(specsFile: File, validate: Boolean = true): List[GensInput] = { + ImputeOutput.readSpecsFile(specsFile, validate) + .map(x => GensInput(x.gens, Some(x.gensInfo), x.chromosome)) + } +} \ No newline at end of file diff --git a/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/ImputeOutput.scala b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/ImputeOutput.scala new file mode 100644 index 0000000000000000000000000000000000000000..41574f0cd0aaf2dd0501ab98d5273148f968dd90 --- /dev/null +++ b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/ImputeOutput.scala @@ -0,0 +1,80 @@ +package nl.lumc.sasc.biopet.pipelines.gwastest.impute + +import java.io.File + +import nl.lumc.sasc.biopet.utils.Logging + +import Spec.SpecDecoderOps + +import scala.io.Source + +/** + * Created by pjvan_thof on 3/25/16. + */ +object ImputeOutput { + + case class Chunk(chromosome: String, summary: File, warnings: File, + gens: File, gensInfo: File, gensInfoBySample: File) + + def expandChunk(chromosome: String, basename: String) = Chunk( + chromosome, + new File(basename + ".gens_summary"), + new File(basename + ".gens_warnings"), + new File(basename + ".gens"), + new File(basename + ".gens_info"), + new File(basename + ".gens_info_by_sample") + ) + + def readSpecsFile(specsFile: File, validate: Boolean = true): List[Chunk] = { + val content = Source.fromFile(specsFile).mkString + val chunks = content.decode[List[Spec.ImputeOutput]] + .map(x => expandChunk(x.chromosome, specsFile.getParent + File.separator + x.name.split(File.separator).last)) + chunks.flatMap(validateChunk(_, validate)) + } + + val ASSESSMENT_HEADER = "-{32}\\n Imputation accuracy assessment \\n-{32}".r + val TOO_FEW_SNPS = "There are no SNPs in the imputation interval, so " + + "there is nothing for IMPUTE2 to analyze; the program will quit now." + val NO_TYPE_2 = "ERROR: There are no type 2 SNPs after applying the command-line settings for this run, which makes it impossible to perform imputation. One possible reason is that you have specified an analysis interval (-int) that contains reference panel SNPs but not inference panel SNPs -- e.g., this can happen at the ends of chromosomes. Another possibility is that your genotypes and the reference panel are mapped to different genome builds, which can lead the same SNPs to be assigned different positions in different panels. If you need help fixing this error, please contact the authors." + val NO_ANALYSIS = "Your current command-line settings imply that there will not be any SNPs in the output file, so IMPUTE2 will not perform any analysis or print output files." + val CORRECT_LOG = " Imputation accuracy assessment " + + def validateChunk(chunk: Chunk, raiseErrors: Boolean = true): Option[Chunk] = { + + def addError(msg: String) = if (raiseErrors) Logging.addError(msg) else Logging.logger.warn(msg) + + if (!chunk.summary.exists()) { + Logging.addError(s"Summary file '${chunk.summary}' does not exist, please check Impute output") + None + } else if (!chunk.warnings.exists()) { + addError(s"Warnings file '${chunk.warnings}' does not exist, please check Impute output") + None + } else if (chunk.summary.canRead()) { + val summaryReader = Source.fromFile(chunk.summary) + val summaryLines = summaryReader.getLines().toList + summaryReader.close() + if (summaryLines.contains(NO_ANALYSIS)) None + else if (summaryLines.contains(TOO_FEW_SNPS)) None + else if (summaryLines.contains(NO_TYPE_2)) { + Logging.logger.warn(s"No Type 2 SNPs found, skipping this chunk: '${chunk.summary}'") + None + } else if (summaryLines.exists(ASSESSMENT_HEADER.findFirstIn(_).isDefined)) None + else if (!chunk.gens.exists()) { + addError(s"Gens file '${chunk.gens}' does not exist, please check Impute output") + None + } else if (!chunk.gensInfo.exists()) { + addError(s"GensInfo file '${chunk.gensInfo}' does not exist, please check Impute output") + None + } else if (!chunk.gensInfoBySample.exists()) { + addError(s"GensInfoBySample file '${chunk.gensInfoBySample}' does not exist, please check Impute output") + None + } else { + if (!summaryLines.contains(CORRECT_LOG)) { + Logging.logger.warn(s"Impute says it did not run but the gens files are there, pipeline will still continue") + Logging.logger.warn(s" Please check: ${chunk.summary}") + } + Some(chunk) + } + } else None + } +} diff --git a/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Spec.scala b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Spec.scala new file mode 100644 index 0000000000000000000000000000000000000000..e8ea1e49b537b4794473d7490610e75d831d355e --- /dev/null +++ b/public/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Spec.scala @@ -0,0 +1,64 @@ +package nl.lumc.sasc.biopet.pipelines.gwastest.impute + +import scala.util.parsing.combinator.JavaTokenParsers + +/** + * .spec file parser combinator. + * + * TODO: clean up. + * + * @author Matthijs Moed <M.H.Moed@lumc.nl> + */ +object Spec extends JavaTokenParsers { + def obj: Parser[Map[String, Any]] = + "{" ~> rep(member) <~ "}" ^^ (Map() ++ _) + + def arr: Parser[List[Any]] = "(" ~> repsep(value, ",") <~ ")" + + def key: Parser[String] = """([\w+])*+""".r + + def member: Parser[(String, Any)] = key ~ "=" ~ value <~ literal(";") ^^ { + case name ~ "=" ~ value => (name, value) + } + + // This is a hack to get around the fact that JavaTokenParsers' + // stringLiterals do not have their double quotes removed. They're stripped + // here, without any error checking. + // TODO: implement as token parser + + def string: Parser[String] = stringLiteral ^^ { + case s => s.substring(1, s.length - 1) + } + + def value: Parser[Any] = ( + obj + | arr + | string + | wholeNumber ^^ (_.toInt) + | floatingPointNumber ^^ (_.toDouble) + ) + + // + + trait SpecDecoder[T] { + def decode(obj: Map[String, Any]): T + } + + case class ImputeOutput(chromosome: String, name: String, orig: String) + + class ImputeOutputMappingDecoder extends SpecDecoder[List[ImputeOutput]] { + override def decode(obj: Map[String, Any]): List[ImputeOutput] = { + val dicts = obj("files").asInstanceOf[List[Map[String, String]]] + dicts.map(d => ImputeOutput(d("chromosome"), d("name"), d("orig"))) + } + } + + implicit val imputeOutputMappingDecoder = new ImputeOutputMappingDecoder + + implicit class SpecDecoderOps(string: String) { + implicit def decode[T](implicit decoder: SpecDecoder[T]): T = { + decoder.decode(Spec.parseAll(Spec.obj, string).get) + } + } +} + diff --git a/public/gwas-test/src/test/resources/fake_chrQ.dict b/public/gwas-test/src/test/resources/fake_chrQ.dict new file mode 100644 index 0000000000000000000000000000000000000000..e2b0e2af7579a994afedb68a5c495ba794a445df --- /dev/null +++ b/public/gwas-test/src/test/resources/fake_chrQ.dict @@ -0,0 +1,2 @@ +@HD VN:1.4 SO:unsorted +@SQ SN:chrQ LN:16571 M5:94445ec460a68206ae9781f71697d3db UR:file:/home/ahbbollen/fake_chrQ.fa diff --git a/public/gwas-test/src/test/resources/fake_chrQ.fa b/public/gwas-test/src/test/resources/fake_chrQ.fa new file mode 100644 index 0000000000000000000000000000000000000000..171b3737bd458bba03d7c457b6ba51e5ef4f774f --- /dev/null +++ b/public/gwas-test/src/test/resources/fake_chrQ.fa @@ -0,0 +1,2 @@ +>chrQ +TCGGCATCTAGAAACTCCAAGTCTGCAGATCTATAACACACGAGGGTATTCAGCCCTAAAGCATTGCGTCTAGGCATGGGTCTTTAATCTCGCCCCGGCAACCTCTACGCGAGCTCCTACCAGTCAACGTGATTGATCCCTCGTACCTAGCATATTCCACGGCGTTTCATAGCCTTGACGGGCACCTTATCCATTTTTACCTAGTTTGACATTAATTGCTTCAAGACTACTGCGGTACGGGTCCCTGCATCCAGAAAGGATGACAAAAAGTGTGACAGTGCGTTCAATCCATAGATTCCCGTCGGTGCGAGGGCTGAAGTTGAGCCCTACAACCCGGCGATAAAAATTGTAACCCTATGCATTGCGTCAAGTTAAGACCCTCCATCTTCGCCACGCGATAGGAATTTGAAACATCGGCGTGCTTAGAAAGGCTATTAGCGCGGGTGGCCTGATCGGTATCCGATAAAGGCGTTGCCACAAATTTTGCCGATTGACCTGCAATTACATCGGGTAAGTGCATTTCCGCCGTTAGAGTATTAAGTCATGACGTTTTCGACATGGGCATTTAATTGGCCCAAGGTGGAATATTCCTGATGTGCCTAAGTATGAGCTCCCCATGAGAATTTGAAGATCATGAAGACTTGAGTCAATTCTAAATGTAGTTACCATCACTTAAGTTCGTTCGCCTTCTGGGCAGGCTATGTCTAACGAGCGACGTCCGCAGTGTGACAATCTAGATATTGGTTGAGCAGGAATACCAAGGAGGTGCTTGAAGTTTCTCTTATGGAACCAACTTCAATCAATAGGTAGTCTCCGTTTCGTTTCCACTGAGACATTTGTGCAGTGGCACAGTATGTGGTCGTAAGCGCATGGTGTTGTTGGAACGAAGACTCTCACTTTTGTTTCCTTTGGTAGTTTAATTTAGCAGTATCCTGGTCGTTGACCAATTTGTGAATCTCCACGGTCTCTCTTTATCAGTCATACCTCTTAAACCGGCTTCCGCTCCTGCGCACATTTATATCCTACTTTCCGTCAATGTGAAAGAGAGATCAACATAATTTCGCGCACGATCTGTCCCATTTCAGACGCCATCGACGGGTCCCGCCTTCACAAAAAACCTGCCACCATGCAGTAGGTACCTCCATATGCTGAGCGTCCGTTACCGGAAGGGTTAAGTCACTGTTTAAAGGATAAGGCGAGGTTTCCCTTGGTGGTGTACTACTGCTCACGTTCCTGCTTTACTCTCCGAGTATTGTTTTAAAAGTGGGCACGCTGGAACGCAGCACCCTATAAGAAGACTCAGTTAGGTCCCCTTCACGAGCATGTTTGAGCTCTCGGACCTAAGACGTCCGAACTCGGTTACGACGGTTAAAGACAAAGCTCCCAGCATGGTATCAGAAGCCACCACGCGTGAGGATGGCGGGGCCCTGGGCACATTGGCCCCGAATTCTTCGCCCACTTAAGTCGGACTCACCACGGGAGACTACGCCCTAACCGGATAATATCTTTTAGATACCATACAGGGTGCAACCATCGCGGCGTCAGCTGAATGTGAGGACAGAGCCCCACACACAGCACTAACAGATCCATGATTTTACTTCGTTTGGGCGACGTGCGAGCCTATTCGGCCTGTCGGACTTTGTGTAGATTCGGGATTTGACCTAGCGTATAGGCCTTTGGTATACTCGAATTATCAGGGTCAAACTACCCTCAAGGAATCCTATTTCAGACGGACGTTTGCCACCCCGGAACGTTGTCGGATCGCTTCTTGCCGCAGGAGATTCATGGAGTGATAATCGCTGGACTCACAAGTGTCAGCGGACTTTCGGTGTCTTGTGGCCTACTTGCAGTGAACACCACCAAACGAGTAGTATTATGGATTGCCGGCGTGTGTTTGTGGCCATGATTGGTTGATGCGACGGCCTAGATTCTCCTACGGGAATCTACCAGGCCCAAAAGCGCTGACTTTTATGTATTTGAGGGGCCGAAATTACATAGTAACCCAGAACAAATACCCGTTAGTTATAAAGTGAGCGCATAAGTTTGGTCGATCCGGCAGTCGAACCATTGCGGTCGGACATATCCGCAGTAGTACACTAAGGCGGAATAGACTGCCGAGTCAACGCTCCCTCATTCTTGCTACCTTAGATCTCGCAGGTTCGACCATTGCTGAAGCCGCTGAATTACACGAGTTGTTTTGTTAACCCCCGGAATGTAGTTCGTACGCCTCAACTGATTCTTCAAAAGCTCACTGCACGTGACTTGTCATGTGTTCCTAAAACATACCTCATCTGTGGGTCTGGTCCCATAAGCATGGAATGTCCGTCGACGCAACATGGAAACCCACTCGCTCGCTATACGTTTATGGTGAGACAGAAACACACTGTATTAGACTGCCACTGATAGCCCCAGTAGCAAGGTGATGTGGCAGGCATGGTACCCAAACGTCTATCGTTTTGGGAACCAAGGGGAGTGCTAATAGGTCCGGCCACGTAGAATGACATAACCTCCAGAGGAGCGCAGGAGTTGATGCATTAAAAGATCCATACTAAACGTTAGCTTAATGCCTTCAAGGCACCAGCTACCTCCATGACAAGGAGATTTCGGAAGGGGTAAGATTTACTTCTGTCCCAAAAGGGTAATGACCCGTAGGGATGGAATCATTGATGAACTCTAAAGGGACTCAGCCGACTAGCCGAGAGGGCTGGACGATCATTTGATGGGAGAATACGCATACATCTAAGTGTCAAGTATTAAATCGGTAATCTCCGTAAAGGCGTGAAGTTCACAGGGCGCAGTTTCCAGTTACTCCAAGAAACTACCGGTTCAGTTATCGCTTCCGGTGCCTTCACAGCAAACATCATTCGCTCAAGAAAGTGCTGTCGCTGCGTGTGGATATTTCCCCCTCGTTCTGGAAAGTGCTCTCAGCGACGCACGTAAACATGCTGGCGACGAGAGAGGCGTCAACACGGTCCGTTACCAAACTGCGGCATTTACCACGAACCTGATTGCAAAGTGAGATTTCCGTAAGGAGGTTAGCCAAATATTACGTAGAGTGTTCCACACCAAATCCGTCGTCCACATTCGCGACGGCAGTCTAGACGTGTAATTCCCCGGATAATCCAGTTACTACATGCTGATGCAGTCATAGTGCACGCAAATGCGCAACTTAACAAGCACGACCTGAAACAGAGAACCCCTGTGTAGTCAATATAGGATGACGGACACACACACTTGCTGCTGCAATCTTACATTCTGCGAACGAGTGCAAAGTTGAAATCATGACGAACAGCCTTGCTTTTCAGAGTCTCTATCGAACTCCTTTACACCTCCATATCTACTTGCAAATCACACTAGAGGGGCGCAGCTTACTCACTGAGAGATGGTCTACCTAATCGATTTTCGGTGAACTTTGAGTACAGCATTGAGTCTGGAGGGTTCCACTACTTTATCGTACCGGTCCGACATGATTTCTTATCGAATAGATGTTGAGATGGACATTAATAAGCATAGTACGTCTCGATCGATGGCTACCTTTACGTCTATGAGTGCTTACATAAGGTCTCTCGTAAGTCATGGTCCCGCGGGGCTCGCGCAACATTGTGGATTAATGACTCCAGTGACGCATGTTCGATTCGCATGAAGTAGGTGGCGCGTATTCATACATGAATAGTAGGCAGAACGAGCACATTGGACCGATCTTGGAGGTTGGGCTTGAGGTCCCGCACTGATAGTTTACGGCCATGAAGACGACAATTGTCAATACTTCTCTATCCTGAGCGAATGCTGACGGGGGCCAGGCGGAAAAGTGCGACACAGTCTACTGATTACGTGTAGTACGTGGCTAAGCATATTCCCGGGTTCGCAAACATAGGTCTCTGATGGGGTATGGGTAAGAAATCTGAAGGTTGCGTCTCTCACCACGGTCAGGATACCGAATCAGCTCATAAGAGTTACAAACGCGCAAATGAAGGCCTAGTCCACAGGGGTGTCATTCGCACGAGCTGGGCTTAGAATCACTGTTTTCGGTCGCACCTGTAGGGGGACATGGGACGGATTTCACAACAAAAAAGCATCTCTCATGATTCGATTCAGATAGAGGAGAGGAGGTAAATGCCAACAAATCTATTCTTTAGTACCGCCTGACGGAACGTATTTAATCCTCGCCTCAGATCGACACCGCAGGGTAGCTGAAGACGTCCGTTCTTAGACATTTAGTCGATCATTTGTTATGAAACAGAACTAGGAGTCCGTGCCCTTCAGGCCGGCTCAGGGGCACCTACCTCCAGATCGCCCAGGTTGGGTTTATTAGGCGCCGAAAAGTTACTGCCCTATCAGCCCCTCCAATCCGACCTACGGACATCATCCCACTGGCTCGCAAAATATAAATTGCGGATGGGAAAGGATAAGGAAAATCATTACCTACACAGAAGGACAATGTCAGTTCCAAATAACACTGATACTTTCGGAGCAACTTGGTCCGGAAATGTAAGTACGACTATAGCCCTTTCGACCAACGCCGACAGTCCTATTTGGACGCCGAGAGAGGCGACGGGTAGCCGAATGTAAAGCTCTCGGGTCGCTCTTGGCGGAATGCGCTGCGGGTCCTACCCTAAACCCTTACCACCACCAACTTCGTTAGGAGCCGTATAGATTACAGCTCCCGCAAAATTAGAGAGGAATCTGAGTTATTAGCTGAGGACCCCGCATTTTCTGCGACGGCGTAGCTGCAGTGACGTACGATATGAGTTCCCGACTGTGAGGGAGTCCCAGTCGTGACTCCCTACAACGGCTCCAGATATTGTTACTTATGGTCAATATGCCCCGACCGCCCATTGTCTCGAGTACAGTCTTCCCCAAAGTTAAGCTGTGCATTACCTTACCGTTTTAGGTCCAGCTGGTAGCACCGAATGCTGCGCAATCCGAGCCCCCGAAATAGACTACGTGTCCACGGTCAATTGTCATGGGTAGCAGAGCTCAAAGAGGAGAAACGTGCCCCGTAAACCTATTAGATCTCGGTTGATAAATATCAGGCCACAGCAGGCTGCCCGATGCTTGTTTGAACAACAACTTCGGGAGCCGCGGTCCTTGGTTCTCCCGATATTCGGCCGCACCGAACGGTACGCGTCATCGCGAGGTGCGTTCTCGCAGCAAGAAATATTTGTTGTTGTTGTCTTCCTTCCGCATAGGAAACCTTAAGCGGTACCTTTCTACGAAGTTGAACCCTAGAAGCACGTGTAACAATTTTTTTTACGCTACACCCGGATCTGCTTCCATCTGTTGATCATATGAGCCTAATGTGACTAATCTGTGCCGTCGATTGAAAATTCGTTCTGAACCTAATCACATGAATTAAAATTAGGGCGAGAATTGGCTCCTTTTGGGCCGTAATCCTTCAAAGGGTTAACCGAATTTAGCCTCCACGGTGACACAAACTCCCATAGGTAAGGCAAACCCAATAACGAGGAAGCCTTGCCCACAGCATGTTTGATAAATACCCTTAGGGTAATATCGCGTGCAATACTGAAGCCGCTCTTCTAGCATCCGTGTTTGACATACTATGACCTTGAAGCCTGCCGCAGCTTCTAGGTCATCCAAGTAGATCAAAACGCCATGTTGTGGATCCATGCATCTTCCCAGTGAACATGGATCTTAGTGTGACAGGCGAGGAGCGGCGAACACTATCGGTGTGGCAAGCTCGGGCCTTCGTACGTTGTGGAAGTATGCGAATAAGGAGACCGTAATGTATCAAGTTCTTAAGAGCCTTGGTACCGTTGCAATTCGGCATGTTCCTACAGAGACACTCCGTGTTTGTCATCCGTCATAGATCTATGGCGTAGTTAGCGCCTCTGAAGTAGTTGTCCATTCAGCAGGCATTGCTTAGGGAGTTTCTGGCGCTTGCCGCTCAAGATGCTCACGGGCCTAAGTAGCACGGCAACCTTTTGACAAAGCATTTTATAAACTGAGCATATTGGCCCGAAACTAATCCAGCAAAGGGTGAAGACCTGTCAGCGGGCCCAGAGTGTGAACGGTCTACTGCGCGGTACATAAGTGGCGTAATCCATCAACAAGACCTACACGACCTGAATGATTTCCAACAACTTTATATGCTTTTCCGCATCTCGAGAGTACCGGAATCTATGCAATCTCCCAAGGATCCGTAGATTTGAAATTCAATCCGACGGGGTAAGGTTGCCGCGCCGGTTAGCTAATGTGCGGATTTATAGTCTTTTTCCCAGAAGGCGTAGTTAGTTTCGCACCTAACTACGACACATACTTGGGTCGACTGTTGAAGGTGGTAAGTTGCGAGCAGTCCGCCGCTCTCACGCGCCGAACCACGTTCATATCGGCAAAGTTGCGCGATGACCTATAGGTGTGCAAAGCTCGTCCGACATTGGGATTGGATTCACGTACATACGTTAGTATCATGGGTAAGCTTCCATGTCAGCCTCGTGTATAGCACCGGTGCGCCGCGCGTTAAGGATTCTATGCCCAGCAAATGTGCCAACGTTGTGGGGAGAAAAGTGTAGTTGGATGCGATCGTGACATCGGCACACCGAAACTCTGCAGCCAGTCCCGCTAATCTCATTGGCACCGGGTAAGAGATTACCTTTGGTTAGGAATCGCGTGCGACGTACTGCACGAAAACAGTGCCTGAACCGAGGTGTTTACTTAGATGGTTCTAGACCCAGCATGTTCCTCACTGGAACCTGACGTCGGTACGTGATCCTCTATACCTCCTTTTCGGTATTGGCCTGGCAGCTACTCTAACTGTTTGGGCCGCGCCGATTTCTCGAGTCCACACGGCGAGGTCAGCAAAATTGCCAGTTAGTGGATGTTGGGATCTCAACGCATTACCATGAGAGTTCTTGGTTTACCCGTTAACATCGCTGCGCACGGTGTGAAAAGCCTGTTTCTTTGGCCCCCATCATCTTCGGCCCGCAGATCTCAGATCAATGATGTAAGGTTGCGGCGGCAAAGACTAGACTTGAGTCGTGAGATGGTGCTTTGCTGAGGCCGTCTCCTATAGCTTATTCTAGGACTTTCCGCAAACCACCCGACGTGCGGCTGTCCACGATCGGATTCCATTCTGTCTCGGAGCATACAGCACTAGATTTGCCGCTTGAAAAATGTTCCATAACCATGATTTCAACCCCATCTAGTCGGCAGGCACAGCTGAGAACAGCGAAGGGCGTCGTGAAGGGCATTGCCCGTAGTGTTTCAGACGTGCTAGAGACTAAATCAACTATCTGCACTCGTAGCCTGGCGTGTGAGATGTCACCACGATGTGCCTAGAGGAGTGATTATGAACATGTATTACCACGTCCGGGTGTCGACGGCTATATGGCTAACATTTCTTATGGCTAGACGTGCTTGGAAAGGTTCCCCAGCCTTCTGTTTCCCGGTGCTTTCCACGAGTCTGGAGTTCTGGTAATTAACTACATGGCGTTAACGCGGAGGTAACCCCCAGTCATTGCATTGCAGGTAGGGCTTAGGTGCAATATAATTCACCAAGGCGCGGATTCCTCACGATTGTTACGAAGACACCCGGAGGGTTTCAGTATGGCTTGAGAAGTGTACGTTTTTCCGGCCAGGGTGTAACTATAACCAACACATGTTTGGCCACGGGCTAAGTCGGTCCGCACGACTGATTTCCCCCGCCCATGTGTTTGGGAGCAATAAACTGCGTCTGCCAAGAGTAACAACTCGAGTAGAGAAGGGAAGTCTCAGACTATTTTGCAAATCAGACTGTAAGGCTCAACAGCCATACAGCTTGCCCTACTACTGAATACTAGCGTAGCGTGGCCACATAGGAAAGACTTCATGTCTTCTAATAACCTTTTACCTCCAACGTCCCCGCCGTCTTCACGCGGTCCAACGATGAGGAAACAACCACCCCTATCTTCCGCGGAGTGGTTCACACGACCCCCGGCGTTAACGCGCACGTTGTTGTCTTTCGGGACGGCACTACCCCCAAATGCCCAGACCCAGTGCTAGCGATATTCAAACGCCGTCCGGTAAGTCCTGACGTTTTTCAACTGGATGCACTGGCGACACGTAGTTCGCAAGGCGTCCATGAGAGGTTTTAACCGTCATGTTTCCGTATCACGTCTTATGTCTGTCTCTATTCTCAGCGAAATTCTCATCATAGGGCGGAGACTATCTGAAGGCCAGCGAATACAAGATTTAATATCAAATATAGCATGGGGGCCAACAGAGGCCCCCCTGGTGCTGACGAATTATCGTGATATTAGTACAGCTGTCTGCAATGCCATTTCGAAGGCTTTTTGTTCGTATCACTGCTCTATGCATAGCGGTCACTATGACCTCTCAGCTTGACTCACCCGAATGACCAATTGTGGTCCAGCACTCCCTCATCTTCCCCCATTAACGATACGTTGGGCACCATCGGTGTGAGCTACCCGTTACAGTCATAGAATCGTTCTTTGCGTTGTACGCGGCACGGAGGTGACCGGGAAAAGCGCCGCGAAGGCCCCGCACTGAATAAAGCTAGTATTAGCGTCTGTCAAAGTGTTTTGACACCTAATTCGCTTCCAAGTCCCAATATCTAATCTAGCCTGCTTTGGGCCAACATCTCATTGCGTTATGCTAATGAAGAGGGTGCGGGATCACATCCGCTCTTCTCTTCCTATACACAGCGGACATTCGGGTTGGACGTTTGGAGTGATAATTTATCGTTAGGGATAAGTATGTCGGCGCTTAGTAGTATAGCCCGCTGACCAGCGTTCGATTTCGAACCTTACTGGACATTCTCAATAACTACTGATCATGACGTTTTCCTCAGTTCCTAGCCTTGACAACTAGCCACAGTCAGCATGGTAGAGAGCGTTGAGCCGGGGATAGCCAGGCTATTAAGACAAAGACCCTCGGGCCCCTTAATGCGCGTCAAGTCTGACGGTTTGAGTGCGGAGCAGTAAGCGCTTTGGTATAACCGTGACGTAGCAGATCCATGCTTCGCCCGCTTCCACCTGAGAGATACTAGCCTCTTTCGCACTTTGTAGGATTACGGGCAGCGAAATATTTATCCTGTGCGGCGAGCCCGCTTCGGTTTCGAGCTCTATCAGTGCGCGGTTGGCACTCCAACGCACGATAACATATACCCGCCCACAAGGCCATGCAGGTTTAACCTCCTATTCTGATTGTACCTGGCTGACTTTACGGTACCCACCAGCGCAGGATTAATAGCCTAATTATGCTAACCGGTGCTAGTCTAACTGCTGTTACTAGTCCGCCCCAGCTACCCCACGGGTCAGTAACTGCACCAGCAAGCATGGTTCTCCTCCTGAAGTTGTACGTTCGAGAACCCCGTATCGAGTTGGTATATAAATTAAGGGTTGTCTAAAACAGAAGCCTATTCCGCTATCATCGGTGTAATAACTGATCGCGCCGTGGTTAAATGGAGGAGCACCCGCATGGATACATCGCTAGCGTCTTGTAACTCTCTGGGGGCCTAGTATGGAACGGAACAATGACATCATTGCTTACGGGGCCCGCACTTAGCTGTCGCGTATCGCAAATCATATGGCATGTCAGTCCCGACATCACGAAAATGACCCCATCTGAGGTGGTCGGGAGGCGAACAGTCGAATATGATGTATGCACCCGCAACTTAATGTTCAAAGGCGGGCGAAATGCCTTCTCCCGTCCGGACTATCCTGAGTGCTAGCCGCGAGTCTGTAAAGGTTGACGCAACCATATAGCACGCAGAAAAATCACTCTCACACCATGAGAACCATGGCGGCACGCTGTCTACTTTGTCTGACAGGCTACGGAAGGAATGGTACATACGTACAAACGGATGATATGATATCGGTCATTGCCTATTGTGACGCTACCCTACTGCATCACCCCCTTAGAATGCGTTGGACGCTCTATAGCAGATCCTCCATCCAGTGGAAGTCTCGTCGCCGTGGTTTGCCTTAACGACCGTTGGAGAGAGCAGGACAGAAATATCGCCCTTTTGAGCGCATTATTTGGAATCGAGGTAAGTCAGTGCGGCATAATCGCGCCTCGTGAGCGGAACAGTTTTTGATCCCACCCGCTAAATGCCAAGGTGCTGTAACCTGGGCGCGACACCAAAAGACCACGTGCTGTATGAAGCATGTGTTCTAGCGCACTCTCAACCGTTACCCCGAGAGTAAAATGTTAGTTGTAGGCCGATTCTGCAATGGTAATTGGCGGAGTGTCTAGGGAAATGTTTCGGTCATACTTAACCGGCTACCTCTTCCTCCCTCAGATTCGGTCTGAGATGAGATATACTGGGTGAGTTGAGTCGCCCTGTATCGTTGCGGCGCTCGTGGACCAGACAGACAGTTCCCGTTTATCTCTGCTTCTAGATGGAGGGTCGCCTCCGTGTTAACGCCGGCGAAGGTAGTCGCAGCTGAAGTTGTGATGCACAATCAGGTGAGCCTTTTAAGTATGGTCCTACGGACGTGAACAGCTGGGCCCAGTCATTTAGTACGGGGGGTTTACCTATAAGGATACGGTAAGAACGTCATCTATCCGTCCCACTGGAGTCCGAGGGGTTCGTGTCTACACGGATTACTTATCATGCACACACGTCTACGGTCATGCATAAAGTTGTGCAGCGCAGCAATCGGAGCGGAGTTACACCATCTCCCTATTAACAAGGCACTTATTAGTACTTACCCCGTTATAGAGCTCTCATCTTATCGATAGAGCGCAGTCCTAAGTATTGGCTCGAGTGATTCGCTCCTCAGCCCTTGATTGTAACTCCCCCGATTGCAGGTTGTATGGTGAGTAAAATCTCTGCGCCCTTCTGTTCGGATAAAGAACCCCGACCACTAATGCCCGCCTGCTTGTTGGGCGGTAAATGGGTAACGGAACATGGACTATGAGTGCGATGATGGTCAATAGAATTACCTTATTACGCAGTAAAAGGAATGACGCAGACAGGTATTTGTCGACGATTGCTTCGAACCTGGCAAAATGGGGAGGTATCCTGTCATGTTCATCTGTAAAACAACTCCTGCCTCTTCGTAGAGGACACACACTGTGGGCCTTTAGCCTTTAGCAGCCCATTGGGGCTTACCAGCTGTCGTCATGGGGTATCATTAAGATCCATGCGCCCCCGAAACTTACTGCAAAACAATATGGCTTAAAGGTAAAGGGACCATCAGGAGAATGCTTAAGAGCGACATATAGATACGTATTTAATTAATTTATGTTAACGCAACCATCTCGCAGGAGTCGCATAGCATATTGCCGGGTGATAGTTAATGCACTGTGCTTCCGTGTTTATATAAAATAAGCAGTAACCTCTGACAGGTTGAGACTCCAACAAGTGCTCCGGGTATTTACCTTCTACCATGGCGTTCTAATATCACGAAAGAGAAATTGTGTGTACCGATGCCAGGTGACCGCCCGCGTGCGCCAACGACGCAATCTAGAGCATCCACGCTGAATTGGGGAACTCTTGCCGTTCGTCGCATGGTGTACTTGGTACCACTCGATATGCCTGATTAGGTTTGGCCGTAGCACGTAAGGTAGTGACTTTCCATTCAAGCTAGCGAAGCGACACCACCACAGTGCCCGGTCAAAATAACCCACACCTGGCCAGCATAGAGGCTAAAATAGCTACAGTGCGCTAATCGAGTGTTTTTGCATCGGCTCGTGGCTGGTGGACTCGGGACAGCTTAGAACTAACTCTGGTGTACAAACGCGATCGTAGCTCTCGCGACTTACTCACCGGAGTAGGTTAGATGGACAAGACCTAACCCGAAGCCTAAATCGCCCTGAGTGTTAGCCGCCATTCAATTCTATGGTTTATCGGGGGCGTCTATGGCTGCGACAGTATGGAGGCCCGTTATGGGCACCCGAGTATCGTACCATAGTAATCCCATATTCCTCTTCGAGCGACTATTGGATCAACATACCTACAGGGTAGTATGAATGTTCTTGATTACAGAAACCATGGAATCGGCGCATTCTATGTTTCACTTCCGAATAACAGTGAGCAAGGCATGCCCTTGACAAGGATCATCCCGACAGCAAGCCGATCGGGCCCTAGAGCCCGACCCCCAAACAGAACACCGGCCACGTAGTTGCTGGGACTAAACAAAGGTGTGTTTCCATAAAAGGAAATCTTCAAGTGTATTGTTGAGTCGTAACGCTTATATTTATGGCCCAATGGGCGTTGCGAGCACAGTAGCAGGCCTAGATGAATGCCTAGGCCACGATCGGGGGGAGGCTCATTGAACGTACTGCCATACCAAGCCCCCGTATGCTATGGCAGGAGGGGTTCTCTTCGTATAGAGCGAGGGTCTCTACGCCAAGCAGCATTCCCGTGTTGGGTGGCCAATGGGGCTCACTAGAAACTCGGTTTTTTTAGCGAAGGAATGAGCAAACTCGTGAAAGGTGGTACACACCAGTTGCGGCCGATTTGTTGTAGCAACAAGGTTTGAAGAATTGAGTAGATGGGCCAATTTACCTCCTATTTAGCGAGTGAGATGGCGCATGTTTATTCAGACTCCATGTGGGGTAGAGGCTAATCGTTTAGTAGCAATAACCCCGCGGGGCAAGAGACCGTAATAACTTGAATCTGTGGTAGCTATGAATATGTGCTTCGCCCTAAGTGTTATGTAACAAGAGTGATCCAGGGGCTCAGATCACACTTAGTACGATCCGCTACTGAAATGCGGCCGCGGGCTTGCACGCTGGACATAAGTCGGATAATCAATTGCCTACGACAGGTTCAGCCATAAGGCTTGGCTCCTAACACACTCATGATGTCTGGCTTTTACTCGTGCCCGGACATAAACGTATGCTCAAACGCGAGACAGGGGAGGGTCAGCACCGTTTAGATCTATAAGGCCTACCGGTAATATGGATCGACAACAAACAGATGCTATAGGGATACCTACTCCTTTGGACCCACATGTAGATGAAGGCAAACACGCAGAGCAAAGGAGAGTAGTCCACCCGGTATAAGTTTGTGCTTTGAATTCTGGCTACGCAGACTTGCACTCTGTCCCGGCATTCACTATACTTCTCCGGAAGTCCTTTAAGAAATGTCCGCGCTCATGTGGTTCCCGTTGCTCAGGGGCCAACTCAAGTAGATCTTTAAGGCGCAGTCGACCACAGGCTACTAGATACGAGTTATACTTATCCGGACATCTGGCTAAATACTTGGATACGATACTTCCCCAGTCGTGAGAACGAAGCTAATACAGATCGAATTTCGATGGTTCAGGCAGGCAGTTCTCAGGAGGCAAGGTGTTAAATAGTTTCGGAGGCTCTTTCGTACGATCAGGGTCTACTACCCTAGGGCATTTTGACTTTGGATTAAATATGCAAAATGCAAGGCCGATTGTGATCAGTACTGATACTCCAACTGGACCACCTTCAGACCCTTCGAGGGGACCTAGACGACGGGAACCCTTCCAGCGGGTGATACCAGTTAGAGCAAGTCACAAACACGATTCAGCCCCCGGGGTTTATGACGTACCATGCGAGTAATAATGCACGTATACGGAGCTCTTCCACCGAGCGATGGCATTTCGGGGCGAGGTAGTTGTCTTTCATTGGCATCGCACAACCCCCATCCTCTTAATTGGCATCGTCTCCAGCTGGAAAGAATTTGAGTGAGCATGTCGCCCCTATTATTCCGTTGCCAATAAAGTGTCTCAACTTTTGGCGAAGGTTTTAACGCATACAAGGAGAAGCCGCGAGACGTCTGTACCGCTGATCTGGACGCAAAGTGCTCGGACTGCCGCTGAGTTATCCTGGACGCCATGATTAGAGCCGTCGTCACTACCTGCATACATGGGCCGATAGAGTACTGCAACCAACAACTCACTTAAGCTCCACAACGGCTGGACACTTCCGAGAGCGGTCTTACACAAACGTTAGGTCCTGGGCCGCCGACCTTACCGCTAGTTAGTGAGAGCCAGTTAAAATTATGAACGCTCGGAACCTTCCCAACAGTGGCCGCAGCCTTCCTTGACGCCTAGCACATCTGGTTTATACTCGGGTATGCCGTAGATCGGTAACCTAGGGAACGACCCTGTGGGTTTAACACCCGAGTGCGTAATCAAGCCTAGAGGCCATCTCAACTCGAGAGGTCTCCTGACAAAGAGGCGCCCGATGAATCATCCAGAGGCGTCTGGCGGTCCTACGAGAGTGGCTTTGGATGCCTGCCCCTTGGATGGATCTGTCTTTAATCGGCGCCAATACCTAGCACTGCTAGGCTCCAGACTGTGTTTACATGCCGTAACCCTGATACTCGCAGAAACGTTGCTGGAAATTCCTAGCAGCTGAAACCATTCCCCGTAACGTACTAGTACGCTAAGAGAGAGTCTCTCCTGGCCCTGATGAGTGTGTTCTCATCTGGGGCACGATACAAGAATCGGAACGAACGCAATGCCGAAGTCCCTTGTACCTTAATTTGGGCGACGCAGATAGACCCAAAGATCGCGGACTACGGAAACTAGCATAGGACCTGTGTCGAGAAGGTTCGAGCAGGTAGTGACACGCAGCGCGGTGGCCGGCGGGGTGGCACATTGCGGGTCAATACTGGTAGTAGCCACTCTTTGGACATAGCGGCGGACCAGCGCCTAGAATGTCTCATTCTCATTTTGTTCCGTGGCACGTTACGTAATGACGGCCCGCCAGCACCTGTGTATGGACTTGTAGCTCGGGCCTCTGGTCCTGGCACGACAAGGCACCAGCCAGTAATCTCTCCTAAGGCGCTAGCGTGCATAGCGCGTCTGCCTACCGCCAGAGAACGCGTCATCTGCAAGACGTCCCAGCGTAGTGAATTGTAACTGCAAGCGTTCTCTTACGGTCATAGTGCCGATTTTGAGCAGTAATGGAAGCAGCAAAATGCCGCCCAAGCGATTCGCAAACTTCTAACAGAGCTACAGCCGGACACGACGCGGTGGTGCTCGCGGTTGGTGATCTTATGATATTAACGCCCATAGCGGCCATCTTAATCGACACCATGTTCGTTTTGGCAGGCCTTGTGGTAAACACGTGCTAGTGGCACCACCCATGCCCGTGCCCATACATCCAAACCGAGAGAAAGCCTATTTAAGCGAAAACCACAACTTCGAGGTTTCACCCCCTGCCATTGATAAAGCGAGGAGTACCCCCGATGCCGGGAAGCGTCCGCACCCATTTCTTTCGTTCTGGAATCCTCGGGCGACTTCTCGAAGATACTGTGCTCACGACCTGGAGTATCATGAACAATCGGAGGAAAATGAGTAATTGTCGAGTCGTTGTTAGACGGCACTTCCGTCCGGCCCAACTGTTCTCGGATACGTGTCCCGTGGTCAATGCTCTAAACCGGCTGCCGGCGACTCAGTTCACTGAGACAAATTCTGATGCTTTCGAAGCAAGGATGCGCCCAGAGCAGAGCTGCCCAGATGAGGTTAAGAACGTAACTATAATCGATCAGCCATTCGGCTTAAGGGGCCCCGGCGAAACGCGAAACACTTGGCACATGGACGCTTCACGCGCAACAGTAGTTGTCTCTTTCGTGAGCCACCGTAGCAGCTAGAAAGGCCTATCCAGTGATGCTTTATGACTGAGTGTCGAATCTAGGTATAGCATAGACTGGCTGATCGGGCGGGTCGGCCCACCCGTCTCGGTCGAGCGGTTCTGACTTTGGGTGGCTGTGTGAACCCAACTGCAGATGGAGTTGAATGGGTACACCCTATGCGAGGCCTCGTCTTTACACCAAATCGGGGCCCTGTGAAGTGCCACTCTTTTCCAGCCGGCAGCCGCTCAGTCTGATTTTGCTTGTACATGTCGTGTGCGAACGTTCCGGGAGGCTTCCGTGTTCCAAATACCGTGTTCTCATATTCGGTCCATCTACCGACGGAGAGTTGGGATGCCCGGGCCCGGAAATATAATTTAAACTCGTGGCCAAGAATTTAGCATGTTGTAAACATGAGAGACAGGGCCGGGCTAAAACATTACCCCTGAGTAATGTAGAGCCACAACTGAACATAACATTGGGATCTAACGCACGCAATCAGTGTAGCTTCAGCCCACCCTCTAAATTTCCCCCGGACAACTGGATTATCACCTGCGTCACGCGATAATTGCTCGCATCTCACCAACACACTTCGACAAATCTGGAGTCTCCCTGGTCCGTACGTCCAAAACCGTTTAAATGGGCGGGTGTGTCGTGAACCAATCTCCTCTTCCATTTGTCACATACTGGCGATGACATCCTTTTACTTGAATTATTCATCCGGGCACCAGCCGCTTTCCCTACGATCCCCGACACTCGGGGCTTCGGGAGTTGCCCGCCAAAAAACCGACAAACCAAACTATACAATCAATCCCATCTAGATGTAGGGGACTGAGGCTCTAAGCTATGCGCCTACTATACTTTGTAGGTATCAAACTACGCTTGAAGATAGTTGATAAGGAAGCGAATTGATCGAGTACCGTATCTTCAGTCCGACTCCCGTTCGAACGCAGCACGCTAACATGGTCCACTGGCATTCTTACTAAATACCTAGTTCACTTCTACATGAGGAGTGTCTGGGCCGGACTCACCTTTGATTAGATAACTGAAG diff --git a/public/gwas-test/src/test/resources/fake_chrQ.fa.fai b/public/gwas-test/src/test/resources/fake_chrQ.fa.fai new file mode 100644 index 0000000000000000000000000000000000000000..b7a558fdb3b3c0e85f6e3c634cc3ae80c601336d --- /dev/null +++ b/public/gwas-test/src/test/resources/fake_chrQ.fa.fai @@ -0,0 +1 @@ +chrQ 16571 6 16571 16572 diff --git a/public/gwas-test/src/test/resources/log4j.properties b/public/gwas-test/src/test/resources/log4j.properties new file mode 100644 index 0000000000000000000000000000000000000000..501af67582a546db584c8538b28cb6f9e07f1692 --- /dev/null +++ b/public/gwas-test/src/test/resources/log4j.properties @@ -0,0 +1,25 @@ +# +# Biopet is built on top of GATK Queue for building bioinformatic +# pipelines. It is mainly intended to support LUMC SHARK cluster which is running +# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) +# should also be able to execute Biopet tools and pipelines. +# +# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center +# +# Contact us at: sasc@lumc.nl +# +# A dual licensing mode is applied. The source code within this project that are +# not part of GATK Queue is freely available for non-commercial use under an AGPL +# license; For commercial users or users who do not want to follow the AGPL +# license, please contact us to obtain a separate license. +# + +# Set root logger level to DEBUG and its only appender to A1. +log4j.rootLogger=ERROR, A1 + +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-5p [%d] [%C{1}] - %m%n \ No newline at end of file diff --git a/public/gwas-test/src/test/resources/specs/files.specs b/public/gwas-test/src/test/resources/specs/files.specs new file mode 100644 index 0000000000000000000000000000000000000000..6385b4296366055c42ecf14e4d6443af7f8222b3 --- /dev/null +++ b/public/gwas-test/src/test/resources/specs/files.specs @@ -0,0 +1,26 @@ +{ + files = ( + { + chromosome = "chrQ"; + name = "imputation_06/test"; + orig = "imputation_03/LLS_Offspring_Partners_Final_37_Overlap_chr1"; + }, + { + chromosome = "chrQ"; + name = "imputation_06/test"; + orig = "imputation_03/LLS_Offspring_Partners_Final_37_Overlap_chr1"; + }, + { + chromosome = "chrQ"; + name = "imputation_06/test"; + orig = "imputation_03/LLS_Offspring_Partners_Final_37_Overlap_chr22"; + }, + { + chromosome = "chrQ"; + name = "imputation_06/test"; + orig = "imputation_03/LLS_Offspring_Partners_Final_37_Overlap_chr22"; + } + ); + jids = ( "2102064" ); + type = "tped3col"; +} \ No newline at end of file diff --git a/public/gwas-test/src/test/resources/specs/test.gens b/public/gwas-test/src/test/resources/specs/test.gens new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/public/gwas-test/src/test/resources/specs/test.gens_info b/public/gwas-test/src/test/resources/specs/test.gens_info new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/public/gwas-test/src/test/resources/specs/test.gens_info_by_sample b/public/gwas-test/src/test/resources/specs/test.gens_info_by_sample new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/public/gwas-test/src/test/resources/specs/test.gens_summary b/public/gwas-test/src/test/resources/specs/test.gens_summary new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/public/gwas-test/src/test/resources/specs/test.gens_warnings b/public/gwas-test/src/test/resources/specs/test.gens_warnings new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/public/gwas-test/src/test/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTestTest.scala b/public/gwas-test/src/test/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTestTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..7455c60183fd9cffc0d6ebc4d44b7503b1a24d49 --- /dev/null +++ b/public/gwas-test/src/test/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTestTest.scala @@ -0,0 +1,98 @@ +package nl.lumc.sasc.biopet.pipelines.gwastest + +import java.io.File +import java.nio.file.Paths + +import com.google.common.io.Files +import nl.lumc.sasc.biopet.utils.config.Config +import org.broadinstitute.gatk.queue.QSettings +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +/** + * Created by pjvan_thof on 4/11/16. + */ +class GwasTestTest extends TestNGSuite with Matchers { + def initPipeline(map: Map[String, Any]): GwasTest = { + new GwasTest { + override def configNamespace = "gwastest" + override def globalConfig = new Config(map) + qSettings = new QSettings + qSettings.runName = "test" + } + } + + @Test + def testFromVcf: Unit = { + val pipeline = initPipeline(GwasTestTest.config ++ + Map("input_vcf" -> GwasTestTest.vcfFile.toString + ) + ) + pipeline.script() + } + + @Test + def testFromGens: Unit = { + val pipeline = initPipeline(GwasTestTest.config ++ + Map("input_gens" -> List(Map("genotypes" -> GwasTestTest.vcfFile, "contig" -> "chrQ")) + ) + ) + pipeline.script() + } + + @Test + def testWrongContig: Unit = { + val pipeline = initPipeline(GwasTestTest.config ++ + Map("input_gens" -> List(Map("genotypes" -> GwasTestTest.vcfFile, "contig" -> "chrBla")) + ) + ) + intercept[IllegalStateException] { + pipeline.script() + } + } + + @Test + def testFromSpecs: Unit = { + val pipeline = initPipeline(GwasTestTest.config ++ + Map("imute_specs_file" -> GwasTestTest.resourcePath("/specs/files.specs")) + ) + pipeline.script() + } + + @Test + def testEmpty: Unit = { + val pipeline = initPipeline(GwasTestTest.config) + intercept[IllegalArgumentException] { + pipeline.script() + } + } +} + +object GwasTestTest { + val vcfFile = File.createTempFile("gwas.", ".vcf") + Files.touch(vcfFile) + vcfFile.deleteOnExit() + + val phenotypeFile = File.createTempFile("gwas.", ".txt") + phenotypeFile.deleteOnExit() + + val outputDir = Files.createTempDir() + outputDir.deleteOnExit() + + val reference = new File(resourcePath("/fake_chrQ.fa")) + + private def resourcePath(p: String): String = { + Paths.get(getClass.getResource(p).toURI).toString + } + + val config = Map( + "reference_fasta" -> GwasTestTest.reference.toString, + "phenotype_file" -> GwasTestTest.phenotypeFile.toString, + "output_dir" -> outputDir, + "snptest" -> Map("exe" -> "test"), + "md5sum" -> Map("exe" -> "test"), + "gatk_jar" -> "test" + ) + +} diff --git a/public/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala b/public/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala index cce528aaa5f10f796dd06c34f45860b689ccd12e..7b8f289df558dfa2667fd44cbdcfc0b3874926b6 100644 --- a/public/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala +++ b/public/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala @@ -35,7 +35,7 @@ import org.testng.annotations.{ BeforeClass, AfterClass, DataProvider, Test } abstract class AbstractTestMapping(val aligner: String) extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): Mapping = { new Mapping { - override def configName = "mapping" + override def configNamespace = "mapping" override def globalConfig = new Config(map) qSettings = new QSettings qSettings.runName = "test" diff --git a/public/pom.xml b/public/pom.xml index 4171089d31ef05ec2c544c29154b809638004dd5..63bde77e79b63f61366f06a7d472a98b0cd9db0d 100644 --- a/public/pom.xml +++ b/public/pom.xml @@ -46,6 +46,7 @@ <module>biopet-tools-extensions</module> <module>biopet-extensions</module> <module>biopet-tools-package</module> + <module>gwas-test</module> </modules> <properties> diff --git a/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTrait.scala b/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTrait.scala index f0fe2c1291d647815f8d336902fbd944f6312f3f..76aee99b4f3b1c857edc4f402aaae1133f313a55 100644 --- a/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTrait.scala +++ b/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTrait.scala @@ -36,15 +36,15 @@ trait ShivaTrait extends MultisampleMappingTrait with Reference with TargetRegio Some(shiva) } - /** Method to make the variantcalling submodule of shiva */ + /** Method to make the variantcalling namespace of shiva */ def makeVariantcalling(multisample: Boolean = false): ShivaVariantcallingTrait with QScript = { if (multisample) new ShivaVariantcalling(qscript) { override def namePrefix = "multisample" - override def configName: String = "shivavariantcalling" + override def configNamespace: String = "shivavariantcalling" override def configPath: List[String] = super.configPath ::: "multisample" :: Nil } else new ShivaVariantcalling(qscript) { - override def configName = "shivavariantcalling" + override def configNamespace = "shivavariantcalling" } } diff --git a/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTrait.scala b/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTrait.scala index f08b4cc18402e995375f14f1095e922883b8dc98..06f876979fb3ae541be76758d031174cb6c93e8b 100644 --- a/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTrait.scala +++ b/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTrait.scala @@ -92,8 +92,8 @@ trait ShivaVariantcallingTrait extends SummaryQScript caller.outputDir = new File(outputDir, caller.name) add(caller) addStats(caller.outputFile, caller.name) - val normalize: Boolean = config("execute_vt_normalize", default = false, submodule = caller.configName) - val decompose: Boolean = config("execute_vt_decompose", default = false, submodule = caller.configName) + val normalize: Boolean = config("execute_vt_normalize", default = false, namespace = caller.configNamespace) + val decompose: Boolean = config("execute_vt_decompose", default = false, namespace = caller.configNamespace) val vtNormalize = new VtNormalize(this) vtNormalize.inputVcf = caller.outputFile diff --git a/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/RawVcf.scala b/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/RawVcf.scala index 8c1613526e56e5f1d4d77ce6cd336ab2fee4578a..847e671166191da3153cc2df818828c66de37aa1 100644 --- a/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/RawVcf.scala +++ b/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/RawVcf.scala @@ -29,11 +29,13 @@ class RawVcf(val root: Configurable) extends Variantcaller { // This caller is designed as fallback when other variantcallers fails to report protected def defaultPrio = Int.MaxValue + val keepRefCalls: Boolean = config("keep_ref_calls", default = false) + def biopetScript { val rawFiles = inputBams.map { case (sample, bamFile) => val mp = new SamtoolsMpileup(this) { - override def configName = "samtoolsmpileup" + override def configNamespace = "samtoolsmpileup" override def defaults = Map("samtoolsmpileup" -> Map("disable_baq" -> true, "min_map_quality" -> 1)) } mp.input :+= bamFile @@ -44,11 +46,11 @@ class RawVcf(val root: Configurable) extends Variantcaller { add(mp | m2v) val vcfFilter = new VcfFilter(this) { - override def configName = "vcffilter" + override def configNamespace = "vcffilter" override def defaults = Map("min_sample_depth" -> 8, "min_alternate_depth" -> 2, "min_samples_pass" -> 1, - "filter_ref_calls" -> true + "filter_ref_calls" -> !keepRefCalls ) } vcfFilter.inputVcf = m2v.output @@ -61,7 +63,7 @@ class RawVcf(val root: Configurable) extends Variantcaller { cv.inputFiles = rawFiles.toList cv.outputFile = outputFile cv.setKey = "null" - cv.excludeNonVariants = true + cv.excludeNonVariants = !keepRefCalls add(cv) } } diff --git a/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaSvCallingTest.scala b/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaSvCallingTest.scala index 7e3f1ccdfa0b7d2f4b842f247476c9f5028c5a23..b0eaaa4303b9725ce88c2856d3d050060196140f 100644 --- a/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaSvCallingTest.scala +++ b/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaSvCallingTest.scala @@ -40,7 +40,7 @@ import scala.collection.mutable.ListBuffer class ShivaSvCallingTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): ShivaSvCalling = { new ShivaSvCalling { - override def configName = "shivasvcalling" + override def configNamespace = "shivasvcalling" override def globalConfig = new Config(ConfigUtils.mergeMaps(map, ShivaSvCallingTest.config)) qSettings = new QSettings qSettings.runName = "test" diff --git a/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala b/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala index 31777668d11df48a3ad99a663d60aad6e7666966..c2249f15511d241428ef6f7d8f39b4bbc41e1143 100644 --- a/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala +++ b/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala @@ -36,7 +36,7 @@ import org.testng.annotations.{ DataProvider, Test } class ShivaTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): Shiva = { new Shiva() { - override def configName = "shiva" + override def configNamespace = "shiva" override def globalConfig = new Config(ConfigUtils.mergeMaps(map, ShivaTest.config)) qSettings = new QSettings qSettings.runName = "test" diff --git a/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTest.scala b/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTest.scala index b1699c7b97132b1df20c88d49c6202164fa89227..d86e46c7e9ef21dfcdd5a67f779e46adc2b23483 100644 --- a/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTest.scala +++ b/public/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaVariantcallingTest.scala @@ -40,7 +40,7 @@ import scala.collection.mutable.ListBuffer class ShivaVariantcallingTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): ShivaVariantcalling = { new ShivaVariantcalling { - override def configName = "shivavariantcalling" + override def configNamespace = "shivavariantcalling" override def globalConfig = new Config(ConfigUtils.mergeMaps(map, ShivaVariantcallingTest.config)) qSettings = new QSettings qSettings.runName = "test" diff --git a/public/tinycap/.gitignore b/public/tinycap/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..1de565933b05f74c75ff9a6520af5f9f8a5a2f1d --- /dev/null +++ b/public/tinycap/.gitignore @@ -0,0 +1 @@ +target \ No newline at end of file diff --git a/public/tinycap/src/test/resources/log4j.properties b/public/tinycap/src/test/resources/log4j.properties new file mode 100644 index 0000000000000000000000000000000000000000..501af67582a546db584c8538b28cb6f9e07f1692 --- /dev/null +++ b/public/tinycap/src/test/resources/log4j.properties @@ -0,0 +1,25 @@ +# +# Biopet is built on top of GATK Queue for building bioinformatic +# pipelines. It is mainly intended to support LUMC SHARK cluster which is running +# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) +# should also be able to execute Biopet tools and pipelines. +# +# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center +# +# Contact us at: sasc@lumc.nl +# +# A dual licensing mode is applied. The source code within this project that are +# not part of GATK Queue is freely available for non-commercial use under an AGPL +# license; For commercial users or users who do not want to follow the AGPL +# license, please contact us to obtain a separate license. +# + +# Set root logger level to DEBUG and its only appender to A1. +log4j.rootLogger=ERROR, A1 + +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-5p [%d] [%C{1}] - %m%n \ No newline at end of file diff --git a/public/tinycap/src/test/scala/nl/lucm/sasc/biopet/pipelines/tinycap/TinyCapTest.scala b/public/tinycap/src/test/scala/nl/lumc/sasc/biopet/pipelines/tinycap/TinyCapTest.scala similarity index 98% rename from public/tinycap/src/test/scala/nl/lucm/sasc/biopet/pipelines/tinycap/TinyCapTest.scala rename to public/tinycap/src/test/scala/nl/lumc/sasc/biopet/pipelines/tinycap/TinyCapTest.scala index b8ec5c3ab159b584c5935bc87e29892799216a71..7275b4349ec04a0e39cc45693173201bff1cc17f 100644 --- a/public/tinycap/src/test/scala/nl/lucm/sasc/biopet/pipelines/tinycap/TinyCapTest.scala +++ b/public/tinycap/src/test/scala/nl/lumc/sasc/biopet/pipelines/tinycap/TinyCapTest.scala @@ -35,7 +35,7 @@ class TinyCapTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): TinyCap = { new TinyCap() { - override def configName = "tinycap" + override def configNamespace = "tinycap" override def globalConfig = new Config(map) diff --git a/public/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala b/public/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala index 1ccb0ea95fedc96d6e967e073ef80a6c3abc0195..2e3ffe26a1735dbea0f664bf8c4c957bd7262f7b 100644 --- a/public/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala +++ b/public/toucan/src/main/scala/nl/lumc/sasc/biopet/pipelines/toucan/Toucan.scala @@ -119,8 +119,8 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum def importAndActivateSample(sampleID: String, inputVcf: File, gVCF: File, annotation: ManweAnnotateVcf): ManweActivateAfterAnnotImport = { - val minGQ: Int = config("minimum_genome_quality", default = 20, submodule = "manwe") - val isPublic: Boolean = config("varda_is_public", default = true, submodule = "manwe") + val minGQ: Int = config("minimum_genome_quality", default = 20, namespace = "manwe") + val isPublic: Boolean = config("varda_is_public", default = true, namespace = "manwe") val bedTrack = new GvcfToBed(this) bedTrack.inputVcf = gVCF @@ -185,7 +185,7 @@ class Toucan(val root: Configurable) extends QScript with BiopetQScript with Sum */ def varda(vcf: File, gVcf: File): File = { - val annotationQueries: List[String] = config("annotation_queries", default = List("GLOBAL *"), submodule = "manwe") + val annotationQueries: List[String] = config("annotation_queries", default = List("GLOBAL *"), namespace = "manwe") //TODO: add groups!!! Need sample-specific group tags for this val annotate = new ManweAnnotateVcf(this)