Commit 659a271f authored by Peter van 't Hof's avatar Peter van 't Hof Committed by GitHub

Merge pull request #44 from biopet/tarmac-base-pipeline

Tarmac base pipeline
parents afbc2c46 224ef05b
......@@ -108,8 +108,8 @@ trait MultiSampleQScript extends SummaryQScript { qscript: QScript =>
val g: Option[String] = sampleTags.get("father").map(_.toString)
g.foreach { father =>
if (sampleId != father) Logging.addError(s"Father for $sampleId can not be itself")
if (samples.contains(father)) if (samples(father).gender == Gender.Male)
Logging.addError(s"Father of $sampleId is not a female")
if (samples.contains(father)) if (samples(father).gender != Gender.Male)
Logging.addError(s"Father of $sampleId is not a male")
else logger.warn(s"For sample '$sampleId' is father '$father' not found in config")
}
g
......@@ -119,13 +119,17 @@ trait MultiSampleQScript extends SummaryQScript { qscript: QScript =>
val g: Option[String] = sampleTags.get("mother").map(_.toString)
g.foreach { mother =>
if (sampleId != mother) Logging.addError(s"mother for $sampleId can not be itself")
if (samples.contains(mother)) if (samples(mother).gender == Gender.Female)
if (samples.contains(mother)) if (samples(mother).gender != Gender.Female)
Logging.addError(s"Mother of $sampleId is not a female")
else logger.warn(s"For sample '$sampleId' is mother '$mother' not found in config")
}
g
}
lazy val family: Option[String] = {
sampleTags.get("family").map(_.toString)
}
lazy val sampleGroups: List[String] = sampleTags.get("groups") match {
case Some(g: List[_]) => g.map(_.toString)
case Some(g: String) => List(g)
......
package nl.lumc.sasc.biopet.core
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.PedigreeQscript.PedMergeStrategy
import nl.lumc.sasc.biopet.utils.Logging
import org.broadinstitute.gatk.queue.QScript
import scala.io.Source
/**
* Created by Sander Bollen on 28-3-17.
*
* A multi-sample Qscript with additional Pedigree information.
* Pedigrees follow the PED standard.
* See: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml for the format
*
* Pedigrees may be parsed from the sample config and/or a supplied PED file.
*/
trait PedigreeQscript extends MultiSampleQScript { qscript: QScript =>
/* Optionally parse from ped file */
def ped: Option[File] = None
/* The merge stategy to use when we have both a ped file and sample tag information */
def mergeStrategy: PedMergeStrategy.Value = PedMergeStrategy.Concatenate
/**
* Case class representing a PED samples
* For the PED format, see:
* http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
* @param familyId family id
* @param individualId individual id
* @param paternalId Optional paternal id
* @param maternalId Optional maternal id
* @param gender gender
* @param affectedPhenotype Optional boolean
* @param genotypeFields optional genotype fileds
*/
case class PedSample(familyId: String, individualId: String,
paternalId: Option[String],
maternalId: Option[String],
gender: MultiSampleQScript.Gender.Value,
affectedPhenotype: Option[Boolean],
genotypeFields: List[String] = Nil)
lazy val pedSamples: List[PedSample] = {
mergeStrategy match {
case PedMergeStrategy.Concatenate => pedSamplesFromConfig() ::: parsePedFile()
case _ => throw new NotImplementedError() // todo
}
}
/**
* Get affected samples
*
* @return List[PedSample]
*/
def getIndexSamples: List[PedSample] = {
pedSamples.filter(x => x.affectedPhenotype)
}
/**
* Get pedSamples from sample tags in config
* May return empty list if no pedigree can be constructed
* PedSamples can only be constructed for those samples where family is defined
* Furthermore, if a father or mother is given, another sample with this id must also exist
*
* @return
*/
def pedSamplesFromConfig(): List[PedSample] = {
val withFam = samples.values.filter(_.family.isDefined)
val sampleIds = withFam.map(_.sampleId).toSet
val fathers = withFam.flatMap(_.father)
val mothers = withFam.flatMap(_.mother)
fathers.foreach { f =>
if (!sampleIds.contains(f)) {
Logging.addError(s"Father $f does not exist in samples")
}
}
mothers.foreach { m =>
if (!sampleIds.contains(m)) {
Logging.addError(s"Mother $m does not exist in samples")
}
}
withFam.map { s =>
PedSample(s.family.get, s.sampleId, s.father, s.mother, s.gender, None)
}.toList
}
/* Parse ped file to list of PedSamples */
def parsePedFile(): List[PedSample] = {
ped match {
case Some(p) => Source.fromFile(p).getLines().map { x => parseSinglePedLine(x) }.toList
case _ => Nil
}
}
/* Parse a single Ped line to a PedSample */
def parseSinglePedLine(line: String): PedSample = {
val arr = line.split("\\s")
var genotypeFields: List[String] = Nil
if (arr.size < 6) {
throw new IllegalArgumentException("Ped file contains less than 6 columns")
} else if (arr.length == 6) {
genotypeFields = Nil
} else {
genotypeFields = arr.drop(6).toList
}
val gender = arr(4) match {
case "1" => MultiSampleQScript.Gender.Male
case "2" => MultiSampleQScript.Gender.Female
case _ => MultiSampleQScript.Gender.Unknown
}
val affected = arr(5) match {
case "1" => Some(false)
case "2" => Some(true)
case _ => None
}
val paternalId = arr(2) match {
case "0" => None
case otherwise => Some(otherwise)
}
val maternalId = arr(3) match {
case "0" => None
case otherwise => Some(otherwise)
}
PedSample(arr(0), arr(1), paternalId, maternalId, gender, affected, genotypeFields)
}
/* Check whether sample is a mother */
def isMother(pedSample: PedSample): Boolean = {
val motherIds = pedSamples.flatMap(_.maternalId)
motherIds.contains(pedSample.individualId)
}
/* Check whether sample is a father */
def isFather(pedSample: PedSample): Boolean = {
val fatherIds = pedSamples.flatMap(_.paternalId)
fatherIds.contains(pedSample.individualId)
}
/**
* Convenience method for checking whether current pedigree constitutes a single patient pedigree
*/
def isSingle: Boolean = {
pedSamples.size == 1 && pedSamples.head.maternalId.isEmpty && pedSamples.head.paternalId.isEmpty
}
/**
* Convenience method for checking whether current pedigree constitutes a trio pedigree
*/
def isTrio: Boolean = {
getIndexSamples.size == 1 &&
pedSamples.size == 3 &&
(getIndexSamples.head.maternalId.isDefined && pedSamples.map(_.individualId).contains(getIndexSamples.head.maternalId.get)) &&
(getIndexSamples.head.paternalId.isDefined && pedSamples.map(_.individualId).contains(getIndexSamples.head.paternalId.get))
}
/**
* Write list of ped samples to a PED file
*
* For the PED format see:
* http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
*/
def writeToPedFile(outFile: File): Unit = {
val writer = new PrintWriter(outFile)
pedSamples.foreach { p =>
val paternalField = p.paternalId match {
case Some(s) => s
case _ => "0"
}
val maternalField = p.maternalId match {
case Some(s) => s
case _ => "0"
}
val genderField = p.gender match {
case MultiSampleQScript.Gender.Male => "1"
case MultiSampleQScript.Gender.Female => "2"
case _ => "-9"
}
val affectedField = p.affectedPhenotype match {
case Some(b) => b match {
case true => "2"
case _ => "1"
}
case _ => "0"
}
val line: String = s"${p.familyId}\t${p.individualId}\t$paternalField\t$maternalField\t$genderField\t$affectedField\t" +
p.genotypeFields.mkString("\t")
writer.write(line + "\n")
}
writer.close()
}
}
object PedigreeQscript {
object PedMergeStrategy extends Enumeration {
val Concatenate, Update, ConcatenatedAndUpdate = Value
}
}
......@@ -38,6 +38,7 @@
<module>mapping</module>
<module>sage</module>
<module>shiva</module>
<module>tarmac</module>
<module>tinycap</module>
<module>toucan</module>
<module>biopet-core</module>
......
<!--
Biopet is built on top of GATK Queue for building bioinformatic
pipelines. It is mainly intended to support LUMC SHARK cluster which is running
SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
should also be able to execute Biopet tools and pipelines.
Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
Contact us at: sasc@lumc.nl
A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
license; For commercial users or users who do not want to follow the AGPL
license, please contact us to obtain a separate license.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>Tarmac</artifactId>
<packaging>jar</packaging>
<parent>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Biopet</artifactId>
<version>0.9.0-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>
<inceptionYear>2017</inceptionYear>
<name>Tarmac</name>
<dependencies>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetCore</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetExtensions</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetToolsExtensions</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.10</artifactId>
<version>2.2.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
package nl.lumc.sasc.biopet.pipelines.tarmac
import java.io.File
import nl.lumc.sasc.biopet.core.{ PedigreeQscript, PipelineCommand, Reference }
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
/**
* Created by Sander Bollen on 23-3-17.
*/
class Tarmac(val root: Configurable) extends QScript with PedigreeQscript with SummaryQScript with Reference {
qscript =>
def this() = this(null)
def init() = {
}
def biopetScript() = {
addSamplesJobs()
addSummaryJobs()
}
def addMultiSampleJobs() = {
}
class Sample(name: String) extends AbstractSample(name) {
val inputCountFile: Option[File] = config("count_file")
val bamFile: Option[File] = config("bam")
/** Function to add sample jobs */
def addJobs(): Unit = {}
/* This is necesary for compile reasons, but library does not in fact exist for this pipeline */
def makeLibrary(id: String) = new Library(id)
class Library(id: String) extends AbstractLibrary(id) {
def addJobs(): Unit = {}
def summaryFiles: Map[String, File] = Map()
def summaryStats: Any = Map()
}
/** Must return files to store into summary */
def summaryFiles: Map[String, File] = Map()
/** Must returns stats to store into summary */
def summaryStats: Any = Map()
}
def makeSample(sampleId: String) = new Sample(sampleId)
def summarySettings: Map[String, Any] = Map()
def summaryFiles: Map[String, File] = Map()
def summaryFile: File = new File(outputDir, "tarmac.summary.json")
}
object Tarmac extends PipelineCommand
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment