Commit b031a6e7 authored by Peter van 't Hof's avatar Peter van 't Hof Committed by GitHub

Merge pull request #78 from biopet/fix-BIOPET-368

Adding gender aware variantcalling for gvcf mode
parents 7729de26 6132f59d
......@@ -17,7 +17,6 @@ package nl.lumc.sasc.biopet.utils.intervals
import java.io.{ File, PrintWriter }
import htsjdk.samtools.SAMSequenceDictionary
import htsjdk.samtools.reference.FastaSequenceFile
import scala.collection.JavaConversions._
import scala.collection.mutable
......@@ -112,6 +111,15 @@ case class BedRecordList(val chrRecords: Map[String, List[BedRecord]], val heade
allRecords.foreach(writer.println)
writer.close()
}
/** This return the fraction of the regions comparing to a length */
def fractionOf(length: Long): Double = length.toDouble / length
/** This return the fraction of the regions comparing to a reference */
def fractionOfReference(dict: SAMSequenceDictionary): Double = fractionOf(dict.getReferenceLength)
/** This return the fraction of the regions comparing to a reference */
def fractionOfReference(file: File): Double = fractionOfReference(FastaUtils.getCachedDict(file))
}
object BedRecordList {
......@@ -131,6 +139,32 @@ object BedRecordList {
def fromList(records: TraversableOnce[BedRecord]): BedRecordList = fromListWithHeader(records, Nil)
/**
* This creates a [[BedRecordList]] based on multiple files. This method combines overlapping regions
*
* @param bedFiles Input bed files
* @return
*/
def fromFilesCombine(bedFiles: File*) = {
fromFiles(bedFiles, combine = true)
}
/**
* This creates a [[BedRecordList]] based on multiple files
*
* @param bedFiles Input bed files
* @param combine When true overlaping regions are merged
* @return
*/
def fromFiles(bedFiles: Seq[File], combine: Boolean = false) = {
val list = bedFiles.foldLeft(empty)((a, b) => fromList(fromFile(b).allRecords ++ a.allRecords))
if (combine) list.combineOverlap
else list
}
/** This created a empty [[BedRecordList]] */
def empty = fromList(Nil)
def fromFile(bedFile: File) = {
val reader = Source.fromFile(bedFile)
val all = reader.getLines().toList
......
......@@ -118,6 +118,11 @@ At this moment the following variant callers can be used
| shiva | correct_readgroups | Boolean | false | Attempt to correct read groups | Only used when input is a bam file |
| shiva | amplicon_bed | Path | | Path to target bed file | all |
| shiva | regions_of_interest | Array of paths | | Array of paths to region of interest (e.g. gene panels) bed files | all |
| shivavariantcalling | gender_aware_calling | Boolean | false | Enables gander aware variantcalling | haplotypecaller_gvcf |
| shivavariantcalling | hap̦loid_regions | Bed file | | Haploid regions for all genders | haplotypecaller_gvcf |
| shivavariantcalling | hap̦loid_regions_male | Bed file | | Haploid regions for males | haplotypecaller_gvcf |
| shivavariantcalling | hap̦loid_regions_female | Bed file | | Haploid regions for females | haplotypecaller_gvcf |
| shiva | amplicon_bed | Path | | Path to target bed file | all |
| vcffilter | min_sample_depth | Integer | 8 | Filter variants with at least x coverage | raw |
| vcffilter | min_alternate_depth | Integer | 2 | Filter variants with at least x depth on the alternate allele | raw |
| vcffilter | min_samples_pass | Integer | 1 | Minimum amount of samples which pass custom filter (requires additional flags) | raw |
......@@ -130,6 +135,14 @@ For all the options, please see the corresponding documentation for the mapping
## Advanced usage
### Gender aware variantcalling
In Shiva and ShivaVariantcalling while using haplotypecaller_gvcf it is possible to do gender aware variantcalling. In this mode it required to supply bed files to define haploid regions (see config values).
- For males, `hap̦loid_regions` and `hap̦loid_regions_male` is used.
- For females, `hap̦loid_regions` and `hap̦loid_regions_female` is used.
The pipeline will use a union of those files. At least 1 file is required while using this mode.
### Reporting modes
Shiva furthermore supports three modes. The default and recommended option is `multisample_variantcalling`.
......
......@@ -60,11 +60,13 @@ class Shiva(val parent: Configurable) extends QScript with MultisampleMappingTra
override def namePrefix = "multisample"
override def configNamespace: String = "shivavariantcalling"
override def configPath: List[String] = super.configPath ::: "multisample" :: Nil
genders = samples.map { case (sampleName, sample) => sampleName -> sample.gender }
}
else new ShivaVariantcalling(qscript) {
override def configNamespace = "shivavariantcalling"
sampleId = sample
libId = library
genders = sample.map(x => x -> samples(x).gender).toMap
}
}
......
......@@ -14,7 +14,8 @@
*/
package nl.lumc.sasc.biopet.pipelines.shiva
import nl.lumc.sasc.biopet.core.{ PipelineCommand, Reference, SampleLibraryTag }
import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, PipelineCommand, Reference, SampleLibraryTag }
import MultiSampleQScript.Gender
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.extensions.Tabix
import nl.lumc.sasc.biopet.extensions.gatk.{ CombineVariants, GenotypeConcordance }
......@@ -48,9 +49,22 @@ class ShivaVariantcalling(val parent: Configurable) extends QScript
var inputBqsrFiles: Map[String, File] = Map()
var genders: Map[String, Gender.Value] = _
/** Executed before script */
def init(): Unit = {
if (inputBamsArg.nonEmpty) inputBams = BamUtils.sampleBamMap(inputBamsArg)
if (Option(genders).isEmpty) genders = {
val samples: Map[String, Any] = config("genders", default = Map())
samples.map {
case (sampleName, gender) =>
sampleName -> (gender.toString.toLowerCase match {
case "male" => Gender.Male
case "female" => Gender.Female
case _ => Gender.Unknown
})
}
}
}
var referenceVcf: Option[File] = config("reference_vcf")
......@@ -100,6 +114,7 @@ class ShivaVariantcalling(val parent: Configurable) extends QScript
caller.inputBqsrFiles = inputBqsrFiles
caller.namePrefix = namePrefix
caller.outputDir = new File(outputDir, caller.name)
caller.genders = genders
add(caller)
addStats(caller.outputFile, caller.name)
val normalize: Boolean = config("execute_vt_normalize", default = false, namespace = caller.configNamespace)
......
......@@ -19,8 +19,12 @@
*/
package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers
import nl.lumc.sasc.biopet.core.MultiSampleQScript.Gender
import nl.lumc.sasc.biopet.extensions.gatk
import nl.lumc.sasc.biopet.extensions.gatk.CombineGVCFs
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
/** Gvcf mode for haplotypecaller */
class HaplotypeCallerGvcf(val parent: Configurable) extends Variantcaller {
......@@ -34,14 +38,76 @@ class HaplotypeCallerGvcf(val parent: Configurable) extends Variantcaller {
def getGvcfs = gVcfFiles
val genderAwareCalling: Boolean = config("gender_aware_calling", default = false)
val haploidRegions: Option[File] = config("hap̦loid_regions")
val haploidRegionsMale: Option[File] = config("haploid_regions_male")
val haploidRegionsFemale: Option[File] = config("haploid_regions_female")
lazy val fractionMale: Double = BedRecordList.fromFiles(Seq(haploidRegions, haploidRegionsMale).flatten, true).fractionOfReference(referenceDict)
lazy val fractionFemale: Double = BedRecordList.fromFiles(Seq(haploidRegions, haploidRegionsFemale).flatten, true).fractionOfReference(referenceDict)
lazy val fractionUnknown: Double = BedRecordList.fromFiles(Seq(haploidRegions).flatten, true).fractionOfReference(referenceDict)
override def fixedValues = Map("haplotypecaller" -> Map("emitRefConfidence" -> "GVCF"))
override def defaults = Map("haplotypecaller" -> Map(
"variant_index_type" -> "LINEAR",
"variant_index_parameter" -> 128000)
)
override def init(): Unit = {
super.init()
if (genderAwareCalling && haploidRegions.isEmpty && haploidRegionsMale.isEmpty && haploidRegionsFemale.isEmpty)
Logging.addError("Gender aware variantcalling is enabled but no haploid bed files are given")
}
def biopetScript() {
gVcfFiles = for ((sample, inputBam) <- inputBams) yield {
val hc = gatk.HaplotypeCaller(this, List(inputBam), new File(outputDir, sample + ".gvcf.vcf.gz"))
hc.BQSR = inputBqsrFiles.get(sample)
add(hc)
sample -> hc.out
if (genderAwareCalling) {
val finalFile = new File(outputDir, sample + ".gvcf.vcf.gz")
val gender = genders.getOrElse(sample, Gender.Unknown)
val haploidBedFiles: List[File] = gender match {
case Gender.Female => haploidRegions.toList ::: haploidRegionsFemale.toList ::: Nil
case Gender.Male => haploidRegions.toList ::: haploidRegionsMale.toList ::: Nil
case _ => haploidRegions.toList
}
val fraction: Double = gender match {
case Gender.Female => fractionFemale
case Gender.Male => fractionMale
case _ => fractionUnknown
}
val haploidGvcf = if (haploidBedFiles.nonEmpty) {
val hc = gatk.HaplotypeCaller(this, List(inputBam), new File(outputDir, sample + ".haploid.gvcf.vcf.gz"))
hc.BQSR = inputBqsrFiles.get(sample)
hc.intervals = haploidBedFiles
hc.scatterCount = (hc.scatterCount * fraction).toInt
hc.sample_ploidy = Some(1)
add(hc)
Some(hc.out)
} else None
val hcDiploid = gatk.HaplotypeCaller(this, List(inputBam), new File(outputDir, sample + ".diploid.gvcf.vcf.gz"))
hcDiploid.BQSR = inputBqsrFiles.get(sample)
hcDiploid.excludeIntervals = haploidBedFiles
hcDiploid.scatterCount = (hcDiploid.scatterCount * (1 - fraction)).toInt
add(hcDiploid)
haploidGvcf match {
case Some(file) =>
val combine = new CombineGVCFs(this)
combine.variant = Seq(hcDiploid.out, file)
combine.out = new File(outputDir, sample + ".gvcf.vcf.gz")
add(combine)
sample -> combine.out
case _ => sample -> hcDiploid.out
}
} else {
val hc = gatk.HaplotypeCaller(this, List(inputBam), new File(outputDir, sample + ".gvcf.vcf.gz"))
hc.BQSR = inputBqsrFiles.get(sample)
add(hc)
sample -> hc.out
}
}
val genotypeGVCFs = gatk.GenotypeGVCFs(this, gVcfFiles.values.toList, outputFile)
......
......@@ -14,6 +14,7 @@
*/
package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers
import nl.lumc.sasc.biopet.core.MultiSampleQScript.Gender
import nl.lumc.sasc.biopet.core.{ BiopetQScript, Reference }
import org.broadinstitute.gatk.queue.QScript
......@@ -27,6 +28,8 @@ trait Variantcaller extends QScript with BiopetQScript with Reference {
var namePrefix: String = _
var genders: Map[String, Gender.Value] = _
val mergeVcfResults: Boolean = config("merge_vcf_results", default = true)
/**
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment