Commit 050af98d authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Remove old code

parent 88d47fcf
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import org.broadinstitute.gatk.queue.QScript
import scala.util.Random
class GatkBenchmarkGenotyping(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Sample gvcf file")
var sampleGvcf: File = _
@Argument(doc = "SampleName", required = true)
var sampleName: String = _
@Input(doc = "Gvcf files", shortName = "I", required = false)
var gvcfFiles: List[File] = Nil
var reference: File = config("reference")
@Argument(doc = "Dbsnp", shortName = "dbsnp", required = false)
var dbsnp: File = config("dbsnp")
def init() {
if (config.contains("gvcffiles")) for (file <- config("gvcffiles").asList)
gvcfFiles ::= file.toString
}
def biopetScript() {
var todoGvcfs = gvcfFiles
var gvcfPool: List[File] = Nil
addGenotypingPipeline(gvcfPool)
while (todoGvcfs.nonEmpty) {
val index = Random.nextInt(todoGvcfs.size)
gvcfPool ::= todoGvcfs(index)
addGenotypingPipeline(gvcfPool)
todoGvcfs = todoGvcfs.filter(b => b != todoGvcfs(index))
}
}
def addGenotypingPipeline(gvcfPool: List[File]) {
val gatkGenotyping = new GatkGenotyping(this)
gatkGenotyping.inputGvcfs = sampleGvcf :: gvcfPool
gatkGenotyping.samples :+= sampleName
gatkGenotyping.outputDir = new File(outputDir, "samples_" + gvcfPool.size)
gatkGenotyping.init()
gatkGenotyping.biopetScript()
addAll(gatkGenotyping.functions)
}
}
object GatkBenchmarkGenotyping extends PipelineCommand
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions.gatk.broad.{ GenotypeGVCFs, SelectVariants }
import org.broadinstitute.gatk.queue.QScript
class GatkGenotyping(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Gvcf files", shortName = "I")
var inputGvcfs: List[File] = Nil
@Argument(doc = "Reference", shortName = "R", required = false)
var reference: File = config("reference")
@Argument(doc = "Dbsnp", shortName = "dbsnp", required = false)
var dbsnp: File = config("dbsnp")
@Argument(doc = "OutputName", required = false)
var outputName: String = "genotype"
@Output(doc = "OutputFile", shortName = "O", required = false)
var outputFile: File = _
@Argument(doc = "Samples", shortName = "sample", required = false)
var samples: List[String] = Nil
def init() {
require(outputName != null, "Outputname is null")
if (outputFile == null) outputFile = new File(outputDir, outputName + ".vcf.gz")
}
def biopetScript() {
addGenotypeGVCFs(inputGvcfs, outputFile)
if (samples.nonEmpty) {
if (samples.size > 1) addSelectVariants(outputFile, samples, new File(outputDir, "samples/"), "all")
for (sample <- samples) addSelectVariants(outputFile, List(sample), new File(outputDir, "samples/"), sample)
}
}
def addGenotypeGVCFs(gvcfFiles: List[File], outputFile: File): File = {
val genotypeGVCFs = GenotypeGVCFs(this, gvcfFiles, outputFile)
add(genotypeGVCFs)
genotypeGVCFs.out
}
def addSelectVariants(inputFile: File, samples: List[String], outputDir: File, name: String) {
val selectVariants = SelectVariants(this, inputFile, new File(outputDir, name + ".vcf.gz"))
selectVariants.excludeNonVariants = true
for (sample <- samples) selectVariants.sample_name :+= sample
add(selectVariants)
}
}
object GatkGenotyping extends PipelineCommand
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import htsjdk.samtools.SamReaderFactory
import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, PipelineCommand }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.extensions.gatk.broad.{ CombineGVCFs, CombineVariants }
import nl.lumc.sasc.biopet.extensions.picard.{ AddOrReplaceReadGroups, SamToFastq }
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.bamtobigwig.Bam2Wig
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import org.broadinstitute.gatk.queue.QScript
import scala.collection.JavaConversions._
class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScript with SummaryQScript {
qscript =>
def this() = this(null)
@Argument(doc = "Skip Genotyping step", shortName = "skipgenotyping", required = false)
var skipGenotyping: Boolean = config("skip_genotyping", default = false)
/** Merge gvcfs */
var mergeGvcfs: Boolean = config("merge_gvcfs", default = false)
/** Joint variantcalling */
var jointVariantcalling: Boolean = config("joint_variantcalling", default = false)
/** Joint genotyping */
var jointGenotyping: Boolean = config("joint_genotyping", default = false)
var singleSampleCalling = config("single_sample_calling", default = true)
var reference: File = config("reference")
var useAllelesOption: Boolean = config("use_alleles_option", default = false)
val externalGvcfs = config("external_gvcfs_files", default = Nil).asFileList
def summaryFile = new File(outputDir, "GatkPipeline.summary.json")
//TODO: Add summary
def summaryFiles = Map()
//TODO: Add summary
def summarySettings = Map()
def makeSample(id: String) = new Sample(id)
class Sample(sampleId: String) extends AbstractSample(sampleId) {
//TODO: Add summary
def summaryFiles: Map[String, File] = Map()
//TODO: Add summary
def summaryStats: Map[String, Any] = Map()
def makeLibrary(id: String) = new Library(id)
class Library(libId: String) extends AbstractLibrary(libId) {
//TODO: Add summary
def summaryFiles: Map[String, File] = Map()
//TODO: Add summary
def summaryStats: Map[String, Any] = Map()
val mapping = new Mapping(qscript)
mapping.sampleId = Some(sampleId)
mapping.libId = Some(libId)
mapping.outputDir = libDir
/** Library variantcalling */
val gatkVariantcalling = new GatkVariantcalling(qscript)
gatkVariantcalling.doublePreProces = false
gatkVariantcalling.sampleID = sampleId
gatkVariantcalling.outputDir = new File(libDir, "variantcalling")
protected def addJobs(): Unit = {
val bamFile: Option[File] = if (config.contains("R1")) {
mapping.input_R1 = config("R1")
mapping.input_R2 = config("R2")
mapping.init()
mapping.biopetScript()
addAll(mapping.functions) // Add functions of mapping to curent function pool
Some(mapping.finalBamFile)
} else if (config.contains("bam")) {
var bamFile: File = config("bam")
if (!bamFile.exists) throw new IllegalStateException("Bam in config does not exist, file: " + bamFile)
if (config("bam_to_fastq", default = false).asBoolean) {
val samToFastq = SamToFastq(qscript, bamFile, libDir + sampleId + "-" + libId + ".R1.fastq",
libDir + sampleId + "-" + libId + ".R2.fastq")
samToFastq.isIntermediate = true
qscript.add(samToFastq)
mapping.input_R1 = samToFastq.fastqR1
mapping.input_R2 = Some(samToFastq.fastqR2)
mapping.init()
mapping.biopetScript()
addAll(mapping.functions) // Add functions of mapping to curent function pool
Some(mapping.finalBamFile)
} else {
var readGroupOke = true
val inputSam = SamReaderFactory.makeDefault.open(bamFile)
val header = inputSam.getFileHeader.getReadGroups
for (readGroup <- inputSam.getFileHeader.getReadGroups) {
if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same")
if (readGroup.getLibrary != libId) logger.warn("Library ID readgroup in bam file is not the same")
if (readGroup.getSample != sampleId || readGroup.getLibrary != libId) readGroupOke = false
}
inputSam.close()
if (!readGroupOke) {
if (config("correct_readgroups", default = false)) {
logger.info("Correcting readgroups, file:" + bamFile)
val aorrg = AddOrReplaceReadGroups(qscript, bamFile, new File(libDir + sampleId + "-" + libId + ".bam"))
aorrg.RGID = sampleId + "-" + libId
aorrg.RGLB = libId
aorrg.RGSM = sampleId
aorrg.isIntermediate = true
qscript.add(aorrg)
bamFile = aorrg.output
} else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile +
"\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this")
}
addAll(BamMetrics(qscript, bamFile, libDir + "metrics/").functions)
Some(bamFile)
}
} else {
logger.error("Sample: " + sampleId + ": No R1 found for run: " + libId)
None
}
if (bamFile.isDefined) {
gatkVariantcalling.inputBams = List(bamFile.get)
gatkVariantcalling.variantcalling = config("library_variantcalling", default = false)
gatkVariantcalling.init()
gatkVariantcalling.biopetScript()
addAll(gatkVariantcalling.functions)
}
addSummaryQScript(mapping)
}
}
/** sample variantcalling */
val gatkVariantcalling = new GatkVariantcalling(qscript)
gatkVariantcalling.sampleID = sampleId
gatkVariantcalling.outputDir = new File(sampleDir, "variantcalling")
protected def addJobs(): Unit = {
addPerLibJobs()
gatkVariantcalling.inputBams = libraries.map(_._2.mapping.finalBamFile).toList
gatkVariantcalling.preProcesBams = false
if (!singleSampleCalling) {
gatkVariantcalling.useHaplotypecaller = false
gatkVariantcalling.useUnifiedGenotyper = false
}
gatkVariantcalling.init()
gatkVariantcalling.biopetScript()
addAll(gatkVariantcalling.functions)
gatkVariantcalling.inputBams.foreach(x => addAll(Bam2Wig(qscript, x).functions))
}
}
def init() {
}
val multisampleVariantcalling = new GatkVariantcalling(this) {
override def configName = "gatkvariantcalling"
override def configPath: List[String] = super.configPath ::: "multisample" :: Nil
}
def biopetScript(): Unit = {
addSamplesJobs()
addSummaryJobs()
}
def addMultiSampleJobs(): Unit = {
val gvcfFiles: List[File] = if (mergeGvcfs && externalGvcfs.size + samples.size > 1) {
val newFile = new File(outputDir, "merged.gvcf.vcf.gz")
add(CombineGVCFs(this, externalGvcfs ++ samples.map(_._2.gatkVariantcalling.scriptOutput.gvcfFile), newFile))
List(newFile)
} else externalGvcfs ++ samples.map(_._2.gatkVariantcalling.scriptOutput.gvcfFile)
if (!skipGenotyping && gvcfFiles.nonEmpty) {
if (jointGenotyping) {
val gatkGenotyping = new GatkGenotyping(this)
gatkGenotyping.inputGvcfs = gvcfFiles
gatkGenotyping.outputDir = new File(outputDir, "genotyping")
gatkGenotyping.init()
gatkGenotyping.biopetScript()
addAll(gatkGenotyping.functions)
var vcfFile = gatkGenotyping.outputFile
}
} else logger.warn("No gVCFs to genotype")
if (jointVariantcalling) {
val allBamfiles = samples.map(_._2.gatkVariantcalling.scriptOutput.bamFiles).toList.fold(Nil)(_ ++ _)
val allRawVcfFiles = samples.map(_._2.gatkVariantcalling.scriptOutput.rawVcfFile).filter(_ != null).toList
val gatkVariantcalling = new GatkVariantcalling(this) {
override def configName = "gatkvariantcalling"
override def configPath: List[String] = super.configPath ::: "multisample" :: Nil
}
if (gatkVariantcalling.useMpileup) {
val cvRaw = CombineVariants(this, allRawVcfFiles.toList, new File(outputDir, "variantcalling/multisample.raw.vcf.gz"))
add(cvRaw)
gatkVariantcalling.rawVcfInput = cvRaw.out
}
multisampleVariantcalling.preProcesBams = false
multisampleVariantcalling.doublePreProces = false
multisampleVariantcalling.inputBams = allBamfiles.toList
multisampleVariantcalling.outputDir = new File(outputDir, "variantcalling")
multisampleVariantcalling.outputName = "multisample"
multisampleVariantcalling.init()
multisampleVariantcalling.biopetScript()
addAll(multisampleVariantcalling.functions)
if (config("inputtype", default = "dna").asString != "rna" && config("recalibration", default = false).asBoolean) {
val recalibration = new GatkVariantRecalibration(this)
recalibration.inputVcf = multisampleVariantcalling.scriptOutput.finalVcfFile
recalibration.bamFiles = allBamfiles
recalibration.outputDir = new File(outputDir, "recalibration")
recalibration.init()
recalibration.biopetScript()
}
}
}
}
object GatkPipeline extends PipelineCommand
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.broad.{ ApplyRecalibration, VariantAnnotator, VariantRecalibrator }
import org.broadinstitute.gatk.queue.QScript
class GatkVariantRecalibration(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "input vcf file", shortName = "I")
var inputVcf: File = _
@Input(doc = "input vcf file", shortName = "BAM", required = false)
var bamFiles: List[File] = Nil
@Output(doc = "output vcf file", shortName = "out")
var outputVcf: File = _
def init() {
require(inputVcf != null, "Missing Output directory on gatk module")
}
def biopetScript() {
var vcfFile: File = if (bamFiles.nonEmpty) addVariantAnnotator(inputVcf, bamFiles, outputDir) else inputVcf
vcfFile = addSnpVariantRecalibrator(vcfFile, outputDir)
vcfFile = addIndelVariantRecalibrator(vcfFile, outputDir)
}
def addSnpVariantRecalibrator(inputVcf: File, dir: File): File = {
val snpRecal = VariantRecalibrator(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal"),
swapExt(dir, inputVcf, ".vcf", ".indel.tranches"), indel = false)
if (snpRecal.resource.nonEmpty) {
add(snpRecal)
val snpApply = ApplyRecalibration(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal.vcf"),
snpRecal.recal_file, snpRecal.tranches_file, indel = false)
add(snpApply)
snpApply.out
} else {
logger.warn("Skipped snp Recalibration, resource is missing")
inputVcf
}
}
def addIndelVariantRecalibrator(inputVcf: File, dir: File): File = {
val indelRecal = VariantRecalibrator(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal"),
swapExt(dir, inputVcf, ".vcf", ".indel.tranches"), indel = true)
if (indelRecal.resource.nonEmpty) {
add(indelRecal)
val indelApply = ApplyRecalibration(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal.vcf"),
indelRecal.recal_file, indelRecal.tranches_file, indel = true)
add(indelApply)
indelApply.out
} else {
logger.warn("Skipped indel Recalibration, resource is missing")
inputVcf
}
}
def addVariantAnnotator(inputvcf: File, bamfiles: List[File], dir: File): File = {
val variantAnnotator = VariantAnnotator(this, inputvcf, bamfiles, swapExt(dir, inputvcf, ".vcf", ".anotated.vcf"))
add(variantAnnotator)
variantAnnotator.out
}
}
object GatkVariantRecalibration extends PipelineCommand
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions.Ln
import nl.lumc.sasc.biopet.extensions.gatk.broad._
import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates
import nl.lumc.sasc.biopet.extensions.tools.{ MergeAlleles, MpileupToVcf, VcfFilter, VcfStats }
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile
import scala.collection.SortedMap
import scala.language.reflectiveCalls
class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
val scriptOutput = new GatkVariantcalling.ScriptOutput
@Input(doc = "Bam files (should be deduped bams)", shortName = "BAM")
var inputBams: List[File] = Nil
@Input(doc = "Raw vcf file", shortName = "raw")
var rawVcfInput: File = _
@Argument(doc = "Reference", shortName = "R", required = false)
var reference: File = config("reference")
@Argument(doc = "OutputName", required = false)
var outputName: String = _
@Argument(doc = "Sample name", required = false)
var sampleID: String = _
var preProcesBams: Boolean = config("pre_proces_bams", default = true)
var variantcalling: Boolean = true
var doublePreProces: Boolean = config("double_pre_proces", default = true)
var useHaplotypecaller: Boolean = config("use_haplotypecaller", default = true)
var useUnifiedGenotyper: Boolean = config("use_unifiedgenotyper", default = false)
var useAllelesOption: Boolean = config("use_alleles_option", default = false)
var useMpileup: Boolean = config("use_mpileup", default = true)
var useIndelRealigner: Boolean = config("use_indel_realign", default = true)
var useBaseRecalibration: Boolean = config("use_base_recalibration", default = true)
def init() {
if (outputName == null && sampleID != null) outputName = sampleID
else if (outputName == null) outputName = config("output_name", default = "noname")
val baseRecalibrator = new BaseRecalibrator(this)
if (preProcesBams && useBaseRecalibration && baseRecalibrator.knownSites.isEmpty) {
logger.warn("No Known site found, skipping base recalibration")
useBaseRecalibration = false
}
}
private def doublePreProces(files: List[File]): List[File] = {
if (files.isEmpty) throw new IllegalStateException("Files can't be empty")
else if (!doublePreProces) files
else if (files.size == 1) {
val bamFile = new File(outputDir, files.head.getName)
if (bamFile != files.head) {
val oldIndex: File = new File(files.head.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex: File = swapExt(outputDir, bamFile, ".bam", ".bai")
val baiLn = Ln(this, oldIndex, newIndex)
add(baiLn)
val bamLn = Ln(this, files.head, bamFile)
bamLn.deps :+= baiLn.output
add(bamLn)
}
List(bamFile)
} else {
val markDup = MarkDuplicates(this, files, new File(outputDir, outputName + ".dedup.bam"))
markDup.isIntermediate = useIndelRealigner
add(markDup)
if (useIndelRealigner) {
List(addIndelRealign(markDup.output, outputDir, isIntermediate = false))
} else {
List(markDup.output)
}
}
}
def biopetScript() {
scriptOutput.bamFiles = {
doublePreProces(if (preProcesBams) {
for (inputBam <- inputBams) yield {
var bamFile = inputBam
if (useIndelRealigner)
bamFile = addIndelRealign(bamFile, outputDir, isIntermediate = useBaseRecalibration)
if (useBaseRecalibration)
bamFile = addBaseRecalibrator(bamFile, outputDir, isIntermediate = inputBams.size > 1)
bamFile
}
} else {
inputBams
})
}
if (variantcalling) {