Commit 1ddd45ad authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Added Sage pipeline

parent 6b69a6d5
{
"bedtools": {"exe": "test"},
"samtools": {"exe": "test"},
"reference": "/blabla/blabla.fa",
"fastqc": { "exe": "/home/pjvan_thof/Downloads/FastQC/fastqc" },
"bwa": { "exe": "test" },
"flexiprep": {
"fastqc": { "exe": "/home/pjvan_thof/Downloads/FastQC/fastqc" },
"seqtk": {"exe":"/data/DIV5/SASC/common/programs/seqtk/seqtk/seqtk"},
"cutadapt": {"exe":"/home/pjvan_thof/.local/bin/cutadapt"},
"sickle": {"exe":"/data/DIV5/SASC/pjvan_thof/bin/sickle"}
}
}
......@@ -91,6 +91,8 @@
<filter>
<artifact>org.broadinstitute.gatk:gatk-queue-package-distribution</artifact>
<includes>
<include>com/sun/**</include>
<include>org/ggf/**</include>
<include>picard/**</include>
<include>htsjdk/**</include>
<include>javassist/**</include>
......
......@@ -9,6 +9,7 @@ import nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantcalling
import nl.lumc.sasc.biopet.pipelines.gatk.GatkVcfSampleCompare
import nl.lumc.sasc.biopet.pipelines.gentrap.Gentrap
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import nl.lumc.sasc.biopet.pipelines.sage.Sage
object BiopetExecutable {
......@@ -21,7 +22,8 @@ object BiopetExecutable {
"gatk-genotyping" -> GatkGenotyping,
"gatk-variantcalling" -> GatkVariantcalling,
"gatk-pipeline" -> GatkPipeline,
"gatk-vcf-sample-compare" -> GatkVcfSampleCompare
"gatk-vcf-sample-compare" -> GatkVcfSampleCompare,
"sage" -> Sage
)
/**
......
......@@ -73,9 +73,9 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
def biopetScript() {
runInitialJobs()
if (paired) runTrimClip(outputFiles("fastq_input_R1"), outputFiles("fastq_input_R2"), outputDir)
else runTrimClip(outputFiles("fastq_input_R1"), outputDir)
val out = if (paired) runTrimClip(outputFiles("fastq_input_R1"), outputFiles("fastq_input_R2"), outputDir)
else runTrimClip(outputFiles("fastq_input_R1"), outputDir)
runFinalize(List(outputFiles("output_R1")), if (outputFiles.contains("output_R2")) List(outputFiles("output_R2")) else List())
}
......@@ -115,13 +115,13 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
return fastqcToContams.out
}
def runTrimClip(R1_in: File, outDir: String, chunk: String) {
def runTrimClip(R1_in: File, outDir: String, chunk: String): (File, File, List[File]) = {
runTrimClip(R1_in, new File(""), outDir, chunk)
}
def runTrimClip(R1_in: File, outDir: String) {
def runTrimClip(R1_in: File, outDir: String): (File, File, List[File]) = {
runTrimClip(R1_in, new File(""), outDir, "")
}
def runTrimClip(R1_in: File, R2_in: File, outDir: String) {
def runTrimClip(R1_in: File, R2_in: File, outDir: String): (File, File, List[File]) = {
runTrimClip(R1_in, R2_in, outDir, "")
}
def runTrimClip(R1_in: File, R2_in: File, outDir: String, chunkarg: String): (File, File, List[File]) = {
......@@ -227,8 +227,8 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
add(Gzip(this, fastq_R1, R1))
if (paired) add(Gzip(this, fastq_R2, R2))
outputFiles += ("output_R1" -> R1)
if (paired) outputFiles += ("output_R2" -> R2)
outputFiles += ("output_R1_gzip" -> R1)
if (paired) outputFiles += ("output_R2_gzip" -> R2)
if (!skipTrim || !skipClip) {
val md5sum_R1 = Md5sum(this, R1, outputDir)
......@@ -239,15 +239,13 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
add(md5sum_R2)
summary.addMd5sum(md5sum_R2, R2 = true, after = true)
}
fastqc_R1_after = Fastqc(this, outputFiles("output_R1"), outputDir + "/" + R1_name + ".qc.fastqc/")
fastqc_R1_after = Fastqc(this, R1, outputDir + "/" + R1_name + ".qc.fastqc/")
add(fastqc_R1_after)
summary.addFastqc(fastqc_R1_after, after = true)
outputFiles += ("fastqc_R1_final" -> fastqc_R1_after.output)
if (paired) {
fastqc_R2_after = Fastqc(this, outputFiles("output_R2"), outputDir + "/" + R2_name + ".qc.fastqc/")
fastqc_R2_after = Fastqc(this, R2, outputDir + "/" + R2_name + ".qc.fastqc/")
add(fastqc_R2_after)
summary.addFastqc(fastqc_R2_after, R2 = true, after = true)
outputFiles += ("fastqc_R2_final" -> fastqc_R2_after.output)
}
}
......
package nl.lumc.sasc.biopet.pipelines.sage
import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.Cat
import nl.lumc.sasc.biopet.extensions.bedtools.BedtoolsCoverage
import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles
import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import nl.lumc.sasc.biopet.scripts.PrefixFastq
import nl.lumc.sasc.biopet.scripts.SquishBed
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.function._
class Sage(val root: Configurable) extends QScript with MultiSampleQScript {
def this() = this(null)
@Input(doc = "countBed", required = false)
var countBed : File = _
@Input(doc = "squishedCountBed, by suppling this file the auto squish job will be skipped", required = false)
var squishedCountBed : File = _
def init() {
for (file <- configfiles) globalConfig.loadConfigFile(file)
if (countBed == null && squishedCountBed == null) throw new IllegalStateException("No bedfile supplied, please add a countBed or squishedCountBed")
}
def biopetScript() {
if (squishedCountBed == null) {
val squishBed = SquishBed.apply(this, countBed, outputDir)
add(squishBed)
squishedCountBed = squishBed.output
}
// Tag library creation
runSamplesJobs
}
// Called for each sample
def runSingleSampleJobs(sampleConfig: Map[String, Any]): Map[String, List[File]] = {
var outputFiles: Map[String, List[File]] = Map()
var libraryBamfiles: List[File] = List()
var libraryFastqFiles: List[File] = List()
val sampleID: String = sampleConfig("ID").toString
val sampleDir: String = outputDir + sampleID + "/"
for ((library, libraryFiles) <- runLibraryJobs(sampleConfig)) {
libraryFastqFiles +:= libraryFiles("prefix_fastq")
libraryBamfiles +:= libraryFiles("FinalBam")
}
val bamFile: File = if (libraryBamfiles.size == 1) libraryBamfiles.head
else if (libraryBamfiles.size > 1) {
val mergeSamFiles = MergeSamFiles(this, libraryBamfiles, sampleDir)
add(mergeSamFiles)
mergeSamFiles.output
} else null
val fastqFile: File = if (libraryBamfiles.size == 1) libraryBamfiles.head
else if (libraryBamfiles.size > 1) {
val cat = Cat.apply(this, libraryBamfiles, sampleDir + sampleID + ".fastq")
add(cat)
cat.output
} else null
this.addBedtoolsCounts(bamFile, sampleID, sampleDir)
return outputFiles
}
// Called for each run from a sample
def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): Map[String, File] = {
var outputFiles: Map[String, File] = Map()
val runID: String = runConfig("ID").toString
val sampleID: String = sampleConfig("ID").toString
val runDir: String = outputDir + sampleID + "/run_" + runID + "/"
if (runConfig.contains("R1")) {
val flexiprep = new Flexiprep(this)
flexiprep.outputDir = runDir + "flexiprep/"
flexiprep.input_R1 = new File(runConfig("R1").toString)
flexiprep.skipClip = true
flexiprep.skipTrim = true
flexiprep.sampleName = sampleID
flexiprep.libraryName = runID
flexiprep.init
flexiprep.biopetScript
addAll(flexiprep.functions)
val flexiprepOutput = for ((key,file) <- flexiprep.outputFiles if key.endsWith("output_R1")) yield file
val prefixFastq = PrefixFastq.apply(this, flexiprepOutput.head, runDir)
prefixFastq.prefix = config("sage_tag", default = "CATG")
add(prefixFastq)
outputFiles += ("prefix_fastq" -> prefixFastq.output)
val mapping = new Mapping(this)
mapping.skipFlexiprep = true
mapping.input_R1 = prefixFastq.output
mapping.RGLB = runConfig("ID").toString
mapping.RGSM = sampleConfig("ID").toString
if (runConfig.contains("PL")) mapping.RGPL = runConfig("PL").toString
if (runConfig.contains("PU")) mapping.RGPU = runConfig("PU").toString
if (runConfig.contains("CN")) mapping.RGCN = runConfig("CN").toString
mapping.outputDir = runDir
mapping.init
mapping.biopetScript
addAll(mapping.functions)
if (config("library_counts", default = false).getBoolean)
this.addBedtoolsCounts(mapping.outputFiles("finalBamFile"), sampleID + "-" + runID, runDir)
outputFiles += ("FinalBam" -> mapping.outputFiles("finalBamFile"))
} else this.logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig)
return outputFiles
}
def addBedtoolsCounts(bamFile:File, outputPrefix: String, outputDir: String) {
add(BedtoolsCoverage(this, bamFile, squishedCountBed, outputDir + outputPrefix + ".sense.count",
depth = false, sameStrand = true, diffStrand = false))
add(BedtoolsCoverage(this, bamFile, squishedCountBed, outputDir + outputPrefix + ".antisense.count",
depth = false, sameStrand = false, diffStrand = true))
add(BedtoolsCoverage(this, bamFile, squishedCountBed, outputDir + outputPrefix + ".count",
depth = false, sameStrand = false, diffStrand = false))
}
}
object Sage extends PipelineCommand {
override val pipeline = "/nl/lumc/sasc/biopet/pipelines/sage/Sage.class"
}
package nl.lumc.sasc.biopet.scripts
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import java.io.File
class SquishBed(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("bed_squish.py")
@Input(doc = "Input file")
var input: File = _
@Output(doc = "output File")
var output: File = _
def cmdLine = getPythonCommand +
required(input) +
required(output)
}
object SquishBed {
def apply(root: Configurable, input: File, outputDir: String): SquishBed = {
val squishBed = new SquishBed(root)
squishBed.input = input
squishBed.output = new File(outputDir, input.getName.stripSuffix(".bed") + ".squish.bed")
return squishBed
}
}
package nl.lumc.sasc.biopet.scripts
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument }
import java.io.File
class PrefixFastq(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("prefixFastq.py")
@Input(doc = "Input file")
var input: File = _
@Output(doc = "output File")
var output: File = _
@Argument(doc = "Prefix sequence")
var prefix: String = "CATG"
@Argument(doc = "Input file is gziped", required = false)
var gzip: Boolean = _
override def beforeCmd {
if (input.getName.endsWith(".gzip") || input.getName.endsWith("gz")) gzip = true
}
def cmdLine = getPythonCommand +
required("-o", output) +
required("--prefix", prefix) +
required(input)
}
object PrefixFastq {
def apply(root: Configurable, input: File, outputDir: String): PrefixFastq = {
val prefixFastq = new PrefixFastq(root)
prefixFastq.input = input
prefixFastq.output = new File(outputDir, input.getName + ".prefix.fastq")
return prefixFastq
}
}
#!/usr/bin/env python2
"""
Overlapping regions removal in a single BED file.
The script will adjust feature coordinates so that no overlaps are present. If
a feature is enveloped entirely by another feature, the smaller feature will be
removed and the enveloping feature split into two.
Input BED files must be position-sorted and only contain the first six fields.
Strands are taken into account when removing regions.
Requirements:
* Python == 2.7.x
* track >= 1.1.0 <http://xapple.github.io/track/>
Copyright (c) 2013 Wibowo Arindrarto <w.arindrarto@lumc.nl>
Copyright (c) 2013 LUMC Sequencing Analysis Support Core <sasc@lumc.nl>
MIT License <http://opensource.org/licenses/MIT>
"""
import argparse
import os
import track
class BEDCoord(object):
"""Class representing a BED feature coordinate."""
__slots__ = ('feature', 'kind', 'point', 'start', 'strand')
def __init__(self, feature, kind, point, start, strand):
"""
:param feature: name of BED feature
:type feature: str
:param kind: type of coordinate, 'start' or 'end'
:type kind: str
:param point: coordinate point
:type point: int
:param start: start coordinate of the coordinate with this feature
:type start: int
:param strand: strand of the feature, 1 or -1
:type strand: int
"""
self.feature = feature
assert kind in ('start', 'end')
self.kind = kind
self.point = point
self.start = start
self.strand = strand
def __repr__(self):
return '{0}{1}'.format(self.point, self.kind[0].upper())
def __gt__(self, other):
if self.point == other.point:
return self.start > other.start
return self.point > other.point
def __lt__(self, other):
if self.point == other.point:
return self.start < other.start
return self.point < other.point
def __ge__(self, other):
return self.point >= other.point
def __le__(self, other):
return self.point <= other.point
def squish_track_records(chrom_recs):
"""Given an iterator for `track` records, yield squished `track` features.
:param chrom_feats: iterator returning `track` records for one chromosome
:type chrom_feats: iterator
:returns: (generator) single `track` records
:rtype: `track.pyrow.SuperRow`
"""
# Algorithm:
# 1. Flatten all coordinate points into a single list
# 2. Sort by point, resolve same point by comparing feature starts
# (already defined in BEDCoord's `__lt__` and `__gt__`)
# 3. Walk through the sorted points while keeping track of overlaps using
# a level' counter for each strand
# 4. Start coordinates increase level counters, end coordinates decrease
# them
# 5. Start coordinates of the squished features are:
# * start coordinates in the array when level == 1
# * end coordinates in the array when level == 1
# 6. End coordinates of the squished features are:
# * start coordinates in the array when level == 2
# * end coordinates in the array when level == 0
# 7. As additional checks, make sure that:
# * when yielding a record, its start coordinate <= its end coordinate
# * the level counter never falls below 0 (this doesn't make sense)
# * after all iterations are finished, the level counter == 0
# Assumes:
# 1. Input BED file is position-sorted
# 2. Coordinate points all denote closed intervals (this is handled by
# `track` for BED files already)
flat_coords = []
for rec in chrom_recs:
flat_coords.append(BEDCoord(rec[2], 'start', rec[0], rec[0], rec[4]))
flat_coords.append(BEDCoord(rec[2], 'end', rec[1], rec[0], rec[4]))
flat_coords.sort()
plus_level, minus_level = 0, 0
plus_row = [0, 0, "", 0, 1]
minus_row = [0, 0, "", 0, -1]
for coord in flat_coords:
if coord.strand == 1:
if coord.kind == 'start':
plus_level += 1
if plus_level == 1:
plus_row[0] = coord.point
plus_row[2] = coord.feature
elif plus_level == 2:
plus_row[1] = coord.point
# track uses closed coordinates already
assert plus_row[0] <= plus_row[1]
yield plus_row
else:
plus_level -= 1
if plus_level == 0:
plus_row[1] = coord.point
plus_row[2] = coord.feature
assert plus_row[0] <= plus_row[1]
yield plus_row
elif plus_level == 1:
plus_row[0] = coord.point
assert plus_level >= 0, 'Unexpected feature level: {0}'.format(
plus_level)
elif coord.strand == -1:
if coord.kind == 'start':
minus_level += 1
if minus_level == 1:
minus_row[0] = coord.point
minus_row[2] = coord.feature
elif minus_level == 2:
minus_row[1] = coord.point
assert minus_row[0] <= minus_row[1]
yield minus_row
else:
minus_level -= 1
if minus_level == 0:
minus_row[1] = coord.point
minus_row[2] = coord.feature
assert minus_row[0] <= minus_row[1]
yield minus_row
elif minus_level == 1:
minus_row[0] = coord.point
assert minus_level >= 0, 'Unexpected feature level: {0}'.format(
minus_level)
assert plus_level == 0, 'Unexpected end plus feature level: ' \
'{0}'.format(plus_level)
assert minus_level == 0, 'Unexpected end minus feature level: ' \
'{0}'.format(minus_level)
def squish_bed(in_file, out_file):
"""Removes all overlapping regions in the input BED file, writing to the
output BED file.
:param in_file: path to input BED file
:type in_file: str
:param out_file: path to output BED file
:type out_file: str
"""
# check for input file presence, remove output file if it already exists
assert os.path.exists(in_file), 'Required input file {0} does not ' \
'exist'.format(in_file)
if os.path.exists(out_file):
os.unlink(out_file)
with track.load(in_file, readonly=True) as in_track, \
track.new(out_file, format='bed') as out_track:
for chrom in in_track.chromosomes:
chrom_rec = in_track.read(chrom)
out_track.write(chrom, squish_track_records(chrom_rec))
if __name__ == '__main__':
usage = __doc__.split('\n\n\n')
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=usage[0], epilog=usage[1])
parser.add_argument('input', type=str, help='Path to input BED file')
parser.add_argument('output', type=str, help='Path to output BED file')
args = parser.parse_args()
squish_bed(args.input, args.output)
#!/usr/bin/env python
# prep_deepsage.py
#
# Script for preprocessing deepSAGE sequencing samples.
#
# Adapted from: http://galaxy.nbic.nl/u/pacthoen/w/deepsagelumc
import argparse
from Bio import SeqIO
PREFIX = 'CATG'
def prep_deepsage(input_fastq, output_fastq, is_gzip, seq):
if is_gzip:
import gzip
opener = gzip.GzipFile
else:
opener = open
with opener(input_fastq, 'r') as in_handle, open(output_fastq, 'w') as \
out_handle:
# adapted from fastools.fastools.add
# not importing since we also need to cut away parts of the sequence
for rec in SeqIO.parse(in_handle, 'fastq'):
qual = rec.letter_annotations['phred_quality']
rec.letter_annotations = {}
rec.seq = seq + rec.seq
rec.letter_annotations = {'phred_quality': [40] * len(seq) +
qual}
SeqIO.write(rec, out_handle, "fastq")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('input_fastq', type=str,
help="Path to input FASTQ file")
parser.add_argument('-o', '--output', type=str,
dest='output_fastq',
default="prep_deepsage_output.fq",
help="Path to output FASTQ file")
parser.add_argument('--gzip', dest='gzip',
action='store_true',
help="Whether input FASTQ file is gzipped or not.")
parser.add_argument('--prefix', dest='prefix',
action='store_true',
default=PREFIX,
help="Whether input FASTQ file is gzipped or not.")
args = parser.parse_args()
prep_deepsage(args.input_fastq, args.output_fastq, args.gzip, args.prefix)