Commit 259e3372 authored by bow's avatar bow
Browse files

Merge branch 'feature-prefix_fastq' into 'develop'

Feature prefix fastq

Fix for #48

See merge request !68
parents 6e3aabd0 dfc55377
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
# prep_deepsage.py
#
# Script for preprocessing deepSAGE sequencing samples.
#
# Adapted from: http://galaxy.nbic.nl/u/pacthoen/w/deepsagelumc
import argparse
from Bio import SeqIO
PREFIX = 'CATG'
def prep_deepsage(input_fastq, output_fastq, is_gzip, seq):
if is_gzip:
import gzip
opener = gzip.GzipFile
else:
opener = open
with opener(input_fastq, 'r') as in_handle, open(output_fastq, 'w') as \
out_handle:
# adapted from fastools.fastools.add
# not importing since we also need to cut away parts of the sequence
for rec in SeqIO.parse(in_handle, 'fastq'):
qual = rec.letter_annotations['phred_quality']
rec.letter_annotations = {}
rec.seq = seq + rec.seq
rec.letter_annotations = {'phred_quality': [40] * len(seq) +
qual}
SeqIO.write(rec, out_handle, "fastq")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('input_fastq', type=str,
help="Path to input FASTQ file")
parser.add_argument('-o', '--output', type=str,
dest='output_fastq',
default="prep_deepsage_output.fq",
help="Path to output FASTQ file")
parser.add_argument('--gzip', dest='gzip',
action='store_true',
help="Whether input FASTQ file is gzipped or not.")
parser.add_argument('--prefix', dest='prefix', type=str,
default=PREFIX,
help="Whether input FASTQ file is gzipped or not.")
args = parser.parse_args()
prep_deepsage(args.input_fastq, args.output_fastq, args.gzip, args.prefix)
......@@ -30,6 +30,10 @@ trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetC
memoryLimit = config("memory_limit")
}
/**
* Creates command to execute extension
* @return
*/
override def commandLine: String = {
preCmdInternal
val cmd = super.commandLine
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.scripts
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument }
import java.io.File
class PrefixFastq(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("prefixFastq.py")
@Input(doc = "Input file")
var input: File = _
@Output(doc = "output File")
var output: File = _
@Argument(doc = "Prefix sequence")
var prefix: String = "CATG"
@Argument(doc = "Input file is gziped", required = false)
var gzip: Boolean = _
override def beforeCmd {
if (input.getName.endsWith(".gzip") || input.getName.endsWith("gz")) gzip = true
}
def cmdLine = getPythonCommand +
required("-o", output) +
required("--prefix", prefix) +
required(input)
}
object PrefixFastq {
def apply(root: Configurable, input: File, outputDir: String): PrefixFastq = {
val prefixFastq = new PrefixFastq(root)
prefixFastq.input = input
prefixFastq.output = new File(outputDir, input.getName + ".prefix.fastq")
return prefixFastq
}
}
package nl.lumc.sasc.biopet.tools
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetJavaCommandLineFunction, ToolCommand }
import htsjdk.samtools.fastq.{ FastqRecord, AsyncFastqWriter, FastqReader, BasicFastqWriter }
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Argument, Output, Input }
import scala.collection.JavaConversions._
/**
* Queue class for PrefixFastq tool
*
* Created by pjvan_thof on 1/13/15.
*/
class PrefixFastq(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Input fastq", shortName = "I", required = true)
var inputFastq: File = _
@Output(doc = "Output fastq", shortName = "o", required = true)
var outputFastq: File = _
@Argument(doc = "Prefix seq", required = true)
var prefixSeq: String = _
/**
* Creates command to execute extension
* @return
*/
override def commandLine = super.commandLine +
required("-i", inputFastq) +
required("-o", outputFastq) +
optional("-s", prefixSeq)
}
object PrefixFastq extends ToolCommand {
/**
* Create a PrefixFastq class object with a sufix ".prefix.fastq" in the output folder
*
* @param root parent object
* @param input input file
* @param outputDir outputFolder
* @return PrefixFastq class object
*/
def apply(root: Configurable, input: File, outputDir: String): PrefixFastq = {
val prefixFastq = new PrefixFastq(root)
prefixFastq.inputFastq = input
prefixFastq.outputFastq = new File(outputDir, input.getName + ".prefix.fastq")
return prefixFastq
}
/**
* Args for commandline program
* @param input input fastq file (can be zipper)
* @param output output fastq file (can be zipper)
* @param seq Seq to prefix the reads with
*/
case class Args(input: File = null, output: File = null, seq: String = null) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('i', "input") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
c.copy(input = x)
}
opt[File]('o', "output") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
c.copy(output = x)
}
opt[String]('s', "seq") required () maxOccurs (1) valueName ("<prefix seq>") action { (x, c) =>
c.copy(seq = x)
}
}
/**
* Program will prefix reads with a given seq
*
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
logger.info("Start")
val argsParser = new OptParser
val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val writer = new AsyncFastqWriter(new BasicFastqWriter(cmdArgs.output), 3000)
val reader = new FastqReader(cmdArgs.input)
var counter = 0
while (reader.hasNext) {
val read = reader.next()
val maxQuality = read.getBaseQualityString.max
val readHeader = read.getReadHeader
val readSeq = cmdArgs.seq + read.getReadString
val baseQualityHeader = read.getBaseQualityHeader
val baseQuality = Array.fill(cmdArgs.seq.size)(maxQuality).mkString + read.getBaseQualityString
writer.write(new FastqRecord(readHeader, readSeq, baseQualityHeader, baseQuality))
counter += 1
if (counter % 1e6 == 0) logger.info(counter + " reads processed")
}
if (counter % 1e6 != 0) logger.info(counter + " reads processed")
writer.close()
reader.close()
logger.info("Done")
}
}
......@@ -36,23 +36,15 @@ class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction {
@Output(doc = "Output vcf", shortName = "o", required = false)
var outputVcf: File = _
var minSampleDepth: Option[Int] = _
var minTotalDepth: Option[Int] = _
var minAlternateDepth: Option[Int] = _
var minSamplesPass: Option[Int] = _
var filterRefCalls: Boolean = _
var minSampleDepth: Option[Int] = config("min_sample_depth")
var minTotalDepth: Option[Int] = config("min_total_depth")
var minAlternateDepth: Option[Int] = config("min_alternate_depth")
var minSamplesPass: Option[Int] = config("min_samples_pass")
var filterRefCalls: Boolean = config("filter_ref_calls")
override val defaultVmem = "8G"
memoryLimit = Option(4.0)
override def afterGraph {
minSampleDepth = config("min_sample_depth")
minTotalDepth = config("min_total_depth")
minAlternateDepth = config("min_alternate_depth")
minSamplesPass = config("min_samples_pass")
filterRefCalls = config("filter_ref_calls")
}
override def commandLine = super.commandLine +
required("-I", inputVcf) +
required("-o", outputVcf) +
......
......@@ -22,14 +22,13 @@ import nl.lumc.sasc.biopet.extensions.bedtools.BedtoolsCoverage
import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles
import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import nl.lumc.sasc.biopet.scripts.PrefixFastq
import nl.lumc.sasc.biopet.tools.PrefixFastq
import nl.lumc.sasc.biopet.tools.BedtoolsCoverageToCounts
import nl.lumc.sasc.biopet.scripts.SquishBed
import nl.lumc.sasc.biopet.tools.SageCountFastq
import nl.lumc.sasc.biopet.tools.SageCreateLibrary
import nl.lumc.sasc.biopet.tools.SageCreateTagCounts
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.function._
class Sage(val root: Configurable) extends QScript with MultiSampleQScript {
def this() = this(null)
......@@ -142,17 +141,17 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript {
addAll(flexiprep.functions)
val flexiprepOutput = for ((key, file) <- flexiprep.outputFiles if key.endsWith("output_R1")) yield file
val prefixFastq = PrefixFastq.apply(this, flexiprepOutput.head, runDir)
prefixFastq.prefix = config("sage_tag", default = "CATG")
val prefixFastq = PrefixFastq(this, flexiprepOutput.head, runDir)
prefixFastq.prefixSeq = config("sage_tag", default = "CATG")
prefixFastq.deps +:= flexiprep.outputFiles("fastq_input_R1")
add(prefixFastq)
libraryOutput.prefixFastq = prefixFastq.output
libraryOutput.prefixFastq = prefixFastq.outputFastq
val mapping = new Mapping(this)
mapping.skipFlexiprep = true
mapping.skipMarkduplicates = true
mapping.aligner = config("aligner", default = "bowtie")
mapping.input_R1 = prefixFastq.output
mapping.input_R1 = prefixFastq.outputFastq
mapping.RGLB = runConfig("ID").toString
mapping.RGSM = sampleConfig("ID").toString
if (runConfig.contains("PL")) mapping.RGPL = runConfig("PL").toString
......@@ -165,7 +164,7 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript {
if (config("library_counts", default = false).asBoolean) {
addBedtoolsCounts(mapping.outputFiles("finalBamFile"), sampleID + "-" + runID, runDir)
addTablibCounts(prefixFastq.output, sampleID + "-" + runID, runDir)
addTablibCounts(prefixFastq.outputFastq, sampleID + "-" + runID, runDir)
}
libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment