Commit 13430b2e authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Added flexiprep as separated artifact

parent e671341e
/target/
\ No newline at end of file
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Flexiprep</artifactId>
<version>0.2.0-DEV</version>
<packaging>jar</packaging>
<inceptionYear>2014</inceptionYear>
<name>Flexiprep</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<sting.unpack.phase>prepare-package</sting.unpack.phase>
<sting.shade.phase>package</sting.shade.phase>
<app.main.class>nl.lumc.sasc.biopet.core.BiopetExecutable</app.main.class>
</properties>
<dependencies>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetFramework</artifactId>
<version>0.2.0-DEV</version>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
<includes>
<include>**/*</include>
</includes>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<id>scala-compile</id>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
<arg>-deprecation</arg>
<arg>-feature</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.5</version>
<configuration>
<archive>
<manifest>
<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<showDeprecation>true</showDeprecation>
</configuration>
</plugin>
</plugins>
</build>
</project>
package nl.lumc.sasc.biopet.pipelines.flexiprep
import scala.io.Source
import nl.lumc.sasc.biopet.extensions.Ln
import org.broadinstitute.gatk.utils.commandline.{ Input }
import argonaut._, Argonaut._
import scalaz._, Scalaz._
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import scala.collection.mutable.Map
class Cutadapt(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Cutadapt(root) {
@Input(doc = "Fastq contams file", required = false)
var contams_file: File = _
override def beforeCmd() {
super.beforeCmd
getContamsFromFile
}
override def cmdLine = {
if (!opt_adapter.isEmpty || !opt_anywhere.isEmpty || !opt_front.isEmpty) {
analysisName = getClass.getSimpleName
super.cmdLine
} else {
analysisName = getClass.getSimpleName + "-ln"
Ln(this, fastq_input, fastq_output, relative = true).cmd
}
}
def getContamsFromFile {
if (contams_file != null) {
if (contams_file.exists()) {
for (line <- Source.fromFile(contams_file).getLines) {
var s: String = line.substring(line.lastIndexOf("\t") + 1, line.size)
if (default_clip_mode == "3") opt_adapter += s
else if (default_clip_mode == "5") opt_front += s
else if (default_clip_mode == "both") opt_anywhere += s
else {
opt_adapter += s
logger.warn("Option default_clip_mode should be '3', '5' or 'both', falling back to default: '3'")
}
logger.info("Adapter: " + s + " found in: " + fastq_input)
}
} else logger.warn("File : " + contams_file + " does not exist")
}
}
def getSummary: Json = {
val trimR = """.*Trimmed reads: *(\d*) .*""".r
val tooShortR = """.*Too short reads: *(\d*) .*""".r
val tooLongR = """.*Too long reads: *(\d*) .*""".r
val adapterR = """Adapter '([C|T|A|G]*)'.*trimmed (\d*) times.""".r
var stats: Map[String, Int] = Map("trimmed" -> 0, "tooshort" -> 0, "toolong" -> 0)
var adapter_stats: Map[String, Int] = Map()
if (stats_output.exists) for (line <- Source.fromFile(stats_output).getLines) {
line match {
case trimR(m) => stats += ("trimmed" -> m.toInt)
case tooShortR(m) => stats += ("tooshort" -> m.toInt)
case tooLongR(m) => stats += ("toolong" -> m.toInt)
case adapterR(adapter, count) => adapter_stats += (adapter -> count.toInt)
case _ =>
}
}
return ("num_reads_affected" := stats("trimmed")) ->:
("num_reads_discarded_too_short" := stats("tooshort")) ->:
("num_reads_discarded_too_long" := stats("toolong")) ->:
("adapters" := adapter_stats.toMap) ->:
jEmptyObject
}
}
object Cutadapt {
def apply(root: Configurable, input: File, output: File): Cutadapt = {
val cutadapt = new Cutadapt(root)
cutadapt.fastq_input = input
cutadapt.fastq_output = output
cutadapt.stats_output = new File(output.getAbsolutePath.substring(0, output.getAbsolutePath.lastIndexOf(".")) + ".stats")
return cutadapt
}
def mergeSummaries(jsons: List[Json]): Json = {
var affected = 0
var tooShort = 0
var tooLong = 0
var adapter_stats: Map[String, Int] = Map()
for (json <- jsons) {
affected += json.field("num_reads_affected").get.numberOrZero.toInt
tooShort += json.field("num_reads_discarded_too_short").get.numberOrZero.toInt
tooLong += json.field("num_reads_discarded_too_long").get.numberOrZero.toInt
val adapters = json.fieldOrEmptyObject("adapters")
for (key <- adapters.objectFieldsOrEmpty) {
val number = adapters.field(key).get.numberOrZero.toInt
if (adapter_stats.contains(key)) adapter_stats(key) += number
else adapter_stats += (key -> number)
}
}
return ("num_reads_affected" := affected) ->:
("num_reads_discarded_too_short" := tooShort) ->:
("num_reads_discarded_too_long" := tooLong) ->:
("adapters" := adapter_stats.toMap) ->:
jEmptyObject
}
}
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nl.lumc.sasc.biopet.pipelines.flexiprep
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import scala.io.Source
import argonaut._, Argonaut._
import scalaz._, Scalaz._
class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(root) {
def getDataBlock(name: String): Array[String] = { // Based on Fastqc v0.10.1
val outputDir = output.getAbsolutePath.stripSuffix(".zip")
val dataFile = new File(outputDir + "/fastqc_data.txt")
if (!dataFile.exists) return null
val data = Source.fromFile(dataFile).mkString
for (block <- data.split(">>END_MODULE\n")) {
val b = if (block.startsWith("##FastQC")) block.substring(block.indexOf("\n") + 1) else block
if (b.startsWith(">>" + name))
return for (line <- b.split("\n"))
yield line
}
return null
}
def getEncoding: String = {
val block = getDataBlock("Basic Statistics")
if (block == null) return null
for (
line <- block if (line.startsWith("Encoding"))
) return line.stripPrefix("Encoding\t")
return null // Could be default Sanger with a warning in the log
}
def getSummary: Json = {
val subfixs = Map("plot_duplication_levels" -> "Images/duplication_levels.png",
"plot_kmer_profiles" -> "Images/kmer_profiles.png",
"plot_per_base_gc_content" -> "Images/per_base_gc_content.png",
"plot_per_base_n_content" -> "Images/per_base_n_content.png",
"plot_per_base_quality" -> "Images/per_base_quality.png",
"plot_per_base_sequence_content" -> "Images/per_base_sequence_content.png",
"plot_per_sequence_gc_content" -> "Images/per_sequence_gc_content.png",
"plot_per_sequence_quality" -> "Images/per_sequence_quality.png",
"plot_sequence_length_distribution" -> "Images/sequence_length_distribution.png",
"fastqc_data" -> "fastqc_data.txt")
val dir = output.getAbsolutePath.stripSuffix(".zip") + "/"
var outputMap: Map[String, Map[String, String]] = Map()
for ((k, v) <- subfixs) outputMap += (k -> Map("path" -> (dir + v)))
val temp = ("" := outputMap) ->: jEmptyObject
return temp.fieldOrEmptyObject("")
}
}
object Fastqc {
def apply(root: Configurable, fastqfile: File, outDir: String): Fastqc = {
val fastqcCommand = new Fastqc(root)
fastqcCommand.fastqfile = fastqfile
var filename: String = fastqfile.getName()
if (filename.endsWith(".gz")) filename = filename.substring(0, filename.size - 3)
if (filename.endsWith(".gzip")) filename = filename.substring(0, filename.size - 5)
if (filename.endsWith(".fastq")) filename = filename.substring(0, filename.size - 6)
//if (filename.endsWith(".fq")) filename = filename.substring(0,filename.size - 3)
fastqcCommand.output = new File(outDir + "/" + filename + "_fastqc.zip")
fastqcCommand.afterGraph
return fastqcCommand
}
}
package nl.lumc.sasc.biopet.pipelines.flexiprep
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat, Seqstat }
import nl.lumc.sasc.biopet.scripts.{ FastqSync, FastqcToContams }
class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "R1 fastq file (gzipped allowed)", shortName = "R1", required = true)
var input_R1: File = _
@Input(doc = "R2 fastq file (gzipped allowed)", shortName = "R2", required = false)
var input_R2: File = _
@Argument(doc = "Skip Trim fastq files", shortName = "skiptrim", required = false)
var skipTrim: Boolean = config("skiptrim", default = false)
@Argument(doc = "Skip Clip fastq files", shortName = "skipclip", required = false)
var skipClip: Boolean = config("skipclip", default = false)
@Argument(doc = "Sample name", shortName = "sample", required = true)
var sampleName: String = _
@Argument(doc = "Library name", shortName = "library", required = true)
var libraryName: String = _
var paired: Boolean = (input_R2 != null)
var R1_ext: String = _
var R2_ext: String = _
var R1_name: String = _
var R2_name: String = _
var fastqc_R1: Fastqc = _
var fastqc_R2: Fastqc = _
var fastqc_R1_after: Fastqc = _
var fastqc_R2_after: Fastqc = _
val summary = new FlexiprepSummary(this)
def init() {
if (input_R1 == null) throw new IllegalStateException("Missing R1 on flexiprep module")
if (outputDir == null) throw new IllegalStateException("Missing Output directory on flexiprep module")
if (sampleName == null) throw new IllegalStateException("Missing Sample name on flexiprep module")
if (libraryName == null) throw new IllegalStateException("Missing Library name on flexiprep module")
else if (!outputDir.endsWith("/")) outputDir += "/"
paired = (input_R2 != null)
if (input_R1.endsWith(".gz")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gz"))
else if (input_R1.endsWith(".gzip")) R1_name = input_R1.getName.substring(0, input_R1.getName.lastIndexOf(".gzip"))
else R1_name = input_R1.getName
R1_ext = R1_name.substring(R1_name.lastIndexOf("."), R1_name.size)
R1_name = R1_name.substring(0, R1_name.lastIndexOf(R1_ext))
if (paired) {
if (input_R2.endsWith(".gz")) R2_name = input_R2.getName.substring(0, input_R2.getName.lastIndexOf(".gz"))
else if (input_R2.endsWith(".gzip")) R2_name = input_R2.getName.substring(0, input_R2.getName.lastIndexOf(".gzip"))
else R2_name = input_R2.getName
R2_ext = R2_name.substring(R2_name.lastIndexOf("."), R2_name.size)
R2_name = R2_name.substring(0, R2_name.lastIndexOf(R2_ext))
}
summary.out = outputDir + sampleName + "-" + libraryName + ".qc.summary.json"
}
def biopetScript() {
runInitialJobs()
val out = if (paired) runTrimClip(outputFiles("fastq_input_R1"), outputFiles("fastq_input_R2"), outputDir)
else runTrimClip(outputFiles("fastq_input_R1"), outputDir)
val R1_files = for ((k, v) <- outputFiles if k.endsWith("output_R1")) yield v
val R2_files = for ((k, v) <- outputFiles if k.endsWith("output_R2")) yield v
runFinalize(R1_files.toList, R2_files.toList)
}
def runInitialJobs() {
outputFiles += ("fastq_input_R1" -> extractIfNeeded(input_R1, outputDir))
if (paired) outputFiles += ("fastq_input_R2" -> extractIfNeeded(input_R2, outputDir))
fastqc_R1 = Fastqc(this, input_R1, outputDir + "/" + R1_name + ".fastqc/")
add(fastqc_R1)
summary.addFastqc(fastqc_R1)
outputFiles += ("fastqc_R1" -> fastqc_R1.output)
outputFiles += ("contams_R1" -> getContams(fastqc_R1, R1_name))
val md5sum_R1 = Md5sum(this, input_R1, outputDir)
add(md5sum_R1)
summary.addMd5sum(md5sum_R1, R2 = false, after = false)
if (paired) {
fastqc_R2 = Fastqc(this, input_R2, outputDir + "/" + R2_name + ".fastqc/")
add(fastqc_R2)
summary.addFastqc(fastqc_R2, R2 = true)
outputFiles += ("fastqc_R2" -> fastqc_R2.output)
outputFiles += ("contams_R2" -> getContams(fastqc_R2, R2_name))
val md5sum_R2 = Md5sum(this, input_R2, outputDir)
add(md5sum_R2)
summary.addMd5sum(md5sum_R2, R2 = true, after = false)
}
}
def getContams(fastqc: Fastqc, pairname: String): File = {
val fastqcToContams = new FastqcToContams(this)
fastqcToContams.fastqc_output = fastqc.output
fastqcToContams.out = new File(outputDir + pairname + ".contams.txt")
fastqcToContams.contams_file = fastqc.contaminants
add(fastqcToContams)
return fastqcToContams.out
}
def runTrimClip(R1_in: File, outDir: String, chunk: String): (File, File, List[File]) = {
runTrimClip(R1_in, new File(""), outDir, chunk)
}
def runTrimClip(R1_in: File, outDir: String): (File, File, List[File]) = {
runTrimClip(R1_in, new File(""), outDir, "")
}
def runTrimClip(R1_in: File, R2_in: File, outDir: String): (File, File, List[File]) = {
runTrimClip(R1_in, R2_in, outDir, "")
}
def runTrimClip(R1_in: File, R2_in: File, outDir: String, chunkarg: String): (File, File, List[File]) = {
val chunk = if (chunkarg.isEmpty || chunkarg.endsWith("_")) chunkarg else chunkarg + "_"
var results: Map[String, File] = Map()
var R1: File = new File(R1_in)
var R2: File = new File(R2_in)
var deps: List[File] = if (paired) List(R1, R2) else List(R1)
val seqtkSeq_R1 = SeqtkSeq.apply(this, R1, swapExt(outDir, R1, R1_ext, ".sanger" + R1_ext), fastqc_R1)
add(seqtkSeq_R1, isIntermediate = true)
R1 = seqtkSeq_R1.output
deps ::= R1
if (paired) {
val seqtkSeq_R2 = SeqtkSeq.apply(this, R2, swapExt(outDir, R2, R2_ext, ".sanger" + R2_ext), fastqc_R2)
add(seqtkSeq_R2, isIntermediate = true)
R2 = seqtkSeq_R2.output
deps ::= R2
}
val seqstat_R1 = Seqstat(this, R1, outDir)
add(seqstat_R1, isIntermediate = true)
summary.addSeqstat(seqstat_R1, R2 = false, after = false, chunk)
if (paired) {
val seqstat_R2 = Seqstat(this, R2, outDir)
add(seqstat_R2, isIntermediate = true)
summary.addSeqstat(seqstat_R2, R2 = true, after = false, chunk)
}
if (!skipClip) { // Adapter clipping
val cutadapt_R1 = Cutadapt(this, R1, swapExt(outDir, R1, R1_ext, ".clip" + R1_ext))
if (outputFiles.contains("contams_R1")) cutadapt_R1.contams_file = outputFiles("contams_R1")
add(cutadapt_R1, isIntermediate = true)
summary.addCutadapt(cutadapt_R1, R2 = false, chunk)
R1 = cutadapt_R1.fastq_output
deps ::= R1
outputFiles += ("cutadapt_R1_stats" -> cutadapt_R1.stats_output)
if (paired) {
val cutadapt_R2 = Cutadapt(this, R2, swapExt(outDir, R2, R2_ext, ".clip" + R2_ext))
outputFiles += ("cutadapt_R2_stats" -> cutadapt_R2.stats_output)
if (outputFiles.contains("contams_R2")) cutadapt_R2.contams_file = outputFiles("contams_R2")
add(cutadapt_R2, isIntermediate = true)
summary.addCutadapt(cutadapt_R2, R2 = true, chunk)
R2 = cutadapt_R2.fastq_output
deps ::= R2
val fastqSync = FastqSync(this, cutadapt_R1.fastq_input, cutadapt_R1.fastq_output, cutadapt_R2.fastq_output,
swapExt(outDir, R1, R1_ext, ".sync" + R1_ext), swapExt(outDir, R2, R2_ext, ".sync" + R2_ext), swapExt(outDir, R1, R1_ext, ".sync.stats"))
fastqSync.deps :::= deps
add(fastqSync, isIntermediate = true)
summary.addFastqcSync(fastqSync, chunk)
outputFiles += ("syncStats" -> fastqSync.output_stats)
R1 = fastqSync.output_R1
R2 = fastqSync.output_R2
deps :::= R1 :: R2 :: Nil
}
}
if (!skipTrim) { // Quality trimming
val sickle = new Sickle(this)
sickle.input_R1 = R1
sickle.output_R1 = swapExt(outDir, R1, R1_ext, ".trim" + R1_ext)
if (paired) {
sickle.input_R2 = R2
sickle.output_R2 = swapExt(outDir, R2, R2_ext, ".trim" + R2_ext)
sickle.output_singles = swapExt(outDir, R2, R2_ext, ".trim.singles" + R1_ext)
}
sickle.output_stats = swapExt(outDir, R1, R1_ext, ".trim.stats")
sickle.deps = deps
add(sickle, isIntermediate = true)
summary.addSickle(sickle, chunk)
R1 = sickle.output_R1
if (paired) R2 = sickle.output_R2
}
val seqstat_R1_after = Seqstat(this, R1, outDir)
seqstat_R1_after.deps = deps
add(seqstat_R1_after)
summary.addSeqstat(seqstat_R1_after, R2 = false, after = true, chunk)
if (paired) {
val seqstat_R2_after = Seqstat(this, R2, outDir)
seqstat_R2_after.deps = deps
add(seqstat_R2_after)
summary.addSeqstat(seqstat_R2_after, R2 = true, after = true, chunk)
}
outputFiles += (chunk + "output_R1" -> R1)
if (paired) outputFiles += (chunk + "output_R2" -> R2)
return (R1, R2, deps)
}
def runFinalize(fastq_R1: List[File], fastq_R2: List[File]) {
if (fastq_R1.length != fastq_R2.length && paired) throw new IllegalStateException("R1 and R2 file number is not the same")
val R1 = new File(outputDir + R1_name + ".qc" + R1_ext + ".gz")
val R2 = new File(outputDir + R2_name + ".qc" + R2_ext + ".gz")
add(Gzip(this, fastq_R1, R1))
if (paired) add(Gzip(this, fastq_R2, R2))
outputFiles += ("output_R1_gzip" -> R1)
if (paired) outputFiles += ("output_R2_gzip" -> R2)
if (!skipTrim || !skipClip) {
val md5sum_R1 = Md5sum(this, R1, outputDir)
add(md5sum_R1)
summary.addMd5sum(md5sum_R1, R2 = false, after = true)
if (paired) {
val md5sum_R2 = Md5sum(this, R2, outputDir)
add(md5sum_R2)
summary.addMd5sum(md5sum_R2, R2 = true, after = true)
}
fastqc_R1_after = Fastqc(this, R1, outputDir + "/" + R1_name + ".qc.fastqc/")
add(fastqc_R1_after)
summary.addFastqc(fastqc_R1_after, after = true)
if (paired) {
fastqc_R2_after = Fastqc(this, R2, outputDir + "/" + R2_name + ".qc.fastqc/")
add(fastqc_R2_after)
summary.addFastqc(fastqc_R2_after, R2 = true, after = true)
}
}
add(summary)
}
def extractIfNeeded(file: File, runDir: String): File = {
if (file.getName().endsWith(".gz") || file.getName().endsWith(".gzip")) {
var newFile: File = swapExt(runDir, file, ".gz", "")
if (file.getName().endsWith(".gzip")) newFile = swapExt(runDir, file, ".gzip", "")
val zcatCommand = Zcat(this, file, newFile)
zcatCommand.isIntermediate = true
add(zcatCommand)
return newFile
} else if (file.getName().endsWith(".bz2")) {
var newFile = swapExt(runDir, file, ".bz2", "")
val pbzip2 = Pbzip2(this, file, newFile)
pbzip2.isIntermediate = true
add(pbzip2)
return newFile
} else return file
}
}
object Flexiprep extends PipelineCommand
package nl.lumc.sasc.biopet.pipelines.flexiprep
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.{ Md5sum, Seqstat }
import nl.lumc.sasc.biopet.scripts.{ FastqSync }
import org.broadinstitute.gatk.queue.function.InProcessFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import java.io.File
import argonaut._, Argonaut._