Commit 63c6517c authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'fix-238' into 'develop'

Fix for #238

The issue is caused by the Cutadapt summary not being aware of the
clipped sequence names. This merge request extends the existing Cutadapt
wrapper in the Flexiprep module and makes it so that the wrapper is
aware of FastQC.

See merge request !288
parents ac389d5d a482f22f
......@@ -43,6 +43,9 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su
def versionCommand = executable + " --version"
def versionRegex = """(.*)""".r
/** Name of the key containing clipped adapters information in the summary stats. */
def adaptersStatsName = "adapters"
var default_clip_mode: String = config("default_clip_mode", default = "3")
var opt_adapter: Set[String] = config("adapter", default = Nil)
var opt_anywhere: Set[String] = config("anywhere", default = Nil)
......@@ -89,7 +92,7 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su
Map("num_reads_affected" -> stats("trimmed"),
"num_reads_discarded_too_short" -> stats("tooshort"),
"num_reads_discarded_too_long" -> stats("toolong"),
"adapters" -> adapter_stats.toMap
adaptersStatsName -> adapter_stats.toMap
)
}
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines.flexiprep
import nl.lumc.sasc.biopet.utils.config.Configurable
/**
* Cutadapt wrapper specific for Flexiprep.
*
* This wrapper overrides the summary part so that instead of only displaying the clipped adapters, the sequence names
* are also displayed. In Flexiprep the sequence will always have names since they are detected by FastQC from a list
* of known adapters / contaminants.
*
* @param root: Configurable object from which this wrapper is initialized.
* @param fastqc: Fastqc wrapper that contains adapter information.
*/
class Cutadapt(root: Configurable, fastqc: Fastqc) extends nl.lumc.sasc.biopet.extensions.Cutadapt(root) {
/** Clipped adapter names from FastQC */
protected def seqToName = fastqc.foundAdapters
.map(adapter => adapter.seq -> adapter.name).toMap
override def summaryStats: Map[String, Any] = {
val initStats = super.summaryStats
// Map of adapter sequence and how many times it is found
val adapterCounts: Map[String, Any] = initStats.get(adaptersStatsName) match {
// "adapters" key found in statistics
case Some(m: Map[_, _]) => m.flatMap {
case (seq: String, count) =>
seqToName.get(seq) match {
// adapter sequence is found by FastQC
case Some(n) => Some(n -> Map("sequence" -> seq, "count" -> count))
// adapter sequence is clipped but not found by FastQC ~ should not happen since all clipped adapter
// sequences come from FastQC
case _ =>
throw new IllegalStateException(s"Adapter '$seq' is clipped but not found by FastQC in '$fastq_input'.")
}
// FastQC found no adapters
case otherwise =>
;
logger.debug(s"No adapters found for summarizing in '$fastq_input'.")
None
}
// "adapters" key not found ~ something went wrong in our part
case _ => throw new RuntimeException(s"Required key 'adapters' not found in stats entry '$fastq_input'.")
}
initStats.updated(adaptersStatsName, adapterCounts)
}
}
......@@ -19,7 +19,7 @@ import java.io.File
import nl.lumc.sasc.biopet.core.summary.{ SummaryQScript, Summarizable }
import nl.lumc.sasc.biopet.core.{ BiopetFifoPipe, BiopetCommandLineFunction }
import nl.lumc.sasc.biopet.extensions.{ Cat, Gzip, Sickle, Cutadapt }
import nl.lumc.sasc.biopet.extensions.{ Cat, Gzip, Sickle }
import nl.lumc.sasc.biopet.extensions.seqtk.SeqtkSeq
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
......@@ -101,7 +101,7 @@ class QcCommand(val root: Configurable, val fastqc: Fastqc) extends BiopetComman
clip = if (!flexiprep.skipClip) {
val foundAdapters = fastqc.foundAdapters.map(_.seq)
if (foundAdapters.nonEmpty) {
val cutadapt = new Cutadapt(root)
val cutadapt = new Cutadapt(root, fastqc)
cutadapt.fastq_input = seqtk.output
cutadapt.fastq_output = new File(output.getParentFile, input.getName + ".cutadapt.fq")
cutadapt.stats_output = new File(flexiprep.outputDir, s"${flexiprep.sampleId.getOrElse("x")}-${flexiprep.libId.getOrElse("x")}.$read.clip.stats")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment