Fastqc.scala 10.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
12
13
14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
15
16
package nl.lumc.sasc.biopet.pipelines.flexiprep

17
import java.io.{ File, FileNotFoundException }
bow's avatar
bow committed
18

19
import nl.lumc.sasc.biopet.core.summary.Summarizable
20
import nl.lumc.sasc.biopet.utils.config.Configurable
21

22
import scala.io.Source
23
import htsjdk.samtools.util.SequenceUtil.reverseComplement
24
import org.broadinstitute.gatk.utils.commandline.Output
25

26
27
28
29
30
31
/**
 * FastQC wrapper with added functionality for the Flexiprep pipeline
 *
 * This wrapper implements additional methods for parsing FastQC output files and aggregating everything in a summary
 * object. The current implementation is based on FastQC v0.10.1.
 */
32
class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(root) with Summarizable {
33

34
35
  /** Allow reporting of all found (potentially adapter) sequences in the FastQC */
  var sensitiveAdapterSearch: Boolean = config("sensitiveAdapterSearch", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
36
  var enableRCtrimming: Boolean = config("enableRCtrimming", default = false)
37

bow's avatar
bow committed
38
  /** Class for storing a single FastQC module result */
39
40
  protected case class FastQCModule(name: String, status: String, lines: Seq[String])

41
42
43
44
45
46
47
48
  /** Default FastQC output directory containing actual results */
  // this is a def instead of a val since the value depends on the variable `output`, which is null on class creation
  def outputDir: File = new File(output.getAbsolutePath.stripSuffix(".zip"))

  /** Default FastQC output data file */
  // this is a def instead of a val since the value depends on the variable `output`, which is null on class creation
  def dataFile: File = new File(outputDir, "fastqc_data.txt")

49
50
51
52
53
  /**
   * FastQC QC modules.
   *
   * @return Mapping of FastQC module names and its contents as array of strings (one item per line)
   * @throws FileNotFoundException if the FastQC data file can not be found.
54
   * @throws IllegalStateException if the module lines have no content or mapping is empty.
55
   */
56
  def qcModules: Map[String, FastQCModule] = {
57
58
    val fastQCLog = Source.fromFile(dataFile)
    val fqModules: Map[String, FastQCModule] = fastQCLog
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
61
62
63
64
65
66
      // drop all the characters before the first module delimiter (i.e. '>>')
      .dropWhile(_ != '>')
      // pull everything into a string
      .mkString
      // split into modules
      .split(">>END_MODULE\n")
      // make map of module name -> module lines
      .map {
67
68
        case (modString) =>
          // module name is in the first line, without '>>' and before the tab character
69
70
          val Array(firstLine, otherLines) = modString
            // drop all '>>' character (start of module)
71
            .dropWhile(_ == '>')
72
73
74
75
76
77
78
79
80
            // split first line and others
            .split("\n", 2)
            // and slice them
            .slice(0, 2)
          // extract module name and module status
          val Array(modName, modStatus) = firstLine
            .split("\t", 2)
            .slice(0, 2)
          modName -> FastQCModule(modName, modStatus, otherLines.split("\n").toSeq)
81
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
82
      .toMap
83

84
    fastQCLog.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
85
86
    if (fqModules.isEmpty) throw new IllegalStateException("Empty FastQC data file " + dataFile.toString)
    else fqModules
87
88
  }

bow's avatar
bow committed
89
90
91
92
93
94
95
  /**
   * Retrieves the FASTQ file encoding as computed by FastQC.
   *
   * @return encoding name
   * @throws NoSuchElementException when the "Basic Statistics" key does not exist in the mapping or
   *                                when a line starting with "Encoding" does not exist.
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
96
  def encoding: String = {
Peter van 't Hof's avatar
Peter van 't Hof committed
97
    if (dataFile.exists) // On a dry run this file does not yet exist
Peter van 't Hof's avatar
Peter van 't Hof committed
98
      qcModules("Basic Statistics") //FIXME: not save
Peter van 't Hof's avatar
Peter van 't Hof committed
99
100
101
102
103
        .lines
        .dropWhile(!_.startsWith("Encoding"))
        .head
        .stripPrefix("Encoding\t")
        .stripSuffix("\t")
Peter van 't Hof's avatar
Peter van 't Hof committed
104
105
    else ""
  }
bow's avatar
bow committed
106

bow's avatar
bow committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
  protected case class BasePositionStats(mean: Double, median: Double,
                                         lowerQuartile: Double, upperQuartile: Double,
                                         percentile10th: Double, percentile90th: Double) {

    def toMap = Map(
      "mean" -> mean,
      "median" -> median,
      "lower_quartile" -> lowerQuartile,
      "upper_quartile" -> upperQuartile,
      "percentile_10th" -> percentile10th,
      "percentile_90th" -> percentile90th)
  }

  /**
   * Retrieves the base quality per position values as computed by FastQc.
   */
  def perBaseSequenceQuality: Map[String, Map[String, Double]] =
    if (dataFile.exists) {
      qcModules.get("Per base sequence quality") match {
        case None => Map()
        case Some(qcModule) =>
          val tableContents = for {
Peter van 't Hof's avatar
Peter van 't Hof committed
129
            line <- qcModule.lines if !(line.startsWith("#") || line.startsWith(">"))
bow's avatar
bow committed
130
131
            values = line.split("\t") if values.size == 7
          } yield (values(0), BasePositionStats(values(1).toDouble, values(2).toDouble, values(3).toDouble,
Peter van 't Hof's avatar
Peter van 't Hof committed
132
            values(4).toDouble, values(5).toDouble, values(6).toDouble).toMap)
bow's avatar
bow committed
133
134
135
136
137
138
139
140
141
142
143
          tableContents.toMap
      }
    } else Map()

  def perBaseSequenceContent: Map[String, Map[String, Double]] =
    if (dataFile.exists) {
      qcModules.get("Per base sequence content") match {
        case None => Map()
        case Some(qcModule) =>
          val bases = qcModule.lines.head.split("\t").tail
          val tableContents = for {
Peter van 't Hof's avatar
Peter van 't Hof committed
144
            line <- qcModule.lines if !(line.startsWith("#") || line.startsWith(">"))
bow's avatar
bow committed
145
146
147
148
149
150
            values = line.split("\t") if values.size == 5
          } yield (values(0), bases.zip(values.tail.map(_.toDouble)).toMap)
          tableContents.toMap
      }
    } else Map()

151
152
153
154
155
  /**
   * Retrieves overrepresented sequences found by FastQ.
   *
   * @return a [[Set]] of [[AdapterSequence]] objects.
   */
bow's avatar
bow committed
156
  def foundAdapters: Set[AdapterSequence] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
157
    if (dataFile.exists) { // On a dry run this file does not yet exist
158
159
      val modules = qcModules

Peter van 't Hof's avatar
Peter van 't Hof committed
160
161
162
163
164
165
166
167
168
169
170
171
      /** Returns a list of adapter and/or contaminant sequences known to FastQC */
      def getFastqcSeqs(file: Option[File]): Set[AdapterSequence] = file match {
        case None => Set.empty[AdapterSequence]
        case Some(f) =>
          (for {
            line <- Source.fromFile(f).getLines()
            if !line.startsWith("#")
            values = line.split("\t+")
            if values.size >= 2
          } yield AdapterSequence(values(0), values(1))).toSet
      }

172
173
      val adapterSet = getFastqcSeqs(adapters)
      val contaminantSet = getFastqcSeqs(contaminants)
174

175
      val foundAdapterNames: Seq[String] = modules.get("Overrepresented sequences") match {
Peter van 't Hof's avatar
Peter van 't Hof committed
176
177
178
179
180
181
182
        case None => Seq.empty[String]
        case Some(qcModule) =>
          for (
            line <- qcModule.lines if !(line.startsWith("#") || line.startsWith(">"));
            values = line.split("\t") if values.size >= 4
          ) yield values(3)
      }
183

Peter van 't Hof's avatar
Peter van 't Hof committed
184
185
      // select full sequences from known adapters and contaminants
      // based on overrepresented sequences results
186
      val fromKnownList: Set[AdapterSequence] = contaminantSet
187
188
        .filter(x => foundAdapterNames.exists(_.startsWith(x.name)))

Peter van 't Hof's avatar
Peter van 't Hof committed
189
      val fromKnownListRC: Set[AdapterSequence] = if (enableRCtrimming) fromKnownList.map {
190
        x => AdapterSequence(x.name + "_RC", reverseComplement(x.seq))
Peter van 't Hof's avatar
Peter van 't Hof committed
191
192
      }
      else Set.empty
193

194
      // list all sequences found by FastQC
195
      val fastQCFoundSequences: Seq[AdapterSequence] = if (sensitiveAdapterSearch) {
196
        modules.get("Overrepresented sequences") match {
197
198
199
200
201
202
203
          case None => Seq.empty
          case Some(qcModule) =>
            for (
              line <- qcModule.lines if !(line.startsWith("#") || line.startsWith(">"));
              values = line.split("\t") if values.size >= 4
            ) yield AdapterSequence(values(3), values(0))
        }
204
      } else Seq()
205

Peter van 't Hof's avatar
Typo    
Peter van 't Hof committed
206
      val foundAdapters = modules.get("Adapter Content").map { x =>
207
208
        val header = x.lines.head.split("\t").tail.zipWithIndex
        val lines = x.lines.tail.map(_.split("\t").tail)
Peter van 't Hof's avatar
Peter van 't Hof committed
209
        val found = header.filter(h => lines.exists(x => x(h._2).toFloat > 0)).map(_._1)
210
211
        adapterSet.filter(x => found.contains(x.name))
      }
212

Peter van 't Hof's avatar
Typo    
Peter van 't Hof committed
213
      fromKnownList ++ fastQCFoundSequences ++ fromKnownListRC ++ foundAdapters.getOrElse(Seq())
Peter van 't Hof's avatar
Peter van 't Hof committed
214
    } else Set()
215
  }
216

217
218
219
  @Output
  private var outputFiles: List[File] = Nil

220
  def summaryFiles: Map[String, File] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
221
    val outputFiles = Map("plot_duplication_levels" -> ("Images" + File.separator + "duplication_levels.png"),
222
223
224
225
226
227
228
229
      "plot_kmer_profiles" -> ("Images" + File.separator + "kmer_profiles.png"),
      "plot_per_base_gc_content" -> ("Images" + File.separator + "per_base_gc_content.png"),
      "plot_per_base_n_content" -> ("Images" + File.separator + "per_base_n_content.png"),
      "plot_per_base_quality" -> ("Images" + File.separator + "per_base_quality.png"),
      "plot_per_base_sequence_content" -> ("Images" + File.separator + "per_base_sequence_content.png"),
      "plot_per_sequence_gc_content" -> ("Images" + File.separator + "per_sequence_gc_content.png"),
      "plot_per_sequence_quality" -> ("Images" + File.separator + "per_sequence_quality.png"),
      "plot_sequence_length_distribution" -> ("Images" + File.separator + "sequence_length_distribution.png"),
Peter van 't Hof's avatar
Peter van 't Hof committed
230
231
      "fastqc_data" -> "fastqc_data.txt")
      .map(x => x._1 -> new File(outputDir, x._2))
Peter van 't Hof's avatar
Peter van 't Hof committed
232
233
234
235

    outputFiles.foreach(this.outputFiles :+= _._2)

    outputFiles ++ Map("fastq_file" -> this.fastqfile)
236
  }
237

bow's avatar
bow committed
238
239
  def summaryStats: Map[String, Any] = Map(
    "per_base_sequence_quality" -> perBaseSequenceQuality,
Peter van 't Hof's avatar
Peter van 't Hof committed
240
241
    "per_base_sequence_content" -> perBaseSequenceContent,
    "adapters" -> foundAdapters.map(x => x.name -> x.seq).toMap)
242
243
244
}

object Fastqc {
245

Peter van 't Hof's avatar
Peter van 't Hof committed
246
  def apply(root: Configurable, fastqfile: File, outDir: File): Fastqc = {
247
248
    val fastqcCommand = new Fastqc(root)
    fastqcCommand.fastqfile = fastqfile
Peter van 't Hof's avatar
Peter van 't Hof committed
249
250
251
252
    var filename: String = fastqfile.getName
    if (filename.endsWith(".gz")) filename = filename.substring(0, filename.length - 3)
    if (filename.endsWith(".gzip")) filename = filename.substring(0, filename.length - 5)
    if (filename.endsWith(".fastq")) filename = filename.substring(0, filename.length - 6)
253
    //if (filename.endsWith(".fq")) filename = filename.substring(0,filename.size - 3)
Peter van 't Hof's avatar
Peter van 't Hof committed
254
    fastqcCommand.output = new File(outDir, filename + "_fastqc.zip")
Peter van 't Hof's avatar
Peter van 't Hof committed
255
    fastqcCommand.beforeGraph()
256
    fastqcCommand
257
258
  }
}