Cutadapt.scala 3.99 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
16
17
18
package nl.lumc.sasc.biopet.extensions

import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
19

20
import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction }
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.utils.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
22
23
import nl.lumc.sasc.biopet.core.summary.Summarizable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
24

25
26
27
import scala.collection.mutable
import scala.io.Source

Peter van 't Hof's avatar
Peter van 't Hof committed
28
29
30
31
/**
 * Extension for cutadept
 * Based on version 1.5
 */
32
class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Summarizable with Version {
33
34
35
  @Input(doc = "Input fastq file")
  var fastq_input: File = _

Peter van 't Hof's avatar
Peter van 't Hof committed
36
37
  @Output
  var fastq_output: File = _
38
39
40
41
42

  @Output(doc = "Output statistics file")
  var stats_output: File = _

  executable = config("exe", default = "cutadapt")
43
44
  def versionCommand = executable + " --version"
  def versionRegex = """(.*)""".r
45

46
47
48
  /** Name of the key containing clipped adapters information in the summary stats. */
  def adaptersStatsName = "adapters"

49
  var default_clip_mode: String = config("default_clip_mode", default = "3")
Peter van 't Hof's avatar
Peter van 't Hof committed
50
51
52
  var opt_adapter: Set[String] = config("adapter", default = Nil)
  var opt_anywhere: Set[String] = config("anywhere", default = Nil)
  var opt_front: Set[String] = config("front", default = Nil)
53

Peter van 't Hof's avatar
Peter van 't Hof committed
54
  var opt_discard: Boolean = config("discard", default = false)
55
  var opt_minimum_length: Int = config("minimum_length", 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
56
  var opt_maximum_length: Option[Int] = config("maximum_length")
57

Peter van 't Hof's avatar
Peter van 't Hof committed
58
  /** return commandline to execute */
59
60
61
62
63
64
65
66
67
68
  def cmdLine = required(executable) +
    // options
    repeat("-a", opt_adapter) +
    repeat("-b", opt_anywhere) +
    repeat("-g", opt_front) +
    conditional(opt_discard, "--discard") +
    optional("-m", opt_minimum_length) +
    optional("-M", opt_maximum_length) +
    // input / output
    required(fastq_input) +
Peter van 't Hof's avatar
Peter van 't Hof committed
69
    (if (outputAsStsout) "" else required("--output", fastq_output) +
Peter van 't Hof's avatar
Peter van 't Hof committed
70
      " > " + required(stats_output))
71

Peter van 't Hof's avatar
Peter van 't Hof committed
72
  /** Output summary stats */
73
  def summaryStats: Map[String, Any] = {
74
75
76
77
78
79
80
81
    val trimR = """.*Trimmed reads: *(\d*) .*""".r
    val tooShortR = """.*Too short reads: *(\d*) .*""".r
    val tooLongR = """.*Too long reads: *(\d*) .*""".r
    val adapterR = """Adapter '([C|T|A|G]*)'.*trimmed (\d*) times.""".r

    val stats: mutable.Map[String, Int] = mutable.Map("trimmed" -> 0, "tooshort" -> 0, "toolong" -> 0)
    val adapter_stats: mutable.Map[String, Int] = mutable.Map()

Peter van 't Hof's avatar
Peter van 't Hof committed
82
    if (stats_output.exists) for (line <- Source.fromFile(stats_output).getLines()) {
83
84
85
86
87
88
89
90
91
      line match {
        case trimR(m)                 => stats += ("trimmed" -> m.toInt)
        case tooShortR(m)             => stats += ("tooshort" -> m.toInt)
        case tooLongR(m)              => stats += ("toolong" -> m.toInt)
        case adapterR(adapter, count) => adapter_stats += (adapter -> count.toInt)
        case _                        =>
      }
    }

92
    Map("num_reads_affected" -> stats("trimmed"),
93
94
      "num_reads_discarded_too_short" -> stats("tooshort"),
      "num_reads_discarded_too_long" -> stats("toolong"),
95
      adaptersStatsName -> adapter_stats.toMap
96
97
98
    )
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
99
  /** Merges values that can be merged for the summary */
100
101
102
103
104
105
106
107
  override def resolveSummaryConflict(v1: Any, v2: Any, key: String): Any = {
    (v1, v2) match {
      case (v1: Int, v2: Int) => v1 + v2
      case _                  => v1
    }
  }

  def summaryFiles: Map[String, File] = Map()
Zeeuw's avatar
Zeeuw committed
108
}