Merge branch 'feature-fix_contams' into 'develop'

Feature fix contams Fix for #11 See merge request !69

Merge branch 'feature-fix_contams' into 'develop'
86e3b9eb · bow · 259e3372 · 1f864153 · 259e3372 · 259e3372
Commit 86e3b9eb authored 10 years ago by bow
--- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/fastqc_contam.py
+++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/fastqc_contam.py
-#!/usr/bin/env python
-#
-# Biopet is built on top of GATK Queue for building bioinformatic
-# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
-# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
-# should also be able to execute Biopet tools and pipelines.
-#
-# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
-#
-# Contact us at: sasc@lumc.nl
-#
-# A dual licensing mode is applied. The source code within this project that are
-# not part of GATK Queue is freely available for non-commercial use under an AGPL
-# license; For commercial users or users who do not want to follow the AGPL
-# license, please contact us to obtain a separate license.
-#
-
-
-import argparse
-import os
-
-from pyfastqc import load_from_dir
-
-
-def parse_contam_file(contam_file, delimiter='\t'):
-    """Given a contaminant file, return a dictionary of contaminant sequences
-    names and their sequences.
-
-    Args:
-        contam_file -- path to contaminants file
-        delimiter -- ID, sequence delimiter in the contaminants file
-    """
-    assert os.path.exists(contam_file), "Contaminant file %r does not exist" % \
-            contam_file
-    with open(contam_file, 'r') as source:
-        # read only lines not beginning with '#' and discard empty lines
-        lines = filter(None, (line.strip() for line in source if not
-            line.startswith('#')))
-        # parse contam seq lines into lists of [id, sequence]
-        parse = lambda line: filter(None, line.split(delimiter))
-        parsed = (parse(line) for line in lines)
-        # and create a dictionary, key=sequence id and value=sequence
-        contam_ref = {name: seq for name, seq in parsed}
-
-    return contam_ref
-
-
-def get_contams_present(results_dir, contam_ref):
-    """Given a path to a FastQC HTML results file, return the <div> tag of the
-    overrepresented sequences list.
-
-    Args:
-        results_dir -- Path to FastQC results directory.
-    """
-    assert os.path.exists(results_dir), "Directory {0} not " \
-            "found.".format(results_dir)
-
-    fastqc = load_from_dir(results_dir)
-    contam_names = set([x[3] for x in fastqc.overrepresented_sequences.data])
-    in_sample = lambda rid: any([cid.startswith(rid) for cid in contam_names])
-    contams_present = {x: y for x, y in contam_ref.items() if in_sample(x)}
-
-    return contams_present
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('results_dir', type=str, 
-            help='Path to FastQC result directory file')
-    parser.add_argument('-c', '--contam_file', type=str,
-            dest='contam_file',
-            help='Path to contaminant file')
-    parser.add_argument('-o', '--output', type=str,
-            dest='output',
-            help='Path to output file')
-    parser.add_argument('--seq-only',dest='seq_only',
-            action='store_true',
-            help='Whether to output contaminant sequences only or not')
-
-    args = parser.parse_args()
-
-    contam_ref = parse_contam_file(args.contam_file)
-    contam_ids = get_contams_present(args.results_dir, contam_ref)
-
-    if args.seq_only:
-       fmt_out = lambda cid, seq: seq
-    else:
-       fmt_out = lambda cid, seq: "{0}\t{1}".format(cid, seq)
-
-    if args.output is None:
-        for cid, seq in contam_ids.items():
-            print fmt_out(cid, seq)
-    else:
-        with open(args.output, 'w') as target:
-            for cid, seq in contam_ids.items():
-                target.write(fmt_out(cid, seq) + '\n')
--- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/pyfastqc/__init__.py
+++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/pyfastqc/__init__.py
-#!/usr/bin/env python
-#
-# Biopet is built on top of GATK Queue for building bioinformatic
-# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
-# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
-# should also be able to execute Biopet tools and pipelines.
-#
-# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
-#
-# Contact us at: sasc@lumc.nl
-#
-# A dual licensing mode is applied. The source code within this project that are
-# not part of GATK Queue is freely available for non-commercial use under an AGPL
-# license; For commercial users or users who do not want to follow the AGPL
-# license, please contact us to obtain a separate license.
-#
-
-
-"""
--------
-pyfastqc
--------
-
-Parser for FastQC results.
-
-Provides a method and classes for parsing a FastQC run results.
-
-Tested with FastQC 0.9.5 output.
-
-"""
-
-import os
-
-
-def load_file(fname, mode='r', **kwargs):
-    """Given a path to a FastQC data file or an open file object pointing to it,
-    return a `FastQC` object.
-    """
-    if isinstance(fname, basestring):
-        with open(fname, mode, **kwargs) as fp:
-            return FastQC(fp)
-    else:
-        return FastQC(fname)
-
-
-def load_from_dir(dirname, data_fname='fastqc_data.txt', mode='r', **kwargs):
-    """Given a path to a FastQC results directory, return a `FastQC` object."""
-    assert os.path.exists(dirname), "Directory %r does not exist" % dirname
-    fqc_path = os.path.join(dirname, os.walk(dirname).next()[1][0], data_fname)
-    return load_file(fqc_path, mode, **kwargs)
-
-
-class FastQCModule(object):
-
-    """Class representing a FastQC analysis module."""
-
-    def __init__(self, raw_lines, end_mark='>>END_MODULE'):
-        self.raw_lines = raw_lines
-        self.end_mark = end_mark
-        self._status = None
-        self._name = None
-        self._data = self.parse()
-
-    def __repr__(self):
-        return '%s(%s)' % (self.__class__.__name__,
-                '[%r, ...]' % self.raw_lines[0])
-
-    def __str__(self):
-        return ''.join(self.raw_lines)
-
-    @property
-    def name(self):
-        """Name of the module."""
-        return self._name
-
-    @property
-    def columns(self):
-        """Columns in the module."""
-        return self._columns
-
-    @property
-    def data(self):
-        """FastQC data."""
-        return self._data
-
-    @property
-    def status(self):
-        """FastQC run status."""
-        return self._status
-
-    def parse(self):
-        """Common parser for a FastQC module."""
-        # check that the last line is a proper end mark
-        assert self.raw_lines[-1].startswith(self.end_mark)
-        # parse name and status from first line
-        tokens = self.raw_lines[0].strip().split('\t')
-        name = tokens[0][2:]
-        self._name = name
-        status = tokens[-1]
-        assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
-            % status
-        self._status = status
-        # and column names from second line
-        columns = self.raw_lines[1][1:].strip().split('\t')
-        self._columns = columns
-        # the rest of the lines except the last one
-        data = []
-        for line in self.raw_lines[2:-1]:
-            cols = line.strip().split('\t')
-            data.append(cols)
-
-        # optional processing for different modules
-        if self.name == 'Basic Statistics':
-            data = {k: v for k, v in data}
-
-        return data
-
-
-class FastQC(object):
-
-    """Class representing results from a FastQC run."""
-
-    # module name -- attribute name mapping
-    _mod_map = {
-        '>>Basic Statistics': 'basic_statistics',
-        '>>Per base sequence quality': 'per_base_sequence_quality',
-        '>>Per sequence quality scores': 'per_sequence_quality_scores',
-        '>>Per base sequence content': 'per_base_sequence_content',
-        '>>Per base GC content': 'per_base_gc_content',
-        '>>Per sequence GC content': 'per_sequence_gc_content',
-        '>>Per base N content': 'per_base_n_content',
-        '>>Sequence Length Distribution': 'sequence_length_distribution',
-        '>>Sequence Duplication Levels': 'sequence_duplication_levels',
-        '>>Overrepresented sequences': 'overrepresented_sequences',
-        '>>Kmer content': 'kmer_content',
-    }
-
-    def __init__(self, fp):
-        # get file name
-        self.fname = fp.name
-        self._modules = {}
-
-        line = fp.readline()
-        while True:
-
-            tokens = line.strip().split('\t')
-            # break on EOF
-            if not line:
-                break
-            # parse version
-            elif line.startswith('##FastQC'):
-                self.version = line.strip().split()[1]
-            # parse individual modules
-            elif tokens[0] in self._mod_map:
-                attr = self._mod_map[tokens[0]]
-                raw_lines = self.read_module(fp, line, tokens[0])
-                self._modules[attr] = FastQCModule(raw_lines)
-
-            line = fp.readline()
-
-    def __repr__(self):
-        return '%s(%r)' % (self.__class__.__name__, self.fname)
-
-    def _filter_by_status(self, status):
-        return [x.name for x in self._modules.values() if x.status == status]
-
-    def read_module(self, fp, line, start_mark):
-        raw = [line]
-        while not line.startswith('>>END_MODULE'):
-            line = fp.readline()
-            raw.append(line)
-
-            if not line:
-                raise ValueError("Unexpected end of file in module %r" % line)
-
-        return raw
-
-    @property
-    def modules(self):
-        return self._modules
-
-    @property
-    def passes(self):
-        return self._filter_by_status('pass')
-
-    @property
-    def passes_num(self):
-        return len(self.passes)
-
-    @property
-    def warns(self):
-        return self._filter_by_status('warn')
-
-    @property
-    def warns_num(self):
-        return len(self.warns)
-
-    @property
-    def fails(self):
-        return self._filter_by_status('fail')
-
-    @property
-    def fails_num(self):
-        return len(self.fails)
-
-    @property
-    def basic_statistics(self):
-        return self._modules['basic_statistics']
-
-    @property
-    def per_base_sequence_quality(self):
-        return self._modules['per_base_sequence_quality']
-
-    @property
-    def per_sequence_quality_scores(self):
-        return self._modules['per_sequence_quality_scores']
-
-    @property
-    def per_base_sequence_content(self):
-        return self._modules['per_base_sequence_content']
-
-    @property
-    def per_base_gc_content(self):
-        return self._modules['per_base_gc_content']
-
-    @property
-    def per_sequence_gc_content(self):
-        return self._modules['per_sequence_gc_content']
-
-    @property
-    def per_base_n_content(self):
-        return self._modules['per_base_n_content']
-
-    @property
-    def sequence_length_distribution(self):
-        return self._modules['sequence_length_distribution']
-
-    @property
-    def sequence_duplication_levels(self):
-        return self._modules['sequence_duplication_levels']
-
-    @property
-    def overrepresented_sequences(self):
-        return self._modules['overrepresented_sequences']
-
-    @property
-    def kmer_content(self):
-        return self._modules['kmer_content']
--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqcToContams.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqcToContams.scala
-/**
- * Biopet is built on top of GATK Queue for building bioinformatic
- * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
- * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
- * should also be able to execute Biopet tools and pipelines.
- *
- * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
- *
- * Contact us at: sasc@lumc.nl
- *
- * A dual licensing mode is applied. The source code within this project that are
- * not part of GATK Queue is freely available for non-commercial use under an AGPL
- * license; For commercial users or users who do not want to follow the AGPL
- * license, please contact us to obtain a separate license.
- */
-package nl.lumc.sasc.biopet.scripts
-
-import java.io.File
-
-import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
-
-import nl.lumc.sasc.biopet.core.config.Configurable
-import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
-
-class FastqcToContams(val root: Configurable) extends PythonCommandLineFunction {
-  setPythonScript("__init__.py", "pyfastqc/")
-  setPythonScript("fastqc_contam.py")
-
-  @Input(doc = "Fastqc output", shortName = "fastqc", required = true)
-  var fastqc_output: File = _
-
-  @Input(doc = "Contams input", shortName = "fastqc", required = false)
-  var contams_file: File = _
-
-  @Output(doc = "Output file", shortName = "out", required = true)
-  var out: File = _
-
-  def cmdLine = {
-    getPythonCommand +
-      required(fastqc_output.getParent()) +
-      required("-c", contams_file) +
-      " > " +
-      required(out)
-  }
-}
--- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala
+++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala
@@ -28,12 +28,15 @@ import nl.lumc.sasc.biopet.core.config.Configurable
 import scala.collection.mutable.Map

 class Cutadapt(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Cutadapt(root) {
-  @Input(doc = "Fastq contams file", required = false)
-  var contams_file: File = _
+  var fastqc: Fastqc = _

  override def beforeCmd() {
    super.beforeCmd
-    getContamsFromFile
+
+    val foundAdapters = fastqc.getFoundAdapters.map(_.seq)
+    if (default_clip_mode == "3") opt_adapter ++= foundAdapters
+    else if (default_clip_mode == "5") opt_front ++= foundAdapters
+    else if (default_clip_mode == "both") opt_anywhere ++= foundAdapters
  }

  override def cmdLine = {
@@ -46,24 +49,6 @@ class Cutadapt(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Cutada
    }
  }

-  def getContamsFromFile {
-    if (contams_file != null) {
-      if (contams_file.exists()) {
-        for (line <- Source.fromFile(contams_file).getLines) {
-          var s: String = line.substring(line.lastIndexOf("\t") + 1, line.size)
-          if (default_clip_mode == "3") opt_adapter += s
-          else if (default_clip_mode == "5") opt_front += s
-          else if (default_clip_mode == "both") opt_anywhere += s
-          else {
-            opt_adapter += s
-            logger.warn("Option default_clip_mode should be '3', '5' or 'both', falling back to default: '3'")
-          }
-          logger.info("Adapter: " + s + " found in: " + fastq_input)
-        }
-      } else logger.warn("File : " + contams_file + " does not exist")
-    }
-  }
-
  def getSummary: Json = {
    val trimR = """.*Trimmed reads: *(\d*) .*""".r
    val tooShortR = """.*Too short reads: *(\d*) .*""".r

--- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala
+++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala
@@ -13,11 +13,6 @@
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
-/*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */

 package nl.lumc.sasc.biopet.pipelines.flexiprep

@@ -52,6 +47,30 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r
    return null // Could be default Sanger with a warning in the log
  }

+  protected case class Sequence(name: String, seq: String)
+  def getFoundAdapters: List[Sequence] = {
+    def getSeqs(file: File) = {
+      if (file != null) {
+        (for (
+          line <- Source.fromFile(file).getLines(); if line.startsWith("#");
+          values = line.split("\t*") if values.size >= 2
+        ) yield Sequence(values(0), values(1))).toList
+      } else Nil
+    }
+
+    val seqs = getSeqs(adapters) ::: getSeqs(contaminants)
+
+    val block = getDataBlock("Overrepresented sequences")
+    if (block == null) return Nil
+
+    val found = for (
+      line <- block if !line.startsWith("#");
+      values = line.split("\t") if values.size >= 4
+    ) yield values(3)
+
+    seqs.filter(x => found.exists(_.startsWith(x.name)))
+  }
+
  def getSummary: Json = {
    val subfixs = Map("plot_duplication_levels" -> "Images/duplication_levels.png",
      "plot_kmer_profiles" -> "Images/kmer_profiles.png",

--- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala
+++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala
@@ -21,7 +21,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
 import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
 import nl.lumc.sasc.biopet.core.config.Configurable
 import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat, Seqstat }
-import nl.lumc.sasc.biopet.scripts.{ FastqSync, FastqcToContams }
+import nl.lumc.sasc.biopet.scripts.{ FastqSync }

 class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
  def this() = this(null)
@@ -101,7 +101,6 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
    add(fastqc_R1)
    summary.addFastqc(fastqc_R1)
    outputFiles += ("fastqc_R1" -> fastqc_R1.output)
-    outputFiles += ("contams_R1" -> getContams(fastqc_R1, R1_name))

    val md5sum_R1 = Md5sum(this, input_R1, outputDir)
    add(md5sum_R1)
@@ -112,7 +111,6 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
      add(fastqc_R2)
      summary.addFastqc(fastqc_R2, R2 = true)
      outputFiles += ("fastqc_R2" -> fastqc_R2.output)
-      outputFiles += ("contams_R2" -> getContams(fastqc_R2, R2_name))

      val md5sum_R2 = Md5sum(this, input_R2, outputDir)
      add(md5sum_R2)
@@ -120,15 +118,6 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
    }
  }

-  def getContams(fastqc: Fastqc, pairname: String): File = {
-    val fastqcToContams = new FastqcToContams(this)
-    fastqcToContams.fastqc_output = fastqc.output
-    fastqcToContams.out = new File(outputDir + pairname + ".contams.txt")
-    fastqcToContams.contams_file = fastqc.contaminants
-    add(fastqcToContams)
-    return fastqcToContams.out
-  }
-
  def runTrimClip(R1_in: File, outDir: String, chunk: String): (File, File, List[File]) = {
    runTrimClip(R1_in, new File(""), outDir, chunk)
  }
@@ -146,13 +135,13 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
    var R2: File = new File(R2_in)
    var deps: List[File] = if (paired) List(R1, R2) else List(R1)

-    val seqtkSeq_R1 = SeqtkSeq.apply(this, R1, swapExt(outDir, R1, R1_ext, ".sanger" + R1_ext), fastqc_R1)
+    val seqtkSeq_R1 = SeqtkSeq(this, R1, swapExt(outDir, R1, R1_ext, ".sanger" + R1_ext), fastqc_R1)
    add(seqtkSeq_R1, isIntermediate = true)
    R1 = seqtkSeq_R1.output
    deps ::= R1

    if (paired) {
-      val seqtkSeq_R2 = SeqtkSeq.apply(this, R2, swapExt(outDir, R2, R2_ext, ".sanger" + R2_ext), fastqc_R2)
+      val seqtkSeq_R2 = SeqtkSeq(this, R2, swapExt(outDir, R2, R2_ext, ".sanger" + R2_ext), fastqc_R2)
      add(seqtkSeq_R2, isIntermediate = true)
      R2 = seqtkSeq_R2.output
      deps ::= R2
@@ -171,7 +160,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
    if (!skipClip) { // Adapter clipping

      val cutadapt_R1 = Cutadapt(this, R1, swapExt(outDir, R1, R1_ext, ".clip" + R1_ext))
-      if (outputFiles.contains("contams_R1")) cutadapt_R1.contams_file = outputFiles("contams_R1")
+      cutadapt_R1.fastqc = fastqc_R1
      add(cutadapt_R1, isIntermediate = true)
      summary.addCutadapt(cutadapt_R1, R2 = false, chunk)
      R1 = cutadapt_R1.fastq_output
@@ -181,7 +170,7 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
      if (paired) {
        val cutadapt_R2 = Cutadapt(this, R2, swapExt(outDir, R2, R2_ext, ".clip" + R2_ext))
        outputFiles += ("cutadapt_R2_stats" -> cutadapt_R2.stats_output)
-        if (outputFiles.contains("contams_R2")) cutadapt_R2.contams_file = outputFiles("contams_R2")
+        cutadapt_R2.fastqc = fastqc_R2
        add(cutadapt_R2, isIntermediate = true)
        summary.addCutadapt(cutadapt_R2, R2 = true, chunk)
        R2 = cutadapt_R2.fastq_output