Remove old script

1f864153 · Peter van 't Hof · 020ffb1f · 020ffb1f · 020ffb1f · 020ffb1f
Commit 1f864153 authored 10 years ago by Peter van 't Hof
--- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/fastqc_contam.py
+++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/fastqc_contam.py
-#!/usr/bin/env python
-#
-# Biopet is built on top of GATK Queue for building bioinformatic
-# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
-# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
-# should also be able to execute Biopet tools and pipelines.
-#
-# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
-#
-# Contact us at: sasc@lumc.nl
-#
-# A dual licensing mode is applied. The source code within this project that are
-# not part of GATK Queue is freely available for non-commercial use under an AGPL
-# license; For commercial users or users who do not want to follow the AGPL
-# license, please contact us to obtain a separate license.
-#
-
-
-import argparse
-import os
-
-from pyfastqc import load_from_dir
-
-
-def parse_contam_file(contam_file, delimiter='\t'):
-    """Given a contaminant file, return a dictionary of contaminant sequences
-    names and their sequences.
-
-    Args:
-        contam_file -- path to contaminants file
-        delimiter -- ID, sequence delimiter in the contaminants file
-    """
-    assert os.path.exists(contam_file), "Contaminant file %r does not exist" % \
-            contam_file
-    with open(contam_file, 'r') as source:
-        # read only lines not beginning with '#' and discard empty lines
-        lines = filter(None, (line.strip() for line in source if not
-            line.startswith('#')))
-        # parse contam seq lines into lists of [id, sequence]
-        parse = lambda line: filter(None, line.split(delimiter))
-        parsed = (parse(line) for line in lines)
-        # and create a dictionary, key=sequence id and value=sequence
-        contam_ref = {name: seq for name, seq in parsed}
-
-    return contam_ref
-
-
-def get_contams_present(results_dir, contam_ref):
-    """Given a path to a FastQC HTML results file, return the <div> tag of the
-    overrepresented sequences list.
-
-    Args:
-        results_dir -- Path to FastQC results directory.
-    """
-    assert os.path.exists(results_dir), "Directory {0} not " \
-            "found.".format(results_dir)
-
-    fastqc = load_from_dir(results_dir)
-    contam_names = set([x[3] for x in fastqc.overrepresented_sequences.data])
-    in_sample = lambda rid: any([cid.startswith(rid) for cid in contam_names])
-    contams_present = {x: y for x, y in contam_ref.items() if in_sample(x)}
-
-    return contams_present
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('results_dir', type=str, 
-            help='Path to FastQC result directory file')
-    parser.add_argument('-c', '--contam_file', type=str,
-            dest='contam_file',
-            help='Path to contaminant file')
-    parser.add_argument('-o', '--output', type=str,
-            dest='output',
-            help='Path to output file')
-    parser.add_argument('--seq-only',dest='seq_only',
-            action='store_true',
-            help='Whether to output contaminant sequences only or not')
-
-    args = parser.parse_args()
-
-    contam_ref = parse_contam_file(args.contam_file)
-    contam_ids = get_contams_present(args.results_dir, contam_ref)
-
-    if args.seq_only:
-       fmt_out = lambda cid, seq: seq
-    else:
-       fmt_out = lambda cid, seq: "{0}\t{1}".format(cid, seq)
-
-    if args.output is None:
-        for cid, seq in contam_ids.items():
-            print fmt_out(cid, seq)
-    else:
-        with open(args.output, 'w') as target:
-            for cid, seq in contam_ids.items():
-                target.write(fmt_out(cid, seq) + '\n')
--- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/pyfastqc/__init__.py
+++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/pyfastqc/__init__.py
-#!/usr/bin/env python
-#
-# Biopet is built on top of GATK Queue for building bioinformatic
-# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
-# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
-# should also be able to execute Biopet tools and pipelines.
-#
-# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
-#
-# Contact us at: sasc@lumc.nl
-#
-# A dual licensing mode is applied. The source code within this project that are
-# not part of GATK Queue is freely available for non-commercial use under an AGPL
-# license; For commercial users or users who do not want to follow the AGPL
-# license, please contact us to obtain a separate license.
-#
-
-
-"""
--------
-pyfastqc
--------
-
-Parser for FastQC results.
-
-Provides a method and classes for parsing a FastQC run results.
-
-Tested with FastQC 0.9.5 output.
-
-"""
-
-import os
-
-
-def load_file(fname, mode='r', **kwargs):
-    """Given a path to a FastQC data file or an open file object pointing to it,
-    return a `FastQC` object.
-    """
-    if isinstance(fname, basestring):
-        with open(fname, mode, **kwargs) as fp:
-            return FastQC(fp)
-    else:
-        return FastQC(fname)
-
-
-def load_from_dir(dirname, data_fname='fastqc_data.txt', mode='r', **kwargs):
-    """Given a path to a FastQC results directory, return a `FastQC` object."""
-    assert os.path.exists(dirname), "Directory %r does not exist" % dirname
-    fqc_path = os.path.join(dirname, os.walk(dirname).next()[1][0], data_fname)
-    return load_file(fqc_path, mode, **kwargs)
-
-
-class FastQCModule(object):
-
-    """Class representing a FastQC analysis module."""
-
-    def __init__(self, raw_lines, end_mark='>>END_MODULE'):
-        self.raw_lines = raw_lines
-        self.end_mark = end_mark
-        self._status = None
-        self._name = None
-        self._data = self.parse()
-
-    def __repr__(self):
-        return '%s(%s)' % (self.__class__.__name__,
-                '[%r, ...]' % self.raw_lines[0])
-
-    def __str__(self):
-        return ''.join(self.raw_lines)
-
-    @property
-    def name(self):
-        """Name of the module."""
-        return self._name
-
-    @property
-    def columns(self):
-        """Columns in the module."""
-        return self._columns
-
-    @property
-    def data(self):
-        """FastQC data."""
-        return self._data
-
-    @property
-    def status(self):
-        """FastQC run status."""
-        return self._status
-
-    def parse(self):
-        """Common parser for a FastQC module."""
-        # check that the last line is a proper end mark
-        assert self.raw_lines[-1].startswith(self.end_mark)
-        # parse name and status from first line
-        tokens = self.raw_lines[0].strip().split('\t')
-        name = tokens[0][2:]
-        self._name = name
-        status = tokens[-1]
-        assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \
-            % status
-        self._status = status
-        # and column names from second line
-        columns = self.raw_lines[1][1:].strip().split('\t')
-        self._columns = columns
-        # the rest of the lines except the last one
-        data = []
-        for line in self.raw_lines[2:-1]:
-            cols = line.strip().split('\t')
-            data.append(cols)
-
-        # optional processing for different modules
-        if self.name == 'Basic Statistics':
-            data = {k: v for k, v in data}
-
-        return data
-
-
-class FastQC(object):
-
-    """Class representing results from a FastQC run."""
-
-    # module name -- attribute name mapping
-    _mod_map = {
-        '>>Basic Statistics': 'basic_statistics',
-        '>>Per base sequence quality': 'per_base_sequence_quality',
-        '>>Per sequence quality scores': 'per_sequence_quality_scores',
-        '>>Per base sequence content': 'per_base_sequence_content',
-        '>>Per base GC content': 'per_base_gc_content',
-        '>>Per sequence GC content': 'per_sequence_gc_content',
-        '>>Per base N content': 'per_base_n_content',
-        '>>Sequence Length Distribution': 'sequence_length_distribution',
-        '>>Sequence Duplication Levels': 'sequence_duplication_levels',
-        '>>Overrepresented sequences': 'overrepresented_sequences',
-        '>>Kmer content': 'kmer_content',
-    }
-
-    def __init__(self, fp):
-        # get file name
-        self.fname = fp.name
-        self._modules = {}
-
-        line = fp.readline()
-        while True:
-
-            tokens = line.strip().split('\t')
-            # break on EOF
-            if not line:
-                break
-            # parse version
-            elif line.startswith('##FastQC'):
-                self.version = line.strip().split()[1]
-            # parse individual modules
-            elif tokens[0] in self._mod_map:
-                attr = self._mod_map[tokens[0]]
-                raw_lines = self.read_module(fp, line, tokens[0])
-                self._modules[attr] = FastQCModule(raw_lines)
-
-            line = fp.readline()
-
-    def __repr__(self):
-        return '%s(%r)' % (self.__class__.__name__, self.fname)
-
-    def _filter_by_status(self, status):
-        return [x.name for x in self._modules.values() if x.status == status]
-
-    def read_module(self, fp, line, start_mark):
-        raw = [line]
-        while not line.startswith('>>END_MODULE'):
-            line = fp.readline()
-            raw.append(line)
-
-            if not line:
-                raise ValueError("Unexpected end of file in module %r" % line)
-
-        return raw
-
-    @property
-    def modules(self):
-        return self._modules
-
-    @property
-    def passes(self):
-        return self._filter_by_status('pass')
-
-    @property
-    def passes_num(self):
-        return len(self.passes)
-
-    @property
-    def warns(self):
-        return self._filter_by_status('warn')
-
-    @property
-    def warns_num(self):
-        return len(self.warns)
-
-    @property
-    def fails(self):
-        return self._filter_by_status('fail')
-
-    @property
-    def fails_num(self):
-        return len(self.fails)
-
-    @property
-    def basic_statistics(self):
-        return self._modules['basic_statistics']
-
-    @property
-    def per_base_sequence_quality(self):
-        return self._modules['per_base_sequence_quality']
-
-    @property
-    def per_sequence_quality_scores(self):
-        return self._modules['per_sequence_quality_scores']
-
-    @property
-    def per_base_sequence_content(self):
-        return self._modules['per_base_sequence_content']
-
-    @property
-    def per_base_gc_content(self):
-        return self._modules['per_base_gc_content']
-
-    @property
-    def per_sequence_gc_content(self):
-        return self._modules['per_sequence_gc_content']
-
-    @property
-    def per_base_n_content(self):
-        return self._modules['per_base_n_content']
-
-    @property
-    def sequence_length_distribution(self):
-        return self._modules['sequence_length_distribution']
-
-    @property
-    def sequence_duplication_levels(self):
-        return self._modules['sequence_duplication_levels']
-
-    @property
-    def overrepresented_sequences(self):
-        return self._modules['overrepresented_sequences']
-
-    @property
-    def kmer_content(self):
-        return self._modules['kmer_content']
--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqcToContams.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqcToContams.scala
-/**
- * Biopet is built on top of GATK Queue for building bioinformatic
- * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
- * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
- * should also be able to execute Biopet tools and pipelines.
- *
- * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
- *
- * Contact us at: sasc@lumc.nl
- *
- * A dual licensing mode is applied. The source code within this project that are
- * not part of GATK Queue is freely available for non-commercial use under an AGPL
- * license; For commercial users or users who do not want to follow the AGPL
- * license, please contact us to obtain a separate license.
- */
-package nl.lumc.sasc.biopet.scripts
-
-import java.io.File
-
-import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
-
-import nl.lumc.sasc.biopet.core.config.Configurable
-import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
-
-class FastqcToContams(val root: Configurable) extends PythonCommandLineFunction {
-  setPythonScript("__init__.py", "pyfastqc/")
-  setPythonScript("fastqc_contam.py")
-
-  @Input(doc = "Fastqc output", shortName = "fastqc", required = true)
-  var fastqc_output: File = _
-
-  @Input(doc = "Contams input", shortName = "fastqc", required = false)
-  var contams_file: File = _
-
-  @Output(doc = "Output file", shortName = "out", required = true)
-  var out: File = _
-
-  def cmdLine = {
-    getPythonCommand +
-      required(fastqc_output.getParent()) +
-      required("-c", contams_file) +
-      " > " +
-      required(out)
-  }
-}
--- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala
+++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala
@@ -21,7 +21,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
 import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
 import nl.lumc.sasc.biopet.core.config.Configurable
 import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat, Seqstat }
-import nl.lumc.sasc.biopet.scripts.{ FastqSync, FastqcToContams }
+import nl.lumc.sasc.biopet.scripts.{ FastqSync }

 class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
  def this() = this(null)