Remove old sync script and its wrapper

32d55edd · bow · de5cb181 · 32d55edd · de5cb181 · de5cb181
Commit 32d55edd authored 10 years ago by bow
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
+  <component name="EntryPointsManager">
+    <entry_points version="2.0" />
+  </component>
  <component name="MavenProjectsManager">
    <option name="originalFiles">
      <list>

--- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/sync_paired_end_reads.py
+++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/sync_paired_end_reads.py
-#!/usr/bin/env python
-#
-# Biopet is built on top of GATK Queue for building bioinformatic
-# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
-# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
-# should also be able to execute Biopet tools and pipelines.
-#
-# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
-#
-# Contact us at: sasc@lumc.nl
-#
-# A dual licensing mode is applied. The source code within this project that are
-# not part of GATK Queue is freely available for non-commercial use under an AGPL
-# license; For commercial users or users who do not want to follow the AGPL
-# license, please contact us to obtain a separate license.
-#
-
-"""
-(Re-)sync two filtered paired end FASTQ files.
-
-Given two filtered paired end read files and one of the original read files,
-re-sync the filtered reads by filtering out anything that is only present in
-one of the two files.
-
-Usage:
-  {command} <orig.fq> <reads_1.fq> <reads_2.fq> \\
-      <reads_1.synced.fq> <reads_2.synced.fq>
-
-The synced reads are written to disk as <reads_1.synced.fq> and
-<reads_2.synced.fq>. Afterwards some counts are printed.
-
-
-Both Illumina old-style and new-style paired-end header lines are supported.
-
-The original read file is used to speed up processing: it contains all
-possible reads from both edited reads (in all files in the same order) so it
-can process all files line by line, not having to read a single file in
-memory. Some ideas were taken from [1].
-
-[1] https://gist.github.com/588841/
-
-2011-11-03, Martijn Vermaat <m.vermaat.hg@lumc.nl>
-"""
-
-
-import sys
-import re
-
-
-def sync_paired_end_reads(original, reads_a, reads_b, synced_a, synced_b):
-    """
-    Filter out reads from two paired end read files that are not present in
-    both of them. Do this in a reasonable amount of time by using a file
-    containing all of the reads for one of the paired ends.
-
-    All arguments are open file handles.
-
-    @arg original: File containing all original reads for one of the paired
-                   ends.
-    @arg reads_a:  First from paired end read files.
-    @arg reads_b:  Second from paired end read files.
-    @arg synced_a: Filtered reads from first paired end read file.
-    @arg synced_b: Filtered reads from second paired end read file.
-
-    @return:       Triple (filtered_a, filtered_b, kept) containing counts
-                   of the number of reads filtered from both input files and
-                   the total number of reads kept in the synced results.
-
-    @todo: Print warnings if obvious things are not right (a or b still has
-           lines after original is processed).
-    """
-    # This matches 1, 2, or 3 preceded by / _ or whitespace. Its rightmost
-    # match in a header line is used to identify the read pair.
-    sep = re.compile('[\s_/][123]')
-
-    def next_record(fh):
-        return [fh.readline().strip() for i in range(4)]
-
-    def head(record):
-        return sep.split(record[0])[:-1]
-
-    headers = (sep.split(x.strip())[:-1] for i, x in enumerate(original)
-               if not (i % 4))
-
-    filtered_a = filtered_b = kept = 0
-
-    a, b = next_record(reads_a), next_record(reads_b)
-
-    for header in headers:
-        if header == head(a) and head(b) != header:
-            a = next_record(reads_a)
-            filtered_a += 1
-
-        if header == head(b) and head(a) != header:
-            b = next_record(reads_b)
-            filtered_b += 1
-
-        if header == head(a) == head(b):
-            print >>synced_a, '\n'.join(a)
-            print >>synced_b, '\n'.join(b)
-            a, b = next_record(reads_a), next_record(reads_b)
-            kept += 1
-
-    return filtered_a, filtered_b, kept
-
-
-if __name__ == '__main__':
-    if len(sys.argv) < 6:
-        sys.stderr.write(__doc__.split('\n\n\n')[0].strip().format(
-            command=sys.argv[0]) + '\n')
-        sys.exit(1)
-    try:
-        original = open(sys.argv[1], 'r')
-        reads_a = open(sys.argv[2], 'r')
-        reads_b = open(sys.argv[3], 'r')
-        synced_a = open(sys.argv[4], 'w')
-        synced_b = open(sys.argv[5], 'w')
-        filtered_a, filtered_b, kept = \
-                    sync_paired_end_reads(original, reads_a, reads_b,
-                                          synced_a, synced_b)
-        print 'Filtered %i reads from first read file.' % filtered_a
-        print 'Filtered %i reads from second read file.' % filtered_b
-        print 'Synced read files contain %i reads.' % kept
-    except IOError as (_, message):
-        sys.stderr.write('Error: %s\n' % message)
-        sys.exit(1)
--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqSync.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqSync.scala
-/**
- * Biopet is built on top of GATK Queue for building bioinformatic
- * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
- * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
- * should also be able to execute Biopet tools and pipelines.
- *
- * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
- *
- * Contact us at: sasc@lumc.nl
- *
- * A dual licensing mode is applied. The source code within this project that are
- * not part of GATK Queue is freely available for non-commercial use under an AGPL
- * license; For commercial users or users who do not want to follow the AGPL
- * license, please contact us to obtain a separate license.
- */
-package nl.lumc.sasc.biopet.scripts
-
-import java.io.File
-
-import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
-
-import argonaut._, Argonaut._
-import scalaz._, Scalaz._
-
-import nl.lumc.sasc.biopet.core.config.Configurable
-import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
-
-import scala.io.Source
-
-class FastqSync(val root: Configurable) extends PythonCommandLineFunction {
-  setPythonScript("sync_paired_end_reads.py")
-
-  @Input(doc = "Start fastq")
-  var input_start_fastq: File = _
-
-  @Input(doc = "R1 input")
-  var input_R1: File = _
-
-  @Input(doc = "R2 input")
-  var input_R2: File = _
-
-  @Output(doc = "R1 output")
-  var output_R1: File = _
-
-  @Output(doc = "R2 output")
-  var output_R2: File = _
-
-  //No output Annotation so file 
-  var output_stats: File = _
-
-  def cmdLine = {
-    getPythonCommand +
-      required(input_start_fastq) +
-      required(input_R1) +
-      required(input_R2) +
-      required(output_R1) +
-      required(output_R2) +
-      " > " +
-      required(output_stats)
-  }
-
-  def getSummary: Json = {
-    val R1_filteredR = """Filtered (\d*) reads from first read file.""".r
-    val R2_filteredR = """Filtered (\d*) reads from second read file.""".r
-    val readsLeftR = """Synced read files contain (\d*) reads.""".r
-
-    var R1_filtered = 0
-    var R2_filtered = 0
-    var readsLeft = 0
-
-    if (output_stats.exists) for (line <- Source.fromFile(output_stats).getLines) {
-      line match {
-        case R1_filteredR(m) => R1_filtered = m.toInt
-        case R2_filteredR(m) => R2_filtered = m.toInt
-        case readsLeftR(m)   => readsLeft = m.toInt
-        case _               =>
-      }
-    }
-
-    return ("num_reads_discarded_R1" := R1_filtered) ->:
-      ("num_reads_discarded_R2" := R2_filtered) ->:
-      ("num_reads_kept" := readsLeft) ->:
-      jEmptyObject
-  }
-}
-
-object FastqSync {
-  def apply(root: Configurable, input_start_fastq: File, input_R1: File, input_R2: File,
-            output_R1: File, output_R2: File, output_stats: File): FastqSync = {
-    val fastqSync = new FastqSync(root)
-    fastqSync.input_start_fastq = input_start_fastq
-    fastqSync.input_R1 = input_R1
-    fastqSync.input_R2 = input_R2
-    fastqSync.output_R1 = output_R1
-    fastqSync.output_R2 = output_R2
-    fastqSync.output_stats = output_stats
-    return fastqSync
-  }
-
-  def mergeSummaries(jsons: List[Json]): Json = {
-    var R1_filtered = 0
-    var R2_filtered = 0
-    var readsLeft = 0
-
-    for (json <- jsons) {
-      R1_filtered += json.field("num_reads_discarded_R1").get.numberOrZero.toInt
-      R2_filtered += json.field("num_reads_discarded_R2").get.numberOrZero.toInt
-      readsLeft += json.field("num_reads_kept").get.numberOrZero.toInt
-    }
-
-    return ("num_reads_discarded_R1" := R1_filtered) ->:
-      ("num_reads_discarded_R2" := R2_filtered) ->:
-      ("num_reads_kept" := readsLeft) ->:
-      jEmptyObject
-  }
-}
\ No newline at end of file