Commit 32d55edd authored by bow's avatar bow
Browse files

Remove old sync script and its wrapper

parent de5cb181
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="EntryPointsManager">
<entry_points version="2.0" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
......
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
"""
(Re-)sync two filtered paired end FASTQ files.
Given two filtered paired end read files and one of the original read files,
re-sync the filtered reads by filtering out anything that is only present in
one of the two files.
Usage:
{command} <orig.fq> <reads_1.fq> <reads_2.fq> \\
<reads_1.synced.fq> <reads_2.synced.fq>
The synced reads are written to disk as <reads_1.synced.fq> and
<reads_2.synced.fq>. Afterwards some counts are printed.
Both Illumina old-style and new-style paired-end header lines are supported.
The original read file is used to speed up processing: it contains all
possible reads from both edited reads (in all files in the same order) so it
can process all files line by line, not having to read a single file in
memory. Some ideas were taken from [1].
[1] https://gist.github.com/588841/
2011-11-03, Martijn Vermaat <m.vermaat.hg@lumc.nl>
"""
import sys
import re
def sync_paired_end_reads(original, reads_a, reads_b, synced_a, synced_b):
"""
Filter out reads from two paired end read files that are not present in
both of them. Do this in a reasonable amount of time by using a file
containing all of the reads for one of the paired ends.
All arguments are open file handles.
@arg original: File containing all original reads for one of the paired
ends.
@arg reads_a: First from paired end read files.
@arg reads_b: Second from paired end read files.
@arg synced_a: Filtered reads from first paired end read file.
@arg synced_b: Filtered reads from second paired end read file.
@return: Triple (filtered_a, filtered_b, kept) containing counts
of the number of reads filtered from both input files and
the total number of reads kept in the synced results.
@todo: Print warnings if obvious things are not right (a or b still has
lines after original is processed).
"""
# This matches 1, 2, or 3 preceded by / _ or whitespace. Its rightmost
# match in a header line is used to identify the read pair.
sep = re.compile('[\s_/][123]')
def next_record(fh):
return [fh.readline().strip() for i in range(4)]
def head(record):
return sep.split(record[0])[:-1]
headers = (sep.split(x.strip())[:-1] for i, x in enumerate(original)
if not (i % 4))
filtered_a = filtered_b = kept = 0
a, b = next_record(reads_a), next_record(reads_b)
for header in headers:
if header == head(a) and head(b) != header:
a = next_record(reads_a)
filtered_a += 1
if header == head(b) and head(a) != header:
b = next_record(reads_b)
filtered_b += 1
if header == head(a) == head(b):
print >>synced_a, '\n'.join(a)
print >>synced_b, '\n'.join(b)
a, b = next_record(reads_a), next_record(reads_b)
kept += 1
return filtered_a, filtered_b, kept
if __name__ == '__main__':
if len(sys.argv) < 6:
sys.stderr.write(__doc__.split('\n\n\n')[0].strip().format(
command=sys.argv[0]) + '\n')
sys.exit(1)
try:
original = open(sys.argv[1], 'r')
reads_a = open(sys.argv[2], 'r')
reads_b = open(sys.argv[3], 'r')
synced_a = open(sys.argv[4], 'w')
synced_b = open(sys.argv[5], 'w')
filtered_a, filtered_b, kept = \
sync_paired_end_reads(original, reads_a, reads_b,
synced_a, synced_b)
print 'Filtered %i reads from first read file.' % filtered_a
print 'Filtered %i reads from second read file.' % filtered_b
print 'Synced read files contain %i reads.' % kept
except IOError as (_, message):
sys.stderr.write('Error: %s\n' % message)
sys.exit(1)
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.scripts
import java.io.File
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import argonaut._, Argonaut._
import scalaz._, Scalaz._
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
import scala.io.Source
class FastqSync(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("sync_paired_end_reads.py")
@Input(doc = "Start fastq")
var input_start_fastq: File = _
@Input(doc = "R1 input")
var input_R1: File = _
@Input(doc = "R2 input")
var input_R2: File = _
@Output(doc = "R1 output")
var output_R1: File = _
@Output(doc = "R2 output")
var output_R2: File = _
//No output Annotation so file
var output_stats: File = _
def cmdLine = {
getPythonCommand +
required(input_start_fastq) +
required(input_R1) +
required(input_R2) +
required(output_R1) +
required(output_R2) +
" > " +
required(output_stats)
}
def getSummary: Json = {
val R1_filteredR = """Filtered (\d*) reads from first read file.""".r
val R2_filteredR = """Filtered (\d*) reads from second read file.""".r
val readsLeftR = """Synced read files contain (\d*) reads.""".r
var R1_filtered = 0
var R2_filtered = 0
var readsLeft = 0
if (output_stats.exists) for (line <- Source.fromFile(output_stats).getLines) {
line match {
case R1_filteredR(m) => R1_filtered = m.toInt
case R2_filteredR(m) => R2_filtered = m.toInt
case readsLeftR(m) => readsLeft = m.toInt
case _ =>
}
}
return ("num_reads_discarded_R1" := R1_filtered) ->:
("num_reads_discarded_R2" := R2_filtered) ->:
("num_reads_kept" := readsLeft) ->:
jEmptyObject
}
}
object FastqSync {
def apply(root: Configurable, input_start_fastq: File, input_R1: File, input_R2: File,
output_R1: File, output_R2: File, output_stats: File): FastqSync = {
val fastqSync = new FastqSync(root)
fastqSync.input_start_fastq = input_start_fastq
fastqSync.input_R1 = input_R1
fastqSync.input_R2 = input_R2
fastqSync.output_R1 = output_R1
fastqSync.output_R2 = output_R2
fastqSync.output_stats = output_stats
return fastqSync
}
def mergeSummaries(jsons: List[Json]): Json = {
var R1_filtered = 0
var R2_filtered = 0
var readsLeft = 0
for (json <- jsons) {
R1_filtered += json.field("num_reads_discarded_R1").get.numberOrZero.toInt
R2_filtered += json.field("num_reads_discarded_R2").get.numberOrZero.toInt
readsLeft += json.field("num_reads_kept").get.numberOrZero.toInt
}
return ("num_reads_discarded_R1" := R1_filtered) ->:
("num_reads_discarded_R2" := R2_filtered) ->:
("num_reads_kept" := readsLeft) ->:
jEmptyObject
}
}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment