Skip to content
Snippets Groups Projects
Commit 32d55edd authored by bow's avatar bow
Browse files

Remove old sync script and its wrapper

parent de5cb181
No related branches found
No related tags found
No related merge requests found
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="EntryPointsManager">
<entry_points version="2.0" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
......
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
"""
(Re-)sync two filtered paired end FASTQ files.
Given two filtered paired end read files and one of the original read files,
re-sync the filtered reads by filtering out anything that is only present in
one of the two files.
Usage:
{command} <orig.fq> <reads_1.fq> <reads_2.fq> \\
<reads_1.synced.fq> <reads_2.synced.fq>
The synced reads are written to disk as <reads_1.synced.fq> and
<reads_2.synced.fq>. Afterwards some counts are printed.
Both Illumina old-style and new-style paired-end header lines are supported.
The original read file is used to speed up processing: it contains all
possible reads from both edited reads (in all files in the same order) so it
can process all files line by line, not having to read a single file in
memory. Some ideas were taken from [1].
[1] https://gist.github.com/588841/
2011-11-03, Martijn Vermaat <m.vermaat.hg@lumc.nl>
"""
import sys
import re
def sync_paired_end_reads(original, reads_a, reads_b, synced_a, synced_b):
"""
Filter out reads from two paired end read files that are not present in
both of them. Do this in a reasonable amount of time by using a file
containing all of the reads for one of the paired ends.
All arguments are open file handles.
@arg original: File containing all original reads for one of the paired
ends.
@arg reads_a: First from paired end read files.
@arg reads_b: Second from paired end read files.
@arg synced_a: Filtered reads from first paired end read file.
@arg synced_b: Filtered reads from second paired end read file.
@return: Triple (filtered_a, filtered_b, kept) containing counts
of the number of reads filtered from both input files and
the total number of reads kept in the synced results.
@todo: Print warnings if obvious things are not right (a or b still has
lines after original is processed).
"""
# This matches 1, 2, or 3 preceded by / _ or whitespace. Its rightmost
# match in a header line is used to identify the read pair.
sep = re.compile('[\s_/][123]')
def next_record(fh):
return [fh.readline().strip() for i in range(4)]
def head(record):
return sep.split(record[0])[:-1]
headers = (sep.split(x.strip())[:-1] for i, x in enumerate(original)
if not (i % 4))
filtered_a = filtered_b = kept = 0
a, b = next_record(reads_a), next_record(reads_b)
for header in headers:
if header == head(a) and head(b) != header:
a = next_record(reads_a)
filtered_a += 1
if header == head(b) and head(a) != header:
b = next_record(reads_b)
filtered_b += 1
if header == head(a) == head(b):
print >>synced_a, '\n'.join(a)
print >>synced_b, '\n'.join(b)
a, b = next_record(reads_a), next_record(reads_b)
kept += 1
return filtered_a, filtered_b, kept
if __name__ == '__main__':
if len(sys.argv) < 6:
sys.stderr.write(__doc__.split('\n\n\n')[0].strip().format(
command=sys.argv[0]) + '\n')
sys.exit(1)
try:
original = open(sys.argv[1], 'r')
reads_a = open(sys.argv[2], 'r')
reads_b = open(sys.argv[3], 'r')
synced_a = open(sys.argv[4], 'w')
synced_b = open(sys.argv[5], 'w')
filtered_a, filtered_b, kept = \
sync_paired_end_reads(original, reads_a, reads_b,
synced_a, synced_b)
print 'Filtered %i reads from first read file.' % filtered_a
print 'Filtered %i reads from second read file.' % filtered_b
print 'Synced read files contain %i reads.' % kept
except IOError as (_, message):
sys.stderr.write('Error: %s\n' % message)
sys.exit(1)
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.scripts
import java.io.File
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import argonaut._, Argonaut._
import scalaz._, Scalaz._
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
import scala.io.Source
class FastqSync(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("sync_paired_end_reads.py")
@Input(doc = "Start fastq")
var input_start_fastq: File = _
@Input(doc = "R1 input")
var input_R1: File = _
@Input(doc = "R2 input")
var input_R2: File = _
@Output(doc = "R1 output")
var output_R1: File = _
@Output(doc = "R2 output")
var output_R2: File = _
//No output Annotation so file
var output_stats: File = _
def cmdLine = {
getPythonCommand +
required(input_start_fastq) +
required(input_R1) +
required(input_R2) +
required(output_R1) +
required(output_R2) +
" > " +
required(output_stats)
}
def getSummary: Json = {
val R1_filteredR = """Filtered (\d*) reads from first read file.""".r
val R2_filteredR = """Filtered (\d*) reads from second read file.""".r
val readsLeftR = """Synced read files contain (\d*) reads.""".r
var R1_filtered = 0
var R2_filtered = 0
var readsLeft = 0
if (output_stats.exists) for (line <- Source.fromFile(output_stats).getLines) {
line match {
case R1_filteredR(m) => R1_filtered = m.toInt
case R2_filteredR(m) => R2_filtered = m.toInt
case readsLeftR(m) => readsLeft = m.toInt
case _ =>
}
}
return ("num_reads_discarded_R1" := R1_filtered) ->:
("num_reads_discarded_R2" := R2_filtered) ->:
("num_reads_kept" := readsLeft) ->:
jEmptyObject
}
}
object FastqSync {
def apply(root: Configurable, input_start_fastq: File, input_R1: File, input_R2: File,
output_R1: File, output_R2: File, output_stats: File): FastqSync = {
val fastqSync = new FastqSync(root)
fastqSync.input_start_fastq = input_start_fastq
fastqSync.input_R1 = input_R1
fastqSync.input_R2 = input_R2
fastqSync.output_R1 = output_R1
fastqSync.output_R2 = output_R2
fastqSync.output_stats = output_stats
return fastqSync
}
def mergeSummaries(jsons: List[Json]): Json = {
var R1_filtered = 0
var R2_filtered = 0
var readsLeft = 0
for (json <- jsons) {
R1_filtered += json.field("num_reads_discarded_R1").get.numberOrZero.toInt
R2_filtered += json.field("num_reads_discarded_R2").get.numberOrZero.toInt
readsLeft += json.field("num_reads_kept").get.numberOrZero.toInt
}
return ("num_reads_discarded_R1" := R1_filtered) ->:
("num_reads_discarded_R2" := R2_filtered) ->:
("num_reads_kept" := readsLeft) ->:
jEmptyObject
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment