From 32d55edd2b35dabf2d4fcff421e55cb274665427 Mon Sep 17 00:00:00 2001 From: bow <bow@bow.web.id> Date: Thu, 29 Jan 2015 17:02:11 +0100 Subject: [PATCH] Remove old sync script and its wrapper --- .idea/misc.xml | 3 + .../biopet/scripts/sync_paired_end_reads.py | 126 ------------------ .../lumc/sasc/biopet/scripts/FastqSync.scala | 116 ---------------- 3 files changed, 3 insertions(+), 242 deletions(-) delete mode 100644 public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/sync_paired_end_reads.py delete mode 100644 public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqSync.scala diff --git a/.idea/misc.xml b/.idea/misc.xml index e10c5f30b..59f0fb2d0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,5 +1,8 @@ <?xml version="1.0" encoding="UTF-8"?> <project version="4"> + <component name="EntryPointsManager"> + <entry_points version="2.0" /> + </component> <component name="MavenProjectsManager"> <option name="originalFiles"> <list> diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/sync_paired_end_reads.py b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/sync_paired_end_reads.py deleted file mode 100644 index be4ab0035..000000000 --- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/sync_paired_end_reads.py +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python -# -# Biopet is built on top of GATK Queue for building bioinformatic -# pipelines. It is mainly intended to support LUMC SHARK cluster which is running -# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) -# should also be able to execute Biopet tools and pipelines. -# -# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center -# -# Contact us at: sasc@lumc.nl -# -# A dual licensing mode is applied. The source code within this project that are -# not part of GATK Queue is freely available for non-commercial use under an AGPL -# license; For commercial users or users who do not want to follow the AGPL -# license, please contact us to obtain a separate license. -# - -""" -(Re-)sync two filtered paired end FASTQ files. - -Given two filtered paired end read files and one of the original read files, -re-sync the filtered reads by filtering out anything that is only present in -one of the two files. - -Usage: - {command} <orig.fq> <reads_1.fq> <reads_2.fq> \\ - <reads_1.synced.fq> <reads_2.synced.fq> - -The synced reads are written to disk as <reads_1.synced.fq> and -<reads_2.synced.fq>. Afterwards some counts are printed. - - -Both Illumina old-style and new-style paired-end header lines are supported. - -The original read file is used to speed up processing: it contains all -possible reads from both edited reads (in all files in the same order) so it -can process all files line by line, not having to read a single file in -memory. Some ideas were taken from [1]. - -[1] https://gist.github.com/588841/ - -2011-11-03, Martijn Vermaat <m.vermaat.hg@lumc.nl> -""" - - -import sys -import re - - -def sync_paired_end_reads(original, reads_a, reads_b, synced_a, synced_b): - """ - Filter out reads from two paired end read files that are not present in - both of them. Do this in a reasonable amount of time by using a file - containing all of the reads for one of the paired ends. - - All arguments are open file handles. - - @arg original: File containing all original reads for one of the paired - ends. - @arg reads_a: First from paired end read files. - @arg reads_b: Second from paired end read files. - @arg synced_a: Filtered reads from first paired end read file. - @arg synced_b: Filtered reads from second paired end read file. - - @return: Triple (filtered_a, filtered_b, kept) containing counts - of the number of reads filtered from both input files and - the total number of reads kept in the synced results. - - @todo: Print warnings if obvious things are not right (a or b still has - lines after original is processed). - """ - # This matches 1, 2, or 3 preceded by / _ or whitespace. Its rightmost - # match in a header line is used to identify the read pair. - sep = re.compile('[\s_/][123]') - - def next_record(fh): - return [fh.readline().strip() for i in range(4)] - - def head(record): - return sep.split(record[0])[:-1] - - headers = (sep.split(x.strip())[:-1] for i, x in enumerate(original) - if not (i % 4)) - - filtered_a = filtered_b = kept = 0 - - a, b = next_record(reads_a), next_record(reads_b) - - for header in headers: - if header == head(a) and head(b) != header: - a = next_record(reads_a) - filtered_a += 1 - - if header == head(b) and head(a) != header: - b = next_record(reads_b) - filtered_b += 1 - - if header == head(a) == head(b): - print >>synced_a, '\n'.join(a) - print >>synced_b, '\n'.join(b) - a, b = next_record(reads_a), next_record(reads_b) - kept += 1 - - return filtered_a, filtered_b, kept - - -if __name__ == '__main__': - if len(sys.argv) < 6: - sys.stderr.write(__doc__.split('\n\n\n')[0].strip().format( - command=sys.argv[0]) + '\n') - sys.exit(1) - try: - original = open(sys.argv[1], 'r') - reads_a = open(sys.argv[2], 'r') - reads_b = open(sys.argv[3], 'r') - synced_a = open(sys.argv[4], 'w') - synced_b = open(sys.argv[5], 'w') - filtered_a, filtered_b, kept = \ - sync_paired_end_reads(original, reads_a, reads_b, - synced_a, synced_b) - print 'Filtered %i reads from first read file.' % filtered_a - print 'Filtered %i reads from second read file.' % filtered_b - print 'Synced read files contain %i reads.' % kept - except IOError as (_, message): - sys.stderr.write('Error: %s\n' % message) - sys.exit(1) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqSync.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqSync.scala deleted file mode 100644 index 6cada2301..000000000 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqSync.scala +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.scripts - -import java.io.File - -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } - -import argonaut._, Argonaut._ -import scalaz._, Scalaz._ - -import nl.lumc.sasc.biopet.core.config.Configurable -import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction - -import scala.io.Source - -class FastqSync(val root: Configurable) extends PythonCommandLineFunction { - setPythonScript("sync_paired_end_reads.py") - - @Input(doc = "Start fastq") - var input_start_fastq: File = _ - - @Input(doc = "R1 input") - var input_R1: File = _ - - @Input(doc = "R2 input") - var input_R2: File = _ - - @Output(doc = "R1 output") - var output_R1: File = _ - - @Output(doc = "R2 output") - var output_R2: File = _ - - //No output Annotation so file - var output_stats: File = _ - - def cmdLine = { - getPythonCommand + - required(input_start_fastq) + - required(input_R1) + - required(input_R2) + - required(output_R1) + - required(output_R2) + - " > " + - required(output_stats) - } - - def getSummary: Json = { - val R1_filteredR = """Filtered (\d*) reads from first read file.""".r - val R2_filteredR = """Filtered (\d*) reads from second read file.""".r - val readsLeftR = """Synced read files contain (\d*) reads.""".r - - var R1_filtered = 0 - var R2_filtered = 0 - var readsLeft = 0 - - if (output_stats.exists) for (line <- Source.fromFile(output_stats).getLines) { - line match { - case R1_filteredR(m) => R1_filtered = m.toInt - case R2_filteredR(m) => R2_filtered = m.toInt - case readsLeftR(m) => readsLeft = m.toInt - case _ => - } - } - - return ("num_reads_discarded_R1" := R1_filtered) ->: - ("num_reads_discarded_R2" := R2_filtered) ->: - ("num_reads_kept" := readsLeft) ->: - jEmptyObject - } -} - -object FastqSync { - def apply(root: Configurable, input_start_fastq: File, input_R1: File, input_R2: File, - output_R1: File, output_R2: File, output_stats: File): FastqSync = { - val fastqSync = new FastqSync(root) - fastqSync.input_start_fastq = input_start_fastq - fastqSync.input_R1 = input_R1 - fastqSync.input_R2 = input_R2 - fastqSync.output_R1 = output_R1 - fastqSync.output_R2 = output_R2 - fastqSync.output_stats = output_stats - return fastqSync - } - - def mergeSummaries(jsons: List[Json]): Json = { - var R1_filtered = 0 - var R2_filtered = 0 - var readsLeft = 0 - - for (json <- jsons) { - R1_filtered += json.field("num_reads_discarded_R1").get.numberOrZero.toInt - R2_filtered += json.field("num_reads_discarded_R2").get.numberOrZero.toInt - readsLeft += json.field("num_reads_kept").get.numberOrZero.toInt - } - - return ("num_reads_discarded_R1" := R1_filtered) ->: - ("num_reads_discarded_R2" := R2_filtered) ->: - ("num_reads_kept" := readsLeft) ->: - jEmptyObject - } -} \ No newline at end of file -- GitLab