Commit 6eec9fed authored by wyleung's avatar wyleung
Browse files

Removing Python version of seqstat and add the wrapper for the dQual/FastDtools version

parent 5165c95f
package nl.lumc.sasc.biopet.extensions
/*
* Wrapper around the seqstat implemented in D
*
*/
import argonaut._, Argonaut._
import scalaz._, Scalaz._
import scala.io.Source
import scala.collection.mutable.Map
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import java.io.File
class Seqstat(val root: Configurable) extends BiopetCommandLineFunction {
override val defaultVmem = "4G"
@Input(doc = "Input FastQ", required = true)
var input: File = _
@Output(doc = "JSON summary", required = true)
var output: File = _
executable = config("exe", default = "fastq-seqstat")
def cmdLine = required(executable) + required(input) + " > " + required(output)
def getSummary: Json = {
val json = Parse.parseOption(Source.fromFile(output).mkString)
if (json.isEmpty) return jNull
else return json.get.fieldOrEmptyObject("stats")
}
}
object Seqstat {
def apply(root: Configurable, input: File, output: File): Seqstat = {
val seqstat = new Seqstat(root)
seqstat.input = input
seqstat.output = output
return seqstat
}
def apply(root: Configurable, fastqfile: File, outDir: String): Seqstat = {
val seqstat = new Seqstat(root)
val ext = fastqfile.getName.substring(fastqfile.getName.lastIndexOf("."))
seqstat.input = fastqfile
seqstat.output = new File(outDir + fastqfile.getName.substring(0, fastqfile.getName.lastIndexOf(".")) + ".seqstats.json")
return seqstat
}
def mergeSummaries(jsons: List[Json]): Json = {
def addJson(json:Json, total:Map[String, Long]) {
for (key <- json.objectFieldsOrEmpty) {
if (json.field(key).get.isObject) addJson(json.field(key).get, total)
else if (json.field(key).get.isNumber) {
val number = json.field(key).get.numberOrZero.toLong
if (total.contains(key)) {
if (key == "len_min") {
if (total(key) > number) total(key) = number
} else if (key == "len_max") {
if (total(key) < number) total(key) = number
} else total(key) += number
}
else total += (key -> number)
}
}
}
var basesTotal: Map[String, Long] = Map()
var readsTotal: Map[String, Long] = Map()
var encoding: Set[Json] = Set()
for (json <- jsons) {
encoding += json.fieldOrEmptyString("qual_encoding")
val bases = json.fieldOrEmptyObject("bases")
addJson(bases, basesTotal)
val reads = json.fieldOrEmptyObject("reads")
addJson(reads, readsTotal)
}
return ("bases" := (
("num_n" := basesTotal("num_n")) ->:
("num_total" := basesTotal("num_total")) ->:
("num_qual_gte" := (
("1" := basesTotal("1")) ->:
("10" := basesTotal("10")) ->:
("20" := basesTotal("20")) ->:
("30" := basesTotal("30")) ->:
("40" := basesTotal("40")) ->:
("50" := basesTotal("50")) ->:
("60" := basesTotal("60")) ->:
jEmptyObject
) ) ->: jEmptyObject)) ->:
("reads" := (
("num_with_n" := readsTotal("num_with_n")) ->:
("num_total" := readsTotal("num_total")) ->:
("len_min" := readsTotal("len_min")) ->:
("len_max" := readsTotal("len_max")) ->:
("num_mean_qual_gte" := (
("1" := readsTotal("1")) ->:
("10" := readsTotal("10")) ->:
("20" := readsTotal("20")) ->:
("30" := readsTotal("30")) ->:
("40" := readsTotal("40")) ->:
("50" := readsTotal("50")) ->:
("60" := readsTotal("60")) ->:
jEmptyObject
) ) ->: jEmptyObject)) ->:
("qual_encoding" := encoding.head) ->:
jEmptyObject
}
}
......@@ -5,8 +5,8 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat }
import nl.lumc.sasc.biopet.scripts._
import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat, Seqstat }
import nl.lumc.sasc.biopet.scripts.{ FastqSync , FastqcToContams}
class Flexiprep(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
......
......@@ -2,8 +2,8 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.Md5sum
import nl.lumc.sasc.biopet.scripts.{ FastqSync, Seqstat }
import nl.lumc.sasc.biopet.extensions.{ Md5sum, Seqstat }
import nl.lumc.sasc.biopet.scripts.{ FastqSync }
import org.broadinstitute.gatk.queue.function.InProcessFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import java.io.File
......@@ -75,7 +75,7 @@ class FlexiprepSummary(val root: Configurable) extends InProcessFunction with Co
else if (!R2 && after) chunks(chunk).seqstatR1after = seqstat
else if (R2 && !after) chunks(chunk).seqstatR2 = seqstat
else if (R2 && after) chunks(chunk).seqstatR2after = seqstat
deps ::= seqstat.out
deps ::= seqstat.output
return seqstat
}
......
#!/usr/bin/env python2
"""
Simple sequencing file statistics.
Gathers various statistics of a FASTQ file.
Requirements:
* Python == 2.7.x
* Biopython >= 1.60
Copyright (c) 2013 Wibowo Arindrarto <w.arindrarto@lumc.nl>
Copyright (c) 2013 LUMC Sequencing Analysis Support Core <sasc@lumc.nl>
MIT License <http://opensource.org/licenses/MIT>
"""
RELEASE = False
__version_info__ = ('0', '2', )
__version__ = '.'.join(__version_info__)
__version__ += '-dev' if not RELEASE else ''
import argparse
import json
import os
import sys
from Bio import SeqIO
# quality points we want to measure
QVALS = [1] + range(10, 70, 10)
def dict2json(d_input, f_out):
"""Dump the given dictionary as a JSON file."""
if isinstance(f_out, basestring):
target = open(f_out, 'w')
else:
target = f_out
json.dump(d_input, target, sort_keys=True, indent=4,
separators=(',', ': '))
target.close()
def gather_stat(in_fastq, out_json, fmt):
total_bases, total_reads = 0, 0
bcntd = dict.fromkeys(QVALS, 0)
rcntd = dict.fromkeys(QVALS, 0)
len_max, len_min = 0, None
n_bases, reads_with_n = 0, 0
# adjust format name to Biopython-compatible name
for rec in SeqIO.parse(in_fastq, 'fastq-' + fmt):
read_quals = rec.letter_annotations['phred_quality']
read_len = len(read_quals)
# compute quality metrics
avg_qual = sum(read_quals) / len(read_quals)
for qval in QVALS:
bcntd[qval] += len([q for q in read_quals if q >= qval])
if avg_qual >= qval:
rcntd[qval] += 1
# compute length metrics
if read_len > len_max:
len_max = read_len
if len_min is None:
len_min = read_len
elif read_len < len_min:
len_min = read_len
# compute n metrics
n_count = rec.seq.lower().count('n')
n_bases += n_count
if n_count > 0:
reads_with_n += 1
total_bases += read_len
total_reads += 1
pctd = {
'files': {
'fastq': {
'path': os.path.abspath(in_fastq),
'checksum_sha1': None,
},
},
'stats': {
'qual_encoding': fmt,
'bases': {
'num_qual_gte': {},
'num_n': n_bases,
'num_total': total_bases,
},
'reads': {
'num_mean_qual_gte': {},
'len_max': len_max,
'len_min': len_min,
'num_with_n': reads_with_n,
'num_total': total_reads,
},
},
}
for qval in QVALS:
pctd['stats']['bases']['num_qual_gte'][qval] = bcntd[qval]
pctd['stats']['reads']['num_mean_qual_gte'][qval] = rcntd[qval]
dict2json(pctd, out_json)
if __name__ == '__main__':
usage = __doc__.split('\n\n\n')
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=usage[0], epilog=usage[1])
parser.add_argument('input', type=str, help='Path to input FASTQ file')
parser.add_argument('-o', '--output', type=str, default=sys.stdout,
help='Path to output JSON file')
parser.add_argument('--fmt', type=str, choices=['sanger', 'illumina',
'solexa'], default='sanger', help='FASTQ quality encoding')
parser.add_argument('--version', action='version', version='%(prog)s ' +
__version__)
args = parser.parse_args()
gather_stat(args.input, args.output, args.fmt)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment