Commit 85e3f91d authored by bow's avatar bow
Browse files

Add gentrap scripts

parent b840a916
#!/usr/bin/env Rscript
# aggr_base_count.R
#
# Given a count file, write tab-delimited file(s) aggregating the counts
# at gene and/or exon level.
#
# (c) 2013 by Wibowo Arindrarto [LUMC - SASC]
# Adapted from Peter-Bram 't Hoen's script: 'merge_table_script_shark_PH3.r'
## FLAGS ##
LEVELS <- c('gene', 'exon')
OUT.DIR <- getwd()
DEBUG <- FALSE
if (DEBUG) {
message("## DEBUG MODE ##")
}
## FUNCTIONS ##
CheckCountFiles <- function(count.files, DEBUG=FALSE) {
# Given a vector of sample names, checks whether the .count files exist.
#
# Count files are the input files used to analyze the RNA-Seq expression
# levels. They must conform to the following file name:
# '{sample_name}/{sample_name}.count'
#
# Args:
# - paths: string vector of file paths
for (cfile in count.files) {
if (!file.exists(cfile)) {
stop(paste("Path '", cfile, "' does not exist. Exiting.", sep=""))
}
if (DEBUG) {
message("Path '", cfile, "' exists.", sep="")
}
}
}
CountBaseExons <- function(count.files, count.names,
col.header=c("gene", "chr", "start", "stop")) {
# Given a list of count files, return a data frame containing their values.
#
# The count files must be a tab-separate file containing the following
# columns in the same order:
# 1. chromosome
# 2. start position
# 3. stop position
# 4. total nucleotide counts
# 5. nucleotide counts per exon
# 6. gene name
#
# The returned data frame has the following columns:
#
# 1. gene name
# 2. chromosome
# 3. start position
# 4. stop position
# 5... total nucleotide counts for each sample
#
# This function assumes that for all count files, the values of the first
# three columns are the same for each row.
#
# Args:
# - count.files: string vector of count file paths
# - col.headers: string vector of default data frame output headers
# given a count file path, extract its fourth column
GetNucCount <- function(x) {
read.table(x, as.is=TRUE)[4]
}
# initial data frame is from the first file
exon.counts <- read.table(count.files[1], as.is=TRUE)
exon.counts <- exon.counts[, c(6, 1:3, 4)]
colnames(exon.counts)[1:5] <- append(col.header, count.names[1])
if (length(count.files) > 1) {
# why doesn't R provide a nice way to slice remaining items??
remaining.files <- count.files[2: length(count.files)]
remaining.names <- count.names[2: length(count.names)]
# append all nucleotide counts from remaining files to exon.counts
exon.counts <- cbind(exon.counts, lapply(remaining.files, GetNucCount))
# and rename these columns accordingly
end.idx <- 5 + length(remaining.files)
colnames(exon.counts)[6:end.idx] <- remaining.names
}
return(exon.counts)
}
CountExons <- function(exon.df) {
# Given a data frame containing exon counts, return a data frame consisting of
# compacted exon counts.
#
# In a compacted exon count data frame, each exon has its own unique name
# consisting of its gene source and its start-stop coordinates.
#
# Args:
# - exon.df: data frame of complete exon counts
# create new data frame of the exon counts, concatenating gene name, and the
# exon start-stop coordinates
exon.dis.counts <- cbind(paste(paste(exon.df$gene, exon.df$start,
sep=":"), exon.df$stop, sep="-"),
exon.df[5: length(exon.df)])
colnames(exon.dis.counts)[1] <- "exon"
counts.in.samples <- as.matrix(exon.dis.counts[2:ncol(exon.dis.counts)])
exon.counts <- aggregate(counts.in.samples ~ exon, data=exon.dis.counts, FUN=sum,
na.rm=TRUE)
colnames(exon.counts)[2:ncol(exon.counts)] <- colnames(counts.in.samples)
return (exon.counts)
}
CountGenes <- function(exon.df) {
# Given a data frame containing exon counts, return a data frame of gene
# counts.
#
# See CountBaseExons for the input data frame format.
#
# Args:
# - exon.df: data frame of complete exon counts
# basically an aggregate of exon counts with the same gene name
counts.in.samples <- as.matrix(exon.df[5:ncol(exon.df)])
gene.counts <- aggregate(counts.in.samples ~ gene, data=exon.df, FUN=sum,
na.rm=TRUE)
# first column is gene
colnames(gene.counts)[2:ncol(gene.counts)] <- colnames(counts.in.samples)
return(gene.counts)
}
# load package for arg parsing
library('getopt')
# create spec for arg parsing
spec <- matrix(c(
# colon-separated paths to each count files
'count-file', 'I', 1, 'character',
# colon-separated paths of each count file label; order must be the same
# as the count files
'count-name', 'N', 1, 'character',
# output file for gene level counts
'gene-count', 'G', 1, 'character',
# output file for exon level counts
'exon-count', 'E', 1, 'character',
# help
'help', 'H', 0, 'logical'
), byrow=TRUE, ncol=4)
opt <- getopt(spec)
# print help if requested
if (!is.null(opt[['help']])) {
cat(getopt(spec, usage=TRUE))
q(status=1)
}
# we need gene-count and/or exon-count flag
if (is.null(opt[['gene-count']]) & is.null(opt[['exon-count']])) {
message("Error: Either '--gene-count' and/or '--exon-count' must have a value.")
q(status=1)
}
# set fallback values for optional args
if (!is.null(opt[['output-dir']])) {
OUT.DIR <- normalizePath(opt[['output-dir']])
# create directory if it doesn't exist
dir.create(OUT.DIR, showWarnings=FALSE)
}
# parse the input file paths and check their presence
if (!is.null(opt[['count-file']])) {
count.files <- opt[['count-file']]
count.files <- unlist(strsplit(gsub(' ', '', count.files), ':'))
CheckCountFiles(count.files, DEBUG)
} else {
stop("Required input count file path(s) not present. Exiting.")
}
# parse the input count labels and check if its length is the same as the input
# files
if (!is.null(opt[['count-name']])) {
count.names <- opt[['count-name']]
count.names <- unlist(strsplit(gsub(' ', '', count.names), ':'))
if (length(count.names) != length(count.files)) {
stop("Mismatched count file paths and labels. Exiting.")
}
} else {
stop("Required input count file label(s) not present. Exiting.")
}
# set output file name for gene counts
if (!is.null(opt[['gene-count']])) {
gene.out <- opt[['gene-count']]
} else {
gene.out <- NULL
}
# set output file name for exon counts
if (!is.null(opt[['exon-count']])) {
exon.out <- opt[['exon-count']]
} else {
exon.out <- NULL
}
# count base exons (complete with coordinates)
base.exon.counts <- CountBaseExons(count.files, count.names)
# and write output files, depending on the flags
if (!is.null(gene.out)) {
gene.counts <- CountGenes(base.exon.counts)
write.table(gene.counts, file = gene.out, sep = "\t", quote = FALSE, row.names = FALSE)
}
if (!is.null(exon.out)) {
exon.counts <- CountExons(base.exon.counts)
write.table(exon.counts, file = exon.out, sep = "\t", quote = FALSE, row.names = FALSE)
}
#!/usr/bin/env python2.7
# bam_rna.py
#
# Generate read pair and alignment statistics for a BAM file.
#
# Part of the Gentrap pipeline
#
# (c) 2013 Wibowo Arindrarto [SASC - LUMC]
import argparse
import json
import locale
import os
from collections import OrderedDict
from functools import partial
import pysam
# set locale to group digits
locale.setlocale(locale.LC_ALL, '')
# formatters for output
int_fmt = partial(locale.format, grouping=True, percent='%i')
float_fmt = partial(locale.format, grouping=True, percent='%.2f')
# -F 0x4
func_nmap = lambda rec: not rec.flag & 0x4
# -f 0xC
func_nunmap = lambda rec: (rec.flag & 0x4) and (rec.flag & 0x8)
# -F 0xC
func_nmap_pair = lambda rec: not rec.flag & 0xC
# -f 0x2
func_nmap_pair_ok = lambda rec: rec.flag & 0x2
# mapped to different chromosomes / reference sequences
func_nmap_diffchr = lambda rec: rec.rnext != rec.tid
# -F 0x4 -f 0x8
func_nmap_sgltn = lambda rec: (not rec.flag & 0x4) and (rec.flag & 0x8)
# check if spliced
func_splice = lambda rec: 'N' in rec.cigarstring
# check if pass QC
# not bool since we always need to run each read through it
func_qc = lambda rec: 1 if rec.flag & 0x200 else 0
FLAGS = OrderedDict((
('total', 'Total'),
('unmapped', 'Unmapped'),
('mapped', 'Mapped'),
('mappedPair', 'Mapped pairs'),
('mappedPairProper', 'Properly mapped pairs'),
('mappedDiffChr', 'Different chromosomes'),
('mappedDiffChrQ', 'Different chromosomes, MAPQ >=5'),
('singleton', 'Singletons'),
('totalSplice', 'Split reads, total'),
('splicePairProper', 'Split reads, properly mapped'),
('spliceSingleton', 'Split reads, singletons'),
))
class BarnStat(object):
"""Class representing a collection of BAM statistics for RNA-seq data."""
def __init__(self, bamfile, read_pair_suffix_len=0, id_sorted=False,
validate=False):
assert os.path.exists(bamfile), "BAM file %r not found." % bamfile
self.validate = validate
self.bamfile = bamfile
self.flags = FLAGS.keys()
# length of read pair suffix (e.g. '/2' has len == 2)
self.suflen = read_pair_suffix_len
if not id_sorted:
self._count_unsorted()
else:
self._count_sorted()
self._format_counts()
def _adjust_counts(self, alns, reads):
"""Adjusts the alignment and read counts."""
# we need to adjust the unmapped counts for alignments as each
# alignments consists of two reads (one read pair) which may be mapped
# multiple times as singletons and/or whole read pairs.
# so unmapped alns is always == unmapped read pairs + singleton *reads*
# counts (proxy for the set of unmapped singletons)
if 'unmapped' in alns:
alns['unmapped'] += reads['singleton']
else:
# tophat, splits unmapped reads into its own BAM file
alns['unmapped'] = 0
return alns, reads
def _count_sorted(self):
"""Counts read and alignment statistics for ID-sorted BAM file."""
flags = self.flags
reads, alns, alns_qc = {}, {}, {}
read_flags = dict.fromkeys(flags, False)
cur_qname = None
# init counts
for flag in flags:
reads[flag], alns[flag], alns_qc[flag] = 0, 0, 0
# iterate over each record
# index for suffix removal, if suffix exist (> 0)
if self.suflen:
sufslice = slice(-self.suflen)
else:
sufslice = slice(None)
for rec in pysam.Samfile(self.bamfile, 'rb'):
# different qname mean we've finished parsing each unique read
# so reset the flags and add them to the counters appropriately
if cur_qname != rec.qname[sufslice]:
for flag in flags:
reads[flag] += int(read_flags[flag])
# reset the qname tracker
cur_qname = rec.qname[sufslice]
# and the read flag tracker
read_flags = dict.fromkeys(flags, False)
# total, total splice
alns['total'] += 1
alns_qc['total'] += func_qc(rec)
read_flags['total'] = True
if func_splice(rec):
alns['totalSplice'] += 1
alns_qc['totalSplice'] += func_qc(rec)
read_flags['totalSplice'] = True
# unmapped
if func_nunmap(rec):
alns['unmapped'] += 1
alns_qc['unmapped'] += func_qc(rec)
read_flags['unmapped'] = True
else:
# mapped
if func_nmap(rec):
alns['mapped'] += 1
alns_qc['mapped'] += func_qc(rec)
read_flags['mapped'] = True
# mapped pairs
if func_nmap_pair(rec):
alns['mappedPair'] += 1
alns_qc['mappedPair'] += func_qc(rec)
read_flags['mappedPair'] = True
# proper pairs, proper pairs splice
if func_nmap_pair_ok(rec):
alns['mappedPairProper'] += 1
alns_qc['mappedPairProper'] += func_qc(rec)
read_flags['mappedPairProper'] = True
if func_splice(rec):
alns['splicePairProper'] += 1
alns_qc['splicePairProper'] += func_qc(rec)
read_flags['splicePairProper'] = True
# mapped to different chromosomes
elif func_nmap_diffchr(rec):
alns['mappedDiffChr'] += 1
alns_qc['mappedDiffChr'] += func_qc(rec)
read_flags['mappedDiffChr'] = True
if rec.mapq >= 5:
alns['mappedDiffChrQ'] += 1
alns_qc['mappedDiffChrQ'] += func_qc(rec)
read_flags['mappedDiffChrQ'] = True
# singletons, singletons splice
elif func_nmap_sgltn(rec):
alns['singleton'] += 1
alns_qc['singleton'] += func_qc(rec)
read_flags['singleton'] = True
if func_splice(rec):
alns['spliceSingleton'] += 1
alns_qc['spliceSingleton'] += func_qc(rec)
read_flags['spliceSingleton'] = True
# for the last read, since we don't pass the qname check again
for flag in flags:
reads[flag] += int(read_flags[flag])
self.aln_counts, self.read_counts = self._adjust_counts(alns, reads)
self.aln_qc_counts = alns_qc
if self.validate:
assert self.validate_counts()
def _count_unsorted(self):
"""Counts read and alignment statistics for non-ID-sorted BAM file."""
flags = self.flags
reads_total, reads_unmap, reads_map, reads_pair_map, reads_pair_proper, \
reads_sgltn, reads_total_splice, reads_pair_proper_splice, \
reads_sgltn_splice, reads_pair_diffchr, reads_pair_diffchr_h = \
set(), set(), set(), set(), set(), set(), set(), set(), set(), \
set(), set()
reads, alns, alns_qc = {}, {}, {}
for flag in flags:
reads[flag], alns[flag], alns_qc[flag] = 0, 0, 0
# index for suffix removal, if suffix exist (> 0)
if self.suflen:
sufslice = slice(-self.suflen)
else:
sufslice = slice(None)
for rec in pysam.Samfile(self.bamfile, 'rb'):
# remove '/1' or '/2' suffixes, to collapse read pair counts
pass
# do countings on alns and reads directly
hname = hash(rec.qname[sufslice])
# total, total splice
alns['total'] += 1
alns_qc['total'] += func_qc(rec)
if hname not in reads_total:
reads_total.add(hname)
if func_splice(rec):
alns['totalSplice'] += 1
alns_qc['totalSplice'] += func_qc(rec)
reads_total_splice.add(hname)
# unmapped
if func_nunmap(rec):
alns['unmapped'] += 1
alns_qc['unmapped'] += func_qc(rec)
reads_unmap.add(hname)
else:
# mapped
if func_nmap(rec):
alns['mapped'] += 1
alns_qc['mapped'] += func_qc(rec)
reads_map.add(hname)
# mapped pairs
if func_nmap_pair(rec):
alns['mappedPair'] += 1
alns_qc['mappedPair'] += func_qc(rec)
reads_pair_map.add(hname)
# proper pairs, proper pairs splice
if func_nmap_pair_ok(rec):
alns['mappedPairProper'] += 1
alns_qc['mappedPairProper'] += func_qc(rec)
reads_pair_proper.add(hname)
if func_splice(rec):
alns['splicePairProper'] += 1
alns_qc['splicePairProper'] += func_qc(rec)
reads_pair_proper_splice.add(hname)
# mapped to different chromosomes
elif func_nmap_diffchr(rec):
alns['mappedDiffChr'] += 1
alns_qc['mappedDiffChr'] += func_qc(rec)
reads_pair_diffchr.add(hname)
if rec.mapq >= 5:
alns['mappedDiffChrQ'] += 1
alns_qc['mappedDiffChrQ'] += func_qc(rec)
reads_pair_diffchr_h.add(hname)
# singletons, singletons splice
elif func_nmap_sgltn(rec):
alns['singleton'] += 1
alns_qc['singleton'] += func_qc(rec)
reads_sgltn.add(hname)
if func_splice(rec):
alns['spliceSingleton'] += 1
alns_qc['spliceSingleton'] += func_qc(rec)
reads_sgltn_splice.add(hname)
# set counts for reads
reads['total'] = len(reads_total)
reads['totalSplice'] = len(reads_total_splice)
reads['unmapped'] = len(reads_unmap)
reads['mapped'] = len(reads_map)
reads['mappedPair'] = len(reads_pair_map)
reads['mappedPairProper'] = len(reads_pair_proper)
reads['mappedDiffChr'] = len(reads_pair_diffchr)
reads['mappedDiffChrQ'] = len(reads_pair_diffchr_h)
reads['splicePairProper'] = len(reads_pair_proper_splice)
reads['singleton'] = len(reads_sgltn)
reads['spliceSingleton'] = len(reads_sgltn_splice)
# free the memory
del reads_total, reads_map, reads_pair_map, reads_pair_proper, \
reads_sgltn, reads_total_splice, reads_pair_proper_splice, \
reads_sgltn_splice, reads_unmap, reads_pair_diffchr, \
reads_pair_diffchr_h
self.aln_counts, self.read_counts = self._adjust_counts(alns, reads)
self.aln_qc_counts = alns_qc
if self.validate:
assert self.validate_counts()
def validate_counts(self):
"""Checks whether all reads and alignment counts add up."""
for ctype in ('read_counts', 'aln_counts'):
count = getattr(self, ctype)
ntotal = count['total']
nmap = count['mapped']
nunmap = count['unmapped']
nmap_pair = count['mappedPair']
nmap_pair_ok = count['mappedPairProper']
nmap_pair_diffchr = count['mappedDiffChr']
nmap_sgltn = count['singleton']
nsplice_total = count['totalSplice']
nsplice_pair_ok = count['splicePairProper']
nsplice_sgltn = count['spliceSingleton']
assert nmap == nmap_pair + nmap_sgltn, \
"Mismatch: %r == %r + %r" % (nmap, nmap_pair, nmap_sgltn)
assert nmap_pair_ok + nmap_pair_diffchr <= nmap_pair
assert nsplice_total <= ntotal
assert nsplice_pair_ok <= nmap_pair_ok
assert nsplice_sgltn <= nmap_sgltn
# total is always == unmapped + mapped pair + singletons
assert ntotal == nunmap + nmap_pair + nmap_sgltn, "Mismatch: " \
"%r == %r + %r + %r" % (ntotal, nunmap, nmap_pair,
nmap_sgltn)
return True
def show(self):
"""Prints the read and alignment counts in human-readable format to
stdout."""
import pprint
pprint.pprint(dict(self.counts.items()))
def write_json(self, out_file):
"""Writes JSON to the output file."""
with open(out_file, 'w') as outfile:
json.dump(self.counts, outfile, sort_keys=True, indent=4,
separators=(',', ': '))
def _format_counts(self):
"""Formats read and alignment counts into nice-looking numbers."""
counts = OrderedDict()
flags = self.flags
for ctype in ('read_counts', 'aln_counts', 'aln_qc_counts'):
count = getattr(self, ctype)
ntotal = count['total']
cont = {}
lvl = 'aln' if ctype == 'aln_counts' else 'read'
if ctype == 'aln_qc_counts':
lvl = 'aln_qc'
else:
pct = lambda x: x * 100.0 / ntotal
if ctype == 'read_counts':
lvl = 'read'
elif ctype == 'aln_counts':
lvl = 'aln'
for flag in flags: