Commit 85e3f91d authored by bow's avatar bow

Add gentrap scripts

parent b840a916
#!/usr/bin/env Rscript
# aggr_base_count.R
#
# Given a count file, write tab-delimited file(s) aggregating the counts
# at gene and/or exon level.
#
# (c) 2013 by Wibowo Arindrarto [LUMC - SASC]
# Adapted from Peter-Bram 't Hoen's script: 'merge_table_script_shark_PH3.r'
## FLAGS ##
LEVELS <- c('gene', 'exon')
OUT.DIR <- getwd()
DEBUG <- FALSE
if (DEBUG) {
message("## DEBUG MODE ##")
}
## FUNCTIONS ##
CheckCountFiles <- function(count.files, DEBUG=FALSE) {
# Given a vector of sample names, checks whether the .count files exist.
#
# Count files are the input files used to analyze the RNA-Seq expression
# levels. They must conform to the following file name:
# '{sample_name}/{sample_name}.count'
#
# Args:
# - paths: string vector of file paths
for (cfile in count.files) {
if (!file.exists(cfile)) {
stop(paste("Path '", cfile, "' does not exist. Exiting.", sep=""))
}
if (DEBUG) {
message("Path '", cfile, "' exists.", sep="")
}
}
}
CountBaseExons <- function(count.files, count.names,
col.header=c("gene", "chr", "start", "stop")) {
# Given a list of count files, return a data frame containing their values.
#
# The count files must be a tab-separate file containing the following
# columns in the same order:
# 1. chromosome
# 2. start position
# 3. stop position
# 4. total nucleotide counts
# 5. nucleotide counts per exon
# 6. gene name
#
# The returned data frame has the following columns:
#
# 1. gene name
# 2. chromosome
# 3. start position
# 4. stop position
# 5... total nucleotide counts for each sample
#
# This function assumes that for all count files, the values of the first
# three columns are the same for each row.
#
# Args:
# - count.files: string vector of count file paths
# - col.headers: string vector of default data frame output headers
# given a count file path, extract its fourth column
GetNucCount <- function(x) {
read.table(x, as.is=TRUE)[4]
}
# initial data frame is from the first file
exon.counts <- read.table(count.files[1], as.is=TRUE)
exon.counts <- exon.counts[, c(6, 1:3, 4)]
colnames(exon.counts)[1:5] <- append(col.header, count.names[1])
if (length(count.files) > 1) {
# why doesn't R provide a nice way to slice remaining items??
remaining.files <- count.files[2: length(count.files)]
remaining.names <- count.names[2: length(count.names)]
# append all nucleotide counts from remaining files to exon.counts
exon.counts <- cbind(exon.counts, lapply(remaining.files, GetNucCount))
# and rename these columns accordingly
end.idx <- 5 + length(remaining.files)
colnames(exon.counts)[6:end.idx] <- remaining.names
}
return(exon.counts)
}
CountExons <- function(exon.df) {
# Given a data frame containing exon counts, return a data frame consisting of
# compacted exon counts.
#
# In a compacted exon count data frame, each exon has its own unique name
# consisting of its gene source and its start-stop coordinates.
#
# Args:
# - exon.df: data frame of complete exon counts
# create new data frame of the exon counts, concatenating gene name, and the
# exon start-stop coordinates
exon.dis.counts <- cbind(paste(paste(exon.df$gene, exon.df$start,
sep=":"), exon.df$stop, sep="-"),
exon.df[5: length(exon.df)])
colnames(exon.dis.counts)[1] <- "exon"
counts.in.samples <- as.matrix(exon.dis.counts[2:ncol(exon.dis.counts)])
exon.counts <- aggregate(counts.in.samples ~ exon, data=exon.dis.counts, FUN=sum,
na.rm=TRUE)
colnames(exon.counts)[2:ncol(exon.counts)] <- colnames(counts.in.samples)
return (exon.counts)
}
CountGenes <- function(exon.df) {
# Given a data frame containing exon counts, return a data frame of gene
# counts.
#
# See CountBaseExons for the input data frame format.
#
# Args:
# - exon.df: data frame of complete exon counts
# basically an aggregate of exon counts with the same gene name
counts.in.samples <- as.matrix(exon.df[5:ncol(exon.df)])
gene.counts <- aggregate(counts.in.samples ~ gene, data=exon.df, FUN=sum,
na.rm=TRUE)
# first column is gene
colnames(gene.counts)[2:ncol(gene.counts)] <- colnames(counts.in.samples)
return(gene.counts)
}
# load package for arg parsing
library('getopt')
# create spec for arg parsing
spec <- matrix(c(
# colon-separated paths to each count files
'count-file', 'I', 1, 'character',
# colon-separated paths of each count file label; order must be the same
# as the count files
'count-name', 'N', 1, 'character',
# output file for gene level counts
'gene-count', 'G', 1, 'character',
# output file for exon level counts
'exon-count', 'E', 1, 'character',
# help
'help', 'H', 0, 'logical'
), byrow=TRUE, ncol=4)
opt <- getopt(spec)
# print help if requested
if (!is.null(opt[['help']])) {
cat(getopt(spec, usage=TRUE))
q(status=1)
}
# we need gene-count and/or exon-count flag
if (is.null(opt[['gene-count']]) & is.null(opt[['exon-count']])) {
message("Error: Either '--gene-count' and/or '--exon-count' must have a value.")
q(status=1)
}
# set fallback values for optional args
if (!is.null(opt[['output-dir']])) {
OUT.DIR <- normalizePath(opt[['output-dir']])
# create directory if it doesn't exist
dir.create(OUT.DIR, showWarnings=FALSE)
}
# parse the input file paths and check their presence
if (!is.null(opt[['count-file']])) {
count.files <- opt[['count-file']]
count.files <- unlist(strsplit(gsub(' ', '', count.files), ':'))
CheckCountFiles(count.files, DEBUG)
} else {
stop("Required input count file path(s) not present. Exiting.")
}
# parse the input count labels and check if its length is the same as the input
# files
if (!is.null(opt[['count-name']])) {
count.names <- opt[['count-name']]
count.names <- unlist(strsplit(gsub(' ', '', count.names), ':'))
if (length(count.names) != length(count.files)) {
stop("Mismatched count file paths and labels. Exiting.")
}
} else {
stop("Required input count file label(s) not present. Exiting.")
}
# set output file name for gene counts
if (!is.null(opt[['gene-count']])) {
gene.out <- opt[['gene-count']]
} else {
gene.out <- NULL
}
# set output file name for exon counts
if (!is.null(opt[['exon-count']])) {
exon.out <- opt[['exon-count']]
} else {
exon.out <- NULL
}
# count base exons (complete with coordinates)
base.exon.counts <- CountBaseExons(count.files, count.names)
# and write output files, depending on the flags
if (!is.null(gene.out)) {
gene.counts <- CountGenes(base.exon.counts)
write.table(gene.counts, file = gene.out, sep = "\t", quote = FALSE, row.names = FALSE)
}
if (!is.null(exon.out)) {
exon.counts <- CountExons(base.exon.counts)
write.table(exon.counts, file = exon.out, sep = "\t", quote = FALSE, row.names = FALSE)
}
#!/usr/bin/env python
#
# gc_dist.py
#
# Given a path to a FASTQ file, create plots of GC percentages.
#
# Part of the Gentrap pipeline.
#
# (c) 2013 Wibowo Arindrarto [SASC - LUMC]
import argparse
import locale
import os
import textwrap
import numpy as np
# for headless matplotlib
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.gridspec as gs
from matplotlib.ticker import FuncFormatter, MultipleLocator
# set locale and formatter to do digit grouping
locale.setlocale(locale.LC_ALL, '')
groupdig = lambda x, pos: locale.format('%d', x, grouping=True)
major_formatter = FuncFormatter(groupdig)
def read_seq(fp):
"""Given a FASTQ file, yield its sequences."""
if isinstance(fp, basestring):
assert os.path.exists(fp)
fp = open(fp, 'r')
for counter, line in enumerate(fp):
if (counter + 3) % 4 == 0:
yield line.strip()
def drange(start, stop, step):
"""Like `range` but for floats."""
cur = start
while cur < stop:
yield cur
cur += step
def graph_gc(fname, outname='test.png'):
"""Graphs the GC percentages of the given FASTQ file."""
# count GC percentages per sequence
gcs = []
for seq in read_seq(fname):
gc = sum(seq.lower().count(x) for x in ('g', 'c', 's'))
gcs.append(gc * 100.0 / len(seq))
# grab mean and std dev for plotting
mean = np.mean(gcs)
stdev = np.std(gcs)
# set the subplots in the figure; top is histogram, bottom is boxplot
fig = plt.figure(figsize=(8, 8))
grids = gs.GridSpec(2, 1, height_ratios=[5, 1])
ax0 = plt.subplot(grids[0])
# set title and adjust distance to plot
title = 'Distribution of GC Percentage'
t = plt.title('\n'.join([title] + textwrap.wrap('%r' %
os.path.basename(fname), 50)), fontsize=15)
t.set_y(1.05)
# start counting bins for width measurement
total = len(gcs)
min_hist = min(gcs)
max_hist = max(gcs)
low = high = np.median(gcs)
step = 1
widths = dict.fromkeys(range(20, 100, 20) + [99], (0, 0))
while low >= min_hist or high <= max_hist:
# cap the width marker at min or max gc values
if high > max_hist: high = max_hist
if low < min_hist: low = min_hist
range_count = len([x for x in gcs if low < x < high])
coverage = float(range_count) / total
if coverage >= 0.2 and not any(widths[20]):
widths[20] = (low, high)
if coverage >= 0.4 and not any(widths[40]):
widths[40] = (low, high)
if coverage >= 0.6 and not any(widths[60]):
widths[60] = (low, high)
if coverage >= 0.8 and not any(widths[80]):
widths[80] = (low, high)
if coverage >= 0.99 and not any(widths[99]):
widths[99] = (low, high)
low -= step
high += step
# use the bin coordinates for partial background coloring
for hstart, hend in widths.values():
plt.axvspan(hstart, hend, facecolor='#0099ff', linestyle='dotted',
linewidth=2.0, edgecolor='black', alpha=0.2)
# plot the histogram
bins = [0] + list(drange(2.5, 100, 5)) + [100]
n, bins, patches = ax0.hist(gcs, bins=bins, facecolor='#009933', alpha=0.9)
# set Y-axis ticks label formatting
ax0.yaxis.set_major_formatter(major_formatter)
ax0.yaxis.grid(True)
plt.ylabel('Read count')
ax0.text(0.02, 0.9, 'Mean: %.2f\nStdev: %.2f' % (mean, stdev),
transform=ax0.transAxes, bbox=dict(facecolor='grey', alpha=0.5,
edgecolor='none'), size=14)
# plot the boxplot
# shared X-axis, but invisible
ax1 = plt.subplot(grids[1], sharex=ax0)
plt.setp(ax1.get_xticklabels(), visible=False)
# and set the Y-axis to be invisible completely
ax1.axes.get_yaxis().set_visible(False)
plot = ax1.boxplot(gcs, vert=False, widths=0.6, sym='r.')
# line width and color settings for boxplot
plot['fliers'][0].set_color('#e62e00')
plot['fliers'][1].set_color('#e62e00')
plot['boxes'][0].set_color('black')
plot['boxes'][0].set_linewidth(1.2)
plot['medians'][0].set_linewidth(1.2)
plot['medians'][0].set_color('black')
plot['whiskers'][0].set_color('black')
plot['whiskers'][0].set_linewidth(1.2)
plot['whiskers'][1].set_color('black')
plot['whiskers'][1].set_linewidth(1.2)
plot['caps'][0].set_linewidth(1.2)
plot['caps'][1].set_linewidth(1.2)
# set X-axis label and ticks
ax0.xaxis.set_major_locator(MultipleLocator(10))
ax0.xaxis.set_minor_locator(MultipleLocator(5))
plt.xlabel('% GC')
grids.update(hspace=0.075)
plt.savefig(outname, bbox_inches='tight')
return gcs
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('input', help='input FASTQ file', default='reads.fq')
parser.add_argument('output', help='output image file', default='test.png')
args = parser.parse_args()
gcs = graph_gc(args.input, args.output)
#!/usr/bin/env python
#
# insert_dist.py
#
# Given path to a text file containing Picard's CollectInsertSizeMetrics
# results, create a new graph.
#
# (c) 2013 Wibowo Arindrarto [SASC - LUMC]
import argparse
import locale
import os
import re
import textwrap
from collections import namedtuple
from functools import partial
# for headless matplotlib
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# set locale and formatter for axis ticks
locale.setlocale(locale.LC_ALL, '')
groupdig = lambda x, pos: locale.format('%d', x, grouping=True)
major_formatter = FuncFormatter(groupdig)
int_fmt = partial(locale.format, grouping=True, percent='%i')
def multi_annotate(ax, title, xy_arr=[], *args, **kwargs):
"""Axis annotation function that targets multiple data points."""
ans = []
an = ax.annotate(title, xy_arr[0], *args, **kwargs)
ans.append(an)
d = {}
if 'xycoords' in kwargs:
d['xycoords'] = kwargs['xycoords']
if 'arrowprops' in kwargs:
d['arrowprops'] = kwargs['arrowprops']
for xy in xy_arr[1:]:
an = ax.annotate(title, xy, alpha=0.0, xytext=(0, 0), textcoords=an, **d)
ans.append(an)
return ans
def parse_insert_sizes_histogram(fname):
"""Given a filename or a file object of a Picard COllectInsertSizeMetrics
output, return the filename, the histogram column names, and the histogram
data."""
if isinstance(fname, basestring):
fp = open(fname, 'r')
else:
fp = fname
line = fp.readline()
while True:
if not line:
raise ValueError("Unexpected end of file")
# try to get the original bam file name
elif 'net.sf.picard.analysis.CollectInsertSizeMetrics' in line:
input = re.search('INPUT=([^\s]*)', line).group(1)
bamname = os.path.basename(input)
elif line.startswith('## HISTOGRAM'):
break
line = fp.readline()
# get column names
colnames = fp.readline().strip().split('\t')
# iterate over the histogram data lines
# and fill up missing data with 0s
data = []
counter = 0
for line in fp:
if not line.strip():
break
# bin number starts at 1
tokens = [int(x) for x in line.split('\t')]
numcol = len(tokens) - 1
if counter == tokens[0] - 1:
data.append(tokens[1:])
counter += 1
else:
while tokens[0] - counter != 1:
data.append([0] * numcol)
counter += 1
data.append(tokens[1:])
counter += 1
histogram = data
return bamname, colnames, histogram
def graph_insert_sizes(fname, outname='test.png'):
"""Given a Picard CollectInsertSizes text output filename, write graph(s)
for the histogram."""
bamname, colnames, hist = parse_insert_sizes_histogram(fname)
# map Picard's insert type (based on its column name)
# to our own name and color
InsType = namedtuple('InsType', ['label', 'color'])
design_map = {
# 5' --F--> <--R-- 5
'fr_count': InsType('inward', '#009933'),
# <--R-- 5' 5' --F-->
'rf_count': InsType('outward', 'orange'),
# 5' --F--> 5' --F--> or <--R-- 5' <--R-- 5'
'tandem_count': InsType('same directions', '#e62e00'),
}
fig = plt.figure()
ax = plt.subplot(111)
for idx, col in enumerate(colnames[1:]):
pcd_name = col.split('.')[-1]
try:
label = design_map[pcd_name].label
color = design_map[pcd_name].color
except KeyError:
raise ValueError("Unexpected column name: %r" % col)
data = [m[idx] for m in hist]
plt.bar(range(len(hist)), data, width=1, linewidth=0, color=color,
alpha=0.6, label=label)
max_val = max(data)
max_val_size = data.index(max_val)
highest_points = [(idx, max_val) for idx, val in enumerate(data) if val == max_val]
x_adj = int(len(data) * 0.1)
y_adj = int(max_val * 0.1)
bbox_props = dict(boxstyle="round", fc="w", edgecolor='black', alpha=1.0)
multi_annotate(ax,
'max count: {0}\nsize: {1} bp'.format(int_fmt(value=max_val),
', '.join([str(x[0]) for x in highest_points])),
xy_arr=highest_points,
xytext=(max_val_size + x_adj, max_val + y_adj),
fontsize=9, bbox=bbox_props,
horizontalalignment='left', verticalalignment='center',
arrowprops=dict(color='black', shrink=0.1, width=0.5, headwidth=2.5, ),)
# adjust ylim to account for annotation box
init_ylim = ax.get_ylim()
ax.set_ylim(0, init_ylim[1] * 1.08)
# set title and its spacing
title = 'Insert Sizes Distribution'
t = plt.title('\n'.join([title] + textwrap.wrap('%r' % bamname, 50)),
fontsize=15)
t.set_y(1.05)
plt.legend()
plt.xlabel("Insert Size")
plt.ylabel("Alignment Count")
ax.yaxis.set_major_formatter(major_formatter)
ax.grid(True)
plt.savefig(outname, bbox_inches='tight')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('input', help='input file')
parser.add_argument('output', help='output image file', default='test.png')
args = parser.parse_args()
graph_insert_sizes(args.input, args.output)
#!/usr/bin/env python
#
# oarse_cuffcmp.py
#
# Parses cuffcompare's cuffcmp.stats output into a JSON file.
#
# Part of the Gentrap pipeline.
#
# (c) 2013 by Wibowo Arindrarto [LUMC - SASC]
import argparse
import json
import locale
import os
import re
# set locale to group digits
locale.setlocale(locale.LC_ALL, '')
# precompiled regex patterns
_base_qr = '\s+(\d+)/(\d+)'
_base_table = '\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*?)\s+'
# source transcripts gtf
_re_dataset = re.compile(r'Summary for dataset:\s+(.*?)\s+:')
# ref exons not covered by query exons, total ref exons
_re_rexons = re.compile(r'Missed exons:%s' % _base_qr)
# query exons not covered by ref exons, total query exons
_re_qexons = re.compile(r'Novel exons:%s' % _base_qr)
# ref introns not covered by query introns, total ref introns
_re_rintrons = re.compile(r'Missed introns:%s' % _base_qr)
# query introns not covered by ref introns, total query introns
_re_qintrons = re.compile(r'Novel introns:%s' % _base_qr)
# ref loci not covered by query loci, total ref loci
_re_rloci = re.compile(r'Missed loci:%s' % _base_qr)
# query loci not covered by ref loci, total query loci
_re_qloci = re.compile(r'Novel loci:%s' % _base_qr)
# base level metrics
_re_base = re.compile(r'Base level:%s' % _base_table)
# exon level metrics
_re_exon = re.compile(r'Exon level:%s' % _base_table)
# intron level metrics
_re_intron = re.compile(r'Intron level:%s' % _base_table)
# intron chain level metrics
_re_intron_chain = re.compile(r'Intron chain level:%s' % _base_table)
# transcript level metrics
_re_transcript = re.compile(r'Transcript level:%s' % _base_table)
# locus level metrics
_re_locus = re.compile(r'Locus level:%s' % _base_table)
def _fallback_search(re_pattern, string, match_type, fallback_str, group,
replacement=None):
"""Function to handle cases when the regex match is of a different type,
e.g. '-' instead of an integer."""
match = re.search(re_pattern, string).group(group)
if match == fallback_str:
return replacement
else:
return match_type(match)
def parse_cuffcmp_stats(stat_file):
"""Parses the statistics in the given cuffcmp.stats file into a
dictionary."""
assert os.path.exists(stat_file), "File %r not found" % stat_file
with open(stat_file, 'r') as source:
# not expecting a huge output, we can store everything in memory
stat_str = source.read()
stats = {
'dataSet': re.search(_re_dataset, stat_str).group(1),
'refExonsNotInQuery': int(re.search(_re_rexons, stat_str).group(1)),
'refExonsTotal': int(re.search(_re_rexons, stat_str).group(2)),
'queryExonsNotInRef': int(re.search(_re_qexons, stat_str).group(1)),
'queryExonsTotal': int(re.search(_re_qexons, stat_str).group(2)),
'refIntronsNotInQuery': int(re.search(_re_rintrons, stat_str).group(1)),
'refIntronsTotal': int(re.search(_re_rintrons, stat_str).group(2)),
'queryIntronsNotInRef': int(re.search(_re_qintrons, stat_str).group(1)),
'queryIntronsTotal': int(re.search(_re_qintrons, stat_str).group(2)),
'refLociNotInQuery': int(re.search(_re_rloci, stat_str).group(1)),
'refLociTotal': int(re.search(_re_rloci, stat_str).group(2)),
'queryLociNotInRef': int(re.search(_re_qloci, stat_str).group(1)),
'queryLociTotal': int(re.search(_re_qloci, stat_str).group(2)),
'baseLevelSn': _fallback_search(_re_base, stat_str, float, '-', 1),
'baseLevelSp': _fallback_search(_re_base, stat_str, float, '-', 2),
'baseLevelFSn': _fallback_search(_re_base, stat_str, float, '-', 3),
'baseLevelFSp': _fallback_search(_re_base, stat_str, float, '-', 4),