Commit 59b8f961 authored by Sander Bollen's avatar Sander Bollen
Browse files

Merge branch 'fix-picard' into 'develop'

Added picard R scipts as workaround till issue is resolved

Workaround for #194 

Will first test this before merge.

See merge request !221
parents 9a2881e9 000e2882
# Script to generate a chart of the base distribution by cycle
# @author Nils Homer
# Parse the arguments
args <- commandArgs(trailing=T);
metricsFile <- args[1];
outputFile <- args[2];
bamFile <- args[3];
subtitle <- ifelse(length(args) < 4, "", args[4]);
# Figure out where the metrics and the histogram are in the file and parse them out
startFinder <- scan(metricsFile, what="character", sep="\n", quiet=TRUE, blank.lines.skip=FALSE);
firstBlankLine=0;
for (i in 1:length(startFinder)) {
if (startFinder[i] == "") {
if (firstBlankLine==0) {
firstBlankLine=i+1;
} else {
secondBlankLine=i+1;
break;
}
}
}
metrics <- read.table(metricsFile, header=T, sep="\t", skip=firstBlankLine);
# Then plot the histogram as a PDF
pdf(outputFile);
plot(x=c(1, 20+nrow(metrics)),
y=c(0, max(metrics[,3:7])),
main=paste("Base Distribution by Cycle\nin file ",bamFile," ",ifelse(subtitle == "","",paste("(",subtitle,")",sep="")),sep=""),
xlab="Cycle",
ylab="Base Percentage",
type="n");
colors = c("red", "orange", "blue", "purple", "black");
for (i in 1:5) {
lines(x=1:nrow(metrics),
y=metrics[,2+i],
col=colors[i],
type="l",
lty=1);
}
legend("bottomright", lwd=1, legend=c("PCT_A", "PCT_C", "PCT_G", "PCT_T", "PCT_N"), col=colors);
dev.off();
# Script to generate a chart to display GC bias based upon read starts observed
# in windows along the genome.
#
# @author Tim Fennell
# Parse the arguments
args <- commandArgs(trailing=T)
metricsFile <- args[1]
outputFile <- args[2]
datasetName <- args[3]
subtitle <- args[4]
windowSize <- args[5]
# Figure out where the metrics and the histogram are in the file and parse them out
startFinder <- scan(metricsFile, what="character", sep="\n", quiet=TRUE, blank.lines.skip=FALSE)
firstBlankLine=0
for (i in 1:length(startFinder)) {
if (startFinder[i] == "") {
if (firstBlankLine==0) {
firstBlankLine=i+1
} else {
secondBlankLine=i+1
break
}
}
}
metrics <- read.table(metricsFile, header=T, sep="\t", skip=firstBlankLine)
pdf(outputFile)
# Some constants that are used below
Y_AXIS_LIM = 2;
MAX_QUALITY_SCORE = 40;
COLORS = c("royalblue", "#FFAAAA", "palegreen3");
# Adjust to give more margin on the right hand side
par(mar = c(5, 4, 4, 4));
# Do the main plot of the normalized coverage by GC
plot(type="p", x=metrics$GC, y=metrics$NORMALIZED_COVERAGE,
xlab=paste(c("GC% of", windowSize, "base windows"), sep=" ", collapse=" "),
ylab="Fraction of normalized coverage",
xlim=c(0,100),
ylim=c(0, Y_AXIS_LIM),
col=COLORS[1],
main=paste(datasetName, "GC Bias Plot", "\n", subtitle)
);
# Add lines at the 50% GC and coverage=1
abline(h=1, v=50, col="lightgrey");
# Add error bars
arrows(metrics$GC,
metrics$NORMALIZED_COVERAGE - metrics$ERROR_BAR_WIDTH,
metrics$GC,
metrics$NORMALIZED_COVERAGE + metrics$ERROR_BAR_WIDTH,
code = 3, angle = 90, length = 0.05, col="grey");
# Plot count of windows as a separate series near the bottom
window_ratio = 0.5 / max(metrics$WINDOWS);
scaled_windows = metrics$WINDOWS * window_ratio;
lines(metrics$GC, scaled_windows, type="h", col=COLORS[2], lwd=3);
# Plot the quality series
lines(metrics$GC, metrics$MEAN_BASE_QUALITY * Y_AXIS_LIM / MAX_QUALITY_SCORE, type="l", col=COLORS[3]);
axis(4,
at=c(0, Y_AXIS_LIM/4, Y_AXIS_LIM/4*2, Y_AXIS_LIM/4*3, Y_AXIS_LIM),
labels=c(0, MAX_QUALITY_SCORE/4, MAX_QUALITY_SCORE/4*2, MAX_QUALITY_SCORE/4*3, MAX_QUALITY_SCORE)
);
mtext("Mean base quality", side=4, line=2.5);
# And finally add a legend
legend("topleft", pch=c(1,15, 45), legend=c("Normalized Coverage", "Windows at GC%", "Base Quality at GC%"), col=COLORS)
dev.off();
\ No newline at end of file
## script to generate histogram of insert sizes from metrics file
## expecting 3 arguments:
## first is the metrics file with the histogram info
## second is the output file
## third is a name for the plot
args <- commandArgs(trailing=TRUE)
metricsFile <- args[1]
pdfFile <- args[2]
bamName <- args[3]
histoWidth <- ifelse(length(args) < 4, 0, as.numeric(args[4]))
startFinder <- scan(metricsFile, what="character", sep="\n", quiet=TRUE, blank.lines.skip=FALSE)
firstBlankLine=0
for (i in 1:length(startFinder)) {
if (startFinder[i] == "") {
if (firstBlankLine==0) {
firstBlankLine=i+1
} else {
secondBlankLine=i+1
break
}
}
}
histogram <- read.table(metricsFile, header=TRUE, sep="\t", skip=secondBlankLine, comment.char="", quote='', check.names=FALSE)
## The histogram has a fr_count/rf_count/tandem_count for each metric "level"
## This code parses out the distinct levels so we can output one graph per level
headers <- sapply(sub(".fr_count","",names(histogram),fixed=TRUE), "[[" ,1)
headers <- sapply(sub(".rf_count","",headers,fixed=TRUE), "[[" ,1)
headers <- sapply(sub(".tandem_count","",headers,fixed=TRUE), "[[" ,1)
## Duplicated header names cause this to barf. KT & Yossi report that this is going to be extremely difficult to
## resolve and it's unlikely that anyone cares anyways. Trap this situation and avoid the PDF so it won't cause
## the workflow to fail
if (any(duplicated(headers))) {
print(paste("Not creating insert size PDF as there are duplicated header names:", headers[which(duplicated(headers))]))
} else {
levels <- c()
for (i in 2:length(headers)) {
if (!(headers[i] %in% levels)) {
levels[length(levels)+1] <- headers[i]
}
}
pdf(pdfFile)
for (i in 1:length(levels)) {
## Reconstitutes the histogram column headers for this level
fr <- paste(levels[i], "fr_count", sep=".")
rf <- paste(levels[i], "rf_count", sep=".")
tandem <- paste(levels[i], "tandem_count", sep=".")
frrange = ifelse(fr %in% names(histogram), max(histogram[fr]), 0)
rfrange = ifelse(rf %in% names(histogram), max(histogram[rf]), 0)
tandemrange = ifelse(tandem %in% names(histogram), max(histogram[tandem]), 0)
yrange <- max(frrange, rfrange, tandemrange)
xrange <- ifelse(histoWidth > 0, histoWidth, max(histogram$insert_size))
plot(x=NULL, y=NULL,
type="n",
main=paste("Insert Size Histogram for", levels[i], "\nin file", bamName),
xlab="Insert Size",
ylab="Count",
xlim=range(0, xrange),
ylim=range(0, yrange))
colors <- c()
labels <- c()
if (fr %in% names(histogram) ) {
lines(histogram$insert_size, as.matrix(histogram[fr]), type="h", col="red")
colors <- c(colors, "red")
labels <- c(labels, "FR")
}
if (rf %in% names(histogram)) {
lines(histogram$insert_size, as.matrix(histogram[rf]), type="h", col="blue")
colors <- c(colors, "blue")
labels <- c(labels, "RF")
}
if (tandem %in% names(histogram)) {
lines(histogram$insert_size, as.matrix(histogram[tandem]), type="h", col="orange")
colors <- c(colors, "orange")
labels <- c(labels, "TANDEM")
}
## Create the legend
legend("topright", labels, fill=colors, col=colors, cex=0.7)
}
dev.off()
}
# Script to generate a chart of mean quality by cycle from a BAM file
# @author Tim Fennell
# Parse the arguments
args <- commandArgs(trailing=T)
metricsFile <- args[1]
outputFile <- args[2]
bamFile <- args[3]
subtitle <- ifelse(length(args) < 4, "", args[4])
# Figure out where the metrics and the histogram are in the file and parse them out
startFinder <- scan(metricsFile, what="character", sep="\n", quiet=TRUE, blank.lines.skip=FALSE)
firstBlankLine=0
for (i in 1:length(startFinder))
{
if (startFinder[i] == "") {
if (firstBlankLine==0) {
firstBlankLine=i+1
} else {
secondBlankLine=i+1
break
}
}
}
metrics <- read.table(metricsFile, header=T, nrows=1, sep="\t", skip=firstBlankLine)
histogram <- read.table(metricsFile, header=T, sep="\t", skip=secondBlankLine)
# Then plot the histogram as a PDF
pdf(outputFile)
plot(histogram$CYCLE,
histogram$MEAN_QUALITY,
type="n",
main=paste("Quality by Cycle\nin file ",bamFile," ",ifelse(subtitle == "","",paste("(",subtitle,")",sep="")),sep=""),
xlab="Cycle",
ylab="Mean Quality",
ylim=range(0,50))
qColor <- "darkblue"
oqColor <- rgb(1, 0.25, 0.25, 0.75)
# Plot OQ first so that it's "behind" the regular qualities
if (!is.null(histogram$MEAN_ORIGINAL_QUALITY)) {
lines(histogram$CYCLE, histogram$MEAN_ORIGINAL_QUALITY, type="l", col=oqColor, lty=1);
}
# Then plot the regular qualities
lines(histogram$CYCLE, histogram$MEAN_QUALITY, type="h", col=qColor, lty=1);
# And add a legend
legend("topleft", pch=c(15,15), legend=c("Mean Quality", "Mean Original Quality"), col=c(qColor, oqColor))
dev.off()
# Script to generate a chart of quality score distribution in a file
# @author Tim Fennell
# Parse the arguments
args <- commandArgs(trailing=T)
metricsFile <- args[1]
outputFile <- args[2]
bamFile <- args[3]
subtitle <- ifelse(length(args) < 4, "", args[4])
# Figure out where the metrics and the histogram are in the file and parse them out
startFinder <- scan(metricsFile, what="character", sep="\n", quiet=TRUE, blank.lines.skip=FALSE)
firstBlankLine=0
for (i in 1:length(startFinder))
{
if (startFinder[i] == "") {
if (firstBlankLine==0) {
firstBlankLine=i+1
} else {
secondBlankLine=i+1
break
}
}
}
metrics <- read.table(metricsFile, header=T, nrows=1, sep="\t", skip=firstBlankLine)
histogram <- read.table(metricsFile, header=T, sep="\t", skip=secondBlankLine)
# Then plot the histogram as a PDF
pdf(outputFile)
plot(histogram$QUALITY,
histogram$COUNT_OF_Q,
type="n",
main=paste("Quality Score Distribution\nin file ",bamFile," ",ifelse(subtitle == "","",paste("(",subtitle,")",sep="")),sep=""),
xlab="Quality Score",
ylab="Observations")
qColor <- "blue"
oqColor <- "lightcyan2"
width <- 5
# Plot OQ first so that it's "behind" the regular qualities
if (!is.null(histogram$COUNT_OF_OQ)) {
lines(histogram$QUALITY+0.25, histogram$COUNT_OF_OQ, type="h", col=oqColor, lty=1, lwd=width, lend="square");
}
# Then plot the regular qualities
lines(histogram$QUALITY, histogram$COUNT_OF_Q, type="h", col=qColor, lty=1, lwd=width, lend="square");
# And add a legend
legend("topleft", pch=c(15,15), legend=c("Quality Scores", "Original Quality Scores"), col=c(qColor, oqColor))
dev.off()
# Script to generate a normalized coverage vs. position along transcript plot.
#
# @author Tim Fennell
# Parse the arguments
args <- commandArgs(trailing = TRUE)
metricsFile <- args[1]
outputFile <- args[2]
bamName <- args[3]
subtitle <- ifelse(length(args) < 4, "", args[4])
# Figure out where the metrics and the histogram are in the file and parse them out
startFinder <- scan(metricsFile, what="character", sep="\n", quiet=TRUE, blank.lines.skip=FALSE)
firstBlankLine=0
for (i in 1:length(startFinder)) {
if (startFinder[i] == "") {
if (firstBlankLine==0) {
firstBlankLine=i+1
} else {
secondBlankLine=i+1
break
}
}
}
data <- read.table(metricsFile, header=T, sep="\t", skip=secondBlankLine, check.names=FALSE)
# The histogram has a normalized_position and normalized_coverage column for each metric "level"
# This code parses out the distinct levels so we can output one graph per level
headers <- sapply(sub(".normalized_coverage","",names(data),fixed=TRUE), "[[" ,1)
## Duplicated header names cause this to barf. KT & Yossi report that this is going to be extremely difficult to
## resolve and it's unlikely that anyone cares anyways. Trap this situation and avoid the PDF so it won't cause
## the workflow to fail
if (any(duplicated(headers))) {
print(paste("Not creating insert size PDF as there are duplicated header names:", headers[which(duplicated(headers))]))
} else {
pdf(outputFile)
levels <- c()
for (i in 2:length(headers)) {
if (!(headers[i] %in% levels)) {
levels[length(levels)+1] <- headers[i]
}
}
# Some constants that are used below
COLORS = c("royalblue", "#FFAAAA", "palegreen3");
# For each level, plot of the normalized coverage by GC
for (i in 1:length(levels)) {
# Reconstitutes the histogram column header for this level
nc <- paste(levels[i], "normalized_coverage", sep=".")
plot(x=data$normalized_position, y=as.matrix(data[nc]),
type="o",
xlab="Normalized Distance Along Transcript",
ylab="Normalized Coverage",
xlim=c(0, 100),
ylim=range(0, max(data[nc])),
col="royalblue",
main=paste("RNA-Seq Coverage vs. Transcript Position\n", levels[i], " ", ifelse(subtitle=="", "", paste("(", subtitle, ")", sep="")), "\nin file ", bamName,sep=""))
# Add a horizontal line at coverage=1
abline(h=1, col="lightgrey");
}
dev.off();
}
\ No newline at end of file
##
## The MIT License
##
## Copyright (c) 2013 The Broad Institute
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to deal
## in the Software without restriction, including without limitation the rights
## to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
## copies of the Software, and to permit persons to whom the Software is
## furnished to do so, subject to the following conditions:
##
## The above copyright notice and this permission notice shall be included in
## all copies or substantial portions of the Software.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
## THE SOFTWARE.
##
args = commandArgs(trailingOnly=TRUE)
opt = list(details.fn=args[1], summary.fn=args[2], output.fn=args[3])
read_metrics_file = function(metrics.fn) {
contents = read.delim(metrics.fn, comment.char="#", stringsAsFactors=FALSE)
return(contents)
}
equals_or_is_na = function(x1, x2) {
if (is.na(x1)) {
return(is.na(x2))
} else {
return(x1 == x2)
}
}
details = read_metrics_file(opt$details.fn)
summary = read_metrics_file(opt$summary.fn)
pdf(opt$output.fn)
par(mfrow=c(2,2), oma=c(0,0,2,0))
for (i in seq_len(nrow(summary))) {
cur_summary = summary[i, ]
cur_sample = cur_summary[1, "SAMPLE"]
cur_library = cur_summary[1, "LIBRARY"]
cur_read_group = cur_summary[1, "READ_GROUP"]
cur_details = details[which((equals_or_is_na(cur_library, details[, "LIBRARY"]) &
(equals_or_is_na(cur_sample, details[, "SAMPLE"])) &
(equals_or_is_na(cur_read_group, details[, "READ_GROUP"])))), ]
## Plot conversion rates
cpg.converted = sum(cur_details$CONVERTED_SITES)
cpg.seen = sum(cur_details$TOTAL_SITES)
cpg.conversion = cpg.converted / cpg.seen
total.conversion = (cpg.converted + cur_summary$NON_CPG_CONVERTED_BASES) / (cpg.seen + cur_summary$NON_CPG_BASES)
barplot(c("non-CpG"=cur_summary$PCT_NON_CPG_BASES_CONVERTED, "Combined"=total.conversion, "CpG"=cpg.conversion),
ylim=c(0.95, 1), ylab="% Conversion", xlab="Distribution", main="Bisulfite Conversion Rate",
col="blue", xpd=FALSE)
abline(h=0.995, col="grey")
## Plot histogram of CpG counts by conversion rate
hist(cur_details$PCT_CONVERTED, 10, xlab="Conversion Rate Of CpGs", ylab="# CpGs",
main="CpG Conversion Rate Distribution", col="blue")
## Plot pie chart showing distribution of CpG coverage
coverage_breaks = c(0, 1, 5, 10, 25, 50, 100, Inf)
coverage_cut = cut(cur_details$TOTAL_SITES, coverage_breaks)
cpg_coverage = split(cur_details$TOTAL_SITES, coverage_cut)
coverages = sapply(cpg_coverage, length)[2:7]
names(coverages) = paste(">=", c(1, 5, 10, 25, 50, 100), sep="")
## If we have 0s all across the pie chart will be effectively meaningless but put in a 100% >= 0 field instead
## to avoid an error on pie(). Normally it'd just be a pain to see these, but ...
if (all(coverages == 0)) {
coverages = c("No Coverage"=1)
}
color_ramp = colorRampPalette(c("white", "#538ED5", "blue"), bias=1, space="Lab")
colors = color_ramp(length(coverages))[2:length(coverages)]
pie(coverages, main="Distribution Of CpGs By Coverage", col=colors, clockwise=TRUE)
discards = log10(c("Mismatches"=cur_summary$READS_IGNORED_MISMATCHES, "Size"=cur_summary$READS_IGNORED_SHORT))
## Protect against -Inf in the case where we had 0 discards
discards = ifelse(is.finite(discards), discards, 0)
barplot(discards, ylab="Number Discarded (log10)", xlab="Reason",
main="Reads Discarded", col="blue", ylim=c(0, ceiling(max(discards))))
header_txt = character()
if (!is.na(cur_sample) && cur_sample != "") {
header_txt = paste(header_txt, " SAMPLE=", cur_sample, sep="")
}
if (!is.na(cur_library) && cur_library != "") {
header_txt = paste(header_txt, " LIBRARY=", cur_library, sep="")
}
if (!is.na(cur_read_group) && cur_read_group != "") {
header_txt = paste(header_txt, " READ GROUP=", cur_read_group, sep="")
}
if (length(header_txt) > 0) {
mtext(header_txt, outer=TRUE, line=1)
}
}
dev.off()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment