diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/aggr_base_count.R b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/aggr_base_count.R deleted file mode 100755 index df4e0bd02efe530164e34ebf3cfac4982362065a..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/aggr_base_count.R +++ /dev/null @@ -1,237 +0,0 @@ -#!/usr/bin/env Rscript - -# aggr_base_count.R -# -# Given a count file, write tab-delimited file(s) aggregating the counts -# at gene and/or exon level. -# -# (c) 2013 by Wibowo Arindrarto [LUMC - SASC] -# Adapted from Peter-Bram 't Hoen's script: 'merge_table_script_shark_PH3.r' - -# General function to install package if it does not exist -# Otherwise, it only loads the package -usePackage <- function(p) { - r <- getOption("repos") - r["CRAN"] <- "http://cran.us.r-project.org" - options(repos = r) - rm(r) - if (!is.element(p, installed.packages()[,1])) - install.packages(p, dep = TRUE) - require(p, character.only = TRUE) -} - -usePackage("getopt") - -## FLAGS ## -LEVELS <- c('gene', 'exon') -OUT.DIR <- getwd() -DEBUG <- FALSE -if (DEBUG) { - message("## DEBUG MODE ##") -} - - -## FUNCTIONS ## -CheckCountFiles <- function(count.files, DEBUG=FALSE) { - # Given a vector of sample names, checks whether the .count files exist. - # - # Count files are the input files used to analyze the RNA-Seq expression - # levels. They must conform to the following file name: - # '{sample_name}/{sample_name}.count' - # - # Args: - # - paths: string vector of file paths - for (cfile in count.files) { - - if (!file.exists(cfile)) { - stop(paste("Path '", cfile, "' does not exist. Exiting.", sep="")) - } - - if (DEBUG) { - message("Path '", cfile, "' exists.", sep="") - } - } -} - -CountBaseExons <- function(count.files, count.names, - col.header=c("gene", "chr", "start", "stop")) { - # Given a list of count files, return a data frame containing their values. - # - # The count files must be a tab-separate file containing the following - # columns in the same order: - # 1. chromosome - # 2. start position - # 3. stop position - # 4. total nucleotide counts - # 5. nucleotide counts per exon - # 6. gene name - # - # The returned data frame has the following columns: - # - # 1. gene name - # 2. chromosome - # 3. start position - # 4. stop position - # 5... total nucleotide counts for each sample - # - # This function assumes that for all count files, the values of the first - # three columns are the same for each row. - # - # Args: - # - count.files: string vector of count file paths - # - col.headers: string vector of default data frame output headers - - # given a count file path, extract its fourth column - GetNucCount <- function(x) { - read.table(x, as.is=TRUE)[4] - } - - # initial data frame is from the first file - exon.counts <- read.table(count.files[1], as.is=TRUE) - exon.counts <- exon.counts[, c(6, 1:3, 4)] - colnames(exon.counts)[1:5] <- append(col.header, count.names[1]) - - if (length(count.files) > 1) { - # why doesn't R provide a nice way to slice remaining items?? - remaining.files <- count.files[2: length(count.files)] - remaining.names <- count.names[2: length(count.names)] - # append all nucleotide counts from remaining files to exon.counts - exon.counts <- cbind(exon.counts, lapply(remaining.files, GetNucCount)) - # and rename these columns accordingly - end.idx <- 5 + length(remaining.files) - colnames(exon.counts)[6:end.idx] <- remaining.names - } - - return(exon.counts) -} - -CountExons <- function(exon.df) { - # Given a data frame containing exon counts, return a data frame consisting of - # compacted exon counts. - # - # In a compacted exon count data frame, each exon has its own unique name - # consisting of its gene source and its start-stop coordinates. - # - # Args: - # - exon.df: data frame of complete exon counts - - - # create new data frame of the exon counts, concatenating gene name, and the - # exon start-stop coordinates - exon.dis.counts <- cbind(paste(paste(exon.df$gene, exon.df$start, - sep=":"), exon.df$stop, sep="-"), - exon.df[5: length(exon.df)]) - colnames(exon.dis.counts)[1] <- "exon" - counts.in.samples <- as.matrix(exon.dis.counts[2:ncol(exon.dis.counts)]) - exon.counts <- aggregate(counts.in.samples ~ exon, data=exon.dis.counts, FUN=sum, - na.rm=TRUE) - colnames(exon.counts)[2:ncol(exon.counts)] <- colnames(counts.in.samples) - - return (exon.counts) -} - -CountGenes <- function(exon.df) { - # Given a data frame containing exon counts, return a data frame of gene - # counts. - # - # See CountBaseExons for the input data frame format. - # - # Args: - # - exon.df: data frame of complete exon counts - - # basically an aggregate of exon counts with the same gene name - counts.in.samples <- as.matrix(exon.df[5:ncol(exon.df)]) - gene.counts <- aggregate(counts.in.samples ~ gene, data=exon.df, FUN=sum, - na.rm=TRUE) - # first column is gene - colnames(gene.counts)[2:ncol(gene.counts)] <- colnames(counts.in.samples) - - return(gene.counts) -} - - -# load package for arg parsing -library('getopt') - -# create spec for arg parsing -spec <- matrix(c( - # colon-separated paths to each count files - 'count-file', 'I', 1, 'character', - # colon-separated paths of each count file label; order must be the same - # as the count files - 'count-name', 'N', 1, 'character', - # output file for gene level counts - 'gene-count', 'G', 1, 'character', - # output file for exon level counts - 'exon-count', 'E', 1, 'character', - # help - 'help', 'H', 0, 'logical' -), byrow=TRUE, ncol=4) -opt <- getopt(spec) - -# print help if requested -if (!is.null(opt[['help']])) { - cat(getopt(spec, usage=TRUE)) - q(status=1) -} - -# we need gene-count and/or exon-count flag -if (is.null(opt[['gene-count']]) & is.null(opt[['exon-count']])) { - message("Error: Either '--gene-count' and/or '--exon-count' must have a value.") - q(status=1) -} - -# set fallback values for optional args -if (!is.null(opt[['output-dir']])) { - OUT.DIR <- normalizePath(opt[['output-dir']]) - # create directory if it doesn't exist - dir.create(OUT.DIR, showWarnings=FALSE) -} - -# parse the input file paths and check their presence -if (!is.null(opt[['count-file']])) { - count.files <- opt[['count-file']] - count.files <- unlist(strsplit(gsub(' ', '', count.files), ':')) - CheckCountFiles(count.files, DEBUG) -} else { - stop("Required input count file path(s) not present. Exiting.") -} - -# parse the input count labels and check if its length is the same as the input -# files -if (!is.null(opt[['count-name']])) { - count.names <- opt[['count-name']] - count.names <- unlist(strsplit(gsub(' ', '', count.names), ':')) - if (length(count.names) != length(count.files)) { - stop("Mismatched count file paths and labels. Exiting.") - } -} else { - stop("Required input count file label(s) not present. Exiting.") -} - -# set output file name for gene counts -if (!is.null(opt[['gene-count']])) { - gene.out <- opt[['gene-count']] -} else { - gene.out <- NULL -} - -# set output file name for exon counts -if (!is.null(opt[['exon-count']])) { - exon.out <- opt[['exon-count']] -} else { - exon.out <- NULL -} - -# count base exons (complete with coordinates) -base.exon.counts <- CountBaseExons(count.files, count.names) - -# and write output files, depending on the flags -if (!is.null(gene.out)) { - gene.counts <- CountGenes(base.exon.counts) - write.table(gene.counts, file = gene.out, sep = "\t", quote = FALSE, row.names = FALSE) -} -if (!is.null(exon.out)) { - exon.counts <- CountExons(base.exon.counts) - write.table(exon.counts, file = exon.out, sep = "\t", quote = FALSE, row.names = FALSE) -} diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/bam_rna.py b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/bam_rna.py deleted file mode 100755 index d2c14cff2588f01c7a9e3ba87ab5558244cb856e..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/bam_rna.py +++ /dev/null @@ -1,391 +0,0 @@ -#!/usr/bin/env python2.7 -# -# Biopet is built on top of GATK Queue for building bioinformatic -# pipelines. It is mainly intended to support LUMC SHARK cluster which is running -# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) -# should also be able to execute Biopet tools and pipelines. -# -# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center -# -# Contact us at: sasc@lumc.nl -# -# A dual licensing mode is applied. The source code within this project that are -# not part of GATK Queue is freely available for non-commercial use under an AGPL -# license; For commercial users or users who do not want to follow the AGPL -# license, please contact us to obtain a separate license. -# - - -# bam_rna.py -# -# Generate read pair and alignment statistics for a BAM file. -# -# Part of the Gentrap pipeline -# -# (c) 2013 Wibowo Arindrarto [SASC - LUMC] - -import argparse -import json -import locale -import os -from collections import OrderedDict -from functools import partial - -import pysam - -# set locale to group digits -locale.setlocale(locale.LC_ALL, '') -# formatters for output -int_fmt = partial(locale.format, grouping=True, percent='%i') -float_fmt = partial(locale.format, grouping=True, percent='%.2f') - -# -F 0x4 -func_nmap = lambda rec: not rec.flag & 0x4 -# -f 0xC -func_nunmap = lambda rec: (rec.flag & 0x4) and (rec.flag & 0x8) -# -F 0xC -func_nmap_pair = lambda rec: not rec.flag & 0xC -# -f 0x2 -func_nmap_pair_ok = lambda rec: rec.flag & 0x2 -# mapped to different chromosomes / reference sequences -func_nmap_diffchr = lambda rec: rec.rnext != rec.tid -# -F 0x4 -f 0x8 -func_nmap_sgltn = lambda rec: (not rec.flag & 0x4) and (rec.flag & 0x8) -# check if spliced -func_splice = lambda rec: 'N' in rec.cigarstring -# check if pass QC -# not bool since we always need to run each read through it -func_qc = lambda rec: 1 if rec.flag & 0x200 else 0 - -FLAGS = OrderedDict(( - ('total', 'Total'), - ('unmapped', 'Unmapped'), - ('mapped', 'Mapped'), - ('mappedPair', 'Mapped pairs'), - ('mappedPairProper', 'Properly mapped pairs'), - ('mappedDiffChr', 'Different chromosomes'), - ('mappedDiffChrQ', 'Different chromosomes, MAPQ >=5'), - ('singleton', 'Singletons'), - ('totalSplice', 'Split reads, total'), - ('splicePairProper', 'Split reads, properly mapped'), - ('spliceSingleton', 'Split reads, singletons'), -)) - -class BarnStat(object): - - """Class representing a collection of BAM statistics for RNA-seq data.""" - - def __init__(self, bamfile, read_pair_suffix_len=0, id_sorted=False, - validate=False): - assert os.path.exists(bamfile), "BAM file %r not found." % bamfile - self.validate = validate - self.bamfile = bamfile - - self.flags = FLAGS.keys() - # length of read pair suffix (e.g. '/2' has len == 2) - self.suflen = read_pair_suffix_len - if not id_sorted: - self._count_unsorted() - else: - self._count_sorted() - - self._format_counts() - - def _adjust_counts(self, alns, reads): - """Adjusts the alignment and read counts.""" - # we need to adjust the unmapped counts for alignments as each - # alignments consists of two reads (one read pair) which may be mapped - # multiple times as singletons and/or whole read pairs. - # so unmapped alns is always == unmapped read pairs + singleton *reads* - # counts (proxy for the set of unmapped singletons) - if 'unmapped' in alns: - alns['unmapped'] += reads['singleton'] - else: - # tophat, splits unmapped reads into its own BAM file - alns['unmapped'] = 0 - - return alns, reads - - def _count_sorted(self): - """Counts read and alignment statistics for ID-sorted BAM file.""" - flags = self.flags - reads, alns, alns_qc = {}, {}, {} - read_flags = dict.fromkeys(flags, False) - cur_qname = None - - # init counts - for flag in flags: - reads[flag], alns[flag], alns_qc[flag] = 0, 0, 0 - - # iterate over each record - # index for suffix removal, if suffix exist (> 0) - if self.suflen: - sufslice = slice(-self.suflen) - else: - sufslice = slice(None) - for rec in pysam.Samfile(self.bamfile, 'rb'): - # different qname mean we've finished parsing each unique read - # so reset the flags and add them to the counters appropriately - if cur_qname != rec.qname[sufslice]: - for flag in flags: - reads[flag] += int(read_flags[flag]) - # reset the qname tracker - cur_qname = rec.qname[sufslice] - # and the read flag tracker - read_flags = dict.fromkeys(flags, False) - # total, total splice - alns['total'] += 1 - alns_qc['total'] += func_qc(rec) - read_flags['total'] = True - if func_splice(rec): - alns['totalSplice'] += 1 - alns_qc['totalSplice'] += func_qc(rec) - read_flags['totalSplice'] = True - # unmapped - if func_nunmap(rec): - alns['unmapped'] += 1 - alns_qc['unmapped'] += func_qc(rec) - read_flags['unmapped'] = True - else: - # mapped - if func_nmap(rec): - alns['mapped'] += 1 - alns_qc['mapped'] += func_qc(rec) - read_flags['mapped'] = True - # mapped pairs - if func_nmap_pair(rec): - alns['mappedPair'] += 1 - alns_qc['mappedPair'] += func_qc(rec) - read_flags['mappedPair'] = True - # proper pairs, proper pairs splice - if func_nmap_pair_ok(rec): - alns['mappedPairProper'] += 1 - alns_qc['mappedPairProper'] += func_qc(rec) - read_flags['mappedPairProper'] = True - if func_splice(rec): - alns['splicePairProper'] += 1 - alns_qc['splicePairProper'] += func_qc(rec) - read_flags['splicePairProper'] = True - # mapped to different chromosomes - elif func_nmap_diffchr(rec): - alns['mappedDiffChr'] += 1 - alns_qc['mappedDiffChr'] += func_qc(rec) - read_flags['mappedDiffChr'] = True - if rec.mapq >= 5: - alns['mappedDiffChrQ'] += 1 - alns_qc['mappedDiffChrQ'] += func_qc(rec) - read_flags['mappedDiffChrQ'] = True - # singletons, singletons splice - elif func_nmap_sgltn(rec): - alns['singleton'] += 1 - alns_qc['singleton'] += func_qc(rec) - read_flags['singleton'] = True - if func_splice(rec): - alns['spliceSingleton'] += 1 - alns_qc['spliceSingleton'] += func_qc(rec) - read_flags['spliceSingleton'] = True - - # for the last read, since we don't pass the qname check again - for flag in flags: - reads[flag] += int(read_flags[flag]) - - self.aln_counts, self.read_counts = self._adjust_counts(alns, reads) - self.aln_qc_counts = alns_qc - if self.validate: - assert self.validate_counts() - - def _count_unsorted(self): - """Counts read and alignment statistics for non-ID-sorted BAM file.""" - flags = self.flags - reads_total, reads_unmap, reads_map, reads_pair_map, reads_pair_proper, \ - reads_sgltn, reads_total_splice, reads_pair_proper_splice, \ - reads_sgltn_splice, reads_pair_diffchr, reads_pair_diffchr_h = \ - set(), set(), set(), set(), set(), set(), set(), set(), set(), \ - set(), set() - - reads, alns, alns_qc = {}, {}, {} - for flag in flags: - reads[flag], alns[flag], alns_qc[flag] = 0, 0, 0 - # index for suffix removal, if suffix exist (> 0) - if self.suflen: - sufslice = slice(-self.suflen) - else: - sufslice = slice(None) - for rec in pysam.Samfile(self.bamfile, 'rb'): - # remove '/1' or '/2' suffixes, to collapse read pair counts - pass - # do countings on alns and reads directly - hname = hash(rec.qname[sufslice]) - # total, total splice - alns['total'] += 1 - alns_qc['total'] += func_qc(rec) - if hname not in reads_total: - reads_total.add(hname) - if func_splice(rec): - alns['totalSplice'] += 1 - alns_qc['totalSplice'] += func_qc(rec) - reads_total_splice.add(hname) - # unmapped - if func_nunmap(rec): - alns['unmapped'] += 1 - alns_qc['unmapped'] += func_qc(rec) - reads_unmap.add(hname) - else: - # mapped - if func_nmap(rec): - alns['mapped'] += 1 - alns_qc['mapped'] += func_qc(rec) - reads_map.add(hname) - # mapped pairs - if func_nmap_pair(rec): - alns['mappedPair'] += 1 - alns_qc['mappedPair'] += func_qc(rec) - reads_pair_map.add(hname) - # proper pairs, proper pairs splice - if func_nmap_pair_ok(rec): - alns['mappedPairProper'] += 1 - alns_qc['mappedPairProper'] += func_qc(rec) - reads_pair_proper.add(hname) - if func_splice(rec): - alns['splicePairProper'] += 1 - alns_qc['splicePairProper'] += func_qc(rec) - reads_pair_proper_splice.add(hname) - # mapped to different chromosomes - elif func_nmap_diffchr(rec): - alns['mappedDiffChr'] += 1 - alns_qc['mappedDiffChr'] += func_qc(rec) - reads_pair_diffchr.add(hname) - if rec.mapq >= 5: - alns['mappedDiffChrQ'] += 1 - alns_qc['mappedDiffChrQ'] += func_qc(rec) - reads_pair_diffchr_h.add(hname) - # singletons, singletons splice - elif func_nmap_sgltn(rec): - alns['singleton'] += 1 - alns_qc['singleton'] += func_qc(rec) - reads_sgltn.add(hname) - if func_splice(rec): - alns['spliceSingleton'] += 1 - alns_qc['spliceSingleton'] += func_qc(rec) - reads_sgltn_splice.add(hname) - - # set counts for reads - reads['total'] = len(reads_total) - reads['totalSplice'] = len(reads_total_splice) - reads['unmapped'] = len(reads_unmap) - reads['mapped'] = len(reads_map) - reads['mappedPair'] = len(reads_pair_map) - reads['mappedPairProper'] = len(reads_pair_proper) - reads['mappedDiffChr'] = len(reads_pair_diffchr) - reads['mappedDiffChrQ'] = len(reads_pair_diffchr_h) - reads['splicePairProper'] = len(reads_pair_proper_splice) - reads['singleton'] = len(reads_sgltn) - reads['spliceSingleton'] = len(reads_sgltn_splice) - - # free the memory - del reads_total, reads_map, reads_pair_map, reads_pair_proper, \ - reads_sgltn, reads_total_splice, reads_pair_proper_splice, \ - reads_sgltn_splice, reads_unmap, reads_pair_diffchr, \ - reads_pair_diffchr_h - - self.aln_counts, self.read_counts = self._adjust_counts(alns, reads) - self.aln_qc_counts = alns_qc - if self.validate: - assert self.validate_counts() - - def validate_counts(self): - """Checks whether all reads and alignment counts add up.""" - for ctype in ('read_counts', 'aln_counts'): - count = getattr(self, ctype) - ntotal = count['total'] - nmap = count['mapped'] - nunmap = count['unmapped'] - nmap_pair = count['mappedPair'] - nmap_pair_ok = count['mappedPairProper'] - nmap_pair_diffchr = count['mappedDiffChr'] - nmap_sgltn = count['singleton'] - nsplice_total = count['totalSplice'] - nsplice_pair_ok = count['splicePairProper'] - nsplice_sgltn = count['spliceSingleton'] - - assert nmap == nmap_pair + nmap_sgltn, \ - "Mismatch: %r == %r + %r" % (nmap, nmap_pair, nmap_sgltn) - assert nmap_pair_ok + nmap_pair_diffchr <= nmap_pair - assert nsplice_total <= ntotal - assert nsplice_pair_ok <= nmap_pair_ok - assert nsplice_sgltn <= nmap_sgltn - # total is always == unmapped + mapped pair + singletons - assert ntotal == nunmap + nmap_pair + nmap_sgltn, "Mismatch: " \ - "%r == %r + %r + %r" % (ntotal, nunmap, nmap_pair, - nmap_sgltn) - - return True - - def show(self): - """Prints the read and alignment counts in human-readable format to - stdout.""" - import pprint - pprint.pprint(dict(self.counts.items())) - - def write_json(self, out_file): - """Writes JSON to the output file.""" - with open(out_file, 'w') as outfile: - json.dump(self.counts, outfile, sort_keys=True, indent=4, - separators=(',', ': ')) - - def _format_counts(self): - """Formats read and alignment counts into nice-looking numbers.""" - counts = OrderedDict() - flags = self.flags - for ctype in ('read_counts', 'aln_counts', 'aln_qc_counts'): - count = getattr(self, ctype) - - ntotal = count['total'] - cont = {} - lvl = 'aln' if ctype == 'aln_counts' else 'read' - - if ctype == 'aln_qc_counts': - lvl = 'aln_qc' - else: - pct = lambda x: x * 100.0 / ntotal - if ctype == 'read_counts': - lvl = 'read' - elif ctype == 'aln_counts': - lvl = 'aln' - - for flag in flags: - # format all counts - cont[flag] = int_fmt(value=count[flag]) - # and add percentage values - if lvl != 'aln_qc': - if flag == 'total': - cont['totalPct'] = '100' - else: - cont[flag + 'Pct'] = float_fmt(value=pct(count[flag])) - - counts[lvl] = cont - - self.counts = counts - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - parser.add_argument('bamfile', help='Path to BAM file') - parser.add_argument('--id-sorted', action='store_true', - dest='id_sorted', help='Whether the BAM file is ID-sorted or not') - parser.add_argument('--suffix-len', type=int, dest='suffix_len', default=0, - help='Length of read pair suffix, if present') - parser.add_argument('-o', '--outfile', dest='out_file', type=str, - help='Path to output file') - parser.add_argument('-f', '--outfmt', dest='out_fmt', type=str, - choices=['json'], default='json', - help='Format of output file') - args = parser.parse_args() - - bamstat = BarnStat(args.bamfile, args.suffix_len, args.id_sorted) - - if args.out_file is None: - bamstat.show() - elif args.out_fmt == 'json': - bamstat.write_json(args.out_file) diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/gc_dist.py b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/gc_dist.py deleted file mode 100755 index 27f2d482867f0250b1ed8df9ff6f921de103191f..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/gc_dist.py +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env python -# -# Biopet is built on top of GATK Queue for building bioinformatic -# pipelines. It is mainly intended to support LUMC SHARK cluster which is running -# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) -# should also be able to execute Biopet tools and pipelines. -# -# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center -# -# Contact us at: sasc@lumc.nl -# -# A dual licensing mode is applied. The source code within this project that are -# not part of GATK Queue is freely available for non-commercial use under an AGPL -# license; For commercial users or users who do not want to follow the AGPL -# license, please contact us to obtain a separate license. -# - -# -# gc_dist.py -# -# Given a path to a FASTQ file, create plots of GC percentages. -# -# Part of the Gentrap pipeline. -# -# (c) 2013 Wibowo Arindrarto [SASC - LUMC] - -import argparse -import locale -import os -import textwrap - -import numpy as np -# for headless matplotlib -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt -import matplotlib.gridspec as gs - -from matplotlib.ticker import FuncFormatter, MultipleLocator - - -# set locale and formatter to do digit grouping -locale.setlocale(locale.LC_ALL, '') -groupdig = lambda x, pos: locale.format('%d', x, grouping=True) -major_formatter = FuncFormatter(groupdig) - - -def read_seq(fp): - """Given a FASTQ file, yield its sequences.""" - if isinstance(fp, basestring): - assert os.path.exists(fp) - fp = open(fp, 'r') - for counter, line in enumerate(fp): - if (counter + 3) % 4 == 0: - yield line.strip() - - -def drange(start, stop, step): - """Like `range` but for floats.""" - cur = start - while cur < stop: - yield cur - cur += step - - -def graph_gc(fname, outname='test.png'): - """Graphs the GC percentages of the given FASTQ file.""" - # count GC percentages per sequence - gcs = [] - for seq in read_seq(fname): - gc = sum(seq.lower().count(x) for x in ('g', 'c', 's')) - gcs.append(gc * 100.0 / len(seq)) - # grab mean and std dev for plotting - mean = np.mean(gcs) - stdev = np.std(gcs) - - # set the subplots in the figure; top is histogram, bottom is boxplot - fig = plt.figure(figsize=(8, 8)) - grids = gs.GridSpec(2, 1, height_ratios=[5, 1]) - - ax0 = plt.subplot(grids[0]) - # set title and adjust distance to plot - title = 'Distribution of GC Percentage' - t = plt.title('\n'.join([title] + textwrap.wrap('%r' % - os.path.basename(fname), 50)), fontsize=15) - t.set_y(1.05) - - # start counting bins for width measurement - total = len(gcs) - min_hist = min(gcs) - max_hist = max(gcs) - low = high = np.median(gcs) - step = 1 - widths = dict.fromkeys(range(20, 100, 20) + [99], (0, 0)) - - while low >= min_hist or high <= max_hist: - # cap the width marker at min or max gc values - if high > max_hist: high = max_hist - if low < min_hist: low = min_hist - - range_count = len([x for x in gcs if low < x < high]) - coverage = float(range_count) / total - - if coverage >= 0.2 and not any(widths[20]): - widths[20] = (low, high) - if coverage >= 0.4 and not any(widths[40]): - widths[40] = (low, high) - if coverage >= 0.6 and not any(widths[60]): - widths[60] = (low, high) - if coverage >= 0.8 and not any(widths[80]): - widths[80] = (low, high) - if coverage >= 0.99 and not any(widths[99]): - widths[99] = (low, high) - - low -= step - high += step - - # use the bin coordinates for partial background coloring - for hstart, hend in widths.values(): - plt.axvspan(hstart, hend, facecolor='#0099ff', linestyle='dotted', - linewidth=2.0, edgecolor='black', alpha=0.2) - - # plot the histogram - bins = [0] + list(drange(2.5, 100, 5)) + [100] - n, bins, patches = ax0.hist(gcs, bins=bins, facecolor='#009933', alpha=0.9) - # set Y-axis ticks label formatting - ax0.yaxis.set_major_formatter(major_formatter) - ax0.yaxis.grid(True) - plt.ylabel('Read count') - ax0.text(0.02, 0.9, 'Mean: %.2f\nStdev: %.2f' % (mean, stdev), - transform=ax0.transAxes, bbox=dict(facecolor='grey', alpha=0.5, - edgecolor='none'), size=14) - - # plot the boxplot - # shared X-axis, but invisible - ax1 = plt.subplot(grids[1], sharex=ax0) - plt.setp(ax1.get_xticklabels(), visible=False) - # and set the Y-axis to be invisible completely - ax1.axes.get_yaxis().set_visible(False) - plot = ax1.boxplot(gcs, vert=False, widths=0.6, sym='r.') - # line width and color settings for boxplot - plot['fliers'][0].set_color('#e62e00') - plot['fliers'][1].set_color('#e62e00') - plot['boxes'][0].set_color('black') - plot['boxes'][0].set_linewidth(1.2) - plot['medians'][0].set_linewidth(1.2) - plot['medians'][0].set_color('black') - plot['whiskers'][0].set_color('black') - plot['whiskers'][0].set_linewidth(1.2) - plot['whiskers'][1].set_color('black') - plot['whiskers'][1].set_linewidth(1.2) - plot['caps'][0].set_linewidth(1.2) - plot['caps'][1].set_linewidth(1.2) - # set X-axis label and ticks - ax0.xaxis.set_major_locator(MultipleLocator(10)) - ax0.xaxis.set_minor_locator(MultipleLocator(5)) - plt.xlabel('% GC') - - grids.update(hspace=0.075) - plt.savefig(outname, bbox_inches='tight') - - return gcs - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - parser.add_argument('input', help='input FASTQ file', default='reads.fq') - parser.add_argument('output', help='output image file', default='test.png') - - args = parser.parse_args() - - gcs = graph_gc(args.input, args.output) diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/hist2count.py b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/hist2count.py deleted file mode 100644 index e91519742c77c6d2bd7fdce66728ac927771118e..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/hist2count.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python -# -# Biopet is built on top of GATK Queue for building bioinformatic -# pipelines. It is mainly intended to support LUMC SHARK cluster which is running -# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) -# should also be able to execute Biopet tools and pipelines. -# -# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center -# -# Contact us at: sasc@lumc.nl -# -# A dual licensing mode is applied. The source code within this project that are -# not part of GATK Queue is freely available for non-commercial use under an AGPL -# license; For commercial users or users who do not want to follow the AGPL -# license, please contact us to obtain a separate license. -# - -""" -Convert a histogram generated by coverageBed to counts per region. - - -A histogram file can be generated with the following command: -coverageBed -split -hist -abam sample.bam -b selection.bed > selection.hist - -The output consists of four columns: -- Chromosome name. -- Start position. -- End position. -- Number of nucleotides mapped to this region. -- Normalised expression for this region. - -If the -c option is used, additional columns can be added. -""" - -import argparse -import sys - -def hist2count(inputHandle, outputHandle, copy): - """ - Split a fasta file on length. - - @arg inputHandle: Open readable handle to a histogram file. - @type inputHandle: stream - @arg outputHandle: Open writable handle to the counts file. - @type outputHandle: stream - @arg outputHandle: List of columns to copy to the output file. - @type outputHandle: list[int] - """ - def __copy(): - copyList = "" - for i in copy: - copyList += "\t%s" % data[i] - return copyList - #__copy - - def __write(): - outputHandle.write("%s\t%i\t%i\t%i\t%f%s\n" % (chromosome, start, - end, count, float(count) / (end - start), copyList)) - - chromosome = "" - start = 0 - end = 0 - count = 0 - - for line in inputHandle.readlines(): - data = line.split() - - if not data[0] == "all": - start_temp = int(data[1]) - end_temp = int(data[2]) - - if data[0] != chromosome or start_temp != start or end_temp != end: - if chromosome: - __write() - chromosome = data[0] - start = start_temp - end = end_temp - count = 0 - copyList = __copy() - #if - count += int(data[-4]) * int(data[-3]) - #if - #for - __write() -#hist2count - -def main(): - """ - Main entry point. - """ - usage = __doc__.split("\n\n\n") - parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description=usage[0], epilog=usage[1]) - parser.add_argument("-i", dest="input", type=argparse.FileType("r"), - default=sys.stdin, help="histogram input file (default=<stdin>)") - parser.add_argument("-o", dest="output", type=argparse.FileType("w"), - default=sys.stdout, help="file used as output (default=<stdout>)") - parser.add_argument("-c", dest="copy", type=int, nargs="+", default=[], - help="copy a column to the output file") - args = parser.parse_args() - - hist2count(args.input, args.output, args.copy) -#main - -if __name__ == '__main__': - main() diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/insert_dist.py b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/insert_dist.py deleted file mode 100755 index 332e7313409b84847ce899ebc3e51ba033fe78fc..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/insert_dist.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python -# -# Biopet is built on top of GATK Queue for building bioinformatic -# pipelines. It is mainly intended to support LUMC SHARK cluster which is running -# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) -# should also be able to execute Biopet tools and pipelines. -# -# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center -# -# Contact us at: sasc@lumc.nl -# -# A dual licensing mode is applied. The source code within this project that are -# not part of GATK Queue is freely available for non-commercial use under an AGPL -# license; For commercial users or users who do not want to follow the AGPL -# license, please contact us to obtain a separate license. -# - -# -# insert_dist.py -# -# Given path to a text file containing Picard's CollectInsertSizeMetrics -# results, create a new graph. -# -# (c) 2013 Wibowo Arindrarto [SASC - LUMC] - -import argparse -import locale -import os -import re -import textwrap - -from collections import namedtuple -from functools import partial - -# for headless matplotlib -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt - -from matplotlib.ticker import FuncFormatter - -# set locale and formatter for axis ticks -locale.setlocale(locale.LC_ALL, '') -groupdig = lambda x, pos: locale.format('%d', x, grouping=True) -major_formatter = FuncFormatter(groupdig) -int_fmt = partial(locale.format, grouping=True, percent='%i') - - -def multi_annotate(ax, title, xy_arr=[], *args, **kwargs): - """Axis annotation function that targets multiple data points.""" - ans = [] - an = ax.annotate(title, xy_arr[0], *args, **kwargs) - ans.append(an) - d = {} - if 'xycoords' in kwargs: - d['xycoords'] = kwargs['xycoords'] - if 'arrowprops' in kwargs: - d['arrowprops'] = kwargs['arrowprops'] - for xy in xy_arr[1:]: - an = ax.annotate(title, xy, alpha=0.0, xytext=(0, 0), textcoords=an, **d) - ans.append(an) - - return ans - - -def parse_insert_sizes_histogram(fname): - """Given a filename or a file object of a Picard COllectInsertSizeMetrics - output, return the filename, the histogram column names, and the histogram - data.""" - - if isinstance(fname, basestring): - fp = open(fname, 'r') - else: - fp = fname - - line = fp.readline() - while True: - if not line: - raise ValueError("Unexpected end of file") - # try to get the original bam file name - elif 'net.sf.picard.analysis.CollectInsertSizeMetrics' in line: - input = re.search('INPUT=([^\s]*)', line).group(1) - bamname = os.path.basename(input) - elif line.startswith('## HISTOGRAM'): - break - line = fp.readline() - - # get column names - colnames = fp.readline().strip().split('\t') - - # iterate over the histogram data lines - # and fill up missing data with 0s - data = [] - counter = 0 - for line in fp: - if not line.strip(): - break - # bin number starts at 1 - tokens = [int(x) for x in line.split('\t')] - numcol = len(tokens) - 1 - if counter == tokens[0] - 1: - data.append(tokens[1:]) - counter += 1 - else: - while tokens[0] - counter != 1: - data.append([0] * numcol) - counter += 1 - data.append(tokens[1:]) - counter += 1 - - histogram = data - - return bamname, colnames, histogram - - -def graph_insert_sizes(fname, outname='test.png'): - """Given a Picard CollectInsertSizes text output filename, write graph(s) - for the histogram.""" - bamname, colnames, hist = parse_insert_sizes_histogram(fname) - - # map Picard's insert type (based on its column name) - # to our own name and color - InsType = namedtuple('InsType', ['label', 'color']) - design_map = { - # 5' --F--> <--R-- 5 - 'fr_count': InsType('inward', '#009933'), - # <--R-- 5' 5' --F--> - 'rf_count': InsType('outward', 'orange'), - # 5' --F--> 5' --F--> or <--R-- 5' <--R-- 5' - 'tandem_count': InsType('same directions', '#e62e00'), - } - - fig = plt.figure() - ax = plt.subplot(111) - for idx, col in enumerate(colnames[1:]): - pcd_name = col.split('.')[-1] - try: - label = design_map[pcd_name].label - color = design_map[pcd_name].color - except KeyError: - raise ValueError("Unexpected column name: %r" % col) - - data = [m[idx] for m in hist] - plt.bar(range(len(hist)), data, width=1, linewidth=0, color=color, - alpha=0.6, label=label) - - max_val = max(data) - max_val_size = data.index(max_val) - highest_points = [(idx, max_val) for idx, val in enumerate(data) if val == max_val] - x_adj = int(len(data) * 0.1) - y_adj = int(max_val * 0.1) - bbox_props = dict(boxstyle="round", fc="w", edgecolor='black', alpha=1.0) - multi_annotate(ax, - 'max count: {0}\nsize: {1} bp'.format(int_fmt(value=max_val), - ', '.join([str(x[0]) for x in highest_points])), - xy_arr=highest_points, - xytext=(max_val_size + x_adj, max_val + y_adj), - fontsize=9, bbox=bbox_props, - horizontalalignment='left', verticalalignment='center', - arrowprops=dict(color='black', shrink=0.1, width=0.5, headwidth=2.5, ),) - - # adjust ylim to account for annotation box - init_ylim = ax.get_ylim() - ax.set_ylim(0, init_ylim[1] * 1.08) - - # set title and its spacing - title = 'Insert Sizes Distribution' - t = plt.title('\n'.join([title] + textwrap.wrap('%r' % bamname, 50)), - fontsize=15) - t.set_y(1.05) - plt.legend() - plt.xlabel("Insert Size") - plt.ylabel("Alignment Count") - ax.yaxis.set_major_formatter(major_formatter) - ax.grid(True) - plt.savefig(outname, bbox_inches='tight') - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - parser.add_argument('input', help='input file') - parser.add_argument('output', help='output image file', default='test.png') - - args = parser.parse_args() - - graph_insert_sizes(args.input, args.output) diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/parse_cuffcmp.py b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/parse_cuffcmp.py deleted file mode 100755 index 81ff276046b29ccfa269a9eca8d9293edba58067..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/parse_cuffcmp.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python -# -# Biopet is built on top of GATK Queue for building bioinformatic -# pipelines. It is mainly intended to support LUMC SHARK cluster which is running -# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) -# should also be able to execute Biopet tools and pipelines. -# -# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center -# -# Contact us at: sasc@lumc.nl -# -# A dual licensing mode is applied. The source code within this project that are -# not part of GATK Queue is freely available for non-commercial use under an AGPL -# license; For commercial users or users who do not want to follow the AGPL -# license, please contact us to obtain a separate license. -# - -# -# oarse_cuffcmp.py -# -# Parses cuffcompare's cuffcmp.stats output into a JSON file. -# -# Part of the Gentrap pipeline. -# -# (c) 2013 by Wibowo Arindrarto [LUMC - SASC] - -import argparse -import json -import locale -import os -import re - - -# set locale to group digits -locale.setlocale(locale.LC_ALL, '') - -# precompiled regex patterns -_base_qr = '\s+(\d+)/(\d+)' -_base_table = '\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*?)\s+' -# source transcripts gtf -_re_dataset = re.compile(r'Summary for dataset:\s+(.*?)\s+:') -# ref exons not covered by query exons, total ref exons -_re_rexons = re.compile(r'Missed exons:%s' % _base_qr) -# query exons not covered by ref exons, total query exons -_re_qexons = re.compile(r'Novel exons:%s' % _base_qr) -# ref introns not covered by query introns, total ref introns -_re_rintrons = re.compile(r'Missed introns:%s' % _base_qr) -# query introns not covered by ref introns, total query introns -_re_qintrons = re.compile(r'Novel introns:%s' % _base_qr) -# ref loci not covered by query loci, total ref loci -_re_rloci = re.compile(r'Missed loci:%s' % _base_qr) -# query loci not covered by ref loci, total query loci -_re_qloci = re.compile(r'Novel loci:%s' % _base_qr) -# base level metrics -_re_base = re.compile(r'Base level:%s' % _base_table) -# exon level metrics -_re_exon = re.compile(r'Exon level:%s' % _base_table) -# intron level metrics -_re_intron = re.compile(r'Intron level:%s' % _base_table) -# intron chain level metrics -_re_intron_chain = re.compile(r'Intron chain level:%s' % _base_table) -# transcript level metrics -_re_transcript = re.compile(r'Transcript level:%s' % _base_table) -# locus level metrics -_re_locus = re.compile(r'Locus level:%s' % _base_table) - - -def _fallback_search(re_pattern, string, match_type, fallback_str, group, - replacement=None): - """Function to handle cases when the regex match is of a different type, - e.g. '-' instead of an integer.""" - match = re.search(re_pattern, string).group(group) - - if match == fallback_str: - return replacement - else: - return match_type(match) - - -def parse_cuffcmp_stats(stat_file): - """Parses the statistics in the given cuffcmp.stats file into a - dictionary.""" - assert os.path.exists(stat_file), "File %r not found" % stat_file - - with open(stat_file, 'r') as source: - # not expecting a huge output, we can store everything in memory - stat_str = source.read() - - stats = { - 'dataSet': re.search(_re_dataset, stat_str).group(1), - 'refExonsNotInQuery': int(re.search(_re_rexons, stat_str).group(1)), - 'refExonsTotal': int(re.search(_re_rexons, stat_str).group(2)), - 'queryExonsNotInRef': int(re.search(_re_qexons, stat_str).group(1)), - 'queryExonsTotal': int(re.search(_re_qexons, stat_str).group(2)), - - 'refIntronsNotInQuery': int(re.search(_re_rintrons, stat_str).group(1)), - 'refIntronsTotal': int(re.search(_re_rintrons, stat_str).group(2)), - 'queryIntronsNotInRef': int(re.search(_re_qintrons, stat_str).group(1)), - 'queryIntronsTotal': int(re.search(_re_qintrons, stat_str).group(2)), - - 'refLociNotInQuery': int(re.search(_re_rloci, stat_str).group(1)), - 'refLociTotal': int(re.search(_re_rloci, stat_str).group(2)), - 'queryLociNotInRef': int(re.search(_re_qloci, stat_str).group(1)), - 'queryLociTotal': int(re.search(_re_qloci, stat_str).group(2)), - - 'baseLevelSn': _fallback_search(_re_base, stat_str, float, '-', 1), - 'baseLevelSp': _fallback_search(_re_base, stat_str, float, '-', 2), - 'baseLevelFSn': _fallback_search(_re_base, stat_str, float, '-', 3), - 'baseLevelFSp': _fallback_search(_re_base, stat_str, float, '-', 4), - - 'exonLevelSn': _fallback_search(_re_exon, stat_str, float, '-', 1), - 'exonLevelSp': _fallback_search(_re_exon, stat_str, float, '-', 2), - 'exonLevelFSn': _fallback_search(_re_exon, stat_str, float, '-', 3), - 'exonLevelFSp': _fallback_search(_re_exon, stat_str, float, '-', 4), - - 'intronLevelSn': _fallback_search(_re_intron, stat_str, float, '-', 1), - 'intronLevelSp': _fallback_search(_re_intron, stat_str, float, '-', 2), - 'intronLevelFSn': _fallback_search(_re_intron, stat_str, float, '-', 3), - 'intronLevelFSp': _fallback_search(_re_intron, stat_str, float, '-', 4), - - 'intronChainLevelSn': _fallback_search(_re_intron_chain, stat_str, float, '-', 1), - 'intronChainLevelSp': _fallback_search(_re_intron_chain, stat_str, float, '-', 2), - 'intronChainLevelFSn': _fallback_search(_re_intron_chain, stat_str, float, '-', 3), - 'intronChainLevelFSp': _fallback_search(_re_intron_chain, stat_str, float, '-', 4), - - 'transcriptLevelSn': _fallback_search(_re_transcript, stat_str, float, '-', 1), - 'transcriptLevelSp': _fallback_search(_re_transcript, stat_str, float, '-', 2), - 'transcriptLevelFSn': _fallback_search(_re_transcript, stat_str, float, '-', 3), - 'transcriptLevelFSp': _fallback_search(_re_transcript, stat_str, float, '-', 4), - - 'locusLevelSn': _fallback_search(_re_locus, stat_str, float, '-', 1), - 'locusLevelSp': _fallback_search(_re_locus, stat_str, float, '-', 2), - 'locusLevelFSn': _fallback_search(_re_locus, stat_str, float, '-', 3), - 'locusLevelFSp': _fallback_search(_re_locus, stat_str, float, '-', 4), - } - - return stats - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - parser.add_argument('input', type=str, - help='Path to input cuffcmp.stats file') - parser.add_argument('-o', '--output-json', dest='output', type=str, - help='Path to JSON output file', default=None) - args = parser.parse_args() - - stats = parse_cuffcmp_stats(args.input) - - if args.output is not None: - with open(args.output, 'w') as jsonfile: - json.dump(stats, jsonfile, sort_keys=True, indent=4, - separators=(',', ': ')) - else: - print json.dumps(stats, sort_keys=True, indent=4, - separators=(',', ': ')) diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/pdf_report.py b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/pdf_report.py deleted file mode 100755 index b0c2b2e82e431d6340c74575ad07c0e48a19b623..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/pdf_report.py +++ /dev/null @@ -1,586 +0,0 @@ -#!/usr/bin/env python -# -# Biopet is built on top of GATK Queue for building bioinformatic -# pipelines. It is mainly intended to support LUMC SHARK cluster which is running -# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) -# should also be able to execute Biopet tools and pipelines. -# -# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center -# -# Contact us at: sasc@lumc.nl -# -# A dual licensing mode is applied. The source code within this project that are -# not part of GATK Queue is freely available for non-commercial use under an AGPL -# license; For commercial users or users who do not want to follow the AGPL -# license, please contact us to obtain a separate license. -# - -from __future__ import print_function - -import argparse -import json -import locale -import os -import re -import sys -from os import path - -from jinja2 import Environment, FileSystemLoader - - -# set locale for digit grouping -locale.setlocale(locale.LC_ALL, "") - - -class FastQCModule(object): - - """Class representing a FastQC analysis module.""" - - def __init__(self, raw_lines, end_mark='>>END_MODULE'): - """ - - :param raw_lines: list of lines in the module - :type raw_lines: list of str - :param end_mark: mark of the end of the module - :type end_mark: str - - """ - self.raw_lines = raw_lines - self.end_mark = end_mark - self._status = None - self._name = None - self._data = self._parse() - - def __repr__(self): - return '%s(%s)' % (self.__class__.__name__, - '[%r, ...]' % self.raw_lines[0]) - - def __str__(self): - return ''.join(self.raw_lines) - - @property - def name(self): - """Name of the module.""" - return self._name - - @property - def columns(self): - """Columns in the module.""" - return self._columns - - @property - def data(self): - """FastQC data.""" - return self._data - - @property - def status(self): - """FastQC run status.""" - return self._status - - def _parse(self): - """Common parser for a FastQC module.""" - # check that the last line is a proper end mark - assert self.raw_lines[-1].startswith(self.end_mark) - # parse name and status from first line - tokens = self.raw_lines[0].strip().split('\t') - name = tokens[0][2:] - self._name = name - status = tokens[-1] - assert status in ('pass', 'fail', 'warn'), "Unknown module status: %r" \ - % status - self._status = status - # and column names from second line - columns = self.raw_lines[1][1:].strip().split('\t') - self._columns = columns - # the rest of the lines except the last one - data = [] - for line in self.raw_lines[2:-1]: - cols = line.strip().split('\t') - data.append(cols) - - # optional processing for different modules - if self.name == 'Basic Statistics': - data = {k: v for k, v in data} - - return data - - -class FastQC(object): - - """Class representing results from a FastQC run.""" - - # module name -- attribute name mapping - _mod_map = { - '>>Basic Statistics': 'basic_statistics', - '>>Per base sequence quality': 'per_base_sequence_quality', - '>>Per sequence quality scores': 'per_sequence_quality_scores', - '>>Per base sequence content': 'per_base_sequence_content', - '>>Per base GC content': 'per_base_gc_content', - '>>Per sequence GC content': 'per_sequence_gc_content', - '>>Per base N content': 'per_base_n_content', - '>>Sequence Length Distribution': 'sequence_length_distribution', - '>>Sequence Duplication Levels': 'sequence_duplication_levels', - '>>Overrepresented sequences': 'overrepresented_sequences', - '>>Kmer content': 'kmer_content', - } - - def __init__(self, fname): - """ - - :param fp: open file handle pointing to the FastQC data file - :type fp: file handle - - """ - # get file name - self.fname = fname - self._modules = {} - - with open(fname, "r") as fp: - line = fp.readline() - while True: - - tokens = line.strip().split('\t') - # break on EOF - if not line: - break - # parse version - elif line.startswith('##FastQC'): - self.version = line.strip().split()[1] - # parse individual modules - elif tokens[0] in self._mod_map: - attr = self._mod_map[tokens[0]] - raw_lines = self._read_module(fp, line, tokens[0]) - self._modules[attr] = FastQCModule(raw_lines) - - line = fp.readline() - - def __repr__(self): - return '%s(%r)' % (self.__class__.__name__, self.fname) - - def _filter_by_status(self, status): - """Filter out modules whose status is different from the given status. - - :param status: module status - :type status: str - :returns: a list of FastQC module names with the given status - :rtype: list of str - - """ - return [x.name for x in self._modules.values() if x.status == status] - - def _read_module(self, fp, line, start_mark): - """Returns a list of lines in a module. - - :param fp: open file handle pointing to the FastQC data file - :type fp: file handle - :param line: first line in the module - :type line: str - :param start_mark: string denoting start of the module - :type start_mark: str - :returns: a list of lines in the module - :rtype: list of str - - """ - raw = [line] - while not line.startswith('>>END_MODULE'): - line = fp.readline() - raw.append(line) - - if not line: - raise ValueError("Unexpected end of file in module %r" % line) - - return raw - - @property - def modules(self): - """All modules in the FastQC results.""" - return self._modules - - @property - def passes(self): - """All module names that pass QC.""" - return self._filter_by_status('pass') - - @property - def passes_num(self): - """How many modules have pass status.""" - return len(self.passes) - - @property - def warns(self): - """All module names with warning status.""" - return self._filter_by_status('warn') - - @property - def warns_num(self): - """How many modules have warn status.""" - return len(self.warns) - - @property - def fails(self): - """All names of failed modules.""" - return self._filter_by_status('fail') - - @property - def fails_num(self): - """How many modules failed.""" - return len(self.fails) - - @property - def basic_statistics(self): - """Basic statistics module results.""" - return self._modules['basic_statistics'] - - @property - def per_base_sequence_quality(self): - """Per base sequence quality module results.""" - return self._modules['per_base_sequence_quality'] - - @property - def per_sequence_quality_scores(self): - """Per sequence quality scores module results.""" - return self._modules['per_sequence_quality_scores'] - - @property - def per_base_sequence_content(self): - """Per base sequence content module results.""" - return self._modules['per_base_sequence_content'] - - @property - def per_base_gc_content(self): - """Per base GC content module results.""" - return self._modules['per_base_gc_content'] - - @property - def per_sequence_gc_content(self): - """Per sequence GC content module results.""" - return self._modules['per_sequence_gc_content'] - - @property - def per_base_n_content(self): - """Per base N content module results.""" - return self._modules['per_base_n_content'] - - @property - def sequence_length_distribution(self): - """Per sequence length distribution module results.""" - return self._modules['sequence_length_distribution'] - - @property - def sequence_duplication_levels(self): - """Sequence duplication module results.""" - return self._modules['sequence_duplication_levels'] - - @property - def overrepresented_sequences(self): - """Overrepresented sequences module results.""" - return self._modules['overrepresented_sequences'] - - @property - def kmer_content(self): - """Kmer content module results.""" - return self._modules['kmer_content'] - - -# HACK: remove this and use jinja2 only for templating -class LongTable(object): - - """Class representing a longtable in LaTeX.""" - - def __init__(self, caption, label, header, aln, colnum): - self.lines = [ - "\\begin{center}", - "\\captionof{table}{%s}" % caption, - "\\label{%s}" % label, - "\\begin{longtable}{%s}" % aln, - "\\hline", - "%s" % header, - "\\hline \\hline", - "\\endhead", - "\\hline \\multicolumn{%i}{c}{\\textit{Continued on next page}}\\\\" % \ - colnum, - "\\hline", - "\\endfoot", - "\\hline", - "\\endlastfoot", - ] - - def __str__(self): - return "\n".join(self.lines) - - def add_row(self, row): - self.lines.append(row) - - def end(self): - self.lines.extend(["\\end{longtable}", "\\end{center}", - "\\addtocounter{table}{-1}"]) - - -# filter functions for the jinja environment -def nice_int(num, default="None"): - if num is None: - return default - try: - return locale.format("%i", int(num), grouping=True) - except: - return default - - -def nice_flt(num, default="None"): - if num is None: - return default - try: - return locale.format("%.2f", float(num), grouping=True) - except: - return default - - -def float2nice_pct(num, default="None"): - if num is None: - return default - try: - return locale.format("%.2f", float(num) * 100.0, grouping=True) - except: - return default - - -# and some handy functions -def natural_sort(inlist): - key = lambda x: [int(a) if a.isdigit() else a.lower() for a in - re.split("([0-9]+)", x)] - inlist.sort(key=key) - return inlist - - -def write_template(run, template_file, logo_file): - - template_file = path.abspath(path.realpath(template_file)) - template_dir = path.dirname(template_file) - # spawn environment and create output directory - env = Environment(loader=FileSystemLoader(template_dir)) - - # change delimiters since LaTeX may use "{{", "{%", or "{#" - env.block_start_string = "((*" - env.block_end_string = "*))" - env.variable_start_string = "(((" - env.variable_end_string = ")))" - env.comment_start_string = "((=" - env.comment_end_string = "=))" - - # trim all block-related whitespaces - env.trim_blocks = True - env.lstrip_blocks = True - - # put in out filter functions - env.filters["nice_int"] = nice_int - env.filters["nice_flt"] = nice_flt - env.filters["float2nice_pct"] = float2nice_pct - env.filters["basename"] = path.basename - - # write tex template for pdflatex - jinja_template = env.get_template(path.basename(template_file)) - run.logo = logo_file - render_vars = { - "run": run, - } - rendered = jinja_template.render(**render_vars) - - print(rendered, file=sys.stdout) - - -class GentrapLib(object): - - def __init__(self, run, sample, name, summary): - assert isinstance(run, GentrapRun) - assert isinstance(sample, GentrapSample) - self.run = run - self.sample = sample - self.name = name - self._raw = summary - # flexiprep settings - self.flexiprep = summary.get("flexiprep", {}) - self.flexiprep_files = summary.get("flexiprep", {}).get("files", {}).get("pipeline", {}) - self.clipping = not self.flexiprep["settings"]["skip_clip"] - self.trimming = not self.flexiprep["settings"]["skip_trim"] - self.is_paired_end = self.flexiprep["settings"]["paired"] - if "fastqc_R1" in self.flexiprep["files"]: - self.fastqc_r1_files = self.flexiprep["files"]["fastqc_R1"] - self.fastqc_r1 = FastQC(self.fastqc_r1_files["fastqc_data"]["path"]) - if "fastqc_R2" in self.flexiprep["files"]: - self.fastqc_r2_files = self.flexiprep["files"]["fastqc_R2"] - self.fastqc_r2 = FastQC(self.fastqc_r2_files["fastqc_data"]["path"]) - if "fastqc_R1_qc" in self.flexiprep["files"]: - self.fastqc_r1_qc_files = self.flexiprep["files"]["fastqc_R1_qc"] - self.fastqc_r1_qc = FastQC(self.fastqc_r1_qc_files["fastqc_data"]["path"]) - if "fastqc_R2_qc" in self.flexiprep["files"]: - self.fastqc_r2_qc_files = self.flexiprep["files"]["fastqc_R2_qc"] - self.fastqc_r2_qc = FastQC(self.fastqc_r2_qc_files["fastqc_data"]["path"]) - # mapping metrics settings - self.aln_metrics = summary.get("bammetrics", {}).get("stats", {}).get("CollectAlignmentSummaryMetrics", {}) - for k, v in self.aln_metrics.items(): - self.aln_metrics[k] = {a.lower(): b for a, b in v.items()} - # insert size metrics files - self.inserts_metrics_files = \ - summary.get("bammetrics", {}).get("files", {}).get("multi_metrics", {}) - # rna metrics files and stats - self.rna_metrics_files = summary.get("bammetrics", {}).get("files", {}).get("rna", {}) - _rmetrics = summary.get("bammetrics", {}).get("stats", {}).get("rna", {}) - if _rmetrics: - if "metrics" in _rmetrics: - _rmetrics = _rmetrics["metrics"] - if _rmetrics: - _rmetrics = {k.lower(): v for k, v in _rmetrics.items() } - self.rna_metrics = _rmetrics - pf_bases = float(_rmetrics["pf_bases"]) - exonic_bases = int(_rmetrics.get("coding_bases", 0)) + int(_rmetrics.get("utr_bases", 0)) - # picard uses pct_ but it's actually ratio ~ we follow their convention - pct_exonic_bases_all = exonic_bases / float(_rmetrics["pf_bases"]) - pct_exonic_bases = exonic_bases / float(_rmetrics.get("pf_aligned_bases", 0)) - self.rna_metrics.update({ - "exonic_bases": exonic_bases, - "pct_exonic_bases_all": pct_exonic_bases_all, - "pct_exonic_bases": pct_exonic_bases, - "pct_aligned_bases": 1.0, - "pct_aligned_bases_all": float(_rmetrics.get("pf_aligned_bases", 0.0)) / pf_bases, - "pct_coding_bases_all": float(_rmetrics.get("coding_bases", 0.0)) / pf_bases, - "pct_utr_bases_all": float(_rmetrics.get("utr_bases", 0.0)) / pf_bases, - "pct_intronic_bases_all": float(_rmetrics.get("intronic_bases", 0.0)) / pf_bases, - "pct_intergenic_bases_all": float(_rmetrics.get("intergenic_bases", 0.0)) / pf_bases, - }) - if _rmetrics.get("ribosomal_bases", "") != "": - self.rna_metrics["pct_ribosomal_bases_all"] = float(_rmetrics.get("pf_ribosomal_bases", 0.0)) / pf_bases - - def __repr__(self): - return "{0}(sample=\"{1}\", lib=\"{2}\")".format( - self.__class__.__name__, self.sample.name, self.name) - - -class GentrapSample(object): - - def __init__(self, run, name, summary): - assert isinstance(run, GentrapRun) - self.run = run - self.name = name - self._raw = summary - self.is_paired_end = summary.get("gentrap", {}).get("stats", {}).get("pipeline", {})["all_paired"] - # mapping metrics settings - self.aln_metrics = summary.get("bammetrics", {}).get("stats", {}).get("CollectAlignmentSummaryMetrics", {}) - for k, v in self.aln_metrics.items(): - self.aln_metrics[k] = {a.lower(): b for a, b in v.items()} - # insert size metrics files - self.inserts_metrics_files = \ - summary.get("bammetrics", {}).get("files", {}).get("multi_metrics", {}) - # rna metrics files and stats - self.rna_metrics_files = summary.get("bammetrics", {}).get("files", {}).get("rna", {}) - _rmetrics = summary.get("bammetrics", {}).get("stats", {}).get("rna", {}) - if _rmetrics: - if "metrics" in _rmetrics: - _rmetrics = _rmetrics["metrics"] - if _rmetrics: - _rmetrics = {k.lower(): v for k, v in _rmetrics.items() } - self.rna_metrics = _rmetrics - pf_bases = float(_rmetrics["pf_bases"]) - exonic_bases = int(_rmetrics.get("coding_bases", 0)) + int(_rmetrics.get("utr_bases", 0)) - # picard uses pct_ but it's actually ratio ~ we follow their convention - pct_exonic_bases_all = exonic_bases / float(_rmetrics["pf_bases"]) - pct_exonic_bases = exonic_bases / float(_rmetrics.get("pf_aligned_bases", 0)) - self.rna_metrics.update({ - "exonic_bases": exonic_bases, - "pct_exonic_bases_all": pct_exonic_bases_all, - "pct_exonic_bases": pct_exonic_bases, - "pct_aligned_bases": 1.0, - "pct_aligned_bases_all": float(_rmetrics.get("pf_aligned_bases", 0.0)) / pf_bases, - "pct_coding_bases_all": float(_rmetrics.get("coding_bases", 0.0)) / pf_bases, - "pct_utr_bases_all": float(_rmetrics.get("utr_bases", 0.0)) / pf_bases, - "pct_intronic_bases_all": float(_rmetrics.get("intronic_bases", 0.0)) / pf_bases, - "pct_intergenic_bases_all": float(_rmetrics.get("intergenic_bases", 0.0)) / pf_bases, - }) - if _rmetrics.get("ribosomal_bases", "") != "": - self.rna_metrics["pct_ribosomal_bases_all"] = float(_rmetrics.get("pf_ribosomal_bases", 0.0)) / pf_bases - - self.lib_names = sorted(summary["libraries"].keys()) - self.libs = \ - {l: GentrapLib(self.run, self, l, summary["libraries"][l]) \ - for l in self.lib_names} - - def __repr__(self): - return "{0}(\"{1}\")".format(self.__class__.__name__, self.name) - - -class GentrapRun(object): - - def __init__(self, summary_file): - - with open(summary_file, "r") as src: - summary = json.load(src) - - self._raw = summary - self.summary_file = summary_file - - self.files = summary["gentrap"].get("files", {}).get("pipeline", {}) - self.settings = summary["gentrap"]["settings"] - self.version = self.settings.get("version", "unknown") - # list containing all exes - self.all_executables = summary["gentrap"]["executables"] - # list containing exes we want to display - executables = [ - ("cutadapt", "adapter clipping"), - ("sickle", "base quality trimming"), - ("fastqc", "sequence metrics collection"), - ("gsnap", "alignment"), - ("tophat", "alignment"), - ("star", "alignment"), - ("htseqcount", "fragment counting"), - ] - self.executables = {} - for k, desc in executables: - in_summary = self.all_executables.get(k) - if in_summary is not None: - self.executables[k] = in_summary - self.executables[k]["desc"] = desc - # since we get the version from the tools we use - if self.all_executables.get("collectalignmentsummarymetrics") is not None: - self.executables["picard"] = self.all_executables["collectalignmentsummarymetrics"] - self.executables["picard"]["desc"] = "alignment_metrics_collection" - # None means we are using the Queue built in Picard - if self.executables["picard"].get("version") is None: - self.executables["picard"]["version"] = "built-in" - # since we get the version from the sub tools we use - if self.all_executables.get("samtoolsview") is not None: - self.executables["samtools"] = self.all_executables["samtoolsview"] - self.executables["samtools"]["desc"] = "various post-alignment processing" - - self.sample_names = sorted(summary["samples"].keys()) - self.samples = \ - {s: GentrapSample(self, s, summary["samples"][s]) \ - for s in self.sample_names} - self.libs = [] - for sample in self.samples.values(): - self.libs.extend(sample.libs.values()) - if all([s.is_paired_end for s in self.samples.values()]): - self.lib_type = "all paired end" - elif all([not s.is_paired_end for s in self.samples.values()]): - self.lib_type = "all single end" - else: - self.lib_type = "mixed (single end and paired end)" - - def __repr__(self): - return "{0}(\"{1}\")".format(self.__class__.__name__, - self.summary_file) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser() - parser.add_argument("summary_file", type=str, - help="Path to Gentrap summary file") - parser.add_argument("template_file", type=str, - help="Path to main template file") - parser.add_argument("logo_file", type=str, - help="Path to main logo file") - args = parser.parse_args() - - run = GentrapRun(args.summary_file) - write_template(run, args.template_file, args.logo_file) - diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/plot_pca.R b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/plot_pca.R deleted file mode 100755 index 419ec5b2cb30baf2cbde032b7e5ba11abba6c305..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/plot_pca.R +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env Rscript -# -# Script for plotting PCA plots for the Gentrap pipeline - - -# General function to install package if it does not exist -# Otherwise, it only loads the package -usePackage <- function(p) { - r <- getOption("repos") - r["CRAN"] <- "http://cran.us.r-project.org" - options(repos = r) - rm(r) - if (!is.element(p, installed.packages()[,1])) - install.packages(p, dep = TRUE) - require(p, character.only = TRUE) -} - -usePackage("getopt") -usePackage("edgeR") -usePackage("ggplot2") -usePackage("gplots") -usePackage("grid") -usePackage("jsonlite") -usePackage("reshape2") -usePackage("MASS") -usePackage("RColorBrewer") - -# create spec for arg parsing -spec <- matrix(c( - # input table (merge of all samples) - 'input-table', 'I', 1, 'character', - # output plot file - 'output-plot', 'O', 1, 'character', - # perform TMM-normalization (only if we are dealing with count data) - 'tmm-normalize', 'T', 0, 'logical' - # help - 'help', 'H', 0, 'logical' -), byrow=TRUE, ncol=4) -opt <- getopt(spec) - -# print help if requested -if (!is.null(opt[['help']])) { - cat(getopt(spec, usage=TRUE)) - q(status=1) -} diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/rna_metrics.py b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/rna_metrics.py deleted file mode 100755 index 862bf7e0134a0b403cc925d69ce21f8d713ccedd..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/rna_metrics.py +++ /dev/null @@ -1,517 +0,0 @@ -#!/usr/bin/env python2 -# -# Biopet is built on top of GATK Queue for building bioinformatic -# pipelines. It is mainly intended to support LUMC SHARK cluster which is running -# SGE. But other types of HPC that are supported by GATK Queue (such as PBS) -# should also be able to execute Biopet tools and pipelines. -# -# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center -# -# Contact us at: sasc@lumc.nl -# -# A dual licensing mode is applied. The source code within this project that are -# not part of GATK Queue is freely available for non-commercial use under an AGPL -# license; For commercial users or users who do not want to follow the AGPL -# license, please contact us to obtain a separate license. -# - - -# rna_metrics.py -# -# Given a sorted, indexed BAM file from an RNA seq experiment, -# output the annotation metrics using Picard CollectRnaSeqMetrics per chromosome - -import argparse -import json -import functools -import locale -import os -import subprocess -import threading -import time -import tempfile -import warnings -import Queue - -# valid column names -# from http://picard.sourceforge.net/picard-metric-definitions.shtml#RnaSeqMetrics -COL_NAMES = { - 'PF_BASES': 'pfBases', - 'PF_ALIGNED_BASES': 'pfAlignedBases', - 'RIBOSOMAL_BASES': 'ribosomalBases', - 'CODING_BASES': 'codingBases', - 'UTR_BASES': 'utrBases', - 'INTRONIC_BASES': 'intronicBases', - 'INTERGENIC_BASES': 'intergenicBases', - 'IGNORED_READS': 'ignoredReads', - 'CORRECT_STRAND_READS': 'correctStrandReads', - 'INCORRECT_STRAND_READS': 'incorrectStrandReads', - 'PCT_RIBOSOMAL_BASES': 'pctRibosomalBases', - 'PCT_CODING_BASES': 'pctCodingBases', - 'PCT_UTR_BASES': 'pctUtrBases', - 'PCT_INTRONIC_BASES': 'pctIntronicBases', - 'PCT_INTERGENIC_BASES': 'pctIntergenicBases', - 'PCT_MRNA_BASES': 'pctMrnaBases', - 'PCT_USABLE_BASES': 'pctUsableBases', - 'PCT_CORRECT_STRAND_READS': 'pctCorrectStrandReads', - 'MEDIAN_CV_COVERAGE': 'medianCvCoverage', - 'MEDIAN_5PRIME_BIAS': 'median5PrimeBias', - 'MEDIAN_3PRIME_BIAS': 'median3PrimeBias', - 'MEDIAN_5PRIME_TO_3PRIME_BIAS': 'median5PrimeTo3PrimeBias', -} -# executables, default to ones in PATH -EXE_SAMTOOLS = 'samtools' -EXE_JAVA = 'java' - -# set locale to group digits -locale.setlocale(locale.LC_ALL, '') -int_fmt = functools.partial(locale.format, grouping=True, percent='%i') -float_fmt = functools.partial(locale.format, grouping=True, percent='%.2f') - - -class MetricsTracker(object): - - """Class to track metrics file output.""" - - def __init__(self, main_bams, chrs): - self.lock = threading.Lock() - self.chrs = chrs - self.files = {} - for rtype, bam in main_bams.items(): - self.files[bam] = { - 'chrs': dict.fromkeys(chrs), - 'strand': rtype, - } - - def add_stat_file(self, main_bam, chr, chr_stat): - self.lock.acquire() - self.files[main_bam]['chrs'][chr] = chr_stat - self.lock.release() - - def check_files(self): - # only for strand-specific rna-seq - for bam, data in self.files.items(): - for chr, path in data['chrs'].items(): - assert path is not None, "Missing statistics file for {0}, " \ - "chromosome {1}".format(bam, chr) - - -class Worker(threading.Thread): - - """Class representing worker to execute jobs.""" - - def __init__(self, queue, group=None, target=None, name=None, args=(), - kwargs={}): - threading.Thread.__init__(self, group, target, name, args, kwargs) - self.queue = queue - self.setDaemon(True) - self.start() - - def run(self): - while True: - func, args, kwargs = self.queue.get() - func(*args, **kwargs) - self.queue.task_done() - - -class ThreadPool(object): - - """Class representing thread pool to execute.""" - - def __init__(self, num_threads): - self.queue = Queue.Queue() - for _ in range(num_threads): - Worker(self.queue) - - def add_task(self, func, *args, **kwargs): - self.queue.put((func, args, kwargs)) - - def wait_completion(self): - self.queue.join() - - -def picard_metrics_worker(in_bam, chr, tracker, annot, jar, - samtools_exe, java_exe): - """Worker for collecting RNA-seq metrics.""" - # check if index exists - assert os.path.exists(in_bam + '.bai') - # create output directory, contains all chr stats - out_dir = os.path.splitext(in_bam)[0] + '.rna_metrics' - # create output directory - try: - os.makedirs(out_dir) - except OSError: - if not os.path.exists(out_dir): - raise - # if chr is none, do stat on all regions - if chr is None: - chr = 'ALL' - out_stat = os.path.join(out_dir, chr + '.rna_metrics.txt') - # split BAM file per chr, write to tmp file - if chr != 'ALL': - bam = tempfile.NamedTemporaryFile(prefix='tmp_rna_metrics_', delete=True) - name = bam.name - tokens = [samtools_exe, 'view', '-bh', '-o', bam.name, in_bam, chr] - proc = subprocess.Popen(tokens, stdout=bam) - while proc.poll() is None: - time.sleep(1) - else: - name = in_bam - picard_toks = [java_exe, '-jar', jar] - for key, value in os.environ.items(): - if key.startswith('OPT_PICARD_COLLECTRNASEQMETRICS_'): - # input, output, and annotation are handled separately - if key.endswith('INPUT') or key.endswith('OUTPUT') or \ - key.endswith('REF_FLAT'): - continue - if value: - picard_toks.append('%s=%s' % - (key.replace('OPT_PICARD_COLLECTRNASEQMETRICS_', ''), value)) - - picard_toks += ['REF_FLAT={0}'.format(annot), - 'STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND', - 'I={0}'.format(name), 'O={0}'.format(out_stat)] - picard = subprocess.Popen(picard_toks) - while picard.poll() is None: - time.sleep(1) - assert os.path.exists(out_stat) - if chr != 'ALL': - bam.close() - tracker.add_stat_file(in_bam, chr, out_stat) - - -def samtools_reads_per_region_count(bam_files, chrs, samtools_exe): - """Counts read per chromosome using samtools (simple count of mapped read - per region.""" - tokens = [samtools_exe, 'view', '-c', '-F', '0x4'] - keys = ['fwd', 'rev', 'mix'] - all_dict = dict.fromkeys(keys) - for rtype, bam in bam_files.items(): - assert rtype in keys, "Unknown key: {0}".format(rtype) - aggr_dict = {} - for chr in chrs: - if chr == 'ALL': - continue - proc = subprocess.Popen(tokens + [bam, chr], stdout=subprocess.PIPE) - count = int(proc.stdout.read()) - aggr_dict[chr] = { - 'metrics': {'countMapped': count} - } - name = os.path.basename(os.path.splitext(bam)[0]) - all_dict[rtype] = {} - all_dict[rtype]['bamFile'] = name - all_dict[rtype]['allMetrics'] = aggr_dict - - return all_dict - - -def picard_reads_per_region_count(bam_files, chrs, annot, jar, samtools_exe, java_exe): - """Counts read per chromosome using Picard and annotation files.""" - assert os.path.exists(annot), "Annotation file {0} not found".format(annot) - # only analyze sense and antisense reads - bam_files = {'fwd': bam_files['fwd'], 'rev': bam_files['rev']} - # create tracker for metric files - metrics_tracker = MetricsTracker(bam_files, chrs) - # create main task pool - metrics_pool = ThreadPool(args.threads) - # add tasks to the pool - for bam in bam_files.values(): - for chr in chrs: - metrics_pool.add_task(picard_metrics_worker, in_bam=bam, chr=chr, - tracker=metrics_tracker, annot=annot, jar=jar, - samtools_exe=samtools_exe, java_exe=java_exe) - metrics_pool.wait_completion() - # checks whether all required stat files are present - metrics_tracker.check_files() - return aggregate_metrics(metrics_tracker, chrs) - - -def prep_bam_file(bams, strand_spec, samtools_exe): - """Index input BAM files and return a dictionary of BAM files to process.""" - for in_bam in in_bams.values(): - bam = os.path.abspath(in_bam) - assert os.path.exists(bam), "File {0} does not exist".format(in_bam) - if not os.path.exists(bam + '.bai'): - subprocess.call([samtools_exe, 'index', bam]) - return bams - - -def parse_metrics_file(metrics_path): - """Given a path to a Picard CollectRnaSeqMetrics output file, return a - dictionary consisting of its column, value mappings. - """ - data_mark = 'PF_BASES' - tokens = [] - with open(metrics_path, 'r') as source: - line = source.readline().strip() - fsize = os.fstat(source.fileno()).st_size - while True: - if not line.startswith(data_mark): - # encountering EOF before metrics is an error - if source.tell() == fsize: - raise ValueError("Metrics not found inside %r" % \ - metrics_path) - line = source.readline().strip() - else: - break - - assert line.startswith(data_mark) - # split header line and append to tokens - tokens.append(line.split('\t')) - # and the values (one row after) - tokens.append(source.readline().strip().split('\t')) - data = {} - for col, value in zip(tokens[0], tokens[1]): - if not value: - data[COL_NAMES[col]] = None - elif col.startswith('PCT') or col.startswith('MEDIAN'): - if value != '?': - data[COL_NAMES[col]] = float(value) - else: - warnings.warn("Undefined value for %s in %s: %s" % (col, - metrics_path, value)) - data[COL_NAMES[col]] = None - else: - assert col in COL_NAMES, 'Unknown column: %s' % col - data[COL_NAMES[col]] = int(value) - - return data - - -def write_json(out_file, data, **kwargs): - with open(out_file, 'w') as jsonfile: - json.dump(data, jsonfile, sort_keys=True, indent=4, - separators=(',', ': ')) - - -def write_html(out_file, data, chrs, is_strand_spec): - if is_strand_spec: - table_func = build_table_html_ss - tpl = open(prog_path('rna_metrics.html')).read() - else: - table_func = build_table_html_nonss - tpl = open(prog_path('rna_metrics_nonss.html')).read() - - html_data = table_func(data, chrs) - with open(out_file, 'w') as htmlfile: - htmlfile.write(tpl.format(**html_data)) - - -def get_base_counts(metrics): - res = { - 'total': metrics['pfAlignedBases'], - 'exonic': metrics['utrBases'] + metrics['codingBases'], - 'intronic': metrics['intronicBases'], - 'intergenic': metrics['intergenicBases'], - } - # count percentages - for reg in ('exonic', 'intronic', 'intergenic'): - res[reg + '_pct'] = res[reg] * 100.0 / res['total'] - # format for display - for key, value in res.items(): - if key.endswith('_pct'): - res[key] = float_fmt(value=value) - else: - res[key] = int_fmt(value=value) - - return res - - -def build_table_html_nonss(data, chrs): - assert data['mix'] is not None and (data['fwd'] is None and data['rev'] is - None), "Invalid data %r" % data - mix = data['mix']['allMetrics'] - read_table = [ - '<table>', - '<tr>', - '<th>Chromosome</th>', - '<th>Reads</th>', - '</tr>' - ] - rrow_tpl = '<tr><td>{0}</td><td>{1}</td>' - for chr in chrs: - # not showing all stats in table, per chr only - if chr == 'ALL': - continue - count = int_fmt(value=mix[chr]['metrics']['countMapped']) - read_table.append(rrow_tpl.format(chr, count)) - read_table.append('</table>') - - return {'table_read_count': '\n'.join(read_table), - 'css': open(prog_path('rna_metrics.css')).read()} - - -def build_table_html_ss(data, chrs): - read_table = [ - '<table>', - '<tr>', - '<th rowspan="2">Chromosome</th>', - '<th>Mapped</th>', - '<th>Sense Annotation Only</th>', - '<th>Antisense Annotation Only</th>', - '</tr>' - '<tr>', - '<th>Both strands</th>', - '<th><green>Sense</green> / <red>Antisense</red> Reads</th>', - '<th><green>Antisense</green> / <red>Sense</red> Reads</th>', - '</tr>' - ] - rrow_tpl = '<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td></tr>' - rcell_tpl = '<green>{0}</green> / <red>{1}</red>' - - fwd_data, rev_data = data['fwd']['allMetrics'], data['rev']['allMetrics'] - mix_data = data['mix']['allMetrics'] - for chr in chrs: - # not showing all stats in table, per chr only - if chr == 'ALL': - continue - cur_fwd = fwd_data[chr]['metrics'] - cur_rev = rev_data[chr]['metrics'] - mix_count = mix_data[chr]['metrics']['countMapped'] - sense = rcell_tpl.format( - int_fmt(value=cur_fwd['correctStrandReads']), - int_fmt(value=cur_fwd['incorrectStrandReads'])) - antisense = rcell_tpl.format( - int_fmt(value=cur_rev['correctStrandReads']), - int_fmt(value=cur_rev['incorrectStrandReads'])) - read_table.append(rrow_tpl.format(chr, int_fmt(value=mix_count), - sense, antisense)) - read_table.append('</table>') - - base_table = [ - '<table>', - '<tr>', - '<th rowspan="2">Region</th>', - '<th colspan="2">Sense Annotation</th>', - '<th colspan="2">Antisense Annotation</th>', - '</tr>' - '<tr>', - '<th>Count</th>', '<th>%</th>', - '<th>Count</th>', '<th>%</th>', - '</tr>' - ] - crow_tpl = [ - '<tr>', - '<td>{0}</td>', '<td>{1}</td>', - '<td>{2}</td>', '<td>{3}</td>', - '<td>{4}</td>', - '</tr>', - ] - crow_tpl = ''.join(crow_tpl) - fwd_bcounts = get_base_counts(fwd_data['ALL']['metrics']) - rev_bcounts = get_base_counts(rev_data['ALL']['metrics']) - for reg in ('exonic', 'intronic', 'intergenic'): - pct = reg + '_pct' - base_table.append( - crow_tpl.format(reg.capitalize(), fwd_bcounts[reg], - fwd_bcounts[pct], rev_bcounts[reg], - rev_bcounts[pct])) - base_table.append('</table>') - - return {'table_read_count': '\n'.join(read_table), - 'table_base_count': '\n'.join(base_table), - 'css': open(prog_path('rna_metrics.css')).read()} - - -def aggregate_metrics(tracker, chrs): - """Aggregates all RNA seq metrics data into a single file.""" - all_dict = {} - keys = ['fwd', 'rev', 'mix'] - all_dict = dict.fromkeys(keys) - for bam, stats in tracker.files.items(): - assert stats['strand'] in keys - aggr_dict = {} - for chr, source in stats['chrs'].items(): - aggr_dict[chr] = { - 'fileName': os.path.basename(source), - 'metrics': parse_metrics_file(source), - } - name = os.path.basename(os.path.splitext(bam)[0]) - all_dict[stats['strand']] = {} - all_dict[stats['strand']]['bamFile'] = name - all_dict[stats['strand']]['allMetrics'] = aggr_dict - - return all_dict - - -def prog_path(fname): - prog_dir = os.path.join(os.path.dirname(os.path.abspath(__file__))) - return os.path.join(prog_dir, fname) - - -if __name__ == '__main__': - # params: - # main bam file - # thread num - # option to compare with annotation source (produces correct/incorrect counts) - # strand specific or not strand specific - # path to picard's collectrnaseqmetrics - # json output path - # samtools binary (default: environment samtools) - # java binary (default: environment java) - # picard options: - parser = argparse.ArgumentParser() - - parser.add_argument('m_bam', type=str, - help='Path to BAM file containing both sense and antisense reads') - parser.add_argument('--s-bam', dest='s_bam', type=str, - help='Path to BAM file containing sense reads') - parser.add_argument('--as-bam', dest='as_bam', type=str, - help='Path to BAM file containing antisense reads') - parser.add_argument('-o', '--outfile', dest='out_file', type=str, - help='Path to output file') - parser.add_argument('-t', '--threads', dest='threads', - default=1, type=int, help='Number of threads to use') - parser.add_argument('--chrs', dest='chrs', - default=prog_path('chrs.txt'), - help='Path to file containing chromosome names') - parser.add_argument('-a', '--annotation', dest='annot', - help='Annotation source') - parser.add_argument('--html', dest='is_html', - action='store_true', - help='Output HTML file') - parser.add_argument('--jar', dest='jar', type=str, - help='Path to Picard\'s CollectRnaSeqMetrics.jar') - parser.add_argument('--java', dest='java', type=str, - default=EXE_JAVA, - help='Path to java executable') - parser.add_argument('--samtools', dest='samtools', type=str, - default=EXE_SAMTOOLS, - help='Path to samtools executable') - - args = parser.parse_args() - - if args.s_bam is not None and args.as_bam is not None: - is_strand_spec = True - in_bams = {'mix': args.m_bam, 'fwd': args.s_bam, 'rev': args.as_bam} - elif args.s_bam is None and args.as_bam is None: - is_strand_spec = False - in_bams = {'mix': args.m_bam} - else: - raise ValueError("Incomplete argument: either sense or antisense BAM " - "files are not specified.") - - chrs = [line.strip() for line in open(args.chrs, 'r')] + ['ALL'] - # check for paths and indices - bam_files = prep_bam_file(in_bams, is_strand_spec, args.samtools) - # use picard and samtools if it's strand-specific - if is_strand_spec: - aggr_data = picard_reads_per_region_count(bam_files, chrs, args.annot, - args.jar, args.samtools, args.java) - sam_data = samtools_reads_per_region_count(bam_files, chrs, - args.samtools) - aggr_data['mix'] = sam_data['mix'] - # otherwise use samtools only - else: - aggr_data = samtools_reads_per_region_count(bam_files, chrs, - args.samtools) - - # write to output file - if args.out_file is None: - ext = '.html' if args.is_html else '.json' - out_file = 'rna_metrics_out' + ext - else: - out_file = args.out_file - - write_func = write_html if args.is_html else write_json - write_func(out_file, aggr_data, chrs=chrs, is_strand_spec=is_strand_spec) diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/gentrap_front.png b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/gentrap_front.png deleted file mode 100644 index 7ecf40f6998c6b6ce833d1f352efe672b4e32859..0000000000000000000000000000000000000000 Binary files a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/gentrap_front.png and /dev/null differ diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/lib.tex b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/lib.tex deleted file mode 100644 index 7650cbc6bb0125aadcdaa78ece93436cde2c0551..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/lib.tex +++ /dev/null @@ -1,32 +0,0 @@ -\section{Library "((( lib.name )))" Results} -\label{lib:(((lib.name)))} - -\subsection{Input information} -\label{sec:seq} - -\begin{center} - \captionof{table}{Input files} - \label{tab:annotfiles} - \begin{longtable}{ l l p{0.4\textwidth} } - \hline - File & Checksum & Name\\ - \hline \hline - \endhead - \hline - \multicolumn{3}{c}{\textit{Continued on next page}}\\ - \hline - \endfoot - \hline - \endlastfoot - Read 1 file & ((( lib.flexiprep_files.input_R1.md5|truncate(7, True, "") ))) & ((( lib.flexiprep_files.input_R1.path|basename )))\\ - ((* if lib.flexiprep_files.input_R2 *)) - Read 2 file & ((( lib.flexiprep_files.input_R2.md5|truncate(7, True, "") ))) & ((( lib.flexiprep_files.input_R2.path|basename )))\\ - ((* endif *)) - \end{longtable} -\end{center} -% HACK: to keep table counters in sync -\addtocounter{table}{-1} - -((* include "lib_mapping.tex" *)) -\clearpage -((* include "lib_seqeval.tex" *)) diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/lib_mapping.tex b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/lib_mapping.tex deleted file mode 100644 index 39ec179065ba964b6c9cf31af5451ee6227f827f..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/lib_mapping.tex +++ /dev/null @@ -1,112 +0,0 @@ -\subsection{Mapping} -\label{sec:map-((( lib.sample.name )))-((( lib.name )))} - -\subsubsection{Mapping statistics} - -\indent - -% number + percentage of reads mapped to genome -% number + percentage of properly paired reads -\begin{center} - \captionof{table}{Mapping Overview} - \label{tab:bamstat-((( lib.sample.name )))-((( lib.name )))} - \setlength{\tabcolsep}{11pt} - ((* if lib.is_paired_end *)) - \begin{tabular}{ l r r r } - \hline - \multirow{2}{*}{Parameter} & \multicolumn{1}{c}{All Pairs} & \multicolumn{1}{c}{First in Pairs} & \multicolumn{1}{c}{Second in Pairs} \\ - & Value & Value & Value \\ - \hline \hline - Total reads & ((( lib.aln_metrics.PAIR.total_reads|nice_int ))) & ((( lib.aln_metrics.FIRST_OF_PAIR.total_reads|nice_int ))) & ((( lib.aln_metrics.SECOND_OF_PAIR.total_reads|nice_int ))) \\ - Mean read length & ((( lib.aln_metrics.PAIR.mean_read_length|nice_flt ))) & ((( lib.aln_metrics.FIRST_OF_PAIR.mean_read_length|nice_flt ))) & ((( lib.aln_metrics.SECOND_OF_PAIR.mean_read_length|nice_flt ))) \\ - Strand balance & ((( lib.aln_metrics.PAIR.strand_balance|nice_flt ))) & ((( lib.aln_metrics.FIRST_OF_PAIR.strand_balance|nice_flt ))) & ((( lib.aln_metrics.SECOND_OF_PAIR.strand_balance|nice_flt ))) \\ - \% Mapped to reference & ((( lib.aln_metrics.PAIR.pct_pf_reads_aligned|float2nice_pct )))\% & ((( lib.aln_metrics.FIRST_OF_PAIR.pct_pf_reads_aligned|float2nice_pct )))\% & ((( lib.aln_metrics.SECOND_OF_PAIR.pct_pf_reads_aligned|float2nice_pct )))\% \\ - \% Mapped to reference (MAPQ >= 20) & ((( lib.aln_metrics.PAIR.pct_pf_reads_aligned|float2nice_pct )))\% & ((( lib.aln_metrics.FIRST_OF_PAIR.pct_pf_reads_aligned|float2nice_pct )))\% & ((( lib.aln_metrics.SECOND_OF_PAIR.pct_pf_reads_aligned|float2nice_pct )))\% \\ - Mismatch rate & ((( lib.aln_metrics.PAIR.pf_mismatch_rate|float2nice_pct )))\% & ((( lib.aln_metrics.FIRST_OF_PAIR.pf_mismatch_rate|float2nice_pct )))\% & ((( lib.aln_metrics.SECOND_OF_PAIR.pf_mismatch_rate|float2nice_pct )))\% \\ - Indel rate & ((( lib.aln_metrics.PAIR.pf_indel_rate|float2nice_pct )))\% & ((( lib.aln_metrics.FIRST_OF_PAIR.pf_indel_rate|float2nice_pct )))\% & ((( lib.aln_metrics.SECOND_OF_PAIR.pf_indel_rate|float2nice_pct )))\% \\ - Chimeras & ((( lib.aln_metrics.PAIR.pct_chimeras|float2nice_pct )))\% & ((( lib.aln_metrics.FIRST_OF_PAIR.pct_chimeras|float2nice_pct )))\% & ((( lib.aln_metrics.SECOND_OF_PAIR.pct_chimeras|float2nice_pct )))\% \\ - \hline - ((* else *)) - \begin{tabular}{ l r } - \hline - \multirow{1}{*}{Parameter} & \multicolumn{1}{c}{Value} \\ - \hline \hline - Total reads & ((( lib.aln_metrics.UNPAIRED.total_reads|nice_int ))) \\ - Mean read length & ((( lib.aln_metrics.UNPAIRED.mean_read_length|nice_flt ))) \\ - Strand balance & ((( lib.aln_metrics.UNPAIRED.strand_balance|nice_flt ))) \\ - \% Mapped to reference & ((( lib.aln_metrics.UNPAIRED.pct_pf_reads_aligned|float2nice_pct )))\% \\ - \% Mapped to reference (MAPQ >= 20) & ((( lib.aln_metrics.UNPAIRED.pct_pf_reads_aligned|float2nice_pct )))\% \\ - Mismatch rate & ((( lib.aln_metrics.UNPAIRED.pf_mismatch_rate|float2nice_pct )))\% \\ - Indel rate & ((( lib.aln_metrics.UNPAIRED.pf_indel_rate|float2nice_pct )))\% \\ - \hline - ((* endif *)) - \end{tabular} -\end{center} - -((* if lib.is_paired_end *)) -% inferred insert size distribution -\subsubsection{Insert size distribution} - -\IfFileExists{((( lib.inserts_metrics_files.insert_size_histogram.path )))} -{ - \begin{figure}[h!] - \centering - \includegraphics[width=0.7\textwidth]{((( lib.inserts_metrics_files.insert_size_histogram.path )))} - \caption{Distribution of insert size length of paired-end reads mapped to opposite strands.} - \end{figure} -} -((= TODO: strand-specific stats -%{ -% \IfFileExists{((( vars['OUT_DIR'] )))/((( vars['SAMPLE'] ))).f.insertsizes.png} -% { -% \begin{figure}[h!] -% \centering -% \includegraphics[width=0.7\textwidth]{((( vars['OUT_DIR'] )))/((( vars['SAMPLE'] ))).f.insertsizes.png} -% \caption{Distribution of insert size length of paired-end reads whose first read maps to the minus strand.} -% \end{figure} -% }{} -% \IfFileExists{((( vars['OUT_DIR'] )))/((( vars['SAMPLE'] ))).r.insertsizes.png} -% { -% \begin{figure}[h!] -% \centering -% \includegraphics[width=0.7\textwidth]{((( vars['OUT_DIR'] )))/((( vars['SAMPLE'] ))).r.insertsizes.png} -% \caption{Distribution of insert size length of paired-end reads whose first read maps to the plus strand.} -% \end{figure} -% }{} -%} -=)) -((* endif *)) - -\subsubsection{RNA-specific metrics} - -\IfFileExists{((( lib.rna_metrics_files.output_chart.path )))} -{ - \begin{figure}[h!] - \centering - \includegraphics[width=0.7\textwidth]{((( lib.rna_metrics_files.output_chart.path )))} - \caption{Normalized coverage bias plot.} - \end{figure} -} - -\begin{center} - \captionof{table}{Functional annotation metrics} - \label{tab:fannot-((( lib.sample.name )))-((( lib.name ))))} - \setlength{\tabcolsep}{11pt} - \begin{tabular}{ l r r r } - \hline - \multirow{2}{*}{Parameter} & \multicolumn{3}{c}{Value} \\ - & Count & \% of all & \% of aligned \\ - \hline \hline - Total bases & ((( lib.rna_metrics.pf_bases|nice_int ))) & 100\% & - \\ - Aligned bases & ((( lib.rna_metrics.pf_aligned_bases|nice_int ))) & ((( lib.rna_metrics.pct_aligned_bases_all|float2nice_pct )))\% & ((( lib.rna_metrics.pct_aligned_bases|float2nice_pct )))\% \\ - Exonic bases & ((( lib.rna_metrics.exonic_bases|nice_int ))) & ((( lib.rna_metrics.pct_exonic_bases_all|float2nice_pct )))\% & ((( lib.rna_metrics.pct_exonic_bases|float2nice_pct )))\% \\ - \hspace*{4mm}Coding bases & ((( lib.rna_metrics.coding_bases|nice_int ))) & ((( lib.rna_metrics.pct_coding_bases_all|float2nice_pct )))\% & ((( lib.rna_metrics.pct_coding_bases|float2nice_pct )))\% \\ - \hspace*{4mm}UTR bases & ((( lib.rna_metrics.utr_bases|nice_int ))) & ((( lib.rna_metrics.pct_utr_bases_all|float2nice_pct )))\% & ((( lib.rna_metrics.pct_utr_bases|float2nice_pct )))\% \\ - Intronic bases & ((( lib.rna_metrics.intronic_bases|nice_int ))) & ((( lib.rna_metrics.pct_intronic_bases_all|float2nice_pct )))\% & ((( lib.rna_metrics.pct_intronic_bases|float2nice_pct )))\% \\ - Intergenic bases & ((( lib.rna_metrics.intergenic_bases|nice_int ))) & ((( lib.rna_metrics.pct_intergenic_bases_all|float2nice_pct )))\% & ((( lib.rna_metrics.pct_intergenic_bases|float2nice_pct )))\% \\ - ((* if lib.rna_metrics.ribosomal_bases != "" *)) - Ribosomal bases & ((( lib.rna_metrics.ribosomal_bases|nice_int ))) & ((( lib.rna_metrics.pct_ribosomal_bases_all|float2nice_pct )))\% & ((( lib.rna_metrics.pct_ribosomal_bases|float2nice_pct )))\% \\ - ((* endif *)) - \hline - \end{tabular} -\end{center} diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/lib_seqeval.tex b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/lib_seqeval.tex deleted file mode 100644 index 8e28e56e9defea0295be0a38005b4bcc6b68e59c..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/lib_seqeval.tex +++ /dev/null @@ -1,456 +0,0 @@ -\subsection{Sequencing Results Evaluation} -\label{sec:seq} - -This section contains statistics of the raw sequencing results. Statistics of -the preprocessing step(s) may be shown as well, depending on which preprocessing -steps were performed. -(Table~\ref{tab:pipelineparams}). - -\indent - -All statistics, except for the per sequence \%GC graph, were collected using -FastQC. Visit the -\href{http://www.bioinformatics.babraham.ac.uk/projects/fastqc/}{FastQC website} -for more detailed explanations of them. - -\subsubsection{Overview} -\label{subsec:seq-overview} -There are four types of preprocessing that may be done: - -\begin{description} - \item[\textit{none}] \hfill \\ - No preprocessing step. - \item[\textit{adapter clipping}] \hfill \\ - Removal of known adapter sequences present in the sequencing - reads. The list of sequences are retrieved from the FastQC contaminant - list, which is packaged with the FastQC released used in this pipeline. - \item[\textit{quality trimming}] \hfill \\ - Removal of all low-quality bases that are often found in the 5' - or 3' ends of the reads. - \item[\textit{adapter clipping followed by quality trimming}] \hfill \\ - Both adapter clipping and base quality trimming. -\end{description} -\indent -Your chosen preprocessing method was:\textbf{ -((* if lib.clipping *))adapter clipping -((* elif lib.trimming *))quality trimming -((* elif lib.clipping and lib.trimming *))adapter clipping followed by quality trimming((* endif *))}. - -((* if lib.clipping *)) -\subsubsection{Adapter removal} -Known adapter sequences found in the raw data are listed in -Table~\ref{tab:adapters}. For each adapter sequence, the count of its -occurence (partially or whole) is also listed. The presence of these -adapters do not always result in the FASTQ records to be discarded. -FASTQ records are only discarded if clipping of the adapter sequence -results in sequences shorter than the threshold set in cutadapt. - -\indent - -For the complete list of known adapter sequences, consult the -\href{http://www.bioinformatics.babraham.ac.uk/projects/fastqc/}{official FastQC website}. -More information about clipping using cutadapt is available on the -\href{https://code.google.com/p/cutadapt/}{official cutadapt website}. - -%\ClipContamTable -\begin{center} - \captionof{table}{Adapter Sequences Present in the Sample} - \label{tab:adapters} - \begin{longtable}{ p{14mm} r p{0.4\textwidth} r } - \hline - Read & Discarded & Adapter & Occurence\\ - \hline \hline - \endhead - \hline - \multicolumn{4}{c}{\textit{Continued on next page}}\\ - \hline - \endfoot - \hline - \endlastfoot - ((* if lib.flexiprep.stats.clipping_R1 *)) - ((* if lib.flexiprep.stats.clipping_R1.adapters *)) - Read 1 & ((( lib.flexiprep.stats.clipping_R1.num_reads_affected|nice_int ))) - ((* for adapter, stat in lib.flexiprep.stats.clipping_R1.adapters.iteritems() *)) - ((* if loop.first *)) - & ((( adapter ))) & ((( stat|nice_int )))\\ - ((* else *)) - & & ((( adapter ))) & ((( stat|nice_int )))\\ - ((* endif *)) - ((* endfor *)) - ((* else *)) - Read 1 & 0 & \textit{none found} & 0\\ - ((* endif *)) - ((* if lib.is_paired_end *)) - ((* if lib.flexiprep.stats.clipping_R2.adapters *)) - Read 2 & ((( lib.flexiprep.stats.clipping_R2.num_reads_affected|nice_int ))) - ((* for adapter, stat in lib.flexiprep.stats.clipping_R2.adapters.iteritems() *)) - ((* if loop.first *)) - & ((( adapter ))) & ((( stat|nice_int )))\\ - ((* else *)) - & & ((( adapter ))) & ((( stat|nice_int )))\\ - ((* endif *)) - ((* endfor *)) - ((* else *)) - Read 2 & 0 & \textit{none found} & 0\\ - ((* endif *)) - ((* endif *)) - ((* endif *)) - \end{longtable} -\end{center} -\addtocounter{table}{-1} - -((* endif *)) - -\vspace{2mm} -((* if lib.trimming and lib.is_paired_end *)) -\subsubsection{Base quality trimming} - Summary of the trimming step is available in Table~\ref{tab:trim}. In short, - sickle is used to trim the 5' and 3' ends of each FASTQ records so that low - quality bases are trimmed off. If after trimming the FASTQ record becomes - shorter than the threshold set by sickle, the entire sequence is discarded. For this - step, read pair completeness check was done along with trimming. - - \indent - - More information about sickle is available on the - \href{https://github.com/najoshi/sickle}{official sickle website}. - - \begin{center} - \captionof{table}{Summary of quality trimming step} - \label{tab:trim} - \begin{tabular}{ l r } - \hline - Parameter & Count\\ \hline \hline - Discarded FASTQ records from read 1 & ((( lib.flexiprep.stats.trimming_R1.num_reads_discarded_total|nice_int )))\\ - Discarded FASTQ records from read 2 & ((( lib.flexiprep.stats.trimming_R2.num_reads_discarded_total|nice_int )))\\ - \hline - \end{tabular} - \end{center} -((* endif *)) -%\vspace{2mm} - -\subsubsection{Basic statistics} -%\label{subsec:seq_basic} - -Basic statistics on the FASTQ files are shown below. For paired-end reads, the -read count numbers of read 1 and read 2, both before and after preprocessing, -must match. Read lengths are likely to vary after preprocessing, due to selective -clipping and trimming of the reads. - -\begin{center} - \captionof{table}{Basic Run Statistics} - %\label{tab:basestats} -((* if lib.clipping or lib.trimming *)) - \begin{longtable}{ l r r } - \hline - Parameter & Raw & Preprocessed \\ \hline \hline - \hline \hline - \endhead - \hline - \multicolumn{3}{c}{\textit{Continued on next page}}\\ - \hline - \endfoot - \hline - \endlastfoot - Read 1 count & ((( lib.fastqc_r1.basic_statistics.data["Total Sequences"] ))) & ((( lib.fastqc_r1_qc.basic_statistics.data["Total Sequences"] )))\\ - Read 1 overall \%GC & ((( lib.fastqc_r1.basic_statistics.data["%GC"] ))) & ((( lib.fastqc_r1_qc.basic_statistics.data["%GC"] )))\\ - Read 1 length range & ((( lib.fastqc_r1.basic_statistics.data["Sequence length"] ))) & ((( lib.fastqc_r1_qc.basic_statistics.data["Sequence length"] )))\\ - \hline - ((* if lib.is_paired_end *)) - Read 2 count & ((( lib.fastqc_r2.basic_statistics.data["Total Sequences"] ))) & ((( lib.fastqc_r2_qc.basic_statistics.data["Total Sequences"] )))\\ - Read 2 overall \%GC & ((( lib.fastqc_r2.basic_statistics.data["%GC"] ))) & ((( lib.fastqc_r2_qc.basic_statistics.data["%GC"] )))\\ - Read 2 length range & ((( lib.fastqc_r2.basic_statistics.data["Sequence length"] ))) & ((( lib.fastqc_r2_qc.basic_statistics.data["Sequence length"] )))\\ - \hline - ((* endif *)) -((* else *)) - \begin{longtable}{ l r } - \hline - Parameter & Raw \\ \hline \hline - \hline \hline - \endhead - \hline - \multicolumn{2}{c}{\textit{Continued on next page}}\\ - \hline - \endfoot - \hline - \endlastfoot - Read 1 count & ((( lib.fastqc_r1.basic_statistics.data["Total Sequences"] )))\\ - Read 1 overall \%GC & ((( lib.fastqc_r1.basic_statistics.data["%GC"] )))\\ - Read 1 length range & ((( lib.fastqc_r1.basic_statistics.data["Sequence length"] )))\\ - \hline - ((* if lib.is_paired_end *)) - Read 1 count & ((( lib.fastqc_r1.basic_statistics.data["Total Sequences"] )))\\ - Read 1 overall \%GC & ((( lib.fastqc_r1.basic_statistics.data["%GC"] )))\\ - Read 1 length range & ((( lib.fastqc_r1.basic_statistics.data["Sequence length"] )))\\ - \hline - ((* endif *)) -((* endif *)) - \end{longtable} -\end{center} - -% sequence length distribution -\subsubsection{Read length distribution} - Read length distribution for the raw read pair data are shown below. - Depending on your chosen preprocessing step, the length distribution for the - preprocessed data may be shown as well. The length distribution for the - preprocessed data usually becomes less uniform compared to the raw data due - to the variable removal of the bases in each read. - \begin{figure}[h!] - \centering - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_files.plot_sequence_length_distribution.path )))} - } - \end{minipage} - \begin{minipage}[b]{0.48\textwidth} - \centering - ((* if lib.trimming or lib.clipping *)) - \subfloat[Preprocessed read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_qc_files.plot_sequence_length_distribution.path )))} - } - ((* endif *)) - \end{minipage} - \caption{Read length distribution for read 1.} - %\label{fig:length_dist_before_and_after_1} - \end{figure} - -((* if lib.is_paired_end *)) - \begin{figure}[h!] - \centering - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_files.plot_sequence_length_distribution.path )))} - } - \end{minipage} - \begin{minipage}[b]{0.48\textwidth} - \centering - ((* if lib.trimming or lib.clipping *)) - \subfloat[Preprocessed read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_qc_files.plot_sequence_length_distribution.path )))} - } - ((* endif *)) - \end{minipage} - \caption{Read length distribution for read 2.} - %\label{fig:length_dist_before_and_after_2} - \end{figure} -((* endif *)) - -% per base sequence quality -\subsubsection{Per base sequence quality} - Here, the sequence quality for different base positions in all read pairs - are shown. For each figure, the central line represents the median value, - the blue represents the mean, the yellow box represents the inter-quartile - range (25\%-75\%), and the upper and lower whiskers represent the 10\% and - 90\% points respectively. The green-shaded region marks good quality calls, - orange-shaded regins mark reasonable quality calls, and red-shaded regions - mark poor quality calls. - - \indent - - Note that the latter base positions shown in the figures are - sometimes aggregates of multiple positions. - \begin{figure}[h!] - \centering - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_files.plot_per_base_quality.path )))} - } - \end{minipage} - \begin{minipage}[b]{0.48\textwidth} - \centering - ((* if lib.trimming or lib.clipping *)) - \subfloat[Preprocessed read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_qc_files.plot_per_base_quality.path )))} - } - ((* endif *)) - \end{minipage} - \caption{Per base quality before and after processing read 1.} - %\label{fig:per_base_qual_before_and_after_1} - \end{figure} - -((* if lib.is_paired_end *)) - \begin{figure}[h!] - \centering - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_files.plot_per_base_quality.path )))} - } - \end{minipage} - \begin{minipage}[b]{0.48\textwidth} - \centering - ((* if lib.trimming or lib.clipping *)) - \subfloat[Preprocessed read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_qc_files.plot_per_base_quality.path )))} - } - ((* endif *)) - \end{minipage} - \caption{Per base quality before and after processing for read 2.} - %\label{fig:per_base_qual_before_and_after_2} - \end{figure} -((* endif *)) - -% per sequence quality scores -\subsubsection{Per sequence quality scores} - The read quality score distributions are shown below. - \begin{figure}[h!] - \centering - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_files.plot_per_sequence_quality.path )))} - } - \end{minipage} - \begin{minipage}[b]{0.48\textwidth} - \centering - ((* if lib.trimming or lib.clipping *)) - \subfloat[Preprocessed read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_qc_files.plot_per_sequence_quality.path )))} - } - ((* endif *)) - \end{minipage} - \caption{Per sequence quality scores before and after preprocessing read 1.} - %\label{fig:per_seq_qual_before_and_after_1} - \end{figure} - -((* if lib.is_paired_end *)) - \begin{figure}[h!] - \centering - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_files.plot_per_sequence_quality.path )))} - } - \end{minipage} - \begin{minipage}[b]{0.48\textwidth} - \centering - ((* if lib.trimming or lib.clipping *)) - \subfloat[Preprocessed read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_qc_files.plot_per_sequence_quality.path )))} - } - ((* endif *)) - \end{minipage} - \caption{Per sequence quality scores before and after preprocessing for read 2.} - %\label{fig:per_seq_qual_before_and_after_2} - \end{figure} -((* endif *)) - -% per base sequence content -\subsubsection{Per base sequence content} - The figures below plot the occurence of each nucleotide in each position in - the reads. In a completely random library, you would expect the differences - among each nucleotide to be minor. You may sometimes see a skewed - proportion of nucleotides near the start of the read due to the use of - non-random cDNA during sample preparation. - - \indent - - Note that the latter base positions shown in the figures are - sometimes aggregates of multiple positions. - \begin{figure}[h!] - \centering - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_files.plot_per_base_sequence_content.path )))} - } - \end{minipage} - \begin{minipage}[b]{0.48\textwidth} - \centering - ((* if lib.trimming or lib.clipping *)) - \subfloat[Preprocessed read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_qc_files.plot_per_base_sequence_content.path )))} - } - ((* endif *)) - \end{minipage} - \caption{Per base sequence content before and after preprocessing for read 1.} - %\label{fig:per_base_content_before_and_after_1} - \end{figure} - -((* if lib.is_paired_end *)) - \begin{figure}[h!] - \centering - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_files.plot_per_base_sequence_content.path )))} - } - \end{minipage} - \begin{minipage}[b]{0.48\textwidth} - \centering - ((* if lib.trimming or lib.clipping *)) - \subfloat[Preprocessed read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_qc_files.plot_per_base_sequence_content.path )))} - } - ((* endif *)) - \end{minipage} - \caption{Per base sequence content before and after preprocessing for read 2.} - %\label{fig:per_base_content_before_and_after_2} - \end{figure} -((* endif *)) - - -% per sequence GC content -\subsubsection{Per sequence GC content} - The figures below show the GC percentage distribution of all the read pair - data. - \begin{figure}[h!] - \centering - ((* if lib.trimming or lib.clipping *)) - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_files.plot_per_sequence_gc_content.path )))} - } - \end{minipage} - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Preprocessed read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_qc_files.plot_per_sequence_gc_content.path )))} - } - \end{minipage} - ((* else *))) - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 1]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r1_files.plot_per_sequence_gc_content.path )))} - } - \end{minipage} - ((* endif *)) - \caption{Per sequence GC content before and after preprocessing for read 1.} - %\label{fig:per_base_content_before_and_after_1} - \end{figure} - -((* if lib.is_paired_end *)) - \begin{figure}[h!] - \centering - ((* if lib.trimming or lib.clipping *)) - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_files.plot_per_sequence_gc_content.path )))} - } - \end{minipage} - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Preprocessed read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_qc_files.plot_per_sequence_gc_content.path )))} - } - \end{minipage} - ((* else *)) - \begin{minipage}[b]{0.48\textwidth} - \centering - \subfloat[Raw read 2]{ - \includegraphics[width=\textwidth]{((( lib.fastqc_r2_files.plot_per_sequence_gc_content.path )))} - } - \end{minipage} - ((* endif *)) - \caption{Per sequence GC content before and after preprocessing for read 2.} - %\label{fig:per_base_content_before_and_after_2} - \end{figure} -((* endif *)) - - diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/main.tex b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/main.tex deleted file mode 100644 index d9365883b62103c1272e330628b6d772b5fd1632..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/main.tex +++ /dev/null @@ -1,307 +0,0 @@ -\documentclass[a4paper,12pt]{article} -\usepackage[a4paper,margin=1in]{geometry} -\usepackage[T1]{fontenc} -\usepackage[usenames,dvipsnames]{xcolor} -\usepackage{longtable} -\usepackage{graphicx} -\usepackage{subfig} -\usepackage{listings} -\usepackage{verbatim} -\usepackage{multirow} -\usepackage{url} -\usepackage{grffile} -\usepackage[superscript]{cite} -% must be located here, so we can handle filenames with underscores -\newcommand{\UnderscoreCommands}{\do\IfFileExists \do\verbatiminput% - \do\verbatimtabinput \do\citeNP \do\citeA \do\citeANP \do\citeN% - \do\shortcite \do\shortciteNP \do\shortciteA \do\shortciteANP% - \do\shortciteN \do\citeyear \do\citeyearNP% -} -\usepackage[strings]{underscore} - -\usepackage{fancyhdr} -\usepackage{hyperref} - -\setlength{\tabcolsep}{20pt} -\renewcommand{\arraystretch}{1.3} -\renewcommand{\familydefault}{\sfdefault} - -% requires the titling package, for subtitles -%\newcommand{\subtitle}[1]{ -% \posttitle{ -% \par\end{center} -% \begin{center}\large#1\end{center} -% \vskip0.5em}} - -\pagestyle{fancy} -\setlength{\headheight}{15.2pt} - -\fancyhf{} -\fancyhead[LE,RO]{\thepage} -\fancyhead[RE]{\textit{\nouppercase{\leftmark}}} -\fancyhead[LO]{\textit{\nouppercase{\rightmark}}} - -\begin{document} -\setlength{\parindent}{0in} -%\title{\Huge Gentrap Run Report} -\title{\resizebox{0.7\linewidth}{!}{\itshape Gentrap Run Report}} -\author{LUMC Sequencing Analysis Support Core} -\maketitle -\begin{center} - {\LARGE version ((( run.version )))} -\end{center} -\begin{figure}[h!] - \centering - \includegraphics[width=0.8\textwidth]{((( run.logo )))} -\end{figure} -\thispagestyle{empty} -\clearpage - -\addtocontents{toc}{\protect\hypertarget{toc}{}} -\tableofcontents -\clearpage - - -\part{Overview} -\label{sec:overview} - -This document outlines the results obtained from running Gentrap, a generic -pipeline for transcriptome analysis. The pipeline itself is composed of several -programs, listed in Table~\ref{tab:programs}. Note that the list only contains -the programs used in this pipeline run. General pipeline settings that applies -to all samples are shown in Table~\ref{tab:runparams}, while general annotation -files are shown in Table~\ref{tab:annotfiles}. - -\begin{center} - \captionof{table}{Programs in Gentrap} - \label{tab:programs} - \begin{longtable}{ l l l p{0.2\textwidth} } - \hline - Program & Version & Checksum & Usage\\ - \hline \hline - \endhead - \hline - \multicolumn{3}{c}{\textit{Continued on next page}}\\ - \hline - \endfoot - \hline - \endlastfoot - Gentrap & ((( run.version ))) & - & the full pipeline\\ - ((* for program, info in run.executables.items()|sort *)) - ((( program ))) & ((( info.version ))) & ((( info.md5|truncate(7, True, "") ))) & ((( info.desc )))\\ - ((* endfor *)) - \end{longtable} -\end{center} -% HACK: to keep table counters in sync -\addtocounter{table}{-1} - -\begin{center} - \captionof{table}{General Run Parameters} - \label{tab:runparams} - \begin{longtable}{ p{0.4\textwidth} p{0.4\textwidth} } - \hline - Parameter & Value\\ - \hline \hline - \endhead - \hline - \multicolumn{2}{c}{\textit{Continued on next page}}\\ - \hline - \endfoot - \hline - \endlastfoot - Number of samples & ((( run.samples|length )))\\ - Number of libraries & ((( run.libs|length )))\\ - Library types & ((( run.lib_type )))\\ - Expression value measures & ((( run.settings.expression_measures|join(", ") )))\\ - Strand protocol & ((( run.settings.strand_protocol|lower )))\\ - Variant calling & ((* if run.settings.variant_calling *))enabled((* else *))disabled((* endif *))\\ - Ribosomal reads removal & ((* if run.settings.remove_ribosomal_reads *))enabled((* else *))disabled((* endif *))\\ - \end{longtable} -\end{center} -\addtocounter{table}{-1} - - -\begin{center} - \captionof{table}{Annotation Files} - \label{tab:annotfiles} - \begin{longtable}{ l l p{0.4\textwidth} } - \hline - File & Checksum & Name\\ - \hline \hline - \endhead - \hline - \multicolumn{3}{c}{\textit{Continued on next page}}\\ - \hline - \endfoot - \hline - \endlastfoot - General refFlat file & ((( run.files.annotation_refflat.md5|truncate(7, True, "") ))) & ((( run.files.annotation_refflat.path|basename )))\\ - ((* if run.files.annotation_gtf *)) - General GTF file & ((( run.files.annotation_gtf.md5|truncate(7, True, "") ))) & ((( run.files.annotation_gtf.path|basename )))\\ - ((* endif *)) - ((* if run.files.annotation_bed *)) - General BED file & ((( run.files.annotation_bed.md5|truncate(7, True, "") ))) & ((( run.files.annotation_bed.path|basename )))\\ - ((* endif *)) - ((* if run.files.ribosome_refflat *)) - Ribosome refFlat & ((( run.files.ribosome_refflat.md5|truncate(7, True, "") ))) & ((( run.files.ribosome_refflat.path|basename )))\\ - ((* endif *)) - \end{longtable} -\end{center} -% HACK: to keep table counters in sync -\addtocounter{table}{-1} - -\clearpage - -((* if run.samples|length > 2 and run.settings.expression_measures|length > 0 *)) -\part{Multi Sample Results} -\label{sec:msr} -This section shows results that are computed from multiple samples. - -\begin{center} - \captionof{table}{Multi Sample Result Files} - \label{tab:annotfiles} - \begin{longtable}{ l l p{0.4\textwidth} } - \hline - File & Checksum & Name\\ - \hline \hline - \endhead - \hline - \multicolumn{3}{c}{\textit{Continued on next page}}\\ - \hline - \endfoot - \hline - \endlastfoot - ((* if run.files.gene_fragments_count *)) - Fragments per gene & ((( run.files.gene_fragments_count.md5|truncate(7, True, "") ))) & ((( run.files.gene_fragments_count.path|basename )))\\ - ((* endif *)) - - ((* if run.files.exon_fragments_count *)) - Fragments per exon & ((( run.files.exon_fragments_count.md5|truncate(7, True, "") ))) & ((( run.files.exon_fragments_count.path|basename )))\\ - ((* endif *)) - - ((* if run.files.gene_bases_count *)) - Bases per gene & ((( run.files.gene_bases_count.md5|truncate(7, True, "") ))) & ((( run.files.gene_bases_count.path|basename )))\\ - ((* endif *)) - - ((* if run.files.exon_bases_count *)) - Bases per exon & ((( run.files.exon_bases_count.md5|truncate(7, True, "") ))) & ((( run.files.exon_bases_count.path|basename )))\\ - ((* endif *)) - - ((* if run.files.gene_fpkm_cufflinks_strict *)) - Cufflinks (strict, gene) & ((( run.files.gene_fpkm_cufflinks_strict.md5|truncate(7, True, "") ))) & ((( run.files.gene_fpkm_cufflinks_strict.path|basename )))\\ - ((* endif *)) - ((* if run.files.isoform_fpkm_cufflinks_strict *)) - Cufflinks (strict, isoform) & ((( run.files.isoform_fpkm_cufflinks_strict.md5|truncate(7, True, "") ))) & ((( run.files.isoform_fpkm_cufflinks_strict.path|basename )))\\ - ((* endif *)) - - ((* if run.files.gene_fpkm_cufflinks_guided *)) - Cufflinks (guided, gene) & ((( run.files.gene_fpkm_cufflinks_guided.md5|truncate(7, True, "") ))) & ((( run.files.gene_fpkm_cufflinks_guided.path|basename )))\\ - ((* endif *)) - ((* if run.files.isoform_fpkm_cufflinks_guided *)) - Cufflinks (guided, isoform) & ((( run.files.isoform_fpkm_cufflinks_guided.md5|truncate(7, True, "") ))) & ((( run.files.isoform_fpkm_cufflinks_guided.path|basename )))\\ - ((* endif *)) - - ((* if run.files.gene_fpkm_cufflinks_blind *)) - Cufflinks (blind, gene) & ((( run.files.gene_fpkm_cufflinks_blind.md5|truncate(7, True, "") ))) & ((( run.files.gene_fpkm_cufflinks_blind.path|basename )))\\ - ((* endif *)) - ((* if run.files.isoform_fpkm_cufflinks_blind *)) - Cufflinks (blind, isoform) & ((( run.files.isoform_fpkm_cufflinks_blind.md5|truncate(7, True, "") ))) & ((( run.files.isoform_fpkm_cufflinks_blind.path|basename )))\\ - ((* endif *)) - \end{longtable} -\end{center} -% HACK: to keep table counters in sync -\addtocounter{table}{-1} - -((* if run.files.gene_fragments_count *)) -\begin{figure}[h!] - \centering - \includegraphics[width=0.65\textwidth]{((( run.files.gene_fragments_count_heatmap.path )))} - \caption{Between-samples correlation of fragment count per gene.} -\end{figure} -((* endif *)) - -((* if run.files.exon_fragments_count *)) -\begin{figure}[h!] - \centering - \includegraphics[width=0.65\textwidth]{((( run.files.exon_fragments_count_heatmap.path )))} - \caption{Between-samples correlation of fragment count per exon.} -\end{figure} -((* endif *)) - -((* if run.files.gene_bases_count *)) -\begin{figure}[h!] - \centering - \includegraphics[width=0.65\textwidth]{((( run.files.gene_bases_count_heatmap.path )))} - \caption{Between-samples correlation of base count per gene.} -\end{figure} -((* endif *)) - -((* if run.files.exon_bases_count *)) -\begin{figure}[h!] - \centering - \includegraphics[width=0.65\textwidth]{((( run.files.exon_bases_count_heatmap.path )))} - \caption{Between-samples correlation of base count per exon.} -\end{figure} -((* endif *)) - -((* if run.files.gene_fpkm_cufflinks_strict_heatmap *)) -\begin{figure}[h!] - \centering - \includegraphics[width=0.65\textwidth]{((( run.files.gene_fpkm_cufflinks_strict_heatmap.path )))} - \caption{Between-samples correlation of the gene level FPKM (Cufflinks strict mode).} -\end{figure} -((* endif *)) - -((* if run.files.gene_fpkm_cufflinks_guided_heatmap *)) -\begin{figure}[h!] - \centering - \includegraphics[width=0.65\textwidth]{((( run.files.gene_fpkm_cufflinks_guided_heatmap.path )))} - \caption{Between-samples correlation of the gene level FPKM (Cufflinks guided mode).} -\end{figure} -((* endif *)) - -((* if run.files.gene_fpkm_cufflinks_blind_heatmap *)) -\begin{figure}[h!] - \centering - \includegraphics[width=0.65\textwidth]{((( run.files.gene_fpkm_cufflinks_blind_heatmap.path )))} - \caption{Between-samples correlation of the gene level FPKM (Cufflinks blind mode).} -\end{figure} -((* endif *)) - -((* endif *)) - -\clearpage - -((* for sample in run.samples.values()|sort *)) -((* include "sample.tex" *)) -\clearpage -((* endfor *)) - - -\part{About Gentrap} -\label{apx:about} - -The Generic Transcriptome Analysis Pipeline (Gentrap) is a -generic pipeline for analyzing transcripts from RNA-seq experiments. \\ - -Gentrap was developed by Wibowo Arindrarto (\href{mailto:w.arindrarto@lumc.nl}{w.arindrarto@lumc.nl}) -based on raw scripts written by Jeroen Laros -(\href{mailto:j.f.j.laros@lumc.nl}{j.f.j.laros@lumc.nl}) and -Peter-Bram 't Hoen -(\href{mailto:p.a.c._t_hoen@lumc.nl}{p.a.c._t_hoen@lumc.nl}) as part of the -\href{https://git.lumc/nl/biopet/biopet}{Biopet framework}. \\ - -The Biopet framework is developed by the -\href{http://sasc.lumc.nl}{Sequencing Analysis Support Core} of the -\href{http://lumc.nl}{Leiden University Medical Center}, by extending the -\href{http://http://gatkforums.broadinstitute.org/discussion/1306/overview-of-queue}{Queue framework}. -Please see the respective web sites for licensing information. - -\indent - -Cover page image: T7 RNA Polymerase and a dsDNA template (PDB ID \texttt{1msw}). -Created by Thomas Splettstoesser, taken from -\href{http://commons.wikimedia.org/wiki/File:T7_RNA_polymerase.jpg}{Wikimedia Commons}. - - -\end{document} diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/sample.tex b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/sample.tex deleted file mode 100644 index b5f31e50efa563c748fa03ada85b61a6139510fe..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/sample.tex +++ /dev/null @@ -1,11 +0,0 @@ -\part{Sample "((( sample.name )))" Results} -\label{sample:(((sample.name)))} - -((* if sample.libs|length > 1 *)) -((* include "sample_mapping.tex" *)) -((* endif *)) - -((* for lib in sample.libs.values() *)) -((* include "lib.tex" *)) -\clearpage -((* endfor *)) diff --git a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/sample_mapping.tex b/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/sample_mapping.tex deleted file mode 100644 index 4f4da850da797c3fb1a355bd32b94955d367f5b2..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/resources/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf/sample_mapping.tex +++ /dev/null @@ -1,122 +0,0 @@ -\section{Mapping} -\label{sec:map-((( sample.name )))} - -\subsection{Mapping statistics} - -\indent - -% number + percentage of reads mapped to genome -% number + percentage of properly paired reads -\begin{center} - \captionof{table}{Mapping Overview} - \label{tab:bamstat-((( sample.name )))} - \setlength{\tabcolsep}{11pt} - ((* if sample.is_paired_end *)) - \begin{tabular}{ l r r r } - \hline - \multirow{2}{*}{Parameter} & \multicolumn{1}{c}{All Pairs} & \multicolumn{1}{c}{First in Pairs} & \multicolumn{1}{c}{Second in Pairs} \\ - & Value & Value & Value \\ - \hline \hline - Total reads & ((( sample.aln_metrics.PAIR.total_reads|nice_int ))) & ((( sample.aln_metrics.FIRST_OF_PAIR.total_reads|nice_int ))) & ((( sample.aln_metrics.SECOND_OF_PAIR.total_reads|nice_int ))) \\ - Mean read length & ((( sample.aln_metrics.PAIR.mean_read_length|nice_flt ))) & ((( sample.aln_metrics.FIRST_OF_PAIR.mean_read_length|nice_flt ))) & ((( sample.aln_metrics.SECOND_OF_PAIR.mean_read_length|nice_flt ))) \\ - Strand balance & ((( sample.aln_metrics.PAIR.strand_balance|nice_flt ))) & ((( sample.aln_metrics.FIRST_OF_PAIR.strand_balance|nice_flt ))) & ((( sample.aln_metrics.SECOND_OF_PAIR.strand_balance|nice_flt ))) \\ - \% Mapped to reference & ((( sample.aln_metrics.PAIR.pct_pf_reads_aligned|float2nice_pct )))\% & ((( sample.aln_metrics.FIRST_OF_PAIR.pct_pf_reads_aligned|float2nice_pct )))\% & ((( sample.aln_metrics.SECOND_OF_PAIR.pct_pf_reads_aligned|float2nice_pct )))\% \\ - \% Mapped to reference (MAPQ >= 20) & ((( sample.aln_metrics.PAIR.pct_pf_reads_aligned|float2nice_pct )))\% & ((( sample.aln_metrics.FIRST_OF_PAIR.pct_pf_reads_aligned|float2nice_pct )))\% & ((( sample.aln_metrics.SECOND_OF_PAIR.pct_pf_reads_aligned|float2nice_pct )))\% \\ - Mismatch rate & ((( sample.aln_metrics.PAIR.pf_mismatch_rate|float2nice_pct )))\% & ((( sample.aln_metrics.FIRST_OF_PAIR.pf_mismatch_rate|float2nice_pct )))\% & ((( sample.aln_metrics.SECOND_OF_PAIR.pf_mismatch_rate|float2nice_pct )))\% \\ - Indel rate & ((( sample.aln_metrics.PAIR.pf_indel_rate|float2nice_pct )))\% & ((( sample.aln_metrics.FIRST_OF_PAIR.pf_indel_rate|float2nice_pct )))\% & ((( sample.aln_metrics.SECOND_OF_PAIR.pf_indel_rate|float2nice_pct )))\% \\ - Chimeras & ((( sample.aln_metrics.PAIR.pct_chimeras|float2nice_pct )))\% & ((( sample.aln_metrics.FIRST_OF_PAIR.pct_chimeras|float2nice_pct )))\% & ((( sample.aln_metrics.SECOND_OF_PAIR.pct_chimeras|float2nice_pct )))\% \\ - \hline - ((* else *)) - \begin{tabular}{ l r } - \hline - \multirow{1}{*}{Parameter} & \multicolumn{1}{c}{Value} \\ - \hline \hline - Total reads & ((( sample.aln_metrics.UNPAIRED.total_reads|nice_int ))) \\ - Mean read length & ((( sample.aln_metrics.UNPAIRED.mean_read_length|nice_flt ))) \\ - Strand balance & ((( sample.aln_metrics.UNPAIRED.strand_balance|nice_flt ))) \\ - \% Mapped to reference & ((( sample.aln_metrics.UNPAIRED.pct_pf_reads_aligned|float2nice_pct )))\% \\ - \% Mapped to reference (MAPQ >= 20) & ((( sample.aln_metrics.UNPAIRED.pct_pf_reads_aligned|float2nice_pct )))\% \\ - Mismatch rate & ((( sample.aln_metrics.UNPAIRED.pf_mismatch_rate|float2nice_pct )))\% \\ - Indel rate & ((( sample.aln_metrics.UNPAIRED.pf_indel_rate|float2nice_pct )))\% \\ - \hline - ((* endif *)) - \end{tabular} -\end{center} - -((* if sample.is_paired_end *)) -% inferred insert size distribution -\subsubsection{Insert size distribution} - -\IfFileExists{((( sample.inserts_metrics_files.insert_size_histogram.path )))} -{ - \begin{figure}[h!] - \centering - \includegraphics[width=0.7\textwidth]{((( sample.inserts_metrics_files.insert_size_histogram.path )))} - \caption{Distribution of insert size length of paired-end reads mapped to opposite strands.} - \end{figure} -} -((= TODO: strand-specific stats -%{ -% \IfFileExists{((( vars['OUT_DIR'] )))/((( vars['SAMPLE'] ))).f.insertsizes.png} -% { -% \begin{figure}[h!] -% \centering -% \includegraphics[width=0.7\textwidth]{((( vars['OUT_DIR'] )))/((( vars['SAMPLE'] ))).f.insertsizes.png} -% \caption{Distribution of insert size length of paired-end reads whose first read maps to the minus strand.} -% \end{figure} -% }{} -% \IfFileExists{((( vars['OUT_DIR'] )))/((( vars['SAMPLE'] ))).r.insertsizes.png} -% { -% \begin{figure}[h!] -% \centering -% \includegraphics[width=0.7\textwidth]{((( vars['OUT_DIR'] )))/((( vars['SAMPLE'] ))).r.insertsizes.png} -% \caption{Distribution of insert size length of paired-end reads whose first read maps to the plus strand.} -% \end{figure} -% }{} -%} -=)) -((* endif *)) - - -\subsection{RNA-specific metrics} - -\IfFileExists{((( sample.rna_metrics_files.output_chart.path )))} -{ - \begin{figure}[h!] - \centering - \includegraphics[width=0.7\textwidth]{((( sample.rna_metrics_files.output_chart.path )))} - \caption{Normalized coverage bias plot.} - \end{figure} -} - -\begin{center} - \captionof{table}{Functional annotation metrics} - \label{tab:fannot-((( sample.name )))} - \setlength{\tabcolsep}{11pt} - \begin{tabular}{ l r r r } - \hline - \multirow{2}{*}{Parameter} & \multicolumn{3}{c}{Value} \\ - & Count & \% of all & \% of aligned \\ - \hline \hline - Total bases & ((( sample.rna_metrics.pf_bases|nice_int ))) & 100\% & - \\ - Aligned bases & ((( sample.rna_metrics.pf_aligned_bases|nice_int ))) & ((( sample.rna_metrics.pct_aligned_bases_all|float2nice_pct )))\% & ((( sample.rna_metrics.pct_aligned_bases|float2nice_pct )))\% \\ - Exonic bases & ((( sample.rna_metrics.exonic_bases|nice_int ))) & ((( sample.rna_metrics.pct_exonic_bases_all|float2nice_pct )))\% & ((( sample.rna_metrics.pct_exonic_bases|float2nice_pct )))\% \\ - \hspace*{4mm}Coding bases & ((( sample.rna_metrics.coding_bases|nice_int ))) & ((( sample.rna_metrics.pct_coding_bases_all|float2nice_pct )))\% & ((( sample.rna_metrics.pct_coding_bases|float2nice_pct )))\% \\ - \hspace*{4mm}UTR bases & ((( sample.rna_metrics.utr_bases|nice_int ))) & ((( sample.rna_metrics.pct_utr_bases_all|float2nice_pct )))\% & ((( sample.rna_metrics.pct_utr_bases|float2nice_pct )))\% \\ - Intronic bases & ((( sample.rna_metrics.intronic_bases|nice_int ))) & ((( sample.rna_metrics.pct_intronic_bases_all|float2nice_pct )))\% & ((( sample.rna_metrics.pct_intronic_bases|float2nice_pct )))\% \\ - Intergenic bases & ((( sample.rna_metrics.intergenic_bases|nice_int ))) & ((( sample.rna_metrics.pct_intergenic_bases_all|float2nice_pct )))\% & ((( sample.rna_metrics.pct_intergenic_bases|float2nice_pct )))\% \\ - ((* if sample.rna_metrics.ribosomal_bases != "" *)) - Ribosomal bases & ((( sample.rna_metrics.ribosomal_bases|nice_int ))) & ((( sample.rna_metrics.pct_ribosomal_bases_all|float2nice_pct )))\% & ((( sample.rna_metrics.pct_ribosomal_bases|float2nice_pct )))\% \\ - ((* endif *)) - \hline - Median 5' bias & ((( sample.rna_metrics.median_5prime_bias ))) & - & - \\ - Median 3' bias & ((( sample.rna_metrics.median_3prime_bias ))) & - & - \\ - Median 5' to 3' bias & ((( sample.rna_metrics.median_5prime_to_3prime_bias ))) & - & - \\ - \hline - ((* if sample.run.settings.strand_protocol != "non_specific" *)) - Correct strand reads & ((( sample.rna_metrics.correct_strand_reads|nice_int ))) & - & - \\ - Incorrect strand reads & ((( sample.rna_metrics.incorrect_strand_reads|nice_int ))) & - & - \\ - ((* endif *)) - \hline - \end{tabular} -\end{center} diff --git a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/CufflinksProducer.scala b/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/CufflinksProducer.scala deleted file mode 100644 index 0a2e48c00688c6cf097066498cd4f6a62f54b107..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/CufflinksProducer.scala +++ /dev/null @@ -1,103 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.pipelines.gentrap - -import java.io.File - -import nl.lumc.sasc.biopet.extensions.{ Cufflinks, Ln } - -/** General trait for containing cufflinks results */ -trait CufflinksProducer { - - import Gentrap.ExpMeasures._ - import Gentrap.StrandProtocol._ - import Gentrap._ - - //TODO: move vars that are used in gentrep - protected def sampleDir: File - protected def sampleId: String - protected def pipeline: Gentrap - protected def alnFile: File - - /** Valid cufflink measure types */ - protected val cufflinksMeasures = Set(CufflinksStrict, CufflinksGuided, CufflinksBlind) - - /** Cufflink's terms for strand specificity */ - lazy val strandedness: String = { - //require(pipeline.config.contains("strand_protocol")) - pipeline.strandProtocol match { - case NonSpecific => "fr-unstranded" - case Dutp => "fr-firststrand" - case otherwise => throw new IllegalStateException("Unexpected strand type for cufflinks: " + otherwise.toString) - } - } - - /** Case class for containing cufflinks + its output symlink jobs */ - protected case class CufflinksJobSet(cuffType: ExpMeasures.Value) { - - require(cufflinksMeasures.contains(cuffType), - "Cufflinks measurement type is either " + cufflinksMeasures.mkString(", ") + s"; not $cuffType") - - /** Base name for output file extensions and config path */ - lazy val name: String = cuffType match { - case CufflinksStrict => "cufflinks_strict" - case CufflinksGuided => "cufflinks_guided" - case CufflinksBlind => "cufflinks_blind" - case otherwise => throw new IllegalStateException("Unexpected cufflinks type: " + otherwise.toString) - } - - /** Container for all jobs in this job set */ - def jobs = Seq(cufflinksJob, geneFpkmJob, isoformFpkmJob) - - /** The main cufflinks job */ - lazy val cufflinksJob: Cufflinks = { - val job = new Cufflinks(pipeline) { - override def configName = "cufflinks" - override def configPath: List[String] = super.configPath ::: name :: Nil - } - job.input = alnFile - job.library_type = Option(strandedness) - job.output_dir = new File(sampleDir, name) - /* - job.GTF = cuffType match { - case CufflinksStrict => pipeline.annotationGtf - case otherwise => None - } - job.GTF_guide = cuffType match { - case CufflinksGuided => pipeline.annotationGtf - case otherwise => None - } - */ - job - } - - /** Job for symlinking gene FPKM results so that it contains a standard filename (with the sample ID) */ - lazy val geneFpkmJob: Ln = { - val job = new Ln(pipeline) - job.input = cufflinksJob.outputGenesFpkm - job.output = new File(cufflinksJob.output_dir, s"$sampleId.genes_fpkm_$name") - job - } - - /** Job for symlinking isoforms FPKM results so that it contains a standard filename (with the sample ID) */ - lazy val isoformFpkmJob: Ln = { - val job = new Ln(pipeline) - job.input = cufflinksJob.outputIsoformsFpkm - job.output = new File(cufflinksJob.output_dir, s"$sampleId.isoforms_fpkm_$name") - job - } - } -} diff --git a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/extensions/CustomVarScan.scala b/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/extensions/CustomVarScan.scala deleted file mode 100644 index f2f11a03e775ebe5a7fa4161813a0157c8f7976a..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/extensions/CustomVarScan.scala +++ /dev/null @@ -1,102 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.pipelines.gentrap.extensions - -import java.io.File - -import nl.lumc.sasc.biopet.core.{ Reference, BiopetCommandLineFunction } -import nl.lumc.sasc.biopet.core.extensions.PythonCommandLineFunction -import nl.lumc.sasc.biopet.utils.config.Configurable -import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsMpileup -import nl.lumc.sasc.biopet.extensions.varscan.Mpileup2cns -import nl.lumc.sasc.biopet.extensions.{ Bgzip, Tabix } -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } - -/** Ad-hoc extension for VarScan variant calling that involves 6-command pipe */ -// FIXME: generalize piping instead of building something by hand like this! -// Better to do everything quick and dirty here rather than something half-implemented with the objects -class CustomVarScan(val root: Configurable) extends BiopetCommandLineFunction with Reference { wrapper => - - override def configName = "customvarscan" - - @Input(doc = "Input BAM file", required = true) - var input: File = null - - @Output(doc = "Output VCF file", required = true) - var output: File = null - - @Output(doc = "Output VCF file index", required = true) - lazy val outputIndex: File = new File(output.toString + ".tbi") - - // mpileup, varscan, fix_mpileup.py, binom_test.py, bgzip, tabix - private def mpileup = new SamtoolsMpileup(wrapper.root) { - this.input = List(wrapper.input) - override def configName = wrapper.configName - disableBaq = true - depth = Option(1000000) - outputMappingQuality = true - - } - - private def fixMpileup = new PythonCommandLineFunction { - setPythonScript("fix_mpileup.py", "/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/") - override val root: Configurable = wrapper.root - override def configName = wrapper.configName - def cmdLine = getPythonCommand - } - - private def removeEmptyPile() = new BiopetCommandLineFunction { - override val root: Configurable = wrapper.root - override def configName = wrapper.configName - executable = config("exe", default = "grep", freeVar = false) - override def cmdLine: String = required(executable) + required("-vP") + required("""\t\t""") - } - - private val varscan = new Mpileup2cns(wrapper.root) { - override def configName = wrapper.configName - strandFilter = Option(0) - outputVcf = Option(1) - } - - private val compress = new Bgzip(wrapper.root) - - private val index = new Tabix(wrapper.root) { - override def configName = wrapper.configName - p = Option("vcf") - } - - override def freezeFieldValues(): Unit = { - varscan.output = Option(new File(wrapper.output.toString.stripSuffix(".gz"))) - compress.input = List(varscan.output.get) - compress.output = this.output - index.input = compress.output - super.freezeFieldValues() - varscan.qSettings = this.qSettings - varscan.freezeFieldValues() - } - - override def beforeGraph(): Unit = { - super.beforeGraph() - require(output.toString.endsWith(".gz"), "Output must have a .gz file extension") - deps :+= referenceFasta() - } - - def cmdLine: String = { - // FIXME: manual trigger of commandLine for version retrieval - mpileup.commandLine - (mpileup | fixMpileup | removeEmptyPile() | varscan).commandLine + " && " + compress.commandLine + " && " + index.commandLine - } -} diff --git a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/extensions/Pdflatex.scala b/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/extensions/Pdflatex.scala deleted file mode 100644 index 4747856d6f58943b33f0407b63c37311ab1f67fb..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/extensions/Pdflatex.scala +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.pipelines.gentrap.extensions - -import java.io.File - -import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction -import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } - -/** - * Wrapper for the pdflatex executable - */ -class Pdflatex(val root: Configurable) extends BiopetCommandLineFunction { - - executable = config("exe", default = "pdflatex", freeVar = false) - override val executableToCanonicalPath = false - - @Input(doc = "Input LaTeX template", required = true) - var input: File = null - - @Output(doc = "Output directory", required = true) - var outputDir: File = null - - @Argument(doc = "Job name", required = true) - var name: String = null - - @Output(doc = "Output PDF file") - lazy val outputPdf: File = { - require(name != null && outputDir != null, "Job name and output directory must be defined") - new File(outputDir, name + ".pdf") - } - - def cmdLine = { - // repeating command 3x times to get internal references working correctly - val singleCommand = required(executable) + - required("-output-directory", outputDir) + - required("-jobname", name) + - required(input) - Seq(singleCommand, singleCommand, singleCommand).mkString(" && ") - } -} diff --git a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/extensions/RawBaseCounter.scala b/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/extensions/RawBaseCounter.scala deleted file mode 100644 index 8be307a9d1323f0252d5b7f2e24f52c07148adcc..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/extensions/RawBaseCounter.scala +++ /dev/null @@ -1,114 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.pipelines.gentrap.extensions - -import java.io.File - -import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction -import nl.lumc.sasc.biopet.utils.Logging -import nl.lumc.sasc.biopet.utils.config.Configurable -import nl.lumc.sasc.biopet.core.extensions.PythonCommandLineFunction -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } - -import scala.language.reflectiveCalls - -/** Ad-hoc extension for counting bases that involves 3-command pipe */ -// FIXME: generalize piping instead of building something by hand like this! -// Better to do everything quick and dirty here rather than something half-implemented with the objects -class RawBaseCounter(val root: Configurable) extends BiopetCommandLineFunction { wrapper => - - override def configName = "rawbasecounter" - - @Input(doc = "Reference BED file", required = true) - var annotationBed: File = null - - @Input(doc = "Input BAM file from both strands", required = false) - var inputBoth: File = null - - @Input(doc = "Input BAM file from the plus strand", required = false) - var inputPlus: File = null - - @Input(doc = "Input BAM file from the minus strand", required = false) - var inputMinus: File = null - - @Output(doc = "Output base count file", required = true) - var output: File = null - - /** Internal flag for mixed strand mode */ - private lazy val mixedStrand: Boolean = inputBoth != null && inputPlus == null && inputMinus == null - - /** Internal flag for distinct strand / strand-specific mode */ - private lazy val distinctStrand: Boolean = inputBoth == null && inputPlus != null && inputMinus != null - - private def grepForStrand = new BiopetCommandLineFunction { - var strand: String = null - override val root: Configurable = wrapper.root - override def configName = wrapper.configName - executable = config("exe", default = "grep", freeVar = false) - override def cmdLine: String = required(executable) + - required("-P", """\""" + strand + """$""") + - required(annotationBed) - } - - private def bedtoolsCovHist = new BiopetCommandLineFunction { - var bam: File = null - override def configName = "bedtoolscoverage" - override val root: Configurable = wrapper.root - executable = config("exe", default = "coverageBed", freeVar = false) - override def cmdLine: String = required(executable) + - required("-split") + - required("-hist") + - required("-abam", bam) + - required("-b", if (mixedStrand) annotationBed else "stdin") - } - - private def hist2Count = new PythonCommandLineFunction { - setPythonScript("hist2count.py", "/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/") - override def configName = wrapper.configName - override val root: Configurable = wrapper.root - def cmdLine = getPythonCommand + optional("-c", "3") - } - - override def beforeGraph(): Unit = { - if (annotationBed == null) Logging.addError("Annotation BED must be supplied") - require(output != null, "Output must be defined") - require((mixedStrand && !distinctStrand) || (!mixedStrand && distinctStrand), - "Invalid input BAM combinations for RawBaseCounter") - } - - def cmdLine: String = - if (mixedStrand && !distinctStrand) { - - val btCov = bedtoolsCovHist - btCov.bam = inputBoth - btCov.commandLine + "|" + hist2Count.commandLine + " > " + output - - } else { - - val plusGrep = grepForStrand - plusGrep.strand = "+" - val plusBtCov = bedtoolsCovHist - plusBtCov.bam = inputPlus - - val minusGrep = grepForStrand - minusGrep.strand = "-" - val minusBtCov = bedtoolsCovHist - minusBtCov.bam = inputMinus - - plusGrep.commandLine + "|" + plusBtCov.commandLine + "|" + hist2Count.commandLine + " > " + required(output) + " && " + - minusGrep.commandLine + "|" + minusBtCov.commandLine + "|" + hist2Count.commandLine + " >> " + required(output) - } -} diff --git a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/AggrBaseCount.scala b/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/AggrBaseCount.scala deleted file mode 100644 index 84941f6beeb080c1ffa2b7681eb8bb9e404d6449..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/AggrBaseCount.scala +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.pipelines.gentrap.scripts - -import java.io.File - -import nl.lumc.sasc.biopet.utils.config.Configurable -import nl.lumc.sasc.biopet.pipelines.gentrap.extensions.RScriptCommandLineFunction -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } - -/** - * Wrapper for the aggr_base_count.R script, used internally in Gentrap - */ -class AggrBaseCount(val root: Configurable) extends RScriptCommandLineFunction { - - setRScript("aggr_base_count.R", "/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/") - - @Input(doc = "Raw base count files", required = true) - var input: File = null - - @Output(doc = "Output count file", required = false) - var output: File = null - - var inputLabel: String = null - var mode: String = null - - override def beforeGraph(): Unit = { - require(mode == "exon" || mode == "gene", "Mode must be either exon or gene") - require(input != null, "Input raw base count table must be defined") - } - - def cmdLine = { - RScriptCommand + - required("-I", input) + - required("-N", inputLabel) + - optional(if (mode == "gene") "-G" else "-E", output) - } -} diff --git a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/Hist2Count.scala b/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/Hist2Count.scala deleted file mode 100644 index 4a8002c9defcf843bfeb6f69b062d68dc6d3b7b6..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/Hist2Count.scala +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.pipelines.gentrap.scripts - -import java.io.File - -import nl.lumc.sasc.biopet.utils.config.Configurable -import nl.lumc.sasc.biopet.core.extensions.PythonCommandLineFunction -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } - -/** - * Wrapper for the hist2count.py script, used internally in Gentrap - */ -class Hist2Count(val root: Configurable) extends PythonCommandLineFunction { - - setPythonScript("hist2count.py") - - @Input(doc = "Input histogram file (generated by bedtools coverage -hist)", required = true) - var input: File = null - - @Output(doc = "Output count file", required = false) - var outputGeneLevelCount: File = null - - /** index of column to copy to output file from input file */ - var copyColumn: List[Int] = List.empty[Int] - - def cmdLine = { - getPythonCommand + - required("-i", input) + - required("-o", outputGeneLevelCount) + - optional("-c", repeat(copyColumn), escape = false) - } -} diff --git a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/PdfReportTemplateWriter.scala b/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/PdfReportTemplateWriter.scala deleted file mode 100644 index eeab93c18279cf3e0510e810ef1199dbb27ee58a..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/PdfReportTemplateWriter.scala +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.pipelines.gentrap.scripts - -import java.io.{ File, FileOutputStream } - -import nl.lumc.sasc.biopet.utils.config.Configurable -import nl.lumc.sasc.biopet.core.extensions.PythonCommandLineFunction -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } - -/** - * Wrapper for the pdf_report.py script, used internally in Gentrap - */ -class PdfReportTemplateWriter(val root: Configurable) extends PythonCommandLineFunction { - - @Input(doc = "Input summary file", required = true) - var summaryFile: File = null - - @Input(doc = "Main report template", required = true) // def since we hard-code the template - def mainTemplateFile: File = new File(templateWorkDir, "main.tex") - - @Input(doc = "Main report logo", required = true) // def since we hard-code the logo - def logoFile: File = new File(templateWorkDir, "gentrap_front.png") - - @Output(doc = "Output file", required = true) - var output: File = null - - val templateWorkDir: File = new File(".queue/tmp/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf") - val templateFiles: Seq[String] = Seq( - "main.tex", "gentrap_front.png", - "sample.tex", "sample_mapping.tex", - "lib.tex", "lib_seqeval.tex", "lib_mapping.tex" - ) - - protected def prepTemplate(name: String, - subPackage: String = "/nl/lumc/sasc/biopet/pipelines/gentrap/templates/pdf"): Unit = { - val target = new File(".queue/tmp" + subPackage, name) - if (!target.getParentFile.exists) target.getParentFile.mkdirs() - val is = getClass.getResourceAsStream(subPackage + "/" + name) - val os = new FileOutputStream(target) - org.apache.commons.io.IOUtils.copy(is, os) - os.close() - - //python_script_name = script - //python_script = new File(".queue/tmp/" + subpackage + python_script_name) - //if (!python_script.getParentFile.exists) python_script.getParentFile.mkdirs - //val is = getClass.getResourceAsStream(subpackage + python_script_name) - //val os = new FileOutputStream(python_script) - //org.apache.commons.io.IOUtils.copy(is, os) - //os.close() - } - - def cmdLine = { - getPythonCommand + - required(summaryFile) + - required(mainTemplateFile) + - required(logoFile.getAbsoluteFile) + - " > " + required(output) - } - - setPythonScript("pdf_report.py") - templateFiles.foreach(t => prepTemplate(t)) -} diff --git a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/PlotPca.scala b/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/PlotPca.scala deleted file mode 100644 index dd13421069fd331c819653a9fcf2cb4cf3b7c851..0000000000000000000000000000000000000000 --- a/public/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/PlotPca.scala +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Biopet is built on top of GATK Queue for building bioinformatic - * pipelines. It is mainly intended to support LUMC SHARK cluster which is running - * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - * should also be able to execute Biopet tools and pipelines. - * - * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - * - * Contact us at: sasc@lumc.nl - * - * A dual licensing mode is applied. The source code within this project that are - * not part of GATK Queue is freely available for non-commercial use under an AGPL - * license; For commercial users or users who do not want to follow the AGPL - * license, please contact us to obtain a separate license. - */ -package nl.lumc.sasc.biopet.pipelines.gentrap.scripts - -import java.io.File - -import nl.lumc.sasc.biopet.utils.config.Configurable -import nl.lumc.sasc.biopet.pipelines.gentrap.extensions.RScriptCommandLineFunction -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } - -/** - * Wrapper for the plot_pca.R script, used internally in Gentrap - */ -class PlotPca(val root: Configurable) extends RScriptCommandLineFunction { - - setRScript("plot_pca.R", "/nl/lumc/sasc/biopet/pipelines/gentrap/scripts/") - - @Input(doc = "Input table", required = true) - var input: File = null - - @Output(doc = "Output plot", required = false) - var output: File = null - - var tmmNormalize: Boolean = config("tmm_normalize", default = false) - - def cmdLine = { - RScriptCommand + - conditional(tmmNormalize, "-T") + - required("-I", input) + - required("-O", output) - } -}