From 2db37a64b8229b8fc6cf0e0224c22d2805d7f7a4 Mon Sep 17 00:00:00 2001 From: bow <bow@bow.web.id> Date: Sat, 27 Jun 2015 17:32:47 +0200 Subject: [PATCH] Add step for merging unmapped and mapped tophat alignment files --- .../mapping/scripts/tophat-recondition.py | 185 ++++++++++++++++++ .../biopet/pipelines/mapping/Mapping.scala | 31 ++- .../mapping/scripts/TophatRecondition.scala | 50 +++++ 3 files changed, 265 insertions(+), 1 deletion(-) create mode 100755 public/mapping/src/main/resources/nl/lumc/sasc/biopet/pipelines/mapping/scripts/tophat-recondition.py create mode 100644 public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/scripts/TophatRecondition.scala diff --git a/public/mapping/src/main/resources/nl/lumc/sasc/biopet/pipelines/mapping/scripts/tophat-recondition.py b/public/mapping/src/main/resources/nl/lumc/sasc/biopet/pipelines/mapping/scripts/tophat-recondition.py new file mode 100755 index 000000000..2f44e54c3 --- /dev/null +++ b/public/mapping/src/main/resources/nl/lumc/sasc/biopet/pipelines/mapping/scripts/tophat-recondition.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -* +# +# Copyright (c) 2013-2015 Christian Brueffer (ORCID: 0000-0002-3826-0989) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +""" +This script modifies the reads in a TopHat unmapped.bam file to make them +compatible with downstream tools (e.g., Picard, samtools or GATK). + +Homepage: https://github.com/cbrueffer/tophat-recondition +""" + +from __future__ import print_function + +import errno +import os +import sys +try: + import pysam +except ImportError: + print('Cannot import the pysam package; please make sure it is installed.\n') + sys.exit(1) + +VERSION = "0.3" + + +def get_index_pos(index, read): + """Returns the position of a read in the index or None.""" + + if read.qname in index: + return index[read.qname] + else: + return None + + +def fix_unmapped_reads(path, outdir, mapped_file="accepted_hits.bam", + unmapped_file="unmapped.bam", cmdline=""): + # Fix things that relate to all unmapped reads. + unmapped_dict = {} + unmapped_index = {} + with pysam.Samfile(os.path.join(path, unmapped_file)) as bam_unmapped: + unmapped_reads = list(bam_unmapped.fetch(until_eof=True)) + unmapped_header = bam_unmapped.header + for i in range(len(unmapped_reads)): + read = unmapped_reads[i] + + # remove /1 and /2 suffixes + if read.qname.find("/") != -1: + read.qname = read.qname[:-2] + + unmapped_index[read.qname] = i + + # work around "mate is unmapped" bug in TopHat + if read.qname in unmapped_dict: + unmapped_reads[unmapped_dict[read.qname]].mate_is_unmapped = True + read.mate_is_unmapped = True + else: + unmapped_dict[read.qname] = i + + read.mapq = 0 + + unmapped_reads[i] = read + + # Fix things that relate only to unmapped reads with a mapped mate. + with pysam.Samfile(os.path.join(path, mapped_file)) as bam_mapped: + for mapped in bam_mapped: + if mapped.mate_is_unmapped: + i = get_index_pos(unmapped_index, mapped) + if i is not None: + unmapped = unmapped_reads[i] + + # map chromosome TIDs from mapped to unmapped file + mapped_rname = bam_mapped.getrname(mapped.tid) + unmapped_new_tid = bam_unmapped.gettid(mapped_rname) + + unmapped.tid = unmapped_new_tid + unmapped.rnext = unmapped_new_tid + unmapped.pos = mapped.pos + unmapped.pnext = 0 + + unmapped_reads[i] = unmapped + + # for the output file, take the headers from the unmapped file + base, _ = os.path.splitext(unmapped_file) + out_filename = "".join([base, "_fixup.sam"]) # since BAM bin values may be messed up after processing + + fixup_header = unmapped_header + fixup_header['PG'].append({'ID': 'TopHat-Recondition', + 'VN': VERSION, + 'CL': cmdline}) + with pysam.Samfile(os.path.join(outdir, out_filename), "wh", + header=fixup_header) as bam_out: + for read in unmapped_reads: + bam_out.write(read) + + +def usage(scriptname, errcode=errno.EINVAL): + print("Usage:\n") + print(scriptname, "[-hv] tophat_output_dir [result_dir]\n") + print("-h print this usage text and exit") + print("-v print the script name and version, and exit") + print("tophat_output_dir: directory containing accepted_hits.bam and unmapped.bam") + print("result_dir: directory to write unmapped_fixup.bam to (default: tophat_output_dir)") + sys.exit(errcode) + + +if __name__ == "__main__": + import getopt + + scriptname = os.path.basename(sys.argv[0]) + cmdline = " ".join(sys.argv) + + try: + opts, args = getopt.gnu_getopt(sys.argv[1:], "dhv") + except getopt.GetoptError as err: + # print help information and exit + print(str(err), file=sys.stderr) + usage(scriptname, errcode=errno.EINVAL) + + debug = False + for o, a in opts: + if o in "-d": + debug = True + elif o in "-h": + usage(scriptname, errcode=0) + elif o in "-v": + print(scriptname, VERSION) + sys.exit(0) + else: + assert False, "unhandled option" + sys.exit(errno.EINVAL) + + if len(args) == 0 or len(args) > 2: + usage(scriptname, errcode=errno.EINVAL) + + bamdir = args.pop(0) + if not os.path.isdir(bamdir): + print("Specified tophat_output_dir does not exist or is not a directory: %s" % bamdir, file=sys.stderr) + sys.exit(errno.EINVAL) + + if len(args) > 0: + resultdir = args.pop(0) + if not os.path.isdir(resultdir): + print("Specified result_dir does not exist or is not a directory: %s" % resultdir, file=sys.stderr) + sys.exit(errno.EINVAL) + else: + resultdir = bamdir + + try: + fix_unmapped_reads(bamdir, resultdir, cmdline=cmdline) + except KeyboardInterrupt: + print("Program interrupted by user, exiting.") + sys.exit(errno.EINTR) + except Exception as e: + if debug: + import traceback + print(traceback.format_exc(), file=sys.stderr) + + print("Error: %s" % str(e), file=sys.stderr) + if hasattr(e, "errno"): + sys.exit(e.errno) + else: + sys.exit(1) diff --git a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala index b427348dc..dd8cc009b 100644 --- a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala +++ b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala @@ -17,6 +17,8 @@ package nl.lumc.sasc.biopet.pipelines.mapping import java.util.Date import java.io.File +import nl.lumc.sasc.biopet.pipelines.mapping.scripts.TophatRecondition + import scala.math._ import org.broadinstitute.gatk.queue.QScript @@ -362,7 +364,34 @@ class Mapping(val root: Configurable) extends QScript with SummaryQScript with S tophat.keep_fasta_order = true add(tophat) - addAddOrReplaceReadGroups(tophat.outputAcceptedHits, output) + // fix unmapped file coordinates + val fixedUnmapped = new File(tophat.output_dir, "unmapped_fixup.sam") + val fixer = new TophatRecondition(this) + fixer.inputBam = tophat.outputAcceptedHits + fixer.outputSam = fixedUnmapped + fixer.isIntermediate = true + add(fixer) + + // sort fixed SAM file + val sorter = SortSam(this, fixer.outputSam, swapExt(fixer.outputSam, ".sam", ".sorted.bam")) + sorter.sortOrder = "coordinate" + sorter.isIntermediate = true + add(sorter) + + // merge with mapped file + val mergeSamFile = MergeSamFiles(this, List(tophat.outputAcceptedHits, sorter.output), + tophat.output_dir, "coordinate") + mergeSamFile.createIndex = true + mergeSamFile.isIntermediate = true + add(mergeSamFile) + + // make sure header coordinates are correct + val reorderSam = new ReorderSam(this) + reorderSam.input = mergeSamFile.output + reorderSam.output = swapExt(output.getParent, output, ".merge.bam", ".reordered.bam") + add(reorderSam) + + addAddOrReplaceReadGroups(reorderSam.output, output) } /** * Adds stampy jobs diff --git a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/scripts/TophatRecondition.scala b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/scripts/TophatRecondition.scala new file mode 100644 index 000000000..8e71ea232 --- /dev/null +++ b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/scripts/TophatRecondition.scala @@ -0,0 +1,50 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.mapping.scripts + +import java.io.File + +import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction + +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** + * Wrapper for the tophat-recondition.py script. + * + * NOTE: we are modifying the input and output to be the BAM files directly so the wrapper works nice with Queue. + */ +class TophatRecondition(val root: Configurable) extends PythonCommandLineFunction { + + setPythonScript("tophat-recondition.py") + + @Input(doc = "Path to input accepted_hits.bam", required = true) + var inputBam: File = null + + @Output(doc = "Path to output unmapped_fixup.bam", required = false) + var outputSam: File = null + + private def inputDir: File = inputBam.getAbsoluteFile.getParentFile + + private def outputDir: File = outputSam.getAbsoluteFile.getParentFile + + override def beforeGraph: Unit = { + require(inputBam != null, "Input must be defined.") + require(outputSam != null, "Output must be defined.") + } + + def cmdLine = getPythonCommand + required(inputDir) + required(outputDir) +} -- GitLab