From 2db37a64b8229b8fc6cf0e0224c22d2805d7f7a4 Mon Sep 17 00:00:00 2001
From: bow <bow@bow.web.id>
Date: Sat, 27 Jun 2015 17:32:47 +0200
Subject: [PATCH] Add step for merging unmapped and mapped tophat alignment
 files

---
 .../mapping/scripts/tophat-recondition.py     | 185 ++++++++++++++++++
 .../biopet/pipelines/mapping/Mapping.scala    |  31 ++-
 .../mapping/scripts/TophatRecondition.scala   |  50 +++++
 3 files changed, 265 insertions(+), 1 deletion(-)
 create mode 100755 public/mapping/src/main/resources/nl/lumc/sasc/biopet/pipelines/mapping/scripts/tophat-recondition.py
 create mode 100644 public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/scripts/TophatRecondition.scala

diff --git a/public/mapping/src/main/resources/nl/lumc/sasc/biopet/pipelines/mapping/scripts/tophat-recondition.py b/public/mapping/src/main/resources/nl/lumc/sasc/biopet/pipelines/mapping/scripts/tophat-recondition.py
new file mode 100755
index 000000000..2f44e54c3
--- /dev/null
+++ b/public/mapping/src/main/resources/nl/lumc/sasc/biopet/pipelines/mapping/scripts/tophat-recondition.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*
+#
+# Copyright (c) 2013-2015 Christian Brueffer (ORCID: 0000-0002-3826-0989)
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+"""
+This script modifies the reads in a TopHat unmapped.bam file to make them
+compatible with downstream tools (e.g., Picard, samtools or GATK).
+
+Homepage: https://github.com/cbrueffer/tophat-recondition
+"""
+
+from __future__ import print_function
+
+import errno
+import os
+import sys
+try:
+    import pysam
+except ImportError:
+    print('Cannot import the pysam package; please make sure it is installed.\n')
+    sys.exit(1)
+
+VERSION = "0.3"
+
+
+def get_index_pos(index, read):
+    """Returns the position of a read in the index or None."""
+
+    if read.qname in index:
+        return index[read.qname]
+    else:
+        return None
+
+
+def fix_unmapped_reads(path, outdir, mapped_file="accepted_hits.bam",
+                       unmapped_file="unmapped.bam", cmdline=""):
+    # Fix things that relate to all unmapped reads.
+    unmapped_dict = {}
+    unmapped_index = {}
+    with pysam.Samfile(os.path.join(path, unmapped_file)) as bam_unmapped:
+        unmapped_reads = list(bam_unmapped.fetch(until_eof=True))
+        unmapped_header = bam_unmapped.header
+        for i in range(len(unmapped_reads)):
+            read = unmapped_reads[i]
+
+            # remove /1 and /2 suffixes
+            if read.qname.find("/") != -1:
+                read.qname = read.qname[:-2]
+
+            unmapped_index[read.qname] = i
+
+            # work around "mate is unmapped" bug in TopHat
+            if read.qname in unmapped_dict:
+                unmapped_reads[unmapped_dict[read.qname]].mate_is_unmapped = True
+                read.mate_is_unmapped = True
+            else:
+                unmapped_dict[read.qname] = i
+
+            read.mapq = 0
+
+            unmapped_reads[i] = read
+
+        # Fix things that relate only to unmapped reads with a mapped mate.
+        with pysam.Samfile(os.path.join(path, mapped_file)) as bam_mapped:
+            for mapped in bam_mapped:
+                if mapped.mate_is_unmapped:
+                    i = get_index_pos(unmapped_index, mapped)
+                    if i is not None:
+                        unmapped = unmapped_reads[i]
+
+                        # map chromosome TIDs from mapped to unmapped file
+                        mapped_rname = bam_mapped.getrname(mapped.tid)
+                        unmapped_new_tid = bam_unmapped.gettid(mapped_rname)
+
+                        unmapped.tid = unmapped_new_tid
+                        unmapped.rnext = unmapped_new_tid
+                        unmapped.pos = mapped.pos
+                        unmapped.pnext = 0
+
+                        unmapped_reads[i] = unmapped
+
+        # for the output file, take the headers from the unmapped file
+        base, _ = os.path.splitext(unmapped_file)
+        out_filename = "".join([base, "_fixup.sam"]) # since BAM bin values may be messed up after processing
+
+    fixup_header = unmapped_header
+    fixup_header['PG'].append({'ID': 'TopHat-Recondition',
+                               'VN': VERSION,
+                               'CL': cmdline})
+    with pysam.Samfile(os.path.join(outdir, out_filename), "wh",
+                       header=fixup_header) as bam_out:
+        for read in unmapped_reads:
+            bam_out.write(read)
+
+
+def usage(scriptname, errcode=errno.EINVAL):
+    print("Usage:\n")
+    print(scriptname, "[-hv] tophat_output_dir [result_dir]\n")
+    print("-h                 print this usage text and exit")
+    print("-v                 print the script name and version, and exit")
+    print("tophat_output_dir: directory containing accepted_hits.bam and unmapped.bam")
+    print("result_dir:        directory to write unmapped_fixup.bam to (default: tophat_output_dir)")
+    sys.exit(errcode)
+
+
+if __name__ == "__main__":
+    import getopt
+
+    scriptname = os.path.basename(sys.argv[0])
+    cmdline = " ".join(sys.argv)
+
+    try:
+        opts, args = getopt.gnu_getopt(sys.argv[1:], "dhv")
+    except getopt.GetoptError as err:
+        # print help information and exit
+        print(str(err), file=sys.stderr)
+        usage(scriptname, errcode=errno.EINVAL)
+
+    debug = False
+    for o, a in opts:
+        if o in "-d":
+            debug = True
+        elif o in "-h":
+            usage(scriptname, errcode=0)
+        elif o in "-v":
+            print(scriptname, VERSION)
+            sys.exit(0)
+        else:
+            assert False, "unhandled option"
+            sys.exit(errno.EINVAL)
+
+    if len(args) == 0 or len(args) > 2:
+        usage(scriptname, errcode=errno.EINVAL)
+
+    bamdir = args.pop(0)
+    if not os.path.isdir(bamdir):
+        print("Specified tophat_output_dir does not exist or is not a directory: %s" % bamdir, file=sys.stderr)
+        sys.exit(errno.EINVAL)
+
+    if len(args) > 0:
+        resultdir = args.pop(0)
+        if not os.path.isdir(resultdir):
+            print("Specified result_dir does not exist or is not a directory: %s" % resultdir, file=sys.stderr)
+            sys.exit(errno.EINVAL)
+    else:
+        resultdir = bamdir
+
+    try:
+        fix_unmapped_reads(bamdir, resultdir, cmdline=cmdline)
+    except KeyboardInterrupt:
+        print("Program interrupted by user, exiting.")
+        sys.exit(errno.EINTR)
+    except Exception as e:
+        if debug:
+            import traceback
+            print(traceback.format_exc(), file=sys.stderr)
+
+        print("Error: %s" % str(e), file=sys.stderr)
+        if hasattr(e, "errno"):
+            sys.exit(e.errno)
+        else:
+            sys.exit(1)
diff --git a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala
index b427348dc..dd8cc009b 100644
--- a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala
+++ b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala
@@ -17,6 +17,8 @@ package nl.lumc.sasc.biopet.pipelines.mapping
 
 import java.util.Date
 import java.io.File
+import nl.lumc.sasc.biopet.pipelines.mapping.scripts.TophatRecondition
+
 import scala.math._
 
 import org.broadinstitute.gatk.queue.QScript
@@ -362,7 +364,34 @@ class Mapping(val root: Configurable) extends QScript with SummaryQScript with S
     tophat.keep_fasta_order = true
     add(tophat)
 
-    addAddOrReplaceReadGroups(tophat.outputAcceptedHits, output)
+    // fix unmapped file coordinates
+    val fixedUnmapped = new File(tophat.output_dir, "unmapped_fixup.sam")
+    val fixer = new TophatRecondition(this)
+    fixer.inputBam = tophat.outputAcceptedHits
+    fixer.outputSam = fixedUnmapped
+    fixer.isIntermediate = true
+    add(fixer)
+
+    // sort fixed SAM file
+    val sorter = SortSam(this, fixer.outputSam, swapExt(fixer.outputSam, ".sam", ".sorted.bam"))
+    sorter.sortOrder = "coordinate"
+    sorter.isIntermediate = true
+    add(sorter)
+    
+    // merge with mapped file
+    val mergeSamFile = MergeSamFiles(this, List(tophat.outputAcceptedHits, sorter.output),
+      tophat.output_dir, "coordinate")
+    mergeSamFile.createIndex = true
+    mergeSamFile.isIntermediate = true
+    add(mergeSamFile)
+
+    // make sure header coordinates are correct
+    val reorderSam = new ReorderSam(this)
+    reorderSam.input = mergeSamFile.output
+    reorderSam.output = swapExt(output.getParent, output, ".merge.bam", ".reordered.bam")
+    add(reorderSam)
+
+    addAddOrReplaceReadGroups(reorderSam.output, output)
   }
   /**
    * Adds stampy jobs
diff --git a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/scripts/TophatRecondition.scala b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/scripts/TophatRecondition.scala
new file mode 100644
index 000000000..8e71ea232
--- /dev/null
+++ b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/scripts/TophatRecondition.scala
@@ -0,0 +1,50 @@
+/**
+ * Biopet is built on top of GATK Queue for building bioinformatic
+ * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
+ * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
+ * should also be able to execute Biopet tools and pipelines.
+ *
+ * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
+ *
+ * Contact us at: sasc@lumc.nl
+ *
+ * A dual licensing mode is applied. The source code within this project that are
+ * not part of GATK Queue is freely available for non-commercial use under an AGPL
+ * license; For commercial users or users who do not want to follow the AGPL
+ * license, please contact us to obtain a separate license.
+ */
+package nl.lumc.sasc.biopet.pipelines.mapping.scripts
+
+import java.io.File
+
+import nl.lumc.sasc.biopet.core.config.Configurable
+import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
+
+import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
+
+/**
+ * Wrapper for the tophat-recondition.py script.
+ * 
+ * NOTE: we are modifying the input and output to be the BAM files directly so the wrapper works nice with Queue.
+ */
+class TophatRecondition(val root: Configurable) extends PythonCommandLineFunction {
+
+  setPythonScript("tophat-recondition.py")
+
+  @Input(doc = "Path to input accepted_hits.bam", required = true)
+  var inputBam: File = null
+
+  @Output(doc = "Path to output unmapped_fixup.bam", required = false)
+  var outputSam: File = null
+
+  private def inputDir: File = inputBam.getAbsoluteFile.getParentFile
+
+  private def outputDir: File = outputSam.getAbsoluteFile.getParentFile
+
+  override def beforeGraph: Unit = {
+    require(inputBam != null, "Input must be defined.")
+    require(outputSam != null, "Output must be defined.")
+  }
+  
+  def cmdLine = getPythonCommand + required(inputDir) + required(outputDir)
+}
-- 
GitLab