Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
2db37a64
Commit
2db37a64
authored
Jun 27, 2015
by
bow
Browse files
Add step for merging unmapped and mapped tophat alignment files
parent
a9beb867
Changes
3
Hide whitespace changes
Inline
Side-by-side
public/mapping/src/main/resources/nl/lumc/sasc/biopet/pipelines/mapping/scripts/tophat-recondition.py
0 → 100755
View file @
2db37a64
#!/usr/bin/env python
# -*- coding: utf-8 -*
#
# Copyright (c) 2013-2015 Christian Brueffer (ORCID: 0000-0002-3826-0989)
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
"""
This script modifies the reads in a TopHat unmapped.bam file to make them
compatible with downstream tools (e.g., Picard, samtools or GATK).
Homepage: https://github.com/cbrueffer/tophat-recondition
"""
from
__future__
import
print_function
import
errno
import
os
import
sys
try
:
import
pysam
except
ImportError
:
print
(
'Cannot import the pysam package; please make sure it is installed.
\n
'
)
sys
.
exit
(
1
)
VERSION
=
"0.3"
def
get_index_pos
(
index
,
read
):
"""Returns the position of a read in the index or None."""
if
read
.
qname
in
index
:
return
index
[
read
.
qname
]
else
:
return
None
def
fix_unmapped_reads
(
path
,
outdir
,
mapped_file
=
"accepted_hits.bam"
,
unmapped_file
=
"unmapped.bam"
,
cmdline
=
""
):
# Fix things that relate to all unmapped reads.
unmapped_dict
=
{}
unmapped_index
=
{}
with
pysam
.
Samfile
(
os
.
path
.
join
(
path
,
unmapped_file
))
as
bam_unmapped
:
unmapped_reads
=
list
(
bam_unmapped
.
fetch
(
until_eof
=
True
))
unmapped_header
=
bam_unmapped
.
header
for
i
in
range
(
len
(
unmapped_reads
)):
read
=
unmapped_reads
[
i
]
# remove /1 and /2 suffixes
if
read
.
qname
.
find
(
"/"
)
!=
-
1
:
read
.
qname
=
read
.
qname
[:
-
2
]
unmapped_index
[
read
.
qname
]
=
i
# work around "mate is unmapped" bug in TopHat
if
read
.
qname
in
unmapped_dict
:
unmapped_reads
[
unmapped_dict
[
read
.
qname
]].
mate_is_unmapped
=
True
read
.
mate_is_unmapped
=
True
else
:
unmapped_dict
[
read
.
qname
]
=
i
read
.
mapq
=
0
unmapped_reads
[
i
]
=
read
# Fix things that relate only to unmapped reads with a mapped mate.
with
pysam
.
Samfile
(
os
.
path
.
join
(
path
,
mapped_file
))
as
bam_mapped
:
for
mapped
in
bam_mapped
:
if
mapped
.
mate_is_unmapped
:
i
=
get_index_pos
(
unmapped_index
,
mapped
)
if
i
is
not
None
:
unmapped
=
unmapped_reads
[
i
]
# map chromosome TIDs from mapped to unmapped file
mapped_rname
=
bam_mapped
.
getrname
(
mapped
.
tid
)
unmapped_new_tid
=
bam_unmapped
.
gettid
(
mapped_rname
)
unmapped
.
tid
=
unmapped_new_tid
unmapped
.
rnext
=
unmapped_new_tid
unmapped
.
pos
=
mapped
.
pos
unmapped
.
pnext
=
0
unmapped_reads
[
i
]
=
unmapped
# for the output file, take the headers from the unmapped file
base
,
_
=
os
.
path
.
splitext
(
unmapped_file
)
out_filename
=
""
.
join
([
base
,
"_fixup.sam"
])
# since BAM bin values may be messed up after processing
fixup_header
=
unmapped_header
fixup_header
[
'PG'
].
append
({
'ID'
:
'TopHat-Recondition'
,
'VN'
:
VERSION
,
'CL'
:
cmdline
})
with
pysam
.
Samfile
(
os
.
path
.
join
(
outdir
,
out_filename
),
"wh"
,
header
=
fixup_header
)
as
bam_out
:
for
read
in
unmapped_reads
:
bam_out
.
write
(
read
)
def
usage
(
scriptname
,
errcode
=
errno
.
EINVAL
):
print
(
"Usage:
\n
"
)
print
(
scriptname
,
"[-hv] tophat_output_dir [result_dir]
\n
"
)
print
(
"-h print this usage text and exit"
)
print
(
"-v print the script name and version, and exit"
)
print
(
"tophat_output_dir: directory containing accepted_hits.bam and unmapped.bam"
)
print
(
"result_dir: directory to write unmapped_fixup.bam to (default: tophat_output_dir)"
)
sys
.
exit
(
errcode
)
if
__name__
==
"__main__"
:
import
getopt
scriptname
=
os
.
path
.
basename
(
sys
.
argv
[
0
])
cmdline
=
" "
.
join
(
sys
.
argv
)
try
:
opts
,
args
=
getopt
.
gnu_getopt
(
sys
.
argv
[
1
:],
"dhv"
)
except
getopt
.
GetoptError
as
err
:
# print help information and exit
print
(
str
(
err
),
file
=
sys
.
stderr
)
usage
(
scriptname
,
errcode
=
errno
.
EINVAL
)
debug
=
False
for
o
,
a
in
opts
:
if
o
in
"-d"
:
debug
=
True
elif
o
in
"-h"
:
usage
(
scriptname
,
errcode
=
0
)
elif
o
in
"-v"
:
print
(
scriptname
,
VERSION
)
sys
.
exit
(
0
)
else
:
assert
False
,
"unhandled option"
sys
.
exit
(
errno
.
EINVAL
)
if
len
(
args
)
==
0
or
len
(
args
)
>
2
:
usage
(
scriptname
,
errcode
=
errno
.
EINVAL
)
bamdir
=
args
.
pop
(
0
)
if
not
os
.
path
.
isdir
(
bamdir
):
print
(
"Specified tophat_output_dir does not exist or is not a directory: %s"
%
bamdir
,
file
=
sys
.
stderr
)
sys
.
exit
(
errno
.
EINVAL
)
if
len
(
args
)
>
0
:
resultdir
=
args
.
pop
(
0
)
if
not
os
.
path
.
isdir
(
resultdir
):
print
(
"Specified result_dir does not exist or is not a directory: %s"
%
resultdir
,
file
=
sys
.
stderr
)
sys
.
exit
(
errno
.
EINVAL
)
else
:
resultdir
=
bamdir
try
:
fix_unmapped_reads
(
bamdir
,
resultdir
,
cmdline
=
cmdline
)
except
KeyboardInterrupt
:
print
(
"Program interrupted by user, exiting."
)
sys
.
exit
(
errno
.
EINTR
)
except
Exception
as
e
:
if
debug
:
import
traceback
print
(
traceback
.
format_exc
(),
file
=
sys
.
stderr
)
print
(
"Error: %s"
%
str
(
e
),
file
=
sys
.
stderr
)
if
hasattr
(
e
,
"errno"
):
sys
.
exit
(
e
.
errno
)
else
:
sys
.
exit
(
1
)
public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala
View file @
2db37a64
...
...
@@ -17,6 +17,8 @@ package nl.lumc.sasc.biopet.pipelines.mapping
import
java.util.Date
import
java.io.File
import
nl.lumc.sasc.biopet.pipelines.mapping.scripts.TophatRecondition
import
scala.math._
import
org.broadinstitute.gatk.queue.QScript
...
...
@@ -362,7 +364,34 @@ class Mapping(val root: Configurable) extends QScript with SummaryQScript with S
tophat
.
keep_fasta_order
=
true
add
(
tophat
)
addAddOrReplaceReadGroups
(
tophat
.
outputAcceptedHits
,
output
)
// fix unmapped file coordinates
val
fixedUnmapped
=
new
File
(
tophat
.
output_dir
,
"unmapped_fixup.sam"
)
val
fixer
=
new
TophatRecondition
(
this
)
fixer
.
inputBam
=
tophat
.
outputAcceptedHits
fixer
.
outputSam
=
fixedUnmapped
fixer
.
isIntermediate
=
true
add
(
fixer
)
// sort fixed SAM file
val
sorter
=
SortSam
(
this
,
fixer
.
outputSam
,
swapExt
(
fixer
.
outputSam
,
".sam"
,
".sorted.bam"
))
sorter
.
sortOrder
=
"coordinate"
sorter
.
isIntermediate
=
true
add
(
sorter
)
// merge with mapped file
val
mergeSamFile
=
MergeSamFiles
(
this
,
List
(
tophat
.
outputAcceptedHits
,
sorter
.
output
),
tophat
.
output_dir
,
"coordinate"
)
mergeSamFile
.
createIndex
=
true
mergeSamFile
.
isIntermediate
=
true
add
(
mergeSamFile
)
// make sure header coordinates are correct
val
reorderSam
=
new
ReorderSam
(
this
)
reorderSam
.
input
=
mergeSamFile
.
output
reorderSam
.
output
=
swapExt
(
output
.
getParent
,
output
,
".merge.bam"
,
".reordered.bam"
)
add
(
reorderSam
)
addAddOrReplaceReadGroups
(
reorderSam
.
output
,
output
)
}
/**
* Adds stampy jobs
...
...
public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/scripts/TophatRecondition.scala
0 → 100644
View file @
2db37a64
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package
nl.lumc.sasc.biopet.pipelines.mapping.scripts
import
java.io.File
import
nl.lumc.sasc.biopet.core.config.Configurable
import
nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
}
/**
* Wrapper for the tophat-recondition.py script.
*
* NOTE: we are modifying the input and output to be the BAM files directly so the wrapper works nice with Queue.
*/
class
TophatRecondition
(
val
root
:
Configurable
)
extends
PythonCommandLineFunction
{
setPythonScript
(
"tophat-recondition.py"
)
@Input
(
doc
=
"Path to input accepted_hits.bam"
,
required
=
true
)
var
inputBam
:
File
=
null
@Output
(
doc
=
"Path to output unmapped_fixup.bam"
,
required
=
false
)
var
outputSam
:
File
=
null
private
def
inputDir
:
File
=
inputBam
.
getAbsoluteFile
.
getParentFile
private
def
outputDir
:
File
=
outputSam
.
getAbsoluteFile
.
getParentFile
override
def
beforeGraph
:
Unit
=
{
require
(
inputBam
!=
null
,
"Input must be defined."
)
require
(
outputSam
!=
null
,
"Output must be defined."
)
}
def
cmdLine
=
getPythonCommand
+
required
(
inputDir
)
+
required
(
outputDir
)
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment