Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Klinische Genetica
capture-lumc
hutspot
Commits
cb516e24
Commit
cb516e24
authored
Nov 23, 2017
by
Sander Bollen
Browse files
covstats
parent
903a86f2
Changes
2
Hide whitespace changes
Inline
Side-by-side
Snakefile
View file @
cb516e24
import json
from os.path import join
from os.path import join
, basename
from os import mkdir
from pyfaidx import Fasta
...
...
@@ -12,8 +12,8 @@ DBSNP = config.get("DBSNP")
ONETHOUSAND = config.get("ONETHOUSAND")
HAPMAP = config.get("HAPMAP")
QUEUE = config.get("QUEUE")
BED = config.get("BED")
REFFLAT = config.get("REFFLAT")
BED = config.get("BED")
# comma-separated list of BED files
REFFLAT = config.get("REFFLAT")
# comma-separated list of refFlat files
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
FASTQ_COUNT = config.get("FASTQ_COUNT")
...
...
@@ -24,11 +24,18 @@ env_dir = join(_this_dir, "envs")
main_env = join(_this_dir, "environment.yml")
settings_template = join(join(_this_dir, "templates"), "pipeline_settings.md.j2")
covpy = join(join(_this_dir, "src"), "covstats.py")
with open(config.get("SAMPLE_CONFIG")) as handle:
SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()
BEDS = BED.split(",")
REFFLATS = REFFLAT.split(",")
BASE_BEDS = [basename(x) for x in BEDS]
BASE_REFFLATS = [basename(x) for x in BEDS]
def split_genome(ref, approx_n_chunks=100):
fa = Fasta(ref)
...
...
@@ -76,6 +83,14 @@ def get_r2(wildcards):
return r2
def get_bedpath(wildcards):
return [x for x in BEDS if basename(x) == wildcards.bed][0]
def get_refflatpath(wildcards):
return [x for x in REFFLATS if basename(x) == wildcards.refflat][0]
def sample_gender(wildcards):
sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
return sam.get("gender", "null")
...
...
@@ -104,7 +119,10 @@ def metrics(do_metrics=True):
fnumpos = expand(out_path("{sample}/pre_process/{sample}.postqc_count.json"),
sample=SAMPLES)
return mnum + mbnum + unum + ubnum + fqcr + fqcm + fqcp + fnumpre + fnumpos
covs = expand(out_path("{sample}/coverage/{bed}.covstats.json"),
sample=SAMPLES, bed=BASE_BEDS)
return mnum + mbnum + unum + ubnum + fqcr + fqcm + fqcp + fnumpre + fnumpos + covs
rule all:
...
...
@@ -376,4 +394,22 @@ rule fqcount_postqc:
fastqcount=FASTQ_COUNT
output:
out_path("{sample}/pre_process/{sample}.postqc_count.json")
shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
\ No newline at end of file
shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
## coverages
rule covstats:
input:
bam=out_path("{sample}/bams/{sample}.markdup.bam"),
genome=out_path("current.genome"),
covpy=covpy,
bed=get_bedpath
params:
subt="Sample {sample}"
output:
covj=out_path("{sample}/coverage/{bed}.covstats.json"),
covp=out_path("{sample}/cpverage/{bed}.covstats.png")
shell: "bedtools coverage -sorted -a {input.bed} -b {input.bam} " \
"-d | python {input.covpy} - --plot {output.covp} " \
"--title 'Targets coverage' --subtitle '{params.subt}' > {output.covj}"
src/covstats.py
0 → 100644
View file @
cb516e24
#!/usr/bin/env python2.7
"""
Plot output of coverageBed / "bedtools coverage" when run with the -d flag.
This script plots a bar graph showing how many times a base covered X times
are found. An additional box plot is also plotted to show the general trend
of the coverage.
By default, the bar graph only shows the data up to the 98th percentile.
Counts for bases that are not covered (0x coverage) are shown in lighter
shade. The median, quartile, and whiskers values shown in the boxplot are
still computed from the complete data set (0x coverage included).
Requirements:
* Python == 2.7.x
* Matplotlib >= 1.3.0
* Numpy >= 1.8.0
Copyright (c) 2013 Wibowo Arindrarto <w.arindrarto@lumc.nl>
Copyright (c) 2013 LUMC Sequencing Analysis Support Core <sasc@lumc.nl>
MIT License <http://opensource.org/licenses/MIT>
"""
RELEASE
=
False
__version_info__
=
(
'0'
,
'1'
,)
__version__
=
'.'
.
join
(
__version_info__
)
__version__
+=
'-dev'
if
not
RELEASE
else
''
import
argparse
import
collections
import
itertools
import
json
import
locale
import
os
import
sys
import
matplotlib
matplotlib
.
use
(
'Agg'
)
import
matplotlib.gridspec
as
gs
import
matplotlib.pyplot
as
plt
import
matplotlib.ticker
as
tkr
import
numpy
as
np
BLUE
=
'#2166AC'
RED
=
'#1A9850'
GREEN
=
'#D6604D'
locale
.
setlocale
(
locale
.
LC_ALL
,
''
)
group_digits
=
lambda
x
,
pos
:
locale
.
format
(
'%d'
,
x
,
grouping
=
True
)
major_formatter
=
tkr
.
FuncFormatter
(
group_digits
)
def
cachedproperty
(
func
):
"""Decorator for cached property loading."""
attr_name
=
func
.
__name__
@
property
def
cached
(
self
):
if
not
hasattr
(
self
,
'_cache'
):
setattr
(
self
,
'_cache'
,
{})
try
:
return
self
.
_cache
[
attr_name
]
except
KeyError
:
result
=
self
.
_cache
[
attr_name
]
=
func
(
self
)
return
result
return
cached
class
Coverage
(
object
):
"""Class representing coverage metrics from a coverageBed -d output."""
def
__init__
(
self
,
cvg_list
,
name
=
'?'
):
"""
:param cvg_list: iterator yielding coverage per position
:type cvg_list: iterable
"""
assert
cvg_list
counter
=
collections
.
Counter
()
total_bases
,
nonzero_bases
=
0
,
0
for
cvg
in
cvg_list
:
counter
[
cvg
]
+=
1
total_bases
+=
1
if
cvg
>
0
:
nonzero_bases
+=
1
self
.
_counter
=
counter
self
.
total_bases
=
total_bases
self
.
nonzero_bases
=
nonzero_bases
def
__iter__
(
self
):
return
self
.
_counter
.
iteritems
()
def
__repr__
(
self
):
return
"{0}(...)"
.
format
(
self
.
__class__
.
__name__
)
@
cachedproperty
def
total
(
self
):
"""Total coverage."""
return
sum
([
cvg
*
count
for
cvg
,
count
in
self
])
@
cachedproperty
def
horizontal
(
self
):
"""Horizontal coverage (0 - 1)."""
return
float
(
self
.
nonzero_bases
)
/
self
.
total_bases
@
cachedproperty
def
mean
(
self
):
"""Average coverage."""
return
float
(
self
.
total
)
/
self
.
total_bases
@
cachedproperty
def
max
(
self
):
"""Maximum coverage."""
return
max
(
self
.
_counter
.
keys
())
@
cachedproperty
def
median
(
self
):
"""Median coverage."""
return
np
.
percentile
(
self
.
cov_counts
,
50
)
@
cachedproperty
def
mode
(
self
):
"""Modal coverage."""
count
=
collections
.
Counter
(
self
.
cov_counts
)
# return self.cov_counts[self.cov_counts.index(self.max)]
return
count
.
most_common
(
1
)[
0
][
0
]
@
cachedproperty
def
cov_counts
(
self
):
"""List of coverage for each base position (unsorted)."""
# get list of each base coverage
cov_counts
=
([
key
]
*
value
for
key
,
value
in
self
.
_counter
.
items
())
# and flatten the list
return
[
item
for
sublist
in
cov_counts
for
item
in
sublist
]
def
at_least
(
self
,
n
):
"""Return the percentages of bases covered at least n times."""
x
=
sum
([
count
for
cvg
,
count
in
self
if
cvg
>=
n
])
return
float
(
x
)
/
self
.
total_bases
def
get_quick_stats
(
self
):
"""Returns a dictionary containing quick coverage statistics."""
return
{
'max'
:
self
.
max
,
'median'
:
self
.
median
,
'mode'
:
self
.
mode
,
'mean'
:
self
.
mean
,
'horizontal'
:
self
.
horizontal
,
'width'
:
self
.
total_bases
,
'width_nonzero'
:
self
.
nonzero_bases
,
'total'
:
self
.
total
,
'frac_min_10x'
:
self
.
at_least
(
10
),
'frac_min_20x'
:
self
.
at_least
(
20
),
'frac_min_30x'
:
self
.
at_least
(
30
),
'frac_min_40x'
:
self
.
at_least
(
40
),
'frac_min_50x'
:
self
.
at_least
(
50
),
'frac_min_100x'
:
self
.
at_least
(
100
),
}
def
plot
(
self
,
min_cov_ok
=
7
,
percentile_show
=
98
,
title
=
None
,
out_img
=
None
):
"""Plots the coverage object.
:param min_cov_ok: Minimum coverage value to show in the bar graph
(default: 7).
:type min_cov_ok: int
:param percentile_show: Maximum percentile to show in the bar graph
(default: 98).
:type percentile_show: int
:param title: Plot title
:type title: list of strings (one item per line)
:param out_img: Output image filename.
:type out_img: str
"""
plt
.
figure
(
figsize
=
(
8
,
8
))
grids
=
gs
.
GridSpec
(
2
,
1
,
height_ratios
=
[
5
,
1
])
ax0
=
plt
.
subplot
(
grids
[
0
])
if
title
is
None
:
title
=
[
'Coverage Plot'
]
elif
isinstance
(
title
,
basestring
):
title
=
[
title
]
t
=
plt
.
title
(
'
\n
'
.
join
(
title
),
fontsize
=
20
)
t
.
set_y
(
1.05
)
x_data
,
y_data
=
[],
[]
x_data_shade
,
y_data_shade
=
[],
[]
for
coverage
,
count
in
self
:
if
coverage
>=
min_cov_ok
:
x_data
.
append
(
coverage
)
y_data
.
append
(
count
)
else
:
x_data_shade
.
append
(
coverage
)
y_data_shade
.
append
(
count
)
ax0
.
bar
(
x_data
,
y_data
,
width
=
1
,
linewidth
=
0
,
color
=
BLUE
,
align
=
'center'
)
ax0
.
yaxis
.
set_major_formatter
(
major_formatter
)
ax0
.
grid
(
True
)
plt
.
ylabel
(
'Counts'
)
ax0
.
text
(
0.5
,
0.95
,
'Mean: %.2fx Median: %.2fx '
'Horizontal: %.2f%%'
%
(
self
.
mean
,
self
.
median
,
100.0
*
\
self
.
horizontal
),
verticalalignment
=
'center'
,
horizontalalignment
=
'center'
,
transform
=
ax0
.
transAxes
,
bbox
=
dict
(
facecolor
=
'wheat'
,
alpha
=
0.5
,
boxstyle
=
'round'
),
size
=
12
)
ax1
=
plt
.
subplot
(
grids
[
1
],
sharex
=
ax0
)
ax1
.
axes
.
get_yaxis
().
set_visible
(
False
)
bp
=
ax1
.
boxplot
(
self
.
cov_counts
,
vert
=
False
,
widths
=
0.6
,
sym
=
'+'
)
for
x
in
itertools
.
chain
(
bp
[
'boxes'
],
bp
[
'medians'
],
bp
[
'whiskers'
],
bp
[
'caps'
]):
x
.
set
(
color
=
BLUE
,
linewidth
=
1.6
)
for
flier
in
bp
[
'fliers'
]:
flier
.
set
(
color
=
RED
,
alpha
=
0.5
)
# bp['fliers'][0].set(color=RED, alpha=0.5)
# bp['fliers'][1].set(color=GREEN, alpha=0.5)
upper_limit
=
np
.
percentile
(
self
.
cov_counts
,
percentile_show
)
if
x_data
:
space
=
(
upper_limit
-
min
(
x_data
))
/
40
else
:
space
=
0
# truncate plot if we're not displaying maximum value
if
upper_limit
!=
self
.
max
:
ax1
.
set_xlim
([
min
(
self
.
cov_counts
)
-
space
-
0.5
,
upper_limit
+
0.5
])
# otherwise, give some space
else
:
ax1
.
set_xlim
([
min
(
self
.
cov_counts
)
-
space
-
0.5
,
upper_limit
+
space
+
0.5
])
# plot shaded values
if
x_data_shade
:
ylim
=
ax0
.
get_ylim
()
ax0
.
bar
(
x_data_shade
,
y_data_shade
,
width
=
1
,
linewidth
=
0
,
color
=
BLUE
,
alpha
=
0.3
,
align
=
'center'
)
ax0
.
set_ylim
(
ylim
)
plt
.
xlabel
(
'Coverage'
)
grids
.
update
(
hspace
=
0.075
)
if
out_img
is
not
None
:
plt
.
savefig
(
out_img
,
bbox_inches
=
'tight'
)
if
__name__
==
'__main__'
:
usage
=
__doc__
.
split
(
'
\n\n\n
'
)
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
,
description
=
usage
[
0
],
epilog
=
usage
[
1
])
parser
.
add_argument
(
'input'
,
type
=
str
,
help
=
'Path to input file '
'(coverageBed output) or
\'
-
\'
for stdin'
)
parser
.
add_argument
(
'--plot'
,
dest
=
'plot'
,
type
=
str
,
help
=
'Path to output PNG file'
)
parser
.
add_argument
(
'--min-cov-show'
,
dest
=
'min_cov_ok'
,
type
=
int
,
default
=
6
,
help
=
'Minimum coverage to show in bar graph'
)
parser
.
add_argument
(
'--max-percentile-show'
,
dest
=
'max_pct_show'
,
type
=
int
,
default
=
98
,
help
=
'Maximum percentile to show in bar graph'
)
parser
.
add_argument
(
'--title'
,
dest
=
'title'
,
type
=
str
,
default
=
'Coverage Plot'
,
help
=
'Plot title'
)
parser
.
add_argument
(
'--subtitle'
,
dest
=
'subtitle'
,
type
=
str
,
help
=
'Plot subtitle'
)
args
=
parser
.
parse_args
()
if
args
.
input
==
'-'
:
instream
=
sys
.
stdin
else
:
instream
=
open
(
args
.
input
,
'r'
)
title
=
[
args
.
title
]
if
args
.
subtitle
is
None
:
title
.
append
(
"'"
+
args
.
input
+
"'"
)
else
:
title
.
append
(
args
.
subtitle
)
coverages
=
{}
all_covs
,
cur_covs
=
[],
[]
cur_chrom
,
prev_chrom
=
None
,
None
for
line
in
(
l
.
strip
()
for
l
in
instream
):
cols
=
line
.
split
(
'
\t
'
)
cur_chrom
=
cols
[
0
]
cvg
=
int
(
cols
[
-
1
])
# coverage for all positions
all_covs
.
append
(
cvg
)
# coverage per chromosome
if
cur_chrom
!=
prev_chrom
:
if
prev_chrom
is
not
None
:
coverages
[
prev_chrom
]
=
cur_covs
cur_covs
=
[]
prev_chrom
=
cur_chrom
cur_covs
.
append
(
cvg
)
# also append the last chromosome from the file
coverages
[
cur_chrom
]
=
cur_covs
coverages
[
'_all'
]
=
all_covs
for
cname
,
clist
in
coverages
.
items
():
coverages
[
cname
]
=
Coverage
(
clist
)
if
args
.
input
!=
'-'
:
instream
.
close
()
if
args
.
plot
is
not
None
:
coverages
[
'_all'
].
plot
(
min_cov_ok
=
args
.
min_cov_ok
,
percentile_show
=
args
.
max_pct_show
,
title
=
title
,
out_img
=
args
.
plot
)
stats
=
{
'coverage'
:
{
k
:
v
.
get_quick_stats
()
for
k
,
v
in
coverages
.
items
()}}
if
args
.
plot
is
not
None
:
files
=
{
'plot_coverage'
:
{
'path'
:
os
.
path
.
abspath
(
args
.
plot
),
'checksum_sha1'
:
None
,
}
}
else
:
files
=
{}
json
.
dump
({
'stats'
:
stats
,
'files'
:
files
},
sys
.
stdout
,
sort_keys
=
True
,
indent
=
4
,
separators
=
(
','
,
': '
))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment