Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hutspot
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Klinische Genetica
capture-lumc
hutspot
Commits
cb516e24
Commit
cb516e24
authored
7 years ago
by
Sander Bollen
Browse files
Options
Downloads
Patches
Plain Diff
covstats
parent
903a86f2
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
Snakefile
+41
-5
41 additions, 5 deletions
Snakefile
src/covstats.py
+328
-0
328 additions, 0 deletions
src/covstats.py
with
369 additions
and
5 deletions
Snakefile
+
41
−
5
View file @
cb516e24
import json
from os.path import join
from os.path import join
, basename
from os import mkdir
from pyfaidx import Fasta
...
...
@@ -12,8 +12,8 @@ DBSNP = config.get("DBSNP")
ONETHOUSAND = config.get("ONETHOUSAND")
HAPMAP = config.get("HAPMAP")
QUEUE = config.get("QUEUE")
BED = config.get("BED")
REFFLAT = config.get("REFFLAT")
BED = config.get("BED")
# comma-separated list of BED files
REFFLAT = config.get("REFFLAT")
# comma-separated list of refFlat files
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
FASTQ_COUNT = config.get("FASTQ_COUNT")
...
...
@@ -24,11 +24,18 @@ env_dir = join(_this_dir, "envs")
main_env = join(_this_dir, "environment.yml")
settings_template = join(join(_this_dir, "templates"), "pipeline_settings.md.j2")
covpy = join(join(_this_dir, "src"), "covstats.py")
with open(config.get("SAMPLE_CONFIG")) as handle:
SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()
BEDS = BED.split(",")
REFFLATS = REFFLAT.split(",")
BASE_BEDS = [basename(x) for x in BEDS]
BASE_REFFLATS = [basename(x) for x in BEDS]
def split_genome(ref, approx_n_chunks=100):
fa = Fasta(ref)
...
...
@@ -76,6 +83,14 @@ def get_r2(wildcards):
return r2
def get_bedpath(wildcards):
return [x for x in BEDS if basename(x) == wildcards.bed][0]
def get_refflatpath(wildcards):
return [x for x in REFFLATS if basename(x) == wildcards.refflat][0]
def sample_gender(wildcards):
sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
return sam.get("gender", "null")
...
...
@@ -104,7 +119,10 @@ def metrics(do_metrics=True):
fnumpos = expand(out_path("{sample}/pre_process/{sample}.postqc_count.json"),
sample=SAMPLES)
return mnum + mbnum + unum + ubnum + fqcr + fqcm + fqcp + fnumpre + fnumpos
covs = expand(out_path("{sample}/coverage/{bed}.covstats.json"),
sample=SAMPLES, bed=BASE_BEDS)
return mnum + mbnum + unum + ubnum + fqcr + fqcm + fqcp + fnumpre + fnumpos + covs
rule all:
...
...
@@ -376,4 +394,22 @@ rule fqcount_postqc:
fastqcount=FASTQ_COUNT
output:
out_path("{sample}/pre_process/{sample}.postqc_count.json")
shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
\ No newline at end of file
shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
## coverages
rule covstats:
input:
bam=out_path("{sample}/bams/{sample}.markdup.bam"),
genome=out_path("current.genome"),
covpy=covpy,
bed=get_bedpath
params:
subt="Sample {sample}"
output:
covj=out_path("{sample}/coverage/{bed}.covstats.json"),
covp=out_path("{sample}/cpverage/{bed}.covstats.png")
shell: "bedtools coverage -sorted -a {input.bed} -b {input.bam} " \
"-d | python {input.covpy} - --plot {output.covp} " \
"--title 'Targets coverage' --subtitle '{params.subt}' > {output.covj}"
This diff is collapsed.
Click to expand it.
src/covstats.py
0 → 100644
+
328
−
0
View file @
cb516e24
#!/usr/bin/env python2.7
"""
Plot output of coverageBed /
"
bedtools coverage
"
when run with the -d flag.
This script plots a bar graph showing how many times a base covered X times
are found. An additional box plot is also plotted to show the general trend
of the coverage.
By default, the bar graph only shows the data up to the 98th percentile.
Counts for bases that are not covered (0x coverage) are shown in lighter
shade. The median, quartile, and whiskers values shown in the boxplot are
still computed from the complete data set (0x coverage included).
Requirements:
* Python == 2.7.x
* Matplotlib >= 1.3.0
* Numpy >= 1.8.0
Copyright (c) 2013 Wibowo Arindrarto <w.arindrarto@lumc.nl>
Copyright (c) 2013 LUMC Sequencing Analysis Support Core <sasc@lumc.nl>
MIT License <http://opensource.org/licenses/MIT>
"""
RELEASE
=
False
__version_info__
=
(
'
0
'
,
'
1
'
,)
__version__
=
'
.
'
.
join
(
__version_info__
)
__version__
+=
'
-dev
'
if
not
RELEASE
else
''
import
argparse
import
collections
import
itertools
import
json
import
locale
import
os
import
sys
import
matplotlib
matplotlib
.
use
(
'
Agg
'
)
import
matplotlib.gridspec
as
gs
import
matplotlib.pyplot
as
plt
import
matplotlib.ticker
as
tkr
import
numpy
as
np
BLUE
=
'
#2166AC
'
RED
=
'
#1A9850
'
GREEN
=
'
#D6604D
'
locale
.
setlocale
(
locale
.
LC_ALL
,
''
)
group_digits
=
lambda
x
,
pos
:
locale
.
format
(
'
%d
'
,
x
,
grouping
=
True
)
major_formatter
=
tkr
.
FuncFormatter
(
group_digits
)
def
cachedproperty
(
func
):
"""
Decorator for cached property loading.
"""
attr_name
=
func
.
__name__
@property
def
cached
(
self
):
if
not
hasattr
(
self
,
'
_cache
'
):
setattr
(
self
,
'
_cache
'
,
{})
try
:
return
self
.
_cache
[
attr_name
]
except
KeyError
:
result
=
self
.
_cache
[
attr_name
]
=
func
(
self
)
return
result
return
cached
class
Coverage
(
object
):
"""
Class representing coverage metrics from a coverageBed -d output.
"""
def
__init__
(
self
,
cvg_list
,
name
=
'
?
'
):
"""
:param cvg_list: iterator yielding coverage per position
:type cvg_list: iterable
"""
assert
cvg_list
counter
=
collections
.
Counter
()
total_bases
,
nonzero_bases
=
0
,
0
for
cvg
in
cvg_list
:
counter
[
cvg
]
+=
1
total_bases
+=
1
if
cvg
>
0
:
nonzero_bases
+=
1
self
.
_counter
=
counter
self
.
total_bases
=
total_bases
self
.
nonzero_bases
=
nonzero_bases
def
__iter__
(
self
):
return
self
.
_counter
.
iteritems
()
def
__repr__
(
self
):
return
"
{0}(...)
"
.
format
(
self
.
__class__
.
__name__
)
@cachedproperty
def
total
(
self
):
"""
Total coverage.
"""
return
sum
([
cvg
*
count
for
cvg
,
count
in
self
])
@cachedproperty
def
horizontal
(
self
):
"""
Horizontal coverage (0 - 1).
"""
return
float
(
self
.
nonzero_bases
)
/
self
.
total_bases
@cachedproperty
def
mean
(
self
):
"""
Average coverage.
"""
return
float
(
self
.
total
)
/
self
.
total_bases
@cachedproperty
def
max
(
self
):
"""
Maximum coverage.
"""
return
max
(
self
.
_counter
.
keys
())
@cachedproperty
def
median
(
self
):
"""
Median coverage.
"""
return
np
.
percentile
(
self
.
cov_counts
,
50
)
@cachedproperty
def
mode
(
self
):
"""
Modal coverage.
"""
count
=
collections
.
Counter
(
self
.
cov_counts
)
# return self.cov_counts[self.cov_counts.index(self.max)]
return
count
.
most_common
(
1
)[
0
][
0
]
@cachedproperty
def
cov_counts
(
self
):
"""
List of coverage for each base position (unsorted).
"""
# get list of each base coverage
cov_counts
=
([
key
]
*
value
for
key
,
value
in
self
.
_counter
.
items
())
# and flatten the list
return
[
item
for
sublist
in
cov_counts
for
item
in
sublist
]
def
at_least
(
self
,
n
):
"""
Return the percentages of bases covered at least n times.
"""
x
=
sum
([
count
for
cvg
,
count
in
self
if
cvg
>=
n
])
return
float
(
x
)
/
self
.
total_bases
def
get_quick_stats
(
self
):
"""
Returns a dictionary containing quick coverage statistics.
"""
return
{
'
max
'
:
self
.
max
,
'
median
'
:
self
.
median
,
'
mode
'
:
self
.
mode
,
'
mean
'
:
self
.
mean
,
'
horizontal
'
:
self
.
horizontal
,
'
width
'
:
self
.
total_bases
,
'
width_nonzero
'
:
self
.
nonzero_bases
,
'
total
'
:
self
.
total
,
'
frac_min_10x
'
:
self
.
at_least
(
10
),
'
frac_min_20x
'
:
self
.
at_least
(
20
),
'
frac_min_30x
'
:
self
.
at_least
(
30
),
'
frac_min_40x
'
:
self
.
at_least
(
40
),
'
frac_min_50x
'
:
self
.
at_least
(
50
),
'
frac_min_100x
'
:
self
.
at_least
(
100
),
}
def
plot
(
self
,
min_cov_ok
=
7
,
percentile_show
=
98
,
title
=
None
,
out_img
=
None
):
"""
Plots the coverage object.
:param min_cov_ok: Minimum coverage value to show in the bar graph
(default: 7).
:type min_cov_ok: int
:param percentile_show: Maximum percentile to show in the bar graph
(default: 98).
:type percentile_show: int
:param title: Plot title
:type title: list of strings (one item per line)
:param out_img: Output image filename.
:type out_img: str
"""
plt
.
figure
(
figsize
=
(
8
,
8
))
grids
=
gs
.
GridSpec
(
2
,
1
,
height_ratios
=
[
5
,
1
])
ax0
=
plt
.
subplot
(
grids
[
0
])
if
title
is
None
:
title
=
[
'
Coverage Plot
'
]
elif
isinstance
(
title
,
basestring
):
title
=
[
title
]
t
=
plt
.
title
(
'
\n
'
.
join
(
title
),
fontsize
=
20
)
t
.
set_y
(
1.05
)
x_data
,
y_data
=
[],
[]
x_data_shade
,
y_data_shade
=
[],
[]
for
coverage
,
count
in
self
:
if
coverage
>=
min_cov_ok
:
x_data
.
append
(
coverage
)
y_data
.
append
(
count
)
else
:
x_data_shade
.
append
(
coverage
)
y_data_shade
.
append
(
count
)
ax0
.
bar
(
x_data
,
y_data
,
width
=
1
,
linewidth
=
0
,
color
=
BLUE
,
align
=
'
center
'
)
ax0
.
yaxis
.
set_major_formatter
(
major_formatter
)
ax0
.
grid
(
True
)
plt
.
ylabel
(
'
Counts
'
)
ax0
.
text
(
0.5
,
0.95
,
'
Mean: %.2fx Median: %.2fx
'
'
Horizontal: %.2f%%
'
%
(
self
.
mean
,
self
.
median
,
100.0
*
\
self
.
horizontal
),
verticalalignment
=
'
center
'
,
horizontalalignment
=
'
center
'
,
transform
=
ax0
.
transAxes
,
bbox
=
dict
(
facecolor
=
'
wheat
'
,
alpha
=
0.5
,
boxstyle
=
'
round
'
),
size
=
12
)
ax1
=
plt
.
subplot
(
grids
[
1
],
sharex
=
ax0
)
ax1
.
axes
.
get_yaxis
().
set_visible
(
False
)
bp
=
ax1
.
boxplot
(
self
.
cov_counts
,
vert
=
False
,
widths
=
0.6
,
sym
=
'
+
'
)
for
x
in
itertools
.
chain
(
bp
[
'
boxes
'
],
bp
[
'
medians
'
],
bp
[
'
whiskers
'
],
bp
[
'
caps
'
]):
x
.
set
(
color
=
BLUE
,
linewidth
=
1.6
)
for
flier
in
bp
[
'
fliers
'
]:
flier
.
set
(
color
=
RED
,
alpha
=
0.5
)
# bp['fliers'][0].set(color=RED, alpha=0.5)
# bp['fliers'][1].set(color=GREEN, alpha=0.5)
upper_limit
=
np
.
percentile
(
self
.
cov_counts
,
percentile_show
)
if
x_data
:
space
=
(
upper_limit
-
min
(
x_data
))
/
40
else
:
space
=
0
# truncate plot if we're not displaying maximum value
if
upper_limit
!=
self
.
max
:
ax1
.
set_xlim
([
min
(
self
.
cov_counts
)
-
space
-
0.5
,
upper_limit
+
0.5
])
# otherwise, give some space
else
:
ax1
.
set_xlim
([
min
(
self
.
cov_counts
)
-
space
-
0.5
,
upper_limit
+
space
+
0.5
])
# plot shaded values
if
x_data_shade
:
ylim
=
ax0
.
get_ylim
()
ax0
.
bar
(
x_data_shade
,
y_data_shade
,
width
=
1
,
linewidth
=
0
,
color
=
BLUE
,
alpha
=
0.3
,
align
=
'
center
'
)
ax0
.
set_ylim
(
ylim
)
plt
.
xlabel
(
'
Coverage
'
)
grids
.
update
(
hspace
=
0.075
)
if
out_img
is
not
None
:
plt
.
savefig
(
out_img
,
bbox_inches
=
'
tight
'
)
if
__name__
==
'
__main__
'
:
usage
=
__doc__
.
split
(
'
\n\n\n
'
)
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
,
description
=
usage
[
0
],
epilog
=
usage
[
1
])
parser
.
add_argument
(
'
input
'
,
type
=
str
,
help
=
'
Path to input file
'
'
(coverageBed output) or
\'
-
\'
for stdin
'
)
parser
.
add_argument
(
'
--plot
'
,
dest
=
'
plot
'
,
type
=
str
,
help
=
'
Path to output PNG file
'
)
parser
.
add_argument
(
'
--min-cov-show
'
,
dest
=
'
min_cov_ok
'
,
type
=
int
,
default
=
6
,
help
=
'
Minimum coverage to show in bar graph
'
)
parser
.
add_argument
(
'
--max-percentile-show
'
,
dest
=
'
max_pct_show
'
,
type
=
int
,
default
=
98
,
help
=
'
Maximum percentile to show in bar graph
'
)
parser
.
add_argument
(
'
--title
'
,
dest
=
'
title
'
,
type
=
str
,
default
=
'
Coverage Plot
'
,
help
=
'
Plot title
'
)
parser
.
add_argument
(
'
--subtitle
'
,
dest
=
'
subtitle
'
,
type
=
str
,
help
=
'
Plot subtitle
'
)
args
=
parser
.
parse_args
()
if
args
.
input
==
'
-
'
:
instream
=
sys
.
stdin
else
:
instream
=
open
(
args
.
input
,
'
r
'
)
title
=
[
args
.
title
]
if
args
.
subtitle
is
None
:
title
.
append
(
"'"
+
args
.
input
+
"'"
)
else
:
title
.
append
(
args
.
subtitle
)
coverages
=
{}
all_covs
,
cur_covs
=
[],
[]
cur_chrom
,
prev_chrom
=
None
,
None
for
line
in
(
l
.
strip
()
for
l
in
instream
):
cols
=
line
.
split
(
'
\t
'
)
cur_chrom
=
cols
[
0
]
cvg
=
int
(
cols
[
-
1
])
# coverage for all positions
all_covs
.
append
(
cvg
)
# coverage per chromosome
if
cur_chrom
!=
prev_chrom
:
if
prev_chrom
is
not
None
:
coverages
[
prev_chrom
]
=
cur_covs
cur_covs
=
[]
prev_chrom
=
cur_chrom
cur_covs
.
append
(
cvg
)
# also append the last chromosome from the file
coverages
[
cur_chrom
]
=
cur_covs
coverages
[
'
_all
'
]
=
all_covs
for
cname
,
clist
in
coverages
.
items
():
coverages
[
cname
]
=
Coverage
(
clist
)
if
args
.
input
!=
'
-
'
:
instream
.
close
()
if
args
.
plot
is
not
None
:
coverages
[
'
_all
'
].
plot
(
min_cov_ok
=
args
.
min_cov_ok
,
percentile_show
=
args
.
max_pct_show
,
title
=
title
,
out_img
=
args
.
plot
)
stats
=
{
'
coverage
'
:
{
k
:
v
.
get_quick_stats
()
for
k
,
v
in
coverages
.
items
()}}
if
args
.
plot
is
not
None
:
files
=
{
'
plot_coverage
'
:
{
'
path
'
:
os
.
path
.
abspath
(
args
.
plot
),
'
checksum_sha1
'
:
None
,
}
}
else
:
files
=
{}
json
.
dump
({
'
stats
'
:
stats
,
'
files
'
:
files
},
sys
.
stdout
,
sort_keys
=
True
,
indent
=
4
,
separators
=
(
'
,
'
,
'
:
'
))
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment