Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
1f864153
Commit
1f864153
authored
Jan 14, 2015
by
Peter van 't Hof
Browse files
Remove old script
parent
020ffb1f
Changes
4
Hide whitespace changes
Inline
Side-by-side
public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/fastqc_contam.py
deleted
100755 → 0
View file @
020ffb1f
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
import
argparse
import
os
from
pyfastqc
import
load_from_dir
def
parse_contam_file
(
contam_file
,
delimiter
=
'
\t
'
):
"""Given a contaminant file, return a dictionary of contaminant sequences
names and their sequences.
Args:
contam_file -- path to contaminants file
delimiter -- ID, sequence delimiter in the contaminants file
"""
assert
os
.
path
.
exists
(
contam_file
),
"Contaminant file %r does not exist"
%
\
contam_file
with
open
(
contam_file
,
'r'
)
as
source
:
# read only lines not beginning with '#' and discard empty lines
lines
=
filter
(
None
,
(
line
.
strip
()
for
line
in
source
if
not
line
.
startswith
(
'#'
)))
# parse contam seq lines into lists of [id, sequence]
parse
=
lambda
line
:
filter
(
None
,
line
.
split
(
delimiter
))
parsed
=
(
parse
(
line
)
for
line
in
lines
)
# and create a dictionary, key=sequence id and value=sequence
contam_ref
=
{
name
:
seq
for
name
,
seq
in
parsed
}
return
contam_ref
def
get_contams_present
(
results_dir
,
contam_ref
):
"""Given a path to a FastQC HTML results file, return the <div> tag of the
overrepresented sequences list.
Args:
results_dir -- Path to FastQC results directory.
"""
assert
os
.
path
.
exists
(
results_dir
),
"Directory {0} not "
\
"found."
.
format
(
results_dir
)
fastqc
=
load_from_dir
(
results_dir
)
contam_names
=
set
([
x
[
3
]
for
x
in
fastqc
.
overrepresented_sequences
.
data
])
in_sample
=
lambda
rid
:
any
([
cid
.
startswith
(
rid
)
for
cid
in
contam_names
])
contams_present
=
{
x
:
y
for
x
,
y
in
contam_ref
.
items
()
if
in_sample
(
x
)}
return
contams_present
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'results_dir'
,
type
=
str
,
help
=
'Path to FastQC result directory file'
)
parser
.
add_argument
(
'-c'
,
'--contam_file'
,
type
=
str
,
dest
=
'contam_file'
,
help
=
'Path to contaminant file'
)
parser
.
add_argument
(
'-o'
,
'--output'
,
type
=
str
,
dest
=
'output'
,
help
=
'Path to output file'
)
parser
.
add_argument
(
'--seq-only'
,
dest
=
'seq_only'
,
action
=
'store_true'
,
help
=
'Whether to output contaminant sequences only or not'
)
args
=
parser
.
parse_args
()
contam_ref
=
parse_contam_file
(
args
.
contam_file
)
contam_ids
=
get_contams_present
(
args
.
results_dir
,
contam_ref
)
if
args
.
seq_only
:
fmt_out
=
lambda
cid
,
seq
:
seq
else
:
fmt_out
=
lambda
cid
,
seq
:
"{0}
\t
{1}"
.
format
(
cid
,
seq
)
if
args
.
output
is
None
:
for
cid
,
seq
in
contam_ids
.
items
():
print
fmt_out
(
cid
,
seq
)
else
:
with
open
(
args
.
output
,
'w'
)
as
target
:
for
cid
,
seq
in
contam_ids
.
items
():
target
.
write
(
fmt_out
(
cid
,
seq
)
+
'
\n
'
)
public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/scripts/pyfastqc/__init__.py
deleted
100644 → 0
View file @
020ffb1f
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
"""
--------
pyfastqc
--------
Parser for FastQC results.
Provides a method and classes for parsing a FastQC run results.
Tested with FastQC 0.9.5 output.
"""
import
os
def
load_file
(
fname
,
mode
=
'r'
,
**
kwargs
):
"""Given a path to a FastQC data file or an open file object pointing to it,
return a `FastQC` object.
"""
if
isinstance
(
fname
,
basestring
):
with
open
(
fname
,
mode
,
**
kwargs
)
as
fp
:
return
FastQC
(
fp
)
else
:
return
FastQC
(
fname
)
def
load_from_dir
(
dirname
,
data_fname
=
'fastqc_data.txt'
,
mode
=
'r'
,
**
kwargs
):
"""Given a path to a FastQC results directory, return a `FastQC` object."""
assert
os
.
path
.
exists
(
dirname
),
"Directory %r does not exist"
%
dirname
fqc_path
=
os
.
path
.
join
(
dirname
,
os
.
walk
(
dirname
).
next
()[
1
][
0
],
data_fname
)
return
load_file
(
fqc_path
,
mode
,
**
kwargs
)
class
FastQCModule
(
object
):
"""Class representing a FastQC analysis module."""
def
__init__
(
self
,
raw_lines
,
end_mark
=
'>>END_MODULE'
):
self
.
raw_lines
=
raw_lines
self
.
end_mark
=
end_mark
self
.
_status
=
None
self
.
_name
=
None
self
.
_data
=
self
.
parse
()
def
__repr__
(
self
):
return
'%s(%s)'
%
(
self
.
__class__
.
__name__
,
'[%r, ...]'
%
self
.
raw_lines
[
0
])
def
__str__
(
self
):
return
''
.
join
(
self
.
raw_lines
)
@
property
def
name
(
self
):
"""Name of the module."""
return
self
.
_name
@
property
def
columns
(
self
):
"""Columns in the module."""
return
self
.
_columns
@
property
def
data
(
self
):
"""FastQC data."""
return
self
.
_data
@
property
def
status
(
self
):
"""FastQC run status."""
return
self
.
_status
def
parse
(
self
):
"""Common parser for a FastQC module."""
# check that the last line is a proper end mark
assert
self
.
raw_lines
[
-
1
].
startswith
(
self
.
end_mark
)
# parse name and status from first line
tokens
=
self
.
raw_lines
[
0
].
strip
().
split
(
'
\t
'
)
name
=
tokens
[
0
][
2
:]
self
.
_name
=
name
status
=
tokens
[
-
1
]
assert
status
in
(
'pass'
,
'fail'
,
'warn'
),
"Unknown module status: %r"
\
%
status
self
.
_status
=
status
# and column names from second line
columns
=
self
.
raw_lines
[
1
][
1
:].
strip
().
split
(
'
\t
'
)
self
.
_columns
=
columns
# the rest of the lines except the last one
data
=
[]
for
line
in
self
.
raw_lines
[
2
:
-
1
]:
cols
=
line
.
strip
().
split
(
'
\t
'
)
data
.
append
(
cols
)
# optional processing for different modules
if
self
.
name
==
'Basic Statistics'
:
data
=
{
k
:
v
for
k
,
v
in
data
}
return
data
class
FastQC
(
object
):
"""Class representing results from a FastQC run."""
# module name -- attribute name mapping
_mod_map
=
{
'>>Basic Statistics'
:
'basic_statistics'
,
'>>Per base sequence quality'
:
'per_base_sequence_quality'
,
'>>Per sequence quality scores'
:
'per_sequence_quality_scores'
,
'>>Per base sequence content'
:
'per_base_sequence_content'
,
'>>Per base GC content'
:
'per_base_gc_content'
,
'>>Per sequence GC content'
:
'per_sequence_gc_content'
,
'>>Per base N content'
:
'per_base_n_content'
,
'>>Sequence Length Distribution'
:
'sequence_length_distribution'
,
'>>Sequence Duplication Levels'
:
'sequence_duplication_levels'
,
'>>Overrepresented sequences'
:
'overrepresented_sequences'
,
'>>Kmer content'
:
'kmer_content'
,
}
def
__init__
(
self
,
fp
):
# get file name
self
.
fname
=
fp
.
name
self
.
_modules
=
{}
line
=
fp
.
readline
()
while
True
:
tokens
=
line
.
strip
().
split
(
'
\t
'
)
# break on EOF
if
not
line
:
break
# parse version
elif
line
.
startswith
(
'##FastQC'
):
self
.
version
=
line
.
strip
().
split
()[
1
]
# parse individual modules
elif
tokens
[
0
]
in
self
.
_mod_map
:
attr
=
self
.
_mod_map
[
tokens
[
0
]]
raw_lines
=
self
.
read_module
(
fp
,
line
,
tokens
[
0
])
self
.
_modules
[
attr
]
=
FastQCModule
(
raw_lines
)
line
=
fp
.
readline
()
def
__repr__
(
self
):
return
'%s(%r)'
%
(
self
.
__class__
.
__name__
,
self
.
fname
)
def
_filter_by_status
(
self
,
status
):
return
[
x
.
name
for
x
in
self
.
_modules
.
values
()
if
x
.
status
==
status
]
def
read_module
(
self
,
fp
,
line
,
start_mark
):
raw
=
[
line
]
while
not
line
.
startswith
(
'>>END_MODULE'
):
line
=
fp
.
readline
()
raw
.
append
(
line
)
if
not
line
:
raise
ValueError
(
"Unexpected end of file in module %r"
%
line
)
return
raw
@
property
def
modules
(
self
):
return
self
.
_modules
@
property
def
passes
(
self
):
return
self
.
_filter_by_status
(
'pass'
)
@
property
def
passes_num
(
self
):
return
len
(
self
.
passes
)
@
property
def
warns
(
self
):
return
self
.
_filter_by_status
(
'warn'
)
@
property
def
warns_num
(
self
):
return
len
(
self
.
warns
)
@
property
def
fails
(
self
):
return
self
.
_filter_by_status
(
'fail'
)
@
property
def
fails_num
(
self
):
return
len
(
self
.
fails
)
@
property
def
basic_statistics
(
self
):
return
self
.
_modules
[
'basic_statistics'
]
@
property
def
per_base_sequence_quality
(
self
):
return
self
.
_modules
[
'per_base_sequence_quality'
]
@
property
def
per_sequence_quality_scores
(
self
):
return
self
.
_modules
[
'per_sequence_quality_scores'
]
@
property
def
per_base_sequence_content
(
self
):
return
self
.
_modules
[
'per_base_sequence_content'
]
@
property
def
per_base_gc_content
(
self
):
return
self
.
_modules
[
'per_base_gc_content'
]
@
property
def
per_sequence_gc_content
(
self
):
return
self
.
_modules
[
'per_sequence_gc_content'
]
@
property
def
per_base_n_content
(
self
):
return
self
.
_modules
[
'per_base_n_content'
]
@
property
def
sequence_length_distribution
(
self
):
return
self
.
_modules
[
'sequence_length_distribution'
]
@
property
def
sequence_duplication_levels
(
self
):
return
self
.
_modules
[
'sequence_duplication_levels'
]
@
property
def
overrepresented_sequences
(
self
):
return
self
.
_modules
[
'overrepresented_sequences'
]
@
property
def
kmer_content
(
self
):
return
self
.
_modules
[
'kmer_content'
]
public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/scripts/FastqcToContams.scala
deleted
100644 → 0
View file @
020ffb1f
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package
nl.lumc.sasc.biopet.scripts
import
java.io.File
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
}
import
nl.lumc.sasc.biopet.core.config.Configurable
import
nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
class
FastqcToContams
(
val
root
:
Configurable
)
extends
PythonCommandLineFunction
{
setPythonScript
(
"__init__.py"
,
"pyfastqc/"
)
setPythonScript
(
"fastqc_contam.py"
)
@Input
(
doc
=
"Fastqc output"
,
shortName
=
"fastqc"
,
required
=
true
)
var
fastqc_output
:
File
=
_
@Input
(
doc
=
"Contams input"
,
shortName
=
"fastqc"
,
required
=
false
)
var
contams_file
:
File
=
_
@Output
(
doc
=
"Output file"
,
shortName
=
"out"
,
required
=
true
)
var
out
:
File
=
_
def
cmdLine
=
{
getPythonCommand
+
required
(
fastqc_output
.
getParent
())
+
required
(
"-c"
,
contams_file
)
+
" > "
+
required
(
out
)
}
}
public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala
View file @
1f864153
...
...
@@ -21,7 +21,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import
nl.lumc.sasc.biopet.core.
{
BiopetQScript
,
PipelineCommand
}
import
nl.lumc.sasc.biopet.core.config.Configurable
import
nl.lumc.sasc.biopet.extensions.
{
Gzip
,
Pbzip2
,
Md5sum
,
Zcat
,
Seqstat
}
import
nl.lumc.sasc.biopet.scripts.
{
FastqSync
,
FastqcToContams
}
import
nl.lumc.sasc.biopet.scripts.
{
FastqSync
}
class
Flexiprep
(
val
root
:
Configurable
)
extends
QScript
with
BiopetQScript
{
def
this
()
=
this
(
null
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment