Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sam Nooij
Jovian screener
Commits
c8aa56d3
Commit
c8aa56d3
authored
Jul 28, 2020
by
Sam Nooij
Browse files
Reformat Python code with Black
parent
6ce6ee4e
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
bin/collect_species_statistics.py
View file @
c8aa56d3
#! /usr/bin/env python3
#Collect scaffold information for a species filtered from Jovian's
# classification table. (Which scaffolds, their length and number
#
Collect scaffold information for a species filtered from Jovian's
# classification table. (Which scaffolds, their length and number
# of reads mapped to them.)
#
# Example use:
...
...
@@ -9,19 +9,19 @@
# -i data/all_taxClassified-Escherichia_coli.tsv data/Mapped_read_counts.tsv \
# -o results/Escherichia_coli-scaffolds_per_sample.tsv results/Escherichia_coli-stats_per_sample.tsv \
#
#Three required input arguments are:
#
Three required input arguments are:
# -i/--input for the two required tables (see example)
# -o/--output for the result tables (see example)
## Import required libraries
import
sys
#to abort the script when an error occurs
import
argparse
#to parse command-line arguments
import
pandas
as
pd
#to work with dataframes (tabular data)
import
numpy
as
np
#for calculations (summing contig stats)
import
re
#to extract sample names from multiqc table
import
sys
#
to abort the script when an error occurs
import
argparse
#
to parse command-line arguments
import
pandas
as
pd
#
to work with dataframes (tabular data)
import
numpy
as
np
#
for calculations (summing contig stats)
import
re
#
to extract sample names from multiqc table
## Define functions:
#1. Parse command-line arguments
#
1. Parse command-line arguments
def
parse_arguments
():
"""
Parse the arguments from the command line, i.e.:
...
...
@@ -29,99 +29,112 @@ def parse_arguments():
-o/--output = 2 output files
-h/--help = show help
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"collect species statistics"
,
description
=
"Collect scaffold statistics for each species filtered from Jovian's results"
,
usage
=
"collect_species_statistics.py -i file1 file2 -o file3 file4"
" [-h / --help]"
,
add_help
=
False
)
parser
=
argparse
.
ArgumentParser
(
prog
=
"collect species statistics"
,
description
=
"Collect scaffold statistics for each species filtered from Jovian's results"
,
usage
=
"collect_species_statistics.py -i file1 file2 -o file3 file4"
" [-h / --help]"
,
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
"-i"
,
"--input"
,
dest
=
"input"
,
metavar
=
"file1 file2"
,
required
=
True
,
nargs
=
2
,
type
=
str
,
help
=
"Classified scaffolds table and mapped read counts."
)
required
.
add_argument
(
"-o"
,
"--output"
,
dest
=
"output"
,
metavar
=
"file3 file4"
,
required
=
True
,
nargs
=
2
,
type
=
str
,
help
=
"Output tables (per scaffold, and summed statistics)."
)
required
.
add_argument
(
"-i"
,
"--input"
,
dest
=
"input"
,
metavar
=
"file1 file2"
,
required
=
True
,
nargs
=
2
,
type
=
str
,
help
=
"Classified scaffolds table and mapped read counts."
,
)
required
.
add_argument
(
"-o"
,
"--output"
,
dest
=
"output"
,
metavar
=
"file3 file4"
,
required
=
True
,
nargs
=
2
,
type
=
str
,
help
=
"Output tables (per scaffold, and summed statistics)."
,
)
optional
=
parser
.
add_argument_group
(
"Optional arguments"
)
optional
.
add_argument
(
"-h"
,
"--help"
,
action
=
"help"
,
help
=
"Show this message and exit."
)
optional
.
add_argument
(
"-h"
,
"--help"
,
action
=
"help"
,
help
=
"Show this message and exit."
)
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
return
(
args
)
return
args
def
main
():
"""
Main execution of the script
"""
#1. Parse and print command-line arguments
#
1. Parse and print command-line arguments
arguments
=
parse_arguments
()
message
=
(
"
\n
"
"These are the arguments you have provided:
\n
"
" INPUT:
\n
"
"{0}
\n
"
" OUTPUT:
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
))
message
=
(
"
\n
"
"These are the arguments you have provided:
\n
"
" INPUT:
\n
"
"{0}
\n
"
" OUTPUT:
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
print
(
message
)
#2. Open input files
species_scaffolds_df
=
pd
.
read_csv
(
arguments
.
input
[
0
],
sep
=
"
\t
"
)
mapped_reads_df
=
pd
.
read_csv
(
arguments
.
input
[
1
],
sep
=
"
\t
"
)
#3. Merge input files together
quantified_scaffolds_df
=
pd
.
merge
(
species_scaffolds_df
,
mapped_reads_df
,
on
=
[
"scaffold_name"
,
"Sample_name"
],
how
=
"left"
)
#4. Extract columns of interest
quantified_scaffolds_df
=
quantified_scaffolds_df
[[
"Sample_name"
,
"scaffold_name"
,
"Length"
,
"mapped_reads"
]]
#And rename them
quantified_scaffolds_df
.
rename
(
columns
=
{
"Sample_name"
:
"Sample"
,
"scaffold_name"
:
"Scaffold"
,
"Length"
:
"Scaffold_length"
,
"mapped_reads"
:
"Mapped_reads"
},
inplace
=
True
)
quantified_scaffolds_df
.
to_csv
(
arguments
.
output
[
0
],
sep
=
"
\t
"
,
index
=
False
)
#5. Aggregate statistics per sample
# 2. Open input files
species_scaffolds_df
=
pd
.
read_csv
(
arguments
.
input
[
0
],
sep
=
"
\t
"
)
mapped_reads_df
=
pd
.
read_csv
(
arguments
.
input
[
1
],
sep
=
"
\t
"
)
# 3. Merge input files together
quantified_scaffolds_df
=
pd
.
merge
(
species_scaffolds_df
,
mapped_reads_df
,
on
=
[
"scaffold_name"
,
"Sample_name"
],
how
=
"left"
,
)
# 4. Extract columns of interest
quantified_scaffolds_df
=
quantified_scaffolds_df
[
[
"Sample_name"
,
"scaffold_name"
,
"Length"
,
"mapped_reads"
]
]
# And rename them
quantified_scaffolds_df
.
rename
(
columns
=
{
"Sample_name"
:
"Sample"
,
"scaffold_name"
:
"Scaffold"
,
"Length"
:
"Scaffold_length"
,
"mapped_reads"
:
"Mapped_reads"
,
},
inplace
=
True
,
)
quantified_scaffolds_df
.
to_csv
(
arguments
.
output
[
0
],
sep
=
"
\t
"
,
index
=
False
)
# 5. Aggregate statistics per sample
quantified_scaffolds_df
[
"Number_of_scaffolds"
]
=
1
statistics_per_sample
=
pd
.
DataFrame
(
quantified_scaffolds_df
.
groupby
(
[
"Sample"
]).
sum
()[[
"Number_of_scaffolds"
,
"Scaffold_length"
,
"Mapped_reads"
]]
statistics_per_sample
=
pd
.
DataFrame
(
quantified_scaffolds_df
.
groupby
([
"Sample"
]).
sum
()[
[
"Number_of_scaffolds"
,
"Scaffold_length"
,
"Mapped_reads"
]
]
)
statistics_per_sample
.
to_csv
(
arguments
.
output
[
1
],
sep
=
"
\t
"
,
index
=
True
)
#requires index for sample names
statistics_per_sample
.
to_csv
(
arguments
.
output
[
1
],
sep
=
"
\t
"
,
index
=
True
)
# requires index for sample names
return
None
return
(
None
)
## Execute script
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
main
()
bin/concatenate_depth_tables.py
View file @
c8aa56d3
#! /usr/bin/env python3
#Concatenate per-sample tables of depth of coverage into an overall table.
#Required input:
#
Concatenate per-sample tables of depth of coverage into an overall table.
#
Required input:
# - A number of per-sample tables as tsv files
#
#The output file will be a concatenated table with additional "Sample_name"
#
The output file will be a concatenated table with additional "Sample_name"
# column. Entering a name on the command-line is required.
#
#Example use:
#
Example use:
# python concatenate_depth_tables.py -i Depth_of_coverage-sample1-paired.tsv Depth_of_coverage-sample2-paired.tsv Depth_of_coverage-sample3-paired.tsv -o Depth_of_coverage-paired.tsv
#IMPORT required libraries---------------------------------
#
IMPORT required libraries---------------------------------
import
pandas
as
pd
import
argparse
#Define FUNCTIONS------------------------------------------
#
Define FUNCTIONS------------------------------------------
def
parse_arguments
():
"""
Parse the arguments from the command line, i.e.:
...
...
@@ -22,34 +22,41 @@ def parse_arguments():
-o/--output = output file (tab-separated table)
-h/--help = show help
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"concatenate mapped read counts"
,
description
=
"Concatenate mapped read count tables"
,
usage
=
"concatenate_mapped_read_counts.py -i [input] -o [output]"
" [-h / --help]"
,
add_help
=
False
)
parser
=
argparse
.
ArgumentParser
(
prog
=
"concatenate mapped read counts"
,
description
=
"Concatenate mapped read count tables"
,
usage
=
"concatenate_mapped_read_counts.py -i [input] -o [output]"
" [-h / --help]"
,
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
'-i'
,
'--input'
,
dest
=
"input"
,
metavar
=
''
,
required
=
True
,
type
=
str
,
nargs
=
'+'
,
help
=
"List of input files (counts per sample)."
)
required
.
add_argument
(
'-o'
,
'--output'
,
dest
=
"output"
,
metavar
=
''
,
required
=
True
,
type
=
str
,
help
=
"Output file name (and directory)."
)
required
.
add_argument
(
"-i"
,
"--input"
,
dest
=
"input"
,
metavar
=
""
,
required
=
True
,
type
=
str
,
nargs
=
"+"
,
help
=
"List of input files (counts per sample)."
,
)
required
.
add_argument
(
"-o"
,
"--output"
,
dest
=
"output"
,
metavar
=
""
,
required
=
True
,
type
=
str
,
help
=
"Output file name (and directory)."
,
)
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
return
(
args
)
return
args
def
extract_sample_name
(
filename
):
"""
...
...
@@ -59,49 +66,51 @@ To extract the sample name, remove "Mapped_read_counts-", and
everything from "_to_" until the end.
"""
if
"/"
in
filename
:
#If the directory path is attached to the filename, remove it
#
If the directory path is attached to the filename, remove it
without_path
=
filename
.
split
(
"/"
)[
-
1
]
else
:
without_path
=
filename
without_prefix
=
without_path
.
replace
(
"Depth_of_coverage-"
,
""
)
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
return
(
sample
)
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
return
sample
def
main
():
"""
Main execution of the script
"""
#1. Parse and show arguments
#
1. Parse and show arguments
arguments
=
parse_arguments
()
message
=
(
"
\n
"
"These are the arguments you have provided:
\n
"
" INPUT:
\n
"
"{0},
\n
"
" OUTPUT:
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
))
message
=
(
"
\n
"
"These are the arguments you have provided:
\n
"
" INPUT:
\n
"
"{0},
\n
"
" OUTPUT:
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
print
(
message
)
#2. Read input files and make into one dataframe
#
2. Read input files and make into one dataframe
concat_df
=
pd
.
DataFrame
()
for
file
in
arguments
.
input
:
sample
=
extract_sample_name
(
file
)
df
=
pd
.
read_csv
(
file
,
sep
=
"
\t
"
)
df
[
"Sample_name"
]
=
sample
concat_df
=
pd
.
concat
([
concat_df
,
df
])
#3. Write table to a tsv file
concat_df
.
to_csv
(
arguments
.
output
,
sep
=
'
\t
'
,
index
=
False
)
return
(
None
)
# 3. Write table to a tsv file
concat_df
.
to_csv
(
arguments
.
output
,
sep
=
"
\t
"
,
index
=
False
)
return
None
#EXECUTE script--------------------------------------------
#
EXECUTE script--------------------------------------------
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
main
()
bin/concatenate_read_counts.py
View file @
c8aa56d3
#! /usr/bin/env python3
#Concatenate per-sample tables of read counts into an overall table.
#Required input:
#
Concatenate per-sample tables of read counts into an overall table.
#
Required input:
# - A number of per-sample tables as tsv files
#
#The output file will be a concatenated table with additional "Sample_name"
#
The output file will be a concatenated table with additional "Sample_name"
# and "Reference" columns. Entering a name on the command-line is required.
#
#Example use:
#
Example use:
# python concatenate_read_counts.py -i sample1-paired.tsv sample2-paired.tsv sample3-paired.tsv -o All_samples-paired.tsv
#IMPORT required libraries---------------------------------
#
IMPORT required libraries---------------------------------
import
pandas
as
pd
import
argparse
#Define FUNCTIONS------------------------------------------
#
Define FUNCTIONS------------------------------------------
def
parse_arguments
():
"""
Parse the arguments from the command line, i.e.:
...
...
@@ -22,34 +22,41 @@ def parse_arguments():
-o/--output = output file (tab-separated table)
-h/--help = show help
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"concatenate mapped read counts"
,
description
=
"Concatenate mapped read count tables"
,
usage
=
"concatenate_mapped_read_counts.py -i [input] -o [output]"
" [-h / --help]"
,
add_help
=
False
)
parser
=
argparse
.
ArgumentParser
(
prog
=
"concatenate mapped read counts"
,
description
=
"Concatenate mapped read count tables"
,
usage
=
"concatenate_mapped_read_counts.py -i [input] -o [output]"
" [-h / --help]"
,
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
'-i'
,
'--input'
,
dest
=
"input"
,
metavar
=
''
,
required
=
True
,
type
=
str
,
nargs
=
'+'
,
help
=
"List of input files (counts per sample)."
)
required
.
add_argument
(
'-o'
,
'--output'
,
dest
=
"output"
,
metavar
=
''
,
required
=
True
,
type
=
str
,
help
=
"Output file name (and directory)."
)
required
.
add_argument
(
"-i"
,
"--input"
,
dest
=
"input"
,
metavar
=
""
,
required
=
True
,
type
=
str
,
nargs
=
"+"
,
help
=
"List of input files (counts per sample)."
,
)
required
.
add_argument
(
"-o"
,
"--output"
,
dest
=
"output"
,
metavar
=
""
,
required
=
True
,
type
=
str
,
help
=
"Output file name (and directory)."
,
)
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
return
(
args
)
return
args
def
extract_sample_and_reference_name
(
filename
):
"""
...
...
@@ -59,60 +66,64 @@ To extract the sample name, remove "Mapped_read_counts-", and
everything from "_to_" until the end.
"""
if
"/"
in
filename
:
#If the directory path is attached to the filename, remove it
#
If the directory path is attached to the filename, remove it
without_path
=
filename
.
split
(
"/"
)[
-
1
]
else
:
without_path
=
filename
without_prefix
=
without_path
.
replace
(
"Mapped_read_counts-"
,
""
)
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
reference
=
without_prefix
[
without_prefix
.
index
(
"_to_"
)
+
4
:
-
4
]
#Assume the .tsv file extension; hence strip the last 4 characters
#
Assume the .tsv file extension; hence strip the last 4 characters
if
"-unpaired"
in
reference
:
reference
=
reference
.
replace
(
"-unpaired"
,
""
)
elif
"-paired"
in
reference
:
reference
=
reference
.
replace
(
"-paired"
,
""
)
else
:
print
(
"There might be an unexpected suffix in the reference name:"
" %s"
%
reference
)
print
(
"There might be an unexpected suffix in the reference name:"
" %s"
%
reference
)
pass
return
(
sample
,
reference
)
return
(
sample
,
reference
)
def
main
():
"""
Main execution of the script
"""
#1. Parse and show arguments
#
1. Parse and show arguments
arguments
=
parse_arguments
()
message
=
(
"
\n
"
"These are the arguments you have provided:
\n
"
" INPUT:
\n
"
"{0},
\n
"
" OUTPUT:
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
))
message
=
(
"
\n
"
"These are the arguments you have provided:
\n
"
" INPUT:
\n
"
"{0},
\n
"
" OUTPUT:
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
print
(
message
)
#2. Read input files and make into one dataframe
#
2. Read input files and make into one dataframe
concat_df
=
pd
.
DataFrame
()
for
file
in
arguments
.
input
:
sample
,
reference
=
extract_sample_and_reference_name
(
file
)
df
=
pd
.
read_csv
(
file
,
sep
=
"
\t
"
)
df
[
"Sample_name"
]
=
sample
df
[
"Reference_name"
]
=
reference
concat_df
=
pd
.
concat
([
concat_df
,
df
])
#3. Write table to a tsv file
concat_df
.
to_csv
(
arguments
.
output
,
sep
=
'
\t
'
,
index
=
False
)
return
(
None
)
# 3. Write table to a tsv file
concat_df
.
to_csv
(
arguments
.
output
,
sep
=
"
\t
"
,
index
=
False
)
return
None
#EXECUTE script--------------------------------------------
#
EXECUTE script--------------------------------------------
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
main
()
bin/create_per_sample_fasta.py
View file @
c8aa56d3
#! /usr/bin/env python3
#From the species-filtered Jovian output table, create fasta files per sample
#
From the species-filtered Jovian output table, create fasta files per sample
# for each species. E.g. 'all_taxClassified-Escherichia_coli.tsv' becomes
# 'A-Escherichia_coli-scaffolds.fasta', 'B-Escherichia_coli-scaffolds.fasta',
# and 'C-Escherichia_coli-scaffolds.fasta' if your sample names are A, B and
# and 'C-Escherichia_coli-scaffolds.fasta' if your sample names are A, B and
# C.
#Resulting fasta files are written to the same directory as the input file.
#
Resulting fasta files are written to the same directory as the input file.
#
#Example use:
#
Example use:
# python create_per_sample_fasta.py -i all_taxClassified-Escherichia_coli.tsv
from
pathlib
import
Path
import
argparse
def
parse_arguments
():
"""
Parse the arguments from the command line, i.e.:
...
...
@@ -20,41 +21,46 @@ def parse_arguments():
-s/--samples = sample names for which to generate output (necessary for snakemake)
-h/--help = show help
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"create per sample fasta"
,
description
=
"Create a fasta file per sample with scaffolds from the specified species"
,
usage
=
"create_per_sample_fasta.py -i file -s samples"
" [-h / --help]"
,
add_help
=
False
)
parser
=
argparse
.
ArgumentParser
(
prog
=
"create per sample fasta"
,
description
=
"Create a fasta file per sample with scaffolds from the specified species"
,
usage
=
"create_per_sample_fasta.py -i file -s samples"
" [-h / --help]"
,
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
"-i"
,
"--input"
,
dest
=
"input"
,
metavar
=
"file"
,
required
=
True
,
type
=
str
,
help
=
"Classified scaffolds table for species of interest."
)
required
.
add_argument
(
"-i"
,
"--input"
,
dest
=
"input"
,
metavar
=
"file"
,
required
=
True
,