Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sam Nooij
Jovian screener
Commits
c8aa56d3
Commit
c8aa56d3
authored
Jul 28, 2020
by
Sam Nooij
Browse files
Reformat Python code with Black
parent
6ce6ee4e
Changes
11
Expand all
Show whitespace changes
Inline
Side-by-side
bin/collect_species_statistics.py
View file @
c8aa56d3
#! /usr/bin/env python3
#Collect scaffold information for a species filtered from Jovian's
#
Collect scaffold information for a species filtered from Jovian's
# classification table. (Which scaffolds, their length and number
# of reads mapped to them.)
#
...
...
@@ -9,19 +9,19 @@
# -i data/all_taxClassified-Escherichia_coli.tsv data/Mapped_read_counts.tsv \
# -o results/Escherichia_coli-scaffolds_per_sample.tsv results/Escherichia_coli-stats_per_sample.tsv \
#
#Three required input arguments are:
#
Three required input arguments are:
# -i/--input for the two required tables (see example)
# -o/--output for the result tables (see example)
## Import required libraries
import
sys
#to abort the script when an error occurs
import
argparse
#to parse command-line arguments
import
pandas
as
pd
#to work with dataframes (tabular data)
import
numpy
as
np
#for calculations (summing contig stats)
import
re
#to extract sample names from multiqc table
import
sys
#
to abort the script when an error occurs
import
argparse
#
to parse command-line arguments
import
pandas
as
pd
#
to work with dataframes (tabular data)
import
numpy
as
np
#
for calculations (summing contig stats)
import
re
#
to extract sample names from multiqc table
## Define functions:
#1. Parse command-line arguments
#
1. Parse command-line arguments
def
parse_arguments
():
"""
Parse the arguments from the command line, i.e.:
...
...
@@ -29,98 +29,111 @@ def parse_arguments():
-o/--output = 2 output files
-h/--help = show help
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"collect species statistics"
,
parser
=
argparse
.
ArgumentParser
(
prog
=
"collect species statistics"
,
description
=
"Collect scaffold statistics for each species filtered from Jovian's results"
,
usage
=
"collect_species_statistics.py -i file1 file2 -o file3 file4"
" [-h / --help]"
,
add_help
=
False
)
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
"-i"
,
required
.
add_argument
(
"-i"
,
"--input"
,
dest
=
"input"
,
metavar
=
"file1 file2"
,
required
=
True
,
nargs
=
2
,
type
=
str
,
help
=
"Classified scaffolds table and mapped read counts."
)
help
=
"Classified scaffolds table and mapped read counts."
,
)
required
.
add_argument
(
"-o"
,
required
.
add_argument
(
"-o"
,
"--output"
,
dest
=
"output"
,
metavar
=
"file3 file4"
,
required
=
True
,
nargs
=
2
,
type
=
str
,
help
=
"Output tables (per scaffold, and summed statistics)."
)
help
=
"Output tables (per scaffold, and summed statistics)."
,
)
optional
=
parser
.
add_argument_group
(
"Optional arguments"
)
optional
.
add_argument
(
"-h"
,
"--help"
,
action
=
"help"
,
help
=
"Show this message and exit."
)
optional
.
add_argument
(
"-h"
,
"--help"
,
action
=
"help"
,
help
=
"Show this message and exit."
)
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
return
(
args
)
return
args
def
main
():
"""
Main execution of the script
"""
#1. Parse and print command-line arguments
#
1. Parse and print command-line arguments
arguments
=
parse_arguments
()
message
=
(
"
\n
"
message
=
(
"
\n
"
"These are the arguments you have provided:
\n
"
" INPUT:
\n
"
"{0}
\n
"
" OUTPUT:
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
print
(
message
)
#2. Open input files
species_scaffolds_df
=
pd
.
read_csv
(
arguments
.
input
[
0
],
sep
=
"
\t
"
)
# 2. Open input files
species_scaffolds_df
=
pd
.
read_csv
(
arguments
.
input
[
0
],
sep
=
"
\t
"
)
mapped_reads_df
=
pd
.
read_csv
(
arguments
.
input
[
1
],
sep
=
"
\t
"
)
mapped_reads_df
=
pd
.
read_csv
(
arguments
.
input
[
1
],
sep
=
"
\t
"
)
#3. Merge input files together
quantified_scaffolds_df
=
pd
.
merge
(
species_scaffolds_df
,
mapped_reads_df
,
# 3. Merge input files together
quantified_scaffolds_df
=
pd
.
merge
(
species_scaffolds_df
,
mapped_reads_df
,
on
=
[
"scaffold_name"
,
"Sample_name"
],
how
=
"left"
)
#4. Extract columns of interest
quantified_scaffolds_df
=
quantified_scaffolds_df
[[
"Sample_name"
,
"scaffold_name"
,
"Length"
,
"mapped_reads"
]]
how
=
"left"
,
)
#And rename them
quantified_scaffolds_df
.
rename
(
columns
=
{
"Sample_name"
:
"Sample"
,
"scaffold_name"
:
"Scaffold"
,
"Length"
:
"Scaffold_length"
,
"mapped_reads"
:
"Mapped_reads"
},
inplace
=
True
)
# 4. Extract columns of interest
quantified_scaffolds_df
=
quantified_scaffolds_df
[
[
"Sample_name"
,
"scaffold_name"
,
"Length"
,
"mapped_reads"
]
]
# And rename them
quantified_scaffolds_df
.
rename
(
columns
=
{
"Sample_name"
:
"Sample"
,
"scaffold_name"
:
"Scaffold"
,
"Length"
:
"Scaffold_length"
,
"mapped_reads"
:
"Mapped_reads"
,
},
inplace
=
True
,
)
quantified_scaffolds_df
.
to_csv
(
arguments
.
output
[
0
],
sep
=
"
\t
"
,
index
=
False
)
quantified_scaffolds_df
.
to_csv
(
arguments
.
output
[
0
],
sep
=
"
\t
"
,
index
=
False
)
#5. Aggregate statistics per sample
#
5. Aggregate statistics per sample
quantified_scaffolds_df
[
"Number_of_scaffolds"
]
=
1
statistics_per_sample
=
pd
.
DataFrame
(
quantified_scaffolds_df
.
groupby
(
[
"Sample"
]).
sum
()[[
"Number_of_scaffolds"
,
"Scaffold_length"
,
"Mapped_reads"
]]
statistics_per_sample
=
pd
.
DataFrame
(
quantified_scaffolds_df
.
groupby
([
"Sample"
]).
sum
()[
[
"Number_of_scaffolds"
,
"Scaffold_length"
,
"Mapped_reads"
]
]
)
statistics_per_sample
.
to_csv
(
arguments
.
output
[
1
],
sep
=
"
\t
"
,
index
=
True
)
#requires index for sample names
statistics_per_sample
.
to_csv
(
arguments
.
output
[
1
],
sep
=
"
\t
"
,
index
=
True
)
# requires index for sample names
return
None
return
(
None
)
## Execute script
if
__name__
==
"__main__"
:
...
...
bin/concatenate_depth_tables.py
View file @
c8aa56d3
#! /usr/bin/env python3
#Concatenate per-sample tables of depth of coverage into an overall table.
#Required input:
#
Concatenate per-sample tables of depth of coverage into an overall table.
#
Required input:
# - A number of per-sample tables as tsv files
#
#The output file will be a concatenated table with additional "Sample_name"
#
The output file will be a concatenated table with additional "Sample_name"
# column. Entering a name on the command-line is required.
#
#Example use:
#
Example use:
# python concatenate_depth_tables.py -i Depth_of_coverage-sample1-paired.tsv Depth_of_coverage-sample2-paired.tsv Depth_of_coverage-sample3-paired.tsv -o Depth_of_coverage-paired.tsv
#IMPORT required libraries---------------------------------
#
IMPORT required libraries---------------------------------
import
pandas
as
pd
import
argparse
#Define FUNCTIONS------------------------------------------
#
Define FUNCTIONS------------------------------------------
def
parse_arguments
():
"""
Parse the arguments from the command line, i.e.:
...
...
@@ -22,34 +22,41 @@ def parse_arguments():
-o/--output = output file (tab-separated table)
-h/--help = show help
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"concatenate mapped read counts"
,
parser
=
argparse
.
ArgumentParser
(
prog
=
"concatenate mapped read counts"
,
description
=
"Concatenate mapped read count tables"
,
usage
=
"concatenate_mapped_read_counts.py -i [input] -o [output]"
" [-h / --help]"
,
add_help
=
False
)
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
'-i'
,
'--input'
,
required
.
add_argument
(
"-i"
,
"--input"
,
dest
=
"input"
,
metavar
=
''
,
metavar
=
""
,
required
=
True
,
type
=
str
,
nargs
=
'+'
,
help
=
"List of input files (counts per sample)."
)
nargs
=
"+"
,
help
=
"List of input files (counts per sample)."
,
)
required
.
add_argument
(
'-o'
,
'--output'
,
required
.
add_argument
(
"-o"
,
"--output"
,
dest
=
"output"
,
metavar
=
''
,
metavar
=
""
,
required
=
True
,
type
=
str
,
help
=
"Output file name (and directory)."
)
help
=
"Output file name (and directory)."
,
)
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
return
(
args
)
return
args
def
extract_sample_name
(
filename
):
"""
...
...
@@ -59,34 +66,36 @@ To extract the sample name, remove "Mapped_read_counts-", and
everything from "_to_" until the end.
"""
if
"/"
in
filename
:
#If the directory path is attached to the filename, remove it
#
If the directory path is attached to the filename, remove it
without_path
=
filename
.
split
(
"/"
)[
-
1
]
else
:
without_path
=
filename
without_prefix
=
without_path
.
replace
(
"Depth_of_coverage-"
,
""
)
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
return
sample
return
(
sample
)
def
main
():
"""
Main execution of the script
"""
#1. Parse and show arguments
#
1. Parse and show arguments
arguments
=
parse_arguments
()
message
=
(
"
\n
"
message
=
(
"
\n
"
"These are the arguments you have provided:
\n
"
" INPUT:
\n
"
"{0},
\n
"
" OUTPUT:
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
print
(
message
)
#2. Read input files and make into one dataframe
#
2. Read input files and make into one dataframe
concat_df
=
pd
.
DataFrame
()
for
file
in
arguments
.
input
:
...
...
@@ -96,12 +105,12 @@ def main():
concat_df
=
pd
.
concat
([
concat_df
,
df
])
#3. Write table to a tsv file
concat_df
.
to_csv
(
arguments
.
output
,
sep
=
'
\t
'
,
index
=
False
)
# 3. Write table to a tsv file
concat_df
.
to_csv
(
arguments
.
output
,
sep
=
"
\t
"
,
index
=
False
)
return
None
return
(
None
)
#EXECUTE script--------------------------------------------
#
EXECUTE script--------------------------------------------
if
__name__
==
"__main__"
:
main
()
bin/concatenate_read_counts.py
View file @
c8aa56d3
#! /usr/bin/env python3
#Concatenate per-sample tables of read counts into an overall table.
#Required input:
#
Concatenate per-sample tables of read counts into an overall table.
#
Required input:
# - A number of per-sample tables as tsv files
#
#The output file will be a concatenated table with additional "Sample_name"
#
The output file will be a concatenated table with additional "Sample_name"
# and "Reference" columns. Entering a name on the command-line is required.
#
#Example use:
#
Example use:
# python concatenate_read_counts.py -i sample1-paired.tsv sample2-paired.tsv sample3-paired.tsv -o All_samples-paired.tsv
#IMPORT required libraries---------------------------------
#
IMPORT required libraries---------------------------------
import
pandas
as
pd
import
argparse
#Define FUNCTIONS------------------------------------------
#
Define FUNCTIONS------------------------------------------
def
parse_arguments
():
"""
Parse the arguments from the command line, i.e.:
...
...
@@ -22,34 +22,41 @@ def parse_arguments():
-o/--output = output file (tab-separated table)
-h/--help = show help
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"concatenate mapped read counts"
,
parser
=
argparse
.
ArgumentParser
(
prog
=
"concatenate mapped read counts"
,
description
=
"Concatenate mapped read count tables"
,
usage
=
"concatenate_mapped_read_counts.py -i [input] -o [output]"
" [-h / --help]"
,
add_help
=
False
)
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
'-i'
,
'--input'
,
required
.
add_argument
(
"-i"
,
"--input"
,
dest
=
"input"
,
metavar
=
''
,
metavar
=
""
,
required
=
True
,
type
=
str
,
nargs
=
'+'
,
help
=
"List of input files (counts per sample)."
)
nargs
=
"+"
,
help
=
"List of input files (counts per sample)."
,
)
required
.
add_argument
(
'-o'
,
'--output'
,
required
.
add_argument
(
"-o"
,
"--output"
,
dest
=
"output"
,
metavar
=
''
,
metavar
=
""
,
required
=
True
,
type
=
str
,
help
=
"Output file name (and directory)."
)
help
=
"Output file name (and directory)."
,
)
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
return
(
args
)
return
args
def
extract_sample_and_reference_name
(
filename
):
"""
...
...
@@ -59,44 +66,48 @@ To extract the sample name, remove "Mapped_read_counts-", and
everything from "_to_" until the end.
"""
if
"/"
in
filename
:
#If the directory path is attached to the filename, remove it
#
If the directory path is attached to the filename, remove it
without_path
=
filename
.
split
(
"/"
)[
-
1
]
else
:
without_path
=
filename
without_prefix
=
without_path
.
replace
(
"Mapped_read_counts-"
,
""
)
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
reference
=
without_prefix
[
without_prefix
.
index
(
"_to_"
)
+
4
:
-
4
]
#Assume the .tsv file extension; hence strip the last 4 characters
#
Assume the .tsv file extension; hence strip the last 4 characters
if
"-unpaired"
in
reference
:
reference
=
reference
.
replace
(
"-unpaired"
,
""
)
elif
"-paired"
in
reference
:
reference
=
reference
.
replace
(
"-paired"
,
""
)
else
:
print
(
"There might be an unexpected suffix in the reference name:"
" %s"
%
reference
)
print
(
"There might be an unexpected suffix in the reference name:"
" %s"
%
reference
)
pass
return
(
sample
,
reference
)
return
(
sample
,
reference
)
def
main
():
"""
Main execution of the script
"""
#1. Parse and show arguments
#
1. Parse and show arguments
arguments
=
parse_arguments
()
message
=
(
"
\n
"
message
=
(
"
\n
"
"These are the arguments you have provided:
\n
"
" INPUT:
\n
"
"{0},
\n
"
" OUTPUT:
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
print
(
message
)
#2. Read input files and make into one dataframe
#
2. Read input files and make into one dataframe
concat_df
=
pd
.
DataFrame
()
for
file
in
arguments
.
input
:
...
...
@@ -107,12 +118,12 @@ def main():
concat_df
=
pd
.
concat
([
concat_df
,
df
])
#3. Write table to a tsv file
concat_df
.
to_csv
(
arguments
.
output
,
sep
=
'
\t
'
,
index
=
False
)
# 3. Write table to a tsv file
concat_df
.
to_csv
(
arguments
.
output
,
sep
=
"
\t
"
,
index
=
False
)
return
None
return
(
None
)
#EXECUTE script--------------------------------------------
#
EXECUTE script--------------------------------------------
if
__name__
==
"__main__"
:
main
()
bin/create_per_sample_fasta.py
View file @
c8aa56d3
#! /usr/bin/env python3
#From the species-filtered Jovian output table, create fasta files per sample
#
From the species-filtered Jovian output table, create fasta files per sample
# for each species. E.g. 'all_taxClassified-Escherichia_coli.tsv' becomes
# 'A-Escherichia_coli-scaffolds.fasta', 'B-Escherichia_coli-scaffolds.fasta',
# and 'C-Escherichia_coli-scaffolds.fasta' if your sample names are A, B and
# C.
#Resulting fasta files are written to the same directory as the input file.
#
Resulting fasta files are written to the same directory as the input file.
#
#Example use:
#
Example use:
# python create_per_sample_fasta.py -i all_taxClassified-Escherichia_coli.tsv
from
pathlib
import
Path
import
argparse
def
parse_arguments
():
"""
Parse the arguments from the command line, i.e.:
...
...
@@ -20,41 +21,46 @@ def parse_arguments():
-s/--samples = sample names for which to generate output (necessary for snakemake)
-h/--help = show help
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"create per sample fasta"
,
parser
=
argparse
.
ArgumentParser
(
prog
=
"create per sample fasta"
,
description
=
"Create a fasta file per sample with scaffolds from the specified species"
,
usage
=
"create_per_sample_fasta.py -i file -s samples"
" [-h / --help]"
,
add_help
=
False
)
usage
=
"create_per_sample_fasta.py -i file -s samples"
" [-h / --help]"
,
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
"-i"
,
required
.
add_argument
(
"-i"
,
"--input"
,
dest
=
"input"
,
metavar
=
"file"
,
required
=
True
,
type
=
str
,
help
=
"Classified scaffolds table for species of interest."
)
help
=
"Classified scaffolds table for species of interest."
,
)
optional
=
parser
.
add_argument_group
(
"Optional arguments"
)
optional
.
add_argument
(
"-s"
,
optional
.
add_argument
(
"-s"
,
"--samples"
,
dest
=
"samples"
,
metavar
=
"samples"
,
type
=
str
,
nargs
=
"+"
,
default
=
"None"
,
help
=
"Samples for which to create fasta files."
)
help
=
"Samples for which to create fasta files."
,
)
optional
.
add_argument
(
"-h"
,
"--help"
,
action
=
"help"
,
help
=
"Show this message and exit."
)
optional
.
add_argument
(
"-h"
,
"--help"
,
action
=
"help"
,
help
=
"Show this message and exit."
)
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
return
(
args
)
return
args
def
read_and_create_fastas
(
input_file
,
samples
):
"""
...
...
@@ -63,90 +69,95 @@ def read_and_create_fastas(input_file, samples):
"""
species
=
input_file
.
split
(
"-"
)[
1
][:
-
4
]
#species is the second part of the file name, after the dash,
#
species is the second part of the file name, after the dash,
# and excluding the extension (last 4 characters)
input_dir
=
Path
(
input_file
).
parent
samples_seen
=
[]
#Keep a list of which samples have been seen, so that
#
Keep a list of which samples have been seen, so that
# for each sample a new empty file can be initiated.
with
open
(
input_file
,
'r'
)
as
read_file
:
next
(
read_file
)
#
skip the header
with
open
(
input_file
,
"r"
)
as
read_file
:
next
(
read_file
)
#
skip the header
for
line
in
read_file
:
line
=
line
.
split
()
#split the line in separate elements
#
split the line in separate elements
sample
=
line
[
0
]
#
sample is the first element
scaffold_id
=
line
[
1
]
#
scaffold id is the second element
sequence
=
line
[
-
1
]
#
sequence is the last element
sample
=
line
[
0
]
#
sample is the first element
scaffold_id
=
line
[
1
]
#
scaffold id is the second element
sequence
=
line
[
-
1
]
#
sequence is the last element
output_file
=
Path
(
input_dir
/
(
"%s-%s-scaffolds.fasta"
%
(
sample
,
species
)))
output_file
=
Path
(
input_dir
/
(
"%s-%s-scaffolds.fasta"
%
(
sample
,
species
))
)
if
not
output_file
.
exists
():
#
If the file is not there yet, create an empty file
#
If the file is not there yet, create an empty file
output_file
.
touch
()
else
:
#
And if it does exist
#
And if it does exist
if
sample
in
samples_seen
:
#
Check whether this had been seen already
#
Check whether this had been seen already
pass
else
:
# and if not, empty the file and add it to the seen list
open
(
output_file
,
'w'
).
close
()
open
(
output_file
,
"w"
).
close
()