Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sam Nooij
Jovian screener
Commits
c8aa56d3
Commit
c8aa56d3
authored
Jul 28, 2020
by
Sam Nooij
Browse files
Reformat Python code with Black
parent
6ce6ee4e
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
bin/collect_species_statistics.py
View file @
c8aa56d3
#! /usr/bin/env python3
#! /usr/bin/env python3
#Collect scaffold information for a species filtered from Jovian's
#
Collect scaffold information for a species filtered from Jovian's
# classification table. (Which scaffolds, their length and number
# classification table. (Which scaffolds, their length and number
# of reads mapped to them.)
# of reads mapped to them.)
#
#
# Example use:
# Example use:
...
@@ -9,19 +9,19 @@
...
@@ -9,19 +9,19 @@
# -i data/all_taxClassified-Escherichia_coli.tsv data/Mapped_read_counts.tsv \
# -i data/all_taxClassified-Escherichia_coli.tsv data/Mapped_read_counts.tsv \
# -o results/Escherichia_coli-scaffolds_per_sample.tsv results/Escherichia_coli-stats_per_sample.tsv \
# -o results/Escherichia_coli-scaffolds_per_sample.tsv results/Escherichia_coli-stats_per_sample.tsv \
#
#
#Three required input arguments are:
#
Three required input arguments are:
# -i/--input for the two required tables (see example)
# -i/--input for the two required tables (see example)
# -o/--output for the result tables (see example)
# -o/--output for the result tables (see example)
## Import required libraries
## Import required libraries
import
sys
#to abort the script when an error occurs
import
sys
#
to abort the script when an error occurs
import
argparse
#to parse command-line arguments
import
argparse
#
to parse command-line arguments
import
pandas
as
pd
#to work with dataframes (tabular data)
import
pandas
as
pd
#
to work with dataframes (tabular data)
import
numpy
as
np
#for calculations (summing contig stats)
import
numpy
as
np
#
for calculations (summing contig stats)
import
re
#to extract sample names from multiqc table
import
re
#
to extract sample names from multiqc table
## Define functions:
## Define functions:
#1. Parse command-line arguments
#
1. Parse command-line arguments
def
parse_arguments
():
def
parse_arguments
():
"""
"""
Parse the arguments from the command line, i.e.:
Parse the arguments from the command line, i.e.:
...
@@ -29,99 +29,112 @@ def parse_arguments():
...
@@ -29,99 +29,112 @@ def parse_arguments():
-o/--output = 2 output files
-o/--output = 2 output files
-h/--help = show help
-h/--help = show help
"""
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"collect species statistics"
,
parser
=
argparse
.
ArgumentParser
(
description
=
"Collect scaffold statistics for each species filtered from Jovian's results"
,
prog
=
"collect species statistics"
,
usage
=
"collect_species_statistics.py -i file1 file2 -o file3 file4"
description
=
"Collect scaffold statistics for each species filtered from Jovian's results"
,
" [-h / --help]"
,
usage
=
"collect_species_statistics.py -i file1 file2 -o file3 file4"
add_help
=
False
)
" [-h / --help]"
,
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
"-i"
,
required
.
add_argument
(
"--input"
,
"-i"
,
dest
=
"input"
,
"--input"
,
metavar
=
"file1 file2"
,
dest
=
"input"
,
required
=
True
,
metavar
=
"file1 file2"
,
nargs
=
2
,
required
=
True
,
type
=
str
,
nargs
=
2
,
help
=
"Classified scaffolds table and mapped read counts."
)
type
=
str
,
help
=
"Classified scaffolds table and mapped read counts."
,
required
.
add_argument
(
"-o"
,
)
"--output"
,
dest
=
"output"
,
required
.
add_argument
(
metavar
=
"file3 file4"
,
"-o"
,
required
=
True
,
"--output"
,
nargs
=
2
,
dest
=
"output"
,
type
=
str
,
metavar
=
"file3 file4"
,
help
=
"Output tables (per scaffold, and summed statistics)."
)
required
=
True
,
nargs
=
2
,
type
=
str
,
help
=
"Output tables (per scaffold, and summed statistics)."
,
)
optional
=
parser
.
add_argument_group
(
"Optional arguments"
)
optional
=
parser
.
add_argument_group
(
"Optional arguments"
)
optional
.
add_argument
(
"-h"
,
optional
.
add_argument
(
"--help"
,
"-h"
,
"--help"
,
action
=
"help"
,
help
=
"Show this message and exit."
action
=
"help"
,
)
help
=
"Show this message and exit."
)
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
return
(
args
)
return
args
def
main
():
def
main
():
"""
"""
Main execution of the script
Main execution of the script
"""
"""
#1. Parse and print command-line arguments
#
1. Parse and print command-line arguments
arguments
=
parse_arguments
()
arguments
=
parse_arguments
()
message
=
(
"
\n
"
message
=
(
"These are the arguments you have provided:
\n
"
"
\n
"
" INPUT:
\n
"
"These are the arguments you have provided:
\n
"
"{0}
\n
"
" INPUT:
\n
"
" OUTPUT:
\n
"
"{0}
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
" OUTPUT:
\n
"
arguments
.
output
))
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
print
(
message
)
print
(
message
)
#2. Open input files
# 2. Open input files
species_scaffolds_df
=
pd
.
read_csv
(
arguments
.
input
[
0
],
species_scaffolds_df
=
pd
.
read_csv
(
arguments
.
input
[
0
],
sep
=
"
\t
"
)
sep
=
"
\t
"
)
mapped_reads_df
=
pd
.
read_csv
(
arguments
.
input
[
1
],
sep
=
"
\t
"
)
mapped_reads_df
=
pd
.
read_csv
(
arguments
.
input
[
1
],
sep
=
"
\t
"
)
# 3. Merge input files together
quantified_scaffolds_df
=
pd
.
merge
(
#3. Merge input files together
species_scaffolds_df
,
quantified_scaffolds_df
=
pd
.
merge
(
species_scaffolds_df
,
mapped_reads_df
,
mapped_reads_df
,
on
=
[
"scaffold_name"
,
"Sample_name"
],
on
=
[
"scaffold_name"
,
"Sample_name"
],
how
=
"left"
)
how
=
"left"
,
)
#4. Extract columns of interest
quantified_scaffolds_df
=
quantified_scaffolds_df
[[
"Sample_name"
,
# 4. Extract columns of interest
"scaffold_name"
,
"Length"
,
"mapped_reads"
]]
quantified_scaffolds_df
=
quantified_scaffolds_df
[
[
"Sample_name"
,
"scaffold_name"
,
"Length"
,
"mapped_reads"
]
#And rename them
]
quantified_scaffolds_df
.
rename
(
columns
=
{
"Sample_name"
:
"Sample"
,
"scaffold_name"
:
"Scaffold"
,
# And rename them
"Length"
:
"Scaffold_length"
,
quantified_scaffolds_df
.
rename
(
"mapped_reads"
:
"Mapped_reads"
},
columns
=
{
inplace
=
True
)
"Sample_name"
:
"Sample"
,
"scaffold_name"
:
"Scaffold"
,
quantified_scaffolds_df
.
to_csv
(
arguments
.
output
[
0
],
"Length"
:
"Scaffold_length"
,
sep
=
"
\t
"
,
index
=
False
)
"mapped_reads"
:
"Mapped_reads"
,
},
#5. Aggregate statistics per sample
inplace
=
True
,
)
quantified_scaffolds_df
.
to_csv
(
arguments
.
output
[
0
],
sep
=
"
\t
"
,
index
=
False
)
# 5. Aggregate statistics per sample
quantified_scaffolds_df
[
"Number_of_scaffolds"
]
=
1
quantified_scaffolds_df
[
"Number_of_scaffolds"
]
=
1
statistics_per_sample
=
pd
.
DataFrame
(
quantified_scaffolds_df
.
groupby
(
statistics_per_sample
=
pd
.
DataFrame
(
[
"Sample"
]).
sum
()[[
"Number_of_scaffolds"
,
"Scaffold_length"
,
quantified_scaffolds_df
.
groupby
([
"Sample"
]).
sum
()[
"Mapped_reads"
]]
[
"Number_of_scaffolds"
,
"Scaffold_length"
,
"Mapped_reads"
]
]
)
)
statistics_per_sample
.
to_csv
(
arguments
.
output
[
1
],
statistics_per_sample
.
to_csv
(
arguments
.
output
[
1
],
sep
=
"
\t
"
,
index
=
True
)
sep
=
"
\t
"
,
index
=
True
)
# requires index for sample names
#requires index for sample names
return
None
return
(
None
)
## Execute script
## Execute script
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
main
()
main
()
\ No newline at end of file
bin/concatenate_depth_tables.py
View file @
c8aa56d3
#! /usr/bin/env python3
#! /usr/bin/env python3
#Concatenate per-sample tables of depth of coverage into an overall table.
#
Concatenate per-sample tables of depth of coverage into an overall table.
#Required input:
#
Required input:
# - A number of per-sample tables as tsv files
# - A number of per-sample tables as tsv files
#
#
#The output file will be a concatenated table with additional "Sample_name"
#
The output file will be a concatenated table with additional "Sample_name"
# column. Entering a name on the command-line is required.
# column. Entering a name on the command-line is required.
#
#
#Example use:
#
Example use:
# python concatenate_depth_tables.py -i Depth_of_coverage-sample1-paired.tsv Depth_of_coverage-sample2-paired.tsv Depth_of_coverage-sample3-paired.tsv -o Depth_of_coverage-paired.tsv
# python concatenate_depth_tables.py -i Depth_of_coverage-sample1-paired.tsv Depth_of_coverage-sample2-paired.tsv Depth_of_coverage-sample3-paired.tsv -o Depth_of_coverage-paired.tsv
#IMPORT required libraries---------------------------------
#
IMPORT required libraries---------------------------------
import
pandas
as
pd
import
pandas
as
pd
import
argparse
import
argparse
#Define FUNCTIONS------------------------------------------
#
Define FUNCTIONS------------------------------------------
def
parse_arguments
():
def
parse_arguments
():
"""
"""
Parse the arguments from the command line, i.e.:
Parse the arguments from the command line, i.e.:
...
@@ -22,34 +22,41 @@ def parse_arguments():
...
@@ -22,34 +22,41 @@ def parse_arguments():
-o/--output = output file (tab-separated table)
-o/--output = output file (tab-separated table)
-h/--help = show help
-h/--help = show help
"""
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"concatenate mapped read counts"
,
parser
=
argparse
.
ArgumentParser
(
description
=
"Concatenate mapped read count tables"
,
prog
=
"concatenate mapped read counts"
,
usage
=
"concatenate_mapped_read_counts.py -i [input] -o [output]"
description
=
"Concatenate mapped read count tables"
,
" [-h / --help]"
,
usage
=
"concatenate_mapped_read_counts.py -i [input] -o [output]"
add_help
=
False
)
" [-h / --help]"
,
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
'-i'
,
required
.
add_argument
(
'--input'
,
"-i"
,
dest
=
"input"
,
"--input"
,
metavar
=
''
,
dest
=
"input"
,
required
=
True
,
metavar
=
""
,
type
=
str
,
required
=
True
,
nargs
=
'+'
,
type
=
str
,
help
=
"List of input files (counts per sample)."
)
nargs
=
"+"
,
help
=
"List of input files (counts per sample)."
,
required
.
add_argument
(
'-o'
,
)
'--output'
,
dest
=
"output"
,
required
.
add_argument
(
metavar
=
''
,
"-o"
,
required
=
True
,
"--output"
,
type
=
str
,
dest
=
"output"
,
help
=
"Output file name (and directory)."
)
metavar
=
""
,
required
=
True
,
type
=
str
,
help
=
"Output file name (and directory)."
,
)
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
return
(
args
)
return
args
def
extract_sample_name
(
filename
):
def
extract_sample_name
(
filename
):
"""
"""
...
@@ -59,49 +66,51 @@ To extract the sample name, remove "Mapped_read_counts-", and
...
@@ -59,49 +66,51 @@ To extract the sample name, remove "Mapped_read_counts-", and
everything from "_to_" until the end.
everything from "_to_" until the end.
"""
"""
if
"/"
in
filename
:
if
"/"
in
filename
:
#If the directory path is attached to the filename, remove it
#
If the directory path is attached to the filename, remove it
without_path
=
filename
.
split
(
"/"
)[
-
1
]
without_path
=
filename
.
split
(
"/"
)[
-
1
]
else
:
else
:
without_path
=
filename
without_path
=
filename
without_prefix
=
without_path
.
replace
(
"Depth_of_coverage-"
,
""
)
without_prefix
=
without_path
.
replace
(
"Depth_of_coverage-"
,
""
)
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
return
(
sample
)
return
sample
def
main
():
def
main
():
"""
"""
Main execution of the script
Main execution of the script
"""
"""
#1. Parse and show arguments
#
1. Parse and show arguments
arguments
=
parse_arguments
()
arguments
=
parse_arguments
()
message
=
(
"
\n
"
message
=
(
"These are the arguments you have provided:
\n
"
"
\n
"
" INPUT:
\n
"
"These are the arguments you have provided:
\n
"
"{0},
\n
"
" INPUT:
\n
"
" OUTPUT:
\n
"
"{0},
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
" OUTPUT:
\n
"
arguments
.
output
))
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
print
(
message
)
print
(
message
)
#2. Read input files and make into one dataframe
#
2. Read input files and make into one dataframe
concat_df
=
pd
.
DataFrame
()
concat_df
=
pd
.
DataFrame
()
for
file
in
arguments
.
input
:
for
file
in
arguments
.
input
:
sample
=
extract_sample_name
(
file
)
sample
=
extract_sample_name
(
file
)
df
=
pd
.
read_csv
(
file
,
sep
=
"
\t
"
)
df
=
pd
.
read_csv
(
file
,
sep
=
"
\t
"
)
df
[
"Sample_name"
]
=
sample
df
[
"Sample_name"
]
=
sample
concat_df
=
pd
.
concat
([
concat_df
,
df
])
concat_df
=
pd
.
concat
([
concat_df
,
df
])
#3. Write table to a tsv file
concat_df
.
to_csv
(
arguments
.
output
,
sep
=
'
\t
'
,
index
=
False
)
return
(
None
)
# 3. Write table to a tsv file
concat_df
.
to_csv
(
arguments
.
output
,
sep
=
"
\t
"
,
index
=
False
)
return
None
#EXECUTE script--------------------------------------------
#
EXECUTE script--------------------------------------------
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
main
()
main
()
\ No newline at end of file
bin/concatenate_read_counts.py
View file @
c8aa56d3
#! /usr/bin/env python3
#! /usr/bin/env python3
#Concatenate per-sample tables of read counts into an overall table.
#
Concatenate per-sample tables of read counts into an overall table.
#Required input:
#
Required input:
# - A number of per-sample tables as tsv files
# - A number of per-sample tables as tsv files
#
#
#The output file will be a concatenated table with additional "Sample_name"
#
The output file will be a concatenated table with additional "Sample_name"
# and "Reference" columns. Entering a name on the command-line is required.
# and "Reference" columns. Entering a name on the command-line is required.
#
#
#Example use:
#
Example use:
# python concatenate_read_counts.py -i sample1-paired.tsv sample2-paired.tsv sample3-paired.tsv -o All_samples-paired.tsv
# python concatenate_read_counts.py -i sample1-paired.tsv sample2-paired.tsv sample3-paired.tsv -o All_samples-paired.tsv
#IMPORT required libraries---------------------------------
#
IMPORT required libraries---------------------------------
import
pandas
as
pd
import
pandas
as
pd
import
argparse
import
argparse
#Define FUNCTIONS------------------------------------------
#
Define FUNCTIONS------------------------------------------
def
parse_arguments
():
def
parse_arguments
():
"""
"""
Parse the arguments from the command line, i.e.:
Parse the arguments from the command line, i.e.:
...
@@ -22,34 +22,41 @@ def parse_arguments():
...
@@ -22,34 +22,41 @@ def parse_arguments():
-o/--output = output file (tab-separated table)
-o/--output = output file (tab-separated table)
-h/--help = show help
-h/--help = show help
"""
"""
parser
=
argparse
.
ArgumentParser
(
prog
=
"concatenate mapped read counts"
,
parser
=
argparse
.
ArgumentParser
(
description
=
"Concatenate mapped read count tables"
,
prog
=
"concatenate mapped read counts"
,
usage
=
"concatenate_mapped_read_counts.py -i [input] -o [output]"
description
=
"Concatenate mapped read count tables"
,
" [-h / --help]"
,
usage
=
"concatenate_mapped_read_counts.py -i [input] -o [output]"
add_help
=
False
)
" [-h / --help]"
,
add_help
=
False
,
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
=
parser
.
add_argument_group
(
"Required arguments"
)
required
.
add_argument
(
'-i'
,
required
.
add_argument
(
'--input'
,
"-i"
,
dest
=
"input"
,
"--input"
,
metavar
=
''
,
dest
=
"input"
,
required
=
True
,
metavar
=
""
,
type
=
str
,
required
=
True
,
nargs
=
'+'
,
type
=
str
,
help
=
"List of input files (counts per sample)."
)
nargs
=
"+"
,
help
=
"List of input files (counts per sample)."
,
required
.
add_argument
(
'-o'
,
)
'--output'
,
dest
=
"output"
,
required
.
add_argument
(
metavar
=
''
,
"-o"
,
required
=
True
,
"--output"
,
type
=
str
,
dest
=
"output"
,
help
=
"Output file name (and directory)."
)
metavar
=
""
,
required
=
True
,
type
=
str
,
help
=
"Output file name (and directory)."
,
)
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
(
args
,
extra_args
)
=
parser
.
parse_known_args
()
return
(
args
)
return
args
def
extract_sample_and_reference_name
(
filename
):
def
extract_sample_and_reference_name
(
filename
):
"""
"""
...
@@ -59,60 +66,64 @@ To extract the sample name, remove "Mapped_read_counts-", and
...
@@ -59,60 +66,64 @@ To extract the sample name, remove "Mapped_read_counts-", and
everything from "_to_" until the end.
everything from "_to_" until the end.
"""
"""
if
"/"
in
filename
:
if
"/"
in
filename
:
#If the directory path is attached to the filename, remove it
#
If the directory path is attached to the filename, remove it
without_path
=
filename
.
split
(
"/"
)[
-
1
]
without_path
=
filename
.
split
(
"/"
)[
-
1
]
else
:
else
:
without_path
=
filename
without_path
=
filename
without_prefix
=
without_path
.
replace
(
"Mapped_read_counts-"
,
""
)
without_prefix
=
without_path
.
replace
(
"Mapped_read_counts-"
,
""
)
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
sample
=
without_prefix
[:
without_prefix
.
index
(
"_to_"
)]
reference
=
without_prefix
[
without_prefix
.
index
(
"_to_"
)
+
4
:
-
4
]
reference
=
without_prefix
[
without_prefix
.
index
(
"_to_"
)
+
4
:
-
4
]
#Assume the .tsv file extension; hence strip the last 4 characters
#
Assume the .tsv file extension; hence strip the last 4 characters
if
"-unpaired"
in
reference
:
if
"-unpaired"
in
reference
:
reference
=
reference
.
replace
(
"-unpaired"
,
""
)
reference
=
reference
.
replace
(
"-unpaired"
,
""
)
elif
"-paired"
in
reference
:
elif
"-paired"
in
reference
:
reference
=
reference
.
replace
(
"-paired"
,
""
)
reference
=
reference
.
replace
(
"-paired"
,
""
)
else
:
else
:
print
(
"There might be an unexpected suffix in the reference name:"
print
(
" %s"
%
reference
)
"There might be an unexpected suffix in the reference name:"
" %s"
%
reference
)
pass
pass
return
(
sample
,
reference
)
return
(
sample
,
reference
)
def
main
():
def
main
():
"""
"""
Main execution of the script
Main execution of the script
"""
"""
#1. Parse and show arguments
#
1. Parse and show arguments
arguments
=
parse_arguments
()
arguments
=
parse_arguments
()
message
=
(
"
\n
"
message
=
(
"These are the arguments you have provided:
\n
"
"
\n
"
" INPUT:
\n
"
"These are the arguments you have provided:
\n
"
"{0},
\n
"
" INPUT:
\n
"
" OUTPUT:
\n
"
"{0},
\n
"
"{1}
\n
"
.
format
(
arguments
.
input
,
" OUTPUT:
\n
"
arguments
.
output
))
"{1}
\n
"
.
format
(
arguments
.
input
,
arguments
.
output
)
)
print
(
message
)