...
 
Commits (20)
#!/bin/bash
#$ -S /bin/bash
#$ -q all.q
#$ -N centrifuge-build_bacteria
#$ -l h_vmem=12G
#$ -cwd
#$ -j Y
#$ -V
#$ -pe BWA 8
echo Start time : `date`
PROJECT_DIR=/exports/sascstudent/brian/project-RefSeqRelease90
PROJECT_DIR2=/exports/sascstudent/project-RefseqRelease
FILTERED_DIR=$PROJECT_DIR/analysis/complete90/library_filtered
centrifuge-build -p 8 \
--conversion-table $PROJECT_DIR/refseqtools/seqid2taxid.DNA.map \
--taxonomy-tree $PROJECT_DIR2/data/taxonomy/nodes.dmp \
--name-table $PROJECT_DIR2/data/taxonomy/names.dmp \
$FILTERED_DIR/bacterial_filtered/bacterial_sequences.fna \
$PROJECT_DIR/analysis/indexes/bacteria
echo End time : `date`
#!/bin/bash
#$ -S /bin/bash
#$ -q all.q
#$ -N centrifuge-build_fungi-bacteria
#$ -l h_vmem=27G
#$ -cwd
#$ -j Y
#$ -V
#$ -pe BWA 8
echo Start time : `date`
PROJECT_DIR=/exports/sascstudent/brian/project-RefSeqRelease90
PROJECT_DIR2=/exports/sascstudent/project-RefseqRelease
FILTERED_DIR=$PROJECT_DIR/analysis/complete90/library_filtered
centrifuge-build -p 8 \
--conversion-table $PROJECT_DIR/refseqtools/seqid2taxid.DNA.map \
--taxonomy-tree $PROJECT_DIR2/data/taxonomy/nodes.dmp \
--name-table $PROJECT_DIR2/data/taxonomy/names.dmp \
$FILTERED_DIR/bacterial_filtered/bacterial_sequences.fna,\
$FILTERED_DIR/fungi_filtered/fungi_sequences.fna \
$PROJECT_DIR/analysis/indexes/fungi-bacteria
echo End time : `date`
#!/bin/bash
#$ -S /bin/bash
#$ -q all.q
#$ -N centrifuge_BioPool_bacteria
#$ -l h_vmem=2G
#$ -cwd
#$ -j Y
#$ -V
#$ -pe BWA 8
echo Start time : `date`
PROJECT_DIR=/exports/sascstudent/brian/project-RefSeqRelease90
TESTDATA_DIR=$PROJECT_DIR/data/test_datasets
OUTPUT_DIR=$PROJECT_DIR/analysis/BioPool_sample
centrifuge -p 8 \
-x $PROJECT_DIR/analysis/indexes/bacteria \
-1 $TESTDATA_DIR/BioPool_BioPool_1_Cycle_02042016_CTGAAGCT-TATAGCCT_L001_R1_001.fastq.gz \
-2 $TESTDATA_DIR/BioPool_BioPool_1_Cycle_02042016_CTGAAGCT-TATAGCCT_L001_R2_001.fastq.gz \
-S $OUTPUT_DIR/output_BioPool_bacteria \
--report-file $OUTPUT_DIR/report_BioPool_bacteria.tsv
centrifuge-kreport -x $PROJECT_DIR/analysis/indexes/bacteria \
$OUTPUT_DIR/output_BioPool_bacteria > $OUTPUT_DIR/kreport_BioPool_bacteria
echo End time : `date`
#!/bin/bash
#$ -S /bin/bash
#$ -q all.q
#$ -N centrifuge_BioPool_fungi-bacteria
#$ -l h_vmem=3G
#$ -cwd
#$ -j Y
#$ -V
#$ -pe BWA 8
echo Start time : `date`
PROJECT_DIR=/exports/sascstudent/brian/project-RefSeqRelease90
TESTDATA_DIR=$PROJECT_DIR/data/test_datasets
OUTPUT_DIR=$PROJECT_DIR/analysis/BioPool_sample
centrifuge -p 8 \
-x $PROJECT_DIR/analysis/indexes/fungi-bacteria \
-1 $TESTDATA_DIR/BioPool_BioPool_1_Cycle_02042016_CTGAAGCT-TATAGCCT_L001_R1_001.fastq.gz \
-2 $TESTDATA_DIR/BioPool_BioPool_1_Cycle_02042016_CTGAAGCT-TATAGCCT_L001_R2_001.fastq.gz \
-S $OUTPUT_DIR/output_BioPool_fungi-bacteria \
--report-file $OUTPUT_DIR/report_BioPool_fungi-bacteria.tsv
centrifuge-kreport -x $PROJECT_DIR/analysis/indexes/fungi-bacteria \
$OUTPUT_DIR/output_BioPool_fungi-bacteria > $OUTPUT_DIR/kreport_BioPool_fungi-bacteria
echo End time : `date`
#!/bin/bash
#$ -S /bin/bash
#$ -q all.q
#$ -N centrifuge_BioPool_new_nt
#$ -l h_vmem=16G
#$ -cwd
#$ -j Y
#$ -V
#$ -pe BWA 8
echo Start time : `date`
PROJECT_DIR=/exports/sascstudent/brian/project-RefSeqRelease90
TESTDATA_DIR=$PROJECT_DIR/data/test_datasets
OUTPUT_DIR=$PROJECT_DIR/analysis/BioPool_sample
centrifuge -p 8 \
-x /exports/genomes/metagenomics/centrifuge/nt-20180303/nt \
-1 $TESTDATA_DIR/BioPool_BioPool_1_Cycle_02042016_CTGAAGCT-TATAGCCT_L001_R1_001.fastq.gz \
-2 $TESTDATA_DIR/BioPool_BioPool_1_Cycle_02042016_CTGAAGCT-TATAGCCT_L001_R2_001.fastq.gz \
-S $OUTPUT_DIR/output_BioPool_new_nt \
--report-file $OUTPUT_DIR/report_BioPool_new_nt.tsv
centrifuge-kreport -x /exports/genomes/metagenomics/centrifuge/nt-20180303/nt \
$OUTPUT_DIR/output_BioPool_new_nt > $OUTPUT_DIR/kreport_BioPool_new_nt
echo End time : `date`
#!/bin/bash
#$ -S /bin/bash
#$ -q all.q
#$ -N centrifuge_BioPool_nt
#$ -l h_vmem=15G
#$ -cwd
#$ -j Y
#$ -V
#$ -pe BWA 8
echo Start time : `date`
PROJECT_DIR=/exports/sascstudent/brian/project-RefSeqRelease90
TESTDATA_DIR=$PROJECT_DIR/data/test_datasets
OUTPUT_DIR=$PROJECT_DIR/analysis/BioPool_sample
centrifuge -p 8 \
-x /exports/genomes/metagenomics/centrifuge/nt/nt \
-1 $TESTDATA_DIR/BioPool_BioPool_1_Cycle_02042016_CTGAAGCT-TATAGCCT_L001_R1_001.fastq.gz \
-2 $TESTDATA_DIR/BioPool_BioPool_1_Cycle_02042016_CTGAAGCT-TATAGCCT_L001_R2_001.fastq.gz \
-S $OUTPUT_DIR/output_BioPool_nt \
--report-file $OUTPUT_DIR/report_BioPool_nt.tsv
centrifuge-kreport -x /exports/genomes/metagenomics/centrifuge/nt/nt \
$OUTPUT_DIR/output_BioPool_nt > $OUTPUT_DIR/kreport_BioPool_nt
echo End time : `date`
# Create indexes with RefSeq Release 90
# Activate ETE3 environment.
source activate ete3
# Go to home directory, create project directory and go to project directory.
cd /exports/sascstudent/brian/
mkdir project-RefSeqRelease90/
cd project-RefSeqRelease90/
# Clone git refseqtools repository.
git clone --single-branch -b old_master https://github.com/papanikos/refseqtools.git
# Go to refseqtools directory.
cd refseqtools/
# Retrieve the RefSeq Release catalog.
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/RefSeq-release90.catalog.gz
# Create a gzipped table with the accession (column 3), its taxID (column 1) and its size (column 6).
zgrep -e "AC_" -e "NC_" -e "NW_" -e "NT_" -e "NZ_" -e "NG_" -e "NR_" RefSeq-release90.catalog.gz | awk -F "\t" '{print $3"\t"$1"\t"$6}' | gzip -c > Refseq90.DNA.gz
# Extract the mapping file that contains a mapping of each sequence to its taxID.
zcat Refseq90.DNA.gz | cut -f1,2 > seqid2taxid.DNA.map
# Create a JSON file where a taxID is used as a primary key and all accessions and sequence sizes are stored under it.
python convert_acc2taxid_to_json.py -i Refseq90.DNA.gz -o Refseq90.DNA.json
# Filter bacteria FASTA files with WDL pipeline.
cd wdlfilter/
java -Dsystem.input-read-limits.lines=200000 -Dconfig.file=/usr/local/sasc/config/cromwell/SGE_36.conf -jar /exports/sascstudent/brian/refseqtools/cromwell-36.jar run FilterDomain.wdl -i bacteria.json
# After filtering of the bacteria FASTA files unzip 'dustmasked.filtered.fna.gz' to 'bacterial_sequences.fna'.
zcat /exports/sascstudent/brian/project-RefseqRelease90/analysis/complete90/library_filtered/bacterial_filtered/dustmasked.filtered.fna.gz > /exports/sascstudent/brian/project-RefseqRelease90/analysis/complete90/library_filtered/bacterial_filtered/bacterial_sequences.fna
# Filter fungi FASTA files with WDL pipeline.
java -Dsystem.input-read-limits.lines=200000 -Dconfig.file=/usr/local/sasc/config/cromwell/SGE_36.conf -jar /exports/sascstudent/brian/refseqtools/cromwell-36.jar run FilterDomain.wdl -i fungi.json
# After filtering of the fungi FASTA files unzip 'dustmasked.filtered.fna.gz' to 'fungi_sequences.fna'.
zcat /exports/sascstudent/brian/project-RefseqRelease90/analysis/complete90/library_filtered/fungi_filtered/dustmasked.filtered.fna.gz > /exports/sascstudent/brian/project-RefseqRelease90/analysis/complete90/library_filtered/fungi_filtered/fungi_sequences.fna
{
"FilterDomainFastas.filterScriptPath": "/exports/sascstudent/brian/project-RefSeqRelease90/refseqtools/filter_fasta.py",
"FilterDomainFastas.includeAccFile": "/exports/sascstudent/project-RefseqRelease/data/accession_sets/bacteria_rep/accessions.txt",
"FilterDomainFastas.domainInputDir": "/exports/sascstudent/project-RefseqRelease/data/release90/library/bacteria",
"FilterDomainFastas.taxDbPath": "/exports/sascstudent/project-RefseqRelease/data/taxonomy/ete3_taxadb",
"FilterDomainFastas.outputDir": "/exports/sascstudent/brian/project-RefSeqRelease90/analysis/complete90/library_filtered/bacterial_filtered",
"FilterDomainFastas.FilterFasta.preCommand": "source activate ete3",
"FilterDomainFastas.dustmaskerExe": "/home/bpiepenbroek/ncbi-blast-2.7.1+/bin/dustmasker",
"FilterDomainFastas.refseqJson": "/exports/sascstudent/brian/project-RefSeqRelease90/refseqtools/RefSeq90.DNA.json"
}
{
"FilterDomainFastas.filterScriptPath": "/exports/sascstudent/brian/project-RefSeqRelease90/refseqtools/filter_fasta.py",
"FilterDomainFastas.domainInputDir": "/exports/sascstudent/project-RefseqRelease/data/release90/library/fungi",
"FilterDomainFastas.taxDbPath": "/exports/sascstudent/project-RefseqRelease/data/taxonomy/ete3_taxadb",
"FilterDomainFastas.outputDir": "/exports/sascstudent/brian/project-RefSeqRelease90/analysis/complete90/library_filtered/fungi_filtered",
"FilterDomainFastas.FilterFasta.preCommand": "source activate ete3",
"FilterDomainFastas.dustmaskerExe": "/home/bpiepenbroek/ncbi-blast-2.7.1+/bin/dustmasker",
"FilterDomainFastas.refseqJson": "/exports/sascstudent/brian/project-RefSeqRelease90/refseqtools/RefSeq90.DNA.json",
"FilterDomainFastas.dustmaskOnly": true
}