Commit e2ba2397 authored by brpiepenbroek's avatar brpiepenbroek

Upload New File

parent 3284fd89
# Create indexes with RefSeq Release 90
# Activate ETE3 environment.
source activate ete3
# Go to home directory, create project directory and go to project directory.
cd /exports/sascstudent/brian/
mkdir project-RefSeqRelease90/
cd project-RefSeqRelease90/
# Clone git refseqtools repository.
git clone --single-branch -b old_master https://github.com/papanikos/refseqtools.git
# Go to refseqtools directory.
cd refseqtools/
# Retrieve the RefSeq Release catalog.
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/RefSeq-release90.catalog.gz
# Create a gzipped table with the accession (column 3), its taxID (column 1) and its size (column 6).
zgrep -e "AC_" -e "NC_" -e "NW_" -e "NT_" -e "NZ_" -e "NG_" -e "NR_" RefSeq-release90.catalog.gz | awk -F "\t" '{print $3"\t"$1"\t"$6}' | gzip -c > Refseq90.DNA.gz
# Extract the mapping file that contains a mapping of each sequence to its taxID.
zcat Refseq90.DNA.gz | cut -f1,2 > seqid2taxid.DNA.map
# Create a JSON file where a taxID is used as a primary key and all accessions and sequence sizes are stored under it.
python convert_acc2taxid_to_json.py -i Refseq90.DNA.gz -o Refseq90.DNA.json
# Filter bacteria FASTA files with WDL pipeline.
cd wdlfilter/
java -Dsystem.input-read-limits.lines=200000 -Dconfig.file=/usr/local/sasc/config/cromwell/SGE_36.conf -jar /exports/sascstudent/brian/refseqtools/cromwell-36.jar run FilterDomain.wdl -i bacteria.json
# After filtering of the bacteria FASTA files unzip 'dustmasked.filtered.fna.gz' to 'bacterial_sequences.fna'.
zcat /exports/sascstudent/brian/project-RefseqRelease90/analysis/complete90/library_filtered/bacterial_filtered/dustmasked.filtered.fna.gz > /exports/sascstudent/brian/project-RefseqRelease90/analysis/complete90/library_filtered/bacterial_filtered/bacterial_sequences.fna
# Filter fungi FASTA files with WDL pipeline.
java -Dsystem.input-read-limits.lines=200000 -Dconfig.file=/usr/local/sasc/config/cromwell/SGE_36.conf -jar /exports/sascstudent/brian/refseqtools/cromwell-36.jar run FilterDomain.wdl -i fungi.json
# After filtering of the fungi FASTA files unzip 'dustmasked.filtered.fna.gz' to 'fungi_sequences.fna'.
zcat /exports/sascstudent/brian/project-RefseqRelease90/analysis/complete90/library_filtered/fungi_filtered/dustmasked.filtered.fna.gz > /exports/sascstudent/brian/project-RefseqRelease90/analysis/complete90/library_filtered/fungi_filtered/fungi_sequences.fna
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment