Commit 4e7fe3ab authored by Sander Bollen's avatar Sander Bollen
Browse files

until genotyping

parent 26862cc0
import json
from os.path import join
from os import mkdir
import re
_run_re = re.compile('([\w\d-]+)_([\w\d-]+)')
OUT_DIR = config.get("OUTPUT_DIR")
REFERENCE = config.get("REFERENCE")
JAVA = config.get("JAVA")
GATK = config.get("GATK")
DBSNP = config.get("DBSNP")
ONETHOUSAND = config.get("ONETHOUSAND")
HAPMAP = config.get("HAPMAP")
QUEUE = config.get("QUEUE")
BED = config.get("BED")
REFFLAT = config.get("REFFLAT")
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
_this_dir = workflow.current_basedir
GSC = join(join(_this_dir, "src"), "gc.sc")
CGSC = join(join(_this_dir, "src"), "cg.sc")
env_dir = join(_this_dir, "envs")
main_env = join(_this_dir, "environment.yml")
settings_template = join(join(_this_dir, "templates"), "pipeline_settings.md.j2")
with open(config.get("SAMPLE_CONFIG")) as handle:
SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()
RUN_NAME = SAMPLE_CONFIG['run_name'] # this should match regex ([\w\d-]+)_([\w\d-]+)
def out_path(path):
return join(OUT_DIR, path)
try:
mkdir(out_path("tmp"))
except OSError:
pass
def get_r1(wildcards):
s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
r1 = [x['R1'] for _, x in s['libraries'].items()]
return r1
def get_r2(wildcards):
s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
r2 = [x['R2'] for _, x in s['libraries'].items()]
return r2
def sample_gender(wildcards):
sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
return sam.get("gender", "null")
rule all:
input:
combined=out_path("multisample/genotyped.vcf.gz")
rule genome:
input: REFERENCE
output: out_path("current.genome")
shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"
rule merge_r1:
input: get_r1
output: temp(out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"))
shell: "cat {input} > {output}"
rule merge_r2:
input: get_r2
output: temp(out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"))
shell: "cat {input} > {output}"
rule sickle:
input:
r1 = out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
r2 = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
output:
r1 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq")),
r2 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")),
s = out_path("{sample}/pre_process/{sample}.trimmed_singles.fastq"),
conda: "envs/sickle.yml"
shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} " \
"-p {output.r2} -s {output.s}"
rule cutadapt:
input:
r1 = out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
r2 = out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")
output:
r1 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq")),
r2 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"))
conda: "envs/cutadapt.yml"
shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} " \
"{input.r1} -p {output.r2} {input.r2}"
rule align:
input:
r1 = out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
r2 = out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"),
ref = REFERENCE
params:
rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
output: temp(out_path("{sample}/bams/{sample}.sorted.bam"))
conda: "envs/bwa.yml"
shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} " \
"| picard SortSam CREATE_INDEX=TRUE TMP_DIR=null " \
"INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"
rule markdup:
input:
bam = out_path("{sample}/bams/{sample}.sorted.bam"),
params:
tmp = out_path("tmp")
output:
bam = temp(out_path("{sample}/bams/{sample}.markdup.bam")),
metrics = out_path("{sample}/bams/{sample}.markdup.metrics")
conda: "envs/picard.yml"
shell: "picard MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={params.tmp} " \
"INPUT={input.bam} OUTPUT={output.bam} " \
"METRICS_FILE={output.metrics} " \
"MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"
rule baserecal:
input:
bam = out_path("{sample}/bams/{sample}.markdup.bam"),
java = JAVA,
gatk = GATK,
ref = REFERENCE,
dbsnp = DBSNP,
one1kg = ONETHOUSAND,
hapmap = HAPMAP
output:
grp = out_path("{sample}/bams/{sample}.baserecal.grp")
conda: "envs/gatk.yml"
shell: "{input.java} -jar {input.gatk} -T BaseRecalibrator " \
"-I {input.bam} -o {output.grp} -nct 8 -R {input.ref} " \
"-cov ReadGroupCovariate -cov QualityScoreCovariate " \
"-cov CycleCovariate -cov ContextCovariate -knownSites " \
"{input.dbsnp} -knownSites {input.one1kg} " \
"-knownSites {input.hapmap}"
rule printreads:
input:
grp=out_path("{sample}/bams/{sample}.baserecal.grp"),
bam=out_path("{sample}/bams/{sample}.markdup.bam"),
java=JAVA,
gatk=GATK,
ref=REFERENCE
output:
bam=out_path("{sample}/bams/{sample}.baserecal.bam"),
bai=out_path("{sample}/bams/{sample}.baserecal.bai")
conda: "envs/gatk.yml"
shell: "{input.java} -jar {input.gatk} -T PrintReads -I {input.bam} "\
"-o {output.bam} -R {input.ref} -BQSR {input.grp}"
rule qdir:
params:
qdir=out_path("{sample}/.qdir")
output:
aux=out_path("{sample}/.qdir/.aux")
shell: "touch {output.aux}"
rule gvcf:
input:
bam=out_path("{sample}/bams/{sample}.baserecal.bam"),
queue=QUEUE,
dbsnp=DBSNP,
ref=REFERENCE,
gc=GSC,
qaux=out_path("{sample}/.qdir/.aux")
params:
qdir=out_path("{sample}/.qdir")
output:
gvcf=out_path("{sample}/vcf/{sample}.g.vcf.gz")
conda: "envs/gatk.yml"
shell: "java -jar {input.queue} -S {nput.gc} -I {input.bam} "\
"-D {input.dbsnp} -R {input.ref} -o {output.gvcf} "\
"-jobResReq 'h_vmem=10G' -run -qsub -jobParaEnv BWA "\
"-runDir {params.qdir}"
rule genotype:
input:
gvcfs=expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"), sample=SAMPLES),
queue=QUEUE,
ref=REFERENCE,
cg=CGSC
params:
li=" -I ".join(expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"), sample=SAMPLES))
output:
combined=out_path("multisample/genotyped.vcf.gz")
conda: "envs/gatk.yml"
shell: "java -jar {input.queue} -S {input.cg} -I {params.li} "\
"-R {inut.ref} -o {output.combined} "\
"-jobResReq 'h_vmem=10G' -run -qsub -jobParaEnv BWA"
name: hutspot
channels:
- bioconda
- conda-forge
- defaults
- r
dependencies:
- cyvcf2=0.8.0=py36_0
- pyvcf=0.6.8=py36_0
- backports=1.0=py36_1
- backports.functools_lru_cache=1.4=py36_1
- ca-certificates=2017.7.27.1=0
- certifi=2017.7.27.1=py36_0
- click=6.7=py36_0
- coloredlogs=7.1=py36_0
- cycler=0.10.0=py36_0
- dbus=1.10.22=0
- expat=2.2.1=0
- fontconfig=2.12.1=5
- freetype=2.7=1
- gettext=0.19.7=1
- glib=2.53.5=0
- gst-plugins-base=1.8.0=0
- gstreamer=1.8.0=1
- humanfriendly=4.4=py36_0
- icu=58.1=1
- jpeg=9b=2
- libffi=3.2.1=3
- libiconv=1.14=4
- libpng=1.6.28=1
- libxcb=1.12=1
- libxml2=2.9.5=0
- matplotlib=2.1.0=py36_1
- ncurses=5.9=10
- openssl=1.0.2l=0
- pandas=0.21.0=py36_0
- patsy=0.4.1=py36_0
- pcre=8.39=0
- pip=9.0.1=py36_0
- pyparsing=2.2.0=py36_0
- pyqt=5.6.0=py36_4
- python=3.6.3=0
- python-dateutil=2.6.1=py36_0
- pytz=2017.3=py_1
- qt=5.6.2=3
- readline=6.2=0
- seaborn=0.8.1=py36_0
- setuptools=36.6.0=py36_1
- sip=4.18=py36_1
- six=1.11.0=py36_1
- sqlite=3.13.0=1
- statsmodels=0.8.0=py36_0
- tk=8.5.19=2
- tornado=4.5.2=py36_0
- wheel=0.30.0=py_1
- xorg-libxau=1.0.8=3
- xorg-libxdmcp=1.1.2=3
- xz=5.2.3=0
- zlib=1.2.8=3
- libgcc=5.2.0=0
- libgfortran=3.0.0=1
- mkl=2017.0.3=0
- numpy=1.13.1=py36_0
- scipy=0.19.1=np113py36_0
- pip:
- backports.functools-lru-cache==1.4
name: hutspot-bwa
channels:
- bioconda
- conda-forge
- defaults
- r
dependencies:
- bwa=0.7.16=pl5.22.0_0
- picard=2.14=py36_0
- ca-certificates=2017.7.27.1=0
- certifi=2017.7.27.1=py36_0
- ncurses=5.9=10
- openssl=1.0.2l=0
- perl=5.22.0.1=0
- pip=9.0.1=py36_0
- python=3.6.3=0
- readline=6.2=0
- setuptools=36.6.0=py36_1
- sqlite=3.13.0=1
- tk=8.5.19=2
- wheel=0.30.0=py_1
- xz=5.2.3=0
- zlib=1.2.8=3
- libgcc=5.2.0=0
- openjdk=8.0.121=1
prefix: /home/ahbbollen/miniconda3/envs/hutspot-bwa
name: hutspot-cutadapt
channels:
- bioconda
- conda-forge
- defaults
- r
dependencies:
- cutadapt=1.14=py36_0
- xopen=0.1.1=py36_0
- ca-certificates=2017.7.27.1=0
- certifi=2017.7.27.1=py36_0
- ncurses=5.9=10
- openssl=1.0.2l=0
- pip=9.0.1=py36_0
- python=3.6.3=1
- readline=6.2=0
- setuptools=36.6.0=py36_1
- sqlite=3.13.0=1
- tk=8.5.19=2
- wheel=0.30.0=py_1
- xz=5.2.3=0
- zlib=1.2.11=0
prefix: /home/ahbbollen/miniconda3/envs/hutspot-cutadapt
name: hutspot-gatk
channels:
- bioconda
- conda-forge
- defaults
- r
dependencies:
- gatk=3.7=py36_1
- bzip2=1.0.6=1
- ca-certificates=2017.7.27.1=0
- certifi=2017.7.27.1=py36_0
- ncurses=5.9=10
- openssl=1.0.2l=0
- pip=9.0.1=py36_0
- python=3.6.3=1
- readline=6.2=0
- setuptools=36.6.0=py36_1
- sqlite=3.13.0=1
- tk=8.5.19=2
- wheel=0.30.0=py_1
- xz=5.2.3=0
- zlib=1.2.11=0
- openjdk=8.0.121=1
prefix: /home/ahbbollen/miniconda3/envs/hutspot-gatk
name: hutspot-picard
channels:
- bioconda
- conda-forge
- defaults
- r
dependencies:
- picard=2.14=py36_0
- ca-certificates=2017.7.27.1=0
- certifi=2017.7.27.1=py36_0
- ncurses=5.9=10
- openssl=1.0.2l=0
- pip=9.0.1=py36_0
- python=3.6.3=1
- readline=6.2=0
- setuptools=36.6.0=py36_1
- sqlite=3.13.0=1
- tk=8.5.19=2
- wheel=0.30.0=py_1
- xz=5.2.3=0
- zlib=1.2.11=0
- openjdk=8.0.121=1
name: hutspot-sickle
channels:
- bioconda
- conda-forge
- defaults
- r
dependencies:
- sickle-trim=1.33=1
- zlib=1.2.8=3
prefix: /home/ahbbollen/miniconda3/envs/hutspot-sickle
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.broadinstitute.gatk.queue.extensions.gatk.CombineGVCFs
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.ReferenceConfidenceMode
import org.broadinstitute.gatk.utils.commandline.Output
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType
class CombineOurGs extends QScript {
@Input(doc="The input bam file", shortName = "I")
var bamFile: List[File] = Nil
@Input(doc="The reference", shortName="R")
var referenceFile: File = _
@Output(doc="output", shortName="O")
var outFile: File = _
def script(): Unit = {
val genotyper = new CombineGVCFs
genotyper.scatterCount = 200
genotyper.variant = this.bamFile
genotyper.reference_sequence = this.referenceFile
genotyper.out = outFile
add(genotyper)
}
}
\ No newline at end of file
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.broadinstitute.gatk.queue.extensions.gatk.HaplotypeCaller
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.ReferenceConfidenceMode
import org.broadinstitute.gatk.utils.commandline.Output
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType
class GvcfWholeGenome extends QScript {
@Input(doc="The input bam file", shortName = "I")
var bamFile: List[File] = Nil
@Input(doc="The reference", shortName="R")
var referenceFile: File = _
@Input(doc="dbSNP vcf", shortName="D")
var dbSNPFile: File = _
@Output(doc="output", shortName="O")
var outFile: File = _
def script(): Unit = {
val genotyper = new HaplotypeCaller
genotyper.scatterCount = 200
genotyper.input_file = this.bamFile
genotyper.reference_sequence = this.referenceFile
genotyper.dbsnp = this.dbSNPFile
genotyper.out = outFile
genotyper.stand_call_conf = 30
genotyper.emitRefConfidence = ReferenceConfidenceMode.GVCF
genotyper.variant_index_type = GATKVCFIndexType.LINEAR
genotyper.variant_index_parameter = 128000
add(genotyper)
}
}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment