common.smk 6.8 KB
Newer Older
1
containers = {
2
3
4
5
6
7
8
9
10
11
12
13
14
15
   'bcftools': 'docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4',
   'bedtools-2.26-python-2.7': 'docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0',
   'biopet-scatterregions': 'docker://quay.io/biocontainers/biopet-scatterregions:0.2--0',
   'bwa-0.7.17-picard-2.22.8': 'docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:76d16eabff506ac13338d7f14644a0ad301b9d7e-0',
   'cutadapt': 'docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0',
   'debian': 'docker://debian:buster-slim',
   'fastqc': 'docker://quay.io/biocontainers/fastqc:0.11.7--4',
   'gatk': 'docker://broadinstitute/gatk3:3.7-0',
   'gvcf2coverage': 'docker://lumc/gvcf2coverage:0.1-dirty-2',
   'multiqc': 'docker://quay.io/biocontainers/multiqc:1.8--py_2',
   'picard': 'docker://quay.io/biocontainers/picard:2.22.8--0',
   'python3': 'docker://python:3.6-slim',
   'samtools-1.7-python-3.6': 'docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0',
   'vtools': 'docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0'
16
17
18
19
20
}

def process_config():
    """ Process the config file and set the default values """

van den Berg's avatar
van den Berg committed
21
22
23
24
25
    def set_default(key, value):
        """Set default config values"""
        if key not in config:
            config[key] = value

26
27
28
29
30
31
32
33
34
35
36
37
    # Read the json schema
    with open(srcdir('config/schema.json'), 'rt') as fin:
        schema = json.load(fin)

    # Validate the config against the schema
    try:
        jsonschema.validate(config, schema)
    except jsonschema.ValidationError as e:
        raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')

    # If you specify a baitsfile, you also have to specify a targets file for
    # picard
38
    if 'baitsfile' in config and 'targetsfile' not in config:
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
        msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
        raise jsonschema.ValidationError(msg)

    # A sample name cannot be a substring of another sample, since that breaks picard
    # metrics parsing by multiqc
    msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
    for s1, s2 in itertools.permutations(config['samples'], 2):
        if s1 in s2:
            raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))

    # Set the default config
    set_default('scatter_size', 1000000000)
    set_default('female_threshold', 0.6)

    # Hide the absolute path so the snakemake linter doesn't cry about it
    set_default('gatk_jar', os.path.join(os.path.sep,'usr','GenomeAnalysisTK.jar'))

    # Set the script paths
57
58
59
60
61
62
    set_default('covstats', srcdir('src/covstats.py'))
    set_default('collect_stats', srcdir('src/collect_stats.py'))
    set_default('merge_stats', srcdir('src/merge_stats.py'))
    set_default('stats_to_tsv', srcdir('src/stats_to_tsv.py'))
    set_default('py_wordcount', srcdir('src/pywc.py'))
    set_default('cutadapt_summary', srcdir('src/cutadapt_summary.py'))
van den Berg's avatar
van den Berg committed
63

64
65
66
67
68
69

def coverage_stats(wildcards):
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
    return files if "refflat" in config else []

van den Berg's avatar
van den Berg committed
70
71
72
73
74
75
76
77
78
79
80
def coverage_files(wildcards):
    """ Return a list of all coverage files

    The coverage is calculated for each sample, for each specified threshold
    """

    # We only calculate the coverage when this is specified in the
    # configuration
    if 'coverage_threshold' not in config:
        return list()

81
    # Fetch the values we need from the configuration
van den Berg's avatar
van den Berg committed
82
83
    samples = config['samples']
    thresholds = config['coverage_threshold']
84

van den Berg's avatar
van den Berg committed
85
86
87
88
    files = list()
    for sample, threshold in itertools.product(samples, thresholds):
        files.append(f'{sample}/vcf/{sample}_{threshold}.bed')
    return files
89

90
91
def sample_bamfiles(wildcards):
    """ Determine the bam files for a sample (one for each readgroup)
92
93
94
95
    """
    files = list()
    sample = config['samples'][wildcards.sample]
    sample_name = wildcards.sample
van den Berg's avatar
van den Berg committed
96
97
    for read_group in sample['read_groups']:
        files.append(f'{sample_name}/bams/{sample_name}-{read_group}.sorted.bam')
98
    return files
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

def gather_gvcf(wildcards):
    """ Gather the gvcf files based on the scatterregions checkpoint

    This is depends on the 'scatter_size' parameter and the reference genome
    used
    """
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def gather_gvcf_tbi(wildcards):
    """ Gather the gvcf index files based on the scatterregions checkpoint
    This is depends on the 'scatter_size' parameter and the reference genome
    used
    """
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def gather_vcf(wildcards):
    """ Gather the vcf files based on the scatterregions checkpoint
    This is depends on the 'scatter_size' parameter and the reference genome
    used
    """
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def gather_vcf_tbi(wildcards):
    """ Gather the vcf index files based on the scatterregions checkpoint
    This is depends on the 'scatter_size' parameter and the reference genome
    used
    """
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
136
137
138
139
140
141
142
143

def sample_cutadapt_files(wildcards):
    """ Determine the cutadapt log files files for a sample (one for each
    readgroup).
    """
    files = list()
    sample = config['samples'][wildcards.sample]
    sample_name = wildcards.sample
van den Berg's avatar
van den Berg committed
144
145
    for read_group in sample['read_groups']:
        files.append(f'{sample_name}/pre_process/{sample_name}-{read_group}.txt')
146
    return files
van den Berg's avatar
van den Berg committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

def all_raw_fastqc(wildcards):
    """ Determine the raw fastq files for each sample """
    fastq_files = list()
    for sample in config['samples']:
        for read_group in config['samples'][sample]['read_groups']:
            fastq_files.append(f"{sample}/pre_process/raw-{sample}-{read_group}/.done")
    return fastq_files

def all_trimmed_fastqc(wildcards):
    """ Determine the raw fastq files for each sample """
    fastq_files = list()
    for sample in config['samples']:
        for read_group in config['samples'][sample]['read_groups']:
            fastq_files.append(f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done")
    return fastq_files