Commit bfcd47f5 authored by Sander Bollen's avatar Sander Bollen
Browse files

use bwa in stead of blat

parent 3a543ec9
......@@ -17,9 +17,10 @@ from Bio.Alphabet import generic_dna
from fastools.fastools import get_reference
from pyfaidx import Fasta
from pysam import AlignmentFile
from .models import Primer, Region, BlatLine
from .utils import NoPrimersException, calc_gc, NEW_VCF
from .utils import NoPrimersException, calc_gc, NEW_VCF, generate_fastq_from_primers
PRIMER3_SCRIPT = os.path.join(os.path.join(os.path.dirname(__file__) ,"static"), 'getprimers.sh')
......@@ -72,80 +73,177 @@ def run_primer3(sequence, region, padding=True,
return primers
def blat_primers(primers, blat_exe=None, ref=None):
tmp = NamedTemporaryFile()
records = []
for primer in primers:
records += [SeqRecord(Seq(primer.left, generic_dna), primer.left)]
records += [SeqRecord(Seq(primer.right, generic_dna), primer.right)]
def aln_primers(primers, bwa_exe=None, samtools_exe=None, ref=None, output_bam=None):
"""
Align primers with BWA.
This only works with BWA-ALN due to the short read length.
:param primers: List of primers
:param bwa_exe: Path to BWA
:param samtools_exe: Path to samtools
:param ref: Path to reference fasta
:param output_bam: Path to final bam file
:return: instance of pysam.AlignmentFile
"""
fq1 = NamedTemporaryFile()
fq2 = NamedTemporaryFile()
sai1 = NamedTemporaryFile()
sai2 = NamedTemporaryFile()
generate_fastq_from_primers(primers, fq1.name, fq2.name)
aln_args1 = [bwa_exe, 'aln', ref, fq1.name]
aln_args2 = [bwa_exe, 'aln', ref, fq2.name]
r = check_call(aln_args1, stdout=sai1)
if r != 0:
raise ValueError("bwa aln crashed with error code {0}".format(r))
r = check_call(aln_args2, stdout=sai2)
if r != 0:
raise ValueError("bwa aln crashed with error code {0}".format(r))
sam = NamedTemporaryFile()
sam_args = [bwa_exe, 'sampe', ref, sai1.name, sai2.name, fq1.name, fq2.name]
r = check_call(sam_args, stdout=sam)
if r != 0:
raise ValueError("bwa sampe crashed with error code {0}".format(r))
bam = NamedTemporaryFile()
bam_args = [samtools_exe, 'view', '-Shb', sam.name]
r = check_call(bam_args, stdout=bam)
if r != 0:
raise ValueError("samtools view crashed with error code {0}".format(r))
final_args = [samtools_exe, 'sort', "-f", bam.name, output_bam]
r = check_call(final_args)
if r != 0:
raise ValueError("samtools sort crashed with error code {0}".format(r))
index_args = [samtools_exe, "index", output_bam]
r = check_call(index_args)
if r != 0:
raise ValueError("samtools index crashed with error code {0}".format(r))
# cleanup
for i in [fq1, fq2, sai1, sai2, sam, bam]:
i.close()
return AlignmentFile(output_bam, "rb")
def _get_match_fraction(aligned_segment):
"""Get the fraction of a read matching the reference"""
matching_bases = aligned_segment.cigartuples[0][1]
return float(matching_bases)/aligned_segment.query_length
def _read_on_same_chrom(aligned_segment, variant=None, region=None):
"""Return true if read is on same chromosome as variant or region"""
read_chrom = aligned_segment.reference_name.split("chr")[-1]
if variant is not None:
variant_chrom = variant.chromosome.split("chr")[-1]
elif region is not None:
variant_chrom = region.chr.split("chr")[-1]
else:
raise ValueError
return read_chrom == variant_chrom
SeqIO.write(records, tmp, "fasta")
tmp.seek(0)
out = NamedTemporaryFile()
args = [blat_exe, '-stepSize=5', '-repMatch=2253', '-minScore=0', '-minIdentity=0', ref, tmp.name, out.name]
retval = check_call(args)
if retval != 0:
raise ValueError("Blat crashed")
blat_out = out.readlines()
out.close()
tmp.close()
return blat_out
def find_best(primers, blat_out, accept_snp=False, field=None, max_freq=None, dbsnp=None):
# should match minimally 90%
# minimum amount of hits
# correct strand
sanitized = _sanitize_blat(blat_out)
primers = _primers_with_blatlines(primers, sanitized)
# check for existence of correct strand
n_prims = []
for primer in primers:
any_left = any([x.strand == "+" for x in primer.blathits_left])
any_right = any([x.strand == "-" for x in primer.blathits_right])
if any_left and any_right:
n_prims.append(primer)
primers = n_prims
# check for match percentage
n_prims = []
for primer in primers:
any_left = any([float(x.Q_size)/float(x.match) >= 0.9 for x in primer.blathits_left])
any_right = any([float(x.Q_size)/float(x.match) >= 0.9 for x in primer.blathits_right])
if any_left and any_right:
n_prims.append(primer)
primers = n_prims
blat_lens = [len(x.blathits_left) + len(x.blathits_right) for x in primers]
# get primers with least hits
primers = [x for x in primers if (len(x.blathits_right) + len(x.blathits_left)) == min(blat_lens)]
# get correct chromosome and location
for primer in primers:
if "chr" in primer.chromosome and "chr" not in primer.blathits_left[0].T_name:
chrom = primer.chromosome.split("chr")[1]
elif "chr" not in primer.chromosome and "chr" in primer.blathits_left[0].T_name:
chrom = "chr" + primer.chromosome
def _has_alternative_alignments(aligned_segment):
"""Return true if read has alternative alignments"""
tags = aligned_segment.get_tags()
return "XA" in [x[0] for x in tags]
def create_primer_from_pair(read1, read2, position=0):
"""Create primer from read pair"""
return Primer(
chromosome=read1.reference_name,
position=position,
left=read1.query_sequence,
right=read2.query_sequence,
left_pos=read1.reference_start,
right_pos=read2.reference_start,
left_len=read1.query_length,
right_len=read2.query_length,
left_gc=calc_gc(read1.query_sequence),
right_gc=calc_gc(read2.query_sequence)
)
def create_primers_bwa(bam_handle, variant=None, region=None):
"""
Find correct pairs in a bam file containing primers for a variant or region
This requires to hold the bam file in memory. Use only with small bam files
:param bam_handle: pysam.AlignmentFile instance
:param variant: variant
:param region: region
:return: generator of primers
"""
if variant is not None and region is not None:
raise ValueError("Either variant _or_ region must be set")
elif variant is None and region is None:
raise ValueError("Either variant _or_ region must be set")
pairs = {} # bucket to hold pairs
for read in bam_handle:
if read.is_read1:
if read.query_name in pairs:
pairs[read.query_name].update({"r1": read})
else:
pairs[read.query_name] = {"r1": read}
elif read.is_read2:
if read.query_name in pairs:
pairs[read.query_name].update({"r2": read})
else:
pairs[read.query_name] = {"r2": read}
for pair in pairs.values():
# must match at least 90%
# must be on same chromosome as variant
# may not have alternative alignments
# pair must be complete
if "r1" not in pair or "r2" not in pair:
continue
if _get_match_fraction(pair['r1']) < 0.9:
continue
if _get_match_fraction(pair['r2']) < 0.9:
continue
if variant is not None:
if not _read_on_same_chrom(pair['r1'], variant=variant):
continue
if not _read_on_same_chrom(pair['r2'], variant=variant):
continue
else:
chrom = primer.chromosome
the_lines_left = [x for x in primer.blathits_left if x.T_name == chrom]
the_lines_right = [x for x in primer.blathits_right if x.T_name == chrom]
if len(the_lines_left) == 0 or len(the_lines_right) == 0:
raise NoPrimersException("No primers could be detected")
primer.left_pos = int(the_lines_left[0].T_start)
primer.right_pos = int(the_lines_right[0].T_start)
# find snps
if not accept_snp and max_freq:
primers = [find_snps(x, dbsnp, field) for x in primers]
primers = [x for x in primers if x.snp_freq <= max_freq]
if not _read_on_same_chrom(pair['r1'], region=region):
continue
if not _read_on_same_chrom(pair['r2'], region=region):
continue
if _has_alternative_alignments(pair['r1']):
continue
if _has_alternative_alignments(pair['r2']):
continue
if variant is not None:
yield create_primer_from_pair(pair['r1'], pair['r2'], variant.position_g_start)
else:
position = int(region.start) + int((float((int(region.stop) - int(region.start)))/2))
yield create_primer_from_pair(pair['r1'], pair['r2'], position)
def find_best_bwa(bam, variant=None, region=None, accept_snp=False, field=None, max_freq=None, dbsnp=None):
primers = []
for primer in create_primers_bwa(bam, variant=variant, region=region):
if not accept_snp and max_freq is not None:
prim = find_snps(primer, dbsnp, field)
if prim.snp_freq <= max_freq:
primers.append(prim)
else:
primers.append(primer)
if len(primers) == 0:
raise NoPrimersException("No suitable primers could be detected")
return primers[0]
else:
return primers
def _freq_in_query(query, field):
......@@ -219,45 +317,11 @@ def _sanitize_p3(handle):
return [x for x in handle if "#" not in x.decode()]
def _sanitize_blat(blat_out):
# sanitize blat output
# lines should start with a number
regex = re.compile("^\d+\t")
return [BlatLine(*x.split("\t")) for x in blat_out if regex.match(x.decode())]
def _get_shortest(x, y, max_v):
small = min(len(x), len(y), max_v)
return x[:small], y[:small]
def _primers_with_blatlines(primers, blat_lines):
for primer in primers:
correct_lines_left = [x for x in blat_lines if x.Q_name == primer.left]
correct_lines_right = [x for x in blat_lines if x.Q_name == primer.right]
primer.blathits_left = [x for x in correct_lines_left]
primer.blathits_right = [x for x in correct_lines_right]
return primers
def _minimize_hits(blat_lines):
groups = {}
for line in blat_lines:
q_name = int(line.strip().split("\t")[9])
try:
groups[q_name] += [line]
except KeyError:
groups[q_name] = [line]
mini = min(map(len, groups.values()))
minimized = []
for x in groups:
if len(groups[x]) == mini:
minimized += groups[x]
return minimized
def chop_region(region, size):
"""
Chop a region in multiple regions with max size `size`
......@@ -287,7 +351,8 @@ def chop_region(region, size):
def get_primer_from_region(region, reference, product_size, n_prims,
blat_exe, primer3_exe, dbsnp=None, field=None,
bwa_exe, samtools_exe, primer3_exe,
output_bam=None, dbsnp=None, field=None,
max_freq=None, strict=False, min_margin=10):
min_length, max_length = list(map(int, product_size.split("-")))
......@@ -298,31 +363,35 @@ def get_primer_from_region(region, reference, product_size, n_prims,
primers = []
return_regions = []
# TODO change such that blat is only run ONCE
for reg in regions:
sequence = get_sequence_fasta(reg, reference=reference)
raw_primers = run_primer3(sequence, reg, padding=True,
product_size=product_size,
n_primers=n_prims, prim3_exe=primer3_exe)
blat_out = blat_primers(raw_primers, blat_exe=blat_exe, ref=reference)
primer = find_best(raw_primers, blat_out, dbsnp=dbsnp, field=field, max_freq=max_freq)
bam = aln_primers(raw_primers, bwa_exe=bwa_exe, samtools_exe=samtools_exe,
ref=reference, output_bam=output_bam)
prims = find_best_bwa(bam, region=reg, dbsnp=dbsnp, field=field, max_freq=max_freq)
tmp_reg = []
tmp_prim = []
# filter out primers too close to the variant
if not (int(primer.left_pos) + int(primer.left_len) + min_margin) < int(reg.start):
continue
if not (int(primer.right_pos) - min_margin) > int(reg.stop):
continue
n_region = Region(start=int(primer.left_pos), stop=int(primer.right_pos)+len(primer.right),
chromosome=primer.chromosome, padding_left=0, padding_right=0,
acc_nr="NA", other="NA")
primer.fragment_sequence = get_sequence_fasta(n_region, reference=reference, padding=False)
if strict and len(primer.fragment_sequence) > max_length:
pass
else:
primers.append(primer)
for primer in prims:
if not (int(primer.left_pos) + int(primer.left_len) + min_margin) < int(reg.start):
continue
if not (int(primer.right_pos) - min_margin) > int(reg.stop):
continue
n_region = Region(start=int(primer.left_pos), stop=int(primer.right_pos)+len(primer.right),
chromosome=primer.chromosome, padding_left=0, padding_right=0,
acc_nr="NA", other="NA")
primer.fragment_sequence = get_sequence_fasta(n_region, reference=reference, padding=False)
if strict and len(primer.fragment_sequence) > max_length:
continue
else:
tmp_prim.append(primer)
tmp_reg.append(reg)
if len(tmp_prim) > 0 and len(tmp_prim) > 0:
primers.append(tmp_prim[0])
return_regions.append(reg)
if len(primers) == 0 or len(return_regions) == 0:
raise NoPrimersException
......
......@@ -243,7 +243,6 @@ def vars_and_primers_to_xml(variants, primers, xml_path=None, xsd=DEFAULT_XSD, o
prims = etree.SubElement(variant, "PRIMERS")
if prim is not None:
prims = primer_to_xml(prims, prim, var)
frag_len = etree.SubElement(prims, "FRAGMENT_LENGTH")
# fragment is the region between the end of the forward primer, and the start of the reverse primer
......@@ -255,6 +254,7 @@ def vars_and_primers_to_xml(variants, primers, xml_path=None, xsd=DEFAULT_XSD, o
gc_perc.text = str(int(calc_gc(prim.fragment_sequence)))
except ValueError:
gc_perc.text = '0'
_ = primer_to_xml(prims, prim, var)
else:
comment = etree.SubElement(uitslag, "OPMERKING")
comment.text = "NO PRIMERS FOUND"
......
......@@ -118,6 +118,10 @@ class Primer(object):
left_name = ''
right_name = ''
def __init__(self, **kwargs):
for kw, kv in kwargs.items():
setattr(self, kw, kv)
@classmethod
def from_p3(cls, forward, reverse, fragment=None, chromosome=None, position=None):
# line looks like this:
......
......@@ -17,9 +17,9 @@ __author__ = 'ahbbollen'
def primers_from_lovd(lovd_file, padding, product_size, n_prims, reference,
blat_exe, primer3_exe, dbsnp, field, max_freq,
m13=False, m13_f="", m13_r="", strict=False,
min_margin=10, ignore_errors=False):
bwa_exe, samtools_exe, primer3_exe, output_bam, dbsnp,
field, max_freq, m13=False, m13_f="", m13_r="",
strict=False, min_margin=10, ignore_errors=False):
variants = var_from_lovd(lovd_file)
primers = []
......@@ -27,11 +27,12 @@ def primers_from_lovd(lovd_file, padding, product_size, n_prims, reference,
region = Region.from_variant(var, padding_l=padding, padding_r=padding)
try:
_, prims = get_primer_from_region(region, reference, product_size,
n_prims, blat_exe, primer3_exe,
dbsnp=dbsnp, field=field,
max_freq=max_freq,
strict=strict,
min_margin=min_margin)
n_prims, bwa_exe, samtools_exe,
primer3_exe,
output_bam=output_bam,
dbsnp=dbsnp, field=field,
max_freq=max_freq, strict=strict,
min_margin=min_margin)
primers.append(prims[0])
except NoPrimersException:
if ignore_errors:
......@@ -45,9 +46,9 @@ def primers_from_lovd(lovd_file, padding, product_size, n_prims, reference,
def primers_from_region(bed_path, padding, product_size, n_prims, reference,
blat_exe, primer3_exe, dbsnp, field, max_freq,
m13=False, m13_f="", m13_r="", strict=False,
min_margin=10):
bwa_exe, samtools_exe, primer3_exe, output_bam,
dbsnp, field, max_freq, m13=False, m13_f="",
m13_r="", strict=False, min_margin=10):
regions = []
primers = []
with open(bed_path, "rb") as bed:
......@@ -58,7 +59,8 @@ def primers_from_region(bed_path, padding, product_size, n_prims, reference,
padding_r=padding)
regs, prims = get_primer_from_region(region, reference,
product_size, n_prims,
blat_exe, primer3_exe,
bwa_exe, samtools_exe,
primer3_exe, output_bam=output_bam,
dbsnp=dbsnp, field=field,
max_freq=max_freq,
strict=strict,
......@@ -125,6 +127,8 @@ def main():
output_group.add_argument('-x', '--xml', help="Output Miracle XML file")
output_group.add_argument('-t', '--tsv', help="Output TSV file")
parser.add_argument('-b', '--bam', help="Path to output BAM file containing primers", required=True)
parser.add_argument('-s', '--sample', help="Same ID for regions")
parser.add_argument('--product_size', help="Size range of desired product. Defaults to 200-450 \
......@@ -158,7 +162,8 @@ def main():
parser.add_argument('-R', '--reference', help="Path to reference fasta file", default=None, required=True)
parser.add_argument('--dbsnp', help="Path to DBSNP vcf", default=None, required=True)
parser.add_argument('--primer3', help="Path to primer3_core exe", default=None, required=True)
parser.add_argument('--blat', help="Path to blat exe", default=None, required=True)
parser.add_argument('--bwa', help="Path to BWA exe", default="bwa")
parser.add_argument('--samtools', help="Path to samtools exe", default="samtools")
parser.add_argument("--ignore-errors", help="Ignore errors", action="store_true")
......@@ -181,8 +186,9 @@ def main():
variants, primers = primers_from_lovd(args.lovd, args.padding,
args.product_size,
args.n_raw_primers,
args.reference, args.blat,
args.primer3, args.dbsnp,
args.reference, args.bwa,
args.samtools, args.primer3,
args.bam, args.dbsnp,
args.field, args.allele_freq,
args.m13, args.m13_forward,
args.m13_reverse,
......@@ -195,8 +201,9 @@ def main():
regions, primers = primers_from_region(args.region, args.padding,
args.product_size,
args.n_raw_primers,
args.reference, args.blat,
args.primer3, args.dbsnp,
args.reference, args.bwa,
args.samtools, args.primer3,
args.bam, args.dbsnp,
args.field, args.allele_freq,
args.m13, args.m13_forward,
args.m13_reverse,
......@@ -208,8 +215,9 @@ def main():
variants, primers = primers_from_lovd(args.lovd, args.padding,
args.product_size,
args.n_raw_primers,
args.reference, args.blat,
args.primer3, args.dbsnp,
args.reference, args.bwa,
args.samtools, args.primer3,
args.bam, args.dbsnp,
args.field, args.allele_freq,
args.m13, args.m13_forward,
args.m13_reverse,
......@@ -222,8 +230,9 @@ def main():
regions, primers = primers_from_region(args.region, args.padding,
args.product_size,
args.n_raw_primers,
args.reference, args.blat,
args.primer3, args.dbsnp,
args.reference, args.bwa,
args.samtools, args.primer3,
args.bam, args.dbsnp,
args.field, args.allele_freq,
args.m13, args.m13_forward,
args.m13_reverse,
......
......@@ -65,7 +65,7 @@ def primer_to_seq_record(primer):
:return: 2-tuple of (SeqRecord forward, SeqRecord reverse)
"""
id_str = datehash()
forward_id = "{0}".format(id_str)
forward_id = "{0}/1".format(id_str)
reverse_id = "{0}/2".format(id_str)
forward_seq = Seq(primer.left, alphabet=NucleotideAlphabet)
reverse_seq = Seq(primer.right, alphabet=NucleotideAlphabet)
......
......@@ -4,7 +4,8 @@ setup(
name='prinia',
version='0.1',
packages=['prinia'],
install_required=["pyfaidx", "fastools", "lxml"],
install_requires=["biopython", "pysam", "pyfaidx",
"fastools", "lxml", "pyvcf"],
url='',
license='',
author='sander bollen',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment