Skip to content
Snippets Groups Projects
Commit 312b7029 authored by Vermaat's avatar Vermaat
Browse files

Merge pull request #54 from mutalyzer/read-dna

Usability improvements in reading DNA for description extractor
parents 7f009cbd 4a26ad95
No related branches found
No related tags found
No related merge requests found
......@@ -357,15 +357,28 @@ def guess_file_type(handle):
"""
Guess the file type of an NGS data file.
We assume that the stream is rewinded before use, after use, the input
stream will be rewinded.
:arg stream handle: Open readable handle to an NGS data file.
:arg file handle: Open readable handle to an NGS data file.
:returns unicode: Either 'fasta', 'fastq' or 'text'.
"""
try:
extension = getattr(handle, 'name').split('.')[-1]
except AttributeError:
pass
else:
if extension in ('fastq', 'fq'):
return 'fastq'
elif extension in ('fasta', 'fa'):
return 'fasta'
try:
position = handle.tell()
handle.seek(0)
except IOError:
return 'text'
token = handle.read(1)
handle.seek(0)
handle.seek(position)
if token == '>':
return 'fasta'
......@@ -379,7 +392,8 @@ def read_dna(handle):
Read the first record in an NGS data file.
If the format is not recognised as FASTA or FASTQ, we assume that the input
is in plain text. In this case, all non-DNA characters are removed.
is in plain text. In this case, DNA is converted to uppercase and all
non-DNA characters are removed.
:arg stream handle: Open readable handle to an NGS data file.
......@@ -390,7 +404,7 @@ def read_dna(handle):
if file_format != 'text':
return unicode(SeqIO.parse(handle, file_format).next().seq)
return ''.join(x for x in unicode(handle.read()) if x in 'ATCG')
return ''.join(x for x in unicode(handle.read()).upper() if x in 'ATCG')
def in_frame_description(s1, s2) :
......
......@@ -244,6 +244,13 @@ Please supply a reference sequence and an observed sequence.
{% endfor %}
</tbody>
</table>
<p>
Please note that the generated descriptions use one-based coordinates
where we start counting at the start of the supplied sequence. This
means that for genomic reference sequences, the result will be
in <code>g.</code> coordinates, while for transcripts the result will be
in <code>n.</code> coordinates.
</p>
{% endif %}{# not errors #}
{% endif %}{# description #}
......
......@@ -207,6 +207,18 @@ class TestWebsite(MutalyzerTest):
assert 'Input sequences are restricted to ' in r.data
assert '1 Error, 0 Warnings.' in r.data
def test_description_extractor_lowercase(self):
"""
Submit a sample sequence with a base in lowercase to the variant
description extractor.
"""
r = self.app.post('/description-extractor', data={
'reference_method': 'raw_method',
'sample_method': 'raw_method',
'reference_sequence': 'TTT',
'sample_sequence': 'TaT'})
assert '<pre class="description">2T&gt;A</pre>' in r.data
def test_checksyntax_valid(self):
"""
Submit the check syntax form with a valid variant.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment