Merge pull request #54 from mutalyzer/read-dna

Usability improvements in reading DNA for description extractor

Merge pull request #54 from mutalyzer/read-dna
312b7029 · Vermaat · 7f009cbd · 4a26ad95 · 312b7029 · 312b7029
Commit 312b7029 authored 9 years ago by Vermaat
--- a/mutalyzer/util.py
+++ b/mutalyzer/util.py
@@ -357,15 +357,28 @@ def guess_file_type(handle):
    """
    Guess the file type of an NGS data file.

-    We assume that the stream is rewinded before use, after use, the input
-    stream will be rewinded.
-
-    :arg stream handle: Open readable handle to an NGS data file.
+    :arg file handle: Open readable handle to an NGS data file.

    :returns unicode: Either 'fasta', 'fastq' or 'text'.
    """
+    try:
+        extension = getattr(handle, 'name').split('.')[-1]
+    except AttributeError:
+        pass
+    else:
+        if extension in ('fastq', 'fq'):
+            return 'fastq'
+        elif extension in ('fasta', 'fa'):
+            return 'fasta'
+
+    try:
+        position = handle.tell()
+        handle.seek(0)
+    except IOError:
+        return 'text'
+
    token = handle.read(1)
-    handle.seek(0)
+    handle.seek(position)

    if token == '>':
        return 'fasta'
@@ -379,7 +392,8 @@ def read_dna(handle):
    Read the first record in an NGS data file.

    If the format is not recognised as FASTA or FASTQ, we assume that the input
-    is in plain text. In this case, all non-DNA characters are removed.
+    is in plain text. In this case, DNA is converted to uppercase and all
+    non-DNA characters are removed.

    :arg stream handle: Open readable handle to an NGS data file.

@@ -390,7 +404,7 @@ def read_dna(handle):
    if file_format != 'text':
        return unicode(SeqIO.parse(handle, file_format).next().seq)

-    return ''.join(x for x in unicode(handle.read()) if x in 'ATCG')
+    return ''.join(x for x in unicode(handle.read()).upper() if x in 'ATCG')


 def in_frame_description(s1, s2) :

--- a/mutalyzer/website/templates/description-extractor.html
+++ b/mutalyzer/website/templates/description-extractor.html
@@ -244,6 +244,13 @@ Please supply a reference sequence and an observed sequence.
      {% endfor %}
      </tbody>
    </table>
+    <p>
+      Please note that the generated descriptions use one-based coordinates
+      where we start counting at the start of the supplied sequence. This
+      means that for genomic reference sequences, the result will be
+      in <code>g.</code> coordinates, while for transcripts the result will be
+      in <code>n.</code> coordinates.
+    </p>
  {% endif %}{# not errors #}
 {% endif %}{# description #}


--- a/tests/test_website.py
+++ b/tests/test_website.py
@@ -207,6 +207,18 @@ class TestWebsite(MutalyzerTest):
        assert 'Input sequences are restricted to ' in r.data
        assert '1 Error, 0 Warnings.' in r.data

+    def test_description_extractor_lowercase(self):
+        """
+        Submit a sample sequence with a base in lowercase to the variant
+        description extractor.
+        """
+        r = self.app.post('/description-extractor', data={
+            'reference_method': 'raw_method',
+            'sample_method': 'raw_method',
+            'reference_sequence': 'TTT',
+            'sample_sequence': 'TaT'})
+        assert '<pre class="description">2T&gt;A</pre>' in r.data
+
    def test_checksyntax_valid(self):
        """
        Submit the check syntax form with a valid variant.