diff --git a/mutalyzer/util.py b/mutalyzer/util.py index 4a3ac13c1dc73d165cff23a68571341629e29dfa..ae709477bbb789b625014046f8c1f4c336981417 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -357,15 +357,28 @@ def guess_file_type(handle): """ Guess the file type of an NGS data file. - We assume that the stream is rewinded before use, after use, the input - stream will be rewinded. - - :arg stream handle: Open readable handle to an NGS data file. + :arg file handle: Open readable handle to an NGS data file. :returns unicode: Either 'fasta', 'fastq' or 'text'. """ + try: + extension = getattr(handle, 'name').split('.')[-1] + except AttributeError: + pass + else: + if extension in ('fastq', 'fq'): + return 'fastq' + elif extension in ('fasta', 'fa'): + return 'fasta' + + try: + position = handle.tell() + handle.seek(0) + except IOError: + return 'text' + token = handle.read(1) - handle.seek(0) + handle.seek(position) if token == '>': return 'fasta' @@ -379,7 +392,8 @@ def read_dna(handle): Read the first record in an NGS data file. If the format is not recognised as FASTA or FASTQ, we assume that the input - is in plain text. In this case, all non-DNA characters are removed. + is in plain text. In this case, DNA is converted to uppercase and all + non-DNA characters are removed. :arg stream handle: Open readable handle to an NGS data file. @@ -390,7 +404,7 @@ def read_dna(handle): if file_format != 'text': return unicode(SeqIO.parse(handle, file_format).next().seq) - return ''.join(x for x in unicode(handle.read()) if x in 'ATCG') + return ''.join(x for x in unicode(handle.read()).upper() if x in 'ATCG') def in_frame_description(s1, s2) : diff --git a/mutalyzer/website/templates/description-extractor.html b/mutalyzer/website/templates/description-extractor.html index 631de1d4796f38dfae44f8dfb513c0053451a473..5134139369d1452a7c2f07bb24df97c0b8e2cd10 100644 --- a/mutalyzer/website/templates/description-extractor.html +++ b/mutalyzer/website/templates/description-extractor.html @@ -244,6 +244,13 @@ Please supply a reference sequence and an observed sequence. {% endfor %} </tbody> </table> + <p> + Please note that the generated descriptions use one-based coordinates + where we start counting at the start of the supplied sequence. This + means that for genomic reference sequences, the result will be + in <code>g.</code> coordinates, while for transcripts the result will be + in <code>n.</code> coordinates. + </p> {% endif %}{# not errors #} {% endif %}{# description #} diff --git a/tests/test_website.py b/tests/test_website.py index 3392d1faa4ec1391797627f881a46003f7f590dc..faf336ba89b84ea96dee1ac7d472fc28429b86a8 100644 --- a/tests/test_website.py +++ b/tests/test_website.py @@ -207,6 +207,18 @@ class TestWebsite(MutalyzerTest): assert 'Input sequences are restricted to ' in r.data assert '1 Error, 0 Warnings.' in r.data + def test_description_extractor_lowercase(self): + """ + Submit a sample sequence with a base in lowercase to the variant + description extractor. + """ + r = self.app.post('/description-extractor', data={ + 'reference_method': 'raw_method', + 'sample_method': 'raw_method', + 'reference_sequence': 'TTT', + 'sample_sequence': 'TaT'}) + assert '<pre class="description">2T>A</pre>' in r.data + def test_checksyntax_valid(self): """ Submit the check syntax form with a valid variant.