Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mutalyzer
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Analyze
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Mirrors
mutalyzer
Commits
6f5c69bf
Commit
6f5c69bf
authored
10 years ago
by
Vermaat
Browse files
Options
Downloads
Patches
Plain Diff
Correctly handle reference file encodings
parent
8acb0970
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
mutalyzer/Retriever.py
+44
-36
44 additions, 36 deletions
mutalyzer/Retriever.py
mutalyzer/parsers/lrg.py
+1
-1
1 addition, 1 deletion
mutalyzer/parsers/lrg.py
mutalyzer/services/rpc.py
+6
-0
6 additions, 0 deletions
mutalyzer/services/rpc.py
mutalyzer/util.py
+8
-0
8 additions, 0 deletions
mutalyzer/util.py
with
59 additions
and
37 deletions
mutalyzer/Retriever.py
+
44
−
36
View file @
6f5c69bf
...
...
@@ -12,14 +12,13 @@ Public classes:
from
__future__
import
unicode_literals
import
codecs
import
io
import
os
# path.isfile(), link() path.isdir(), path.mkdir(),
# walk(), path.getsize(), path.join(), stat(), remove()
import
time
import
bz2
# BZ2Compressor(), BZ2File()
import
hashlib
# md5(), update(), hexdigest()
import
urllib2
# urlopen()
import
StringIO
# StringIO()
from
Bio
import
SeqIO
# read()
from
Bio
import
Entrez
# efetch(), read(), esearch(), esummary()
from
Bio.Seq
import
UnknownSeq
...
...
@@ -28,6 +27,7 @@ from xml.dom import DOMException, minidom
from
xml.parsers
import
expat
from
httplib
import
HTTPException
,
IncompleteRead
from
sqlalchemy.orm.exc
import
NoResultFound
import
cchardet
as
chardet
from
mutalyzer
import
util
from
mutalyzer.config
import
settings
...
...
@@ -100,27 +100,33 @@ class Retriever(object) :
Write raw data to a compressed file.
@arg raw_data: The raw_data to be compressed and written
@type raw_data: string
@type raw_data:
byte
string
@arg filename: The intended name of the outfile
@type filename: unicode
@return: outfile ; The full path and name of the file written
@rtype: unicode
"""
# Todo: Should we write a utf-8 encoded genbank file? Not even sure
# what type `raw_data` is...
result
=
chardet
.
detect
(
raw_data
)
if
result
[
'
confidence
'
]
>
0.5
:
encoding
=
result
[
'
encoding
'
]
else
:
encoding
=
'
utf-8
'
if
not
util
.
is_utf8_alias
(
encoding
):
raw_data
=
raw_data
.
decode
(
encoding
).
encode
(
'
utf-8
'
)
# Compress the data to save disk space.
comp
=
bz2
.
BZ2Compressor
()
data
=
comp
.
compress
(
raw_data
)
data
+=
comp
.
flush
()
out_handle
=
open
(
self
.
_nametofile
(
filename
),
"
w
"
)
out_handle
=
open
(
self
.
_nametofile
(
filename
),
"
w
b
"
)
out_handle
.
write
(
data
)
out_handle
.
close
()
return
out_handle
.
name
# return the full path to the file
#_write
# Todo: check callers; argument should be a byte string
def
_calcHash
(
self
,
content
)
:
"""
Calculate the md5sum of a piece of text.
...
...
@@ -241,7 +247,7 @@ class Retriever(object) :
'
IncompleteRead: %s
'
%
unicode
(
e
))
return
[]
if
response_text
==
'
\n
'
:
if
response_text
.
strip
()
==
b
'
\n
'
:
# This is apparently what dbSNP returns for non-existing dbSNP id
self
.
_output
.
addMessage
(
__file__
,
4
,
'
EENTREZ
'
,
'
ID rs%s could not be found in dbSNP.
'
\
...
...
@@ -259,14 +265,14 @@ class Retriever(object) :
self
.
_output
.
addMessage
(
__file__
,
-
1
,
'
INFO
'
,
'
ExpatError: %s
'
%
unicode
(
e
))
self
.
_output
.
addMessage
(
__file__
,
-
1
,
'
INFO
'
,
'
Result from dbSNP: %s
'
%
response_text
)
'
Result from dbSNP: %s
'
%
unicode
(
response_text
,
'
utf-8
'
)
)
return
[]
except
IndexError
:
# The expected root element is not present.
self
.
_output
.
addMessage
(
__file__
,
4
,
'
EENTREZ
'
,
'
Unknown dbSNP
'
\
'
error. Result XML was not as expected.
'
)
self
.
_output
.
addMessage
(
__file__
,
-
1
,
'
INFO
'
,
'
Result from dbSNP: %s
'
%
response_text
)
'
Result from dbSNP: %s
'
%
unicode
(
response_text
,
'
utf-8
'
)
)
return
[]
snps
=
[]
...
...
@@ -292,7 +298,6 @@ class GenBankRetriever(Retriever):
# Child specific init
#__init__
# todo: raw_data must always be a byte string
def
write
(
self
,
raw_data
,
filename
,
extract
)
:
"""
Write raw data to a file. The data is parsed before writing, if a
...
...
@@ -305,7 +310,7 @@ class GenBankRetriever(Retriever):
database).
@arg raw_data: The data
@type raw_data: string
@type raw_data:
byte
string
@arg filename: The intended name of the file.
@type filename: unicode
@arg extract: Flag that indicates whether to extract the record ID and
...
...
@@ -320,26 +325,24 @@ class GenBankRetriever(Retriever):
@rtype: tuple (unicode, unicode)
"""
if
raw_data
==
"
\n
Nothing has been found
\n
"
:
if
raw_data
.
strip
()
==
b
'
Nothing has been found
'
:
self
.
_output
.
addMessage
(
__file__
,
4
,
"
ENORECORD
"
,
"
The record could not be retrieved.
"
)
return
None
#if
fakehandle
=
StringIO
.
StringIO
()
# Unfortunately, BioPython needs a
fakehandle
.
write
(
raw_data
)
# file handle.
fakehandle
=
io
.
BytesIO
()
# Unfortunately, BioPython needs a
fakehandle
.
write
(
raw_data
)
# file handle.
fakehandle
.
seek
(
0
)
try
:
record
=
SeqIO
.
read
(
fakehandle
,
"
genbank
"
)
except
(
ValueError
,
AttributeError
):
# An error occured while parsing.
self
.
_output
.
addMessage
(
__file__
,
4
,
"
ENOPARSE
"
,
"
The file could not be parsed.
"
)
fakehandle
.
close
()
return
None
#except
if
type
(
record
.
seq
)
==
UnknownSeq
:
fakehandle
.
close
()
self
.
_output
.
addMessage
(
__file__
,
4
,
"
ENOSEQ
"
,
"
This record contains no sequence. Chromosomal or contig
"
\
"
records should be uploaded with the GenBank uploader.
"
)
...
...
@@ -349,12 +352,12 @@ class GenBankRetriever(Retriever):
outfile
=
filename
GI
=
None
if
extract
:
outfile
=
record
.
id
GI
=
record
.
annotations
[
"
gi
"
]
outfile
=
unicode
(
record
.
id
)
GI
=
unicode
(
record
.
annotations
[
"
gi
"
]
)
if
outfile
!=
filename
:
# Add the reference (incl version) to the reference output
# This differs if the original reference lacks a version
self
.
_output
.
addOutput
(
"
reference
"
,
record
.
id
)
self
.
_output
.
addOutput
(
"
reference
"
,
unicode
(
record
.
id
)
)
self
.
_output
.
addOutput
(
"
BatchFlags
"
,
(
"
A1
"
,(
filename
,
...
...
@@ -362,9 +365,8 @@ class GenBankRetriever(Retriever):
filename
+
"
.
"
)))
self
.
_output
.
addMessage
(
__file__
,
2
,
"
WNOVER
"
,
"
No version number is given, using %s. Please use this
"
\
"
number to reduce downloading overhead.
"
%
record
.
id
)
"
number to reduce downloading overhead.
"
%
unicode
(
record
.
id
)
)
#if
fakehandle
.
close
()
self
.
_write
(
raw_data
,
outfile
)
...
...
@@ -390,7 +392,7 @@ class GenBankRetriever(Retriever):
'
Could not retrieve %s.
'
%
name
)
return
None
if
raw_data
==
'
\n
'
:
# Check if the file is empty or not.
if
raw_data
.
strip
()
==
b
''
:
# Check if the file is empty or not.
self
.
_output
.
addMessage
(
__file__
,
4
,
'
ERETR
'
,
'
Could not retrieve %s.
'
%
name
)
return
None
...
...
@@ -398,10 +400,10 @@ class GenBankRetriever(Retriever):
# This is a hack to detect constructed references, the proper way to
# do this would be to check the data_file_division attribute of the
# parsed GenBank file (it would be 'CON').
if
'
\n
CONTIG
'
in
raw_data
:
if
b
'
\n
CONTIG
'
in
raw_data
:
try
:
# Get the length in base pairs
length
=
int
(
raw_data
[:
raw_data
.
index
(
'
bp
'
,
0
,
500
)].
split
()[
-
1
])
length
=
int
(
raw_data
[:
raw_data
.
index
(
b
'
bp
'
,
0
,
500
)].
split
()[
-
1
])
except
ValueError
,
IndexError
:
self
.
_output
.
addMessage
(
__file__
,
4
,
'
ERETR
'
,
'
Could not retrieve %s.
'
%
name
)
...
...
@@ -583,24 +585,24 @@ class GenBankRetriever(Retriever):
'
Could not get mapping information for gene %s.
'
%
gene
)
return
None
if
summary
[
0
][
"
NomenclatureSymbol
"
].
lower
()
==
gene
.
lower
()
:
# Found it.
if
unicode
(
summary
[
0
][
"
NomenclatureSymbol
"
]
)
.
lower
()
==
gene
.
lower
()
:
# Found it.
if
not
summary
[
0
][
"
GenomicInfo
"
]
:
self
.
_output
.
addMessage
(
__file__
,
4
,
"
ENOMAPPING
"
,
"
No mapping information found for gene %s.
"
%
gene
)
return
None
#if
ChrAccVer
=
summary
[
0
][
"
GenomicInfo
"
][
0
][
"
ChrAccVer
"
]
ChrLoc
=
summary
[
0
][
"
GenomicInfo
"
][
0
][
"
ChrLoc
"
]
ChrStart
=
summary
[
0
][
"
GenomicInfo
"
][
0
][
"
ChrStart
"
]
ChrStop
=
summary
[
0
][
"
GenomicInfo
"
][
0
][
"
ChrStop
"
]
break
;
ChrAccVer
=
unicode
(
summary
[
0
][
"
GenomicInfo
"
][
0
][
"
ChrAccVer
"
]
)
ChrLoc
=
unicode
(
summary
[
0
][
"
GenomicInfo
"
][
0
][
"
ChrLoc
"
]
)
ChrStart
=
unicode
(
summary
[
0
][
"
GenomicInfo
"
][
0
][
"
ChrStart
"
]
)
ChrStop
=
unicode
(
summary
[
0
][
"
GenomicInfo
"
][
0
][
"
ChrStop
"
]
)
break
#if
# Collect official symbols that has this gene as alias in case we
# can not find anything.
if
gene
in
summary
[
0
][
"
OtherAliases
"
]
and
\
if
gene
in
[
unicode
(
a
)
for
a
in
summary
[
0
][
"
OtherAliases
"
]
]
and
\
summary
[
0
][
"
NomenclatureSymbol
"
]
:
aliases
.
append
(
summary
[
0
][
"
NomenclatureSymbol
"
])
;
aliases
.
append
(
unicode
(
summary
[
0
][
"
NomenclatureSymbol
"
])
)
#for
if
not
ChrAccVer
:
# We did not find any genes.
...
...
@@ -643,6 +645,13 @@ class GenBankRetriever(Retriever):
@return: UD or None
@rtype: unicode
"""
if
not
(
url
.
startswith
(
'
http://
'
)
or
url
.
startswith
(
'
https://
'
)
or
url
.
startswith
(
'
ftp://
'
)):
self
.
_output
.
addMessage
(
__file__
,
4
,
"
ERECPARSE
"
,
"
Only HTTP(S) or FTP locations are allowed.
"
)
return
None
handle
=
urllib2
.
urlopen
(
url
)
info
=
handle
.
info
()
if
info
[
"
Content-Type
"
]
==
"
text/plain
"
:
...
...
@@ -688,7 +697,7 @@ class GenBankRetriever(Retriever):
If the downloaded file is recognised by its hash, the old UD number
is used.
@arg raw_data: A GenBank record
@arg raw_data: A GenBank record
.
@type raw_data: byte string
@return: Accession number for the uploaded file.
...
...
@@ -857,7 +866,6 @@ class LRGRetriever(Retriever):
# Now we have the file, so we can parse it.
file_handle
=
bz2
.
BZ2File
(
filename
,
"
r
"
)
file_handle
=
codecs
.
getreader
(
'
utf-8
'
)(
file_handle
)
#create GenRecord.Record from LRG file
record
=
lrg
.
create_record
(
file_handle
.
read
())
...
...
@@ -978,7 +986,7 @@ class LRGRetriever(Retriever):
if a parse error occurs None is returned.
@arg raw_data: The data
@type raw_data: string
@type raw_data:
byte
string
@arg filename: The intended name of the file
@type filename: unicode
...
...
This diff is collapsed.
Click to expand it.
mutalyzer/parsers/lrg.py
+
1
−
1
View file @
6f5c69bf
...
...
@@ -112,7 +112,7 @@ def create_record(data):
Create a GenRecord.Record of a LRG <xml> formatted string.
@arg data: Content of LRG file
@type data: string
@type data:
byte
string
@return: GenRecord.Record instance
@rtype: object
...
...
This diff is collapsed.
Click to expand it.
mutalyzer/services/rpc.py
+
6
−
0
View file @
6f5c69bf
...
...
@@ -1058,6 +1058,12 @@ class MutalyzerService(ServiceBase):
output
.
addMessage
(
__file__
,
-
1
,
'
INFO
'
,
'
Received request uploadGenBankLocalFile()
'
)
# The Python type for `data` should be a sequence of `str` objects,
# but it seems we sometimes just get one `str` object. Perhaps only in
# the unit tests, but let's fix that anyway.
if
isinstance
(
data
,
str
):
data
=
[
data
]
# Note that the max file size check below might be bogus, since Spyne
# first checks the total request size, which by default has a maximum
# of 2 megabytes.
...
...
This diff is collapsed.
Click to expand it.
mutalyzer/util.py
+
8
−
0
View file @
6f5c69bf
...
...
@@ -93,6 +93,14 @@ def reverse_complement(sequence):
return
''
.
join
(
reversed
(
sequence
.
translate
(
table
)))
def
is_utf8_alias
(
encoding
):
"""
Returns `True` if the given encoding is recognized as UTF-8.
"""
aliases
=
(
'
utf_8
'
,
'
u8
'
,
'
utf
'
,
'
utf8
'
)
return
encoding
.
lower
().
replace
(
'
-
'
,
'
_
'
)
in
aliases
def
grouper
(
iterable
,
n
=
2
,
fillvalue
=
None
):
"""
Make an iterator that takes {n} elements at a time from {iterable}, using
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment