Commit f299d5e6 authored by Laros's avatar Laros
Browse files

Rewrote the GBparser module to cope with multiple linking methods (to link

mRNA and CDS fields). Added a SNP converter. Implemented a first check for
variants that hit splice sites. Added a chromosomal position for NC slices.
Added a check for the use of intronic positions in a transcript reference 
sequence.

mutalyzer.conf:
- Added variables for splice site mutation detection.

Mutator.py:
- Merged the restriction sites (added, deleted) in one object.

Output.py:
- Changed the return type of getIndexedOutput() and getOutput() from None to
  an empty list for convenience.

Config.py:
- Added variables for splice site mutation detection.

GBparser.py:
- Completely rewrote this module.
  - Will now collect all CDS, mRNA and exon information.
  - Tries to match a CDS and mRNA based upon the range.
    - When this succeeds, try to match on protein, locus tag or product name.
    - If all fails and there is only one option left, link it.
  - Will remove genes that are not fully annotated (half outside the record
    for example).

Retriever.py:
- Added a snpConvert() function.
- Added a check for uploaded records or slices that have no sequence in them
  (a complete contig for example).

GenRecord.py:
- Added variables to cope with chromosomal coordinates.
- Added a toChromPos() function to convert a g. notation to a g. on a
  chromosome.
- Added a addToChromDescription() function to generate a chromosomal
  description of a variant.
- Modified some warnings concerning missing mRNA or missing CDS.
- Added a checkIntron() function that gives a warning when a variant hits a
  splice site.

Web.py:
- Added a nomenclature version variable.
- Added a urlEncode() function to generate valid links.

File.py:
- Made the getMimeType() function public.

Mutalyzer.py:
- Added a __intronicPosition() function that checks whether the user used an
  intronic position.
- Added checks for illegal use of intronic positions.
- Fixed a bug in the __toProtDescr() function.
- Added more checks for the translation and transcription of transcripts.
- Added a better CDS start site mutation detection.
- Added a chromosomal description if available.
- Added more information to the legend (product and linking method).
- Merged the restriction sites (added, deleted) in one object.

handler.py:
- Added guessing of mime types for downloadable files (it used to default to
  text/plain).

index.py:
- Added a snp() function to interface with the snp() function of the Retriever.
- Added a checkForward() function to accommodate for HTTP GET links.
- Removed the `version' variable from all functions, it is now moved to the
  menu.

check.html:
- Modified to cope with the new functionality.

gbupload.html:
- Restructured the layout.

snp.html:
- New page for the SNP converter.

menu.html:
- Modified to cope with the new functionality.

style.css:
- Modified to make some better tables and helper boxes.

index.html:
- Completely rewritten it.

parse.html:
- Restructured the layout to make it uniform with the name checker.



git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/trunk@73 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1
parent 253bcd3b
......@@ -135,7 +135,8 @@ CREATE TABLE GBInfo (
CREATE TABLE BatchQueue (
QueueID INT(5) PRIMARY KEY AUTO_INCREMENT,
JobID CHAR(20) NOT NULL,
Input CHAR(255) NOT NULL
Input CHAR(255) NOT NULL,
Flags CHAR(20)
);
CREATE TABLE BatchJob (
......
......@@ -147,3 +147,7 @@ upstream = 5000
# Number of downstream nucleotides when searching for a transcript.
downstream = 2000
spliceAlarm = 2
spliceWarn = 5
......@@ -143,6 +143,9 @@ class Config() :
pass
#File
class GenRecord() :
pass
def __init__(self) :
"""
Initialise the class with variables read from the configuration
......@@ -213,6 +216,8 @@ class Config() :
## Set the variables needed by the File module.
#self.File.upstream = int(config["upstream"])
#self.File.downstream = int(config["downstream"])
self.GenRecord.spliceAlarm = int(config["spliceAlarm"])
self.GenRecord.spliceWarn = int(config["spliceWarn"])
#__init__
#Config
......
......@@ -90,10 +90,10 @@ class File() :
return ret
#__tempFileWrapper
def __getMimeType(self, handle) :
def getMimeType(self, handle) :
"""
Get the mime type of a stream by inspecting a fixed number of bytes.
The stream is not rewinded after use.
The stream is rewinded after use.
Arguments:
handle ; A handle to a stream.
......@@ -116,9 +116,10 @@ class File() :
MagicInstance.load()
description = MagicInstance.buffer(buf)
del MagicInstance
handle.seek(0)
return mimeType, description
#__getMimeType
#getMimeType
def __parseCsvFile(self, handle) :
"""
......@@ -315,7 +316,7 @@ class File() :
list ; A list of lists, None if an error occured.
"""
mimeType = self.__getMimeType(handle)
mimeType = self.getMimeType(handle)
if mimeType[0] == "text/plain" :
return self.__parseCsvFile(handle)
if mimeType[0] == "application/vnd.ms-office" :
......
......@@ -8,6 +8,20 @@ import Db
mutalyzer GenRecord.Record populated with data from a GenBank file.
"""
class tempGene() :
"""
"""
def __init__(self, name) :
"""
"""
self.name = name
self.rnaList = []
self.cdsList = []
#__init__
#tempGene
class GBparser() :
"""
"""
......@@ -95,6 +109,9 @@ class GBparser() :
result = Entrez.read(handle)
handle.close()
if not result[0]["LinkSetDb"] :
return None
proteinGI = result[0]["LinkSetDb"][0]["Link"][0]["Id"]
handle = Entrez.efetch(db = "protein", id = proteinGI,
......@@ -109,6 +126,206 @@ class GBparser() :
return proteinAcc
#__transcriptToProtein
def __findMismatch(self, productList, direction) :
"""
"""
i = 0
while i < productList[0].count(' ') + 1 :
for j in range(1, len(productList)) :
if productList[0][::direction].split(' ')[i] != \
productList[j][::direction].split(' ')[i] :
return i
i += 1
#while
return 0
#__findMismatch
def __tagByDict(self, locus, key) :
"""
"""
if locus.qualifiers.has_key(key) :
setattr(locus, key, locus.qualifiers[key][0])
else :
setattr(locus, key, None)
#__tagByDict
def __tagLocus(self, locusList) :
"""
"""
productList = []
for i in locusList :
productList.extend(i.qualifiers["product"])
self.__tagByDict(i, "locus_tag")
self.__tagByDict(i, "transcript_id")
self.__tagByDict(i, "protein_id")
self.__tagByDict(i, "gene")
self.__tagByDict(i, "product")
i.proteinLink = None
i.linked = False
if not i.transcript_id :
if i.protein_id :
i.proteinLink = i.protein_id.split('.')[0]
#if
else :
i.proteinLink = \
self.__transcriptToProtein(i.transcript_id.split('.')[0])
i.positionList = self.__locationList2posList(i)
i.location = self.__location2pos(i.location) #FIXME
#if not i.positionList : # FIXME ???
# i.positionList = i.location
if i.positionList :
i.usable = True
else :
i.usable = False
#for
if productList :
a = self.__findMismatch(productList, 1)
b = productList[0].count(' ') - \
self.__findMismatch(productList, -1) + 1
for i in range(len(locusList)) :
locusList[i].productTag = \
' '.join(productList[i].split(' ')[a:b])
#if
#__tagLocus
def __checkTags(self, locusList, tagName) :
"""
Check whether all tags in a locus list are unique.
Arguments:
locusList ; A list of loci.
tagName ; Name of the tag to be checked.
Returns:
boolean ; True if the tags are unique, False otherwise.
"""
tags = []
for i in locusList :
tags.append(getattr(i, tagName))
badTags = []
for i in locusList :
myTag = getattr(i, tagName)
numberOfTags = tags.count(myTag)
if numberOfTags > 1 :
badTags.append(myTag)
#for
for i in locusList :
if getattr(i, tagName) in badTags :
setattr(i, tagName, None)
#for
#__checkTags
def __matchByRange(self, mrna, cds) :
"""
Match the mRNA list to the CDS list.
Arguments:
mrnaList ; List of splice sites.
cdsList ; CDS list (including internal splice sites).
Returns:
integer ; -1 : False.
0 : Don't know.
1 : Maybe true.
2 : Probably true.
"""
if not cds or not mrna :
return 0 # No information -> Don't know.
mrnaList = mrna.positionList
if not mrnaList :
mrnaList = mrna.location
cdsList = cds.positionList
if not cdsList :
cdsList = cds.location
if not cdsList or not mrnaList :
return 0 # No information -> Don't know.
if cdsList[0] < mrnaList[0] or cdsList[-1] > mrnaList[-1] :
return -1 # CDS is outside transcript range -> False.
if len(cdsList) > 2 : # The CDS spans more than one exon.
if not cdsList[1] in mrnaList :
return -1 # At least one splice site doesn't match -> False.
x = mrnaList.index(cdsList[1])
y = x + len(cdsList) - 2
if mrnaList[x:y] == cdsList[1:-1] :
return 2 # All splice sites match -> Probably true.
return -1 # At least one splice site doesn't match -> False.
#if
return 1 # Everything matches, but there is little information.
#__matchByRange
def link(self, rnaList, cdsList) :
"""
"""
self.__tagLocus(rnaList)
self.__tagLocus(cdsList)
self.__checkTags(rnaList, "locus_tag")
self.__checkTags(cdsList, "locus_tag")
self.__checkTags(rnaList, "proteinLink")
self.__checkTags(cdsList, "proteinLink")
self.__checkTags(rnaList, "productTag")
self.__checkTags(cdsList, "productTag")
for i in rnaList :
i.link = None
i.linkMethod = None
for j in cdsList :
if self.__matchByRange(i, j) > 0 :
if i.locus_tag and i.locus_tag == j.locus_tag :
i.link = j
i.linkMethod = "locus"
j.linked = True
print "Linked:", j.locus_tag
break
#if
if i.proteinLink and i.proteinLink == j.proteinLink :
i.link = j
i.linkMethod = "protein"
j.linked = True
break
#if
if i.productTag and i.productTag == j.productTag :
i.link = j
i.linkMethod = "product"
j.linked = True
break
#if
#if
#for
# One *could* also do exhaustion per matched range...
for i in rnaList :
if not i.link :
leftOverCount = 0
leftOverTranscript = None
leftOverProtein = None
for j in cdsList :
if self.__matchByRange(i, j) > 0 and not j.linked :
leftOverCount += 1
leftOverTranscript = i
leftOverProtein = j
#if
#for
if leftOverCount == 1 :
leftOverTranscript.link = leftOverProtein
leftOverTranscript.linkMethod = "exhaustion"
leftOverProtein.linked = True
#if
#if
#for
#link
def createGBRecord(self, filename):
"""
Create a GenRecord.Record from a GenBank file
......@@ -130,21 +347,20 @@ class GBparser() :
record.version = biorecord.id.split('.')[1]
#mRNAProducts = []
#CDSProducts = []
#for i in biorecord.features :
# if i.qualifiers :
# if i.qualifiers.has_key("gene") :
# if i.type == "mRNA" :
# if i.qualifiers.has_key("product") :
# mRNAProducts.append(i.qualifiers["product"][0])
# if i.type == "CDS" :
# if i.qualifiers.has_key("product") :
# CDSProducts.append(i.qualifiers["product"][0])
# #if
#print mRNAProducts
#print CDSProducts
exonList = []
geneDict = {}
accInfo = biorecord.annotations['accessions']
if len(accInfo) >= 3 :
region = accInfo[2]
if "complement" in region :
record.orientation = -1
record.chromOffset = int(region.split('.')[2][:-1])
#if
else :
record.chromOffset = int(accInfo[2].split('.')[0])
#if
record.recordId = biorecord.id
for i in biorecord.features :
if i.qualifiers :
if i.type == "source" :
......@@ -167,109 +383,108 @@ class GBparser() :
#if
if i.qualifiers.has_key("gene") :
gene = i.qualifiers["gene"][0]
GeneInstance = record.findGene(gene)
if not GeneInstance :
GeneInstance = Gene(gene)
record.geneList.append(GeneInstance)
#if
if i.type == "gene" :
if i.strand :
GeneInstance.orientation = i.strand
GeneInstance.location = self.__location2pos(i.location)
if not GeneInstance.location :
GeneInstance.transcribe = False
geneName = i.qualifiers["gene"][0]
if not geneDict.has_key(geneName) :
myGene = Gene(geneName)
record.geneList.append(myGene)
if i.strand :
myGene.orientation = i.strand
myGene.location = self.__location2pos(i.location)
geneDict[geneName] = tempGene(geneName)
#if
# RESOLV
LocusInstance = None
locusTag = None
if i.qualifiers.has_key("locus_tag") :
locusTag = i.qualifiers["locus_tag"][0]
#locusName = locusTag[-3:]
LocusInstance = GeneInstance.findLocus(locusTag[-3:])
#if
else :
if i.qualifiers.has_key("transcript_id") :
LocusInstance = GeneInstance.findLink(
self.__transcriptToProtein(
i.qualifiers["transcript_id"][0].split('.')[0]))
if i.qualifiers.has_key("protein_id") :
LocusInstance = GeneInstance.findLink(
i.qualifiers["protein_id"][0].split('.')[0])
#else
if not LocusInstance and (i.type == "mRNA" or i.type == "CDS") :
if record.molType != 'n' :
if locusTag :
LocusInstance = Locus(locusTag[-3:])
else :
LocusInstance = Locus(GeneInstance.newLocusTag())
GeneInstance.transcriptList.append(LocusInstance)
else :
if GeneInstance.transcriptList :
LocusInstance = GeneInstance.transcriptList[0]
else :
LocusInstance = Locus(GeneInstance.newLocusTag())
GeneInstance.transcriptList.append(LocusInstance)
if not LocusInstance and i.type == "exon" :
if GeneInstance.transcriptList :
LocusInstance = GeneInstance.transcriptList[0]
else :
LocusInstance = Locus(GeneInstance.newLocusTag())
GeneInstance.transcriptList.append(LocusInstance)
# /RESOLV
if i.type == "mRNA" :
PListInstance = PList()
LocusInstance.mRNA = PListInstance
posList = self.__locationList2posList(i)
if posList != None :
PListInstance.location = \
self.__location2pos(i.location)
PListInstance.positionList = posList
#if
if i.qualifiers.has_key("transcript_id") :
LocusInstance.transcriptID = \
i.qualifiers["transcript_id"][0]
LocusInstance.link = self.__transcriptToProtein(
LocusInstance.transcriptID.split('.')[0])
LocusInstance.locusTag = locusTag
LocusInstance.transcribe = True
#if
geneDict[geneName].rnaList.append(i)
if i.type == "CDS" :
PListInstance = PList()
LocusInstance.CDS = PListInstance
PListInstance.location = self.__location2pos(i.location)
PListInstance.positionList = \
self.__locationList2posList(i)
if i.qualifiers.has_key("transl_table") :
LocusInstance.txTable = \
int(i.qualifiers["transl_table"][0])
if i.qualifiers.has_key("protein_id") :
LocusInstance.proteinID = \
i.qualifiers["protein_id"][0]
LocusInstance.link = \
LocusInstance.proteinID.split('.')[0]
LocusInstance.locusTag = locusTag
LocusInstance.translate = True
#if
geneDict[geneName].cdsList.append(i)
if i.type == "exon" :
if not LocusInstance.exon :
LocusInstance.exon = PList()
LocusInstance.exon.positionList.extend(
self.__location2pos(i.location))
#if
exonList.extend(self.__location2pos(i.location))
#if
#if
#for
if record.molType == 'g' :
for j in geneDict.keys() :
myGene = geneDict[j]
self.link(myGene.rnaList, myGene.cdsList)
for i in myGene.rnaList :
if i.usable :
myRealGene = record.findGene(i.gene)
if i.locus_tag :
myTranscript = Locus(i.locus_tag[-3:])
else :
myTranscript = Locus(myRealGene.newLocusTag())
myTranscript.mRNA = PList()
myTranscript.mRNA.positionList = i.positionList
myTranscript.mRNA.location = i.location
myTranscript.transcribe = True
myTranscript.transcriptID = i.transcript_id
myTranscript.transcriptProduct = i.product
myTranscript.locusTag = i.locus_tag
if i.link :
myTranscript.CDS = PList()
myTranscript.CDS.positionList = i.link.positionList
myTranscript.CDS.location = i.link.location
myTranscript.translate = True
myTranscript.proteinID = i.link.protein_id
myTranscript.linkMethod = i.linkMethod
myTranscript.proteinProduct = i.link.product
if i.link.qualifiers.has_key("transl_table") :
myTranscript.txTable = \
int(i.qualifiers["transl_table"][0])
#if
myRealGene.transcriptList.append(myTranscript)
#if
#for
for i in myGene.cdsList :
if not i.linked and i.usable :
myRealGene = record.findGene(i.gene)
if i.locus_tag :
myTranscript = Locus(i.locus_tag[-3:])
else :
myTranscript = Locus(myRealGene.newLocusTag())
myTranscript.CDS = PList()
myTranscript.CDS.positionList = i.positionList
myTranscript.CDS.location = i.location
myTranscript.proteinID = i.protein_id
myTranscript.proteinProduct = i.product
if i.qualifiers.has_key("transl_table") :
myTranscript.txTable = \
int(i.qualifiers["transl_table"][0])
myRealGene.transcriptList.append(myTranscript)
#if
#if
#for
#for
#if
else :
myGene = geneDict[geneDict.keys()[0]]
myRealGene = record.geneList[0]
myCDS = myGene.cdsList[0]
self.__tagByDict(myCDS, "protein_id")
self.__tagByDict(myCDS, "product")
myTranscript = Locus("001")
myTranscript.exon = PList()
if exonList :
myTranscript.exon.positionList = exonList
else :
myTranscript.exon.location = myRealGene.location
myTranscript.CDS = PList()
myTranscript.CDS.location = self.__location2pos(myCDS.location)
myTranscript.transcriptID = biorecord.id
myTranscript.proteinID = myCDS.protein_id
myTranscript.proteinProduct = myCDS.product
myTranscript.linkMethod = "exhaustion"
myTranscript.transcribe = True
if myCDS.qualifiers.has_key("transl_table") :
myTranscript.txTable = \
int(i.qualifiers["transl_table"][0])
myRealGene.transcriptList.append(myTranscript)
#else
for i in record.geneList :
if not i.transcriptList :
record.geneList.remove(i)
return record
#parseRecord
#GBparser
......@@ -98,6 +98,9 @@ class Locus(object) :
self.protLongName = ""
self.transcribe = False
self.translate = False
self.linkMethod = None
self.transcriptProduct = None
self.proteinProduct = None
#__init__
def addToDescription(self, rawVariant) :
......@@ -228,6 +231,10 @@ class Record(object) :
self.description = ""
self._sourcetype = None #LRG or GB
self.version = None
self.chromOffset = 0
self.chromDescription = ""
self.orientation = 1
self.recordId = None
#__init__
def findGene(self, name) :
......@@ -259,6 +266,28 @@ class Record(object) :
else :
self.description = rawVariant
#addToDescription
def toChromPos(self, i) :
"""
"""
if self.orientation == 1 :
return self.chromOffset + i - 1
return self.chromOffset - i + 1
#toChromPos
def addToChromDescription(self, rawVariant) :
"""
"""
if not self.chromOffset :
return
if self.chromDescription :
self.chromDescription = "%s;%s" % (self.chromDescription,
rawVariant)
else :
self.chromDescription = rawVariant
#addToChromDescription
#Record
class GenRecord() :
......@@ -269,11 +298,12 @@ class GenRecord() :
checkRecord() ; Check and repair self.record
"""
def __init__(self, output) :
def __init__(self, output, config) :
"""
"""
self.__output = output
self.__config = config