From 48bb332db0d8e6b89dea8957d1c4cdf16ce5c964 Mon Sep 17 00:00:00 2001 From: "J.F.J. Laros" <j.f.j.laros@lumc.nl> Date: Mon, 14 Jun 2010 08:40:40 +0000 Subject: [PATCH] Commit to do a merge with web_dev. This version is not suitable for distribution as it is under heavy development. Most modules will have minor changes because of a difference in set up of both the Db and Config module. install.sh: - Added functionality to enable the cron restart of the Batch Checker. - Added the auto-generation of a .htaccess file. - Added permission settings. mutalyzer.conf: - Added configuration options for the Scheduler, File and GenRecord modules. Db.txt: - Described how to make the new ChrName tables for hg18 and hg19. errorcodes.txt: - Added classifications to the messages. doc: - Made a set up for the documentation. TechnicalReference: - This will be a technical document that describes the internals of the project. It is only meant for developers. API: - This is a description of the API, it is auto generated by the mkapidoc.sh script. Also only meant for developers. Mutalyzer.py: - Added a new roll function that will always find both boundaries. - Implemented a new protein naming scheme. - Fixed the trimming of a delins. - Rewrote the processing of a variant. - Moved post processing of the GenBank record to the GenRecord module. - Moved the crossmapper instance to the GenBank module, to make one instance per transcript variant. - Moved the naming of a variant to the GenBank module, as is strongly interacts with the crossmapper instance. - Moved the constructCDS function to the GenRecord module. handler.py: - Added functionality for the batch checker (retrieve results). - Added functionality for the genbank uploader (retrieve GenBank files). webservice.py: - Modified to work with the new Db module. UCSC_update.py: - Modified to work with the new Db module. GenRecord.py: - Replaced the dictionary structure with a nested list structure to make iteration more convenient. - Added names to the Locus and Gene objects. - Added all information needed to do a crossmapping in the Locus object. - Wrote functions to find Loci and Genes. - Wrote a function that expands a description of a variant (coupled to a Locus). Mutator.py: - Added documentation. Parser.py: - Added documentation. Web.py: - Added documentation. - Added a function that checks whether a string is an e-mail address. Scheduler.py: - Implemented a batch scheduler that uses a MySQL database for queueing. File.py: - Implemented a CSV, XLS and ODS parser for use in the Scheduler module. Output.py: - Added documentation. Mapper.py: - Modified the complex object initialisation. Config.py: - Made subclasses to configure the separate modules. Db.py: - Added documentation. - Split the Db modules into different classes, according to functionality, they all inherit the query function from the Db base class. - Added chromosome accession number to name conversion functions and vice versa. - Added functionality for the batch checker. Crossmap.py: - Added documentation. Retriever.py: - Added documentation. - Added fall back functionality when searching for a gene. index.py: - Added a batch submit interface. batch.html: - The layout of the batch submit interface. git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/trunk@30 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1 --- Db.txt | 78 +- doc/API/api.conf | 105 ++ doc/API/mkapidoc.sh | 5 + doc/TechnicalReference/Makefile | 37 + doc/TechnicalReference/TechnicalReference.tex | 69 ++ doc/TechnicalReference/bibliography.bib | 0 errorcodes.txt | 59 +- install.sh | 46 +- mutalyzer.conf | 48 +- src/BatchChecker.py | 4 +- src/Modules/Config.py | 186 ++-- src/Modules/Crossmap.py | 10 + src/Modules/Db.py | 660 ++++++++----- src/Modules/File.py | 322 ++++++ src/Modules/GenRecord.py | 373 +++++-- src/Modules/Mapper.py | 86 +- src/Modules/Misc.py | 5 + src/Modules/Mutator.py | 154 +-- src/Modules/Output.py | 319 +++--- src/Modules/Parser.py | 32 +- src/Modules/Retriever.py | 213 ++-- src/Modules/Scheduler.py | 85 +- src/Modules/Web.py | 18 + src/Modules/__init__.py | 14 + src/Mutalyzer.py | 927 ++++++++---------- src/UCSC_update.py | 26 +- src/VarInfo.py | 4 +- src/handler.py | 47 +- src/index.py | 50 +- src/webservice.py | 26 +- templates/batch.html | 40 + 31 files changed, 2566 insertions(+), 1482 deletions(-) create mode 100644 doc/API/api.conf create mode 100644 doc/API/mkapidoc.sh create mode 100644 doc/TechnicalReference/Makefile create mode 100644 doc/TechnicalReference/TechnicalReference.tex create mode 100644 doc/TechnicalReference/bibliography.bib create mode 100644 src/Modules/File.py create mode 100644 templates/batch.html diff --git a/Db.txt b/Db.txt index 3867c448..fda594a6 100644 --- a/Db.txt +++ b/Db.txt @@ -38,14 +38,15 @@ CREATE TABLE BatchQueue ( QueueID INT(5) PRIMARY KEY AUTO_INCREMENT, JobID CHAR(20) NOT NULL, AccNo CHAR(13) NOT NULL, - Gene CHAR(20) NOT NULL, + Gene CHAR(20), Variant CHAR(255) NOT NULL ); CREATE TABLE BatchJob ( JobID CHAR(20) PRIMARY KEY, Filter CHAR(20) NOT NULL, - EMail CHAR(255) NOT NULL + EMail CHAR(255) NOT NULL, + FromHost Char(255) NOT NULL ); CREATE TABLE Var ( @@ -54,3 +55,76 @@ CREATE TABLE Var ( ); INSERT INTO Var VALUES ("WatchDog", 0); --- + +CREATE TABLE ChrName ( + AccNo CHAR(20) PRIMARY KEY, + name CHAR(20) NOT NULL +); + +# hg19: +INSERT INTO ChrName (AccNo, name) VALUES + ("NC_000001.10", "chr1"), + ("NC_000002.11", "chr2"), + ("NC_000003.11", "chr3"), + ("NC_000004.11", "chr4"), + ("NC_000005.9", "chr5"), + ("NC_000006.11", "chr6"), + ("NC_000007.13", "chr7"), + ("NC_000008.10", "chr8"), + ("NC_000009.11", "chr9"), + ("NC_000010.10", "chr10"), + ("NC_000011.9", "chr11"), + ("NC_000012.11", "chr12"), + ("NC_000013.10", "chr13"), + ("NC_000014.8", "chr14"), + ("NC_000015.9", "chr15"), + ("NC_000016.9", "chr16"), + ("NC_000017.10", "chr17"), + ("NC_000018.9", "chr18"), + ("NC_000019.9", "chr19"), + ("NC_000020.10", "chr20"), + ("NC_000021.8", "chr21"), + ("NC_000022.10", "chr22"), + ("NC_000023.10", "chrX"), + ("NC_000024.9", "chrY"), + ("NT_167244.1", "chr6_apd_hap1"), + ("NT_113891.2", "chr6_cox_hap2"), + ("NT_167245.1", "chr6_dbb_hap3"), + ("NT_167246.1", "chr6_mann_hap4"), + ("NT_167247.1", "chr6_mcf_hap5"), + ("NT_167248.1", "chr6_qbl_hap6"), + ("NT_167249.1", "chr6_ssto_hap7"), + ("NT_167250.1", "chr4_ctg9_hap1"), + ("NT_167251.1", "chr17_ctg5_hap1") +; + +# hg18: +INSERT INTO ChrName (AccNo, name) VALUES + ("NC_000001.9", "chr1"), + ("NC_000002.10", "chr2"), + ("NC_000003.10", "chr3"), + ("NC_000004.10", "chr4"), + ("NC_000005.8", "chr5"), + ("NC_000006.10", "chr6"), + ("NC_000007.12", "chr7"), + ("NC_000008.9", "chr8"), + ("NC_000009.10", "chr9"), + ("NC_000010.9", "chr10"), + ("NC_000011.8", "chr11"), + ("NC_000012.10", "chr12"), + ("NC_000013.9", "chr13"), + ("NC_000014.7", "chr14"), + ("NC_000015.8", "chr15"), + ("NC_000016.8", "chr16"), + ("NC_000017.9", "chr17"), + ("NC_000018.8", "chr18"), + ("NC_000019.8", "chr19"), + ("NC_000020.9", "chr20"), + ("NC_000021.7", "chr21"), + ("NC_000022.9", "chr22"), + ("NC_000023.9", "chrX"), + ("NC_000024.8", "chrY"), + ("NC_001807.4", "chrM"), + ("NT_113891.1", "chr6_cox_hap1"), + ("NT_113959.1", "chr22_h2_hap1") +; diff --git a/doc/API/api.conf b/doc/API/api.conf new file mode 100644 index 00000000..f44efb94 --- /dev/null +++ b/doc/API/api.conf @@ -0,0 +1,105 @@ +[epydoc] # Epydoc section marker (required by ConfigParser) + +# modules +# The list of objects to document. Objects can be named using +# dotted names, module filenames, or package directory names. +# Alases for this option include "objects" and "values". +modules: ../../src/Modules, ../../src/*.py + +# output +# The type of output that should be generated. Should be one +# of: html, text, latex, dvi, ps, pdf. +output: pdf + +# target +# The path to the output directory. May be relative or absolute. +target: api/ + +# docformat +# The default markup language for docstrings, for modules that do +# not define __docformat__. Defaults to epytext. +docformat: epytext + +# css +# The CSS stylesheet for HTML output. Can be the name of a builtin +# stylesheet, or the name of a file. +#css: white + +# name +# The documented project's name. +name: Mutalyzer 2.0 + +# url +# The documented project's URL. +url: http://www.mutalyzer.nl/2.0/ + +# link +# HTML code for the project link in the navigation bar. If left +# unspecified, the project link will be generated based on the +# project's name and URL. +#link: <a href="somewhere">My Cool Project</a> + +# top +# The "top" page for the documentation. Can be a URL, the name +# of a module or class, or one of the special names "trees.html", +# "indices.html", or "help.html" +top: ../../src + +# help +# An alternative help file. The named file should contain the +# body of an HTML file; navigation bars will be added to it. +#help: my_helpfile.html + +# frames +# Whether or not to include a frames-based table of contents. +#frames: yes + +# private +# Whether or not to inclue private variables. (Even if included, +# private variables will be hidden by default.) +private: yes + +# imports +# Whether or not to list each module's imports. +imports: no + +# verbosity +# An integer indicating how verbose epydoc should be. The default +# value is 0; negative values will supress warnings and errors; +# positive values will give more verbose output. +verbosity: 0 + +# parse +# Whether or not parsing should be used to examine objects. +parse: yes + +# introspect +# Whether or not introspection should be used to examine objects. +introspect: no + +# graph +# The list of graph types that should be automatically included +# in the output. Graphs are generated using the Graphviz "dot" +# executable. Graph types include: "classtree", "callgraph", +# "umlclass". Use "all" to include all graph types +graph: all + +# dotpath +# The path to the Graphviz "dot" executable, used to generate +# graphs. +dotpath: /usr/local/bin/dot + +# sourcecode +# Whether or not to include syntax highlighted source code in +# the output (HTML only). +#sourcecode: yes + +# pstat +# The name of one or more pstat files (generated by the profile +# or hotshot module). These are used to generate call graphs. +pstat: profile.out + +# separate-classes +# Whether each class should be listed in its own section when +# generating LaTeX or PDF output. +separate-classes: no diff --git a/doc/API/mkapidoc.sh b/doc/API/mkapidoc.sh new file mode 100644 index 00000000..9ae6c00f --- /dev/null +++ b/doc/API/mkapidoc.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +epydoc --config api.conf +mv api/api.pdf . +rm -rf api/ diff --git a/doc/TechnicalReference/Makefile b/doc/TechnicalReference/Makefile new file mode 100644 index 00000000..50239034 --- /dev/null +++ b/doc/TechnicalReference/Makefile @@ -0,0 +1,37 @@ +# General Makefile for LaTeX documents by J. F. J. Laros. +# Last alteration on 15-10-2009. +# +# The packages texlive-base-bin, texlive-latex-base and ghostscript should +# be installed. +# + +LATEX = latex +BIBTEX = bibtex +DVIPS = dvips +PS2PDF = ps2pdf14 + +SRC := $(shell grep -H '\\begin{document' *.tex | cut -f 1 -d '.') + +BIB := $(shell grep '\\bibliography{' $(SRC).tex > /dev/null && \ + grep '\\cite{' $(SRC).tex) + +all: $(SRC) + +$(SRC): $(SRC).tex + $(LATEX) $^ +ifdef BIB + $(BIBTEX) $(SRC) +endif + $(LATEX) $^ + $(LATEX) $^ + $(DVIPS) $(SRC).dvi -o $(SRC).ps + +release: $(SRC) clean + $(PS2PDF) $(SRC).ps + rm -f $(SRC).ps + +clean: + rm -f *.aux $(SRC).bbl $(SRC).blg $(SRC).dvi $(SRC).log $(SRC).toc + +distclean: clean + rm -f $(SRC).ps $(SRC).pdf diff --git a/doc/TechnicalReference/TechnicalReference.tex b/doc/TechnicalReference/TechnicalReference.tex new file mode 100644 index 00000000..b0e832b5 --- /dev/null +++ b/doc/TechnicalReference/TechnicalReference.tex @@ -0,0 +1,69 @@ +\newcommand{\thisversion}{2.0} + +\documentclass{article} +\usepackage{amssymb, amsthm, graphicx, float, textcomp} +\title{\Huge Mutalyzer \thisversion\ Technical Reference Manual} +\author{Jeroen F. J. Laros + \vspace{10pt}\\ + Department of Human Genetics\\ + Center for Human and Clinical Genetics\\ + \texttt{j.f.j.laros@lumc.nl}} +\date{\today} +\frenchspacing + +\newtheorem{theorem}{Theorem} +\newtheorem{lemma}[theorem]{Lemma} +\newtheorem{corollary}[theorem]{Corollary} + +\theoremstyle{definition} +\newtheorem{example}[theorem]{Example} +\newtheorem{definition}[]{Definition} +\newtheorem{remark}[theorem]{Remark} +\newtheorem{conjecture}[theorem]{Conjecture} + +\begin{document} + +\maketitle +\thispagestyle{empty} +\newpage + +\pagenumbering{roman} +\tableofcontents +\newpage + +\pagenumbering{arabic} + +\section{Introduction}\label{sec:introduction} +This document is intended for developers. + +\section{Modules}\label{sec:modules} +Mutalyzer \thisversion\ blabla. +\subsection{Config}\label{subsec:config} +\subsection{Output}\label{subsec:output} +\subsection{Db}\label{subsec:db} +\subsection{Retriever}\label{subsec:retriever} +\subsection{Mutator}\label{subsec:mutator} +\subsection{Scheduler}\label{subsec:scheduler} +\subsection{GenRecord}\label{subsec:genrecord} +\subsection{Web}\label{subsec:web} +\subsection{Misc}\label{subsec:misc} + +\section{Programs}\label{sec:programs} +\subsection{Mutalyzer}\label{subsec:mutalyzer} +\subsection{VarInfo}\label{subsec:varinfo} +\subsection{UCSC\_Update}\label{subsec:ucsc_update} + +\section{Interfaces}\label{sec:interfaces} +% handler.py +\subsection{Web}\label{subsec:webinterface} +% index.py +\subsubsection{TAL} +% templates/ +\subsection{Webservices}\label{subsec:webservinterface} +% webservice.py +\subsection{Command line}\label{subsec:commandline} + +\bibliography{bibliography}{} +\bibliographystyle{plain} + +\end{document} diff --git a/doc/TechnicalReference/bibliography.bib b/doc/TechnicalReference/bibliography.bib new file mode 100644 index 00000000..e69de29b diff --git a/errorcodes.txt b/errorcodes.txt index e54c44a1..b8d0a590 100644 --- a/errorcodes.txt +++ b/errorcodes.txt @@ -1,34 +1,39 @@ Information: -INFO | Information. +INFO | N | Information. Warnings: -WSTART | Mutation in the start codon. -WTXSTART | Mutation hits transcription start. -WSPLDON | Mutation hits a splice donor site. -WSPLACC | Mutation hits a splice acceptor site. -WROLL | Variant position is ambiguous and not the last one was given. -WINSDUP | Variant was described as an insertion, but it is a duplication. -WNOMRNA | No mRNA field was found in the GenBank record. -WNOCDS | No CDS field was found in the GenBank record. -WNOVER | No accession version number was given. -WHASH | Hash of a GenBank record has changed. +WSTART | E | Mutation in the start codon. +WTXSTART | E | Mutation hits transcription start. +WSPLDON | E | Mutation hits a splice donor site. +WSPLACC | E | Mutation hits a splice acceptor site. +WROLL | D | Variant position is ambiguous and not the last one was given. +WINSDUP | D | Variant was described as an insertion, but it is a + duplication. +WNOMRNA | R | No mRNA field was found in the GenBank record. +WNOCDS | R | No CDS field was found in the GenBank record. +WNOCDSLIST | R | No CDS list was found in the GenBank record. +WNOVER | R | No accession version number was given. +WHASH | N | Hash of a GenBank record has changed. +WNOCHANGE | D | Variant equals reference sequence. +WNOTMINIMAL | D | A shorter description of a raw variant is possible. Errors: -EARGLEN | There was a discrepancy between the range and the length of the - | optional argument. -EREF | There was a discrepancy between the reference sequence and the - | optional argument. -ENOCHANGE | Mutation has no effect. -ENOTMINIMAL | A shorter description of a raw variant is possible. -EINSRANGE | The positions of an insertion are not consecutive. -ENOCDS | No CDS field was found in the GenBank record and none could be - | constructed. +ENOVAR | D | No mutation given. +EARGLEN | D | There was a discrepancy between the range and the length of + the optional argument. +EREF | D | There was a discrepancy between the reference sequence and the + optional argument. +EINSRANGE | D | The positions of an insertion are not consecutive. +WNOCDS | R | No CDS field was found in the GenBank record and none could + be constructed. +ENOGENE | R | Gene not found. +ESTOP | R | In frame stop codon found. Fatal errors: -EPARSE | Nomenclature parse error. -ERECPARSE | GenBank record parse error. -EARG | -EFILESIZE | The filesize is either too large or too small. -ERETR | Could not retrieve a GenBank record. -EARG | Error in the arguments (of a webservice). -ERANGE | Position out of range (webservice). +EPARSE | D | Nomenclature parse error. +EBPARSE | D | Parse error in the submitted batch file. +ERECPARSE | R | GenBank record parse error. +EFILESIZE | N | The filesize is either too large or too small. +ERETR | R | Could not retrieve a GenBank record. +EARG | N | Error in the arguments (of a webservice). +ERANGE | D | Position out of range (webservice). diff --git a/install.sh b/install.sh index 1b0fc907..71f3a7e2 100644 --- a/install.sh +++ b/install.sh @@ -1,22 +1,40 @@ #!/bin/sh -script="UCSC_update.py" -cron_entry="25 6 \* \* \* python `pwd`/src/$script" +updateCron() { + cron_entry="$1 python `pwd`/src/$2.py" + + if ! `crontab -l | grep "$cron_entry" > /dev/null`; then + echo "Updating cron entry." + if `crontab -l | grep $2 > /dev/null`; then + echo "Removing old entry." + crontab -l | grep -v $2 | crontab + fi + echo "Installing new entry." + ( + crontab -l + echo $cron_entry + ) | crontab + fi +} if `echo $0 | grep '/' > /dev/null`; then echo "Please run this script from the installation directory." exit 1 fi -if ! `crontab -l | grep "$cron_entry" > /dev/null`; then - echo "Updating cron entry." - if `crontab -l | grep $script > /dev/null`; then - echo "Removing old entry." - crontab -l | grep -v $script | crontab - fi - echo "Installing new entry." - ( - crontab -l - echo $cron_entry - ) | crontab -fi +updateCron "25 6 \* \* \*" "UCSC_update" +updateCron "*/1 \* \* \* \*" "BatchChecker" + +cat << EOF > .htaccess +SetHandler mod_python +PythonHandler src/handler +PythonPath "sys.path + ['`pwd`/src']" +PythonDebug On + +RewriteEngine on +RewriteRule Variant_info.php Variant_info +EOF + +chmod go+rx . src src/Modules templates +chmod go+r .htaccess mutalyzer.conf src/*.py src/Modules/*.py templates/* +chmod go+rw var diff --git a/mutalyzer.conf b/mutalyzer.conf index 6e3112b5..6a7950dc 100644 --- a/mutalyzer.conf +++ b/mutalyzer.conf @@ -3,6 +3,7 @@ # +# # These settings are used by the Retriever module. # @@ -22,6 +23,7 @@ maxDldSize = 10 minDldSize = 512 +# # These settings are used by the Db module. # @@ -34,6 +36,9 @@ dbNames = "hg18", "hg19" # MySQL username for the local databases (inernalDb and dnNames). LocalMySQLuser = "mutalyzer" +# Host name for the local databases. +LocalMySQLhost = "localhost" + # MySQL username for the UCSC database. RemoteMySQLuser = "genome" @@ -47,6 +52,7 @@ UpdateInterval = 7 TempFile = "./var/UCSC_Update.txt" +# # These settings are used by the Output module. # @@ -72,6 +78,7 @@ loglevel = 3 outputlevel = 1 +# # These settings are used by the Mutator module. # @@ -85,11 +92,46 @@ maxvissize = 25 flankclipsize = 6 +# # These settings are used by the Scheduler module. # -# Watchdog timeout in seconds. -watchDogTimeOut = 60 - # Name of the batch process. processName = "MutalyzerBatch2" + +# Return e-mail address. +mailFrom = "noreply@humgen.nl" + +# Location of the mail template. +mailMessage = "./mail.txt" + +# Subject of the message. +mailSubject = "Result of Mutalyzer batch check." + +# Location of the results. +resultsDir = "./var/cache" + + +# +# These settings are used by the File module. +# + +# Amount of bytes to be read for determining the file type. +bufSize = 32768 + +# The obligatory header in batch request files. +header = "AccNo", "Genesymbol", "Mutation" + +# Directory for temporary files. +tempDir = "./var" + + +# +# These settings are used by the GenRecord module. +# + +# Number of upstream nucleotides when searching for a transcript. +upstream = 5000 + +# Number of downstream nucleotides when searching for a transcript. +downstream = 2000 diff --git a/src/BatchChecker.py b/src/BatchChecker.py index da992a89..b22e740f 100644 --- a/src/BatchChecker.py +++ b/src/BatchChecker.py @@ -7,11 +7,11 @@ if len(sys.argv[0].split('/')) > 2 : os.chdir(sys.argv[0].rsplit('/', 2)[0]) from Modules import Config -from Modules import Db +from Modules.Db import Batch from Modules import Scheduler C = Config.Config() -D = Db.Db("local", C.Db.internalDb, C.Db) +D = Batch(C.Db) S = Scheduler.Scheduler(C.Scheduler, D) if not S.isDaemonRunning() : diff --git a/src/Modules/Config.py b/src/Modules/Config.py index 7e3ffa36..f8ff8d40 100644 --- a/src/Modules/Config.py +++ b/src/Modules/Config.py @@ -1,62 +1,137 @@ #!/usr/bin/python +""" + Module for reading the config file and splitting up the variables into + subclasses. Each of these subclasses are used to configure a specific + module. + + Public classes: + Config ; Read the configuration file and store the data in subclasses. +""" + class Config() : """ - Read the configuration file and store the data. - - Public variables: - # Used by the Retriever module: - email ; Email address used for Entrez. - cache ; Location of the cache directory. - cachesize ; Maximum size of the cache directory in bytes. - maxDldSize ; Maximum size of a GenBank record in bytes. - minDldSize ; Minimum size of a GenBank record in bytes. - - # Used by the Db module: - internalDb ; Name of the internal database. - dbNames ; Name of the mapping databases - LocalMySQLuser ; Username for the local databases. - RemoteMySQLuser ; Username for the remote UCSC database. - RemoteMySQLhost ; Hostname of the UCSC database server. - UpdateInterval ; Time window (in days) to search for updates. - TempFile ; Location for downloaded updates. - - # Used by the Output module: - log ; Name and location of the logfile. - datestring ; Prefix for log messages. - - # Used by the Mutator module: - flanksize ; Length of the flanking sequences in the - visualisation. - maxvissize ; Maximum length of the variation in the - visualisation. - flankclipsize ; Length of the inserted/deleted flanks. + Read the configuration file and store the data in subclasses. + Public subclasses: + Retriever ; Container for the Retriever configuration variables. + Db ; Container for the Db configuration variables. + Output ; Container for the Output configuration variables. + Mutator ; Container for the Mutator configuration variables. + Scheduler ; Container for the Scheduler configuration variables. + File ; Container for the File configuration variables. + GenRecord ; Container for the File configuration variables. Special Methods: - __init__ ; Read the configuration file. + __init__ ; Read the configuration file and initialise the + subclasses. """ class Retriever() : + """ + Container class for the Retriever configuration variables. + + Public variables: + email ; Email address used for Entrez. + cache ; Location of the cache directory. + cachesize ; Maximum size of the cache directory in bytes. + maxDldSize ; Maximum size of a GenBank record in bytes. + minDldSize ; Minimum size of a GenBank record in bytes. + """ + pass #Retriever class Db() : - pass + """ + Container class for the Db configuration variables. + + Public variables: + internalDb ; Name of the internal database. + dbNames ; Name of the mapping databases + LocalMySQLuser ; Username for the local databases. + LocalMySQLhost ; Hostname of the local databases. + + RemoteMySQLuser ; Username for the remote UCSC database. + RemoteMySQLhost ; Hostname of the UCSC database server. + UpdateInterval ; Time window (in days) to search for + updates. + TempFile ; Location for downloaded updates. + """ #Db class Output() : + """ + Container class for the Output configuration variables. + + Public variables: + log ; Name and location of the logfile. + datestring ; Prefix for log messages. + loglevel ; Default level for logging. + outputlevel ; Default level for output. + """ + pass #Output class Mutator() : + """ + Container class for the Mutator configuration variables. + + Public variables: + flanksize ; Length of the flanking sequences in the + visualisation. + maxvissize ; Maximum length of the variation in the + visualisation. + flankclipsize ; Length of the inserted/deleted flanks. + """ + pass #Mutator class Scheduler() : + """ + Container class for the Scheduler configuration variables. + + Public variables: + processName ; Name of the scheduler in the process list. + mailFrom ; Return e-mail address. + mailMessage ; Template e-mail. + mailSubject ; Subject of the e-mail. + resultsDir ; Location of the results. + """ + pass #Scheduler + class File() : + """ + Container class for the File configuration variables. + + Public variables: + bufSize ; Amount of bytes to be read for determining the file + type. + header ; The obligatory header in batch request files. + tempDir ; Directory for temporary files. + """ + + pass + #File + + class GenRecord() : + """ + Container class for the GenRecord configuration variables. + + Public variables: + upstream ; Number of upstream nucleotides when searching for a + transcript. + downstream ; Number of downstream nucleotides when searching for a + transcript. + """ + + pass + #File + def __init__(self) : """ Initialise the class with variables read from the configuration @@ -64,33 +139,12 @@ class Config() : hard coded constant is used (the name and path to the configuration file). - Public variables (altered): - # Used by the Retriever module: - email ; Email address used for Entrez. - cache ; Location of the cache directory. - cachesize ; Maximum size of the cache directory in bytes. - maxDldSize ; Maximum size of a GenBank record in bytes. - minDldSize ; Minimum size of a GenBank record in bytes. - - # Used by the Db module: - internalDb ; Name of the internal database. - dbNames ; Name of the mapping databases - LocalMySQLuser ; Username for the local databases. - RemoteMySQLuser ; Username for the remote UCSC database. - RemoteMySQLhost ; Hostname of the UCSC database server. - UpdateInterval ; Time window (in days) to search for updates. - TempFile ; Location for downloaded updates. - - # Used by the Output module: - log ; Name and location of the logfile. - datestring ; Prefix for log messages. - - # Used by the Mutator module: - flanksize ; Length of the flanking sequences in the - visualisation. - maxvissize ; Maximum length of the variation in the - visualisation. - flankclipsize ; Length of the inserted/deleted flanks. + Public subclasses (altered): + Retriever ; Initialised with Retriever configuration variables. + Db ; Initialised with Db configuration variables. + Output ; Initialised with Output configuration variables. + Mutator ; Initialised with Mutator configuration variables. + Scheduler ; Initialised with Scheduler configuration variables. """ from configobj import ConfigObj # ConfigObj() @@ -106,7 +160,9 @@ class Config() : # Set the variables needed by the Db module. self.Db.internalDb = config["internalDb"] self.Db.dbNames = config["dbNames"] + self.Db.LocalMySQLuser = config["LocalMySQLuser"] + self.Db.LocalMySQLhost = config["LocalMySQLhost"] self.Db.RemoteMySQLuser = config["RemoteMySQLuser"] self.Db.RemoteMySQLhost = config["RemoteMySQLhost"] self.Db.UpdateInterval = int(config["UpdateInterval"]) @@ -124,8 +180,20 @@ class Config() : self.Mutator.flankclipsize = int(config["flankclipsize"]) # Set the variables needed by the Scheduler module. - self.Scheduler.watchDogTimeOut = int(config["watchDogTimeOut"]) self.Scheduler.processName = config["processName"] + self.Scheduler.mailFrom = config["mailFrom"] + self.Scheduler.mailMessage = config["mailMessage"] + self.Scheduler.mailSubject = config["mailSubject"] + self.Scheduler.resultsDir = config["resultsDir"] + + # Set the variables needed by the File module. + self.File.bufSize = int(config["bufSize"]) + self.File.header = config["header"] + self.File.tempDir = config["tempDir"] + + # Set the variables needed by the GenRecord module. + self.File.upstream = int(config["upstream"]) + self.File.downstream = int(config["downstream"]) #__init__ #Config diff --git a/src/Modules/Crossmap.py b/src/Modules/Crossmap.py index 58e336b5..37ccf01e 100644 --- a/src/Modules/Crossmap.py +++ b/src/Modules/Crossmap.py @@ -1,5 +1,15 @@ #!/usr/bin/python +""" + Module for conversion from genomic coordinates to coding sequence + orientated coordinates and vice versa. + The conversions are done based upon a list of splice sites, the CDS start + and stop and the orientation of a transcript. + + Public classes: + Crossmap ; Convert from g. to c. or n. notation or vice versa. +""" + class Crossmap() : """ Convert from g. to c. or n. notation or vice versa. diff --git a/src/Modules/Db.py b/src/Modules/Db.py index a1abc211..ada5573a 100644 --- a/src/Modules/Db.py +++ b/src/Modules/Db.py @@ -1,13 +1,34 @@ #!/usr/bin/python +""" + Module for database access. + The Db class is a superclass of the rest of the classes and should not be + used as such. The superclass mainly consists of a wrapper for SQL + statements. + + + Public classes: + Db ; Log in to a database and keep it open for queries. + Mapping ; Mapping of transcripts and genes. + Remote ; Retrieving updates for the mapping databases. + Update ; Updating the mapping databases. + Cache ; Cache administration. + Batch ; Batch checker. +""" + import MySQLdb # connect(), escape_string() import types # TupleType -import time +import time # strftime() +import os # os.remove() -#from Output import Output -import os # os.remove() +from Modules import Misc # ID() -from Modules import Misc +# +# Note that compound queries are split into single queries because of a bug +# in MySQLdb. The functions load_Update(), merge_cdsUpdates() and +# merge_Update (search for MYSQL_BUG in this file) are affected and may be +# rewritten when this bug is fixed. +# class Db() : """ @@ -17,116 +38,30 @@ class Db() : __db ; Interface to the database. Special methods: - __init__(config, where) ; Do the login. - - Private methods: - __query(statement) ; General query function. + __init__(dbName, mySqlUser, mySqlHost) ; Do the login. Public methods: - # For mapping. - get_protAcc(mrnaAcc) ; Query the database for a protein ID. - get_NM_info(mrnaAcc) ; Retrieve various data for an NM number. - get_NM_version(mrnaAcc) ; Get the version number of an accession - number. - get_Transcripts(chrom, ; Get a list of transcripts, given a - position, chromosome and a range. - overlap) - get_GeneName(mrnaAcc) ; Get the gene name, given an NM number. - isChrom(name) ; Check whether we know this name to be - a chromosome name. - - # For updating mapping information - get_Update() ; Retrieve new mapping info from the UCSC. - load_Update() ; Load new mapping info into the local - database. - count_Updates() ; Count the number of entries in the new - mapping info table. - backup_cdsUpdates() ; Make a backup of updates that overwrite - the old mapping info. - count_cdsUpdates() ; Count the number of updates that - overwrite the old mapping info. - merge_cdsUpdates() ; Merge the backup of old mapping info - with the other old info. - merge_Update() ; Merge the new mapping info from the - UCSC with what we already have. - - # For cache administration. - insertGB(AccNo, GI, md5, ; Insert info about a GenBank record. - ChrAccVer, - ChrStart, - ChrStop, - orientation, - url) - updateHash(AccNo, md5) ; Update the hash of an accession number. - getGBFromLoc(ChrAccVer, ; Get the accession number from slicing - ChrStart, information. - ChrStop, - orientation) - getGBFromHash(md5) ; Get the accession number from its hash. - getGBFromGI(GI) ; Get the accession number from its GI - number. - getLoc(AccNo) ; Get the slicing information of an - accession number. - getHash(AccNo) ; Get the hash of a GenBank record. - getUrl(AccNo) ; Get the URL of an accession number. - - Inherited from Output.Config: - internalDb ; Name of the internal database. - RemoteMySQLuser ; MySQL username for the UCSC database. - RemoteMySQLhost ; Host name for the UCSC database. - LocalMySQLuser ; MySQL username for the local databases. - UpdateInterval ; The size of the time window. - TempFile ; The name and location of the temporary file. This - file is created if it doesn't exist and is - overwritten if it does exist. The function - load_Update() will remove this file. + query(statement) ; General query function. """ - # - # Note that compound queries are split into single queries because of a bug - # in MySQLdb. The functions load_Update(), merge_cdsUpdates() and - # merge_Update (search for MYSQL_BUG in this file) are affected and may be - # rewritten when this bug is fixed. - # - - def __init__(self, where, dbName, config) : + def __init__(self, dbName, mySqlUser, mySqlHost) : """ - Log in to the database. The username and the name of the - database are given in the configuration file. + Log in to the database. Arguments: - where ; A switch to see which database to use: - local ; Use the database on localhost. - remote ; Use the UCSC database. - dbName ; The name of the database to use (hg18 or hg19). + dbName ; The name of the database to use. + mySqlUser ; User name for the database. + mySqlHost ; Host name for the database. Private variables (altered): __db ; The interface to the database. - - Inherited variables from Output.Config: - internalDb ; Name of the internal database. - RemoteMySQLuser ; MySQL username for the UCSC database. - RemoteMySQLhost ; Host name for the UCSC database. - LocalMySQLuser ; MySQL username for the local databases. """ - #Output.__init__(self, __file__) - self.__config = config - - self.opened = False - if dbName in self.__config.dbNames or dbName == self.__config.internalDb : - if where == "remote" : - self.__db = MySQLdb.connect(user = self.__config.RemoteMySQLuser, - db = dbName, - host = self.__config.RemoteMySQLhost) - else : - self.__db = MySQLdb.connect(user = self.__config.LocalMySQLuser, - db = dbName) - self.opened = True - #if + self.__db = MySQLdb.connect(user = mySqlUser, db = dbName, + host = mySqlHost) #__init__ - def __query(self, statement) : + def query(self, statement) : """ Query the database. @@ -167,11 +102,47 @@ class Db() : cursor.close() return result - #__query + #query +#Db + +class Mapping(Db) : + """ + Database functions for mapping of transcripts and genes. + + Special methods: + __init__(build, config) ; Initialise the class. + + Public methods: + get_protAcc(mrnaAcc) ; Query the database for a protein ID. + get_NM_info(mrnaAcc) ; Retrieve various data for an NM number. + get_NM_version(mrnaAcc) ; Get the version number of an accession + number. + get_Transcripts(chrom, ; Get a list of transcripts, given a + position, chromosome and a range. + overlap) + get_GeneName(mrnaAcc) ; Get the gene name, given an NM number. + isChrom(name) ; Check whether we know this name to be + a chromosome name. + + Inherited methods from Db: + query(statement) ; General query function. + + SQL tables from dbNames: + map ; Accumulated mapping info. + """ + + def __init__(self, build, config) : + """ + Initialise the Db parent class. Use the local database for a + certain build. - # - # These methods are used for mapping. - # + Arguments: + build ; The version of the mapping database. + config ; Configuration variables. + """ + + Db.__init__(self, build, config.LocalMySQLuser, config.LocalMySQLhost) + #__init__ def get_protAcc(self, mrnaAcc) : """ @@ -193,7 +164,7 @@ class Db() : WHERE acc = %s; """, mrnaAcc - return self.__query(statement)[0][0] + return self.query(statement)[0][0] #get_protAcc def get_NM_info(self, mrnaAcc) : @@ -222,7 +193,7 @@ class Db() : WHERE acc = %s; """, mrnaAcc - return self.__query(statement)[0] + return self.query(statement)[0] #get_NM_info def get_NM_version(self, mrnaAcc) : @@ -245,7 +216,7 @@ class Db() : WHERE acc = %s; """, mrnaAcc - ret = self.__query(statement) + ret = self.query(statement) if ret : return int(ret[0][0]) return 0 @@ -295,7 +266,7 @@ class Db() : #else ret = [] # Convert the results to a normal list. - for i in self.__query(statement) : + for i in self.query(statement) : ret.append(i[0] + '.' + str(self.get_NM_version(i[0]))) return ret #get_Transcripts @@ -320,7 +291,7 @@ class Db() : WHERE acc = %s; """, mrnaAcc - return self.__query(statement)[0][0] + return self.query(statement)[0][0] #get_GeneName def isChrom(self, name) : @@ -344,14 +315,96 @@ class Db() : WHERE chrom = %s; """, name - if int(self.__query(statement)[0][0]) > 0 : + if int(self.query(statement)[0][0]) > 0 : return True return False #isChrom - # - # These methods are used for updating the mapping information. - # + def chromName(self, accNo) : + """ + Get the name of a chromosome, given an accession number. + + Arguments: + accNo ; The accession number of a chromosome. + + SQL tables from dbNames: + ChrName ; Assembly release notes. + + Returns: + string ; The name of a chromosome. + """ + + statement = """ + SELECT name + FROM ChrName + WHERE AccNo = %s; + """, accNo + + return self.query(statement)[0][0] + #chromName + + def chromAcc(self, name) : + """ + Get the accession number of a chromosome, given a name. + + Arguments: + name ; The name of a chromosome. + + SQL tables from dbNames: + ChrName ; Assembly release notes. + + Returns: + string ; The accession number of a chromosome. + """ + + statement = """ + SELECT AccNo + FROM ChrName + WHERE name = %s; + """, name + + return self.query(statement)[0][0] + #chromAcc +#Mapper + +class Remote(Db) : + """ + Database functions for retrieving updates for the mapping databases. + + Special methods: + __init__(config) ; Initialise the class. + + Public methods: + get_Update() ; Retrieve new mapping info from the UCSC. + + Inherited methods from Db: + query(statement) ; General query function. + + SQL tables from dbNames: + gbStatus ; acc -> version mapping (NM to NM + version), + type, modDate + refGene ; name -> geneName mapping (NM to gene name), + txStart, txEnd, cdsStart, cdsEnd, exonStarts, + exonEnds, chrom, strand. + refLink ; mrnaAcc -> protAcc mapping (NM to NP). + """ + + def __init__(self, build, config) : + """ + Initialise the Db parent class. Use the remote database for a + certain build. + + Arguments: + build ; The version of the mapping database. + config ; Configuration variables. + + Private variables (altered): + __config ; Configuration variables. + """ + + self.__config = config + Db.__init__(self, build, config.RemoteMySQLuser, config.RemoteMySQLhost) + #__init__ def get_Update(self) : """ @@ -364,13 +417,6 @@ class Db() : the load_Update() function. - Inherited variables from Output.Config: - UpdateInterval ; The size of the time window. - TempFile ; The name and location of the temporary file. - This file is created if it doesn't exist and - is overwritten if it does exist. The function - load_Update() will remove this file. - SQL tables from dbNames: gbStatus ; acc -> version mapping (NM to NM + version), type, modDate @@ -394,7 +440,7 @@ class Db() : handle = open(self.__config.TempFile, "w") # Convert the results to a tab delimited file. - for i in self.__query(statement) : + for i in self.query(statement) : for j in i : handle.write(str(j) + chr(0x09)) # 0x09 is a TAB. handle.write('\n') @@ -402,6 +448,53 @@ class Db() : handle.close() #get_Update +#Remote + +class Update(Db) : + """ + Database functions for updating the mapping databases. + + Public methods: + load_Update() ; Load new mapping info into the local database. + count_Updates() ; Count the number of entries in the new + mapping info table. + backup_cdsUpdates() ; Make a backup of updates that overwrite the + old mapping info. + count_cdsUpdates() ; Count the number of updates that overwrite + the old mapping info. + merge_cdsUpdates() ; Merge the backup of old mapping info with the + other old info. + merge_Update() ; Merge the new mapping info from the UCSC with + what we already have. + + Inherited methods from Db: + query(statement) ; General query function. + + SQL tables from dbNames: + map ; Accumulated mapping info. + map_temp ; Newly found data. + map_new ; Merge of map_temp and map. + map_cdsBackup_temp ; Entries that were updated without an increment + of the version number. + map_cdsBackup ; Merge of map_cdsBackup_temp and itself. + """ + + def __init__(self, build, config) : + """ + Initialise the Db parent class. Use the remote database for a + certain build. + + Arguments: + build ; The version of the mapping database. + config ; Configuration variables. + + Private variables (altered): + __config ; Configuration variables. + """ + + self.__config = config + Db.__init__(self, build, config.LocalMySQLuser, config.LocalMySQLhost) + #__init__ def load_Update(self) : """ @@ -409,12 +502,6 @@ class Db() : configuration file) created by the get_Update() function and import it in the local database. - Inherited variables from Config: - TempFile ; The name and location of the temporary file. This - file is created by the get_Update() function. After - the local import is complete, this file will be - removed. - SQL tables from dbNames (altered): map_temp ; Created and loaded with data from TempFile. @@ -428,13 +515,13 @@ class Db() : statement = """ CREATE TABLE map_temp LIKE map; """, None - self.__query(statement) + self.query(statement) statement = """ LOAD DATA LOCAL INFILE %s INTO TABLE map_temp; """, self.__config.TempFile - self.__query(statement) + self.query(statement) os.remove(self.__config.TempFile) #load_Update @@ -457,7 +544,7 @@ class Db() : FROM map_temp; """, None - return int(self.__query(statement)[0][0]) + return int(self.query(statement)[0][0]) #count_Updates def backup_cdsUpdates(self) : @@ -490,7 +577,7 @@ class Db() : ); """, None - self.__query(statement) + self.query(statement) #backup_cdsUpdates def count_cdsUpdates(self) : @@ -514,7 +601,7 @@ class Db() : FROM map_cdsBackup_temp; """, None - return int(self.__query(statement)[0][0]) + return int(self.query(statement)[0][0]) #count_cdsUpdates def merge_cdsUpdates(self) : @@ -537,12 +624,12 @@ class Db() : SELECT * FROM map_cdsBackup_temp; """, None - self.__query(statement) + self.query(statement) statement = """ DROP TABLE map_cdsBackup_temp; """, None - self.__query(statement) + self.query(statement) #merge_cdsUpdates def merge_Update(self) : @@ -574,38 +661,83 @@ class Db() : AND map.txStart = map_temp.txStart ); """, None - self.__query(statement) + self.query(statement) statement = """ DROP TABLE map; """, None - self.__query(statement) + self.query(statement) statement = """ CREATE TABLE map SELECT * FROM map_new; """, None - self.__query(statement) + self.query(statement) statement = """ DROP TABLE map_new; """, None - self.__query(statement) + self.query(statement) statement = """ DROP TABLE map_temp; """, None - self.__query(statement) + self.query(statement) #merge_Update +#Update - # - # These methods are used for cache administration. - # +class Cache(Db) : + """ + Database functions for cache administration. - def insertGB(self, AccNo, GI, md5, ChrAccVer, ChrStart, + Special methods: + __init__(config) ; Initialise the class. + + Public methods: + insertGB(accNo, GI, ; Insert info about a GenBank record. + fileHash, + ChrAccVer, + ChrStart, + ChrStop, + orientation, + url) + updateHash(accNo, ; Update the hash of an accession number. + fileHash) + getGBFromLoc(ChrAccVer, ; Get the accession number from slicing + ChrStart, information. + ChrStop, + orientation) + getGBFromHash(fileHash) ; Get the accession number from its hash. + getGBFromGI(GI) ; Get the accession number from its GI + number. + getLoc(accNo) ; Get the slicing information of an + accession number. + getHash(accNo) ; Get the hash of a GenBank record. + getUrl(accNo) ; Get the URL of an accession number. + + Inherited methods from Db: + query(statement) ; General query function. + + SQL tables from internalDb: + GBInfo ; Information about cached and uploaded GenBank files. + """ + + def __init__(self, config) : + """ + Initialise the Db parent class. Use the internalDb. + + Arguments: + config ; Configuration variables. + """ + + Db.__init__(self, config.internalDb, config.LocalMySQLuser, + config.LocalMySQLhost) + #__init__ + + def insertGB(self, accNo, GI, fileHash, ChrAccVer, ChrStart, ChrStop, orientation, url) : """ Insert information about a GenBank record in the internal database. - The AccNo and md5 arguments are mandatory. + The accNo and fileHash arguments are mandatory. - If the record is a normal RefSeq, then the GI number should be provided. - If the record is a chromosome slice, then the ChrAccVer, @@ -616,9 +748,9 @@ class Db() : is assumed to be uploaded. Arguments: - AccNo ; The name associated with this record. + accNo ; The name associated with this record. GI ; The GI number (if available). - md5 ; The md5sum of the content of the record. + fileHash ; The hash of the content of the record. ChrAccVer ; The accession number of the chromosome (if available). ChrStart ; The start of the record in chromosomal @@ -637,18 +769,19 @@ class Db() : statement = """ INSERT INTO GBInfo VALUES (%s, %s, %s, %s, %s, %s, %s, %s); - """, (AccNo, GI, md5, ChrAccVer, ChrStart, ChrStop, orientation, url) + """, (accNo, GI, fileHash, ChrAccVer, ChrStart, ChrStop, orientation, + url) - self.__query(statement) + self.query(statement) #insertGB - def updateHash(self, AccNo, md5) : + def updateHash(self, accNo, fileHash) : """ Update the hash of an accession number. Arguments: - AccNo ; The accession number of a GenBank record. - hash ; The hash of a GenBank record. + accNo ; The accession number of a GenBank record. + fileHash ; The hash of a GenBank record. SQL tables from internalDb (altered): GBInfo ; Information about cached and uploaded GenBank files. @@ -658,9 +791,9 @@ class Db() : UPDATE GBInfo SET hash = %s WHERE AccNo = %s; - """, (md5, AccNo) + """, (fileHash, accNo) - self.__query(statement) + self.query(statement) #updateHash def getGBFromLoc(self, ChrAccVer, ChrStart, ChrStop, orientation) : @@ -692,18 +825,18 @@ class Db() : AND orientation = %s; """, (ChrAccVer, ChrStart, ChrStop, orientation) - ret = self.__query(statement) + ret = self.query(statement) if ret : return ret[0][0] return None #getGBFromLoc - def getGBFromHash(self, md5) : + def getGBFromHash(self, fileHash) : """ Get the accession number from its hash. Arguments: - hash ; The hash of a GenBank record. + fileHash ; The hash of a GenBank record. SQL tables from internalDb: GBInfo ; Information about cached and uploaded GenBank files. @@ -716,9 +849,9 @@ class Db() : SELECT AccNo FROM GBInfo WHERE hash = %s; - """, md5 + """, fileHash - ret = self.__query(statement) + ret = self.query(statement) if ret : return ret[0][0] return None @@ -745,19 +878,19 @@ class Db() : WHERE GI = %s; """, GI - ret = self.__query(statement) + ret = self.query(statement) if ret : return ret[0][0] return None #getGBFromGI - def getLoc(self, AccNo) : + def getLoc(self, accNo) : """ Get the slicing information of an accession number, typically this only affects UD numbers. Arguments: - AccNo ; The accession number of a genbank record. + accNo ; The accession number of a genbank record. SQL tables from internalDb: GBInfo ; Information about cached and uploaded GenBank files. @@ -775,20 +908,20 @@ class Db() : SELECT ChrAccVer, ChrStart, ChrStop, orientation FROM GBInfo WHERE AccNo = %s; - """, AccNo + """, accNo - ret = self.__query(statement) + ret = self.query(statement) if ret : return list(ret[0]) return None #getLoc - def getHash(self, AccNo) : + def getHash(self, accNo) : """ Get the hash of a GenBank record identified by an accession number. Arguments: - AccNo ; The accession number of a genbank record. + accNo ; The accession number of a genbank record. SQL tables from internalDb: GBInfo ; Information about cached and uploaded GenBank files. @@ -801,21 +934,21 @@ class Db() : SELECT hash FROM GBInfo WHERE AccNo = %s; - """, AccNo + """, accNo - ret = self.__query(statement) + ret = self.query(statement) if ret : return ret[0][0] return None #getHash - def getUrl(self, AccNo) : + def getUrl(self, accNo) : """ Get the URL of an accession number, typically this only affects uploaded UD numbers. Arguments: - AccNo ; The accession number of a genbank record. + accNo ; The accession number of a genbank record. SQL tables from internalDb: GBInfo ; Information about cached and uploaded GenBank files. @@ -828,61 +961,89 @@ class Db() : SELECT url FROM GBInfo WHERE AccNo = %s; - """, AccNo + """, accNo - ret = self.__query(statement) + ret = self.query(statement) if ret : return ret[0][0] return None #getHash - def getGI(self, AccNo) : + def getGI(self, accNo) : """ + Get the GI number that is connected to the accession number. + + Arguments: + accNo ; The accession number. + + SQL tables from internalDb: + GBInfo ; Information about cached and uploaded GenBank files. """ + statement = """ SELECT GI FROM GBInfo WHERE AccNo = %s; - """, AccNo + """, accNo - ret = self.__query(statement) + ret = self.query(statement) if ret : return ret[0][0] return None #getGI +#Cache - # - # These methods are for the batch checker. - # +class Batch(Db) : + """ + Database functions for the batch checker. - def getWatchDogTimer(self) : - """ - """ + Special methods: + __init__(config) ; Initialise the class. - statement = """ - SELECT Value - FROM Var - WHERE Name = "WatchDog"; - """, None + Public methods: + isJobListEmpty() ; See if there are active jobs. + addJob(outputFilter, ; Add a job and give it a unique ID. + email, + fromHost) + getJobs() ; Get a list of active jobs. + removeJob(jobID) ; Remove a job and return information about + the job submitter. + addToQueue(jobID, ; Add a request belonging to a certain job to + accNo, the queue. + gene, + variant) + getFromQueue(jobID) ; Get a request belonging to a certain job + from the queue. + + Inherited methods from Db: + query(statement) ; General query function. - return int(self.__query(statement)[0][0]) - #getWatchDogTimer + SQL tables from internalDb: + BatchJob ; Job information. + BatchQueue ; Requests. + """ - def setWatchDogTimer(self) : - """ + def __init__(self, config) : """ + Initialise the Db parent class. Use the internalDb. - statement = """ - UPDATE Var - SET Value = %s - WHERE Name = "WatchDog"; - """, time.strftime("%s") + Arguments: + config ; Configuration variables. + """ - self.__query(statement) - #setWatchDogTimer + Db.__init__(self, config.internalDb, config.LocalMySQLuser, + config.LocalMySQLhost) + #__init__ def isJobListEmpty(self) : """ + See if there are active jobs. + + SQL tables from internalDb: + BatchJob ; Job information. + + Returns: + boolean ; False if there are active jobs, True otherwise. """ statement = """ @@ -890,13 +1051,24 @@ class Db() : FROM BatchJob; """, None - if int(self.__query(statement)[0][0]) : + if int(self.query(statement)[0][0]) : return False return True #isJobListEmpty - def addJob(self, output_filter, email) : + def addJob(self, outputFilter, email, fromHost) : """ + Add a job and give it a unique ID. + + Arguments: + outputFilter ; Output settings for all requests in this job. + email ; Contact information of the submitter. + + SQL tables from internalDb (altered): + BatchJob ; Job information. + + Returns: + int ; A job ID. """ M = Misc.Misc() @@ -904,15 +1076,22 @@ class Db() : del M statement = """ INSERT INTO BatchJob - VALUES (%s, %s, %s); - """, (jobID, output_filter, email) + VALUES (%s, %s, %s, %s); + """, (jobID, outputFilter, email, fromHost) - self.__query(statement) + self.query(statement) return jobID #addJob def getJobs(self) : """ + Get a list of active jobs. + + SQL tables from internalDb: + BatchJob ; Job information. + + Returns: + list ; List of job IDs. """ statement = """ @@ -921,46 +1100,85 @@ class Db() : """, None ret = [] - for i in self.__query(statement) : + for i in self.query(statement) : ret.append(i[0]) return ret #getJobs def removeJob(self, jobID) : """ - """ + Remove a job (because the queue for this job is empty) and return + information needed to alert the job submitter. + + Arguments: + jobID ; Identifier of a job. + + SQL tables from internalDb (altered): + BatchJob ; Job information. + Returns: + triple ; Data for the job submitter. + """ + + # First retrieve all information about this job. statement = """ - SELECT EMail + SELECT EMail, Filter, FromHost FROM BatchJob WHERE JobID = %s; """, jobID - eMail = self.__query(statement)[0][0] + data = self.query(statement)[0] + # Remove the job. statement = """ DELETE FROM BatchJob WHERE JobID = %s; """, jobID - self.__query(statement) - return eMail + self.query(statement) + return data #removeJob - def addToQueue(self, jobID, AccNo, Gene, Variant) : + def addToQueue(self, jobID, accNo, gene, variant) : """ + Add a request belonging to a certain job to the queue. + + Arguments: + jobID ; Identifier of a job. + accNo ; The accession number of a request. + gene ; The gene and transcript variant information. + variant ; The variant. + + SQL tables from internalDb (altered): + BatchQueue ; Requests. """ + # The first value (QueueID) will be auto increased by MySQL. statement = """ INSERT INTO BatchQueue VALUES (%s, %s, %s, %s, %s); - """, (None, jobID, AccNo, Gene, Variant) + """, (None, jobID, accNo, gene, variant) - self.__query(statement) + self.query(statement) #addToQueue def getFromQueue(self, jobID) : """ + Get a request belonging to a certain job from the queue. If a + request is found, remove it from the queue and return it. Otherwise + return nothing. + + Arguments: + jobID ; Identifier of a job. + + SQL tables from internalDb (altered): + BatchQueue ; Requests. + + Returns: + triple: + accNo ; The accession number of a request. + gene ; The gene and transcript variant information. + variant ; The variant. """ statement = """ @@ -971,35 +1189,27 @@ class Db() : LIMIT 1; """, jobID - results = self.__query(statement) + results = self.query(statement) if results : - queueID, accNo, gene, variant = results[0] + jobID, accNo, gene, variant = results[0] else : return None + # We have found a request, so remove it from the queue. statement = """ DELETE FROM BatchQueue WHERE QueueID = %s; - """, queueID + """, jobID - self.__query(statement) + self.query(statement) return accNo, gene, variant #getFromQueue -#Db +#Batch # # Unit test. # if __name__ == "__main__" : - # Get the username / db from the config file. - D = Db("local") - - # Do some basic testing (will crash if MySQL is not set up properly. - D.get_protAcc("NM_002001") - D.get_NM_info("NM_002001") - D.get_NM_version("NM_002001") - D.get_Transcripts("chr1", 159272155, 159272155, 0) - D.get_GeneName("NM_002001") - del D + pass #if diff --git a/src/Modules/File.py b/src/Modules/File.py new file mode 100644 index 00000000..713b399b --- /dev/null +++ b/src/Modules/File.py @@ -0,0 +1,322 @@ +#!/usr/bin/python + +""" + Module for parsing CSV files and spreadsheets. + + Public classes: + File ; Parse CSV files and spreadsheets. +""" + +import magic # open(), MAGIC_MIME, MAGIC_NONE +import csv # Sniffer(), reader(), Error +import xlrd # open_workbook() +import zipfile # ZipFile() +import xml.dom.minidom # parseString() +import os # remove() +import types # UnicodeType + +from Modules import Misc + +class File() : + """ + Parse CSV files and spreadsheets. + + Private variables: + __config ; Configuration variables. + __output ; The Output object. + + Special methods: + __init__(config, output) ; Initialse the class. + + Private methods: + __tempFileWrapper(func, ; Call func() with a filename. + handle) + __getMimeType(handle) ; Get the mime type of a stream. + __parseCsvFile(handle) ; Parse a CSV file. + __parseXlsFile(handle) ; Parse an Excel file. + __parseOdsFile(handle) ; Parse an OpenDocument Spreadsheet file. + __checkBatchFormat(job) ; Check a batch job and sanitize it. + + Public methods: + parseFileRaw(handle) ; Parse a stream with the appropriate parser. + parseBatchFile(handle) ; Parse a stream with the appropriate parser + and sanitize the output. + """ + + def __init__(self, config, output) : + """ + Initialise the class. + + Private variables (altered): + __config ; Initialised with configuration variables. + __output ; Set to the Output object. + """ + + self.__config = config + self.__output = output + #__init__ + + def __tempFileWrapper(self, func, handle) : + """ + Make a temporary file, put the content of a stream in it and pass + the filename to a general function. Return whatever this function + returns. + + Arguments: + func ; A general function that needs a file name as argument. + handle ; A stream. + + Returns: + unknown ; The output of func(). + """ + + # Generate an unique filename in the tempDir directory. + MiscInstance = Misc.Misc() + fileName = self.__config.tempDir + '/' + str(MiscInstance.ID()) + del MiscInstance + + # Dump the content of the stream pointed to by handle into the file. + handle.seek(0) + writeHandle = open(fileName, "w") + writeHandle.write(handle.read()) + writeHandle.close() + + # Open the file with func(). + ret = func(fileName) + os.remove(fileName) + + return ret + #__tempFileWrapper + + def __getMimeType(self, handle) : + """ + Get the mime type of a stream by inspecting a fixed number of bytes. + The stream is not rewinded after use. + + Arguments: + handle ; A handle to a stream. + + Private variables: + __config ; The bufSize configuration variables. + + Returns: + string ; The mime type of a file. + """ + + handle.seek(0) + buf = handle.read(self.__config.bufSize) + + MagicInstance = magic.open(magic.MAGIC_MIME) + MagicInstance.load() + mimeType = MagicInstance.buffer(buf).split(';')[0] + MagicInstance.close() + MagicInstance = magic.open(magic.MAGIC_NONE) + MagicInstance.load() + description = MagicInstance.buffer(buf) + del MagicInstance + + return mimeType, description + #__getMimeType + + def __parseCsvFile(self, handle) : + """ + Parse a CSV file. + The stream is not rewinded after use. + + Arguments: + handle ; A handle to a stream. + + Private variables: + __config ; The bufSize configuration variables. + + Returns: + list ; A list of lists. + """ + + handle.seek(0) + buf = handle.read(self.__config.bufSize) + + try : + dialect = csv.Sniffer().sniff(buf) + except csv.Error, e : + self.__output.addMessage(__file__, 4, "EBPARSE", e) + return None + #except + + handle.seek(0) + reader = csv.reader(handle, dialect) + + ret = [] + for i in reader : + ret.append(i) + + return ret + #__parseCsvFile + + def __parseXlsFile(self, handle) : + """ + Parse an Excel file. + The stream is not rewinded after use. + + Arguments: + handle ; A handle to a stream. + + Returns: + list ; A list of lists. + """ + + workBook = self.__tempFileWrapper(xlrd.open_workbook, handle) + sheet = workBook.sheet_by_index(0) + + ret = [] + for i in range(sheet.nrows) : + row = [] + for j in sheet.row_values(i) : + if type(j) == types.UnicodeType : # Convert the data to strings. + row.append(j.encode("utf8")) + else : + row.append(str(j)) + #for + ret.append(row) + #for + + del sheet, workBook + + return ret + #__parseXlsFile + + def __parseOdsFile(self, handle) : + """ + Parse an OpenDocument Spreadsheet file. + The stream is not rewinded after use. + + Arguments: + handle ; A handle to a stream. + + Returns: + list ; A list of lists. + + """ + + zipFile = self.__tempFileWrapper(zipfile.ZipFile, handle) + doc = xml.dom.minidom.parseString(zipFile.read("content.xml")) + zipFile.close() + + ret = [] + for i in doc.getElementsByTagName("table:table-row") : + row = [] + for j in i.getElementsByTagName("table:table-cell") : + c = j.getElementsByTagName("text:p") + if c : + row.append(c[0].lastChild.data.encode("utf8")) + #if + #for + ret.append(row) + #for + + return ret + #__parseOdsFile + + def __checkBatchFormat(self, job) : + """ + Check if a job is of the correct format. + - Each row should consist of three elements. + - The first and the last element should be non-zero. + - The first line should be the header defined in the config file. + - Silently ignore all empty lines. + + Arguments: + job ; list of lists. + + Private variables: + __config ; The header configuration variable. + + Returns: + list ; A sanitised list of lists (without a header or empty + lines). + """ + + if job[0] != self.__config.header : + self.__output.addMessage(__file__, 4, "EBPARSE", + "Header not valid.") + return None + #if + + for i in range(0, len(job)) : + if job[i] : # Non empty line. + if len(job[i]) == 3 : + if job[i][0] or job[i][1] or job[i][2] : # Non empty line. + if not job[i][0] : + self.__output.addMessage(__file__, 4, "EBPARSE", + "The first column may not be empty in line " \ + "%i." % i) + return None + #if + if not job[i][2] : + self.__output.addMessage(__file__, 4, "EBPARSE", + "The last column may not be empty in line " \ + "%i." % i) + return None + #if + #if + #if + else : + self.__output.addMessage(__file__, 4, "EBPARSE", + "Wrong amount of columns in line %i.\n" % i) + return None + #else + #if + #for + + # All tests are passed, now we do some trimming. + ret = [] + for i in range(1, len(job)) : + if job[i] and job[i] != ['', '', ''] : + ret.append(job[i]) + + return ret + #__checkBatchFormat + + def parseFileRaw(self, handle) : + """ + Check which format a stream has and parse it with the appropriate + parser if the stream is recognised. + + Arguments: + handle ; A handle to a stream. + + Returns: + list ; A list of lists, None if an error occured. + """ + + mimeType = self.__getMimeType(handle) + if mimeType[0] == "text/plain" : + return self.__parseCsvFile(handle) + if mimeType[0] == "application/vnd.ms-office" : + return self.__parseXlsFile(handle) + if mimeType == ("application/octet-stream", + "OpenDocument Spreadsheet") : + return self.__parseOdsFile(handle) + + return None + #parseFile + + def parseBatchFile(self, handle) : + """ + Check which format a stream has and parse it with the appropriate + parser if the stream is recognised. + + Arguments: + handle ; A handle to a stream. + + Returns: + list ; A sanitised list of lists (without a header or empty + lines), or None if an error occured. + """ + + job = self.parseFileRaw(handle) + if job : + return self.__checkBatchFormat(job) + return None + #parseBatchFile +#File diff --git a/src/Modules/GenRecord.py b/src/Modules/GenRecord.py index e8d21872..9af23978 100644 --- a/src/Modules/GenRecord.py +++ b/src/Modules/GenRecord.py @@ -1,6 +1,23 @@ #!/usr/bin/python -class Plist(object) : +import Crossmap +import Bio + +""" + Module to convert a GenBank record to a nested dictionary consisting of + a list of genes, which itself consists of a list of loci. This structure + makes it possible to iterate over genes and transcripts without having to + search for them each time. + + Public classes: + PList ; Store a general location and a list of splice sites. + Locus ; Store data about the mRNA and CDS splice sites. + Gene ; Store a list of Locus objects and the orientation. + Record ; Store a geneList and other additional information. + GenRecord ; Convert a GenBank record to a nested dictionary. +""" + +class PList(object) : """ A position list object, to store a general location and a list of specific splice sites (if available). @@ -30,9 +47,9 @@ class Plist(object) : """ self.location = [] - self.list = [] + self.positionList = [] #__init__ -#plist +#PList class Locus(object) : """ @@ -47,26 +64,42 @@ class Locus(object) : exon ; A position list object. """ - def __init__(self) : + def __init__(self, name) : """ Initialise the class. Public variables (altered): - mRNA ; A position list object. - CDS ; A position list object. - exon ; A position list object. - CM ; A Crossmap object. + mRNA ; A position list object. + CDS ; A position list object. + location ; + exon ; A position list object. + txTable ; The translation table. + CM ; A Crossmap object. """ + self.name = name self.mRNA = None self.CDS = None self.location = [] self.exon = None self.txTable = 1 self.CM = None - + self.transcriptID = None + self.proteinID = None + self.molType = 'c' + self.description = "" #__init__ -#locus + + def addToDescription(self, rawVariant) : + """ + """ + + if self.description : + self.description = "%s;%s" % (self.description, rawVariant) + else : + self.description = rawVariant + #addToDescription +#Locus class Gene(object) : """ @@ -82,7 +115,7 @@ class Gene(object) : list ; A list of Locus objects. """ - def __init__(self) : + def __init__(self, name) : """ Initialise the class. @@ -91,21 +124,32 @@ class Gene(object) : list ; A list of Locus objects. """ - self.orientation = 0 - self.list = {} + self.name = name + self.orientation = 1 + self.transcriptList = [] #__init__ -#gene -class RecordObj(object) : + def findLocus(self, name) : + """ + """ + + for i in self.transcriptList : + if i.name == name : + return i + return None + #findLocus +#Gene + +class Record(object) : """ - A RecordObj object, to store a genelist and other additional + A Record object, to store a geneList and other additional information. Special methods: __init__() ; Initialise the class. Public variables: - genelist ; List of Gene objects. + geneList ; List of Gene objects. mol_type ; Variable to indicate the sequence type (DNA, RNA, ...) organelle ; Variable to indicate whether the sequence is from the nucleus or from an onganelle (if so, also from which @@ -120,7 +164,7 @@ class RecordObj(object) : Public variables (altered): - genelist ; List of Gene objects. + geneList ; List of Gene objects. mol_type ; Variable to indicate the sequence type (DNA, RNA, ...) organelle ; Variable to indicate whether the sequence is from @@ -130,16 +174,36 @@ class RecordObj(object) : information is present. """ - self.genelist = {} + self.geneList = [] self.mol_type = None self.organelle = None - self.source = Gene() + self.source = Gene(None) #__init__ -#RecordObj + + def hasGene(self, name) : + """ + """ + + for i in self.geneList : + if i.name == name : + return True + return False + #hasGene + + def findGene(self, name) : + """ + """ + + for i in self.geneList : + if i.name == name : + return i + return None + #findGene +#Record class GenRecord() : """ - Hmmmm. + Convert a GenBank record to a nested dictionary. Private methods: __location2pos(location) ; @@ -150,6 +214,15 @@ class GenRecord() : structured dictionary. """ + def __init__(self, config, output) : + """ + """ + + self.__config = config + self.__output = output + self.record = None + #__init__ + def __location2pos(self, location) : """ Convert a location object to a tuple of integers. @@ -184,6 +257,10 @@ class GenRecord() : ret = [] for i in locationList.sub_features : + if i.ref : # This is a workaround for a bug in BioPython. + ret = None + break + #if temp = self.__location2pos(i.location) ret.append(temp[0]) ret.append(temp[1]) @@ -192,51 +269,74 @@ class GenRecord() : return ret #__locationList2posList - """ - def __sortins(self, position, posList) : - last = 0 - - for i in range(0, len(posList), 2) : - if position[0] == posList[i] : - return posList - if position[0] > last and position[0] < posList[i] : - return posList[:i] + position + posList[i:] - last = posList[i] - #for - return posList + position - #__sortins - """ + def __constructCDS(self, mRNA, CDSpos) : + """ + """ + + i = 1 + ret = [CDSpos[0]] + + while CDSpos[0] > mRNA[i] : + i += 2 + + j = i + while CDSpos[1] > mRNA[j] : + j += 2 + + ret.extend(mRNA[i:j]) + ret.append(CDSpos[1]) - def record2dict(self, record) : - recordDict = RecordObj() - #recordDict.genelist = {} + return ret + #__constructCDS + + def __maybeInvert(self, gene, string) : + """ + """ + + if gene.orientation == -1 : + return Bio.Seq.reverse_complement(string) + return string + #__maybeInvert + + def parseRecord(self, record) : + """ + Convert a GenBank record to a nested dictionary. + + Arguments: + record ; A GenBank record. + + Returns: + dict ; A nested dictionary. + """ + + self.record = Record() for i in record.features : if i.qualifiers : if i.type == "source" : if i.qualifiers.has_key("organelle") : - recordDict.organelle = i.qualifiers["organelle"][0] + self.record.organelle = i.qualifiers["organelle"][0] if i.qualifiers.has_key("mol_type") : - recordDict.mol_type = i.qualifiers["mol_type"][0] + self.record.mol_type = i.qualifiers["mol_type"][0] - #recordDict["null"] = Gene() - recordDict.source.orientation = 1 - recordDict.source.list["001"] = Locus() - recordDict.source.list["001"].CDS = Plist() - recordDict.source.list["001"].CDS.location = \ - self.__location2pos(i.location) + fakeGene = Locus("001") + self.record.source.transcriptList.append(fakeGene) + fakeGene.CDS = PList() + fakeGene.CDS.location = self.__location2pos(i.location) + #if if i.qualifiers.has_key("gene") : gene = i.qualifiers["gene"][0] - if not recordDict.genelist.has_key(gene) : - recordDict.genelist[gene] = Gene() + + GeneInstance = self.record.findGene(gene) + if not GeneInstance : + GeneInstance = Gene(gene) + self.record.geneList.append(GeneInstance) + #if + if i.type == "gene" : if i.strand : - recordDict.genelist[gene].orientation = i.strand - else : - recordDict.genelist[gene].orientation = 1 - - recordDict.genelist[gene].location = \ - self.__location2pos(i.location) + GeneInstance.orientation = i.strand + GeneInstance.location = self.__location2pos(i.location) #if # Look if there is a locus tag present, if not, give it the @@ -244,73 +344,152 @@ class GenRecord() : locus_tag = "001" if i.qualifiers.has_key("locus_tag") : locus_tag = i.qualifiers["locus_tag"][0][-3:] - if not recordDict.genelist[gene].list.has_key(locus_tag) : - recordDict.genelist[gene].list[locus_tag] = Locus() + + LocusInstance = GeneInstance.findLocus(locus_tag) + if not LocusInstance : + LocusInstance = Locus(locus_tag) + GeneInstance.transcriptList.append(LocusInstance) + #if if i.type == "mRNA" : - recordDict.genelist[gene].list[locus_tag].mRNA = Plist() - recordDict.genelist[gene].list[locus_tag].mRNA.location = \ - self.__location2pos(i.location) - recordDict.genelist[gene].list[locus_tag].mRNA.list = \ - self.__locationList2posList(i) + PListInstance = PList() + LocusInstance.mRNA = PListInstance + + posList = self.__locationList2posList(i) + if posList != None : + PListInstance.location = \ + self.__location2pos(i.location) + PListInstance.positionList = posList + #if + #if if i.type == "CDS" : - recordDict.genelist[gene].list[locus_tag].CDS = Plist() - recordDict.genelist[gene].list[locus_tag].CDS.location = \ - self.__location2pos(i.location) - recordDict.genelist[gene].list[locus_tag].CDS.list = \ + PListInstance = PList() + LocusInstance.CDS = PListInstance + + PListInstance.location = self.__location2pos(i.location) + PListInstance.positionList = \ self.__locationList2posList(i) + if i.qualifiers.has_key("transl_table") : - recordDict.genelist[gene].list[locus_tag].txTable = \ + LocusInstance.txTable = \ int(i.qualifiers["transl_table"][0]) #if if i.type == "exon" : - if not recordDict.genelist[gene].list[locus_tag].exon : - recordDict.genelist[gene].list[locus_tag].exon = Plist() - recordDict.genelist[gene].list[locus_tag].exon.list.extend( + if not LocusInstance.exon : + PListInstance = PList() + LocusInstance.exon = PListInstance + #if + PListInstance.positionList.extend( self.__location2pos(i.location)) + #if #if #if #for - return recordDict - #record2dict - - def printRecordDict(self, d, record) : - for i in d : - print i - print " Orientation: " + str(d[i].orientation) - for j in d[i].list : - print " Locus: " + str(j) - if d[i].list[j].mRNA : - print " mRNA: " - print " " + str(d[i].list[j].mRNA.location) - if d[i].list[j].mRNA.list : - print " " + str(d[i].list[j].mRNA.list) - print splice(record, d[i].list[j].mRNA.list) + # Now we have gathered all information. + for i in self.record.geneList : + for j in i.transcriptList : + if not j.mRNA : + if not j.exon: + self.__output.addMessage(__file__, 2, "WNOMRNA", + "No mRNA field found for gene %s, transcript " \ + "variant %s in GenBank record %s, constructing " \ + "it from CDS." % (i.name, j.name, record.id)) + if j.CDS : + if not j.CDS.positionList : + self.__output.addMessage(__file__, 2, + "WNOCDSLIST", "No CDS list found for " \ + "gene %s, transcript variant %s in " \ + "GenBank record %s, constructing it from " \ + "CDS location." % (i.name, j.name, + record.id)) + j.mRNA = j.CDS + j.mRNA.positionList = j.CDS.location + #if + else : + j.mRNA = j.CDS + #if + else : + self.__output.addMessage(__file__, 2, "WNOCDS", + "No CDS found for gene %s, transcript " \ + "variant %s in GenBank record %s, " \ + "constructing it from genelocation." % ( + i.name, j.name, record.id)) + j.CDS = GenRecord.Locus() + j.CDS.location = j.location + j.mRNA = j.CDS + j.mRNA.positionList = i.location + j.molType = 'n' + #else #if else : - print splice(record, d[i].list[j].mRNA.location) + self.__output.addMessage(__file__, 2, "WNOMRNA", + "No mRNA field found for gene %s, transcript " \ + "variant %s in GenBank record %s, constructing " \ + "it from gathered exon information." % ( + i.name, j.name, record.id)) + j.mRNA = j.exon + #else + #if + if not j.mRNA.positionList : + j.mRNA.positionList = j.mRNA.location + if j.CDS : + if not j.CDS.positionList : + self.__output.addMessage(__file__, 2, "WNOCDS", + "No CDS list found for gene %s, transcript " \ + "variant %s in GenBank record %s, constructing " \ + "it from mRNA list and CDS location." % (i.name, + j.name, record.id)) + if j.mRNA.positionList : + j.CDS.positionList = self.__constructCDS( + j.mRNA.positionList, j.CDS.location) + else : + j.CDS.positionList = self.__constructCDS( + j.mRNA.location, j.CDS.location) + #if + j.CM = Crossmap.Crossmap(j.mRNA.positionList, + j.CDS.location, i.orientation) #if - if d[i].list[j].CDS : - print " CDS: " - print " " + str(d[i].list[j].CDS.location) - if d[i].list[j].CDS.list : - print " " + str(d[i].list[j].CDS.list) - print splice(record, d[i].list[j].CDS.list) + else : + j.molType = 'n' + if j.mRNA.positionList : + j.CM = Crossmap.Crossmap(j.mRNA.positionList, + [], i.orientation) + else : + j.description = '?' + #else + #for + #for + #parseRecord + + def name(self, start, stop, varType, arg1, arg2) : + """ + """ + + for i in self.record.geneList : + for j in i.transcriptList : + if j.CM : + if varType != "subst" : + if start != stop : + j.addToDescription("%s_%s%s%s" % (j.CM.g2c(start), + j.CM.g2c(stop), varType, + self.__maybeInvert(i, arg1))) + else : + j.addToDescription("%s%s%s" % (j.CM.g2c(start), + varType, self.__maybeInvert(i, arg1))) #if else : - print splice(record, d[i].list[j].CDS.location) + j.addToDescription("%s%c>%c" % (j.CM.g2c(start), + self.__maybeInvert(i, arg1), + self.__maybeInvert(i, arg2))) #if #for #for - #printRecordDict + #name #GenRecord if __name__ == "__main__" : R = GenRecord() - bla = R._GenRecord__sortins([10, 20], [4, 5]) - print R._GenRecord__sortins([1, 2], bla) - print R._GenRecord__sortins([8, 9], bla) del R #if diff --git a/src/Modules/Mapper.py b/src/Modules/Mapper.py index dbb836c1..a03e7005 100644 --- a/src/Modules/Mapper.py +++ b/src/Modules/Mapper.py @@ -30,7 +30,7 @@ from soaplib.serializers.primitive import String, Integer from soaplib.serializers.clazz import ClassSerializer class Mapping(ClassSerializer) : - ''' + """ Extended ClassSerializer object with mixed types of attributes Attributes: @@ -41,49 +41,69 @@ class Mapping(ClassSerializer) : start_g ; Define the type of start_g value. end_g ; Define the type of end_g value. mutationType ; Define the type of mutation type - ''' - class types : - startmain = Integer - startoffset = Integer - endmain = Integer - endoffset = Integer - start_g = Integer - end_g = Integer + """ + + class types() : + """ + Types are defined here for the TC module. + """ + + startmain = Integer + startoffset = Integer + endmain = Integer + endoffset = Integer + start_g = Integer + end_g = Integer mutationType = String #types -#Mapping -# Any comments on the following statement?? -Mapping.typecode = TC.Struct(Mapping, - [ TC.Integer('startmain'), - TC.Integer('startoffset'), - TC.Integer('endmain'), - TC.Integer('endoffset'), - TC.Integer('start_g'), - TC.Integer('end_g'), - TC.String('mutationType') ], - 'Mapping') + def __init__(self) : + """ + Types are defined here for the soaplib module. + """ + + self.typecode = TC.Struct(Mapping, [ + TC.Integer('startmain'), + TC.Integer('startoffset'), + TC.Integer('endmain'), + TC.Integer('endoffset'), + TC.Integer('start_g'), + TC.Integer('end_g'), + TC.String('mutationType') + ], 'Mapping') + #__init__ +#Mapping class Transcript(ClassSerializer) : - ''' + """ Extended ClassSerializer object with mixed types of attributes Attributes: trans_start ; Define the type of trans_start trans_stop ; Define the type of trans_stop CDS_stop ; Define the type of CDS_stop - ''' - class types : + """ + + class types() : + """ + """ + trans_start = Integer - trans_stop = Integer - CDS_stop = Integer + trans_stop = Integer + CDS_stop = Integer #types + + def __init__(self) : + """ + """ + + self.typecode = TC.Struct(Transcript, [ + TC.Integer('trans_start'), + TC.Integer('trans_stop'), + TC.Integer('CDS_stop') + ], 'Transcript') + #__init__ #Transcript -Transcript.typecode = TC.Struct(Transcript, - [ TC.Integer('trans_start'), - TC.Integer('trans_stop'), - TC.Integer('CDS_stop') ], - 'Transcript') def __sl2il(l) : """ @@ -132,7 +152,7 @@ def __process(LOVD_ver, build, acc, var, Conf, O) : # Make a connection to the MySQL database with the username / db # information from the configuration file. #Database = Db.Db("local", O) # Open the database. - Database = Db.Db("local", build, Conf.Db) + Database = Db.Mapping(build, Conf.Db) # Get the rest of the input variables. @@ -199,7 +219,7 @@ def __process(LOVD_ver, build, acc, var, Conf, O) : #__process def conversionToCoding(offset, main, trans_start, trans_stop, CDS_stop) : - ''' + """ Converts c. (non-star) positions to c. numbered (star and +-) positions Arguments: @@ -216,7 +236,7 @@ def conversionToCoding(offset, main, trans_start, trans_stop, CDS_stop) : (intronic position, +- notation) cMain ; The main coordinate of a position in c. (star) notation. - ''' + """ cOffset = "" cMain = main if offset != "0" : diff --git a/src/Modules/Misc.py b/src/Modules/Misc.py index 98c08953..8f76dfc0 100644 --- a/src/Modules/Misc.py +++ b/src/Modules/Misc.py @@ -1,3 +1,8 @@ +#!/usr/bin/python + +""" +""" + import time class Misc() : diff --git a/src/Modules/Mutator.py b/src/Modules/Mutator.py index bdafa981..2893365c 100644 --- a/src/Modules/Mutator.py +++ b/src/Modules/Mutator.py @@ -1,6 +1,18 @@ #!/usr/bin/python -#from Output import Output +""" + Module for mutating a string. + + Mutations are described in the original coordinates. These coordinates are + transfered to the mutated coordinates with the aid of an internal shift + list, which keeps track of the sizes of changes. Using the original + coordinates greatly simplifies combined mutations in a variant. + + The original as well as the mutated string are stored. + + Public classes: + Mutator ; Mutate a string and register all shift points. +""" class Mutator() : """ @@ -50,25 +62,26 @@ class Mutator() : Initialise the class with the original string. Arguments: - orig ; The original string before mutation. + orig ; The original string before mutation. + config ; Configuration variables. + output ; The output object. Private variables (altered): - __shift ; Initialised to the empty list. + __config ; Initialised with the configuration variables. + __output ; Initialised with the output object. + __shift ; Initialised to the empty list. Public variables (altered): orig ; Initialised to the parameter orig. mutated ; Initialised to the parameter orig. """ - #Output.__init__(self, __file__) self.__config = config self.__output = output - self.__shift = [] + self.orig = orig self.mutated = orig - - #self.output.createOutputNode("visualisation", 1) # Info message. #__init__ def __sortins(self, tuple) : @@ -122,6 +135,11 @@ class Mutator() : pos2 ; The second interbase position of the deletion. ins ; The insertion. + Private variables: + __config ; The variables maxvissize, flanksize and flankclipsize + are used in the visualisation. + __output ; Visualisation information is added. + Public variables (altered): mutated ; This string will reflect the result of the given delins. @@ -135,24 +153,21 @@ class Mutator() : odel = self.orig[pos1:pos2] if len(odel) > self.__config.maxvissize : odel = "%s [%ibp] %s" % (odel[:self.__config.flankclipsize], - len(odel) - self.__config.flankclipsize * 2, - odel[-self.__config.flankclipsize:]) + len(odel) - self.__config.flankclipsize * 2, + odel[-self.__config.flankclipsize:]) bp1 = self.shiftpos(pos1) bp2 = self.shiftpos(pos2) lmflank = self.mutated[max(bp1 - self.__config.flanksize, 0):bp1] rmflank = self.mutated[bp2:bp2 + self.__config.flanksize] - #print insvis = ins if len(ins) > self.__config.maxvissize : insvis = "%s [%ibp] %s" % (ins[:self.__config.flankclipsize], - len(ins) - self.__config.flankclipsize * 2, - ins[-self.__config.flankclipsize:]) + len(ins) - self.__config.flankclipsize * 2, + ins[-self.__config.flankclipsize:]) fill = abs(len(odel) - len(insvis)) if len(odel) > len(ins) : - #print "%s %s %s" % (loflank, odel, roflank) - #print "%s %s%s %s" % (lmflank, insvis, '-' * fill, rmflank) self.__output.addOutput("visualisation", "%s %s %s" % (loflank, odel, roflank)) self.__output.addOutput("visualisation", @@ -160,8 +175,6 @@ class Mutator() : rmflank)) #if else : - #print "%s %s%s %s" % (loflank, odel, '-' * fill, roflank) - #print "%s %s %s" % (lmflank, insvis, rmflank) self.__output.addOutput("visualisation", "%s %s%s %s" % (loflank, odel, '-' * fill, roflank)) @@ -175,19 +188,6 @@ class Mutator() : self.mutated = self.mutated[:self.shiftpos(pos1)] + ins + \ self.mutated[self.shiftpos(pos2):] self.__sortins([pos1 + 1, len(ins) + pos1 - pos2]) - - """ - from Bio import pairwise2 - po1 = max(pos1 - 25, 0) # Bug fix for mutations at the - pm1 = max(self.shiftpos(pos1) - 25, 0) # start of a sequence. - - alignments = pairwise2.align.globalms(self.orig[po1:pos2 + 25], - self.mutated[pm1:self.shiftpos(pos2) + 25], - 1, -1, -2, -1) - print - print alignments[0][0] - print alignments[0][1] - """ #__mutate def shiftpos(self, position) : @@ -248,13 +248,16 @@ class Mutator() : Arguments: pos1 ; The first nucleotide of the range to be deleted. pos2 ; The last nucleotide of the range to be deleted. + + Private variables: + __output ; Visualisation information is added. """ if pos1 == pos2 : self.__output.addOutput("visualisation", "deletion of %i" % pos1) else : self.__output.addOutput("visualisation", "deletion of %i to %i" % ( - pos1, pos2)) + pos1, pos2)) self.__mutate(pos1 - 1, pos2, '') #delM @@ -266,10 +269,13 @@ class Mutator() : pos ; The interbase position where the insertion should take place. ins ; The insertion, a string. + + Private variables: + __output ; Visualisation information is added. """ - self.__output.addOutput("visualisation", "insertion between %i and %i" % ( - pos, pos + 1)) + self.__output.addOutput("visualisation", + "insertion between %i and %i" % (pos, pos + 1)) self.__mutate(pos, pos, ins) #insM @@ -294,6 +300,9 @@ class Mutator() : Arguments: pos ; The position where the substitution should take place. nuc ; The new nucleotide. + + Private variables: + __output ; Visualisation information is added. """ self.__output.addOutput("visualisation", "substitution at %i" % pos) @@ -340,84 +349,3 @@ class Mutator() : if __name__ == "__main__" : pass #if -""" -import sys - -def ladder() : - length = 79 - - for i in range(length) : - sys.stdout.write(str((i + 1) / 10)) - sys.stdout.write("\n") - for i in range(length) : - sys.stdout.write(str((i + 1) % 10)) - sys.stdout.write("\n") -#ladder - -M = Mutator("AAAGCCACCAGTTTCTTCCATGTGTTTTCACTCGCTTCGAAAAATTTAGGTAGGCTCTAGATATC") - -M.invM(44, 50) -print "Inv 44 50" -ladder() -print M.orig -print M.mutated - -M.delinsM(34, 38, "TTTAAAATTTTAA") -print "Delins 34 38 TTTAAAATTTTAA" -ladder() -print M.orig -print M.mutated - -M.invM(24, 30) -print "Inv 24 30" -ladder() -print M.orig -print M.mutated - -M.delM(10, 10) -print "Del 10" -ladder() -print M.orig -print M.mutated - -M.subM(5, 'T') -print "Sub 5 T" -ladder() -print M.orig -print M.mutated - -M.insM(7, 'G') -print "Ins 7_8 G" -ladder() -print M.orig -print M.mutated -print M._Mutator__shift - -M.delM(4, 8) -print "Del 4 8" -ladder() -print M.orig -print M.mutated -M.insM(4, "TTTA") -print "Ins 4 TTTA" -ladder() -print M.orig -print M.mutated -M.delM(4, 8) -print "Del 4 8" -ladder() -print M.orig -print M.mutated -M.delinsM(4, 8, "TTTAAAATTTTAA") -print "Delins 4 8 TTTAAAATTTTAA" -ladder() -print M.orig -print M.mutated -M.invM(24, 30,) -print "Inv 24 30" -ladder() -print M.orig -print M.mutated - -print M.newSplice([1, 10, 20, 30, 40]) -""" diff --git a/src/Modules/Output.py b/src/Modules/Output.py index 84cbe63f..44cddd08 100644 --- a/src/Modules/Output.py +++ b/src/Modules/Output.py @@ -1,23 +1,60 @@ #!/usr/bin/python -#from Config import Config -from time import strftime +""" + Module for storing output and messages. + Output is stored as a named list that can be expanded. + Messages can be retrieved at a later time to provide flexibility. Message + levels are defined to increase or decrease the amount of logging and ouput. + The position of the log file, as well as the levels are defined in the + configuration file. + + Message levels: + -1 : Log ; Specifically log a message. + 0 : Debug ; Debug information. + 1 : Info ; Info. + 2 : Warning ; Regular warnings. + 3 : Error ; Serious errors that can be compensated for. + 4 : Fatal ; Errors that are not recoverable. + 5 : Off ; Can be used as a log/output level to turn off output. + + Public classes: + Message ; Container class for message variables. + Output ; Output interface for errors, warnings and logging. +""" + +import time # strftime() -class Node() : - """ +class Message() : """ + Container class for message variables. - def __init__(self, level) : - self.message = [] - self.level = level - #__init__ -#Node + Special methods: + __init__(origin, level, code, description) ; Make a message object. -class Message() : - """ + Public variables: + origin ; Name of the module creating this object. + level ; Importance of the message. + code ; The error code of the message. + description ; A description of the message. """ def __init__(self, origin, level, code, description) : + """ + Make a new message object. + + Arguments: + origin ; Name of the module creating this object. + level ; Importance of the message. + code ; The error code of the message. + description ; A description of the message. + + Public variables (altered): + origin ; Name of the module creating this object. + level ; Importance of the message. + code ; The error code of the message. + description ; A description of the message. + """ + self.origin = origin self.level = level self.code = code @@ -25,39 +62,41 @@ class Message() : #__init__ #Message -#class Empty() : -# def __len__(self) : -# return 0 - class Output() : """ Provide an output interface for errors, warnings and logging purposes. Private variables: + __config ; Configuration variables. + __outputdata ; The output dictionary. + __messages ; The messages list. __instance ; The name of the module that made this object. __loghandle ; The handle of the log file. - __datestring ; Format of the prefix for log messages. __errors ; The number of errors that have been processed. __warnings ; The number of warnings that have been processed. Special methods: - __init__(config, instance) ; Initialise the class with variables + __init__(instance, config) ; Initialise the class with variables from the config file and the calling module. - __del__() ; Close the logfile. + __del__() ; Close the logfile and clean up. Private methods: __niceName(filename) ; Strip the path and the extention from a filename. + __levelToName(level) ; Convert a log level to a readable string. Public methods: - ErrorMsg(filename, message) ; Print an error message to standard - output and log it. - WarningMsg(filename, message) ; Print an error message to standard - output. - LogMsg(filename, message) ; Log a message. - Summary() ; Print a summary of the number of - errors and warnings. + addMessage(filename, ; Add a message to the message list. + level, + code, + description) + getMessages() ; Print all messages that exceed the + configured output level. + addOutput(name, data) ; Add output to the output dictionary. + getOutput(name) ; Retrieve data from the output dictionary. + Summary() ; Print a summary of the number of errors + and warnings. """ def __init__(self, instance, config) : @@ -66,26 +105,21 @@ class Output() : config file and the calling module. Arguments: - config ; The configuration object. instance ; The filename of the module that created this object. - - Public variables(altered): - outputdata ; The output list. + config ; The configuration object. Private variables (altered): + __config ; Configuration variables. + __outputdata ; The output dictionary. + __messages ; The messages list. __instance ; Initialised with the name of the module that created this object. __loghandle ; Initialised as the handle of the log file defined in the configuration file. - __datestring ; Format of the prefix for log messages. __errors ; Initialised to 0. __warnings ; Initialised to 0. - - Inherited variables from Config: - log ; Location of the log file. """ - #Config.__init__(self) self.__config = config self.__outputData = {} self.__messages = [] @@ -93,26 +127,18 @@ class Output() : self.__loghandle = open(self.__config.log, "a") self.__errors = 0 self.__warnings = 0 - - - #self.createOutputNode("debug", 0) - #self.createOutputNode("info", 1) - #self.createOutputNode("warnings", 2) - #self.createOutputNode("errors", 3) - #self.createOutputNode("fatalerrors", 4) - #self.createOutputNode("log", 5) #__init__ def __del__(self) : """ - Clean up the output list and close the log file. + Clean up the output dictionary, the messages list and close the log + file. - Public variables(altered): - outputdata ; The output list. - Private variables(altered): - __loghandle ; The handle of the log file defined in the - configuration file. + __loghandle ; The handle of the log file defined in the + configuration file. + __outputdata ; The output dictionary. + __messages ; The messages list. """ self.__loghandle.close() @@ -138,6 +164,13 @@ class Output() : def __levelToName(self, level) : """ + Convert a log level to a readable string. + + Arguments: + level ; A log level (an integer between -1 and 5). + + Returns: + string ; A readable description of the log level. """ if level == 0 : @@ -153,63 +186,79 @@ class Output() : return "" #__levelToName - #def addToOutputNode(self, filename, name, code, message) : - # """ - # """ - - # niceName = self.__niceName(filename) - - # self.__outputData[name].message.append(Message(niceName, code, message)) - - # level = self.__outputData[name].level - # if level >= self.__config.loglevel : - # prefix = "" - # if level == 2 : - # prefix = "Warning: " - # if level == 3 : - # prefix = "Error: " - # if level == 4 : - # prefix = "Fatal: " - # self.__loghandle.write(strftime(self.__config.datestring + ' ') + \ - # "%s (%s) %s: %s%s\n" % (self.__instance, - # niceName, code, prefix, message)) - # self.__loghandle.flush() - # #if - ##addToOutputNode - def addMessage(self, filename, level, code, description) : """ + Add a message to the message list. + If the level exceeds the configured loglevel or if the level is -1, + then the message is also logged. + If the severity equals 2, then the number of warnings is inreased, + if it exceeds 2, then the number of errors is increased. + + Arguments: + filename ; Name of the calling module. + level ; Severity of the message. + code ; Error code of the message. + description ; Description of the message. + + Private variables: + __messages ; The messages list. + __instance ; Module that created the Output object. + __config ; The variables loglevel and datestring are used. + __loghandle ; Handle to the log file. + + Private variables (altered): + __warnings ; Increased by one if the severity equals 2. + __errors ; Increased by one if the severity exceeds 2. """ niceName = self.__niceName(filename) + # Append a new message object to the messages list. self.__messages.append(Message(niceName, level, code, description)) if level == 2 : self.__warnings += 1 if level > 2 : self.__errors += 1 + + # Log the message if the message is important enough, or if it is only + # meant to be logged (level -1). if level > self.__config.loglevel or level == -1 : - self.__loghandle.write(strftime(self.__config.datestring + ' ') + \ - "%s (%s) %s: %s%s\n" % (self.__instance, - niceName, code, self.__levelToName(level), - description)) + self.__loghandle.write(time.strftime( + self.__config.datestring + ' ') + "%s (%s) %s: %s%s\n" % ( + self.__instance, niceName, code, self.__levelToName(level), + description)) self.__loghandle.flush() #if #addMessage def getMessages(self) : """ + Print all messages that exceed the configured output level. + + Private variables: + __messages ; The messages list. + __config ; The variable outputlevel is used. """ for i in self.__messages : if i.level > self.__config.outputlevel : - print "%s (%s): %s" % (self.__levelToName(i.level), i.origin, - i.description) + print "%s(%s): %s" % (self.__levelToName(i.level), i.origin, + i.description) #getMessages def addOutput(self, name, data) : """ + If the output dictionary already has a node with the specified + name, the list that this name points to is expanded with the data. + Otherwise create a node and assign a list containing the data. + + Arguments: + name ; Name of a node in the output dictionary. + data ; The data to be stored at this node. + + Private variables: + __outputData ; The output dictionary. """ if self.__outputData.has_key(name) : @@ -220,6 +269,13 @@ class Output() : def getOutput(self, name) : """ + Return a list of data from the output dictionary. + + Arguments: + name ; Name of a node in the output dictionary. + + Private variables: + __outputData ; The output dictionary. """ if self.__outputData.has_key(name) : @@ -227,94 +283,6 @@ class Output() : return None #getOutput - - #def createOutputNode(self, name, level) : - # """ - # """ - - # self.__outputData[name] = Node(level) - ##createOutputNode - - #def getData(self, name) : - # """ - # """ - - # if self.__outputData.has_key(name) and \ - # self.__outputData[name].level >= self.__config.outputlevel : - # return self.__outputData[name].message - # return [] - ##getdata - - #''' - #def getMsg(self, name) : - # """ - # """ - - # if self.__outputData[name].serverity >= self.__config.outputlevel : - # return - ##getMsg - - #def ErrorMsg(self, filename, message) : - # """ - # Print an error message to standard output and log it. - - # Arguments: - # filename ; The file where the error originated. - # message ; The error message. - - # Private variables (altered): - # __errors ; Increased by one. - # """ - - # print "Error (%s): %s" % (self.__niceName(filename), message) - # self.LogMsg(filename, "Error: " + message) - # #self.addData(filename, "test", "error", "5", message) - # self.__errors += 1 - ##ErrorMsg - - #def WarningMsg(self, filename, message) : - # """ - # Print an error message to standard output. - - # Arguments: - # filename ; The file where the warning originated. - # message ; The warning message. - - # Private variables (altered): - # __warnings ; Increased by one. - # """ - - # print "Warning (%s): %s" % (self.__niceName(filename), message) - # #self.addData(filename, "test", "warning", "5", message) - # self.__warnings += 1 - ##WarningMsg - - #def LogMsg(self, filename, message) : - # """ - # Log a message to the log file defined in the configuration file. - - # Arguments: - # filename ; The file where the logging request originated. - # message ; The message to be logged. - - # Private variables: - # __loghandle ; The handle of the log file defined in the - # configuration file. - # __instance ; The name of the module that created this output - # object. - - # Inherited variables from Config: - # datestring ; Format of the prefix for log messages. - # """ - - - # self.__loghandle.write(strftime(self.__config.datestring + ' ') + \ - # "%s (%s): %s\n" % (self.__instance, self.__niceName(filename), - # message)) - # self.__loghandle.flush() - ##LogMsg - #''' - def Summary(self) : """ Print a summary of the number of errors and warnings. @@ -322,6 +290,12 @@ class Output() : Private variables: __errors ; The number of errors. __warnings ; The number of warnings. + + Returns: + triple: + integer ; Number of errors. + integer ; Number of warnings. + string ; Summary. """ e_s = 's' @@ -331,8 +305,8 @@ class Output() : if self.__warnings == 1 : w_s = '' - print "%i Error%s, %i Warning%s." % (self.__errors, e_s, - self.__warnings, w_s) + return self.__errors, self.__warnings, "%i Error%s, %i Warning%s." % ( + self.__errors, e_s, self.__warnings, w_s) #Summary #Output @@ -340,12 +314,5 @@ class Output() : # Unit test. # if __name__ == "__main__" : - import Config - - C = Config.Config() - - O = Output(__file__, C.Output) - - O.WarningMsg(__file__, "Ja, er ging wat mis.") - del O + pass #if diff --git a/src/Modules/Parser.py b/src/Modules/Parser.py index 09726036..b3e18dea 100644 --- a/src/Modules/Parser.py +++ b/src/Modules/Parser.py @@ -1,6 +1,16 @@ #!/usr/bin/python -#from Output import Output +""" + Module for parting a variant described using the HGVS nomenclature. + + A context-free parser is defined here, the nomenclature rules are specified + in BNF, which is used (with some minor modifications) as source of this + module. + + Public classes: + Nomenclatureparser ; Parse an input string. +""" + from pyparsing import * class Nomenclatureparser() : @@ -285,10 +295,7 @@ class Nomenclatureparser() : __output ; Set to the output object. """ - #self.__output = output - #Output.__init__(self, __file__) - self.output = output - + self.__output = output ParserElement.enablePackrat() # Speed up parsing considerably. #__init__ @@ -314,19 +321,14 @@ class Nomenclatureparser() : try : return self.Var.parseString(variant, parseAll = True) except ParseException, err : - self.output.addMessage(__file__, 4, "EPARSE", str(err)) + self.__output.addMessage(__file__, 4, "EPARSE", str(err)) - # Print the input. - #print variant - #self.output.createOutputNode("parseError", 4) # Fatal error. - #self.output.addOutput("nomenclatureparser", variant) - self.output.addMessage(__file__, 4, "EPARSE", variant) + # Log the input. + self.__output.addMessage(__file__, 4, "EPARSE", variant) - # And print the position where the parsing error occurred. + # And log the position where the parsing error occurred. pos = int(str(err).split(':')[-1][:-1]) - 1 - #print pos * ' ' + '^' - #self.output.addOutput("nomenclatureparser", pos * ' ' + '^') - self.output.addMessage(__file__, 4, "EPARSE", pos * ' ' + '^') + self.__output.addMessage(__file__, 4, "EPARSE", pos * ' ' + '^') return None #except diff --git a/src/Modules/Retriever.py b/src/Modules/Retriever.py index 3b1c2e6d..08b2ca75 100644 --- a/src/Modules/Retriever.py +++ b/src/Modules/Retriever.py @@ -1,5 +1,16 @@ #!/usr/bin/python +""" + Module for retrieving files from either the cache or the NCBI. + + A hash of every retrieved file is stored in the internal database. If a + requested file is not found, but its hash is, we use additional information + to re-download the file. + + Public classes: + Retriever ; Retrieve a record from either the cache or the NCBI. +""" + import os # path.isfile(), link() path.isdir(), path.mkdir(), # walk(), path.getsize(), path.join(), stat(), remove() import bz2 # BZ2Compressor(), BZ2File() @@ -9,10 +20,7 @@ import StringIO # StringIO() from Bio import SeqIO # read() from Bio import Entrez # efetch(), read(), esearch(), esummary() -import Misc - -#from Output import Output -#from Db import Db +from Modules import Misc class Retriever() : """ @@ -24,8 +32,10 @@ class Retriever() : cachesize ; Maximum size of the cache. Special methods: - __init__(config) ; Use variables from the configuration file to - initialise the class private variables. + __init__(config, ; Use variables from the configuration file to + output, initialise the class private variables. + database) + Private methods: __foldersize(folder) ; Return the size of a folder. @@ -68,8 +78,6 @@ class Retriever() : cache ; The directory where the records are stored. """ - #Db.__init__(self, "local", "mutalyzer") - self.__config = config self.__output = output self.__database = database @@ -91,8 +99,8 @@ class Retriever() : folder_size = 0 for (path, dirs, files) in os.walk(folder) : - for file in files : - folder_size += os.path.getsize(os.path.join(path, file)) + for fileName in files : + folder_size += os.path.getsize(os.path.join(path, fileName)) return folder_size #__foldersize @@ -259,13 +267,14 @@ class Retriever() : name, GI = self.__write(raw_data, name, 1) if name : # Processing went okay. currentmd5sum = self.__database.getHash(name) - md5sum = self.__calcHash(raw_data) - if md5sum != currentmd5sum : - self.__output.addMessage(__file__, -1, "WHASH", - "Warning: Hash of %s changed from %s to %s." % ( - name, currentmd5sum, md5sum)) - self.__database.updateHash(name, md5sum) - #if + if currentmd5sum : + md5sum = self.__calcHash(raw_data) + if md5sum != currentmd5sum : + self.__output.addMessage(__file__, -1, "WHASH", + "Warning: Hash of %s changed from %s to %s." % ( + name, currentmd5sum, md5sum)) + self.__database.updateHash(name, md5sum) + #if else : self.__database.insertGB(name, GI, self.__calcHash(raw_data), None, 0, 0, 0, None) @@ -353,6 +362,9 @@ class Retriever() : organism ; The organism in which we search. upstream ; Number of upstream nucleotides for the slice. downstream ; Number of downstream nucleotides for the slice. + + Returns: + """ # Search the NCBI for a specific gene in an organism. @@ -361,41 +373,53 @@ class Retriever() : searchresult = Entrez.read(handle) handle.close() - # FIXME - if len(searchresult["IdList"]) > 1 : - print "Hmmmmm." - return None - #if + ChrAccVer = None # We did not find anything yet. + aliases = [] # A list of aliases in case we find them. + for i in searchresult["IdList"] : # Inspect all results. + handle = Entrez.esummary(db = "gene", id = i) + summary = Entrez.read(handle) + handle.close() + if summary[0]["NomenclatureSymbol"] == gene : # Found it. + ChrAccVer = summary[0]["GenomicInfo"][0]["ChrAccVer"] + ChrLoc = summary[0]["GenomicInfo"][0]["ChrLoc"] + ChrStart = summary[0]["GenomicInfo"][0]["ChrStart"] + ChrStop = summary[0]["GenomicInfo"][0]["ChrStop"] + break; + #if - # Get summary information for the first search hit. - handle = Entrez.esummary(db = "gene", id = searchresult["IdList"][0]) - summary = Entrez.read(handle) - handle.close() - if not len(summary[0]["GenomicInfo"]) : - print "No mapping information found." - #FIXME Output and stuff. - return + # Collect official symbols that has this gene as alias in case we + # can not find anything. + if gene in summary[0]["OtherAliases"] and \ + summary[0]["NomenclatureSymbol"] : + aliases.append(summary[0]["NomenclatureSymbol"]); + #for + + if not ChrAccVer : # We did not find any genes. + if aliases : + self.__output.addMessage(__file__, 4, "ENOGENE", + "Gene %s not found, found aliases: %s" % (gene, aliases)) + return None + #if + self.__output.addMessage(__file__, 4, "ENOGENE", + "Gene %s not found." % gene) + return None #if - ChrAccVer = summary[0]["GenomicInfo"][0]["ChrAccVer"] # Extract the mapping - ChrLoc = summary[0]["GenomicInfo"][0]["ChrLoc"] # information. - ChrStart = summary[0]["GenomicInfo"][0]["ChrStart"] - ChrStop = summary[0]["GenomicInfo"][0]["ChrStop"] - + # Figure out the orientation of the gene. orientation = "1" if ChrStart > ChrStop : # Swap start and stop. orientation = "2" temp = ChrStart ChrStart = ChrStop - downstream # Also take care of the flanking - ChrStop = temp + upstream # sequences. + ChrStop = temp + upstream + 1 # sequences. #if else : - ChrStart -= upstream - ChrStop += downstream + ChrStart -= upstream - 1 + ChrStop += downstream + 2 #else # And retrieve the slice. - self.retrieveslice(ChrAccVer, ChrStart, ChrStop, orientation) + return self.retrieveslice(ChrAccVer, ChrStart, ChrStop, orientation) #retrievegene def downloadrecord(self, url) : @@ -544,120 +568,11 @@ class Retriever() : return record #loadrecord - - #''' - #def loadrecord_old(self, identifier) : - # """ - # Return a record from either the cache or the NCBI. - # If a file is retrieved from the NCBI, a hard link is made to its - # alternative name (GI when an accession number is given and vice - # versa). If no version is given, it will be retrieved and the - # record will be renamed. - # After downloading a file, the cache is checked for overflows by - # calling the __cleancache() function. - # The files are stored in compressed format in the cache. - - # Variables: - # identifier ; Either an accession number or a GI number. - - # Inherited variables from Config: - # cache ; The directory where the record is stored. - # email ; The email address which we give to the NCBI. - # output ; The output object. - - # Returns: - # SeqRecord ; The record that was requested. - # """ - - # # If a GI is given, remove the "GI" or "GI:" part. - # if (identifier[:2] == "GI") : - # if (identifier[2] == ':') : - # name = identifier[3:] - # else : - # name = identifier[2:] - # #if - # else : - # name = identifier - # - # # Make a filename based upon the identifier. - # filename = self.__nametofile(name) - # - # # If the filename is not present, retrieve it from the NCBI. - # if not os.path.isfile(filename) : - # Loc = self.__database.getLoc(name) # Look in the UD database - # if not Loc : # Never seen this name before. - # net_handle = \ - # Entrez.efetch(db = "nucleotide", id = name, rettype = "gb") - # raw_data = net_handle.read() - # net_handle.close() - # # Check if the record is empty or not. - # if raw_data != "\n" : - # self.__write(raw_data, filename, 1) - # else : - # self.__output.addMessage(__file__, 4, "ERETR", - # "Could not retrieve %s." % name) - # return None - # #if - # else : # We know the name. - # self.retrieveslice(*Loc) - # #if - # - # # Now we have the file, so we can parse it. - # file_handle = bz2.BZ2File(filename, "r") - # try : - # record = SeqIO.read(file_handle, "genbank") - # except ValueError : - # self.__output.addMessage(__file__, 4, "ERECPARSE", - # "Could not parse %s, purging." % filename) - # os.remove(filename) - # file_handle.close() - # return None - # #except - - # file_handle.close() - - # if name[:3] == "UD_" : # No renaming is needed. - # return record - - # # If a GI is supplied, find out the accession number (plus version) - # # and vice versa. - # if name != record.annotations["gi"] : - # altfilename = self.__nametofile(record.annotations["gi"]) - # altfilename2 = self.__nametofile(record.id) - # #if - # else : - # altfilename = self.__nametofile(record.id) - # - # # If the alternative filename is not present yet, make a hard link. - # if not os.path.isfile(altfilename) : - # os.link(filename, altfilename) - - # # If the other alternative filename is not present (no version was - # # given), rename the file. If it already exists, remove the file. - # if filename != altfilename2 : - # if not os.path.isfile(altfilename2) : - # os.rename(filename, altfilename2) - # else : - # os.remove(filename) - - # self.__output.addMessage(__file__, 2, "WNOVRE", - # "No version number is given, using %s. Please use version numbers to reduce " \ - # "downloading overhead." % record.id) - # #if - - # return record - ##loadrecord - #''' #Retriever # # Unit test. # if __name__ == "__main__" : - # Get the location of the cache, the cachesize and the email address from - # the config file. - R = Retriever() - - R.loadrecord("AB026906.1") # Retrieve a GenBank record. - del R + pass #if diff --git a/src/Modules/Scheduler.py b/src/Modules/Scheduler.py index 5899ada1..d83d9090 100644 --- a/src/Modules/Scheduler.py +++ b/src/Modules/Scheduler.py @@ -1,27 +1,79 @@ #!/usr/bin/python +""" + Public classes: + Scheduler ; +""" + import time -import Config -import Db import subprocess import psutil import os +import smtplib +from email.mime.text import MIMEText + +from Modules import Config +from Modules import Output +from Modules import Db + +import Mutalyzer class Scheduler() : """ + Special methods: + __init__(config, database) ; + + Public methods: + isDaemonRunning() ; + process() ; + addJob(outputFilter, eMail, queue, fromHost) ; + """ def __init__(self, config, database) : """ + Arguments: + config ; + database ; """ self.__config = config self.__database = database #__init__ + def __sendMail(self, mailTo, url) : + """ + Send an e-mail containing an url to a batch job submitter. + + Arguments: + mailTo ; The batch job submitter. + url ; The url containing the results. + + Private variables: + __config ; The variables mailMessage, mailSubject and mailFrom + are used. + """ + + handle = open(self.__config.mailMessage) + message = MIMEText(handle.read() % url) + handle.close() + + message["Subject"] = self.__config.mailSubject + message["From"] = self.__config.mailFrom + message["To"] = mailTo + + smtpInstance = smtplib.SMTP() + smtpInstance.connect() + smtpInstance.sendmail(self.__config.mailFrom, mailTo, + message.as_string()) + smtpInstance.quit() + #__sendMail + def isDaemonRunning(self) : """ + Returns: + True if an other scheduler is already running, False otherwise. """ myPid = os.getpid() @@ -42,23 +94,40 @@ class Scheduler() : for i in jobList : results = self.__database.getFromQueue(i) if results : - print i, results + if results[1] : + cmd = "%s(%s):%s" % results + else : + cmd = "%s:%s" % (results[0], results[2]) + C = Config.Config() + O = Output.Output(__file__, C.Output) + Mutalyzer.process(cmd, C, O) + handle = open("%s/Results_%s.txt" % ( + self.__config.resultsDir, i), "a") + handle.write(str(O.getOutput("variantdescription"))) + handle.close() + del O, C + #if else : - eMail = self.__database.removeJob(i) - print "Job %s finished, email %s file %s" % (i, eMail, i) - time.sleep(1) + eMail, stuff, fromHost = self.__database.removeJob(i) + #print "Job %s finished, email %s file %s" % (i, eMail, i) + self.__sendMail(eMail, "%sResults_%s.txt" % (fromHost, i)) + #else #for jobList = self.__database.getJobs() #while #process - def addJob(self, outputFilter, eMail, queue) : + def addJob(self, outputFilter, eMail, queue, fromHost) : """ + Arguments: + outputFilter ; + eMail ; + queue ; """ print "called addjob" jobList = self.__database.getJobs() - jobID = self.__database.addJob(outputFilter, eMail) + jobID = self.__database.addJob(outputFilter, eMail, fromHost) for i in queue : self.__database.addToQueue(jobID, *i) subprocess.Popen([self.__config.processName, "src/BatchChecker.py"], diff --git a/src/Modules/Web.py b/src/Modules/Web.py index 99c398f2..95bb681d 100644 --- a/src/Modules/Web.py +++ b/src/Modules/Web.py @@ -1,6 +1,14 @@ #!/usr/bin/python +""" + Module that provides general functions used by the web interfaces. + + Public classes: + Web ; General functions used by the web interfaces. +""" + import sys # sys.stdout +import re # match from cStringIO import StringIO # StringIO() getvalue() class Web() : @@ -153,4 +161,14 @@ class Web() : return s #read + + def isEMail(self, eMail) : + """ + """ + + if re.match("^[a-zA-Z0-9._%-]+@[a-zA-Z0-9._%-]+.[a-zA-Z]{2,6}$", + eMail) : + return True + return False + #isEmail #Web diff --git a/src/Modules/__init__.py b/src/Modules/__init__.py index e69de29b..10629c42 100644 --- a/src/Modules/__init__.py +++ b/src/Modules/__init__.py @@ -0,0 +1,14 @@ +""" + Public modules: + Config ; + Crossmap ; + Db ; + GenRecord ; + Misc ; + Mutator ; + Output ; + Parser ; + Retriever ; + Scheduler ; + Web ; +""" diff --git a/src/Mutalyzer.py b/src/Mutalyzer.py index 15c6693c..b46379a6 100644 --- a/src/Mutalyzer.py +++ b/src/Mutalyzer.py @@ -1,17 +1,28 @@ #!/usr/bin/python +""" + The nomenclature checker. +""" + +import sys +import math +import types +import Bio + +import Bio.Seq +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC +from Bio.SeqUtils import seq3 + from Modules import Retriever from Modules import GenRecord from Modules import Crossmap from Modules import Parser from Modules import Db - -import types +from Modules import Mutator from Modules import Output from Modules import Config -import Bio.Seq - class newMut() : def __init__(self) : self.c = "" @@ -26,6 +37,9 @@ def __order(a, b) : #__order def __roll(string, start, stop, orientation) : + """ + """ + pattern = string[start:stop] if orientation == 1 : i = stop - 1 @@ -49,8 +63,34 @@ def __roll(string, start, stop, orientation) : #else #__roll +def roll2(ref, start, stop) : + """ + """ + + pattern = ref[start:stop] + patternLength = len(pattern) + + g_min = start - 1 + j = patternLength - 1 + while g_min > -1 and ref[g_min] == pattern[j % patternLength] : + j -= 1 + g_min -= 1 + #while + g_min += 1 + + g_max = stop + j = 0 + while g_max < len(ref) and ref[g_max] == pattern[j % patternLength] : + j += 1 + g_max += 1 + #while + + return g_min, g_max - patternLength +#roll2 + def __palinsnoop(string) : - import math + """ + """ revcomp = Bio.Seq.reverse_complement(string) @@ -61,7 +101,8 @@ def __palinsnoop(string) : #__palinsnoop def __bprint(s) : - import math + """ + """ if not s : return @@ -82,6 +123,9 @@ def __bprint(s) : #__bprint def __PtLoc2main(Loc) : + """ + """ + main = int(Loc.Main) if Loc.MainSgn == '-' : main = -main @@ -90,6 +134,9 @@ def __PtLoc2main(Loc) : #__PtLoc2main def __PtLoc2offset(Loc) : + """ + """ + if Loc.Offset : offset = int(Loc.Offset) if Loc.OffSgn == '-' : @@ -101,22 +148,6 @@ def __PtLoc2offset(Loc) : return offset #__PtLoc2offset -""" -def IsInt(string) : - try : - num = int(string) - return 1 - #try - except ValueError : - return 0 -#IsInt -""" - -""" -def printp(string, depth) : - print (depth * " ") + str(string) -""" - def __splice(string, splice_sites) : """ Construct the transcript or the coding sequence from a record and @@ -140,6 +171,9 @@ def __splice(string, splice_sites) : #__splice def __nsplice(string, splice_sites, CDS, orientation) : + """ + """ + transcript = "" if orientation == 1 : @@ -148,94 +182,45 @@ def __nsplice(string, splice_sites, CDS, orientation) : transcript += string[CDS[0] - 1:splice_sites[i + 1]] else : if splice_sites[i] > CDS[0] : - transcript += string[splice_sites[i] - 1:splice_sites[i + 1]] + transcript += \ + string[splice_sites[i] - 1:splice_sites[i + 1]] + #for + #if else : for i in range(0, len(splice_sites), 2) : if CDS[1] >= splice_sites[i] and CDS[1] <= splice_sites[i + 1] : transcript += string[splice_sites[i] - 1: CDS[1]] else : if splice_sites[i] < CDS[1] : - transcript += string[splice_sites[i] - 1:splice_sites[i + 1]] - + transcript += \ + string[splice_sites[i] - 1:splice_sites[i + 1]] + #for + #else return transcript #__nsplice - -def __toProtDescr(orig, trans) : - from Bio.SeqUtils import seq3 - - if str(trans) == str(orig) : - return "p.=" - - if len(trans) > len(orig) : - ext = abs(len(orig) - len(trans)) - return "p.*%i%sext*%i" % (len(orig) + 1, seq3(trans[len(orig)]), ext) - - if len(orig) > len(trans) : - return "p.%s%i*" % (seq3(orig[len(trans)]), len(trans) + 1) - - i = 0 - while i < len(orig) - 1 and orig[i] == trans[i] : - i += 1 - - return "p.%s%i%s" % (seq3(orig[i]), i + 1, seq3(trans[i])) -#__toProtDescr - -def __constructCDS(mRNA, CDSpos) : - #print mRNA - #print CDSpos - i = 1 - ret = [CDSpos[0]] - - while CDSpos[0] > mRNA[i] : - i += 2 - - j = i - while CDSpos[1] > mRNA[j] : - j += 2 - - ret.extend(mRNA[i:j]) - ret.append(CDSpos[1]) - - #print ret - return ret -#__constructCDS - -""" -def __isStringThere(ref, p1, p2, string) : - if ref[p1 - 1:p2] == string : - return True - return False -#__isStringThere - -def __checkStringLength(p1, p2, length) : - if p2 - p1 + 1 == int(length) : - return True - return False -#__checkStringLength -""" - def __checkOptArg(ref, p1, p2, arg, M, O) : + """ + """ + if arg : if arg.isdigit() : length = int(arg) interval = p2 - p1 + 1 if length != interval : - O.addMessage(__file__, 3, "EARGLEN", "The length (%i) differed from that of " \ - "the range (%i)." % (length, interval)) + O.addMessage(__file__, 3, "EARGLEN", + "The length (%i) differed from that of the range (%i)." % ( + length, interval)) return False #if #if else : - #revcomp = reverse_complement(string) - ref_slice = str(ref[p1 - 1:p2]) - #if M.orientation == -1 : - # ref_slice = Bio.Seq.reverse_complement(ref_slice) if ref_slice != str(arg) : # FIXME more informative. - O.addMessage(__file__, 3, "EREF", "%s not found at position c.%s (g.%i), " \ - "found %s instead." % (arg, M.g2c(p1), p1, ref_slice)) + O.addMessage(__file__, 3, "EREF", + "%s not found at position c.%s (g.%i), found %s instead." \ + % (arg, M.g2c(p1), p1, ref_slice)) return False #if #else @@ -284,97 +269,186 @@ def __lcs(str1, str2) : return __lcp(t1, t2) #__lcs +def findInFrameDescription(str1, str2) : + """ + """ + + # Nothing happened. + if str1 == str2 : + return "p.(=)" + + lcp = __lcp(str1, str2) + lcs = __lcs(str1[lcp:], str2[lcp:]) + str1_end = len(str1) - lcs + str2_end = len(str2) - lcs + + # Insertion / Duplication. + if not str1_end - lcp : + inLen = str2_end - lcp + + if lcp - inLen >= 0 and str1[lcp - inLen:lcp] == str2[lcp:str2_end] : + if inLen == 1 : + return "p.(%s%idup)" % (seq3(str1[lcp - inLen]), + lcp - inLen + 1) + return "p.(%s%i_%s%idup)" % (seq3(str1[lcp - inLen]), + lcp - inLen + 1, + seq3(str1[lcp - 1], lcp)) + #if + return "p.(%s%i_%s%iins%s)" % (seq3(str1[lcp - 1]), lcp, + seq3(str1[lcp]), lcp + 1, + seq3(str2[lcp:str2_end])) + #if + + # Deletion. + if not str2_end - lcp : + if lcp + 1 == str1_end : + return "p.(%s%idel)" % (seq3(str1[lcp], lcp + 1)) + return "p.(%s%i_%s%idel)" % (seq3(str1[lcp - 1]), lcp + 1, + seq3(str1[str1_end - 1]), str1_end) + #if + + # Substitution. + if str1_end == str2_end and str1_end == lcp + 1 : + if len(str1) > len(str2) : + return "p.(*%i%sext*%i)" % (len(str1) + 1, seq3(str2[len(str1)]), + abs(len(str1) - len(str2))) + if len(str1) > len(str2) : + return "p.(%s%i*)" % (seq3(str1[len(str2)]), len(str2) + 1) + return "p.(%s%i%s)" % (seq3(str1[lcp]), lcp + 1, seq3(str2[lcp])) + #if + + # InDel. + if lcp + 1 == str1_end : + return "p.(%s%idelins%s)" % (seq3(str1[lcp]), lcp + 1, + seq3(str2[lcp:str2_end])) + return "p.(%s%i_%s%idelins%s)" % (seq3(str1[lcp]), lcp + 1, + seq3(str1[str1_end - 1]), + str1_end, seq3(str2[lcp:str2_end])) +#findInFrameDescription + +def findFrameShift(str1, str2) : + """ + """ + + lcp = __lcp(str1, str2) + + return "p.(%s%i%sfs*%i)" % (seq3(str1[lcp]), lcp + 1, seq3(str2[lcp]), + len(str2) - lcp) +#findFrameShift + +def __toProtDescr(CDSStop, orig, trans) : + """ + """ + + if CDSStop % 3 : + return findFrameShift(str(orig), str(trans)) + return findInFrameDescription(str(orig), str(trans)) +#__toProtDescr + def __trim(string, lcp, lcs) : - if lcp and lcs : - return string[lcp:-lcs] - if lcp : - return string[lcp:] - if lcs : - return string[:-lcs] - return string + """ + """ + + return string[lcp:len(string) - lcs] #__trim def __rangeToC(M, g1, g2) : + """ + """ + if M.orientation == -1 : return M.g2c(g2), M.g2c(g1) return M.g2c(g1), M.g2c(g2) #__rangeToC def __maybeInvert(M, string) : + """ + """ + if M.orientation == -1 : return Bio.Seq.reverse_complement(string) return string #__maybeInvert -def __rv(MUU, record, GeneSymbol, RawVar, M, RefType, O, NM) : - #start_main = __PtLoc2main(RawVar.StartLoc.PtLoc) - start_main = M.main2int(RawVar.StartLoc.PtLoc.MainSgn + - RawVar.StartLoc.PtLoc.Main) - start_offset = __PtLoc2offset(RawVar.StartLoc.PtLoc) - end_main = start_main - end_offset = start_offset - if RawVar.EndLoc : - #end_main = __PtLoc2main(RawVar.EndLoc.PtLoc) - end_main = M.main2int(RawVar.EndLoc.PtLoc.MainSgn + - RawVar.EndLoc.PtLoc.Main) - end_offset = __PtLoc2offset(RawVar.EndLoc.PtLoc) - #if - - start_g = int(start_main) - end_g = int(end_main) - - Arg1 = RawVar.Arg1 - Arg2 = RawVar.Arg2 - if RefType in ['c', 'n'] : - start_g = M.x2g(start_main, start_offset) - end_g = M.x2g(end_main, end_offset) - if M.orientation == -1 : - Arg1 = Bio.Seq.reverse_complement(RawVar.Arg1) - Arg2 = Bio.Seq.reverse_complement(RawVar.Arg2) - #if - #if - - start_g, end_g = __order(start_g, end_g) - - start_c, end_c = __rangeToC(M, start_g, end_g) - if start_c.isdigit() and 1 <= int(start_c) <= 3 or \ - end_c.isdigit() and 1 <= int(end_c) <= 3 : - O.addMessage(__file__, 2, "WSTART", "Mutation in start codon.") - - # start_offset has to be calculated (not accepted from the parser) - start_t_m, start_t_o = M.g2x(start_g) - t_s, t_e, c_s = M.info() - if start_t_o == -1 or start_t_o == -2 : - if start_t_m == t_s : - O.addMessage(__file__, 2, "WTXSTART", "Mutation hits transcription start.") - else : - O.addMessage(__file__, 2, "WSPLDON", "Mutation hits a splice donor site.") - if start_t_o == 1 or start_t_o == 2 : - O.addMessage(__file__, 2, "WSPLACC", "Mutation hits a splice acceptor site.") - - #print str(record.seq[start_g - 20:start_g + 20]) - - if RawVar.MutationType in ["del", "dup", "subst", "delins"] : - __checkOptArg(record.seq, start_g, end_g, Arg1, M, O) +def __searchFrameShift(orig, mutated) : + pass +#__searchFrameShift + +def __rv(MUU, record, RawVar, GenRecordInstance, RefType, O, NM) : + """ + """ + + for i in GenRecordInstance.record.geneList : + for j in i.transcriptList : + M = j.CM + if j.CM : + start_main = j.CM.main2int(RawVar.StartLoc.PtLoc.MainSgn + + RawVar.StartLoc.PtLoc.Main) + start_offset = __PtLoc2offset(RawVar.StartLoc.PtLoc) + end_main = start_main + end_offset = start_offset + if RawVar.EndLoc : + end_main = j.CM.main2int(RawVar.EndLoc.PtLoc.MainSgn + + RawVar.EndLoc.PtLoc.Main) + end_offset = __PtLoc2offset(RawVar.EndLoc.PtLoc) + #if + + start_g = int(start_main) + end_g = int(end_main) + + Arg1 = RawVar.Arg1 + Arg2 = RawVar.Arg2 + if RefType in ['c', 'n'] : + start_g = j.CM.x2g(start_main, start_offset) + end_g = j.CM.x2g(end_main, end_offset) + if j.CM.orientation == -1 : + Arg1 = Bio.Seq.reverse_complement(RawVar.Arg1) + Arg2 = Bio.Seq.reverse_complement(RawVar.Arg2) + #if + #if + + start_g, end_g = __order(start_g, end_g) + + start_c, end_c = __rangeToC(M, start_g, end_g) + if start_c.isdigit() and 1 <= int(start_c) <= 3 or \ + end_c.isdigit() and 1 <= int(end_c) <= 3 : + O.addMessage(__file__, 2, "WSTART", + "Mutation in start codon.") + + # start_offset has to be calculated (not accepted from the + # parser) + start_t_m, start_t_o = j.CM.g2x(start_g) + t_s, t_e, c_s = j.CM.info() + if start_t_o == -1 or start_t_o == -2 : + if start_t_m == t_s : + O.addMessage(__file__, 2, "WTXSTART", + "Mutation hits transcription start.") + else : + O.addMessage(__file__, 2, "WSPLDON", + "Mutation hits a splice donor site.") + #if + if start_t_o == 1 or start_t_o == 2 : + O.addMessage(__file__, 2, "WSPLACC", + "Mutation hits a splice acceptor site.") + + if RawVar.MutationType in ["del", "dup", "subst", "delins"] : + __checkOptArg(record.seq, start_g, end_g, Arg1, M, O) + #if + #for + #for - global protDescr - protDescr = False # Substitution. if RawVar.MutationType == "subst" : if RawVar.Arg1 == RawVar.Arg2 : - O.addMessage(__file__, 3, "ENOCHANGE", "No mutation given (%c>%c) at position " \ - "c.%s (g.%i)." % (RawVar.Arg1, RawVar.Arg1, M.g2c(start_g), - start_g)) + O.addMessage(__file__, 3, "ENOVAR", + "No mutation given (%c>%c) at position c.%s (g.%i)." % ( + RawVar.Arg1, RawVar.Arg1, M.g2c(start_g), start_g)) MUU.subM(start_g, Arg2) - NM.c += str(M.g2c(start_g)) + \ - __maybeInvert(M, record.seq[start_g - 1]) + \ - '>' + __maybeInvert(M, Arg2) - NM.g += str(start_g) + \ - record.seq[start_g - 1] + \ - '>' + Arg2 - protDescr = True + GenRecordInstance.name(start_g, 0, "subst", record.seq[start_g - 1], + Arg2) + NM.g += str(start_g) + record.seq[start_g - 1] + '>' + Arg2 #if # Deletion / Duplication. @@ -383,10 +457,11 @@ def __rv(MUU, record, GeneSymbol, RawVar, M, RefType, O, NM) : M.orientation) if rollposstart != start_g : rollposend = rollposstart + (end_g - start_g) - O.addMessage(__file__, 2, "WROLL", "Sequence %s at position c.%s (g.%i) was " \ - "given, however, the HGVS notation prescribes that it should " \ - "be %s at position c.%s (g.%i)." % ( - str(record.seq[start_g - 1:end_g]), M.g2c(start_g), start_g, + O.addMessage(__file__, 2, "WROLL", + "Sequence %s at position c.%s (g.%i) was given, however, " \ + "the HGVS notation prescribes that it should be %s at " \ + "position c.%s (g.%i)." % (str(record.seq[start_g - 1:end_g]), + M.g2c(start_g), start_g, str(record.seq[rollposstart - 1:rollposend]), M.g2c(rollposstart), rollposstart)) start_g = rollposstart @@ -406,7 +481,7 @@ def __rv(MUU, record, GeneSymbol, RawVar, M, RefType, O, NM) : cpos = str(M.g2c(start_g)) gpos = str(start_g) #else - NM.c += cpos + RawVar.MutationType + GenRecordInstance.name(start_g, end_g, RawVar.MutationType, "", "") NM.g += gpos + RawVar.MutationType #if @@ -415,20 +490,20 @@ def __rv(MUU, record, GeneSymbol, RawVar, M, RefType, O, NM) : snoop = __palinsnoop(record.seq[start_g - 1:end_g]) if snoop : if snoop == -1 : - O.addMessage(__file__, 3, "ENOCHANGE", "Sequence %s at position c.%s (g.%i) " \ - "is a palindrome (its own reverse complement)." % ( + O.addMessage(__file__, 2, "WNOCHANGE", + "Sequence %s at position c.%s (g.%i) is a palindrome " \ + "(its own reverse complement)." % ( str(record.seq[start_g - 1:end_g]), M.g2c(start_g), start_g)) - # Do nothing... + return NM else : - O.addMessage(__file__, 2, "ENOTMINIMAL", "Sequence %s at position c.%s (g.%i) " \ - "is a partial palindrome (the first %i " \ - "nucleotide(s) are the reverse complement of " \ - "the last one(s)), the HGVS notation " \ - "prescribes that it should be %s at position " \ - "c.%s (g.%i)." % ( - str(record.seq[start_g - 1:end_g]), M.g2c(start_g), - start_g, snoop, + O.addMessage(__file__, 2, "WNOTMINIMAL", + "Sequence %s at position c.%s (g.%i) is a partial " \ + "palindrome (the first %i nucleotide(s) are the reverse " \ + "complement of the last one(s)), the HGVS notation " \ + "prescribes that it should be %s at position c.%s " \ + "(g.%i)." % (str(record.seq[start_g - 1:end_g]), + M.g2c(start_g), start_g, snoop, str(record.seq[start_g + snoop - 1: end_g - snoop]), M.g2c(start_g + snoop), start_g + snoop)) start_g += snoop @@ -437,21 +512,18 @@ def __rv(MUU, record, GeneSymbol, RawVar, M, RefType, O, NM) : MUU.invM(start_g, end_g) c1, c2 = __rangeToC(M, start_g, end_g) - NM.c += c1 + '_' + c2 + "inv" + #NM.c += c1 + '_' + c2 + "inv" + GenRecordInstance.name(start_g, end_g, "inv", "", "") NM.g += str(start_g) + '_' + str(end_g) + "inv" #if # Insertion. if RawVar.MutationType == "ins" : if start_g + 1 != end_g : - O.addMessage(__file__, 3, "EINSRANGE", "c.%s (g.%i) and c.%s (g.%i) are not " \ - "consecutive positions." % ( - M.g2c(start_g), start_g, M.g2c(end_g), end_g)) + O.addMessage(__file__, 3, "EINSRANGE", + "c.%s (g.%i) and c.%s (g.%i) are not consecutive positions." \ + % (M.g2c(start_g), start_g, M.g2c(end_g), end_g)) - #inserted = RawVar.Arg1 - #if M.orientation == -1 : - # inserted = Bio.Seq.reverse_complement(RawVar.Arg1) - MUU.insM(start_g, Arg1) way = M.orientation @@ -459,14 +531,9 @@ def __rv(MUU, record, GeneSymbol, RawVar, M, RefType, O, NM) : rs1 = MUU.shiftpos(start_g) re1 = MUU.shiftpos(start_g) + l - #rs1, re1 = __order(rs1, re1) - #print "+++", MUU.mutated[rs1:re1], rs1, re1 - rs2 = __roll(MUU.mutated, rs1, re1, way) - 1 - #shiftlen = ((rs2 - rs1 - l + 1) * way) shiftlen = (((rs2 - rs1) * way) - l + 1) * way - #print "+++", rs2, shiftlen c1 = rs2 c2 = rs2 + ((l - 1) * way) @@ -474,12 +541,12 @@ def __rv(MUU, record, GeneSymbol, RawVar, M, RefType, O, NM) : corr = 0 if rs1 != rs2 or \ str(MUU.mutated[c1:c2 + 1]) == str(MUU.mutated[c1-l:(c2-l)+1]) : - O.addMessage(__file__, 2, "WINSDUP", "Insertion of %s at position c.%s (g.%i) " \ - "was given, however, the HGVS notation prescribes that it " \ - "should be a duplication of %s at position c.%s (g.%i)." % ( - RawVar.Arg1, M.g2c(start_g), start_g, - str(MUU.mutated[c1:c2 + 1]), M.g2c(start_g + shiftlen), - start_g + shiftlen)) + O.addMessage(__file__, 2, "WINSDUP", + "Insertion of %s at position c.%s (g.%i) was given, " \ + "however, the HGVS notation prescribes that it should be a " \ + "duplication of %s at position c.%s (g.%i)." % (RawVar.Arg1, + M.g2c(start_g), start_g, str(MUU.mutated[c1:c2 + 1]), + M.g2c(start_g + shiftlen), start_g + shiftlen)) start_g += shiftlen end_g += shiftlen corr = 1 @@ -487,29 +554,41 @@ def __rv(MUU, record, GeneSymbol, RawVar, M, RefType, O, NM) : c1, c2 = __rangeToC(M, start_g, end_g) if corr : - NM.c += c1 + '_' + c2 + "dup" + #NM.c += c1 + '_' + c2 + "dup" + GenRecordInstance.name(start_g, end_g, "dup", "", "") NM.g += str(start_g) + '_' + str(end_g) + "dup" else : - NM.c += c1 + '_' + c2 + "ins" + RawVar.Arg1 + #NM.c += c1 + '_' + c2 + "ins" + RawVar.Arg1 + GenRecordInstance.name(start_g, end_g, "ins", "", "") NM.g += str(start_g) + '_' + str(end_g) + "ins" + Arg1 #if + # DelIns. if RawVar.MutationType == "delins" : - lcp = __lcp(RawVar.Arg1, RawVar.Arg2) - lcs = -__lcs(RawVar.Arg1, RawVar.Arg2) - + Arg1 = RawVar.Arg1 + if not RawVar.Arg1 : + Arg1 = MUU.orig[start_g - 1:end_g] + + lcp = __lcp(Arg1, RawVar.Arg2) + lcs = __lcs(Arg1, RawVar.Arg2) + + if str(Arg1) == str(RawVar.Arg2) : + O.addMessage(__file__, 2, "WNOCHANGE", + "Sequence %s at position c.%s (g.%i) is identical to the " \ + "variant." % ( + str(record.seq[start_g - 1:end_g]), + M.g2c(start_g), start_g)) + return NM + ins_part = RawVar.Arg2 if lcp or lcs : - del_part = __trim(RawVar.Arg1, lcp, lcs) + del_part = __trim(Arg1, lcp, lcs) ins_part = __trim(RawVar.Arg2, lcp, lcs) - # FIXME Output stuff and such. - print start_g + lcp, end_g - lcs - print M.g2c(start_g + lcp), M.g2c(end_g - lcs) - print "del%sins%s" % (del_part, ins_part) - - MUU.delinsM(start_g, end_g, Arg2) + #O.addMessage(__file__, 2, "WNOTMINIMAL", + # "") - #NM.c += str(M.g2c(start_g)) + '_' + str(M.g2c(end_g)) + "delins" + RawVar.Arg2 - #NM.g += str(start_g) + '_' + str(end_g) + "delins" + RawVar.Arg2 + start_g += lcp + end_g -= lcs + MUU.delinsM(start_g, end_g, ins_part) if start_g != end_g : c1, c2 = __rangeToC(M, start_g, end_g) @@ -520,303 +599,111 @@ def __rv(MUU, record, GeneSymbol, RawVar, M, RefType, O, NM) : cpos = str(M.g2c(start_g)) gpos = str(start_g) #else - #NM.c += cpos + "delins" + RawVar.Arg2 - #NM.g += gpos + "delins" + RawVar.Arg2 - NM.c += cpos + "delins" + __maybeInvert(M, Arg2) - NM.g += gpos + "delins" + Arg2 + #NM.c += cpos + "delins" + __maybeInvert(M, ins_part) + GenRecordInstance.name(start_g, end_g, "delins", ins_part, "") + NM.g += gpos + "delins" + ins_part #if - #print MUU.mutated[start_g - 20:start_g + 20] return NM #__rv -def __ppp(MUU, record, parts, recordDict, refseq, depth, O) : - #printp("+++ recurse +++", depth) - #printp(repr(parts), depth) - #printp(parts, depth) - """ - printp(parts[4], depth) - printp(repr(parts[4]), depth) - printp(parts[4][0][0][0].Main, 2) - printp(parts, depth) - """ +def __ppp(MUU, record, parts, GenRecordInstance, refseq, depth, O) : if parts.RefSeqAcc : refseq = parts.RefSeqAcc - #printp("RefSeqAcc: " + str(refseq), depth) - #printp("RefType: " + str(parts.RefType), depth) - - #printp("Version: " + str(parts.Version), depth) - if parts.Gene : - print "Gene Symbol: " + str(parts.Gene.GeneSymbol) - #printp("Transcript variant: " + str(parts.Gene.TransVar), depth) - #printp("Protein isoform: " + str(parts.Gene.ProtIso), depth) - #if - """ - if parts.ChimeronSet : - #printp(str(parts.ChimeronSet), depth) - for i in parts.ChimeronSet : - printp("ChimeronSet", depth) - __ppp(MUU, record, i, recordDict, refseq, depth + 1, O) - #for - #if - if parts.MosaicSet : - #printp(str(parts.MosaicSet), depth) - for i in parts.MosaicSet : - printp("MosaicSet", depth) - __ppp(MUU, record, i, recordDict, refseq, depth + 1, O) - #for - #if - if parts.SimpleAlleleVarSet : - #printp(str(parts.SimpleAlleleVarSet), depth) - for i in parts.SimpleAlleleVarSet : - printp("SimpleAlleleVarSet", depth) - __ppp(MUU, record, i, recordDict, refseq, depth + 1, O) - #for - #__ppp(MUU, record, i, recordDict, refseq, depth + 1) - #if - if parts.MultiAlleleVars : - printp(str(parts.MultiAlleleVars), depth) - for i in parts.MultiAlleleVars : - printp("MultiAlleleVars", depth) - print repr(i) - __ppp(MUU, record, i, recordDict, refseq, depth + 1, O) - #for - #if - """ if parts.RawVar or parts.SingleAlleleVarSet : - GS = "" - if recordDict.genelist : - GS = recordDict.genelist.keys()[0] - - if parts.Gene and parts.Gene.GeneSymbol : - GS = parts.Gene.GeneSymbol - #print "Gene Name: " + GS - #O.createOutputNode("genename", 1) # Info message. - #O.addToOutputNode(__file__, "genename", "INFO", GS) - O.addOutput("genename", GS) - - transcriptvariant = "001" - if parts.Gene and parts.Gene.TransVar : - transcriptvariant = parts.Gene.TransVar - #print "Transcript variant: " + transcriptvariant - #O.createOutputNode("transcriptvariant", 1) # Info message. - #O.addToOutputNode(__file__, "transcriptvariant", "INFO", transcriptvariant) - O.addOutput("transcriptvariant", transcriptvariant) - #print - - if recordDict.genelist : - if recordDict.genelist.has_key(GS) : - currentGene = recordDict.genelist[GS] - else : - print "No such gene %s in record." % GS - # FIXME Output an stuff. - return - #else - else : - currentGene = recordDict.source - W = currentGene.list[transcriptvariant] - - noTrans = False - if not W.mRNA : - if not W.exon: - O.addMessage(__file__, 2, "WNOMRNA", "No mRNA field found for gene %s, " \ - "transcript variant %s in GenBank record %s, " \ - "constructing it from CDS." % (GS, transcriptvariant, - record.id)) - if W.CDS : - if not W.CDS.list : - print "Extra warning" - # FIXME Output and stuff. - W.mRNA = W.CDS - W.mRNA.list = W.CDS.location - noTrans = True - else : - W.mRNA = W.CDS - #if - else : - print currentGene.location - # FIXME Output and stuff. - W.CDS = GenRecord.Locus() - W.CDS.location = W.location - W.mRNA = W.CDS - W.mRNA.list = currentGene.location - noTrans = True - #if - else : - O.addMessage(__file__, 2, "WNOMRNA", "No mRNA field found for gene %s, " \ - "transcript variant %s in GenBank record %s, " \ - "constructing it from gathered exon information." % (GS, - transcriptvariant, record.id)) - W.mRNA = W.exon - #if - #print W.mRNA.list - if not W.mRNA.list : - W.mRNA.list = W.mRNA.location - if W.CDS : - if not W.CDS.list : - O.addMessage(__file__, 2, "WNOCDS", "No CDS list found for gene %s, " \ - "transcript variant %s in GenBank record %s, " \ - "constructing it from mRNA list and CDS location." % (GS, - transcriptvariant, record.id)) - if W.mRNA.list : - W.CDS.list = __constructCDS(W.mRNA.list, W.CDS.location) - #print W.mRNA.list, W.CDS.location - #print W.CDS.list - else : - W.CDS.list = __constructCDS(W.mRNA.location, W.CDS.location) - #else : - # pass # Noncoding RNA? - - if parts.RefType == 'n' : - M = Crossmap.Crossmap( - W.mRNA.list, - [], - currentGene.orientation) - else : - if not W.CDS : - O.addMessage(__file__, 3, "ENOCDS", "No CDS information found for gene %s, " \ - "transcript variant %s in GenBank " \ - "record %s." % (GS, transcriptvariant, - record.id)) - return - #if - M = Crossmap.Crossmap( - W.mRNA.list, - W.CDS.location, - currentGene.orientation) - #else - #print W.mRNA - - #print recordDict["organelle"], recordDict["mol_type"] NM = newMut() - NM.c = parts.RefSeqAcc - NM.g = parts.RefSeqAcc - if parts.Version : - NM.c += '.' + parts.Version - NM.g += '.' + parts.Version - #if - if parts.RefType == 'n' : - NM.c += ":n." - else : - NM.c += ":c." - if recordDict.organelle and recordDict.organelle == "mitochondrion" : - NM.g += ":m." + + print GenRecordInstance.record.mol_type + #if parts.RefType == 'n' : + # #NM.c += "n." + #else : + # #NM.c += "c." + if GenRecordInstance.record.organelle and \ + GenRecordInstance.record.organelle == "mitochondrion" : + NM.g += "m." else : - if recordDict.genelist : - NM.g += ":g." - else : # EST - NM.g += ":" + if GenRecordInstance.record.geneList : + NM.g += "g." + #else : # EST + # NM.g += "" #else + GS = GenRecordInstance.record.geneList[0].name if parts.SingleAlleleVarSet : - NM.c += '[' + #NM.c += '[' NM.g += '[' for i in parts.SingleAlleleVarSet : - __rv(MUU, record, GS, i.RawVar, M, parts.RefType, O, NM) - NM.c += ';' + __rv(MUU, record, i.RawVar, GenRecordInstance, + parts.RefType, O, NM) + #NM.c += ';' NM.g += ';' #for - NM.c = NM.c[0:-1] + ']' + #NM.c = NM.c[0:-1] + ']' NM.g = NM.g[0:-1] + ']' #if else : - NM = __rv(MUU, record, GS, parts.RawVar, M, parts.RefType, O, NM) - - #print - #print "+++", NM.c - #print "+++", NM.g - #O.createOutputNode("variantdescription", 1) # Info message. - #O.addToOutputNode(__file__, "variantdescription", "INFO", NM.c) - #O.addToOutputNode(__file__, "variantdescription", "INFO", NM.g) - O.addOutput("variantdescription", NM.c) + NM = __rv(MUU, record, parts.RawVar, GenRecordInstance, + parts.RefType, O, NM) + + #O.addOutput("variantdescription", NM.c) O.addOutput("variantdescription", NM.g) del NM + W = GenRecordInstance.record.geneList[0].transcriptList[0] if not W.CDS : # Noncoding. return - if noTrans : - return - if not recordDict.genelist : # EST + #if noTrans : + # return + if not GenRecordInstance.record.geneList : # EST return - import Bio - from Bio.Seq import Seq - from Bio.Alphabet import IUPAC - - cds = Seq(str(__splice(MUU.orig, W.CDS.list)), IUPAC.unambiguous_dna) - cdsm = Seq(str(__nsplice(MUU.mutated, MUU.newSplice(W.mRNA.list), - MUU.newSplice(W.CDS.location), M.orientation)), + cds = Seq(str(__splice(MUU.orig, W.CDS.positionList)), + IUPAC.unambiguous_dna) + cdsm = Seq(str(__nsplice(MUU.mutated, + MUU.newSplice(W.mRNA.positionList), + MUU.newSplice(W.CDS.location), + W.CM.orientation)), IUPAC.unambiguous_dna) - if M.orientation == -1 : + if W.CM.orientation == -1 : cds = Bio.Seq.reverse_complement(cds) cdsm = Bio.Seq.reverse_complement(cdsm) - del M - #print "\n<b>Old protein:</b>" if '*' in cds.translate()[:-1] : - print "In frame stop codon found." + O.addMessage(__file__, 3, "ESTOP", "In frame stop codon found.") return #if orig = cds.translate(table = W.txTable, cds = True, to_stop = True) - #__bprint(orig + '*') - #O.createOutputNode("oldprotein", 1) # Info message. - #O.addToOutputNode(__file__, "oldprotein", "INFO", orig + '*') O.addOutput("oldprotein", orig + '*') - #print "\n\n<b>New protein:</b>" trans = cdsm.translate(table = W.txTable, to_stop = True) if not trans or trans[0] != 'M' : if str(cdsm[0:3]) in \ Bio.Data.CodonTable.unambiguous_dna_by_id[ W.txTable].start_codons : - __bprint('?') - print "\n\n<b>Alternative protein using start codon %s:</b>" % \ - str(cdsm[0:3]) - __bprint('M' + trans[1:] + '*') - #O.createOutputNode("altprotein", 1) # Info message. - #O.addToOutputNode(__file__, "altprotein", "INFO", 'M' + trans[1:] + '*') + O.addOutput("newprotein", '?') + O.addOutput("altstart", str(cdsm[0:3])) O.addOutput("altprotein", 'M' + trans[1:] + '*') else : __bprint('?') - #O.createOutputNode("newprotein", 1) # Info message. - #O.addToOutputNode(__file__, "newprotein", "INFO", '?') O.addOutput("newprotein", '?') else : - #__bprint(trans + '*') - #O.createOutputNode("newprotein", 1) # Info message. - #O.addToOutputNode(__file__, "newprotein", "INFO", trans + '*') O.addOutput("newprotein", trans + '*') - if protDescr : - #print - #print - #O.createOutputNode("proteindescription", 1) # Info message. - #O.addToOutputNode(__file__, "proteindescription", "INFO", - # __toProtDescr(orig, trans)) - O.addOutput("proteindescription", __toProtDescr(orig, trans)) - #if + if not parts.SingleAlleleVarSet : + O.addOutput("proteindescription", __toProtDescr( + W.CM.g2x(MUU.newSplice(W.CDS.location)[1])[0], orig, trans)) + else : + O.addOutput("proteindescription", "p.?") + + del W.CM #if #__ppp -def main(cmd) : - C = Config.Config() - O = Output.Output(__file__, C.Output) - - #O.LogMsg(__file__, "Received variant " + cmd) - O.addMessage(__file__, -1, "INFO", "Received variant " + cmd) - +def process(cmd, C, O) : parser = Parser.Nomenclatureparser(O) - #print cmd - #O.createOutputNode("inputvariant", 1) # Info message. - #O.addToOutputNode(__file__, "inputvariant", "INFO", cmd) O.addOutput("inputvariant", cmd) - #print ParseObj = parser.parse(cmd) - #print - #for i in parser.outputData : # HMMMM - # #print i.origin, i.name, i.msgType, i.severity, i.message - # print i, parser.outputData[i].message del parser if ParseObj : @@ -824,10 +711,9 @@ def main(cmd) : RetrieveRecord = ParseObj.RefSeqAcc + '.' + ParseObj.Version else : RetrieveRecord = ParseObj.RefSeqAcc + O.addOutput("reference", RetrieveRecord) - #print "Retrieving..." - - D = Db.Db("local", C.Db.internalDb, C.Db) + D = Db.Cache(C.Db) retriever = Retriever.Retriever(C.Retriever, O, D) record = retriever.loadrecord(RetrieveRecord) if not record : @@ -835,44 +721,27 @@ def main(cmd) : del retriever del D - #print "Dicting..." - D = GenRecord.GenRecord() - d = D.record2dict(record) - del D - #print "Printing..." - #D.printRecordDict(d, record) - - from Modules import Mutator - + GenRecordInstance = GenRecord.GenRecord(C.GenRecord, O) + GenRecordInstance.parseRecord(record) + MUU = Mutator.Mutator(record.seq, C.Mutator, O) + __ppp(MUU, record, ParseObj, GenRecordInstance, "", 0, O) + del MUU + return GenRecordInstance + #if +#process - #MUU.createOutputNode(__file__, "errors", 3) # Error message. - #MUU.createOutputNode(__file__, "warnings", 3) # Error message. +def main(cmd) : + C = Config.Config() + O = Output.Output(__file__, C.Output) - __ppp(MUU, record, ParseObj, d, "", 0, O) - #for i in MUU.outputData : # HMMMM - # #print i.origin, i.name, i.msgType, i.severity, i.message - # print i, MUU.outputData[i].message + O.addMessage(__file__, -1, "INFO", "Received variant " + cmd) + RD = process(cmd, C, O) - del MUU - #if - #print "\n\n" - O.Summary() - #O.LogMsg(__file__, "Finished processing variant " + cmd) O.addMessage(__file__, -1, "INFO", "Finished processing variant " + cmd) - #for i in O.outputData : # HMMMM - # #print i.origin, i.name, i.msgType, i.severity, i.message - # print i, O.outputData[i].message ### OUTPUT BLOCK ### - print "NEW OUTPUT BLOCK" - vd = O.getOutput("variantdescription") - if vd : - print vd[0] - print vd[1] - print - #if gn = O.getOutput("genename") if gn : print "Gene Name: " + gn[0] @@ -882,54 +751,70 @@ def main(cmd) : print #if - #for i in O.getOutput("fatalerrors") : - # print "Fatal (%s) %s: %s" % (i.origin, i.code, i.description) - #for i in O.getOutput("errors") : - # print "Error (%s) %s: %s" % (i.origin, i.code, i.description) - #for i in O.getOutput("warnings") : - # print "Warning (%s) %s: %s" % (i.origin, i.code, i.description) - #for i in O.getOutput("info") : - # print "Info (%s) %s: %s" % (i.origin, i.code, i.description) - #for i in O.getOutput("debug") : - # print "Debug (%s) %s: %s" % (i.origin, i.code, i.description) O.getMessages() + errors, warnings, summary = O.Summary() + print summary + print + + if not errors : + visualisation = O.getOutput("visualisation") + if visualisation : + for i in range(len(visualisation)) : + if i and not i % 3 : + print + print visualisation[i] + #for + print + #if - visualisation = O.getOutput("visualisation") - if visualisation : - for i in range(len(visualisation)) : - if i and not i % 3 : - print - print visualisation[i] - #for - print - #if - vd = O.getOutput("variantdescription") - if vd : - for i in vd : - print i - op = O.getOutput("oldprotein") - if op : - print "\n<b>Old protein:</b>" - __bprint(op[0]) - print - #if - np = O.getOutput("newprotein") - if np : - print "\n<b>New protein:</b>" - __bprint(np[0]) - print + reference = O.getOutput("reference")[0] + + for i in RD.record.geneList : + for j in i.transcriptList : + if ';' in j.description : + print "%s(%s_%s):%c.[%s]" % (reference, i.name, j.name, + j.molType, j.description) + else : + print "%s(%s_%s):%c.%s" % (reference, i.name, j.name, + j.molType, j.description) + + vd = O.getOutput("variantdescription") + if vd : + for i in vd : + print "%s:%s" % (reference, i) + + pd = O.getOutput("proteindescription") + if pd : + if O.getOutput("altprotein") : + print "%s:p.(0)" % reference + else : + print "%s:%s" % (reference, pd[0]) + #if + + op = O.getOutput("oldprotein") + if op : + print "\n<b>Old protein:</b>" + __bprint(op[0]) + print + #if + np = O.getOutput("newprotein") + if np : + print "\n<b>New protein:</b>" + __bprint(np[0]) + print + #if + ap = O.getOutput("altprotein") + if ap : + print "\n<b>Alternative protein using start codon %s:</b>" % \ + O.getOutput("altstart")[0] + __bprint(ap[0]) + print + #if #if - pd = O.getOutput("proteindescription") - if pd : - print - print pd[0] - print "/NEW OUTPUT BLOCK" ### OUTPUT BLOCK ### del O #main if __name__ == "__main__" : - import sys - main(sys.argv[1]) #if diff --git a/src/UCSC_update.py b/src/UCSC_update.py index 8125d63d..73cb0e29 100644 --- a/src/UCSC_update.py +++ b/src/UCSC_update.py @@ -3,26 +3,29 @@ """ Get updates on mapping information from the UCSC. - This program is intended to be run dayly from cron. + This program is intended to be run daily from cron. """ -import sys -import os -os.chdir(sys.argv[0].rsplit('/', 2)[0]) +import sys # sys.argv +import os # os.chdir() from Modules import Config from Modules import Output -from Modules import Db +from Modules.Db import Remote +from Modules.Db import Update + +os.chdir(sys.argv[0].rsplit('/', 2)[0]) C = Config.Config() O = Output.Output(__file__, C.Output) O.addMessage(__file__, -1, "INFO", "Starting UCSC mapping data update") for i in C.Db.dbNames : - RemoteDb = Db.Db("remote", i, C.Db) - LocalDb = Db.Db("local", i, C.Db) - + RemoteDb = Remote(i, C.Db) RemoteDb.get_Update() + del RemoteDb + + LocalDb = Update(i, C.Db) LocalDb.load_Update() count_Updates = LocalDb.count_Updates() @@ -36,11 +39,10 @@ for i in C.Db.dbNames : LocalDb.merge_cdsUpdates() #if LocalDb.merge_Update() + + del LocalDb #for O.addMessage(__file__, -1, "INFO", "UCSC mapping data update end") -del LocalDb -del RemoteDb -del O -del C +del O, C diff --git a/src/VarInfo.py b/src/VarInfo.py index 4df7273c..cda88548 100644 --- a/src/VarInfo.py +++ b/src/VarInfo.py @@ -72,12 +72,12 @@ def __getcoords(C, Loc, Type) : def __process(LOVD_ver, build, acc, var, C, O) : # Make a connection to the MySQL database with the username / db # information from the configuration file. - Database = Db.Db("local", build, C.Db) # Open the database. - if not Database.opened : + if not build in C.Db.dbNames : O.addMessage(__file__, 4, "EARG", "Database %s not found." % build) print "Error (Variant_info): Database %s not found." % build return #if + Database = Db.Mapping(build, C.Db) # Open the database. # Get the rest of the input variables. accno = acc diff --git a/src/handler.py b/src/handler.py index 45bfc73d..84b3ad39 100644 --- a/src/handler.py +++ b/src/handler.py @@ -11,16 +11,14 @@ """ import os +import bz2 from ZSI import dispatch - -# This is a workaround for pydoc. Aparently it crashes when a module can not be -# imported. -try : - from mod_python import apache, publisher -except ImportError : - pass +from soaplib.client import make_service_client +from mod_python import apache, publisher from Modules import Web +from Modules import Config + import webservice def handler(req): @@ -76,16 +74,43 @@ def handler(req): reqFile = req.uri.split('/')[-1] req.content_type = 'text/plain' req.headers_out["Content-Disposition"] = \ - "attachment; filename = \"%s\"" % reqFile # To force downloading. - args = {"path" : reqPath} # Replace the path variable. + "attachment; filename = \"%s\"" % reqFile # To force downloading. + args = {"path" : reqPath} # Replace the path. req.write(W.tal("HTML", "templates/" + reqFile, args)) return apache.OK #if + # Return raw content (for batch checker results). + if "Results" in req.uri : + reqFile = req.uri.split('/')[-1] + C = Config.Config() + req.write(open("%s/%s" % (C.Scheduler.resultsDir, reqFile)).read()) + del C + return apache.OK + #if + + # Return uncompressed GenBank files from the cache. + if "GenBank" in req.uri : + reqFile = req.uri.split('/')[-1] + C = Config.Config() + fileName = "%s/%s.bz2" % (C.Retriever.cache, reqFile) + if os.path.isfile(fileName) : + handle = bz2.BZ2File("%s/%s.bz2" % (C.Retriever.cache, reqFile), + "r") + req.content_type = 'text/plain' + req.headers_out["Content-Disposition"] = \ + "attachment; filename = \"%s\"" % reqFile # Force downloading. + req.write(handle.read()) + handle.close() + del C + #if + else : + return apache.HTTP_FORBIDDEN + return apache.OK + #if + # Generate the WSDL file from the MutalyzerService class. if ".wsdl" in req.uri : - from soaplib.client import make_service_client - servicepath = "http://" + reqPath + "/services" client = make_service_client(servicepath, webservice.MutalyzerService()) req.content_type = 'text/xml' diff --git a/src/index.py b/src/index.py index a42649bf..b535fbc0 100644 --- a/src/index.py +++ b/src/index.py @@ -15,12 +15,18 @@ import Mutalyzer import VarInfo -from Modules import Web -from mod_python import apache -from Modules import Config import pydoc import webservice +from mod_python import apache + +from Modules import Web +from Modules import Config +from Modules import Output +from Modules import Db +from Modules import Scheduler +from Modules import File + def index(req) : """ The mutation checker page. @@ -111,6 +117,9 @@ def download(req) : #download def upload(req) : + """ + """ + C = Config.Config() maxUploadSize = C.Retriever.maxDldSize del C @@ -136,6 +145,41 @@ def upload(req) : return ret #upload +def batch(req) : + """ + """ + + W = Web.Web() + eMail = "" + if req.form : + eMail = req.form['eMail'] + fileUpload = req.form['file'] + + if fileUpload.filename and W.isEMail(eMail) : + C = Config.Config() + D = Db.Batch(C.Db) + S = Scheduler.Scheduler(C.Scheduler, D) + O = Output.Output(__file__, C.Output) + FileInstance = File.File(C.File, O) + + job = FileInstance.parseBatchFile(fileUpload.file) + S.addJob("1231243", eMail, job, "http://%s%s" % (req.hostname, + req.uri)) + + del FileInstance, S, D, C + #if + #if + + args = { + "version" : W.version, + "lastEMail" : eMail + } + + ret = W.tal("HTML", "templates/batch.html", args) + del W + return ret +#batch + def documentation(req) : """ Generate documentation for the webservice. diff --git a/src/webservice.py b/src/webservice.py index 510f1b82..b776dac5 100644 --- a/src/webservice.py +++ b/src/webservice.py @@ -1,5 +1,11 @@ #!/usr/bin/python +""" + Mutalyzer webservices. + + Public classes: + MutalyzerService ; Mutalyzer webservices. +""" from soaplib.wsgi_soap import SimpleWSGISoapApp from soaplib.service import soapmethod @@ -9,7 +15,7 @@ from ZSI import TC from ZSI.fault import Fault from Modules import Web -from Modules import Db +from Modules.Db import Mapping from Modules import Output from Modules import Config from Modules import Parser @@ -48,7 +54,7 @@ class MutalyzerService(SimpleWSGISoapApp) : gTocConversion(self, build, variant) ; Convert g. to c. """ - def __checkBuild(self, L, D, build) : + def __checkBuild(self, build, config) : """ Check if the build is supported (hg18 or hg19). @@ -61,7 +67,7 @@ class MutalyzerService(SimpleWSGISoapApp) : Nothing (but raises an EARG exception). """ - if not D.opened : + if not build in config.dbNames : L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault(Fault.Client, "EARG", detail = "The build argument (%s) was not a valid " \ @@ -178,8 +184,8 @@ class MutalyzerService(SimpleWSGISoapApp) : "Received request getTranscripts(%s %s %s)" % (build, chrom, pos)) - D = Db.Db("local", build, C.Db) - self.__checkBuild(L, D, build) + self.__checkBuild(build, C.Db) + D = Mapping(build, C.Db) self.__checkChrom(L, D, chrom) self.__checkPos(L, pos) @@ -220,8 +226,8 @@ class MutalyzerService(SimpleWSGISoapApp) : "Received request getTranscriptsRange(%s %s %s %s %s)" % (build, chrom, pos1, pos2, method)) - D = Db.Db("local", build, C.Db) - self.__checkBuild(L, D, build) + D = Mapping(build, C.Db) + self.__checkBuild(build, C.Db) ret = D.get_Transcripts(chrom, pos1, pos2, method) L.addMessage(__file__, -1, "INFO", @@ -252,8 +258,8 @@ class MutalyzerService(SimpleWSGISoapApp) : L.addMessage(__file__, -1, "INFO", "Received request getGeneName(%s %s)" % (build, accno)) - D = Db.Db("local", build, C.Db) - self.__checkBuild(L, D, build) + D = Mapping(build, C.Db) + self.__checkBuild(build, C.Db) ret = D.get_GeneName(accno.split('.')[0]) L.addMessage(__file__, -1, "INFO", @@ -424,7 +430,7 @@ class MutalyzerService(SimpleWSGISoapApp) : """ Conf = Config.Config() # Read the configuration file. - Database = Db.Db("local", build, Conf.Db) + D = Mapping(build, C.Db) O = Output.Output(__file__, Conf.Output) diff --git a/templates/batch.html b/templates/batch.html new file mode 100644 index 00000000..c25e6098 --- /dev/null +++ b/templates/batch.html @@ -0,0 +1,40 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" + "http://www.w3.org/TR/html4/loose.dtd"> +<html> + <head> + <meta http-equiv = "Content-type" content = "text/html; charset=UTF-8"> + <title tal:content = "structure string:Mutalyzer ${version}"></title> + <script type = "text/javascript" + language = "javascript" + src = "test.js"> + </script> + </head> + <body onload = "yo()"> +<!-- + <div metal:use-macro = "sitemacros/macros/menu"></div> +--> + <center> + <big tal:content = "structure string:Mutalyzer ${version} batch checker."> + </big> + </center><br> + Blablabla + <br> + <form action = "" method = "post" enctype = "multipart/form-data"> + <input + type = "file" + name = "file" + size = "100%" + ><br> + <input + type = "text" + name = "eMail" + tal:attributes = "value lastEMail" + size = "100%" + ><br> + <input + type="submit" + value="Submit" + > + </form> + </body> +</html> -- GitLab