diff --git a/doc/abstractESHG/Makefile b/doc/abstractESHG/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..26fe678bea8e32cb1bb435b6dddb335fd353be8b --- /dev/null +++ b/doc/abstractESHG/Makefile @@ -0,0 +1,44 @@ +# Makefile +# + +LATEX = latex +BIBTEX = bibtex +DVIPS = dvips +PS2PDF = ps2pdf14 +GNUPLOT = gnuplot + +PDF = $(addsuffix .pdf, $(basename $(shell grep -l '\\begin{document' *.tex))) +BIB = $(addsuffix .bbl, $(basename $(shell grep -l '\\nocite{\|\\cite{' *.tex))) +EPS = $(addsuffix .eps, $(basename $(shell ls *.gnp))) + + +all: $(EPS) $(BIB) $(PDF) + +clean: + rm -f *.blg *.log *.nav *.out *.snm *.toc *.dvi *.aux *.vrb + +release: all clean + +distclean: clean + rm -f $(PDF) $(EPS) + +%.aux: %.tex + $(LATEX) $^ + rm $(addsuffix .dvi, $(basename $^)) + +%.bbl: %.aux + $(BIBTEX) $(basename $^) + +%.dvi: %.tex + $(LATEX) $^ + $(LATEX) $^ + $(LATEX) $^ + +%.ps: %.dvi + $(DVIPS) $^ -o $@ + +%.pdf: %.ps + $(PS2PDF) $^ + +%.eps: %.gnp + $(GNUPLOT) < $< diff --git a/doc/abstractESHG/abstract.bbl b/doc/abstractESHG/abstract.bbl new file mode 100644 index 0000000000000000000000000000000000000000..932593530772999d73b325a8c2ff42e99c8d6dd4 --- /dev/null +++ b/doc/abstractESHG/abstract.bbl @@ -0,0 +1,15 @@ +\begin{thebibliography}{1} + +\bibitem{NOM1} +J.T. {den}~Dunnen and S.E. Antonarakis. +\newblock Mutation nomenclature extensions and suggestions to describe complex + mutations: {A} discussion. +\newblock {\em Human Mutation}, 15:7--12, 2000. + +\bibitem{hgvs_bnf} +J.F.J. Laros, A.~Blavier, J.T. den Dunnen, and P.E.M. Taschner. +\newblock A formalized description of the standard human variant nomenclature + in extended backus-naur form. +\newblock {\em BMC Bioinformatics}, 12(Suppl 4):S5, 2011. + +\end{thebibliography} diff --git a/doc/abstractESHG/abstract.tex b/doc/abstractESHG/abstract.tex new file mode 100644 index 0000000000000000000000000000000000000000..f38c3642ce9ff9f067acdae9d88a695a9f74fbf0 --- /dev/null +++ b/doc/abstractESHG/abstract.tex @@ -0,0 +1,66 @@ +\documentclass{article} +\usepackage{fullpage} + +\author{J.F.J. Laros \and M. Vermaat \and J.T. den Dunnen \and P.E.M. Taschner} +\title{Disambiguating complex HGVS variant descriptions} + +\frenchspacing + +\begin{document} + +\maketitle + +\begin{abstract} \noindent +The \emph{Human Genome Variation Society} (HGVS)~\cite{NOM1} nomenclature for +the description of sequence variations \ldots + +\paragraph{Background} +The recent formalisation of the HGVS nomenclature syntax~\cite{hgvs_bnf} makes +it possible to automatically interpret the variant description and reconstruct +the observed sequence. This formalisation however, tells us nothing about how +to make such a description. + +\paragraph{Problem description} +Formally, a variant description is, together with the reference sequence, the +input of a function that transforms the reference sequence into the observed +sequence. This function is not injective; multiple descriptions can generate +the same observed sequence. If for example, we observe a change from +\texttt{ATGCTTCAGG} to \texttt{CTGAAGCATT}. The untrained eye might see this +change as \texttt{1\_10delinsCTGAAGCATT}, while the preferred description would +be \texttt{1\_9inv;10G>T}. We call the set of descriptions that result in the +same observed sequence the set of \emph{equivalent descriptions}. + +\paragraph{Solution} +We present an algorithm that, given a reference sequence and an observed +sequence, will generate the HGVS description of the variant. Because there is +no direct link between the variant description that is used to reconstruct the +observed sequence and the generated variant description, this algorithm will +always generate the same description, no matter which description in the set of +equivalent descriptions is used. + +\paragraph{Implementation} +We start with finding the smallest indel that describes the change by removing +the longest common prefix and the longest common suffix from the reference- and +the observed sequence. Next, we recursively try to find a shorter description +using the following strategy: + +First we determine the \emph{longest common substring} (LCS) in both the +forward and the reverse strand. If the LCS is found on the forward strand, we +split the description in two parts and recursively describe the separate parts. +If the LCS is found on the reverse strand, we split the description in three +parts, the same two parts that we would get in the former case, plus an +inversion in between. + +The recursion ends if an elementary description (substitution, insertion, +deletion, etc.) is found. If a variant was split, the length of the description +is compared to the length of the indel that was split and the shortest of the +two is returned. + +\paragraph{Conclusion} +It works. + +\bibliographystyle{plain} +\bibliography{/home/jfjlaros/projects/bibliography.bib} +\end{abstract} + +\end{document}