Commit 48810a3e authored by Laros's avatar Laros
Browse files

Final draft of data sharing lecture.

parent 72937681
......@@ -42,12 +42,18 @@
\end{itemize}
\end{pframe}
%\section{Bulk data}
\section{Bulk data}
\begin{pframe}
\begin{center}
\includegraphics[height=0.7\textheight]{frequency}
\end{center}
\end{pframe}
\subsection{Requirements}
\begin{pframe}
We aim for a solution that has the following properties:
\begin{itemize}
\item Not only frequencies, but also \emph{supporting evidence} (numerator,
\item Not only counts, but also \emph{reference calls} (numerator,
denominator).
\item Support for \emph{labels}.
\item Support for \emph{timestamps}.
......@@ -118,23 +124,6 @@
\end{minipage}
\end{pframe}
%\subsection{Options}
%\begin{pframe}
% Choices for the database layout:
% \begin{itemize}
% \item \emph{Sample} oriented.
% \item \emph{Variant} oriented.
% \end{itemize}
% \bigskip
%
% Choices for the infrastructure:
% \begin{itemize}
% \item Centralised.
% \item Decentralised.
% \end{itemize}
%\end{pframe}
%\section{Sample oriented server}
\subsection{Database layout}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}\begin{figure}[]
......@@ -156,22 +145,6 @@
\end{minipage}
\end{pframe}
\subsection{Features and concerns}
\begin{pframe}
All requirements are met:
\begin{itemize}
\item Frequencies and supporting evidence.
\item Labels.
\item Timestamps.
\end{itemize}
\bigskip
We store more information than strictly necessary:
\begin{itemize}
\item Grouping by sample.
\end{itemize}
\end{pframe}
\subsection{Security requirements}
\begin{pframe}
General security measures:
......@@ -191,8 +164,24 @@
\end{itemize}
\end{pframe}
%\section{Variant oriented server}
\subsection{Database layout}
\subsection{Features and concerns}
\begin{pframe}
All requirements are met:
\begin{itemize}
\item Frequencies and supporting evidence.
\item Labels.
\item Timestamps.
\end{itemize}
\bigskip
Privacy concerns:
\begin{itemize}
\item Variants are grouped by sample.
\item The samples have labels.
\end{itemize}
\end{pframe}
\subsection{Alternative: variant oriented database}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}\begin{figure}[]
\begin{center}
......@@ -207,193 +196,131 @@
\begin{itemize}
\item Metadata goes into the \emph{Observation} table.
\end{itemize}
\bigskip
Similar layout for genotype quality data.
\end{minipage}
\end{pframe}
\subsection{Features and concerns}
\begin{pframe}
Requirements that are met:
\begin{itemize}
\item Frequencies and supporting evidence.
\item Labels.
\end{itemize}
\bigskip
But not:
Samples can still be identified:
\begin{itemize}
\item Timestamps.
\item Timestamps are a unique label for a group of variants.
\item Any unique combination of labels.
\begin{itemize}
\item New gene panels, very rare diseases.
\end{itemize}
\end{itemize}
\bigskip
Timestamps are a unique label for a group of variants.
\end{pframe}
\subsection{Security requirements}
\subsection{Alternative: pooling}
\begin{pframe}
No general security measures or restrictions on the interface are needed.
Instead of variants or samples, only \emph{aggregated data} is shared.
\bigskip
Pitfall: A {unique combination of labels} may still be associated with one
sample.
\begin{itemize}
\item New gene panels.
\item Very rare diseases.
\end{itemize}
\end{pframe}
\subsection{Additional notes}
\begin{pframe}
Pros:
Requirements:
\begin{itemize}
\item Database can be completely open.
\item Additional local infrastructure.
\end{itemize}
\bigskip
Cons:
Concerns:
\begin{itemize}
\item No duplicate detection.
\item No sample removal.
\begin{itemize}
\item Rolling back a submission is possible.
\end{itemize}
\item Limited data integrity checks.
\item Requires more resources.
\item Works on paper, but needs to be developed.
\item Potential long waiting times.
\end{itemize}
\end{pframe}
\bigskip
\subsection{Additional notes}
\begin{pframe}
Database content:
Use case ($1500$ samples, $50$ gene panels):
\begin{itemize}
\item Duplicate detection.
\item Removal of samples.
\item Quality control at submission time.
\item Pool size $100$, average waiting time of $3$ years.
\item Pool size $10$, waiting time of years for some of our IDPs.
\end{itemize}
\end{pframe}
\subsection{Technical issues}
\subsection{Hybrid solution}
\begin{pframe}
Very important to ``speak the same language''.
We use a centralised, sample oriented server.
\bigskip
Make sure we use common names:
Additionally, each institute can set up their own server, which can be
queried by the centralised server.
\begin{itemize}
\item Reference sequence (hg19, hg38).
\item Labels.
\begin{itemize}
\item Gene panels.
\item Disease/syndrome names.
\end{itemize}
\item Conceptually the same as client side pooling.
\item Data is available instantaneous.
\end{itemize}
\end{pframe}
\bigskip
\begin{pframe}
\begin{figure}[]
{\Large\texttt{
\begin{tabular}{llll}
\#CHROM & POS & REF & ALT\\
\onslide<1->{1 & 884551 & GAGA\color{red}AAGA & GAGA}\\
\onslide<2->{1 & 884552 & AGA\color{red}AAGA & AGA}\\
\onslide<3->{1 & 884553 & GA\color{red}AAGA & GA}\\
\onslide<4->{1 & 884554 & A\color{red}AAGA & A}
\end{tabular}
}}
\caption{Deletion of \bt{AAGA}.}
\end{figure}
About $80$\% of the institutes are content with using only the centralised
server.
\end{pframe}
%\section{Pooling}
\subsection{Concept}
\subsection{Technical issues}
\begin{pframe}
Instead of submitting variants or samples, only \emph{aggregated data} is
shared.
Very important to use the same metadata and naming.
\bigskip
Requirements:
Genome build.
\begin{itemize}
\item Additional local infrastructure.
\item Reference allele is checked by the database.
\end{itemize}
\bigskip
Concerns:
Ontologies and automatic checking of metadata.
\begin{itemize}
\item Potential long waiting times.
\item Gene panels.
\item Disease/syndrome names.
\end{itemize}
\bigskip
Use case ($1500$ samples, $50$ gene panels):
\begin{itemize}
\item Pool size $100$, average waiting time of $3$ years.
\item Pool size $10$, waiting time of $4$ years for some of our IDPs.
\end{itemize}
Variant calls.
\vfill
\permfoot{\url{http://varda.readthedocs.io/en/latest/}}
\end{pframe}
%\section{Decentralised servers}
\subsection{Concept}
\begin{pframe}
Each institute gets their own server, a centralised \emph{query interface}
queries these databases.
Beware of the freedom allowed in VCF files.
\bigskip
Requirements:
\begin{itemize}
\item Development of a centralised query interface.
\item Installation and maintenance of several servers.
\end{itemize}
\begin{minipage}{\textwidth}
\begin{figure}[]
{\Large\texttt{
\begin{tabular}{llll}
\#CHROM & POS & REF & ALT\\
\onslide<1->{1 & 884551 & GAGA\color{red}AAGA & GAGA}\\
\onslide<2->{1 & 884552 & AGA\color{red}AAGA & AGA}\\
\onslide<3->{1 & 884553 & GA\color{red}AAGA & GA}\\
\onslide<4->{1 & 884554 & A\color{red}AAGA & A}
\end{tabular}
}}
\caption{Deletion of \bt{AAGA}.}
\end{figure}
\end{minipage}
\bigskip
Concerns:
\begin{itemize}
\item Does not solve any of the security concerns.
\end{itemize}
\end{pframe}
\onslide<4->{Disambiguation is done by the database.}
\vfill
%\section{Conclusions}
%\subsection{Summary}
%\begin{pframe}
% \begin{table}[]
% \begin{center}
% \begin{tabular}{lccc}
% type & requirements & development & maintenance\\
% \hline
% sample & yes & no & low\\
% sample + pooling & yes & yes & high\\
% variant & no & yes & low\\
% distributed & yes & yes & high\\
% \end{tabular}
% \end{center}
% \caption{Solutions and consequences.}
% \end{table}
%
% Decentralised solutions will require a substantial amount of additional
% effort:
% \begin{itemize}
% \item Development.
% \item Maintenance.
% \end{itemize}
%\end{pframe}
\permfoot{\url{http://varda.readthedocs.io/en/latest/}}
\end{pframe}
\section{Interpreted data}
\begin{pframe}
\begin{center}
\includegraphics[height=0.7\textheight]{diagnostics}
\end{center}
\end{pframe}
\subsection{HGVS nomenclature}
\begin{pframe}
Recommendations for the description of sequence variants.
\bigskip
\textit{HGVS-nomenclature is used to report and exchange information
regarding variants found in DNA, RNA and protein sequences and serves as an
international standard in DNA diagnostics. HGVS-nomenclature is authorised by
the Human Genome Variation Society (HGVS), the Human Variome Project (HVP)
and the HUman Genome Organization (HUGO).}
regarding variants found in DNA, RNA and protein sequences ...}
\bigskip
The famous \bf{g.} and \bf{c.} notation:
The famous {\bf g.}, {\bf c.} and {\bf p.} notation:
\begin{itemize}
\item {\Large\texttt{NC\_000011.9:g.111959695G>T}}.
\item {\Large\texttt{NM\_003002.3:c.274G>T}}.
\item \bt{NC\_000011.9:g.111959695G>T}
\item \bt{NM\_003002.3:c.274G>T}
\item \bt{NP\_002993.1:p.Asp92Tyr}
\end{itemize}
\vfill
......@@ -404,10 +331,10 @@
\begin{figure}[]
\begin{center}
{\Large\texttt{
\only<1>{TTGAATCCT \color{red}TGCTC \color{black}TGCGATGGA\\}%
\only<2>{TTGAATCCTT \color{red}GCTCT \color{black}GCGATGGA\\}%
\only<3>{TTGAATCCTTG \color{red}CTCTG \color{black}CGATGGA\\}%
\only<4->{TTGAATCCTTGC \color{red}TCTGC \color{black}GATGGA\\}%
\only<1>{...TTGAATCCT \color{red}TGCTC \color{black}TGCGATGGA...\\}%
\only<2>{...TTGAATCCTT \color{red}GCTCT \color{black}GCGATGGA...\\}%
\only<3>{...TTGAATCCTTG \color{red}CTCTG \color{black}CGATGGA...\\}%
\only<4->{...TTGAATCCTTGC \color{red}TCTGC \color{black}GATGGA...\\}%
\vspace{1cm}
\begin{tabular}{l}
\onslide<1->{NM\_003002.2:c.262\_266del}\\
......@@ -419,6 +346,9 @@
\end{center}
\caption{Deletion of \bt{TCTGC}.}
\end{figure}
\bigskip
\onslide<4->{$44$\% of all insertions and deletions suffer from this.}
\end{pframe}
\begin{pframe}
......@@ -426,29 +356,36 @@
\begin{center}
Genome\\
{\Large\texttt{%
3' AACTTAGGA \color{red}ACGAG \color{black}ACGCTACCT 5'\\
5' TTGAATCCTTGC \color{red}TCTGC \color{black}GATGGA 3'\\
3' ...AACTTAGGA \color{red}ACGAG \color{black}ACGCTACCT... 5'\\
5' ...TTGAATCCTTGC \color{red}TCTGC \color{black}GATGGA... 3'\\
}}
Transcript
\end{center}
\caption{3' rule.}
\end{figure}
\bigskip
Commonly used approach (almost all annotation tools):
\begin{itemize}
\item Calculate the coordinates based on the mapping of the transcript.
\item Calculate the \emph{reverse complement} of the inserted sequence.
\end{itemize}
\end{pframe}
\begin{pframe}
\begin{figure}[]
{\Large\texttt{%
\only<1>{%
ATGGCTCC \color{red}TG- \color{black}CCATGGAA\\
ATGGCTCC \color{red}TGC \color{black}CCATGGAA\\
...ATGGCTCC \color{red}TG- \color{black}CCATGGAA...\\
...ATGGCTCC \color{red}TGC \color{black}CCATGGAA...\\
}%
\only<2>{%
ATGGCTCCTG \color{red}- \color{black}CCATGGAA\\
ATGGCTCCTG \color{red}C \color{black}CCATGGAA\\
...ATGGCTCCTG \color{red}- \color{black}CCATGGAA...\\
...ATGGCTCCTG \color{red}C \color{black}CCATGGAA...\\
}%
\only<3->{%
ATGGCTCCTGCC \color{red}- \color{black}ATGGAA\\
ATGGCTCCTGCC \color{red}C \color{black}ATGGAA\\
...ATGGCTCCTGCC \color{red}- \color{black}ATGGAA...\\
...ATGGCTCCTGCC \color{red}C \color{black}ATGGAA...\\
}%
\vspace{1cm}
\begin{tabular}{l}
......@@ -458,17 +395,19 @@
\onslide<4->{NM\_002001.2:c.12dup}
\end{tabular}
}}
\caption{}
\caption{Deletion of a \bt{C}.}
\end{figure}
\smallskip
A more extreme example.
\end{pframe}
\subsection{Mutalyzer}
\begin{pframe}
Mutalyzer: a curational tool for Locus Specific Mutation Databases (LSDBs).
A curational tool for Locus Specific Mutation Databases.
\bigskip
Variant nomenclature checker applying Human Genome Variation Society (HGVS)
guidelines.
Variant nomenclature checker applying the HGVS guidelines:
\begin{itemize}
\item Is the syntax of the variant description valid?
\item Does the reference sequence exist?
......@@ -492,8 +431,7 @@
\begin{center}
\includegraphics[width=\textwidth]{mutalyzer_disambiguation}
\end{center}
\caption{}
\label{}
\caption{\bt{NM\_002001.2:c.9\_10delinsTGC}.}
\end{figure}
\end{pframe}
......@@ -502,15 +440,64 @@
\begin{center}
\includegraphics[width=\textwidth]{mutalyzer_disambiguation_result}
\end{center}
\caption{}
\label{}
\caption{\bt{NM\_002001.2:c.12dup}.}
\end{figure}
\end{pframe}
\begin{pframe}
Mutalyzer is free:
\begin{itemize}
\item Open source (AGPL license).
\item Available via GitHub.
\end{itemize}
\bigskip
Available via the web:
\begin{itemize}
\item Interactive user interface.
\item Batch interface:
\begin{itemize}
\item CSV, XLS, ODS.
\end{itemize}
\item Webservices (to integrate in a pipeline):
\begin{itemize}
\item SOAP.
\item HTTP/RPC+JSON.
\end{itemize}
\end{itemize}
\vfill
\permfoot{\url{https://mutalyzer.nl}}\\
\permfoot{\url{https://github.com/mutalyzer/mutalyzer}}
\end{pframe}
\section{Conclusions}
\subsection{Pitfalls}
\begin{pframe}
Mismatches in annotation are problematic:
\begin{itemize}
\item Genome build.
\item RefSeq references (with version numbers).
\item Effect description (pathogenic, class 5, class A, \ldots).
\end{itemize}
\bigskip
A lot of variant descriptions are ambiguous:
\begin{itemize}
\item VCF indels.
\item HGVS insertions, deletions, inversions, deletion-insertions,
duplications.
\end{itemize}
\end{pframe}
% Make the acknowledgements slide.
\makeAcknowledgementsSlide{
\begin{tabular}{l}
Martijn Vermaat\\
Sander Bollen\\
Jonathan Vis\\
Guy Allard\\
Johan den Dunnen
\end{tabular}
}
......
../../submodules/presentation-pics/pics/diagnostics.jpg
\ No newline at end of file
../../submodules/presentation-pics/pics/frequency.jpg
\ No newline at end of file
Subproject commit d2389a47ec4397a925918434f7e1466b4f91cf4e
Subproject commit f6f9fc191b7eab0c1bae13c77f9614f18648af62
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment