Commit b0d0334a authored by Laros's avatar Laros
Browse files

Some rough ideas for the data sharing lecture.

parent 2f384197
......@@ -13,3 +13,6 @@
[submodule "submodules/kPAL"]
path = submodules/kPAL
url = https://github.com/LUMC/kPAL.git
[submodule "submodules/vkgl-datashare"]
path = submodules/vkgl-datashare
url = https://git.lumc.nl/j.f.j.laros/vkgl-datashare.git
../../submodules/presentation/Makefile
\ No newline at end of file
../../submodules/presentation/beamerthemelumc.sty
\ No newline at end of file
../../submodules/vkgl-datashare/presentations/2016-09-12/client.dia
\ No newline at end of file
\documentclass[slidestop]{beamer}
\author{Jeroen F.J. Laros}
\title{Data sharing}
\providecommand{\mySubTitle}{Conceptual and nomenclature issues}
\providecommand{\myConference}{\"OGH symposium}
\providecommand{\myDate}{29-09-2016}
\providecommand{\myGroup}{Leiden Genome Technology Center}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\usetheme{lumc}
\begin{document}
% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}
% Make the title slide.
\makeTitleSlide{\includegraphics[width=3.5cm]{sharing}}
% First page of the presentation.
\section{Introduction}
\makeTableOfContents
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{gq}
\end{center}
\caption{}
\label{}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{samples}
\end{center}
\caption{}
\label{}
\end{figure}
\end{pframe}
\subsection{Types of variants}
\begin{pframe}
% Data sharing at the LUMC:
% - Variant interpretations.
% - Variant frequencies.
\end{pframe}
% Issues with sharing frequencies:
% - See VKGL discussion.
\subsection{Requirements}
\begin{pframe}
We aim for a solution that has the following properties:
\begin{itemize}
\item Not only frequencies, but also \emph{supporting evidence} (numerator,
denominator).
\item Support for \emph{labels}.
\item Support for \emph{timestamps}.
\end{itemize}
\bigskip
Labels allow for queries like:
\begin{itemize}
\item How many times have we seen this variant in non-diabetes patients?
\item How many times was this variant observed in Nijmegen?
\end{itemize}
\bigskip
Timestamps are essential for \emph{reproducibility}.
\end{pframe}
\subsection{Options}
\begin{pframe}
Choices for the database layout:
\begin{itemize}
\item \emph{Sample} oriented.
\item \emph{Variant} oriented.
\end{itemize}
\bigskip
Choices for the infrastructure:
\begin{itemize}
\item Centralised.
\item Decentralised.
\end{itemize}
\end{pframe}
\subsection{Basic client set up}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{client}
\end{center}
\caption{Upload abstraction.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
From a \emph{gVCF}:
\begin{itemize}
\item Variants.
\item Regions with sufficient ``coverage''.
\begin{itemize}
\item We use \emph{genotype quality}.
\end{itemize}
\end{itemize}
\bigskip
Metadata:
\begin{itemize}
\item Labels.
\end{itemize}
\end{minipage}
\end{pframe}
%\subsection{Concerns}
%\begin{pframe}
% Sample identification:
% \begin{itemize}
% \item The Netherlands: $17 \times 10^6$ inhabitants, $24$ SNPs.
% \item Global: $108 \times 10^9$ inhabitants, $37$ SNPs.
% \end{itemize}
%\end{pframe}
\section{Sample oriented server}
\subsection{Database layout}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}\begin{figure}[]
\begin{center}
%\includegraphics[width=\textwidth]{sample_db}
\end{center}
\caption{Basic structure.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
Variants grouped by sample.
\begin{itemize}
\item Metadata goes into the \emph{Sample} table.
\end{itemize}
\bigskip
Similar layout for ``coverage'' data.
\end{minipage}
\end{pframe}
\subsection{Features and concerns}
\begin{pframe}
All requirements are met:
\begin{itemize}
\item Frequencies and supporting evidence.
\item Labels.
\item Timestamps.
\end{itemize}
\bigskip
We store more information than strictly necessary:
\begin{itemize}
\item Grouping by sample.
\end{itemize}
\end{pframe}
\subsection{Security requirements}
\begin{pframe}
General security measures:
\begin{itemize}
\item Access restriction by authentication.
\item Encrypted communication.
\item Optional:
\begin{itemize}
\item Dedicated (VPN) network.
\end{itemize}
\end{itemize}
\bigskip
Interface restrictions:
\begin{itemize}
\item Only variant oriented queries are allowed.
\end{itemize}
\end{pframe}
\subsection{Additional notes}
\begin{pframe}
Database content:
\begin{itemize}
\item Duplicate detection.
\item Removal of samples.
\item Quality control at submission time.
\end{itemize}
\end{pframe}
\section{Variant oriented server}
\subsection{Database layout}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}\begin{figure}[]
\begin{center}
%\includegraphics[width=\textwidth]{variant_db}
\end{center}
\caption{Basic structure.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
Variants not grouped.
\begin{itemize}
\item Metadata goes into the \emph{Observation} table.
\end{itemize}
\bigskip
Similar layout for ``coverage'' data.
\end{minipage}
\end{pframe}
\subsection{Features and concerns}
\begin{pframe}
Requirements that are met:
\begin{itemize}
\item Frequencies and supporting evidence.
\item Labels.
\end{itemize}
\bigskip
But not:
\begin{itemize}
\item Timestamps.
\end{itemize}
\bigskip
Timestamps are a unique label for a group of variants.
\end{pframe}
\subsection{Security requirements}
\begin{pframe}
No general security measures or restrictions on the interface are needed.
\bigskip
Pitfall: A {unique combination of labels} may still be associated with one
sample.
\begin{itemize}
\item New gene panels.
\item Very rare diseases.
\end{itemize}
\end{pframe}
\subsection{Additional notes}
\begin{pframe}
Pros:
\begin{itemize}
\item Database can be completely open.
\end{itemize}
\bigskip
Cons:
\begin{itemize}
\item No duplicate detection.
\item No sample removal.
\begin{itemize}
\item Rolling back a submission is possible.
\end{itemize}
\item Limited data integrity checks.
\item Requires more resources.
\item Works on paper, but needs to be developed.
\end{itemize}
\end{pframe}
\section{Pooling}
\subsection{Concept}
\begin{pframe}
Instead of submitting variants or samples, only \emph{aggregated data} is
shared.
\bigskip
Requirements:
\begin{itemize}
\item Additional local infrastructure.
\end{itemize}
\bigskip
Concerns:
\begin{itemize}
\item Potential long waiting times.
\end{itemize}
\bigskip
Use case ($1500$ samples, $50$ gene panels):
\begin{itemize}
\item Pool size $100$, average waiting time of $3$ years.
\item Pool size $10$, waiting time of $4$ years for some of our IDPs.
\end{itemize}
\end{pframe}
\section{Decentralised servers}
\subsection{Concept}
\begin{pframe}
Each institute gets their own server, a centralised \emph{query interface}
queries these databases.
\bigskip
Requirements:
\begin{itemize}
\item Development of a centralised query interface.
\item Installation and maintenance of several servers.
\end{itemize}
\bigskip
Concerns:
\begin{itemize}
\item Does not solve any of the security concerns.
\end{itemize}
\end{pframe}
\section{Conclusions}
\subsection{Summary}
\begin{pframe}
\begin{table}[]
\begin{center}
\begin{tabular}{lccc}
type & requirements & development & maintenance\\
\hline
sample & yes & no & low\\
sample + pooling & yes & yes & high\\
variant & no & yes & low\\
distributed & yes & yes & high\\
\end{tabular}
\end{center}
\caption{Solutions and consequences.}
\end{table}
Decentralised solutions will require a substantial amount of additional
effort:
\begin{itemize}
\item Development.
\item Maintenance.
\end{itemize}
\end{pframe}
% Issues with sharing interpretations:
% - ...
% Nomenclature issues:
% -
% Make the acknowledgements slide.
\makeAcknowledgementsSlide{
\begin{tabular}{l}
\end{tabular}
}
\end{document}
set style data boxes
set style fill solid
set xrange[0:2000]
set yrange[0:60]
set xlabel "position"
set ylabel "GQ"
set multiplot
set nokey
plot "gq_1.dat" notitle
set style data lines
plot 30 lc 0 lw 8
#!/usr/bin/env python
import random
y = 0
for i in range(2000):
y = max(0, y + random.randint(-1, 1))
print y
1
2
1
0
0
0
0
0
1
2
3
4
5
6
6
5
6
6
5
5
6
5
5
6
7
7
8
7
6
7
6
6
7
6
7
8
7
6
7
8
9
9
9
9
8
8
7
8
9
10
10
11
11
12
12
13
12
12
13
12
12
13
12
13
13
12
12
12
12
12
12
11
12
13
14
14
14
15
15
16
17
18
19
19
20
19
20
21
21
22
22
21
21
20
19
18
17
17
16
15
16
15
16
15
15
15
16
16
15
14
14
15
15
16
15
15
15
14
14
14
15
14
14
15
15
16
15
16
16
16
16
17
17
17
16
17
18
18
18
17
18
17
17
16
17
17
17
17
16
17
18
18
18
18
18
19
20
19
18
18
19
19
20
21
20
21
20
20
21
20
21
22
22
23
24
25
25
25
25
24
25
26
27
27
28
29
28
29
30
31
30
30
30
31
32
31
31
32
32
33
34
33
33
33
32
31
30
31
30
31
30
30
31
30
30
31
32
33
32
32
31
31
31
32
33
34
33
34
33
33
33
33
33
34
33
32
33
34
35
34
33
34
35
34
33