liacs_ai.tex 8.24 KB
Newer Older
Laros's avatar
Laros committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
\documentclass[slidestop]{beamer}

\author{Jeroen F.J. Laros}
\title{Artificial Intelligence in Genetics}
\providecommand{\mySubTitle}{Requirements and examples}
\providecommand{\myConference}{ML/AI meeting LUMC - LIACS}
\providecommand{\myDate}{27-06-2018}
\providecommand{\myGroup}{Research Software Engineering}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}

\usetheme{lumc}
\usepackage{fancyvrb}

\begin{document}

% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}

% Make the title slide.
\makeTitleSlide{\includegraphics[height=2.7cm]{sanger_trace}}

% First page of the presentation.
\section{Introduction}
\makeTableOfContents


\section{Sequencing}
\subsection{From DNA to sequences}
\begin{pframe}
  \begin{minipage}[t]{0.47\textwidth}\begin{figure}[]
    \begin{center}
      \includegraphics[width=\textwidth]{NovaSeq6000}
    \end{center}
    \caption{Illumina NovaSeq 6000.}
  \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}[t]{0.47\textwidth}
    \begin{Verbatim}[fontsize=\tiny]
      @K0187:120:GFLBXX:1:1101:2950:1191 1:N:0:CTTGAG
      TTAGGAAATAATAAATTGTAGTTTTTTTTATGATTTGGTTGAATTGATT
      +
      AAAFFJJJJJJ-AJFFFJ-FF<F<JJJAFJFFFJAFJJ<-FFFA<FFAA
      @K0187:120:GFLBXX:1:1101:7710:1209 1:N:0:CTTGAG
      GTCCACTAGAACTTGTAGAGCTGGAACCAACTGATTTGCAAAAGCAAAA
      +
      AA-FFFJAAAF-FFAJJJFJJJ--AFFJJFFJJFAFA-FJJJJFJJJJF
      @K0187:120:GFLBXX:1:1101:17269:1209 1:N:0:CTTGAG
      GTACTTGATGAATATTTACTAAAGAATGGAAGGAAAAAAGAAAGAAGGA
      +
      AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
      @K0187:120:GFLBXX:1:1101:28067:1209 1:N:0:CTTGAG
      TGGGTGTGTTAAGAATGACATTCAACGCAGTCTCTTTACCACCTCCCCA
      +
      AAA-FJFJFJFAJJJJFFJFAFJJJJFFJFAJJJJFFJJJAAJ7FFJJA
      @K0187:120:GFLBXX:1:1101:1367:1226 1:N:0:CTTGAG
      NTAGGTGTAAAAACATGATCATCGAAGAGACAAGAAATAGAACGTATTT
      +
      #AAFFFJFJJJJJJJJJJJJJJJJFJFJFAFJJJJJJJJJJJJJJJAFF
      @K0187:120:GFLBXX:1:1101:8410:1226 1:N:0:CTTGAG
      AGTGAGACCCTCTCTCTAAAAAGAAAAAAGAAAAAAAATTATGATGTTT
      +
      AAFAFFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
    \end{Verbatim}
  \end{minipage}
\end{pframe}


\section{Background}
\subsection{Current status and requirements}
\begin{pframe}
  We work a lot with \emph{sequences} (over an alphabet of four letters).
  \begin{itemize}
    \item Alignment / variant calling.
    \item \textit{De novo} assembly / gene prediction.
    \item $k$-mer profiling.
  \end{itemize}
  \bigskip

  For most of these problems, there are efficient deterministic algorithms
  available, with the exception of \textit{de novo} assembly.
  \bigskip
  \pause

  We have a strong need for being able to \emph{explain} results.
  \begin{itemize}
    \item Techniques like decision trees.
    \item Optimisation of procedures, which are in the end very clear.
  \end{itemize}
\end{pframe}


\section{Farmacogenetics}
\subsection{Personalised Medicine}
\begin{pframe}
  Predict drug response from patient genetics.
  \begin{itemize}
    \item The current gold standard $r^2 \approx 0.55$.
  \end{itemize}
  \bigskip

  Over the years, we have collected quite some data.
  \begin{itemize}
    \item High quality genotypes from sequencing.
    \item $550$ \textit{in vivo} metabolite measurements.
  \end{itemize}
  \bigskip
  \pause

  The approach, a neural network with:
  \begin{itemize}
    \item Binary vectors indicating presence/absence of genetic features (two
      sets of inputs due to ploidy) as input.
    \item Predicted metabolite measurement (scalar) as output.
  \end{itemize}

  \vfill
  \permfoot{Guy Allard}
\end{pframe}

\begin{pframe}
  \vspace{-0.5cm}
  \begin{figure}[]
    \begin{center}
      \includegraphics[height=0.8\textheight]{pm_network}
    \end{center}
    \caption{Model architecture.}
  \end{figure}
\end{pframe}

\begin{pframe}
  Techniques and libraries:
  \begin{itemize}
    \item Model constructed and trained using TensorFlow.
    \item Allele submodules share the same layers (same model).
  \end{itemize}

  \begin{figure}[]
    \begin{center}
      \includegraphics[width=\textwidth]{allele}
    \end{center}
    \caption{Visualising contribution of features.}
  \end{figure}
  \vspace{-0.5cm}

  Results:
  \begin{itemize}
    \item $10$-fold cross validation, $r^2 \approx 0.7$.
    \item Allele submodule ($p_{a1}$) can be interrogated and the effects of
      genetic features explored.
  \end{itemize}

  \vfill
  \permfoot{Guy Allard}
\end{pframe}


%\section{Single cell sequencing}
%\subsection{???}
%\begin{pframe}
%  Slides here.
%\end{pframe}


\section{Clinical Genetics}
% Use case 2: Variant classification for the purposes of clinical diagnostics
% in the clinic is currently a tedious manual task. Clinicians have to go
% through lists of dozens of variants, and manually curate them. This task can
% take up to several hours per patient. It would greatly speed up diagnosis if
% the clinical could have an automated ranking, which would select the best
% candidate hit first. As variants are automatically annotated with dozens of
% annotation fields, we reasoned we could very well use some machine learnings
% methods for this purpose. Our first attempt was to use a decision tree, or
% ensemble of decision trees, as this is a white-box model. A confounding
% factor here is that the vast majority of variants (millions) are
% unclassified, with just a few thousand classified variants. This is currently
% an ongoing project in cooperation with the ICT department.
\subsection{Variant classification}
\begin{pframe}
  From the tens of thousands of genetic variants, we are only interested in the
  one that causes a disease.
  \begin{itemize}
    \item Filtering strategies narrow it down to a few dozens of variants.
    \item Manual curation takes a few hours per patient.
  \end{itemize}
  \bigskip

  Can this be done automatically?
  \bigskip
  \pause

  Challenges:
  \begin{itemize}
    \item Make the selection explainable (decision trees).
    \item Large number of unclassified variants.
  \end{itemize}

  \vfill
  \permfoot{Sander Bollen}
\end{pframe}

% Use case 1: Classifying reads to the lab protocol they were sequenced with.
% Within the department of clinical genetics, several lab protocols are used
% for the generation of Next-Generation Sequencing. Two different capture kits
% are used (WES and IDP), as well as two different sequencers (Hiseq4000 and
% NextSeq500). Downstream analysis can be slightly different for each protocol
% used, which requires run metadata to be correct. Unfortunately, this is not
% always the case, and a way of validating the input metadata is required. We
% have used machine learning to accomplish this task. Briefly, k-mers were
% counted for each NGS read, after which an SVM was trained on the dataset of
% several hundred thousand reads. Nextseq reads separate well from all other
% classes, but both hiseq4000 classes were more difficult to separate. This
% method may also be useful for other purposes where separating reads into
% classes is required, e.g. separating virus from host reads.
\subsection{Detecting lab protocol from raw data}
\begin{pframe}
  Differences in preprocessing (lab) steps:
  \begin{itemize}
    \item Capture kit (selection of part of the genome).
    \item Different sequencers.
  \end{itemize}
  \bigskip

  Validation of this metadata is needed because downstream analysis depends on
  it.
  \bigskip
  \pause

  Approach:
  \begin{itemize}
    \item $k$-mer counts of every read.
    \item Training on over $100,\!000$ examples:
    \begin{itemize}
      \item SVM.
      \item Random forest.
    \end{itemize}
  \end{itemize}

  \vfill
  \permfoot{Sander Bollen}
\end{pframe}

\begin{pframe}
  Distinguishing sequencing platform works very well.

  \begin{figure}[]
    \begin{center}
      \includegraphics[width=0.5\textwidth]{svm_error}
      \hfill
      \includegraphics[width=0.5\textwidth]{random_forest_error}
    \end{center}
    \caption{Overall error (left: SVM, right: Random forest).}
  \end{figure}

  Potential applications in contamination and \emph{infection} detection.

  \vfill
  \permfoot{Sander Bollen}
\end{pframe}


% Make the acknowledgements slide.
\makeAcknowledgementsSlide{
  \begin{tabular}{l}
    Guy Allard\\
    %Peter van 't Hof\\
    Sander Bollen\\
    Leon Mei\\
  \end{tabular}
}

\end{document}