Commit 713c0e63 authored by Laros's avatar Laros
Browse files

Added skeleton for extended k-mer lecture.

parent dce618bb
......@@ -10,3 +10,6 @@
[submodule "submodules/eyr4"]
path = submodules/eyr4
url = https://git.lumc.nl/j.f.j.laros/eyr4.git
[submodule "submodules/kPAL"]
path = submodules/kPAL
url = https://github.com/LUMC/kPAL.git
../../submodules/presentation-pics/pics/Cell.svg
\ No newline at end of file
../../submodules/presentation-pics/pics/DNA_chemical_structure.svg
\ No newline at end of file
../../submodules/presentation-pics/pics/DNA_orbit_static.png
\ No newline at end of file
../../submodules/presentation-pics/pics/base-pacbio.ppm
\ No newline at end of file
../../submodules/presentation-pics/pics/bridge_amplification.jpg
\ No newline at end of file
../../submodules/presentation-pics/pics/flowcell_2000.ppm
\ No newline at end of file
../../submodules/presentation-pics/pics/hiseq_2000.jpg
\ No newline at end of file
../../submodules/kPAL/presentations/theory_implementation/k-mer.dat
\ No newline at end of file
../../submodules/kPAL/presentations/theory_implementation/k-mer.gnp
\ No newline at end of file
../../submodules/kPAL/presentations/theory_implementation/k-mer2.dat
\ No newline at end of file
../../submodules/kPAL/presentations/theory_implementation/k-mer2.gnp
\ No newline at end of file
../../submodules/presentation-pics/pics/k_align.png
\ No newline at end of file
../../submodules/presentation-pics/pics/k_basecall.png
\ No newline at end of file
../../submodules/presentation-pics/pics/k_flowcell.png
\ No newline at end of file
../../submodules/presentation-pics/pics/k_image.jpg
\ No newline at end of file
\documentclass[slidestop]{beamer}
\author{Jeroen F.J. Laros}
\title{\LaTeX\ Presentation Template}
\providecommand{\mySubTitle}{Subtitle}
\providecommand{\myConference}{Work discussion}
\providecommand{\myDate}{12-nov-2015}
\providecommand{\myGroup}{Leiden Genome Technology Center}
\title{Applications of $k$-mer profiling in genomics}
\providecommand{\mySubTitle}{}
\providecommand{\myConference}{WIC Midwintermeeting, TU/e, Eindhoven}
\providecommand{\myDate}{1-feb-2016}
\providecommand{\myGroup}{}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
......@@ -17,79 +17,669 @@
%\renewcommand{\pause}{}
% Make the title slide.
\makeTitleSlide{\includegraphics[height=3.5cm]{logos/lumc_logo_small}}
\makeTitleSlide{\includegraphics[height=3.5cm]{kmer}}
% First page of the presentation.
\section{Short introduction}
\section{Introduction}
\makeTableOfContents
\subsection{Titles and subtitles}
\begin{pframe}
Notice the title and subtitle:
\begin{itemize}
\item The \emph{section} command controls the title.
\item The \emph{subsection} command controls the frametitle.
\end{itemize}
\bigskip
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{Cell}
\end{center}
\caption{Schematic overview of the cell.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
DNA can be found mainly in the \emph{nucleus}.
\begin{itemize}
\item The autosomes.
\item Chromosome X and Y.
\end{itemize}
\bigskip
But also the \emph{mitochondria} contain DNA.
\bigskip
Testing \hl{highlight}\ colour.
RNA can be found everywhere in the cell.
\end{minipage}
\end{pframe}
\subsection{DNA}
\begin{pframe}
The titles are retained until overwritten.
\bigskip
\pause
Notice the footer is present on all overlays.
\vfill
\permfoot{\url{https://git.lumc.nl/j.f.j.laros/presentation/tree/master}}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{DNA_orbit_static}
\hspace{1cm}
\includegraphics[height=0.7\textheight]{DNA_chemical_structure}
\end{center}
\caption{Chemical structure of DNA.}
\end{figure}
\end{pframe}
\section{New section title which is rather long}
\subsection{Code}
\section{Sequencing}
\subsection{DNA synthesis}
\begin{pframe}
A \emph{pframe} does not need to be declared fragile.
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{sequencing_principle}
\end{center}
\caption{Sequencing by synthesis.}
\end{figure}
\begin{lstlisting}[caption={Example input}]
print "Hello"
\end{lstlisting}
Other methods include \emph{ligation} and \emph{direct measurement}.
\end{pframe}
\subsection{Two columns}
\subsection{Illumina platforms}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{hiseq_2000}
\caption{HiSeq 2500.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
Characteristics:
\begin{itemize}
\item High throughput ($3$~genomes).
\item Paired end.
\item High accuracy.
\item Read length $2 \times 125$bp.
\item Relatively long run time ($6$~days).
\item Relatively expensive.
\end{itemize}
\end{minipage}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{flowcell_2000}
\hfill
\includegraphics[height=0.7\textheight]{k_flowcell}
\end{center}
\caption{Flowcell.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{logos/lumc_logo_small}
\includegraphics[trim=11.3cm 1.1cm 0cm 11.7cm, clip,
height=0.7\textheight]{bridge_amplification}
\end{center}
\caption{Some picture.}
\caption{Clusters.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
Sometimes it is convenient to have two columns.
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{k_image}
\end{center}
\caption{A tile (part of a lane).}
\end{figure}
\end{minipage}
\end{pframe}
\subsection{Base calling}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{k_basecall}
\end{center}
\caption{Base calling.}
\end{figure}
\end{pframe}
\subsection{Pacific Biosciences}
\begin{pframe}
\begin{figure}
\includegraphics[height=0.7\textheight]{pacbio}
\caption{PacBio RS.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.43\textwidth]{pacbio_cell}
\hfill
\includegraphics[height=0.43\textwidth, trim=1cm 9cm 1cm 7cm, clip]{pacbio_cell_hand}
\end{center}
\caption{SMRT cell.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\hspace{0.3cm}
\includegraphics[angle=90, height=0.7\textheight]{smrtbell}
\hfill
\includegraphics[width=0.7\textwidth]{zmw}
\hspace{0.3cm}
\end{center}
\caption{Real time polymerase monitoring.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{base-pacbio}
\end{center}
\caption{Base calling on the PacBio.}
\end{figure}
\end{pframe}
\section{Data analysis}
\subsection{Next generation sequencing data}
\begin{pframe}
\begin{lstlisting}[language=none, caption={A FastQ file.}]
@SGGPP:4:101
TTCGGGGGCTGGCAAATCCACTTCCGTGACACGCTACCATTCGCTGGTG
+
-'+4589,53330-0&07+03:54/2362-+.488587>@/25440++0
@SGGPP:4:102
CGGTAAACCACCCTGCTGACGGAACCCTAATGCGCCTGAAAGACAGCGT
+
34/--0'+.000(.55:;:99(0(+2(22(0316;185;;0;:<<>=AA
@SGGPP:4:106
TCGTTAACGACTTTGTTCGCCACCGCAACCGCCTGTTTCGGGTCACAGG
+
09875;5?<;?@A4?B:BBB<AA>CCC>C>BB0.->=0488+3444:@5
@SGGPP:4:112
TTGATGAATATATTATTTCAGGGAATAATTATGACACCTTTAGAACGCA
+
70<<@::5:<;==7;>>/79<:.:494.8(,,8:753/5@5??C>B???
\end{lstlisting}
\end{pframe}
\section{Alignment}
\subsection{The best match to the reference genome}
\begin{pframe}
\begin{minipage}[t]{0.7\textheight}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{k_align}
\end{center}
\caption{Alignment visualisation.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.45\textwidth}
Very efficient.
\begin{itemize}
\item Pictures can be too high.
\item You may have multiple small pictures.
\item \ldots
\item The reference genome needs to be \emph{indexed}.
\item Finding an alignment is as easy as looking up a word in a
dictionary.
\end{itemize}
\end{minipage}
\end{pframe}
\section{Variant calling}
\subsection{Consistent deviations from the reference}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[width=0.9\textwidth]{varcall}
\end{center}
\caption{Result of an alignment.}
\end{figure}
\end{pframe}
\subsection{Raw sequencing data}
\begin{pframe}
When do we work with \emph{raw data}:
\begin{itemize}
\item Unknown reference.
\item No time for analysis.
\end{itemize}
\bigskip
\pause
If the reference sequence is unknown, we can still do:
\begin{itemize}
\item Quality control.
\item Coverage estimation.
\end{itemize}
\bigskip
\pause
Compare raw datasets:
\begin{itemize}
\item Quality control.
\item Phylogeny.
\item Metagenomics.
\end{itemize}
\end{pframe}
\subsection{Counting $k$-mers}
\begin{pframe}
We choose a $k$ and count all occurrences of substrings of length $k$.
\bigskip
\pause
The counts of these substrings serve as a fingerprint of the dataset.
\bigskip
\pause
But, these counts contain a lot more information.
\end{pframe}
\subsection{A $2$-mer profile} \begin{pframe}
\begin{figure}[]
\begin{center}
\fbox{
\newline
\phantom{X}
\newline
% A>AC; C>T; T>AG; G>C
\only<1>{\bt{
\underline{AC}TAGACCACTTACTAGAGACTAGACCACCACTAGACCA\ldots}}\newline
\only<2>{\bt{
A\underline{CT}AGACCACTTACTAGAGACTAGACCACCACTAGACCA\ldots}}\newline
\only<3>{\bt{
AC\underline{TA}GACCACTTACTAGAGACTAGACCACCACTAGACCA\ldots}}\newline
\only<4>{\bt{
ACT\underline{AG}ACCACTTACTAGAGACTAGACCACCACTAGACCA\ldots}}\newline
\only<5>{\bt{
ACTA\underline{GA}CCACTTACTAGAGACTAGACCACCACTAGACCA\ldots}}\newline
\only<6>{\bt{
ACTAG\underline{AC}CACTTACTAGAGACTAGACCACCACTAGACCA\ldots}}\newline
\phantom{X}
\newline
}
\end{center}
\caption{Indexing.}
\end{figure}
We use a sliding window to find all $k$-mers ($k = 2$).
\begin{table}[]
\begin{center}
\begin{tabular}{ll|ll|ll|ll}
AA & $0$ & CA & $0$ & GA &
\only<-4>{$0$}\only<5->{\hl{$1$}} & TA &
\only<-2>{$0$}\only<3->{\hl{$1$}}\\
AC & \only<-5>{\hl{$1$}}\only<6->{\hl{$2$}} & CC &
$0$ & GC & $0$ & TC & $0$\\
AG & \only<-3>{$0$}\only<4->{\hl{$1$}} & CG &
$0$ & GG & $0$ & TG & $0$\\
AT & $0$ & CT &
\only<1>{$0$}\only<2->{\hl{$1$}} & GT & $0$ &
TT & $0$\\
\end{tabular}
\end{center}
\caption{$2$-mer profile.}
\end{table}
Observations are stored in a table.
\end{pframe}
\subsection{Counting $k$-mers}
\begin{pframe}
We choose a $k$ and count all occurrences of substrings of length $k$.
\pause
\begin{figure}
\colorbox{white}{
\includegraphics[width = \textwidth]{k-mer}
}
\vspace{-0.5cm}
\caption{A profile of $3$-mer counts.} \label{fig:kmerprofile}
\end{figure}
In Figure~\ref{fig:kmerprofile} we see a part of $3$-mer counts; \bt{AAA}
occurs $10$ times, \bt{AAC} occurs 20 times, etc.
\end{pframe}
\subsection{Choosing $k$}
\begin{pframe}
$k$ can not be too small:
\begin{itemize}
\item $k = 1$ will result in loss of all subsequence information.
\item $k = 2$ will give you information about di-nucleotides.
\item There is only one unique $10$-mer in Human (hg18).
\end{itemize}
\bigskip
\pause
But, $k$ can not be too large either:
\begin{itemize}
\item Almost all $18$-mers are unique in the Human genome.
\item Since this is not error tolerant:
\begin{itemize}
\item Read errors.
\item Assembly errors.
\end{itemize}
\end{itemize}
\bigskip
\pause
We have a solution for this (explained later in this presentation).
\end{pframe}
\section{$k$-mer profiles}
\subsection{Indexing}
\begin{pframe}
\begin{table}[]
\begin{center}
\begin{tabular}{c|c|c}
nucleotide & binary & decimal\\
\hline
\bt{A} & \bt{00} & $0$\\
\bt{C} & \bt{01} & $1$\\
\bt{G} & \bt{10} & $2$\\
\bt{T} & \bt{11} & $3$\\
\end{tabular}
\end{center}
\label{}
\caption{Nucleotide encoding table.}
\end{table}
We use an additional trick to store profiles efficiently.
\bigskip
\pause
First notice that we can encode a nucleotide in two bits.
\end{pframe}
\begin{pframe}
\begin{table}[]
\begin{center}
\begin{tabular}{c|c|r}
substring & binary & decimal\\
\hline
\bt{AAAA} & \bt{00 00 00 00} & $0$\\
\bt{AAAC} & \bt{00 00 00 01} & $1$\\
\vdots & \vdots & \vdots\\
\bt{GATC} & \bt{10 00 11 01} & $141$\\
\vdots & \vdots & \vdots\\
\bt{TTTT} & \bt{11 11 11 11} & $255$\\
\end{tabular}
\end{center}
\label{}
\caption{Encoding strings.}
\end{table}
We can concatenate the \emph{binary encoding}.
\bigskip
\pause
There is no need to store the substrings.
\end{pframe}
\subsection{Comparing $k$-mer profiles}
\begin{pframe}
\vspace{-0.5cm}
\begin{figure}
\colorbox{white}{
\includegraphics[width = \textwidth]{k-mer}
}
\smallskip
\colorbox{white}{
\includegraphics[width = \textwidth]{k-mer2}
}
\vspace{-0.5cm}
\caption{Two profiles of $k$-mer counts.}
\end{figure}
\pause
How to express this difference with one value.
\end{pframe}
\subsection{Pairwise distance function}
\begin{pframe}
We use the following function:
\begin{displaymath}
f(x, y) = \frac{|x - y|}{(x + 1) (y + 1)}
\end{displaymath}
\pause
Properties:
\begin{itemize}
\item $f(0, 1) = \frac12$
\item $f(0, 1) > f(7, 8)$
\end{itemize}
\bigskip
\bigskip
\pause
This is desirable:
\begin{itemize}
\item The fact that a $k$-mer is not present is more important than the
number of times it is present.
\item Differences in the low end of the spectrum are more important than
ones at the high end.
\end{itemize}
\end{pframe}
\subsection{Multiset distance function}
\begin{pframe}
Let $f$ be a function $f : \mathbb{R}_{\ge 0} \times \mathbb{R}_{\ge 0}
\to \mathbb{R}_{\ge 0}$
with finite supremum $M$ and the following properties:
\begin{align*}
f(x, y) &= f(y, x) & &\mathrm{\ for\ all\ } & x, y &\in \mathbb{R}_{\ge 0}\\
f(x, x) &= 0 & &\mathrm{\ for\ all\ } & x &\in \mathbb{R}_{\ge 0}\\
f(x, 0) &\ge {M}/2 & &\mathrm{\ for\ all\ } & x &\in \mathbb{R}_{> 0}\\
f(x, y) &\le f(x, z) + f(z, y) & &\mathrm{\ for\ all\ } & x, y, z &\in \mathbb{R}_{\ge 0}
\end{align*}
\smallskip
\pause
For a multiset $X$, let $S(X)$ denote its underlying set. For multisets $X,
Y$ with $S(X),S(Y) \subseteq \{1, 2, \ldots, n\}$ we define
\smallskip
\begin{displaymath}
d_f(X, Y) = \frac{\sum_{i = 1}^n f(x_i, y_i)}{|S(X) \cup S(Y)| + 1}
\end{displaymath}
\bigskip
\end{pframe}
\subsection{Strand balance}
\begin{pframe}
When analysing a dataset:
\begin{itemize}
\item We either see a $k$-mer or its reverse complement ($50\%$ chance of
either).
\item If sequenced in sufficient depth, we expect a balance between forward
and reverse complement $k$-mers.
\end{itemize}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\fbox{
\begin{tabular}{l}
\bt{ACTAGACCACTTAC\only<2->{\color{LUMCRood}}TAGAGACTAGA\only<2->{\color{black}}CCACCACTAGACCA\ldots}\\
\bt{TGATCTGGTGAATG\only<3->{\color{LUMCRood}}ATCTCTGATCT\only<3->{\color{black}}GGTGGTGATCTGGT\ldots}\\
\end{tabular}
}
\end{center}
\caption{Double stranded DNA.}
\label{}
\end{figure}
\onslide<4->{So, the number of times we see \bt{TAGAGACTAGA} must be roughly
the same as for \bt{TCTAGTCTCTA}.}
\end{pframe}
\begin{pframe}
How to calculate it:
\begin{itemize}
\item We can split a profile into a forward and a reverse complement
profile.
\item We can calculate the balance between these sub-profiles.
\end{itemize}
\bigskip
\pause
This is an estimation of ``sufficient coverage'':
\begin{itemize}
\item If the balance is good, the coverage is good.
\begin{itemize}
\item Does not work for strand specific protocols.
\end{itemize}
\end{itemize}