Commit e00b3ac5 authored by Laros's avatar Laros
Browse files

Updated lecture.

parent 713c0e63
../../submodules/presentation-pics/pics/cost_per_genome.jpg
\ No newline at end of file
......@@ -23,7 +23,7 @@
\section{Introduction}
\makeTableOfContents
\subsection{Titles and subtitles}
\subsection{The cell}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}[]
......@@ -78,7 +78,7 @@
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{hiseq_2000}
\includegraphics[width=\textwidth, trim=0 40 0 0, clip]{hiseq_2000}
\caption{HiSeq 2500.}
\end{figure}
\end{minipage}
......@@ -86,12 +86,12 @@
\begin{minipage}[t]{0.47\textwidth}
Characteristics:
\begin{itemize}
\item High throughput ($3$~genomes).
\item High throughput ($10$~genomes).
\item Paired end.
\item High accuracy.
\item Read length $2 \times 125$bp.
\item Relatively long run time ($6$~days).
\item Relatively expensive.
\item Run time ($6$~days).
\item Output $0.5$Terabyte.
\end{itemize}
\end{minipage}
\end{pframe}
......@@ -101,7 +101,7 @@
\begin{center}
\includegraphics[height=0.7\textheight]{flowcell_2000}
\hfill
\includegraphics[height=0.7\textheight]{k_flowcell}
\includegraphics[height=0.7\textheight,trim=2 2 2 2,clip]{k_flowcell}
\end{center}
\caption{Flowcell.}
\end{figure}
......@@ -119,6 +119,8 @@
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
This sequencer basically takes pictures.
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{k_image}
......@@ -128,11 +130,10 @@
\end{minipage}
\end{pframe}
\subsection{Base calling}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{k_basecall}
\includegraphics[height=0.7\textheight,trim=2 2 2 2,clip]{k_basecall}
\end{center}
\caption{Base calling.}
\end{figure}
......@@ -141,17 +142,33 @@
\subsection{Pacific Biosciences}
\begin{pframe}
\begin{figure}
\includegraphics[height=0.7\textheight]{pacbio}
\includegraphics[height=0.7\textheight,trim=0 10 0 30, clip]{pacbio}
\caption{PacBio RS.}
\end{figure}
\end{pframe}
\begin{pframe}
Characteristics:
\begin{itemize}
\item Low throughput (one genome would take dozens of runs).
\item Low accuracy for long reads, but high accuracy for short ones.
\item Read length up to $60$kb.
\begin{itemize}
\item $50$\% is larger than $20$kb.
\item $5$\% is larger than $30$kb.
\end{itemize}
\item Run time $30$~minutes to $4$~hours.
\item Output $1$Gigabyte.
\end{itemize}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.43\textwidth]{pacbio_cell}
\hfill
\includegraphics[height=0.43\textwidth, trim=1cm 9cm 1cm 7cm, clip]{pacbio_cell_hand}
\includegraphics[height=0.43\textwidth,trim=1cm 9cm 1cm 7cm,clip]
{pacbio_cell_hand}
\end{center}
\caption{SMRT cell.}
\end{figure}
......@@ -179,6 +196,16 @@
\end{figure}
\end{pframe}
\subsection{Data}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{cost_per_genome}
\end{center}
\caption{Reducing cost and increasing throughput.}
\end{figure}
\end{pframe}
\section{Data analysis}
\subsection{Next generation sequencing data}
\begin{pframe}
......@@ -202,13 +229,12 @@
\end{lstlisting}
\end{pframe}
\section{Alignment}
\subsection{The best match to the reference genome}
\subsection{Alignment: the best match to a reference genome}
\begin{pframe}
\begin{minipage}[t]{0.7\textheight}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{k_align}
\includegraphics[height=0.7\textheight,trim=2 2 2 2,clip]{k_align}
\end{center}
\caption{Alignment visualisation.}
\end{figure}
......@@ -224,8 +250,7 @@
\end{minipage}
\end{pframe}
\section{Variant calling}
\subsection{Consistent deviations from the reference}
\subsection{Variant calling: consistent deviations from a reference}
\begin{pframe}
\begin{figure}[]
\begin{center}
......@@ -235,32 +260,65 @@
\end{figure}
\end{pframe}
\subsection{Raw sequencing data}
\section{Metagenomics}
\subsection{Metagenomcs, a different ballpark}
\begin{pframe}
When do we work with \emph{raw data}:
\emph{Metagenomics} is the study of genetic material recovered directly from
\emph{environmental samples}.
\begin{itemize}
\item Unknown reference.
\item No time for analysis.
\item Soil / skeletons.
\item Infected wounds.
\item Skin.
\end{itemize}
\bigskip
\pause
If the reference sequence is unknown, we can still do:
Applications in:
\begin{itemize}
\item Quality control.
\item Coverage estimation.
\item Food science.
\item Medical microbiology.
\item \emph{Forensics}.
\end{itemize}
\end{pframe}
\begin{pframe}
Sequencing is mostly the same, but data analysis is challenging.
\begin{itemize}
\item Potentially thousands of species in one sample.
\item Unknown relative abundances.
\item Computationally expensive.
\end{itemize}
\bigskip
\pause
Compare raw datasets:
There is a huge blind spot.
\begin{itemize}
\item Quality control.
\item Phylogeny.
\item Metagenomics.
\item Many species do not have a reference sequence.
\begin{itemize}
\item Some are impossible to culture.
\end{itemize}
\item For some datasets, $97$\% of the data is of unknown origin.
\end{itemize}
\end{pframe}
\begin{pframe}
For forensic applications, we want to compare these datasets.
\begin{itemize}
\item Identification of samples.
\item Following a sample through time.
\item Phylogenetic reconstruction (relatedness of samples).
\end{itemize}
\bigskip
We have to do without reference sequences in order to utilise the complete
dataset.
\bigskip
\pause
$k$-mer profiling.
\end{pframe}
\section{$k$-mer profiling}
\subsection{Counting $k$-mers}
\begin{pframe}
We choose a $k$ and count all occurrences of substrings of length $k$.
......@@ -371,7 +429,6 @@
We have a solution for this (explained later in this presentation).
\end{pframe}
\section{$k$-mer profiles}
\subsection{Indexing}
\begin{pframe}
\begin{table}[]
......@@ -410,15 +467,11 @@
\bt{TTTT} & \bt{11 11 11 11} & $255$\\
\end{tabular}
\end{center}
\label{}
\caption{Encoding strings.}
\end{table}
We can concatenate the \emph{binary encoding}.
\bigskip
\pause
There is no need to store the substrings.
We can concatenate the \emph{binary encoding}, there is no need to store the
substrings.
\end{pframe}
\subsection{Comparing $k$-mer profiles}
......@@ -441,6 +494,27 @@
How to express this difference with one value.
\end{pframe}
\subsection{Multiset distance function}
\begin{pframe}
Let $f$ be a function $f : \mathbb{R}_{\ge 0} \times \mathbb{R}_{\ge 0}
\to \mathbb{R}_{\ge 0}$
with finite supremum $M$ and the following properties:
\begin{align*}
f(x, y) &= f(y, x) & &\mathrm{\ for\ all\ } & x, y &\in \mathbb{R}_{\ge 0}\\
f(x, x) &= 0 & &\mathrm{\ for\ all\ } & x &\in \mathbb{R}_{\ge 0}\\
f(x, 0) &\ge {M}/2 & &\mathrm{\ for\ all\ } & x &\in \mathbb{R}_{> 0}\\
f(x, y) &\le f(x, z) + f(z, y) & &\mathrm{\ for\ all\ } & x, y, z &\in \mathbb{R}_{\ge 0}
\end{align*}
\pause
For a multiset $X$, let $S(X)$ denote its underlying set. For multisets $X,
Y$ with $S(X),S(Y) \subseteq \{1, 2, \ldots, n\}$ we define
\begin{displaymath}
d_f(X, Y) = \frac{\sum_{i = 1}^n f(x_i, y_i)}{|S(X) \cup S(Y)| + 1}
\end{displaymath}
\end{pframe}
\subsection{Pairwise distance function}
\begin{pframe}
We use the following function:
......@@ -454,8 +528,7 @@
\item $f(0, 1) = \frac12$
\item $f(0, 1) > f(7, 8)$
\end{itemize}
\bigskip
\bigskip
\smallskip
\pause
This is desirable:
......@@ -467,30 +540,6 @@
\end{itemize}
\end{pframe}
\subsection{Multiset distance function}
\begin{pframe}
Let $f$ be a function $f : \mathbb{R}_{\ge 0} \times \mathbb{R}_{\ge 0}
\to \mathbb{R}_{\ge 0}$
with finite supremum $M$ and the following properties:
\begin{align*}
f(x, y) &= f(y, x) & &\mathrm{\ for\ all\ } & x, y &\in \mathbb{R}_{\ge 0}\\
f(x, x) &= 0 & &\mathrm{\ for\ all\ } & x &\in \mathbb{R}_{\ge 0}\\
f(x, 0) &\ge {M}/2 & &\mathrm{\ for\ all\ } & x &\in \mathbb{R}_{> 0}\\
f(x, y) &\le f(x, z) + f(z, y) & &\mathrm{\ for\ all\ } & x, y, z &\in \mathbb{R}_{\ge 0}
\end{align*}
\smallskip
\pause
For a multiset $X$, let $S(X)$ denote its underlying set. For multisets $X,
Y$ with $S(X),S(Y) \subseteq \{1, 2, \ldots, n\}$ we define
\smallskip
\begin{displaymath}
d_f(X, Y) = \frac{\sum_{i = 1}^n f(x_i, y_i)}{|S(X) \cup S(Y)| + 1}
\end{displaymath}
\bigskip
\end{pframe}
\subsection{Strand balance}
\begin{pframe}
When analysing a dataset:
......@@ -579,7 +628,6 @@
}
\caption{Shrinking a profile.}
\end{figure}
\pause
Works fine if the indexed sequences are large compared to $k$.
\end{pframe}
......@@ -644,15 +692,62 @@
optimal size when comparing.
\end{pframe}
\section{Applications in metagenomics}
\subsection{Fingers and keyboards}
\begin{pframe}
Experimental set up.
\begin{itemize}
\item Three people.
\item Samples of each finger.
\item Samples of different keys of their keyboard.
\end{itemize}
\bigskip
\pause
Results.
\begin{itemize}
\item Clear clusters per person.
\item Skin samples and keyboard samples were very close together.
\item The keys could even be associated to the fingers.
\end{itemize}
\vfill
\permfoot{Data from: Fierer et.al., 2010.}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{kmer_keyboard}
\includegraphics[height=0.7\textheight,trim=0 0 0 65, clip]
{kmer_keyboard}
\end{center}
\caption{}
\caption{Principal component analysis of distance matrix.}
\end{figure}
\end{pframe}
\subsubsection{Read classification within one dataset}
\begin{pframe}
Experimental set up.
\begin{itemize}
\item Mixture of three bacteria.
\item Simulated sequencing on PacBio (reads of over $20,\!000$
nucleotides).
\item $k$-mer profiling of \emph{each read}.
\item PCA on pairwise distance matrix.
\end{itemize}
\bigskip
\pause
Results.
\begin{itemize}
\item Good separation of species.
\item Clustering with DBSCAN (density based).
\end{itemize}
\vfill
\permfoot{L. Khachatryan, 2015.}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
......@@ -660,12 +755,33 @@
\end{center}
\caption{}
\end{figure}
\vspace{-5pt}
\permfoot{L. Khachatryan, 2015.}
\end{pframe}
\section{Conclusions}
\subsection{Take home message}
\begin{pframe}
My conclusions.
Metagenomics is computationally expensive.
\begin{itemize}
\item Identification, relatedness can be done efficiently.
\end{itemize}
\bigskip
Metagenomics suffers from \emph{reference bias}.
\begin{itemize}
\item Can be avoided by using \emph{reference free} methods.
\end{itemize}
\bigskip
$k$-mer profiling can also be used for exploration of one dataset.
\begin{itemize}
\item Partitioning of dataset for downstream analysis.
\begin{itemize}
\item \textit{De novo} assembly.
\end{itemize}
\end{itemize}
\end{pframe}
% Make the acknowledgements slide.
......
Subproject commit 80b20d3c36d22a83a78bff28049fd8723b310fb3
Subproject commit 8d5710c016ca54a224951e53ae779fe6f67e3629
Subproject commit dfede6fb0c743ed0c78f5b2107ca031035818d5b
Subproject commit c4a46638a9eba2aeb59dfdb061f546c43c4e78fb
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment