Commit 1f15e4cd authored by Laros's avatar Laros
Browse files

Added functional_annotation_metagenomes lecture.

parent 08f463a2
../../submodules/presentation-pics/pics/MG_RAST.ppm
\ No newline at end of file
../../submodules/presentation/Makefile
\ No newline at end of file
../../submodules/presentation/beamerthemelumc.sty
\ No newline at end of file
../../submodules/presentation-pics/pics/ecoli.eps
\ No newline at end of file
../../submodules/presentation-pics/pics/ecoli_coverage.xcf
\ No newline at end of file
\documentclass[slidestop]{beamer}
\title{Functional annotation of metagenomes}
\providecommand{\myConference}{Metagenomics course}
\providecommand{\myDate}{Thursday, 7 February 2013}
\author{Jeroen F. J. Laros}
\providecommand{\myGroup}{Leiden Genome Technology Center}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\providecommand{\lastCenterLogo}{
\raisebox{-0.1cm}{
\includegraphics[height = 1cm]{lgtc_logo}
%\includegraphics[height = 0.7cm]{ngi_logo}
}
}
\providecommand{\lastRightLogo}{
%\includegraphics[height = 0.7cm]{nbic_logo}
%\includegraphics[height = 0.8cm]{nwo_logo_en}
%\hspace{1.5cm}\includegraphics[height = 0.7cm]{gen2phen_logo}
}
\usetheme{lumc}
\input{horizontal_coverage}
\begin{document}
% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}
% Make the title page.
\bodytemplate
% First page of the presentation.
\section{Introduction}
\begin{frame}
\frametitle{Functional analysis}
Objectives:
\begin{itemize}
\item Find the functional repertoire \ldots
\begin{itemize}
\item of the identified species (taxonomic analysis).
\end{itemize}
\end{itemize}
\bigskip
\pause
Challenges:
\begin{itemize}
\item Incomplete coverage.
\item Abundance and diversity of species.
\begin{itemize}
\item Homologies between species.
\end{itemize}
\item NGS data:
\begin{itemize}
\item Large volume of raw data.
\item Short reads.
\end{itemize}
\item Proteins with unknown functions.
\item Proteins with no known homologues.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Alignment}
One reference genome:
\begin{itemize}
\item Variant calling.
\begin{itemize}
\item Strain identification (MLST).
\end{itemize}
\item Functional consequences of a variant.
\end{itemize}
\bigskip
\pause
Multiple reference genomes:
\begin{itemize}
\item Targeted identification.
\item Related species.
\end{itemize}
\bigskip
\pause
Other datasets:
\begin{itemize}
\item Shotgun datasets.
\item 16S ribosomal RNA.
\item Every known reference sequence (BLASTN).
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Alignment}
\begin{minipage}[t]{0.48\textwidth}
\begin{figure}[h]
\begin{center}
\includegraphics[height=0.9\textheight]{k_align}
\end{center}
\caption{Alignment example.}
\end{figure}
\end{minipage}
\hfill
\pause
\begin{minipage}[t]{0.48\textwidth}
Also useful for filtering:
\begin{itemize}
\item Remove contamination.
\item Reduce the size of the dataset.
\end{itemize}
\bigskip
\pause
But beware:
\begin{itemize}
\item It also removes homologous areas in other species.
\end{itemize}
\end{minipage}
\end{frame}
\section{Targeted identification}
\begin{frame}
\frametitle{Use case: E. coli plasmid and gene identification}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.9\textheight]{ecoli}
\end{center}
\caption{Escherichia coli.}
\label{}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Some figures on the \emph{E. coli}}
Genome published in 1997.
\begin{itemize}
\item Genome size $4.6 \times 10^6$ basepairs.
\item $4,\!288$ genes in the assembly.
\item $2,\!584$ operons in the assembly.
\end{itemize}
\bigskip
\bigskip
\pause
However, per individual strain:
\begin{itemize}
\item Between $4,\!000$ and $5,\!500$ genes.
\item $16,\!000$ genes in total (pangenome).
\end{itemize}
\bigskip
\bigskip
\pause
Very diverse, only $20\%$ of the genome is shared between all strains.
\bigskip
We could view this as a simple metagenome.
\end{frame}
\begin{frame}
\frametitle{Plasmids}
\begin{figure}[]
\begin{center}
\vspace{-0.5cm}
\colorbox{white}{
\includegraphics[width=0.5\textwidth]{plasmid_en}
}
\end{center}
\caption{Schematic overview of a cell containing plasmids.}
\label{}
\end{figure}
\pause
Plasmids are small DNA molecules.
\begin{itemize}
\item Separate and independent from the chromosome.
\item Can be transferred to other species.
\item Size between $1 \times 10^3$ and $1 \times 10^6$ basepairs.
\item Copy number between $1$ and $1,\!000$.
\item Variable between strains and individuals.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Profiling}
\pause
Plasmids:
\begin{itemize}
\item May carry antibiotic resistance genes.
\item Not all of them are known.
\item May be highly similar to other plasmids.
\end{itemize}
\bigskip
\bigskip
\pause
Genes:
\begin{itemize}
\item Multi Locus Sequence Typing (MLST).
\begin{itemize}
\item Uses household genes (genomic).
\item Fragments of $450$ to $500$ basepairs.
\end{itemize}
\pause
\item Antibiotic resistance.
\begin{itemize}
\item The gene may be known, the plasmid may not be.
\end{itemize}
\pause
\item Efflux pumps.
\item \ldots
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Sequencers: Ion Torrent}
\begin{minipage}[t]{0.48\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{ion-torrent-sequencer-300x235}
\caption{Ion torrent.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.48\textwidth}
Characteristics:
\begin{itemize}
\item $3$ hours per run.
\item $1$ day sampleprep, $1$ day emulsion PCR.
\item $4\times10^6$ reads.
\item Read length $\pm 300$bp.
\item $2$ \emph{E. coli} per run.
\end{itemize}
\end{minipage}
\bigskip
\pause
Fast and inexpensive.
\end{frame}
\begin{frame}
\frametitle{General overview}
We screen for $130$ known plasmids and $400$ genes.
\bigskip
\pause
Output:
\begin{itemize}
\item MLST.
\item List of plasmids.
\begin{itemize}
\item Otherwise, similar plasmids.
\end{itemize}
\item List of genes of interest.
\end{itemize}
\bigskip
\pause
For the MLST, we need a list of variants
\begin{itemize}
\item Covered in the \emph{NGS introduction course} \ldots
\item and the previous talk.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Plasmid detection}
Pipeline:
\begin{itemize}
\item Select all reads that do not map to the genome.
\item Map these reads to each plasmid individually.
\item Calculate the \emph{horizontal coverage}.
\end{itemize}
\bigskip
\pause
Some notes:
\begin{itemize}
\item This overestimates the number of plasmids.
\item Should be used as an indication of presence.
\begin{itemize}
\item E.g., $80\%$ of a plasmid can be found.
\end{itemize}
\item Homologies between plasmids should be known.
\item Recombination can be an issue.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Coverage}
\begin{figure}[h]
\vspace{-0.5cm}
\begin{center}
\includegraphics[width=\textwidth]{ecoli_coverage}
\end{center}
\caption{Coverage / depth histogram.}
\pause
\begin{picture}(0, 0)(0, 0)
\setlength{\linethickness}{2pt}
\color{red}
\put(-116, 60){\line(1, 0){267}}
\end{picture}
\end{figure}
\pause
\begin{figure}[h]
\vspace{-0.5cm}
\fbox{
\begin{picture}(300, 17)(0, 0)
\put(35, 9){\line(1, 0){40}}
\put(100, 9){\line(1, 0){20}}
\put(145, 9){\line(1, 0){150}}
\end{picture}
}
\caption{Coverage summary.}
\end{figure}
From this, we can easily calculate the percentage of the gene we found.
\end{frame}
\begin{frame}
\frametitle{Plasmid detection}
\begin{table}[]
\begin{center}
\begin{tabular}{l|rrr@{.}lrr@{$.$}l}
Plasmid & Size & Reads & \multicolumn{2}{r}{\#3/\#2} & Cov &
\multicolumn{2}{r}{\#5/\#2}\\
\hline
\bt{NC\_001537} & $3895$ & $18728$ & $4$ & $808$ & $1418$ & $0$
& $364$\\
\bt{NC\_002119} & $9957$ & $6130$ & $0$ & $615$ & $789$ & $0$
& $079$\\
\bt{NC\_002127} & $3306$ & $11749$ & $3$ & $553$ & $1068$ & $0$
& $323$\\
\bt{NC\_002128} & $92721$ & $11824$ & $0$ & $127$ & $35783$ & $0$
& $385$\\
\bt{NC\_002142} & $68817$ & $8163$ & $0$ & $118$ & $15938$ & $0$
& $231$\\
\bt{NC\_002145} & $1549$ & $46141$ & $29$ & $787$ & $1549$ & $1$
& $000$\\
\bt{NC\_002487} & $5847$ & $11669$ & $1$ & $995$ & $1735$ & $0$
& $296$\\
\bt{NC\_002525} & $75582$ & $420$ & $0$ & $005$ & $1325$ & $0$
& $017$\\
\bt{NC\_004429} & $6349$ & $961$ & $0$ & $151$ & $1858$ & $0$
& $292$\\
\end{tabular}
\end{center}
\caption{Part of the plasmids table.}
\label{}
\end{table}
\end{frame}
\begin{frame}
\frametitle{Gene detection}
\begin{table}[]
\begin{center}
\begin{tabular}{l|rrrr@{$.$}l}
Reference & Gene & Length & Cov & \multicolumn{2}{r}{\#4/\#3}\\
\hline
\bt{AB699171} & CMY-87 & $959$ & $90$ & $0$ & $093$\\
\bt{AB715422} & IMP-34 & $742$ & $125$ & $0$ & $168$\\
\bt{AB737978} & ACT-16 & $1062$ & $202$ & $0$ & $190$\\
\bt{AB753456} & IMP-42 & $739$ & $417$ & $0$ & $564$\\
\bt{AB753457} & IMP-40 & $739$ & $414$ & $0$ & $560$\\
\bt{AB753458} & IMP-41 & $731$ & $364$ & $0$ & $497$\\
\bt{AC\_000091.1} & accD & $915$ & $915$ & $1$ & $000$\\
\bt{AC\_000091.1} & acrA & $1194$ & $1194$ & $1$ & $000$\\
\bt{AC\_000091.1} & acrB & $3150$ & $3150$ & $1$ & $000$\\
\end{tabular}
\end{center}
\caption{Part of the genes table.}
\label{}
\end{table}
\end{frame}
\section{Semi-targeted approach.}
\begin{fframe}
\frametitle{Full genome analysis}
\begin{figure}
\begin{center}
\only<1>{\coveragepic{0}}
\only<2>{\coveragepic{1}}
\end{center}
\caption{Horizontal coverage}
\end{figure}
\vfill
\end{fframe}
\begin{frame}
\frametitle{Full genome analysis}
\begin{figure}
\colorbox{white}{
\includegraphics[width=\textwidth]{soil}
}
\caption{Horizontal coverage of ranked genomes}
\end{figure}
\end{frame}
\section{Functional analysis}
\begin{frame}
\frametitle{An ``unbiassed'' approach}
Use every available reference sequence.
\begin{itemize}
\item Focus on finding genes.
\item Try to identify processes based on gene information.
\begin{itemize}
\item The processes are not limited to one species.
\end{itemize}
\end{itemize}
\bigskip
\pause
Identify genes.
\begin{itemize}
\item Looking at the best BLAST hist.
\begin{itemize}
\item More sophisticated methods use weighed BLAST information.
\end{itemize}
\item Do we have all components for a certain pathway?
\end{itemize}
\bigskip
\pause
Still biassed to the content of the databases used.
\end{frame}
\begin{frame}
\frametitle{De novo assembly}
Assemble reads.
\begin{itemize}
\item Covered in the \emph{De novo assembly course}.
\item Can be optimised for \emph{open reading frames}.
\end{itemize}
\bigskip
\pause
Find open reading frames.
\begin{itemize}
\item Glimmer.
\item GeneMark.
\item ORF-Finder.
\item \ldots
\end{itemize}
\bigskip
\pause
Blast these open reading frames.
\begin{itemize}
\item Longer sequences align easier.
\item May find \emph{homologous} genes.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Pathways}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.9\textheight]{pathway}
\end{center}
\caption{Example pathway (Ye et al. 2009).}
\label{}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Identifying pathways}
In general, a pathway has been found if all the genes involved in that
pathway have been found.
\bigskip
\pause
This approach may lead to overestimation of:
\begin{itemize}
\item The number of pathways.
\item The size of the pathways.
\end{itemize}
\bigskip
But also the underestimation of the size of a pathway.
\bigskip
\pause
Several approaches to solve these issues:
\begin{itemize}
\item Find the minimum number of pathways that explain the observed genes
(MinPath).
\item Smoothing or ``gap filling''.
\item Taxonomic limitation.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Minpath}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.9\textheight]{minpath}
\end{center}
\caption{(Ye et al. 2009).}
\label{}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Pipelines}
\begin{minipage}[t]{0.48\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.9\textheight]{mg_pipeline}
\end{center}
\caption{Prakash et al. 2002.}
\label{}
\end{figure}
\end{minipage}
\hfill
\pause
\begin{minipage}[t]{0.48\textwidth}
Some examples:
\begin{itemize}
\item HMP Unified Metabolic Analysis (HUMAnN).
\item MetaGenomics Rapid Annotation using Subsystems Technology
(MG-RAST). % Rast is the German word for speed.
\end{itemize}
\end{minipage}
\end{frame}
\begin{frame}
\frametitle{HUMAnN: Human Microbiome}
\begin{minipage}[t]{0.48\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{humann_pipeline}
\end{center}
\caption{Abucker et al. 2012.}
\label{}
\end{figure}
\end{minipage}
\hfill
\pause
\begin{minipage}[t]{0.48\textwidth}
This pipelines combines many tools:
\begin{itemize}
\item Data cleaning.
\item Blasting (identify organisms).
\item Functional translation / pathways.
\item Taxonomic limitation.
\item \ldots
\end{itemize}
\end{minipage}
\end{frame}
%\begin{frame}
% \frametitle{MG-RAST}
%
% Quality control, protein prediction, clustering and similarity-based