Skip to content
Snippets Groups Projects
Commit 79db5c8f authored by Laros's avatar Laros
Browse files

Added introduction lecture (extracted from extended introduction).

parent 1495d0b1
No related branches found
No related tags found
No related merge requests found
Showing
with 1 addition and 374 deletions
File moved
File moved
File moved
File moved
File moved
\documentclass[slidestop]{beamer}
\title{Introduction to NGS data analysis}
\providecommand{\myConference}{Workshop NGS, Hogeschool Leiden}
\providecommand{\myConference}{Introduction NGS Data Analysis}
\providecommand{\myDate}{Thursday, May 22, 2014}
\author{Jeroen F. J. Laros}
\providecommand{\myGroup}{Leiden Genome Technology Center}
......@@ -340,93 +340,7 @@
\end{itemize}
\end{pframe}
\section{Resequencing}
\subsection{Data analysis}
\begin{pframe}
Resequencing pipelines can roughly be divided in five steps.
\pause
\begin{enumerate}
\item Pre-alignment.
\begin{itemize}
\item Quality control.
\item Data cleaning.
\end{itemize}
\pause
\item Alignment.
\begin{itemize}
\item Post-alignment quality control.
\end{itemize}
\pause
\item Variant calling.
\pause
\item Filtering.
\begin{itemize}
\item Post-variant calling quality control.
\end{itemize}
\pause
\item Annotation.
\end{enumerate}
\end{pframe}
\section{Pre-alignment}
\subsection{Data cleaning}
\begin{pframe}
Depending on the sequencing platform, parts of the reads need to be removed.
\begin{itemize}
\item Remove linker sequences (\emph{Cutadapt}, \emph{FASTX toolkit}).
\item Clip low quality reads at the end of the read (\emph{Sickle},
\emph{Trimmomatic}, \emph{FASTX toolkit}).
\item Length filtering (\emph{Fastools}).
\end{itemize}
\vfill
\permfoot{http://code.google.com/p/cutadapt/}
\permfoot{http://hannonlab.cshl.edu/fastx\_toolkit/}
\permfoot{https://github.com/najoshi/sickle}
\permfoot{http://www.usadellab.org/cms/index.php?page=trimmomatic}
\permfoot{https://pypi.python.org/pypi/fastools}
\end{pframe}
\subsection{Trimming}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.85\textheight]{pretrimmed_qscores}
\end{center}
\caption{Quality score per position.}
\end{figure}
\end{pframe}
\subsection{Clipping}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.85\textheight]{linker-clip}
\end{center}
\caption{Sequencing linkers.}
\end{figure}
\end{pframe}
\subsection{Quality control}
\begin{pframe}
The \emph{FastQC toolkit} can be used for quality control (both before and
after the data cleaning step).
\begin{itemize}
\item GC content.
\item GC distribution.
\item Quality scores distribution.
\item \ldots
\end{itemize}
\vfill
\permfoot{http://www.bioinformatics.babraham.ac.uk/projects/fastqc/}
\end{pframe}
\subsection{Example QC output}
\begin{pframe}
\begin{figure}
\includegraphics[width=\textwidth, height=0.35\textheight]
......@@ -464,55 +378,6 @@
\end{minipage}
\end{pframe}
\subsection{Choose an aligner}
\begin{pframe}
Not all aligners can deal with indels.
\begin{itemize}
\item Only a couple of years ago, only SNPs were considered.
\begin{itemize}
\item \emph{Bowtie}.
\end{itemize}
\end{itemize}
\medskip
\pause
Few aligners can work with large deletions.
\begin{itemize}
\item Spliced RNA.
\begin{itemize}
\item \emph{GMAP} / \emph{GSNAP}.
\item \emph{Tophat}.
\end{itemize}
\item \emph{BWA-MEM}.
\end{itemize}
\vfill
\permfoot{http://bowtie-bio.sourceforge.net/index.shtml}
\permfoot{http://research-pub.gene.com/gmap/}
\permfoot{http://tophat.cbcb.umd.edu/}
\permfoot{http://bio-bwa.sourceforge.net/}
\end{pframe}
\begin{pframe}
The choice of aligner may be restricted by the sequencer.
\begin{itemize}
\item For the Ion Torrent: \emph{Tmap}.
\begin{itemize}
\item Combination of three different aligners.
\item Deals with errors in homopolymer stretches.
\end{itemize}
\item For the PacBio: \emph{BLASR}.
\end{itemize}
\vfill
\permfoot{https://github.com/iontorrent/TS/tree/master/Analysis/TMAP}
\permfoot{https://github.com/PacificBiosciences/blasr}
\end{pframe}
\section{Variant calling}
\subsection{Consistent deviations from the reference}
\begin{pframe}
......@@ -524,181 +389,8 @@
\end{figure}
\end{pframe}
\subsection{Some considerations}
\begin{pframe}
Things a variant caller might take into account:
\begin{itemize}
\item Strand balance.
\item Base quality.
\item Mapping quality.
\begin{itemize}
\item Distribution within the reads.
\end{itemize}
\item Ploidity of the organism in question.
\end{itemize}
\medskip
\pause
Complicating factors:
\begin{itemize}
\item Pooled samples.
\pause
\item RNA.
\begin{itemize}
\item Allele specific expression.
\item RNA editing.
\end{itemize}
\pause
\item Strand specific sampleprep.
\end{itemize}
\end{pframe}
\subsection{Choice of variant caller}
\begin{pframe}
Rules of thumb:
\begin{itemize}
\item Well known organism and experiment: Statistical model.
\item Use a simpler variant caller otherwise.
\end{itemize}
\bigskip
\pause
Popular variant callers:
\begin{itemize}
\item \emph{Samtools}.
\item \emph{GATK}.
\item \emph{VarScan}.
\end{itemize}
\vfill
\permfoot{http://samtools.sourceforge.net/}
\permfoot{https://www.broadinstitute.org/gatk/}
\permfoot{http://varscan.sourceforge.net/}
\end{pframe}
\section{Variant filtering}
\subsection{Filtering on coverage}
\begin{pframe}
We can set some thresholds:
\begin{itemize}
\item Minimum.
\item Maximum.
\end{itemize}
\bigskip
\pause
We filter for a maximum coverage because of copy number variation.
\bigskip
\pause
A good way to calculate the maximum:
\begin{itemize}
\item Calculate the mean coverage.
\begin{itemize}
\item Only of the covered (targeted) regions.
\end{itemize}
\item Multiply this number with a reasonable factor e.g., $2.5$.
\end{itemize}
\end{pframe}
\section{Annotation}
\subsection{What is already known about a variant}
\begin{pframe}
A selection of SeattleSeq annotation:
\begin{itemize}
\item Is the variant known?
\item Does it hit a gene?
\pause
\begin{itemize}
\item Is it in an intron?
\begin{itemize}
\item Does it hit a splice site?
\end{itemize}
\pause
\item Is it in the coding region?
\begin{itemize}
\item Is there a gain/loss of a stop codon?
\item Does the variant result in a frameshift?
\item \ldots
\end{itemize}
\pause
\item Is it in the 5'/3' UTR of a gene?
\item \ldots
\end{itemize}
\pause
\item Is it in a regulatory region?
\item \ldots
\end{itemize}
\end{pframe}
\section{Full genome sequencing}
\subsection{Copy number variation}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[trim=0 5cm 0 0, clip, height=0.9\textheight, width=\textwidth]{cnv}
\end{center}
\caption{Coverage patterns over a whole chromosome.}
\end{figure}
\end{pframe}
\begin{pframe}
Per sample:
\begin{itemize}
\item The reference needs to be very good.
\item Sequencability biases.
\item Mapping biases.
\end{itemize}
\bigskip
\pause
Within a population:
\begin{itemize}
\item Mixture of distributions.
\item Not sensitive to aforementioned biases.
\item Needs a lot of controls.
\end{itemize}
\end{pframe}
\subsection{Structural variation}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.9\textheight, width=\textwidth]{poorly_mapped}
\end{center}
\caption{Multiple issues while mapping.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[trim=0 19cm 0 0, clip, width=\textwidth]{discordant}
\end{center}
\caption{Discordant and split reads.}
\end{figure}
\vfill
\permfoot{http://breakdancer.sourceforge.net/}
\permfoot{http://sourceforge.net/projects/pindel/}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.9\textheight]{sv}
\end{center}
\caption{Different types of structural variation.}
\end{figure}
\end{pframe}
\section{De Novo assembly}
\subsection{Assesmbly}
\begin{pframe}
\begin{figure}[]
\begin{center}
......@@ -708,35 +400,6 @@
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{contig}
\end{center}
\caption{Overlaps are used to reconstruct a genome.}
\end{figure}
\end{pframe}
\subsection{Scaffolding}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.9\textheight]{scaffold}
\end{center}
\caption{Paired end or mate pair reads can be used.}
\end{figure}
\end{pframe}
\subsection{Easier assembly with PacBio}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.9\textheight]{hgap}
\end{center}
\caption{Correcting PacBio reads.}
\end{figure}
\end{pframe}
\section{Pipelines}
\subsection{Pipelines}
\begin{pframe}
......@@ -797,42 +460,6 @@
\end{lstlisting}
\end{pframe}
\section{Graphical interfaces}
\subsection{Galaxy}
\begin{pframe}
Galaxy: a graphical user interface:
\begin{itemize}
\item Wrapper for command line utilities.
\item User friendly.
\item Point and click.
\pause
\item Workflows.
\begin{itemize}
\item Save all the steps you did in your analysis.
\item Rerun the entire analysis on a new dataset.
\item Share your workflow with other people.
\item \ldots
\end{itemize}
\end{itemize}
\vfill
\permfoot{http://galaxy.psu.edu/}
\end{pframe}
\begin{pframe}
\begin{figure}
\includegraphics[trim=0 0 0 2cm, clip, width=\textwidth]{galaxy}
\caption{Galaxy main user interface}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}
\includegraphics[width=\textwidth, height=0.9\textheight]{galaxy_mpileup}
\caption{User friendly interface with Galaxy}
\end{figure}
\end{pframe}
\subsection{Workflow of a parallel pipeline}
\begin{pframe}
\begin{figure}
......
File moved
File moved
File moved
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment