Select Git revision
combining_tools.tex

Laros authored
combining_tools.tex 10.86 KiB
\documentclass[slidestop]{beamer}
\title{Combining tools into a pipeline}
\providecommand{\myConference}{NGS data analysis, 8th edition}
\providecommand{\myDate}{Monday, September 1, 2014}
\author{Jeroen F. J. Laros}
\providecommand{\myGroup}{Leiden Genome Technology Center}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\providecommand{\lastCenterLogo}{
\raisebox{-0.1cm}{
\includegraphics[height = 1cm]{lgtc_logo}
%\includegraphics[height = 0.7cm]{ngi_logo}
}
}
\providecommand{\lastRightLogo}{
%\includegraphics[height = 0.7cm]{nbic_logo}
%\includegraphics[height = 0.8cm]{nwo_logo_en}
%\hspace{1.5cm}\includegraphics[height = 0.7cm]{gen2phen_logo}
}
\usetheme{lumc}
\begin{document}
% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}
% Make the title page.
\bodytemplate
% First page of the presentation.
\section{Introduction}
\subsection{Pipelines}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.85\textheight]{pipeline}
\end{center}
\caption{A real-life pipeline.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.85\textheight]{assemblyline}
\end{center}
\caption{Scene from ``Modern times''.}
\end{figure}
\end{pframe}
\begin{pframe}
Combining tools:
\begin{itemize}
\item The output of one tool can serve as the input for another.
\item Not necessarily linear.
\item \ldots
\end{itemize}
\bigskip
\pause
Running various different tools:
\begin{itemize}
\item Two or three different aligners.
\item A couple of variant callers.
\item \ldots
\end{itemize}
\end{pframe}
\subsection{Running example: Exome sequencing}
\begin{pframe}
In \emph{exome sequencing}, we select genomic regions of interest using a
\emph{target-enrichment strategy}.
\begin{itemize}
\item PCR.
\item On array capture.
\item \color{yellow}In-solution capture\color{white}.
\end{itemize}
\medskip
\pause
Overview of an in-solution capture.
\begin{itemize}
\item Fragmentation.
\item Size selection.
\item Linker ligation.
\item Capture.
\end{itemize}
\medskip
\pause
These regions are then \emph{sequenced}.
\end{pframe}
\subsection{Sequencers: HiSeq}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{hiseq_2000}
\caption{HiSeq 2000.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
Characteristics:
\begin{itemize}
\item High throughput.
\item Paired end.
\item High accuracy.
\item Read length $2 \times 150$bp.
\item Relatively long run time.
\item Relatively expensive.
\end{itemize}
\end{minipage}
\end{pframe}
\subsection{Sequencers: Ion Torrent}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{ion-torrent}
\caption{Ion torrent.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
Characteristics:
\begin{itemize}
\item Moderate throughput.
\item Single end (for now).
\item High accuracy.
\item Read length $\pm 200$bp.
\item Short run time.
\item Cheap runs.
\end{itemize}
\end{minipage}
\end{pframe}
\subsection{Data analysis}
\begin{pframe}
Resequencing pipelines can roughly be divided in five steps.
\pause
\begin{enumerate}
\item Pre-alignment.
\begin{itemize}
\item Quality control.
\item Data cleaning.
\end{itemize}
\pause
\item Alignment.
\begin{itemize}
\item Post-alignment quality control.
\end{itemize}
\pause
\item Variant calling.
\pause
\item Filtering.
\begin{itemize}
\item Post-variant calling quality control.
\end{itemize}
\pause
\item Annotation.
\end{enumerate}
\end{pframe}
\section{Pre-alignment}
\subsection{Trimming}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.85\textheight]{pretrimmed_qscores}
\end{center}
\caption{Quality score per position.}
\end{figure}
\end{pframe}
\subsection{Clipping}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.85\textheight]{linker-clip}
\end{center}
\caption{Sequencing linkers.}
\end{figure}
\end{pframe}
\subsection{Data cleaning and QC}
\begin{pframe}
Depending on the sequencing platform, parts of the reads need to be removed.
\begin{itemize}
\item Remove linker sequences (\emph{Cutadapt}, \emph{FASTX toolkit}).
\item Trim low quality reads at the end of the read (\emph{Sickle},
\emph{Trimmomatic}, \emph{FASTX toolkit}).
\item Length filtering (\emph{Fastools}).
\end{itemize}
\medskip
\pause
The \emph{FastQC toolkit} can be used for quality control (both before and
after the data cleaning step).
\begin{itemize}
\item Positional nucleotide content.
\item GC distribution.
\item Sequence quality distribution.
\item \ldots
\end{itemize}
\end{pframe}
\subsection{Example QC output}
\begin{pframe}
\begin{figure}
\includegraphics[width=\textwidth, height=0.35\textheight]
{per_base_sequence_content}
\caption{Positional nucleotide content.}
\end{figure}
\vspace{-0.7cm}
\begin{figure}
\includegraphics[width=\textwidth, height=0.35\textheight]
{per_sequence_quality}
\caption{Sequence quality distribution.}
\end{figure}
\end{pframe}
\section{Alignment}
\subsection{Choose an aligner}
\begin{pframe}
Alignment needs to be fault-tolerant.
\medskip
\pause
Not all aligners can deal with indels.
\begin{itemize}
\item Older aligners only allowed substitutions.
\end{itemize}
\medskip
\pause
Few aligners can work with large deletions.
\begin{itemize}
\item Spliced RNA.
\begin{itemize}
\item \emph{GMAP} / \emph{GSNAP}.
\item \emph{Tophat}.
\end{itemize}
\item \emph{BWA-MEM}.
\end{itemize}
\medskip
\pause
The choice of aligner may be restricted by the sequencer.
\begin{itemize}
\item For the Ion Torrent: \emph{Tmap}.
\item For the PacBio: \emph{BLASR}.
\end{itemize}
\end{pframe}
\section{Variant calling}
\subsection{Pileup}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[width=0.9\textwidth]{varcall}
\end{center}
\caption{Result of an alignment.}
\end{figure}
\end{pframe}
\subsection{Some considerations}
\begin{pframe}
Things a variant caller might take into account:
\begin{itemize}
\item Strand balance.
\item Base quality.
\item Mapping quality.
\begin{itemize}
\item Distribution within the reads.
\end{itemize}
\item Ploidity of the organism in question.
\end{itemize}
\medskip
\pause
Complicating factors:
\begin{itemize}
\item Pooled samples.
\pause
\item RNA.
\begin{itemize}
\item Allele specific expression.
\item RNA editing.
\end{itemize}
\pause
\item Strand specific sampleprep.
\end{itemize}
\end{pframe}
\subsection{Choice of variant caller}
\begin{pframe}
Rules of thumb:
\begin{itemize}
\item Well known organism and experiment: Statistical model.
\item Use a simpler variant caller otherwise.
\end{itemize}
\bigskip
\pause
Popular variant callers:
\begin{itemize}
\item \emph{Samtools}.
\item \emph{GATK}.
\item \emph{VarScan}.
\end{itemize}
\end{pframe}
\section{Variant filtering}
\subsection{Filtering on coverage}
\begin{pframe}
We can set some thresholds:
\begin{itemize}
\item Minimum.
\item Maximum.
\end{itemize}
\bigskip
\pause
We filter for a maximum coverage because of copy number variation.
\bigskip
\pause
A good way to calculate the maximum:
\begin{itemize}
\item Calculate the mean coverage.
\begin{itemize}
\item Only of the covered (targeted) regions.
\end{itemize}
\item Multiply this number with a reasonable factor e.g., $2.5$.
\end{itemize}
\end{pframe}
\section{Annotation}
\subsection{What is already known about a variant}
\begin{pframe}
A selection of SeattleSeq annotation:
\begin{itemize}
\item Is the variant known?
\item Does it hit a gene?
\pause
\begin{itemize}
\item Is it in an intron?
\begin{itemize}
\item Does it hit a splice site?
\end{itemize}
\pause
\item Is it in the coding region?
\begin{itemize}
\item Is there a gain/loss of a stop codon?
\item Does the variant result in a frameshift?
\item \ldots
\end{itemize}
\pause
\item Is it in the 5'/3' UTR of a gene?
\item \ldots
\end{itemize}
\pause
\item Is it in a regulatory region?
\item \ldots
\end{itemize}
\end{pframe}
\section{Pipelines}
\subsection{Combining tools}
\begin{pframe}
\begin{lstlisting}[language=bash, caption=Shell script]
bwa aln -t 8 $reference $i > $i.sai
bwa samse $reference $i.sai $i > $i.sam
samtools view -bt $reference -o $i.bam $i.sam
\end{lstlisting}
\medskip
\pause
\begin{lstlisting}[language=make, caption=Makefile]
%.sai: %.fq
$(BWA) aln -t $(THREADS) $(call MKREF, $@) $< > $@
%.sam: %.sai %.fq
$(BWA) samse $(call MKREF, $@) $^ > $@
%.bam: %.sam
$(SAMTOOLS) view -bt $(call MKREF, $@) -o $@ $<
\end{lstlisting}
\end{pframe}
\section{Graphical interfaces}
\subsection{Galaxy}
\begin{pframe}
Galaxy: a graphical user interface:
\begin{itemize}
\item Wrapper for command line utilities.
\item User friendly.
\item Point and click.
\pause
\item Workflows.
\begin{itemize}
\item Save all the steps you did in your analysis.
\item Rerun the entire analysis on a new dataset.
\item Share your workflow with other people.
\item \ldots
\end{itemize}
\end{itemize}
\vfill
\permfoot{http://galaxy.psu.edu/}
\end{pframe}
\begin{pframe}
\begin{figure}
\includegraphics[trim=0 0 0 2cm, clip, width=\textwidth]{galaxy}
\caption{Galaxy main user interface}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}
\includegraphics[width=\textwidth, height=0.9\textheight]{galaxy_mpileup}
\caption{User friendly interface with Galaxy}
\end{figure}
\end{pframe}
\subsection{Workflow of a parallel pipeline}
\begin{pframe}
\begin{figure}
\includegraphics[width=\textwidth, height=0.9\textheight]{gapss3}
\caption{Dependency diagram.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}
\includegraphics[trim=320 0 100 70, clip, width=\textwidth,
height=0.9\textheight]{gapss3}
\caption{Zoomed in.}
\end{figure}
\end{pframe}
\section{Questions?}
\lastpagetemplate
\begin{pframe}
\begin{center}
\bigskip
\bigskip
\bigskip
\bigskip
Michiel van Galen
Martijn Vermaat
Johan den Dunnen
\end{center}
\end{pframe}
\end{document}