Skip to content
Snippets Groups Projects
combining_tools.tex 10.9 KiB
Newer Older
Laros's avatar
Laros committed
\documentclass[slidestop]{beamer}

\title{Combining tools into a pipeline}
Laros's avatar
Laros committed
\providecommand{\myConference}{NGS data analysis, 8th edition}
\providecommand{\myDate}{Monday, September 1, 2014}
Laros's avatar
Laros committed
\author{Jeroen F. J. Laros}
\providecommand{\myGroup}{Leiden Genome Technology Center}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\providecommand{\lastCenterLogo}{
  \raisebox{-0.1cm}{
    \includegraphics[height = 1cm]{lgtc_logo}
    %\includegraphics[height = 0.7cm]{ngi_logo}
  }
}
\providecommand{\lastRightLogo}{
  %\includegraphics[height = 0.7cm]{nbic_logo}
  %\includegraphics[height = 0.8cm]{nwo_logo_en}
  %\hspace{1.5cm}\includegraphics[height = 0.7cm]{gen2phen_logo}
}

\usetheme{lumc}

\begin{document}

% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}

% Make the title page.
\bodytemplate

% First page of the presentation.
\section{Introduction}
\subsection{Pipelines}
\begin{pframe}
  \begin{figure}[]
    \begin{center}
      \includegraphics[height=0.85\textheight]{pipeline}
    \end{center}
    \caption{A real-life pipeline.}
  \end{figure}
\end{pframe}

\begin{pframe}
  \begin{figure}[]
    \begin{center}
      \includegraphics[height=0.85\textheight]{assemblyline}
    \end{center}
    \caption{Scene from ``Modern times''.}
  \end{figure}
\end{pframe}

\begin{pframe}
  Combining tools:
  \begin{itemize}
    \item The output of one tool can serve as the input for another.
    \item Not necessarily linear.
    \item \ldots
  \end{itemize}
  \bigskip
  \pause

  Running various different tools:
  \begin{itemize}
    \item Two or three different aligners.
    \item A couple of variant callers.
    \item \ldots
  \end{itemize}
\end{pframe}

\subsection{Running example: Exome sequencing}
\begin{pframe}
  In \emph{exome sequencing}, we select genomic regions of interest using a 
  \emph{target-enrichment strategy}.
  
  \begin{itemize}
    \item PCR.
    \item On array capture.
    \item \color{yellow}In-solution capture\color{white}.
  \end{itemize}
  \medskip
  \pause

  Overview of an in-solution capture.
  \begin{itemize}
    \item Fragmentation.
    \item Size selection.
    \item Linker ligation.
    \item Capture.
  \end{itemize}
  \medskip
  \pause

  These regions are then \emph{sequenced}.
\end{pframe}

\subsection{Sequencers: HiSeq}
\begin{pframe}
  \begin{minipage}[t]{0.47\textwidth}
    \begin{figure}
      \includegraphics[width=\textwidth]{hiseq_2000}
      \caption{HiSeq 2000.}
    \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}[t]{0.47\textwidth}
    Characteristics:
    \begin{itemize}
      \item High throughput.
      \item Paired end.
      \item High accuracy.
      \item Read length $2 \times 150$bp.
      \item Relatively long run time.
      \item Relatively expensive.
    \end{itemize}
  \end{minipage}
\end{pframe}

\subsection{Sequencers: Ion Torrent}
\begin{pframe}
  \begin{minipage}[t]{0.47\textwidth}
    \begin{figure}
      \includegraphics[width=\textwidth]{ion-torrent}
      \caption{Ion torrent.}
    \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}[t]{0.47\textwidth}
    Characteristics:
    \begin{itemize}
      \item Moderate throughput.
      \item Single end (for now).
      \item High accuracy.
      \item Read length $\pm 200$bp.
      \item Short run time.
      \item Cheap runs.
    \end{itemize}
  \end{minipage}
\end{pframe}

\subsection{Data analysis}
\begin{pframe}
  Resequencing pipelines can roughly be divided in five steps.
  \pause
  \begin{enumerate}
    \item Pre-alignment.
    \begin{itemize}
      \item Quality control.
      \item Data cleaning.
    \end{itemize}
    \pause
    \item Alignment.
    \begin{itemize}
      \item Post-alignment quality control.
    \end{itemize}
    \pause
    \item Variant calling.
    \pause
    \item Filtering.
    \begin{itemize}
      \item Post-variant calling quality control.
    \end{itemize}
    \pause
    \item Annotation.
  \end{enumerate}
\end{pframe}

\section{Pre-alignment}
\subsection{Trimming}
\begin{pframe}
  \begin{figure}[]
    \begin{center}
      \includegraphics[height=0.85\textheight]{pretrimmed_qscores}
    \end{center}
    \caption{Quality score per position.}
  \end{figure}
\end{pframe}

\subsection{Clipping}
\begin{pframe}
  \begin{figure}[]
    \begin{center}
      \includegraphics[height=0.85\textheight]{linker-clip}
    \end{center}
    \caption{Sequencing linkers.}
  \end{figure}
\end{pframe}

\subsection{Data cleaning and QC}
\begin{pframe}
  Depending on the sequencing platform, parts of the reads need to be removed.
  \begin{itemize}
    \item Remove linker sequences (\emph{Cutadapt}, \emph{FASTX toolkit}).
Laros's avatar
Laros committed
    \item Trim low quality reads at the end of the read (\emph{Sickle},
Laros's avatar
Laros committed
      \emph{Trimmomatic}, \emph{FASTX toolkit}).
    \item Length filtering (\emph{Fastools}).
  \end{itemize}
  \medskip
  \pause

  The \emph{FastQC toolkit} can be used for quality control (both before and
  after the data cleaning step).
  \begin{itemize}
Laros's avatar
Laros committed
    \item Positional nucleotide content.
Laros's avatar
Laros committed
    \item GC distribution.
Laros's avatar
Laros committed
    \item Sequence quality distribution.
Laros's avatar
Laros committed
    \item \ldots
  \end{itemize}
\end{pframe}

\subsection{Example QC output}
\begin{pframe}
  \begin{figure}
    \includegraphics[width=\textwidth, height=0.35\textheight]
      {per_base_sequence_content}
Laros's avatar
Laros committed
     \caption{Positional nucleotide content.}
Laros's avatar
Laros committed
  \end{figure}
Laros's avatar
Laros committed
  \vspace{-0.7cm}
Laros's avatar
Laros committed

  \begin{figure}
    \includegraphics[width=\textwidth, height=0.35\textheight]
      {per_sequence_quality}
Laros's avatar
Laros committed
    \caption{Sequence quality distribution.}
Laros's avatar
Laros committed
  \end{figure}
\end{pframe}

\section{Alignment}
\subsection{Choose an aligner}
\begin{pframe}
  Alignment needs to be fault-tolerant.
  \medskip
  \pause

  Not all aligners can deal with indels.
  \begin{itemize}
Laros's avatar
Laros committed
    \item Older aligners only allowed substitutions.
Laros's avatar
Laros committed
  \end{itemize}
  \medskip
  \pause

  Few aligners can work with large deletions.
  \begin{itemize}
    \item Spliced RNA.
    \begin{itemize}
      \item \emph{GMAP} / \emph{GSNAP}.
      \item \emph{Tophat}.
    \end{itemize}
    \item \emph{BWA-MEM}.
  \end{itemize}
  \medskip
  \pause

  The choice of aligner may be restricted by the sequencer.
  \begin{itemize}
    \item For the Ion Torrent: \emph{Tmap}.
    \item For the PacBio: \emph{BLASR}.
  \end{itemize}
\end{pframe}

\section{Variant calling}
\subsection{Pileup}
\begin{pframe}
  \begin{figure}[]
    \begin{center}
      \includegraphics[width=0.9\textwidth]{varcall}
    \end{center}
    \caption{Result of an alignment.}
  \end{figure}
\end{pframe}

\subsection{Some considerations}
\begin{pframe}
  Things a variant caller might take into account:
  \begin{itemize}
    \item Strand balance.
    \item Base quality.
    \item Mapping quality.
    \begin{itemize}
      \item Distribution within the reads.
    \end{itemize}
    \item Ploidity of the organism in question.
  \end{itemize}
  \medskip
  \pause

  Complicating factors:
  \begin{itemize}
    \item Pooled samples.
    \pause
    \item RNA.
    \begin{itemize}
      \item Allele specific expression.
      \item RNA editing.
    \end{itemize}
    \pause
    \item Strand specific sampleprep.
  \end{itemize}
\end{pframe}

\subsection{Choice of variant caller}
\begin{pframe}
  Rules of thumb:
  \begin{itemize}
    \item Well known organism and experiment: Statistical model.
    \item Use a simpler variant caller otherwise.
  \end{itemize}
  \bigskip
  \pause

  Popular variant callers:
  \begin{itemize}
    \item \emph{Samtools}.
    \item \emph{GATK}.
    \item \emph{VarScan}.
  \end{itemize}
\end{pframe}

\section{Variant filtering}
\subsection{Filtering on coverage}
\begin{pframe}
  We can set some thresholds:
  \begin{itemize}
    \item Minimum.
    \item Maximum.
  \end{itemize}
  \bigskip
  \pause

  We filter for a maximum coverage because of copy number variation.
  \bigskip
  \pause

  A good way to calculate the maximum:
  \begin{itemize}
    \item Calculate the mean coverage.
    \begin{itemize}
      \item Only of the covered (targeted) regions.
    \end{itemize}
    \item Multiply this number with a reasonable factor e.g., $2.5$.
  \end{itemize}
\end{pframe}

\section{Annotation}
\subsection{What is already known about a variant}
\begin{pframe}
  A selection of SeattleSeq annotation:
  \begin{itemize}
    \item Is the variant known?
    \item Does it hit a gene?
    \pause
    \begin{itemize}
      \item Is it in an intron?
      \begin{itemize}
        \item Does it hit a splice site?
      \end{itemize}
      \pause
      \item Is it in the coding region?
      \begin{itemize}
        \item Is there a gain/loss of a stop codon?
        \item Does the variant result in a frameshift?
        \item \ldots
      \end{itemize}
      \pause
      \item Is it in the 5'/3' UTR of a gene?
      \item \ldots
    \end{itemize}
    \pause
    \item Is it in a regulatory region?
    \item \ldots
  \end{itemize}
\end{pframe}

\section{Pipelines}
\subsection{Combining tools}
\begin{pframe}
  \begin{lstlisting}[language=bash, caption=Shell script]
    bwa aln -t 8 $reference $i > $i.sai
    bwa samse $reference $i.sai $i > $i.sam
    samtools view -bt $reference -o $i.bam $i.sam
  \end{lstlisting}
  \medskip
  \pause

  \begin{lstlisting}[language=make, caption=Makefile]
    %.sai: %.fq
      $(BWA) aln -t $(THREADS) $(call MKREF, $@) $< > $@

    %.sam: %.sai %.fq
      $(BWA) samse $(call MKREF, $@) $^ > $@

    %.bam: %.sam
      $(SAMTOOLS) view -bt $(call MKREF, $@) -o $@ $<
  \end{lstlisting}
\end{pframe}

\section{Graphical interfaces}
\subsection{Galaxy}
\begin{pframe}
  Galaxy: a graphical user interface:
  \begin{itemize}
    \item Wrapper for command line utilities.
    \item User friendly.
    \item Point and click.
    \pause
    \item Workflows.
    \begin{itemize}
      \item Save all the steps you did in your analysis.
      \item Rerun the entire analysis on a new dataset.
      \item Share your workflow with other people.
      \item \ldots
    \end{itemize}
  \end{itemize}

  \vfill
  \permfoot{http://galaxy.psu.edu/}
\end{pframe}

\begin{pframe}
  \begin{figure}
    \includegraphics[trim=0 0 0 2cm, clip, width=\textwidth]{galaxy}
    \caption{Galaxy main user interface}
  \end{figure}
\end{pframe}

\begin{pframe}
  \begin{figure}
    \includegraphics[width=\textwidth, height=0.9\textheight]{galaxy_mpileup}
    \caption{User friendly interface with Galaxy}
  \end{figure}
\end{pframe}

\subsection{Workflow of a parallel pipeline}
\begin{pframe}
  \begin{figure}
    \includegraphics[width=\textwidth, height=0.9\textheight]{gapss3}
    \caption{Dependency diagram.}
  \end{figure}
\end{pframe}

\begin{pframe}
  \begin{figure}
    \includegraphics[trim=320 0 100 70, clip, width=\textwidth,
      height=0.9\textheight]{gapss3}
    \caption{Zoomed in.}
  \end{figure}
\end{pframe}

\section{Questions?}
\lastpagetemplate
\begin{pframe}
  \begin{center}
    \bigskip
    \bigskip
    \bigskip
    \bigskip

    Michiel van Galen

Laros's avatar
Laros committed
    Martijn Vermaat
Laros's avatar
Laros committed
    Johan den Dunnen
  \end{center}
Laros's avatar
Laros committed
\end{pframe}
\end{document}