Skip to content
Snippets Groups Projects
Select Git revision
  • a70c3019f45ac712c539b580d1bfa62bd426855e
  • master default
  • leiden_2017
  • workshop_HS_2014
  • LOVD_plus
  • 3gb_test
  • ngs_data_analysis
  • lgtc_intro
  • leiden_2014
  • GCC_2013
  • breda_2014
11 results

combining_tools.tex

Blame
  • combining_tools.tex 10.86 KiB
    \documentclass[slidestop]{beamer}
    
    \title{Combining tools into a pipeline}
    \providecommand{\myConference}{NGS data analysis, 8th edition}
    \providecommand{\myDate}{Monday, September 1, 2014}
    \author{Jeroen F. J. Laros}
    \providecommand{\myGroup}{Leiden Genome Technology Center}
    \providecommand{\myDepartment}{Department of Human Genetics}
    \providecommand{\myCenter}{Center for Human and Clinical Genetics}
    \providecommand{\lastCenterLogo}{
      \raisebox{-0.1cm}{
        \includegraphics[height = 1cm]{lgtc_logo}
        %\includegraphics[height = 0.7cm]{ngi_logo}
      }
    }
    \providecommand{\lastRightLogo}{
      %\includegraphics[height = 0.7cm]{nbic_logo}
      %\includegraphics[height = 0.8cm]{nwo_logo_en}
      %\hspace{1.5cm}\includegraphics[height = 0.7cm]{gen2phen_logo}
    }
    
    \usetheme{lumc}
    
    \begin{document}
    
    % This disables the \pause command, handy in the editing phase.
    %\renewcommand{\pause}{}
    
    % Make the title page.
    \bodytemplate
    
    % First page of the presentation.
    \section{Introduction}
    \subsection{Pipelines}
    \begin{pframe}
      \begin{figure}[]
        \begin{center}
          \includegraphics[height=0.85\textheight]{pipeline}
        \end{center}
        \caption{A real-life pipeline.}
      \end{figure}
    \end{pframe}
    
    \begin{pframe}
      \begin{figure}[]
        \begin{center}
          \includegraphics[height=0.85\textheight]{assemblyline}
        \end{center}
        \caption{Scene from ``Modern times''.}
      \end{figure}
    \end{pframe}
    
    \begin{pframe}
      Combining tools:
      \begin{itemize}
        \item The output of one tool can serve as the input for another.
        \item Not necessarily linear.
        \item \ldots
      \end{itemize}
      \bigskip
      \pause
    
      Running various different tools:
      \begin{itemize}
        \item Two or three different aligners.
        \item A couple of variant callers.
        \item \ldots
      \end{itemize}
    \end{pframe}
    
    \subsection{Running example: Exome sequencing}
    \begin{pframe}
      In \emph{exome sequencing}, we select genomic regions of interest using a 
      \emph{target-enrichment strategy}.
      
      \begin{itemize}
        \item PCR.
        \item On array capture.
        \item \color{yellow}In-solution capture\color{white}.
      \end{itemize}
      \medskip
      \pause
    
      Overview of an in-solution capture.
      \begin{itemize}
        \item Fragmentation.
        \item Size selection.
        \item Linker ligation.
        \item Capture.
      \end{itemize}
      \medskip
      \pause
    
      These regions are then \emph{sequenced}.
    \end{pframe}
    
    \subsection{Sequencers: HiSeq}
    \begin{pframe}
      \begin{minipage}[t]{0.47\textwidth}
        \begin{figure}
          \includegraphics[width=\textwidth]{hiseq_2000}
          \caption{HiSeq 2000.}
        \end{figure}
      \end{minipage}
      \hfill
      \begin{minipage}[t]{0.47\textwidth}
        Characteristics:
        \begin{itemize}
          \item High throughput.
          \item Paired end.
          \item High accuracy.
          \item Read length $2 \times 150$bp.
          \item Relatively long run time.
          \item Relatively expensive.
        \end{itemize}
      \end{minipage}
    \end{pframe}
    
    \subsection{Sequencers: Ion Torrent}
    \begin{pframe}
      \begin{minipage}[t]{0.47\textwidth}
        \begin{figure}
          \includegraphics[width=\textwidth]{ion-torrent}
          \caption{Ion torrent.}
        \end{figure}
      \end{minipage}
      \hfill
      \begin{minipage}[t]{0.47\textwidth}
        Characteristics:
        \begin{itemize}
          \item Moderate throughput.
          \item Single end (for now).
          \item High accuracy.
          \item Read length $\pm 200$bp.
          \item Short run time.
          \item Cheap runs.
        \end{itemize}
      \end{minipage}
    \end{pframe}
    
    \subsection{Data analysis}
    \begin{pframe}
      Resequencing pipelines can roughly be divided in five steps.
      \pause
      \begin{enumerate}
        \item Pre-alignment.
        \begin{itemize}
          \item Quality control.
          \item Data cleaning.
        \end{itemize}
        \pause
        \item Alignment.
        \begin{itemize}
          \item Post-alignment quality control.
        \end{itemize}
        \pause
        \item Variant calling.
        \pause
        \item Filtering.
        \begin{itemize}
          \item Post-variant calling quality control.
        \end{itemize}
        \pause
        \item Annotation.
      \end{enumerate}
    \end{pframe}
    
    \section{Pre-alignment}
    \subsection{Trimming}
    \begin{pframe}
      \begin{figure}[]
        \begin{center}
          \includegraphics[height=0.85\textheight]{pretrimmed_qscores}
        \end{center}
        \caption{Quality score per position.}
      \end{figure}
    \end{pframe}
    
    \subsection{Clipping}
    \begin{pframe}
      \begin{figure}[]
        \begin{center}
          \includegraphics[height=0.85\textheight]{linker-clip}
        \end{center}
        \caption{Sequencing linkers.}
      \end{figure}
    \end{pframe}
    
    \subsection{Data cleaning and QC}
    \begin{pframe}
      Depending on the sequencing platform, parts of the reads need to be removed.
      \begin{itemize}
        \item Remove linker sequences (\emph{Cutadapt}, \emph{FASTX toolkit}).
        \item Trim low quality reads at the end of the read (\emph{Sickle},
          \emph{Trimmomatic}, \emph{FASTX toolkit}).
        \item Length filtering (\emph{Fastools}).
      \end{itemize}
      \medskip
      \pause
    
      The \emph{FastQC toolkit} can be used for quality control (both before and
      after the data cleaning step).
      \begin{itemize}
        \item Positional nucleotide content.
        \item GC distribution.
        \item Sequence quality distribution.
        \item \ldots
      \end{itemize}
    \end{pframe}
    
    \subsection{Example QC output}
    \begin{pframe}
      \begin{figure}
        \includegraphics[width=\textwidth, height=0.35\textheight]
          {per_base_sequence_content}
         \caption{Positional nucleotide content.}
      \end{figure}
      \vspace{-0.7cm}
    
      \begin{figure}
        \includegraphics[width=\textwidth, height=0.35\textheight]
          {per_sequence_quality}
        \caption{Sequence quality distribution.}
      \end{figure}
    \end{pframe}
    
    \section{Alignment}
    \subsection{Choose an aligner}
    \begin{pframe}
      Alignment needs to be fault-tolerant.
      \medskip
      \pause
    
      Not all aligners can deal with indels.
      \begin{itemize}
        \item Older aligners only allowed substitutions.
      \end{itemize}
      \medskip
      \pause
    
      Few aligners can work with large deletions.
      \begin{itemize}
        \item Spliced RNA.
        \begin{itemize}
          \item \emph{GMAP} / \emph{GSNAP}.
          \item \emph{Tophat}.
        \end{itemize}
        \item \emph{BWA-MEM}.
      \end{itemize}
      \medskip
      \pause
    
      The choice of aligner may be restricted by the sequencer.
      \begin{itemize}
        \item For the Ion Torrent: \emph{Tmap}.
        \item For the PacBio: \emph{BLASR}.
      \end{itemize}
    \end{pframe}
    
    \section{Variant calling}
    \subsection{Pileup}
    \begin{pframe}
      \begin{figure}[]
        \begin{center}
          \includegraphics[width=0.9\textwidth]{varcall}
        \end{center}
        \caption{Result of an alignment.}
      \end{figure}
    \end{pframe}
    
    \subsection{Some considerations}
    \begin{pframe}
      Things a variant caller might take into account:
      \begin{itemize}
        \item Strand balance.
        \item Base quality.
        \item Mapping quality.
        \begin{itemize}
          \item Distribution within the reads.
        \end{itemize}
        \item Ploidity of the organism in question.
      \end{itemize}
      \medskip
      \pause
    
      Complicating factors:
      \begin{itemize}
        \item Pooled samples.
        \pause
        \item RNA.
        \begin{itemize}
          \item Allele specific expression.
          \item RNA editing.
        \end{itemize}
        \pause
        \item Strand specific sampleprep.
      \end{itemize}
    \end{pframe}
    
    \subsection{Choice of variant caller}
    \begin{pframe}
      Rules of thumb:
      \begin{itemize}
        \item Well known organism and experiment: Statistical model.
        \item Use a simpler variant caller otherwise.
      \end{itemize}
      \bigskip
      \pause
    
      Popular variant callers:
      \begin{itemize}
        \item \emph{Samtools}.
        \item \emph{GATK}.
        \item \emph{VarScan}.
      \end{itemize}
    \end{pframe}
    
    \section{Variant filtering}
    \subsection{Filtering on coverage}
    \begin{pframe}
      We can set some thresholds:
      \begin{itemize}
        \item Minimum.
        \item Maximum.
      \end{itemize}
      \bigskip
      \pause
    
      We filter for a maximum coverage because of copy number variation.
      \bigskip
      \pause
    
      A good way to calculate the maximum:
      \begin{itemize}
        \item Calculate the mean coverage.
        \begin{itemize}
          \item Only of the covered (targeted) regions.
        \end{itemize}
        \item Multiply this number with a reasonable factor e.g., $2.5$.
      \end{itemize}
    \end{pframe}
    
    \section{Annotation}
    \subsection{What is already known about a variant}
    \begin{pframe}
      A selection of SeattleSeq annotation:
      \begin{itemize}
        \item Is the variant known?
        \item Does it hit a gene?
        \pause
        \begin{itemize}
          \item Is it in an intron?
          \begin{itemize}
            \item Does it hit a splice site?
          \end{itemize}
          \pause
          \item Is it in the coding region?
          \begin{itemize}
            \item Is there a gain/loss of a stop codon?
            \item Does the variant result in a frameshift?
            \item \ldots
          \end{itemize}
          \pause
          \item Is it in the 5'/3' UTR of a gene?
          \item \ldots
        \end{itemize}
        \pause
        \item Is it in a regulatory region?
        \item \ldots
      \end{itemize}
    \end{pframe}
    
    \section{Pipelines}
    \subsection{Combining tools}
    \begin{pframe}
      \begin{lstlisting}[language=bash, caption=Shell script]
        bwa aln -t 8 $reference $i > $i.sai
        bwa samse $reference $i.sai $i > $i.sam
        samtools view -bt $reference -o $i.bam $i.sam
      \end{lstlisting}
      \medskip
      \pause
    
      \begin{lstlisting}[language=make, caption=Makefile]
        %.sai: %.fq
          $(BWA) aln -t $(THREADS) $(call MKREF, $@) $< > $@
    
        %.sam: %.sai %.fq
          $(BWA) samse $(call MKREF, $@) $^ > $@
    
        %.bam: %.sam
          $(SAMTOOLS) view -bt $(call MKREF, $@) -o $@ $<
      \end{lstlisting}
    \end{pframe}
    
    \section{Graphical interfaces}
    \subsection{Galaxy}
    \begin{pframe}
      Galaxy: a graphical user interface:
      \begin{itemize}
        \item Wrapper for command line utilities.
        \item User friendly.
        \item Point and click.
        \pause
        \item Workflows.
        \begin{itemize}
          \item Save all the steps you did in your analysis.
          \item Rerun the entire analysis on a new dataset.
          \item Share your workflow with other people.
          \item \ldots
        \end{itemize}
      \end{itemize}
    
      \vfill
      \permfoot{http://galaxy.psu.edu/}
    \end{pframe}
    
    \begin{pframe}
      \begin{figure}
        \includegraphics[trim=0 0 0 2cm, clip, width=\textwidth]{galaxy}
        \caption{Galaxy main user interface}
      \end{figure}
    \end{pframe}
    
    \begin{pframe}
      \begin{figure}
        \includegraphics[width=\textwidth, height=0.9\textheight]{galaxy_mpileup}
        \caption{User friendly interface with Galaxy}
      \end{figure}
    \end{pframe}
    
    \subsection{Workflow of a parallel pipeline}
    \begin{pframe}
      \begin{figure}
        \includegraphics[width=\textwidth, height=0.9\textheight]{gapss3}
        \caption{Dependency diagram.}
      \end{figure}
    \end{pframe}
    
    \begin{pframe}
      \begin{figure}
        \includegraphics[trim=320 0 100 70, clip, width=\textwidth,
          height=0.9\textheight]{gapss3}
        \caption{Zoomed in.}
      \end{figure}
    \end{pframe}
    
    \section{Questions?}
    \lastpagetemplate
    \begin{pframe}
      \begin{center}
        \bigskip
        \bigskip
        \bigskip
        \bigskip
    
        Michiel van Galen
    
        Martijn Vermaat
    
        Johan den Dunnen
      \end{center}
    \end{pframe}
    \end{document}