Commit 801c2835 authored by Laros's avatar Laros
Browse files

Updated variant calling lecture.

parent e0eb85f0
../../submodules/presentation-pics/pics/Ion_Proton_s.jpg
\ No newline at end of file
../../submodules/presentation-pics/pics/ion-torrent.jpg
\ No newline at end of file
../../submodules/presentation-pics/pics/miseq.jpg
\ No newline at end of file
../../shared/positionpicture.tex
\ No newline at end of file
\documentclass[slidestop]{beamer}
\author{Jeroen F.J. Laros}
\title{Variant Calling}
\providecommand{\mySubTitle}{}
\providecommand{\myConference}{Hogeschool Leiden}
\providecommand{\myDate}{Thursday, October 15, 2015}
\author{Jeroen F. J. Laros}
\providecommand{\myGroup}{Leiden Genome Technology Center}
\providecommand{\myDate}{02-11-2016}
\providecommand{\myGroup}{}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\providecommand{\lastCenterLogo}{
\raisebox{-0.1cm}{
\includegraphics[height = 1cm]{logos/lgtc_logo}
%\includegraphics[height = 0.7cm]{ngi_logo}
}
}
\providecommand{\lastRightLogo}{
%\includegraphics[height = 0.7cm]{nbic_logo}
%\includegraphics[height = 0.8cm]{nwo_logo_en}
%\hspace{1.5cm}\includegraphics[height = 0.7cm]{gen2phen_logo}
}
\providecommand{\myConference}{Hogeschool Leiden}
\usetheme{lumc}
\input{positionpicture.tex}
\begin{document}
% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}
% Make the title page.
\bodytemplate
% Make the title slide.
\makeTitleSlide{\includegraphics[width=3.5cm, trim=2 2 2 2, clip]{k_align}}
% First page of the presentation.
\section{Introduction}
\begin{frame}
\frametitle{Sequencers: Illumina}
\makeTableOfContents
\begin{minipage}[t]{0.48\textwidth}
\subsection{Illumina platforms}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{hiseq_2000}
\includegraphics[width=\textwidth, trim=0 40 0 0, clip]{hiseq_2000}
\caption{HiSeq 2500.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.48\textwidth}
\begin{minipage}[t]{0.47\textwidth}
Characteristics:
\begin{itemize}
\item High throughput.
\item High throughput ($3$~genomes).
\item Paired end.
\item High accuracy.
\item Read length $2 \times 125$bp.
\item Relatively long run time.
\item Relatively long run time ($6$~days).
\item Relatively expensive.
\end{itemize}
\end{minipage}
\end{frame}
\begin{frame}
\frametitle{Sequencers: Life Technologies}
\begin{minipage}[t]{0.48\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{ion-torrent}
\caption{Ion torrent.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.48\textwidth}
Characteristics:
\begin{itemize}
\item Low throughput.
\item Single end (for now).
\item High accuracy.
\item Read length $\pm 400$bp.
\item Short run time.
\item Cheap runs.
\end{itemize}
\end{minipage}
\end{frame}
\begin{frame}
\frametitle{Sequencers: Life Technologies}
\end{pframe}
\begin{minipage}[t]{0.48\textwidth}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{Ion_Proton_s}
\caption{Ion proton.}
\includegraphics[width=\textwidth]{miseq}
\vspace{-0.5cm}
\caption{MiSeq.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.48\textwidth}
\begin{minipage}[t]{0.47\textwidth}
Characteristics:
\begin{itemize}
\item Moderate throughput.
\item Single end (for now).
\item Moderate throughput ($3$~exomes).
\item Paired end.
\item High accuracy.
\item Read length $\pm 200$bp.
\item Short run time.
\item Cheap runs.
\item Read length $2 \times 300$bp.
\item Relatively short run time ($3$~days).
\item Relatively expensive.
\end{itemize}
\end{minipage}
\end{frame}
\begin{frame}
\frametitle{Data analysis}
\end{pframe}
\subsection{Next generation sequencing data}
\begin{pframe}
\begin{lstlisting}[language=none, caption={A FastQ file.}]
@SGGPP:4:101
TTCGGGGGCTGGCAAATCCACTTCCGTGACACGCTACCATTCGCTGGTG
+
-'+4589,53330-0&07+03:54/2362-+.488587>@/25440++0
@SGGPP:4:102
CGGTAAACCACCCTGCTGACGGAACCCTAATGCGCCTGAAAGACAGCGT
+
34/--0'+.000(.55:;:99(0(+2(22(0316;185;;0;:<<>=AA
@SGGPP:4:106
TCGTTAACGACTTTGTTCGCCACCGCAACCGCCTGTTTCGGGTCACAGG
+
09875;5?<;?@A4?B:BBB<AA>CCC>C>BB0.->=0488+3444:@5
@SGGPP:4:112
TTGATGAATATATTATTTCAGGGAATAATTATGACACCTTTAGAACGCA
+
70<<@::5:<;==7;>>/79<:.:494.8(,,8:753/5@5??C>B???
\end{lstlisting}
\end{pframe}
\subsection{Data analysis}
\begin{pframe}
Resequencing pipelines can roughly be divided in five steps.
\pause
\begin{enumerate}
......@@ -121,62 +108,59 @@
\pause
\item Variant calling.
\pause
\item Filtering.
\item Annotation and Filtering.
\begin{itemize}
\item Post-variant calling quality control.
\end{itemize}
\pause
\item Annotation.
\item Effect prediction.
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{Alignment and variant calling}
\end{pframe}
\subsection{Alignment and variant calling}
\begin{pframe}
Alignment needs to be fault-tolerant.
\bigskip
\pause
Not all aligners can deal with indels.
\begin{itemize}
\item Only a couple of years ago, only substitutions were considered.
\begin{itemize}
\item Bowtie.
\end{itemize}
\item Older aligners (\emph{Bowtie}), only consider substitutions.
\end{itemize}
\bigskip
\pause
\medskip
Few aligners can work with large deletions.
Some aligners can work with large deletions.
\begin{itemize}
\item Spliced RNA.
\begin{itemize}
\item Gmap / Gsnap.
\item Tophat.
\item \emph{GMAP} / \emph{GSNAP}.
\item \emph{Tophat}.
\end{itemize}
\item \emph{BWA-MEM}.
\end{itemize}
\bigskip
\pause
The choice of aligner may be restricted by the sequencer.
\end{frame}
\vfill
\permfoot{\url{http://bowtie-bio.sourceforge.net/index.shtml}}
\permfoot{\url{http://research-pub.gene.com/gmap/}}
\permfoot{\url{http://tophat.cbcb.umd.edu/}}
\begin{frame}
\frametitle{Principle of variant calling}
\permfoot{\url{http://bio-bwa.sourceforge.net/}}
\end{pframe}
\section{Variant Calling}
\subsection{Principle of variant calling}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[width=0.9\textwidth]{varcall}
\end{center}
\caption{Result of an alignment.}
\label{}
\end{figure}
\end{frame}
\section{Variant Calling}
\begin{frame}
\frametitle{Principle of variant calling}
\end{pframe}
\begin{pframe}
In principle, we call a variant when we are confident we have seen one.
\bigskip
\pause
......@@ -194,11 +178,10 @@
\item Fixed settings.
\item Statistical models.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Some considerations}
\end{pframe}
\subsection{Some considerations}
\begin{pframe}
Things a variant caller might take into account:
\begin{itemize}
\item Strand balance.
......@@ -209,30 +192,25 @@
\end{itemize}
\item Ploidity of the organism in question.
\end{itemize}
\bigskip
\medskip
\pause
Complicating factors:
\begin{itemize}
\item Pooled samples.
\begin{itemize}
\item More or less the same as ploidity.
\end{itemize}
\pause
\item RNA.
\begin{itemize}
\item Allele specific expression.
\item Tissue specific expression.
\item RNA editing.
\end{itemize}
\pause
\item Strand specific sampleprep.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Choice of variant caller}
\end{pframe}
\subsection{Choice of variant caller}
\begin{pframe}
Rules of thumb:
\begin{itemize}
\item Well known organism and experiment: Statistical model.
......@@ -241,31 +219,26 @@
\bigskip
\pause
For diploid, non pooled samples:
Popular variant callers:
\begin{itemize}
\item Samtools.
\item GATK.
\item \emph{Samtools}.
\item \emph{GATK} (no longer free).
\item \emph{VarScan}.
\end{itemize}
\bigskip
For diploid, non pooled RNA samples:
\begin{itemize}
\item SNVMix.
\end{itemize}
\bigskip
\vfill
\permfoot{\url{http://samtools.sourceforge.net/}}
For other samples:
\begin{itemize}
\item VarScan.
\end{itemize}
\end{frame}
\permfoot{\url{https://www.broadinstitute.org/gatk/}}
\permfoot{\url{http://varscan.sourceforge.net/}}
\end{pframe}
\section{Small indel detection}
\begin{frame}
\frametitle{Indels}
If we choose an aligner that allows for indels, we can also call them.
\subsection{Indels}
\begin{pframe}
Choose an aligner that allows for indels.
\bigskip
\pause
Deletions are easier:
\begin{itemize}
......@@ -273,7 +246,6 @@
\item In principle, a deletion of arbitrary length can be detected.
\end{itemize}
\bigskip
\pause
For insertions:
\begin{itemize}
......@@ -286,11 +258,10 @@
\item Local assembly in combination with anchoring.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{False positive substitutions.}
\end{pframe}
\subsection{False positive substitutions.}
\begin{pframe}
Can occur because of misalignment near (large) deletions or insertions.
\bigskip
......@@ -307,96 +278,91 @@
No way around this, except for correction afterwards.
\begin{itemize}
\item Realignment.
\item Realignment.
\item BAQ ``realignment''.
\end{itemize}
\end{frame}
\end{pframe}
\begin{frame}
\begin{pframe}
\vspace{-0.5cm}
\begin{figure}[]
\begin{center}
\includegraphics[trim=12cm 9cm 9cm 3.3cm, clip, height=\textheight]
\includegraphics[trim=12cm 9cm 9cm 4cm, clip, height=0.7\textheight]
{27bpDel}
\end{center}
\caption{False positive substitutions.}
\label{}
\end{figure}
\end{frame}
\begin{frame}
\emph{Base Alignment Quality} (BAQ) realignment:
Remove substitutions around indels.
\end{pframe}
\begin{center}
\fbox{
\setlength{\unitlength}{1pt}
\begin{picture}(300, 60)(0, 0)
\put(0, 10){\line(1, 0){300}} % Genomic sequence.
\put(0, 14){{\scriptsize reference}}
\put(80, 20){\line(1, 0){60}} % Read with a deletion.
\put(160, 20){\line(1, 0){60}}
\put(80, 24){{\scriptsize read1}}
\put(148, 27.5){xx}
\put(160, 30){\line(1, 0){110}}
\put(250, 34){{\scriptsize read2}}
\end{picture}
}
\pause
\bigskip
$\Downarrow$
\bigskip
\fbox{
\setlength{\unitlength}{1pt}
\begin{picture}(300, 60)(0, 0)
\put(0, 10){\line(1, 0){300}} % Genomic sequence.
\put(0, 14){{\scriptsize reference}}
\put(80, 20){\line(1, 0){60}} % Read with a deletion.
\put(160, 20){\line(1, 0){60}}
\put(80, 24){{\scriptsize read1}}
\put(130, 30){\line(1, 0){10}}
\put(160, 30){\line(1, 0){110}}
\put(250, 34){{\scriptsize read2}}
\end{picture}
}
\end{center}
\end{frame}
\begin{pframe}
\begin{figure}[]
\begin{center}
\fbox{
\setlength{\unitlength}{0.8pt}
\begin{picture}(300, 60)(0, 0)
\put(0, 10){\line(1, 0){300}} % Genomic sequence.
\put(0, 14){{\scriptsize reference}}
\put(80, 20){\line(1, 0){60}} % Read with a deletion.
\put(160, 20){\line(1, 0){60}}
\put(80, 24){{\scriptsize read1}}
\put(148, 27.5){xx}
\put(160, 30){\line(1, 0){110}}
\put(250, 34){{\scriptsize read2}}
\end{picture}
}
\onslide<2->{
\bigskip
$\Downarrow$
\bigskip
\fbox{
\setlength{\unitlength}{0.8pt}
\begin{picture}(300, 60)(0, 0)
\put(0, 10){\line(1, 0){300}} % Genomic sequence.
\put(0, 14){{\scriptsize reference}}
\put(80, 20){\line(1, 0){60}} % Read with a deletion.
\put(160, 20){\line(1, 0){60}}
\put(80, 24){{\scriptsize read1}}
\put(130, 30){\line(1, 0){10}}
\put(160, 30){\line(1, 0){110}}
\put(250, 34){{\scriptsize read2}}
\end{picture}
}
}
\end{center}
\caption{Realignment.}
\end{figure}
\end{pframe}
\section{Tools}
\begin{frame}
\frametitle{Variant caller input}
\subsection{Variant caller input}
\begin{pframe}
Output of aligner:
\begin{itemize}
\item SAM Sequence Alignment/Map.
\item BAM Binary Alignment/Map (compressed SAM).
\end{itemize}
\bigskip
\bigskip
\pause
The following popular variant callers work (indirectly) with BAM format:
Almost all modern variant callers work (indirectly) with BAM format:
\begin{itemize}
\item Samtools.
\item GATK (no longer free).
\item GATK.
\item VarScan.
\end{itemize}
\bigskip
\bigskip
All of these variant callers produce (indirectly) VCF as output.
\end{frame}
\begin{frame}[fragile]
\frametitle{Pileup}
\end{pframe}
\subsection{Pileup}
\begin{pframe}
Variant calling is done on a \emph{pileup} file.
\bigskip
\pause
......@@ -406,17 +372,16 @@
\begin{lstlisting}[language=none, caption=Make a pileup file.]
samtools view -bt <reference> -o out.bam -i in.sam
samtools sort out.bam out.sorted
samtools mpileup -uf <reference> out.sorted.bam > \
samtools sort out.bam out.sort
samtools mpileup -uf <reference> out.sort.bam > \
out.pileup
\end{lstlisting}
Some variant callers work on the BAM file directly.
\end{frame}
\begin{frame}[fragile]
\frametitle{Converting a SAM file}
\end{pframe}
\subsection{Converting a SAM file}
\begin{pframe}
Use the following command to make a BAM file out of a SAM file.
\bigskip
......@@ -434,13 +399,11 @@
\end{tabular}
\end{center}
\caption{\bt{samtools view} options}
\label{}
\end{table}
\end{frame}
\begin{frame}[fragile]
\frametitle{Indexing the reference sequence}
\end{pframe}
\subsection{Indexing the reference sequence}
\begin{pframe}
This needs to be done only once.
\bigskip
......@@ -456,11 +419,10 @@
\item Length of the reference.
\item Defines the order of the reference sequences in sorting.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Sorting a BAM file}
\end{pframe}
\subsection{Sorting a BAM file}
\begin{pframe}
Sort alignments by leftmost coordinates.
\bigskip
......@@ -473,16 +435,15 @@
\pause
After these steps, the alignment output is ready for the pileup step.
\end{frame}
\begin{frame}[fragile]
\frametitle{Creating a pileup file}
\end{pframe}
\subsection{Creating a pileup file}
\begin{pframe}
Use the following command to make a pileup file.
\bigskip
\begin{lstlisting}[language=none, caption=Make a pileup file.]
samtools mpileup -uf <reference> out.sorted.bam > \
samtools mpileup -uf <reference> out.sort.bam > \
out.pileup
\end{lstlisting}
......@@ -496,29 +457,22 @@
\end{tabular}
\end{center}
\caption{\bt{samtools mpileup} options.}
\label{}
\end{table}
\bigskip
\pause
The \bt{mpileup} command does base quality recalibration.
\end{frame}
\begin{frame}[fragile]
\frametitle{Pileup output}
\end{pframe}
\subsection{Pileup output}
\begin{pframe}
\begin{minipage}[t]{0.45\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.8\textheight]{k_align}