Commit 6263bdcd authored by Laros's avatar Laros
Browse files

Added qc_lab_practices lecture.

parent 8e7ee14f
../../submodules/presentation/Makefile
\ No newline at end of file
../../submodules/presentation/beamerthemelumc.sty
\ No newline at end of file
../../submodules/presentation/gen2phen_logo.eps
\ No newline at end of file
../../submodules/presentation-pics/pics/hiseq_2000.jpg
\ No newline at end of file
../../submodules/presentation-pics/pics/illuminagenomeanalyzer.eps
\ No newline at end of file
../../submodules/presentation/lgtc_logo.eps
\ No newline at end of file
../../submodules/presentation/lumc_logo.eps
\ No newline at end of file
../../submodules/presentation/lumc_logo_small.eps
\ No newline at end of file
../../submodules/presentation/nbic_logo.eps
\ No newline at end of file
../../submodules/presentation/ngi_logo.eps
\ No newline at end of file
../../submodules/presentation/nwo_logo_en.eps
\ No newline at end of file
../../submodules/presentation/nwo_logo_nl.eps
\ No newline at end of file
../../submodules/presentation-pics/pics/per_base_quality.eps
\ No newline at end of file
../../submodules/presentation-pics/pics/per_base_sequence_content.png
\ No newline at end of file
../../submodules/presentation-pics/pics/per_sequence_gc_content.eps
\ No newline at end of file
../../submodules/presentation-pics/pics/per_sequence_quality.png
\ No newline at end of file
\documentclass[slidestop]{beamer}
\title{QC and accounting in GAPSS3}
\providecommand{\myConference}{KeyGene/BioAssist NGS Workshop}
\providecommand{\myDate}{Monday, 4 Juli 2011}
\author{Jeroen F. J. Laros}
\providecommand{\myGroup}{Leiden Genome Technology Center}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\providecommand{\lastCenterLogo}{
\raisebox{-0.1cm}{
\includegraphics[scale = 0.055]{lgtc_logo}
}
}
\providecommand{\lastRightLogo}{
\includegraphics[scale = 0.1]{nbic_logo}
}
\usetheme{lumc}
\begin{document}
% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}
% Make the title page.
\bodytemplate
% First page of the presentation.
\section{Introduction}
\begin{frame}
In \emph{exome sequencing}, we select genomic regions of interest using a
\emph{target-enrichment strategy}.
\begin{itemize}
\item PCR.
\item On array capture.
\item \color{yellow}In-solution capture\color{white}.
\end{itemize}
\bigskip
\pause
Overview of an in-solution capture.
\begin{itemize}
\item Fragmentation.
\item Size selection.
\item Linker ligation.
\item Capture.
\end{itemize}
\bigskip
\pause
These regions are then \emph{sequenced}.
\begin{itemize}
\item Illumina Genome Analyser II (GAII).
\item Illumina HiSeq 2000.
\end{itemize}
\end{frame}
\begin{frame}
The Illumina Genome Analyser II.
\begin{minipage}{0.45\textwidth}
\begin{center}
\includegraphics[scale = 0.2]{illuminagenomeanalyzer}
\end{center}
\end{minipage}
\hfill
\pause
\begin{minipage}{0.45\textwidth}
\begin{itemize}
\item Manufacturer: Illumina, Inc.
\item Commercially available since 2005.
\item Per cycle, one base is read.
\item Reads up to $100 \times 2$ base pairs.
\item Takes about $8$ days.
\item Produces about $40$ Giga bases per run.
\end{itemize}
\end{minipage}
\pause
Pros:
\begin{itemize}
\item Does paired end sequencing.
\item Cheap.
\end{itemize}
\end{frame}
\begin{frame}
The Illumina HiSeq 2000.
\begin{minipage}{0.45\textwidth}
\begin{center}
\includegraphics[scale = 0.5]{hiseq_2000}
\end{center}
\end{minipage}
\hfill
\pause
\begin{minipage}{0.45\textwidth}
\begin{itemize}
\item Manufacturer: Illumina, Inc.
\item Commercially available since 2010.
\item Per cycle, one base is read.
\item Reads up to $150 \times 2$ base pairs.
\item Takes about $8$ days.
\item Produces about $150$ Giga bases per run.
\end{itemize}
\end{minipage}
\pause
Pros:
\begin{itemize}
\item Even higher throughput.
\end{itemize}
\end{frame}
\section{General layout}
\begin{frame}
Exome sequencing pipelines can roughly be divided in five steps.
\bigskip
\begin{enumerate}
\item Pre-alignment.
\begin{itemize}
\item Quality control.
\item Data cleaning.
\end{itemize}
\pause
\item Alignment.
\begin{itemize}
\item Post-alignment quality control.
\end{itemize}
\pause
\item Variant calling.
\pause
\item Filtering.
\begin{itemize}
\item Post-variant calling quality control.
\end{itemize}
\pause
\item Annotation.
\begin{itemize}
\item Post-annotation quality control.
\end{itemize}
\end{enumerate}
\end{frame}
\section{Pre-alignment}
\begin{frame}
We use the FASTX toolkit for data cleaning.
\bigskip
\begin{itemize}
\item Remove linker sequences.
\item Clip low quality reads at the end of the read.
\item Judge the read that is left over.
\end{itemize}
\bigskip
\bigskip
\pause
The FASTQC toolkit is used for quality control (both before and after the
data cleaning step).
\bigskip
\begin{itemize}
\item GC content.
\item GC distribution.
\item Quality scores distribution.
\item \ldots
\end{itemize}
\end{frame}
\begin{frame}
\begin{minipage}{0.45\textwidth}
\begin{center}
\includegraphics[width=5.5cm, height=3cm]{per_base_sequence_content}
\end{center}
\end{minipage}
\hfill
\begin{minipage}{0.45\textwidth}
Per base sequence content.
\end{minipage}
\bigskip
\begin{minipage}{0.45\textwidth}
\begin{center}
\includegraphics[width = 3cm, height = 5.5cm, angle = 270]
{per_sequence_gc_content}
\end{center}
\end{minipage}
\hfill
\begin{minipage}{0.45\textwidth}
Per sequence GC content.
\end{minipage}
\end{frame}
\begin{frame}
\begin{minipage}{0.45\textwidth}
\begin{center}
\includegraphics[width=5.5cm, height=3cm]{per_sequence_quality}
\end{center}
\end{minipage}
\hfill
\begin{minipage}{0.45\textwidth}
Per sequence quality.
\end{minipage}
\bigskip
\begin{minipage}{0.45\textwidth}
\begin{center}
\includegraphics[width = 3cm, height = 5.5cm, angle = 270]
{sequence_length_distribution}
\end{center}
\end{minipage}
\hfill
\begin{minipage}{0.45\textwidth}
Sequence length distribution.
\end{minipage}
\end{frame}
\section{Alignment}
\begin{frame}
Stampy: A statistical algorithm for sensitive and fast mapping of Illumina
sequence reads.
\bigskip
\pause
Some features:
\begin{itemize}
\item Base quality recalibration.
\begin{itemize}
\item First map $1\%$ of the input.
\item Recalibrate the Fastq quality scores.
\item Redo the alignment with the recalibrated scores.
\end{itemize}
\pause
\item Uses BWA for the hard work.
\begin{itemize}
\item Switches to its accurate built in aligner when BWA fails.
\end{itemize}
\end{itemize}
\bigskip
\emph{Burrows-Wheeler Aligner} (BWA) is a short read aligner that allows
small insertions and deletions.
\end{frame}
\begin{frame}
Base quality recalibration.
\begin{center}
\includegraphics[scale = 0.4, angle = 270]{per_base_quality}
\end{center}
\end{frame}
\section{Variant calling}
\begin{frame}
Variant calling is done by Samtools, BCFtools / VCFutils.
\bigskip
The output of most modern aligners is in \emph{Sequence Alignment / Map}
(SAM) format.
\bigskip
\pause
Mainly file format conversions.
\bigskip
\begin{tabular}{@{\,\ \ \ $\bullet$\ \,}l@{\ \ $\rightarrow$\ \ }l}
\color{yellow}SAM\color{white} & BAM.\\
BAM & BAM.sorted.\\
BAM.sorted & BAM.sorted.index.\\
BAM.sorted & mpileup (\color{yellow}BAQ
realignment\color{white}).\\
BAM.sorted & BCF.\\
BCF & \color{yellow}VCF\color{white}.\\
\end{tabular}
\bigskip
We end up with a list in \emph{Variant Call Format} (VCF).
\end{frame}
\begin{frame}
\emph{Base Alignment Quality} (BAQ) realignment:
Remove SNPs around indels.
\begin{center}
\fbox{
\setlength{\unitlength}{1pt}
\begin{picture}(300, 60)(0, 0)
\put(0, 10){\line(1, 0){300}} % Genomic sequence.
\put(0, 14){{\scriptsize reference}}
\put(80, 20){\line(1, 0){60}} % Read with a deletion.
\put(160, 20){\line(1, 0){60}}
\put(80, 24){{\scriptsize read1}}
\put(148, 27.5){xx}
\put(160, 30){\line(1, 0){110}}
\put(250, 34){{\scriptsize read2}}
\end{picture}
}
\pause
\bigskip
$\Downarrow$
\bigskip
\fbox{
\setlength{\unitlength}{1pt}
\begin{picture}(300, 60)(0, 0)
\put(0, 10){\line(1, 0){300}} % Genomic sequence.
\put(0, 14){{\scriptsize reference}}
\put(80, 20){\line(1, 0){60}} % Read with a deletion.
\put(160, 20){\line(1, 0){60}}
\put(80, 24){{\scriptsize read1}}
\put(130, 30){\line(1, 0){10}}
\put(160, 30){\line(1, 0){110}}
\put(250, 34){{\scriptsize read2}}
\end{picture}
}
\end{center}
\end{frame}
\section{Filtering}
\begin{frame}
Samtools varfilter.
\begin{itemize}
\item Minimum coverage threshold.
\item Strand bias.
\item Quality scores.
\item \color{yellow}Maximum coverage threshold\color{white}.
\begin{itemize}
\item Copy number variation.
\item Alignment artefacts.
\end{itemize}
\end{itemize}
\bigskip
\pause
Transition transversion rates:
\begin{itemize}
\item Around $2.1$ human full genome.
\item Around $2.8$ to $3.0$ human exome.
\end{itemize}
\bigskip
\pause
Still working on:
\begin{itemize}
\item Maximum coverage per region.
\begin{itemize}
\item Probe affinity can vary greatly.
\end{itemize}
\end{itemize}
\end{frame}
\section{Annotation}
\begin{frame}
We use five annotation sources.
\begin{itemize}
\item Seattle Seq.
\item Ensembl.
\item Mutalyzer / SVEP.
\item LOVD.
\item In house database.
\pause
\begin{itemize}
\item HGMD data.
\item 1000 genomes project.
\item Genome of the Netherlands (250 triplets).
\item All variants called by this pipeline.
\pause
\begin{itemize}
\item Coverage per variant.
\item Number of reads supporting the variant.
\item Horizontal coverage per sample.
\end{itemize}
\end{itemize}
\end{itemize}
\bigskip
\pause
dbSNP rate should be around $90\%$ (human exome).
\end{frame}
\section{Technical details}
\begin{frame}
Some implementation details.
\begin{itemize}
\item Framework in \emph{bash}.
\begin{itemize}
\item Stand alone scripts written in other languages (Perl, Python,
\ldots).
\pause
\end{itemize}
\item \emph{Sun grid engine} to submit jobs to our local cluster.
\pause
\item Database to keep track of the versions of all used tools and custom
scripts.
\begin{itemize}
\item If one or more tools are upgraded, the new versions are stored.
\item The version number of the pipeline is incremented.
\item The versions of all tools of all pipeline versions can be retrieved
from this database.
\end{itemize}
\pause
\item \LaTeX\ documentation is automatically generated.
\begin{itemize}
\item Compiled to pdf that can be handed over to the customer.
\end{itemize}
\pause
\item All individual commands are logged.
\end{itemize}
\end{frame}
\section{Questions?}
\lastpagetemplate
\begin{frame}
\begin{center}
Acknowledgements
\bigskip
\bigskip
Michiel van Galen
Yu-Ching Lai
Martijn Vermaat
Bradley ten Broeke
Jaap van der Heijden
Michel Villerius
Matthew Hestand
Johan den Dunnen
\bigskip
\bigskip
\bigskip
\bigskip
\bigskip
\bigskip
\bt{http://www.lgtc.nl}
\end{center}
\end{frame}
\end{document}
../../submodules/presentation-pics/pics/sequence_length_distribution.eps
\ No newline at end of file
../../submodules/presentation/ul_logo.eps
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment