Skip to content
Snippets Groups Projects
Commit 1495d0b1 authored by Laros's avatar Laros
Browse files

Merge branch 'leiden_2014' of git.lumc.nl:humgen/ngs-intro-course into leiden_2014

parents 7e21f0ee 51b66d40
No related branches found
No related tags found
No related merge requests found
\documentclass[slidestop]{beamer}
\title{Quality control}
\providecommand{\myConference}{NGS introduction}
\providecommand{\myDate}{Thursday, 22 May 2014}
\author{Michiel van Galen}
\providecommand{\myGroup}{Leiden Genome Technology Center}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\providecommand{\lastCenterLogo}{
\raisebox{-0.1cm}{
%\includegraphics[height=1cm]{lgtc_logo}
%\includegraphics[height=0.7cm]{ngi_logo}
}
}
\providecommand{\lastRightLogo}{
%\includegraphics[height=0.7cm]{nbic_logo}
%\includegraphics[height=0.8cm]{nwo_logo_en}
%\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo}
}
\usetheme{lumc}
\begin{document}
% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}
% Make the title page.
\bodytemplate
\section{Introduction}
\subsection{Overview}
\begin{pframe}
\begin{itemize}
\item Data and the flaws
\item Quality control basics
\item Tools and advanced methods
\end{itemize}
\end{pframe}
\subsection{The data}
\begin{pframe}
\begin{itemize}
\item FastQ: Expanded two line Fasta format
\item Four lines per entry
\item Sequence and per base phred quality combined
\item Beware of different score offsets
\end{itemize}
\begin{lstlisting}[caption={FastQ format}]
@SEQ_ID
GATTTGGGGTTCAAAGCAGTA
+
!''*((((***+))%%%++)(
\end{lstlisting}
\end{pframe}
\subsection{The flaws}
\begin{pframe}
At any point from the start of the experiment until beginning analyses,
quality can be jeopardized.
\bigskip
\begin{itemize}
\item Gathering material and sample prep
\begin{itemize}
\item Contamination, degradation, sample swap
\end{itemize}
\item Sequencing
\begin{itemize}
\item Exhausted chemicals, technical issues
\end{itemize}
\item Data integrity
\begin{itemize}
\item File corruption
\end{itemize}
\item Many other unexpected external factors
\end{itemize}
\end{pframe}
\subsection{The consequence}
\begin{pframe}
\bigskip
Low quality greatly influences the downstream analyses.
\bigskip
\begin{figure}
\caption{Garbage in garbage out}
\centering
\includegraphics[width=0.5\textwidth]{garbage}
\end{figure}
\end{pframe}
\section{Quality control basics}
\subsection{Quality assessment}
\begin{pframe}
\begin{itemize}
\item FastQC: A quality control tool for high throughput sequence data.
\item Assess the quality of your data in a fastq file
\end{itemize}
\begin{figure}
\caption{FastQC}
\centering
\includegraphics[width=0.5\textwidth]{pretrimmed_qscores}
\end{figure}
\end{pframe}
\subsection{Data properties}
\begin{pframe}
Properties which can indicate possible biases in your data:
\begin{itemize}
\item Quality scores - Higher is better
\item GC content - Expected vs observed
\item Duplication rate - Lower is usually better
\item N content - Less is more
\item Adapter contaminants - More adapter, less sample
\item kMer statistics - Expected vs observed
\end{itemize}
\end{pframe}
\subsection{Improving your data}
\begin{pframe}
After identification of some issues, correction may be possible
\bigskip
\begin{itemize}
\item Low quality bases can be discarded
\item Adapter sequences can be removed
\item Downstream analyses can be tailored to identified problems
\end{itemize}
\end{pframe}
\subsection{Quality trimming}
\begin{pframe}
\begin{itemize}
\item Getting rid of low quality bases
\item Only want to maintain the high-quality bases
\end{itemize}
\begin{lstlisting}[language=none, caption={}]
@Header
ACGTACGTACGT
+
!#II!JJJI##!
Will result in:
--GTACGTA---
\end{lstlisting}
\end{pframe}
\subsection{Clipping adapters}
\begin{pframe}
\begin{itemize}
\item FastQC can identify adapter contaminants which can hamper later analyses
\item Specific tools can remove these specific sequences
\end{itemize}
\begin{figure}
\caption{Adapter Sequencing}
\centering
\includegraphics[width=0.8\textwidth]{adapter_sequencing}
\end{figure}
\end{pframe}
\subsection{Digital data quality}
\begin{pframe}
Also digital date can be of low quality
\bigskip
\begin{itemize}
\item Hardware failure
\begin{itemize}
\item Data corruption, insufficient disk space
\end{itemize}
\item Human failure
\begin{itemize}
\item Sample swaps, unclear file names, incomplete copies
\end{itemize}
\end{itemize}
\end{pframe}
\section{Tools and advanced methods}
\subsection{kMer analysis}
\begin{pframe}
\begin{itemize}
\item Analyzing the frequencies of words of length K
\item Proven to detect all sorts of factors which influence the data
\begin{itemize}
\item Contamination, quality, duplication
\end{itemize}
\item Also used to determine sample complexity
\end{itemize}
\end{pframe}
\subsection{Overview of tools}
\begin{pframe}
\begin{itemize}
\item{Quality assessment}
\begin{itemize}
\item FastQC, kMer, QCDB
\end{itemize}
\item{Trimming}
\begin{itemize}
\item Sickle: A windowed adaptive trimming tool
\end{itemize}
\item{Adapter clipping}
\begin{itemize}
\item Cutadapt
\end{itemize}
\item{File integrity}
\begin{itemize}
\item Md5checksums, GRP
\end{itemize}
\end{itemize}
\end{pframe}
\subsection{QC process}
\begin{pframe}
Good QC practice can be performed following the next steps:
\begin{itemize}
\item Assess the quality of raw data
\item Identify possible factors that impact the data
\item Apply the tools to improve the data
\item Assess the quality again and evaluate the results
\end{itemize}
\bigskip
Preferably this can be done in a precompiled pipeline
\end{pframe}
\section{Questions?}
\lastpagetemplate
\begin{pframe}
\begin{center}
Acknowledgements:
\bigskip
\bigskip
Jeroen Laros
\bigskip
Martijn Vermaat
\bigskip
Jeroen Frank
\bigskip
LGTC
\end{center}
\end{pframe}
\end{document}
\documentclass[slidestop]{beamer}
\title{Quality control}
\providecommand{\myConference}{Work discussion}
\providecommand{\myDate}{Thursday, 22 May 2014}
\author{Michiel van Galen}
\providecommand{\myGroup}{Leiden Genome Technology Center}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\providecommand{\lastCenterLogo}{
\raisebox{-0.1cm}{
%\includegraphics[height=1cm]{lgtc_logo}
%\includegraphics[height=0.7cm]{ngi_logo}
}
}
\providecommand{\lastRightLogo}{
%\includegraphics[height=0.7cm]{nbic_logo}
%\includegraphics[height=0.8cm]{nwo_logo_en}
%\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo}
}
\usetheme{lumc}
\begin{document}
% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}
% Make the title page.
\bodytemplate
\section{Introduction}
\subsection{Overview}
\begin{pframe}
\begin{itemize}
\item Data and the flaws
\item Quality control basics
\item Tools and advanced methods
\end{itemize}
\end{pframe}
\subsection{The data}
\begin{pframe}
\begin{itemize}
\item FastQ: Expanded two line Fasta format
\item Four lines per entry
\item Sequence and per base phred quality combined
\item Beware of different score offsets
\end{itemize}
\begin{lstlisting}[caption={FastQ format}]
@SEQ_ID
GATTTGGGGTTCAAAGCAGTA
+
!''*((((***+))%%%++)(
\end{lstlisting}
\end{pframe}
\subsection{The flaws}
\begin{pframe}
At any point from the start of the experiment until beginning analyses,
quality can be jeopardized.
\bigskip
\begin{itemize}
\item Gathering material and sample prep
\begin{itemize}
\item Contamination, degradation, sample swap
\end{itemize}
\item Sequencing
\begin{itemize}
\item Exhausted chemicals, technical issues
\end{itemize}
\item Data integrity
\begin{itemize}
\item File corruption
\end{itemize}
\item Many other unexpected external factors
\end{itemize}
\end{pframe}
\subsection{The consequence}
\begin{pframe}
\bigskip
Low quality greatly influences the downstream analyses.
\bigskip
\begin{figure}
\caption{Garbage in garbage out}
\centering
\includegraphics[width=0.5\textwidth]{garbage}
\end{figure}
\end{pframe}
\section{Quality control basics}
\subsection{Quality assessment}
\begin{pframe}
\begin{itemize}
\item FastQC: A quality control tool for high throughput sequence data.
\item Assess the quality of your data in a fastq file
\end{itemize}
\begin{figure}
\caption{FastQC}
\centering
\includegraphics[width=0.5\textwidth]{pretrimmed_qscores}
\end{figure}
\end{pframe}
\subsection{Data properties}
\begin{pframe}
Properties which can indicate possible biases in your data:
\begin{itemize}
\item Quality scores - Higer is better
\item GC content - Expected vs observed
\item Duplication rate - Lower is usually better
\item N content - Less is more
\item Adapter contaminants - More adapter, less sample
\item kMer statistics - Expected vs observed
\end{itemize}
\end{pframe}
\subsection{Improving your data}
\begin{pframe}
After identification of some issues, correction may be possible
\bigskip
\begin{itemize}
\item Low quality bases can be discarded
\item Adapter sequences can be removed
\item Downstream analyses can be tailored to identified problems
\end{itemize}
\end{pframe}
\subsection{Quality trimming}
\begin{pframe}
\begin{itemize}
\item Getting rid of low quality bases
\item Only want to maintain the high-quality bases
\end{itemize}
\begin{lstlisting}[language=none, caption={}]
@Header
ACGTACGTACGT
+
!#II!JJJI##!
Will result in:
--GTACGTA---
\end{lstlisting}
\end{pframe}
\subsection{Clipping adapters}
\begin{pframe}
\begin{itemize}
\item FastQC can identify adapter contaminants which can hamper later analyses
\item Specific tools can remove these specific sequences
\end{itemize}
\begin{figure}
\caption{Adapter Sequencing}
\centering
\includegraphics[width=0.8\textwidth]{adapter_sequencing}
\end{figure}
\end{pframe}
\subsection{Digital data quality}
\begin{pframe}
Also digital date can be of low quality
\bigskip
\begin{itemize}
\item Hardware failure
\begin{itemize}
\item Data corruption, insufficient disk space
\end{itemize}
\item Human failure
\begin{itemize}
\item Sample swaps, unclear filenames, incomplete copies
\end{itemize}
\end{itemize}
\end{pframe}
\section{Tools and advanced methods}
\subsection{kMer analysis}
\begin{pframe}
\begin{itemize}
\item Analyzing the frequencies of words of length K
\item Proven to detect all sorts of factors which influence the data
\begin{itemize}
\item Contamination, quality, duplication
\end{itemize}
\item Also used to determine sample complexity
\end{itemize}
\end{pframe}
\subsection{Overview of tools}
\begin{pframe}
\begin{itemize}
\item{Qualty assessment}
\begin{itemize}
\item FastQC, kMer, QCDB
\end{itemize}
\item{Trimming}
\begin{itemize}
\item Sickle: A windowed adaptive trimming tool
\end{itemize}
\item{Adapter clipping}
\begin{itemize}
\item Cutadapt
\end{itemize}
\item{File integrity}
\begin{itemize}
\item Md5checksums, GRP
\end{itemize}
\end{itemize}
\end{pframe}
\subsection{QC process}
\begin{pframe}
Good QC practice can be performed following the next steps:
\begin{itemize}
\item Assess the quality of raw data
\item Identify possible factors that impact the data
\item Apply the tools to improve the data
\item Assess the quality again and evaluate the results
\end{itemize}
\bigskip
Preferably this can be done in a precompiled pipeline
\end{pframe}
\section{Questions?}
\lastpagetemplate
\begin{pframe}
\begin{center}
Acknowledgements:
\bigskip
\bigskip
Jeroen Laros
\bigskip
Martijn Vermaat
\bigskip
Jeroen Frank
\bigskip
LGTC
\end{center}
\end{pframe}
\end{document}
../presentation/ul_logo.eps
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment