diff --git a/grp_for_data_analysis/Makefile b/grp_for_data_analysis/Makefile new file mode 120000 index 0000000000000000000000000000000000000000..199dff7226a84dcdd0c281699009a50ee16432d5 --- /dev/null +++ b/grp_for_data_analysis/Makefile @@ -0,0 +1 @@ +../presentation/Makefile \ No newline at end of file diff --git a/grp_for_data_analysis/beamerthemelumc.sty b/grp_for_data_analysis/beamerthemelumc.sty new file mode 120000 index 0000000000000000000000000000000000000000..999deb4e197ab5cd127d06a12d63c324f88ec711 --- /dev/null +++ b/grp_for_data_analysis/beamerthemelumc.sty @@ -0,0 +1 @@ +../presentation/beamerthemelumc.sty \ No newline at end of file diff --git a/grp_for_data_analysis/gen2phen_logo.eps b/grp_for_data_analysis/gen2phen_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..0f2636661f1c186560f22691e6a5172e1bc3f7a8 --- /dev/null +++ b/grp_for_data_analysis/gen2phen_logo.eps @@ -0,0 +1 @@ +../presentation/gen2phen_logo.eps \ No newline at end of file diff --git a/grp_for_data_analysis/grp_for_data_analysis.tex b/grp_for_data_analysis/grp_for_data_analysis.tex new file mode 100644 index 0000000000000000000000000000000000000000..075f8aa5b0d8014b2318baf79900547bec342ec0 --- /dev/null +++ b/grp_for_data_analysis/grp_for_data_analysis.tex @@ -0,0 +1,65 @@ +\documentclass[slidestop]{beamer} + +\title{Phylogenetic reconstruction} +\providecommand{\myConference}{NGS introduction} +\providecommand{\myDate}{Thursday, 22 May 2014} +\author{Michiel van Galen} +\providecommand{\myGroup}{Leiden Genome Technology Center} +\providecommand{\myDepartment}{Department of Human Genetics} +\providecommand{\myCenter}{Center for Human and Clinical Genetics} +\providecommand{\lastCenterLogo}{ + \raisebox{-0.1cm}{ + %\includegraphics[height=1cm]{lgtc_logo} + %\includegraphics[height=0.7cm]{ngi_logo} + } +} +\providecommand{\lastRightLogo}{ + %\includegraphics[height=0.7cm]{nbic_logo} + %\includegraphics[height=0.8cm]{nwo_logo_en} + %\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo} +} + +\usetheme{lumc} + +\begin{document} + +% This disables the \pause command, handy in the editing phase. +%\renewcommand{\pause}{} + +% Make the title page. +\bodytemplate + +% First page of the presentation. +\section{Introduction} +\subsection{Some slide} +\begin{pframe} + + \begin{itemize} + \item The \emph{section} command controls the title. + \item The \emph{subsection} command controls the frametitle. + \end{itemize} +\end{pframe} + +\section{Questions?} +\lastpagetemplate +\begin{pframe} + \begin{center} + Acknowledgements: + \bigskip + \bigskip + + Jeroen Laros + \bigskip + + Martijn Vermaat + \bigskip + + Jeroen Frank + \bigskip + + LGTC + + \end{center} +\end{pframe} + +\end{document} diff --git a/grp_for_data_analysis/lgtc_logo.eps b/grp_for_data_analysis/lgtc_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..1732a7e9e5917e6840c9e5977fcff78334758d07 --- /dev/null +++ b/grp_for_data_analysis/lgtc_logo.eps @@ -0,0 +1 @@ +../presentation/lgtc_logo.eps \ No newline at end of file diff --git a/grp_for_data_analysis/lumc_logo.eps b/grp_for_data_analysis/lumc_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..28075f649e2e97c354f0edcb8499aaa716802566 --- /dev/null +++ b/grp_for_data_analysis/lumc_logo.eps @@ -0,0 +1 @@ +../presentation/lumc_logo.eps \ No newline at end of file diff --git a/grp_for_data_analysis/lumc_logo_small.eps b/grp_for_data_analysis/lumc_logo_small.eps new file mode 120000 index 0000000000000000000000000000000000000000..a5544fe55e1326788ad0ab37c560dc40c9adf29e --- /dev/null +++ b/grp_for_data_analysis/lumc_logo_small.eps @@ -0,0 +1 @@ +../presentation/lumc_logo_small.eps \ No newline at end of file diff --git a/grp_for_data_analysis/nbic_logo.eps b/grp_for_data_analysis/nbic_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..8780a0131e16e899fa17c12d886c4135f36e933d --- /dev/null +++ b/grp_for_data_analysis/nbic_logo.eps @@ -0,0 +1 @@ +../presentation/nbic_logo.eps \ No newline at end of file diff --git a/grp_for_data_analysis/ngi_logo.eps b/grp_for_data_analysis/ngi_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..2a2e1ea9b13da5f50916f33c34e8b5607022962d --- /dev/null +++ b/grp_for_data_analysis/ngi_logo.eps @@ -0,0 +1 @@ +../presentation/ngi_logo.eps \ No newline at end of file diff --git a/grp_for_data_analysis/nwo_logo_en.eps b/grp_for_data_analysis/nwo_logo_en.eps new file mode 120000 index 0000000000000000000000000000000000000000..adcf12fd16e511dbe8213f21ba684611f13291bf --- /dev/null +++ b/grp_for_data_analysis/nwo_logo_en.eps @@ -0,0 +1 @@ +../presentation/nwo_logo_en.eps \ No newline at end of file diff --git a/grp_for_data_analysis/nwo_logo_nl.eps b/grp_for_data_analysis/nwo_logo_nl.eps new file mode 120000 index 0000000000000000000000000000000000000000..67830c9ef8ee053f74c8510ac87c2e8a4a4f88dc --- /dev/null +++ b/grp_for_data_analysis/nwo_logo_nl.eps @@ -0,0 +1 @@ +../presentation/nwo_logo_nl.eps \ No newline at end of file diff --git a/grp_for_data_analysis/ul_logo.eps b/grp_for_data_analysis/ul_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..cd04dcb9c72ebdde50fda7ee41e375e053fc31b9 --- /dev/null +++ b/grp_for_data_analysis/ul_logo.eps @@ -0,0 +1 @@ +../presentation/ul_logo.eps \ No newline at end of file diff --git a/phylogenetic_reconstruction/Makefile b/phylogenetic_reconstruction/Makefile new file mode 120000 index 0000000000000000000000000000000000000000..199dff7226a84dcdd0c281699009a50ee16432d5 --- /dev/null +++ b/phylogenetic_reconstruction/Makefile @@ -0,0 +1 @@ +../presentation/Makefile \ No newline at end of file diff --git a/phylogenetic_reconstruction/beamerthemelumc.sty b/phylogenetic_reconstruction/beamerthemelumc.sty new file mode 120000 index 0000000000000000000000000000000000000000..999deb4e197ab5cd127d06a12d63c324f88ec711 --- /dev/null +++ b/phylogenetic_reconstruction/beamerthemelumc.sty @@ -0,0 +1 @@ +../presentation/beamerthemelumc.sty \ No newline at end of file diff --git a/phylogenetic_reconstruction/clusterall_bw.ps b/phylogenetic_reconstruction/clusterall_bw.ps new file mode 120000 index 0000000000000000000000000000000000000000..6c6991fd9c2e19d7e4e5323b286333741a2d0d33 --- /dev/null +++ b/phylogenetic_reconstruction/clusterall_bw.ps @@ -0,0 +1 @@ +../presentation-pics/pics/clusterall_bw.ps \ No newline at end of file diff --git a/phylogenetic_reconstruction/gen2phen_logo.eps b/phylogenetic_reconstruction/gen2phen_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..0f2636661f1c186560f22691e6a5172e1bc3f7a8 --- /dev/null +++ b/phylogenetic_reconstruction/gen2phen_logo.eps @@ -0,0 +1 @@ +../presentation/gen2phen_logo.eps \ No newline at end of file diff --git a/phylogenetic_reconstruction/lgtc_logo.eps b/phylogenetic_reconstruction/lgtc_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..1732a7e9e5917e6840c9e5977fcff78334758d07 --- /dev/null +++ b/phylogenetic_reconstruction/lgtc_logo.eps @@ -0,0 +1 @@ +../presentation/lgtc_logo.eps \ No newline at end of file diff --git a/phylogenetic_reconstruction/lumc_logo.eps b/phylogenetic_reconstruction/lumc_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..28075f649e2e97c354f0edcb8499aaa716802566 --- /dev/null +++ b/phylogenetic_reconstruction/lumc_logo.eps @@ -0,0 +1 @@ +../presentation/lumc_logo.eps \ No newline at end of file diff --git a/phylogenetic_reconstruction/lumc_logo_small.eps b/phylogenetic_reconstruction/lumc_logo_small.eps new file mode 120000 index 0000000000000000000000000000000000000000..a5544fe55e1326788ad0ab37c560dc40c9adf29e --- /dev/null +++ b/phylogenetic_reconstruction/lumc_logo_small.eps @@ -0,0 +1 @@ +../presentation/lumc_logo_small.eps \ No newline at end of file diff --git a/phylogenetic_reconstruction/nbic_logo.eps b/phylogenetic_reconstruction/nbic_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..8780a0131e16e899fa17c12d886c4135f36e933d --- /dev/null +++ b/phylogenetic_reconstruction/nbic_logo.eps @@ -0,0 +1 @@ +../presentation/nbic_logo.eps \ No newline at end of file diff --git a/phylogenetic_reconstruction/ngi_logo.eps b/phylogenetic_reconstruction/ngi_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..2a2e1ea9b13da5f50916f33c34e8b5607022962d --- /dev/null +++ b/phylogenetic_reconstruction/ngi_logo.eps @@ -0,0 +1 @@ +../presentation/ngi_logo.eps \ No newline at end of file diff --git a/phylogenetic_reconstruction/nwo_logo_en.eps b/phylogenetic_reconstruction/nwo_logo_en.eps new file mode 120000 index 0000000000000000000000000000000000000000..adcf12fd16e511dbe8213f21ba684611f13291bf --- /dev/null +++ b/phylogenetic_reconstruction/nwo_logo_en.eps @@ -0,0 +1 @@ +../presentation/nwo_logo_en.eps \ No newline at end of file diff --git a/phylogenetic_reconstruction/nwo_logo_nl.eps b/phylogenetic_reconstruction/nwo_logo_nl.eps new file mode 120000 index 0000000000000000000000000000000000000000..67830c9ef8ee053f74c8510ac87c2e8a4a4f88dc --- /dev/null +++ b/phylogenetic_reconstruction/nwo_logo_nl.eps @@ -0,0 +1 @@ +../presentation/nwo_logo_nl.eps \ No newline at end of file diff --git a/phylogenetic_reconstruction/phylogenetic_reconstruction.tex b/phylogenetic_reconstruction/phylogenetic_reconstruction.tex new file mode 100644 index 0000000000000000000000000000000000000000..2a19d7d428a97a743ae7f56f98e96d118eadc101 --- /dev/null +++ b/phylogenetic_reconstruction/phylogenetic_reconstruction.tex @@ -0,0 +1,207 @@ +\documentclass[slidestop]{beamer} +\title{Phylogenetic reconstruction} +\providecommand{\myConference}{NGS introduction} +\providecommand{\myDate}{Thursday, 22 May 2014} +\author{Michiel van Galen} +\providecommand{\myGroup}{Leiden Genome Technology Center} +\providecommand{\myDepartment}{Department of Human Genetics} +\providecommand{\myCenter}{Center for Human and Clinical Genetics} +\providecommand{\lastCenterLogo}{ + \raisebox{-0.1cm}{ + %\includegraphics[height=1cm]{lgtc_logo} + %\includegraphics[height=0.7cm]{ngi_logo} + } +} +\providecommand{\lastRightLogo}{ + %\includegraphics[height=0.7cm]{nbic_logo} + %\includegraphics[height=0.8cm]{nwo_logo_en} + %\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo} +} + +\usetheme{lumc} + +\begin{document} + +% This disables the \pause command, handy in the editing phase. +%\renewcommand{\pause}{} + +% Make the title page. +\bodytemplate + +% First page of the presentation. +\section{Material} +\subsection{Input and goal} +\begin{pframe} + \begin{itemize} + \item Sequence data available for different strains of bacteria + \item One FastQ file per strain + \end{itemize} + \bigskip + + NGS throughput is much higher compared to conventional methods (Sanger +sequencing). Increasing the chances on new insights. + \bigskip + + However, there is little solutions available to accommodate the magnitude in +the field of phylogenetic reconstruction. +\end{pframe} + +\section{Methods} +\subsection{Naive approach} +\begin{pframe} + \begin{figure} + \centering + \includegraphics[width=0.75\textwidth]{previous} + \end{figure} +\end{pframe} + +\subsection{Naive approach} +\begin{pframe} + Early workflow adapted from Sanger suffered from some limitations: + \bigskip + + \begin{itemize} + \item Difficult to reproduce + \item Poorly documented + \item Using unconventional methods + \item Not parallelized + \item Susceptible to errors + \item Customization or modification nearly impossible + \item Stops at the tree construction + \end{itemize} +\end{pframe} + +\subsection{From bundle of scripts to pipeline} +\begin{pframe} + Re-factor the workflow into a complete pipeline + \bigskip + + \begin{itemize} + \item Convert the workflow to an automated pipeline + \item Replace custom scripts with maintained existing tools and methods + \item Include cluster support + \item Improve usability and customization + \end{itemize} +\end{pframe} + +\section{Pipeline} +\subsection{Breakdown of the pipeline} +\begin{pframe} + The workflow can be roughly broken down into two parts + \bigskip + + \begin{itemize} + \item Per sample part - Analyze the samples separately + \item Merged part - Combine output for each sample + \end{itemize} +\end{pframe} + +\subsection{Per sample part} +\begin{pframe} + These steps are for each sample the same and can be parallelized + \begin{itemize} + \item Add QC - Standard tools + \item Alignment to canonical reference - BWA + \item Variant calling and filtering - Samtools + \item Mask variants in repeated regions - BEDtools + \end{itemize} +\end{pframe} + +\subsection{Merged part, combining the output} +\begin{pframe} + \begin{itemize} + \item Compare the variants between strains - Python + \begin{itemize} + \item Merge the variant files into one matrix - VCFtools + \end{itemize} + \bigskip + + \item Use PHYLIP to infer a evolutionary tree + \begin{itemize} + \item Create distance matrix (dnadist) + \item Create a phylogenetic tree + \end{itemize} + \end{itemize} +\end{pframe} + +\section{Current situation} +\subsection{Implementation} +\begin{pframe} + The pipeline is designed to run on the LUMC Shark cluster + \begin{itemize} + \item All tools are available and maintained + \item Pipeline is written in Make, compatible to run in parallel + \item Reduced the number of custom scripts to just one + \begin{itemize} + \item Not reinventing the wheel, outsource support for tools + \end{itemize} + \end{itemize} +\end{pframe} + +\section{Future work} +\subsection{Possible expansions} +\begin{pframe} + \begin{itemize} + \item Improve usability even more + \begin{itemize} + \item User friendly interface + \item More automation + \end{itemize} + \bigskip + + \item kMer analysis + \begin{itemize} + \item Proven to work on meta-genomic datasets + \end{itemize} + \end{itemize} +\end{pframe} + +\subsection{kMer} +\begin{pframe} + \begin{itemize} + \item Calculate distance between samples based on occurrences of words of length k + \end{itemize} + \begin{figure} + \centering + \includegraphics[width=0.40\textwidth]{clusterall_bw.ps} + \end{figure} +\end{pframe} + +\section{Conclusion} +\begin{pframe} + Summarizing: + \bigskip + + \begin{itemize} + \item Much room for pipeline development and automation + \item Apply existing tools where possible reduce development time + \item Data is relatively small compared to human data making our + infrastructure well prepared + \end{itemize} +\end{pframe} + +\section{Questions?} +\lastpagetemplate +\begin{pframe} + \begin{center} + Acknowledgements: + \bigskip + \bigskip + + Wilco Knetsch + \bigskip + + Jeroen Laros + \bigskip + + Martijn Vermaat + \bigskip + + Jeroen Frank + \bigskip + + LGTC + \end{center} +\end{pframe} + +\end{document} diff --git a/phylogenetic_reconstruction/phylogenetic_reconstruction.tex.bak b/phylogenetic_reconstruction/phylogenetic_reconstruction.tex.bak new file mode 100644 index 0000000000000000000000000000000000000000..e8046edb35b9de04b4aa1db3540d338ae9a0ba93 --- /dev/null +++ b/phylogenetic_reconstruction/phylogenetic_reconstruction.tex.bak @@ -0,0 +1,207 @@ +\documentclass[slidestop]{beamer} +\title{Phylogenetic reconstruction} +\providecommand{\myConference}{NGS introduction} +\providecommand{\myDate}{Thursday, 22 May 2014} +\author{Michiel van Galen} +\providecommand{\myGroup}{Leiden Genome Technology Center} +\providecommand{\myDepartment}{Department of Human Genetics} +\providecommand{\myCenter}{Center for Human and Clinical Genetics} +\providecommand{\lastCenterLogo}{ + \raisebox{-0.1cm}{ + %\includegraphics[height=1cm]{lgtc_logo} + %\includegraphics[height=0.7cm]{ngi_logo} + } +} +\providecommand{\lastRightLogo}{ + %\includegraphics[height=0.7cm]{nbic_logo} + %\includegraphics[height=0.8cm]{nwo_logo_en} + %\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo} +} + +\usetheme{lumc} + +\begin{document} + +% This disables the \pause command, handy in the editing phase. +%\renewcommand{\pause}{} + +% Make the title page. +\bodytemplate + +% First page of the presentation. +\section{Material} +\subsection{Input and goal} +\begin{pframe} + \begin{itemize} + \item Sequence data available for different strains of bacteria + \item One FastQ file per strain + \end{itemize} + \bigskip + + NGS throughput is much higher compared to conventional methods (Sanger +sequencing). Increasing the chances on new insights. + \bigskip + + However, there is little solutions available to accomodate the magnitude in +the field of phylogenetic reconstruction. +\end{pframe} + +\section{Methods} +\subsection{Naive approach} +\begin{pframe} + \begin{figure} + \centering + \includegraphics[width=0.75\textwidth]{previous} + \end{figure} +\end{pframe} + +\subsection{Naive approach} +\begin{pframe} + Early workflow adapted from Sanger suffered from some limitations: + \bigskip + + \begin{itemize} + \item Difficult to reproduce + \item Poorly documented + \item Using unconventional methods + \item Not parallelized + \item Susceptible to errors + \item Customization or modification nearly impossible + \item Stops at the tree construction + \end{itemize} +\end{pframe} + +\subsection{From bundle of scripts to pipeline} +\begin{pframe} + Refactor the workflow into a complete pipeline + \bigskip + + \begin{itemize} + \item Convert the workflow to an automated pipeline + \item Replace custom scripts with maintained existing tools and methods + \item Include cluster support + \item Improve usability and customization + \end{itemize} +\end{pframe} + +\section{Pipeline} +\subsection{Breakdown of the pipeline} +\begin{pframe} + The workflow can be roughly broken down into two parts + \bigskip + + \begin{itemize} + \item Per sample part - Analyze the samples seperately + \item Merged part - Combine output for each sample + \end{itemize} +\end{pframe} + +\subsection{Per sample part} +\begin{pframe} + These steps are for each sample the same and can be parallelized + \begin{itemize} + \item Add QC - Standard tools + \item Alignment to canonical reference - BWA + \item Variant calling and filtering - Samtools + \item Mask variants in repeated regions - BEDtools + \end{itemize} +\end{pframe} + +\subsection{Merged part, combining the output} +\begin{pframe} + \begin{itemize} + \item Compare the variants between strains - Python + \begin{itemize} + \item Merge the variant files into one matrix - VCFtools + \end{itemize} + \bigskip + + \item Use PHYLIP to infer a evolutionary tree + \begin{itemize} + \item Create distance matrix (dnadist) + \item Create a phylogenetic tree + \end{itemize} + \end{itemize} +\end{pframe} + +\section{Current situation} +\subsection{Implementation} +\begin{pframe} + The pipeline is designed to run on the LUMC Shark cluster + \begin{itemize} + \item All tools are available and maintained + \item Pipeline is written in Make, compatible to run in parallel + \item Reduced the number of custom scripts to just one + \begin{itemize} + \item Not reinventing the wheel, outsource support for tools + \end{itemize} + \end{itemize} +\end{pframe} + +\section{Future work} +\subsection{Possible expansions} +\begin{pframe} + \begin{itemize} + \item Improve usabilty even more + \begin{itemize} + \item User friendly interface + \item More automation + \end{itemize} + \bigskip + + \item kMer analysis + \begin{itemize} + \item Proven to work on metagenomic datasets + \end{itemize} + \end{itemize} +\end{pframe} + +\subsection{kMer} +\begin{pframe} + \begin{itemize} + \item Calculate distance between samples based on occurences of words of length k + \end{itemize} + \begin{figure} + \centering + \includegraphics[width=0.40\textwidth]{clusterall_bw.ps} + \end{figure} +\end{pframe} + +\section{Conclusion} +\begin{pframe} + Summarizing: + \bigskip + + \begin{itemize} + \item Much room for pipeline development and automation + \item Apply existing tools where possible reduce development time + \item Data is relatively small compared to human data making our + infrastructure well prepared + \end{itemize} +\end{pframe} + +\section{Questions?} +\lastpagetemplate +\begin{pframe} + \begin{center} + Acknowledgements: + \bigskip + \bigskip + + Wilco Knetsch + \bigskip + + Jeroen Laros + \bigskip + + Martijn Vermaat + \bigskip + + Jeroen Frank + \bigskip + + LGTC + \end{center} +\end{pframe} + +\end{document} diff --git a/phylogenetic_reconstruction/previous.png b/phylogenetic_reconstruction/previous.png new file mode 100644 index 0000000000000000000000000000000000000000..2ce6b948e06330ee81cbedfbdcdb8ec31de057d4 Binary files /dev/null and b/phylogenetic_reconstruction/previous.png differ diff --git a/phylogenetic_reconstruction/ul_logo.eps b/phylogenetic_reconstruction/ul_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..cd04dcb9c72ebdde50fda7ee41e375e053fc31b9 --- /dev/null +++ b/phylogenetic_reconstruction/ul_logo.eps @@ -0,0 +1 @@ +../presentation/ul_logo.eps \ No newline at end of file diff --git a/quality_control/Makefile b/quality_control/Makefile new file mode 120000 index 0000000000000000000000000000000000000000..199dff7226a84dcdd0c281699009a50ee16432d5 --- /dev/null +++ b/quality_control/Makefile @@ -0,0 +1 @@ +../presentation/Makefile \ No newline at end of file diff --git a/quality_control/adapter_sequencing.png b/quality_control/adapter_sequencing.png new file mode 120000 index 0000000000000000000000000000000000000000..bfe79ed062e512378c5338fee817cd2ccbfd45a9 --- /dev/null +++ b/quality_control/adapter_sequencing.png @@ -0,0 +1 @@ +../presentation-pics/pics/adapter_sequencing.png \ No newline at end of file diff --git a/quality_control/beamerthemelumc.sty b/quality_control/beamerthemelumc.sty new file mode 120000 index 0000000000000000000000000000000000000000..999deb4e197ab5cd127d06a12d63c324f88ec711 --- /dev/null +++ b/quality_control/beamerthemelumc.sty @@ -0,0 +1 @@ +../presentation/beamerthemelumc.sty \ No newline at end of file diff --git a/quality_control/garbage.jpg b/quality_control/garbage.jpg new file mode 120000 index 0000000000000000000000000000000000000000..61a423cbb01cb64188a533e57b960af3f6aa0a7c --- /dev/null +++ b/quality_control/garbage.jpg @@ -0,0 +1 @@ +../presentation-pics/pics/garbage-in-garbage-out.jpg \ No newline at end of file diff --git a/quality_control/gen2phen_logo.eps b/quality_control/gen2phen_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..0f2636661f1c186560f22691e6a5172e1bc3f7a8 --- /dev/null +++ b/quality_control/gen2phen_logo.eps @@ -0,0 +1 @@ +../presentation/gen2phen_logo.eps \ No newline at end of file diff --git a/quality_control/lgtc_logo.eps b/quality_control/lgtc_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..1732a7e9e5917e6840c9e5977fcff78334758d07 --- /dev/null +++ b/quality_control/lgtc_logo.eps @@ -0,0 +1 @@ +../presentation/lgtc_logo.eps \ No newline at end of file diff --git a/quality_control/lumc_logo.eps b/quality_control/lumc_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..28075f649e2e97c354f0edcb8499aaa716802566 --- /dev/null +++ b/quality_control/lumc_logo.eps @@ -0,0 +1 @@ +../presentation/lumc_logo.eps \ No newline at end of file diff --git a/quality_control/lumc_logo_small.eps b/quality_control/lumc_logo_small.eps new file mode 120000 index 0000000000000000000000000000000000000000..a5544fe55e1326788ad0ab37c560dc40c9adf29e --- /dev/null +++ b/quality_control/lumc_logo_small.eps @@ -0,0 +1 @@ +../presentation/lumc_logo_small.eps \ No newline at end of file diff --git a/quality_control/nbic_logo.eps b/quality_control/nbic_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..8780a0131e16e899fa17c12d886c4135f36e933d --- /dev/null +++ b/quality_control/nbic_logo.eps @@ -0,0 +1 @@ +../presentation/nbic_logo.eps \ No newline at end of file diff --git a/quality_control/ngi_logo.eps b/quality_control/ngi_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..2a2e1ea9b13da5f50916f33c34e8b5607022962d --- /dev/null +++ b/quality_control/ngi_logo.eps @@ -0,0 +1 @@ +../presentation/ngi_logo.eps \ No newline at end of file diff --git a/quality_control/nwo_logo_en.eps b/quality_control/nwo_logo_en.eps new file mode 120000 index 0000000000000000000000000000000000000000..adcf12fd16e511dbe8213f21ba684611f13291bf --- /dev/null +++ b/quality_control/nwo_logo_en.eps @@ -0,0 +1 @@ +../presentation/nwo_logo_en.eps \ No newline at end of file diff --git a/quality_control/nwo_logo_nl.eps b/quality_control/nwo_logo_nl.eps new file mode 120000 index 0000000000000000000000000000000000000000..67830c9ef8ee053f74c8510ac87c2e8a4a4f88dc --- /dev/null +++ b/quality_control/nwo_logo_nl.eps @@ -0,0 +1 @@ +../presentation/nwo_logo_nl.eps \ No newline at end of file diff --git a/quality_control/pretrimmed_qscores.png b/quality_control/pretrimmed_qscores.png new file mode 120000 index 0000000000000000000000000000000000000000..5fe738b8450d4dd0ee4d30f4dffc386f3f0fc5d5 --- /dev/null +++ b/quality_control/pretrimmed_qscores.png @@ -0,0 +1 @@ +../presentation-pics/pics/pretrimmed_qscores.png \ No newline at end of file diff --git a/quality_control/quality_control.tex b/quality_control/quality_control.tex new file mode 100644 index 0000000000000000000000000000000000000000..877b9000832f29eaa726f8d0b946b5e48a741138 --- /dev/null +++ b/quality_control/quality_control.tex @@ -0,0 +1,251 @@ +\documentclass[slidestop]{beamer} +\title{Quality control} +\providecommand{\myConference}{NGS introduction} +\providecommand{\myDate}{Thursday, 22 May 2014} +\author{Michiel van Galen} +\providecommand{\myGroup}{Leiden Genome Technology Center} +\providecommand{\myDepartment}{Department of Human Genetics} +\providecommand{\myCenter}{Center for Human and Clinical Genetics} +\providecommand{\lastCenterLogo}{ + \raisebox{-0.1cm}{ + %\includegraphics[height=1cm]{lgtc_logo} + %\includegraphics[height=0.7cm]{ngi_logo} + } +} +\providecommand{\lastRightLogo}{ + %\includegraphics[height=0.7cm]{nbic_logo} + %\includegraphics[height=0.8cm]{nwo_logo_en} + %\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo} +} + +\usetheme{lumc} + +\begin{document} + +% This disables the \pause command, handy in the editing phase. +%\renewcommand{\pause}{} + +% Make the title page. +\bodytemplate + +\section{Introduction} +\subsection{Overview} +\begin{pframe} + \begin{itemize} + \item Data and the flaws + \item Quality control basics + \item Tools and advanced methods + \end{itemize} +\end{pframe} + +\subsection{The data} +\begin{pframe} + \begin{itemize} + \item FastQ: Expanded two line Fasta format + \item Four lines per entry + \item Sequence and per base phred quality combined + \item Beware of different score offsets + \end{itemize} + \begin{lstlisting}[caption={FastQ format}] + @SEQ_ID + GATTTGGGGTTCAAAGCAGTA + + + !''*((((***+))%%%++)( + \end{lstlisting} +\end{pframe} + +\subsection{The flaws} +\begin{pframe} + At any point from the start of the experiment until beginning analyses, + quality can be jeopardized. + \bigskip + + \begin{itemize} + \item Gathering material and sample prep + \begin{itemize} + \item Contamination, degradation, sample swap + \end{itemize} + \item Sequencing + \begin{itemize} + \item Exhausted chemicals, technical issues + \end{itemize} + \item Data integrity + \begin{itemize} + \item File corruption + \end{itemize} + \item Many other unexpected external factors + \end{itemize} +\end{pframe} + +\subsection{The consequence} +\begin{pframe} + \bigskip + + Low quality greatly influences the downstream analyses. + \bigskip + + \begin{figure} + \caption{Garbage in garbage out} + \centering + \includegraphics[width=0.5\textwidth]{garbage} + \end{figure} +\end{pframe} + +\section{Quality control basics} +\subsection{Quality assessment} +\begin{pframe} + \begin{itemize} + \item FastQC: A quality control tool for high throughput sequence data. + \item Assess the quality of your data in a fastq file + \end{itemize} + \begin{figure} + \caption{FastQC} + \centering + \includegraphics[width=0.5\textwidth]{pretrimmed_qscores} + \end{figure} +\end{pframe} + +\subsection{Data properties} +\begin{pframe} + Properties which can indicate possible biases in your data: + \begin{itemize} + \item Quality scores - Higher is better + \item GC content - Expected vs observed + \item Duplication rate - Lower is usually better + \item N content - Less is more + \item Adapter contaminants - More adapter, less sample + \item kMer statistics - Expected vs observed + \end{itemize} +\end{pframe} + +\subsection{Improving your data} +\begin{pframe} + After identification of some issues, correction may be possible + \bigskip + + \begin{itemize} + \item Low quality bases can be discarded + \item Adapter sequences can be removed + \item Downstream analyses can be tailored to identified problems + \end{itemize} +\end{pframe} + +\subsection{Quality trimming} +\begin{pframe} + \begin{itemize} + \item Getting rid of low quality bases + \item Only want to maintain the high-quality bases + \end{itemize} + \begin{lstlisting}[language=none, caption={}] + @Header + ACGTACGTACGT + + + !#II!JJJI##! + + Will result in: + --GTACGTA--- + \end{lstlisting} +\end{pframe} + +\subsection{Clipping adapters} +\begin{pframe} + \begin{itemize} + \item FastQC can identify adapter contaminants which can hamper later analyses + \item Specific tools can remove these specific sequences + \end{itemize} + \begin{figure} + \caption{Adapter Sequencing} + \centering + \includegraphics[width=0.8\textwidth]{adapter_sequencing} + \end{figure} +\end{pframe} + +\subsection{Digital data quality} +\begin{pframe} + Also digital date can be of low quality + \bigskip + + \begin{itemize} + \item Hardware failure + \begin{itemize} + \item Data corruption, insufficient disk space + \end{itemize} + \item Human failure + \begin{itemize} + \item Sample swaps, unclear file names, incomplete copies + \end{itemize} + \end{itemize} +\end{pframe} + +\section{Tools and advanced methods} +\subsection{kMer analysis} +\begin{pframe} + \begin{itemize} + \item Analyzing the frequencies of words of length K + \item Proven to detect all sorts of factors which influence the data + \begin{itemize} + \item Contamination, quality, duplication + \end{itemize} + \item Also used to determine sample complexity + \end{itemize} +\end{pframe} + +\subsection{Overview of tools} +\begin{pframe} + \begin{itemize} + \item{Quality assessment} + \begin{itemize} + \item FastQC, kMer, QCDB + \end{itemize} + \item{Trimming} + \begin{itemize} + \item Sickle: A windowed adaptive trimming tool + \end{itemize} + \item{Adapter clipping} + \begin{itemize} + \item Cutadapt + \end{itemize} + \item{File integrity} + \begin{itemize} + \item Md5checksums, GRP + \end{itemize} + \end{itemize} +\end{pframe} + +\subsection{QC process} +\begin{pframe} + Good QC practice can be performed following the next steps: + \begin{itemize} + \item Assess the quality of raw data + \item Identify possible factors that impact the data + \item Apply the tools to improve the data + \item Assess the quality again and evaluate the results + \end{itemize} + \bigskip + + Preferably this can be done in a precompiled pipeline +\end{pframe} + +\section{Questions?} +\lastpagetemplate +\begin{pframe} + \begin{center} + Acknowledgements: + \bigskip + \bigskip + + Jeroen Laros + \bigskip + + Martijn Vermaat + \bigskip + + Jeroen Frank + \bigskip + + LGTC + + \end{center} +\end{pframe} + +\end{document} diff --git a/quality_control/quality_control.tex.bak b/quality_control/quality_control.tex.bak new file mode 100644 index 0000000000000000000000000000000000000000..5b3c16ab9ae58ca5fc863529b5c8627e0bd09d01 --- /dev/null +++ b/quality_control/quality_control.tex.bak @@ -0,0 +1,251 @@ +\documentclass[slidestop]{beamer} +\title{Quality control} +\providecommand{\myConference}{Work discussion} +\providecommand{\myDate}{Thursday, 22 May 2014} +\author{Michiel van Galen} +\providecommand{\myGroup}{Leiden Genome Technology Center} +\providecommand{\myDepartment}{Department of Human Genetics} +\providecommand{\myCenter}{Center for Human and Clinical Genetics} +\providecommand{\lastCenterLogo}{ + \raisebox{-0.1cm}{ + %\includegraphics[height=1cm]{lgtc_logo} + %\includegraphics[height=0.7cm]{ngi_logo} + } +} +\providecommand{\lastRightLogo}{ + %\includegraphics[height=0.7cm]{nbic_logo} + %\includegraphics[height=0.8cm]{nwo_logo_en} + %\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo} +} + +\usetheme{lumc} + +\begin{document} + +% This disables the \pause command, handy in the editing phase. +%\renewcommand{\pause}{} + +% Make the title page. +\bodytemplate + +\section{Introduction} +\subsection{Overview} +\begin{pframe} + \begin{itemize} + \item Data and the flaws + \item Quality control basics + \item Tools and advanced methods + \end{itemize} +\end{pframe} + +\subsection{The data} +\begin{pframe} + \begin{itemize} + \item FastQ: Expanded two line Fasta format + \item Four lines per entry + \item Sequence and per base phred quality combined + \item Beware of different score offsets + \end{itemize} + \begin{lstlisting}[caption={FastQ format}] + @SEQ_ID + GATTTGGGGTTCAAAGCAGTA + + + !''*((((***+))%%%++)( + \end{lstlisting} +\end{pframe} + +\subsection{The flaws} +\begin{pframe} + At any point from the start of the experiment until beginning analyses, + quality can be jeopardized. + \bigskip + + \begin{itemize} + \item Gathering material and sample prep + \begin{itemize} + \item Contamination, degradation, sample swap + \end{itemize} + \item Sequencing + \begin{itemize} + \item Exhausted chemicals, technical issues + \end{itemize} + \item Data integrity + \begin{itemize} + \item File corruption + \end{itemize} + \item Many other unexpected external factors + \end{itemize} +\end{pframe} + +\subsection{The consequence} +\begin{pframe} + \bigskip + + Low quality greatly influences the downstream analyses. + \bigskip + + \begin{figure} + \caption{Garbage in garbage out} + \centering + \includegraphics[width=0.5\textwidth]{garbage} + \end{figure} +\end{pframe} + +\section{Quality control basics} +\subsection{Quality assessment} +\begin{pframe} + \begin{itemize} + \item FastQC: A quality control tool for high throughput sequence data. + \item Assess the quality of your data in a fastq file + \end{itemize} + \begin{figure} + \caption{FastQC} + \centering + \includegraphics[width=0.5\textwidth]{pretrimmed_qscores} + \end{figure} +\end{pframe} + +\subsection{Data properties} +\begin{pframe} + Properties which can indicate possible biases in your data: + \begin{itemize} + \item Quality scores - Higer is better + \item GC content - Expected vs observed + \item Duplication rate - Lower is usually better + \item N content - Less is more + \item Adapter contaminants - More adapter, less sample + \item kMer statistics - Expected vs observed + \end{itemize} +\end{pframe} + +\subsection{Improving your data} +\begin{pframe} + After identification of some issues, correction may be possible + \bigskip + + \begin{itemize} + \item Low quality bases can be discarded + \item Adapter sequences can be removed + \item Downstream analyses can be tailored to identified problems + \end{itemize} +\end{pframe} + +\subsection{Quality trimming} +\begin{pframe} + \begin{itemize} + \item Getting rid of low quality bases + \item Only want to maintain the high-quality bases + \end{itemize} + \begin{lstlisting}[language=none, caption={}] + @Header + ACGTACGTACGT + + + !#II!JJJI##! + + Will result in: + --GTACGTA--- + \end{lstlisting} +\end{pframe} + +\subsection{Clipping adapters} +\begin{pframe} + \begin{itemize} + \item FastQC can identify adapter contaminants which can hamper later analyses + \item Specific tools can remove these specific sequences + \end{itemize} + \begin{figure} + \caption{Adapter Sequencing} + \centering + \includegraphics[width=0.8\textwidth]{adapter_sequencing} + \end{figure} +\end{pframe} + +\subsection{Digital data quality} +\begin{pframe} + Also digital date can be of low quality + \bigskip + + \begin{itemize} + \item Hardware failure + \begin{itemize} + \item Data corruption, insufficient disk space + \end{itemize} + \item Human failure + \begin{itemize} + \item Sample swaps, unclear filenames, incomplete copies + \end{itemize} + \end{itemize} +\end{pframe} + +\section{Tools and advanced methods} +\subsection{kMer analysis} +\begin{pframe} + \begin{itemize} + \item Analyzing the frequencies of words of length K + \item Proven to detect all sorts of factors which influence the data + \begin{itemize} + \item Contamination, quality, duplication + \end{itemize} + \item Also used to determine sample complexity + \end{itemize} +\end{pframe} + +\subsection{Overview of tools} +\begin{pframe} + \begin{itemize} + \item{Qualty assessment} + \begin{itemize} + \item FastQC, kMer, QCDB + \end{itemize} + \item{Trimming} + \begin{itemize} + \item Sickle: A windowed adaptive trimming tool + \end{itemize} + \item{Adapter clipping} + \begin{itemize} + \item Cutadapt + \end{itemize} + \item{File integrity} + \begin{itemize} + \item Md5checksums, GRP + \end{itemize} + \end{itemize} +\end{pframe} + +\subsection{QC process} +\begin{pframe} + Good QC practice can be performed following the next steps: + \begin{itemize} + \item Assess the quality of raw data + \item Identify possible factors that impact the data + \item Apply the tools to improve the data + \item Assess the quality again and evaluate the results + \end{itemize} + \bigskip + + Preferably this can be done in a precompiled pipeline +\end{pframe} + +\section{Questions?} +\lastpagetemplate +\begin{pframe} + \begin{center} + Acknowledgements: + \bigskip + \bigskip + + Jeroen Laros + \bigskip + + Martijn Vermaat + \bigskip + + Jeroen Frank + \bigskip + + LGTC + + \end{center} +\end{pframe} + +\end{document} diff --git a/quality_control/ul_logo.eps b/quality_control/ul_logo.eps new file mode 120000 index 0000000000000000000000000000000000000000..cd04dcb9c72ebdde50fda7ee41e375e053fc31b9 --- /dev/null +++ b/quality_control/ul_logo.eps @@ -0,0 +1 @@ +../presentation/ul_logo.eps \ No newline at end of file