Merge branch 'leiden_2014' of git.lumc.nl:humgen/ngs-intro-course into leiden_2014

1495d0b1 · Laros · 7e21f0ee · 51b66d40 · 1495d0b1 · 1495d0b1
Commit 1495d0b1 authored 10 years ago by Laros
--- a/phylogenetic_reconstruction/ngi_logo.eps
+++ b/phylogenetic_reconstruction/ngi_logo.eps
+../presentation/ngi_logo.eps
\ No newline at end of file
--- a/phylogenetic_reconstruction/nwo_logo_en.eps
+++ b/phylogenetic_reconstruction/nwo_logo_en.eps
+../presentation/nwo_logo_en.eps
\ No newline at end of file
--- a/phylogenetic_reconstruction/nwo_logo_nl.eps
+++ b/phylogenetic_reconstruction/nwo_logo_nl.eps
+../presentation/nwo_logo_nl.eps
\ No newline at end of file
--- a/phylogenetic_reconstruction/phylogenetic_reconstruction.tex
+++ b/phylogenetic_reconstruction/phylogenetic_reconstruction.tex
+\documentclass[slidestop]{beamer}
+\title{Phylogenetic reconstruction}
+\providecommand{\myConference}{NGS introduction}
+\providecommand{\myDate}{Thursday, 22 May 2014}
+\author{Michiel van Galen}
+\providecommand{\myGroup}{Leiden Genome Technology Center}
+\providecommand{\myDepartment}{Department of Human Genetics}
+\providecommand{\myCenter}{Center for Human and Clinical Genetics}
+\providecommand{\lastCenterLogo}{
+  \raisebox{-0.1cm}{
+    %\includegraphics[height=1cm]{lgtc_logo}
+    %\includegraphics[height=0.7cm]{ngi_logo}
+  }
+}
+\providecommand{\lastRightLogo}{
+  %\includegraphics[height=0.7cm]{nbic_logo}
+  %\includegraphics[height=0.8cm]{nwo_logo_en}
+  %\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo}
+}
+\usetheme{lumc}
+\begin{document}
+% This disables the \pause command, handy in the editing phase.
+%\renewcommand{\pause}{}
+% Make the title page.
+\bodytemplate
+% First page of the presentation.
+\section{Material}
+\subsection{Input and goal}
+\begin{pframe}
+  \begin{itemize}
+    \item Sequence data available for different strains of bacteria
+    \item One FastQ file per strain
+  \end{itemize}
+    \bigskip
+    NGS throughput is much higher compared to conventional methods (Sanger
+sequencing). Increasing the chances on new insights.
+    \bigskip
+    However, there is little solutions available to accommodate the magnitude in
+the field of phylogenetic reconstruction.
+\end{pframe}
+\section{Methods}
+\subsection{Naive approach}
+\begin{pframe}
+  \begin{figure}
+    \centering
+    \includegraphics[width=0.75\textwidth]{previous}
+  \end{figure}
+\end{pframe}
+\subsection{Naive approach}
+\begin{pframe}
+  Early workflow adapted from Sanger suffered from some limitations:
+  \bigskip
+  \begin{itemize}
+    \item Difficult to reproduce
+    \item Poorly documented
+    \item Using unconventional methods 
+    \item Not parallelized
+    \item Susceptible to errors
+    \item Customization or modification nearly impossible
+    \item Stops at the tree construction
+  \end{itemize}
+\end{pframe}
+\subsection{From bundle of scripts to pipeline}
+\begin{pframe}
+  Re-factor the workflow into a complete pipeline
+  \bigskip
+  \begin{itemize}
+    \item Convert the workflow to an automated pipeline
+    \item Replace custom scripts with maintained existing tools and methods
+    \item Include cluster support
+    \item Improve usability and customization 
+  \end{itemize}
+\end{pframe}
+\section{Pipeline}
+\subsection{Breakdown of the pipeline}
+\begin{pframe}
+  The workflow can be roughly broken down into two parts
+  \bigskip
+  \begin{itemize}
+    \item Per sample part - Analyze the samples separately
+    \item Merged part - Combine output for each sample
+  \end{itemize}
+\end{pframe}
+\subsection{Per sample part}
+\begin{pframe}
+  These steps are for each sample the same and can be parallelized
+  \begin{itemize}
+    \item Add QC - Standard tools
+    \item Alignment to canonical reference - BWA
+    \item Variant calling and filtering - Samtools
+    \item Mask variants in repeated regions - BEDtools
+  \end{itemize}
+\end{pframe}
+\subsection{Merged part, combining the output}
+\begin{pframe}
+  \begin{itemize}
+    \item Compare the variants between strains - Python
+    \begin{itemize}
+      \item Merge the variant files into one matrix - VCFtools
+    \end{itemize}
+    \bigskip
+    \item Use PHYLIP to infer a evolutionary tree
+    \begin{itemize}
+      \item Create distance matrix (dnadist)
+      \item Create a phylogenetic tree
+    \end{itemize}
+  \end{itemize}
+\end{pframe}
+\section{Current situation}
+\subsection{Implementation}
+\begin{pframe}
+  The pipeline is designed to run on the LUMC Shark cluster
+  \begin{itemize}
+    \item All tools are available and maintained 
+    \item Pipeline is written in Make, compatible to run in parallel
+    \item Reduced the number of custom scripts to just one
+    \begin{itemize}
+      \item Not reinventing the wheel, outsource support for tools
+    \end{itemize}
+  \end{itemize}
+\end{pframe}
+\section{Future work}
+\subsection{Possible expansions}
+\begin{pframe}
+  \begin{itemize}
+    \item Improve usability even more
+    \begin{itemize}
+      \item User friendly interface 
+      \item More automation
+    \end{itemize}
+    \bigskip
+    \item kMer analysis
+    \begin{itemize}
+      \item Proven to work on meta-genomic datasets
+    \end{itemize}
+  \end{itemize}
+\end{pframe}
+\subsection{kMer}
+\begin{pframe}
+  \begin{itemize}
+    \item Calculate distance between samples based on occurrences of words of length k
+  \end{itemize}
+  \begin{figure}
+    \centering
+    \includegraphics[width=0.40\textwidth]{clusterall_bw.ps}
+  \end{figure}
+\end{pframe}
+\section{Conclusion}
+\begin{pframe}
+  Summarizing: 
+  \bigskip
+  \begin{itemize}
+    \item Much room for pipeline development and automation
+    \item Apply existing tools where possible reduce development time
+    \item Data is relatively small compared to human data making our
+    infrastructure well prepared
+  \end{itemize}
+\end{pframe}
+\section{Questions?}
+\lastpagetemplate
+\begin{pframe}
+  \begin{center}
+    Acknowledgements:
+    \bigskip
+    \bigskip
+    Wilco Knetsch
+    \bigskip
+    Jeroen Laros
+    \bigskip
+    Martijn Vermaat
+    \bigskip
+    Jeroen Frank
+    \bigskip
+    LGTC    
+  \end{center}
+\end{pframe}
+\end{document}
--- a/phylogenetic_reconstruction/phylogenetic_reconstruction.tex.bak
+++ b/phylogenetic_reconstruction/phylogenetic_reconstruction.tex.bak
+\documentclass[slidestop]{beamer}
+\title{Phylogenetic reconstruction}
+\providecommand{\myConference}{NGS introduction}
+\providecommand{\myDate}{Thursday, 22 May 2014}
+\author{Michiel van Galen}
+\providecommand{\myGroup}{Leiden Genome Technology Center}
+\providecommand{\myDepartment}{Department of Human Genetics}
+\providecommand{\myCenter}{Center for Human and Clinical Genetics}
+\providecommand{\lastCenterLogo}{
+  \raisebox{-0.1cm}{
+    %\includegraphics[height=1cm]{lgtc_logo}
+    %\includegraphics[height=0.7cm]{ngi_logo}
+  }
+}
+\providecommand{\lastRightLogo}{
+  %\includegraphics[height=0.7cm]{nbic_logo}
+  %\includegraphics[height=0.8cm]{nwo_logo_en}
+  %\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo}
+}
+\usetheme{lumc}
+\begin{document}
+% This disables the \pause command, handy in the editing phase.
+%\renewcommand{\pause}{}
+% Make the title page.
+\bodytemplate
+% First page of the presentation.
+\section{Material}
+\subsection{Input and goal}
+\begin{pframe}
+  \begin{itemize}
+    \item Sequence data available for different strains of bacteria
+    \item One FastQ file per strain
+  \end{itemize}
+    \bigskip
+    NGS throughput is much higher compared to conventional methods (Sanger
+sequencing). Increasing the chances on new insights.
+    \bigskip
+    However, there is little solutions available to accomodate the magnitude in
+the field of phylogenetic reconstruction.
+\end{pframe}
+\section{Methods}
+\subsection{Naive approach}
+\begin{pframe}
+  \begin{figure}
+    \centering
+    \includegraphics[width=0.75\textwidth]{previous}
+  \end{figure}
+\end{pframe}
+\subsection{Naive approach}
+\begin{pframe}
+  Early workflow adapted from Sanger suffered from some limitations:
+  \bigskip
+  \begin{itemize}
+    \item Difficult to reproduce
+    \item Poorly documented
+    \item Using unconventional methods 
+    \item Not parallelized
+    \item Susceptible to errors
+    \item Customization or modification nearly impossible
+    \item Stops at the tree construction
+  \end{itemize}
+\end{pframe}
+\subsection{From bundle of scripts to pipeline}
+\begin{pframe}
+  Refactor the workflow into a complete pipeline
+  \bigskip
+  \begin{itemize}
+    \item Convert the workflow to an automated pipeline
+    \item Replace custom scripts with maintained existing tools and methods
+    \item Include cluster support
+    \item Improve usability and customization 
+  \end{itemize}
+\end{pframe}
+\section{Pipeline}
+\subsection{Breakdown of the pipeline}
+\begin{pframe}
+  The workflow can be roughly broken down into two parts
+  \bigskip
+  \begin{itemize}
+    \item Per sample part - Analyze the samples seperately
+    \item Merged part - Combine output for each sample
+  \end{itemize}
+\end{pframe}
+\subsection{Per sample part}
+\begin{pframe}
+  These steps are for each sample the same and can be parallelized
+  \begin{itemize}
+    \item Add QC - Standard tools
+    \item Alignment to canonical reference - BWA
+    \item Variant calling and filtering - Samtools
+    \item Mask variants in repeated regions - BEDtools
+  \end{itemize}
+\end{pframe}
+\subsection{Merged part, combining the output}
+\begin{pframe}
+  \begin{itemize}
+    \item Compare the variants between strains - Python
+    \begin{itemize}
+      \item Merge the variant files into one matrix - VCFtools
+    \end{itemize}
+    \bigskip
+    \item Use PHYLIP to infer a evolutionary tree
+    \begin{itemize}
+      \item Create distance matrix (dnadist)
+      \item Create a phylogenetic tree
+    \end{itemize}
+  \end{itemize}
+\end{pframe}
+\section{Current situation}
+\subsection{Implementation}
+\begin{pframe}
+  The pipeline is designed to run on the LUMC Shark cluster
+  \begin{itemize}
+    \item All tools are available and maintained 
+    \item Pipeline is written in Make, compatible to run in parallel
+    \item Reduced the number of custom scripts to just one
+    \begin{itemize}
+      \item Not reinventing the wheel, outsource support for tools
+    \end{itemize}
+  \end{itemize}
+\end{pframe}
+\section{Future work}
+\subsection{Possible expansions}
+\begin{pframe}
+  \begin{itemize}
+    \item Improve usabilty even more
+    \begin{itemize}
+      \item User friendly interface 
+      \item More automation
+    \end{itemize}
+    \bigskip
+    \item kMer analysis
+    \begin{itemize}
+      \item Proven to work on metagenomic datasets
+    \end{itemize}
+  \end{itemize}
+\end{pframe}
+\subsection{kMer}
+\begin{pframe}
+  \begin{itemize}
+    \item Calculate distance between samples based on occurences of words of length k
+  \end{itemize}
+  \begin{figure}
+    \centering
+    \includegraphics[width=0.40\textwidth]{clusterall_bw.ps}
+  \end{figure}
+\end{pframe}
+\section{Conclusion}
+\begin{pframe}
+  Summarizing: 
+  \bigskip
+  \begin{itemize}
+    \item Much room for pipeline development and automation
+    \item Apply existing tools where possible reduce development time
+    \item Data is relatively small compared to human data making our
+    infrastructure well prepared
+  \end{itemize}
+\end{pframe}
+\section{Questions?}
+\lastpagetemplate
+\begin{pframe}
+  \begin{center}
+    Acknowledgements:
+    \bigskip
+    \bigskip
+    Wilco Knetsch
+    \bigskip
+    Jeroen Laros
+    \bigskip
+    Martijn Vermaat
+    \bigskip
+    Jeroen Frank
+    \bigskip
+    LGTC    
+  \end{center}
+\end{pframe}
+\end{document}
--- a/phylogenetic_reconstruction/previous.png
+++ b/phylogenetic_reconstruction/previous.png
--- a/phylogenetic_reconstruction/ul_logo.eps
+++ b/phylogenetic_reconstruction/ul_logo.eps
+../presentation/ul_logo.eps
\ No newline at end of file
--- a/quality_control/Makefile
+++ b/quality_control/Makefile
+../presentation/Makefile
\ No newline at end of file
--- a/quality_control/adapter_sequencing.png
+++ b/quality_control/adapter_sequencing.png
+../presentation-pics/pics/adapter_sequencing.png
\ No newline at end of file
--- a/quality_control/beamerthemelumc.sty
+++ b/quality_control/beamerthemelumc.sty
+../presentation/beamerthemelumc.sty
\ No newline at end of file
--- a/quality_control/garbage.jpg
+++ b/quality_control/garbage.jpg
+../presentation-pics/pics/garbage-in-garbage-out.jpg
\ No newline at end of file
--- a/quality_control/gen2phen_logo.eps
+++ b/quality_control/gen2phen_logo.eps
+../presentation/gen2phen_logo.eps
\ No newline at end of file
--- a/quality_control/lgtc_logo.eps
+++ b/quality_control/lgtc_logo.eps
+../presentation/lgtc_logo.eps
\ No newline at end of file
--- a/quality_control/lumc_logo.eps
+++ b/quality_control/lumc_logo.eps
+../presentation/lumc_logo.eps
\ No newline at end of file
--- a/quality_control/lumc_logo_small.eps
+++ b/quality_control/lumc_logo_small.eps
+../presentation/lumc_logo_small.eps
\ No newline at end of file
--- a/quality_control/nbic_logo.eps
+++ b/quality_control/nbic_logo.eps
+../presentation/nbic_logo.eps
\ No newline at end of file
--- a/quality_control/ngi_logo.eps
+++ b/quality_control/ngi_logo.eps
+../presentation/ngi_logo.eps
\ No newline at end of file
--- a/quality_control/nwo_logo_en.eps
+++ b/quality_control/nwo_logo_en.eps
+../presentation/nwo_logo_en.eps
\ No newline at end of file
--- a/quality_control/nwo_logo_nl.eps
+++ b/quality_control/nwo_logo_nl.eps
+../presentation/nwo_logo_nl.eps
\ No newline at end of file
--- a/quality_control/pretrimmed_qscores.png
+++ b/quality_control/pretrimmed_qscores.png
+../presentation-pics/pics/pretrimmed_qscores.png
\ No newline at end of file