Commit bdf85b3b authored by Laros's avatar Laros
Browse files

Added lecture.

parent 6f767e39
\documentclass[slidestop]{beamer}
\author{Jeroen F.J. Laros}
\title{High throughput NGS data analysis}
\providecommand{\mySubTitle}{}
\providecommand{\myConference}{SIG Bioinformatics in Medical Microbiology NL}
\providecommand{\myDate}{17-09-2018}
\providecommand{\myGroup}{}
\providecommand{\myDepartment}{Research Software Engineering, LUMC}
\providecommand{\myCenter}{Program manager bioinformatics, RIVM}
\usetheme{lumc}
\begin{document}
\lstset{language=make, mathescape=false}
\input{petrinet}
% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}
% Make the title slide.
\makeTitleSlide{} %\includegraphics[height=3.5cm]{logos/lumc_logo_small}}
% First page of the presentation.
\section{Introduction}
\makeTableOfContents
\section{Pipelines}
\subsection{Early NGS sequencing}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[width=0.8\textwidth]{illuminagenomeanalyzer}
\end{center}
\pause
\caption{Genome Analyzer II.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[trim=1cm 1.5cm 1cm 1, clip, width=0.5\textwidth]
{k_marjolein}
\end{center}
\caption{Marjolein Kriek.}
\end{figure}
Sequencing took nine months.
\begin{itemize}
\item Data analysis mostly by hand.
\end{itemize}
\end{minipage}
\end{pframe}
\begin{pframe}
Routine data analysis:
\begin{itemize}
\item First pipelines were written in Perl.
\item Pipelines were executed on a single machine.
\end{itemize}
\bigskip
Drawbacks:
\begin{itemize}
\item Single threaded.
\item Use of \lstinline{system()} calls.
\end{itemize}
\bigskip
Later pipelines in Bash.
\begin{itemize}
\item Easier to work with external programs.
\end{itemize}
\end{pframe}
\subsection{NGS for routine diagnostics}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}
\includegraphics[width=\textwidth, trim=0 40 0 0, clip]{hiseq_2000}
\caption{HiSeq 2500.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
Characteristics:
\begin{itemize}
\item High throughput ($3$~genomes).
\item Paired end.
\item High accuracy.
\item Read length $2 \times 125$bp.
\item Run time ($6$~days).
\end{itemize}
\end{minipage}
\end{pframe}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}
\includegraphics[width=0.95\textwidth]{DellBlade4}
\caption{Dell M610 blade server.}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
Necessity to run multiple analyses at the same time.
\begin{itemize}
\item Linux SGE cluster.
\item Central (Isilon) storage.
\end{itemize}
\bigskip
Current pecifications:
\begin{itemize}
\item $51$ nodes providing $908$ cores.
\item $1$PB HPC storage.
\item $1.5$PB archive.
\end{itemize}
\end{minipage}
\end{pframe}
\begin{pframe}
Limitations of conventional scripting languages.
\begin{itemize}
\item Manual identification of parallel steps.
\item Explicit job submission.
\item Explicit job monitoring.
\end{itemize}
\bigskip
Pipelines became large and unmanageable.
\bigskip
Need for a framework that takes care of parallelisation.
Separation of the workflow from the computational infrastructure.
\end{pframe}
\begin{pframe}
%\frametitle{A different viewpoint}
What if\ldots
\begin{itemize}
\item All commands are atomic.
\begin{itemize}
\item We describe input and output.
\end{itemize}
\item We build a \emph{dependency graph}.
\item Trace a path in this graph to find a workflow.
\end{itemize}
\bigskip
\pause
This way we do not need to:
\begin{itemize}
\item Design a workflow.
\item Figure out which parts can be run in parallel.
\end{itemize}
\end{pframe}
\begin{pframe}
\begin{figure}
\petrinet{\bs{fastq}}{\bs{fq.f}}{\bs{stats}}{\bs{bam}}{\bs{report}}{
\underline{\bsi{fastx}}}{\underline{\bsi{bwa}}}{\underline{\bsi{latex}}}{
red}
\caption{A parallel workflow.}
\end{figure}
\end{pframe}
\begin{pframe}
\lstinline{make} is an implementation of this idea.
\begin{itemize}
\item Works on a single machine as well as on a cluster.
\item Used for clinical genetics pipeline until a few months ago.
\end{itemize}
\bigskip
\pause
Drawbacks:
\begin{itemize}
\item Old language with awkward syntax.
\item No scoping.
\item Difficult to debug.
\end{itemize}
\bigskip
\pause
We now use \lstinline{Snakemake} for all of our diagnostics pipelines.
\vfill
\permfoot{\url{https://www.gnu.org/software/make/}}
\permfoot{\url{https://snakemake.readthedocs.io/}}
\end{pframe}
\section{Stable pipelines}
\subsection{Virtual environments}
\begin{pframe}
To have stable pipelines in a shared environment, some form of isolation is
needed.
\bigskip
Package manager / environment management system.
\begin{itemize}
\item Conda.
\item GNU Guix.
\end{itemize}
\bigskip
Compartimentalisation:
\begin{itemize}
\item Docker.
\item Singularity.
\end{itemize}
\bigskip
Separate the pipeline and its dependences from the computational
infrastructure.
\end{pframe}
\subsection{Version control}
\begin{pframe}
Our pipelines are stored in Git.
\bigskip
Configuration of virtual environments.
\begin{itemize}
\item Fixed version numbers.
\end{itemize}
\bigskip
For reproducibility:
\begin{itemize}
\item Use releases.
\item Store git checkout hash for every pipeline run.
\end{itemize}
\vfill
\permfoot{\url{https://git.lumc.nl/}}
\end{pframe}
\subsection{Summary}
\begin{pframe}
Manageable pipelines:
\begin{itemize}
\item Separate the workflow from the computational infrastructure.
\end{itemize}
\bigskip
Stable pipelines:
\begin{itemize}
\item Separate the pipeline and its dependencies from the computational
infrastructure.
\item Full control of dependencies.
\end{itemize}
\bigskip
\end{pframe}
\section{Fast analysis: NIPT}
\subsection{Detection of fetal chromosomal aberrations}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
Large events:
\begin{itemize}
\item Trisomies.
\begin{itemize}
\item 13, 18 and 21.
\end{itemize}
\item Large deletions or duplications.
\end{itemize}
\bigskip
Primary targets:
\begin{itemize}
\item Patau syndrome (13).
\item Edwards syndrome (18).
\item Down syndrome (21).
\end{itemize}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[width=0.8\textwidth]{down_syndrome}
\end{center}
\caption{Down syndrome.}
\end{figure}
\end{minipage}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{NIPT_story}
\end{center}
\caption{Free floating DNA in the maternal bloodstream.}
\end{figure}
\end{pframe}
\subsection{Sample handling}
\begin{pframe}
Isolation:
\begin{itemize}
\item From blood plasma instead of white blood cells.
\end{itemize}
\bigskip
Sample preparation:
\begin{itemize}
\item Whole genome sequencing.
\end{itemize}
\bigskip
Sequencing on HiSeq 4000:
\begin{itemize}
\item Low pass ($\pm15$M reads).
\item Batches of $96$ samples.
\item One or two batches on one flowcell.
\end{itemize}
\end{pframe}
\subsection{Visualisation}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight, trim=25 0 0 0, clip]{NIPT_visualisation}
\end{center}
\caption{WISECONDOR CNV calls.}
\end{figure}
\end{pframe}
\subsection{Original pipeline}
\begin{pframe}
The pipeline is straightforward:
\begin{itemize}
\item Alignment to the reference genome (GRCh37, hg19).
\item Deduplication.
\item CNV calling.
\end{itemize}
\bigskip
Original run time: $6$ hours, but with up to $500$ samples per week.
\begin{itemize}
\item $20$ minutes per sample.
\end{itemize}
\bigskip
\pause
Additional constraint: $8$ (office) hours turnaround time.
\begin{itemize}
\item $2.5$ minutes per sample ($2$ batches of $96$ samples).
\pause
\item Implementation time of two months.
\end{itemize}
\end{pframe}
\subsection{Optimisation}
\begin{pframe}
Improvements on the pipeline:
\begin{itemize}
\item Other aligner (\lstinline{BWA mem}).
\item Other computational framework (\lstinline{Snakemake}).
\end{itemize}
\bigskip
These improvements brought the runtime back to around $40$ minutes per sample.
\bigskip
\pause
Parallel processing on the LUMC cluster.
\begin{itemize}
\item Up to $200$ cores for this project.
\item Between $90$ and $180$ minutes per batch.
\begin{itemize}
\item Depending on the number of batches per flowcell.
\pause
\item One batch in $1.5$ to $3$ hours.
\end{itemize}
\end{itemize}
\end{pframe}
\section{Production data analysis}
\subsection{Fully automatic pipeline runs}
\begin{pframe}
Prerequisites:
\begin{itemize}
\item Input data.
\item Metadata of the input data.
\begin{itemize}
\item Which files belong to which sample.
\item Read pair information.
\end{itemize}
\item Additional sample information (LIMS).
\begin{itemize}
\item Pedigree information.
\item Gene panel.
\end{itemize}
\item Pipeline.
\end{itemize}
\bigskip
\end{pframe}
\subsection{Computational infrastructure}
\begin{pframe}
Observation:
\begin{itemize}
\item Much of the automated data analysis is extremely complex.
\begin{itemize}
\item Many implicit dependencies and workflows.
\end{itemize}
\item Virtually impossible to transfer to other people.
\begin{itemize}
\item Problematic in diagnostics.
\end{itemize}
\end{itemize}
\bigskip
Our approach:
\begin{itemize}
\item Highly specialised microservices.
\begin{itemize}
\item Do only one thing and do it well.
\end{itemize}
\end{itemize}
\end{pframe}
\begin{pframe}
Reproducibility and automation.
\bigskip
Computational infrastructure:
\begin{itemize}
\item Transfer: gatekeeper for \emph{production data}.
\item Cerana: Project conductor.
\item Florea: API adapter for the HPC cluster.
\item Amegilla: API adapter for legacy / proprietary systems.
\end{itemize}
\vfill
\permfoot{\url{https://git.lumc.nl/groups/apis}}
\end{pframe}
\subsection{Transfer server}
\begin{pframe}
Previous situation:
\begin{itemize}
\item We receive data on hard disks or via an sFTP server.
\item Frequently missing or wrong metadata, mixups.
\end{itemize}
\bigskip
Current situation:
\begin{itemize}
\item The \emph{consumer} of the data decides what is to be sent.
\begin{itemize}
\item Sample IDs and grouping, QC metrics, \ldots
\end{itemize}
\item The data is only accepted when this metadata is valid.
\end{itemize}
\bigskip
\pause
Gain:
\begin{itemize}
\item $24$\% rejected transfers.
\end{itemize}
\end{pframe}
\subsection{Cerana, the project conductor}
\begin{pframe}
Gather data needed to run a data analysis.
\begin{itemize}
\item Files (e.g., from the transfer server).
\item Metadata (e.g., trio information from a LIMS system).
\end{itemize}
\bigskip
About the metadata:
\begin{itemize}
\item The order is not relevant.
\item The data can come from any (authorised) source.
\end{itemize}
\bigskip
When all data is available, a signal is sent to an actor that runs the
analysis.
\end{pframe}
\subsection{Florea, the pipeline runner}
\begin{pframe}
Start a pipeline on the cluster.
\begin{itemize}
\item Get the pipeline configuration from our GitLab system.
\item Start the pipeline.
\item Monitor the pipeline progress.
\item Keep track of the status (query via the API).
\end{itemize}
\bigskip
Once the pipeline is finished, a status update is reported back to the
submitter of the pipeline run.
\end{pframe}
\subsection{Underlying infrastructure}
\begin{pframe}
Security:
\begin{itemize}
\item Encryption and identity management with X.509.
\end{itemize}
\bigskip
Interfaces:
\begin{itemize}
\item Fully documented APIs.
\begin{itemize}
\item Only open standards.
\end{itemize}
\end{itemize}
\bigskip
All actions are stored in a database that can be queried via the API.
\end{pframe}
\begin{pframe}
Any of these services may fail.
\bigskip
Well defined interface:
\begin{itemize}
\item Easy to make backup procedures.
\end{itemize}
\bigskip
Full (online) documentation.
\begin{itemize}
\item Bypass any of the microservices.
\item Bypass everything.
\end{itemize}
\end{pframe}
\subsection{Use case: NIPT}
\begin{pframe}
\begin{lstlisting}[language=none, caption={Summary of run $36$.}]
d2e88cea-9d7c-4c1c-b788-b20385d830d8
Name: 103033-036
Delivered on: 14-06-2017 20:40:53
Started on: 14-06-2017 18:58:45
Duration: 1:42:08
State: successful
Samples: 96
Remarks:
\end{lstlisting}
Overviews can be made by querying the relevant systems.
\bigskip
\pause
Over $21,\!000$ samples per year ($28$\% of the Dutch total).
\end{pframe}
\subsection{Exome sequencing}
\begin{pframe}
We run four production pipelines at the moment.
\bigskip
Whole exome sequencing:
\begin{itemize}
\item $20$GB per sample ($50$ million reads).
\item $5,\!000$ samples per year (LUMC, EMC).
\item $100$ samples per week on average.
\item Takes half a year of computation time.
\item We deliver in $24$ hours.
\end{itemize}
\end{pframe}
% Make the acknowledgements slide.
\makeAcknowledgementsSlide{
\begin{tabular}{ll}
Jonathan Vis\\
Mark Santcroos\\
Martijn Vermaat\\
Michel Villerius\\
Sander Bollen\\
Johan den Dunnen\\
\end{tabular}
}
\end{document}
../../submodules/presentation-pics/pics/DellBlade4.png
\ No newline at end of file
../../submodules/presentation/Makefile
\ No newline at end of file
../../submodules/presentation-pics/pics/NIPT_story.jpg
\ No newline at end of file
../../submodules/presentation-pics/pics/NIPT_visualisation.png
\ No newline at end of file
../../submodules/presentation/beamerthemelumc.sty
\ No newline at end of file
../../submodules/presentation-pics/pics/down_syndrome.jpg
\ No newline at end of file
../../submodules/presentation-pics/pics/hiseq_2000.jpg