Commit c472296b authored by Laros's avatar Laros
Browse files

Added NIPT data analysis lecture.

parent 612f85c4
../../submodules/presentation/Makefile
\ No newline at end of file
../../submodules/presentation-pics/pics/NIPT_story.jpg
\ No newline at end of file
../../submodules/presentation-pics/pics/NIPT_visualisation.png
\ No newline at end of file
../../submodules/presentation/beamerthemelumc.sty
\ No newline at end of file
../../submodules/presentation-pics/pics/down_syndrome.jpg
\ No newline at end of file
../../submodules/presentation/logos
\ No newline at end of file
\documentclass[slidestop]{beamer}
\author{Jeroen F.J. Laros}
\title{NIPT data analysis}
\providecommand{\mySubTitle}{and high throughput automation}
\providecommand{\myConference}{GenomeScan lecture series}
\providecommand{\myDate}{19-06-2017}
\providecommand{\myGroup}{}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\usetheme{lumc}
\begin{document}
% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}
% Make the title slide.
\makeTitleSlide{\includegraphics[height=2.5cm]{NIPT_story}}
% First page of the presentation.
\section{Introduction}
\makeTableOfContents
\section{Background}
\subsection{Detection of fetal chromosomal aberrations}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
Large events:
\begin{itemize}
\item Trisomies.
\begin{itemize}
\item 13, 18 and 21.
\end{itemize}
\item Large deletions or duplications.
\end{itemize}
\bigskip
Primary targets:
\begin{itemize}
\item Patau syndrome (13).
\item Edwards syndrome (18).
\item Down syndrome (21).
\end{itemize}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[width=0.8\textwidth]{down_syndrome}
\end{center}
\caption{Down syndrome.}
\end{figure}
\end{minipage}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{NIPT_story}
\end{center}
\caption{Free floating DNA in the maternal bloodstream.}
\end{figure}
\end{pframe}
\subsection{Sample handling}
\begin{pframe}
Isolation:
\begin{itemize}
\item From blood plasma instead of white blood cells.
\end{itemize}
\bigskip
Sample preparation:
\begin{itemize}
\item Whole genome sequencing.
\end{itemize}
\bigskip
Sequencing on HiSeq 4000:
\begin{itemize}
\item Low pass ($\pm15$M reads).
\item Batches of $96$ samples.
\item One or two batches on one flowcell.
\end{itemize}
\end{pframe}
\section{Data analysis}
\subsection{WISECONDOR}
\begin{pframe}
Within sample comparison:
\begin{itemize}
\item Slight increase in coverage for one of the chromosomes.
\item $z$-scores.
\end{itemize}
\bigskip
Two stages:
\begin{itemize}
\item Reference set (once).
\item Per sample analysis.
\end{itemize}
\vfill
\permfoot{Straver et al., WISECONDOR: detection of fetal aberrations from
shallow sequencing maternal plasma based on a within-sample comparison
scheme, NAR, 2014.}
\end{pframe}
\begin{pframe}
Standard score ($z$-score):
\begin{displaymath}
z = \frac{x - \mu}{\sigma}
\end{displaymath}
where:
\begin{itemize}
\item $\mu$ is the mean.
\item $\sigma$ is the standard deviation.
\end{itemize}
\bigskip
Now we can choose \emph{one} threshold for calling.
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{z_transform}
\onslide<2>{
\begin{picture}(0, 0)(0, 0)
\linethickness{2pt}
\put(-80, 21){\line(1, 0){80}}
\end{picture}
}
\end{center}
\caption{$z$-transformation.}
\end{figure}
\end{pframe}
\begin{pframe}
This idea is applied to \emph{bins}.
\bigskip
Preparation using the reference set:
\begin{itemize}
\item The genome is divided in equally sized bins.
\item The reference set is used to identify bins that behave similarly.
\end{itemize}
\bigskip
Analysis for a sample:
\begin{itemize}
\item For each bin, get the set of ``similar'' bins.
\item See if the $z$-score of this bin is higher than $3$.
\end{itemize}
\end{pframe}
\section{Results}
\subsection{Visualisation}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight, trim=25 0 0 0, clip]{NIPT_visualisation}
\end{center}
\caption{WISECONDOR CNV calls.}
\end{figure}
\end{pframe}
\begin{pframe}
We see a call in almost every bin in chromosome 21.
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth, trim=100 30 500 610, clip]{NIPT_visualisation}
\end{center}
\caption{WISECONDOR CNV calls for chromosome 19 and 21.}
\end{figure}
\end{pframe}
\section{Pipeline}
\subsection{Original pipeline}
\begin{pframe}
The pipeline is straightforward:
\begin{itemize}
\item Alignment to the reference genomen (GRCh37, hg19).
\item Deduplication.
\item CNV calling.
\end{itemize}
\bigskip
Original run time: $5$ to $6$ hours.
\bigskip
We expect up to $500$ samples per week.
\begin{itemize}
\item $20$ minutes per sample.
\item No room for delays.
\end{itemize}
\end{pframe}
\subsection{Updates}
\begin{pframe}
Improvements on the pipeline:
\begin{itemize}
\item Other aligner (\lstinline{BWA mem}).
\item Other computational framework (\lstinline{Snakemake}).
\end{itemize}
These improvements brought the runtime back to around $40$ minutes per sample.
\bigskip
Parallel processing on the LUMC cluster.
\begin{itemize}
\item Up to $200$ cores for this project.
\item Between $90$ and $180$ minutes per batch.
\begin{itemize}
\item Depending on the number of batches per flowcell.
\end{itemize}
\end{itemize}
\end{pframe}
\section{Production data analysis}
\subsection{Computational infrastructure}
\begin{pframe}
Observation:
\begin{itemize}
\item Much of the automated data analysis is extremely complex.
\begin{itemize}
\item Many implicit dependencies and workflows.
\end{itemize}
\item Virtually impossible to transfer to other people.
\begin{itemize}
\item Problematic in diagnostics.
\end{itemize}
\end{itemize}
\bigskip
Our approach:
\begin{itemize}
\item Highly specialised microservices.
\begin{itemize}
\item Do only one thing and do it well.
\end{itemize}
\end{itemize}
\end{pframe}
\begin{pframe}
Reproducibility and automation.
\bigskip
Computational infrastructure:
\begin{itemize}
\item Transfer: gatekeeper for \emph{production data}.
\item Cerana: Project conductor.
\item Florea: API adapter for the HPC cluster.
\item Amegilla: API adapter for legacy / proprietary systems.
\end{itemize}
%\bigskip
%Common framework:
%\begin{itemize}
% \item Nginx, REST.
% \item X.509 certificates.
%\end{itemize}
\vfill
\permfoot{\url{https://git.lumc.nl/groups/apis}}
\end{pframe}
\subsection{Transfer server}
\begin{pframe}
Previous situation:
\begin{itemize}
\item We receive data on hard disks or via an sFTP server.
\item Frequently missing or wrong metadata, mixups.
\end{itemize}
\bigskip
Current situation:
\begin{itemize}
\item The \emph{consumer} of the data decides what is to be sent.
\begin{itemize}
\item Sample IDs and grouping, QC metrics, \ldots
\end{itemize}
\item The data is only accepted when this metadata is valid.
\end{itemize}
\bigskip
\pause
Some statistics:
\begin{itemize}
\item $17,\!600$ files sent in $131$ transfers.
\item $41$ ($24$\%) rejected transfers.
\end{itemize}
\end{pframe}
\subsection{Cerana, the project conductor}
\begin{pframe}
Gather data needed to run a data analysis.
\begin{itemize}
\item Files (e.g., from the transfer server).
\item Metadata (e.g., trio information from a LIMS system).
\end{itemize}
\bigskip
About the metadata:
\begin{itemize}
\item The order is not relevant.
\item The data can come from any (authorised) source.
\end{itemize}
\bigskip
When all data is available, a signal is sent to an actor that runs the
analysis.
\end{pframe}
\subsection{Florea, the pipeline runner}
\begin{pframe}
Start a pipeline on the cluster.
\begin{itemize}
\item Get the pipeline configuration from our GitLab system.
\item Start the pipeline.
\item Monitor the pipeline progress.
\item Keep track of the status (query via the API).
\end{itemize}
\end{pframe}
\subsection{Underlying infrastructure}
\begin{pframe}
Security:
\begin{itemize}
\item Encryption and identity management with X.509.
\end{itemize}
\bigskip
Interfaces:
\begin{itemize}
\item Fully documented APIs.
\begin{itemize}
\item Only open standards.
\end{itemize}
\end{itemize}
\bigskip
All actions are stored in a database that can be queried via the API.
\end{pframe}
\begin{pframe}
Any of these services may fail.
\bigskip
Well defined interface:
\begin{itemize}
\item Easy to make backup procedures.
\end{itemize}
\bigskip
Full (online) documentation.
\begin{itemize}
\item Bypass any of the microservices.
\item Bypass everything.
\end{itemize}
\end{pframe}
\subsection{Use case: NIPT}
\begin{pframe}
\begin{lstlisting}[language=none, caption={Summary of run $36$.}]
d2e88cea-9d7c-4c1c-b788-b20385d830d8
Name: 103033-036
Delivered on: 14-06-2017 20:40:53
Started on: 14-06-2017 18:58:45
Duration: 1:42:08
State: successful
Samples: 96
Remarks:
\end{lstlisting}
Overviews can be made by querying the relevant systems.
\end{pframe}
\begin{pframe}
Our first run using the API system.
\medskip
\begin{tabular}{l@{:}l@{:}ll}
16 & 30 & 54 & Data transfer is initiated.\\
18 & 59 & 39 & Data transfer finished.\\
19 & 00 & 15 & Notification is sent to Cerana.\\
19 & 00 & 15 & Cerana receives notification.\\
19 & 00 & 15 & Notification is sent to Florea.\\
19 & 00 & 15 & Florea receives notification.\\
19 & 00 & 21 & Pipeline starts.\\
19 & 55 & 06 & Pipeline ends.\\
20 & 00 & 09 & Data transfer to ErasmusMC and notification e-mail.\\
20 & 00 & 13 & Florea updates status to 'finished'.
\end{tabular}
\end{pframe}
% Make the acknowledgements slide.
\makeAcknowledgementsSlide{
\begin{tabular}{lll}
\bf LUMC & \bf GenomeScan & \bf ErasmusMC \\
Sander Bollen & Niels de Water & Marjan Boter \\
Wibowo Arindrarto & Floor Pepers & Frank Sleutels \\
Jonathan Vis & Mark de Jong
\end{tabular}
\bigskip
\begin{tabularx}{\textwidth}{Xl}
& \includegraphics[height=1cm]{logos/genomescan_logo}
\end{tabularx}
}
\end{document}
set sample (100)
set multiplot layout 2,2
unset xtics
set yrange [0:9]
print rand(-1)
plot 0.5 * invnorm(rand(0)) + 2 w p pt 7 ps 2 lc rgb "blue" notitle, \
"<echo 0 4" w p pt 7 ps 2 lc rgb "red" notitle
plot 1.5 * invnorm(rand(0)) + 5 w p pt 7 ps 2 lc rgb "blue" notitle
set yrange [-5:5]
print rand(-1)
set xlabel "chr1"
plot invnorm(rand(0)) w p pt 7 ps 2 lc rgb "blue" notitle, \
"<echo 0 4" w p pt 7 ps 2 lc rgb "red" notitle
set xlabel "chr21"
plot invnorm(rand(0)) w p pt 7 ps 2 lc rgb "blue" notitle
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment