Commit 846a91c9 authored by Laros's avatar Laros
Browse files

Added new lecture.

parent 74e774f7
../../submodules/presentation/Makefile
\ No newline at end of file
../../submodules/presentation-pics/pics/Mz80k.jpg
\ No newline at end of file
../../submodules/presentation/beamerthemelumc.sty
\ No newline at end of file
../../submodules/presentation-pics/pics/fam_re.ps
\ No newline at end of file
../../submodules/presentation-pics/pics/gitlab_diff.xcf
\ No newline at end of file
../../submodules/presentation-pics/pics/gitlab_doc.xcf
\ No newline at end of file
../../submodules/presentation-pics/pics/gitlab_network.xcf
\ No newline at end of file
../data_sharing/gq.gnp
\ No newline at end of file
../data_sharing/gq_1.dat
\ No newline at end of file
../data_sharing/gq_2.dat
\ No newline at end of file
../data_sharing/gq_3.dat
\ No newline at end of file
../../submodules/presentation-pics/pics/ipynb.xcf
\ No newline at end of file
../../submodules/presentation/logos
\ No newline at end of file
../../submodules/presentation-pics/pics/mutalyzer_disambiguation.xcf
\ No newline at end of file
../../submodules/presentation-pics/pics/mutalyzer_disambiguation_result.xcf
\ No newline at end of file
../../submodules/presentation-pics/pics/nested.png
\ No newline at end of file
\documentclass[slidestop]{beamer}
\author{Jeroen F.J. Laros}
\title{Research Software Engineering}
\providecommand{\mySubTitle}{}
\providecommand{\myConference}{Staff meeting}
\providecommand{\myDate}{09-04-2017}
\providecommand{\myGroup}{}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\usetheme{lumc}
\begin{document}
% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}
% Make the title slide.
\makeTitleSlide{\includegraphics[height=3.0cm]{Mz80k}}
% First page of the presentation.
\section{Introduction}
\makeTableOfContents
\subsection{Research software}
\begin{pframe}
Specialised software for research purposes.
\begin{itemize}
\item Method development.
\begin{itemize}
\item Novel \emph{data analysis}.
\item Efficiency.
\item The method itself.
\end{itemize}
\item Production (bulk analysis).
\begin{itemize}
\item Large studies.
\item Diagnostics.
\end{itemize}
\end{itemize}
\bigskip
Not to be confused with general software:
\begin{itemize}
\item Spreadsheets.
\item Programming languages like Matlab.
\end{itemize}
\end{pframe}
\begin{pframe}
Why do we develop such software?
\begin{itemize}
\item No interest from industry (yet).
\item Expert knowledge required.
\item Very expansive when development is outsourced.
\end{itemize}
\bigskip
Frequently put on the market years later:
\begin{itemize}
\item NextGene.
\item Genalice.
\item HiX.
\end{itemize}
\end{pframe}
\begin{pframe}
\begin{minipage}[t]{0.47\textwidth}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{will_it_work}
\end{center}
\caption{Will it work?}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.47\textwidth}
We aim for high grade, \emph{professional} software.
\begin{itemize}
\item Mutalyzer (over $270,\!000,\!000$ requests processed).
\item TSSV (used by the FLDO and NFI).
\item Varda (going national via VKGL).
\item \ldots
\end{itemize}
\end{minipage}
\end{pframe}
\subsection{Partners / Customers}
\begin{pframe}
Within the LUMC:
\begin{itemize}
\item Human Genetics.
\begin{itemize}
\item LGTC.
\item FLDO.
\end{itemize}
\item Clinical Genetics.
\item Haematology.
\item Medical Microbiology.
\end{itemize}
\bigskip
External:
\begin{itemize}
\item VKGL.
\item ErasmusMC.
\item GenomeScan.
\end{itemize}
\end{pframe}
\section{Research software}
\subsection{Mutalyzer}
\begin{pframe}
A curational tool for Locus Specific Mutation Databases.
\bigskip
Variant nomenclature checker applying the HGVS guidelines:
\begin{itemize}
\item Is the syntax of the variant description valid?
\item Does the reference sequence exist?
\item Is the variant possible on this reference sequence?
\item Is this variant description the recommended one?
\end{itemize}
\bigskip
Basic effect prediction.
\begin{itemize}
\item Is the description of the transcript product as expected?
\item Is the predicted protein as expected?
\end{itemize}
\vfill
\permfoot{\url{https://mutalyzer.nl}}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{mutalyzer_disambiguation}
\end{center}
\caption{\bt{NM\_002001.2:c.9\_10delinsTGC}.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{mutalyzer_disambiguation_result}
\end{center}
\caption{\bt{NM\_002001.2:c.12dup}.}
\end{figure}
\end{pframe}
\begin{pframe}
Mutalyzer is free:
\begin{itemize}
\item Open source (AGPL license).
\item Available via GitHub.
\end{itemize}
\bigskip
Available via the web:
\begin{itemize}
\item Interactive user interface.
\item Batch interface:
\begin{itemize}
\item CSV, XLS, ODS.
\end{itemize}
\item Webservices (to integrate in a pipeline):
\begin{itemize}
\item SOAP.
\item HTTP/RPC+JSON.
\end{itemize}
\end{itemize}
\vfill
\permfoot{\url{https://mutalyzer.nl}}\\
\permfoot{\url{https://github.com/mutalyzer/mutalyzer}}
\end{pframe}
\subsection{Varda}
\begin{pframe}
A database for bulk variant data.
\begin{itemize}
\item There are a lot of databases available for sharing variants.
\begin{itemize}
\item Locus specific databases like LOVD.
\item Large number of healthy individuals (ExAC).
\end{itemize}
\end{itemize}
\bigskip
So why build yet an other database?
\begin{itemize}
\item Missing values.
\end{itemize}
\end{pframe}
\begin{pframe}
We aim for a solution that has the following properties:
\begin{itemize}
\item Not only counts, but also \emph{reference calls} (numerator,
denominator).
\item Support for \emph{labels}.
\item Support for \emph{timestamps}.
\end{itemize}
\bigskip
Labels allow for queries like:
\begin{itemize}
\item How many times have we seen this variant in non-diabetes patients?
\item How many times was this variant observed in institute X?
\end{itemize}
\bigskip
Timestamps are essential for \emph{reproducibility}.
\end{pframe}
\begin{pframe}
\begin{figure}[]
\vspace{-0.5cm}
\begin{center}
\includegraphics[height=0.8\textheight]{samples}
\begin{picture}(0, 0)
\only<2>{\put(-45, 9.5){\line(0, 1){50}}}
\only<3>{\put(-23, 9.5){\line(0, 1){50}}}
\end{picture}
\end{center}
\vspace{-0.3cm}
\caption{Sharing observations is not enough.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\vspace{-0.5cm}
\begin{center}
\includegraphics[height=0.8\textheight]{gq}
\end{center}
\vspace{-0.3cm}
\caption{Callable region ($584$--$1596$).}
\end{figure}
\end{pframe}
\subsection{Nested}
\begin{pframe}
Pedigree editor.
\begin{itemize}
\item Cyrillic was no longer usable.
\item We have hundreds of pedigrees we can no longer access.
\end{itemize}
\bigskip
Goals:
\begin{itemize}
\item Portable.
\item Can be used online.
\item Can be embedded.
\begin{itemize}
\item Miracle.
\item LOVD.
\end{itemize}
\end{itemize}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{nested}
\end{center}
\caption{Nested online editor.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{fam_re}
\end{center}
\caption{A Cyrillic FAM file.}
\end{figure}
\end{pframe}
\begin{pframe}
\begin{lstlisting}[language=none, caption={Structure snippet.}]
- name: members
for: number_of_members
structure:
- name: surname
- name: forenames
- name: address
type: comment
- name: spouses
for: number_of_spouses
structure:
- name: id
type: int
- name: flags
type: relationship
- name: name
\end{lstlisting}
\end{pframe}
\begin{pframe}
\begin{lstlisting}[language=none, caption={Output snippet.}]
members:
- surname: 'Gambolputty'
forenames: 'Johann'
address: 'Ulm'
spouses:
- flags:
consanguineous: false
divorced: false
informal: false
separated: false
id: 2
name: ''
\end{lstlisting}
\end{pframe}
\section{Research infrastructure}
\subsection{HPC cluster}
\begin{pframe}
Used by almost $300$ people:
\begin{itemize}
\item Human Genetics (LGTC, LIPIDEN, FLDO).
\item Medical Statistics (MolEpi).
\item Medical Microbiology.
\item Clinical Genetics.
\item LUMC wide (SASC, Research ICT).
\end{itemize}
\bigskip
But also from external parties:
\begin{itemize}
\item Leiden University (FSW).
\item Generade.
\item GenomeScan.
\end{itemize}
\end{pframe}
%\begin{pframe}
% We participate in national projects (NFU D4LS WP7).
% \bigskip
%
% {\bf Securely scaling out computational intensive data analyses from UMCs
% to SURF}
% \medskip
%
% \textit{Successful pilot LUMC, SURFnet and SURFsara}
% \medskip
%
% SURF and the Leiden University Medical Center (LUMC) have successfully
% established a federated data analysis pipeline infrastructure connecting the
% high performance computing (HPC) cluster of LUMC and SURFsara's HPC Cloud
% service. Researchers are using this connection for extra computing capacity.
%
% \vfill
% \permfoot{\url{https://www.surf.nl/en/news/2016/10/}}
%\end{pframe}
\subsection{Git}
\begin{pframe}
\emph{The management of changes to documents, computer programs, large web
sites, and other collections of information.} --- Wikipedia.
\bigskip
\pause
General features:
\begin{itemize}
\item Keeping track of files in an orderly manner.
\begin{itemize}
\item Hiding old versions.
\item Recording who made changes and when.
\end{itemize}
\item Enables collaboration.
\end{itemize}
\vfill
\permfoot{\url{http://www.git-scm.com/}}
\permfoot{\url{https://github.com/}}
\end{pframe}
\subsection{Why version control?}
\begin{pframe}
For a single user:
\begin{itemize}
\item Revert files to a previous state.
\item Revert the entire project back to a previous state.
\item Review changes made over time.
\item Backup.
\end{itemize}
\bigskip
\pause
For multiple users:
\begin{itemize}
\item A reliable way to share files between people/computers.
\item Allow multiple people working on the same project at the same time.
\item Conflict resolution.
\item See who made which changes at which time.
\end{itemize}
\end{pframe}
\subsection{Collaboration}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{gitlab_network}
\end{center}
\caption{Collaboration with many people.}
\end{figure}
\end{pframe}
\subsection{Tracking of changes}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[width=\textwidth]{gitlab_diff}
\end{center}
\caption{Compare versions.}
\end{figure}
Not limited to the previous version and the latest one.
\begin{itemize}
\item Different authors.
\item Any two versions.
\end{itemize}
\end{pframe}
\subsection{Documentation}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.7\textheight]{gitlab_doc}
\end{center}
\caption{Documentation and programs in one place.}
\end{figure}
\end{pframe}
\subsection{Interactive computational environments}
\begin{pframe}
Combine code execution, text, mathematics, plots and rich media into a single
document.
\bigskip
Ideal for exploration of data.
\begin{itemize}
\item Documentation and code are interwoven.
\item Results are displayed inline.
\item Web based.
\item Versions.
\end{itemize}
\bigskip
\pause
Integration with GitLab.
\vfill
\permfoot{\url{http://ipython.org/notebook.html}}
\end{pframe}
\subsection{iPython notebook}
\begin{pframe}
\begin{figure}[]
\begin{center}
\includegraphics[height=0.85\textheight]{ipynb}
\end{center}
\caption{iPython notebook.}
\end{figure}
\end{pframe}
\section{Education}
\subsection{Overview}
\begin{pframe}
LUMC wide courses:
\begin{itemize}
\item Practical Linux.
\item Shark introduction (also for partner institutes).
\item Advanced Linux and command line scripting.
\item GitLab as a Collaborative Working Environment.
\item Code and Data Management with Git
\item Scientific programming using Python.
\end{itemize}
\bigskip
(Inter)national:
\begin{itemize}
\item NGS introduction.
\item NGS data analysis.
\end{itemize}
\end{pframe}
\subsection{Practical Linux}
\begin{pframe}
Three hour course, given monthly.
\bigskip
Introduction to the Linux operating system.
\bigskip
Goals:
\begin{itemize}
\item The terminal, basic commands.
\item The file system, users, groups and permissions.
\item Connecting to other machines.
\end{itemize}
\vfill
\permfoot{M. Lefter, J. Vis}
\permfoot{\url{https://git.lumc.nl/courses/practical-linux-course}}
\end{pframe}
\subsection{Shark introduction}
\begin{pframe}
Three hour course, given monthly.
\bigskip
An overview of the available infrastructure.
\bigskip
In particular:
\begin{itemize}
\item Using clusters.
\begin{itemize}
\item The Sun Grid Engine.
\item Queues.
\end{itemize}
\item Where to store your data.
\item Do's and don'ts.
\end{itemize}
\vfill
\permfoot{M. Villerius}
\permfoot{\url{https://git.lumc.nl/shark/SHARK/wikis/home}}
\end{pframe}
\subsection{Code and data management with Git}
\begin{pframe}
Full day course, given twice a year.
\bigskip
Everyone in the Bioinformatics field:
\begin{itemize}
\item Software development.
\item Project management.
\item Collaboration.
\end{itemize}
\bigskip
\pause
Topics:
\begin{itemize}
\item Git Basics
\item Remotes
\item Writing Markdown files.
\end{itemize}
\vfill
\permfoot{M. Vermaat, W. Arindrarto}
\permfoot{\url{https://git.lumc.nl/courses/gitcourse}}
\end{pframe}
\subsection{Scientific programming using Python}
\begin{pframe}
Four day course, given yearly.
\bigskip
Topics:
\begin{itemize}
\item Python basics.
\item Standard data structures.
\item Working with NumPy arrays.
\item Plotting with matplotlib.
\item Object-oriented programming.
\item The Biopython library.
\end{itemize}
\vfill
\permfoot{M. Vermaat, W. Arindrato, Z. Tatum, W.Y. Leung}
\permfoot{\url{https://git.lumc.nl/courses/programming-course}}
\end{pframe}
\section{Production data analysis}
\subsection{Computational infrastructure}
\begin{pframe}
Reproducibility and automation.
\bigskip
Computational infrastructure:
\begin{itemize}
\item Transfer server: gatekeeper for \emph{production data}.