\documentclass[slidestop]{beamer} \title{Analysis projects skeleton} \providecommand{\myConference}{Git course} \providecommand{\myDate}{Monday, June 23, 2014} \author{Jeroen F. J. Laros} \providecommand{\myGroup}{Leiden Genome Technology Center} \providecommand{\myDepartment}{Department of Human Genetics} \providecommand{\myCenter}{Center for Human and Clinical Genetics} \providecommand{\lastCenterLogo}{ \raisebox{-0.1cm}{ \includegraphics[height=1cm]{lgtc_logo} %\includegraphics[height=0.7cm]{ngi_logo} } } \providecommand{\lastRightLogo}{ %\includegraphics[height=0.7cm]{nbic_logo} %\includegraphics[height=0.8cm]{nwo_logo_en} %\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo} } \usetheme{lumc} \begin{document} % This disables the \pause command, handy in the editing phase. %\renewcommand{\pause}{} % Make the title page. \bodytemplate % First page of the presentation. \section{Introduction} \subsection{Shared projects} \begin{pframe} Most of us work on multiple projects with multiple people. \bigskip That is why is is convenient to: \begin{itemize} \item Have everything in one place. \begin{itemize} \item Data. \item Code. \item Documentation. \end{itemize} \pause \item Have the same structure for all projects. \end{itemize} \end{pframe} \section{Starting a project} \subsection{Project skeleton} \begin{pframe} Usage: \begin{itemize} \item Make a clone of the skeleton project. \item Rename the project. \item Create a new project on the server. \item Change the remote \bt{origin} to your new project. \end{itemize} \bigskip \pause Configure your project. \begin{itemize} \item Choose to make your project public or not. \begin{itemize} \item Public by default. \item Public really means public. \end{itemize} \item Add the people that work on this project. \end{itemize} \vfill \permfoot{https://git.lumc.nl/lgtc-bioinformatics/project-skeleton} \end{pframe} \section{Project structure} \subsection{Global overview} \begin{pframe} Project layout: \begin{itemize} \item analysis \item data \item doc \item src \end{itemize} \bigskip Ideally, every directory in the project has a \bt{README.md} file. \end{pframe} \subsection{Markdown files} \begin{pframe} \begin{lstlisting}[language=none, caption=Markdown snippet.] # Installation To install [Git](http://www.git-scm.com/): apt-get install git Now you can do the following: - Make a new repository with `git init`. - Clone an existing repository with `git clone`. \end{lstlisting} \end{pframe} \begin{pframe} \begin{figure}[] \begin{center} \includegraphics[width=\textwidth]{markdown} \end{center} \caption{Rendered markdown page.} \end{figure} \end{pframe} \subsection{The toplevel ``README.md'' file} \begin{pframe} This file contains general information about the project, for example: \begin{itemize} \item Who leads the project. \item Who participates in the project. \item The amount of hours people have spent on this project. \end{itemize} \end{pframe} \subsection{The ``doc'' directory} \begin{pframe} Documentation on the project: \begin{itemize} \item Annotation of the data. \item Goal of the project. \item Related work and literature. \begin{itemize} \item You may want to note who provided the documentation. \end{itemize} \end{itemize} \end{pframe} \subsection{The ``data'' directory} \begin{pframe} Used to store all raw data. \bigskip The \bt{README.md} contains: \begin{itemize} \item Description of the delivered data. \begin{itemize} \item Sequencing centre. \item Platform. \item Molecular type. \item Owner. \item Gatherer. \end{itemize} \item Description of other data. \begin{itemize} \item Perhaps you already got BAM files. \begin{itemize} \item Who aligned it? \item Which aligner? \end{itemize} \end{itemize} \end{itemize} \end{pframe} \subsection{The ``analysis'' directory} \begin{pframe} All analysis related files are stored here: \begin{itemize} \item Symlinks to the actual data. \item Run scripts. \item Make files. \item Result files. \end{itemize} \bigskip Try to separate self-contained parts of the analysis in their own subdirectories and document dependencies in a \bt{README.md} file. \begin{itemize} \item Normal data analysis. \item $k$-mer analysis. \end{itemize} \end{pframe} \subsection{The ``src'' directory} \begin{pframe} Any custom scripts and specific software versions for this project. \bigskip When these scripts are useful for other projects, move them to their own repository. \end{pframe} \section{Working with large files} \subsection{Git is not designed for massive files} \begin{pframe} Some problems with large files: \begin{itemize} \item Limited storage on the server. \item Checking out a repository would take a long time. \end{itemize} \bigskip It also does not make much sense: \begin{itemize} \item These files are usually \emph{static}. \item And probably \emph{binary}. \end{itemize} \bigskip \pause We do want to have some way to track our input and output data. This can be done with \bt{git-annex}. \vfill \permfoot{http://git-annex.branchable.com/} \end{pframe} \subsection{Git annex} \begin{pframe} Manage files with git, without checking their contents in. \begin{itemize} \item Manage large files without storing them. \item Store file checksums. \item Prevent files from being deleted accidentally. \end{itemize} \bigskip \pause You first have to enable this for your repository. \bigskip \begin{lstlisting}[language=none, caption=Enable git-annex.] $ git annex init "" \end{lstlisting} \end{pframe} \subsection{Adding big files} \begin{pframe} In our master repository, we annex a file. \bigskip \begin{lstlisting}[language=none, caption=Adding files.] $ git annex add $ git commit \end{lstlisting} \bigskip \pause In a clone, this file will visible, but not really present. \bigskip \begin{lstlisting}[language=none, caption=Make a file available.] $ file : broken symbolic link to ... $ git annex get \end{lstlisting} \end{pframe} \subsection{Modifying files} \begin{pframe} Sometimes we need to change the content of a file. \bigskip \begin{lstlisting}[language=none, caption=Unlocking a file.] $ git annex edit unlock (copying...) ok \end{lstlisting} \bigskip You can use \bt{git annex add} when you are done. \end{pframe} \subsection{Removing files} \begin{pframe} As long as there are enough copies available, you can remove files. \bigskip \begin{lstlisting}[language=none, caption=A failing drop command.] $ git annex drop drop bigfile (unsafe) git-annex: drop: 1 failed \end{lstlisting} \bigskip \pause It is actually quite well protected. \bigskip \begin{lstlisting}[language=none, caption=rm fails too.] $ rm -rf rm: cannot remove /.git/annex/objects/... \end{lstlisting} \end{pframe} \subsection{Synchronise your results} \begin{pframe} Let the other repositories know what you have done. \bigskip \begin{lstlisting}[language=none, caption=Synchronise with all repositories.] $ git annex sync \end{lstlisting} \bigskip \pause You can choose to sync with a selection of repositories. \bigskip \begin{lstlisting}[language=none, caption=Synchronise with a selection.] $ git annex sync origin \end{lstlisting} \end{pframe} \subsection{Cleaning your repository} \begin{pframe} You can clean your repository with one command. \bigskip \begin{lstlisting}[language=none, caption=Remove untracked files.] $ git clean -f -x \end{lstlisting} \begin{table}[] \begin{center} \begin{tabular}{ll} option & description\\ \hline \bt{-f} & Force (really remove).\\ \bt{-x} & Also remove \emph{ignored} files.\\ \bt{-n} & Do a \emph{dry run}.\\ \end{tabular} \end{center} \caption{Common options.} \end{table} \end{pframe} \subsection{Working together on the same clone} \begin{pframe} Sometimes you need to work with other people on the same repository clone. \begin{itemize} \item Where the large files are stored. \end{itemize} \bigskip Use the following command to give group access: \bigskip \begin{lstlisting}[language=none, caption=Make everyting group writable.] $ find -type d -exec chmod 775 {} \; $ find -type f -exec chmod 664 {} \; \end{lstlisting} \end{pframe} \section{Questions?} \lastpagetemplate \begin{pframe} \begin{center} Acknowledgements: \bigskip \bigskip Martijn Vermaat Wibowo Arindrarto Zuotian Tatum \end{center} \vfill \permfoot{http://git-annex.branchable.com/} \permfoot{https://git.lumc.nl/lgtc-bioinformatics/project-skeleton} \end{pframe} \end{document}