skeleton.tex 9.05 KB
Newer Older
1 2
\documentclass[slidestop]{beamer}

3
\title{Analysis projects skeleton}
4
\providecommand{\myConference}{Git course}
5
\providecommand{\myDate}{Monday, June 23, 2014}
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
\author{Jeroen F. J. Laros}
\providecommand{\myGroup}{Leiden Genome Technology Center}
\providecommand{\myDepartment}{Department of Human Genetics}
\providecommand{\myCenter}{Center for Human and Clinical Genetics}
\providecommand{\lastCenterLogo}{
  \raisebox{-0.1cm}{
    \includegraphics[height=1cm]{lgtc_logo}
    %\includegraphics[height=0.7cm]{ngi_logo}
  }
}
\providecommand{\lastRightLogo}{
  %\includegraphics[height=0.7cm]{nbic_logo}
  %\includegraphics[height=0.8cm]{nwo_logo_en}
  %\hspace{1.5cm}\includegraphics[height=0.7cm]{gen2phen_logo}
}

\usetheme{lumc}

\begin{document}

% This disables the \pause command, handy in the editing phase.
%\renewcommand{\pause}{}

% Make the title page.
\bodytemplate

% First page of the presentation.
\section{Introduction}
34 35
\subsection{Shared projects}
\begin{pframe}
36 37 38 39 40 41 42 43 44 45 46
  Most of us work on multiple projects with multiple people.
  \bigskip

  That is why is is convenient to:
  \begin{itemize}
    \item Have everything in one place.
    \begin{itemize}
      \item Data.
      \item Code.
      \item Documentation.
    \end{itemize}
47
    \pause
48 49
    \item Have the same structure for all projects.
  \end{itemize}
50
\end{pframe}
51

52
\section{Starting a project}
53 54
\subsection{Project skeleton}
\begin{pframe}
55 56
  Usage:
  \begin{itemize}
57
    \item Make a clone of the skeleton project.
58
    \item Rename the project.
59 60
    \item Create a new project on the server.
    \item Change the remote \bt{origin} to your new project.
61
  \end{itemize}
62 63
  \bigskip
  \pause
64 65 66 67 68

  Configure your project.
  \begin{itemize}
    \item Choose to make your project public or not.
    \begin{itemize}
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
69
      \item Private by default.
70 71 72 73
      \item Public really means public.
    \end{itemize}
    \item Add the people that work on this project.
  \end{itemize}
74 75 76

  \vfill
  \permfoot{https://git.lumc.nl/lgtc-bioinformatics/project-skeleton}
77
\end{pframe}
78

79
\section{Project structure}
80 81
\subsection{Global overview}
\begin{pframe}
82 83 84 85 86 87 88 89 90
  Project layout:
  \begin{itemize}
    \item analysis
    \item data
    \item doc
    \item src
  \end{itemize}
  \bigskip

91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
  Ideally, every directory in the project has a \bt{README.md} file.
\end{pframe}

\subsection{Markdown files}
\begin{pframe}
  \begin{lstlisting}[language=none, caption=Markdown snippet.]
    # Installation

    To install [Git](http://www.git-scm.com/):

        apt-get install git

    Now you can do the following:

    - Make a new repository with `git init`.
    - Clone an existing repository with `git clone`.
  \end{lstlisting}
\end{pframe}
  
\begin{pframe}
  \begin{figure}[]
    \begin{center}
      \includegraphics[width=\textwidth]{markdown}
    \end{center}
    \caption{Rendered markdown page.}
  \end{figure}
117
\end{pframe}
118

119
\subsection{The toplevel ``README.md'' file}
120
\begin{pframe}
121 122 123 124 125 126
  This file contains general information about the project, for example:
  \begin{itemize}
    \item Who leads the project.
    \item Who participates in the project.
    \item The amount of hours people have spent on this project.
  \end{itemize}
127
\end{pframe}
128

129 130
\subsection{The ``doc'' directory}
\begin{pframe}
131 132
  Documentation on the project:
  \begin{itemize}
133
    \item Annotation of the data.
134 135 136 137 138 139
    \item Goal of the project.
    \item Related work and literature.
    \begin{itemize}
      \item You may want to note who provided the documentation.
    \end{itemize}
  \end{itemize}
140
\end{pframe}
141

142 143
\subsection{The ``data'' directory}
\begin{pframe}
144 145 146
  Used to store all raw data.
  \bigskip

147
  The \bt{README.md} contains:
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
  \begin{itemize}
    \item Description of the delivered data.
    \begin{itemize}
      \item Sequencing centre.
      \item Platform.
      \item Molecular type.
      \item Owner.
      \item Gatherer.
    \end{itemize}
    \item Description of other data.
    \begin{itemize}
      \item Perhaps you already got BAM files.
      \begin{itemize}
        \item Who aligned it?
        \item Which aligner?
      \end{itemize}
    \end{itemize}
  \end{itemize}

167
\end{pframe}
168

169 170
\subsection{The ``analysis'' directory}
\begin{pframe}
171 172
  All analysis related files are stored here:
  \begin{itemize}
173
    \item Symlinks to the actual data.
174 175 176 177 178 179 180
    \item Run scripts.
    \item Make files.
    \item Result files.
  \end{itemize}
  \bigskip

  Try to separate self-contained parts of the analysis in their own
181
  subdirectories and document dependencies in a \bt{README.md} file.
182 183 184 185
  \begin{itemize}
    \item Normal data analysis.
    \item $k$-mer analysis.
  \end{itemize}
186
\end{pframe}
187

188 189
\subsection{The ``src'' directory}
\begin{pframe}
190 191 192 193 194
  Any custom scripts and specific software versions for this project.
  \bigskip

  When these scripts are useful for other projects, move them to their own
  repository.
195
\end{pframe}
196 197

\section{Working with large files}
198 199
\subsection{Git is not designed for massive files}
\begin{pframe}
200 201 202 203 204 205 206
  Some problems with large files:
  \begin{itemize}
    \item Limited storage on the server.
    \item Checking out a repository would take a long time.
  \end{itemize}
  \bigskip

207 208 209 210 211 212 213 214
  It also does not make much sense:
  \begin{itemize}
    \item These files are usually \emph{static}.
    \item And probably \emph{binary}.
  \end{itemize}
  \bigskip
  \pause

215 216 217 218 219
  We do want to have some way to track our input and output data. This can be
  done with \bt{git-annex}.

  \vfill
  \permfoot{http://git-annex.branchable.com/}
220
\end{pframe}
221

222 223
\subsection{Git annex}
\begin{pframe}
224
  Manage files with git, without checking their contents in.
225 226 227 228 229 230 231 232 233
  \begin{itemize}
    \item Manage large files without storing them.
    \item Store file checksums.
    \item Prevent files from being deleted accidentally.
  \end{itemize}
  \bigskip
  \pause

  You first have to enable this for your repository.
234
  \bigskip
235 236 237 238

  \begin{lstlisting}[language=none, caption=Enable git-annex.]
    $ git annex init "<name>"
  \end{lstlisting}
239
\end{pframe}
240

241 242
\subsection{Adding big files}
\begin{pframe}
243 244 245
  In our master repository, we annex a file.
  \bigskip

246 247 248 249 250
  \begin{lstlisting}[language=none, caption=Adding files.]
    $ git annex add <filename>
    $ git commit
  \end{lstlisting}
  \bigskip
251
  \pause
252 253

  In a clone, this file will visible, but not really present.
254 255
  \bigskip

256 257 258 259 260
  \begin{lstlisting}[language=none, caption=Make a file available.]
    $ file <filename>
    <filename>: broken symbolic link to ...
    $ git annex get <filename>
  \end{lstlisting}
261
\end{pframe}
262

263 264 265 266 267 268 269 270 271 272 273 274 275 276
\subsection{Modifying files}
\begin{pframe}
  Sometimes we need to change the content of a file.
  \bigskip

  \begin{lstlisting}[language=none, caption=Unlocking a file.]
    $ git annex edit <filename>
    unlock <filename> (copying...) ok
  \end{lstlisting}
  \bigskip

  You can use \bt{git annex add} when you are done.
\end{pframe}

277 278
\subsection{Removing files}
\begin{pframe}
279
  As long as there are enough copies available, you can remove files.
280 281
  \bigskip

282 283 284 285 286 287
  \begin{lstlisting}[language=none, caption=A failing drop command.]
    $ git annex drop <filename>
    drop bigfile (unsafe)
    git-annex: drop: 1 failed
  \end{lstlisting}
  \bigskip
288
  \pause
289 290

  It is actually quite well protected.
291 292
  \bigskip

293 294 295 296
  \begin{lstlisting}[language=none, caption=rm fails too.]
    $ rm -rf <repository>
    rm: cannot remove <repository>/.git/annex/objects/...
  \end{lstlisting}
297
\end{pframe}
298

299 300
\subsection{Synchronise your results}
\begin{pframe}
301
  Let the other repositories know what you have done.
302 303 304
  \bigskip

  \begin{lstlisting}[language=none, caption=Synchronise with all repositories.]
305 306
    $ git annex sync
  \end{lstlisting}
307 308 309 310 311 312 313 314 315
  \bigskip
  \pause

  You can choose to sync with a selection of repositories.
  \bigskip

  \begin{lstlisting}[language=none, caption=Synchronise with a selection.]
    $ git annex sync origin
  \end{lstlisting}
316
\end{pframe}
317

318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
\subsection{Cleaning your repository}
\begin{pframe}
  You can clean your repository with one command.
  \bigskip

  \begin{lstlisting}[language=none, caption=Remove untracked files.]
    $ git clean -f -x
  \end{lstlisting}

  \begin{table}[]
    \begin{center}
      \begin{tabular}{ll}
        option & description\\
        \hline
        \bt{-f} & Force (really remove).\\
        \bt{-x} & Also remove \emph{ignored} files.\\
        \bt{-n} & Do a \emph{dry run}.\\
      \end{tabular}
    \end{center}
    \caption{Common options.}
  \end{table}
\end{pframe}

341 342
\subsection{Working together on the same clone}
\begin{pframe}
343 344 345 346 347 348 349 350 351 352
  Sometimes you need to work with other people on the same repository clone.
  \begin{itemize}
    \item Where the large files are stored.
  \end{itemize}
  \bigskip

  Use the following command to give group access:
  \bigskip

  \begin{lstlisting}[language=none, caption=Make everyting group writable.]
353 354 355
    $ find -type d -exec chmod 775 {} \;
    $ find -type f -exec chmod 664 {} \;
  \end{lstlisting}
356
\end{pframe}
357 358 359

\section{Questions?}
\lastpagetemplate
360
\begin{pframe}
361 362 363 364 365 366 367
  \begin{center}
    Acknowledgements:
    \bigskip
    \bigskip

    Martijn Vermaat

368
    Wibowo Arindrarto
369

370
    Zuotian Tatum
371
  \end{center}
372 373 374

  \vfill
  \permfoot{http://git-annex.branchable.com/}
375

376 377
  \permfoot{https://git.lumc.nl/lgtc-bioinformatics/project-skeleton}
\end{pframe}
378
\end{document}