Added gene profiling lecture (old IDA lecture).

019c9a24 · Laros · 32bba3ac · 019c9a24 · 019c9a24 · 019c9a24
Commit 019c9a24 authored 10 years ago by Laros
--- a/gene_profiling/DellBlade4.png
+++ b/gene_profiling/DellBlade4.png
+../presentation-pics/pics/DellBlade4.png
\ No newline at end of file
--- a/gene_profiling/Makefile
+++ b/gene_profiling/Makefile
+../presentation/Makefile
\ No newline at end of file
--- a/gene_profiling/antibiotic_test.jpg
+++ b/gene_profiling/antibiotic_test.jpg
+../presentation-pics/pics/antibiotic_test.jpg
\ No newline at end of file
--- a/gene_profiling/beamerthemelumc.sty
+++ b/gene_profiling/beamerthemelumc.sty
+../presentation/beamerthemelumc.sty
\ No newline at end of file
--- a/gene_profiling/ecoli.eps
+++ b/gene_profiling/ecoli.eps
+../presentation-pics/pics/ecoli.eps
\ No newline at end of file
--- a/gene_profiling/gen2phen_logo.eps
+++ b/gene_profiling/gen2phen_logo.eps
+../presentation/gen2phen_logo.eps
\ No newline at end of file
--- a/gene_profiling/gene_profiling.tex
+++ b/gene_profiling/gene_profiling.tex
+\documentclass[slidestop]{beamer}
+
+\title{\emph{E. coli} plasmid and gene profiling\\
+  {\small using Next Generation Sequencing}}
+\providecommand{\myConference}{IDA NGS lectures}
+\providecommand{\myDate}{Tuesday, 10 December 2012}
+\author{Jeroen F. J. Laros}
+\providecommand{\myGroup}{Leiden Genome Technology Center}
+\providecommand{\myDepartment}{Department of Human Genetics}
+\providecommand{\myCenter}{Center for Human and Clinical Genetics}
+\providecommand{\lastCenterLogo}{
+  \raisebox{-0.1cm}{
+    \includegraphics[height = 1cm]{lgtc_logo}
+    %\includegraphics[height = 0.7cm]{ngi_logo}
+  }
+}
+\providecommand{\lastRightLogo}{
+  %\includegraphics[height = 0.7cm]{nbic_logo}
+  %\includegraphics[height = 0.8cm]{nwo_logo_en}
+  %\hspace{1.5cm}\includegraphics[height = 0.7cm]{gen2phen_logo}
+}
+
+\usetheme{lumc}
+
+\begin{document}
+
+% This disables the \pause command, handy in the editing phase.
+%\renewcommand{\pause}{}
+
+% Make the title page.
+\bodytemplate
+
+% First page of the presentation.
+\section{Introduction}
+\begin{frame}
+  \frametitle{General overview}
+
+  \begin{figure}[]
+    \begin{center}
+      \includegraphics[height=0.9\textheight]{ecoli}
+    \end{center}
+    \caption{Escherichia coli.}
+    \label{}
+  \end{figure}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Some figures on the \emph{E. coli}}
+
+  Genome published in 1997.
+  \begin{itemize}
+    \item Genome size $4.6 \times 10^6$ basepairs.
+    \item $4,\!288$ genes in the assembly.
+    \begin{itemize}
+      \item $86$ tRNA genes.  % Not interesting.
+    \end{itemize}
+    \item $2,\!584$ operons in the assembly.
+    \begin{itemize}
+      \item $7$ rRNA operons. % Not interesting.
+    \end{itemize}
+  \end{itemize}
+  \bigskip
+  \bigskip
+  \pause
+
+  However, per individual strain:
+  \begin{itemize}
+    \item Between $4,\!000$ and $5,\!500$ genes.
+    \item $16,\!000$ genes in total (pangenome).
+  \end{itemize}
+  \bigskip
+  \bigskip
+  \pause
+
+  Very diverse, only $20\%$ of the genome is shared between all strains.
+\end{frame}
+
+\begin{frame}
+  \frametitle{Plasmids}
+
+  \begin{figure}[]
+    \begin{center}
+      \vspace{-0.5cm}
+      \colorbox{white}{
+        \includegraphics[width=0.5\textwidth]{plasmid_en}
+      }
+    \end{center}
+    \caption{Schematic overview of a cell containing plasmids.}
+    \label{}
+  \end{figure}
+  \pause
+
+  Plasmids are small DNA molecules.
+  \begin{itemize}
+    \item Separate and independent from the chromosome.
+    \item Can be transferred to other species.
+    \item Size between $1 \times 10^3$ and $1 \times 10^6$ basepairs.
+    \item Copy number between $1$ and $1,\!000$.
+    \item Variable between strains and individuals.
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Profiling}
+  \pause
+
+  Plasmids:
+  \begin{itemize}
+    \item May carry antibiotic resistance genes.
+    \item Not all of them are known.
+    \item May be highly similar to other plasmids.
+  \end{itemize}
+  \bigskip
+  \bigskip
+  \pause
+
+  Genes:
+  \begin{itemize}
+    \item Multi Locus Sequence Typing (MLST).
+    \begin{itemize}
+      \item Uses household genes.
+      \item Fragments of $450$ to $500$ basepairs.
+    \end{itemize}
+    \pause
+    \item Antibiotic resistance.
+    \begin{itemize}
+      \item The gene may be known, the plasmid may not be.
+    \end{itemize}
+    \pause
+    \item Efflux pumps.
+    \item \ldots
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Antibiotic resistance testing}
+
+  \begin{figure}[]
+    \begin{center}
+      \includegraphics[height=0.9\textheight]{antibiotic_test}
+    \end{center}
+    \caption{Classical antibiotic resistance test.}
+    \label{Antibiotic test.}
+  \end{figure}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Goals}
+  \bigskip
+  \pause
+
+  Clinical:
+  \begin{itemize}
+    \item Strain identification (MLST).
+    \item Antibiotic resistance testing.
+    \item Identifying efflux pumps.
+    \item Find other important genes.
+  \end{itemize}
+  \bigskip
+  \bigskip
+  \pause
+
+  Technical limitations:
+  \begin{itemize}
+    \item The result must be delivered fast.
+  \end{itemize}
+  \bigskip
+  \bigskip
+  \pause
+
+  Next Generation Sequencing.
+\end{frame}
+
+\section{Next Generation Sequencing}
+\begin{frame}
+  \frametitle{Why Next Generation Sequencing?}
+  \bigskip
+  \pause
+
+  We analyse \emph{everything} in one go.
+  \begin{itemize}
+    \item The genome, all plasmids are sequenced.
+    \item Known but also \emph{unknown} DNA is sequenced.
+    \item Data can be re-analysed.
+    \begin{itemize}
+      \item Is gene X also in there?
+    \end{itemize}
+  \end{itemize}
+  \bigskip
+  \bigskip
+  \pause
+
+  We did a pilot on the HiSeq 2000.
+  \begin{itemize}
+    \item Successful.
+    \item A bit slow (it takes two weeks for a HiSeq to finish).
+    \item Way too much data per sample.
+    \begin{itemize}
+      \item Over $200$ times more data per sample than needed.
+    \end{itemize}
+    \item Found a contamination (Streptococcus).
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Sequencers: Ion Torrent}
+
+  \begin{minipage}[t]{0.48\textwidth}
+    \begin{figure}
+      \includegraphics[width=\textwidth]{ion-torrent}
+      \caption{Ion torrent.}
+    \end{figure}
+  \end{minipage}
+  \hfill
+  \begin{minipage}[t]{0.48\textwidth}
+    Characteristics:
+    \begin{itemize}
+      \item $3$ hours per run.
+      \item $1$ day sampleprep, $1$ day emulsion PCR.
+      \item $4\times10^6$ reads.
+      \item Read length $\pm 300$bp.
+      \item $2$ \emph{E. coli} per run.
+    \end{itemize}
+  \end{minipage}
+  \bigskip
+  \pause
+
+  Fast and inexpensive.
+\end{frame}
+
+\section{Data analysis}
+\begin{frame}
+  \frametitle{General overview}
+
+  We screen for $130$ known plasmids and $400$ genes.
+  \bigskip
+  \pause
+
+  Output:
+  \begin{itemize}
+    \item MLST.
+    \item List of plasmids.
+    \begin{itemize}
+      \item Otherwise, similar plasmids.
+    \end{itemize}
+    \item List of genes of interest.
+  \end{itemize}
+  \bigskip
+  \pause
+
+  For the MLST, we need a \emph{consensus sequence}.
+  \begin{itemize}
+    \item As opposed to a list of variants, which we normally use.
+  \end{itemize}
+  \bigskip
+  \pause
+
+  For the list of plasmids and genes, we want a list we can open in Excel.
+  \bigskip
+\end{frame}
+
+\begin{frame}
+  \frametitle{Alignment}
+
+  \vspace{-0.75cm}
+  \begin{figure}[h]
+    \begin{center}
+      \includegraphics[height = \textheight]{k_align}
+    \end{center}
+    \caption{Variant calling.}
+  \end{figure}
+\end{frame}
+
+\begin{frame}
+  \frametitle{MLST}
+
+  Pipeline:
+  \begin{itemize}
+    \item Map all reads to the genome.
+    \item Make a consensus sequence.
+    \item Select genes.
+  \end{itemize}
+  \bigskip
+  \pause
+
+  Tools:
+  \begin{itemize}
+    \item \bt{tmap} for alignment.
+    \item \bt{samtools}/\bt{bcftools} for builing a consensus sequence.
+    \item In house program to select a region.
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Plasmid detection}
+
+  Pipeline:
+  \begin{itemize}
+    \item Select all reads that do not map to the genome.
+    \item Map these reads to each plasmid individually.
+    \item Calculate the \emph{horizontal coverage}.
+  \end{itemize}
+  \bigskip
+  \pause
+
+  Tools:
+  \begin{itemize}
+    \item \bt{samtools} to extract unmapped reads.
+    \item \bt{tmap} for alignment.
+    \item In house program to make a \bt{wiggle} track.
+    \item In house program to find \emph{covered regions}.
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Coverage}
+
+  \begin{figure}[h]
+    \vspace{-0.5cm}
+    \begin{center}
+      \includegraphics[height = \textwidth, angle = 270]{wiggle}
+    \end{center}
+    \caption{Coverage / depth histogram.}
+    \pause
+    \begin{picture}(0, 0)(-54, -49)
+      \color{black}
+      \put(-100, -26){\line(1, 0){100}}
+    \end{picture}
+  \end{figure}
+  \pause
+
+  \begin{figure}[h]
+    \vspace{-0.5cm}
+    \fbox{
+      \begin{picture}(100, 5)(0, 0)
+        \put(22, 2){\line(1, 0){1}}
+        \put(25, 2){\line(1, 0){2}}
+        \put(29, 2){\line(1, 0){1}}
+        \put(89, 2){\line(1, 0){1}}
+      \end{picture}
+    }
+    \caption{Coverage summary.}
+  \end{figure}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Antibiotic resistance genes detection}
+
+  Pipeline:
+  \begin{itemize}
+    \item Select genes from the genome or plasmids.
+    \item Calculate the non-\bt{N} content of the consensus sequence.
+  \end{itemize}
+  \bigskip
+  \pause
+
+  Tools:
+  \begin{itemize}
+    \item In house program to select a region.
+    \item In house program to calculate the non-\bt{N} percentage.
+  \end{itemize}
+\end{frame}
+
+\section{Challenges}
+\begin{frame}
+  \frametitle{Technical issues}
+
+  Between $66\%$ and $80\%$ of the reads map to the genome.
+  \bigskip
+  \pause
+
+  The other needs to be mapped to the $130$ plasmids and $278$ additional
+  genes.
+  \begin{itemize}
+    \item Alignment is not much faster for small reference sequences.
+  \end{itemize}
+  \bigskip
+  \pause
+
+  In total, the analysis would take around $\frac{130 + 278}3 = 136$ times
+  longer than the initial alignment.
+\end{frame}
+
+\begin{frame}
+  \frametitle{Clusters}
+
+  \begin{figure}
+    \includegraphics[width=0.95\textwidth]{DellBlade4}
+    \caption{Dell M610 blade server}
+  \end{figure}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Automatic scheduling on a cluster}
+
+  \begin{lstlisting}[caption={Makefile snippet.}]
+    %.bam: %.sam
+      $(SAMTOOLS) view -bt $(call MAKEREF, $@) -o $@ $<
+
+    %.flagstat: %.bam
+      $(SAMTOOLS) flagstat $< > $@
+  \end{lstlisting}
+  \bigskip
+  \pause
+
+  To fully exploit a cluster, we use the \emph{Make} language.
+  \begin{itemize}
+    \item Only describe dependencies.
+    \item Implicit workflow.
+    \item Error control.
+  \end{itemize}
+  \bigskip
+  \pause
+
+  The pipeline we made is only $122$ lines long.
+  \begin{itemize}
+    \item Maintainable.
+  \end{itemize}
+\end{frame}
+
+\section{Results}
+\begin{frame}[fragile]
+  \frametitle{MLST}
+
+  \begin{lstlisting}[caption={Part of the consensus sequence of acrB.}]
+    CAATGATGATCGACAGTATGGCTGTGCTCGATATCTTCATTCTTGCGGCT
+    AAAGCGGCGGCGAACCACCACAAAGAATACCGGAACGAAGAAGATTGCCA
+    GTACCGTTGCGGTCACCATCCCGCCCATTACACCGGTACCTACTGCGTTC
+    TGCGCGCCGGAACCAGCACCAGTACTGATAACCAGCGGCATAACGCCGAG
+    GATAAACGCCAGCGAGGTCATCAGGATCGGACGTAAACGCATCCGCACCG
+    CATCAAGCGTCGCTTCAATCAGACCTTTACCTTCTTTATCCATCAAGTCT
+    TTGGCGAATTCGACGATAAGGATCGCGTTCTTCGCCGACAACCCAATGGT
+    TGTGAGCAGGCCTACCTGGAAGTAAACGTCATTGGTCAGGCCACGGAAGG
+  \end{lstlisting}
+  \bigskip
+  \pause
+
+  These sequences can be analysed directly by existing MLST classification
+  software.
+\end{frame}
+
+\begin{frame}
+  \frametitle{Plasmid detection}
+
+  \begin{table}[]
+    \begin{center}
+      \begin{tabular}{l|rrr@{.}lrr@{$.$}l}
+        Plasmid & Size & Reads & \multicolumn{2}{r}{\#3/\#2} & Cov &
+          \multicolumn{2}{r}{\#5/\#2}\\
+        \hline
+        \bt{NC\_001537} &  $3895$ & $18728$ &  $4$ & $808$ &  $1418$ & $0$
+          & $364$\\
+        \bt{NC\_002119} &  $9957$ &  $6130$ &  $0$ & $615$ &   $789$ & $0$
+          & $079$\\
+        \bt{NC\_002127} &  $3306$ & $11749$ &  $3$ & $553$ &  $1068$ & $0$
+          & $323$\\
+        \bt{NC\_002128} & $92721$ & $11824$ &  $0$ & $127$ & $35783$ & $0$
+          & $385$\\
+        \bt{NC\_002142} & $68817$ &  $8163$ &  $0$ & $118$ & $15938$ & $0$
+          & $231$\\
+        \bt{NC\_002145} &  $1549$ & $46141$ & $29$ & $787$ &  $1549$ & $1$
+          & $000$\\
+        \bt{NC\_002487} &  $5847$ & $11669$ &  $1$ & $995$ &  $1735$ & $0$
+          & $296$\\
+        \bt{NC\_002525} & $75582$ &   $420$ &  $0$ & $005$ &  $1325$ & $0$
+          & $017$\\
+        \bt{NC\_004429} &  $6349$ &   $961$ &  $0$ & $151$ &  $1858$ & $0$
+          & $292$\\
+      \end{tabular}
+    \end{center}
+    \caption{Part of the plasmids Excel file.}
+    \label{}
+  \end{table}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Gene detection}
+
+  \begin{table}[]
+    \begin{center}
+      \begin{tabular}{l|rrrr@{$.$}l}
+        Reference & Gene & Length & Cov & \multicolumn{2}{r}{\#4/\#3}\\
+        \hline
+        \bt{AB699171}     & CMY-87 &  $959$ &   $90$ & $0$ & $093$\\
+        \bt{AB715422}     & IMP-34 &  $742$ &  $125$ & $0$ & $168$\\
+        \bt{AB737978}     & ACT-16 & $1062$ &  $202$ & $0$ & $190$\\
+        \bt{AB753456}     & IMP-42 &  $739$ &  $417$ & $0$ & $564$\\
+        \bt{AB753457}     & IMP-40 &  $739$ &  $414$ & $0$ & $560$\\
+        \bt{AB753458}     & IMP-41 &  $731$ &  $364$ & $0$ & $497$\\
+        \bt{AC\_000091.1} &   accD &  $915$ &  $915$ & $1$ & $000$\\
+        \bt{AC\_000091.1} &   acrA & $1194$ & $1194$ & $1$ & $000$\\
+        \bt{AC\_000091.1} &   acrB & $3150$ & $3150$ & $1$ & $000$\\
+      \end{tabular}
+    \end{center}
+    \caption{Part of the genes Excel file.}
+    \label{}
+  \end{table}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Reusability}
+
+  Plasmids and genes can be added easily.
+  \bigskip
+  \pause
+
+  Plasmids.
+  \begin{itemize}
+    \item Download a reference sequence.
+    \item Index the reference sequence.
+    \item Put the files in the right folder.
+  \end{itemize}
+  \bigskip
+  \pause
+
+  Genes:
+  \begin{itemize}
+    \item Download a reference sequence.
+    \item Find the gene in this reference sequence.
+    \item Write down the coordinates of the gene.
+  \end{itemize}
+  \bigskip
+
+  This part is automated.
+\end{frame}
+
+\section{Questions?}
+\lastpagetemplate
+\begin{frame}
+  \begin{center}
+    Acknowledgements:
+    \bigskip
+    \bigskip
+
+    Sunita Paltansing
+
+    Henk Buermans
+
+    Sandra Bernards
+
+    Johan den Dunnen
+  \end{center}
+\end{frame}
+
+\end{document}
--- a/gene_profiling/ion-torrent.jpg
+++ b/gene_profiling/ion-torrent.jpg
+../presentation-pics/pics/ion-torrent.jpg
\ No newline at end of file
--- a/gene_profiling/k_align.png
+++ b/gene_profiling/k_align.png
+../presentation-pics/pics/k_align.png
\ No newline at end of file
--- a/gene_profiling/lgtc_logo.eps
+++ b/gene_profiling/lgtc_logo.eps
+../presentation/lgtc_logo.eps
\ No newline at end of file
--- a/gene_profiling/lumc_logo.eps
+++ b/gene_profiling/lumc_logo.eps
+../presentation/lumc_logo.eps
\ No newline at end of file
--- a/gene_profiling/lumc_logo_small.eps
+++ b/gene_profiling/lumc_logo_small.eps
+../presentation/lumc_logo_small.eps
\ No newline at end of file
--- a/gene_profiling/nbic_logo.eps
+++ b/gene_profiling/nbic_logo.eps
+../presentation/nbic_logo.eps
\ No newline at end of file
--- a/gene_profiling/ngi_logo.eps
+++ b/gene_profiling/ngi_logo.eps
+../presentation/ngi_logo.eps
\ No newline at end of file
--- a/gene_profiling/nwo_logo_en.eps
+++ b/gene_profiling/nwo_logo_en.eps
+../presentation/nwo_logo_en.eps
\ No newline at end of file
--- a/gene_profiling/nwo_logo_nl.eps
+++ b/gene_profiling/nwo_logo_nl.eps
+../presentation/nwo_logo_nl.eps
\ No newline at end of file
--- a/gene_profiling/plasmid_en.eps
+++ b/gene_profiling/plasmid_en.eps
+../presentation-pics/pics/plasmid_en.eps
\ No newline at end of file
--- a/gene_profiling/ul_logo.eps
+++ b/gene_profiling/ul_logo.eps
+../presentation/ul_logo.eps
\ No newline at end of file
--- a/gene_profiling/wiggle.eps
+++ b/gene_profiling/wiggle.eps
+../presentation-pics/pics/wiggle.eps
\ No newline at end of file