diff --git a/01 - Welcome.ipynb b/01 - Welcome.ipynb index 0adb27f6d88837561f1a724dd4c4f64383c44298..80542ed8d2d7aefddb9ce67cc736847651427646 100644 --- a/01 - Welcome.ipynb +++ b/01 - Welcome.ipynb @@ -2,7 +2,7 @@ "metadata": { "celltoolbar": "Slideshow", "name": "", - "signature": "sha256:6c570ac428c0fbfa0115d6c05d52e3c05410cd3de42ae35e7ee7feb0f435a641" + "signature": "sha256:a016c0815699df8dae5155425df47f34e59e2aac64d4fa7f4edaa9c43a33ad3f" }, "nbformat": 3, "nbformat_minor": 0, @@ -259,8 +259,8 @@ { "html": [ "<style>/* Remove the vertical scrollbar added by nbconvert. */\n", - "html {\n", - " overflow-y: hidden;\n", + ".reveal {\n", + " overflow: hidden;\n", "}\n", "\n", "/* Workaround some highlight.js bugs in language autodetection. */\n", @@ -287,7 +287,7 @@ "output_type": "pyout", "prompt_number": 1, "text": [ - "<IPython.core.display.HTML at 0x33caa50>" + "<IPython.core.display.HTML at 0x1f71a50>" ] } ], diff --git a/02 - Introduction to Python (1).ipynb b/02 - Introduction to Python (1).ipynb index 2e37bf916ba9bcb9889b30ac88fc076785c6e3e4..fdc809c37ffd5a2409c0a724772151d878652595 100644 --- a/02 - Introduction to Python (1).ipynb +++ b/02 - Introduction to Python (1).ipynb @@ -2,7 +2,7 @@ "metadata": { "celltoolbar": "Slideshow", "name": "", - "signature": "sha256:06632e3aab84ba95070db9bd6588d5a6fa8e7440ebe85cac2ce1a3c91911db47" + "signature": "sha256:48b5e4d61de5dfa8cebfe8fe0c78f8587586ee15c8ea50fe5c1d67f8ea5caa4a" }, "nbformat": 3, "nbformat_minor": 0, @@ -893,8 +893,8 @@ { "html": [ "<style>/* Remove the vertical scrollbar added by nbconvert. */\n", - "html {\n", - " overflow-y: hidden;\n", + ".reveal {\n", + " overflow: hidden;\n", "}\n", "\n", "/* Workaround some highlight.js bugs in language autodetection. */\n", @@ -919,13 +919,13 @@ ], "metadata": {}, "output_type": "pyout", - "prompt_number": 25, + "prompt_number": 1, "text": [ - "<IPython.core.display.HTML at 0x7f39d4043c90>" + "<IPython.core.display.HTML at 0x3463a50>" ] } ], - "prompt_number": 25 + "prompt_number": 1 } ], "metadata": {} diff --git a/02 - Introduction to Python (2).ipynb b/02 - Introduction to Python (2).ipynb index ff39ed75c0ddd9fee4d78864eb313a5965a61f31..d7010ad5a4bc1c27708e86ce779188c6d3977840 100644 --- a/02 - Introduction to Python (2).ipynb +++ b/02 - Introduction to Python (2).ipynb @@ -2,7 +2,7 @@ "metadata": { "celltoolbar": "Slideshow", "name": "", - "signature": "sha256:57d29de0d77287096e49247676a76ea06e3730e1e5f819ecfc6b4b3f6d85e4e9" + "signature": "sha256:7b4c4307ccc127e749bcc6fae78706177f4e98973022d13424e5a50a8325f24e" }, "nbformat": 3, "nbformat_minor": 0, @@ -1842,8 +1842,8 @@ { "html": [ "<style>/* Remove the vertical scrollbar added by nbconvert. */\n", - "html {\n", - " overflow-y: hidden;\n", + ".reveal {\n", + " overflow: hidden;\n", "}\n", "\n", "/* Workaround some highlight.js bugs in language autodetection. */\n", @@ -1868,13 +1868,13 @@ ], "metadata": {}, "output_type": "pyout", - "prompt_number": 60, + "prompt_number": 1, "text": [ - "<IPython.core.display.HTML at 0x24f0cd0>" + "<IPython.core.display.HTML at 0x1873a50>" ] } ], - "prompt_number": 60 + "prompt_number": 1 } ], "metadata": {} diff --git a/02 - Introduction to Python (3).ipynb b/02 - Introduction to Python (3).ipynb index 9360d5c169dd44b0294b85c6f20168b871e12730..792c0926b7c98083cfb53aa045d31ddf4fff937a 100644 --- a/02 - Introduction to Python (3).ipynb +++ b/02 - Introduction to Python (3).ipynb @@ -2,7 +2,7 @@ "metadata": { "celltoolbar": "Slideshow", "name": "", - "signature": "sha256:74676c0cad25c0954fa4f8352dda1c9010bd747de1752b4ccc667d9132427fdd" + "signature": "sha256:255f999074050e52200e04b7179ca89a373daf14311e382c4ab05b4444cf392e" }, "nbformat": 3, "nbformat_minor": 0, @@ -851,8 +851,8 @@ { "html": [ "<style>/* Remove the vertical scrollbar added by nbconvert. */\n", - "html {\n", - " overflow-y: hidden;\n", + ".reveal {\n", + " overflow: hidden;\n", "}\n", "\n", "/* Workaround some highlight.js bugs in language autodetection. */\n", @@ -877,13 +877,13 @@ ], "metadata": {}, "output_type": "pyout", - "prompt_number": 24, + "prompt_number": 1, "text": [ - "<IPython.core.display.HTML at 0x1965a50>" + "<IPython.core.display.HTML at 0x2e0ea50>" ] } ], - "prompt_number": 24 + "prompt_number": 1 } ], "metadata": {} diff --git a/03 - More Python goodness (1).ipynb b/03 - More Python goodness (1).ipynb index b5f1303c0433cecab87762440284253b4124e561..74e20a78f7bbe5f23090547a99bfb628b8da7947 100644 --- a/03 - More Python goodness (1).ipynb +++ b/03 - More Python goodness (1).ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:70bfb8ad86899454fd23b5db2e3184de272ec729cf9cdc477127a1e7d7f0f60b" + "signature": "sha256:bc50d15e4618afafc7cbb3feab475a110d7e039ac562addd73b3cc9d6a5b1137" }, "nbformat": 3, "nbformat_minor": 0, @@ -33,7 +33,12 @@ "3. [String methods](#stringmethods)\n", "4. [Comments and docstrings](#docstrings)\n", "5. [Detour: PEP8 and other PEPs](#peps)\n", - "6. [Errors and exceptions](#exceptions)" + "6. [Errors and exceptions](#exceptions)\n", + "7. Working with modules\n", + "8. Examples from the standard library\n", + "9. Reading and writing files\n", + "10. Assignment: Finding the most common 7-mer in a FASTA file\n", + "11. Further reading" ] }, { diff --git a/03 - More Python goodness (2).ipynb b/03 - More Python goodness (2).ipynb index 26928b254adde1a9565bde8bc2fe54262514cb01..eb1c032df1f5dcd95948a2fe2c3fe9c63c884f40 100644 --- a/03 - More Python goodness (2).ipynb +++ b/03 - More Python goodness (2).ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:449f59d356a6f72e179ef39e58da7265eae4ecb8253da3c52cc6b41277fe2909" + "signature": "sha256:0892ec60b3a9f342f46559142d68b8c002cd1f86043508cf29b9fb892b41d031" }, "nbformat": 3, "nbformat_minor": 0, @@ -28,6 +28,12 @@ "source": [ "## Table of contents\n", "\n", + "1. Working with scripts\n", + "2. The standard library\n", + "3. String methods\n", + "4. Comments and docstrings\n", + "5. Detour: PEP8 and other PEPs\n", + "6. Errors and exceptions\n", "7. [Working with modules](#modules)\n", "8. [Examples from the standard library](#stdlib-examples)\n", "9. [Reading and writing files](#io)\n", diff --git a/05 - IPython Notebook.ipynb b/05 - IPython Notebook.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7d8a8e98b1bedd068895ff3d1d8db9341a721066 --- /dev/null +++ b/05 - IPython Notebook.ipynb @@ -0,0 +1,1637 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:ffac64912e6dd086a01e7335bc70811bb3d120474fe85a1a87e60f80994db347" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "-" + } + }, + "source": [ + "\n", + "***\n", + "\n", + "[Michiel van Galen](mailto:m.van_galen@lumc.nl), [Department of Human Genetics, Leiden University Medical Center](http://humgen.nl)\n", + "\n", + "Examples and ideas taken from: [IPython Documentation](http://nbviewer.ipython.org/github/ipython/ipython/blob/2.x/examples/Index.ipynb) and [The role of computing in science](http://www.socrates.if.usp.br/~wtc/?q=role-computing-science)\n", + "\n", + "License: [Creative Commons Attribution 3.0 License (CC-by)](http://creativecommons.org/licenses/by/3.0)" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Foreword" + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Requirements on scientific computing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"High-profile journals have called for increased openness in computational sciences. Some prestigious journals, including Science, have even started to demand of authors to provide the source code for simulation software used in publications to readers upon request.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Reproducible Research in Computational Science, Roger D. Peng, Science 334, 1226 (2011).\n", + "- Shining Light into Black Boxes, A. Morin et al., Science 336, 159-160 (2012).\n", + "- The case for open computer programs, D.C. Ince, Nature 482, 485 (2012)." + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "The cornerstone of the scientific method:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Replication & Reproduction" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "To achieve this in scientific computing (programming):" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Any source code which generates data should be:\n", + " - Tracked!\n", + " - Backed up and secured\n", + " - Ideally published online\n", + " - Additionally: Also track external software versions and settings\n" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Revision Control System" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Systems such as \n", + " - Git\n", + " - SVN\n", + " \n", + " \n", + " \n", + "- Online repository\n", + " - Github" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Check out our one day Git course:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Git course @ LUMC HumGen](https://humgenprojects.lumc.nl/trac/humgenprojects/wiki/gitcourse)" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Python in scientific computing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Large community of users\n", + "- Extensive ecosystem of scientific libraries and environments\n", + "- Parallel processing with processes and threads\n", + "- Interprocess communication (MPI)\n", + "- GPU computing (OpenCL and CUDA)\n", + "- Readily available and suitable for use on high-performance computing clusters.\n", + "- No license costs, no unnecessary use of research budget." + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Lots of reasons to python!\n" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "How to python?" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "The python interpreter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Standard way of running code\n", + "- Reads and runs the code in a file\n", + "\n", + "$ python my-program.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Alternatively, start the interpreter interactively by simply typing python\n", + "- Not very convenient due to a number of limitations\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$ python" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "IPython (Interactive Python)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Interactive shell with improved user friendliness" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Command history\n", + "- Tab auto-completion\n", + "- In-line editing of code\n", + "- Object introspection, and automatic extract of documentation strings\n", + "- Good interaction with operating system shell\n" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "IPython Notebook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Started by typing 'ipython notebook' in the directory where you want to store notebooks\n", + "- Opens a new browser window with an index page where existing notebooks are shown\n", + "- New notebooks can be created in that same directory" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "IPython Notebook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Cell based executable workflow\n", + "- Interactive computational documents\n", + "- Add elegant explanatory text\n", + "- Direct output of computations\n", + "- Add figures and video\n", + "- Integration in Git\n", + "- Export your notebooks to PDF or HTML (nbconvert)\n", + "- Share notebooks easily with nbviewer" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Why Notebook?" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "\"Web-based interactive computational environment where you can combine code execution, text, mathematics, plots and rich media into a single document.\"" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "\"It is based on the IPython shell, but provides a cell-based environment with great interactivity, where calculations can be organized documented in a structured way.\"" + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "<br></br><br></br><br></br><br></br>\n", + "<font color='darkgreen'>Track, store and share elegant code using IPython Notebooks & Git!</font>\n" + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "<font color='darkgreen'>Replicability & Reproducibility</font>\n", + "<br></br><br></br><br></br><br></br>" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Getting started with IPython Notebook" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Where to Notebook?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Install locally\n", + " - See instructions in this course\n", + " - Linux, Windows\n", + " \n", + " \n", + "- Remotely, Shark cluster LUMC\n", + " - Connect to shark\n", + " - Type 'notebook' and click the given URL" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "The notebook user interface" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Cell based workflow\n", + "- Move between cells with the arrow keys or clicks with the mouse" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "There are two different modes from which always one is active" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Edit mode\n", + " - Hit ENTER or click the edit area to change to edit mode\n", + " - Indicated with a <font color='green'>green</font> cell border\n", + " - Edit your cell as if a normal text editor\n", + " \n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Command mode\n", + " - Hit ESCAPE to change into command mode\n", + " - Indicated with a <font color='grey'>grey</font> cell border\n", + " - Edit the notebook as a whole\n", + "\n", + "Note: Different shortcuts apply in both modes!\n", + "\n", + "\n" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "For example. Press 'b' to add a cell below in command mode to add a cell" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "The toolbar is clickable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Edit mode shortcuts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "- esc : command mode\n", + "- shift+enter : run cell\n", + "- ctrl+enter : run cell, select below\n" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Command mode shortcuts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- enter : edit mode\n", + "- shift+enter : run cell\n", + "- ctrl+enter : run cell, select below\n", + "\n", + "\n", + "- y : to code\n", + "- m : to markdown\n", + "\n", + "\n", + "- ctrl+k : move cell up\n", + "- ctrl+j : move cell down\n", + "\n", + "\n", + "- a : insert cell above\n", + "- b : insert cell below\n", + "\n", + "\n", + "- x : cut cell\n", + "- c : copy cell\n", + "- v : paste cell below\n", + "\n", + "\n", + "- z : undo last delete\n", + "- d : delete cell (press twice)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Press 'h' to show the help." + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Saving your notebook\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- The notebook automatically saves\n", + "- You can manually click the 'save' icon or press CTRL+s" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Checkpoints**\n", + "\n", + "Currently only single checkpoints are stored, but multiple checkpoints will be enabled for future versions of IPython and Bookstore. " + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Two different cell types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Code cell\n", + " - Contains code \n", + " - Inline comments\n", + " - Python or system calls\n" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# This is a code cell\n", + "x = 10" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 42 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Markdown cell\n", + " - Format your notebook\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a markdown cell which allows to format your document **nicely** and add *context* to code cells." + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Set the cell type of the selected cell" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Select the type from the dropdown box in the toolbar\n", + "- In command mode only: Press 'y' for code or 'm' for markdown" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Code cells" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mainly contain Python code" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "All of the python goodness works inside code cells!" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Running cells\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Cells run asynchronous\n", + "- Results are persistent\n", + "- Save time by running intensive code blocks only once" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run a code cell by pressing the play button or shift+enter" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Assign a value\n", + "a = 12\n", + "# Output a to the power of 3\n", + "print a**3" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "1728\n" + ] + } + ], + "prompt_number": 10 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the 'Cell' option in the toolbar to run multiple cells at once." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- If the kernel is busy running it states 'Busy' in the header of the window or tab\n", + "- The indicator in the top right corner shows the kernel state" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Other uses of code cells\n" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Except for code, some system aliases are available such as 'ls':" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ls" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "biopython.ipynb \u001b[0m\u001b[01;34mimages\u001b[0m/ person.py \u001b[01;34msolutions\u001b[0m/\r\n", + "classes.ipynb INSTALL.md python.ipynb \u001b[01;34mstyles\u001b[0m/\r\n", + "\u001b[01;34mdata\u001b[0m/ matplotlib.ipynb README.md welcome.ipynb\r\n", + "\u001b[01;34mexamples\u001b[0m/ more-python.ipynb \u001b[01;32mseq_toolbox.py\u001b[0m*\r\n", + "exercises.ipynb Notebook.ipynb sequencer.ipynb\r\n", + "git.ipynb numpy.ipynb sequencer_old.py\r\n" + ] + } + ], + "prompt_number": 69 + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Any command line program can be run using '!' with string interpolation from Python variables:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "message = 'Some text'\n", + "!echo $message" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Some text\r\n" + ] + } + ], + "prompt_number": 70 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "!samtools" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\r\n", + "Program: samtools (Tools for alignments in the SAM format)\r\n", + "Version: 0.1.19-44428cd\r\n", + "\r\n", + "Usage: samtools <command> [options]\r\n", + "\r\n", + "Command: view SAM<->BAM conversion\r\n", + " sort sort alignment file\r\n", + " mpileup multi-way pileup\r\n", + " depth compute the depth\r\n", + " faidx index/extract FASTA\r\n", + " tview text alignment viewer\r\n", + " index index alignment\r\n", + " idxstats BAM index stats (r595 or later)\r\n", + " fixmate fix mate information\r\n", + " flagstat simple stats\r\n", + " calmd recalculate MD/NM tags and '=' bases\r\n", + " merge merge sorted alignments\r\n", + " rmdup remove PCR duplicates\r\n", + " reheader replace BAM header\r\n", + " cat concatenate BAMs\r\n", + " bedcov read depth per BED region\r\n", + " targetcut cut fosmid regions (for fosmid pool only)\r\n", + " phase phase heterozygotes\r\n", + " bamshuf shuffle and group alignments by name\r\n", + "\r\n" + ] + } + ], + "prompt_number": 45 + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Autocompletion works also by pressing TAB:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import numpy\n", + "numpy.random.random()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 47, + "text": [ + "0.7211185564561403" + ] + } + ], + "prompt_number": 47 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "\u00a7 Excercise : My first Notebook (1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Start a Notebook session.\n", + " - $ ipython notebook\n", + "\n", + "This opens a webbrowser and shows existing notebooks.\n", + " - Create a new notebook by clicking the 'New notebook' button.\n", + "\n", + "A new tab will open with a fresh notebook. Rename your notebook to something useful\n", + " - Click on the current name (Untitled1) and edit this\n", + "\n", + "Add the code shown below to some **_code_** cells\n", + " - Add cells by pressen the '+' button or ALT+ENTER\n", + " - Remember the keyboard shortcuts or the help function ('h')\n", + "\n", + "\n", + "Notice the last cell produced output! We will continue to develop this notebook later this session." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def complement(seq):\n", + " complements = {'A': 'T', 'C': 'G', 'T': 'A', 'G': 'C'}\n", + " c_seq = ''\n", + " for n in seq:\n", + " c_seq = c_seq + complements[n]\n", + " return c_seq" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 40 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "translate('ACGT')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 41, + "text": [ + "'TGCA'" + ] + } + ], + "prompt_number": 41 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Output from code cells" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- When a cell is run it can generate output\n", + "- This is shown below the cell in the output area\n", + "- Output is asynchronous\n", + "- Outputs are objects and the last one is available under the variable '_'\n", + " " + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Example: Show some output and use it into another function" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "numpy.random.rand(3)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 86, + "text": [ + "array([ 0.96757675, 0.54000181, 0.63633455])" + ] + } + ], + "prompt_number": 86 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "numpy.sin(_)\n" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 89, + "text": [ + "array([ 0.73353825, 0.49178408, 0.55988864])" + ] + } + ], + "prompt_number": 89 + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Scrollbars are automatically added for large outputs" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for i in range(2):\n", + " print i" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0\n", + "1\n" + ] + } + ], + "prompt_number": 115 + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Exceptions are formatted nicely:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Let's distribute a few cookies to nobody\n", + "cookies = 3\n", + "persons = 0\n", + "share = cookies / persons\n", + "print share" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "ename": "ZeroDivisionError", + "evalue": "integer division or modulo by zero", + "output_type": "pyerr", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mZeroDivisionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m<ipython-input-51-0d805d7db48f>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mcookies\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m3\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mpersons\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mshare\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcookies\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mpersons\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[0mshare\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mZeroDivisionError\u001b[0m: integer division or modulo by zero" + ] + } + ], + "prompt_number": 51 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Markdown cells" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Formatted **text** _can_ **_be_** added to IPython Notebooks using Markdown cells. \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Markdown is a popular markup language that is a superset of HTML. \n", + "- Its specification can be found here:\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "http://daringfireball.net/projects/markdown/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To create a markdown cell:\n", + "- First focus on the cell you want to format\n", + "- Then select 'Markdown' the dropdown box in the toolbar or press 'm' in command mode\n", + "- Note that you need to run a markdown cell to show the result\n", + " - Click 'play' or SHIFT+ENTER" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Some markdown examples: " + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Headers can be put by using hashes '#'" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Header 1\n", + "## Header 2\n", + "### Header 3" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Header 1\n", + "## Header 2\n", + "### Header 3" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Lists" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "- A list\n", + " - A sublist\n", + " - Yet another level\n", + " - And so on" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- A list\n", + " - A sublist\n", + " - Yet another level\n", + " - And so on" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "General HTML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since markdown is a superset of HTML, this will also work" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "<table border=\"1\" style=\"width:200px\">\n", + "<tr>\n", + " <td>Jill</td>\n", + " <td>Smith</td> \n", + " <td>50</td>\n", + "</tr>\n", + "<tr>\n", + " <td>Eve</td>\n", + " <td>Jackson</td> \n", + " <td>94</td>\n", + "</tr>\n", + "</table>" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<table border=\"1\" style=\"width:200px\">\n", + "<tr>\n", + " <td>Jill</td>\n", + " <td>Smith</td> \n", + " <td>50</td>\n", + "</tr>\n", + "<tr>\n", + " <td>Eve</td>\n", + " <td>Jackson</td> \n", + " <td>94</td>\n", + "</tr>\n", + "</table>" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Markdown also supports formulas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nicely formatted" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "\\begin{equation*} \\left( \\sum_{k=1}^n a_k b_k \\right)^2 \\leq \\left( \\sum_{k=1}^n a_k^2 \\right) \\left( \\sum_{k=1}^n b_k^2 \\right) \\end{equation*}\n" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\begin{equation*} \\left( \\sum_{k=1}^n a_k b_k \\right)^2 \\leq \\left( \\sum_{k=1}^n a_k^2 \\right) \\left( \\sum_{k=1}^n b_k^2 \\right) \\end{equation*}\n" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "We can also show figures. These can be locally or linked from the www." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Notebook has acceess to all the files down in the directory" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "<br></br><br></br>\n", + "Some other examples:" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Python supports videos also. Note this is not a markdown cell, but a code cell using a python package." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from IPython.display import YouTubeVideo\n", + "# a talk about IPython at Sage Days at U. Washington, Seattle.\n", + "# Video credit: William Stein.\n", + "YouTubeVideo('1j_HxD4iLn8')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "\n", + " <iframe\n", + " width=\"400\"\n", + " height=300\"\n", + " src=\"https://www.youtube.com/embed/1j_HxD4iLn8\"\n", + " frameborder=\"0\"\n", + " allowfullscreen\n", + " ></iframe>\n", + " " + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 1, + "text": [ + "<IPython.lib.display.YouTubeVideo at 0x3224890>" + ] + } + ], + "prompt_number": 1 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Plotting with matplotlib" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "More on this later in the course." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "plt.plot([x**2 for x in range(100)])\n", + "plt.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "display_data", + "png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAEACAYAAABYq7oeAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHX1JREFUeJzt3XmYFNW5x/HviCASEQQDsgZEUHFBwF2UUREQZVEQEBcS\nkCSiEhNXjDfMjcarPmhERKKICERG9hGQHWlBYFhkERzZBWWQQfZFYLa+f7w1djOydvdMVXf9Ps9T\nT1efqep+p2D67bPUOSAiIiIiIiIiIiIiIiIiIiIiIiIiIuJJHwJZwMqwsgrADGAtMB0oH/az3sA6\nYDXQPKy8sfMa64B+YeVnASOd8nTgd7ENX0REisLNQEOOTg6vA886+88Brzr79YHlQEmgFrAeSHJ+\ntgi41tmfDLR09nsC7zr7nYBPYhq9iIgUmVocnRxWA5Wd/Quc52C1hufCjpsKXA9UAb4NK+8M/Cfs\nmOuc/TOBn2IVtIiIRO6MCM6pjDU14TwWJIqqwJaw47YA1Y5RnumU4zz+4OznAnuxZisREXFRJMkh\nXNDZREQkgZwZwTlZWHPSNqzJaLtTngnUCDuuOlZjyHT2C5cXnFMT2OrEUg7YVfgN69SpE9ywYUME\noYqI+NoG4KJIToyk5jAB6OrsdwXSwso7A6WA2kBdrCN6G7AP61tIAh4CPj3Ga3UAZh3rDTds2EAw\nGNQWDNKnTx/XY/DKpmuha6FrceINqBPBZzxw8ppDKtAUOB/rG/gHNjppFNAd2AR0dI7NcMozsP6D\nnoSanHoCHwFnY6OVpjrlg4Hh2FDWnVhyERERl50sOdx/nPJmxyl/xdkK+wq44hjlRwglFxERicLg\nwXDvvXDeedG/VrQd0lLMkpOT3Q7BM3QtQnQtQvx6LUaPhldegaSkkx97KmL0MkUu6LSfiYhIIWvX\nQpMmMGUKNG4cKk+yTBHR57xqDiIicezQIbjvPvjnP49ODNFSzUFEJI517w6HD8N///vrJqVoag6R\n3OcgIiIeMGQIzJ8PixfHrq+hgGoOIiJxaPlyuOMO+OILqF//2Meoz0FExEf27IEOHeDtt4+fGKKl\nmoOISBwJBu1ehmrV4J13Tnys+hxERHyib1/YuhU+KeLVb1RzEBGJE4EAdO4MixZBzZonP159DiIi\nCW7rVujSBYYNO7XEEC0lBxERj8vJgY4doWdPaN68eN5TzUoiIh735JOwYQN8+imccRpf6dUhLSKS\noFJTYeJEWLLk9BJDtFRzEBHxqJUr4bbbYOZMaNDg9M9Xh7SISILZs8fuZ/j3vyNLDNFSzUFExGPy\n86FtW6hVC/r3j/x11OcgIpJAXn4Zdu+GsWPdi0HJQUTEQz77DN5/32ZaLVXKvTjUrCQi4hHr1sFN\nN0FaGtx4Y/Svpw5pEZE4d+AAtGtnK7rFIjFESzUHERGXBYN2B/S558IHH8Ru4R51SIuIxLHXXoPv\nv7eFe2K9oluklBxERFw0ZYoNV124EEqXdjuaEI/kqJNSs5KIJJz1661/Ydw4aNIk9q+vDmkRkTiz\nf3+oA7ooEkO0VHMQESlm+fnQvj389rfw3ntF18+gDmkRkTjy0kuwfTuMHOmdDujClBxERIrR+PEw\neLAt9enmHdAn49Gc9StqVhKRuLdqFdx6q41Quvrqon8/dUiLiHjcjh3Qpo1NwV0ciSFaqjmIiBSx\nnBxo0QKuucZueCsu0dQclBxERIrY44/Dd9/BhAlQokTxva9GK4mIeNR778GsWZCeXryJIVqqOYiI\nFJFAADp1gi+/hLp1i//91SEtIuIxGzdC587w8cfuJIZoRZMcegPfACuBEcBZQAVgBrAWmA6UL3T8\nOmA10DysvLHzGuuAflHEIyLiCfv2QevW8D//A82auR1NZCJNDrWAHkAj4AqgBNAZeB5LDvWAWc5z\ngPpAJ+exJfAuoarOQKA7UNfZWkYYk4iI6/LyoEsXaNoUHnvM7WgiF2ly2AfkAGWwTu0ywFagDTDU\nOWYo0M7ZbwukOudsAtYD1wFVgLLAIue4YWHniIjEneeeg0OHoF+ct4NEOlppF/AG8D1wCJiG1Rgq\nA1nOMVnOc4CqQHrY+VuAaliy2BJWnumUi4jEncGDbbhqejqULOl2NNGJNDnUAZ7Empf2AqOBBwsd\nE3S2mEhJSfllPzk5meTk5Fi9tIhI1AIBeOEFmDsXKlRwK4YAgUAgJq8V6VDWTsAdwCPO84eA64Hb\ngFuBbViT0WzgEkJ9D686j1OBPsBm55hLnfL7gabAnwu9n4ayiohnrV9vazJ8/DHcfrvb0YS4MZR1\nNZYMznbeuBmQAUwEujrHdAXSnP0JWId1KaA21vG8CEsi+7D+hyQsyRScIyLiebt3w913Q0qKtxJD\ntCJtVlqBdR4vAfKBpcD7WOfyKGz00Sago3N8hlOeAeQCPQk1OfUEPsISzWSsViEi4nk5OdChA9x5\nJ/y5cHtHnNMd0iIiEQgG4U9/gh9/hLQ0b06NobmVRESK2ZtvwsKFNjWGFxNDtJQcREROU1qarcuw\nYAGULet2NEVDzUoiIqdhyRLrY5g6FRo3djuaE9PEeyIixeD776FtW/jgA+8nhmgpOYiInIK9e23I\n6lNPWYJIdGpWEhE5iZwcuOsuuOgiGDAAkuLkk1PLhIqIFJFgEHr0gG3brCP6zDgaxqOhrCIiReT/\n/g+WLoU5c+IrMUTLR7+qiMjpGTHC1oBesADOOcftaIqXmpVERI4hEICOHeHzz+Hyy92OJjIayioi\nEkPffAOdOsEnn8RvYoiWkoOISJitW21k0htvwG23uR2Ne5QcREQc+/ZZYujRAx4svHyZz6jPQUQE\nyM62m9wuvBAGDoyfexlORPc5iIhEIRiE3//eFu4ZNy5xhqzqPgcRkSi8+CKsWWMjkxIlMURLl0FE\nfG3AABg9GubNgzJl3I7GO5QcRMS3xo6FV16BuXPht791OxpvUZ+DiPjSnDm2/vO0adCwodvRFA3d\nBCcichpWrYL77oPU1MRNDNFSchARX9m82VZy69cPbr/d7Wi8S8lBRHxjxw5o3hyefRY6d3Y7Gm9T\nn4OI+MKBAzYdxh13wL/+5XY0xUM3wYmInEB2NrRuDTVqwKBBiXH386lQchAROY68PHjgATh8GMaM\n8ddNbrpDWkTkGIJBeOIJW+Jz6lR/JYZo6VKJSMJKSYGFC2H2bChd2u1o4ouSg4gkpH797D6GL7+E\nc891O5r4o+QgIgln6FBbrGfuXKhUye1o4pM6pEUkoaSlwaOPWlPSJZe4HY271CEtIoJNuf3HP8KU\nKUoM0VJyEJGEkJ4OnTrZcNXGjd2OJv5p+gwRiXsrVkDbttbX0LSp29EkBiUHEYlra9faRHrvvAOt\nWrkdTeJQchCRuLV5s82V9PLLNgW3xE40yaE8MAb4FsgArgMqADOAtcB055gCvYF1wGqgeVh5Y2Cl\n87N+UcQjIj6ydatNuf3UU9Ctm9vRJJ5okkM/YDJwKXAl9qH/PJYc6gGznOcA9YFOzmNL4F1Cw6sG\nAt2Bus7WMoqYRMQHfvrJagzdu0OvXm5Hk5giTQ7lgJuBD53nucBeoA0w1CkbCrRz9tsCqUAOsAlY\nj9U0qgBlgUXOccPCzhER+ZU9e6BFC7jnHujd2+1oElekyaE28BMwBFgKDAJ+A1QGspxjspznAFWB\nLWHnbwGqHaM80ykXEfmVffugZUsbkfTSS25Hk9giTQ5nAo2w5qFGwEFCTUgFgs4mIhK1gwfhrrug\nUSN4803/rMnglkhvgtvibIud52OwDudtwAXOYxVgu/PzTKBG2PnVnfMznf3w8sxjvWFKSsov+8nJ\nySQnJ0cYuojEm0OHoE0bqFvXhqwqMRxbIBAgEAjE5LWiucRzgEewkUkpQBmnfCfwGlaTKO881gdG\nANdizUYzgYuwmsVCoBfW7/AZ8DYwtdB7aW4lEZ86fNj6F847D4YPhxIl3I4ofri1ElwD4AOgFLAB\n+ANQAhgF1MQ6njsCe5zjXwC6YZ3XfwGmOeWNgY+As7HRT8cae6DkIOJD2dlw771QpgyMGKHFek6X\nlgkVkYSTkwMdO1oT0siRULKk2xHFH83KKiIJJScHunSx9Z/HjFFicIOSg4h4Sm4uPPCAjU4aPx5K\nlXI7In9SchARzyhIDPv3W2I46yy3I/IvJQcR8YTcXHjoIdi711ZzK13a7Yj8TclBRFxXkBh27YJP\nP1Vi8AIlBxFxVUFTkmoM3qLkICKuyckJ9TEoMXiLkoOIuCInB+6/36bGGD9eicFrlBxEpNgdOQKd\nOkF+Powbp1FJXqRlQkWkWB0+DO3bwxln2A1uSgzepOQgIsXm0CFo187mSho5Uje4eZmSg4gUiwMH\nbD2GihVtEj1NieFtSg4iUuT27rUV3GrXhmHDNLtqPFByEJEitWsX3HEHXHklDBqk9RjihZKDiBSZ\n7dvh1lvh5pthwADrhJb4oH8qESkSW7bALbfYKm59+2ppz3ij5CAiMbdxoyWG7t0hJUWJIR4pOYhI\nTGVkQNOm8PTT8MwzbkcjkdKYARGJma++suGqffvCgw+6HY1EQ8lBRGJizhzo0AHef99udJP4pmYl\nEYnaZ5/ZlBgjRigxJAolBxGJyogR1vE8aRI0a+Z2NBIralYSkYgNGACvvgqzZsFll7kdjcSSkoOI\nnLZgEP75Txg+3Poaatd2OyKJNSUHETkteXnQqxfMnw/z5kHlym5HJEVByUFETtmRI/DwwzYtRiAA\n5cq5HZEUFXVIi8gp2bfP7mHIzYUpU5QYEp2Sg4ic1LZtkJwMdevCqFFa79kPlBxE5ITWrYObboJ7\n74V339WU236hPgcROa7Fi6FtWxuZ9MgjbkcjxUnJQUSOadIk6NYNBg+G1q3djkaKm5qVRORXBg2C\nHj1g4kQlBr9SzUFEfhEMwj/+YVNizJljHdDiT0oOIgJAdrb1K6xZAwsWQKVKbkckblKzkoiwZw+0\nbAn798Ps2UoMouQg4nubNtlQ1csvhzFjoEwZtyMSL4g2OZQAlgETnecVgBnAWmA6UD7s2N7AOmA1\n0DysvDGw0vlZvyjjEZHTsGiRJYY//Qneflv3MEhItMnhL0AGEHSeP48lh3rALOc5QH2gk/PYEngX\nKFhyfCDQHajrbC2jjElETsH48TYdxsCBNpGeSLhokkN1oBXwAaEP+jbAUGd/KFCwJlRbIBXIATYB\n64HrgCpAWWCRc9ywsHNEpAgEg/D66/DEEzB1KrRp43ZE4kXRjFb6N/AMcG5YWWUgy9nPcp4DVAXS\nw47bAlTDksWWsPJMp1xEikB2Njz6KCxdCunpUL262xGJV0Vac7gb2I71NyQd55ggoeYmEXHZrl3Q\nogXs2AFz5yoxyIlFWnO4EWtCagWUxmoPw7HawgXANqzJaLtzfCZQI+z86liNIdPZDy/PPNYbpqSk\n/LKfnJxMcnJyhKGL+M/q1Xanc9u28Npr6nhOVIFAgEAgEJPXOt63/tPRFHgaaA28DuwEXsM6o8s7\nj/WBEcC1WLPRTOAirGaxEOiF9Tt8BrwNTC30HsFgUJUQkUhMnw4PPmhrPXfr5nY0UpySkpIgws/5\nWN0hXfDJ/SowCht9tAno6JRnOOUZQC7QM+ycnsBHwNnAZH6dGEQkAsEgvPMOvPKK3b9wyy1uRyTx\nJBY1h+KgmoPIacjOhscft2kwJkyA2rXdjkjc4IWag4h4xPbt0L49VKwI8+dD2bJuRyTxSNNniCSQ\nZcvg2mttSc9x45QYJHKqOYgkiNRUu9N5wADo2PHkx4uciJKDSJzLy4MXXoDRo2HmTGjQwO2IJBEo\nOYjEsZ07oUsXyM219Z4rVnQ7IkkU6nMQiVPLl8M118CVV8K0aUoMEluqOYjEoY8/hiefhP79oXNn\nt6ORRKTkIBJHsrPhqadgyhSYNctqDSJFQclBJE5kZsJ998H558OSJVC+/MnPEYmU+hxE4sDs2da/\ncPfdkJamxCBFTzUHEQ/Lz7cJ8/r3h+HDoVkztyMSv1ByEPGoXbvg4Ydh924bpqr1F6Q4qVlJxIMW\nLoRGjaBePQgElBik+KnmIOIhwSD062fTbL/3Htxzj9sRiV8pOYh4xO7dthjPDz/Y+s4XXuh2ROJn\nalYS8YAFC6BhQ6hZE+bNU2IQ96nmIOKi/Hzo2xfeeAPef9/WeBbxAiUHEZds22ajkQ4etNFINWu6\nHZFIiJqVRFwwZYo1I11/PXzxhRKDeI9qDiLF6MgR6N3b1l5ITbUV20S8SMlBpJhkZNjaCxdeaNNt\na4pt8TI1K4kUsWAQBg6Epk3h8cdh7FglBvE+1RxEitC2bdC9O2RlwZdfwsUXux2RyKlRzUGkiKSl\nwVVXWcfzggVKDBJfVHMQibF9+2yVtkDAmpBuusntiEROn2oOIjEUCNjqbCVLwooVSgwSv1RzEImB\nn3+Gv/8dRo2CQYOgVSu3IxKJjmoOIlGaP9/6FrKy4OuvlRgkMajmIBKhQ4egTx8YNgwGDID27d2O\nSCR2VHMQicD8+TYK6bvvrLagxCCJRjUHkdPw88/w4os29UX//tChg9sRiRQN1RxETtHnn8MVV1jf\nwsqVSgyS2FRzEDmJ3bvhmWdg+nSbBuOuu9yOSKToqeYgchzBoM2eevnlUKoUrFqlxCD+oZqDyDF8\n/z089hhs3Gj3LuhmNvEb1RxEwuTmwptvQqNGcN11sGyZEoP4U6TJoQYwG/gGWAX0csorADOAtcB0\noHzYOb2BdcBqoHlYeWNgpfOzfhHGIxK1RYvgmmtg8mQbqvrii9acJOJHkSaHHOCvwGXA9cBjwKXA\n81hyqAfMcp4D1Ac6OY8tgXeBJOdnA4HuQF1naxlhTCIR2bULHn0U2raFp5+GGTOgXj23oxJxV6TJ\nYRuw3Nk/AHwLVAPaAEOd8qFAO2e/LZCKJZVNwHrgOqAKUBZY5Bw3LOwckSKVnw9DhkD9+nDGGbZS\n2wMPQFLSyc8VSXSx6JCuBTQEFgKVgSynPMt5DlAVSA87ZwuWTHKc/QKZTrlIkVq2DJ54ArKzYdIk\nuPpqtyMS8ZZok8M5wFjgL8D+Qj8LOltMpKSk/LKfnJxMslZmlwjs2mV9CWPHwssvQ7duUKKE21GJ\nxEYgECAQCMTktaKpQJcEJgFTgLecstVAMtbsVAXrtL6EUN/Dq87jVKAPsNk55lKn/H6gKfDnQu8V\nDAZjlmfEh/LybCrtPn3szuaXXoIKFdyOSqRoJVkbaUSf85H2OSQBg4EMQokBYALQ1dnvCqSFlXcG\nSgG1sY7nRVgS2Yf1PyQBD4WdIxITX3xhQ1NTU2HaNJtBVYlB5MQirTk0AeYAXxNqOuqNfeCPAmpi\nHc8dgT3Oz18AugG5WDPUNKe8MfARcDYwmdCw2HCqOchp27gRnn0WFi+Gvn2txqDOZvGTaGoO8fKn\nouQgp2zvXvjXv+DDD+Gvf4W//Q3OPtvtqESKnxvNSiKek5NjTUYXXww7d9rMqX//uxKDSCQ0t5LE\nvWAQ0tLg+efhd7+zfoUGDdyOSiS+KTlIXJs3z/oV9u+Ht9+GFi3cjkgkMSg5SFzKyIDevWH5chuW\n+sADul9BJJbU5yBxZdMm6NoVbr0VbrkF1qyBhx9WYhCJNSUHiQs//mjTXTRuDLVqwbp18NRTULq0\n25GJJCYlB/G0n36ymVILVmP79lv43/+Fc891OzKRxKbkIJ60Y4f1KVxyCRw+bMNS33gDKlVyOzIR\nf1ByEE8pSAoXXwx79tjsqe+8A1Wruh2ZiL8oOYgn/PijNR+FJ4WBA6FmTbcjE/EnJQdx1ebN8Pjj\ncNlldofzihVKCiJeoOQgrsjIsCGpjRrBb35jz/v1g+rV3Y5MREA3wUkxmzcPXn8d0tOhVy/YsAHK\nl3c7KhEpTMlBilxeHkyYYNNmb9tmfQupqVCmjNuRicjxKDlIkTlwAD76CN56CypWtKRw7726m1kk\nHig5SMxt3mzDT4cMsSkuhg6FG2/UQjsi8UQd0hITwSAEAtC+vXUy5+fDokUwbhzcdJMSg0i8Uc1B\norJ/P/z3v7bITn6+DUsdOhTOOcftyEQkGkoOEpEVK+A//4FPPrEZUvv3h+Rk1RBEEoWSg5yygwdh\n5EgYNAi2bIEePWDVKqhWze3IRCTW4uV7XjAYDLodgy8Fg7BkCQweDKNGQZMm8Mgj0KoVnKmvFiKe\nlmRV+Yg+5/XnLceUlWV9CUOGwKFD8Ic/2MyoqiWI+INqDvKLQ4fsZrVhw+xO5nbtLCncfDOcoXFt\nInEnmpqDkoPP5eXB7NkwYgSkpcHVV9uym/fcY3MeiUj8UnKQ05Kfb3MbjRwJo0fbWgldukCnTmo2\nEkkk6nOQk8rPh4ULYexY61guW9aSwezZtoaCiEg4JYcElpsLc+fC+PF2p3K5ctChA3z2GVxxhdvR\niYiXKTkkmP37YcYM61ieNAlq1bL+gxkz4NJL3Y5OROKF+hwSwIYNMHmyJYP58+GGG6B1axttVKOG\n29GJiFvUIe0zBw/CnDkwbZolhf37oWVLuPtuaN7c+hNERJQcElxuLnz1FXz+OcycabOdNm5sieDO\nO6FBA92HICK/puSQYHJzYdky+OIL2+bOhZo14fbbbWvaVLUDETk5JYc4d/Cg1Qa+/NK29HTrK2ja\n1LbkZKhUye0oRSTeKDnEkbw8WLMGFi+2JJCeDmvXWtNQkya23XgjnH++25GKSLxTcvCo3FxLBEuX\nWjPRV1/ZY6VKcM01cP31tl11FZx1ltvRikiiSYTk0BJ4CygBfAC8Vujnnk4OwSBkZsI339i2ciV8\n/TV8+61NR9GokW0NG9rcRRUquB2xiPhBvCeHEsAaoBmQCSwG7ge+DTvGE8lh3z67p2D9emsKWrMG\nVq+2xzJloH59uOwyu/v4yittP9bLZQYCAZKTk2P7onFK1yJE1yJE1yIk3udWuhZYD2xynn8CtOXo\n5FDk8vNhxw5b4eyHH0Lbpk3w3Xf2+PPPUKcOXHQR1K1ry2M++qjNTVRctQH9xw/RtQjRtQjRtYgN\nLySHasAPYc+3ANdF84J5eTYCaO9e2LPHHnftsm3nTtuysmD7dnv88Ud7LFfOZiitUSO0tWsHtWvb\nNBSVK2uNZBHxBy8kh1NqL2rVyr7d5+dDTk5oO3LEFqkp2A4cgOxsa+YpX94+8MuVg/POg4oVQ9sN\nN1jHcKVKUKUKXHCBOoVFRAp44Xvw9UAK1ikN0BvI5+hO6fVAneINS0Qk7m0ALnI7iEidif0CtYBS\nwHJA84eKiAh3YiOW1mM1BxERERERkdPTElgNrAOeczmW4lYDmA18A6wCejnlFYAZwFpgOlDeleiK\nXwlgGTDRee7X61AeGIMN987ARvf59Vr0xv4+VgIjgLPwz7X4EMjCfvcCJ/rde2Ofo6uB5sUUY5Ep\ngTU11QJK4r/+iAuAq5z9c7Cmt0uB14FnnfLngFeLPzRX/A34GJjgPPfrdRgKdHP2zwTK4c9rUQvY\niCUEgJFAV/xzLW4GGnJ0cjje714f+/wsiV239UBcT/R/AzA17PnzzuZXadid5KuByk7ZBc7zRFcd\nmAncSqjm4MfrUA77QCzMj9eiAvaF6TwsSU4E7sBf16IWRyeH4/3uvTm65WUqNlL0uLyeOY51g1w1\nl2JxWy3sW8JC7B8/yynPIvSfIZH9G3gGG+ZcwI/XoTbwEzAEWAoMAn6DP6/FLuAN4HtgK7AHa1Lx\n47UocLzfvSr2+VngpJ+lXk8O7k+o5A3nAGOBvwD7C/0sSOJfp7uB7Vh/w/HuzfHDdQD7htwIeNd5\nPMiva9N+uRZ1gCexL05Vsb+TBwsd45drcSwn+91PeF28nhwysU7ZAjU4Ovv5QUksMQzHmpXAvhFc\n4OxXwT44E9mNQBvgOyAVuA27Hn67DmD//7dgE1SCdUw3Arbhv2txNTAf2AnkAuOwpmg/XosCx/ub\nKPxZWt0pOy6vJ4clQF1CN8h1ItQZ6QdJwGBsRMpbYeUTsI43nMc0EtsL2H/s2kBn4HPgIfx3HcA+\n+H4A6jnPm2GjdSbiv2uxGms3Pxv7W2mG/a348VoUON7fxATsb6cU9ndUF1hU7NHFmJ9vkGuCtbEv\nx5pUlmFDeytgnbOJPlTvWJoS+oLg1+vQAKs5rMC+LZfDv9fiWUJDWYdiNW2/XItUrK8lG/vC8AdO\n/Lu/gH2OrgZaFGukIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIisfT/aY4DA839B3AAAAAASUVO\nRK5CYII=\n", + "text": [ + "<matplotlib.figure.Figure at 0x333b590>" + ] + } + ], + "prompt_number": 36 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "\u00a7 Excercise : My first Notebook (2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's add some markdown cells to the notebook you created earlier:\n", + " - Select the top code cell\n", + " - Press ESC to go into command mode\n", + " - Press 'a', this will add a cell above the selected cell\n", + " - Notice the focus is on the new cell\n", + " - Now press 'm' to set the celltype to 'Markdown'\n", + " - Press ENTER and add some code (see below for example)\n", + " - Run the cell and see if it worked\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Example markdown code:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### My next notebook\n", + "\n", + "This notebook is my first notebook with some experimental code.\n", + "\n", + "Author: [Michiel van Galen](mailto:m.van_galen@lumc.nl)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, try to add more python code to the notebook:\n", + " - Make sure to set the cell type to code\n", + " - Add the 'reverse' function to your notebook as shown below" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def reverse(seq):\n", + " rev = seq[::-1]\n", + " return rev" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 42 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Try using both the 'reverse' and 'translate' function." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "reverse('AACGT')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 45, + "text": [ + "'TGCAA'" + ] + } + ], + "prompt_number": 45 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "translate(_)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 46, + "text": [ + "'ACGTT'" + ] + } + ], + "prompt_number": 46 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "\u00a7 Excercise : My first Notebook (3)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A palindromic sequence is a nucleic acid sequence (DNA or RNA) that is the same whether read 5' (five-prime) to 3' (three prime) on one strand or 5' to 3' on the complementary strand with which it forms a double helix. Palindromic sequences play an important role in molecular biology:\n", + "\n", + "http://en.wikipedia.org/wiki/Palindromic_sequence\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "<img src=\"https://git.lumc.nl/humgen/programming-course/raw/master/images/1590px-DNA_palindrome.svg.png\" align=\"center\" width=\"400\" >\n" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Take some time to develop your notebook further and add the following features to your notebook:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " - Think of function which can test if a sequence is palindromic\n", + " - Use the functions 'complement' and 'reverse'\n", + " - Nicely formatted mardown cell(s) explaining the notebook\n", + " - Add links as references like the one above\n", + "\n", + "**Bonus: Write a function which can test if there are short palindromic sequences in a longer piece of DNA**\n", + "\n", + "- Try to find the palindromic seqeuences of at least length 6 in the sequence : GGGAGACATGTCTAACCGTTGTAAAA\n", + "- Implement your current functions in a new function\n", + "- Begin to iterate over a sequence and find a palindrome of size 2\n", + "- Continue to work from there and expand your test\n", + "- Can a palindromic sequence have an odd length?\n" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "<br></br><br></br><br></br><br></br>\n", + "<br></br><br></br><br></br><br></br>\n", + "<br></br><br></br><br></br><br></br>\n", + "<br></br><br></br><br></br><br></br>\n", + "<br></br><br></br><br></br><br></br>\n", + "Ignore following code for layout" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from IPython.core.display import HTML\n", + "def custom_style():\n", + " style = open('styles/notebook.css', 'r').read()\n", + " return HTML('<style>' + style + '</style>')\n", + "def custom_script():\n", + " script = open('styles/notebook.js', 'r').read()\n", + " return HTML('<script>' + script + '</script>')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "custom_style()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "<style>/*\n", + " https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers\n", + "*/\n", + "@font-face {\n", + " font-family: \"Computer Modern\";\n", + " src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');\n", + "}\n", + "div.cell{\n", + " width:800px;\n", + " margin-left:16% !important;\n", + " margin-right:auto;\n", + "}\n", + "h1 {\n", + " font-family: Helvetica, serif;\n", + "}\n", + "h4{\n", + " margin-top:12px;\n", + " margin-bottom: 3px;\n", + " }\n", + "div.text_cell_render{\n", + " font-family: Computer Modern, \"Helvetica Neue\", Arial, Helvetica, Geneva, sans-serif;\n", + " line-height: 145%;\n", + " font-size: 130%;\n", + " width:800px;\n", + " margin-left:auto;\n", + " margin-right:auto;\n", + "}\n", + ".CodeMirror{\n", + " font-family: \"Source Code Pro\", source-code-pro,Consolas, monospace;\n", + "}\n", + ".prompt{\n", + " display: None;\n", + "}\n", + ".text_cell_render .exercise {\n", + " font-weight: 300;\n", + " /*font-size: 22pt;*/\n", + " color: #4057A1;\n", + " font-style: italic;\n", + " /*margin-bottom: .5em;\n", + " margin-top: 0.5em;\n", + " display: block;*/\n", + "}\n", + ".text_cell_render .example {\n", + " font-weight: 300;\n", + " color: #40A157;\n", + " font-style: italic;\n", + "}\n", + "\n", + ".warning{\n", + " color: rgb( 240, 20, 20 )\n", + "}\n", + "</style>" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 2, + "text": [ + "<IPython.core.display.HTML at 0x1d02710>" + ] + } + ], + "prompt_number": 2 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "custom_script()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "<script>// https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers\n", + "MathJax.Hub.Config({\n", + " TeX: {\n", + " extensions: [\"AMSmath.js\"]\n", + " },\n", + " tex2jax: {\n", + " inlineMath: [ ['$','$'], [\"\\\\(\",\"\\\\)\"] ],\n", + " displayMath: [ ['$$','$$'], [\"\\\\[\",\"\\\\]\"] ]\n", + " },\n", + " displayAlign: 'center', // Change this to 'center' to center equations.\n", + " \"HTML-CSS\": {\n", + " styles: {'.MathJax_Display': {\"margin\": 4}}\n", + " }\n", + " });\n", + "</script>" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 3, + "text": [ + "<IPython.core.display.HTML at 0x1d02a90>" + ] + } + ], + "prompt_number": 3 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/09 - A sip of Biopython (1).ipynb b/09 - A sip of Biopython (1).ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b79a8d351e86c1f29843b77c9f76247e7e82cc02 --- /dev/null +++ b/09 - A sip of Biopython (1).ipynb @@ -0,0 +1,761 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:379897678563a5d1a894e27455668e010f747d9dd35befb0c0ab7f547bdd85ec" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# A sip of Biopython (1)\n", + "***\n", + "\n", + "[Wibowo Arindrarto](mailto:w.arindrarto@lumc.nl), [Sequencing Analysis Support Core, Leiden University Medical Center](http://sasc.lumc.nl)\n", + "\n", + "[Martijn Vermaat](mailto:m.vermaat.hg@lumc.nl), [Department of Human Genetics, Leiden University Medical Center](http://humgen.nl)\n", + "\n", + "License: [Creative Commons Attribution 3.0 License (CC-by)](http://creativecommons.org/licenses/by/3.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of contents\n", + "\n", + "1. [Prelude: Python packages and their installation](#packages)\n", + "2. [Biopython](#biopython)\n", + "3. [Working with sequences](#sequences)\n", + "4. File I/O with Biopython\n", + "5. Fetching from online resources: NCBI's Entrez\n", + "6. Performing a remote BLAST search\n", + "7. Beyond Biopython" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"packages\"></a>\n", + "## Prelude: Python packages and their installation\n", + "\n", + "### Python Package Index (PyPI)\n", + "\n", + "[The Python Package Index](https://pypi.python.org/) is *the* place to find 3rd-party Python libraries (and to upload your own too).\n", + "\n", + "Remember the `pip install biopython` (or `numpy`, etc). command you ran? That source package was stored in PyPI.\n", + "\n", + "Some of the Python packages developed at our department are also in there:\n", + "\n", + "- [kMer](https://pypi.python.org/pypi/kMer): Analysis toolkit and programming library for k-mer profiles.\n", + "- [TSSV](https://pypi.python.org/pypi/tssv): Targeted characterisation of short structural variation.\n", + "- [fastools](https://pypi.python.org/pypi/fastools): Various tools for the analysis and manipulation of FASTA/FASTQ files.\n", + "- [piletools](https://pypi.python.org/pypi/piletools): Various tools for the analysis of mpileup files.\n", + "- [barcode](https://pypi.python.org/pypi/barcode): For designing NGS barcodes.\n", + "- [wiggelen](http://wiggelen.readthedocs.org/): Working with Wiggle (WIG) tracks.\n", + "- [monoseq](https://monoseq.readthedocs.org/): Pretty-printing of DNA and protein sequences.\n", + "\n", + "For example, to install our library for working with Wiggle tracks:\n", + "\n", + " pip install wiggelen" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Python virtual environments\n", + "\n", + "[`virtualenv`](http://www.virtualenv.org/) (with its frontend [`virtualenvwrapper`](http://virtualenvwrapper.readthedocs.org/)) is a tool for managing isolated Python environments. Its benefits are:\n", + "\n", + "1. You can have specific versions of packages installed, per environment.\n", + "2. You can install packages as non-root user.\n", + "\n", + "We list some commands to manage your virtual environments. First, creating a new one:\n", + "\n", + " $ mkvirtualenv my-environment\n", + "\n", + "Activating an environment:\n", + "\n", + " $ workon my-environment\n", + "\n", + "Whenever you have an environment activated, it is indicated by prefixing your bash prompt with the name of the environment surrounded by brackets.\n", + "\n", + "Deactivating an environment:\n", + "\n", + " $ deactivate\n", + "\n", + "Normally, `pip install` will try to install a package system-wide, for which you'd need administrator permissions. Whenever you have a virtual environment activated, `pip install` will install the package in the virtual environment automatically." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"biopython\"></a>\n", + "## Biopython\n", + "\n", + "<img src=\"http://biopython.org/DIST/docs/tutorial/images/biopython.jpg\" />\n", + "\n", + "### About Biopython\n", + "\n", + "Biopython is one of the big libraries for working with bioinformatics-related data (but not the only one). It is *Open Source* and ran by a team of developers from around the world under the OBF umbrella.\n", + "\n", + "Development started in 1998 and it is still actively maintained with a new release every 3-4 months. The library is mature and has had multiple publications (the whole library itself and sometimes its submodules).\n", + "\n", + "Biopython is compatible with Python 2.x and 3.x and tested on multiple operating systems and Python implementations.\n", + "\n", + "Some links with more information:\n", + "\n", + "- [Biopython homepage](http://biopython.org)\n", + "- [Git development repository](http://github.com/biopython/biopython)\n", + "- [Mailing list](http://lists.open-bio.org/pipermail/biopython/)\n", + "- [Biopython Tutorial and Cookbook](http://biopython.org/DIST/docs/tutorial/Tutorial.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inside Biopython\n", + "\n", + "As one of the larger Python packages, these are some of the things provided by Biopython:\n", + "\n", + "- Rich objects representing various concepts (e.g., sequences, alignments, motifs).\n", + "\n", + "- File parsers and writers.\n", + " * Sequence files: fasta, fastq, genbank, abi, sff, etc.\n", + " * Alignment files: clustal, emboss, phylip, nexus, etc.\n", + " * Sequence search outputs: BLAST, HMMER, BLAT, etc.\n", + " * Phylogenetic trees: newick, nexus, phyloxml, etc.\n", + " * Sequence motifs: AlignAce, TRANSFAC, etc.\n", + " * Others: PDB files, etc.\n", + "\n", + "- Access to remote resources (e.g., Entrez, NCBI BLAST).\n", + "\n", + "- Application wrappers.\n", + "\n", + "- A simple graphing tool.\n", + "\n", + "- Simple algorithms (e.g., pairwise alignment, cluster analysis).\n", + "\n", + "- References such as codon tables and IUPAC sequences." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Characterizing Biopython\n", + "\n", + "Strengths:\n", + "\n", + "- Based on Python (readability, expressive constructs).\n", + "- Wide range of parsers with common interfaces.\n", + "- Access to online resources.\n", + "\n", + "Weaknesses:\n", + "\n", + "- Based on Python (?) (no error checks until runtime).\n", + "- Not always the fastest parsers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"sequences\"></a>\n", + "## Working with sequences\n", + "\n", + "The `Seq` object is Biopython's main representation of nucleotide or protein sequences. It is essentially a string with alphabet information. Its constructor is available in the `Bio.Seq` module." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from Bio.Seq import Seq" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sequence alphabets\n", + "\n", + "Let's make our first `Seq` object." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_seq = Seq('GGGTACGATAAA')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_seq" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 3, + "text": [ + "Seq('GGGTACGATAAA', Alphabet())" + ] + } + ], + "prompt_number": 3 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice the bit about the alphabet. Biopython never tries to guess what alphabet your sequence is in. You have to be explicit yourself (recall `import this`)." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from Bio.Alphabet import generic_dna" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 4 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_dna = Seq('GGGTACGATAAA', generic_dna)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 5 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_dna" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 6, + "text": [ + "Seq('GGGTACGATAAA', DNAAlphabet())" + ] + } + ], + "prompt_number": 6 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Seq` objects are almost the same as native Python `str` objects. They have similar methods and can be used with almost the same set of operators." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_dna.lower()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 7, + "text": [ + "Seq('gggtacgataaa', DNAAlphabet())" + ] + } + ], + "prompt_number": 7 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_dna.endswith('N')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 8, + "text": [ + "False" + ] + } + ], + "prompt_number": 8 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_dna + my_dna" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 9, + "text": [ + "Seq('GGGTACGATAAAGGGTACGATAAA', DNAAlphabet())" + ] + } + ], + "prompt_number": 9 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "str(my_dna)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 10, + "text": [ + "'GGGTACGATAAA'" + ] + } + ], + "prompt_number": 10 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Operations on sequences\n", + "\n", + "The plus feature of the `Seq` type is that we get to use additional functions from 'molecular biology'." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_rna = my_dna.transcribe()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 11 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_rna" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 12, + "text": [ + "Seq('GGGUACGAUAAA', RNAAlphabet())" + ] + } + ], + "prompt_number": 12 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we see that running `transcribe()` returns a new RNA sequence. Biopython uses alphabet information to determine whether a `Seq` member function can be invoked or not." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are more, of course." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_dna.complement()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 13, + "text": [ + "Seq('CCCATGCTATTT', DNAAlphabet())" + ] + } + ], + "prompt_number": 13 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_dna.reverse_complement()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 14, + "text": [ + "Seq('TTTATCGTACCC', DNAAlphabet())" + ] + } + ], + "prompt_number": 14 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_protein = my_dna.translate()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 15 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "my_protein" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 16, + "text": [ + "Seq('GYDK', ExtendedIUPACProtein())" + ] + } + ], + "prompt_number": 16 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now try running `my_protein.transcribe()` in your interpreter. What happens?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### <span class=\"exercise\">Exercise: Six-frame translation</span>\n", + "\n", + "Write a function that takes a `Seq` object and prints all possible translation frames from it. For example, when using `CGATCGTAGCTGTAGCGCGATATATACTAGGG` as the input sequence, the output is (not necessarily in this order):\n", + "\n", + " RS*L*RDIY*\n", + " P*YISRYSYD\n", + " DRSCSAIYTR\n", + " PSIYRATATI\n", + " IVAVARYILG\n", + " LVYIALQLRS\n", + "\n", + "Additionally, try to see how to use an alternative translation table (hint: `Bio.Data` module)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Storing additional information\n", + "\n", + "`Seq` objects are good for storing the sequence itself. But where do we store metadata such as sequence ID or interesting regions in the sequence?\n", + "\n", + "For this, we use the `SeqRecord` object from the `Bio.SeqRecord` module. It is essentially a thin wrap around the `Seq` object that also stores sequence metadata." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from Bio.SeqRecord import SeqRecord" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 17 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "record = SeqRecord(my_dna)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 18 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "record" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 19, + "text": [ + "SeqRecord(seq=Seq('GGGTACGATAAA', DNAAlphabet()), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])" + ] + } + ], + "prompt_number": 19 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print record" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "ID: <unknown id>\n", + "Name: <unknown name>\n", + "Description: <unknown description>\n", + "Number of features: 0\n", + "Seq('GGGTACGATAAA', DNAAlphabet())\n" + ] + } + ], + "prompt_number": 20 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We saw that `SeqRecord` stores a `Seq` object and other metadata such as:\n", + "\n", + "* `id`: Sequence ID.\n", + "* `name`: Sequence name, usually the same as `id`.\n", + "* `description`: Sequence description.\n", + "* `dbxrefs`: A list of database cross references.\n", + "\n", + "There are also other metadata not shown here such as:\n", + "\n", + "* `letter_annotation`: Annotation per sequence position.\n", + "\n", + "It's enough to supply only a `Seq` object when creating `SeqRecord`. However, as we saw, it's not that useful to have `<unknown id>` and `<unknown description>` as metadata. " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "SeqRecord(my_dna, id='my precious', description='my precious sequence')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 21, + "text": [ + "SeqRecord(seq=Seq('GGGTACGATAAA', DNAAlphabet()), id='my precious', name='<unknown name>', description='my precious sequence', dbxrefs=[])" + ] + } + ], + "prompt_number": 21 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from IPython.core.display import HTML\n", + "def custom_style():\n", + " style = open('styles/notebook.css', 'r').read()\n", + " return HTML('<style>' + style + '</style>')\n", + "def custom_script():\n", + " script = open('styles/notebook.js', 'r').read()\n", + " return HTML('<script>' + script + '</script>')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 22 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "custom_style()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "<style>/*\n", + " https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers\n", + "*/\n", + "@font-face {\n", + " font-family: \"Computer Modern\";\n", + " src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');\n", + "}\n", + "div.cell{\n", + " width:800px;\n", + " margin-left:16% !important;\n", + " margin-right:auto;\n", + "}\n", + "h1 {\n", + " font-family: Helvetica, serif;\n", + "}\n", + "h4{\n", + " margin-top:12px;\n", + " margin-bottom: 3px;\n", + " }\n", + "div.text_cell_render{\n", + " font-family: Computer Modern, \"Helvetica Neue\", Arial, Helvetica, Geneva, sans-serif;\n", + " line-height: 145%;\n", + " font-size: 130%;\n", + " width:800px;\n", + " margin-left:auto;\n", + " margin-right:auto;\n", + "}\n", + ".CodeMirror{\n", + " font-family: \"Source Code Pro\", source-code-pro,Consolas, monospace;\n", + "}\n", + ".prompt{\n", + " display: None;\n", + "}\n", + ".text_cell_render .exercise {\n", + " font-weight: 300;\n", + " /*font-size: 22pt;*/\n", + " color: #4057A1;\n", + " font-style: italic;\n", + " /*margin-bottom: .5em;\n", + " margin-top: 0.5em;\n", + " display: block;*/\n", + "}\n", + ".text_cell_render .example {\n", + " font-weight: 300;\n", + " color: #40A157;\n", + " font-style: italic;\n", + "}\n", + "\n", + ".warning{\n", + " color: rgb( 240, 20, 20 )\n", + "}\n", + "</style>" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 23, + "text": [ + "<IPython.core.display.HTML at 0x23c7350>" + ] + } + ], + "prompt_number": 23 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "custom_script()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "<script>// https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers\n", + "MathJax.Hub.Config({\n", + " TeX: {\n", + " extensions: [\"AMSmath.js\"]\n", + " },\n", + " tex2jax: {\n", + " inlineMath: [ ['$','$'], [\"\\\\(\",\"\\\\)\"] ],\n", + " displayMath: [ ['$$','$$'], [\"\\\\[\",\"\\\\]\"] ]\n", + " },\n", + " displayAlign: 'center', // Change this to 'center' to center equations.\n", + " \"HTML-CSS\": {\n", + " styles: {'.MathJax_Display': {\"margin\": 4}}\n", + " }\n", + " });\n", + "</script>" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 24, + "text": [ + "<IPython.core.display.HTML at 0x23c7450>" + ] + } + ], + "prompt_number": 24 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/09 - A sip of Biopython (2).ipynb b/09 - A sip of Biopython (2).ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7d565a7dbc2a2c9e71fc8865be357fe76a7ec4d0 --- /dev/null +++ b/09 - A sip of Biopython (2).ipynb @@ -0,0 +1,1199 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:1c343fc8627b1d9dee68bb0544d2c4b19a14bca05167b161c6574365818a0631" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# A sip of Biopython (2)\n", + "***\n", + "\n", + "[Wibowo Arindrarto](mailto:w.arindrarto@lumc.nl), [Sequencing Analysis Support Core, Leiden University Medical Center](http://sasc.lumc.nl)\n", + "\n", + "[Martijn Vermaat](mailto:m.vermaat.hg@lumc.nl), [Department of Human Genetics, Leiden University Medical Center](http://humgen.nl)\n", + "\n", + "License: [Creative Commons Attribution 3.0 License (CC-by)](http://creativecommons.org/licenses/by/3.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of contents\n", + "\n", + "1. Prelude: Python packages and their installation\n", + "2. Biopython\n", + "3. Working with sequences\n", + "4. [File I/O with Biopython](#files)\n", + "5. [Fetching from online resources: NCBI's Entrez](#entrez)\n", + "5. [Performing a remote BLAST search](#blast)\n", + "6. [Beyond Biopython](#beyond)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"files\"></a>\n", + "## File I/O with Biopython\n", + "\n", + "As mentioned before, one of Biopython's strength is its wide range of parsing support. Briefly:\n", + "\n", + "* Sequence files: `Bio.SeqIO`\n", + "* Alignment files: `Bio.AlignIO`\n", + "* Sequence search files: `Bio.Blast` (soon to be `Bio.SearchIO`)\n", + "* Phylogenetic trees: `Bio.Phylo`\n", + "* Sequence motifs: `Bio.motifs`\n", + "* Protein structures: `Bio.PDB`\n", + "\n", + "These parsers may not be the fastest available, but they provide a common interface of objects within their domain.\n", + "\n", + "As an example, let's take a look at parsing sequence files." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reading sequence files with `SeqIO`\n", + "\n", + "The main sequence input/output functions are all contained in the `Bio.SeqIO` module. Two of the most commonly used are `Bio.SeqIO.read` and `Bio.SeqIO.parse`. They provide the same functionality, except that the former is for files containing a single sequence and the latter is for files containing multiple sequences." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `SeqIO.read`\n", + "\n", + "A simple function call that takes as input:\n", + "\n", + "1. The file name *or* a file handle object pointing to your sequence.\n", + "2. The sequence file format name.\n", + "\n", + "It returns a single `SeqRecord` object containing our sequence of interest." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from Bio import SeqIO" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "fasta_record = SeqIO.read('data/simple.fa', 'fasta')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "fasta_record" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 3, + "text": [ + "SeqRecord(seq=Seq('MGIKQYSQEELKEMALVEIAHELFEEHKKPVPFQELLNEIASLLGVKKEELGDR...EIK', SingleLetterAlphabet()), id='sp|P12464|RPOE_BACSU', name='sp|P12464|RPOE_BACSU', description='sp|P12464|RPOE_BACSU DNA-directed RNA polymerase subunit delta OS=Bacillus subtilis (strain 168) GN=rpoE PE=1 SV=1', dbxrefs=[])" + ] + } + ], + "prompt_number": 3 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print fasta_record" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "ID: sp|P12464|RPOE_BACSU\n", + "Name: sp|P12464|RPOE_BACSU\n", + "Description: sp|P12464|RPOE_BACSU DNA-directed RNA polymerase subunit delta OS=Bacillus subtilis (strain 168) GN=rpoE PE=1 SV=1\n", + "Number of features: 0\n", + "Seq('MGIKQYSQEELKEMALVEIAHELFEEHKKPVPFQELLNEIASLLGVKKEELGDR...EIK', SingleLetterAlphabet())\n" + ] + } + ], + "prompt_number": 4 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The nice thing about this method, is that it abstracts over all the different file formats. If we want to parse another file which is a FASTQ instead of a FASTA file, we simply need to change the file name and the format name." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "fastq_record = SeqIO.read('data/easy.fastq', 'fastq')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 5 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "fastq_record" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 6, + "text": [ + "SeqRecord(seq=Seq('CCGCGACCTCTGTTCTGCAGCCCCTTCCCTTCCCCGCCTCCTGCTCTGCCGGGA...CCA', SingleLetterAlphabet()), id='HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1', name='HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1', description='HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1', dbxrefs=[])" + ] + } + ], + "prompt_number": 6 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print fastq_record" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "ID: HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1\n", + "Name: HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1\n", + "Description: HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1\n", + "Number of features: 0\n", + "Per letter annotation for: phred_quality\n", + "Seq('CCGCGACCTCTGTTCTGCAGCCCCTTCCCTTCCCCGCCTCCTGCTCTGCCGGGA...CCA', SingleLetterAlphabet())\n" + ] + } + ], + "prompt_number": 7 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Of course, since the FASTQ file has additional quality information, we can now access it. In this case, the qualities are stored in `letter_annotations`." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print fastq_record.letter_annotations" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "{'phred_quality': [16, 10, 10, 10, 19, 8, 27, 31, 27, 32, 27, 10, 17, 32, 24, 32, 17, 10, 10, 25, 18, 34, 23, 25, 8, 16, 30, 33, 35, 33, 35, 33, 33, 35, 34, 31, 25, 25, 21, 31, 7, 13, 23, 13, 13, 22, 8, 22, 22, 22, 25, 27, 30, 31, 31, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}\n" + ] + } + ], + "prompt_number": 8 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even more, `SeqIO` has support not only for plain text formats but also binary formats. Here's an example of `SeqIO` reading a Sanger sequencing trace file. Note the additional information (e.g. sequencing well) stored in the file is also parsed." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "abi_record = SeqIO.read('data/sanger.ab1', 'abi')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 9 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print abi_record" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "ID: 226032_C-ME-18_pCAGseqF\n", + "Name: sanger\n", + "Number of features: 0\n", + "/polymer=POP7 \n", + "/run_finish=2009-12-12 11:44:49\n", + "/sample_well=B9\n", + "/run_start=2009-12-12 09:56:53\n", + "/machine_model=3730\n", + "/dye=Z-BigDyeV3\n", + "Per letter annotation for: phred_quality\n", + "Seq('GGGCGAGCKYYAYATTTTGGCAAGAATTGAGCTCTATGGCCACAACCATGGTGA...TTC', IUPACAmbiguousDNA())\n" + ] + } + ], + "prompt_number": 10 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Head over to the [official Biopython API documentation](http://biopython.org/DIST/docs/api/) for a complete list of supported formats." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `SeqIO.parse`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`SeqIO.parse` is similar to `SeqIO.read`, but instead of returning a single `SeqRecord` object, we can iterate over the return value to get consecutive `SeqRecord` objects.\n", + "\n", + "This is similar to how we iterated over a filehandle to get consecutive lines:\n", + "\n", + " fh = open('my_file')\n", + " for line in fh:\n", + " print line\n", + " fh.close()\n", + "\n", + "However, instead of returning each line of the file, `SeqIO.parse` returns a single `SeqRecord` object per iteration and we do not need to call `close` afterwards because it is handled automatically by Biopython.\n", + "\n", + " for record in SeqIO.parse('my_sequence.fa', 'fasta'):\n", + " print record" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for record in SeqIO.parse('data/parse.fastq', 'fastq'):\n", + " print record.seq" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "CCGCGACCTCTGTTCTGCAGCCCCTTCCCTTCCCCGCCTCCTGCTCTGCCGGGACTACGCACCGGCCTGATTGGTTACCCCCGGGGTGTCCTCGGTCACCA\n", + "CCGCGACCTCTGTTCTGCAGCCCCTTCCCTTCCCCGCCTCCTGCTCTGCCGGGACTACGCACCGGCCTGATTGGTTACCCCCGGGGTGTCCTCGGTCACCA\n" + ] + } + ], + "prompt_number": 11 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The advantage of `SeqIO.parse` is that it allows you to handle large, multi-sequence files gracefully. The file size may well exceed your memory, since it allows us to process the records one by one.\n", + "\n", + "`SeqIO.parse` supports the same set of formats that `SeqIO.read` supports, one example is shown below." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for record in SeqIO.parse('data/roche.sff', 'sff'):\n", + " print record.id, len(record), record.seq[:10]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "E3MFGYR02JWQ7T 265 tcagGGTCTA\n", + "E3MFGYR02JA6IL 271 tcagTTTTTT\n", + "E3MFGYR02JHD4H 310 tcagAAAGAC\n", + "E3MFGYR02GFKUC 299 tcagCGGCCG\n", + "E3MFGYR02FTGED 281 tcagTGGTAA\n", + "E3MFGYR02FR9G7 261 tcagCTCCGT\n", + "E3MFGYR02GAZMS 278 tcagAAAGAA\n", + "E3MFGYR02HHZ8O 221 tcagACTTTC\n", + "E3MFGYR02GPGB1 269 tcagAAGCAG\n", + "E3MFGYR02F7Z7G 219 tcagAATCAT\n" + ] + } + ], + "prompt_number": 12 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### <span class=\"exercise\">Exercise: Trimming sequences from a mangled FASTQ file</span>\n", + "\n", + "* From the interleaved FASTQ file `data/mangled.fq`, print all sequences of the first read pairs after trimming off their first five nucleotides.\n", + "* **Hint:** first pair and second pair records are marked with `/1` and `/2`, respectively." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### <span class=\"example\">Example: A poor man's FastQC</span>\n", + "\n", + "Those of you working with NGS data probably know the [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) tool. It summarizes the quality of your raw sequencing reads over different metrics.\n", + "\n", + "<img src=\"files/images/fastqc.png\">\n", + "\n", + "Let's try to mimic the plot showing the base quality score distributions over the read length." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "%pylab inline" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Populating the interactive namespace from numpy and matplotlib\n" + ] + }, + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "WARNING: pylab import has clobbered these variables: ['record']\n", + "`%matplotlib` prevents importing * from pylab and numpy\n" + ] + } + ], + "prompt_number": 13 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "qualities = array([r.letter_annotations['phred_quality'] for r in\n", + " SeqIO.parse('data/mangled.fq', 'fastq')])" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 14 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now have a (20, 36) shaped array of quality scores (only 20 sequences of 36 bases each). Conveniently, we can pass this directly to the matplotlib `boxplot` function.\n", + "\n", + "While we're at it, let's also draw some visual indication that we consider everything below 60 as suboptimal (using `axhspan`) and add proper labels." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "figsize(10, 6)\n", + "axhspan(60, 100, facecolor='green', alpha=0.1, lw=0)\n", + "axhspan(0, 60, facecolor='red', alpha=0.1, lw=0)\n", + "boxplot(qualities, sym='')\n", + "ylim(qualities.min() // 10 * 10, qualities.max() // 10 * 10 + 10)\n", + "xlabel('Read position')\n", + "ylabel('Phred-based quality score')\n", + "title('Base quality score distributions over the read length');" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "display_data", + "png": "iVBORw0KGgoAAAANSUhEUgAAAloAAAGJCAYAAABSGZ32AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xm4JFV98PHvZVBZhwFFYK7ooAYRo6BRzCDIiKiIC2rU\nJORVkESJ5kWjxIDby2AiIsmMGoxLFAGJuBE3jDs6iJlRXNhRcYEgM2yyL24w9/3jd9qu23R1n+q+\n1ev38zz93K7qPn1OVZ9b9etzTp0CSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSRpba4C/Ts//CvjK\n8IoycFcC+6fnbwQ+uICffTuwLD0/FfinBfzs9wFvXsDPGzdraNbZUbcReGjJa2sY3nZ0KpemzCbD\nLoCm3pXAXcSJ8ybgC8CDhlmgBTaXHgAfBZ5ReG3SD8ZzhefHAy/PSLOGvJPj1kTdaeQzV/7Wjg4D\nzm1Z90rgn3v8vHGzEji9ZV0/+3OUDGo71jA+gamGwEBLwzYHPJs4ce4EXAecNNQSDdbMsAtQYtMh\n5dvtxFhWrlHdj6Nk0ZDzH1adqtskBKWqkYGWRslvgf8Cdi+sexZwPnArcBVwbOG1zYD/BH4F3Ayc\nBzwwvbYNcDKwAbia6Foqq++bE91PNwGXAq8Hfll4vbXl6VSaXVXbEq1w16f0ZwGzJfkcRrP15Fvp\n74XAbcCLgYuJoLPhPmnb9mjzWQ9I+d4M3Jg+rxFs7Ax8OpXpVzQD102ILrEriYD2NGBxem1Z2s7D\ngf8Fvp7WHw5clrbty8CDS7YN4CUp7a+IrsKilTRbTsq+t7cB+wLvIVo4/y29fyPwKuCnwE8K64rf\nyQOArxL7ck2hnI3tKn73a4gWiN2A9wPLabaowr27Il+e8r4R+Bzxg6BhI3AEcHnalvcUXns4cA5w\nC3AD8HHKPZeoezcD30xlAzga+FTLe9+dHtC5nh8G/A+wmtjXxzLfgcAbgD8ntv/8wmvLgG8T+/Mr\nwP0Lr/0psDaV9QJgvw7bdSXwj8BFKY9NuqR/GVHfbgN+Dryi5fNeX9jWwzvk206nutzpe9wEWEV8\nh78A/m96/yLK6yzA00o+T5IG6grgqen5FsTJ/9TC6/sBj0rPHw1cCxyclo8APk+cuGeAxxItYwCf\nIcbabA5sD3yXex+0G04gTohLiG7LS4igrqH1pH4K8Nb0fDvg+akMWwGfTHk3fJPmCeEw5ndTtX7u\n65l/Mj6YCMTaeTuxfYvS40lp/aKUZhWx7fcD9k6vHU4EDMuALYmg9iPptWWpPKemdJul/H8KPII4\n2byJOHG3sztxotkHuG/K//c0x2gdW8ir0/dW3F8NG4mT/ZK0PY11jX13KnFibuT9Lpr7ubFdxUCr\nmMeh3LvrsPj97k+cYPdMn/1vRF0plu3zRMC6MxHcPj299jEikCGl3Zv2dgXuIP4PFhH14KdEC9BD\ngDuJukV6fQOwV1ruVM8PI76Dv0vbv1mbvIvfS8Ma4GdEoLgZsb/enl6bJYK2A9PyAWn5ASXbdiXw\nw5Tufh3SNwK5g4Bd0vMnp21/bFo+kPj/3504VpxB5+734vfcrS63+x4b3fx/SwTBS4k6+HXgHpp1\nqqzOln2eJA3UlcQJ+mbgd8Qv1T/u8P53Eb/QIX79/g8RgBXtAPyG+SeWvwS+UfKZP6d5coRowejU\nonUK5YOv96TZMgLVAq2lxL5onFTPBP6hJJ/jgM8CD2tZv5w4qLdrvTubOGk07Ers801oBiTLCq9/\nifknkE2IE9/ObT77/xEnvoYtiBbKRqC1kmaLVtn3BrG/Wse7bARWtFlXDLSKeW8J3E2c1JfROdA6\njM6B1slEIF787N/RbA3ZyPwA6hNECw7Ej4YPUN7C2fAW5gfYM8T/wZPT8rlEayFEK8nP0vNu9fww\nooWxk5Xce4zWN5nfIvlKoi5AtLC1BmZfBl5a8vlXpHI0VE3/GeDV6fmHibF+DX9EfqDVrS53+h6/\nwfzxhU9lfp0qq7Otn3d0STk14ew61LDNEb82tyV+8R5JtBjskF5/InEgu57ogjmC5q/f04mWjo8D\n64F30GwFuA9wDRHA3Ux0EW1fUoalzA+srip5XztbECfTK4nuzXOI7pxexgxtIAKQFxK/nA8kBtC3\n8y/ECferRKDYOIjvTJxcN7ZJsxPzT7xXEftrh8K64n54CNFF1diHN6b17QKHnYjgoOGuwvtblX1v\nDe3GvPyyzbri+4t530kEu0s7pMnVus/uJLaruA+uLTy/i2br3D8S9eA8opX0ZR3yKNa5OWJ7G3mc\nQQRQAIfQrBM59bzTfuukuE2/phn8PwR4USG/m4nW1B07fFZrneqU/pnAd4h9fDPRwtX4f9+J3v9P\nc+py6/fY2ObWfIt1raFdnS37PE0ZAy2NkjniF+w9NLvCziBabh5EBB/vp1lv7yZaHh5F/Hp8NvHL\n+CqiNeX+RAC3LRH8tGtBgThRFcdrtI5DuosIqBp2onlgPYpoGdor5bEfcXLtdXD2acD/IU5Ga1PZ\n2rmDaO16GDG+53VE69FVqfztBj5vYH6L1YOJfXhdYV3xhHEV0Q21beGxJXEibHUN81u6tmD+uJ6i\nsu+tNf+iTgOOZ1ry3oro0t1ABEaN8jQUg4JuA5lb99mWxHat75IOYr++gjiZHwG8l/atLxuIQKCh\nsT2NPM4kWvRmgefRbL37Jd3rebftaxeQd3IVESgX68TWwIkd0rTWqbL09yO6s08kxuxtC3yR5v9S\nt//TbuXOrcutWut2a4uug+HVkYGWRsFM4W+jdetHad1WNLsV9yJ+0TcObCuIk8oiosvt90SQdi3R\n0rOaOIhvQgQkja6YVp8kxtI0xmgdyfyD5wXEHFiLiFam4udsRfziv5U4ubcOOO7kOu7d9fcZ4HFE\nd0lrF0vRs4gxNDPE+KR70uM84sRwAhFcbEazC+NjwGuJwGErohvm45SfbN9PdCE1Lk7YhggA2zmT\nCJieRIxHeivlx5cVtP/eoP0+yXFQIe9/AtYRgcoN6e9LUn6Ht3z+dcR3fp/CumKg/DGiJWoPIhA4\nnjg5l7WmFAPsF9GcquQWok6129efJL7P/VM5jiK6BNem128gxk2dSgzGblwQcA3V6nk71xH1ofWH\nQdkPhf8EnkN0tS8i6tcKuneP5qS/b3r8ithPz2R+l/4niW7IRxJ1u8r/WpW6DPPrwCeB19Aco3U0\n848POXXWq2KnmIGWRsFZxAn3VuIk+VKagdariJP2bcRYlk8U0u1IXJF1K3E10Rqa401eShy0G1cZ\nfYry7o3jiO6hK4jxIh9h/oHxNcTJ4WYi0CsOdn8XMRD5V8SJ8Ut0bpUpvraSaMG6meguhDjBfpo4\n+X265HMgxqd8jdhva4F/J7otN6ayPpwIBn5JXNEIMcbldOIKxV8QLXVHtpSv6LNEt97HiX18MeUD\nei8jBl2fQbTQ3MT87pbitnf63t5N7IubiH1bZq7l+UeJE++NxODp/1N4/eXEAPNfESfa4iDos4mB\nztcS3dOtZT2bqHf/lbZrF+AvSsrRmvbxRFB2O3G14qtpzv1VdHkq70lEUPUs4ju8u/CeM4ixQWe0\npO1Uz3PmkWpc0Xgj8P2S7Sp+ztXEj6E3EvvrKiIwzD2XlKWfIfbTq4nA5iaiu/RzhbRfJurEN4h9\ndnbG9jV0q8udvscPEgHtRcAPgP8mfhg0guacOjspc5NpBD2CuGS48biV+EfajjhJXE5U4CXDKqDU\nxgp6H9uyEN5C59YsScPzTNoHzNLQbUKzr/tEmld0HM38q3qkYVvB8AKt7YiWtX2GlL+k+TYjuqY3\nJbo4v0PzymdppDyd5mXUP6Z5pdOOaVkaFSuodkXTQnk5Mcj9vUPIW1J7mxNjH28jxmOdjFcQakR9\nmBhvAzEmpWGmZVmSJEkV3JcY5NmY36U1sLoJSZKkCTOom3w+k7ha44a0fB3RZXgtMSfR9a0Jdv/j\n3ecuu+SyARVPkiSpLxcSdweZZ1Bze3ycuOz9tLR8InE58TuAY4irDo9pSTO3/rbyOQFXHb+Ko954\nVE+F6SeteZu3eZu3eZu3eZt3q9nFs9AmrhrEPFpbEjcOLc4JdALNO5vvj1cdSpKkCTSIrsM7ufed\n3W8igi9JkqSJ1e5+aKNi5VFv6Ny8t/NDWm85la+ftOZt3uZt3uZt3uZt3kWr374a4k4j84zy/Zc6\njtGSJEkaFcMcoyVJkjSVDLQkSZJqYqAlSZJUEwMtSZKkmhhoSZIk1cRAS5IkqSYGWpIkSTUx0JIk\nSaqJgZYkSVJNDLQkSZJqYqAlSZJUEwMtSZKkmhhoSZIk1cRAS5IkqSYGWpIkSTUx0JIkSaqJgZYk\nSVJNDLQkSZJqYqAlSZJUEwMtSZKkmhhoSZIk1cRAS5IkqSYGWpIkSTUx0JIkSaqJgZYkSVJNDLQk\nSZJqYqAlSZJUEwMtSZKkmhhoSZIk1cRAS5IkqSYGWpIkSTUZRKC1BDgT+BFwGfCnwErgauD89Dhw\nAOWQJEkaqE0HkMe7gS8CL0z5bQk8A1idHpIkSROp7kBrG2Bf4NC0fDdwa3o+U3PekiRJQ1V31+Eu\nwA3AKcAPgQ8CW6TXjgQuBE4muhclSZImSt2B1qbA44D3pr93Asek5V2APYFrgFU1l0OSJGng6u46\nvDo9vpeWzyQCrRsK7/kQcFa7xKuOb8Zfy/ddzt777t1TIZYunu36ng23re/ps7t51IN35JZbyuPZ\nJUs2culV15a+3q3sncrdb96d0veTtu68+7UQec+2+d7W11THFlpr2auUu9ftHmZdaxjGdpu3eQ86\n73643fPzXnvuWtadu65r+kGMk/oW8DfA5cTVhpsD7wQaR77XAk8ADmlJN7dQO3F28VLW37ah59eH\nmXen1+v87HHOu18LmXfdZR1VC7mf6q5rkrQQUkB2r7hqENM7HAl8lBiP9Rjg7cCJwEVp3X5EsCVp\nhKw6futhF2Eo+tnufveZeZv3oPLuh9tdzSACrQuJFqs9gBcAtwAvJYKuPYDnAdcNoBySKlh9wnQG\nWv1sd7/7zLzNe1B598PtrmYQ82hJlc0xA4vLXoMNjMd4J0nSdDPQ0kiaYa7zmB0ccyNJGn0GWhmG\neQWcqluoq9AWKu/ZxUuz8x7XutbvdkvSpDLQynDLLZt0bF3RaOn0fUG931m/eY9rXRvmPpekUTY2\ngVY/80lB5wP9kiUbO6btZ7xQp7Q56aG87P2UOydvt3th09add7/zxbWmn4N5ZemUdpz3eafWuF5a\nIPttwczNu9v3BdXn2TPv0f2+h9laPs7b3UxTfQ6vhcp7bAKtfiYU7XcOnX7ybk1bdU6ffsreb979\njJPqlDYn/UJud7/pq+y3frd7IetaVcP8voeZdz+tiMNswRzmdk9r3sP8vs27t7wbGoFVleP5QuU9\nNoGWhqPXViWpKuuapElkoKVSrZG8M2yrLtY1SZPKQEvSVJvWOduGud3Tus+HadhjbqeZgZakqTat\nc7YNc7undZ8P0zDHQU47A60Be90xt09l3uPM/SZJ46mfKx4XioHWgB31xuGdtIeZ9zhzv0nSeBqF\nuQkHcVNpTQhbdjQo1jVJk8IWrQFpnSwN8iZMG6W8q7bs9DNJ7CjoZYK7SDe+2z0qUyxMS13rd1D4\nuG73MDkQf/CmfZ8baA3IoIKqUcl7Ei7X72W/jfN2j2vZx7Xc0N8A5XHZxlHjQPzBm/Z9bqAlaUHY\nuiJJ92agJalv49yqJEl1MtCSpB45keN0Geb3Pa11bRK220BL2VYdv7VTHWggxqWuOZHjdBnm9z2t\ndW0SttvpHZRt9Qlb95x2Wi/XH+ftHmbZrWvqZHbx0tKH4wE1amzR0kCMQ+tEHcZ5u8e17ONabuVx\nPKDGjYGWuirOJzW7eLhTVYyLYc6b1q9e5w8r6rVVybo2WKMyb9q0GPZ8UtP4fQ97n4OBljJ4sqtu\nnPfZQpS911alcd5v48aWocEb5nxS0/rdjsIcXgZakqbeNP7SlzQYBlqSppotO5Lq5FWHGohVx/d+\nFZlUhXVtuniVqeoyxwxLF8+WPuaYyfocAy0NRD+X60tVWNemi1eZqi4zzLHhtvWljxnmsj7HQEvS\ngrNVafzYMiTVw0BL0oKzVWn82DIk1cPB8JIWTOs8WDC4KRsWKm9bdiQtJAMtSQtmmPNgLVTetuxI\nWkgGWhoIWwmkyVI29xgMdv6xcbkB+UIYlX2uagy0NBDTciCUpsEozT22+oTpCLRGaZ+rmkEMhl8C\nnAn8CLgMeCKwHfA14HLgq+k9kqQh8UpRqR6DCLTeDXwReCTwGODHwDFEoLUrcHZaliQNiVeKSvWo\nO9DaBtgX+HBavhu4FXgucFpadxrwvJrLIUlZbNmRtJDqDrR2AW4ATgF+CHwQ2BLYAbguvee6tCxJ\nQ2fLjqSFVHegtSnwOOC96e+d3LubcC49NMFsJZAm1zCvKp7WK5qndbvHUd1XHV6dHt9Ly2cCbwCu\nBXZMf3cCrm+X+D9WrvrD8xXLl7Ni773rLKtqMDPbnERy9Qnxd2798OZaknIsrXAO63TJ/bZLNnb9\nrLL0OWkXQvF/dHZxb/+fq468HSqWtd/t7ufY0unzu+Vd5+tVvu9e9nk/hrndo5v3Gv5j5VmdP4D6\nA61rgV8Sg94vBw4ALk2PQ4F3pL+fbZd45VFH1Vw81c2gSpNsbv38y+tnZpfea12d6RfCMP5HF2K7\nPbZo+Faw8qhd/7B03OrVbd81iHm0jgQ+CtwX+DnwMmAR8Engr4ErgRcPoBySJGnMzMx2bjUedYMI\ntC4EntBm/QEDyFuSKjn2dY59kUbFKLT69suZ4SWpYOVRBlrSJClrERtUa9ggJiyVJE2wlau8qnjQ\n3Od55tZvmPcorrvp0msHUgYDLUlaIP12O45rt+Vxq/s76Y/rdg9Tv/tcg2OgJUkLpN9ux2nttpzW\n7dZ0cIyWJDF/XiaoNn1Aa9qq6aVcxbo2k54Ooq6NypV/49j6aaAlSfR3sjKo0qCM67xnC2UcWz8N\ntCRJGpBhXwE37YbRImagJUnqyzh25wzDKLUMTathtIg5GF6SRsS4XrLf90UAY7rdUg4DLUkaEdN6\nyf60bremg4GWJEnKNsyu4nFs/TTQkiRJ2YZ55d84tn4aaEmShmJmdjbNCzXTdi6ySedFBIM3jBYx\nAy1JUl96PXnNrV8/7zFtxnFOqHE3jBYxAy1JGhHj2sIxjt050qDkBlrLgAPS8y2AxbWURpKmmC0c\n0uTJCbReAXwK+EBafhDwmdpKJEmSRtYwr/wbx1bfnEDr74B9gNvS8uXAA2srkSRJNZqZXdr24W1w\n8gyzq3gcW31zAq3fpkfDpsBcPcWRJKk+c+s3zHsU19106bUDLcs4zgk17obRIpYTaJ0DvIkYm/U0\nohvxrDoLJUkaH+PYnTMKvIhg8Eb1XodHAzcAFwNHAF8E3lxnoSRpGo1rC8c4dudIg7JpxuuXALsB\n/1F/cSRp+hQn6zxuNVM5p5Q0qbq1aN0N/AR4yADKIklTadon7tR48V6H1eR0HW4HXAp8gxibdRbw\n+ToLJUnSIDi+rDrvdVhNt65DgLekv40rDWfwqkNJ0gQYZtBgkDd4K1dtPfDvPKdFaw3wY2I2+K2B\ny4grESVJGsvunFEwTRcRjMrcZaN6r8MXA98FXpSen5eeS5I0lt05GpxRmrtsGHK6Dt8MPAG4Pi1v\nD5xNzKclSZKkEjktWjPEPFoNN6Z1kiRpynivw2pyWrS+DHwFOIMIsP4c+FKdhZIkjb7i/F8z6em4\nTE9RLDuMT7lHwXGrBz+gvHWuORif7ywn0Ho98GfAk9LyB4DP1FYiSdJYGJcTXTujUPZhXAE3rhbq\n+xrVex3uQtx253Xp8WVgWY1lkiRp4k3rRQRDnfB0RO91eCZwT2F5Y1onSZJUybS14uV0HS4CfldY\n/i1wnwp5XAncRgRrvwf2AlYCf0NzkP0biJYySZKkkTAzu7T0tdw5wHICrV8BBwOfS8sHp3W55oAV\nwE0t61anhyRJGhPjeOVfLxpzfjXMzC6917ocOYHW3wIfBd6Tlq8GXlIxn3bTQThFhCRJY2bauv76\nlTNG62fAE4HdgUcCy9O6XHPA14HvAy8vrD8SuBA4GVhS4fMkSRp709IyNEqGMQdYTqD198R9Du8A\n3g38EHhGhTyeBDwWeCbwd8C+wPuIqxn3BK4BVlX4PEmSxt60tgwNc8LTYVzpmdN1eDjwLiK42g54\nKXA6MYlpjmvS3xuI+bf2As4tvP4h4Kx2CVeuasZfK5YvZ8Xee2dmKUmSRtGwJzydmV2YebnWrF3L\nmnXruued8VkXA48G/g1YA3waOJ9opepmC+KqxduBLYGvAscBFwGNO0m+lriX4iEtaedGYUI5SZKg\n+2DoXgdLT5tx3U/dJphNwdy94qqcrsMfEAHSQcQUDIuJubRy7EC0Xl0AfBf4QvqsE4lg60JgPyLY\nkiRJI26YXX/D1GsrXE6L1iJiLNXPgVuA+wOzRKBUJ1u0JEkjwxat0O92Tup+6qdF6x6iVeuWtHwj\n9QdZkiRNtGltGZo2OYGWJElaYN7rcDoYaEmSpIGZtmktcgKt1cCj6i6IJEnSqOq1qzcn0PoR8B/A\necTteLbpKSdJksTM7GwaOD0zb36ncTFtXX8NvXb1Vrnf4G7AYcR8V98GPgh8s6dc83jVoSRpZHjV\n4XTr/v33ftUhxBQPuxH3OryBmP/qdcAnqhZUkiRpWuQEWu8EfkJMWPo24E+AdwDPIebXkiRJyjJt\n01rkBFoXAXsAryDGaRU9ccFLJEmSJta0TWuRE2i9BLizZd3Z6e8tSJIkTbheLwLoFGhtTtxu5wHA\ndoXHMuIWPJIkacpMW9dfQ6/zf3UKtI4Avg88grgFT+PxeeA9PeUmSZLG2rR1/fVr0w6vvSs9jgRO\nGkxxJEnSJCrOGTYzC9MyhVOnQGt/4BvABuAFbV7/dC0lkiRJE2daAqtWnQKt/YhA6znAXJvXDbQk\nSZI66BRoHZv+HjaAckiSJI2slau27mlAfKdA66g26+aI6eXniJtNS5KkKTLN9zpc6EBra9p3Gc6U\nrJckSROu12kOplWnQGvloAohSZI0iToFWg2bA38N7J6eN1qzDq+rUJIkSZMg5xY8pwM7AAcCa4Cd\ngTtqLJMkSdJEyGnRejjwQuBg4DTgDODbdRZKkiSNluKEow3TNDdWrxcB5ARav0t/bwUeDVwLbN9T\nbpIkaSxNU1DVTq8XAeQEWh8kbib9ZuI+h1sBb+kpN0mSpCmSG2gBnAPsUmNZJEmSJkpOoHVs4Xlx\n/qy3LnBZJEmSJkpOoHUnzQBrc+DZwGW1lUiSJGlC5ARa/9qy/C/AV2soiyRJ0kjq9V6HOfNotdoS\nuPc1npIkSRPquNVb95Qup0Xr4sLzTYAH4vgsSZKkrnICrecUnt8NXAf8vp7iSJIkTY6cQOu2luXW\ntrObFqgskiRJEyUn0Poh8GDg5rS8LXAVcSXiHPDQeoomSZI03nIGw3+NmNLh/unxLOKqw13IC7Ku\nBC4CzgfOS+u2S597efqsJVUKLUmSNEi93uswJ9BaDnyxsPwlYO8KecwBK4DHAnuldccQgdauwNlp\nWZIkaST1eq/DnEBrA3Gfw2VEK9abgKp3lpxpWX4ucFp6fhrwvIqfJ0mSNPJyAq2/JKZ0+Azw6fT8\nLyvkMQd8Hfg+8PK0bgfi6kXS3x0qfJ4kSdJYyBkMfyPw6j7yeBJwDbA90V3445bXG4Pq72XlqlV/\neL5i+XJW7F2lx1KSJKkea9auZc26dV3f19qlV7djgTuIlq0VwLXATsA3gd1a3js3t75qD6UkSfWY\nmV3K3PoNPb+uyTYzOwtt4qpebsFTxRY0593aEng6MdP854FD0/pDgc/WXA5JkqSerVzV2y146g60\ndgDOBS4Avgt8gZjO4QTgacT0DvunZUmSpJFUx70OTyo8n2N+c9gceeO2rgD2bLP+JuCAjPSSJElj\nq1OL1g/S437A44jWp58SgdN96y+aJEnSeOvUonVq+vtKYB+aN5J+H/DtGsskSZI0EXLGaC0BFheW\nt8Zb5kiSJHWVM4/WCcSNpdek5f2AlTWVR5IkaeT0eq/D3Hm0dqJ5n8LvEvNf1c15tCRJI8N5tNRJ\nP/NobUJcIbgH8DliIPxeHVNIkiQpK9B6L7Cc5v0N70jrJEmS1EHOGK0nAo8Fzk/LNwH3qa1EkiRJ\nEyKnRet3wKLC8vbAxnqKI0mSNDlyAq2TgM8ADwSOB/4HeHudhZIkSRolvd7rMPeqw0cCT03PzwZ+\n1FNu1XjVoSRpZHjV4XTr/v33ftXhw4h7Fr4HuJS4GbQTlkqSJHWRE2h9GrgbeDjwAWBn4Iw6CyVJ\nkjQKZmZnU2vVTOF5vpyrDjcSgdYLiPFaJ9G8AlGSJGli9TuMKfeqw0OAlwJfSOuc3kGSJKmLnEDr\ncGLC0rcRY7UeCvxnnYWSJEmaBDldh5cCRxaWf0HcaFqSJEkd5ARauxLzZ+0ObJ7WzREtW5IkSSqR\n03V4CvB+YkD8CuA04KM1lkmSJGki5ARamwNfJybh+l9gJfCsGsskSZI0EXK6Dn9D3OvwZ8D/BTYA\nW9ZZKEmSpEmQE2j9PbAF8Grgn4DFwKF1FkqSJGkS5ARa56W/M0SwdVt9xZEkSZocOWO0ngBcXHhc\nCDy+zkJJkiRNgpwWrQ8DrwLOTcv7pHWPqatQkiRJkyCnRetumkEWwLfTOkmSJHXQqUXrT9Lfc4AP\nAB9Ly3+e1kmSJKmDToHWKmIG+IZj09+ZlvWSJElqo1OgtWJQhZAkSZpEOWO0ir5QSykkSZImUNVA\na7aWUkiSJE2gqoHWBbWUQpIkaQJVDbRe1kMei4DzgbPS8krg6rTufODAHj5TkiRp5HUaDH9xh9fm\nyJ+w9DXAZcDWhbSr00OSJGlidQq0npP+vir9PZ2Y2uGvKnz+g4CDgLcBr0vrZtJDkiRponUKtK5M\nf58O7FlYfxHR5Xd0xue/E3g9sLiwbg44Engp8H3gKOCWvOJKkiSNj5wxWjPE/Q0bnkRei9SzgeuJ\noKz4/vcg7eSmAAATZ0lEQVQBuxDB2zXExKiSJEkTJ+em0ocDpwDbpOVbyBsUvzfwXKLrcDOiVesj\nREtWw4doDpK/l5WrmjHYiuXLWbH33hnZSpIk1WvN2rWsWbeu6/uqjJXaJr2/l26+/YB/IMZ97US0\nZAG8FngCcEibNHNz69f3kJUkSQtvZnYpc+s39Py6JtvM7Cy0iatyWrR2JAazzxJTMewOLAdOrpI/\nzfsjngjskZavAI6o8DmSJEljIyfQOpXoOnxTWv4p8EmqBVpr0gPgJRXSSZIkja2cwfAPAD4B3JOW\nfw/cXVuJJEmSJkROoHUHcP/C8p8Ct9ZTHEmSpMmR03V4FHFl4EOBtcD2wAvrLJQkSdIk6BZoLQKe\nnB67EYPafwL8ruZySZIkjb1uXYf3EFMv3A1cQtz/0CBLkiQpQ07X4beB9xAD4u+kOVXDD2sslyRJ\n0tjLCbQeSwRWb21Z/5SFL44kSdLkyAm0VtRdCEmSpEmUE2htBvwZsIwYHN/oOmxt4ZIkSVJBTqD1\nOeL+hj8AflNvcSRJkiZHTqA1Czyj7oJIkiRNmpxAay3wGOCimssiSdJIm5ldWvratks2DrAkGhcz\nHV67OP1dBPwRcAXw27Rujgi+6jQ3t359zVlIktSbmdmlzK3fMOxiaETMzM5Cm7iqU4vWc4iAqm1C\nSZIkddYp0LoO+Fvg4US34cnEDPGSJEnK0OkWPKcBf0J0IR4ErBpIiSRJkiZEpxatRwKPTs8/BHyv\n/uJIkiRNjk4tWneXPJckaeod+7rbh10EjYFOg9zvAe4qLG8O/Do9nwMW11WoRh5edShJksZBL1cd\nLqqtNJIkSVOgU9ehJEmS+mCgJUmSVBMDLUmSpJoYaEmS1IOVq7YedhE0Bkb51jpedShJGlne61BF\nZVcd2qIlSZJUEwMtSZKkmhhoSZIk1cRAS5IkqSYGWpIk9cB7HSqHVx1KkiT1yasOJUmSBsxAS5Ik\nqSaDCLQWAecDZ6Xl7YCvAZcDXwWWDKAMkiRJAzeIQOs1wGXAXFo+hgi0dgXOTsuSJEkTp+5A60HA\nQcCHaA4Qey5wWnp+GvC8mssgSdKC816HylF3oPVO4PXAxsK6HYDr0vPr0rIkSWPluNUGWuquzkDr\n2cD1xPissmkk5mh2KUqSJE2UTWv87L2JbsKDgM2AxcDpRCvWjsC1wE5EMNbWylWr/vB8xfLlrNh7\n7xqLK0mSlGfN2rWsWbeu6/sGNWHpfsA/AM8BTgRuBN5BDIRfQvsB8U5YKkkaWTOzS5lbv2HYxdCI\nGIUJSxtdhCcATyOmd9g/LUuSJE2cOrsOi85JD4CbgAMGlK8kSbXwXofK4b0OJUmS+jQKXYeSJElT\nxUBLkiSpJgZakiRJNTHQkiRJqomBliRJPfBeh8rhVYeSJPXACUtV5FWHkiRJA2agJUmSVBMDLUmS\npJoYaEmSJNXEQEuSpB54r0Pl8KpDSZKkPnnVoSRJ0oAZaEmSJNXEQEuSJKkmBlqSJEk1MdCSJKkH\n3utQObzqUJKkHnivQxV51aEkSdKAGWhJkiTVxEBLkiSpJgZakiRJNTHQkiSpB97rUDm86lCSJKlP\nXnUoSZI0YAZakiRJNTHQkiRJqomBliRJUk0MtCRJ6oH3OlQOrzqUJKkH3utQRV51KEmSNGAGWpIk\nSTWpO9DaDPgucAFwGfD2tH4lcDVwfnocWHM5JEmSBm7Tmj//N8BTgLtSXt8G9gHmgNXpIUmSNJEG\n0XV4V/p7X2ARcHNaHuWB+JIkdeS9DpVjEIHWJkTX4XXAN4FL0/ojgQuBk4ElAyiHJEkLZuVRBlrq\nbhCB1kZgT+BBwJOBFcD7gF3S+muAVQMohyRJ0kDVPUar6Fbgv4HHA2sK6z8EnNUuwcpVzfhrxfLl\nrNh77xqLJ0mSlGfN2rWsWbeu6/vqHif1AOBu4BZgc+ArwHFE9+G16T2vBZ4AHNKS1glLJUnSWCib\nsLTuFq2dgNOILspNgNOBs4GPEN2Gc8AVwBE1l0OSJGngRvnKP1u0JEkja+WqrR0Qrz8oa9Ey0JIk\nqQfe61BF3utQkiRpwAy0JEmSamKgJUmSVBMDLUmSpJoYaEmS1APvdagcXnUoSZLUJ686lCRJGjAD\nLUmSpJoYaEmSJNXEQEuSJKkmBlqSJPVg5aqth10EjQGvOpQkqQfe61BFZVcdbjr4okiSNL7SCTU9\nj782DKiMgZYkSRUYVKkKx2hJkiTVxEBLkiSpJgZakiRJNTHQkiRJqomBliRJUk0MtCRJkmpioCVJ\nklQTAy1JkqSaGGhJkiTVxEBLkiSpJgZakiRJNTHQkiRJqomBliRJUk0MtCRJkmpioCVJklQTAy1J\nkqSaGGhJkiTVxEBLkiSpJnUGWpsB3wUuAC4D3p7Wbwd8Dbgc+CqwpMYySJIkDU2dgdZvgKcAewKP\nSc/3AY4hAq1dgbPTcmVr1q7tuWD9pDVv8zZv8zZv8zZv885Vd9fhXenvfYFFwM3Ac4HT0vrTgOf1\n8sFr1q3ruVD9pDVv8zZv8zZv8zZv885Vd6C1CdF1eB3wTeBSYIe0TPq7Q81lkCRJGopNa/78jUTX\n4TbAV4juw6K59JAkSZo4MwPM6y3Ar4G/AVYA1wI7ES1du7V5/wXAHoMqnCRJUh8uJBqXBuYBNK8o\n3Bz4FvBU4ETg6LT+GOCEQRZKkiRpEjwa+CHRMnUR8Pq0fjvg6zi9gyRJkiRJkqbBh4krFS/uMf3O\nNK9+vAR4dYW0ZROwVrEIOB84q4e0VxItg+cD5/WQfglwJvAjovx/mpnuESnPxuNWqu03gDcQ+/xi\n4AzgfhXSvialuyQ976ZdHcmdJLdd2helst8DPK6HvP+F2OcXAp8mLgzJTftPKd0FxJxzO1fMu+Eo\n4sKU7SqkXQlcTfN7P7CHvI8ktv0S4B0V0n68kO8V6W+VvPci/kfOB74HPKFC2j2AdcT/2ueBrUvS\nlh1LcutaWfrc+laWPqe+laXNqW/djqHd6lpZ+pV0r2+d8s6pa2XpP0H3+laWNreulaXPqW/9Tv5d\nlj63rpWlz6lrZWlz6lq3c263ulaWfiXd61qnvHPq2ljbF3gsvQdaO9IcqLYV8BPgkRXSb5H+bgp8\nh5iAtYrXAR8l/qGquoLyCpXjNODw9HxTyk/4nWwCXEPnE36rZcAvaAZXnwAOzUz7x8R3vRkRpH4N\neFiXNO3qyInAP6bnR1M+LrBd2t2IyXW/SfdAq136p9GcRuWEinkXD7pHAh+qmDfEd/VlOtefdmmP\nJeprjnbpn0J8X/dJy9tXLHfDvwJvrpj3GuAZ6fkzie8uN+330nqAlwFvLUlbdizJrWtl6XPrW1n6\nnPpWljanvnU6hubUtbL0OfWtLG1uXcs5/pfVt7K0a8ira2Xpc+tbu3NPbl0rS1/l2NYufe6xrV3a\n3GNb2Tk3p66Vpc89trVLm1vX5hm3ex2eS0x62qtriQgV4A4iKl1aIX3rBKw3VUj7IOAgokL1erVn\nr+m2If6ZP5yW7yZapqo6APg58MsKaW4Dfk9U2k3T3/WZaXcjflX8hvjVdQ7wgi5p2tWR3Ely26X9\nMfGLMUe79F8jfnVBbMuDKqS9vfB8K+BXFfMGWE3zYFw1bW59a5f+lcSvwN+n5Rsq5t3I/8XAxyrm\nfQ3NHxJLKK9v7dL+UVoPMZb0z0rStjuWzJJf18qORbn1rSx9Tn0rS5tT3zodQ3PqWtl+g+71rSzt\n35JX17od/zvVt7K8c+taWfrc+tbv5N/tzl1Vjm3t0uce29qlzT22lZ1zc+pau/SN//ecY1u7tLl1\nbewto/cWrdbP+V/iS87VmID1duLXRBWfIn4970dvXYe/IJo5vw+8vGLaPYl/hFOICxQ+SDNar+LD\nwKt6SPcKYp9dD5xeId1uxC+/7YjyrgPenZFuGfPrSPFkOkPnYL01bUPOr75O6SG+90Mqpn0bcBVx\nUOx24Uhr+oOBd6bn3X75taY9luiuvhA4uYe8zyea6L9D/Op/fIW0DU8mfvF305r+IcSPgauILoJO\nLbCtaf+H2G8Qv3pvy8z/f4lf6VXqWmv64rEot76VpYfu9a1d2qr1rZG2Sl1rl76X+tbY51XqWru8\nG6rUt0baKnWtNf3W5Ne3dueeKnWt07krp651O/d1qmtlaXPqWru0Vepau/S5da1d2l7q2lhaRv+B\n1lZEwNLT7X+IXzDfIeYDy/Fs4N/T8xX0FmjtlP5uT3z5+3Z4b6vHExF4Y/zAuyhvoi5zXyJ6z2oq\nLXgY0cd9f6JF6zPAX1VIfzjxXZ0DvJfmP1gnyygPtKBzS2Rr2oZ+A603Af/VY1qIqVBOqZB+CyK4\nXpyWryC+g9y8H0gcvGeAfyYOSLl5k543guInED8UctM2vA94bZd826X/OvD89PxFxC/v3LSPICZX\n/j7w/+jcighxLPkBzWNJlbrWSN/uWJRb38rS59S3TsfBbvWtmLZqXWuXd5X61pq2Sl1rl74hp761\npq1S19qlr1rfGueep1C9rhXTryisqxLUt0ufU9fK0kLesa2R9qD0t0pda8276rGtmLZqXRtby+gv\n0LoPUbH/vs9yvAX4h8z3Hk/86rmCaGq+E/hIH3kfSwwEzLVjyrthH+ALFfM8mOgTr+rPmd///hKa\nQWdVxxNNt90sY34d+TGxDyAC1h9XSNvQT6B1GPHLdbMe0jY8mBh8mZv+0cRA7yvS4/fEr7gH9pB3\np9fK3vMlovW24WeUHxDbff6mRHdLTtd+a/piq8AMnbvJ2+XdsCsRQJRpdyypUtc6HYty6ltZ+sPo\nXt+6HQc71bfWtFXrWre8l1H+nbRLW6WuleWdU9/apa1S17ptd7f61tA491Spa+3SN1QJtFrTH0be\nsa0sb8g7tjXSvplqda1b3svIiycaaavUtT8YtzFa/ZohotfLiFadKlonYH0ana+GKnoj0aS8C/AX\nwDeAl1bIewuagwe3BJ5OtWDzWiLQ2zUtH0BcbVLFX9J5rEyZHxNXOG5O7P8DiP2fq/EP9GDil+MZ\nPZTh8zQH4B8KfLaHz4DexsgdSMwhdzAx1qyKPyo8P5j8+gZRP3Yg6twuRLfG44ju2xw7FZ4/n+o/\nbj4L7J+e70q0iN5YIf0BxDiWDRXzhTj4NQ6G+5M/DgWaLbabEAf195W8r+xYklvXco5FnepbWfqc\n+laWNqe+tUtbpa6V5Z1T38rS5ta1Tvu8W30rS5tb18rS59S3snNPbl3LOXd1qmtl6XPqWlnahxfe\nU1bX2qVdR35dK8t7x8J7yupaWdp+j2tj4WPEP8JvicDhZRXT70MM3ruAvMvWi8omYK1qP6pfdbhL\nyvcCIvJ/Qw/57kGMP+g2zUA7WxJN2mWXunfzjzSndziN5hUbOb6V0l7Ave+V2U6jjvyOZh3JnSS3\nNe3hRBP/L4nbR11L/KLJzftw4KfEeIxGfXtvhbRnEvvsAqJpvtOvtm7/G7+gfCxDu7w/QtTzC4mD\nS6ebv7fL+z7EeLyLia61FRXLfQoxtq+bdt/342lemr2OGBuZk/Zw4rL7n6TH8R3yLTuW5Na1dumf\nSX59K0ufU9/K0ubUt7K0RZ3qWln6nPpWts9z61qn43+3+lZW7ty6VpY+p771O/l3Wfrnk1fXytLn\n1LWytDl1Leec26mulaXPqWtlaXPrmiRJkiRJkiRJkiRJkiRJkiRJkiRJkiRJkiRpMt1DzLlzETGf\nW5V7jXZyJXn31+vFc4Cj0/PnAY8svHYc8NSa8pUkSark9sLzU6l2K6lOcm9k3K9TgT8bQD6SJsS0\n3YJH0uhYR9x0nPT3S8TNdb9F3GwXojXpO8QszV+jOYP0/YnZsC8BPkj5LUTuAFan932duLUGwJ7p\ncxt3SmjMqv1q4k4EF9K83dNhwEnA8lSef0nleSjzA6+npvUXEbdbuW9afyWwkphJ+qLCtkmSJC2o\nRovWIuLWG69Ky2fTvP/ZE9MyzL+tyN8A/5qe/xtxbziAg4jbm7Rr0dpI3KcT4sawJ6XnFwH7pufH\nAe9Mz9fTvEXU4vT30EK6U4AXFD6/sbwZcFVhG04DXpOeXwH8XXr+SiIwlCRJWnB3E2O0rgfOI1rV\ntwLuonnPtPNp3vT80UTL1UXEDcq/mNafDywrfO6NtA+07qbZcr9LSreYuEdbw0OJ1iaIVrVPAX9F\n3OMT7h1oFbsOG8t7AOcU1u9PBJIQgVbjpslPJFrmJE0Juw4lDdKviRvvPgT4DXAw0e13S1rfeDwq\nvf8kovXqMcARwOaFzyrrLiwzA8yVrG94FvDvwOOIm7AvapNPu89oXdea12/T33uATTPLK2kCGGhJ\nGoZfE+Oh3kaMo7oCeGF6bYYIrCBanzak54cV0n8LOCQ9fyawbUk+mwAvSs8PAc4FbgNuBvZJ618C\nrEn5Pjg9PwbYhntfFXk7zS7FhjngJ0QLW2PM2UuY38IlaUoZaEkapGIrzwXAz4AXE111f53WXQI8\nN71nJdGV933ghkL644Anp/c+n/ldgUV3AnsBFwMrgLem9YcSg9ovJIK6txItTacT3ZQ/BN4N3Jry\nbOT7ceD1RFfjQwv5/BZ4WSrrRUSX5fvbbPMc7VvEJEmSxs7t3d8iSfWxRUvSJLP1SJIkSZIkSZIk\nSZIkSZIkSZIkSZIkSZIkSZIm1/8H4h9yRaLbelYAAAAASUVORK5CYII=\n", + "text": [ + "<matplotlib.figure.Figure at 0x4279e90>" + ] + } + ], + "prompt_number": 15 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Writing sequence files with `SeqIO`\n", + "\n", + "Often, after manipulating parsed sequences, we need to write it back to a file. This is accomplished using `SeqIO.write` in Biopython. The function takes as its input:\n", + "\n", + "1. An iterable returning `SeqRecord` objects (generators and lists are examples of iterables).\n", + "2. A filename to write to *or* a file-like handle.\n", + "3. The format to write to.\n", + "\n", + "It returns the number of sequences written." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "records = SeqIO.parse('data/parse.fastq', 'fastq')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 16 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "SeqIO.write(records, 'my_sequences.fa', 'fasta')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 17, + "text": [ + "2" + ] + } + ], + "prompt_number": 17 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we're free to write the output format as any format supported by `SeqIO.write` so long as the information is adequate. In this case, reading the `my_sequences.fa` and writing it back to `parse.fastq` would not be possible since we have discarded the quality information.\n", + "\n", + "Additionally, the sequence records are written one-by-one to the file (courtesy of `SeqIO.parse`). So even if the `data/parse.fastq` is 10GB large, we can run the commands with only 1GB of free memory (for example)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Additional `SeqIO` methods\n", + "\n", + "There are several more useful `SeqIO` functions, which you are free to try out on your own:\n", + "\n", + "* `SeqIO.index`: For efficiently fetching random records from a large sequence file.\n", + "* `SeqIO.index_db`: Similar to `SeqIO.index`, but with a persistent index.\n", + "* `SeqIO.convert`: Shortcut for converting between file formats." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"entrez\"></a>\n", + "## Fetching from online resources: NCBI's Entrez\n", + "\n", + "The `Bio.Entrez` library provides interface to [NCBI's Entrez e-utilities](https://www.ncbi.nlm.nih.gov/books/NBK25500/). One example we are demonstrating today is the `Entrez.efetch` utility to retrieve various records from one of NCBI's databases." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from Bio import Entrez" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 18 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To monitor potential excessive use of their services, NCBI requests you to specify your email address with each request. With Biopython, you can set it once for your session like this:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "Entrez.email = 'python@lumc.nl'" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 19 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fetching with `efetch`\n", + "\n", + "The `Entrez.efetch` function returns a file-like handle that instead of pointing to a local file, points to a remote resource. This file handle is similar to the local file handle we saw on Tuesday." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "efetch_handle = Entrez.efetch(db=\"nucleotide\", id=\"NM_005804\",\n", + " rettype=\"gb\", retmode=\"text\")" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 20 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can of course do an `efetch_handle.read()` and see the entire contents of the genbank file. But we know how to work with `SeqIO.read` now, so let's use that instead (recall that `SeqIO.read` works equally well with file handles and local file names)." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ncbi_record = SeqIO.read(efetch_handle, 'genbank')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 21 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print ncbi_record" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "ID: NM_005804.3\n", + "Name: NM_005804\n", + "Description: Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 39A (DDX39A), transcript variant 1, mRNA.\n", + "Number of features: 24\n", + "/comment=REVIEWED REFSEQ: This record has been curated by NCBI staff. The\n", + "reference sequence was derived from DA432925.1, BC001009.2 and\n", + "BM792110.1.\n", + "This sequence is a reference standard in the RefSeqGene project.\n", + "On Oct 14, 2010 this sequence version replaced gi:21040370.\n", + "Summary: This gene encodes a member of the DEAD box protein family.\n", + "These proteins are characterized by the conserved motif\n", + "Asp-Glu-Ala-Asp (DEAD) and are putative RNA helicases. They are\n", + "implicated in a number of cellular processes involving alteration\n", + "of RNA secondary structure, such as translation initiation, nuclear\n", + "and mitochondrial splicing, and ribosome and spliceosome assembly.\n", + "Based on their distribution patterns, some members of the DEAD box\n", + "protein family are believed to be involved in embryogenesis,\n", + "spermatogenesis, and cellular growth and division. This gene is\n", + "thought to play a role in the prognosis of patients with\n", + "gastrointestinal stromal tumors. A pseudogene of this gene is\n", + "present on chromosome 13. Alternate splicing results in multiple\n", + "transcript variants. Additional alternatively spliced transcript\n", + "variants of this gene have been described, but their full-length\n", + "nature is not known. [provided by RefSeq, Sep 2013].\n", + "Transcript Variant: This variant (1) represents the longer\n", + "transcript.\n", + "Publication Note: This RefSeq record includes a subset of the\n", + "publications that are available for this gene. Please see the Gene\n", + "record to access additional publications.\n", + "##Evidence-Data-START##\n", + "Transcript exon combination :: U90426.1, BC001009.2 [ECO:0000332]\n", + "RNAseq introns :: mixed/partial sample support\n", + " ERS025081, ERS025082 [ECO:0000350]\n", + "##Evidence-Data-END##\n", + "COMPLETENESS: complete on the 3' end.\n", + "/sequence_version=3\n", + "/source=Homo sapiens (human)\n", + "/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']\n", + "/keywords=['RefSeq']\n", + "/references=[Reference(title='Up-regulation of DDX39 in human malignant pleural mesothelioma cell lines compared to normal pleural mesothelial cells', ...), Reference(title='A whole-genome association study of major determinants for allopurinol-related Stevens-Johnson syndrome and toxic epidermal necrolysis in Japanese patients', ...), Reference(title='DDX39 acts as a suppressor of invasion for bladder cancer', ...), Reference(title='Clinical proteomics identified ATP-dependent RNA helicase DDX39 as a novel biomarker to predict poor prognosis of patients with gastrointestinal stromal tumor', ...), Reference(title='Interferon-induced antiviral protein MxA interacts with the cellular RNA helicases UAP56 and URH49', ...), Reference(title='The closely related RNA helicases, UAP56 and URH49, preferentially form distinct mRNA export machineries and coordinately regulate mitotic progression', ...), Reference(title='Hcc-1 is a novel component of the nuclear matrix with growth inhibitory function', ...), Reference(title='Growth-regulated expression and G0-specific turnover of the mRNA that encodes URH49, a mammalian DExH/D box protein that is highly related to the mRNA export protein UAP56', ...), Reference(title='Analysis of a high-throughput yeast two-hybrid system and its use to predict the function of intracellular proteins encoded within the human MHC class III region', ...), Reference(title='The BAT1 gene in the MHC encodes an evolutionarily conserved putative nuclear RNA helicase of the DEAD family', ...)]\n", + "/accessions=['NM_005804']\n", + "/data_file_division=PRI\n", + "/date=25-MAY-2014\n", + "/organism=Homo sapiens\n", + "/gi=308522777\n", + "Seq('AGCAGCAGCCCGACGCAAGAGGCAGGAAGCGCAGCAACTCGTGTCTGAGCGCCC...AAA', IUPACAmbiguousDNA())\n" + ] + } + ], + "prompt_number": 22 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Entrez.efetch` also allows you to fetch multiple records in one go:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "efetch_handle = Entrez.efetch(db=\"nucleotide\", id=[\"NM_005804\",\"NM_000967\"],\n", + " rettype=\"gb\", retmode=\"text\")" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 23 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for record in SeqIO.parse(efetch_handle, 'genbank'):\n", + " print record.id, record.description" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "NM_005804.3 Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 39A (DDX39A), transcript variant 1, mRNA.\n", + "NM_000967.3 Homo sapiens ribosomal protein L3 (RPL3), transcript variant 1, mRNA.\n" + ] + } + ], + "prompt_number": 24 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For a full argument list of `Entrez.efetch`, consult its [documentation page](https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Other `Bio.Entrez` utilities\n", + "\n", + "Biopython has the entire Entrez suite supported. A complete list of the Entrez services is available in its [documentation](https://www.ncbi.nlm.nih.gov/books/NBK25500/)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"blast\"></a>\n", + "## Performing a remote BLAST search\n", + "\n", + "A common action for bioinformaticians is to perform a BLAST search. Biopython provides a way to automate the search and helps you interpret the results." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using NCBI's BLAST\n", + "\n", + "Entrez is not the only online service Biopython interacts with. We can also submit BLAST searches to NCBI using the `qblast` function in the `Bio.Blast.NCBIWWW` module.\n", + "\n", + "Similar to Entrez, Biopython tries to conform to the NCBI's remote BLAST required parameters in the function call. The official NCBI documentation is [here](https://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html).\n", + "\n", + "Let's do a short BLAST search using our earlier fetched `ncbi_record` (NM_005804). " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from Bio.Blast.NCBIWWW import qblast" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 25 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "blast_handle = qblast('blastn', 'refseq_mrna', ncbi_record.seq)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 26 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the command returns the BLAST result in an XML file (one of the available formats to download from an interactive BLAST session). We will use `blast_handle` in the next section, but of course we could also write the results to a file for viewing later:\n", + "\n", + " blast_file = open('my_blast_output.xml', 'w')\n", + " blast_file.write(blast_handle.read())\n", + " blast_file.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parsing the BLAST search results\n", + "\n", + "Of course, Biopython has parsing capabilities for the results as well. As mentioned briefly above, parsing for these types of file is available in the `Bio.Blast.NCBIXML` submodule and `Bio.SearchIO` submodule. The former is an old module that will be deprecated soon and replaced by the latter (which is still in experimental stage, but already stable enough).\n", + "\n", + "Here's a short example using `Bio.SearchIO`:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from Bio import SearchIO" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "/home/martijn/.virtualenvs/programming-course/local/lib/python2.7/site-packages/Bio/SearchIO/__init__.py:213: BiopythonExperimentalWarning: Bio.SearchIO is an experimental submodule which may undergo significant changes prior to its future official release.\n", + " BiopythonExperimentalWarning)\n" + ] + } + ], + "prompt_number": 27 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "qresult = SearchIO.read(blast_handle, 'blast-xml')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 28 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "qresult" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 29, + "text": [ + "QueryResult(id='37723', 50 hits)" + ] + } + ], + "prompt_number": 29 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print qresult" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Program: blastn (2.2.29+)\n", + " Query: 37723 (1558)\n", + " No definition line\n", + " Target: refseq_mrna\n", + " Hits: ---- ----- ----------------------------------------------------------\n", + " # # HSP ID + description \n", + " ---- ----- ----------------------------------------------------------\n", + " 0 1 gi|308522777|ref|NM_005804.3| Homo sapiens DEAD (Asp-G...\n", + " 1 1 gi|397471065|ref|XM_003807080.1| PREDICTED: Pan panisc...\n", + " 2 1 gi|426387514|ref|XM_004060164.1| PREDICTED: Gorilla go...\n", + " 3 1 gi|395750601|ref|XM_002828787.2| PREDICTED: Pongo abel...\n", + " 4 1 gi|402904531|ref|XM_003915048.1| PREDICTED: Papio anub...\n", + " 5 1 gi|544509259|ref|XM_005588244.1| PREDICTED: Macaca fas...\n", + " 6 1 gi|426387518|ref|XM_004060166.1| PREDICTED: Gorilla go...\n", + " 7 1 gi|635036575|ref|XM_007995524.1| PREDICTED: Chlorocebu...\n", + " 8 1 gi|544509261|ref|XM_005588245.1| PREDICTED: Macaca fas...\n", + " 9 1 gi|403302190|ref|XM_003941697.1| PREDICTED: Saimiri bo...\n", + " 10 1 gi|301601638|ref|NM_001193491.1| Macaca mulatta DEAD (...\n", + " 11 1 gi|562865790|ref|XM_006161013.1| PREDICTED: Tupaia chi...\n", + " 12 1 gi|640797366|ref|XM_008056626.1| PREDICTED: Tarsius sy...\n", + " 13 1 gi|586527809|ref|XM_006918771.1| PREDICTED: Pteropus a...\n", + " 14 1 gi|585156628|ref|XM_006730572.1| PREDICTED: Leptonycho...\n", + " 15 1 gi|478537026|ref|XM_004442573.1| PREDICTED: Ceratother...\n", + " 16 1 gi|591345874|ref|XM_007098135.1| PREDICTED: Panthera t...\n", + " 17 1 gi|586977054|ref|XM_003981942.2| PREDICTED: Felis catu...\n", + " 18 1 gi|558177074|ref|XM_006100331.1| PREDICTED: Myotis luc...\n", + " 19 1 gi|545534433|ref|XM_533895.4| PREDICTED: Canis lupus f...\n", + " 20 1 gi|593748472|ref|XM_007129429.1| PREDICTED: Physeter c...\n", + " 21 1 gi|594668171|ref|XM_007182056.1| PREDICTED: Balaenopte...\n", + " 22 1 gi|472358840|ref|XM_004398985.1| PREDICTED: Odobenus r...\n", + " 23 1 gi|554542571|ref|XM_005865845.1| PREDICTED: Myotis bra...\n", + " 24 1 gi|395850754|ref|XM_003797893.1| PREDICTED: Otolemur g...\n", + " 25 1 gi|511847827|ref|XM_004748233.1| PREDICTED: Mustela pu...\n", + " 26 1 gi|466046657|ref|XM_004277418.1| PREDICTED: Orcinus or...\n", + " 27 1 gi|470600673|ref|XM_004312273.1| PREDICTED: Tursiops t...\n", + " 28 1 gi|301771295|ref|XM_002920972.1| PREDICTED: Ailuropoda...\n", + " 29 1 gi|602717846|ref|XM_007469048.1| PREDICTED: Lipotes ve...\n", + " ~~~\n", + " 47 1 gi|545534427|ref|XM_005632789.1| PREDICTED: Canis lupu...\n", + " 48 1 gi|511847823|ref|XM_004748231.1| PREDICTED: Mustela pu...\n", + " 49 1 gi|511847825|ref|XM_004748232.1| PREDICTED: Mustela pu...\n" + ] + } + ], + "prompt_number": 30 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the function call is very similar to `SeqIO`. However, the returned `QueryResult` object is something we have not seen before. In general, this is a container object for all our results.\n", + "\n", + "The main principle of `Bio.SearchIO` objects is that for all sequence database searches, we have at least three layers of containers:\n", + "\n", + "1. The query itself (`QueryResult`).\n", + "2. All the database hits from the query (`Hit`).\n", + "3. All the locations in a database record where alignments are found (`HSP`, for high-scoring pair)." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# for every hit in the first 10 query result\n", + "for hit in qresult[:10]:\n", + " # for every hsp in hit\n", + " for hsp in hit:\n", + " # show the hit ID and how long the match spans the hit (ungapped)\n", + " print hit.id, hsp.hit_span" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "gi|308522777|ref|NM_005804.3| 1558\n", + "gi|397471065|ref|XM_003807080.1| 1540\n", + "gi|426387514|ref|XM_004060164.1| 1539\n", + "gi|395750601|ref|XM_002828787.2| 1536\n", + "gi|402904531|ref|XM_003915048.1| 1562\n", + "gi|544509259|ref|XM_005588244.1| 1537\n", + "gi|426387518|ref|XM_004060166.1| 1428\n", + "gi|635036575|ref|XM_007995524.1| 1537\n", + "gi|544509261|ref|XM_005588245.1| 1430\n", + "gi|403302190|ref|XM_003941697.1| 1539\n" + ] + } + ], + "prompt_number": 31 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And all these containers are indexable." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# get the Hit object with the ID 'gi|308522777|ref|NM_005804.3|'\n", + "qresult['gi|308522777|ref|NM_005804.3|']" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 32, + "text": [ + "Hit(id='gi|308522777|ref|NM_005804.3|', query_id='37723', 1 hsps)" + ] + } + ], + "prompt_number": 32 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# get the first Hit object\n", + "qresult[0]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 33, + "text": [ + "Hit(id='gi|308522777|ref|NM_005804.3|', query_id='37723', 1 hsps)" + ] + } + ], + "prompt_number": 33 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# get the first HSP of the first Hit object\n", + "qresult[0][0]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 34, + "text": [ + "HSP(hit_id='gi|308522777|ref|NM_005804.3|', query_id='37723', 1 fragments)" + ] + } + ], + "prompt_number": 34 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is more to these objects than can be shown here. You can consult the [official API documentation](http://biopython.org/DIST/docs/api/) for a full reference." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### <span class=\"exercise\">Exercise: Navigating through a BLAST result</span>\n", + "\n", + "From the last BLAST result we have, fetch the full sequences of the top 10 hits that are not predicted sequences." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"beyond\"></a>\n", + "## Beyond Biopython\n", + "\n", + "Biopython is not the only useful 3rd party Python library out there. There is still much ground it does not cover, which is covered by other modules. Some of the modules in the bioinformatics space you may find useful:\n", + "\n", + "- [pysam](http://wwwfgu.anat.ox.ac.uk/~andreas/documentation/samtools/api.html): A samtools wrapper for parsing and writing SAM/BAM alignment files.\n", + "- [PyVCF](http://pyvcf.readthedocs.org/en/latest/): For working with VCF files.\n", + "- [track](http://xapple.github.io/track/): For working with genome tracks (e.g. BED tracks).\n", + "- [pybedtools](http://pythonhosted.org/pybedtools/): A bedtools wrapper for working with genome tracks.\n", + "- [metaseq](http://pythonhosted.org/metaseq/): A framework for exploring genomic data.\n", + "- [GEMINI](http://gemini.readthedocs.org/): A flexible framework for exploring genome variation.\n", + "- ..." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from IPython.core.display import HTML\n", + "def custom_style():\n", + " style = open('styles/notebook.css', 'r').read()\n", + " return HTML('<style>' + style + '</style>')\n", + "def custom_script():\n", + " script = open('styles/notebook.js', 'r').read()\n", + " return HTML('<script>' + script + '</script>')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 35 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "custom_style()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "<style>/*\n", + " https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers\n", + "*/\n", + "@font-face {\n", + " font-family: \"Computer Modern\";\n", + " src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');\n", + "}\n", + "div.cell{\n", + " width:800px;\n", + " margin-left:16% !important;\n", + " margin-right:auto;\n", + "}\n", + "h1 {\n", + " font-family: Helvetica, serif;\n", + "}\n", + "h4{\n", + " margin-top:12px;\n", + " margin-bottom: 3px;\n", + " }\n", + "div.text_cell_render{\n", + " font-family: Computer Modern, \"Helvetica Neue\", Arial, Helvetica, Geneva, sans-serif;\n", + " line-height: 145%;\n", + " font-size: 130%;\n", + " width:800px;\n", + " margin-left:auto;\n", + " margin-right:auto;\n", + "}\n", + ".CodeMirror{\n", + " font-family: \"Source Code Pro\", source-code-pro,Consolas, monospace;\n", + "}\n", + ".prompt{\n", + " display: None;\n", + "}\n", + ".text_cell_render .exercise {\n", + " font-weight: 300;\n", + " /*font-size: 22pt;*/\n", + " color: #4057A1;\n", + " font-style: italic;\n", + " /*margin-bottom: .5em;\n", + " margin-top: 0.5em;\n", + " display: block;*/\n", + "}\n", + ".text_cell_render .example {\n", + " font-weight: 300;\n", + " color: #40A157;\n", + " font-style: italic;\n", + "}\n", + "\n", + ".warning{\n", + " color: rgb( 240, 20, 20 )\n", + "}\n", + "</style>" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 36, + "text": [ + "<IPython.core.display.HTML at 0x7fb0a0b7e790>" + ] + } + ], + "prompt_number": 36 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "custom_script()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "<script>// https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers\n", + "MathJax.Hub.Config({\n", + " TeX: {\n", + " extensions: [\"AMSmath.js\"]\n", + " },\n", + " tex2jax: {\n", + " inlineMath: [ ['$','$'], [\"\\\\(\",\"\\\\)\"] ],\n", + " displayMath: [ ['$$','$$'], [\"\\\\[\",\"\\\\]\"] ]\n", + " },\n", + " displayAlign: 'center', // Change this to 'center' to center equations.\n", + " \"HTML-CSS\": {\n", + " styles: {'.MathJax_Display': {\"margin\": 4}}\n", + " }\n", + " });\n", + "</script>" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 37, + "text": [ + "<IPython.core.display.HTML at 0x7fb0a0b7eb10>" + ] + } + ], + "prompt_number": 37 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/INSTALL.md b/INSTALL.md index e984346d5d9a7fd0d9ea71ed7f4dfa37117ba3cd..ec8efb66f9f271dcb48fb1a20ea151d53675903e 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -23,8 +23,8 @@ We need some system packages to be installed. For the following command, you need sudo rights: sudo apt-get install -y \ - curl python-qt4 libfreetype6-dev libpng12-dev python-cairo \ - python-gtk2 python-gtk2-dev git gfortran + curl gfortran git libblas-dev libfreetype6-dev liblapack-dev + libpng12-dev python-cairo python-gtk2 python-gtk2-dev python-qt4 From here on, everything is local for the current user. diff --git a/README.md b/README.md index 7777dd8cc41b5ca7dbb7f6d535c9474271b9906b..b7787252e9ecfc16d4c35059247df2e0251209fb 100644 --- a/README.md +++ b/README.md @@ -28,13 +28,26 @@ Materials The top-level directory contains materials for the following lessons: -1. Welcome (slides) [view](http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/welcome.ipynb) -2. Introduction to Python (slides) [view](http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/python.ipynb) -3. More Python Goodness (notebook) [view](http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/more-python.ipynb) -4. Working with NumPy arrays (slides) [view](http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/numpy.ipynb) -5. Plotting with matplotlib (slides) [view](http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/matplotlib.ipynb) -6. Object-oriented programming (slides) [view](http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/classes.ipynb) -7. A sip of Biopython (notebook) [view](http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/biopython.ipynb) +1. Welcome ([slides][lesson_01]) +2. Introduction to Python ([slides 1][lesson_02_01], [slides 2][lesson_02_02], + [slides 3][lesson_02_03]) +3. More Python Goodness ([notebook 1][lesson_03_01], [notebook 2][lesson_03_02]) +4. Working with NumPy arrays +5. IPython Notebook ([notebook][lesson_05]) +6. Plotting with matplotlib +7. Python for data analysis +8. Object-oriented programming +9. A sip of Biopython ([notebook 1][lesson_09_01], [notebook 2][lesson_09_02]) + +[lesson_01]: http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/01%20-%20Welcome.ipynb +[lesson_02_01]: http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/02%20-%20Introduction%20to%20Python%20(1).ipynb +[lesson_02_02]: http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/02%20-%20Introduction%20to%20Python%20(2).ipynb +[lesson_02_03]: http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/02%20-%20Introduction%20to%20Python%20(3).ipynb +[lesson_03_01]: http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/03%20-%20More%20Python%20goodness%20(1).ipynb +[lesson_03_02]: http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/03%20-%20More%20Python%20goodness%20(2).ipynb +[lesson_05]: http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/05%20-%20IPython%20Notebook.ipynb +[lesson_09_01]: http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/09%20-%20A%20sip%20of%20Biopython%20(1).ipynb +[lesson_09_02]: http://nbviewer.ipython.org/urls/git.lumc.nl/humgen/programming-course/raw/master/09%20-%20A%20sip%20of%20Biopython%20(2).ipynb As indicated, some of the lessons are slideshows, whereas others are just notebooks we scroll through during class. The links above are all one-page diff --git a/biopython.ipynb b/biopython.ipynb deleted file mode 100644 index a4bcc613c3dec585c3c91a658ee04a269f32de68..0000000000000000000000000000000000000000 --- a/biopython.ipynb +++ /dev/null @@ -1,1768 +0,0 @@ -{ - "metadata": { - "name": "" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# A sip of Biopython\n", - "***\n", - "\n", - "[Wibowo Arindrarto](mailto:w.arindrarto@lumc.nl), [Jeroen Laros](mailto:j.f.j.laros@lumc.nl), [Zuotian Tatum](mailto:z.tatum@lumc.nl), [Martijn Vermaat](mailto:m.vermaat.hg@lumc.nl)\n", - "\n", - "[Department of Human Genetics, Leiden University Medical Center](http://humgen.nl)\n", - "\n", - "[Sequencing Analysis Support Core, Leiden University Medical Center](http://sasc.lumc.nl)\n", - "\n", - "License: [Creative Commons Attribution 3.0 License (CC-by)](http://creativecommons.org/licenses/by/3.0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Table of contents\n", - "\n", - "1. [Prelude: Python packages and their installation](#packages)\n", - "2. [Biopython](#biopython)\n", - "3. [Working with sequences](#sequences)\n", - "4. [File I/O with Biopython](#files)\n", - "5. [Fetching from online resources: NCBI's Entrez](#entrez)\n", - "6. [Performing a remote BLAST search](#blast)\n", - "7. [Beyond Biopython](#beyond)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "<a id=\"packages\"></a>\n", - "## Prelude: Python packages and their installation\n", - "\n", - "### Python Package Index (PyPI)\n", - "\n", - "[The Python Package Index](https://pypi.python.org/) is *the* place to find 3rd-party Python libraries (and to upload your own too).\n", - "\n", - "Remember the `pip install biopython` (or `numpy`, etc). command you ran? That source package was stored in PyPI.\n", - "\n", - "Some of the Python packages developed at our department are also in there:\n", - "\n", - "- [kMer](https://pypi.python.org/pypi/kMer): Analysis toolkit and programming library for k-mer profiles.\n", - "- [TSSV](https://pypi.python.org/pypi/tssv): Targeted characterisation of short structural variation.\n", - "- [fastools](https://pypi.python.org/pypi/fastools): Various tools for the analysis and manipulation of FASTA/FASTQ files.\n", - "- [piletools](https://pypi.python.org/pypi/piletools): Various tools for the analysis of mpileup files.\n", - "- [barcode](https://pypi.python.org/pypi/barcode): For designing NGS barcodes.\n", - "- [wiggelen](http://wiggelen.readthedocs.org/): Working with Wiggle (WIG) tracks.\n", - "- [monoseq](https://monoseq.readthedocs.org/): Pretty-printing of DNA and protein sequences.\n", - "\n", - "For example, to install our library for working with Wiggle tracks:\n", - "\n", - " pip install wiggelen" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Python virtual environments\n", - "\n", - "[`virtualenv`](http://www.virtualenv.org/) (with its frontend [`virtualenvwrapper`](http://virtualenvwrapper.readthedocs.org/)) is a tool for managing isolated Python environments. Its benefits are:\n", - "\n", - "1. You can have specific versions of packages installed, per environment.\n", - "2. You can install packages as non-root user.\n", - "\n", - "We list some commands to manage your virtual environments. First, creating a new one:\n", - "\n", - " $ mkvirtualenv my-environment\n", - "\n", - "Activating an environment:\n", - "\n", - " $ workon my-environment\n", - "\n", - "Whenever you have an environment activated, it is indicated by prefixing your bash prompt with the name of the environment surrounded by brackets.\n", - "\n", - "Deactivating an environment:\n", - "\n", - " $ deactivate\n", - "\n", - "Normally, `pip install` will try to install a package system-wide, for which you'd need administrator permissions. Whenever you have a virtual environment activated, `pip install` will install the package in the virtual environment automatically." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "<a id=\"biopython\"></a>\n", - "## Biopython\n", - "\n", - "<img src=\"http://biopython.org/DIST/docs/tutorial/images/biopython.jpg\" />\n", - "\n", - "### About Biopython\n", - "\n", - "Biopython is one of the big libraries for working with bioinformatics-related data (but not the only one). It is *Open Source* and ran by a team of developers from around the world under the OBF umbrella.\n", - "\n", - "Development started in 1998 and it is still actively maintained with a new release every 3-4 months. The library is mature and has had multiple publications (the whole library itself and sometimes its submodules).\n", - "\n", - "Biopython is compatible with Python 2.x and 3.x and tested on multiple operating systems and Python implementations.\n", - "\n", - "Some links with more information:\n", - "\n", - "- [Biopython homepage](http://biopython.org)\n", - "- [Git development repository](http://github.com/biopython/biopython)\n", - "- [Mailing list](http://lists.open-bio.org/pipermail/biopython/)\n", - "- [Biopython Tutorial and Cookbook](http://biopython.org/DIST/docs/tutorial/Tutorial.html)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Inside Biopython\n", - "\n", - "As one of the larger Python packages, these are some of the things provided by Biopython:\n", - "\n", - "- Rich objects representing various concepts (e.g., sequences, alignments, motifs).\n", - "\n", - "- File parsers and writers.\n", - " * Sequence files: fasta, fastq, genbank, abi, sff, etc.\n", - " * Alignment files: clustal, emboss, phylip, nexus, etc.\n", - " * Sequence search outputs: BLAST, HMMER, BLAT, etc.\n", - " * Phylogenetic trees: newick, nexus, phyloxml, etc.\n", - " * Sequence motifs: AlignAce, TRANSFAC, etc.\n", - " * Others: PDB files, etc.\n", - "\n", - "- Access to remote resources (e.g., Entrez, NCBI BLAST).\n", - "\n", - "- Application wrappers.\n", - "\n", - "- A simple graphing tool.\n", - "\n", - "- Simple algorithms (e.g., pairwise alignment, cluster analysis).\n", - "\n", - "- References such as codon tables and IUPAC sequences." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Characterizing Biopython\n", - "\n", - "Strengths:\n", - "\n", - "- Based on Python (readability, expressive constructs).\n", - "- Wide range of parsers with common interfaces.\n", - "- Access to online resources.\n", - "\n", - "Weaknesses:\n", - "\n", - "- Based on Python (?) (no error checks until runtime).\n", - "- Not always the fastest parsers." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "<a id=\"sequences\"></a>\n", - "## Working with sequences\n", - "\n", - "The `Seq` object is Biopython's main representation of nucleotide or protein sequences. It is essentially a string with alphabet information. Its constructor is available in the `Bio.Seq` module." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from Bio.Seq import Seq" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 8 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sequence alphabets\n", - "\n", - "Let's make our first `Seq` object." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_seq = Seq('GGGTACGATAAA')" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 51 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_seq" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 52, - "text": [ - "Seq('GGGTACGATAAA', Alphabet())" - ] - } - ], - "prompt_number": 52 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice the bit about the alphabet. Biopython never tries to guess what alphabet your sequence is in. You have to be explicit yourself (recall `import this`)." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from Bio.Alphabet import generic_dna" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 53 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_dna = Seq('GGGTACGATAAA', generic_dna)" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 55 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_dna" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 56, - "text": [ - "Seq('GGGTACGATAAA', DNAAlphabet())" - ] - } - ], - "prompt_number": 56 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`Seq` objects are almost the same as native Python `str` objects. They have similar methods and can be used with almost the same set of operators." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_dna.lower()" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 57, - "text": [ - "Seq('gggtacgataaa', DNAAlphabet())" - ] - } - ], - "prompt_number": 57 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_dna.endswith('N')" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 58, - "text": [ - "False" - ] - } - ], - "prompt_number": 58 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_dna + my_dna" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 66, - "text": [ - "Seq('GGGTACGATAAAGGGTACGATAAA', DNAAlphabet())" - ] - } - ], - "prompt_number": 66 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "str(my_dna)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 67, - "text": [ - "'GGGTACGATAAA'" - ] - } - ], - "prompt_number": 67 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Operations on sequences\n", - "\n", - "The plus feature of the `Seq` type is that we get to use additional functions from 'molecular biology'." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_rna = my_dna.transcribe()" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 59 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_rna" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 60, - "text": [ - "Seq('GGGUACGAUAAA', RNAAlphabet())" - ] - } - ], - "prompt_number": 60 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we see that running `transcribe()` returns a new RNA sequence. Biopython uses alphabet information to determine whether a `Seq` member function can be invoked or not." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are more, of course." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_dna.complement()" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 61, - "text": [ - "Seq('CCCATGCTATTT', DNAAlphabet())" - ] - } - ], - "prompt_number": 61 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_dna.reverse_complement()" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 62, - "text": [ - "Seq('TTTATCGTACCC', DNAAlphabet())" - ] - } - ], - "prompt_number": 62 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_protein = my_dna.translate()" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 63 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_protein" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 64, - "text": [ - "Seq('GYDK', ExtendedIUPACProtein())" - ] - } - ], - "prompt_number": 64 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now try running `my_protein.transcribe()` in your interpreter. What happens?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### <span class=\"exercise\">Exercise: Six-frame translation</span>\n", - "\n", - "Write a function that takes a `Seq` object and prints all possible translation frames from it. For example, when using `CGATCGTAGCTGTAGCGCGATATATACTAGGG` as the input sequence, the output is (not necessarily in this order):\n", - "\n", - " RS*L*RDIY*\n", - " P*YISRYSYD\n", - " DRSCSAIYTR\n", - " PSIYRATATI\n", - " IVAVARYILG\n", - " LVYIALQLRS\n", - "\n", - "Additionally, try to see how to use an alternative translation table (hint: `Bio.Data` module)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Storing additional information\n", - "\n", - "`Seq` objects are good for storing the sequence itself. But where do we store metadata such as sequence ID or interesting regions in the sequence?\n", - "\n", - "For this, we use the `SeqRecord` object from the `Bio.SeqRecord` module. It is essentially a thin wrap around the `Seq` object that also stores sequence metadata." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from Bio.SeqRecord import SeqRecord" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 69 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "record = SeqRecord(my_dna)" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 87 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "record" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 88, - "text": [ - "SeqRecord(seq=Seq('GGGTACGATAAA', DNAAlphabet()), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])" - ] - } - ], - "prompt_number": 88 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print record" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "ID: <unknown id>\n", - "Name: <unknown name>\n", - "Description: <unknown description>\n", - "Number of features: 0\n", - "Seq('GGGTACGATAAA', DNAAlphabet())\n" - ] - } - ], - "prompt_number": 89 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We saw that `SeqRecord` stores a `Seq` object and other metadata such as:\n", - "\n", - "* `id`: Sequence ID.\n", - "* `name`: Sequence name, usually the same as `id`.\n", - "* `description`: Sequence description.\n", - "* `dbxrefs`: A list of database cross references.\n", - "\n", - "There are also other metadata not shown here such as:\n", - "\n", - "* `letter_annotation`: Annotation per sequence position.\n", - "\n", - "It's enough to supply only a `Seq` object when creating `SeqRecord`. However, as we saw, it's not that useful to have `<unknown id>` and `<unknown description>` as metadata. " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "SeqRecord(my_dna, id='my precious', description='my precious sequence')" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 73, - "text": [ - "SeqRecord(seq=Seq('GGGTACGATAAA', DNAAlphabet()), id='my precious', name='<unknown name>', description='my precious sequence', dbxrefs=[])" - ] - } - ], - "prompt_number": 73 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "<a id=\"files\"></a>\n", - "## File I/O with Biopython\n", - "\n", - "As mentioned before, one of Biopython's strength is its wide range of parsing support. Briefly:\n", - "\n", - "* Sequence files: `Bio.SeqIO`\n", - "* Alignment files: `Bio.AlignIO`\n", - "* Sequence search files: `Bio.Blast` (soon to be `Bio.SearchIO`)\n", - "* Phylogenetic trees: `Bio.Phylo`\n", - "* Sequence motifs: `Bio.motifs`\n", - "* Protein structures: `Bio.PDB`\n", - "\n", - "These parsers may not be the fastest available, but they provide a common interface of objects within their domain.\n", - "\n", - "As an example, let's take a look at parsing sequence files." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading sequence files with `SeqIO`\n", - "\n", - "The main sequence input/output functions are all contained in the `Bio.SeqIO` module. Two of the most commonly used are `Bio.SeqIO.read` and `Bio.SeqIO.parse`. They provide the same functionality, except that the former is for files containing a single sequence and the latter is for files containing multiple sequences." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `SeqIO.read`\n", - "\n", - "A simple function call that takes as input:\n", - "\n", - "1. The file name *or* a file handle object pointing to your sequence.\n", - "2. The sequence file format name.\n", - "\n", - "It returns a single `SeqRecord` object containing our sequence of interest." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from Bio import SeqIO" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 4 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fasta_record = SeqIO.read('data/simple.fa', 'fasta')" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 153 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fasta_record" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 154, - "text": [ - "SeqRecord(seq=Seq('MGIKQYSQEELKEMALVEIAHELFEEHKKPVPFQELLNEIASLLGVKKEELGDR...EIK', SingleLetterAlphabet()), id='sp|P12464|RPOE_BACSU', name='sp|P12464|RPOE_BACSU', description='sp|P12464|RPOE_BACSU DNA-directed RNA polymerase subunit delta OS=Bacillus subtilis (strain 168) GN=rpoE PE=1 SV=1', dbxrefs=[])" - ] - } - ], - "prompt_number": 154 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print fasta_record" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "ID: sp|P12464|RPOE_BACSU\n", - "Name: sp|P12464|RPOE_BACSU\n", - "Description: sp|P12464|RPOE_BACSU DNA-directed RNA polymerase subunit delta OS=Bacillus subtilis (strain 168) GN=rpoE PE=1 SV=1\n", - "Number of features: 0\n", - "Seq('MGIKQYSQEELKEMALVEIAHELFEEHKKPVPFQELLNEIASLLGVKKEELGDR...EIK', SingleLetterAlphabet())\n" - ] - } - ], - "prompt_number": 155 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The nice thing about this method, is that it abstracts over all the different file formats. If we want to parse another file which is a FASTQ instead of a FASTA file, we simply need to change the file name and the format name." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fastq_record = SeqIO.read('data/easy.fastq', 'fastq')" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 156 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fastq_record" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 157, - "text": [ - "SeqRecord(seq=Seq('CCGCGACCTCTGTTCTGCAGCCCCTTCCCTTCCCCGCCTCCTGCTCTGCCGGGA...CCA', SingleLetterAlphabet()), id='HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1', name='HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1', description='HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1', dbxrefs=[])" - ] - } - ], - "prompt_number": 157 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print fastq_record" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "ID: HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1\n", - "Name: HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1\n", - "Description: HWI-ST1019:196:D121WACXX:5:1101:1538:2300/1\n", - "Number of features: 0\n", - "Per letter annotation for: phred_quality\n", - "Seq('CCGCGACCTCTGTTCTGCAGCCCCTTCCCTTCCCCGCCTCCTGCTCTGCCGGGA...CCA', SingleLetterAlphabet())\n" - ] - } - ], - "prompt_number": 158 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Of course, since the FASTQ file has additional quality information, we can now access it. In this case, the qualities are stored in `letter_annotations`." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print fastq_record.letter_annotations" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "{'phred_quality': [16, 10, 10, 10, 19, 8, 27, 31, 27, 32, 27, 10, 17, 32, 24, 32, 17, 10, 10, 25, 18, 34, 23, 25, 8, 16, 30, 33, 35, 33, 35, 33, 33, 35, 34, 31, 25, 25, 21, 31, 7, 13, 23, 13, 13, 22, 8, 22, 22, 22, 25, 27, 30, 31, 31, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}\n" - ] - } - ], - "prompt_number": 159 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Even more, `SeqIO` has support not only for plain text formats but also binary formats. Here's an example of `SeqIO` reading a Sanger sequencing trace file. Note the additional information (e.g. sequencing well) stored in the file is also parsed." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "abi_record = SeqIO.read('data/sanger.ab1', 'abi')" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 160 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print abi_record" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "ID: 226032_C-ME-18_pCAGseqF\n", - "Name: sanger\n", - "Number of features: 0\n", - "/polymer=POP7 \n", - "/run_finish=2009-12-12 11:44:49\n", - "/sample_well=B9\n", - "/run_start=2009-12-12 09:56:53\n", - "/machine_model=3730\n", - "/dye=Z-BigDyeV3\n", - "Per letter annotation for: phred_quality\n", - "Seq('GGGCGAGCKYYAYATTTTGGCAAGAATTGAGCTCTATGGCCACAACCATGGTGA...TTC', IUPACAmbiguousDNA())\n" - ] - } - ], - "prompt_number": 161 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Head over to the [official Biopython API documentation](http://biopython.org/DIST/docs/api/) for a complete list of supported formats." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `SeqIO.parse`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`SeqIO.parse` is similar to `SeqIO.read`, but instead of returning a single `SeqRecord` object, we can iterate over the return value to get consecutive `SeqRecord` objects.\n", - "\n", - "This is similar to how we iterated over a filehandle to get consecutive lines:\n", - "\n", - " fh = open('my_file')\n", - " for line in fh:\n", - " print line\n", - " fh.close()\n", - "\n", - "However, instead of returning each line of the file, `SeqIO.parse` returns a single `SeqRecord` object per iteration and we do not need to call `close` afterwards because it is handled automatically by Biopython.\n", - "\n", - " for record in SeqIO.parse('my_sequence.fa', 'fasta'):\n", - " print record" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for record in SeqIO.parse('data/parse.fastq', 'fastq'):\n", - " print record.seq" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "CCGCGACCTCTGTTCTGCAGCCCCTTCCCTTCCCCGCCTCCTGCTCTGCCGGGACTACGCACCGGCCTGATTGGTTACCCCCGGGGTGTCCTCGGTCACCA\n", - "CCGCGACCTCTGTTCTGCAGCCCCTTCCCTTCCCCGCCTCCTGCTCTGCCGGGACTACGCACCGGCCTGATTGGTTACCCCCGGGGTGTCCTCGGTCACCA\n" - ] - } - ], - "prompt_number": 163 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The advantage of `SeqIO.parse` is that it allows you to handle large, multi-sequence files gracefully. The file size may well exceed your memory, since it allows us to process the records one by one.\n", - "\n", - "`SeqIO.parse` supports the same set of formats that `SeqIO.read` supports, one example is shown below." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for record in SeqIO.parse('data/roche.sff', 'sff'):\n", - " print record.id, len(record), record.seq[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "E3MFGYR02JWQ7T 265 tcagGGTCTA\n", - "E3MFGYR02JA6IL 271 tcagTTTTTT\n", - "E3MFGYR02JHD4H 310 tcagAAAGAC\n", - "E3MFGYR02GFKUC 299 tcagCGGCCG\n", - "E3MFGYR02FTGED 281 tcagTGGTAA\n", - "E3MFGYR02FR9G7 261 tcagCTCCGT\n", - "E3MFGYR02GAZMS 278 tcagAAAGAA\n", - "E3MFGYR02HHZ8O 221 tcagACTTTC\n", - "E3MFGYR02GPGB1 269 tcagAAGCAG\n", - "E3MFGYR02F7Z7G 219 tcagAATCAT\n" - ] - } - ], - "prompt_number": 164 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### <span class=\"exercise\">Exercise: Trimming sequences from a mangled FASTQ file</span>\n", - "\n", - "* From the interleaved FASTQ file `data/mangled.fq`, print all sequences of the first read pairs after trimming off their first five nucleotides.\n", - "* **Hint:** first pair and second pair records are marked with `/1` and `/2`, respectively." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### <span class=\"example\">Example: A poor man's FastQC</span>\n", - "\n", - "Those of you working with NGS data probably know the [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) tool. It summarizes the quality of your raw sequencing reads over different metrics.\n", - "\n", - "<img src=\"files/images/fastqc.png\">\n", - "\n", - "Let's try to mimic the plot showing the base quality score distributions over the read length." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pylab inline" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "Populating the interactive namespace from numpy and matplotlib\n" - ] - } - ], - "prompt_number": 3 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "qualities = array([r.letter_annotations['phred_quality'] for r in\n", - " SeqIO.parse('data/mangled.fq', 'fastq')])" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 5 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We now have a (20, 36) shaped array of quality scores (only 20 sequences of 36 bases each). Conveniently, we can pass this directly to the matplotlib `boxplot` function.\n", - "\n", - "While we're at it, let's also draw some visual indication that we consider everything below 60 as suboptimal (using `axhspan`) and add proper labels." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "figsize(10, 6)\n", - "axhspan(60, 100, facecolor='green', alpha=0.1, lw=0)\n", - "axhspan(0, 60, facecolor='red', alpha=0.1, lw=0)\n", - "boxplot(qualities, sym='')\n", - "ylim(qualities.min() // 10 * 10, qualities.max() // 10 * 10 + 10)\n", - "xlabel('Read position')\n", - "ylabel('Phred-based quality score')\n", - "title('Base quality score distributions over the read length');" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "display_data", - "png": "iVBORw0KGgoAAAANSUhEUgAAAloAAAGJCAYAAABSGZ32AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XlcVOX+B/DPAC6pIIvIKmIaJquouZALhqB1I5fcDXEt\n9ZfXwlcueS28mqKGXfXeSk1c6lpXW1TcMlPcUxNRy11xQ8QFWQQUgef3h5e5gHNmzgycWeDzfr18\nxTDzPc/3nHmY+fac8zxHJYQQICIiIqIqZ2XqBIiIiIiqKxZaRERERAphoUVERESkEBZaRERERAph\noUVERESkEBZaRERERAphoUWkoNDQUKxcuRIA8O9//xs9e/Y0cUbG4+3tjd27dwMA5s6di7Fjx1bZ\ntm1tbXH16lUAwIgRIzBz5swq2/b48eMxZ86cKtuepSnbZ82dlZUVrly5ovE5U+6Htryo5mGhRSbl\n7e2NevXqwdbWFo6Ojnj99ddx8+ZNU6dVZVQqFVQqFQBg2LBh+Pnnn9XPVfcP49L9BoAPP/wQK1as\n0Bkj98sxNzcX3t7e6nbKtqWP1atXo0uXLuV+98UXX+Bvf/ubQduzNLGxsYiKiir3u8ocT3NirP2w\npMKUTIOFFpmUSqXCli1bkJubi/T0dLi4uGDixImmTstozHW94KKiIpO0q+uLUSovcz2O5qS4uNik\n7ZuqTymtOhSlpCwWWmQ26tSpgzfffBNnzpxR/27r1q0IDg5Gw4YN4eXlhVmzZqmfe/ToEd566y00\natQIDg4OaN++Pe7cuQMAyM7OxujRo+Hu7g5PT0/MnDkTJSUlGtstKCjAiBEj4OjoCD8/PyxcuBBN\nmjRRP19x5KnsqaoHDx7g9ddfR+PGjeHo6IjIyEikpaVpbKfs6EnXrl0BAEFBQbCzs8P69esREBCA\nLVu2qF//5MkTNGrUCCdPnnxmW/fu3cPrr78OBwcHODk5oWvXrupi48aNG+jXrx8aN26MRo0aqQvX\nkpISzJkzB97e3nBxcUF0dDRycnIAAFevXoWVlRUSEhLQtGlT9OjRAwCQkJAAX19fODo6olevXrh+\n/brGfQOAr7/+Gk2bNkWjRo0wd+7ccs+VHTmRet9mzJiB/fv3491334WtrS3++te/qo//559/jhde\neAEtW7bU+J7cu3cPERERsLOzQ2hoqDrP0v0q+96XjkCcO3cO48aNw+HDh9UjqhXfXwBYsWIFXnjh\nBTg5OaF3795IT09XP2dlZYVly5bBx8cHDg4OePfdd9XPXbp0Cd26dYO9vT2cnZ0xePBgyWO3efNm\n+Pn5wcHBAd27d8e5c+cAAPPnz8eAAQPKvXbSpEmYNGkSAO39fPXq1Xj55ZcRExODRo0alfvbAYAd\nO3Zg3rx5+M9//gNbW1sEBwern7t69So6d+4MOzs79OzZE/fv31c/99tvvyEkJAQODg5o3bo19u7d\nK7lf3t7eWLBgAQIDA2Fra4uSkhKt8atWrYKvry/s7OzQvHlzLF++vNz2Fi5cqN7XhIQEyXY10daX\ntb2PJSUlmDx5MpydnfH888/jn//8J6ysrFBcXCzZZwHgl19+0bg9qoEEkQl5e3uLXbt2CSGEyMvL\nE8OHDxfR0dHq55OSksQff/whhBDi1KlTwsXFRWzcuFEIIcSXX34pIiMjRUFBgSgpKRHJyckiJydH\nCCFEnz59xLhx40R+fr64c+eOaN++vVi2bJnGHKZOnSq6du0qHjx4IG7cuCH8/PxEkyZN1M+rVCpx\n+fJl9eMRI0aImTNnCiGEuH//vvjxxx9FQUGByM3NFQMGDBB9+vRRvzY0NFSsXLlSCCHEqlWrROfO\nnSW3u2DBAjFo0CD1440bN4rAwECNOU+bNk2MGzdOFBUViaKiInHgwAEhhBBFRUUiMDBQxMTEiPz8\nfPHo0SNx8OBBIYQQK1euFC1atBCpqani4cOHol+/fiIqKkoIIURqaqpQqVQiOjpa5Ofni4KCArFx\n40bRokULce7cOVFcXCzmzJkjQkJCNObz559/igYNGoj9+/eLx48fi5iYGGFjYyN+/fVXIYQQsbGx\n6ra0vW9lj1fZ4xQRESEePHggHj169Myxi46OFra2tuq2J02apD7OpftVXFys8T1ZvXp1ufdEiPLv\n76+//ioaNWokTpw4IR4/fiwmTpwounbtWi63yMhIkZ2dLa5fvy6cnZ3Fzz//LIQQYvDgwWLu3LlC\nCCEeP36sfh8qOn/+vKhfv77YtWuXKCoqEgsWLBAtWrQQT548EVevXhX16tUTubm56vfXzc1NHDly\nRAihvZ+vWrVK2NjYiH/+85+iuLhYFBQUPNN22felVLdu3UTz5s3FxYsXRUFBgQgNDRXTpk0TQghx\n8+ZN4eTkJLZv3y6EEOKXX34RTk5O4u7duxr3rWnTpiI4OFjcvHlTPHr0SDL+3r17Qgghtm7dKq5c\nuSKEEGLv3r2iXr16Ijk5WQghxPbt24WLi4v4888/RV5enhgyZMgzf0NllX2fdfVlTe/jjh07hBBC\nfPHFF8LX11ekpaWJBw8eiLCwMGFlZaXuU1J9Vmp7VPOw0CKTatq0qWjQoIGwt7cXtWrVEh4eHuL0\n6dOSr580aZJ4//33hRBCJCQkiJCQEHHq1Klyr7l9+7aoU6dOuS+WdevWie7du2vc5vPPP6/+chRC\niOXLlwtPT0/1Y02F1t/+9jeN2zpx4oRwcHBQP9an0EpLSxMNGjRQf6m++eabYuHChRrb+eijj0Tv\n3r3FpUuXyv3+0KFDwtnZuVxhUeqVV14RX3zxhfrx+fPnRa1atURxcbG6IElNTVU/36tXr3JfIMXF\nxaJevXri+vXrz2x71qxZYsiQIerHeXl5onbt2upC6+OPPxZvvfWWEEL6fRPi6fH66quvyv1OpVKJ\nPXv2PPO7soVW2bYfPnworK2txc2bN3UWWhXfEyHKF1qjRo0SU6dOLbftWrVqiWvXrqnzKFtADRw4\nUMyfP18IIcTw4cPF22+/LW7evPnMfpb197//vVyBXVJSIjw8PMTevXuFEEJ07txZrF27VgghxM6d\nO0Xz5s2FELr7+apVq4SXl5fWtsu+L2WPzyeffKJ+/Pnnn4tevXoJIYSIi4t7pjDr2bOnWLNmjcbt\ne3t7i1WrVqkf6xvfp08fsXjxYiGEECNHjhTTp09XP3fhwgXZhZauvqztfezevbtYvny5+rldu3aV\n61NSfbbi9uLi4jTmSdUfTx2SSalUKmzatAkPHjzA48ePsXTpUnTr1g0ZGRkAgCNHjqB79+5o3Lgx\n7O3tsWzZMvVpjKioKPTs2RODBw+Gh4cHpk6diqKiIly7dg1PnjyBm5sbHBwc4ODggHHjxuHu3bsa\nc7h161a5U4VeXl6y88/Pz8c777wDb29vNGzYEN26dUN2drZB1wy5u7vj5Zdfxvfff4+srCzs2LED\nw4YN0/jaDz74AC1atEBERASaN2+O+fPnA3h62rBp06awsnr2Tzs9PR1NmzZVP/by8kJRUZH6WAMo\ndxyuXbuGSZMmqY+hk5MTAGg8NZqeng5PT0/143r16qlfX5HU+1ZK0zUvZfOqSKVSlWu7fv36cHR0\nxK1btyRj5Kp4zOrXrw8nJ6dyx8DV1VX9c7169ZCbmwsAWLBgAYQQaN++Pfz9/bFq1SrJNsr2OZVK\nhSZNmqjbGDp0KL799lsAwLp169R9Qk4/13bctCm7T8899xwePnyobnPDhg3q9hwcHHDw4EHcvn1b\nclsV+5S2+O3bt6Njx45wcnKCg4MDtm3bpv57T09PN/jvVE5frvg+lu5zxXbL9rVSmvqs1Pao5mGh\nRWZDpVKhb9++sLa2xsGDBwE8/ZLp06cPbt68iaysLIwbN059DYqNjQ0++ugj/Pnnnzh06BC2bNmC\ntWvXwsvLC3Xq1MH9+/fx4MEDPHjwANnZ2Th9+rTGdt3c3Mpdr1HxOqR69eohPz9f/Tg9PV39wRof\nH48LFy7g6NGjyM7Oxt69eyGejhQbdAyio6PxzTffYMOGDQgJCYGbm5vG1zVo0ACffvopLl++jM2b\nN2PRokXYvXs3vLy8cP36dY0XPru7u6uXRCjdTxsbG7i4uKh/V/YLw8vLC8uXL1cfwwcPHiAvLw8d\nO3Z8Zttubm64ceOG+nF+fn6563rKknrfKrZflrYLjoUQ5dp++PAhMjMz4e7ujvr166vzKVW2KNB1\nIXPFY5aXl4f79+/Dw8NDaxwAuLi4YPny5UhLS8OyZcswYcIEjbNM3d3dce3atWf2p7SN/v37Iykp\nCWlpadi4cSOGDh0K4GkBo6uf69o/TQW5Nl5eXoiKiirXJ3JzczFlyhTJmIp9Sir+8ePHePPNNzFl\nyhTcuXMHDx48wGuvvab+W9L1d6orb7l9uaKKfbvszxX3j0gTFlpkcqUfpEII9ehWq1atADz90nRw\ncEDt2rVx9OhRrFu3Tv3BlpSUhNOnT6O4uBi2traoVasWrK2t4erqioiICMTExCA3NxclJSW4fPky\n9u3bp7H9gQMHYt68ecjKysLNmzexdOnSch+erVu3xr///W8UFxdjx44d5bbz8OFDPPfcc2jYsCEy\nMzOfueBYGxcXF1y+fLnc7/r27Yvk5GQsWbIEw4cPl4zdunUrLl26BCEE7OzsYG1tDWtra7Rv3x5u\nbm6YNm0a8vPz8ejRIxw6dAgAMGTIEHz22We4evUqHj58iA8//BCDBw+W/LIdN24c5s6dq56ckJ2d\njQ0bNmh8bf/+/bFlyxYcPHgQhYWF+OijjyQnH0i9b1LHRI5t27ap2545cyY6deoEDw8PODs7w8PD\nA19//TWKi4uRkJBQbvsuLi64efMmnjx5ov5d2UJ5yJAhWLVqFU6ePInHjx/jww8/RMeOHSVHU8oW\n2Bs2bFAvVWJvbw+VSqXxWA8cOBBbt27F7t278eTJE8THx6Nu3boICQkBADg7OyM0NBQjRozA888/\nr54Q4Obmplc/18TFxQVXr1595n8MpP5H4a233kJiYiJ27tyJ4uJiPHr0SF0EyqEtvrCwEIWFhWjU\nqBGsrKywfft27Ny5s9xxWr16Nc6ePYv8/Hy9/tb06cul+196DAYOHIjFixfj1q1byMrKwvz588t9\nPsjps4b+jxdVDyy0yOQiIyNha2uLhg0bYubMmVi7dq260Pr888/x0Ucfwc7ODrNnz8agQYPUcbdv\n38aAAQPQsGFD+Pr6IjQ0VD2zbe3atSgsLFTPMhowYIDk6Y2PP/4YTZs2RbNmzdCrVy8MHz683Afj\n4sWLkZiYCAcHB6xbtw59+/ZVP/fee++hoKAAjRo1QkhICF599VWtozJln4uNjUV0dDQcHBzw/fff\nAwDq1q2Lfv364erVq+jXr5/kMbt48SLCw8Nha2uLkJAQ/N///R+6desGKysrJCYm4tKlS/Dy8kKT\nJk2wfv16AMCoUaMQFRWFrl274vnnn0e9evWwdOnScvmV1adPH0ydOhWDBw9Gw4YNERAQUG4dsLJ8\nfX3xr3/9C0OHDoW7uzscHR3LnW4pu+/a3rdJkybh+++/h6OjI9577z3J/S+bq0qlwrBhwzBr1iw4\nOTnhxIkT+Oabb9TPr1ixAgsXLkSjRo1w5swZvPzyy+rnwsLC4OfnB1dXVzRu3PiZXMPCwjB79my8\n+eabcHd3R2pqKr777jvJY1Y29vfff0fHjh1ha2uL3r17Y8mSJeq1v8ry8fHBN998g4kTJ8LZ2Rlb\nt25FYmIibGxs1K8ZOnQofv31V/VoVilt/VzOOlKlMxqdnJzQrl07yeNb+tjT0xObNm3C3Llz0bhx\nY3h5eSE+Pl6yqK5IKl4IAVtbWyxZsgQDBw6Eo6Mjvv32W/Tu3Vsd26tXL7z33nt45ZVX4OPjg7Cw\nMNmjSbr6srb3cezYsYiIiEBgYCDatm2Lv/zlL7C2tlYXzXL6bHVZm4wMoxIKltrnz58vN6X5ypUr\nmD17Nt566y0MGjQI165dg7e3N9avXw97e3ul0iDSS1JSEqKiop45RWAss2fPxsWLF9Wn04jIfGzf\nvh3jx48vd0qZSBtFR7RatmyJEydO4MSJEzh+/Djq1auHvn37Ii4uDuHh4bhw4QLCwsIQFxenZBpE\nFiMzMxMJCQl4++23TZ0KEeHpum/btm1DUVER0tLSMGvWLK2jzUQVGe3U4a5du9CiRQs0adIEmzdv\nRnR0NICnF/9u3LjRWGkQyWKKYf4VK1bAy8sLr776Kjp37mz09onoWUIIxMbGwtHREW3atIGfnx/+\n/ve/mzotsiCKnjosa9SoUWjXrh0mTJgABwcHPHjwAMDTTuzo6Kh+TERERFRdGKXQKiwshIeHB86c\nOQNnZ+dyhRYAODo6IjMzU+k0iIiIiIzKRvdLKm/79u1o27YtnJ2dATydDnv79m24uroiPT1dPdun\nLL8AP5z548wzvyciIiIyN0FBQUhJSXnm90YptL799lsMGTJE/fiNN97AmjVrMHXqVKxZswZ9+vR5\nJubMH2eQliO9Nkv83HhM/nCyQflUJpZts222zbbZNttm22y7Ig87zQsZK34xfF5eHnbt2lVulsa0\nadPUdzbfvXs3pk2bpnQaREREREan+IhW/fr1ce/evXK/c3R0xK5du5RumoiIiMikrGNjY2NNnYQm\ns2bNwuTp2of3mjQ17IaplY1l22ybbbNtts222TbbLmvRvEXQVFIZbXkHfalUKq3XaBERERGZCw87\nD433teS9DomIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgU\nwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiI\niIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEs\ntIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiI\nSCEstIiIiIgUonihlZWVhf79+6NVq1bw9fXFb7/9htjYWHh6eiI4OBjBwcHYsWOH0mkQERERGZ1K\nCCGUbCA6OhrdunXDqFGjUFRUhLy8PPzjH/+Ara0tYmJipBNTqZCWk6ZkakRERERVwsPOA5pKKhsl\nG83Ozsb+/fuxZs2ap43Z2KBhw4YAoDEZIiIioupE0VOHqampcHZ2xsiRI9GmTRuMHTsW+fn5AICl\nS5ciKCgIo0ePRlZWlpJpEBEREZmEooVWUVERkpOTMWHCBCQnJ6N+/fqIi4vDhAkTkJqaipSUFLi5\nuWHy5MlKpkFERERkEoqeOvT09ISnpydeeuklAED//v0RFxcHZ2dn9WvGjBmDyMhIjfHxc+PVP3fq\n0gkhXUIMysPdzkPna24pdD2Yn5crsrKk61l7+xL8ef225PO6cteWd2Xb1hZfmVil266sqmjbQ8P7\nZinXHFbMXZ+8Dd1vU/a1UqbYb7bNto3ddmVwv8u3fWj/IRzef1hnvOIXw3ft2hVfffUVfHx8EBsb\ni4KCArz//vtwdXUFAHz22Wc4duwY1q1bVz6xKrwY3sPOHWk5twx+3pRta3teyW1bctuVVZVtK52r\nuarK46R0XyMiqgpSF8MrvrzD0qVLMWzYMAQFBeHUqVOYPn06pkyZgsDAQAQFBWHv3r347LPPlE6D\niPQUP9fW1CmYRGX2u7LHjG2zbWO1XRncb/0oPqJlKI5o6X7enEeVOKJl2OvNSWVyt+QRLWPuN9tm\n26ZquzK431LPm2B5ByJDCagAO6nngFuwjOudiIioZmOhRWZJBaF9hAOWOUpEREQ1CwstGUw5A470\nV1Wz0KqqbQ87d9ltW2pfq+x+ExFVVyy0ZMjKstI6ukLmRdv7BSj7nlW2bUvta6Y85kRE5sxiCq3K\nrCcFaP+gt7cv0RpbmeuFtMXKiQekc69M3nLa5n5XbazSbVd2vbiK8QIol4u2WEs+5tpG4wwZgazs\nCKbctnW9X4D+6+yxbfN9v005Wm7J+/2/GP3X8Kqqti2m0KrMgqKVnaFQmbYrxuo7Y6IyuVe27cpc\nJ6UtVk58Ve53ZeP1OW6V3e+q7Gv6MuX7bcq2KzOKaMoRTFPud01t25TvN9s2rO1SpYWVPp/nVdW2\nxRRaZBqGjioR6Yt9jYiqIxZaJKliJW/Ja0KReWNfI6LqioUWEdVoNXXNNlPud0095qZk6mtuazIW\nWkRUo9XUNdtMud819Zibkimvg6zpWGgZWcy03BrZtiXjcSMiskyVmfFYVVhoGdnkD033pW3Kti0Z\njxsRkWUyh7UJpZfPJqqAIztkLOxrRFRdcETLSCoulgbIWzDNnNrWd2SnMovEmgNDFrh7Gme5+20u\nSyzUlL5W2YvCLXW/TYkX4htfTT/mLLSMxFhFlbm0XR2m6xty3Cx5vy01d0vNG6jcBcqWso/mhhfi\nG19NP+YstIioSnB0hYjoWSy0iKjSLHlUiYhISSy0iIgMxIUcaxZTvt81ta9Vh/1moUWyxc+15VIH\nZBSW0te4kGPNYsr3u6b2teqw31zegWRbFGdrcGxNna5vyfttytzZ10gbDzt3yX+8HpDMDUe0yCgs\nYXRCCZa835aau6XmTfLwekCyNCy0SKey60l52Jl2qQpLYcp10yrL0PXDyjJ0VIl9zbjMZd20msLU\n60nVxPfb1MccYKFFMvDLTn+WfMyqIndDR5Us+bhZGo4MGZ8p15Oqqe+tOazhxUKLiGq8mvh/+kRk\nHCy0iKhG48gOESmJsw7JKOLnGj6LjEgf7Gs1C2eZklIEVHC385D8J6CStR0WWmQUlZmuT6QP9rWa\nhbNMSSkqCNzKSZP8p4KQtR0WWkRU5TiqZHk4MkSkDBZaRFTlOKpkeTgyRKQMXgxPRFWm4jpYgPGW\nbKiqtjmyQ0RViYUWEVUZU66DVVVtc2SHiKoSCy0yCo4SEFUvUmuPAcZdf8xSbkBeFczlmJN+WGiR\nUdSUD0KimsCc1h5bFFczCi1zOuakH8Uvhs/KykL//v3RqlUr+Pr64siRI8jMzER4eDh8fHwQERGB\nrKwspdMgIiItOFOUSBmKF1qTJk3Ca6+9hrNnz+LUqVN48cUXERcXh/DwcFy4cAFhYWGIi4tTOg0i\nItKCM0WJlKFooZWdnY39+/dj1KhRAAAbGxs0bNgQmzdvRnR0NAAgOjoaGzduVDINIiLZOLJDRFVJ\n0UIrNTUVzs7OGDlyJNq0aYOxY8ciLy8PGRkZcHFxAQC4uLggIyNDyTSIiGTjyA4RVSVFC62ioiIk\nJydjwoQJSE5ORv369Z85TahSqaBSybtfEFkujhIQVV+mnFVcU2c019T9tkSKzjr09PSEp6cnXnrp\nJQBA//79MW/ePLi6uuL27dtwdXVFeno6GjdurDF+eWy8+ufQTp0QGhKiZLqkAJXH/xaRXPTfGluk\nmW6tJSI53PX4DtM25d7BvkTntqTi5cRWhbJ/ox52hv19xk/MBfTMtbL7XZnPFm3b19W2ks/r834b\ncswrw5T7bb5tJ2F5bKL2DUDhQsvV1RVNmjTBhQsX4OPjg127dsHPzw9+fn5Ys2YNpk6dijVr1qBP\nnz4a42MnT1YyPTICFlVUnYm08tPrVR7uz/xOyfiqYIq/0arYb362kOmFInayj/rRrEWLNL5K8XW0\nli5dimHDhqGwsBDNmzfHqlWrUFxcjIEDB2LlypXw9vbG+vXrlU6DiIiILJDKQ/uosblTvNAKCgrC\nsWPHnvn9rl27lG6aiEhvH8fw2hcic2EOo76VxZXhiYjKiJ3MQouoOpEaETPWaJjiC5YSEVH1FhvP\nWcXGxmMuj0i7Ve5f2d9l/nnbKDmw0CIiqiKVPe1oqactZy2q3Je+pe63KVX2mJPxsNAiIqoilT3t\nWFNPW9bU/aaagddoERGh/LpMgH7LB1SM1TeeSK6yfU313x+N0dfMZeafJY5+stAiIkLlvqxYVJGx\nWOq6Z1XFEkc/WWgREREZialnwNV0phgRY6FFRESVYomnc0zBnEaGaipTjIjxYngiIjNhqVP2Kz0J\nwEL3m0gOFlpERGaipk7Zr6n7TTUDCy0iIiKSzZSnii1x9JOFFhEREclmypl/ljj6yUKLiIhMQuXh\n8d91oVQa1yKr7jiJwPhMMSLGQouIiCrF0C8vkZZW7l9NY4lrQlk6U4yIsdAiIjITljrCYYmnc4iM\nRVahdfXqVezatQsAkJ+fj5ycHEWTIiKqiTjCQVT96Cy0li9fjgEDBuCdd94BANy8eRN9+/ZVPDEi\nIiIyP6ac+WeJo746C61//etfOHDgAOzs7AAAPj4+uHPnjuKJERERKUHl4a7xH2+DI48pTxVb4qiv\nzkKrTp06qFOnjvpxUVERVCqVokkREREpQaTdKvev7O8y/7xt1FwscU0oS2eKETGdhVa3bt3wySef\nID8/H7/88gsGDBiAyMhIY+RGREQWwBJP55gDTiIwPrO81+H8+fPh7OyMgIAALFu2DK+99hrmzJlj\njNyIiGoUSx3hsMTTOUTGYqPtyaKiIvj7++PcuXN4++23jZUTEVGNUnaxzlmLUCPXlCKqrrSOaNnY\n2KBly5a4du2asfIhIqpxavrCnWRZeK9D/Wgd0QKAzMxM+Pn5oX379qhfvz4AQKVSYfPmzYonR0RE\npCReX6Y/U9/r0NJOVesstGbPng0A6pmGQgjOOiQiomrBlF/aLPKMLzbe+IWazovhQ0ND8eKLLyIn\nJwe5ubnw9fVFt27djJEbERFZAEs8nWMOLG1kpjLMZe0ys7zX4fr169GhQwds2LAB69evR/v27bFh\nwwZj5EZERBaAyxSQNua0dpkp6Dx1OGfOHBw7dgyNGzcGANy9exdhYWEYMGCA4skRERERWTKdI1pC\nCDg7O6sfOzk5QQihaFJERERknnivQ/3oHNHq1asXevbsiaFDh0IIgf/85z949dVXjZEbERGZsbLr\nf6n++6OlLE9RNnfAcvI2B6aY+VdxrTnAct4znYXWwoUL8cMPP+DgwYMAgHfeeQd9+/ZVPDEiIjJv\nlvJFp4lXQZScAAAgAElEQVQ55G6KGXCWqqreL1OMiKmEjvOAqampcHV1xXPPPQcAKCgoQEZGBry9\nvZVNTKUyiz8EIiIi4OnMudKLuQ15vrKvNxeVzdtSCkx932+Vh4fGS6t0XqPVv39/WFtb/y/Aygr9\n+/fXN18iIiIiiyiyqpLOU4fFxcWoXbu2+nGdOnXw5MkT2Q14e3vDzs4O1tbWqFWrFo4ePYrY2Fh8\n9dVX6ovs582bh169ehmQPhEREZEyVB7uks/JXQNMZ6HVqFEjbNq0Cb179wYAbNq0CY0aNZKZ4tNT\ngElJSXB0dCz3u5iYGMTExMjeDhEREZmeJc78M0TF04aGnjLVWWh9+eWXGDZsGN59910AgKenJ77+\n+mu9GtF0zpJLRBAREVmemnbqr7J0XqPVokULHDlyBGfOnMHZs2dx+PBhtGjRQnYDKpUKPXr0QLt2\n7bBixQr175cuXYqgoCCMHj0aWVlZhmVPRERkoWrKyJA5McUaYDoLrX/84x/IyclBgwYNMGnSJLRp\n0wY///yz7AYOHjyIEydOYPv27fjXv/6F/fv3Y/z48UhNTUVKSgrc3NwwefLkSu0EERGRpampI0Om\nXPDUFLeL0nnqMCEhAe+99x5+/vlnZGZmYu3atYiKikLPnj1lNeDm5gYAcHZ2Rt++fXH06FF06dJF\n/fyYMWMQGRmpMTY2Pl79c2inTggNCZHVJhEREZknUy94qvKomnW5kg4dQtLhwzpfp7PQKr2WauvW\nrYiKioK/v7/sJPLz81FcXAxbW1vk5eVh586d+Pjjj3H79m24uroCAH766ScEBARojI/lSBcRERFV\nUlUUVhVP9YaGhJQbAJq1aJHGOJ2FVtu2bREREYErV64gLi4OOTk5sLLSecYRAJCRkaFeRb6oqAjD\nhg1DREQEhg8fjpSUFKhUKjRr1gzLli2TtT0iIiIyLUtZcLSqGbrPOleGLy4uRkpKCpo3bw57e3vc\nv38faWlpCAwMNKhB2YlxZXgiIjIjVb0yvKWq7H5W1+Nk8Mrw1tbWaNu2Lezt7QEATk5OihdZRERE\n1Z0pLwon45F3DpCIiIiqlClmwJmDmrasBQstIiIiMpqadn2XzkIrJiYGf/75pzFyISIiIjJLhp7q\n1VlotWrVCm+//Tbat2+PL7/8EtnZ2QY1RERERE8vmn66rpOq3PpOlqKmnforZeipXp2zDkudO3cO\nq1evxrp169C5c2eMHTsW3bt3N6hRWYlx1iEREZkRzjqs2XS//wbOOgSeLvFw7tw5nD17Fs7OzggK\nCsKiRYswaNAgwzMmIiIiquZ0Flrvv/8+WrZsiW3btmHGjBk4fvw4pk6disTERKSkpBgjRyIiIqom\natqyFjoLrcDAQJw8eRLLly9H+/btyz135MgRxRIjIiKi6qemLWuhs9D6+uuvUb9+/XK/CwsLAwD1\nIqZERERE1ZmhkwAkC62CggLcv38f9+7dQ2Zmpvrf1atXkcaL1ImIiGqkmnbqr5Sh639J3lR62bJl\nWLx4MW7duoW2bduqf29ra4t3333XoMaIiIjIss1aVDNvKm0oncs7LF26FBMnTjRWPmpc3oGIiMwJ\nl3d4ytD9rLhmWHX7jpda3kFyRGv37t145ZVX4O7ujh9//PGZ5/v161e1GRIREVG1Vd0KK7kkC629\ne/filVdeQWJiIlQq1TPPs9AiIiIi0k72yvDGxlOHRERkTnjq8Kmasp8VxcZrvzZN71OH8fHxz25E\npYIQAiqVCjExMQamSkRERJaqJt/r0JBJAJKFVm5ursZThqWFFhEREdU8nHGoH8lCKzY21ohpEBER\nEVU/koVWqYKCAqxcuRJnzpxBQUGBejQrISFB8eSIiIiILJnOW/BERUUhIyMDO3bsQGhoKG7cuIEG\nDRoYIzciIiIii6ZzROvSpUv4/vvvsWnTJkRHR2Po0KHo3LmzMXIjIiIiM1FxwVGgZq2NZegkAJ2F\nVu3atQEADRs2xOnTp+Hq6oq7d+8a1BgRERFZpppUVGlS5fc6LDV27FhkZmZizpw5eOONN/Dw4UPM\nnj3boMaIiIiIahIuWEpERCQDFywlbfResLTUrFmz/reRMutnffTRR1WUGhEREVH1pLPQql+/vrrA\nKigowJYtW+Dr66t4YkRERESWTu9Th48fP0ZERAT27t2rVE4AeOqQiIjMC08d1myG3utQ5zpaFeXl\n5SGNBRARERHVILMW2RoUp/PUYUBAgPrnkpIS3Llzh9dnEREREcmgs9BKTEz834ttbODi4oJatWop\nmhQRERFRdaCz0LKzsyv3ODe3/PlJR0fHqs2IiIiIqJrQWWi1adMG169fh4ODAwDgwYMH8PLygkql\ngkqlwpUrVxRPkoiIiMgS6bwYPjw8HFu2bMH9+/dx//59bN26FREREUhNTZVVZHl7eyMwMBDBwcFo\n3749ACAzMxPh4eHw8fFBREQEsrKyKr8nRERERAox9F6HOpd38Pf3xx9//KHzd1KaNWuG48ePlzvF\nOGXKFDRq1AhTpkzB/Pnz8eDBA8TFxZVPjMs7EBGRGeHyDqSNwcs7uLu7Y86cObh69SpSU1PxySef\nwEPDHby1qdjw5s2bER0dDQCIjo7Gxo0b9doeERERkSXQWWh9++23uHPnDvr27Yt+/frhzp07+Pbb\nb2U3oFKp0KNHD7Rr1w4rVqwAAGRkZMDFxQUA4OLigoyMDAPTJyIiIjJfOi+Gd3JywpIlSwxu4ODB\ng3Bzc8Pdu3cRHh6OF198sdzzpRfVaxIbH6/+ObRTJ4SGhBicBxEREVFVSTp0CEmHD+t8nd634KmM\nWbNmoUGDBlixYgWSkpLg6uqK9PR0dO/eHefOnSufGK/RIiIiM8JrtEibKrsFjz7y8/PV627l5eVh\n586dCAgIwBtvvIE1a9YAANasWYM+ffoomQYRERFRpcTGG3YLHkVHtFJTU9G3b18AQFFREYYNG4bp\n06cjMzMTAwcOxPXr1+Ht7Y3169fD3t6+fGIc0SIiIjPCEa2aTff7r3lES7LQmjhx4v9epFKVC1ap\nVJW6bksOFlpERGROWGjVbIYWWpKnDtu2bYu2bdvi8ePHSE5Oho+PD1544QWkpKSgsLCwarImIiIi\nqsZ0njrs0KEDDhw4oL6R9JMnT9C5c2ccOXJE2cQ4okVERGaEI1o1W5WPaJXKyspCTk6O+nFubi5v\nmUNEREQkg851tKZNm4Y2bdogNDQUALB3717ExsYqnBYRERGR+VDsXocAkJ6ejqNHjwJ4eirR1dXV\noMb0SoynDomIyIzw1CFpY/Cpw5KSEuzatQsnT55E7969UVhYqC66iIiIiEiazkJrwoQJOHz4sPr+\nhg0aNMCECRMUT4yIiIjI0um8RuvIkSM4ceIEgoODAQCOjo548uSJ4okRERERWTqdI1q1a9dGcXGx\n+vHdu3dhZaXonXuIiIiIqgWdFdPEiRPRt29f3LlzBx9++CFefvllTJ8+3Ri5EREREZkFRe91ePbs\nWfz6668AgLCwMLRq1cqgxvRKjLMOiYjIjHDWYc2m2IKlly9fRrNmzfDuu+/Cz88Pv/zyCxcsJSIi\nIpJBZ6HVr18/2NjY4NKlS3jnnXdw48YNDB061Bi5EREREZmUysMDKg8PAKoyP8unc9ahlZUVbGxs\n8OOPP2LixImYOHGiegYiERERUXVW2cuYZM06XLduHdauXYvXX38dALi8AxEREZEMOguthIQEHD58\nGDNmzECzZs1w5coVvPXWW8bIjYiIiMiiyZp1aAqcdUhEROaEsw5JG6lZhzqv0bpw4QI+/PBDnDlz\nBgUFBU83plLhypUrVZ8lERERUTWi89ThyJEjMW7cONjY2CApKQnR0dEYNmyYMXIjIiIismg6C62C\nggL06NEDQgg0bdoUsbGx2Lp1qzFyIyIiIrJoOk8d1q1bF8XFxWjRogX++c9/wt3dHXl5ecbIjYiI\niMii6Sy0/vGPfyA/Px9LlizBzJkzkZOTgzVr1hgjNyIiIiKLJnvWYU5ODgDAzs5O0YRKcdYhERGZ\nE846JG0MvtfhsWPHEBAQoP4XFBSE33//XZEkiYiIiKoTnacOR40ahc8//xxdunQBABw4cACjRo3C\nqVOnFE+OiIiIyJLpHNGysbFRF1kA0LlzZ9jY6KzPiIiIiGo8yYrp+PHjAIBu3brhnXfewZAhQwAA\n//nPf9CtWzfjZEdERERkwSQvhg8NDYVKpXrm90IIqFQq7NmzR9nEeDE8ERGZEV4MT9rofQuepKQk\nJfMhIiIiqvZ0XqNV1uuvv65UHkRERETVjl6FVhpP5RERERHJpleh1bp1a6XyICIiIqp29Cq0Vq1a\npXcDxcXFCA4ORmRkJAAgNjYWnp6eCA4ORnBwMHbs2KH3NomIiIgsgeTF8AEBAZJBKpVK9oKlixcv\nhq+vL3Jzc9WxMTExiImJ0TNVIiIiIssiWWglJiYCAD7//HMAQFRUFIQQ+Pe//y174zdv3sS2bdsw\nY8YMLFq0CMDT5SFk3l6RiIiIyKJJFlre3t4AgJ07dyIlJUX9+8DAQAQHB2P+/Pk6N/7+++9j4cKF\n6htSA09HtJYuXYq1a9eiXbt2iI+Ph729fSV2gYiIiMg86bxGSwiBAwcOqB8fPHhQ1ojUli1b0Lhx\nYwQHB5d7/fjx45GamoqUlBS4ublh8uTJBqZOREREZN503rQwISEBI0eORHZ2NgDA3t5e1kXxhw4d\nwubNm7Ft2zY8evQIOTk5GD58ONauXat+zZgxY9QXyWsSGx+v/jm0UyeEhoTobJeIiIhIaUmHDiHp\n8GGdr5O8BU9F2dnZEEIYdJpv7969+PTTT5GYmIj09HS4ubkBAD777DMcO3YM69atezYx3oKHiIjM\nCG/BQ9rofQueUrdv38aMGTOQlpaGHTt24MyZMzh8+DBGjx4tu/HS+yMCwJQpU3Dy5EmoVCo0a9YM\ny5Yt02M3iIiIiCyHzhGtXr16YeTIkfjkk09w6tQpPHnyBMHBwfjjjz+UTYwjWkREZEY4okXaSI1o\n6bwY/t69exg0aBCsra0BALVq1YKNjc6BMCIiIqIaT2eh1aBBA9y/f1/9+LfffkPDhg0VTYqIiIio\nOtA5NBUfH4/IyEhcuXIFISEhuHv3Lr7//ntj5EZERERk0bQWWsXFxdi3bx/27duHc+fOQQiBli1b\nonbt2sbKj4iIiMhiaT11aG1tjXXr1sHGxgb+/v4ICAhgkUVEREQkk85Th507d8a7776LQYMGoX79\n+uqlGtq0aWOM/IiIiIgsls5C68SJE1CpVPjoo4/K/X7Pnj2KJUVERERUHegstJKSkoyQBhEREVH1\no7PQevToEX744QdcvXoVxcXF6lOHFUe4iIiIiKg8nYVW7969YW9vj7Zt26Ju3brGyImIiIioWtBZ\naKWlpeHnn382Ri5ERERE1YrOQiskJASnTp1CYGCgMfIhIiIyWyoPd8nnHOxLjJgJWQrJm0oHBAQA\neLpo6cWLF9GsWTPUqVPnaZBKhVOnTimbGG8qTUREZow3kaaypG4qLTmilZiYCJVKBQAaA4mIiIhI\nO8lCy8XFBV9++SUuXbqEwMBAjB49GjY2Os80EhEREdF/Sd6CJzo6GsePH0dAQAC2bduGyZMnGzMv\nIiIiIosnOUR19uxZnD59GgAwZswYvPTSS0ZLioiIiKg6kBzRKnuakKcMiYiIyvs4JtfUKZAFkJx1\naG1tjXr16qkfFxQU4LnnnnsapFIhJydH2cQ465CIiIgshN6zDouLixVNiIiIiKi6kzx1SERERESV\nw0KLiIiISCEstIiIiIgUwkKLiIjIALHxtqZOgSyA5KxDU+OsQyIiMme81yGVJTXrkCNaRERERAph\noUVERESkEBZaRERERAphoUVERESkEBZaREREBuC9DkkOzjokIiIiqiTOOiQiIiIyMhZaRERERApR\nvNAqLi5GcHAwIiMjAQCZmZkIDw+Hj48PIiIikJWVpXQKRERERCaheKG1ePFi+Pr6QqVSAQDi4uIQ\nHh6OCxcuICwsDHFxcUqnQERERGQSihZaN2/exLZt2zBmzBj1BWKbN29GdHQ0ACA6OhobN25UMgUi\nIiJF8F6HJIeihdb777+PhQsXwsrqf81kZGTAxcUFAODi4oKMjAwlUyAiIlLErEUstEg3xQqtLVu2\noHHjxggODtY43RF4uoRD6SlFIiIiourGRqkNHzp0CJs3b8a2bdvw6NEj5OTkICoqCi4uLrh9+zZc\nXV2Rnp6Oxo0bS24jNj5e/XNop04IDQlRKl0iIiIi2ZIOHULS4cM6X2eUBUv37t2LTz/9FImJiZgy\nZQqcnJwwdepUxMXFISsrS+MF8VywlIiIzJnKwx0i7Zap0yAzYfIFS0tPEU6bNg2//PILfHx8sHv3\nbkybNs1YKRAREREZFW/BQ0REZIDYeFvETub9DukpqREtFlpERERElWTyU4dERERENQ0LLSIiIiKF\nsNAiIiIiUggLLSIiIiKFsNAiIiIyAO91SHJw1iEREZEBuGAplcVZh0RERERGxkKLiIiISCEstIiI\niIgUwkKLiIiISCEstIiIiAzwcQzvc0i6cdYhERERUSVx1iERERGRkbHQIiIiIlIICy0iIiIihbDQ\nIiIiIlIICy0iIiID8F6HJAdnHRIRERmA9zqksjjrkIiIiMjIWGgRERERKYSFFhEREZFCWGgRERER\nKYSFFhERkQF4r0OSg7MOiYiIiCqJsw6JiIiIjIyFFhEREZFCWGgRERERKYSFFhEREZFCWGgREREZ\ngPc6JDk465CIiMgAvNchlcVZh0RERERGxkKLiIiISCGKFlqPHj1Chw4d0Lp1a/j6+mL69OkAgNjY\nWHh6eiI4OBjBwcHYsWOHkmkQERERmYSNkhuvW7cu9uzZg3r16qGoqAidO3fGgQMHoFKpEBMTg5iY\nGCWbJyIiIjIpxU8d1qtXDwBQWFiI4uJiODg4AIDGC8aIiIgsBe91SHIoXmiVlJSgdevWcHFxQffu\n3eHn5wcAWLp0KYKCgjB69GhkZWUpnQYREVGVip3MQot0U7zQsrKyQkpKCm7evIl9+/YhKSkJ48eP\nR2pqKlJSUuDm5obJkycrnQYRERGR0Sl6jVZZDRs2xF/+8hf8/vvvCA0NVf9+zJgxiIyM1BgTGx+v\n/jm0UyeEhoQonSYRERGRTkmHDiHp8GGdr1N0wdJ79+7BxsYG9vb2KCgoQM+ePfHxxx/Dz88Prq6u\nAIDPPvsMx44dw7p168onxgVLiYiIyEJILViq6IhWeno6oqOjUVJSgpKSEkRFRSEsLAzDhw9HSkoK\nVCoVmjVrhmXLlimZBhEREZFJ8BY8REREBoiNt+UF8aQmNaLFQouIiMgAvNchlcV7HRIREREZGQst\nIiIiIoWw0CIiIiJSCAstIiIiIoWw0CIiIjIA73VIcnDWIREREVElcdYhERERkZGx0CIiIiJSCAst\nIiIiIoWw0CIiIiJSCAstIiIiA8TG25o6BbIAnHVIRERkAN7rkMqSmnVoY4JciIiILJbKw6PMz0//\ny4EBksJCi4iISA8sqkgfvEaLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiI\niIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEs\ntIiIiIgUwkKLiIiISCEstIiIiIgUwkKLiIiISCEstIiIiIgUolih9ejRI3To0AGtW7eGr68vpk+f\nDgDIzMxEeHg4fHx8EBERgaysLKVSICIiIjIpxQqtunXrYs+ePUhJScGpU6ewZ88eHDhwAHFxcQgP\nD8eFCxcQFhaGuLg4g7afdOiQwblVJpZts222zbbZNttm22xbLkVPHdarVw8AUFhYiOLiYjg4OGDz\n5s2Ijo4GAERHR2Pjxo0GbTvp8GGD86pMLNtm22ybbbNtts222bZcihZaJSUlaN26NVxcXNC9e3f4\n+fkhIyMDLi4uAAAXFxdkZGQomQIRERGRydgouXErKyukpKQgOzsbPXv2xJ49e8o9r1KpoFKplEyB\niIiIyGRUQghhjIZmz56N5557Dl999RWSkpLg6uqK9PR0dO/eHefOnXvm9a1bt8bJkyeNkRoRERFR\npQQFBSElJeWZ3ytWaN27dw82Njawt7dHQUEBevbsiY8//hg///wznJycMHXqVMTFxSErK8vgC+KJ\niIiIzJlihdbp06cRHR2NkpISlJSUICoqCh988AEyMzMxcOBAXL9+Hd7e3li/fj3s7e2VSIGIiIjI\npIx26pCIiIioprGoleFHjRoFFxcXBAQEGBR/48YN9exHf39/LFmyRHas1AKs+iguLkZwcDAiIyP1\njvX29kZgYCCCg4PRvn17veOzsrLQv39/tGrVCr6+vvjtt99kxZ0/fx7BwcHqfw0bNtTruAHAvHnz\n4Ofnh4CAAAwdOhSPHz+WHbt48WIEBATA398fixcv1vl6TX1E7iK5mmI3bNgAPz8/WFtbIzk5We+2\nP/jgA7Rq1QpBQUHo168fsrOzZcfOnDkTQUFBaN26NcLCwnDjxg292i4VHx8PKysrZGZmyo6NjY2F\np6en+n3fsWOH3m0vXboUrVq1gr+/P6ZOnSo7dvDgwep2mzVrhuDgYL3aPnr0KNq3b4/g4GC89NJL\nOHbsmOzYkydPolOnTggMDMQbb7yB3NxcjbFSnyVy+5pUvNz+JhUvp79Jxcrpb7o+Q3X1Nal4Of1N\nW9ty+ppU/KBBg3T2N6lYuX1NKl5Of6vs4t9S8XL7mlS8nL4mFSunr+n6ztXV16Ti5fQ1bW3L6WvP\nEBZk3759Ijk5Wfj7+xsUn56eLk6cOCGEECI3N1f4+PiIM2fOyI7Py8sTQgjx5MkT0aFDB7F//369\n2o+PjxdDhw4VkZGResUJIYS3t7e4f/++3nGlhg8fLlauXCmEeJp/VlaW3tsoLi4Wrq6u4vr167Jj\nUlNTRbNmzcSjR4+EEEIMHDhQrF69Wlbs6dOnhb+/vygoKBBFRUWiR48e4tKlS1pjNPWRDz74QMyf\nP18IIURcXJyYOnWq7NizZ8+K8+fPi9DQUHH8+HG92965c6coLi4WQggxdepUvdrOyclR/7xkyRIx\nevRovdoWQojr16+Lnj17au0/mmJjY2NFfHy8lr3VHr97927Ro0cPUVhYKIQQ4s6dO3rlXWry5Mli\n9uzZerXdrVs3sWPHDiGEENu2bROhoaGyY9u1ayf27dsnhBAiISFBzJw5U2Os1GeJ3L4mFS+3v0nF\ny+lvUrFy+pu2z1A5fU0qXk5/k4qV29fkfP5L9TepWLl9TSpebn/T9N0jt69Jxevz2aYpXu5nm6ZY\nuZ9tUt+5cvqaVLzczzZNsXL7WkUWNaLVpUsXODg4GBzv6uqK1q1bAwAaNGiAVq1a4datW7LjKy7A\n6ujoKDv25s2b2LZtG8aMGQNh4NlaQ+Oys7Oxf/9+jBo1CgBgY2ODhg0b6r2dXbt2oXnz5mjSpIns\nGDs7O9SqVQv5+fkoKipCfn4+PDw8ZMWeO3cOHTp0QN26dWFtbY1u3brhxx9/1BqjqY/IXSRXU+yL\nL74IHx8fWflqig8PD4eV1dM/sw4dOuDmzZuyY21tbdU/P3z4EI0aNdKrbQCIiYnBggUL9M4bkN/f\nNMV/8cUXmD59OmrVqgUAcHZ21qvt0vbXr1+PIUOG6NW2m5ub+v+us7KyJPubptiLFy+iS5cuAIAe\nPXrghx9+0Bir6bMkLS1Ndl+T+iyS29+k4uX0N6lYOf1N22eonL4mddwA3f1NKvbLL7+U1dd0ff5r\n629Sbcvta1LxcvtbZRf/1vTdpc9nm6Z4uZ9tmmLlfrZJfefK6Wua4kv/3uV8tmmKldvXKrKoQqsq\nXb16FSdOnECHDh1kx1RcgNXX11d27Pvvv4+FCxeqO6a+VCoVevTogXbt2mHFihV6xaampsLZ2Rkj\nR45EmzZtMHbsWOTn5+udw3fffYehQ4fqFePo6IjJkyfDy8sL7u7usLe3R48ePWTF+vv7Y//+/cjM\nzER+fj62bt0q+cesjbkskpuQkIDXXntNr5gZM2bAy8sLa9aswbRp0/SK3bRpEzw9PREYGKhXXKml\nS5ciKCgIo0eP1vuepBcvXsS+ffvQsWNHhIaG4vfff9e7/f3798PFxQXNmzfXKy4uLk7d5z744APM\nmzdPdqyfnx82bdoE4OmpFW2na0uV/SwxpK8Z8lkkJ15Of6sYq09/KxtrSF8rje/YsSMA/fpb2bYv\nXLigd1/TdMzk9reyeRvS18q2Lbe/VXbx78p8d8mJ19bXpGLl9DVNsfr0NU3HDZDX1zTFGtLXAFjW\nqUMhnp6KMvTUYanc3FzRtm1b8dNPPxkUn5WVJTp06CD27Nkj6/WJiYliwoQJQggh9uzZI15//XW9\n27x165YQ4ulQZVBQkHq4WY5jx44JGxsbcfToUSGEEJMmTZIcopby+PFj0ahRI9lDpaUuXbokWrVq\nJe7duyeePHki+vTpI7755hvZ8StXrhRt27YVXbt2FePHjxfvvfeezpiKfcTe3r7c8w4ODrJjS8kZ\nXtcWP2fOHNGvXz+DYoUQYt68eWLEiBGy4/Py8kT79u1Fdna2EOLpqed79+7JbjsjI0OUlJSIkpIS\nMWPGDDFq1Ci9cvf39xd//etfhRBCHD16VDRr1kx2bKlx48aJRYsWaW1XU3xYWJj48ccfhRBCrF+/\nXvTo0UN27Llz50RERIRo27atmDVrlnByctLadm5urmjTpo36s0SfvlYar+mzSG5/k4qX09+0fQ7q\n6m9lY/Xta5ra1qe/VYzVp69p2285/a1irD59TVO8vv2t9Ltn9+7deve1svFlv7vk9jWpeDl9TSpW\nCHmfbaWxW7duFR06dNCrr1VsW9/PtrKx+va1UjWu0CosLBQRERHis88+q1Qef//738XChQtlvXb6\n9FlE8hEAAAmzSURBVOnC09NTeHt7C1dXV1GvXj0RFRVlcNuxsbHi008/lf369PR04e3trX68f/9+\n8Ze//EWvNjdu3Ch69uypV4wQQnz33Xflzr+vXbtWXXTqa/r06eKLL77Q+bqKfaRly5YiPT1dCPG0\nYG3ZsqXs2FKVKbRWrVolQkJCREFBgd6xpa5duyb8/Pxkx586dUo0btxYeHt7C29vb2FjYyOaNm0q\nMjIy9G5bzt9cxdf06tVLJCUlqR83b95c8gNR0/afPHkiXFxcRFpamtZ2NcXb2tqqfy4pKRF2dnay\nY8s6f/68aN++vWSsps8Sffqats8iOf1NKl5Of9P1Oaitv1WM1bev6Wpb23uiKVafvibVtpz+pilW\nn76ma7919bdSpd89+vQ1TfGl9Cm0KsbL/WyTalsIeZ9tpbGzZ8/Wq6/paltuPVEaq09fK6tGnToU\nQmD06NHw9fXFe++9p1fsvXv31EOMBQUF+OWXX7TOhipr7ty5uHHjBlJTU/Hdd9/hlVdewdq1a2W3\nnZ+fr56NkpeXh507d+o189LV1RVNmjTBhQsXADy91qp0CFWub7/9Vuu1MlJefPFF/PbbbygoKIAQ\nArt27dJr2PrOnTsAgOvXr+Onn37S+9QlALzxxhtYs2YNAGDNmjXo06eP3tsADLtGbseOHVi4cCE2\nbdqEunXr6hV78eJF9c+bNm2S3d8AICAgABkZGUhNTUVqaio8PT2RnJyMxo0by4pPT09X//zTTz/p\nPdO3T58+2L17NwDgwoULKCwshJOTk+z4Xbt2oVWrVnB3d9erXQBo0aIF9u7dCwDYvXu37OtQAODu\n3bsAnp42mDNnDsaPH6/xdVKfJXL7mpzPIm39TSpeTn+TipXT3zTF6tPXpNqW09+kYuX2NW3HXFd/\nk4qV29ek4uX0N6nvHrl9Tc53l7a+JhUvp69JxV66dEn9Gqm+pim2U6dOsvuaVNu3b99Wv0aqr0nF\nGvy5prMUMyODBw8Wbm5uonbt2sLT01MkJCToFb9//36hUqlEUFCQaN26tWjdurXYvn27rNhTp06J\n4OBgERQUJAICAsSCBQsM2QWRlJSk96zDK1euiKCgIBEUFCT8/PzE3Llz9W43JSVFtGvXTgQGBoq+\nffvqNevw4cOHwsnJqdxMEX3Mnz9f+Pr6Cn9/fzF8+HD1jA05unTpInx9fUVQUJDYvXu3zteX9pFa\ntWqp+8j9+/dFWFiYeOGFF0R4eLh48OCBrNiVK1eKn376SXh6eoq6desKFxcX0atXL9ltr1y5UrRo\n0UJ4eXmp+9v48eNlx7755pvC399fBAUFiX79+mn9vzZdfxvNmjWTnJ2jqe2oqCgREBAgAgMDRe/e\nvcXt27f1aruwsFC89dZbwt/fX7Rp00byNLtU3iNGjBDLli2TbFMq94SEBHHs2DHRvn17ERQUJDp2\n7CiSk5Nl7/fixYuFj4+P8PHxEdOnT5dsV+qzRG5f0xS/bds22f1NKl5Of5OKldPfpGLL0tbXpOLl\n9DepYy63r2n7/NfV36TyltvXpOLl9Dep7x65fU0q/scff5TV16Ti5fQ1qVg5fU3Od662viYVL6ev\nScXK7WsVccFSIiIiIoXUqFOHRERERMbEQouIiIhIISy0iIiIiBTCQouIiIhIISy0iIiIiBTCQouI\niIhIISy0iMhorK2tERwcjMDAQPTr1w8PHz6sku16e3sjMzOzSrZVUWJiIubPnw8A2LhxI86ePat+\n7uOPP8avv/6qSLtEVD1wHS0iMhpbW1v1XQ5GjBiBgIAATJ48udLbbdasGY4fPw5HR8dKb0ubESNG\nIDIyEm+++aai7RBR9cERLSIyiU6dOuHy5csAgMuXL+PVV19Fu3bt0LVrV5w/fx7A09Gkjh07ok2b\nNggPD1ffkun+/fuIiIiAv78/xo4dK3kLkQYNGiAmJgb+/v7o0aMH7t27BwBISUlBx44dERQUhH79\n+qlvt7FkyRL4+fkhKChIfbun1atXY+LEiTh8+DASExPxwQcfoE2bNrhy5QpGjBiBH374AQDw66+/\nok2bNggMDMTo0aNRWFgI4OloW2xsLNq2bYvAwED1vhFRzcBCi4iMrri4GDt37oS/vz8A4O2338bS\npUvx+++/Y+HChZgwYQIAoEuXLvjtt9+QnJyMQYMGYcGCBQCAWbNmoWvXrvjjjz/Qt29fXL9+XWM7\n+fn5eOmll/DHH3+gW7dumDVrFgBg+PDhWLhwIU6ePImAgAD17+fPn4+UlBScPHkSX375JQBApVIB\neFoYvvHGG/j000+RnJyM559/HiqVCiqVCo8ePcLIkSOxfv16nDp1CkVFRfjiiy/U8c7Ozjh+/DjG\njx+PTz/9VKGjSkTmiIUWERlNQUEBgoOD4ebmhhs3bmDcuHF4+PAh/r+9u3dpHQzDOPzzA2rxs4OL\nixQKLjXWqmO1k4OjEBSpuBZEnZyE6ijo5OYgIqUdqiAK4iq1IDhYCCi4SNU/QJrFpZoziMHTdpFD\nFD33BVleyPMkmW4e3iQXFxeYpsng4CDJZNL98evj4yPj4+MYhsHm5iY3NzcAnJ+fk0gkAJiYmCAQ\nCNTt19jYyNTUFACJRIJCoYBt25TLZWKxGABzc3Pk83kADMNgZmaGTCZDU1NT3ZrV0zPHcbi9vSUY\nDBIKhWpqAkxOTgIQjUYplUqffm4i8nMpaInIl/H7/RSLRe7v72lpaeHo6AjHcejq6qJYLLrH9fU1\nAAsLCywuLmJZFtvb2zw/P7u1Pru91HEcdzpVvf7u5OSE+fl5rq6uGBkZ4eXlpaZPvRrVa9W9fD4f\n8PYyQKVS+dR1i8jPpqAlIl/O7/eztbXFysoKbW1tBINBDg4OgLeQYlkWALZt09PTA7ztlXo3OjpK\nNpsF4PT0lKenp7p9Xl9f2d/fByCbzRKLxejo6CAQCFAoFABIp9PE43Ecx+Hh4YF4PM76+jrlcrnm\nrcj29nZs2/5rraGhgb6+PkqlkrvnLJ1OMzY29i+PSER+CQUtEfkyH6c8kUiEUChELpcjk8mws7ND\nJBIhHA5zfHwMwNraGqZpMjw8THd3t3v+6uoq+XyecDjM4eEhvb29dfu1trZyeXlJf38/Z2dnpFIp\nAPb29lheXmZgYADLskilUlQqFWZnZzEMg2g0ytLSEp2dne4+LIDp6Wk2NjYYGhri7u7O7ePz+djd\n3cU0TQzDoLm5mWQyWXPPH2uJyP9Bn3cQkV/r4+ckRES+gyZaIvJraXokIt9NEy0RERERj2iiJSIi\nIuIRBS0RERERjyhoiYiIiHhEQUtERETEIwpaIiIiIh5R0BIRERHxyB8CAICS+JK/JAAAAABJRU5E\nrkJggg==\n", - "text": [ - "<matplotlib.figure.Figure at 0x2e616d0>" - ] - } - ], - "prompt_number": 6 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Writing sequence files with `SeqIO`\n", - "\n", - "Often, after manipulating parsed sequences, we need to write it back to a file. This is accomplished using `SeqIO.write` in Biopython. The function takes as its input:\n", - "\n", - "1. An iterable returning `SeqRecord` objects (generators and lists are examples of iterables).\n", - "2. A filename to write to *or* a file-like handle.\n", - "3. The format to write to.\n", - "\n", - "It returns the number of sequences written." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "records = SeqIO.parse('data/parse.fastq', 'fastq')" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 169 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "SeqIO.write(records, 'my_sequences.fa', 'fasta')" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 170, - "text": [ - "2" - ] - } - ], - "prompt_number": 170 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that we're free to write the output format as any format supported by `SeqIO.write` so long as the information is adequate. In this case, reading the `my_sequences.fa` and writing it back to `parse.fastq` would not be possible since we have discarded the quality information.\n", - "\n", - "Additionally, the sequence records are written one-by-one to the file (courtesy of `SeqIO.parse`). So even if the `data/parse.fastq` is 10GB large, we can run the commands with only 1GB of free memory (for example)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Additional `SeqIO` methods\n", - "\n", - "There are several more useful `SeqIO` functions, which you are free to try out on your own:\n", - "\n", - "* `SeqIO.index`: For efficiently fetching random records from a large sequence file.\n", - "* `SeqIO.index_db`: Similar to `SeqIO.index`, but with a persistent index.\n", - "* `SeqIO.convert`: Shortcut for converting between file formats." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "<a id=\"entrez\"></a>\n", - "## Fetching from online resources: NCBI's Entrez\n", - "\n", - "The `Bio.Entrez` library provides interface to [NCBI's Entrez e-utilities](https://www.ncbi.nlm.nih.gov/books/NBK25500/). One example we are demonstrating today is the `Entrez.efetch` utility to retrieve various records from one of NCBI's databases." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from Bio import Entrez" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 171 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To monitor potential excessive use of their services, NCBI requests you to specify your email address with each request. With Biopython, you can set it once for your session like this:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "Entrez.email = 'python@lumc.nl'" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 172 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fetching with `efetch`\n", - "\n", - "The `Entrez.efetch` function returns a file-like handle that instead of pointing to a local file, points to a remote resource. This file handle is similar to the local file handle we saw on Tuesday." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "efetch_handle = Entrez.efetch(db=\"nucleotide\", id=\"NM_005804\",\n", - " rettype=\"gb\", retmode=\"text\")" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 127 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can of course do an `efetch_handle.read()` and see the entire contents of the genbank file. But we know how to work with `SeqIO.read` now, so let's use that instead (recall that `SeqIO.read` works equally well with file handles and local file names)." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ncbi_record = SeqIO.read(efetch_handle, 'genbank')" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 128 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print ncbi_record" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "ID: NM_005804.3\n", - "Name: NM_005804\n", - "Description: Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 39A (DDX39A), transcript variant 1, mRNA.\n", - "Number of features: 13\n", - "/comment=REVIEWED REFSEQ: This record has been curated by NCBI staff. The\n", - "reference sequence was derived from DA432925.1, BC001009.2 and\n", - "BM792110.1.\n", - "This sequence is a reference standard in the RefSeqGene project.\n", - "On Oct 14, 2010 this sequence version replaced gi:21040370.\n", - "Summary: This gene encodes a member of the DEAD box protein family.\n", - "These proteins are characterized by the conserved motif\n", - "Asp-Glu-Ala-Asp (DEAD) and are putative RNA helicases. They are\n", - "implicated in a number of cellular processes involving alteration\n", - "of RNA secondary structure, such as translation initiation, nuclear\n", - "and mitochondrial splicing, and ribosome and spliceosome assembly.\n", - "Based on their distribution patterns, some members of the DEAD box\n", - "protein family are believed to be involved in embryogenesis,\n", - "spermatogenesis, and cellular growth and division. Alternatively\n", - "spliced transcript variants encoding different isoforms have been\n", - "found. [provided by RefSeq, Feb 2011].\n", - "Transcript Variant: This variant (1) represents the predominant,\n", - "protein-coding transcript.\n", - "Publication Note: This RefSeq record includes a subset of the\n", - "publications that are available for this gene. Please see the Gene\n", - "record to access additional publications.\n", - "##Evidence-Data-START##\n", - "Transcript exon combination :: U90426.1, BC001009.2 [ECO:0000332]\n", - "RNAseq introns :: single sample supports all introns\n", - " ERS025084 [ECO:0000348]\n", - "##Evidence-Data-END##\n", - "COMPLETENESS: complete on the 3' end.\n", - "/sequence_version=3\n", - "/source=Homo sapiens (human)\n", - "/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']\n", - "/keywords=['RefSeq']\n", - "/references=[Reference(title='DDX39 acts as a suppressor of invasion for bladder cancer', ...), Reference(title='Clinical proteomics identified ATP-dependent RNA helicase DDX39 as a novel biomarker to predict poor prognosis of patients with gastrointestinal stromal tumor', ...), Reference(title='Meta-analysis of genome-wide association studies identifies three new risk loci for atopic dermatitis', ...), Reference(title='Interferon-induced antiviral protein MxA interacts with the cellular RNA helicases UAP56 and URH49', ...), Reference(title='The cellular DExD/H-box RNA-helicases UAP56 and URH49 exhibit a CRM1-independent nucleocytoplasmic shuttling activity', ...), Reference(title='Growth-regulated expression and G0-specific turnover of the mRNA that encodes URH49, a mammalian DExH/D box protein that is highly related to the mRNA export protein UAP56', ...), Reference(title='Analysis of a high-throughput yeast two-hybrid system and its use to predict the function of intracellular proteins encoded within the human MHC class III region', ...), Reference(title='TREX is a conserved complex coupling transcription with messenger RNA export', ...), Reference(title='Directed proteomic analysis of the human nucleolus', ...), Reference(title='The BAT1 gene in the MHC encodes an evolutionarily conserved putative nuclear RNA helicase of the DEAD family', ...)]\n", - "/accessions=['NM_005804']\n", - "/data_file_division=PRI\n", - "/date=07-JUL-2013\n", - "/organism=Homo sapiens\n", - "/gi=308522777\n", - "Seq('AGCAGCAGCCCGACGCAAGAGGCAGGAAGCGCAGCAACTCGTGTCTGAGCGCCC...AAA', IUPACAmbiguousDNA())\n" - ] - } - ], - "prompt_number": 129 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`Entrez.efetch` also allows you to fetch multiple records in one go:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "efetch_handle = Entrez.efetch(db=\"nucleotide\", id=[\"NM_005804\",\"NM_000967\"],\n", - " rettype=\"gb\", retmode=\"text\")" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 120 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for record in SeqIO.parse(efetch_handle, 'genbank'):\n", - " print record.id, record.description" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "NM_005804.3 Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 39A (DDX39A), transcript variant 1, mRNA.\n", - "NM_000967.3 Homo sapiens ribosomal protein L3 (RPL3), transcript variant 1, mRNA.\n" - ] - } - ], - "prompt_number": 122 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For a full argument list of `Entrez.efetch`, consult its [documentation page](https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Other `Bio.Entrez` utilities\n", - "\n", - "Biopython has the entire Entrez suite supported. A complete list of the Entrez services is available in its [documentation](https://www.ncbi.nlm.nih.gov/books/NBK25500/)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "<a id=\"blast\"><a/>\n", - "## Performing a remote BLAST search\n", - "\n", - "A common action for bioinformaticians is to perform a BLAST search. Biopython provides a way to automate the search and helps you interpret the results." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using NCBI's BLAST\n", - "\n", - "Entrez is not the only online service Biopython interacts with. We can also submit BLAST searches to NCBI using the `qblast` function in the `Bio.Blast.NCBIWWW` module.\n", - "\n", - "Similar to Entrez, Biopython tries to conform to the NCBI's remote BLAST required parameters in the function call. The official NCBI documentation is [here](https://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html).\n", - "\n", - "Let's do a short BLAST search using our earlier fetched `ncbi_record` (NM_005804). " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from Bio.Blast.NCBIWWW import qblast" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 131 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "blast_handle = qblast('blastn', 'refseq_mrna', ncbi_record.seq)" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 132 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, the command returns the BLAST result in an XML file (one of the available formats to download from an interactive BLAST session). We will use `blast_handle` in the next section, but of course we could also write the results to a file for viewing later:\n", - "\n", - " blast_file = open('my_blast_output.xml', 'w')\n", - " blast_file.write(blast_handle.read())\n", - " blast_file.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Parsing the BLAST search results\n", - "\n", - "Of course, Biopython has parsing capabilities for the results as well. As mentioned briefly above, parsing for these types of file is available in the `Bio.Blast.NCBIXML` submodule and `Bio.SearchIO` submodule. The former is an old module that will be deprecated soon and replaced by the latter (which is still in experimental stage, but already stable enough).\n", - "\n", - "Here's a short example using `Bio.SearchIO`:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from Bio import SearchIO" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 136 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "qresult = SearchIO.read(blast_handle, 'blast-xml')" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 137 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "qresult" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 138, - "text": [ - "QueryResult(id='22127', 50 hits)" - ] - } - ], - "prompt_number": 138 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print qresult" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "Program: blastn (2.2.28+)\n", - " Query: 22127 (1558)\n", - " No definition line\n", - " Target: refseq_mrna\n", - " Hits: ---- ----- ----------------------------------------------------------\n", - " # # HSP ID + description \n", - " ---- ----- ----------------------------------------------------------\n", - " 0 1 gi|308522777|ref|NM_005804.3| Homo sapiens DEAD (Asp-G...\n", - " 1 1 gi|397471065|ref|XM_003807080.1| PREDICTED: Pan panisc...\n", - " 2 1 gi|426387514|ref|XM_004060164.1| PREDICTED: Gorilla go...\n", - " 3 1 gi|395750601|ref|XM_002828787.2| PREDICTED: Pongo abel...\n", - " 4 1 gi|402904531|ref|XM_003915048.1| PREDICTED: Papio anub...\n", - " 5 1 gi|426387518|ref|XM_004060166.1| PREDICTED: Gorilla go...\n", - " 6 1 gi|403302190|ref|XM_003941697.1| PREDICTED: Saimiri bo...\n", - " 7 1 gi|301601638|ref|NM_001193491.1| Macaca mulatta DEAD (...\n", - " 8 1 gi|441628855|ref|XM_003275677.2| PREDICTED: Nomascus l...\n", - " 9 1 gi|478537026|ref|XM_004442573.1| PREDICTED: Ceratother...\n", - " 10 1 gi|472358840|ref|XM_004398985.1| PREDICTED: Odobenus r...\n", - " 11 1 gi|410950597|ref|XM_003981942.1| PREDICTED: Felis catu...\n", - " 12 1 gi|395850754|ref|XM_003797893.1| PREDICTED: Otolemur g...\n", - " 13 1 gi|511847827|ref|XM_004748233.1| PREDICTED: Mustela pu...\n", - " 14 1 gi|466046657|ref|XM_004277418.1| PREDICTED: Orcinus or...\n", - " 15 1 gi|470600673|ref|XM_004312273.1| PREDICTED: Tursiops t...\n", - " 16 1 gi|301771295|ref|XM_002920972.1| PREDICTED: Ailuropoda...\n", - " 17 1 gi|345787717|ref|XM_533895.3| PREDICTED: Canis lupus f...\n", - " 18 1 gi|344283234|ref|XM_003413330.1| PREDICTED: Loxodonta ...\n", - " 19 1 gi|471416597|ref|XM_004389765.1| PREDICTED: Trichechus...\n", - " 20 1 gi|426228875|ref|XM_004008473.1| PREDICTED: Ovis aries...\n", - " 21 1 gi|511847823|ref|XM_004748231.1| PREDICTED: Mustela pu...\n", - " 22 1 gi|511847825|ref|XM_004748232.1| PREDICTED: Mustela pu...\n", - " 23 1 gi|77736448|ref|NM_001034752.1| Bos taurus DEAD (Asp-G...\n", - " 24 1 gi|507964305|ref|XM_004688236.1| PREDICTED: Condylura ...\n", - " 25 2 gi|335282777|ref|XM_003123370.2| PREDICTED: Sus scrofa...\n", - " 26 1 gi|514445080|ref|XM_004999847.1| PREDICTED: Cavia porc...\n", - " 27 2 gi|513022593|ref|XM_004872489.1| PREDICTED: Heteroceph...\n", - " 28 1 gi|395837043|ref|XM_003791407.1| PREDICTED: Otolemur g...\n", - " 29 2 gi|335282779|ref|XM_003354105.1| PREDICTED: Sus scrofa...\n", - " ~~~\n", - " 47 1 gi|213511505|ref|NM_001141379.1| Salmo salar ATP-depen...\n", - " 48 1 gi|524974020|ref|XM_005086794.1| PREDICTED: Mesocricet...\n", - " 49 1 gi|524974018|ref|XM_005086793.1| PREDICTED: Mesocricet...\n" - ] - } - ], - "prompt_number": 139 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that the function call is very similar to `SeqIO`. However, the returned `QueryResult` object is something we have not seen before. In general, this is a container object for all our results.\n", - "\n", - "The main principle of `Bio.SearchIO` objects is that for all sequence database searches, we have at least three layers of containers:\n", - "\n", - "1. The query itself (`QueryResult`).\n", - "2. All the database hits from the query (`Hit`).\n", - "3. All the locations in a database record where alignments are found (`HSP`, for high-scoring pair)." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# for every hit in the first 10 query result\n", - "for hit in qresult[:10]:\n", - " # for every hsp in hit\n", - " for hsp in hit:\n", - " # show the hit ID and how long the match spans the hit (ungapped)\n", - " print hit.id, hsp.hit_span" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "gi|308522777|ref|NM_005804.3| 1558\n", - "gi|397471065|ref|XM_003807080.1| 1540\n", - "gi|426387514|ref|XM_004060164.1| 1539\n", - "gi|395750601|ref|XM_002828787.2| 1536\n", - "gi|402904531|ref|XM_003915048.1| 1562\n", - "gi|426387518|ref|XM_004060166.1| 1428\n", - "gi|403302190|ref|XM_003941697.1| 1539\n", - "gi|301601638|ref|NM_001193491.1| 1300\n", - "gi|441628855|ref|XM_003275677.2| 1419\n", - "gi|478537026|ref|XM_004442573.1| 1402\n" - ] - } - ], - "prompt_number": 148 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And all these containers are indexable." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# get the Hit object with the ID 'gi|308522777|ref|NM_005804.3|'\n", - "qresult['gi|308522777|ref|NM_005804.3|']" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 149, - "text": [ - "Hit(id='gi|308522777|ref|NM_005804.3|', query_id='22127', 1 hsps)" - ] - } - ], - "prompt_number": 149 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# get the first Hit object\n", - "qresult[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 150, - "text": [ - "Hit(id='gi|308522777|ref|NM_005804.3|', query_id='22127', 1 hsps)" - ] - } - ], - "prompt_number": 150 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# get the first HSP of the first Hit object\n", - "qresult[0][0]" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 151, - "text": [ - "HSP(hit_id='gi|308522777|ref|NM_005804.3|', query_id='22127', 1 fragments)" - ] - } - ], - "prompt_number": 151 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There is more to these objects than can be shown here. You can consult the [official API documentation](http://biopython.org/DIST/docs/api/) for a full reference." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### <span class=\"exercise\">Exercise: Navigating through a BLAST result</span>\n", - "\n", - "From the last BLAST result we have, fetch the full sequences of the top 10 hits that are not predicted sequences." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "<a id=\"beyond\"></a>\n", - "## Beyond Biopython\n", - "\n", - "Biopython is not the only useful 3rd party Python library out there. There is still much ground it does not cover, which is covered by other modules. Some of the modules in the bioinformatics space you may find useful:\n", - "\n", - "- [pysam](http://wwwfgu.anat.ox.ac.uk/~andreas/documentation/samtools/api.html): A samtools wrapper for parsing and writing SAM/BAM alignment files.\n", - "- [PyVCF](http://pyvcf.readthedocs.org/en/latest/): For working with VCF files.\n", - "- [track](http://xapple.github.io/track/): For working with genome tracks (e.g. BED tracks).\n", - "- [pybedtools](http://pythonhosted.org/pybedtools/): A bedtools wrapper for working with genome tracks.\n", - "- [metaseq](http://pythonhosted.org/metaseq/): A framework for exploring genomic data.\n", - "- [GEMINI](http://gemini.readthedocs.org/): A flexible framework for exploring genome variation.\n", - "- ..." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from IPython.core.display import HTML\n", - "def custom_style():\n", - " style = open('styles/notebook.css', 'r').read()\n", - " return HTML('<style>' + style + '</style>')\n", - "def custom_script():\n", - " script = open('styles/notebook.js', 'r').read()\n", - " return HTML('<script>' + script + '</script>')" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 7 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "custom_style()" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "html": [ - "<style>/*\n", - " https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers\n", - "*/\n", - "@font-face {\n", - " font-family: \"Computer Modern\";\n", - " src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');\n", - "}\n", - "div.cell{\n", - " width:800px;\n", - " margin-left:16% !important;\n", - " margin-right:auto;\n", - "}\n", - "h1 {\n", - " font-family: Helvetica, serif;\n", - "}\n", - "h4{\n", - " margin-top:12px;\n", - " margin-bottom: 3px;\n", - " }\n", - "div.text_cell_render{\n", - " font-family: Computer Modern, \"Helvetica Neue\", Arial, Helvetica, Geneva, sans-serif;\n", - " line-height: 145%;\n", - " font-size: 130%;\n", - " width:800px;\n", - " margin-left:auto;\n", - " margin-right:auto;\n", - "}\n", - ".CodeMirror{\n", - " font-family: \"Source Code Pro\", source-code-pro,Consolas, monospace;\n", - "}\n", - ".prompt{\n", - " display: None;\n", - "}\n", - ".text_cell_render .exercise {\n", - " font-weight: 300;\n", - " /*font-size: 22pt;*/\n", - " color: #4057A1;\n", - " font-style: italic;\n", - " /*margin-bottom: .5em;\n", - " margin-top: 0.5em;\n", - " display: block;*/\n", - "}\n", - ".text_cell_render .example {\n", - " font-weight: 300;\n", - " color: #40A157;\n", - " font-style: italic;\n", - "}\n", - "\n", - ".warning{\n", - " color: rgb( 240, 20, 20 )\n", - "}\n", - "</style>" - ], - "metadata": {}, - "output_type": "pyout", - "prompt_number": 8, - "text": [ - "<IPython.core.display.HTML at 0x2e55bd0>" - ] - } - ], - "prompt_number": 8 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "custom_script()" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "html": [ - "<script>// https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers\n", - "MathJax.Hub.Config({\n", - " TeX: {\n", - " extensions: [\"AMSmath.js\"]\n", - " },\n", - " tex2jax: {\n", - " inlineMath: [ ['$','$'], [\"\\\\(\",\"\\\\)\"] ],\n", - " displayMath: [ ['$$','$$'], [\"\\\\[\",\"\\\\]\"] ]\n", - " },\n", - " displayAlign: 'center', // Change this to 'center' to center equations.\n", - " \"HTML-CSS\": {\n", - " styles: {'.MathJax_Display': {\"margin\": 4}}\n", - " }\n", - " });\n", - "</script>" - ], - "metadata": {}, - "output_type": "pyout", - "prompt_number": 8, - "text": [ - "<IPython.core.display.HTML at 0x351cc10>" - ] - } - ], - "prompt_number": 8 - } - ], - "metadata": {} - } - ] -} \ No newline at end of file diff --git a/images/1590px-DNA_palindrome.svg.png b/images/1590px-DNA_palindrome.svg.png new file mode 100644 index 0000000000000000000000000000000000000000..0c05871f4cf5c1da20e4e24157df4f79c0e82e4a Binary files /dev/null and b/images/1590px-DNA_palindrome.svg.png differ diff --git a/images/ipynblogo.png b/images/ipynblogo.png new file mode 100644 index 0000000000000000000000000000000000000000..f7c7dd42fd6c4438b3b2b41dbec3eec292e88900 Binary files /dev/null and b/images/ipynblogo.png differ diff --git a/images/ipython-notebook-screenshot.jpg b/images/ipython-notebook-screenshot.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e4e3f4e4f30525596d59673af54c2101be156b0a Binary files /dev/null and b/images/ipython-notebook-screenshot.jpg differ diff --git a/images/ipython-screenshot.jpg b/images/ipython-screenshot.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dba3e5ecb43bdf26b4543a434e4fe54d61b44d35 Binary files /dev/null and b/images/ipython-screenshot.jpg differ diff --git a/images/menubar_toolbar.png b/images/menubar_toolbar.png new file mode 100644 index 0000000000000000000000000000000000000000..a27b2759d956281b8a03081e75916c039da1d71b Binary files /dev/null and b/images/menubar_toolbar.png differ diff --git a/images/python-screenshot.jpg b/images/python-screenshot.jpg new file mode 100644 index 0000000000000000000000000000000000000000..01a9e6b93020f853a10dcc51fb6d035932e51430 Binary files /dev/null and b/images/python-screenshot.jpg differ diff --git a/styles/custom.css b/styles/custom.css index 2bfc5ba144979594e9c70763a667633d278f2254..b327eff98e8985fdddc5623f8e7d04badd9ba1da 100644 --- a/styles/custom.css +++ b/styles/custom.css @@ -1,6 +1,6 @@ /* Remove the vertical scrollbar added by nbconvert. */ -html { - overflow-y: hidden; +.reveal { + overflow: hidden; } /* Workaround some highlight.js bugs in language autodetection. */