From d264b7246182bdaafa0dd35e310708ceb440db85 Mon Sep 17 00:00:00 2001
From: Martin Larralde <martin.larralde@embl.de>
Date: Mon, 2 Sep 2024 16:42:40 +0200
Subject: [PATCH] Improve documentation of `lightmotif` Python module

---
 lightmotif-py/docs/api/loader.rst |   4 +-
 lightmotif-py/docs/conf.py        |   3 +-
 lightmotif-py/lightmotif/io.rs    | 151 ++++++++++++++++++++----------
 lightmotif-py/lightmotif/lib.rs   |  67 +++++++++++++
 4 files changed, 175 insertions(+), 50 deletions(-)

diff --git a/lightmotif-py/docs/api/loader.rst b/lightmotif-py/docs/api/loader.rst
index ba907d2..35e3021 100644
--- a/lightmotif-py/docs/api/loader.rst
+++ b/lightmotif-py/docs/api/loader.rst
@@ -3,4 +3,6 @@ Loader
 
 .. currentmodule:: lightmotif
 
-.. autoclass:: lightmotif.Loader
\ No newline at end of file
+.. autoclass:: lightmotif.Loader
+    :special-members: __init__, __iter__, __next__
+    :members:
\ No newline at end of file
diff --git a/lightmotif-py/docs/conf.py b/lightmotif-py/docs/conf.py
index 863dd34..57c3e8c 100644
--- a/lightmotif-py/docs/conf.py
+++ b/lightmotif-py/docs/conf.py
@@ -189,7 +189,7 @@ napoleon_use_rtype = False
 
 # -- Options for autodoc extension -------------------------------------------
 
-autoclass_content = "init"
+autoclass_content = "class"
 autodoc_member_order = 'groupwise'
 autosummary_generate = []
 
@@ -199,7 +199,6 @@ autosummary_generate = []
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "psutil": ("https://psutil.readthedocs.io/en/latest/", None),
-    "biopython": ("https://biopython.org/docs/latest/api/", None),
     "numpy": ("https://numpy.org/doc/stable/", None)
 }
 
diff --git a/lightmotif-py/lightmotif/io.rs b/lightmotif-py/lightmotif/io.rs
index 4e256ca..2202619 100644
--- a/lightmotif-py/lightmotif/io.rs
+++ b/lightmotif-py/lightmotif/io.rs
@@ -31,9 +31,16 @@ fn convert_error(error: Error) -> PyErr {
 
 // --- JASPAR motif ------------------------------------------------------------
 
+/// A motif loaded from a JASPAR or JASPAR16 file.
+///
+/// The JASPAR database stores motifs with a FASTA-like header line containing
+/// the motif name and description, and one line per matrix column prefixed
+/// by the alphabet symbol that contains the count matrix.
+///
 #[pyclass(module = "lightmotif.lib", extends = Motif)]
 pub struct JasparMotif {
     #[pyo3(get)]
+    /// `str` or `None`: The description of the motif, if any.
     description: Option<String>,
 }
 
@@ -73,6 +80,7 @@ impl JasparMotif {
 
 // --- UniPROBE motif ----------------------------------------------------------
 
+/// A motif loaded from a UniPROBE file.
 #[pyclass(module = "lightmotif.lib", extends = Motif)]
 pub struct UniprobeMotif {}
 
@@ -98,13 +106,22 @@ impl UniprobeMotif {
 
 // --- TRANSFAC records --------------------------------------------------------
 
+/// A motif loaded from a TRANSFAC file.
+///
+/// The TRANSFAC database stores motif information in an EMBL-like file that
+/// contains the count matrix for the motif, as well as optional metadata
+/// such as accession, description, creation date or bibliography.
+///
 #[pyclass(module = "lightmotif.lib", extends = Motif)]
 pub struct TransfacMotif {
     #[pyo3(get)]
+    /// `str` or `None`: The identifier of the motif, if any.
     id: Option<String>,
     #[pyo3(get)]
+    /// `str` or `None`: The accession of the motif, if any.
     accession: Option<String>,
     #[pyo3(get)]
+    /// `str` or `None`: The description of the motif, if any.
     description: Option<String>,
 }
 
@@ -140,6 +157,7 @@ impl TransfacMotif {
 
 // --- Loader ------------------------------------------------------------------
 
+/// An iterator for loading motifs from a file.
 #[pyclass(module = "lightmotif.lib")]
 pub struct Loader {
     reader: Box<dyn Iterator<Item = PyResult<PyObject>> + Send>,
@@ -147,6 +165,72 @@ pub struct Loader {
 
 #[pymethods]
 impl Loader {
+    #[new]
+    #[pyo3(signature = (file, format="jaspar", *, protein=false))]
+    /// Create a new loader from the given parameters.
+    ///
+    /// Arguments:
+    ///     file (`os.PathLike` or file-like object): The file containing the
+    ///         motifs to load, as either a path to a filesystem location, or
+    ///         a file-like object open in binary mode.
+    ///     format (`str`): The format of the motif file. Supported formats
+    ///         are ``jaspar``, ``jaspar16``, ``uniprobe`` and ``transfac``.
+    ///     protein(`bool`): Set to `True` if the loader should be expecting
+    ///         a protein motif rather than a DNA motif.
+    ///
+    pub fn __init__(
+        file: Bound<PyAny>,
+        format: &str,
+        protein: bool,
+    ) -> PyResult<PyClassInitializer<Self>> {
+        let py = file.py();
+        let pathlike = py
+            .import_bound(pyo3::intern!(py, "os"))?
+            .call_method1(pyo3::intern!(py, "fsdecode"), (&file,));
+        let b: Box<dyn BufRead + Send> = if let Ok(path) = pathlike {
+            // NOTE(@althonos): In theory this is safe because `os.fsencode` encodes
+            //                  the PathLike object into the OS prefered encoding,
+            //                  which is was OsStr wants. In practice, there may be
+            //                  some weird bugs if that encoding is incorrect, idk...
+            let decoded = path.downcast::<PyString>()?;
+            std::fs::File::open(decoded.to_str()?)
+                .map(std::io::BufReader::new)
+                .map(Box::new)?
+        } else {
+            PyFileRead::from_ref(&file)
+                .map(std::io::BufReader::new)
+                .map(Box::new)?
+        };
+        let reader: Box<dyn Iterator<Item = PyResult<PyObject>> + Send> = match format {
+            "jaspar" if protein => {
+                return Err(PyValueError::new_err(
+                    "cannot read protein motifs from JASPAR format",
+                ))
+            }
+            "jaspar16" if protein => {
+                Box::new(lightmotif_io::jaspar16::read::<_, Protein>(b).map(JasparMotif::convert16))
+            }
+            "transfac" if protein => {
+                Box::new(lightmotif_io::transfac::read::<_, Protein>(b).map(TransfacMotif::convert))
+            }
+            "uniprobe" if protein => {
+                Box::new(lightmotif_io::uniprobe::read::<_, Protein>(b).map(UniprobeMotif::convert))
+            }
+            "jaspar" => Box::new(lightmotif_io::jaspar::read(b).map(JasparMotif::convert)),
+            "jaspar16" => {
+                Box::new(lightmotif_io::jaspar16::read::<_, Dna>(b).map(JasparMotif::convert16))
+            }
+            "transfac" => {
+                Box::new(lightmotif_io::transfac::read::<_, Dna>(b).map(TransfacMotif::convert))
+            }
+            "uniprobe" => {
+                Box::new(lightmotif_io::uniprobe::read::<_, Dna>(b).map(UniprobeMotif::convert))
+            }
+            _ => return Err(PyValueError::new_err(format!("invalid format: {}", format))),
+        };
+        Ok(PyClassInitializer::from(Loader { reader }))
+    }
+
     fn __iter__(slf: PyRef<Self>) -> PyRef<Self> {
         slf
     }
@@ -157,53 +241,26 @@ impl Loader {
 }
 
 /// Load the motifs contained in a file.
+///
+/// Arguments:
+///     file (`os.PathLike` or file-like object): The file containing the
+///         motifs to load, as either a path to a filesystem location, or
+///         a file-like object open in binary mode.
+///     format (`str`): The format of the motif file. Supported formats
+///         are ``jaspar``, ``jaspar16``, ``uniprobe`` and ``transfac``.
+///     protein(`bool`): Set to `True` if the loader should be expecting
+///         a protein motif rather than a DNA motif.
+///
+/// Returns:
+///     `~lightmotif.Loader`: A loader configured to load one or more `Motif`
+///     from the given file.
+///
 #[pyfunction]
 #[pyo3(signature = (file, format="jaspar", *, protein=false))]
-pub fn load(file: Bound<PyAny>, format: &str, protein: bool) -> PyResult<Loader> {
-    let py = file.py();
-    let pathlike = py
-        .import_bound(pyo3::intern!(py, "os"))?
-        .call_method1(pyo3::intern!(py, "fsdecode"), (&file,));
-    let b: Box<dyn BufRead + Send> = if let Ok(path) = pathlike {
-        // NOTE(@althonos): In theory this is safe because `os.fsencode` encodes
-        //                  the PathLike object into the OS prefered encoding,
-        //                  which is was OsStr wants. In practice, there may be
-        //                  some weird bugs if that encoding is incorrect, idk...
-        let decoded = path.downcast::<PyString>()?;
-        std::fs::File::open(decoded.to_str()?)
-            .map(std::io::BufReader::new)
-            .map(Box::new)?
-    } else {
-        PyFileRead::from_ref(&file)
-            .map(std::io::BufReader::new)
-            .map(Box::new)?
-    };
-    let reader: Box<dyn Iterator<Item = PyResult<PyObject>> + Send> = match format {
-        "jaspar" if protein => {
-            return Err(PyValueError::new_err(
-                "cannot read protein motifs from JASPAR format",
-            ))
-        }
-        "jaspar16" if protein => {
-            Box::new(lightmotif_io::jaspar16::read::<_, Protein>(b).map(JasparMotif::convert16))
-        }
-        "transfac" if protein => {
-            Box::new(lightmotif_io::transfac::read::<_, Protein>(b).map(TransfacMotif::convert))
-        }
-        "uniprobe" if protein => {
-            Box::new(lightmotif_io::uniprobe::read::<_, Protein>(b).map(UniprobeMotif::convert))
-        }
-        "jaspar" => Box::new(lightmotif_io::jaspar::read(b).map(JasparMotif::convert)),
-        "jaspar16" => {
-            Box::new(lightmotif_io::jaspar16::read::<_, Dna>(b).map(JasparMotif::convert16))
-        }
-        "transfac" => {
-            Box::new(lightmotif_io::transfac::read::<_, Dna>(b).map(TransfacMotif::convert))
-        }
-        "uniprobe" => {
-            Box::new(lightmotif_io::uniprobe::read::<_, Dna>(b).map(UniprobeMotif::convert))
-        }
-        _ => return Err(PyValueError::new_err(format!("invalid format: {}", format))),
-    };
-    Ok(Loader { reader })
+pub fn load<'py>(
+    file: Bound<'py, PyAny>,
+    format: &str,
+    protein: bool,
+) -> PyResult<Bound<'py, Loader>> {
+    Bound::new(file.py(), Loader::__init__(file, format, protein)?)
 }
diff --git a/lightmotif-py/lightmotif/lib.rs b/lightmotif-py/lightmotif/lib.rs
index 54453a7..bef9b87 100644
--- a/lightmotif-py/lightmotif/lib.rs
+++ b/lightmotif-py/lightmotif/lib.rs
@@ -265,6 +265,11 @@ impl EncodedSequence {
     }
 
     /// Convert this sequence into a striped matrix.
+    ///
+    /// Returns:
+    ///     `~lightmotif.StripedSequence`: The input sequence, stored in a
+    ///     column-major matrix.
+    ///
     pub fn stripe(&self) -> PyResult<StripedSequence> {
         match &self.data {
             EncodedSequenceData::Dna(dna) => Ok(StripedSequenceData::from(dna.to_striped()).into()),
@@ -1073,16 +1078,24 @@ impl From<lightmotif::scores::StripedScores<f32, C>> for StripedScores {
 
 // --- Motif -------------------------------------------------------------------
 
+/// A base object storing information about a motif.
 #[pyclass(module = "lightmotif.lib", subclass)]
 #[derive(Debug)]
 pub struct Motif {
     #[pyo3(get)]
+    /// `CountMatrix` or `None`: The count matrix for this motif.
+    ///
+    /// This may be `None` if the motif was loaded from a format that does
+    /// not store counts but frequencies, such as the ``uniprobe`` format.
     counts: Option<Py<CountMatrix>>,
     #[pyo3(get)]
+    /// `WeightMatrix`: The weight matrix for this motif.
     pwm: Py<WeightMatrix>,
     #[pyo3(get)]
+    /// `ScoringMatrix`: The scoring matrix for this motif.
     pssm: Py<ScoringMatrix>,
     #[pyo3(get)]
+    /// `str` or `None`: An optional name for the motif.
     name: Option<String>,
 }
 
@@ -1135,6 +1148,18 @@ impl Motif {
 
 // --- Scanner -----------------------------------------------------------------
 
+/// A fast scanner for identifying high scoring positions in a sequence.
+///
+/// This class internally uses a discretized version of the matrix to
+/// identify candidate positions, and then rescores blocks with the full
+/// algorithm only if needed. Using a `Scanner` is likely faster than
+/// using the `~ScoringMatrix.calculate` method for rare sites or high
+/// thresholds.
+///
+/// Note:
+///     This algorithm is only available for DNA motifs because of
+///     implementation requirements.
+///
 #[pyclass(module = "lightmotif.lib")]
 #[derive(Debug)]
 pub struct Scanner {
@@ -1200,12 +1225,15 @@ impl Scanner {
     }
 }
 
+/// A hit found by a `~lightmotif.Scanner`.
 #[pyclass(module = "lightmotif.lib")]
 #[derive(Debug)]
 pub struct Hit {
     #[pyo3(get)]
+    /// `int`: The index of the hit, zero-based.
     position: usize,
     #[pyo3(get)]
+    /// `float`: The score of the scoring matrix at the hit position.
     score: f32,
 }
 
@@ -1281,6 +1309,19 @@ pub fn create(sequences: Bound<PyAny>, protein: bool, name: Option<String>) -> P
 }
 
 /// Encode and stripe a text sequence.
+///
+/// Arguments:
+///     sequence (`str`): The sequence to encode and stripe.
+///     protein (`bool`): Pass `True` to treat the sequence as a protein
+///         sequence.
+///
+/// Returns:
+///     `~lightmotif.StripedSequence`: A striped sequence containing the
+///     sequence data.
+///
+/// Raises:
+///     `ValueError`: When the sequences contains an invalid character.
+///
 #[pyfunction]
 #[pyo3(signature = (sequence, protein=false))]
 pub fn stripe(sequence: Bound<PyString>, protein: bool) -> PyResult<StripedSequence> {
@@ -1291,6 +1332,32 @@ pub fn stripe(sequence: Bound<PyString>, protein: bool) -> PyResult<StripedSeque
 }
 
 /// Scan a sequence using a fast scanner implementation to identify hits.
+///
+/// See `~lightmotif.Scanner` for more information.
+///
+/// Arguments:
+///     pssm (`~lightmotif.ScoringMatrix`): The scoring matrix to use to
+///         score the sequence with.
+///     sequence (`~lightmotif.StripedSequence`): The striped sequence to
+///         process with the scanner. Longer sequences benifit more from the
+///         scanning implementation.
+///     threshold (`float`): The score threshold to use for filtering hits.
+///         Use `ScoringMatrix.score` to compute a score threshold from a
+///         target *p-value*. A higher threshold will result in less
+///         candidate hits and total runtime.
+///     
+/// Returns:
+///     `~lightmotif.Scanner`: The scanner for finding candidate hits in
+///     the target sequence.
+///  
+/// Raises:
+///     `ValueError`: When either ``pssm`` or ``sequence`` are not using
+///         the DNA alphabet.
+///
+/// Note:
+///     This algorithm is only available for DNA motifs because of
+///     implementation requirements.  
+///
 #[pyfunction]
 #[pyo3(signature = (pssm, sequence, threshold = 0.0, block_size = 256))]
 pub fn scan<'py>(
-- 
GitLab