From d264b7246182bdaafa0dd35e310708ceb440db85 Mon Sep 17 00:00:00 2001 From: Martin Larralde <martin.larralde@embl.de> Date: Mon, 2 Sep 2024 16:42:40 +0200 Subject: [PATCH] Improve documentation of `lightmotif` Python module --- lightmotif-py/docs/api/loader.rst | 4 +- lightmotif-py/docs/conf.py | 3 +- lightmotif-py/lightmotif/io.rs | 151 ++++++++++++++++++++---------- lightmotif-py/lightmotif/lib.rs | 67 +++++++++++++ 4 files changed, 175 insertions(+), 50 deletions(-) diff --git a/lightmotif-py/docs/api/loader.rst b/lightmotif-py/docs/api/loader.rst index ba907d2..35e3021 100644 --- a/lightmotif-py/docs/api/loader.rst +++ b/lightmotif-py/docs/api/loader.rst @@ -3,4 +3,6 @@ Loader .. currentmodule:: lightmotif -.. autoclass:: lightmotif.Loader \ No newline at end of file +.. autoclass:: lightmotif.Loader + :special-members: __init__, __iter__, __next__ + :members: \ No newline at end of file diff --git a/lightmotif-py/docs/conf.py b/lightmotif-py/docs/conf.py index 863dd34..57c3e8c 100644 --- a/lightmotif-py/docs/conf.py +++ b/lightmotif-py/docs/conf.py @@ -189,7 +189,7 @@ napoleon_use_rtype = False # -- Options for autodoc extension ------------------------------------------- -autoclass_content = "init" +autoclass_content = "class" autodoc_member_order = 'groupwise' autosummary_generate = [] @@ -199,7 +199,6 @@ autosummary_generate = [] intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), "psutil": ("https://psutil.readthedocs.io/en/latest/", None), - "biopython": ("https://biopython.org/docs/latest/api/", None), "numpy": ("https://numpy.org/doc/stable/", None) } diff --git a/lightmotif-py/lightmotif/io.rs b/lightmotif-py/lightmotif/io.rs index 4e256ca..2202619 100644 --- a/lightmotif-py/lightmotif/io.rs +++ b/lightmotif-py/lightmotif/io.rs @@ -31,9 +31,16 @@ fn convert_error(error: Error) -> PyErr { // --- JASPAR motif ------------------------------------------------------------ +/// A motif loaded from a JASPAR or JASPAR16 file. +/// +/// The JASPAR database stores motifs with a FASTA-like header line containing +/// the motif name and description, and one line per matrix column prefixed +/// by the alphabet symbol that contains the count matrix. +/// #[pyclass(module = "lightmotif.lib", extends = Motif)] pub struct JasparMotif { #[pyo3(get)] + /// `str` or `None`: The description of the motif, if any. description: Option<String>, } @@ -73,6 +80,7 @@ impl JasparMotif { // --- UniPROBE motif ---------------------------------------------------------- +/// A motif loaded from a UniPROBE file. #[pyclass(module = "lightmotif.lib", extends = Motif)] pub struct UniprobeMotif {} @@ -98,13 +106,22 @@ impl UniprobeMotif { // --- TRANSFAC records -------------------------------------------------------- +/// A motif loaded from a TRANSFAC file. +/// +/// The TRANSFAC database stores motif information in an EMBL-like file that +/// contains the count matrix for the motif, as well as optional metadata +/// such as accession, description, creation date or bibliography. +/// #[pyclass(module = "lightmotif.lib", extends = Motif)] pub struct TransfacMotif { #[pyo3(get)] + /// `str` or `None`: The identifier of the motif, if any. id: Option<String>, #[pyo3(get)] + /// `str` or `None`: The accession of the motif, if any. accession: Option<String>, #[pyo3(get)] + /// `str` or `None`: The description of the motif, if any. description: Option<String>, } @@ -140,6 +157,7 @@ impl TransfacMotif { // --- Loader ------------------------------------------------------------------ +/// An iterator for loading motifs from a file. #[pyclass(module = "lightmotif.lib")] pub struct Loader { reader: Box<dyn Iterator<Item = PyResult<PyObject>> + Send>, @@ -147,6 +165,72 @@ pub struct Loader { #[pymethods] impl Loader { + #[new] + #[pyo3(signature = (file, format="jaspar", *, protein=false))] + /// Create a new loader from the given parameters. + /// + /// Arguments: + /// file (`os.PathLike` or file-like object): The file containing the + /// motifs to load, as either a path to a filesystem location, or + /// a file-like object open in binary mode. + /// format (`str`): The format of the motif file. Supported formats + /// are ``jaspar``, ``jaspar16``, ``uniprobe`` and ``transfac``. + /// protein(`bool`): Set to `True` if the loader should be expecting + /// a protein motif rather than a DNA motif. + /// + pub fn __init__( + file: Bound<PyAny>, + format: &str, + protein: bool, + ) -> PyResult<PyClassInitializer<Self>> { + let py = file.py(); + let pathlike = py + .import_bound(pyo3::intern!(py, "os"))? + .call_method1(pyo3::intern!(py, "fsdecode"), (&file,)); + let b: Box<dyn BufRead + Send> = if let Ok(path) = pathlike { + // NOTE(@althonos): In theory this is safe because `os.fsencode` encodes + // the PathLike object into the OS prefered encoding, + // which is was OsStr wants. In practice, there may be + // some weird bugs if that encoding is incorrect, idk... + let decoded = path.downcast::<PyString>()?; + std::fs::File::open(decoded.to_str()?) + .map(std::io::BufReader::new) + .map(Box::new)? + } else { + PyFileRead::from_ref(&file) + .map(std::io::BufReader::new) + .map(Box::new)? + }; + let reader: Box<dyn Iterator<Item = PyResult<PyObject>> + Send> = match format { + "jaspar" if protein => { + return Err(PyValueError::new_err( + "cannot read protein motifs from JASPAR format", + )) + } + "jaspar16" if protein => { + Box::new(lightmotif_io::jaspar16::read::<_, Protein>(b).map(JasparMotif::convert16)) + } + "transfac" if protein => { + Box::new(lightmotif_io::transfac::read::<_, Protein>(b).map(TransfacMotif::convert)) + } + "uniprobe" if protein => { + Box::new(lightmotif_io::uniprobe::read::<_, Protein>(b).map(UniprobeMotif::convert)) + } + "jaspar" => Box::new(lightmotif_io::jaspar::read(b).map(JasparMotif::convert)), + "jaspar16" => { + Box::new(lightmotif_io::jaspar16::read::<_, Dna>(b).map(JasparMotif::convert16)) + } + "transfac" => { + Box::new(lightmotif_io::transfac::read::<_, Dna>(b).map(TransfacMotif::convert)) + } + "uniprobe" => { + Box::new(lightmotif_io::uniprobe::read::<_, Dna>(b).map(UniprobeMotif::convert)) + } + _ => return Err(PyValueError::new_err(format!("invalid format: {}", format))), + }; + Ok(PyClassInitializer::from(Loader { reader })) + } + fn __iter__(slf: PyRef<Self>) -> PyRef<Self> { slf } @@ -157,53 +241,26 @@ impl Loader { } /// Load the motifs contained in a file. +/// +/// Arguments: +/// file (`os.PathLike` or file-like object): The file containing the +/// motifs to load, as either a path to a filesystem location, or +/// a file-like object open in binary mode. +/// format (`str`): The format of the motif file. Supported formats +/// are ``jaspar``, ``jaspar16``, ``uniprobe`` and ``transfac``. +/// protein(`bool`): Set to `True` if the loader should be expecting +/// a protein motif rather than a DNA motif. +/// +/// Returns: +/// `~lightmotif.Loader`: A loader configured to load one or more `Motif` +/// from the given file. +/// #[pyfunction] #[pyo3(signature = (file, format="jaspar", *, protein=false))] -pub fn load(file: Bound<PyAny>, format: &str, protein: bool) -> PyResult<Loader> { - let py = file.py(); - let pathlike = py - .import_bound(pyo3::intern!(py, "os"))? - .call_method1(pyo3::intern!(py, "fsdecode"), (&file,)); - let b: Box<dyn BufRead + Send> = if let Ok(path) = pathlike { - // NOTE(@althonos): In theory this is safe because `os.fsencode` encodes - // the PathLike object into the OS prefered encoding, - // which is was OsStr wants. In practice, there may be - // some weird bugs if that encoding is incorrect, idk... - let decoded = path.downcast::<PyString>()?; - std::fs::File::open(decoded.to_str()?) - .map(std::io::BufReader::new) - .map(Box::new)? - } else { - PyFileRead::from_ref(&file) - .map(std::io::BufReader::new) - .map(Box::new)? - }; - let reader: Box<dyn Iterator<Item = PyResult<PyObject>> + Send> = match format { - "jaspar" if protein => { - return Err(PyValueError::new_err( - "cannot read protein motifs from JASPAR format", - )) - } - "jaspar16" if protein => { - Box::new(lightmotif_io::jaspar16::read::<_, Protein>(b).map(JasparMotif::convert16)) - } - "transfac" if protein => { - Box::new(lightmotif_io::transfac::read::<_, Protein>(b).map(TransfacMotif::convert)) - } - "uniprobe" if protein => { - Box::new(lightmotif_io::uniprobe::read::<_, Protein>(b).map(UniprobeMotif::convert)) - } - "jaspar" => Box::new(lightmotif_io::jaspar::read(b).map(JasparMotif::convert)), - "jaspar16" => { - Box::new(lightmotif_io::jaspar16::read::<_, Dna>(b).map(JasparMotif::convert16)) - } - "transfac" => { - Box::new(lightmotif_io::transfac::read::<_, Dna>(b).map(TransfacMotif::convert)) - } - "uniprobe" => { - Box::new(lightmotif_io::uniprobe::read::<_, Dna>(b).map(UniprobeMotif::convert)) - } - _ => return Err(PyValueError::new_err(format!("invalid format: {}", format))), - }; - Ok(Loader { reader }) +pub fn load<'py>( + file: Bound<'py, PyAny>, + format: &str, + protein: bool, +) -> PyResult<Bound<'py, Loader>> { + Bound::new(file.py(), Loader::__init__(file, format, protein)?) } diff --git a/lightmotif-py/lightmotif/lib.rs b/lightmotif-py/lightmotif/lib.rs index 54453a7..bef9b87 100644 --- a/lightmotif-py/lightmotif/lib.rs +++ b/lightmotif-py/lightmotif/lib.rs @@ -265,6 +265,11 @@ impl EncodedSequence { } /// Convert this sequence into a striped matrix. + /// + /// Returns: + /// `~lightmotif.StripedSequence`: The input sequence, stored in a + /// column-major matrix. + /// pub fn stripe(&self) -> PyResult<StripedSequence> { match &self.data { EncodedSequenceData::Dna(dna) => Ok(StripedSequenceData::from(dna.to_striped()).into()), @@ -1073,16 +1078,24 @@ impl From<lightmotif::scores::StripedScores<f32, C>> for StripedScores { // --- Motif ------------------------------------------------------------------- +/// A base object storing information about a motif. #[pyclass(module = "lightmotif.lib", subclass)] #[derive(Debug)] pub struct Motif { #[pyo3(get)] + /// `CountMatrix` or `None`: The count matrix for this motif. + /// + /// This may be `None` if the motif was loaded from a format that does + /// not store counts but frequencies, such as the ``uniprobe`` format. counts: Option<Py<CountMatrix>>, #[pyo3(get)] + /// `WeightMatrix`: The weight matrix for this motif. pwm: Py<WeightMatrix>, #[pyo3(get)] + /// `ScoringMatrix`: The scoring matrix for this motif. pssm: Py<ScoringMatrix>, #[pyo3(get)] + /// `str` or `None`: An optional name for the motif. name: Option<String>, } @@ -1135,6 +1148,18 @@ impl Motif { // --- Scanner ----------------------------------------------------------------- +/// A fast scanner for identifying high scoring positions in a sequence. +/// +/// This class internally uses a discretized version of the matrix to +/// identify candidate positions, and then rescores blocks with the full +/// algorithm only if needed. Using a `Scanner` is likely faster than +/// using the `~ScoringMatrix.calculate` method for rare sites or high +/// thresholds. +/// +/// Note: +/// This algorithm is only available for DNA motifs because of +/// implementation requirements. +/// #[pyclass(module = "lightmotif.lib")] #[derive(Debug)] pub struct Scanner { @@ -1200,12 +1225,15 @@ impl Scanner { } } +/// A hit found by a `~lightmotif.Scanner`. #[pyclass(module = "lightmotif.lib")] #[derive(Debug)] pub struct Hit { #[pyo3(get)] + /// `int`: The index of the hit, zero-based. position: usize, #[pyo3(get)] + /// `float`: The score of the scoring matrix at the hit position. score: f32, } @@ -1281,6 +1309,19 @@ pub fn create(sequences: Bound<PyAny>, protein: bool, name: Option<String>) -> P } /// Encode and stripe a text sequence. +/// +/// Arguments: +/// sequence (`str`): The sequence to encode and stripe. +/// protein (`bool`): Pass `True` to treat the sequence as a protein +/// sequence. +/// +/// Returns: +/// `~lightmotif.StripedSequence`: A striped sequence containing the +/// sequence data. +/// +/// Raises: +/// `ValueError`: When the sequences contains an invalid character. +/// #[pyfunction] #[pyo3(signature = (sequence, protein=false))] pub fn stripe(sequence: Bound<PyString>, protein: bool) -> PyResult<StripedSequence> { @@ -1291,6 +1332,32 @@ pub fn stripe(sequence: Bound<PyString>, protein: bool) -> PyResult<StripedSeque } /// Scan a sequence using a fast scanner implementation to identify hits. +/// +/// See `~lightmotif.Scanner` for more information. +/// +/// Arguments: +/// pssm (`~lightmotif.ScoringMatrix`): The scoring matrix to use to +/// score the sequence with. +/// sequence (`~lightmotif.StripedSequence`): The striped sequence to +/// process with the scanner. Longer sequences benifit more from the +/// scanning implementation. +/// threshold (`float`): The score threshold to use for filtering hits. +/// Use `ScoringMatrix.score` to compute a score threshold from a +/// target *p-value*. A higher threshold will result in less +/// candidate hits and total runtime. +/// +/// Returns: +/// `~lightmotif.Scanner`: The scanner for finding candidate hits in +/// the target sequence. +/// +/// Raises: +/// `ValueError`: When either ``pssm`` or ``sequence`` are not using +/// the DNA alphabet. +/// +/// Note: +/// This algorithm is only available for DNA motifs because of +/// implementation requirements. +/// #[pyfunction] #[pyo3(signature = (pssm, sequence, threshold = 0.0, block_size = 256))] pub fn scan<'py>( -- GitLab