From 89c20190f1bdd4b03c9cae397c8ad938f9ecc201 Mon Sep 17 00:00:00 2001
From: Martin Larralde <martin.larralde@embl.de>
Date: Mon, 8 May 2023 20:39:00 +0200
Subject: [PATCH] Rewrite TRANSFAC parser to support out-of-order line entries

---
 lightmotif-transfac/src/lib.rs | 176 ++++++++++++++++++++-------------
 1 file changed, 107 insertions(+), 69 deletions(-)

diff --git a/lightmotif-transfac/src/lib.rs b/lightmotif-transfac/src/lib.rs
index 4d71b7b..ec36191 100644
--- a/lightmotif-transfac/src/lib.rs
+++ b/lightmotif-transfac/src/lib.rs
@@ -1,19 +1,29 @@
 #![doc = include_str!("../README.md")]
+#![allow(unused)]
 
-use std::str::FromStr;
-
+use nom::branch::alt;
 use nom::bytes::complete::is_a;
 use nom::bytes::complete::tag;
 use nom::bytes::complete::take_till;
+use nom::bytes::complete::take_until;
 use nom::bytes::complete::take_while;
 use nom::bytes::complete::take_while1;
+use nom::character::complete::anychar;
+use nom::character::complete::line_ending;
+use nom::character::complete::not_line_ending;
+use nom::character::complete::space0;
+use nom::character::streaming::space1;
 use nom::combinator::eof;
 use nom::combinator::map_res;
 use nom::error::Error;
 use nom::error::ErrorKind;
 use nom::multi::count;
+use nom::multi::many1;
 use nom::multi::many_till;
+use nom::multi::separated_list1;
 use nom::sequence::delimited;
+use nom::sequence::preceded;
+use nom::sequence::terminated;
 use nom::IResult;
 
 use lightmotif::Alphabet;
@@ -21,94 +31,122 @@ use lightmotif::CountMatrix;
 use lightmotif::DenseMatrix;
 use lightmotif::Symbol;
 
-fn is_newline(c: char) -> bool {
-    c == '\r' || c == '\n'
+pub struct TransfacMatrix<A: Alphabet, const K: usize> {
+    counts: CountMatrix<A, K>,
 }
 
-fn is_space(c: char) -> bool {
-    c == '\t' || c == ' '
-}
+mod utils {
+    use nom::branch::alt;
+    use nom::character::complete::line_ending;
+    use nom::character::complete::not_line_ending;
+    use nom::combinator::eof;
+    use nom::combinator::map_res;
+    use nom::sequence::delimited;
+    use nom::sequence::terminated;
+    use nom::IResult;
+
+    use std::str::FromStr;
 
-fn is_digit(c: char) -> bool {
-    c.is_digit(10)
+    pub fn is_newline(c: char) -> bool {
+        c == '\r' || c == '\n'
+    }
 }
 
-fn parse_integer<N: FromStr>(input: &str) -> IResult<&str, N> {
-    map_res(take_while1(is_digit), N::from_str)(input)
+fn parse_line(input: &str) -> IResult<&str, &str> {
+    terminated(take_till(utils::is_newline), line_ending)(input)
 }
 
-fn parse_id(input: &str) -> IResult<&str, &str> {
-    let (input, _) = tag("ID")(input)?;
-    let (input, _) = take_while1(is_space)(input)?;
-    let (input, id) = take_till(is_newline)(input)?;
-    let (input, _) = take_while1(is_newline)(input)?;
-    Ok((input, id))
+fn parse_ac(input: &str) -> IResult<&str, &str> {
+    let (input, line) = preceded(tag("AC"), parse_line)(input)?;
+    Ok((input, line.trim()))
 }
 
-fn parse_species(input: &str) -> IResult<&str, &str> {
-    let (input, _) = tag("BF")(input)?;
-    let (input, _) = take_while1(is_space)(input)?;
-    let (input, species) = take_till(is_newline)(input)?;
-    let (input, _) = take_while1(is_newline)(input)?;
-    Ok((input, species))
+fn parse_id(input: &str) -> IResult<&str, &str> {
+    let (input, line) = preceded(tag("ID"), parse_line)(input)?;
+    Ok((input, line.trim()))
 }
 
-fn parse_symbol<S: Symbol>(input: &str) -> IResult<&str, S> {
-    if let Some(c) = input.chars().nth(0) {
-        match S::from_char(c) {
-            Ok(s) => Ok((&input[1..], s)),
-            Err(_) => Err(nom::Err::Failure(Error::new(input, ErrorKind::MapRes))),
-        }
-    } else {
-        Err(nom::Err::Error(Error::new(input, ErrorKind::Eof)))
-    }
+fn parse_bf(input: &str) -> IResult<&str, &str> {
+    let (input, line) = preceded(tag("BF"), parse_line)(input)?;
+    Ok((input, line.trim()))
 }
 
 fn parse_alphabet<S: Symbol>(input: &str) -> IResult<&str, Vec<S>> {
-    let (input, _) = tag("P0")(input)?;
-    let (input, _) = take_while1(is_space)(input)?;
-    let (input, (symbols, _)) = many_till(
-        delimited(take_while(is_space), parse_symbol, take_while(is_space)),
-        is_a("\n\r"),
-    )(input)?;
-    let (input, _) = take_while(is_newline)(input)?;
-    Ok((input, symbols))
+    delimited(
+        alt((tag("PO"), tag("P0"))),
+        preceded(
+            space1,
+            separated_list1(space1, map_res(anychar, S::from_char)),
+        ),
+        line_ending,
+    )(input)
 }
 
 fn parse_row(input: &str, k: usize) -> IResult<&str, Vec<u32>> {
-    let (input, _) = take_while1(char::is_numeric)(input)?;
-    let (input, _) = take_while1(char::is_whitespace)(input)?;
-    let (input, counts) = count(
-        delimited(
-            take_while(is_space),
-            parse_integer::<u32>,
-            take_while(is_space),
-        ),
-        k,
-    )(input)?;
-    let (input, _) = take_till(is_newline)(input)?;
-    let (input, _) = take_while1(is_newline)(input)?;
-    Ok((input, counts))
+    delimited(
+        nom::character::complete::u32,
+        count(delimited(space0, nom::character::complete::u32, space0), k),
+        parse_line,
+    )(input)
 }
 
-pub fn parse_matrix<A: Alphabet, const K: usize>(input: &str) -> IResult<&str, CountMatrix<A, K>> {
-    let (input, _id) = parse_id(input)?;
-    let (input, _) = parse_species(input)?;
-    let (input, symbols) = parse_alphabet::<A::Symbol>(input)?;
-    let (input, (counts, _)) = many_till(|i| parse_row(i, symbols.len()), tag("XX"))(input)?;
-
-    let (input, _) = take_while1(char::is_whitespace)(input)?;
-    let (input, _) = tag("//")(input)?;
-    let (input, _) = take_while1(char::is_whitespace)(input)?;
+fn parse_tag(input: &str) -> IResult<&str, &str> {
+    nom::branch::alt((
+        tag("XX"),
+        tag("ID"),
+        tag("BF"),
+        tag("P0"),
+        tag("PO"),
+        tag("//"),
+    ))(input)
+}
 
-    let mut data = DenseMatrix::<u32, K>::new(counts.len());
-    for (i, count) in counts.iter().enumerate() {
-        for (s, &c) in symbols.iter().zip(count.iter()) {
-            data[i][s.as_index()] = c;
+pub fn parse_matrix<A: Alphabet, const K: usize>(
+    mut input: &str,
+) -> IResult<&str, CountMatrix<A, K>> {
+    let mut id = None;
+    let mut bf = None;
+    let mut countmatrix = None;
+
+    loop {
+        match parse_tag(input)?.1 {
+            "XX" => {
+                let (rest, _) = parse_line(input)?;
+                input = rest;
+            }
+            "ID" => {
+                let (rest, line) = parse_id(input)?;
+                id = Some(line.trim());
+                input = rest;
+            }
+            "BF" => {
+                let (rest, line) = parse_bf(input)?;
+                bf = Some(line.trim());
+                input = rest;
+            }
+            "P0" | "PO" => {
+                // parse alphabet and count lines
+                let (rest, symbols) = parse_alphabet::<A::Symbol>(input)?;
+                let (rest, counts) = many1(|l| parse_row(l, symbols.len()))(rest)?;
+                input = rest;
+                // parse
+                let mut data = DenseMatrix::<u32, K>::new(counts.len());
+                for (i, count) in counts.iter().enumerate() {
+                    for (s, &c) in symbols.iter().zip(count.iter()) {
+                        data[i][s.as_index()] = c;
+                    }
+                }
+                countmatrix = Some(data);
+            }
+            "//" => {
+                input = preceded(tag("//"), parse_line)(input)?.0;
+                break;
+            }
+            _ => unreachable!(),
         }
     }
 
-    let matrix = CountMatrix::new(data).unwrap(); // FIXME
+    let matrix = CountMatrix::new(countmatrix.unwrap()).unwrap();
     Ok((input, matrix))
 }
 
@@ -136,9 +174,9 @@ mod test {
     }
 
     #[test]
-    fn test_parse_species() {
+    fn test_parse_bf() {
         let line = "BF Pseudomonas aeruginosa\n";
-        let res = super::parse_species(line).unwrap();
+        let res = super::parse_bf(line).unwrap();
         assert_eq!(res.0, "");
         assert_eq!(res.1, "Pseudomonas aeruginosa");
     }
-- 
GitLab