diff --git a/lightmotif-io/Cargo.toml b/lightmotif-io/Cargo.toml index 15ab27425a780491c254b342b304eb3536fedd27..4d6da17b8d85f17ac2c1ea648db03589593fc761 100644 --- a/lightmotif-io/Cargo.toml +++ b/lightmotif-io/Cargo.toml @@ -16,6 +16,7 @@ keywords = ["bioinformatics", "motif", "parser", "transfac", "meme"] [dependencies] nom = "7" memchr = "2" +generic-array = "1.0" [dependencies.lightmotif] path = "../lightmotif" diff --git a/lightmotif-io/src/jaspar/mod.rs b/lightmotif-io/src/jaspar/mod.rs new file mode 100644 index 0000000000000000000000000000000000000000..a41b9b9bd4163a2e567b341a9edbc57dc603096d --- /dev/null +++ b/lightmotif-io/src/jaspar/mod.rs @@ -0,0 +1,134 @@ +//! Parser implementation for matrices in JASPAR (2016) format. +//! +//! The [JASPAR database](https://jaspar.elixir.no/docs/) stores manually +//! curated DNA-binding sites as count matrices. +//! +//! The JASPAR files contains a FASTA-like header line for each record, +//! followed by one line per symbol storing tab-separated counts at each +//! position. The "raw" format simply stores 4 lines corresponding to the +//! scores for the A, C, G and T letters: +//! ```text +//! >MA1104.2 GATA6 +//! 22320 20858 35360 5912 4535 2560 5044 76686 1507 1096 13149 18911 22172 +//! 16229 14161 13347 11831 62936 1439 1393 815 852 75930 3228 19054 17969 +//! 13432 11894 10394 7066 6459 580 615 819 456 712 1810 18153 11605 +//! 27463 32531 20343 54635 5514 74865 72392 1124 76629 1706 61257 23326 27698 +//! ``` + +use std::io::BufRead; + +use lightmotif::abc::Dna; +use lightmotif::pwm::CountMatrix; + +use crate::error::Error; + +mod parse; + +// --- + +#[derive(Debug, Clone)] +pub struct Record { + id: String, + description: Option<String>, + matrix: CountMatrix<Dna>, +} + +impl Record { + pub fn id(&self) -> &str { + &self.id + } + + pub fn description(&self) -> Option<&str> { + self.description.as_deref() + } + + pub fn matrix(&self) -> &CountMatrix<Dna> { + &self.matrix + } +} + +impl AsRef<CountMatrix<Dna>> for Record { + fn as_ref(&self) -> &CountMatrix<Dna> { + &self.matrix + } +} + +// --- + +pub struct Reader<B: BufRead> { + buffer: Vec<u8>, + bufread: B, + start: usize, +} + +impl<B: BufRead> Reader<B> { + pub fn new(mut reader: B) -> Self { + let mut buffer = Vec::new(); + let start = reader.read_until(b'>', &mut buffer).unwrap_or(1) - 1; + + Self { + bufread: reader, + buffer, + start, + } + } +} + +impl<B: BufRead> Iterator for Reader<B> { + type Item = Result<Record, Error>; + fn next(&mut self) -> Option<Self::Item> { + match self.bufread.read_until(b'>', &mut self.buffer) { + Ok(n) => { + let bytes = if n == 0 { + &self.buffer[self.start..] + } else { + &self.buffer[self.start..=self.start + n] + }; + let text = match std::str::from_utf8(bytes) { + Ok(text) => text, + Err(_) => { + return Some(Err(Error::Io(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "decoding error", + )))); + } + }; + if n == 0 && text.trim().is_empty() { + return None; + } + let (rest, record) = match self::parse::record(text) { + Err(e) => return Some(Err(Error::from(e))), + Ok((rest, record)) => (rest, record), + }; + self.start += n + 1 - rest.len(); + if self.start > self.buffer.capacity() / 2 { + let n = self.buffer.len(); + self.buffer.copy_within(self.start.., 0); + self.buffer.truncate(n - self.start); + self.start = 0; + } + Some(Ok(record)) + } + Err(e) => Some(Err(Error::from(e))), + } + } +} + +#[cfg(test)] +mod test { + + #[test] + fn test_single() { + let text = concat!( + ">MA1104.2 GATA6\n", + "22320 20858 35360 5912 4535 2560 5044 76686 1507 1096 13149 18911 22172\n", + "16229 14161 13347 11831 62936 1439 1393 815 852 75930 3228 19054 17969\n", + "13432 11894 10394 7066 6459 580 615 819 456 712 1810 18153 11605\n", + "27463 32531 20343 54635 5514 74865 72392 1124 76629 1706 61257 23326 27698\n", + ); + let mut reader = super::Reader::new(std::io::Cursor::new(text)); + let record = reader.next().unwrap().unwrap(); + assert_eq!(&record.id, "MA1104.2"); + assert!(reader.next().is_none()); + } +} diff --git a/lightmotif-io/src/jaspar/parse.rs b/lightmotif-io/src/jaspar/parse.rs new file mode 100644 index 0000000000000000000000000000000000000000..73cc1ddc42dd1565e2419cdf2c68b7e8014e20ac --- /dev/null +++ b/lightmotif-io/src/jaspar/parse.rs @@ -0,0 +1,174 @@ +//! Parser implementation for matrices in JASPAR (raw) format. +//! The [JASPAR database](https://jaspar.elixir.no/docs/) stores manually +//! curated DNA-binding sites as count matrices. +//! +//! The JASPAR files contains a FASTA-like header line for each record, +//! followed by one line per symbol storing tab-separated counts at each +//! position. The raw version only allows 4 lines per motif, which represent +//! the counts for the A, C, G and T nucleotides: +//! ```text +//! >MA1104.2 GATA6 +//! 22320 20858 35360 5912 4535 2560 5044 76686 1507 1096 13149 18911 22172 +//! 16229 14161 13347 11831 62936 1439 1393 815 852 75930 3228 19054 17969 +//! 13432 11894 10394 7066 6459 580 615 819 456 712 1810 18153 11605 +//! 27463 32531 20343 54635 5514 74865 72392 1124 76629 1706 61257 23326 27698 +//! ``` +//! +#![allow(unused)] + +use std::str::FromStr; + +use generic_array::GenericArray; +use lightmotif::abc::Alphabet; +use lightmotif::abc::Dna; +use lightmotif::abc::Nucleotide; +use lightmotif::abc::Symbol; +use lightmotif::dense::DefaultAlignment; +use lightmotif::dense::DenseMatrix; +use lightmotif::err::InvalidData; +use lightmotif::num::PowerOfTwo; +use lightmotif::num::Unsigned; +use lightmotif::num::U4; +use lightmotif::pwm::CountMatrix; +use lightmotif::pwm::FrequencyMatrix; +use nom::bytes::complete::tag; +use nom::bytes::complete::take_while; +use nom::character::complete::anychar; +use nom::character::complete::line_ending; +use nom::character::complete::not_line_ending; +use nom::character::complete::space0; +use nom::character::complete::space1; +use nom::character::complete::tab; +use nom::combinator::map; +use nom::combinator::map_res; +use nom::combinator::opt; +use nom::multi::many1; +use nom::multi::separated_list0; +use nom::sequence::delimited; +use nom::sequence::pair; +use nom::sequence::preceded; +use nom::sequence::separated_pair; +use nom::sequence::terminated; +use nom::IResult; + +use super::Record; + +pub fn counts(input: &str) -> IResult<&str, Vec<u32>> { + preceded( + opt(space1), + separated_list0(space1, nom::character::complete::u32), + )(input) +} + +pub fn matrix_column(input: &str) -> IResult<&str, Vec<u32>> { + terminated(counts, line_ending)(input) +} + +pub fn build_matrix<B: Unsigned + PowerOfTwo>( + input: GenericArray<Vec<u32>, U4>, + symbols: &[<Dna as Alphabet>::Symbol], +) -> Result<DenseMatrix<u32, <Dna as Alphabet>::K, B>, InvalidData> { + let mut matrix = DenseMatrix::new(input[0].len()); + for (counts, s) in input.as_slice().into_iter().zip(symbols) { + // check that array length is consistent + if counts.len() != matrix.rows() { + return Err(InvalidData); + } + // copy frequencies into matrix + for (i, x) in counts.into_iter().enumerate() { + matrix[i][s.as_index()] = *x + } + } + Ok(matrix) +} + +pub fn matrix<B: Unsigned + PowerOfTwo>( + input: &str, +) -> IResult<&str, DenseMatrix<u32, <Dna as Alphabet>::K, B>> { + let (input, a) = matrix_column(input)?; + let (input, c) = matrix_column(input)?; + let (input, g) = matrix_column(input)?; + let (input, t) = matrix_column(input)?; + + let len = a.len(); + let g = GenericArray::from([a, c, g, t]); + let symbols = &[Nucleotide::A, Nucleotide::C, Nucleotide::G, Nucleotide::T]; + + match build_matrix(g, symbols) { + Ok(x) => Ok((input, x)), + Err(e) => unimplemented!(), + } +} + +pub fn header(input: &str) -> IResult<&str, (&str, Option<&str>)> { + let (input, id) = preceded( + tag(">"), + nom::bytes::complete::take_while(|c: char| !c.is_ascii_whitespace()), + )(input)?; + + let (input, accession) = nom::bytes::complete::take_until("\n")(input)?; + let (input, _) = line_ending(input)?; + + let accession = accession.trim(); + if accession.is_empty() { + Ok((input, (id, None))) + } else { + Ok((input, (id, Some(accession)))) + } +} + +pub fn record(input: &str) -> IResult<&str, Record> { + let (input, (id, description)) = header(input)?; + let (input, matrix) = map_res(matrix::<DefaultAlignment>, CountMatrix::new)(input)?; + + Ok(( + input, + Record { + id: id.to_string(), + description: description.map(String::from), + matrix, + }, + )) +} + +#[cfg(test)] +mod tests { + use super::*; + + use lightmotif::abc::Dna; + use lightmotif::abc::Nucleotide; + + #[test] + fn test_matrix_column() { + let (rest, counts) = super::matrix_column("10 12 4 1 2 2 0 0 0 8 13\n").unwrap(); + assert_eq!(rest, ""); + assert_eq!(counts, vec![10, 12, 4, 1, 2, 2, 0, 0, 0, 8, 13]); + } + + #[test] + fn test_matrix() { + let (_rest, matrix) = super::matrix::<DefaultAlignment>(concat!( + "10 12 4 1 2 2 0 0 0 8 13\n", + " 2 2 7 1 0 8 0 0 1 2 2\n", + " 3 1 1 0 23 0 26 26 0 0 4\n", + "11 11 14 24 1 16 0 0 25 16 7\n", + )) + .unwrap(); + assert_eq!(&matrix[0][..4], &[10, 2, 11, 3]); + } + + #[test] + fn test_record() { + let (_rest, record) = super::record(concat!( + ">MA0002.1 RUNX1\n", + "10 12 4 1 2 2 0 0 0 8 13\n", + " 2 2 7 1 0 8 0 0 1 2 2\n", + " 3 1 1 0 23 0 26 26 0 0 4\n", + "11 11 14 24 1 16 0 0 25 16 7\n", + )) + .unwrap(); + assert_eq!(&record.id, "MA0002.1"); + assert_eq!(record.description.as_deref(), Some("RUNX1")); + assert_eq!(&record.matrix[0][..4], &[10, 2, 11, 3]); + } +} diff --git a/lightmotif-io/src/jaspar16/mod.rs b/lightmotif-io/src/jaspar16/mod.rs index d9698b3ea061c08f6b0089796605f579c2bdd255..16c9fe9618234e553e46d6acc57c0c8d2921c6c2 100644 --- a/lightmotif-io/src/jaspar16/mod.rs +++ b/lightmotif-io/src/jaspar16/mod.rs @@ -116,6 +116,10 @@ impl<B: BufRead, A: Alphabet> Iterator for Reader<B, A> { } } +pub fn read<B: BufRead, A: Alphabet>(reader: B) -> self::Reader<B, A> { + self::Reader::new(reader) +} + #[cfg(test)] mod test { diff --git a/lightmotif-io/src/lib.rs b/lightmotif-io/src/lib.rs index c1c7d643ebfa75b0bdef794a40a9b657ad7b2681..1980607bd126e0889c8b3156d1994a78c3466d77 100644 --- a/lightmotif-io/src/lib.rs +++ b/lightmotif-io/src/lib.rs @@ -4,6 +4,7 @@ extern crate lightmotif; pub mod error; +pub mod jaspar; pub mod jaspar16; pub mod transfac; pub mod uniprobe;