diff --git a/src/abc.rs b/src/abc.rs new file mode 100644 index 0000000000000000000000000000000000000000..889a4f624cbffa099175f7371f506504b1caceae --- /dev/null +++ b/src/abc.rs @@ -0,0 +1,46 @@ +use std::convert::TryFrom; +use std::fmt::Debug; + +pub struct InvalidSymbol(char); + +/// Common traits for a biological alphabet. +pub trait Alphabet: Debug + Copy + Default { + type Symbol: Default + Sized + Copy + TryFrom<char>; +} + +#[derive(Clone, Copy)] +#[repr(u8)] +pub enum DnaSymbol { + A = 0, + C = 1, + T = 2, + G = 3, + N = 4, +} + +impl TryFrom<char> for DnaSymbol { + type Error = InvalidSymbol; + fn try_from(c: char) -> Result<Self, Self::Error> { + match c { + 'A' => Ok(DnaSymbol::A), + 'C' => Ok(DnaSymbol::C), + 'T' => Ok(DnaSymbol::T), + 'G' => Ok(DnaSymbol::G), + 'N' => Ok(DnaSymbol::N), + _ => Err(InvalidSymbol(c)), + } + } +} + +impl Default for DnaSymbol { + fn default() -> DnaSymbol { + DnaSymbol::N + } +} + +#[derive(Default, Debug, Clone, Copy)] +pub struct DnaAlphabet; + +impl Alphabet for DnaAlphabet { + type Symbol = DnaSymbol; +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index f3094a0838a76605b889a3955224405c0c1ed0b6..a6f37be27945f70525b081461078d65a869e28dc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ //! Fast position-weight matrices using sequence striping and SIMD. +mod abc; mod matrix; - +mod seq; diff --git a/src/seq.rs b/src/seq.rs new file mode 100644 index 0000000000000000000000000000000000000000..946fe62151615d173083c57543bf64fc1602986f --- /dev/null +++ b/src/seq.rs @@ -0,0 +1,52 @@ +use super::abc::Alphabet; +use super::abc::InvalidSymbol; +use super::matrix::DenseMatrix; + +#[derive(Clone, Debug)] +pub struct EncodedSequence<A: Alphabet> { + pub alphabet: A, + pub data: Vec<A::Symbol>, +} + +impl<A: Alphabet> EncodedSequence<A> { + /// Create a new encoded sequence from a textual representation. + pub fn from_text(sequence: &str) -> Result<Self, InvalidSymbol> + where InvalidSymbol: From<<A::Symbol as TryFrom<char>>::Error> + { + let data = sequence.chars() + .map(|c| A::Symbol::try_from(c)) + .collect::<Result<_, _>>()?; + Ok(Self { + data, + alphabet: Default::default(), + }) + } + + /// Convert the encoded sequence to a striped matrix. + pub fn to_striped<const C: usize>(&self) -> StripedSequence<A, C> { + let length = self.data.len(); + let n = (length + C) / C; + let mut data = DenseMatrix::new(n); + for (i, &x) in self.data.iter().enumerate() { + data[i%n][i/n] = x; + } + StripedSequence { + alphabet: self.alphabet, + data, + length, + } + } +} + +#[derive(Clone, Debug)] +pub struct StripedSequence<A: Alphabet, const C: usize = 32> { + pub alphabet: A, + pub length: usize, + pub data: DenseMatrix<A::Symbol, C>, +} + +impl<A: Alphabet, const C: usize> From<EncodedSequence<A>> for StripedSequence<A, C> { + fn from(encoded: EncodedSequence<A>) -> Self { + encoded.to_striped() + } +} \ No newline at end of file