From 5c3f73c8a89a066b4c151186b7c00f0ce5141095 Mon Sep 17 00:00:00 2001 From: Martin Larralde <martin.larralde@embl.de> Date: Tue, 9 May 2023 14:16:13 +0200 Subject: [PATCH] Add support for parsing more metadata from TRANSFAC files --- lightmotif-transfac/src/lib.rs | 488 ++++++++++++++++++++++++++++----- 1 file changed, 417 insertions(+), 71 deletions(-) diff --git a/lightmotif-transfac/src/lib.rs b/lightmotif-transfac/src/lib.rs index 7ec4ea6..465b639 100644 --- a/lightmotif-transfac/src/lib.rs +++ b/lightmotif-transfac/src/lib.rs @@ -11,12 +11,14 @@ use nom::bytes::complete::take_until; use nom::bytes::complete::take_while; use nom::bytes::complete::take_while1; use nom::character::complete::anychar; +use nom::character::complete::char; use nom::character::complete::line_ending; use nom::character::complete::not_line_ending; use nom::character::complete::space0; use nom::character::streaming::space1; use nom::combinator::eof; use nom::combinator::map_res; +use nom::combinator::opt; use nom::error::Error; use nom::error::ErrorKind; use nom::multi::count; @@ -33,14 +35,67 @@ use lightmotif::CountMatrix; use lightmotif::DenseMatrix; use lightmotif::Symbol; -pub struct TransfacMatrix<A: Alphabet, const K: usize> { +#[derive(Clone, Debug)] +pub struct Matrix<A: Alphabet, const K: usize> { + id: Option<String>, + accession: Option<String>, + name: Option<String>, counts: CountMatrix<A, K>, + dates: Vec<Date>, + references: Vec<Reference>, + sites: Vec<String>, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DateKind { + Created, + Updated, +} + +#[derive(Debug, Clone)] +pub struct Date { + kind: DateKind, + author: String, + day: u8, + month: u8, + year: u16, +} + +#[derive(Clone, Debug)] +pub struct ReferenceNumber { + local: u32, + xref: Option<String>, +} + +impl ReferenceNumber { + pub fn new(local: u32) -> Self { + Self { local, xref: None } + } + + pub fn with_xref<X>(local: u32, xref: X) -> Self + where + X: Into<Option<String>>, + { + Self { + local, + xref: xref.into(), + } + } +} + +#[derive(Clone, Debug)] +pub struct Reference { + number: ReferenceNumber, + // authors: String, + title: Option<String>, + link: Option<String>, + pmid: Option<String>, } fn parse_line(input: &str) -> IResult<&str, &str> { match memchr::memchr(b'\n', input.as_bytes()) { - None => Err(nom::Err::Error(Error::new(input, ErrorKind::Verify))), - Some(i) if i == input.len() => Ok(("", input)), + None => Err(nom::Err::Error(Error::new(input, ErrorKind::Char))), + Some(i) if i == input.len() - 1 => Ok(("", input)), Some(i) => { let (line, rest) = input.split_at(i + 1); Ok((rest, line)) @@ -48,21 +103,6 @@ fn parse_line(input: &str) -> IResult<&str, &str> { } } -fn parse_ac(input: &str) -> IResult<&str, &str> { - let (input, line) = preceded(tag("AC"), parse_line)(input)?; - Ok((input, line.trim())) -} - -fn parse_id(input: &str) -> IResult<&str, &str> { - let (input, line) = preceded(tag("ID"), parse_line)(input)?; - Ok((input, line.trim())) -} - -fn parse_bf(input: &str) -> IResult<&str, &str> { - let (input, line) = preceded(tag("BF"), parse_line)(input)?; - Ok((input, line.trim())) -} - fn parse_alphabet<S: Symbol>(input: &str) -> IResult<&str, Vec<S>> { delimited( alt((tag("PO"), tag("P0"))), @@ -83,37 +123,180 @@ fn parse_row(input: &str, k: usize) -> IResult<&str, Vec<u32>> { } fn parse_tag(input: &str) -> IResult<&str, &str> { - nom::branch::alt(( - tag("BF"), - tag("ID"), - tag("XX"), - tag("P0"), - tag("PO"), - tag("//"), - ))(input) -} - -pub fn parse_matrix<A: Alphabet, const K: usize>( - mut input: &str, -) -> IResult<&str, CountMatrix<A, K>> { + let (rest, tag) = nom::bytes::complete::take(2usize)(input)?; + match tag { + "AC" | "BA" | "BS" | "BF" | "CC" | "CO" | "DT" | "ID" | "NA" | "P0" | "PO" | "RN" + | "XX" | "//" => Ok((rest, tag)), + _ => Err(nom::Err::Error(Error::new(input, ErrorKind::Alt))), + } +} + +fn parse_reference_number(input: &str) -> IResult<&str, ReferenceNumber> { + let (rest, number) = preceded( + terminated(tag("RN"), space0), + delimited(char('['), nom::character::complete::u32, char(']')), + )(input)?; + match opt(anychar)(rest)?.1 { + Some(';') => { + let (rest, xref) = delimited(char(';'), take_till(|c| c == '.'), char('.'))(rest)?; + let (rest, _) = parse_line(rest)?; + Ok(( + rest, + ReferenceNumber::with_xref(number, xref.trim().to_string()), + )) + } + _ => { + let (rest, _) = parse_line(input)?; + Ok((rest, ReferenceNumber::new(number))) + } + } +} + +fn parse_datekind(input: &str) -> IResult<&str, DateKind> { + match alt((tag("created"), tag("updated")))(input)? { + (rest, "created") => Ok((rest, DateKind::Created)), + (rest, "updated") => Ok((rest, DateKind::Updated)), + _ => unreachable!(), + } +} + +fn parse_date(input: &str) -> IResult<&str, Date> { + let (rest, _) = terminated(tag("DT"), space0)(input)?; + + let (rest, day) = terminated(nom::character::complete::u8, char('.'))(rest)?; + let (rest, month) = terminated(nom::character::complete::u8, char('.'))(rest)?; + let (rest, year) = nom::character::complete::u16(rest)?; + let (rest, _) = space0(rest)?; + + let (rest, kind) = delimited(char('('), parse_datekind, char(')'))(rest)?; + let (rest, author) = delimited( + char(';'), + preceded(space0, take_till(|c| c == '.')), + char('.'), + )(rest)?; + let (rest, _) = parse_line(rest)?; + + Ok(( + rest, + Date { + author: author.to_string(), + kind, + year, + month, + day, + }, + )) +} + +fn parse_reference(mut input: &str) -> IResult<&str, Reference> { + let mut pmid = None; + let mut link = None; + let mut title = None; + + let (rest, number) = parse_reference_number(input)?; + input = rest; + loop { + match nom::bytes::complete::take(2usize)(input)?.1 { + "RX" => { + let (rest, line) = preceded( + preceded( + terminated(tag("RX"), space0), + terminated(tag("PUBMED:"), space0), + ), + terminated(take_till(|c| c == '.'), char('.')), + )(input)?; + let (rest, _) = parse_line(rest)?; + pmid = Some(line.to_string()); + input = rest; + } + "RA" => { + let (rest, line) = preceded(tag("RA"), parse_line)(input)?; + // ra = Some(line.trim()); + input = rest; + } + "RL" => { + let (rest, line) = preceded(tag("RL"), parse_line)(input)?; + link = Some(line.trim().to_string()); + input = rest; + } + "RT" => { + let (rest, line) = preceded(tag("RT"), parse_line)(input)?; + title = Some(line.trim().to_string()); + input = rest; + } + _ => break, + } + } + + Ok(( + input, + Reference { + number, + pmid, + link, + title, + }, + )) +} + +pub fn parse_matrix<A: Alphabet, const K: usize>(mut input: &str) -> IResult<&str, Matrix<A, K>> { + let mut accession = None; + let mut ba = None; + let mut name = None; let mut id = None; - let mut bf = None; + let mut copyright = None; + let mut dates = Vec::new(); + let mut references = Vec::new(); + let mut comments = Vec::new(); + let mut sites = Vec::new(); + let mut factors = Vec::new(); let mut countmatrix = None; loop { match parse_tag(input)?.1 { - "XX" => { - let (rest, _) = parse_line(input)?; + "AC" => { + let (rest, line) = preceded(tag("AC"), parse_line)(input)?; + accession = Some(line.trim().to_string()); input = rest; } - "ID" => { - let (rest, line) = parse_id(input)?; - id = Some(line.trim()); + "BA" => { + let (rest, line) = preceded(tag("BA"), parse_line)(input)?; + ba = Some(line.trim().to_string()); + input = rest; + } + "BS" => { + let (rest, line) = preceded(tag("BS"), parse_line)(input)?; + sites.push(line.trim().to_string()); input = rest; } "BF" => { - let (rest, line) = parse_bf(input)?; - bf = Some(line.trim()); + let (rest, line) = preceded(tag("BF"), parse_line)(input)?; + factors.push(line.trim().to_string()); + input = rest; + } + "CC" => { + let (rest, lines) = many1(preceded(tag("CC"), parse_line))(input)?; + comments.push(lines.join(" ")); + input = rest; + } + "CO" => { + let (rest, line) = preceded(tag("CO"), parse_line)(input)?; + copyright = Some(line.trim().to_string()); + input = rest; + } + "DT" => { + let (rest, date) = parse_date(input)?; + dates.push(date); + input = rest; + } + "ID" => { + let (rest, line) = preceded(tag("ID"), parse_line)(input)?; + id = Some(line.trim().to_string()); + input = rest; + } + "NA" => { + let (rest, line) = preceded(tag("NA"), parse_line)(input)?; + name = Some(line.trim().to_string()); input = rest; } "P0" | "PO" => { @@ -121,7 +304,7 @@ pub fn parse_matrix<A: Alphabet, const K: usize>( let (rest, symbols) = parse_alphabet::<A::Symbol>(input)?; let (rest, counts) = many1(|l| parse_row(l, symbols.len()))(rest)?; input = rest; - // parse + // read counts into a dense matrix let mut data = DenseMatrix::<u32, K>::new(counts.len()); for (i, count) in counts.iter().enumerate() { for (s, &c) in symbols.iter().zip(count.iter()) { @@ -130,21 +313,36 @@ pub fn parse_matrix<A: Alphabet, const K: usize>( } countmatrix = Some(data); } + "RN" => { + let (rest, reference) = parse_reference(input)?; + references.push(reference); + input = rest; + } "//" => { input = preceded(tag("//"), parse_line)(input)?.0; break; } + "XX" => input = parse_line(input)?.0, _ => unreachable!(), } } - let matrix = CountMatrix::new(countmatrix.unwrap()).unwrap(); + let counts = CountMatrix::new(countmatrix.unwrap()).unwrap(); + let matrix = Matrix { + accession, + id, + name, + counts, + dates, + references, + sites, + }; Ok((input, matrix)) } pub fn parse_matrices<A: Alphabet, const K: usize>( input: &str, -) -> IResult<&str, Vec<CountMatrix<A, K>>> { +) -> IResult<&str, Vec<Matrix<A, K>>> { let (input, (matrices, _)) = many_till(parse_matrix, eof)(input)?; Ok((input, matrices)) } @@ -157,22 +355,6 @@ mod test { use lightmotif::Nucleotide; use lightmotif::Symbol; - #[test] - fn test_parse_id() { - let line = "ID prodoric_MX000001\n"; - let res = super::parse_id(line).unwrap(); - assert_eq!(res.0, ""); - assert_eq!(res.1, "prodoric_MX000001") - } - - #[test] - fn test_parse_bf() { - let line = "BF Pseudomonas aeruginosa\n"; - let res = super::parse_bf(line).unwrap(); - assert_eq!(res.0, ""); - assert_eq!(res.1, "Pseudomonas aeruginosa"); - } - #[test] fn test_parse_alphabet() { let line = "P0 A T G C\n"; @@ -202,7 +384,7 @@ mod test { } #[test] - fn test_parse_matrix() { + fn test_parse_prodoric() { let text = concat!( "ID prodoric_MX000001\n", "BF Pseudomonas aeruginosa\n", @@ -221,17 +403,181 @@ mod test { assert_eq!(res.0, ""); let matrix = res.1; - // assert_eq!(matrix.name, "prodoric_MX000001"); - assert_eq!(matrix.counts().rows(), 7); - assert_eq!(matrix.counts()[0][Nucleotide::A.as_index()], 0); - assert_eq!(matrix.counts()[0][Nucleotide::T.as_index()], 0); - assert_eq!(matrix.counts()[0][Nucleotide::G.as_index()], 2); - assert_eq!(matrix.counts()[0][Nucleotide::C.as_index()], 0); - assert_eq!(matrix.counts()[0][Nucleotide::N.as_index()], 0); - assert_eq!(matrix.counts()[5][Nucleotide::A.as_index()], 0); - assert_eq!(matrix.counts()[5][Nucleotide::T.as_index()], 1); - assert_eq!(matrix.counts()[5][Nucleotide::G.as_index()], 0); - assert_eq!(matrix.counts()[5][Nucleotide::C.as_index()], 1); - assert_eq!(matrix.counts()[5][Nucleotide::N.as_index()], 0); + assert_eq!(matrix.id, Some(String::from("prodoric_MX000001"))); + assert_eq!(matrix.counts.counts().rows(), 7); + assert_eq!(matrix.counts.counts()[0][Nucleotide::A.as_index()], 0); + assert_eq!(matrix.counts.counts()[0][Nucleotide::T.as_index()], 0); + assert_eq!(matrix.counts.counts()[0][Nucleotide::G.as_index()], 2); + assert_eq!(matrix.counts.counts()[0][Nucleotide::C.as_index()], 0); + assert_eq!(matrix.counts.counts()[0][Nucleotide::N.as_index()], 0); + assert_eq!(matrix.counts.counts()[5][Nucleotide::A.as_index()], 0); + assert_eq!(matrix.counts.counts()[5][Nucleotide::T.as_index()], 1); + assert_eq!(matrix.counts.counts()[5][Nucleotide::G.as_index()], 0); + assert_eq!(matrix.counts.counts()[5][Nucleotide::C.as_index()], 1); + assert_eq!(matrix.counts.counts()[5][Nucleotide::N.as_index()], 0); + } + + #[test] + fn test_parse_reference_number() { + let res = super::parse_reference_number("RN [1]\n").unwrap().1; + assert_eq!(res.local, 1); + assert!(res.xref.is_none()); + + let res = super::parse_reference_number("RN [2]; RE0000531.\n") + .unwrap() + .1; + assert_eq!(res.local, 2); + assert_eq!(res.xref, Some(String::from("RE0000531"))); + } + + #[test] + fn test_parse_reference() { + let text = concat!( + "RN [1]; RE0000231.\n", + "RX PUBMED: 1846322.\n", + "RA Sun X.-H., Baltimore D.\n", + "RT An inhibitory domain of E12 transcription factor prevents DNA binding in E12 homodimers but not in E12 heterodimers\n", + "RL Cell 64:459-470 (1991).\n", + "XX\n", + ); + let res = super::parse_reference(text).unwrap().1; + assert_eq!(res.number.local, 1); + assert_eq!(res.number.xref, Some(String::from("RE0000231"))); + assert_eq!(res.link, Some(String::from("Cell 64:459-470 (1991)."))); + assert_eq!(res.pmid, Some(String::from("1846322"))); + + let text = concat!( + "RN [1]\n", + "RA Biedenkapp H., Borgmeyer U., Sippel A., Klempnauer K.-H.;\n", + "RT Viral myb oncogene encodes a sequence-specific DNA-binding activity;\n", + "RL Nature 335:835-837 (1988).\n", + "XX\n", + ); + let res = super::parse_reference(text).unwrap().1; + assert_eq!(res.number.local, 1); + assert_eq!(res.number.xref, None); + assert_eq!(res.pmid, None); + assert_eq!( + res.title, + Some(String::from( + "Viral myb oncogene encodes a sequence-specific DNA-binding activity;" + )) + ); + assert_eq!(res.link, Some(String::from("Nature 335:835-837 (1988)."))); + } + + #[test] + fn test_parse_transfac_v2() { + let text = concat!( + "AC M00001\n", + "XX\n", + "DT 19.10.1992 (created); EWI.\n", + "XX\n", + "PO A C G T\n", + "01 1 2 2 0\n", + "02 2 1 2 0\n", + "03 3 0 1 1\n", + "04 0 5 0 0\n", + "05 5 0 0 0\n", + "06 0 0 4 1\n", + "07 0 1 4 0\n", + "08 0 0 0 5\n", + "09 0 0 5 0\n", + "10 0 1 2 2\n", + "11 0 2 0 3\n", + "12 1 0 3 1\n", + "XX\n", + "BF MyoD\n", + "XX\n", + "BA 5 functional elements in 3 genes\n", + "XX\n", + "CC Test comment.\n", + "XX\n", + "//\n", + ); + let res = super::parse_matrix::<Dna, { Dna::K }>(text).unwrap(); + assert_eq!(res.0, ""); + + let matrix = res.1; + assert_eq!(matrix.accession, Some(String::from("M00001"))); + assert_eq!(matrix.counts.counts().rows(), 12); + assert_eq!(matrix.counts.counts()[0][Nucleotide::A.as_index()], 1); + assert_eq!(matrix.counts.counts()[0][Nucleotide::T.as_index()], 0); + assert_eq!(matrix.counts.counts()[0][Nucleotide::G.as_index()], 2); + assert_eq!(matrix.counts.counts()[0][Nucleotide::C.as_index()], 2); + assert_eq!(matrix.counts.counts()[0][Nucleotide::N.as_index()], 0); + assert_eq!(matrix.counts.counts()[5][Nucleotide::A.as_index()], 0); + assert_eq!(matrix.counts.counts()[5][Nucleotide::T.as_index()], 1); + assert_eq!(matrix.counts.counts()[5][Nucleotide::G.as_index()], 4); + assert_eq!(matrix.counts.counts()[5][Nucleotide::C.as_index()], 0); + assert_eq!(matrix.counts.counts()[5][Nucleotide::N.as_index()], 0); + } + + #[test] + fn test_parse_transfac_v9() { + let text = concat!( + "AC M00030\n", + "XX\n", + "ID F$MATA1_01\n", + "XX\n", + "DT 18.10.1994 (created); ewi.\n", + "DT 16.10.1995 (updated); ewi.\n", + "CO Copyright (C), Biobase GmbH.\n", + "XX\n", + "NA MATa1\n", + "XX\n", + // "DE mating factor a1\n", + "XX\n", + "BF T00488; MATa1; Species: yeast, Saccharomyces cerevisiae.\n", + "XX\n", + "P0 A C G T\n", + "01 0 1 1 12 T\n", + "02 0 0 14 0 G\n", + "03 14 0 0 0 A\n", + "04 0 0 0 14 T\n", + "05 0 0 14 0 G\n", + "06 1 2 0 11 T\n", + "07 10 0 3 1 A\n", + "08 6 2 4 2 N\n", + "09 5 4 1 4 N\n", + "10 2 1 1 10 T\n", + "XX\n", + "BA a1 half-sites of 14 hsg operators of 4 genes\n", + "XX\n", + "BS TGATGTACTT; R05553; 1; 10;; p.\n", + "BS TGATGTAATC; R05554; 1; 10;; p.\n", + "BS TGATGTGTAA; R05555; 1; 10;; p.\n", + "BS TGATGCAGAA; R05556; 1; 10;; p.\n", + "BS TGATGAAGCG; R05557; 1; 10;; p.\n", + "BS TGATGTTAAT; R05558; 1; 10;; p.\n", + "BS TGATGTAAAT; R05559; 1; 10;; p.\n", + "BS TGATGTAACT; R05560; 1; 10;; p.\n", + "BS TGATGCAGTT; R05561; 1; 10;; p.\n", + "BS TGATGTGAAT; R05562; 1; 10;; p.\n", + "BS CGATGTGCTT; R05563; 1; 10;; p.\n", + "BS TGATGTATCT; R05564; 1; 10;; p.\n", + "BS GGATGTAACT; R05565; 1; 10;; p.\n", + "BS TGATGTAGGT; R05566; 1; 10;; p.\n", + "XX\n", + "CC compiled sequences\n", + "XX\n", + "RN [1]; RE0000546.\n", + "RX PUBMED: 7907979.\n", + "RA Goutte C., Johnson A. D.\n", + "RT Recognition of a DNA operator by a dimer composed of two different homeodomain proteins\n", + "RL EMBO J. 13:1434-1442 (1994).\n", + "XX\n", + "//\n", + ); + let res = super::parse_matrix::<Dna, { Dna::K }>(text).unwrap(); + let matrix = res.1; + assert_eq!(res.0, ""); + assert_eq!(matrix.name, Some(String::from("MATa1"))); + assert_eq!(matrix.dates.len(), 2); + assert_eq!(matrix.dates[0].author, "ewi"); + assert_eq!(matrix.dates[1].author, "ewi"); + assert_eq!(matrix.dates[1].day, 16); + assert_eq!(matrix.dates[1].month, 10); + assert_eq!(matrix.dates[1].year, 1995); } } -- GitLab