Skip to main content

gapsmith_core/
gpr.rs

1//! Gene-protein-reaction (GPR) Boolean expressions.
2//!
3//! Syntax accepted (matching cobrar / COBRApy conventions):
4//!
5//! - Leaves: bare identifiers `[A-Za-z_][A-Za-z0-9_\.\-]*` or quoted strings.
6//! - `and` (case-insensitive) or `&` — logical AND.
7//! - `or` (case-insensitive) or `|` — logical OR.
8//! - Parentheses for grouping.
9//!
10//! AND binds tighter than OR (textbook precedence). No NOT operator.
11//!
12//! A leaf is a [`GeneId`] reference; compound complexes become `And` nodes.
13//! Isoforms become `Or` nodes.
14
15use crate::GeneId;
16use serde::{Deserialize, Serialize};
17use std::fmt;
18
19#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
20#[serde(tag = "op", rename_all = "lowercase")]
21pub enum Gpr {
22    Gene { id: GeneId },
23    And { operands: Vec<Gpr> },
24    Or { operands: Vec<Gpr> },
25}
26
27impl Gpr {
28    pub fn gene(id: impl Into<GeneId>) -> Self {
29        Gpr::Gene { id: id.into() }
30    }
31
32    /// Flatten nested same-operator nodes and drop unary/empty collections.
33    pub fn normalize(self) -> Self {
34        match self {
35            g @ Gpr::Gene { .. } => g,
36            Gpr::And { operands } => {
37                let mut out = Vec::with_capacity(operands.len());
38                for op in operands {
39                    match op.normalize() {
40                        Gpr::And { operands: inner } => out.extend(inner),
41                        other => out.push(other),
42                    }
43                }
44                match out.len() {
45                    0 => Gpr::And { operands: vec![] },
46                    1 => out.into_iter().next().unwrap(),
47                    _ => Gpr::And { operands: out },
48                }
49            }
50            Gpr::Or { operands } => {
51                let mut out = Vec::with_capacity(operands.len());
52                for op in operands {
53                    match op.normalize() {
54                        Gpr::Or { operands: inner } => out.extend(inner),
55                        other => out.push(other),
56                    }
57                }
58                match out.len() {
59                    0 => Gpr::Or { operands: vec![] },
60                    1 => out.into_iter().next().unwrap(),
61                    _ => Gpr::Or { operands: out },
62                }
63            }
64        }
65    }
66
67    /// Collect all distinct gene ids referenced anywhere in the tree.
68    pub fn collect_genes(&self, out: &mut Vec<GeneId>) {
69        match self {
70            Gpr::Gene { id } => {
71                if !out.contains(id) {
72                    out.push(id.clone());
73                }
74            }
75            Gpr::And { operands } | Gpr::Or { operands } => {
76                for op in operands {
77                    op.collect_genes(out);
78                }
79            }
80        }
81    }
82}
83
84impl fmt::Display for Gpr {
85    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
86        match self {
87            Gpr::Gene { id } => f.write_str(id.as_str()),
88            Gpr::And { operands } => {
89                for (i, op) in operands.iter().enumerate() {
90                    if i > 0 {
91                        f.write_str(" and ")?;
92                    }
93                    write_grouped(f, op, matches!(op, Gpr::Or { .. }))?;
94                }
95                Ok(())
96            }
97            Gpr::Or { operands } => {
98                for (i, op) in operands.iter().enumerate() {
99                    if i > 0 {
100                        f.write_str(" or ")?;
101                    }
102                    write_grouped(f, op, false)?;
103                }
104                Ok(())
105            }
106        }
107    }
108}
109
110fn write_grouped(f: &mut fmt::Formatter<'_>, g: &Gpr, paren: bool) -> fmt::Result {
111    if paren {
112        write!(f, "({})", g)
113    } else {
114        write!(f, "{}", g)
115    }
116}
117
118#[derive(Debug, thiserror::Error)]
119pub enum GprParseError {
120    #[error("unexpected character '{ch}' at position {pos}")]
121    UnexpectedChar { ch: char, pos: usize },
122    #[error("unexpected end of input at position {pos}")]
123    UnexpectedEnd { pos: usize },
124    #[error("unclosed parenthesis opened at position {pos}")]
125    UnclosedParen { pos: usize },
126    #[error("empty expression")]
127    Empty,
128}
129
130// -- Parser --------------------------------------------------------------
131// Precedence: OR < AND. Right-associative AND/OR are fine because of flattening.
132// Grammar:
133//   expr   := and_expr ('or' and_expr)*
134//   and_expr := atom ('and' atom)*
135//   atom   := '(' expr ')' | IDENT | QUOTED
136//
137// Tokens: `(`, `)`, AND, OR, IDENT, QUOTED_IDENT.
138
139impl std::str::FromStr for Gpr {
140    type Err = GprParseError;
141    fn from_str(s: &str) -> Result<Self, Self::Err> {
142        let tokens = tokenize(s)?;
143        if tokens.is_empty() {
144            return Err(GprParseError::Empty);
145        }
146        let mut parser = Parser { tokens: &tokens, pos: 0 };
147        let expr = parser.parse_or()?;
148        if parser.pos != tokens.len() {
149            let (bad_pos, _) = tokens[parser.pos];
150            return Err(GprParseError::UnexpectedEnd { pos: bad_pos });
151        }
152        Ok(expr.normalize())
153    }
154}
155
156#[derive(Clone, Debug, Eq, PartialEq)]
157enum Tok {
158    LParen,
159    RParen,
160    And,
161    Or,
162    Ident(String),
163}
164
165fn tokenize(s: &str) -> Result<Vec<(usize, Tok)>, GprParseError> {
166    let mut out = Vec::new();
167    let bytes = s.as_bytes();
168    let mut i = 0;
169    while i < bytes.len() {
170        let c = bytes[i] as char;
171        match c {
172            ' ' | '\t' | '\n' | '\r' => {
173                i += 1;
174            }
175            '(' => {
176                out.push((i, Tok::LParen));
177                i += 1;
178            }
179            ')' => {
180                out.push((i, Tok::RParen));
181                i += 1;
182            }
183            '&' => {
184                out.push((i, Tok::And));
185                i += 1;
186            }
187            '|' => {
188                out.push((i, Tok::Or));
189                i += 1;
190            }
191            '\'' | '"' => {
192                let quote = c;
193                let start = i + 1;
194                i += 1;
195                while i < bytes.len() && (bytes[i] as char) != quote {
196                    i += 1;
197                }
198                if i >= bytes.len() {
199                    return Err(GprParseError::UnexpectedEnd { pos: start });
200                }
201                let id: String = s[start..i].to_string();
202                out.push((start, Tok::Ident(id)));
203                i += 1; // closing quote
204            }
205            ch if is_ident_start(ch) => {
206                let start = i;
207                while i < bytes.len() && is_ident_cont(bytes[i] as char) {
208                    i += 1;
209                }
210                let slice = &s[start..i];
211                let lower = slice.to_ascii_lowercase();
212                let tok = match lower.as_str() {
213                    "and" => Tok::And,
214                    "or" => Tok::Or,
215                    _ => Tok::Ident(slice.to_string()),
216                };
217                out.push((start, tok));
218            }
219            other => return Err(GprParseError::UnexpectedChar { ch: other, pos: i }),
220        }
221    }
222    Ok(out)
223}
224
225fn is_ident_start(c: char) -> bool {
226    c.is_ascii_alphabetic() || c == '_'
227}
228
229fn is_ident_cont(c: char) -> bool {
230    c.is_ascii_alphanumeric() || matches!(c, '_' | '.' | '-' | ':' | '+')
231}
232
233struct Parser<'a> {
234    tokens: &'a [(usize, Tok)],
235    pos: usize,
236}
237
238impl<'a> Parser<'a> {
239    fn peek(&self) -> Option<&'a Tok> {
240        self.tokens.get(self.pos).map(|(_, t)| t)
241    }
242    fn peek_pos(&self) -> usize {
243        self.tokens
244            .get(self.pos)
245            .map(|(p, _)| *p)
246            .unwrap_or_else(|| self.tokens.last().map(|(p, _)| *p + 1).unwrap_or(0))
247    }
248    fn bump(&mut self) -> &'a Tok {
249        let t = &self.tokens[self.pos].1;
250        self.pos += 1;
251        t
252    }
253
254    fn parse_or(&mut self) -> Result<Gpr, GprParseError> {
255        let mut left = self.parse_and()?;
256        while matches!(self.peek(), Some(Tok::Or)) {
257            self.bump();
258            let right = self.parse_and()?;
259            left = match left {
260                Gpr::Or { mut operands } => {
261                    operands.push(right);
262                    Gpr::Or { operands }
263                }
264                other => Gpr::Or { operands: vec![other, right] },
265            };
266        }
267        Ok(left)
268    }
269
270    fn parse_and(&mut self) -> Result<Gpr, GprParseError> {
271        let mut left = self.parse_atom()?;
272        while matches!(self.peek(), Some(Tok::And)) {
273            self.bump();
274            let right = self.parse_atom()?;
275            left = match left {
276                Gpr::And { mut operands } => {
277                    operands.push(right);
278                    Gpr::And { operands }
279                }
280                other => Gpr::And { operands: vec![other, right] },
281            };
282        }
283        Ok(left)
284    }
285
286    fn parse_atom(&mut self) -> Result<Gpr, GprParseError> {
287        match self.peek() {
288            Some(Tok::LParen) => {
289                let open_pos = self.peek_pos();
290                self.bump();
291                let expr = self.parse_or()?;
292                match self.peek() {
293                    Some(Tok::RParen) => {
294                        self.bump();
295                        Ok(expr)
296                    }
297                    _ => Err(GprParseError::UnclosedParen { pos: open_pos }),
298                }
299            }
300            Some(Tok::Ident(s)) => {
301                let id = s.clone();
302                self.bump();
303                Ok(Gpr::gene(id))
304            }
305            Some(Tok::And) | Some(Tok::Or) | Some(Tok::RParen) => {
306                Err(GprParseError::UnexpectedChar { ch: '?', pos: self.peek_pos() })
307            }
308            None => Err(GprParseError::UnexpectedEnd { pos: self.peek_pos() }),
309        }
310    }
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316    use std::str::FromStr;
317
318    #[test]
319    fn single_gene() {
320        let g: Gpr = "b0001".parse().unwrap();
321        assert_eq!(g, Gpr::gene("b0001"));
322    }
323
324    #[test]
325    fn precedence_and_binds_tighter() {
326        let g: Gpr = "a and b or c and d".parse().unwrap();
327        assert_eq!(
328            g,
329            Gpr::Or {
330                operands: vec![
331                    Gpr::And { operands: vec![Gpr::gene("a"), Gpr::gene("b")] },
332                    Gpr::And { operands: vec![Gpr::gene("c"), Gpr::gene("d")] },
333                ]
334            }
335        );
336    }
337
338    #[test]
339    fn parens_override_precedence() {
340        let g: Gpr = "(a or b) and (c or d)".parse().unwrap();
341        assert_eq!(
342            g,
343            Gpr::And {
344                operands: vec![
345                    Gpr::Or { operands: vec![Gpr::gene("a"), Gpr::gene("b")] },
346                    Gpr::Or { operands: vec![Gpr::gene("c"), Gpr::gene("d")] },
347                ]
348            }
349        );
350    }
351
352    #[test]
353    fn symbolic_operators() {
354        let g: Gpr = "a & (b | c)".parse().unwrap();
355        assert_eq!(
356            g,
357            Gpr::And {
358                operands: vec![
359                    Gpr::gene("a"),
360                    Gpr::Or { operands: vec![Gpr::gene("b"), Gpr::gene("c")] },
361                ]
362            }
363        );
364    }
365
366    #[test]
367    fn collect_genes_unique() {
368        let g: Gpr = "a and (b or a or c)".parse().unwrap();
369        let mut v = vec![];
370        g.collect_genes(&mut v);
371        assert_eq!(
372            v.iter().map(|g| g.as_str().to_string()).collect::<Vec<_>>(),
373            vec!["a", "b", "c"]
374        );
375    }
376
377    #[test]
378    fn display_roundtrip() {
379        let g: Gpr = "a and (b or c)".parse().unwrap();
380        let s = g.to_string();
381        let g2: Gpr = s.parse().unwrap();
382        assert_eq!(g, g2);
383    }
384
385    #[test]
386    fn empty_is_error() {
387        assert!(matches!(Gpr::from_str("   ").unwrap_err(), GprParseError::Empty));
388    }
389
390    #[test]
391    fn unclosed_paren_is_error() {
392        assert!(matches!(
393            Gpr::from_str("a and (b or c").unwrap_err(),
394            GprParseError::UnclosedParen { .. }
395        ));
396    }
397}