Skip to main content

ucum/
parser.rs

1//! Total, bounded recursive-descent parser for the UCUM `c/s` grammar.
2//!
3//! The parser consumes its input strictly monotonically (every step either
4//! advances the cursor or unwinds the recursion), so it terminates on every
5//! input. A redundant step counter is wired in as a hard backstop: even if a
6//! future edit introduced a non-advancing loop, the parser would return a
7//! [`UcumError::Parse`] rather than hang. This totality guarantee is the single
8//! most important property of the crate.
9
10use crate::error::UcumError;
11use core::fmt;
12
13/// A node in the parsed unit expression tree.
14#[derive(Clone, Debug, PartialEq)]
15pub(crate) enum Node {
16    /// A bare integer factor (e.g. `1`, `100`). Dimensionless.
17    Factor(f64),
18    /// A simple unit: a token (`m`, `kg`, `[ft_i]`, `10*`) with an exponent.
19    /// Prefix detection is deferred to analysis; `sym` is the raw token.
20    Symbol { sym: String, exp: i32 },
21    /// Multiplication: `a.b`.
22    Mul(Box<Node>, Box<Node>),
23    /// Division: `a/b`.
24    Div(Box<Node>, Box<Node>),
25    /// Leading-slash reciprocal: `/term`.
26    Recip(Box<Node>),
27    /// A parenthesized sub-term, preserved for faithful re-display.
28    Group(Box<Node>),
29}
30
31/// A parsed UCUM unit expression (abstract syntax tree).
32///
33/// Produced by [`crate::parse`]. Exposed for advanced users who want to inspect
34/// or re-display an expression; most callers will use the higher-level
35/// functions ([`crate::analyze`], [`crate::convert`], …) directly.
36///
37/// The [`fmt::Display`] implementation re-serializes the expression in
38/// normalized UCUM syntax, dropping redundant parentheses while preserving
39/// those required by UCUM's left-associative grouping.
40#[derive(Clone, Debug, PartialEq)]
41pub struct UnitExpr {
42    pub(crate) root: Node,
43}
44
45impl UnitExpr {
46    /// Borrow the root node (crate-internal).
47    pub(crate) fn root_ref(&self) -> &Node {
48        &self.root
49    }
50}
51
52impl fmt::Display for UnitExpr {
53    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54        write_node(&self.root, f, false)
55    }
56}
57
58/// Strip redundant grouping: a `Group` only ever wraps a sub-term, and the tree
59/// structure already encodes the grouping, so the explicit node carries no
60/// information for display.
61fn peel(mut node: &Node) -> &Node {
62    while let Node::Group(inner) = node {
63        node = inner;
64    }
65    node
66}
67
68/// Re-serialize a node in normalized UCUM syntax.
69///
70/// `paren_if_compound` requests parentheses when the node is a compound term
71/// (`.`, `/`, or leading-slash). Because UCUM operators are equal-precedence
72/// and left-associative, parentheses are only required around the *right*
73/// operand of a `.`/`/` (e.g. `kg/(m.s)` ≠ `kg/m.s`); left operands and atoms
74/// never need them. This drops redundant parentheses (`((m))` → `m`) while
75/// preserving every semantically necessary one.
76fn write_node(node: &Node, f: &mut fmt::Formatter<'_>, paren_if_compound: bool) -> fmt::Result {
77    match peel(node) {
78        Node::Factor(v) => {
79            if v.fract() == 0.0 && v.abs() < 1e15 {
80                write!(f, "{}", *v as i64)
81            } else {
82                write!(f, "{v}")
83            }
84        }
85        Node::Symbol { sym, exp } => {
86            if *exp == 1 {
87                f.write_str(sym)
88            } else {
89                write!(f, "{sym}{exp}")
90            }
91        }
92        Node::Mul(a, b) => write_compound(f, paren_if_compound, |f| {
93            write_node(a, f, false)?;
94            f.write_str(".")?;
95            write_node(b, f, true)
96        }),
97        Node::Div(a, b) => write_compound(f, paren_if_compound, |f| {
98            write_node(a, f, false)?;
99            f.write_str("/")?;
100            write_node(b, f, true)
101        }),
102        Node::Recip(t) => write_compound(f, paren_if_compound, |f| {
103            f.write_str("/")?;
104            write_node(t, f, false)
105        }),
106        // Unreachable: `peel` removed all `Group` wrappers.
107        Node::Group(_) => Ok(()),
108    }
109}
110
111fn write_compound<F>(f: &mut fmt::Formatter<'_>, parenthesize: bool, body: F) -> fmt::Result
112where
113    F: FnOnce(&mut fmt::Formatter<'_>) -> fmt::Result,
114{
115    if parenthesize {
116        f.write_str("(")?;
117        body(f)?;
118        f.write_str(")")
119    } else {
120        body(f)
121    }
122}
123
124/// Parse a UCUM expression into an AST. Total: never panics, never hangs.
125pub(crate) fn parse(input: &str) -> Result<UnitExpr, UcumError> {
126    let mut p = Parser::new(input);
127    let root = p.parse_main_term()?;
128    p.skip_done()?;
129    Ok(UnitExpr { root })
130}
131
132/// Maximum parenthesis nesting depth. Bounds parser (and downstream evaluator)
133/// recursion so that pathological input such as `((((…))))` returns an error
134/// instead of overflowing the stack.
135const MAX_DEPTH: u32 = 256;
136
137struct Parser<'a> {
138    src: &'a str,
139    b: &'a [u8],
140    pos: usize,
141    steps: u64,
142    max_steps: u64,
143    depth: u32,
144}
145
146impl<'a> Parser<'a> {
147    fn new(src: &'a str) -> Self {
148        let b = src.as_bytes();
149        // Generous linear backstop: the parser is already O(n); this only fires
150        // on a hypothetical non-advancing bug, never on legitimate input.
151        let max_steps = (b.len() as u64).saturating_mul(16).saturating_add(1024);
152        Parser {
153            src,
154            b,
155            pos: 0,
156            steps: 0,
157            max_steps,
158            depth: 0,
159        }
160    }
161
162    #[inline]
163    fn step(&mut self) -> Result<(), UcumError> {
164        self.steps += 1;
165        if self.steps > self.max_steps {
166            return Err(self.err("expression too complex (step limit exceeded)"));
167        }
168        Ok(())
169    }
170
171    #[inline]
172    fn peek(&self) -> Option<u8> {
173        self.b.get(self.pos).copied()
174    }
175
176    #[inline]
177    fn err(&self, msg: &str) -> UcumError {
178        UcumError::Parse {
179            pos: self.pos,
180            msg: msg.to_string(),
181        }
182    }
183
184    fn err_at(&self, pos: usize, msg: &str) -> UcumError {
185        UcumError::Parse {
186            pos,
187            msg: msg.to_string(),
188        }
189    }
190
191    /// `main-term := "/" term | term`
192    fn parse_main_term(&mut self) -> Result<Node, UcumError> {
193        self.step()?;
194        if self.peek() == Some(b'/') {
195            self.pos += 1;
196            let t = self.parse_term()?;
197            Ok(Node::Recip(Box::new(t)))
198        } else {
199            self.parse_term()
200        }
201    }
202
203    /// `term := component (("." | "/") component)*`  (left-associative)
204    fn parse_term(&mut self) -> Result<Node, UcumError> {
205        let mut left = self.parse_component()?;
206        loop {
207            self.step()?;
208            match self.peek() {
209                Some(b'.') => {
210                    self.pos += 1;
211                    let right = self.parse_component()?;
212                    left = Node::Mul(Box::new(left), Box::new(right));
213                }
214                Some(b'/') => {
215                    self.pos += 1;
216                    let right = self.parse_component()?;
217                    left = Node::Div(Box::new(left), Box::new(right));
218                }
219                _ => break,
220            }
221        }
222        Ok(left)
223    }
224
225    /// `component := "(" main-term ")" | annotation | annotatable annotation?`
226    fn parse_component(&mut self) -> Result<Node, UcumError> {
227        self.step()?;
228        match self.peek() {
229            None => Err(self.err("unexpected end of input, expected a unit")),
230            Some(b'(') => {
231                self.depth += 1;
232                if self.depth > MAX_DEPTH {
233                    return Err(self.err("expression nested too deeply"));
234                }
235                self.pos += 1;
236                let inner = self.parse_main_term()?;
237                if self.peek() != Some(b')') {
238                    return Err(self.err("expected ')'"));
239                }
240                self.pos += 1;
241                self.depth -= 1;
242                // A '(' group may carry a trailing annotation but never an exponent.
243                self.skip_annotation()?;
244                Ok(Node::Group(Box::new(inner)))
245            }
246            Some(b')') => Err(self.err("unexpected ')'")),
247            Some(b'{') => {
248                // Standalone annotation is dimensionless unity.
249                self.skip_annotation()?;
250                Ok(Node::Factor(1.0))
251            }
252            Some(_) => {
253                let start = self.pos;
254                let chunk = self.read_chunk()?;
255                let node = component_from_chunk(self.src, chunk, start)?;
256                self.skip_annotation()?;
257                Ok(node)
258            }
259        }
260    }
261
262    /// Read a simple-unit token: everything up to a bracket-depth-0 operator,
263    /// grouping paren, annotation, or end of input. The contents of `[...]`
264    /// (which may contain `.`, `/`, `(`, `)`, etc.) are consumed verbatim.
265    /// Returns the byte range of the chunk.
266    fn read_chunk(&mut self) -> Result<(usize, usize), UcumError> {
267        let start = self.pos;
268        while let Some(c) = self.peek() {
269            self.step()?;
270            match c {
271                b'[' => {
272                    // Consume up to and including the matching ']'.
273                    let bracket_start = self.pos;
274                    self.pos += 1;
275                    loop {
276                        match self.peek() {
277                            Some(b']') => {
278                                self.pos += 1;
279                                break;
280                            }
281                            Some(_) => self.pos += 1,
282                            None => {
283                                return Err(self.err_at(bracket_start, "unterminated '[' in unit"));
284                            }
285                        }
286                    }
287                }
288                b'.' | b'/' | b'(' | b')' | b'{' => break,
289                _ => self.pos += 1,
290            }
291        }
292        if self.pos == start {
293            return Err(self.err("expected a unit"));
294        }
295        Ok((start, self.pos))
296    }
297
298    /// Skip an optional `{annotation}`; annotations are dimensionless comments.
299    fn skip_annotation(&mut self) -> Result<(), UcumError> {
300        if self.peek() != Some(b'{') {
301            return Ok(());
302        }
303        let open = self.pos;
304        self.pos += 1;
305        loop {
306            self.step()?;
307            match self.peek() {
308                Some(b'}') => {
309                    self.pos += 1;
310                    return Ok(());
311                }
312                Some(b'{') => return Err(self.err("nested '{' in annotation")),
313                // UCUM annotations may only contain printable ASCII (33..=126),
314                // i.e. graphic characters other than the curly braces.
315                Some(c) if (33..=126).contains(&c) => self.pos += 1,
316                Some(_) => {
317                    return Err(self.err("annotation contains a non-ASCII or control character"));
318                }
319                None => return Err(self.err_at(open, "unterminated annotation '{'")),
320            }
321        }
322    }
323
324    /// Ensure all input was consumed.
325    fn skip_done(&mut self) -> Result<(), UcumError> {
326        match self.peek() {
327            None => Ok(()),
328            Some(_) => Err(self.err("unexpected trailing input")),
329        }
330    }
331}
332
333/// Split a chunk into a unit token + exponent (bracket-aware) and build a node.
334fn component_from_chunk(src: &str, range: (usize, usize), start: usize) -> Result<Node, UcumError> {
335    let chunk = &src[range.0..range.1];
336    let cb = chunk.as_bytes();
337
338    // Exponent digits/sign may only appear after the last ']' (if any).
339    let scan_from = chunk.rfind(']').map(|i| i + 1).unwrap_or(0);
340
341    // Consume a trailing run of digits.
342    let mut i = cb.len();
343    while i > scan_from && cb[i - 1].is_ascii_digit() {
344        i -= 1;
345    }
346    let had_digits = i < cb.len();
347    // Include an optional sign immediately before the digit run.
348    let exp_start = if had_digits && i > scan_from && (cb[i - 1] == b'+' || cb[i - 1] == b'-') {
349        i - 1
350    } else if had_digits {
351        i
352    } else {
353        cb.len()
354    };
355
356    let unit_part = &chunk[..exp_start];
357    let exp_str = &chunk[exp_start..];
358
359    if unit_part.is_empty() {
360        // Pure number: a dimensionless factor. A leading sign is not allowed.
361        if exp_str.starts_with('+') || exp_str.starts_with('-') {
362            return Err(UcumError::Parse {
363                pos: start,
364                msg: "a numeric factor may not be signed".to_string(),
365            });
366        }
367        let v: f64 = exp_str.parse().map_err(|_| UcumError::Parse {
368            pos: start,
369            msg: format!("invalid numeric factor '{exp_str}'"),
370        })?;
371        return Ok(Node::Factor(v));
372    }
373
374    let exp: i32 = if exp_str.is_empty() {
375        1
376    } else {
377        exp_str.parse().map_err(|_| UcumError::Parse {
378            pos: start + exp_start,
379            msg: format!("invalid exponent '{exp_str}'"),
380        })?
381    };
382
383    Ok(Node::Symbol {
384        sym: unit_part.to_string(),
385        exp,
386    })
387}