Skip to main content

serde_structprop/
lexer.rs

1//! Lexer (tokenizer) for the structprop format.
2//!
3//! The lexer converts a raw `&str` into a flat sequence of [`Token`]s paired
4//! with their 1-indexed source line numbers.  Comments and insignificant
5//! whitespace are stripped.  The resulting token stream is consumed by
6//! [`crate::parse()`].
7//!
8//! # Token rules
9//!
10//! | Input | Token produced |
11//! |---|---|
12//! | `=` | `Token::Eq` |
13//! | `{` | `Token::Open` |
14//! | `}` | `Token::Close` |
15//! | `# … \n` | *(discarded)* |
16//! | `"…"` | `Token::Term` with the quoted content |
17//! | any other non-whitespace run | `Token::Term` |
18//! | end of input | `Token::Eof` |
19
20/// A single token produced by the structprop lexer.
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub enum Token {
23    /// A bare or double-quoted string term.
24    ///
25    /// Bare terms are delimited by whitespace or the special characters
26    /// `=`, `{`, `}`, and `#`.  Quoted terms may contain any character
27    /// except `"` — the format has no escape sequences, so a `"` always terminates the quoted string.
28    Term(String),
29
30    /// The assignment operator `=`.
31    Eq,
32
33    /// An opening brace `{` that begins an array or object body.
34    Open,
35
36    /// A closing brace `}` that ends an array or object body.
37    Close,
38
39    /// A sentinel placed at the end of the token stream.
40    Eof,
41}
42
43/// Internal lexer state machine states.
44#[derive(PartialEq)]
45enum State {
46    /// Between tokens; skipping whitespace.
47    Whitespace,
48    /// Inside a `# …` line comment.
49    Comment,
50    /// Accumulating a bare (unquoted) term.
51    Term,
52    /// Accumulating a double-quoted term.
53    Quoted,
54}
55
56/// Lex a structprop `input` string into a flat [`Vec`] of [`Token`]s, each
57/// paired with its 1-indexed source line number.
58///
59/// Comments (`# … \n`) and insignificant whitespace (spaces, tabs, carriage
60/// returns, and newlines) are discarded.  The returned vector always ends with
61/// [`Token::Eof`].
62///
63/// # Errors
64///
65/// Returns [`crate::Error::Parse`] if the input contains [`u32::MAX`] or
66/// more newlines (i.e. the file exceeds [`u32::MAX`] lines).
67///
68/// # Examples
69///
70/// ```
71/// use serde_structprop::lexer::{tokenize, Token};
72///
73/// let tokens = tokenize("key = value").unwrap();
74/// assert_eq!(tokens, vec![
75///     (Token::Term("key".into()), 1),
76///     (Token::Eq, 1),
77///     (Token::Term("value".into()), 1),
78///     (Token::Eof, 1),
79/// ]);
80/// ```
81pub fn tokenize(input: &str) -> crate::error::Result<Vec<(Token, u32)>> {
82    let mut tokens = Vec::new();
83    let mut state = State::Whitespace;
84    let mut buf = String::new();
85    let mut line = 1u32;
86    let mut token_line = 1u32;
87
88    for ch in input.chars() {
89        match state {
90            State::Whitespace => match ch {
91                '\n' => line = inc_line(line)?,
92                ' ' | '\t' | '\r' => {}
93                '#' => state = State::Comment,
94                '"' => {
95                    token_line = line;
96                    state = State::Quoted;
97                }
98                '=' => tokens.push((Token::Eq, line)),
99                '{' => tokens.push((Token::Open, line)),
100                '}' => tokens.push((Token::Close, line)),
101                _ => {
102                    token_line = line;
103                    buf.push(ch);
104                    state = State::Term;
105                }
106            },
107            State::Quoted => {
108                if ch == '"' {
109                    tokens.push((Token::Term(buf.clone()), token_line));
110                    buf.clear();
111                    state = State::Whitespace;
112                } else {
113                    if ch == '\n' {
114                        line = inc_line(line)?;
115                    }
116                    buf.push(ch);
117                }
118            }
119            State::Comment => {
120                if ch == '\n' {
121                    line = inc_line(line)?;
122                    state = State::Whitespace;
123                }
124            }
125            State::Term => {
126                flush_term_char(
127                    ch,
128                    &mut buf,
129                    &mut tokens,
130                    &mut line,
131                    &mut token_line,
132                    &mut state,
133                )?;
134            }
135        }
136    }
137
138    // Flush any term that extends to the very end of the input.
139    if state == State::Term {
140        let term = buf.trim().to_owned();
141        if !term.is_empty() {
142            tokens.push((Token::Term(term), token_line));
143        }
144    }
145
146    tokens.push((Token::Eof, line));
147    Ok(tokens)
148}
149
150/// Increment a line counter, returning an error if it would overflow.
151fn inc_line(line: u32) -> crate::error::Result<u32> {
152    line.checked_add(1).ok_or_else(|| {
153        crate::error::Error::Parse("file exceeds maximum line count (u32::MAX)".to_owned())
154    })
155}
156
157/// Handle one character while in the `Term` state, flushing the accumulated
158/// buffer and emitting punctuation tokens as needed.
159fn flush_term_char(
160    ch: char,
161    buf: &mut String,
162    tokens: &mut Vec<(Token, u32)>,
163    line: &mut u32,
164    token_line: &mut u32,
165    state: &mut State,
166) -> crate::error::Result<()> {
167    match ch {
168        '\n' => {
169            flush_buf(buf, tokens, *token_line);
170            *line = inc_line(*line)?;
171            *state = State::Whitespace;
172        }
173        '#' | ' ' | '\t' | '\r' => {
174            flush_buf(buf, tokens, *token_line);
175            *state = if ch == '#' {
176                State::Comment
177            } else {
178                State::Whitespace
179            };
180        }
181        '=' => {
182            flush_buf(buf, tokens, *token_line);
183            tokens.push((Token::Eq, *line));
184            *state = State::Whitespace;
185        }
186        '{' => {
187            flush_buf(buf, tokens, *token_line);
188            tokens.push((Token::Open, *line));
189            *state = State::Whitespace;
190        }
191        '}' => {
192            flush_buf(buf, tokens, *token_line);
193            tokens.push((Token::Close, *line));
194            *state = State::Whitespace;
195        }
196        _ => buf.push(ch),
197    }
198    Ok(())
199}
200
201/// Drain `buf` into a `Token::Term` if non-empty.
202fn flush_buf(buf: &mut String, tokens: &mut Vec<(Token, u32)>, token_line: u32) {
203    let term = buf.trim().to_owned();
204    if !term.is_empty() {
205        tokens.push((Token::Term(term), token_line));
206    }
207    buf.clear();
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    #[test]
215    fn basic_kv() {
216        let toks = tokenize("key = value").unwrap();
217        assert_eq!(
218            toks,
219            vec![
220                (Token::Term("key".into()), 1),
221                (Token::Eq, 1),
222                (Token::Term("value".into()), 1),
223                (Token::Eof, 1),
224            ]
225        );
226    }
227
228    #[test]
229    fn quoted_value() {
230        let toks = tokenize(r#"key = "hello world""#).unwrap();
231        assert_eq!(
232            toks,
233            vec![
234                (Token::Term("key".into()), 1),
235                (Token::Eq, 1),
236                (Token::Term("hello world".into()), 1),
237                (Token::Eof, 1),
238            ]
239        );
240    }
241
242    #[test]
243    fn comment_stripped() {
244        let toks = tokenize("# comment\nkey = val").unwrap();
245        assert_eq!(
246            toks,
247            vec![
248                (Token::Term("key".into()), 2),
249                (Token::Eq, 2),
250                (Token::Term("val".into()), 2),
251                (Token::Eof, 2),
252            ]
253        );
254    }
255
256    #[test]
257    fn array() {
258        let toks = tokenize("k = { 1 2 3 }").unwrap();
259        assert_eq!(
260            toks,
261            vec![
262                (Token::Term("k".into()), 1),
263                (Token::Eq, 1),
264                (Token::Open, 1),
265                (Token::Term("1".into()), 1),
266                (Token::Term("2".into()), 1),
267                (Token::Term("3".into()), 1),
268                (Token::Close, 1),
269                (Token::Eof, 1),
270            ]
271        );
272    }
273
274    #[test]
275    fn multiline_line_numbers() {
276        let toks = tokenize("a = 1\nb = 2\nc = 3\n").unwrap();
277        assert_eq!(
278            toks,
279            vec![
280                (Token::Term("a".into()), 1),
281                (Token::Eq, 1),
282                (Token::Term("1".into()), 1),
283                (Token::Term("b".into()), 2),
284                (Token::Eq, 2),
285                (Token::Term("2".into()), 2),
286                (Token::Term("c".into()), 3),
287                (Token::Eq, 3),
288                (Token::Term("3".into()), 3),
289                (Token::Eof, 4),
290            ]
291        );
292    }
293
294    #[test]
295    fn line_overflow_returns_error() {
296        // Build a string with u32::MAX newlines — too large to actually
297        // allocate, so we test with a saturated counter by constructing a
298        // minimal reproduction using inc_line directly.
299        assert!(inc_line(u32::MAX).is_err());
300    }
301}