Skip to main content

serde_structprop/
lexer.rs

1//! Lexer (tokenizer) for the structprop format.
2//!
3//! The lexer converts a raw `&str` into a flat sequence of [`Token`]s paired
4//! with their 1-indexed source line numbers.  Comments and insignificant
5//! whitespace are stripped.  The resulting token stream is consumed by
6//! [`crate::parse()`].
7//!
8//! # Token rules
9//!
10//! | Input | Token produced |
11//! |---|---|
12//! | `=` | `Token::Eq` |
13//! | `{` | `Token::Open` |
14//! | `}` | `Token::Close` |
15//! | `# … \n` | *(discarded)* |
16//! | `"…"` | `Token::Term` with the quoted content |
17//! | any other non-whitespace run | `Token::Term` |
18//! | end of input | `Token::Eof` |
19
20/// A single token produced by the structprop lexer.
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub enum Token {
23    /// A bare or double-quoted string term.
24    ///
25    /// Bare terms are delimited by whitespace or the special characters
26    /// `=`, `{`, `}`, and `#`.  Quoted terms may contain any character
27    /// except `"` — the format has no escape sequences, so a `"` always terminates the quoted string.
28    Term(String),
29
30    /// The assignment operator `=`.
31    Eq,
32
33    /// An opening brace `{` that begins an array or object body.
34    Open,
35
36    /// A closing brace `}` that ends an array or object body.
37    Close,
38
39    /// A sentinel placed at the end of the token stream.
40    Eof,
41}
42
43/// Internal lexer state machine states.
44#[derive(PartialEq)]
45enum State {
46    /// Between tokens; skipping whitespace.
47    Whitespace,
48    /// Inside a `# …` line comment.
49    Comment,
50    /// Accumulating a bare (unquoted) term.
51    Term,
52    /// Accumulating a double-quoted term.
53    Quoted,
54}
55
56/// Lex a structprop `input` string into a flat [`Vec`] of [`Token`]s, each
57/// paired with its 1-indexed source line number.
58///
59/// Comments (`# … \n`) and insignificant whitespace (spaces, tabs, carriage
60/// returns, and newlines) are discarded.  The returned vector always ends with
61/// [`Token::Eof`].
62///
63/// # Errors
64///
65/// Returns [`crate::Error::Parse`] if the input contains [`u32::MAX`] or
66/// more newlines (i.e. the file exceeds [`u32::MAX`] lines).
67///
68/// # Examples
69///
70/// ```
71/// use serde_structprop::lexer::{tokenize, Token};
72///
73/// let tokens = tokenize("key = value").unwrap();
74/// assert_eq!(tokens, vec![
75///     (Token::Term("key".into()), 1),
76///     (Token::Eq, 1),
77///     (Token::Term("value".into()), 1),
78///     (Token::Eof, 1),
79/// ]);
80/// ```
81pub fn tokenize(input: &str) -> crate::error::Result<Vec<(Token, u32)>> {
82    let mut tokens = Vec::new();
83    let mut state = State::Whitespace;
84    let mut buf = String::new();
85    let mut line = 1u32;
86    let mut token_line = 1u32;
87
88    for ch in input.chars() {
89        match state {
90            State::Whitespace => match ch {
91                '\n' => line = inc_line(line)?,
92                ' ' | '\t' | '\r' => {}
93                '#' => state = State::Comment,
94                '"' => {
95                    token_line = line;
96                    state = State::Quoted;
97                }
98                '=' => tokens.push((Token::Eq, line)),
99                '{' => tokens.push((Token::Open, line)),
100                '}' => tokens.push((Token::Close, line)),
101                _ => {
102                    token_line = line;
103                    buf.push(ch);
104                    state = State::Term;
105                }
106            },
107            State::Quoted => {
108                if ch == '"' {
109                    tokens.push((Token::Term(buf.clone()), token_line));
110                    buf.clear();
111                    state = State::Whitespace;
112                } else {
113                    if ch == '\n' {
114                        line = inc_line(line)?;
115                    }
116                    buf.push(ch);
117                }
118            }
119            State::Comment => {
120                if ch == '\n' {
121                    line = inc_line(line)?;
122                    state = State::Whitespace;
123                }
124            }
125            State::Term => {
126                flush_term_char(
127                    ch,
128                    &mut buf,
129                    &mut tokens,
130                    &mut line,
131                    &mut token_line,
132                    &mut state,
133                )?;
134            }
135        }
136    }
137
138    // Flush any term that extends to the very end of the input.
139    if state == State::Term {
140        let term = buf.trim().to_owned();
141        if !term.is_empty() {
142            tokens.push((Token::Term(term), token_line));
143        }
144    }
145
146    // A quoted string that was never closed is a syntax error.
147    if state == State::Quoted {
148        return Err(crate::error::Error::Parse(format!(
149            "line {token_line}: unterminated quoted string"
150        )));
151    }
152
153    tokens.push((Token::Eof, line));
154    Ok(tokens)
155}
156
157/// Increment a line counter, returning an error if it would overflow.
158fn inc_line(line: u32) -> crate::error::Result<u32> {
159    line.checked_add(1).ok_or_else(|| {
160        crate::error::Error::Parse("file exceeds maximum line count (u32::MAX)".to_owned())
161    })
162}
163
164/// Handle one character while in the `Term` state, flushing the accumulated
165/// buffer and emitting punctuation tokens as needed.
166fn flush_term_char(
167    ch: char,
168    buf: &mut String,
169    tokens: &mut Vec<(Token, u32)>,
170    line: &mut u32,
171    token_line: &mut u32,
172    state: &mut State,
173) -> crate::error::Result<()> {
174    match ch {
175        '\n' => {
176            flush_buf(buf, tokens, *token_line);
177            *line = inc_line(*line)?;
178            *state = State::Whitespace;
179        }
180        '#' | ' ' | '\t' | '\r' => {
181            flush_buf(buf, tokens, *token_line);
182            *state = if ch == '#' {
183                State::Comment
184            } else {
185                State::Whitespace
186            };
187        }
188        '=' => {
189            flush_buf(buf, tokens, *token_line);
190            tokens.push((Token::Eq, *line));
191            *state = State::Whitespace;
192        }
193        '{' => {
194            flush_buf(buf, tokens, *token_line);
195            tokens.push((Token::Open, *line));
196            *state = State::Whitespace;
197        }
198        '}' => {
199            flush_buf(buf, tokens, *token_line);
200            tokens.push((Token::Close, *line));
201            *state = State::Whitespace;
202        }
203        _ => buf.push(ch),
204    }
205    Ok(())
206}
207
208/// Drain `buf` into a `Token::Term` if non-empty.
209fn flush_buf(buf: &mut String, tokens: &mut Vec<(Token, u32)>, token_line: u32) {
210    let term = buf.trim().to_owned();
211    if !term.is_empty() {
212        tokens.push((Token::Term(term), token_line));
213    }
214    buf.clear();
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220
221    #[test]
222    fn basic_kv() {
223        let toks = tokenize("key = value").unwrap();
224        assert_eq!(
225            toks,
226            vec![
227                (Token::Term("key".into()), 1),
228                (Token::Eq, 1),
229                (Token::Term("value".into()), 1),
230                (Token::Eof, 1),
231            ]
232        );
233    }
234
235    #[test]
236    fn quoted_value() {
237        let toks = tokenize(r#"key = "hello world""#).unwrap();
238        assert_eq!(
239            toks,
240            vec![
241                (Token::Term("key".into()), 1),
242                (Token::Eq, 1),
243                (Token::Term("hello world".into()), 1),
244                (Token::Eof, 1),
245            ]
246        );
247    }
248
249    #[test]
250    fn comment_stripped() {
251        let toks = tokenize("# comment\nkey = val").unwrap();
252        assert_eq!(
253            toks,
254            vec![
255                (Token::Term("key".into()), 2),
256                (Token::Eq, 2),
257                (Token::Term("val".into()), 2),
258                (Token::Eof, 2),
259            ]
260        );
261    }
262
263    #[test]
264    fn array() {
265        let toks = tokenize("k = { 1 2 3 }").unwrap();
266        assert_eq!(
267            toks,
268            vec![
269                (Token::Term("k".into()), 1),
270                (Token::Eq, 1),
271                (Token::Open, 1),
272                (Token::Term("1".into()), 1),
273                (Token::Term("2".into()), 1),
274                (Token::Term("3".into()), 1),
275                (Token::Close, 1),
276                (Token::Eof, 1),
277            ]
278        );
279    }
280
281    #[test]
282    fn multiline_line_numbers() {
283        let toks = tokenize("a = 1\nb = 2\nc = 3\n").unwrap();
284        assert_eq!(
285            toks,
286            vec![
287                (Token::Term("a".into()), 1),
288                (Token::Eq, 1),
289                (Token::Term("1".into()), 1),
290                (Token::Term("b".into()), 2),
291                (Token::Eq, 2),
292                (Token::Term("2".into()), 2),
293                (Token::Term("c".into()), 3),
294                (Token::Eq, 3),
295                (Token::Term("3".into()), 3),
296                (Token::Eof, 4),
297            ]
298        );
299    }
300
301    #[test]
302    fn line_overflow_returns_error() {
303        // Build a string with u32::MAX newlines — too large to actually
304        // allocate, so we test with a saturated counter by constructing a
305        // minimal reproduction using inc_line directly.
306        assert!(inc_line(u32::MAX).is_err());
307    }
308
309    #[test]
310    fn unterminated_quoted_string_reports_line_number() {
311        // A quoted string that never closes should produce a clear error
312        // naming the line it started on, not a downstream parse error.
313        let err = tokenize("key = \"unterminated").unwrap_err();
314        let msg = err.to_string();
315        assert!(
316            msg.contains("unterminated quoted string"),
317            "expected 'unterminated quoted string' in error: {msg}"
318        );
319        assert!(
320            msg.contains("line 1"),
321            "expected line number in error: {msg}"
322        );
323    }
324
325    #[test]
326    fn unterminated_quoted_string_multiline_reports_correct_line() {
327        // The error should report the line on which the quote *opened*.
328        let err = tokenize("a = 1\nb = \"unterminated").unwrap_err();
329        let msg = err.to_string();
330        assert!(msg.contains("line 2"), "expected 'line 2' in error: {msg}");
331    }
332}