serde_structprop/lexer.rs
1//! Lexer (tokenizer) for the structprop format.
2//!
3//! The lexer converts a raw `&str` into a flat sequence of [`Token`]s paired
4//! with their 1-indexed source line numbers. Comments and insignificant
5//! whitespace are stripped. The resulting token stream is consumed by
6//! [`crate::parse()`].
7//!
8//! # Token rules
9//!
10//! | Input | Token produced |
11//! |---|---|
12//! | `=` | `Token::Eq` |
13//! | `{` | `Token::Open` |
14//! | `}` | `Token::Close` |
15//! | `# … \n` | *(discarded)* |
16//! | `"…"` | `Token::Term` with the quoted content |
17//! | any other non-whitespace run | `Token::Term` |
18//! | end of input | `Token::Eof` |
19
20/// A single token produced by the structprop lexer.
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub enum Token {
23 /// A bare or double-quoted string term.
24 ///
25 /// Bare terms are delimited by whitespace or the special characters
26 /// `=`, `{`, `}`, and `#`. Quoted terms may contain any character
27 /// except `"` — the format has no escape sequences, so a `"` always terminates the quoted string.
28 Term(String),
29
30 /// The assignment operator `=`.
31 Eq,
32
33 /// An opening brace `{` that begins an array or object body.
34 Open,
35
36 /// A closing brace `}` that ends an array or object body.
37 Close,
38
39 /// A sentinel placed at the end of the token stream.
40 Eof,
41}
42
43/// Internal lexer state machine states.
44#[derive(PartialEq)]
45enum State {
46 /// Between tokens; skipping whitespace.
47 Whitespace,
48 /// Inside a `# …` line comment.
49 Comment,
50 /// Accumulating a bare (unquoted) term.
51 Term,
52 /// Accumulating a double-quoted term.
53 Quoted,
54}
55
56/// Lex a structprop `input` string into a flat [`Vec`] of [`Token`]s, each
57/// paired with its 1-indexed source line number.
58///
59/// Comments (`# … \n`) and insignificant whitespace (spaces, tabs, carriage
60/// returns, and newlines) are discarded. The returned vector always ends with
61/// [`Token::Eof`].
62///
63/// # Errors
64///
65/// Returns [`crate::Error::Parse`] if the input contains [`u32::MAX`] or
66/// more newlines (i.e. the file exceeds [`u32::MAX`] lines).
67///
68/// # Examples
69///
70/// ```
71/// use serde_structprop::lexer::{tokenize, Token};
72///
73/// let tokens = tokenize("key = value").unwrap();
74/// assert_eq!(tokens, vec![
75/// (Token::Term("key".into()), 1),
76/// (Token::Eq, 1),
77/// (Token::Term("value".into()), 1),
78/// (Token::Eof, 1),
79/// ]);
80/// ```
81pub fn tokenize(input: &str) -> crate::error::Result<Vec<(Token, u32)>> {
82 let mut tokens = Vec::new();
83 let mut state = State::Whitespace;
84 let mut buf = String::new();
85 let mut line = 1u32;
86 let mut token_line = 1u32;
87
88 for ch in input.chars() {
89 match state {
90 State::Whitespace => match ch {
91 '\n' => line = inc_line(line)?,
92 ' ' | '\t' | '\r' => {}
93 '#' => state = State::Comment,
94 '"' => {
95 token_line = line;
96 state = State::Quoted;
97 }
98 '=' => tokens.push((Token::Eq, line)),
99 '{' => tokens.push((Token::Open, line)),
100 '}' => tokens.push((Token::Close, line)),
101 _ => {
102 token_line = line;
103 buf.push(ch);
104 state = State::Term;
105 }
106 },
107 State::Quoted => {
108 if ch == '"' {
109 tokens.push((Token::Term(buf.clone()), token_line));
110 buf.clear();
111 state = State::Whitespace;
112 } else {
113 if ch == '\n' {
114 line = inc_line(line)?;
115 }
116 buf.push(ch);
117 }
118 }
119 State::Comment => {
120 if ch == '\n' {
121 line = inc_line(line)?;
122 state = State::Whitespace;
123 }
124 }
125 State::Term => {
126 flush_term_char(
127 ch,
128 &mut buf,
129 &mut tokens,
130 &mut line,
131 &mut token_line,
132 &mut state,
133 )?;
134 }
135 }
136 }
137
138 // Flush any term that extends to the very end of the input.
139 if state == State::Term {
140 let term = buf.trim().to_owned();
141 if !term.is_empty() {
142 tokens.push((Token::Term(term), token_line));
143 }
144 }
145
146 tokens.push((Token::Eof, line));
147 Ok(tokens)
148}
149
150/// Increment a line counter, returning an error if it would overflow.
151fn inc_line(line: u32) -> crate::error::Result<u32> {
152 line.checked_add(1).ok_or_else(|| {
153 crate::error::Error::Parse("file exceeds maximum line count (u32::MAX)".to_owned())
154 })
155}
156
157/// Handle one character while in the `Term` state, flushing the accumulated
158/// buffer and emitting punctuation tokens as needed.
159fn flush_term_char(
160 ch: char,
161 buf: &mut String,
162 tokens: &mut Vec<(Token, u32)>,
163 line: &mut u32,
164 token_line: &mut u32,
165 state: &mut State,
166) -> crate::error::Result<()> {
167 match ch {
168 '\n' => {
169 flush_buf(buf, tokens, *token_line);
170 *line = inc_line(*line)?;
171 *state = State::Whitespace;
172 }
173 '#' | ' ' | '\t' | '\r' => {
174 flush_buf(buf, tokens, *token_line);
175 *state = if ch == '#' {
176 State::Comment
177 } else {
178 State::Whitespace
179 };
180 }
181 '=' => {
182 flush_buf(buf, tokens, *token_line);
183 tokens.push((Token::Eq, *line));
184 *state = State::Whitespace;
185 }
186 '{' => {
187 flush_buf(buf, tokens, *token_line);
188 tokens.push((Token::Open, *line));
189 *state = State::Whitespace;
190 }
191 '}' => {
192 flush_buf(buf, tokens, *token_line);
193 tokens.push((Token::Close, *line));
194 *state = State::Whitespace;
195 }
196 _ => buf.push(ch),
197 }
198 Ok(())
199}
200
201/// Drain `buf` into a `Token::Term` if non-empty.
202fn flush_buf(buf: &mut String, tokens: &mut Vec<(Token, u32)>, token_line: u32) {
203 let term = buf.trim().to_owned();
204 if !term.is_empty() {
205 tokens.push((Token::Term(term), token_line));
206 }
207 buf.clear();
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213
214 #[test]
215 fn basic_kv() {
216 let toks = tokenize("key = value").unwrap();
217 assert_eq!(
218 toks,
219 vec![
220 (Token::Term("key".into()), 1),
221 (Token::Eq, 1),
222 (Token::Term("value".into()), 1),
223 (Token::Eof, 1),
224 ]
225 );
226 }
227
228 #[test]
229 fn quoted_value() {
230 let toks = tokenize(r#"key = "hello world""#).unwrap();
231 assert_eq!(
232 toks,
233 vec![
234 (Token::Term("key".into()), 1),
235 (Token::Eq, 1),
236 (Token::Term("hello world".into()), 1),
237 (Token::Eof, 1),
238 ]
239 );
240 }
241
242 #[test]
243 fn comment_stripped() {
244 let toks = tokenize("# comment\nkey = val").unwrap();
245 assert_eq!(
246 toks,
247 vec![
248 (Token::Term("key".into()), 2),
249 (Token::Eq, 2),
250 (Token::Term("val".into()), 2),
251 (Token::Eof, 2),
252 ]
253 );
254 }
255
256 #[test]
257 fn array() {
258 let toks = tokenize("k = { 1 2 3 }").unwrap();
259 assert_eq!(
260 toks,
261 vec![
262 (Token::Term("k".into()), 1),
263 (Token::Eq, 1),
264 (Token::Open, 1),
265 (Token::Term("1".into()), 1),
266 (Token::Term("2".into()), 1),
267 (Token::Term("3".into()), 1),
268 (Token::Close, 1),
269 (Token::Eof, 1),
270 ]
271 );
272 }
273
274 #[test]
275 fn multiline_line_numbers() {
276 let toks = tokenize("a = 1\nb = 2\nc = 3\n").unwrap();
277 assert_eq!(
278 toks,
279 vec![
280 (Token::Term("a".into()), 1),
281 (Token::Eq, 1),
282 (Token::Term("1".into()), 1),
283 (Token::Term("b".into()), 2),
284 (Token::Eq, 2),
285 (Token::Term("2".into()), 2),
286 (Token::Term("c".into()), 3),
287 (Token::Eq, 3),
288 (Token::Term("3".into()), 3),
289 (Token::Eof, 4),
290 ]
291 );
292 }
293
294 #[test]
295 fn line_overflow_returns_error() {
296 // Build a string with u32::MAX newlines — too large to actually
297 // allocate, so we test with a saturated counter by constructing a
298 // minimal reproduction using inc_line directly.
299 assert!(inc_line(u32::MAX).is_err());
300 }
301}