serde_structprop/lexer.rs
1//! Lexer (tokenizer) for the structprop format.
2//!
3//! The lexer converts a raw `&str` into a flat sequence of `Token`s, stripping
4//! comments and collapsing insignificant whitespace. The resulting token stream
5//! is consumed by [`crate::parse()`].
6//!
7//! # Token rules
8//!
9//! | Input | Token produced |
10//! |---|---|
11//! | `=` | `Token::Eq` |
12//! | `{` | `Token::Open` |
13//! | `}` | `Token::Close` |
14//! | `# … \n` | *(discarded)* |
15//! | `"…"` | `Token::Term` with the quoted content |
16//! | any other non-whitespace run | `Token::Term` |
17//! | end of input | `Token::Eof` |
18
19/// A single token produced by the structprop lexer.
20#[derive(Debug, Clone, PartialEq, Eq)]
21pub enum Token {
22 /// A bare or double-quoted string term.
23 ///
24 /// Bare terms are delimited by whitespace or the special characters
25 /// `=`, `{`, `}`, and `#`. Quoted terms may contain any character
26 /// except an unescaped `"`.
27 Term(String),
28
29 /// The assignment operator `=`.
30 Eq,
31
32 /// An opening brace `{` that begins an array or object body.
33 Open,
34
35 /// A closing brace `}` that ends an array or object body.
36 Close,
37
38 /// A sentinel placed at the end of the token stream.
39 Eof,
40}
41
42/// Lex a structprop `input` string into a flat [`Vec`] of [`Token`]s.
43///
44/// Comments (`# … \n`) and insignificant whitespace (spaces, tabs, carriage
45/// returns, and newlines) are discarded. The returned vector always ends with
46/// [`Token::Eof`].
47///
48/// # Examples
49///
50/// ```
51/// use serde_structprop::lexer::{tokenize, Token};
52///
53/// let tokens = tokenize("key = value");
54/// assert_eq!(tokens, vec![
55/// Token::Term("key".into()),
56/// Token::Eq,
57/// Token::Term("value".into()),
58/// Token::Eof,
59/// ]);
60/// ```
61#[must_use]
62pub fn tokenize(input: &str) -> Vec<Token> {
63 /// Internal lexer state machine states.
64 #[derive(PartialEq)]
65 enum State {
66 /// Between tokens; skipping whitespace.
67 Whitespace,
68 /// Inside a `# …` line comment.
69 Comment,
70 /// Accumulating a bare (unquoted) term.
71 Term,
72 /// Accumulating a double-quoted term.
73 Quoted,
74 }
75
76 let mut tokens = Vec::new();
77 let mut state = State::Whitespace;
78 let mut buf = String::new();
79
80 for ch in input.chars() {
81 match state {
82 State::Whitespace => match ch {
83 '#' => state = State::Comment,
84 '"' => state = State::Quoted,
85 ' ' | '\t' | '\r' | '\n' => {}
86 '=' => tokens.push(Token::Eq),
87 '{' => tokens.push(Token::Open),
88 '}' => tokens.push(Token::Close),
89 _ => {
90 buf.push(ch);
91 state = State::Term;
92 }
93 },
94 State::Quoted => {
95 if ch == '"' {
96 tokens.push(Token::Term(buf.clone()));
97 buf.clear();
98 state = State::Whitespace;
99 } else {
100 buf.push(ch);
101 }
102 }
103 State::Comment => {
104 if ch == '\n' {
105 state = State::Whitespace;
106 }
107 }
108 State::Term => match ch {
109 '#' | '\n' | ' ' | '\t' | '\r' => {
110 let term = buf.trim().to_owned();
111 if !term.is_empty() {
112 tokens.push(Token::Term(term));
113 }
114 buf.clear();
115 state = if ch == '#' {
116 State::Comment
117 } else {
118 State::Whitespace
119 };
120 }
121 '=' => {
122 let term = buf.trim().to_owned();
123 if !term.is_empty() {
124 tokens.push(Token::Term(term));
125 }
126 buf.clear();
127 tokens.push(Token::Eq);
128 state = State::Whitespace;
129 }
130 '{' => {
131 let term = buf.trim().to_owned();
132 if !term.is_empty() {
133 tokens.push(Token::Term(term));
134 }
135 buf.clear();
136 tokens.push(Token::Open);
137 state = State::Whitespace;
138 }
139 '}' => {
140 let term = buf.trim().to_owned();
141 if !term.is_empty() {
142 tokens.push(Token::Term(term));
143 }
144 buf.clear();
145 tokens.push(Token::Close);
146 state = State::Whitespace;
147 }
148 _ => buf.push(ch),
149 },
150 }
151 }
152
153 // Flush any term that extends to the very end of the input.
154 if state == State::Term {
155 let term = buf.trim().to_owned();
156 if !term.is_empty() {
157 tokens.push(Token::Term(term));
158 }
159 }
160
161 tokens.push(Token::Eof);
162 tokens
163}
164
165#[cfg(test)]
166mod tests {
167 use super::*;
168
169 #[test]
170 fn basic_kv() {
171 let toks = tokenize("key = value");
172 assert_eq!(
173 toks,
174 vec![
175 Token::Term("key".into()),
176 Token::Eq,
177 Token::Term("value".into()),
178 Token::Eof,
179 ]
180 );
181 }
182
183 #[test]
184 fn quoted_value() {
185 let toks = tokenize(r#"key = "hello world""#);
186 assert_eq!(
187 toks,
188 vec![
189 Token::Term("key".into()),
190 Token::Eq,
191 Token::Term("hello world".into()),
192 Token::Eof,
193 ]
194 );
195 }
196
197 #[test]
198 fn comment_stripped() {
199 let toks = tokenize("# comment\nkey = val");
200 assert_eq!(
201 toks,
202 vec![
203 Token::Term("key".into()),
204 Token::Eq,
205 Token::Term("val".into()),
206 Token::Eof,
207 ]
208 );
209 }
210
211 #[test]
212 fn array() {
213 let toks = tokenize("k = { 1 2 3 }");
214 assert_eq!(
215 toks,
216 vec![
217 Token::Term("k".into()),
218 Token::Eq,
219 Token::Open,
220 Token::Term("1".into()),
221 Token::Term("2".into()),
222 Token::Term("3".into()),
223 Token::Close,
224 Token::Eof,
225 ]
226 );
227 }
228}