Skip to main content

serde_structprop/
lexer.rs

1//! Lexer (tokenizer) for the structprop format.
2//!
3//! The lexer converts a raw `&str` into a flat sequence of `Token`s, stripping
4//! comments and collapsing insignificant whitespace.  The resulting token stream
5//! is consumed by [`crate::parse()`].
6//!
7//! # Token rules
8//!
9//! | Input | Token produced |
10//! |---|---|
11//! | `=` | `Token::Eq` |
12//! | `{` | `Token::Open` |
13//! | `}` | `Token::Close` |
14//! | `# … \n` | *(discarded)* |
15//! | `"…"` | `Token::Term` with the quoted content |
16//! | any other non-whitespace run | `Token::Term` |
17//! | end of input | `Token::Eof` |
18
19/// A single token produced by the structprop lexer.
20#[derive(Debug, Clone, PartialEq, Eq)]
21pub enum Token {
22    /// A bare or double-quoted string term.
23    ///
24    /// Bare terms are delimited by whitespace or the special characters
25    /// `=`, `{`, `}`, and `#`.  Quoted terms may contain any character
26    /// except an unescaped `"`.
27    Term(String),
28
29    /// The assignment operator `=`.
30    Eq,
31
32    /// An opening brace `{` that begins an array or object body.
33    Open,
34
35    /// A closing brace `}` that ends an array or object body.
36    Close,
37
38    /// A sentinel placed at the end of the token stream.
39    Eof,
40}
41
42/// Lex a structprop `input` string into a flat [`Vec`] of [`Token`]s.
43///
44/// Comments (`# … \n`) and insignificant whitespace (spaces, tabs, carriage
45/// returns, and newlines) are discarded.  The returned vector always ends with
46/// [`Token::Eof`].
47///
48/// # Examples
49///
50/// ```
51/// use serde_structprop::lexer::{tokenize, Token};
52///
53/// let tokens = tokenize("key = value");
54/// assert_eq!(tokens, vec![
55///     Token::Term("key".into()),
56///     Token::Eq,
57///     Token::Term("value".into()),
58///     Token::Eof,
59/// ]);
60/// ```
61#[must_use]
62pub fn tokenize(input: &str) -> Vec<Token> {
63    /// Internal lexer state machine states.
64    #[derive(PartialEq)]
65    enum State {
66        /// Between tokens; skipping whitespace.
67        Whitespace,
68        /// Inside a `# …` line comment.
69        Comment,
70        /// Accumulating a bare (unquoted) term.
71        Term,
72        /// Accumulating a double-quoted term.
73        Quoted,
74    }
75
76    let mut tokens = Vec::new();
77    let mut state = State::Whitespace;
78    let mut buf = String::new();
79
80    for ch in input.chars() {
81        match state {
82            State::Whitespace => match ch {
83                '#' => state = State::Comment,
84                '"' => state = State::Quoted,
85                ' ' | '\t' | '\r' | '\n' => {}
86                '=' => tokens.push(Token::Eq),
87                '{' => tokens.push(Token::Open),
88                '}' => tokens.push(Token::Close),
89                _ => {
90                    buf.push(ch);
91                    state = State::Term;
92                }
93            },
94            State::Quoted => {
95                if ch == '"' {
96                    tokens.push(Token::Term(buf.clone()));
97                    buf.clear();
98                    state = State::Whitespace;
99                } else {
100                    buf.push(ch);
101                }
102            }
103            State::Comment => {
104                if ch == '\n' {
105                    state = State::Whitespace;
106                }
107            }
108            State::Term => match ch {
109                '#' | '\n' | ' ' | '\t' | '\r' => {
110                    let term = buf.trim().to_owned();
111                    if !term.is_empty() {
112                        tokens.push(Token::Term(term));
113                    }
114                    buf.clear();
115                    state = if ch == '#' {
116                        State::Comment
117                    } else {
118                        State::Whitespace
119                    };
120                }
121                '=' => {
122                    let term = buf.trim().to_owned();
123                    if !term.is_empty() {
124                        tokens.push(Token::Term(term));
125                    }
126                    buf.clear();
127                    tokens.push(Token::Eq);
128                    state = State::Whitespace;
129                }
130                '{' => {
131                    let term = buf.trim().to_owned();
132                    if !term.is_empty() {
133                        tokens.push(Token::Term(term));
134                    }
135                    buf.clear();
136                    tokens.push(Token::Open);
137                    state = State::Whitespace;
138                }
139                '}' => {
140                    let term = buf.trim().to_owned();
141                    if !term.is_empty() {
142                        tokens.push(Token::Term(term));
143                    }
144                    buf.clear();
145                    tokens.push(Token::Close);
146                    state = State::Whitespace;
147                }
148                _ => buf.push(ch),
149            },
150        }
151    }
152
153    // Flush any term that extends to the very end of the input.
154    if state == State::Term {
155        let term = buf.trim().to_owned();
156        if !term.is_empty() {
157            tokens.push(Token::Term(term));
158        }
159    }
160
161    tokens.push(Token::Eof);
162    tokens
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    #[test]
170    fn basic_kv() {
171        let toks = tokenize("key = value");
172        assert_eq!(
173            toks,
174            vec![
175                Token::Term("key".into()),
176                Token::Eq,
177                Token::Term("value".into()),
178                Token::Eof,
179            ]
180        );
181    }
182
183    #[test]
184    fn quoted_value() {
185        let toks = tokenize(r#"key = "hello world""#);
186        assert_eq!(
187            toks,
188            vec![
189                Token::Term("key".into()),
190                Token::Eq,
191                Token::Term("hello world".into()),
192                Token::Eof,
193            ]
194        );
195    }
196
197    #[test]
198    fn comment_stripped() {
199        let toks = tokenize("# comment\nkey = val");
200        assert_eq!(
201            toks,
202            vec![
203                Token::Term("key".into()),
204                Token::Eq,
205                Token::Term("val".into()),
206                Token::Eof,
207            ]
208        );
209    }
210
211    #[test]
212    fn array() {
213        let toks = tokenize("k = { 1 2 3 }");
214        assert_eq!(
215            toks,
216            vec![
217                Token::Term("k".into()),
218                Token::Eq,
219                Token::Open,
220                Token::Term("1".into()),
221                Token::Term("2".into()),
222                Token::Term("3".into()),
223                Token::Close,
224                Token::Eof,
225            ]
226        );
227    }
228}