systemd_unit_edit/
lex.rs

1//! Lexer for systemd unit files
2
3/// Token types for systemd unit files
4#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
5#[allow(non_camel_case_types)]
6#[repr(u16)]
7pub enum SyntaxKind {
8    /// Left bracket: `[`
9    LEFT_BRACKET = 0,
10    /// Right bracket: `]`
11    RIGHT_BRACKET,
12    /// Equals sign: `=`
13    EQUALS,
14    /// Key name (e.g., "Type", "ExecStart")
15    KEY,
16    /// Section name (e.g., "Unit", "Service")
17    SECTION_NAME,
18    /// Value part of key=value
19    VALUE,
20    /// Comment starting with `#` or `;`
21    COMMENT,
22    /// Newline: `\n` or `\r\n`
23    NEWLINE,
24    /// Whitespace: spaces and tabs
25    WHITESPACE,
26    /// Line continuation: backslash at end of line
27    LINE_CONTINUATION,
28    /// Error token
29    ERROR,
30
31    /// Root node: the entire file
32    ROOT,
33    /// Section node: a section with its entries
34    SECTION,
35    /// Section header node: `[Section Name]`
36    SECTION_HEADER,
37    /// Entry node: `Key=Value`
38    ENTRY,
39    /// Blank line node
40    BLANK_LINE,
41}
42
43/// Convert our `SyntaxKind` into the rowan `SyntaxKind`.
44impl From<SyntaxKind> for rowan::SyntaxKind {
45    fn from(kind: SyntaxKind) -> Self {
46        Self(kind as u16)
47    }
48}
49
50/// Check if a character is valid at the start of a key name
51#[inline]
52fn is_valid_initial_key_char(c: char) -> bool {
53    // Keys must start with A-Za-z
54    c.is_ascii_alphabetic()
55}
56
57/// Check if a character is valid in a key name
58#[inline]
59fn is_valid_key_char(c: char) -> bool {
60    // Keys can contain A-Za-z0-9_-
61    c.is_ascii_alphanumeric() || c == '-' || c == '_'
62}
63
64/// Check if a character is a newline
65#[inline]
66fn is_newline(c: char) -> bool {
67    c == '\n' || c == '\r'
68}
69
70/// Check if a character is whitespace (space or tab)
71#[inline]
72fn is_whitespace(c: char) -> bool {
73    c == ' ' || c == '\t'
74}
75
76/// Lexer implementation
77fn lex_impl(input: &str) -> impl Iterator<Item = (SyntaxKind, &str)> + '_ {
78    let mut remaining = input;
79    let mut at_line_start = true;
80    let mut in_section_header = false;
81
82    std::iter::from_fn(move || {
83        if remaining.is_empty() {
84            return None;
85        }
86
87        let c = remaining.chars().next()?;
88
89        match c {
90            // Newline
91            _ if is_newline(c) => {
92                let char_len = c.len_utf8();
93                // Handle \r\n as a single newline
94                if c == '\r' && remaining.get(1..2) == Some("\n") {
95                    let (token, rest) = remaining.split_at(2);
96                    remaining = rest;
97                    at_line_start = true;
98                    in_section_header = false;
99                    Some((SyntaxKind::NEWLINE, token))
100                } else {
101                    let (token, rest) = remaining.split_at(char_len);
102                    remaining = rest;
103                    at_line_start = true;
104                    in_section_header = false;
105                    Some((SyntaxKind::NEWLINE, token))
106                }
107            }
108
109            // Comment (# or ; at start of line or after whitespace)
110            '#' | ';' if at_line_start => {
111                let end = remaining.find(is_newline).unwrap_or(remaining.len());
112                let (token, rest) = remaining.split_at(end);
113                remaining = rest;
114                Some((SyntaxKind::COMMENT, token))
115            }
116
117            // Line continuation (backslash before newline)
118            '\\' if remaining.get(1..2) == Some("\n") || remaining.get(1..3) == Some("\r\n") => {
119                let len = if remaining.get(1..3) == Some("\r\n") {
120                    3
121                } else {
122                    2
123                };
124                let (token, rest) = remaining.split_at(len);
125                remaining = rest;
126                at_line_start = false; // Line continues, so we're not at the start of a new logical line
127                Some((SyntaxKind::LINE_CONTINUATION, token))
128            }
129
130            // Section header [Section Name]
131            '[' if at_line_start => {
132                remaining = &remaining[1..]; // consume '['
133                at_line_start = false;
134                in_section_header = true;
135                Some((SyntaxKind::LEFT_BRACKET, "["))
136            }
137
138            ']' if in_section_header => {
139                remaining = &remaining[1..]; // consume ']'
140                in_section_header = false;
141                Some((SyntaxKind::RIGHT_BRACKET, "]"))
142            }
143
144            // Whitespace at start of line - could be blank line or continuation line
145            _ if is_whitespace(c) && at_line_start => {
146                let end = remaining
147                    .find(|c| !is_whitespace(c))
148                    .unwrap_or(remaining.len());
149                let (token, rest) = remaining.split_at(end);
150                remaining = rest;
151                at_line_start = false; // After leading whitespace, we're not at line start anymore
152                Some((SyntaxKind::WHITESPACE, token))
153            }
154
155            // Whitespace (not at line start)
156            _ if is_whitespace(c) => {
157                let end = remaining
158                    .find(|c| !is_whitespace(c))
159                    .unwrap_or(remaining.len());
160                let (token, rest) = remaining.split_at(end);
161                remaining = rest;
162                Some((SyntaxKind::WHITESPACE, token))
163            }
164
165            // Equals sign
166            '=' => {
167                remaining = &remaining[1..];
168                Some((SyntaxKind::EQUALS, "="))
169            }
170
171            // Key name (starts with alphabetic)
172            _ if is_valid_initial_key_char(c) && at_line_start => {
173                let end = remaining
174                    .find(|c: char| !is_valid_key_char(c))
175                    .unwrap_or(remaining.len());
176                let (token, rest) = remaining.split_at(end);
177                remaining = rest;
178                at_line_start = false;
179                Some((SyntaxKind::KEY, token))
180            }
181
182            // Section name (between [ and ])
183            _ if in_section_header => {
184                // Inside brackets - read until ]
185                let end = remaining.find(']').unwrap_or(remaining.len());
186                let (token, rest) = remaining.split_at(end);
187                remaining = rest;
188                Some((SyntaxKind::SECTION_NAME, token))
189            }
190
191            // Value (everything else on a line, handling line continuations)
192            _ if !at_line_start => {
193                // Read until newline (but watch for line continuations)
194                let mut end = 0;
195                for ch in remaining.chars() {
196                    if ch == '\\' {
197                        // Check if it's a line continuation
198                        let remaining_from_here = &remaining[end..];
199                        if remaining_from_here.get(1..2) == Some("\n")
200                            || remaining_from_here.get(1..3) == Some("\r\n")
201                        {
202                            // It's a line continuation, stop here
203                            break;
204                        }
205                        end += ch.len_utf8();
206                    } else if is_newline(ch) {
207                        // Stop at newline
208                        break;
209                    } else {
210                        end += ch.len_utf8();
211                    }
212                }
213
214                if end == 0 {
215                    // No value content, this shouldn't happen
216                    None
217                } else {
218                    let (token, rest) = remaining.split_at(end);
219                    remaining = rest;
220                    Some((SyntaxKind::VALUE, token))
221                }
222            }
223
224            // Error: unexpected character at line start
225            _ => {
226                let char_len = c.len_utf8();
227                let (token, rest) = remaining.split_at(char_len);
228                remaining = rest;
229                at_line_start = false;
230                Some((SyntaxKind::ERROR, token))
231            }
232        }
233    })
234}
235
236/// Lex a systemd unit file into tokens
237pub(crate) fn lex(input: &str) -> impl Iterator<Item = (SyntaxKind, &str)> {
238    lex_impl(input)
239}
240
241#[cfg(test)]
242mod tests {
243    use super::SyntaxKind::*;
244    use super::*;
245
246    #[test]
247    fn test_empty() {
248        assert_eq!(lex("").collect::<Vec<_>>(), vec![]);
249    }
250
251    #[test]
252    fn test_simple_section() {
253        let input = "[Unit]\n";
254        assert_eq!(
255            lex(input).collect::<Vec<_>>(),
256            vec![
257                (LEFT_BRACKET, "["),
258                (SECTION_NAME, "Unit"),
259                (RIGHT_BRACKET, "]"),
260                (NEWLINE, "\n"),
261            ]
262        );
263    }
264
265    #[test]
266    fn test_key_value() {
267        let input = "Description=Test Service\n";
268        assert_eq!(
269            lex(input).collect::<Vec<_>>(),
270            vec![
271                (KEY, "Description"),
272                (EQUALS, "="),
273                (VALUE, "Test Service"),
274                (NEWLINE, "\n"),
275            ]
276        );
277    }
278
279    #[test]
280    fn test_key_value_with_spaces() {
281        let input = "Description = Test Service\n";
282        assert_eq!(
283            lex(input).collect::<Vec<_>>(),
284            vec![
285                (KEY, "Description"),
286                (WHITESPACE, " "),
287                (EQUALS, "="),
288                (WHITESPACE, " "),
289                (VALUE, "Test Service"),
290                (NEWLINE, "\n"),
291            ]
292        );
293    }
294
295    #[test]
296    fn test_comment_hash() {
297        let input = "# This is a comment\n";
298        assert_eq!(
299            lex(input).collect::<Vec<_>>(),
300            vec![(COMMENT, "# This is a comment"), (NEWLINE, "\n"),]
301        );
302    }
303
304    #[test]
305    fn test_comment_semicolon() {
306        let input = "; This is a comment\n";
307        assert_eq!(
308            lex(input).collect::<Vec<_>>(),
309            vec![(COMMENT, "; This is a comment"), (NEWLINE, "\n"),]
310        );
311    }
312
313    #[test]
314    fn test_line_continuation() {
315        let input = "ExecStart=/bin/echo \\\n  hello\n";
316        let tokens: Vec<_> = lex(input).collect();
317        assert_eq!(tokens[0], (KEY, "ExecStart"));
318        assert_eq!(tokens[1], (EQUALS, "="));
319        assert_eq!(tokens[2], (VALUE, "/bin/echo "));
320        assert_eq!(tokens[3], (LINE_CONTINUATION, "\\\n"));
321        assert_eq!(tokens[4], (WHITESPACE, "  "));
322        assert_eq!(tokens[5], (VALUE, "hello"));
323        assert_eq!(tokens[6], (NEWLINE, "\n"));
324    }
325
326    #[test]
327    fn test_full_unit_file() {
328        let input = r#"[Unit]
329Description=Test Service
330After=network.target
331
332[Service]
333Type=simple
334ExecStart=/usr/bin/test
335"#;
336        let tokens: Vec<_> = lex(input).collect();
337
338        // Verify we get the expected token types
339        assert_eq!(tokens[0].0, LEFT_BRACKET);
340        assert_eq!(tokens[1].0, SECTION_NAME);
341        assert_eq!(tokens[1].1, "Unit");
342        assert_eq!(tokens[2].0, RIGHT_BRACKET);
343        assert_eq!(tokens[3].0, NEWLINE);
344
345        // Find "Description=Test Service"
346        let desc_idx = tokens
347            .iter()
348            .position(|(k, t)| *k == KEY && *t == "Description")
349            .unwrap();
350        assert_eq!(tokens[desc_idx + 1].0, EQUALS);
351        assert_eq!(tokens[desc_idx + 2].0, VALUE);
352        assert_eq!(tokens[desc_idx + 2].1, "Test Service");
353    }
354
355    #[test]
356    fn test_blank_lines() {
357        let input = "Key=Value\n\nKey2=Value2\n";
358        let tokens: Vec<_> = lex(input).collect();
359
360        // Should have two newlines in sequence
361        let first_newline = tokens.iter().position(|(k, _)| *k == NEWLINE).unwrap();
362        assert_eq!(tokens[first_newline + 1].0, NEWLINE);
363    }
364}
systemd_unit_edit/lex.rs

systemd_unit_edit/
lex.rs