systemd_unit_edit/
lex.rs

1//! Lexer for systemd unit files
2
3/// Token types for systemd unit files
4#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
5#[allow(non_camel_case_types)]
6#[repr(u16)]
7pub enum SyntaxKind {
8    /// Left bracket: `[`
9    LEFT_BRACKET = 0,
10    /// Right bracket: `]`
11    RIGHT_BRACKET,
12    /// Equals sign: `=`
13    EQUALS,
14    /// Key name (e.g., "Type", "ExecStart")
15    KEY,
16    /// Section name (e.g., "Unit", "Service")
17    SECTION_NAME,
18    /// Value part of key=value
19    VALUE,
20    /// Comment starting with `#` or `;`
21    COMMENT,
22    /// Newline: `\n` or `\r\n`
23    NEWLINE,
24    /// Whitespace: spaces and tabs
25    WHITESPACE,
26    /// Line continuation: backslash at end of line
27    LINE_CONTINUATION,
28    /// Error token
29    ERROR,
30
31    /// Root node: the entire file
32    ROOT,
33    /// Section node: a section with its entries
34    SECTION,
35    /// Section header node: `[Section Name]`
36    SECTION_HEADER,
37    /// Entry node: `Key=Value`
38    ENTRY,
39    /// Blank line node
40    BLANK_LINE,
41}
42
43/// Convert our `SyntaxKind` into the rowan `SyntaxKind`.
44impl From<SyntaxKind> for rowan::SyntaxKind {
45    fn from(kind: SyntaxKind) -> Self {
46        Self(kind as u16)
47    }
48}
49
50/// Check if a character is valid at the start of a key name
51#[inline]
52fn is_valid_initial_key_char(c: char) -> bool {
53    // Keys must start with A-Za-z
54    c.is_ascii_alphabetic()
55}
56
57/// Check if a character is valid in a key name
58#[inline]
59fn is_valid_key_char(c: char) -> bool {
60    // Keys can contain A-Za-z0-9_-
61    c.is_ascii_alphanumeric() || c == '-' || c == '_'
62}
63
64/// Check if a character is a newline
65#[inline]
66fn is_newline(c: char) -> bool {
67    c == '\n' || c == '\r'
68}
69
70/// Check if a character is whitespace (space or tab)
71#[inline]
72fn is_whitespace(c: char) -> bool {
73    c == ' ' || c == '\t'
74}
75
76/// Lexer implementation
77fn lex_impl(input: &str) -> impl Iterator<Item = (SyntaxKind, &str)> + '_ {
78    let mut remaining = input;
79    let mut at_line_start = true;
80    let mut in_section_header = false;
81
82    std::iter::from_fn(move || {
83        if remaining.is_empty() {
84            return None;
85        }
86
87        let c = remaining.chars().next()?;
88
89        match c {
90            // Newline
91            _ if is_newline(c) => {
92                let char_len = c.len_utf8();
93                // Handle \r\n as a single newline
94                if c == '\r' && remaining.get(1..2) == Some("\n") {
95                    let (token, rest) = remaining.split_at(2);
96                    remaining = rest;
97                    at_line_start = true;
98                    in_section_header = false;
99                    Some((SyntaxKind::NEWLINE, token))
100                } else {
101                    let (token, rest) = remaining.split_at(char_len);
102                    remaining = rest;
103                    at_line_start = true;
104                    in_section_header = false;
105                    Some((SyntaxKind::NEWLINE, token))
106                }
107            }
108
109            // Comment (# or ; at start of line or after whitespace)
110            '#' | ';' if at_line_start => {
111                let end = remaining.find(is_newline).unwrap_or(remaining.len());
112                let (token, rest) = remaining.split_at(end);
113                remaining = rest;
114                Some((SyntaxKind::COMMENT, token))
115            }
116
117            // Line continuation (backslash before newline)
118            '\\' if remaining.get(1..2) == Some("\n") || remaining.get(1..3) == Some("\r\n") => {
119                let len = if remaining.get(1..3) == Some("\r\n") {
120                    3
121                } else {
122                    2
123                };
124                let (token, rest) = remaining.split_at(len);
125                remaining = rest;
126                at_line_start = false; // Line continues, so we're not at the start of a new logical line
127                Some((SyntaxKind::LINE_CONTINUATION, token))
128            }
129
130            // Section header [Section Name]
131            '[' if at_line_start => {
132                remaining = &remaining[1..]; // consume '['
133                at_line_start = false;
134                in_section_header = true;
135                Some((SyntaxKind::LEFT_BRACKET, "["))
136            }
137
138            ']' if in_section_header => {
139                remaining = &remaining[1..]; // consume ']'
140                in_section_header = false;
141                Some((SyntaxKind::RIGHT_BRACKET, "]"))
142            }
143
144            // Whitespace at start of line - could be blank line
145            _ if is_whitespace(c) && at_line_start => {
146                let end = remaining
147                    .find(|c| !is_whitespace(c))
148                    .unwrap_or(remaining.len());
149                let (token, rest) = remaining.split_at(end);
150                remaining = rest;
151                Some((SyntaxKind::WHITESPACE, token))
152            }
153
154            // Whitespace (not at line start)
155            _ if is_whitespace(c) => {
156                let end = remaining
157                    .find(|c| !is_whitespace(c))
158                    .unwrap_or(remaining.len());
159                let (token, rest) = remaining.split_at(end);
160                remaining = rest;
161                Some((SyntaxKind::WHITESPACE, token))
162            }
163
164            // Equals sign
165            '=' => {
166                remaining = &remaining[1..];
167                Some((SyntaxKind::EQUALS, "="))
168            }
169
170            // Key name (starts with alphabetic)
171            _ if is_valid_initial_key_char(c) && at_line_start => {
172                let end = remaining
173                    .find(|c: char| !is_valid_key_char(c))
174                    .unwrap_or(remaining.len());
175                let (token, rest) = remaining.split_at(end);
176                remaining = rest;
177                at_line_start = false;
178                Some((SyntaxKind::KEY, token))
179            }
180
181            // Section name (between [ and ])
182            _ if in_section_header => {
183                // Inside brackets - read until ]
184                let end = remaining.find(']').unwrap_or(remaining.len());
185                let (token, rest) = remaining.split_at(end);
186                remaining = rest;
187                Some((SyntaxKind::SECTION_NAME, token))
188            }
189
190            // Value (everything else on a line, handling line continuations)
191            _ if !at_line_start => {
192                // Read until newline (but watch for line continuations)
193                let mut end = 0;
194                for ch in remaining.chars() {
195                    if ch == '\\' {
196                        // Check if it's a line continuation
197                        let remaining_from_here = &remaining[end..];
198                        if remaining_from_here.get(1..2) == Some("\n")
199                            || remaining_from_here.get(1..3) == Some("\r\n")
200                        {
201                            // It's a line continuation, stop here
202                            break;
203                        }
204                        end += ch.len_utf8();
205                    } else if is_newline(ch) {
206                        // Stop at newline
207                        break;
208                    } else {
209                        end += ch.len_utf8();
210                    }
211                }
212
213                if end == 0 {
214                    // No value content, this shouldn't happen
215                    None
216                } else {
217                    let (token, rest) = remaining.split_at(end);
218                    remaining = rest;
219                    Some((SyntaxKind::VALUE, token))
220                }
221            }
222
223            // Error: unexpected character at line start
224            _ => {
225                let char_len = c.len_utf8();
226                let (token, rest) = remaining.split_at(char_len);
227                remaining = rest;
228                at_line_start = false;
229                Some((SyntaxKind::ERROR, token))
230            }
231        }
232    })
233}
234
235/// Lex a systemd unit file into tokens
236pub(crate) fn lex(input: &str) -> impl Iterator<Item = (SyntaxKind, &str)> {
237    lex_impl(input)
238}
239
240#[cfg(test)]
241mod tests {
242    use super::SyntaxKind::*;
243    use super::*;
244
245    #[test]
246    fn test_empty() {
247        assert_eq!(lex("").collect::<Vec<_>>(), vec![]);
248    }
249
250    #[test]
251    fn test_simple_section() {
252        let input = "[Unit]\n";
253        assert_eq!(
254            lex(input).collect::<Vec<_>>(),
255            vec![
256                (LEFT_BRACKET, "["),
257                (SECTION_NAME, "Unit"),
258                (RIGHT_BRACKET, "]"),
259                (NEWLINE, "\n"),
260            ]
261        );
262    }
263
264    #[test]
265    fn test_key_value() {
266        let input = "Description=Test Service\n";
267        assert_eq!(
268            lex(input).collect::<Vec<_>>(),
269            vec![
270                (KEY, "Description"),
271                (EQUALS, "="),
272                (VALUE, "Test Service"),
273                (NEWLINE, "\n"),
274            ]
275        );
276    }
277
278    #[test]
279    fn test_key_value_with_spaces() {
280        let input = "Description = Test Service\n";
281        assert_eq!(
282            lex(input).collect::<Vec<_>>(),
283            vec![
284                (KEY, "Description"),
285                (WHITESPACE, " "),
286                (EQUALS, "="),
287                (WHITESPACE, " "),
288                (VALUE, "Test Service"),
289                (NEWLINE, "\n"),
290            ]
291        );
292    }
293
294    #[test]
295    fn test_comment_hash() {
296        let input = "# This is a comment\n";
297        assert_eq!(
298            lex(input).collect::<Vec<_>>(),
299            vec![(COMMENT, "# This is a comment"), (NEWLINE, "\n"),]
300        );
301    }
302
303    #[test]
304    fn test_comment_semicolon() {
305        let input = "; This is a comment\n";
306        assert_eq!(
307            lex(input).collect::<Vec<_>>(),
308            vec![(COMMENT, "; This is a comment"), (NEWLINE, "\n"),]
309        );
310    }
311
312    #[test]
313    fn test_line_continuation() {
314        let input = "ExecStart=/bin/echo \\\n  hello\n";
315        let tokens: Vec<_> = lex(input).collect();
316        assert_eq!(tokens[0], (KEY, "ExecStart"));
317        assert_eq!(tokens[1], (EQUALS, "="));
318        assert_eq!(tokens[2], (VALUE, "/bin/echo "));
319        assert_eq!(tokens[3], (LINE_CONTINUATION, "\\\n"));
320        assert_eq!(tokens[4], (WHITESPACE, "  "));
321        assert_eq!(tokens[5], (VALUE, "hello"));
322        assert_eq!(tokens[6], (NEWLINE, "\n"));
323    }
324
325    #[test]
326    fn test_full_unit_file() {
327        let input = r#"[Unit]
328Description=Test Service
329After=network.target
330
331[Service]
332Type=simple
333ExecStart=/usr/bin/test
334"#;
335        let tokens: Vec<_> = lex(input).collect();
336
337        // Verify we get the expected token types
338        assert_eq!(tokens[0].0, LEFT_BRACKET);
339        assert_eq!(tokens[1].0, SECTION_NAME);
340        assert_eq!(tokens[1].1, "Unit");
341        assert_eq!(tokens[2].0, RIGHT_BRACKET);
342        assert_eq!(tokens[3].0, NEWLINE);
343
344        // Find "Description=Test Service"
345        let desc_idx = tokens
346            .iter()
347            .position(|(k, t)| *k == KEY && *t == "Description")
348            .unwrap();
349        assert_eq!(tokens[desc_idx + 1].0, EQUALS);
350        assert_eq!(tokens[desc_idx + 2].0, VALUE);
351        assert_eq!(tokens[desc_idx + 2].1, "Test Service");
352    }
353
354    #[test]
355    fn test_blank_lines() {
356        let input = "Key=Value\n\nKey2=Value2\n";
357        let tokens: Vec<_> = lex(input).collect();
358
359        // Should have two newlines in sequence
360        let first_newline = tokens.iter().position(|(k, _)| *k == NEWLINE).unwrap();
361        assert_eq!(tokens[first_newline + 1].0, NEWLINE);
362    }
363}