desktop_edit/
lex.rs

1//! Lexer for INI/.desktop files
2
3/// Token types for INI/.desktop files
4#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
5#[allow(non_camel_case_types)]
6#[repr(u16)]
7pub enum SyntaxKind {
8    /// Left bracket: `[`
9    LEFT_BRACKET = 0,
10    /// Right bracket: `]`
11    RIGHT_BRACKET,
12    /// Equals sign: `=`
13    EQUALS,
14    /// Key name (e.g., "Name", "Type")
15    KEY,
16    /// Section name (e.g., "Desktop Entry")
17    SECTION_NAME,
18    /// Locale suffix (e.g., "[de_DE]" in "Name[de_DE]")
19    LOCALE,
20    /// Value part of key=value
21    VALUE,
22    /// Comment starting with `#`
23    COMMENT,
24    /// Newline: `\n` or `\r\n`
25    NEWLINE,
26    /// Whitespace: spaces and tabs
27    WHITESPACE,
28    /// Error token
29    ERROR,
30
31    /// Root node: the entire file
32    ROOT,
33    /// Group node: a section with its entries
34    GROUP,
35    /// Group header node: `[Section Name]`
36    GROUP_HEADER,
37    /// Entry node: `Key=Value` or `Key[locale]=Value`
38    ENTRY,
39    /// Blank line node
40    BLANK_LINE,
41}
42
43/// Convert our `SyntaxKind` into the rowan `SyntaxKind`.
44impl From<SyntaxKind> for rowan::SyntaxKind {
45    fn from(kind: SyntaxKind) -> Self {
46        Self(kind as u16)
47    }
48}
49
50/// Check if a character is valid at the start of a key name
51#[inline]
52fn is_valid_initial_key_char(c: char) -> bool {
53    // Keys must start with A-Za-z0-9
54    c.is_ascii_alphanumeric()
55}
56
57/// Check if a character is valid in a key name
58#[inline]
59fn is_valid_key_char(c: char) -> bool {
60    // Keys can contain A-Za-z0-9-
61    c.is_ascii_alphanumeric() || c == '-'
62}
63
64/// Check if a character is a newline
65#[inline]
66fn is_newline(c: char) -> bool {
67    c == '\n' || c == '\r'
68}
69
70/// Check if a character is whitespace (space or tab)
71#[inline]
72fn is_whitespace(c: char) -> bool {
73    c == ' ' || c == '\t'
74}
75
76/// Lexer implementation
77fn lex_impl(input: &str) -> impl Iterator<Item = (SyntaxKind, &str)> + '_ {
78    let mut remaining = input;
79    let mut at_line_start = true;
80    let mut in_section_header = false;
81    let mut in_locale = false;
82
83    std::iter::from_fn(move || {
84        if remaining.is_empty() {
85            return None;
86        }
87
88        let c = remaining.chars().next()?;
89
90        match c {
91            // Newline
92            _ if is_newline(c) => {
93                let char_len = c.len_utf8();
94                // Handle \r\n as a single newline
95                if c == '\r' && remaining.get(1..2) == Some("\n") {
96                    let (token, rest) = remaining.split_at(2);
97                    remaining = rest;
98                    at_line_start = true;
99                    in_section_header = false;
100                    in_locale = false;
101                    Some((SyntaxKind::NEWLINE, token))
102                } else {
103                    let (token, rest) = remaining.split_at(char_len);
104                    remaining = rest;
105                    at_line_start = true;
106                    in_section_header = false;
107                    in_locale = false;
108                    Some((SyntaxKind::NEWLINE, token))
109                }
110            }
111
112            // Comment (# at start of line or after whitespace)
113            '#' if at_line_start => {
114                let end = remaining.find(is_newline).unwrap_or(remaining.len());
115                let (token, rest) = remaining.split_at(end);
116                remaining = rest;
117                Some((SyntaxKind::COMMENT, token))
118            }
119
120            // Section header [Section Name]
121            '[' if at_line_start => {
122                remaining = &remaining[1..]; // consume '['
123                at_line_start = false;
124                in_section_header = true;
125                Some((SyntaxKind::LEFT_BRACKET, "["))
126            }
127
128            // Left bracket in key-value context (for locale like Name[de])
129            '[' => {
130                remaining = &remaining[1..]; // consume '['
131                in_locale = true;
132                Some((SyntaxKind::LEFT_BRACKET, "["))
133            }
134
135            ']' => {
136                remaining = &remaining[1..]; // consume ']'
137                in_section_header = false;
138                in_locale = false;
139                Some((SyntaxKind::RIGHT_BRACKET, "]"))
140            }
141
142            // Whitespace at start of line - could be blank line
143            _ if is_whitespace(c) && at_line_start => {
144                let end = remaining
145                    .find(|c| !is_whitespace(c))
146                    .unwrap_or(remaining.len());
147                let (token, rest) = remaining.split_at(end);
148                remaining = rest;
149                // Check if this is followed by newline or EOF (blank line)
150                // Otherwise it's just leading whitespace before a key
151                Some((SyntaxKind::WHITESPACE, token))
152            }
153
154            // Whitespace (not at line start)
155            _ if is_whitespace(c) => {
156                let end = remaining
157                    .find(|c| !is_whitespace(c))
158                    .unwrap_or(remaining.len());
159                let (token, rest) = remaining.split_at(end);
160                remaining = rest;
161                Some((SyntaxKind::WHITESPACE, token))
162            }
163
164            // Equals sign
165            '=' => {
166                remaining = &remaining[1..];
167                Some((SyntaxKind::EQUALS, "="))
168            }
169
170            // Key name (starts with alphanumeric)
171            _ if is_valid_initial_key_char(c) && at_line_start => {
172                let end = remaining
173                    .find(|c: char| !is_valid_key_char(c))
174                    .unwrap_or(remaining.len());
175                let (token, rest) = remaining.split_at(end);
176                remaining = rest;
177                at_line_start = false;
178                Some((SyntaxKind::KEY, token))
179            }
180
181            // Locale identifier or section name (between [ and ])
182            _ if in_section_header || in_locale => {
183                // Inside brackets - read until ]
184                let end = remaining.find(']').unwrap_or(remaining.len());
185                let (token, rest) = remaining.split_at(end);
186                remaining = rest;
187                Some((SyntaxKind::VALUE, token))
188            }
189
190            // Value (everything else on a line)
191            _ if !at_line_start => {
192                // Everything else on the line is a value
193                let end = remaining.find(is_newline).unwrap_or(remaining.len());
194                let (token, rest) = remaining.split_at(end);
195                remaining = rest;
196                Some((SyntaxKind::VALUE, token))
197            }
198
199            // Error: unexpected character at line start
200            _ => {
201                let char_len = c.len_utf8();
202                let (token, rest) = remaining.split_at(char_len);
203                remaining = rest;
204                at_line_start = false;
205                Some((SyntaxKind::ERROR, token))
206            }
207        }
208    })
209}
210
211/// Lex an INI/.desktop file into tokens
212pub(crate) fn lex(input: &str) -> impl Iterator<Item = (SyntaxKind, &str)> {
213    lex_impl(input)
214}
215
216#[cfg(test)]
217mod tests {
218    use super::SyntaxKind::*;
219    use super::*;
220
221    #[test]
222    fn test_empty() {
223        assert_eq!(lex("").collect::<Vec<_>>(), vec![]);
224    }
225
226    #[test]
227    fn test_simple_section() {
228        let input = "[Desktop Entry]\n";
229        assert_eq!(
230            lex(input).collect::<Vec<_>>(),
231            vec![
232                (LEFT_BRACKET, "["),
233                (VALUE, "Desktop Entry"),
234                (RIGHT_BRACKET, "]"),
235                (NEWLINE, "\n"),
236            ]
237        );
238    }
239
240    #[test]
241    fn test_key_value() {
242        let input = "Name=Example\n";
243        assert_eq!(
244            lex(input).collect::<Vec<_>>(),
245            vec![
246                (KEY, "Name"),
247                (EQUALS, "="),
248                (VALUE, "Example"),
249                (NEWLINE, "\n"),
250            ]
251        );
252    }
253
254    #[test]
255    fn test_key_value_with_spaces() {
256        let input = "Name = Example Application\n";
257        assert_eq!(
258            lex(input).collect::<Vec<_>>(),
259            vec![
260                (KEY, "Name"),
261                (WHITESPACE, " "),
262                (EQUALS, "="),
263                (WHITESPACE, " "),
264                (VALUE, "Example Application"),
265                (NEWLINE, "\n"),
266            ]
267        );
268    }
269
270    #[test]
271    fn test_comment() {
272        let input = "# This is a comment\n";
273        assert_eq!(
274            lex(input).collect::<Vec<_>>(),
275            vec![(COMMENT, "# This is a comment"), (NEWLINE, "\n"),]
276        );
277    }
278
279    #[test]
280    fn test_full_desktop_file() {
281        let input = r#"[Desktop Entry]
282Name=Example
283Type=Application
284Exec=example
285# Comment
286Icon=example.png
287
288[Desktop Action Play]
289Name=Play
290Exec=example --play
291"#;
292        let tokens: Vec<_> = lex(input).collect();
293
294        // Verify we get the expected token types
295        assert_eq!(tokens[0].0, LEFT_BRACKET);
296        assert_eq!(tokens[1].0, VALUE); // "Desktop Entry"
297        assert_eq!(tokens[2].0, RIGHT_BRACKET);
298        assert_eq!(tokens[3].0, NEWLINE);
299
300        // Find and verify "Name=Example"
301        let name_idx = tokens
302            .iter()
303            .position(|(k, t)| *k == KEY && *t == "Name")
304            .unwrap();
305        assert_eq!(tokens[name_idx + 1].0, EQUALS);
306        assert_eq!(tokens[name_idx + 2].0, VALUE);
307        assert_eq!(tokens[name_idx + 2].1, "Example");
308    }
309
310    #[test]
311    fn test_blank_lines() {
312        let input = "Key=Value\n\nKey2=Value2\n";
313        let tokens: Vec<_> = lex(input).collect();
314
315        // Should have two newlines in sequence
316        let first_newline = tokens.iter().position(|(k, _)| *k == NEWLINE).unwrap();
317        assert_eq!(tokens[first_newline + 1].0, NEWLINE);
318    }
319}