xkbcommon_rs/
lexer.rs

1// based loosely on scanner.c
2/*
3 * Copyright © 2012 Ran Benita <ran234@gmail.com>
4 * Copyright © 2024 wysiwys
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26pub(crate) use crate::lexer_utils::*;
27use crate::parser_utils::XkbFileParseError;
28use logos::Logos;
29use std::iter::Peekable;
30
31// TODO: return the span/location information
32pub(crate) struct Lexer<'input> {
33    bracket_depth: usize,
34    closed_last_bracket: bool,
35    finished_block: bool,
36    token_stream: Peekable<logos::SpannedIter<'input, RawToken<'input>>>,
37}
38
39impl<'input> Lexer<'input> {
40    pub(crate) fn new(input: &'input str) -> Result<Self, XkbFileParseError> {
41        let input = check_supported_char_encoding(input)
42            .map_err(|_| XkbFileParseError::WrongInputFormat)?;
43        Ok(Self {
44            bracket_depth: 0,
45            closed_last_bracket: false,
46            finished_block: false,
47            token_stream: RawToken::lexer(input).spanned().peekable(),
48        })
49    }
50    pub(crate) fn is_empty(&mut self) -> bool {
51        self.token_stream.peek().is_none()
52    }
53    pub(crate) fn reset(&mut self) {
54        self.bracket_depth = 0;
55        self.closed_last_bracket = false;
56        self.finished_block = false;
57    }
58}
59
60impl<'input> Iterator for Lexer<'input> {
61    type Item = Token;
62
63    fn next(&mut self) -> Option<Self::Item> {
64        // A bit hacky: `next` does not return an element if an XkbFile block is detected to have
65        // ended. This is done here because lalrpop currently (does not seem to) support parsing
66        // only part of the lexer's tokens, and then leaving off. However, we need to do this in
67        // order to parse e.g. only the first of several XkbFiles in a file.
68
69        if self.finished_block {
70            return None;
71        }
72        self.token_stream
73            .next()
74            .map(|(raw_token, _span)| match raw_token {
75                Ok(raw_token) => {
76                    let token = Token::from(raw_token);
77
78                    // Detect whether a block has ended.
79                    // When the bracket_depth is lowered to 0 and followed by a semicolon,
80                    // the block has ended.
81                    // TODO: should the struct also track whether an Obrace preceded the last
82                    // Cbrace?
83                    if token == Token::Obrace {
84                        self.bracket_depth += 1;
85                    } else if token == Token::Cbrace {
86                        if self.bracket_depth > 0 {
87                            self.bracket_depth -= 1;
88                        }
89                        if self.bracket_depth == 0 {
90                            self.closed_last_bracket = true;
91                        }
92                    } else if self.closed_last_bracket && token == Token::Semi {
93                        self.finished_block = true;
94                    }
95
96                    Some(token)
97                }
98                Err(_) => None,
99            })?
100    }
101}
102
103#[allow(dead_code)]
104#[derive(Logos, Debug, PartialEq)]
105enum RawToken<'input> {
106    #[regex("\"[^\"]*\"", priority = 5)]
107    String(&'input str),
108
109    #[regex(r"[[//]#][^\n]*[\n\r]?", |_| logos::Skip, priority=5)]
110    Comment,
111
112    // <is_graph*> but not <>
113    #[regex(r"<[\x21-\x3B\x3D\x3F-\x7E]*>", priority = 4)]
114    Keyname(&'input str),
115
116    #[regex("[ \x00\t\n]+", |_| logos::Skip, priority=3)]
117    Whitespace,
118
119    #[token(";", priority = 3)]
120    Semi,
121
122    #[token(r"{", priority = 3)]
123    Obrace,
124
125    #[token(r"}", priority = 3)]
126    Cbrace,
127
128    #[token("=", priority = 3)]
129    Equals,
130
131    #[token(r"[", priority = 3)]
132    Obracket,
133
134    #[token(r"]", priority = 3)]
135    Cbracket,
136
137    #[token(r"(", priority = 3)]
138    Oparen,
139
140    #[token(r")", priority = 3)]
141    Cparen,
142
143    #[token(r".", priority = 3)]
144    Dot,
145
146    #[token(",", priority = 3)]
147    Comma,
148
149    #[token("+", priority = 3)]
150    Plus,
151
152    #[token(r"-", priority = 3)]
153    Minus,
154
155    #[token(r"*", priority = 3)]
156    Times,
157
158    #[token(r"/", priority = 3)]
159    Divide,
160
161    #[token(r"!", priority = 3)]
162    Exclam,
163
164    #[token(r"~", priority = 3)]
165    Invert,
166
167    #[regex("[A-Za-z_][A-Za-z0-9_]*", priority = 2)]
168    Ident(&'input str),
169    #[regex("0[xX][0-9a-fA-F]+", |lex| hex_convert(lex.slice()), priority=1)]
170    HexNumber(u32),
171
172    #[regex("[0-9]+", |lex| lex.slice().parse().ok(), priority=1)]
173    UInt(u32),
174
175    #[regex(r"[0-9]*\.[0-9]+", |lex| lex.slice().parse().ok(), priority=1)]
176    Float(f64),
177}
178#[derive(Clone, Debug, PartialEq)]
179pub(crate) enum Token {
180    Skip,
181    Keyname(String),
182    String(String),
183    Ident(String),
184    UInt(u32),
185    Float(f64),
186    Semi,
187    Obrace,
188    Cbrace,
189    Equals,
190    Obracket,
191    Cbracket,
192    Oparen,
193    Cparen,
194    Dot,
195    Comma,
196    Plus,
197    Minus,
198    Times,
199    Divide,
200    Exclam,
201    Invert,
202    ActionTok,
203    Alias,
204    AlphanumericKeys,
205    AlternateGroup,
206    Alternate,
207    Augment,
208    Default,
209    FunctionKeys,
210    Group,
211    Hidden,
212    Include,
213    Indicator,
214    Interpret,
215    KeypadKeys,
216    Key,
217    Keys,
218    Logo,
219    ModifierKeys,
220    ModifierMap,
221    Outline,
222    Overlay,
223    Override,
224    Partial,
225    Replace,
226    Row,
227    Section,
228    Shape,
229    Solid,
230    Text,
231    Type,
232    VirtualMods,
233    Virtual,
234    XkbCompatmap,
235    XkbGeometry,
236    XkbKeycodes,
237    XkbKeymap,
238    XkbLayout,
239    XkbSemantics,
240    XkbSymbols,
241    XkbTypes,
242}
243
244impl std::fmt::Display for Token {
245    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
246        write!(f, "{:?}", self)
247    }
248}
249
250impl<'token> From<RawToken<'token>> for Token {
251    fn from(raw_token: RawToken) -> Self {
252        match raw_token {
253            RawToken::Comment => Token::Skip,
254            RawToken::Whitespace => Token::Skip,
255            // remove brackets and process escape codes
256            RawToken::String(s) => Token::String(process_string(s[1..s.len() - 1].as_bytes())),
257            RawToken::Ident(s) => Token::keyword_match(s),
258            RawToken::UInt(s) => Token::UInt(s),
259            RawToken::Float(f) => Token::Float(f),
260            RawToken::Semi => Token::Semi,
261            RawToken::Obrace => Token::Obrace,
262            RawToken::Cbrace => Token::Cbrace,
263            RawToken::Equals => Token::Equals,
264            RawToken::Obracket => Token::Obracket,
265            RawToken::Cbracket => Token::Cbracket,
266            RawToken::Oparen => Token::Oparen,
267            RawToken::Cparen => Token::Cparen,
268            RawToken::Dot => Token::Dot,
269            RawToken::Comma => Token::Comma,
270            RawToken::Plus => Token::Plus,
271            RawToken::Minus => Token::Minus,
272            RawToken::Times => Token::Times,
273            RawToken::Divide => Token::Divide,
274            RawToken::Exclam => Token::Exclam,
275            RawToken::Invert => Token::Invert,
276            // remove brackets
277            RawToken::Keyname(s) => Token::Keyname(s[1..s.len() - 1].into()),
278            RawToken::HexNumber(u) => Token::UInt(u),
279        }
280    }
281}
282
283fn hex_convert(token: &str) -> Option<u32> {
284    u32::from_str_radix(&token[2..], 16).ok()
285}
286
287// based on string processing part of _xkbcommon_lex
288// assumes outer quotes have been removed
289fn process_string(bytes: &[u8]) -> String {
290    let len = bytes.len();
291    let mut new: Vec<u8> = Vec::with_capacity(len);
292    let mut i = 0;
293
294    while i < len {
295        if let Some(esc) = bytes.get(i..i + 2) {
296            let mut increment = 2;
297
298            let backslash: u8 = '\\'.try_into().unwrap();
299            match esc {
300                s if s.starts_with(&[backslash]) => {
301                    match s[1] as char {
302                        'n' => new.extend("\n".as_bytes()),
303                        't' => new.extend("\t".as_bytes()),
304                        'r' => new.extend("\r".as_bytes()),
305                        'b' => new.extend("\\".as_bytes()), //backslash
306                        'f' => new.extend("\x0c".as_bytes()), // form feed page break
307                        'v' => new.extend("\x0b".as_bytes()),
308                        'e' => new.extend("\x1b".as_bytes()), // octal \033
309                        _ => {
310                            // get the next 1..3 characters.
311                            let octal = bytes
312                                .get(i + 1..i + 4)
313                                .or_else(|| bytes.get(i + 1..i + 3))
314                                .or_else(|| s.get(1..2))
315                                .unwrap()
316                                .iter()
317                                .map(|byte| *byte as char)
318                                .take_while(|c| ('0'..='7').contains(c))
319                                .collect::<String>();
320
321                            if !octal.is_empty() {
322                                if let Ok(c) = u8::from_str_radix(&octal, 8) {
323                                    // skip \0, \00, \000
324                                    if c != 0 {
325                                        new.push(c);
326                                    }
327                                }
328                                increment += octal.len() - 1;
329                            }
330                        }
331                    }
332                }
333                // non-escape
334                s => {
335                    new.push(s[0]);
336                    increment = 1;
337                }
338            };
339            i += increment;
340        } else {
341            assert_eq!(i + 1, len);
342
343            new.push(bytes[i]);
344            i += 1;
345        }
346    }
347
348    String::from_utf8(new).expect("escaped string is not valid utf8")
349}
350
351impl Token {
352    fn keyword_match(token: &str) -> Self {
353        crate::text::lookup_key(&crate::keywords::KEYWORDS, token)
354            .cloned()
355            .unwrap_or_else(|| Token::Ident(token.into()))
356    }
357}
358
359#[cfg(test)]
360mod test {
361
362    fn test_process_string(s: &str) -> String {
363        process_string(&s[1..s.len() - 1].as_bytes())
364    }
365    use super::*;
366    #[test]
367    fn test_string_process() {
368        assert_eq!(test_process_string(r#""""#), "");
369        assert_eq!(test_process_string(r#""Test\e""#), "Test\x1b");
370        assert_eq!(test_process_string(r#""Test\e1""#), "Test\x1b1");
371        assert_eq!(test_process_string(r#""Test\00f""#), "Testf");
372        assert_eq!(test_process_string(r#""Test\00\00\0f""#), "Testf");
373        assert_eq!(test_process_string(r#""\456Test\00\00\082""#), "Test82");
374        assert_eq!(test_process_string(r#""\456\00\00\081""#), "81");
375        assert_eq!(test_process_string(r#""\000\00\0000\00\""#), r"0\");
376        assert_eq!(test_process_string(r#""\000\00\000\00\""#), r"\");
377        assert_eq!(test_process_string(r#""\000\00\0\00""#), r"");
378        assert_eq!(test_process_string(r#""\456Test\0000""#), "Test0");
379        assert_eq!(test_process_string(r#""Test\9f""#), "Testf");
380        assert_eq!(test_process_string(r#""Test\1f""#), "Test\u{1}f");
381        assert_eq!(test_process_string(r#""Test\1\2""#), "Test\u{1}\u{2}");
382        assert_eq!(test_process_string(r#""Test\401\2""#), "Test\u{2}");
383    }
384}