fea_rs/parse/
lexer.rs

1//! Scan a FEA file, producing a sequence of tokens.
2//!
3//! This is the first step in our parsing process. The tokens produced here
4//! have no semantic information; for instance we do not try to distinguish a
5//! keyword from a glyph name. Instead we are just describing the most basic
6//! structure of the document.
7//!
8//! The `Lexer` type is driven by a [Parser].
9//!
10//! [Parser]: super::Parser
11
12mod lexeme;
13mod token_set;
14
15pub(crate) use lexeme::{Kind, Lexeme};
16pub use token_set::TokenSet;
17
18const EOF: u8 = 0x0;
19
20pub(crate) struct Lexer<'a> {
21    input: &'a str,
22    pos: usize,
23    after_backslash: bool,
24    after_number_or_float: bool,
25    in_path: ExpectingPath,
26}
27
28// simple state machine for tracking whether we should be parsing a path.
29//
30// paths are complicated because suddenly we stop tokenizing, and just
31// glom everything together up to the closing parens.
32#[derive(Clone, Copy, Default)]
33enum ExpectingPath {
34    #[default]
35    Ready,
36    // we have seen the 'include' keyword. This means if the next token is a paren,
37    // we enter the 'InPath' state.
38    SawInclude,
39    InPath,
40}
41
42impl ExpectingPath {
43    fn in_path(self) -> bool {
44        matches!(self, ExpectingPath::InPath)
45    }
46
47    fn transition(&mut self, kind: Kind) {
48        *self = match (*self, kind) {
49            (ExpectingPath::Ready, Kind::IncludeKw) => ExpectingPath::SawInclude,
50            (ExpectingPath::SawInclude, Kind::LParen) => ExpectingPath::InPath,
51            // don't transition if we see whitespace after include, e.g,
52            // include (hi.fea)
53            (ExpectingPath::SawInclude, Kind::Whitespace) => ExpectingPath::SawInclude,
54            _ => ExpectingPath::Ready,
55        }
56    }
57}
58
59impl<'a> Lexer<'a> {
60    pub(crate) fn new(input: &'a str) -> Self {
61        Lexer {
62            input,
63            pos: 0,
64            after_backslash: false,
65            after_number_or_float: false,
66            in_path: Default::default(),
67        }
68    }
69
70    fn nth(&self, index: usize) -> u8 {
71        self.input
72            .as_bytes()
73            .get(self.pos + index)
74            .copied()
75            .unwrap_or(EOF)
76    }
77
78    fn bump(&mut self) -> Option<u8> {
79        let pos = self.pos;
80        let next = self.input.as_bytes().get(pos).copied();
81        self.pos += usize::from(next.is_some());
82        next
83    }
84
85    pub(crate) fn next_token(&mut self) -> Lexeme {
86        let start_pos = self.pos;
87        let first = self.bump().unwrap_or(EOF);
88        let kind = match first {
89            EOF => Kind::Eof,
90            _ if self.in_path.in_path() => self.path(),
91            byte if is_ascii_whitespace(byte) => self.whitespace(),
92            b'#' => self.comment(),
93            b'"' => self.string(),
94            b'0'..=b'9' if self.after_backslash => self.cid(),
95            b'0' => self.number(true),
96            b'1'..=b'9' => self.number(false),
97            b';' => Kind::Semi,
98            b':' => Kind::Colon,
99            b',' => Kind::Comma,
100            b'@' => self.glyph_class_name(),
101            b'\\' => Kind::Backslash,
102            b'-' => self.hyphen_or_minus(),
103            b'=' => Kind::Eq,
104            b'{' => Kind::LBrace,
105            b'}' => Kind::RBrace,
106            b'[' => Kind::LSquare,
107            b']' => Kind::RSquare,
108            b'(' => Kind::LParen,
109            b')' => Kind::RParen,
110            b'<' => Kind::LAngle,
111            b'>' => Kind::RAngle,
112            b'\'' => Kind::SingleQuote,
113            b'$' => Kind::Dollar,
114            b'*' => Kind::Asterisk,
115            b'+' => Kind::Plus,
116            b'/' => Kind::Slash,
117            b'n' | b'u' | b'd' if self.after_number_or_float => Kind::NumberSuffix,
118            _ => self.ident(),
119        };
120        self.in_path.transition(kind);
121
122        self.after_backslash = matches!(kind, Kind::Backslash);
123        self.after_number_or_float = matches!(kind, Kind::Number | Kind::Float);
124
125        let len = self.pos - start_pos;
126        Lexeme { len, kind }
127    }
128
129    fn whitespace(&mut self) -> Kind {
130        while is_ascii_whitespace(self.nth(0)) {
131            self.bump();
132        }
133        Kind::Whitespace
134    }
135
136    fn comment(&mut self) -> Kind {
137        while ![b'\n', EOF].contains(&self.nth(0)) {
138            self.bump();
139        }
140        Kind::Comment
141    }
142
143    fn string(&mut self) -> Kind {
144        loop {
145            match self.nth(0) {
146                b'"' => {
147                    self.bump();
148                    break Kind::String;
149                }
150                EOF => break Kind::StringUnterminated,
151                _ => {
152                    self.bump();
153                }
154            }
155        }
156    }
157
158    fn hyphen_or_minus(&mut self) -> Kind {
159        if self.nth(0) == b'0' {
160            // octal, so this is a hyphen (and an error)
161            if self.nth(1).is_ascii_digit() {
162                return Kind::Hyphen;
163            }
164            // hex: ditto
165            if [b'x', b'X'].contains(&self.nth(1)) {
166                return Kind::Hyphen;
167            }
168        }
169        if self.nth(0).is_ascii_digit() {
170            return self.number(false);
171        }
172
173        Kind::Hyphen
174    }
175
176    fn number(&mut self, leading_zero: bool) -> Kind {
177        if leading_zero && self.nth(0) != b'.' {
178            if [b'x', b'X'].contains(&self.nth(0)) {
179                self.bump();
180                if self.nth(0).is_ascii_hexdigit() {
181                    self.eat_hex_digits();
182                    Kind::Hex
183                } else {
184                    Kind::HexEmpty
185                }
186            } else if self.nth(0).is_ascii_digit() {
187                self.eat_octal_digits();
188                Kind::Octal
189            } else {
190                // just '0'
191                Kind::Number
192            }
193        } else {
194            self.eat_decimal_digits();
195            if self.nth(0) == b'.' {
196                self.bump();
197                self.eat_decimal_digits();
198                Kind::Float
199            } else {
200                Kind::Number
201            }
202        }
203    }
204
205    fn eat_octal_digits(&mut self) {
206        while matches!(self.nth(0), b'0'..=b'7') {
207            self.bump();
208        }
209    }
210    fn eat_hex_digits(&mut self) {
211        while self.nth(0).is_ascii_hexdigit() {
212            self.bump();
213        }
214    }
215
216    fn eat_decimal_digits(&mut self) {
217        while self.nth(0).is_ascii_digit() {
218            self.bump();
219        }
220    }
221
222    fn cid(&mut self) -> Kind {
223        self.eat_decimal_digits();
224        Kind::Cid
225    }
226
227    fn glyph_class_name(&mut self) -> Kind {
228        self.eat_ident();
229        Kind::NamedGlyphClass
230    }
231
232    fn eat_ident(&mut self) {
233        loop {
234            match self.nth(0) {
235                EOF => break,
236                b if is_ascii_whitespace(b) => break,
237                b'-' => (),
238                b if is_special(b) => break,
239                _ => (),
240            }
241            self.bump();
242        }
243    }
244
245    /// super dumb for now; we eat anything that isn't whitespace or special char.
246    fn ident(&mut self) -> Kind {
247        let start_pos = self.pos.saturating_sub(1);
248        self.eat_ident();
249
250        if self.after_backslash {
251            return Kind::Ident;
252        }
253
254        let raw_token = &self.input.as_bytes()[start_pos..self.pos];
255        Kind::from_keyword(raw_token).unwrap_or(Kind::Ident)
256    }
257
258    fn path(&mut self) -> Kind {
259        while !matches!(self.nth(0), EOF | b')') {
260            self.bump();
261        }
262        Kind::Path
263    }
264}
265
266#[cfg(test)]
267pub(crate) fn tokenize(text: &str) -> Vec<Lexeme> {
268    iter_tokens(text).collect()
269}
270
271#[cfg(test)]
272pub(crate) fn iter_tokens(text: &str) -> impl Iterator<Item = Lexeme> + '_ {
273    let mut cursor = Lexer::new(text);
274    std::iter::from_fn(move || {
275        let next = cursor.next_token();
276        match next.kind {
277            Kind::Eof => None,
278            _ => Some(next),
279        }
280    })
281}
282
283// [\ , ' - ; < = > @ \ ( ) [ ] { }]
284fn is_special(byte: u8) -> bool {
285    (39..=45).contains(&byte)
286        || (59..=64).contains(&byte)
287        || (91..=93).contains(&byte)
288        || byte == 123
289        || byte == 125
290}
291
292fn is_ascii_whitespace(byte: u8) -> bool {
293    byte == b' ' || (0x9..=0xD).contains(&byte)
294}
295
296#[cfg(test)]
297pub(crate) fn debug_tokens(tokens: &[Lexeme]) -> Vec<String> {
298    let mut result = Vec::new();
299    let mut pos = 0;
300    for token in tokens {
301        result.push(format!("{}..{} {}", pos, pos + token.len, token.kind));
302        pos += token.len;
303    }
304    result
305}
306
307#[cfg(test)]
308pub(crate) fn debug_tokens2(tokens: &[Lexeme], src: &str) -> Vec<String> {
309    let mut result = Vec::new();
310    let mut pos = 0;
311    for token in tokens {
312        let text = if token.kind.has_contents() {
313            format!("{}({})", token.kind, &src[pos..pos + token.len])
314        } else {
315            format!("{}", token.kind)
316        };
317        result.push(text);
318        pos += token.len;
319    }
320    result
321}
322
323#[cfg(test)]
324mod tests {
325    use super::*;
326
327    #[test]
328    fn empty_hex() {
329        let fea = "0x 0x11 0xzz";
330        let tokens = tokenize(fea);
331        let token_strs = debug_tokens(&tokens);
332        assert_eq!(token_strs[0], "0..2 HEX EMPTY");
333        assert_eq!(token_strs[1], "2..3 WS");
334        assert_eq!(token_strs[2], "3..7 HEX");
335        assert_eq!(token_strs[3], "7..8 WS");
336        assert_eq!(token_strs[4], "8..10 HEX EMPTY");
337        assert_eq!(token_strs[5], "10..12 ID");
338    }
339
340    #[test]
341    fn numbers() {
342        let fea = "0 001 10 1. 1.0 -1 -1. -1.5";
343        let tokens = tokenize(fea);
344        let token_strs = debug_tokens2(&tokens, fea);
345        assert_eq!(token_strs[0], "NUM(0)");
346        assert_eq!(token_strs[2], "OCT(001)");
347        assert_eq!(token_strs[4], "NUM(10)");
348        assert_eq!(token_strs[6], "FLOAT(1.)");
349        assert_eq!(token_strs[8], "FLOAT(1.0)");
350        assert_eq!(token_strs[10], "NUM(-1)");
351        assert_eq!(token_strs[12], "FLOAT(-1.)");
352    }
353
354    #[test]
355    fn bad_numbers() {
356        let fea = "-00 -0x1 -0x -ff";
357        let tokens = tokenize(fea);
358        let token_strs = debug_tokens2(&tokens, fea);
359        assert_eq!(token_strs[0], "-");
360        assert_eq!(token_strs[1], "OCT(00)");
361        assert_eq!(token_strs[3], "-");
362        assert_eq!(token_strs[4], "HEX(0x1)");
363        assert_eq!(token_strs[6], "-");
364        assert_eq!(token_strs[7], "HEX EMPTY(0x)");
365        assert_eq!(token_strs[9], "-");
366        assert_eq!(token_strs[10], "ID(ff)");
367    }
368
369    #[test]
370    fn languagesystem() {
371        let fea = "languagesystem dflt cool;";
372        let tokens = tokenize(fea);
373        assert_eq!(tokens[0].len, 14);
374        let token_strs = debug_tokens2(&tokens, fea);
375        assert_eq!(token_strs[0], "LanguagesystemKw");
376        assert_eq!(token_strs[1], "WS( )");
377        assert_eq!(token_strs[2], "ID(dflt)");
378        assert_eq!(token_strs[3], "WS( )");
379        assert_eq!(token_strs[4], "ID(cool)");
380        assert_eq!(token_strs[5], ";");
381    }
382
383    #[test]
384    fn escaping_keywords() {
385        let fea = "sub \\sub \\rsub";
386        let tokens = tokenize(fea);
387        let token_strs = debug_tokens2(&tokens, fea);
388        assert_eq!(token_strs[0], "SubKw");
389        assert_eq!(token_strs[1], "WS( )");
390        assert_eq!(token_strs[2], "\\");
391        assert_eq!(token_strs[3], "ID(sub)");
392        assert_eq!(token_strs[4], "WS( )");
393        assert_eq!(token_strs[5], "\\");
394        assert_eq!(token_strs[6], "ID(rsub)");
395    }
396
397    #[test]
398    fn cid_versus_ident() {
399        let fea = "@hi =[\\1-\\2 a - b];";
400        let tokens = tokenize(fea);
401        let token_strs = debug_tokens2(&tokens, fea);
402        assert_eq!(token_strs[0], "@GlyphClass(@hi)");
403        assert_eq!(token_strs[1], "WS( )");
404        assert_eq!(token_strs[2], "=");
405        assert_eq!(token_strs[3], "[");
406        assert_eq!(token_strs[4], "\\");
407        assert_eq!(token_strs[5], "CID(1)");
408        assert_eq!(token_strs[6], "-");
409        assert_eq!(token_strs[7], "\\");
410        assert_eq!(token_strs[8], "CID(2)");
411        assert_eq!(token_strs[9], "WS( )");
412        assert_eq!(token_strs[10], "ID(a)");
413        assert_eq!(token_strs[11], "WS( )");
414        assert_eq!(token_strs[12], "-");
415        assert_eq!(token_strs[13], "WS( )");
416        assert_eq!(token_strs[14], "ID(b)");
417        assert_eq!(token_strs[15], "]");
418        assert_eq!(token_strs[16], ";");
419    }
420
421    #[test]
422    fn trivia() {
423        let fea = "# OpenType 4.h\n# -@,\nlanguagesystem DFLT cool;";
424        let tokens = tokenize(fea);
425        let token_strs = debug_tokens2(&tokens, fea);
426        assert_eq!(token_strs[0], "#(# OpenType 4.h)");
427        assert_eq!(token_strs[1], "WS(\n)");
428        assert_eq!(token_strs[2], "#(# -@,)");
429        assert_eq!(token_strs[3], "WS(\n)");
430        assert_eq!(token_strs[4], "LanguagesystemKw");
431        assert_eq!(token_strs[5], "WS( )");
432        assert_eq!(token_strs[6], "ID(DFLT)");
433        assert_eq!(token_strs[7], "WS( )");
434        assert_eq!(token_strs[8], "ID(cool)");
435        assert_eq!(token_strs[9], ";");
436    }
437
438    #[test]
439    fn suffixes_good() {
440        let fea = "1n -5.3u 31.1d 0n";
441        let tokens = tokenize(fea);
442        let token_strs = debug_tokens2(&tokens, fea);
443        assert_eq!(token_strs[0], "NUM(1)");
444        assert_eq!(token_strs[1], "SUFFIX(n)");
445        assert_eq!(token_strs[3], "FLOAT(-5.3)");
446        assert_eq!(token_strs[4], "SUFFIX(u)");
447        assert_eq!(token_strs[6], "FLOAT(31.1)");
448        assert_eq!(token_strs[7], "SUFFIX(d)");
449        assert_eq!(token_strs[9], "NUM(0)");
450        assert_eq!(token_strs[10], "SUFFIX(n)");
451    }
452
453    #[test]
454    fn include_with_spaces() {
455        let fea = "include ( path.fea );";
456        let tokens = tokenize(fea);
457        let token_strs = debug_tokens2(&tokens, fea);
458        assert_eq!(token_strs[0], "IncludeKw");
459        assert_eq!(token_strs[1], "WS( )");
460        assert_eq!(token_strs[2], "(");
461        assert_eq!(token_strs[3], "Path( path.fea )");
462        assert_eq!(token_strs[4], ")");
463        assert_eq!(token_strs[5], ";");
464        assert!(token_strs.get(6).is_none());
465    }
466}