1mod lexeme;
13mod token_set;
14
15pub(crate) use lexeme::{Kind, Lexeme};
16pub use token_set::TokenSet;
17
18const EOF: u8 = 0x0;
19
20pub(crate) struct Lexer<'a> {
21 input: &'a str,
22 pos: usize,
23 after_backslash: bool,
24 after_number_or_float: bool,
25 in_path: ExpectingPath,
26}
27
28#[derive(Clone, Copy, Default)]
33enum ExpectingPath {
34 #[default]
35 Ready,
36 SawInclude,
39 InPath,
40}
41
42impl ExpectingPath {
43 fn in_path(self) -> bool {
44 matches!(self, ExpectingPath::InPath)
45 }
46
47 fn transition(&mut self, kind: Kind) {
48 *self = match (*self, kind) {
49 (ExpectingPath::Ready, Kind::IncludeKw) => ExpectingPath::SawInclude,
50 (ExpectingPath::SawInclude, Kind::LParen) => ExpectingPath::InPath,
51 (ExpectingPath::SawInclude, Kind::Whitespace) => ExpectingPath::SawInclude,
54 _ => ExpectingPath::Ready,
55 }
56 }
57}
58
59impl<'a> Lexer<'a> {
60 pub(crate) fn new(input: &'a str) -> Self {
61 Lexer {
62 input,
63 pos: 0,
64 after_backslash: false,
65 after_number_or_float: false,
66 in_path: Default::default(),
67 }
68 }
69
70 fn nth(&self, index: usize) -> u8 {
71 self.input
72 .as_bytes()
73 .get(self.pos + index)
74 .copied()
75 .unwrap_or(EOF)
76 }
77
78 fn bump(&mut self) -> Option<u8> {
79 let pos = self.pos;
80 let next = self.input.as_bytes().get(pos).copied();
81 self.pos += usize::from(next.is_some());
82 next
83 }
84
85 pub(crate) fn next_token(&mut self) -> Lexeme {
86 let start_pos = self.pos;
87 let first = self.bump().unwrap_or(EOF);
88 let kind = match first {
89 EOF => Kind::Eof,
90 _ if self.in_path.in_path() => self.path(),
91 byte if is_ascii_whitespace(byte) => self.whitespace(),
92 b'#' => self.comment(),
93 b'"' => self.string(),
94 b'0'..=b'9' if self.after_backslash => self.cid(),
95 b'0' => self.number(true),
96 b'1'..=b'9' => self.number(false),
97 b';' => Kind::Semi,
98 b':' => Kind::Colon,
99 b',' => Kind::Comma,
100 b'@' => self.glyph_class_name(),
101 b'\\' => Kind::Backslash,
102 b'-' => self.hyphen_or_minus(),
103 b'=' => Kind::Eq,
104 b'{' => Kind::LBrace,
105 b'}' => Kind::RBrace,
106 b'[' => Kind::LSquare,
107 b']' => Kind::RSquare,
108 b'(' => Kind::LParen,
109 b')' => Kind::RParen,
110 b'<' => Kind::LAngle,
111 b'>' => Kind::RAngle,
112 b'\'' => Kind::SingleQuote,
113 b'$' => Kind::Dollar,
114 b'*' => Kind::Asterisk,
115 b'+' => Kind::Plus,
116 b'/' => Kind::Slash,
117 b'n' | b'u' | b'd' if self.after_number_or_float => Kind::NumberSuffix,
118 _ => self.ident(),
119 };
120 self.in_path.transition(kind);
121
122 self.after_backslash = matches!(kind, Kind::Backslash);
123 self.after_number_or_float = matches!(kind, Kind::Number | Kind::Float);
124
125 let len = self.pos - start_pos;
126 Lexeme { len, kind }
127 }
128
129 fn whitespace(&mut self) -> Kind {
130 while is_ascii_whitespace(self.nth(0)) {
131 self.bump();
132 }
133 Kind::Whitespace
134 }
135
136 fn comment(&mut self) -> Kind {
137 while ![b'\n', EOF].contains(&self.nth(0)) {
138 self.bump();
139 }
140 Kind::Comment
141 }
142
143 fn string(&mut self) -> Kind {
144 loop {
145 match self.nth(0) {
146 b'"' => {
147 self.bump();
148 break Kind::String;
149 }
150 EOF => break Kind::StringUnterminated,
151 _ => {
152 self.bump();
153 }
154 }
155 }
156 }
157
158 fn hyphen_or_minus(&mut self) -> Kind {
159 if self.nth(0) == b'0' {
160 if self.nth(1).is_ascii_digit() {
162 return Kind::Hyphen;
163 }
164 if [b'x', b'X'].contains(&self.nth(1)) {
166 return Kind::Hyphen;
167 }
168 }
169 if self.nth(0).is_ascii_digit() {
170 return self.number(false);
171 }
172
173 Kind::Hyphen
174 }
175
176 fn number(&mut self, leading_zero: bool) -> Kind {
177 if leading_zero && self.nth(0) != b'.' {
178 if [b'x', b'X'].contains(&self.nth(0)) {
179 self.bump();
180 if self.nth(0).is_ascii_hexdigit() {
181 self.eat_hex_digits();
182 Kind::Hex
183 } else {
184 Kind::HexEmpty
185 }
186 } else if self.nth(0).is_ascii_digit() {
187 self.eat_octal_digits();
188 Kind::Octal
189 } else {
190 Kind::Number
192 }
193 } else {
194 self.eat_decimal_digits();
195 if self.nth(0) == b'.' {
196 self.bump();
197 self.eat_decimal_digits();
198 Kind::Float
199 } else {
200 Kind::Number
201 }
202 }
203 }
204
205 fn eat_octal_digits(&mut self) {
206 while matches!(self.nth(0), b'0'..=b'7') {
207 self.bump();
208 }
209 }
210 fn eat_hex_digits(&mut self) {
211 while self.nth(0).is_ascii_hexdigit() {
212 self.bump();
213 }
214 }
215
216 fn eat_decimal_digits(&mut self) {
217 while self.nth(0).is_ascii_digit() {
218 self.bump();
219 }
220 }
221
222 fn cid(&mut self) -> Kind {
223 self.eat_decimal_digits();
224 Kind::Cid
225 }
226
227 fn glyph_class_name(&mut self) -> Kind {
228 self.eat_ident();
229 Kind::NamedGlyphClass
230 }
231
232 fn eat_ident(&mut self) {
233 loop {
234 match self.nth(0) {
235 EOF => break,
236 b if is_ascii_whitespace(b) => break,
237 b'-' => (),
238 b if is_special(b) => break,
239 _ => (),
240 }
241 self.bump();
242 }
243 }
244
245 fn ident(&mut self) -> Kind {
247 let start_pos = self.pos.saturating_sub(1);
248 self.eat_ident();
249
250 if self.after_backslash {
251 return Kind::Ident;
252 }
253
254 let raw_token = &self.input.as_bytes()[start_pos..self.pos];
255 Kind::from_keyword(raw_token).unwrap_or(Kind::Ident)
256 }
257
258 fn path(&mut self) -> Kind {
259 while !matches!(self.nth(0), EOF | b')') {
260 self.bump();
261 }
262 Kind::Path
263 }
264}
265
266#[cfg(test)]
267pub(crate) fn tokenize(text: &str) -> Vec<Lexeme> {
268 iter_tokens(text).collect()
269}
270
271#[cfg(test)]
272pub(crate) fn iter_tokens(text: &str) -> impl Iterator<Item = Lexeme> + '_ {
273 let mut cursor = Lexer::new(text);
274 std::iter::from_fn(move || {
275 let next = cursor.next_token();
276 match next.kind {
277 Kind::Eof => None,
278 _ => Some(next),
279 }
280 })
281}
282
283fn is_special(byte: u8) -> bool {
285 (39..=45).contains(&byte)
286 || (59..=64).contains(&byte)
287 || (91..=93).contains(&byte)
288 || byte == 123
289 || byte == 125
290}
291
292fn is_ascii_whitespace(byte: u8) -> bool {
293 byte == b' ' || (0x9..=0xD).contains(&byte)
294}
295
296#[cfg(test)]
297pub(crate) fn debug_tokens(tokens: &[Lexeme]) -> Vec<String> {
298 let mut result = Vec::new();
299 let mut pos = 0;
300 for token in tokens {
301 result.push(format!("{}..{} {}", pos, pos + token.len, token.kind));
302 pos += token.len;
303 }
304 result
305}
306
307#[cfg(test)]
308pub(crate) fn debug_tokens2(tokens: &[Lexeme], src: &str) -> Vec<String> {
309 let mut result = Vec::new();
310 let mut pos = 0;
311 for token in tokens {
312 let text = if token.kind.has_contents() {
313 format!("{}({})", token.kind, &src[pos..pos + token.len])
314 } else {
315 format!("{}", token.kind)
316 };
317 result.push(text);
318 pos += token.len;
319 }
320 result
321}
322
323#[cfg(test)]
324mod tests {
325 use super::*;
326
327 #[test]
328 fn empty_hex() {
329 let fea = "0x 0x11 0xzz";
330 let tokens = tokenize(fea);
331 let token_strs = debug_tokens(&tokens);
332 assert_eq!(token_strs[0], "0..2 HEX EMPTY");
333 assert_eq!(token_strs[1], "2..3 WS");
334 assert_eq!(token_strs[2], "3..7 HEX");
335 assert_eq!(token_strs[3], "7..8 WS");
336 assert_eq!(token_strs[4], "8..10 HEX EMPTY");
337 assert_eq!(token_strs[5], "10..12 ID");
338 }
339
340 #[test]
341 fn numbers() {
342 let fea = "0 001 10 1. 1.0 -1 -1. -1.5";
343 let tokens = tokenize(fea);
344 let token_strs = debug_tokens2(&tokens, fea);
345 assert_eq!(token_strs[0], "NUM(0)");
346 assert_eq!(token_strs[2], "OCT(001)");
347 assert_eq!(token_strs[4], "NUM(10)");
348 assert_eq!(token_strs[6], "FLOAT(1.)");
349 assert_eq!(token_strs[8], "FLOAT(1.0)");
350 assert_eq!(token_strs[10], "NUM(-1)");
351 assert_eq!(token_strs[12], "FLOAT(-1.)");
352 }
353
354 #[test]
355 fn bad_numbers() {
356 let fea = "-00 -0x1 -0x -ff";
357 let tokens = tokenize(fea);
358 let token_strs = debug_tokens2(&tokens, fea);
359 assert_eq!(token_strs[0], "-");
360 assert_eq!(token_strs[1], "OCT(00)");
361 assert_eq!(token_strs[3], "-");
362 assert_eq!(token_strs[4], "HEX(0x1)");
363 assert_eq!(token_strs[6], "-");
364 assert_eq!(token_strs[7], "HEX EMPTY(0x)");
365 assert_eq!(token_strs[9], "-");
366 assert_eq!(token_strs[10], "ID(ff)");
367 }
368
369 #[test]
370 fn languagesystem() {
371 let fea = "languagesystem dflt cool;";
372 let tokens = tokenize(fea);
373 assert_eq!(tokens[0].len, 14);
374 let token_strs = debug_tokens2(&tokens, fea);
375 assert_eq!(token_strs[0], "LanguagesystemKw");
376 assert_eq!(token_strs[1], "WS( )");
377 assert_eq!(token_strs[2], "ID(dflt)");
378 assert_eq!(token_strs[3], "WS( )");
379 assert_eq!(token_strs[4], "ID(cool)");
380 assert_eq!(token_strs[5], ";");
381 }
382
383 #[test]
384 fn escaping_keywords() {
385 let fea = "sub \\sub \\rsub";
386 let tokens = tokenize(fea);
387 let token_strs = debug_tokens2(&tokens, fea);
388 assert_eq!(token_strs[0], "SubKw");
389 assert_eq!(token_strs[1], "WS( )");
390 assert_eq!(token_strs[2], "\\");
391 assert_eq!(token_strs[3], "ID(sub)");
392 assert_eq!(token_strs[4], "WS( )");
393 assert_eq!(token_strs[5], "\\");
394 assert_eq!(token_strs[6], "ID(rsub)");
395 }
396
397 #[test]
398 fn cid_versus_ident() {
399 let fea = "@hi =[\\1-\\2 a - b];";
400 let tokens = tokenize(fea);
401 let token_strs = debug_tokens2(&tokens, fea);
402 assert_eq!(token_strs[0], "@GlyphClass(@hi)");
403 assert_eq!(token_strs[1], "WS( )");
404 assert_eq!(token_strs[2], "=");
405 assert_eq!(token_strs[3], "[");
406 assert_eq!(token_strs[4], "\\");
407 assert_eq!(token_strs[5], "CID(1)");
408 assert_eq!(token_strs[6], "-");
409 assert_eq!(token_strs[7], "\\");
410 assert_eq!(token_strs[8], "CID(2)");
411 assert_eq!(token_strs[9], "WS( )");
412 assert_eq!(token_strs[10], "ID(a)");
413 assert_eq!(token_strs[11], "WS( )");
414 assert_eq!(token_strs[12], "-");
415 assert_eq!(token_strs[13], "WS( )");
416 assert_eq!(token_strs[14], "ID(b)");
417 assert_eq!(token_strs[15], "]");
418 assert_eq!(token_strs[16], ";");
419 }
420
421 #[test]
422 fn trivia() {
423 let fea = "# OpenType 4.h\n# -@,\nlanguagesystem DFLT cool;";
424 let tokens = tokenize(fea);
425 let token_strs = debug_tokens2(&tokens, fea);
426 assert_eq!(token_strs[0], "#(# OpenType 4.h)");
427 assert_eq!(token_strs[1], "WS(\n)");
428 assert_eq!(token_strs[2], "#(# -@,)");
429 assert_eq!(token_strs[3], "WS(\n)");
430 assert_eq!(token_strs[4], "LanguagesystemKw");
431 assert_eq!(token_strs[5], "WS( )");
432 assert_eq!(token_strs[6], "ID(DFLT)");
433 assert_eq!(token_strs[7], "WS( )");
434 assert_eq!(token_strs[8], "ID(cool)");
435 assert_eq!(token_strs[9], ";");
436 }
437
438 #[test]
439 fn suffixes_good() {
440 let fea = "1n -5.3u 31.1d 0n";
441 let tokens = tokenize(fea);
442 let token_strs = debug_tokens2(&tokens, fea);
443 assert_eq!(token_strs[0], "NUM(1)");
444 assert_eq!(token_strs[1], "SUFFIX(n)");
445 assert_eq!(token_strs[3], "FLOAT(-5.3)");
446 assert_eq!(token_strs[4], "SUFFIX(u)");
447 assert_eq!(token_strs[6], "FLOAT(31.1)");
448 assert_eq!(token_strs[7], "SUFFIX(d)");
449 assert_eq!(token_strs[9], "NUM(0)");
450 assert_eq!(token_strs[10], "SUFFIX(n)");
451 }
452
453 #[test]
454 fn include_with_spaces() {
455 let fea = "include ( path.fea );";
456 let tokens = tokenize(fea);
457 let token_strs = debug_tokens2(&tokens, fea);
458 assert_eq!(token_strs[0], "IncludeKw");
459 assert_eq!(token_strs[1], "WS( )");
460 assert_eq!(token_strs[2], "(");
461 assert_eq!(token_strs[3], "Path( path.fea )");
462 assert_eq!(token_strs[4], ")");
463 assert_eq!(token_strs[5], ";");
464 assert!(token_strs.get(6).is_none());
465 }
466}