1#[cfg(test)]
2use alloc::{vec, vec::Vec};
3use anyhow::{Result, bail};
4use core::char;
5use core::fmt;
6use core::str;
7use unicode_xid::UnicodeXID;
8
9use self::Token::*;
10
11#[derive(Clone)]
12pub struct Tokenizer<'a> {
13 input: &'a str,
14 span_offset: u32,
15 chars: CrlfFold<'a>,
16}
17
18#[derive(Clone)]
19struct CrlfFold<'a> {
20 chars: str::CharIndices<'a>,
21}
22
23#[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)]
28pub struct Span {
29 start: u32,
30 end: u32,
31}
32
33impl Default for Span {
34 fn default() -> Span {
35 Span {
36 start: u32::MAX,
37 end: u32::MAX,
38 }
39 }
40}
41
42impl Span {
43 pub fn new(start: u32, end: u32) -> Span {
44 let span = Span { start, end };
45 assert!(span.is_known(), "cannot create a span with u32::MAX");
46 span
47 }
48
49 pub fn adjust(&mut self, offset: u32) {
51 if self.is_known() {
52 self.start += offset;
53 self.end += offset;
54 }
55 }
56
57 pub fn start(&self) -> u32 {
59 assert!(self.is_known(), "cannot get start of unknown span");
60 self.start
61 }
62
63 pub fn end(&self) -> u32 {
65 assert!(self.is_known(), "cannot get end of unknown span");
66 self.end
67 }
68
69 pub fn set_end(&mut self, new_end: u32) {
71 if !self.is_known() {
72 self.start = new_end;
73 }
74 self.end = new_end;
75 }
76
77 pub fn set_start(&mut self, new_start: u32) {
79 if !self.is_known() {
80 self.end = new_start;
81 }
82 self.start = new_start;
83 }
84
85 pub fn is_known(&self) -> bool {
87 self.start != u32::MAX && self.end != u32::MAX
88 }
89}
90
91#[derive(Eq, PartialEq, Debug, Copy, Clone)]
92pub enum Token {
93 Whitespace,
94 Comment,
95
96 Equals,
97 Comma,
98 Colon,
99 Period,
100 Semicolon,
101 LeftParen,
102 RightParen,
103 LeftBrace,
104 RightBrace,
105 LessThan,
106 GreaterThan,
107 RArrow,
108 Star,
109 At,
110 Slash,
111 Plus,
112 Minus,
113
114 Use,
115 Type,
116 Func,
117 U8,
118 U16,
119 U32,
120 U64,
121 S8,
122 S16,
123 S32,
124 S64,
125 F32,
126 F64,
127 Char,
128 Record,
129 Resource,
130 Own,
131 Borrow,
132 Flags,
133 Variant,
134 Enum,
135 Bool,
136 String_,
137 Option_,
138 Result_,
139 Future,
140 Stream,
141 ErrorContext,
142 List,
143 Map,
144 Underscore,
145 As,
146 From_,
147 Static,
148 Interface,
149 Tuple,
150 Import,
151 Export,
152 World,
153 Package,
154 Constructor,
155 Async,
156
157 Id,
158 ExplicitId,
159
160 Integer,
161
162 Include,
163 With,
164}
165
166#[derive(Eq, PartialEq, Debug)]
167#[allow(dead_code)]
168pub enum Error {
169 InvalidCharInId(u32, char),
170 IdPartEmpty(u32),
171 InvalidEscape(u32, char),
172 Unexpected(u32, char),
173 UnterminatedComment(u32),
174 Wanted {
175 at: u32,
176 expected: &'static str,
177 found: &'static str,
178 },
179}
180
181impl<'a> Tokenizer<'a> {
182 pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>> {
183 detect_invalid_input(input)?;
184
185 let mut t = Tokenizer {
186 input,
187 span_offset,
188 chars: CrlfFold {
189 chars: input.char_indices(),
190 },
191 };
192 t.eatc('\u{feff}');
194 Ok(t)
195 }
196
197 pub fn expect_semicolon(&mut self) -> Result<()> {
198 self.expect(Token::Semicolon)?;
199 Ok(())
200 }
201
202 pub fn get_span(&self, span: Span) -> &'a str {
203 let start = usize::try_from(span.start() - self.span_offset).unwrap();
204 let end = usize::try_from(span.end() - self.span_offset).unwrap();
205 &self.input[start..end]
206 }
207
208 pub fn parse_id(&self, span: Span) -> Result<&'a str> {
209 let ret = self.get_span(span);
210 validate_id(span.start(), &ret)?;
211 Ok(ret)
212 }
213
214 pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> {
215 let token = self.get_span(span);
216 let id_part = token.strip_prefix('%').unwrap();
217 validate_id(span.start(), id_part)?;
218 Ok(id_part)
219 }
220
221 pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> {
222 loop {
223 match self.next_raw()? {
224 Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {}
225 other => break Ok(other),
226 }
227 }
228 }
229
230 pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> {
234 let (str_start, ch) = match self.chars.next() {
235 Some(pair) => pair,
236 None => return Ok(None),
237 };
238 let start = self.span_offset + u32::try_from(str_start).unwrap();
239 let token = match ch {
240 '\n' | '\t' | ' ' => {
241 while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {}
243 Whitespace
244 }
245 '/' => {
246 if self.eatc('/') {
248 for (_, ch) in &mut self.chars {
249 if ch == '\n' {
250 break;
251 }
252 }
253 Comment
254 } else if self.eatc('*') {
256 let mut depth = 1;
257 while depth > 0 {
258 let (_, ch) = match self.chars.next() {
259 Some(pair) => pair,
260 None => return Err(Error::UnterminatedComment(start)),
261 };
262 match ch {
263 '/' if self.eatc('*') => depth += 1,
264 '*' if self.eatc('/') => depth -= 1,
265 _ => {}
266 }
267 }
268 Comment
269 } else {
270 Slash
271 }
272 }
273 '=' => Equals,
274 ',' => Comma,
275 ':' => Colon,
276 '.' => Period,
277 ';' => Semicolon,
278 '(' => LeftParen,
279 ')' => RightParen,
280 '{' => LeftBrace,
281 '}' => RightBrace,
282 '<' => LessThan,
283 '>' => GreaterThan,
284 '*' => Star,
285 '@' => At,
286 '-' => {
287 if self.eatc('>') {
288 RArrow
289 } else {
290 Minus
291 }
292 }
293 '+' => Plus,
294 '%' => {
295 let mut iter = self.chars.clone();
296 if let Some((_, ch)) = iter.next() {
297 if is_keylike_start(ch) {
298 self.chars = iter.clone();
299 while let Some((_, ch)) = iter.next() {
300 if !is_keylike_continue(ch) {
301 break;
302 }
303 self.chars = iter.clone();
304 }
305 }
306 }
307 ExplicitId
308 }
309 ch if is_keylike_start(ch) => {
310 let remaining = self.chars.chars.as_str().len();
311 let mut iter = self.chars.clone();
312 while let Some((_, ch)) = iter.next() {
313 if !is_keylike_continue(ch) {
314 break;
315 }
316 self.chars = iter.clone();
317 }
318 let str_end =
319 str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len());
320 match &self.input[str_start..str_end] {
321 "use" => Use,
322 "type" => Type,
323 "func" => Func,
324 "u8" => U8,
325 "u16" => U16,
326 "u32" => U32,
327 "u64" => U64,
328 "s8" => S8,
329 "s16" => S16,
330 "s32" => S32,
331 "s64" => S64,
332 "f32" => F32,
333 "f64" => F64,
334 "char" => Char,
335 "resource" => Resource,
336 "own" => Own,
337 "borrow" => Borrow,
338 "record" => Record,
339 "flags" => Flags,
340 "variant" => Variant,
341 "enum" => Enum,
342 "bool" => Bool,
343 "string" => String_,
344 "option" => Option_,
345 "result" => Result_,
346 "future" => Future,
347 "stream" => Stream,
348 "error-context" => ErrorContext,
349 "list" => List,
350 "map" => Map,
351 "_" => Underscore,
352 "as" => As,
353 "from" => From_,
354 "static" => Static,
355 "interface" => Interface,
356 "tuple" => Tuple,
357 "world" => World,
358 "import" => Import,
359 "export" => Export,
360 "package" => Package,
361 "constructor" => Constructor,
362 "include" => Include,
363 "with" => With,
364 "async" => Async,
365 _ => Id,
366 }
367 }
368
369 ch if ch.is_ascii_digit() => {
370 let mut iter = self.chars.clone();
371 while let Some((_, ch)) = iter.next() {
372 if !ch.is_ascii_digit() {
373 break;
374 }
375 self.chars = iter.clone();
376 }
377
378 Integer
379 }
380
381 ch => return Err(Error::Unexpected(start, ch)),
382 };
383 let end = match self.chars.clone().next() {
384 Some((i, _)) => i,
385 None => self.input.len(),
386 };
387
388 let end = self.span_offset + u32::try_from(end).unwrap();
389 Ok(Some((Span::new(start, end), token)))
390 }
391
392 pub fn eat(&mut self, expected: Token) -> Result<bool, Error> {
393 let mut other = self.clone();
394 match other.next()? {
395 Some((_span, found)) if expected == found => {
396 *self = other;
397 Ok(true)
398 }
399 Some(_) => Ok(false),
400 None => Ok(false),
401 }
402 }
403
404 pub fn expect(&mut self, expected: Token) -> Result<Span, Error> {
405 match self.next()? {
406 Some((span, found)) => {
407 if expected == found {
408 Ok(span)
409 } else {
410 Err(Error::Wanted {
411 at: span.start(),
412 expected: expected.describe(),
413 found: found.describe(),
414 })
415 }
416 }
417 None => Err(Error::Wanted {
418 at: self.span_offset + u32::try_from(self.input.len()).unwrap(),
419 expected: expected.describe(),
420 found: "eof",
421 }),
422 }
423 }
424
425 fn eatc(&mut self, ch: char) -> bool {
426 let mut iter = self.chars.clone();
427 match iter.next() {
428 Some((_, ch2)) if ch == ch2 => {
429 self.chars = iter;
430 true
431 }
432 _ => false,
433 }
434 }
435
436 pub fn eof_span(&self) -> Span {
437 let end = self.span_offset + u32::try_from(self.input.len()).unwrap();
438 Span::new(end, end)
439 }
440}
441
442impl<'a> Iterator for CrlfFold<'a> {
443 type Item = (usize, char);
444
445 fn next(&mut self) -> Option<(usize, char)> {
446 self.chars.next().map(|(i, c)| {
447 if c == '\r' {
448 let mut attempt = self.chars.clone();
449 if let Some((_, '\n')) = attempt.next() {
450 self.chars = attempt;
451 return (i, '\n');
452 }
453 }
454 (i, c)
455 })
456 }
457}
458
459fn detect_invalid_input(input: &str) -> Result<()> {
460 let mut line = 1;
462 for ch in input.chars() {
463 match ch {
464 '\n' => line += 1,
465 '\r' | '\t' => {}
466
467 '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
473 | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
474 bail!(
475 "Input contains bidirectional override codepoint {:?} at line {}",
476 ch.escape_unicode(),
477 line
478 );
479 }
480
481 '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
489 | '\u{17b4}' | '\u{17b5}' => {
490 bail!(
491 "Codepoint {:?} at line {} is discouraged by Unicode",
492 ch.escape_unicode(),
493 line
494 );
495 }
496
497 ch if ch.is_control() => {
501 bail!("Control code '{}' at line {}", ch.escape_unicode(), line);
502 }
503
504 _ => {}
505 }
506 }
507
508 Ok(())
509}
510
511fn is_keylike_start(ch: char) -> bool {
512 UnicodeXID::is_xid_start(ch) || ch == '_' || ch == '-'
515}
516
517fn is_keylike_continue(ch: char) -> bool {
518 UnicodeXID::is_xid_continue(ch) || ch == '-'
520}
521
522pub fn validate_id(start: u32, id: &str) -> Result<(), Error> {
523 if id.is_empty() {
525 return Err(Error::IdPartEmpty(start));
526 }
527
528 for (idx, part) in id.split('-').enumerate() {
530 let Some(first_char) = part.chars().next() else {
533 return Err(Error::IdPartEmpty(start));
534 };
535 if idx == 0 && !first_char.is_ascii_alphabetic() {
536 return Err(Error::InvalidCharInId(start, first_char));
537 }
538 let mut upper = None;
539 for ch in part.chars() {
540 if ch.is_ascii_digit() {
541 } else if ch.is_ascii_uppercase() {
543 if upper.is_none() {
544 upper = Some(true);
545 } else if let Some(false) = upper {
546 return Err(Error::InvalidCharInId(start, ch));
547 }
548 } else if ch.is_ascii_lowercase() {
549 if upper.is_none() {
550 upper = Some(false);
551 } else if let Some(true) = upper {
552 return Err(Error::InvalidCharInId(start, ch));
553 }
554 } else {
555 return Err(Error::InvalidCharInId(start, ch));
556 }
557 }
558 }
559
560 Ok(())
561}
562
563impl Token {
564 pub fn describe(&self) -> &'static str {
565 match self {
566 Whitespace => "whitespace",
567 Comment => "a comment",
568 Equals => "'='",
569 Comma => "','",
570 Colon => "':'",
571 Period => "'.'",
572 Semicolon => "';'",
573 LeftParen => "'('",
574 RightParen => "')'",
575 LeftBrace => "'{'",
576 RightBrace => "'}'",
577 LessThan => "'<'",
578 GreaterThan => "'>'",
579 Use => "keyword `use`",
580 Type => "keyword `type`",
581 Func => "keyword `func`",
582 U8 => "keyword `u8`",
583 U16 => "keyword `u16`",
584 U32 => "keyword `u32`",
585 U64 => "keyword `u64`",
586 S8 => "keyword `s8`",
587 S16 => "keyword `s16`",
588 S32 => "keyword `s32`",
589 S64 => "keyword `s64`",
590 F32 => "keyword `f32`",
591 F64 => "keyword `f64`",
592 Char => "keyword `char`",
593 Own => "keyword `own`",
594 Borrow => "keyword `borrow`",
595 Resource => "keyword `resource`",
596 Record => "keyword `record`",
597 Flags => "keyword `flags`",
598 Variant => "keyword `variant`",
599 Enum => "keyword `enum`",
600 Bool => "keyword `bool`",
601 String_ => "keyword `string`",
602 Option_ => "keyword `option`",
603 Result_ => "keyword `result`",
604 Future => "keyword `future`",
605 Stream => "keyword `stream`",
606 ErrorContext => "keyword `error-context`",
607 List => "keyword `list`",
608 Map => "keyword `map`",
609 Underscore => "keyword `_`",
610 Id => "an identifier",
611 ExplicitId => "an '%' identifier",
612 RArrow => "`->`",
613 Star => "`*`",
614 At => "`@`",
615 Slash => "`/`",
616 Plus => "`+`",
617 Minus => "`-`",
618 As => "keyword `as`",
619 From_ => "keyword `from`",
620 Static => "keyword `static`",
621 Interface => "keyword `interface`",
622 Tuple => "keyword `tuple`",
623 Import => "keyword `import`",
624 Export => "keyword `export`",
625 World => "keyword `world`",
626 Package => "keyword `package`",
627 Constructor => "keyword `constructor`",
628 Integer => "an integer",
629 Include => "keyword `include`",
630 With => "keyword `with`",
631 Async => "keyword `async`",
632 }
633 }
634}
635
636impl core::error::Error for Error {}
637
638impl fmt::Display for Error {
639 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
640 match self {
641 Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
642 Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
643 Error::Wanted {
644 expected, found, ..
645 } => write!(f, "expected {expected}, found {found}"),
646 Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"),
647 Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
648 Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"),
649 }
650 }
651}
652
653#[test]
654fn test_validate_id() {
655 validate_id(0, "apple").unwrap();
656 validate_id(0, "apple-pear").unwrap();
657 validate_id(0, "apple-pear-grape").unwrap();
658 validate_id(0, "a0").unwrap();
659 validate_id(0, "a").unwrap();
660 validate_id(0, "a-a").unwrap();
661 validate_id(0, "bool").unwrap();
662 validate_id(0, "APPLE").unwrap();
663 validate_id(0, "APPLE-PEAR").unwrap();
664 validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
665 validate_id(0, "apple-PEAR-grape").unwrap();
666 validate_id(0, "APPLE-pear-GRAPE").unwrap();
667 validate_id(0, "ENOENT").unwrap();
668 validate_id(0, "is-XML").unwrap();
669 validate_id(0, "apple-0").unwrap();
670 validate_id(0, "a0-000-3d4a-54FF").unwrap();
671
672 assert!(validate_id(0, "").is_err());
673 assert!(validate_id(0, "0").is_err());
674 assert!(validate_id(0, "%").is_err());
675 assert!(validate_id(0, "$").is_err());
676 assert!(validate_id(0, "0a").is_err());
677 assert!(validate_id(0, ".").is_err());
678 assert!(validate_id(0, "·").is_err());
679 assert!(validate_id(0, "a a").is_err());
680 assert!(validate_id(0, "_").is_err());
681 assert!(validate_id(0, "-").is_err());
682 assert!(validate_id(0, "a-").is_err());
683 assert!(validate_id(0, "-a").is_err());
684 assert!(validate_id(0, "Apple").is_err());
685 assert!(validate_id(0, "applE").is_err());
686 assert!(validate_id(0, "-apple-pear").is_err());
687 assert!(validate_id(0, "apple-pear-").is_err());
688 assert!(validate_id(0, "apple_pear").is_err());
689 assert!(validate_id(0, "apple.pear").is_err());
690 assert!(validate_id(0, "apple pear").is_err());
691 assert!(validate_id(0, "apple/pear").is_err());
692 assert!(validate_id(0, "apple|pear").is_err());
693 assert!(validate_id(0, "apple-Pear").is_err());
694 assert!(validate_id(0, "()()").is_err());
695 assert!(validate_id(0, "").is_err());
696 assert!(validate_id(0, "*").is_err());
697 assert!(validate_id(0, "apple\u{5f3}pear").is_err());
698 assert!(validate_id(0, "apple\u{200c}pear").is_err());
699 assert!(validate_id(0, "apple\u{200d}pear").is_err());
700 assert!(validate_id(0, "apple--pear").is_err());
701 assert!(validate_id(0, "_apple").is_err());
702 assert!(validate_id(0, "apple_").is_err());
703 assert!(validate_id(0, "_Znwj").is_err());
704 assert!(validate_id(0, "__i386").is_err());
705 assert!(validate_id(0, "__i386__").is_err());
706 assert!(validate_id(0, "Москва").is_err());
707 assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
708 assert!(validate_id(0, "a0-000-3d4A-54Ff").is_err());
709 assert!(validate_id(0, "😼").is_err(), "non-identifier");
710 assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
711}
712
713#[test]
714fn test_tokenizer() {
715 fn collect(s: &str) -> Result<Vec<Token>> {
716 let mut t = Tokenizer::new(s, 0)?;
717 let mut tokens = Vec::new();
718 while let Some(token) = t.next()? {
719 tokens.push(token.1);
720 }
721 Ok(tokens)
722 }
723
724 assert_eq!(collect("").unwrap(), vec![]);
725 assert_eq!(collect("_").unwrap(), vec![Token::Underscore]);
726 assert_eq!(collect("apple").unwrap(), vec![Token::Id]);
727 assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]);
728 assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]);
729 assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]);
730 assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]);
731 assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]);
732 assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]);
733 assert_eq!(collect("garçon").unwrap(), vec![Token::Id]);
734 assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]);
735 assert_eq!(collect("москва").unwrap(), vec![Token::Id]);
736 assert_eq!(collect("東京").unwrap(), vec![Token::Id]);
737 assert_eq!(
738 collect("garçon-hühnervögel-москва-東京").unwrap(),
739 vec![Token::Id]
740 );
741 assert_eq!(collect("a0").unwrap(), vec![Token::Id]);
742 assert_eq!(collect("a").unwrap(), vec![Token::Id]);
743 assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]);
744 assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
745 assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
746 assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
747 assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
748 assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
749 assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
750 assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
751 assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
752 assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
753 assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
754
755 assert_eq!(collect("func").unwrap(), vec![Token::Func]);
756 assert_eq!(
757 collect("a: func()").unwrap(),
758 vec![
759 Token::Id,
760 Token::Colon,
761 Token::Func,
762 Token::LeftParen,
763 Token::RightParen
764 ]
765 );
766
767 assert_eq!(collect("resource").unwrap(), vec![Token::Resource]);
768
769 assert_eq!(collect("own").unwrap(), vec![Token::Own]);
770 assert_eq!(
771 collect("own<some-id>").unwrap(),
772 vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan]
773 );
774
775 assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]);
776 assert_eq!(
777 collect("borrow<some-id>").unwrap(),
778 vec![
779 Token::Borrow,
780 Token::LessThan,
781 Token::Id,
782 Token::GreaterThan
783 ]
784 );
785
786 assert!(collect("\u{149}").is_err(), "strongly discouraged");
787 assert!(collect("\u{673}").is_err(), "strongly discouraged");
788 assert!(collect("\u{17a3}").is_err(), "strongly discouraged");
789 assert!(collect("\u{17a4}").is_err(), "strongly discouraged");
790 assert!(collect("\u{202a}").is_err(), "bidirectional override");
791 assert!(collect("\u{2068}").is_err(), "bidirectional override");
792 assert!(collect("\u{0}").is_err(), "control code");
793 assert!(collect("\u{b}").is_err(), "control code");
794 assert!(collect("\u{c}").is_err(), "control code");
795 assert!(collect("\u{85}").is_err(), "control code");
796}