Skip to main content

wit_parser/ast/
lex.rs

1#[cfg(test)]
2use alloc::{vec, vec::Vec};
3use core::char;
4use core::fmt;
5use core::result::Result;
6use core::str;
7
8use self::Token::*;
9
10#[derive(Clone)]
11pub struct Tokenizer<'a> {
12    input: &'a str,
13    span_offset: u32,
14    chars: CrlfFold<'a>,
15}
16
17#[derive(Clone)]
18struct CrlfFold<'a> {
19    chars: str::CharIndices<'a>,
20}
21
22/// A span, designating a range of bytes where a token is located.
23///
24/// Uses `u32::MAX` as a sentinel value to represent unknown spans (e.g.,
25/// decoded from binary).
26#[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)]
27pub struct Span {
28    start: u32,
29    end: u32,
30}
31
32impl Default for Span {
33    fn default() -> Span {
34        Span {
35            start: u32::MAX,
36            end: u32::MAX,
37        }
38    }
39}
40
41impl Span {
42    pub fn new(start: u32, end: u32) -> Span {
43        let span = Span { start, end };
44        assert!(span.is_known(), "cannot create a span with u32::MAX");
45        span
46    }
47
48    /// Adjusts this span by adding the given byte offset to both start and end.
49    pub fn adjust(&mut self, offset: u32) {
50        if self.is_known() {
51            self.start += offset;
52            self.end += offset;
53        }
54    }
55
56    /// Returns the start offset, panicking if this is an unknown span.
57    pub fn start(&self) -> u32 {
58        assert!(self.is_known(), "cannot get start of unknown span");
59        self.start
60    }
61
62    /// Returns the end offset, panicking if this is an unknown span.
63    pub fn end(&self) -> u32 {
64        assert!(self.is_known(), "cannot get end of unknown span");
65        self.end
66    }
67
68    /// Sets the end offset. If this is unknown, converts to a zero-width span at that position.
69    pub fn set_end(&mut self, new_end: u32) {
70        if !self.is_known() {
71            self.start = new_end;
72        }
73        self.end = new_end;
74    }
75
76    /// Sets the start offset. If this is unknown, converts to a zero-width span at that position.
77    pub fn set_start(&mut self, new_start: u32) {
78        if !self.is_known() {
79            self.end = new_start;
80        }
81        self.start = new_start;
82    }
83
84    /// Returns true if this span has a known source location.
85    pub fn is_known(&self) -> bool {
86        self.start != u32::MAX && self.end != u32::MAX
87    }
88}
89
90#[derive(Eq, PartialEq, Debug, Copy, Clone)]
91pub enum Token {
92    Whitespace,
93    Comment,
94
95    Equals,
96    Comma,
97    Colon,
98    Period,
99    Semicolon,
100    LeftParen,
101    RightParen,
102    LeftBrace,
103    RightBrace,
104    LessThan,
105    GreaterThan,
106    RArrow,
107    Star,
108    At,
109    Slash,
110    Plus,
111    Minus,
112
113    Use,
114    Type,
115    Func,
116    U8,
117    U16,
118    U32,
119    U64,
120    S8,
121    S16,
122    S32,
123    S64,
124    F32,
125    F64,
126    Char,
127    Record,
128    Resource,
129    Own,
130    Borrow,
131    Flags,
132    Variant,
133    Enum,
134    Bool,
135    String_,
136    Option_,
137    Result_,
138    Future,
139    Stream,
140    ErrorContext,
141    List,
142    Map,
143    Underscore,
144    As,
145    From_,
146    Static,
147    Interface,
148    Tuple,
149    Import,
150    Export,
151    World,
152    Package,
153    Constructor,
154    Async,
155
156    Id,
157    ExplicitId,
158
159    Integer,
160
161    Include,
162    With,
163}
164
165#[derive(Eq, PartialEq, Debug)]
166#[allow(dead_code)]
167pub enum Error {
168    ControlCodepoint(u32, char),
169    DeprecatedCodepoint(u32, char),
170    ForbiddenCodepoint(u32, char),
171    InvalidCharInId(u32, char),
172    IdPartEmpty(u32),
173    InvalidEscape(u32, char),
174    Unexpected(u32, char),
175    UnterminatedComment(u32),
176    Wanted {
177        at: u32,
178        expected: &'static str,
179        found: &'static str,
180    },
181}
182
183impl<'a> Tokenizer<'a> {
184    pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>, Error> {
185        detect_invalid_input(input)?;
186
187        let mut t = Tokenizer {
188            input,
189            span_offset,
190            chars: CrlfFold {
191                chars: input.char_indices(),
192            },
193        };
194        // Eat utf-8 BOM
195        t.eatc('\u{feff}');
196        Ok(t)
197    }
198
199    pub fn expect_semicolon(&mut self) -> Result<(), Error> {
200        self.expect(Token::Semicolon)?;
201        Ok(())
202    }
203
204    pub fn get_span(&self, span: Span) -> &'a str {
205        let start = usize::try_from(span.start() - self.span_offset).unwrap();
206        let end = usize::try_from(span.end() - self.span_offset).unwrap();
207        &self.input[start..end]
208    }
209
210    pub fn parse_id(&self, span: Span) -> Result<&'a str, Error> {
211        let ret = self.get_span(span);
212        validate_id(span.start(), &ret)?;
213        Ok(ret)
214    }
215
216    pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str, Error> {
217        let token = self.get_span(span);
218        let id_part = token.strip_prefix('%').unwrap();
219        validate_id(span.start(), id_part)?;
220        Ok(id_part)
221    }
222
223    pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> {
224        loop {
225            match self.next_raw()? {
226                Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {}
227                other => break Ok(other),
228            }
229        }
230    }
231
232    /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an
233    /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more
234    /// tokens available.
235    pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> {
236        let (str_start, ch) = match self.chars.next() {
237            Some(pair) => pair,
238            None => return Ok(None),
239        };
240        let start = self.span_offset + u32::try_from(str_start).unwrap();
241        let token = match ch {
242            '\n' | '\t' | ' ' => {
243                // Eat all contiguous whitespace tokens
244                while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {}
245                Whitespace
246            }
247            '/' => {
248                // Eat a line comment if it's `//...`
249                if self.eatc('/') {
250                    for (_, ch) in &mut self.chars {
251                        if ch == '\n' {
252                            break;
253                        }
254                    }
255                    Comment
256                // eat a block comment if it's `/*...`
257                } else if self.eatc('*') {
258                    let mut depth = 1;
259                    while depth > 0 {
260                        let (_, ch) = match self.chars.next() {
261                            Some(pair) => pair,
262                            None => return Err(Error::UnterminatedComment(start)),
263                        };
264                        match ch {
265                            '/' if self.eatc('*') => depth += 1,
266                            '*' if self.eatc('/') => depth -= 1,
267                            _ => {}
268                        }
269                    }
270                    Comment
271                } else {
272                    Slash
273                }
274            }
275            '=' => Equals,
276            ',' => Comma,
277            ':' => Colon,
278            '.' => Period,
279            ';' => Semicolon,
280            '(' => LeftParen,
281            ')' => RightParen,
282            '{' => LeftBrace,
283            '}' => RightBrace,
284            '<' => LessThan,
285            '>' => GreaterThan,
286            '*' => Star,
287            '@' => At,
288            '-' => {
289                if self.eatc('>') {
290                    RArrow
291                } else {
292                    Minus
293                }
294            }
295            '+' => Plus,
296            '%' => {
297                let mut iter = self.chars.clone();
298                if let Some((_, ch)) = iter.next() {
299                    if is_keylike_start(ch) {
300                        self.chars = iter.clone();
301                        while let Some((_, ch)) = iter.next() {
302                            if !is_keylike_continue(ch) {
303                                break;
304                            }
305                            self.chars = iter.clone();
306                        }
307                    }
308                }
309                ExplicitId
310            }
311            ch if is_keylike_start(ch) => {
312                let remaining = self.chars.chars.as_str().len();
313                let mut iter = self.chars.clone();
314                while let Some((_, ch)) = iter.next() {
315                    if !is_keylike_continue(ch) {
316                        break;
317                    }
318                    self.chars = iter.clone();
319                }
320                let str_end =
321                    str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len());
322                match &self.input[str_start..str_end] {
323                    "use" => Use,
324                    "type" => Type,
325                    "func" => Func,
326                    "u8" => U8,
327                    "u16" => U16,
328                    "u32" => U32,
329                    "u64" => U64,
330                    "s8" => S8,
331                    "s16" => S16,
332                    "s32" => S32,
333                    "s64" => S64,
334                    "f32" => F32,
335                    "f64" => F64,
336                    "char" => Char,
337                    "resource" => Resource,
338                    "own" => Own,
339                    "borrow" => Borrow,
340                    "record" => Record,
341                    "flags" => Flags,
342                    "variant" => Variant,
343                    "enum" => Enum,
344                    "bool" => Bool,
345                    "string" => String_,
346                    "option" => Option_,
347                    "result" => Result_,
348                    "future" => Future,
349                    "stream" => Stream,
350                    "error-context" => ErrorContext,
351                    "list" => List,
352                    "map" => Map,
353                    "_" => Underscore,
354                    "as" => As,
355                    "from" => From_,
356                    "static" => Static,
357                    "interface" => Interface,
358                    "tuple" => Tuple,
359                    "world" => World,
360                    "import" => Import,
361                    "export" => Export,
362                    "package" => Package,
363                    "constructor" => Constructor,
364                    "include" => Include,
365                    "with" => With,
366                    "async" => Async,
367                    _ => Id,
368                }
369            }
370
371            ch if ch.is_ascii_digit() => {
372                let mut iter = self.chars.clone();
373                while let Some((_, ch)) = iter.next() {
374                    if !ch.is_ascii_digit() {
375                        break;
376                    }
377                    self.chars = iter.clone();
378                }
379
380                Integer
381            }
382
383            ch => return Err(Error::Unexpected(start, ch)),
384        };
385        let end = match self.chars.clone().next() {
386            Some((i, _)) => i,
387            None => self.input.len(),
388        };
389
390        let end = self.span_offset + u32::try_from(end).unwrap();
391        Ok(Some((Span::new(start, end), token)))
392    }
393
394    pub fn eat(&mut self, expected: Token) -> Result<bool, Error> {
395        let mut other = self.clone();
396        match other.next()? {
397            Some((_span, found)) if expected == found => {
398                *self = other;
399                Ok(true)
400            }
401            Some(_) => Ok(false),
402            None => Ok(false),
403        }
404    }
405
406    pub fn expect(&mut self, expected: Token) -> Result<Span, Error> {
407        match self.next()? {
408            Some((span, found)) => {
409                if expected == found {
410                    Ok(span)
411                } else {
412                    Err(Error::Wanted {
413                        at: span.start(),
414                        expected: expected.describe(),
415                        found: found.describe(),
416                    })
417                }
418            }
419            None => Err(Error::Wanted {
420                at: self.span_offset + u32::try_from(self.input.len()).unwrap(),
421                expected: expected.describe(),
422                found: "eof",
423            }),
424        }
425    }
426
427    fn eatc(&mut self, ch: char) -> bool {
428        let mut iter = self.chars.clone();
429        match iter.next() {
430            Some((_, ch2)) if ch == ch2 => {
431                self.chars = iter;
432                true
433            }
434            _ => false,
435        }
436    }
437
438    pub fn eof_span(&self) -> Span {
439        let end = self.span_offset + u32::try_from(self.input.len()).unwrap();
440        Span::new(end, end)
441    }
442}
443
444impl<'a> Iterator for CrlfFold<'a> {
445    type Item = (usize, char);
446
447    fn next(&mut self) -> Option<(usize, char)> {
448        self.chars.next().map(|(i, c)| {
449            if c == '\r' {
450                let mut attempt = self.chars.clone();
451                if let Some((_, '\n')) = attempt.next() {
452                    self.chars = attempt;
453                    return (i, '\n');
454                }
455            }
456            (i, c)
457        })
458    }
459}
460
461fn detect_invalid_input(input: &str) -> Result<(), Error> {
462    // Disallow specific codepoints.
463    for (pos, ch) in input.char_indices() {
464        match ch {
465            '\n' | '\r' | '\t' => {}
466
467            // Bidirectional override codepoints can be used to craft source code that
468            // appears to have a different meaning than its actual meaning. See
469            // [CVE-2021-42574] for background and motivation.
470            //
471            // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
472            '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
473            | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
474                return Err(Error::ForbiddenCodepoint(u32::try_from(pos).unwrap(), ch));
475            }
476
477            // Disallow several characters which are deprecated or discouraged in Unicode.
478            //
479            // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs.
480            // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks.
481            // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels.
482            // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see
483            // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
484            '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
485            | '\u{17b4}' | '\u{17b5}' => {
486                return Err(Error::DeprecatedCodepoint(u32::try_from(pos).unwrap(), ch));
487            }
488
489            // Disallow control codes other than the ones explicitly recognized above,
490            // so that viewing a wit file on a terminal doesn't have surprising side
491            // effects or appear to have a different meaning than its actual meaning.
492            ch if ch.is_control() => {
493                return Err(Error::ControlCodepoint(u32::try_from(pos).unwrap(), ch));
494            }
495
496            _ => {}
497        }
498    }
499
500    Ok(())
501}
502
503fn is_keylike_start(ch: char) -> bool {
504    // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars,
505    // but we'll diagnose that after we've lexed the full string.
506    unicode_ident::is_xid_start(ch) || ch == '_' || ch == '-'
507}
508
509fn is_keylike_continue(ch: char) -> bool {
510    // Lex any XID continue (which includes `_`) or '-'.
511    unicode_ident::is_xid_continue(ch) || ch == '-'
512}
513
514pub fn validate_id(start: u32, id: &str) -> Result<(), Error> {
515    // IDs must have at least one part.
516    if id.is_empty() {
517        return Err(Error::IdPartEmpty(start));
518    }
519
520    // Ids consist of parts separated by '-'s.
521    for (idx, part) in id.split('-').enumerate() {
522        // Parts must be non-empty and contain either all ASCII lowercase or
523        // all ASCII uppercase. Non-first segment can also start with a digit.
524        let Some(first_char) = part.chars().next() else {
525            return Err(Error::IdPartEmpty(start));
526        };
527        if idx == 0 && !first_char.is_ascii_alphabetic() {
528            return Err(Error::InvalidCharInId(start, first_char));
529        }
530        let mut upper = None;
531        for ch in part.chars() {
532            if ch.is_ascii_digit() {
533                // Digits are accepted in both uppercase and lowercase segments.
534            } else if ch.is_ascii_uppercase() {
535                if upper.is_none() {
536                    upper = Some(true);
537                } else if let Some(false) = upper {
538                    return Err(Error::InvalidCharInId(start, ch));
539                }
540            } else if ch.is_ascii_lowercase() {
541                if upper.is_none() {
542                    upper = Some(false);
543                } else if let Some(true) = upper {
544                    return Err(Error::InvalidCharInId(start, ch));
545                }
546            } else {
547                return Err(Error::InvalidCharInId(start, ch));
548            }
549        }
550    }
551
552    Ok(())
553}
554
555impl Token {
556    pub fn describe(&self) -> &'static str {
557        match self {
558            Whitespace => "whitespace",
559            Comment => "a comment",
560            Equals => "'='",
561            Comma => "','",
562            Colon => "':'",
563            Period => "'.'",
564            Semicolon => "';'",
565            LeftParen => "'('",
566            RightParen => "')'",
567            LeftBrace => "'{'",
568            RightBrace => "'}'",
569            LessThan => "'<'",
570            GreaterThan => "'>'",
571            Use => "keyword `use`",
572            Type => "keyword `type`",
573            Func => "keyword `func`",
574            U8 => "keyword `u8`",
575            U16 => "keyword `u16`",
576            U32 => "keyword `u32`",
577            U64 => "keyword `u64`",
578            S8 => "keyword `s8`",
579            S16 => "keyword `s16`",
580            S32 => "keyword `s32`",
581            S64 => "keyword `s64`",
582            F32 => "keyword `f32`",
583            F64 => "keyword `f64`",
584            Char => "keyword `char`",
585            Own => "keyword `own`",
586            Borrow => "keyword `borrow`",
587            Resource => "keyword `resource`",
588            Record => "keyword `record`",
589            Flags => "keyword `flags`",
590            Variant => "keyword `variant`",
591            Enum => "keyword `enum`",
592            Bool => "keyword `bool`",
593            String_ => "keyword `string`",
594            Option_ => "keyword `option`",
595            Result_ => "keyword `result`",
596            Future => "keyword `future`",
597            Stream => "keyword `stream`",
598            ErrorContext => "keyword `error-context`",
599            List => "keyword `list`",
600            Map => "keyword `map`",
601            Underscore => "keyword `_`",
602            Id => "an identifier",
603            ExplicitId => "an '%' identifier",
604            RArrow => "`->`",
605            Star => "`*`",
606            At => "`@`",
607            Slash => "`/`",
608            Plus => "`+`",
609            Minus => "`-`",
610            As => "keyword `as`",
611            From_ => "keyword `from`",
612            Static => "keyword `static`",
613            Interface => "keyword `interface`",
614            Tuple => "keyword `tuple`",
615            Import => "keyword `import`",
616            Export => "keyword `export`",
617            World => "keyword `world`",
618            Package => "keyword `package`",
619            Constructor => "keyword `constructor`",
620            Integer => "an integer",
621            Include => "keyword `include`",
622            With => "keyword `with`",
623            Async => "keyword `async`",
624        }
625    }
626}
627
628impl core::error::Error for Error {}
629
630impl Error {
631    /// Returns the byte offset in the source map where this error occurred.
632    pub fn position(&self) -> u32 {
633        match self {
634            Error::ControlCodepoint(at, _)
635            | Error::DeprecatedCodepoint(at, _)
636            | Error::ForbiddenCodepoint(at, _)
637            | Error::InvalidCharInId(at, _)
638            | Error::IdPartEmpty(at)
639            | Error::InvalidEscape(at, _)
640            | Error::Unexpected(at, _)
641            | Error::UnterminatedComment(at) => *at,
642            Error::Wanted { at, .. } => *at,
643        }
644    }
645}
646
647impl fmt::Display for Error {
648    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
649        match self {
650            Error::ControlCodepoint(_, ch) => write!(f, "Control code '{}'", ch.escape_unicode()),
651            Error::DeprecatedCodepoint(_, ch) => {
652                write!(
653                    f,
654                    "Codepoint {:?} is discouraged by Unicode",
655                    ch.escape_unicode()
656                )
657            }
658            Error::ForbiddenCodepoint(_, ch) => {
659                write!(
660                    f,
661                    "Input contains bidirectional override codepoint {:?}",
662                    ch.escape_unicode()
663                )
664            }
665            Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
666            Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
667            Error::Wanted {
668                expected, found, ..
669            } => write!(f, "expected {expected}, found {found}"),
670            Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"),
671            Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
672            Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"),
673        }
674    }
675}
676
677#[test]
678fn test_validate_id() {
679    validate_id(0, "apple").unwrap();
680    validate_id(0, "apple-pear").unwrap();
681    validate_id(0, "apple-pear-grape").unwrap();
682    validate_id(0, "a0").unwrap();
683    validate_id(0, "a").unwrap();
684    validate_id(0, "a-a").unwrap();
685    validate_id(0, "bool").unwrap();
686    validate_id(0, "APPLE").unwrap();
687    validate_id(0, "APPLE-PEAR").unwrap();
688    validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
689    validate_id(0, "apple-PEAR-grape").unwrap();
690    validate_id(0, "APPLE-pear-GRAPE").unwrap();
691    validate_id(0, "ENOENT").unwrap();
692    validate_id(0, "is-XML").unwrap();
693    validate_id(0, "apple-0").unwrap();
694    validate_id(0, "a0-000-3d4a-54FF").unwrap();
695
696    assert!(validate_id(0, "").is_err());
697    assert!(validate_id(0, "0").is_err());
698    assert!(validate_id(0, "%").is_err());
699    assert!(validate_id(0, "$").is_err());
700    assert!(validate_id(0, "0a").is_err());
701    assert!(validate_id(0, ".").is_err());
702    assert!(validate_id(0, "·").is_err());
703    assert!(validate_id(0, "a a").is_err());
704    assert!(validate_id(0, "_").is_err());
705    assert!(validate_id(0, "-").is_err());
706    assert!(validate_id(0, "a-").is_err());
707    assert!(validate_id(0, "-a").is_err());
708    assert!(validate_id(0, "Apple").is_err());
709    assert!(validate_id(0, "applE").is_err());
710    assert!(validate_id(0, "-apple-pear").is_err());
711    assert!(validate_id(0, "apple-pear-").is_err());
712    assert!(validate_id(0, "apple_pear").is_err());
713    assert!(validate_id(0, "apple.pear").is_err());
714    assert!(validate_id(0, "apple pear").is_err());
715    assert!(validate_id(0, "apple/pear").is_err());
716    assert!(validate_id(0, "apple|pear").is_err());
717    assert!(validate_id(0, "apple-Pear").is_err());
718    assert!(validate_id(0, "()()").is_err());
719    assert!(validate_id(0, "").is_err());
720    assert!(validate_id(0, "*").is_err());
721    assert!(validate_id(0, "apple\u{5f3}pear").is_err());
722    assert!(validate_id(0, "apple\u{200c}pear").is_err());
723    assert!(validate_id(0, "apple\u{200d}pear").is_err());
724    assert!(validate_id(0, "apple--pear").is_err());
725    assert!(validate_id(0, "_apple").is_err());
726    assert!(validate_id(0, "apple_").is_err());
727    assert!(validate_id(0, "_Znwj").is_err());
728    assert!(validate_id(0, "__i386").is_err());
729    assert!(validate_id(0, "__i386__").is_err());
730    assert!(validate_id(0, "Москва").is_err());
731    assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
732    assert!(validate_id(0, "a0-000-3d4A-54Ff").is_err());
733    assert!(validate_id(0, "😼").is_err(), "non-identifier");
734    assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
735}
736
737#[test]
738fn test_tokenizer() {
739    fn collect(s: &str) -> Result<Vec<Token>, Error> {
740        let mut t = Tokenizer::new(s, 0)?;
741        let mut tokens = Vec::new();
742        while let Some(token) = t.next()? {
743            tokens.push(token.1);
744        }
745        Ok(tokens)
746    }
747
748    assert_eq!(collect("").unwrap(), vec![]);
749    assert_eq!(collect("_").unwrap(), vec![Token::Underscore]);
750    assert_eq!(collect("apple").unwrap(), vec![Token::Id]);
751    assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]);
752    assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]);
753    assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]);
754    assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]);
755    assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]);
756    assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]);
757    assert_eq!(collect("garçon").unwrap(), vec![Token::Id]);
758    assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]);
759    assert_eq!(collect("москва").unwrap(), vec![Token::Id]);
760    assert_eq!(collect("東京").unwrap(), vec![Token::Id]);
761    assert_eq!(
762        collect("garçon-hühnervögel-москва-東京").unwrap(),
763        vec![Token::Id]
764    );
765    assert_eq!(collect("a0").unwrap(), vec![Token::Id]);
766    assert_eq!(collect("a").unwrap(), vec![Token::Id]);
767    assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]);
768    assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
769    assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
770    assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
771    assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
772    assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
773    assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
774    assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
775    assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
776    assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
777    assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
778
779    assert_eq!(collect("func").unwrap(), vec![Token::Func]);
780    assert_eq!(
781        collect("a: func()").unwrap(),
782        vec![
783            Token::Id,
784            Token::Colon,
785            Token::Func,
786            Token::LeftParen,
787            Token::RightParen
788        ]
789    );
790
791    assert_eq!(collect("resource").unwrap(), vec![Token::Resource]);
792
793    assert_eq!(collect("own").unwrap(), vec![Token::Own]);
794    assert_eq!(
795        collect("own<some-id>").unwrap(),
796        vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan]
797    );
798
799    assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]);
800    assert_eq!(
801        collect("borrow<some-id>").unwrap(),
802        vec![
803            Token::Borrow,
804            Token::LessThan,
805            Token::Id,
806            Token::GreaterThan
807        ]
808    );
809
810    assert!(collect("\u{149}").is_err(), "strongly discouraged");
811    assert!(collect("\u{673}").is_err(), "strongly discouraged");
812    assert!(collect("\u{17a3}").is_err(), "strongly discouraged");
813    assert!(collect("\u{17a4}").is_err(), "strongly discouraged");
814    assert!(collect("\u{202a}").is_err(), "bidirectional override");
815    assert!(collect("\u{2068}").is_err(), "bidirectional override");
816    assert!(collect("\u{0}").is_err(), "control code");
817    assert!(collect("\u{b}").is_err(), "control code");
818    assert!(collect("\u{c}").is_err(), "control code");
819    assert!(collect("\u{85}").is_err(), "control code");
820}