Skip to main content

wit_parser/ast/
lex.rs

1#[cfg(test)]
2use alloc::{vec, vec::Vec};
3use core::char;
4use core::fmt;
5use core::result::Result;
6use core::str;
7use unicode_xid::UnicodeXID;
8
9use self::Token::*;
10
11#[derive(Clone)]
12pub struct Tokenizer<'a> {
13    input: &'a str,
14    span_offset: u32,
15    chars: CrlfFold<'a>,
16}
17
18#[derive(Clone)]
19struct CrlfFold<'a> {
20    chars: str::CharIndices<'a>,
21}
22
23/// A span, designating a range of bytes where a token is located.
24///
25/// Uses `u32::MAX` as a sentinel value to represent unknown spans (e.g.,
26/// decoded from binary).
27#[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)]
28pub struct Span {
29    start: u32,
30    end: u32,
31}
32
33impl Default for Span {
34    fn default() -> Span {
35        Span {
36            start: u32::MAX,
37            end: u32::MAX,
38        }
39    }
40}
41
42impl Span {
43    pub fn new(start: u32, end: u32) -> Span {
44        let span = Span { start, end };
45        assert!(span.is_known(), "cannot create a span with u32::MAX");
46        span
47    }
48
49    /// Adjusts this span by adding the given byte offset to both start and end.
50    pub fn adjust(&mut self, offset: u32) {
51        if self.is_known() {
52            self.start += offset;
53            self.end += offset;
54        }
55    }
56
57    /// Returns the start offset, panicking if this is an unknown span.
58    pub fn start(&self) -> u32 {
59        assert!(self.is_known(), "cannot get start of unknown span");
60        self.start
61    }
62
63    /// Returns the end offset, panicking if this is an unknown span.
64    pub fn end(&self) -> u32 {
65        assert!(self.is_known(), "cannot get end of unknown span");
66        self.end
67    }
68
69    /// Sets the end offset. If this is unknown, converts to a zero-width span at that position.
70    pub fn set_end(&mut self, new_end: u32) {
71        if !self.is_known() {
72            self.start = new_end;
73        }
74        self.end = new_end;
75    }
76
77    /// Sets the start offset. If this is unknown, converts to a zero-width span at that position.
78    pub fn set_start(&mut self, new_start: u32) {
79        if !self.is_known() {
80            self.end = new_start;
81        }
82        self.start = new_start;
83    }
84
85    /// Returns true if this span has a known source location.
86    pub fn is_known(&self) -> bool {
87        self.start != u32::MAX && self.end != u32::MAX
88    }
89}
90
91#[derive(Eq, PartialEq, Debug, Copy, Clone)]
92pub enum Token {
93    Whitespace,
94    Comment,
95
96    Equals,
97    Comma,
98    Colon,
99    Period,
100    Semicolon,
101    LeftParen,
102    RightParen,
103    LeftBrace,
104    RightBrace,
105    LessThan,
106    GreaterThan,
107    RArrow,
108    Star,
109    At,
110    Slash,
111    Plus,
112    Minus,
113
114    Use,
115    Type,
116    Func,
117    U8,
118    U16,
119    U32,
120    U64,
121    S8,
122    S16,
123    S32,
124    S64,
125    F32,
126    F64,
127    Char,
128    Record,
129    Resource,
130    Own,
131    Borrow,
132    Flags,
133    Variant,
134    Enum,
135    Bool,
136    String_,
137    Option_,
138    Result_,
139    Future,
140    Stream,
141    ErrorContext,
142    List,
143    Map,
144    Underscore,
145    As,
146    From_,
147    Static,
148    Interface,
149    Tuple,
150    Import,
151    Export,
152    World,
153    Package,
154    Constructor,
155    Async,
156
157    Id,
158    ExplicitId,
159
160    Integer,
161
162    Include,
163    With,
164}
165
166#[derive(Eq, PartialEq, Debug)]
167#[allow(dead_code)]
168pub enum Error {
169    ControlCodepoint(u32, char),
170    DeprecatedCodepoint(u32, char),
171    ForbiddenCodepoint(u32, char),
172    InvalidCharInId(u32, char),
173    IdPartEmpty(u32),
174    InvalidEscape(u32, char),
175    Unexpected(u32, char),
176    UnterminatedComment(u32),
177    Wanted {
178        at: u32,
179        expected: &'static str,
180        found: &'static str,
181    },
182}
183
184impl<'a> Tokenizer<'a> {
185    pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>, Error> {
186        detect_invalid_input(input)?;
187
188        let mut t = Tokenizer {
189            input,
190            span_offset,
191            chars: CrlfFold {
192                chars: input.char_indices(),
193            },
194        };
195        // Eat utf-8 BOM
196        t.eatc('\u{feff}');
197        Ok(t)
198    }
199
200    pub fn expect_semicolon(&mut self) -> Result<(), Error> {
201        self.expect(Token::Semicolon)?;
202        Ok(())
203    }
204
205    pub fn get_span(&self, span: Span) -> &'a str {
206        let start = usize::try_from(span.start() - self.span_offset).unwrap();
207        let end = usize::try_from(span.end() - self.span_offset).unwrap();
208        &self.input[start..end]
209    }
210
211    pub fn parse_id(&self, span: Span) -> Result<&'a str, Error> {
212        let ret = self.get_span(span);
213        validate_id(span.start(), &ret)?;
214        Ok(ret)
215    }
216
217    pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str, Error> {
218        let token = self.get_span(span);
219        let id_part = token.strip_prefix('%').unwrap();
220        validate_id(span.start(), id_part)?;
221        Ok(id_part)
222    }
223
224    pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> {
225        loop {
226            match self.next_raw()? {
227                Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {}
228                other => break Ok(other),
229            }
230        }
231    }
232
233    /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an
234    /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more
235    /// tokens available.
236    pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> {
237        let (str_start, ch) = match self.chars.next() {
238            Some(pair) => pair,
239            None => return Ok(None),
240        };
241        let start = self.span_offset + u32::try_from(str_start).unwrap();
242        let token = match ch {
243            '\n' | '\t' | ' ' => {
244                // Eat all contiguous whitespace tokens
245                while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {}
246                Whitespace
247            }
248            '/' => {
249                // Eat a line comment if it's `//...`
250                if self.eatc('/') {
251                    for (_, ch) in &mut self.chars {
252                        if ch == '\n' {
253                            break;
254                        }
255                    }
256                    Comment
257                // eat a block comment if it's `/*...`
258                } else if self.eatc('*') {
259                    let mut depth = 1;
260                    while depth > 0 {
261                        let (_, ch) = match self.chars.next() {
262                            Some(pair) => pair,
263                            None => return Err(Error::UnterminatedComment(start)),
264                        };
265                        match ch {
266                            '/' if self.eatc('*') => depth += 1,
267                            '*' if self.eatc('/') => depth -= 1,
268                            _ => {}
269                        }
270                    }
271                    Comment
272                } else {
273                    Slash
274                }
275            }
276            '=' => Equals,
277            ',' => Comma,
278            ':' => Colon,
279            '.' => Period,
280            ';' => Semicolon,
281            '(' => LeftParen,
282            ')' => RightParen,
283            '{' => LeftBrace,
284            '}' => RightBrace,
285            '<' => LessThan,
286            '>' => GreaterThan,
287            '*' => Star,
288            '@' => At,
289            '-' => {
290                if self.eatc('>') {
291                    RArrow
292                } else {
293                    Minus
294                }
295            }
296            '+' => Plus,
297            '%' => {
298                let mut iter = self.chars.clone();
299                if let Some((_, ch)) = iter.next() {
300                    if is_keylike_start(ch) {
301                        self.chars = iter.clone();
302                        while let Some((_, ch)) = iter.next() {
303                            if !is_keylike_continue(ch) {
304                                break;
305                            }
306                            self.chars = iter.clone();
307                        }
308                    }
309                }
310                ExplicitId
311            }
312            ch if is_keylike_start(ch) => {
313                let remaining = self.chars.chars.as_str().len();
314                let mut iter = self.chars.clone();
315                while let Some((_, ch)) = iter.next() {
316                    if !is_keylike_continue(ch) {
317                        break;
318                    }
319                    self.chars = iter.clone();
320                }
321                let str_end =
322                    str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len());
323                match &self.input[str_start..str_end] {
324                    "use" => Use,
325                    "type" => Type,
326                    "func" => Func,
327                    "u8" => U8,
328                    "u16" => U16,
329                    "u32" => U32,
330                    "u64" => U64,
331                    "s8" => S8,
332                    "s16" => S16,
333                    "s32" => S32,
334                    "s64" => S64,
335                    "f32" => F32,
336                    "f64" => F64,
337                    "char" => Char,
338                    "resource" => Resource,
339                    "own" => Own,
340                    "borrow" => Borrow,
341                    "record" => Record,
342                    "flags" => Flags,
343                    "variant" => Variant,
344                    "enum" => Enum,
345                    "bool" => Bool,
346                    "string" => String_,
347                    "option" => Option_,
348                    "result" => Result_,
349                    "future" => Future,
350                    "stream" => Stream,
351                    "error-context" => ErrorContext,
352                    "list" => List,
353                    "map" => Map,
354                    "_" => Underscore,
355                    "as" => As,
356                    "from" => From_,
357                    "static" => Static,
358                    "interface" => Interface,
359                    "tuple" => Tuple,
360                    "world" => World,
361                    "import" => Import,
362                    "export" => Export,
363                    "package" => Package,
364                    "constructor" => Constructor,
365                    "include" => Include,
366                    "with" => With,
367                    "async" => Async,
368                    _ => Id,
369                }
370            }
371
372            ch if ch.is_ascii_digit() => {
373                let mut iter = self.chars.clone();
374                while let Some((_, ch)) = iter.next() {
375                    if !ch.is_ascii_digit() {
376                        break;
377                    }
378                    self.chars = iter.clone();
379                }
380
381                Integer
382            }
383
384            ch => return Err(Error::Unexpected(start, ch)),
385        };
386        let end = match self.chars.clone().next() {
387            Some((i, _)) => i,
388            None => self.input.len(),
389        };
390
391        let end = self.span_offset + u32::try_from(end).unwrap();
392        Ok(Some((Span::new(start, end), token)))
393    }
394
395    pub fn eat(&mut self, expected: Token) -> Result<bool, Error> {
396        let mut other = self.clone();
397        match other.next()? {
398            Some((_span, found)) if expected == found => {
399                *self = other;
400                Ok(true)
401            }
402            Some(_) => Ok(false),
403            None => Ok(false),
404        }
405    }
406
407    pub fn expect(&mut self, expected: Token) -> Result<Span, Error> {
408        match self.next()? {
409            Some((span, found)) => {
410                if expected == found {
411                    Ok(span)
412                } else {
413                    Err(Error::Wanted {
414                        at: span.start(),
415                        expected: expected.describe(),
416                        found: found.describe(),
417                    })
418                }
419            }
420            None => Err(Error::Wanted {
421                at: self.span_offset + u32::try_from(self.input.len()).unwrap(),
422                expected: expected.describe(),
423                found: "eof",
424            }),
425        }
426    }
427
428    fn eatc(&mut self, ch: char) -> bool {
429        let mut iter = self.chars.clone();
430        match iter.next() {
431            Some((_, ch2)) if ch == ch2 => {
432                self.chars = iter;
433                true
434            }
435            _ => false,
436        }
437    }
438
439    pub fn eof_span(&self) -> Span {
440        let end = self.span_offset + u32::try_from(self.input.len()).unwrap();
441        Span::new(end, end)
442    }
443}
444
445impl<'a> Iterator for CrlfFold<'a> {
446    type Item = (usize, char);
447
448    fn next(&mut self) -> Option<(usize, char)> {
449        self.chars.next().map(|(i, c)| {
450            if c == '\r' {
451                let mut attempt = self.chars.clone();
452                if let Some((_, '\n')) = attempt.next() {
453                    self.chars = attempt;
454                    return (i, '\n');
455                }
456            }
457            (i, c)
458        })
459    }
460}
461
462fn detect_invalid_input(input: &str) -> Result<(), Error> {
463    // Disallow specific codepoints.
464    for (pos, ch) in input.char_indices() {
465        match ch {
466            '\n' | '\r' | '\t' => {}
467
468            // Bidirectional override codepoints can be used to craft source code that
469            // appears to have a different meaning than its actual meaning. See
470            // [CVE-2021-42574] for background and motivation.
471            //
472            // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
473            '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
474            | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
475                return Err(Error::ForbiddenCodepoint(u32::try_from(pos).unwrap(), ch));
476            }
477
478            // Disallow several characters which are deprecated or discouraged in Unicode.
479            //
480            // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs.
481            // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks.
482            // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels.
483            // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see
484            // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
485            '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
486            | '\u{17b4}' | '\u{17b5}' => {
487                return Err(Error::DeprecatedCodepoint(u32::try_from(pos).unwrap(), ch));
488            }
489
490            // Disallow control codes other than the ones explicitly recognized above,
491            // so that viewing a wit file on a terminal doesn't have surprising side
492            // effects or appear to have a different meaning than its actual meaning.
493            ch if ch.is_control() => {
494                return Err(Error::ControlCodepoint(u32::try_from(pos).unwrap(), ch));
495            }
496
497            _ => {}
498        }
499    }
500
501    Ok(())
502}
503
504fn is_keylike_start(ch: char) -> bool {
505    // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars,
506    // but we'll diagnose that after we've lexed the full string.
507    UnicodeXID::is_xid_start(ch) || ch == '_' || ch == '-'
508}
509
510fn is_keylike_continue(ch: char) -> bool {
511    // Lex any XID continue (which includes `_`) or '-'.
512    UnicodeXID::is_xid_continue(ch) || ch == '-'
513}
514
515pub fn validate_id(start: u32, id: &str) -> Result<(), Error> {
516    // IDs must have at least one part.
517    if id.is_empty() {
518        return Err(Error::IdPartEmpty(start));
519    }
520
521    // Ids consist of parts separated by '-'s.
522    for (idx, part) in id.split('-').enumerate() {
523        // Parts must be non-empty and contain either all ASCII lowercase or
524        // all ASCII uppercase. Non-first segment can also start with a digit.
525        let Some(first_char) = part.chars().next() else {
526            return Err(Error::IdPartEmpty(start));
527        };
528        if idx == 0 && !first_char.is_ascii_alphabetic() {
529            return Err(Error::InvalidCharInId(start, first_char));
530        }
531        let mut upper = None;
532        for ch in part.chars() {
533            if ch.is_ascii_digit() {
534                // Digits are accepted in both uppercase and lowercase segments.
535            } else if ch.is_ascii_uppercase() {
536                if upper.is_none() {
537                    upper = Some(true);
538                } else if let Some(false) = upper {
539                    return Err(Error::InvalidCharInId(start, ch));
540                }
541            } else if ch.is_ascii_lowercase() {
542                if upper.is_none() {
543                    upper = Some(false);
544                } else if let Some(true) = upper {
545                    return Err(Error::InvalidCharInId(start, ch));
546                }
547            } else {
548                return Err(Error::InvalidCharInId(start, ch));
549            }
550        }
551    }
552
553    Ok(())
554}
555
556impl Token {
557    pub fn describe(&self) -> &'static str {
558        match self {
559            Whitespace => "whitespace",
560            Comment => "a comment",
561            Equals => "'='",
562            Comma => "','",
563            Colon => "':'",
564            Period => "'.'",
565            Semicolon => "';'",
566            LeftParen => "'('",
567            RightParen => "')'",
568            LeftBrace => "'{'",
569            RightBrace => "'}'",
570            LessThan => "'<'",
571            GreaterThan => "'>'",
572            Use => "keyword `use`",
573            Type => "keyword `type`",
574            Func => "keyword `func`",
575            U8 => "keyword `u8`",
576            U16 => "keyword `u16`",
577            U32 => "keyword `u32`",
578            U64 => "keyword `u64`",
579            S8 => "keyword `s8`",
580            S16 => "keyword `s16`",
581            S32 => "keyword `s32`",
582            S64 => "keyword `s64`",
583            F32 => "keyword `f32`",
584            F64 => "keyword `f64`",
585            Char => "keyword `char`",
586            Own => "keyword `own`",
587            Borrow => "keyword `borrow`",
588            Resource => "keyword `resource`",
589            Record => "keyword `record`",
590            Flags => "keyword `flags`",
591            Variant => "keyword `variant`",
592            Enum => "keyword `enum`",
593            Bool => "keyword `bool`",
594            String_ => "keyword `string`",
595            Option_ => "keyword `option`",
596            Result_ => "keyword `result`",
597            Future => "keyword `future`",
598            Stream => "keyword `stream`",
599            ErrorContext => "keyword `error-context`",
600            List => "keyword `list`",
601            Map => "keyword `map`",
602            Underscore => "keyword `_`",
603            Id => "an identifier",
604            ExplicitId => "an '%' identifier",
605            RArrow => "`->`",
606            Star => "`*`",
607            At => "`@`",
608            Slash => "`/`",
609            Plus => "`+`",
610            Minus => "`-`",
611            As => "keyword `as`",
612            From_ => "keyword `from`",
613            Static => "keyword `static`",
614            Interface => "keyword `interface`",
615            Tuple => "keyword `tuple`",
616            Import => "keyword `import`",
617            Export => "keyword `export`",
618            World => "keyword `world`",
619            Package => "keyword `package`",
620            Constructor => "keyword `constructor`",
621            Integer => "an integer",
622            Include => "keyword `include`",
623            With => "keyword `with`",
624            Async => "keyword `async`",
625        }
626    }
627}
628
629impl core::error::Error for Error {}
630
631impl Error {
632    /// Returns the byte offset in the source map where this error occurred.
633    pub fn position(&self) -> u32 {
634        match self {
635            Error::ControlCodepoint(at, _)
636            | Error::DeprecatedCodepoint(at, _)
637            | Error::ForbiddenCodepoint(at, _)
638            | Error::InvalidCharInId(at, _)
639            | Error::IdPartEmpty(at)
640            | Error::InvalidEscape(at, _)
641            | Error::Unexpected(at, _)
642            | Error::UnterminatedComment(at) => *at,
643            Error::Wanted { at, .. } => *at,
644        }
645    }
646}
647
648impl fmt::Display for Error {
649    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
650        match self {
651            Error::ControlCodepoint(_, ch) => write!(f, "Control code '{}'", ch.escape_unicode()),
652            Error::DeprecatedCodepoint(_, ch) => {
653                write!(
654                    f,
655                    "Codepoint {:?} is discouraged by Unicode",
656                    ch.escape_unicode()
657                )
658            }
659            Error::ForbiddenCodepoint(_, ch) => {
660                write!(
661                    f,
662                    "Input contains bidirectional override codepoint {:?}",
663                    ch.escape_unicode()
664                )
665            }
666            Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
667            Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
668            Error::Wanted {
669                expected, found, ..
670            } => write!(f, "expected {expected}, found {found}"),
671            Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"),
672            Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
673            Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"),
674        }
675    }
676}
677
678#[test]
679fn test_validate_id() {
680    validate_id(0, "apple").unwrap();
681    validate_id(0, "apple-pear").unwrap();
682    validate_id(0, "apple-pear-grape").unwrap();
683    validate_id(0, "a0").unwrap();
684    validate_id(0, "a").unwrap();
685    validate_id(0, "a-a").unwrap();
686    validate_id(0, "bool").unwrap();
687    validate_id(0, "APPLE").unwrap();
688    validate_id(0, "APPLE-PEAR").unwrap();
689    validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
690    validate_id(0, "apple-PEAR-grape").unwrap();
691    validate_id(0, "APPLE-pear-GRAPE").unwrap();
692    validate_id(0, "ENOENT").unwrap();
693    validate_id(0, "is-XML").unwrap();
694    validate_id(0, "apple-0").unwrap();
695    validate_id(0, "a0-000-3d4a-54FF").unwrap();
696
697    assert!(validate_id(0, "").is_err());
698    assert!(validate_id(0, "0").is_err());
699    assert!(validate_id(0, "%").is_err());
700    assert!(validate_id(0, "$").is_err());
701    assert!(validate_id(0, "0a").is_err());
702    assert!(validate_id(0, ".").is_err());
703    assert!(validate_id(0, "·").is_err());
704    assert!(validate_id(0, "a a").is_err());
705    assert!(validate_id(0, "_").is_err());
706    assert!(validate_id(0, "-").is_err());
707    assert!(validate_id(0, "a-").is_err());
708    assert!(validate_id(0, "-a").is_err());
709    assert!(validate_id(0, "Apple").is_err());
710    assert!(validate_id(0, "applE").is_err());
711    assert!(validate_id(0, "-apple-pear").is_err());
712    assert!(validate_id(0, "apple-pear-").is_err());
713    assert!(validate_id(0, "apple_pear").is_err());
714    assert!(validate_id(0, "apple.pear").is_err());
715    assert!(validate_id(0, "apple pear").is_err());
716    assert!(validate_id(0, "apple/pear").is_err());
717    assert!(validate_id(0, "apple|pear").is_err());
718    assert!(validate_id(0, "apple-Pear").is_err());
719    assert!(validate_id(0, "()()").is_err());
720    assert!(validate_id(0, "").is_err());
721    assert!(validate_id(0, "*").is_err());
722    assert!(validate_id(0, "apple\u{5f3}pear").is_err());
723    assert!(validate_id(0, "apple\u{200c}pear").is_err());
724    assert!(validate_id(0, "apple\u{200d}pear").is_err());
725    assert!(validate_id(0, "apple--pear").is_err());
726    assert!(validate_id(0, "_apple").is_err());
727    assert!(validate_id(0, "apple_").is_err());
728    assert!(validate_id(0, "_Znwj").is_err());
729    assert!(validate_id(0, "__i386").is_err());
730    assert!(validate_id(0, "__i386__").is_err());
731    assert!(validate_id(0, "Москва").is_err());
732    assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
733    assert!(validate_id(0, "a0-000-3d4A-54Ff").is_err());
734    assert!(validate_id(0, "😼").is_err(), "non-identifier");
735    assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
736}
737
738#[test]
739fn test_tokenizer() {
740    fn collect(s: &str) -> Result<Vec<Token>, Error> {
741        let mut t = Tokenizer::new(s, 0)?;
742        let mut tokens = Vec::new();
743        while let Some(token) = t.next()? {
744            tokens.push(token.1);
745        }
746        Ok(tokens)
747    }
748
749    assert_eq!(collect("").unwrap(), vec![]);
750    assert_eq!(collect("_").unwrap(), vec![Token::Underscore]);
751    assert_eq!(collect("apple").unwrap(), vec![Token::Id]);
752    assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]);
753    assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]);
754    assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]);
755    assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]);
756    assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]);
757    assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]);
758    assert_eq!(collect("garçon").unwrap(), vec![Token::Id]);
759    assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]);
760    assert_eq!(collect("москва").unwrap(), vec![Token::Id]);
761    assert_eq!(collect("東京").unwrap(), vec![Token::Id]);
762    assert_eq!(
763        collect("garçon-hühnervögel-москва-東京").unwrap(),
764        vec![Token::Id]
765    );
766    assert_eq!(collect("a0").unwrap(), vec![Token::Id]);
767    assert_eq!(collect("a").unwrap(), vec![Token::Id]);
768    assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]);
769    assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
770    assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
771    assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
772    assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
773    assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
774    assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
775    assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
776    assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
777    assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
778    assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
779
780    assert_eq!(collect("func").unwrap(), vec![Token::Func]);
781    assert_eq!(
782        collect("a: func()").unwrap(),
783        vec![
784            Token::Id,
785            Token::Colon,
786            Token::Func,
787            Token::LeftParen,
788            Token::RightParen
789        ]
790    );
791
792    assert_eq!(collect("resource").unwrap(), vec![Token::Resource]);
793
794    assert_eq!(collect("own").unwrap(), vec![Token::Own]);
795    assert_eq!(
796        collect("own<some-id>").unwrap(),
797        vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan]
798    );
799
800    assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]);
801    assert_eq!(
802        collect("borrow<some-id>").unwrap(),
803        vec![
804            Token::Borrow,
805            Token::LessThan,
806            Token::Id,
807            Token::GreaterThan
808        ]
809    );
810
811    assert!(collect("\u{149}").is_err(), "strongly discouraged");
812    assert!(collect("\u{673}").is_err(), "strongly discouraged");
813    assert!(collect("\u{17a3}").is_err(), "strongly discouraged");
814    assert!(collect("\u{17a4}").is_err(), "strongly discouraged");
815    assert!(collect("\u{202a}").is_err(), "bidirectional override");
816    assert!(collect("\u{2068}").is_err(), "bidirectional override");
817    assert!(collect("\u{0}").is_err(), "control code");
818    assert!(collect("\u{b}").is_err(), "control code");
819    assert!(collect("\u{c}").is_err(), "control code");
820    assert!(collect("\u{85}").is_err(), "control code");
821}