Skip to main content

wit_parser/ast/
lex.rs

1#[cfg(test)]
2use alloc::{vec, vec::Vec};
3use anyhow::{Result, bail};
4use core::char;
5use core::fmt;
6use core::str;
7use unicode_xid::UnicodeXID;
8
9use self::Token::*;
10
11#[derive(Clone)]
12pub struct Tokenizer<'a> {
13    input: &'a str,
14    span_offset: u32,
15    chars: CrlfFold<'a>,
16}
17
18#[derive(Clone)]
19struct CrlfFold<'a> {
20    chars: str::CharIndices<'a>,
21}
22
23/// A span, designating a range of bytes where a token is located.
24///
25/// Uses `u32::MAX` as a sentinel value to represent unknown spans (e.g.,
26/// decoded from binary).
27#[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)]
28pub struct Span {
29    start: u32,
30    end: u32,
31}
32
33impl Default for Span {
34    fn default() -> Span {
35        Span {
36            start: u32::MAX,
37            end: u32::MAX,
38        }
39    }
40}
41
42impl Span {
43    pub fn new(start: u32, end: u32) -> Span {
44        let span = Span { start, end };
45        assert!(span.is_known(), "cannot create a span with u32::MAX");
46        span
47    }
48
49    /// Adjusts this span by adding the given byte offset to both start and end.
50    pub fn adjust(&mut self, offset: u32) {
51        if self.is_known() {
52            self.start += offset;
53            self.end += offset;
54        }
55    }
56
57    /// Returns the start offset, panicking if this is an unknown span.
58    pub fn start(&self) -> u32 {
59        assert!(self.is_known(), "cannot get start of unknown span");
60        self.start
61    }
62
63    /// Returns the end offset, panicking if this is an unknown span.
64    pub fn end(&self) -> u32 {
65        assert!(self.is_known(), "cannot get end of unknown span");
66        self.end
67    }
68
69    /// Sets the end offset. If this is unknown, converts to a zero-width span at that position.
70    pub fn set_end(&mut self, new_end: u32) {
71        if !self.is_known() {
72            self.start = new_end;
73        }
74        self.end = new_end;
75    }
76
77    /// Sets the start offset. If this is unknown, converts to a zero-width span at that position.
78    pub fn set_start(&mut self, new_start: u32) {
79        if !self.is_known() {
80            self.end = new_start;
81        }
82        self.start = new_start;
83    }
84
85    /// Returns true if this span has a known source location.
86    pub fn is_known(&self) -> bool {
87        self.start != u32::MAX && self.end != u32::MAX
88    }
89}
90
91#[derive(Eq, PartialEq, Debug, Copy, Clone)]
92pub enum Token {
93    Whitespace,
94    Comment,
95
96    Equals,
97    Comma,
98    Colon,
99    Period,
100    Semicolon,
101    LeftParen,
102    RightParen,
103    LeftBrace,
104    RightBrace,
105    LessThan,
106    GreaterThan,
107    RArrow,
108    Star,
109    At,
110    Slash,
111    Plus,
112    Minus,
113
114    Use,
115    Type,
116    Func,
117    U8,
118    U16,
119    U32,
120    U64,
121    S8,
122    S16,
123    S32,
124    S64,
125    F32,
126    F64,
127    Char,
128    Record,
129    Resource,
130    Own,
131    Borrow,
132    Flags,
133    Variant,
134    Enum,
135    Bool,
136    String_,
137    Option_,
138    Result_,
139    Future,
140    Stream,
141    ErrorContext,
142    List,
143    Map,
144    Underscore,
145    As,
146    From_,
147    Static,
148    Interface,
149    Tuple,
150    Import,
151    Export,
152    World,
153    Package,
154    Constructor,
155    Async,
156
157    Id,
158    ExplicitId,
159
160    Integer,
161
162    Include,
163    With,
164}
165
166#[derive(Eq, PartialEq, Debug)]
167#[allow(dead_code)]
168pub enum Error {
169    InvalidCharInId(u32, char),
170    IdPartEmpty(u32),
171    InvalidEscape(u32, char),
172    Unexpected(u32, char),
173    UnterminatedComment(u32),
174    Wanted {
175        at: u32,
176        expected: &'static str,
177        found: &'static str,
178    },
179}
180
181impl<'a> Tokenizer<'a> {
182    pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>> {
183        detect_invalid_input(input)?;
184
185        let mut t = Tokenizer {
186            input,
187            span_offset,
188            chars: CrlfFold {
189                chars: input.char_indices(),
190            },
191        };
192        // Eat utf-8 BOM
193        t.eatc('\u{feff}');
194        Ok(t)
195    }
196
197    pub fn expect_semicolon(&mut self) -> Result<()> {
198        self.expect(Token::Semicolon)?;
199        Ok(())
200    }
201
202    pub fn get_span(&self, span: Span) -> &'a str {
203        let start = usize::try_from(span.start() - self.span_offset).unwrap();
204        let end = usize::try_from(span.end() - self.span_offset).unwrap();
205        &self.input[start..end]
206    }
207
208    pub fn parse_id(&self, span: Span) -> Result<&'a str> {
209        let ret = self.get_span(span);
210        validate_id(span.start(), &ret)?;
211        Ok(ret)
212    }
213
214    pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> {
215        let token = self.get_span(span);
216        let id_part = token.strip_prefix('%').unwrap();
217        validate_id(span.start(), id_part)?;
218        Ok(id_part)
219    }
220
221    pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> {
222        loop {
223            match self.next_raw()? {
224                Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {}
225                other => break Ok(other),
226            }
227        }
228    }
229
230    /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an
231    /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more
232    /// tokens available.
233    pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> {
234        let (str_start, ch) = match self.chars.next() {
235            Some(pair) => pair,
236            None => return Ok(None),
237        };
238        let start = self.span_offset + u32::try_from(str_start).unwrap();
239        let token = match ch {
240            '\n' | '\t' | ' ' => {
241                // Eat all contiguous whitespace tokens
242                while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {}
243                Whitespace
244            }
245            '/' => {
246                // Eat a line comment if it's `//...`
247                if self.eatc('/') {
248                    for (_, ch) in &mut self.chars {
249                        if ch == '\n' {
250                            break;
251                        }
252                    }
253                    Comment
254                // eat a block comment if it's `/*...`
255                } else if self.eatc('*') {
256                    let mut depth = 1;
257                    while depth > 0 {
258                        let (_, ch) = match self.chars.next() {
259                            Some(pair) => pair,
260                            None => return Err(Error::UnterminatedComment(start)),
261                        };
262                        match ch {
263                            '/' if self.eatc('*') => depth += 1,
264                            '*' if self.eatc('/') => depth -= 1,
265                            _ => {}
266                        }
267                    }
268                    Comment
269                } else {
270                    Slash
271                }
272            }
273            '=' => Equals,
274            ',' => Comma,
275            ':' => Colon,
276            '.' => Period,
277            ';' => Semicolon,
278            '(' => LeftParen,
279            ')' => RightParen,
280            '{' => LeftBrace,
281            '}' => RightBrace,
282            '<' => LessThan,
283            '>' => GreaterThan,
284            '*' => Star,
285            '@' => At,
286            '-' => {
287                if self.eatc('>') {
288                    RArrow
289                } else {
290                    Minus
291                }
292            }
293            '+' => Plus,
294            '%' => {
295                let mut iter = self.chars.clone();
296                if let Some((_, ch)) = iter.next() {
297                    if is_keylike_start(ch) {
298                        self.chars = iter.clone();
299                        while let Some((_, ch)) = iter.next() {
300                            if !is_keylike_continue(ch) {
301                                break;
302                            }
303                            self.chars = iter.clone();
304                        }
305                    }
306                }
307                ExplicitId
308            }
309            ch if is_keylike_start(ch) => {
310                let remaining = self.chars.chars.as_str().len();
311                let mut iter = self.chars.clone();
312                while let Some((_, ch)) = iter.next() {
313                    if !is_keylike_continue(ch) {
314                        break;
315                    }
316                    self.chars = iter.clone();
317                }
318                let str_end =
319                    str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len());
320                match &self.input[str_start..str_end] {
321                    "use" => Use,
322                    "type" => Type,
323                    "func" => Func,
324                    "u8" => U8,
325                    "u16" => U16,
326                    "u32" => U32,
327                    "u64" => U64,
328                    "s8" => S8,
329                    "s16" => S16,
330                    "s32" => S32,
331                    "s64" => S64,
332                    "f32" => F32,
333                    "f64" => F64,
334                    "char" => Char,
335                    "resource" => Resource,
336                    "own" => Own,
337                    "borrow" => Borrow,
338                    "record" => Record,
339                    "flags" => Flags,
340                    "variant" => Variant,
341                    "enum" => Enum,
342                    "bool" => Bool,
343                    "string" => String_,
344                    "option" => Option_,
345                    "result" => Result_,
346                    "future" => Future,
347                    "stream" => Stream,
348                    "error-context" => ErrorContext,
349                    "list" => List,
350                    "map" => Map,
351                    "_" => Underscore,
352                    "as" => As,
353                    "from" => From_,
354                    "static" => Static,
355                    "interface" => Interface,
356                    "tuple" => Tuple,
357                    "world" => World,
358                    "import" => Import,
359                    "export" => Export,
360                    "package" => Package,
361                    "constructor" => Constructor,
362                    "include" => Include,
363                    "with" => With,
364                    "async" => Async,
365                    _ => Id,
366                }
367            }
368
369            ch if ch.is_ascii_digit() => {
370                let mut iter = self.chars.clone();
371                while let Some((_, ch)) = iter.next() {
372                    if !ch.is_ascii_digit() {
373                        break;
374                    }
375                    self.chars = iter.clone();
376                }
377
378                Integer
379            }
380
381            ch => return Err(Error::Unexpected(start, ch)),
382        };
383        let end = match self.chars.clone().next() {
384            Some((i, _)) => i,
385            None => self.input.len(),
386        };
387
388        let end = self.span_offset + u32::try_from(end).unwrap();
389        Ok(Some((Span::new(start, end), token)))
390    }
391
392    pub fn eat(&mut self, expected: Token) -> Result<bool, Error> {
393        let mut other = self.clone();
394        match other.next()? {
395            Some((_span, found)) if expected == found => {
396                *self = other;
397                Ok(true)
398            }
399            Some(_) => Ok(false),
400            None => Ok(false),
401        }
402    }
403
404    pub fn expect(&mut self, expected: Token) -> Result<Span, Error> {
405        match self.next()? {
406            Some((span, found)) => {
407                if expected == found {
408                    Ok(span)
409                } else {
410                    Err(Error::Wanted {
411                        at: span.start(),
412                        expected: expected.describe(),
413                        found: found.describe(),
414                    })
415                }
416            }
417            None => Err(Error::Wanted {
418                at: self.span_offset + u32::try_from(self.input.len()).unwrap(),
419                expected: expected.describe(),
420                found: "eof",
421            }),
422        }
423    }
424
425    fn eatc(&mut self, ch: char) -> bool {
426        let mut iter = self.chars.clone();
427        match iter.next() {
428            Some((_, ch2)) if ch == ch2 => {
429                self.chars = iter;
430                true
431            }
432            _ => false,
433        }
434    }
435
436    pub fn eof_span(&self) -> Span {
437        let end = self.span_offset + u32::try_from(self.input.len()).unwrap();
438        Span::new(end, end)
439    }
440}
441
442impl<'a> Iterator for CrlfFold<'a> {
443    type Item = (usize, char);
444
445    fn next(&mut self) -> Option<(usize, char)> {
446        self.chars.next().map(|(i, c)| {
447            if c == '\r' {
448                let mut attempt = self.chars.clone();
449                if let Some((_, '\n')) = attempt.next() {
450                    self.chars = attempt;
451                    return (i, '\n');
452                }
453            }
454            (i, c)
455        })
456    }
457}
458
459fn detect_invalid_input(input: &str) -> Result<()> {
460    // Disallow specific codepoints.
461    let mut line = 1;
462    for ch in input.chars() {
463        match ch {
464            '\n' => line += 1,
465            '\r' | '\t' => {}
466
467            // Bidirectional override codepoints can be used to craft source code that
468            // appears to have a different meaning than its actual meaning. See
469            // [CVE-2021-42574] for background and motivation.
470            //
471            // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
472            '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
473            | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
474                bail!(
475                    "Input contains bidirectional override codepoint {:?} at line {}",
476                    ch.escape_unicode(),
477                    line
478                );
479            }
480
481            // Disallow several characters which are deprecated or discouraged in Unicode.
482            //
483            // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs.
484            // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks.
485            // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels.
486            // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see
487            // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
488            '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
489            | '\u{17b4}' | '\u{17b5}' => {
490                bail!(
491                    "Codepoint {:?} at line {} is discouraged by Unicode",
492                    ch.escape_unicode(),
493                    line
494                );
495            }
496
497            // Disallow control codes other than the ones explicitly recognized above,
498            // so that viewing a wit file on a terminal doesn't have surprising side
499            // effects or appear to have a different meaning than its actual meaning.
500            ch if ch.is_control() => {
501                bail!("Control code '{}' at line {}", ch.escape_unicode(), line);
502            }
503
504            _ => {}
505        }
506    }
507
508    Ok(())
509}
510
511fn is_keylike_start(ch: char) -> bool {
512    // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars,
513    // but we'll diagnose that after we've lexed the full string.
514    UnicodeXID::is_xid_start(ch) || ch == '_' || ch == '-'
515}
516
517fn is_keylike_continue(ch: char) -> bool {
518    // Lex any XID continue (which includes `_`) or '-'.
519    UnicodeXID::is_xid_continue(ch) || ch == '-'
520}
521
522pub fn validate_id(start: u32, id: &str) -> Result<(), Error> {
523    // IDs must have at least one part.
524    if id.is_empty() {
525        return Err(Error::IdPartEmpty(start));
526    }
527
528    // Ids consist of parts separated by '-'s.
529    for (idx, part) in id.split('-').enumerate() {
530        // Parts must be non-empty and contain either all ASCII lowercase or
531        // all ASCII uppercase. Non-first segment can also start with a digit.
532        let Some(first_char) = part.chars().next() else {
533            return Err(Error::IdPartEmpty(start));
534        };
535        if idx == 0 && !first_char.is_ascii_alphabetic() {
536            return Err(Error::InvalidCharInId(start, first_char));
537        }
538        let mut upper = None;
539        for ch in part.chars() {
540            if ch.is_ascii_digit() {
541                // Digits are accepted in both uppercase and lowercase segments.
542            } else if ch.is_ascii_uppercase() {
543                if upper.is_none() {
544                    upper = Some(true);
545                } else if let Some(false) = upper {
546                    return Err(Error::InvalidCharInId(start, ch));
547                }
548            } else if ch.is_ascii_lowercase() {
549                if upper.is_none() {
550                    upper = Some(false);
551                } else if let Some(true) = upper {
552                    return Err(Error::InvalidCharInId(start, ch));
553                }
554            } else {
555                return Err(Error::InvalidCharInId(start, ch));
556            }
557        }
558    }
559
560    Ok(())
561}
562
563impl Token {
564    pub fn describe(&self) -> &'static str {
565        match self {
566            Whitespace => "whitespace",
567            Comment => "a comment",
568            Equals => "'='",
569            Comma => "','",
570            Colon => "':'",
571            Period => "'.'",
572            Semicolon => "';'",
573            LeftParen => "'('",
574            RightParen => "')'",
575            LeftBrace => "'{'",
576            RightBrace => "'}'",
577            LessThan => "'<'",
578            GreaterThan => "'>'",
579            Use => "keyword `use`",
580            Type => "keyword `type`",
581            Func => "keyword `func`",
582            U8 => "keyword `u8`",
583            U16 => "keyword `u16`",
584            U32 => "keyword `u32`",
585            U64 => "keyword `u64`",
586            S8 => "keyword `s8`",
587            S16 => "keyword `s16`",
588            S32 => "keyword `s32`",
589            S64 => "keyword `s64`",
590            F32 => "keyword `f32`",
591            F64 => "keyword `f64`",
592            Char => "keyword `char`",
593            Own => "keyword `own`",
594            Borrow => "keyword `borrow`",
595            Resource => "keyword `resource`",
596            Record => "keyword `record`",
597            Flags => "keyword `flags`",
598            Variant => "keyword `variant`",
599            Enum => "keyword `enum`",
600            Bool => "keyword `bool`",
601            String_ => "keyword `string`",
602            Option_ => "keyword `option`",
603            Result_ => "keyword `result`",
604            Future => "keyword `future`",
605            Stream => "keyword `stream`",
606            ErrorContext => "keyword `error-context`",
607            List => "keyword `list`",
608            Map => "keyword `map`",
609            Underscore => "keyword `_`",
610            Id => "an identifier",
611            ExplicitId => "an '%' identifier",
612            RArrow => "`->`",
613            Star => "`*`",
614            At => "`@`",
615            Slash => "`/`",
616            Plus => "`+`",
617            Minus => "`-`",
618            As => "keyword `as`",
619            From_ => "keyword `from`",
620            Static => "keyword `static`",
621            Interface => "keyword `interface`",
622            Tuple => "keyword `tuple`",
623            Import => "keyword `import`",
624            Export => "keyword `export`",
625            World => "keyword `world`",
626            Package => "keyword `package`",
627            Constructor => "keyword `constructor`",
628            Integer => "an integer",
629            Include => "keyword `include`",
630            With => "keyword `with`",
631            Async => "keyword `async`",
632        }
633    }
634}
635
636impl core::error::Error for Error {}
637
638impl fmt::Display for Error {
639    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
640        match self {
641            Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
642            Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
643            Error::Wanted {
644                expected, found, ..
645            } => write!(f, "expected {expected}, found {found}"),
646            Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"),
647            Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
648            Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"),
649        }
650    }
651}
652
653#[test]
654fn test_validate_id() {
655    validate_id(0, "apple").unwrap();
656    validate_id(0, "apple-pear").unwrap();
657    validate_id(0, "apple-pear-grape").unwrap();
658    validate_id(0, "a0").unwrap();
659    validate_id(0, "a").unwrap();
660    validate_id(0, "a-a").unwrap();
661    validate_id(0, "bool").unwrap();
662    validate_id(0, "APPLE").unwrap();
663    validate_id(0, "APPLE-PEAR").unwrap();
664    validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
665    validate_id(0, "apple-PEAR-grape").unwrap();
666    validate_id(0, "APPLE-pear-GRAPE").unwrap();
667    validate_id(0, "ENOENT").unwrap();
668    validate_id(0, "is-XML").unwrap();
669    validate_id(0, "apple-0").unwrap();
670    validate_id(0, "a0-000-3d4a-54FF").unwrap();
671
672    assert!(validate_id(0, "").is_err());
673    assert!(validate_id(0, "0").is_err());
674    assert!(validate_id(0, "%").is_err());
675    assert!(validate_id(0, "$").is_err());
676    assert!(validate_id(0, "0a").is_err());
677    assert!(validate_id(0, ".").is_err());
678    assert!(validate_id(0, "·").is_err());
679    assert!(validate_id(0, "a a").is_err());
680    assert!(validate_id(0, "_").is_err());
681    assert!(validate_id(0, "-").is_err());
682    assert!(validate_id(0, "a-").is_err());
683    assert!(validate_id(0, "-a").is_err());
684    assert!(validate_id(0, "Apple").is_err());
685    assert!(validate_id(0, "applE").is_err());
686    assert!(validate_id(0, "-apple-pear").is_err());
687    assert!(validate_id(0, "apple-pear-").is_err());
688    assert!(validate_id(0, "apple_pear").is_err());
689    assert!(validate_id(0, "apple.pear").is_err());
690    assert!(validate_id(0, "apple pear").is_err());
691    assert!(validate_id(0, "apple/pear").is_err());
692    assert!(validate_id(0, "apple|pear").is_err());
693    assert!(validate_id(0, "apple-Pear").is_err());
694    assert!(validate_id(0, "()()").is_err());
695    assert!(validate_id(0, "").is_err());
696    assert!(validate_id(0, "*").is_err());
697    assert!(validate_id(0, "apple\u{5f3}pear").is_err());
698    assert!(validate_id(0, "apple\u{200c}pear").is_err());
699    assert!(validate_id(0, "apple\u{200d}pear").is_err());
700    assert!(validate_id(0, "apple--pear").is_err());
701    assert!(validate_id(0, "_apple").is_err());
702    assert!(validate_id(0, "apple_").is_err());
703    assert!(validate_id(0, "_Znwj").is_err());
704    assert!(validate_id(0, "__i386").is_err());
705    assert!(validate_id(0, "__i386__").is_err());
706    assert!(validate_id(0, "Москва").is_err());
707    assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
708    assert!(validate_id(0, "a0-000-3d4A-54Ff").is_err());
709    assert!(validate_id(0, "😼").is_err(), "non-identifier");
710    assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
711}
712
713#[test]
714fn test_tokenizer() {
715    fn collect(s: &str) -> Result<Vec<Token>> {
716        let mut t = Tokenizer::new(s, 0)?;
717        let mut tokens = Vec::new();
718        while let Some(token) = t.next()? {
719            tokens.push(token.1);
720        }
721        Ok(tokens)
722    }
723
724    assert_eq!(collect("").unwrap(), vec![]);
725    assert_eq!(collect("_").unwrap(), vec![Token::Underscore]);
726    assert_eq!(collect("apple").unwrap(), vec![Token::Id]);
727    assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]);
728    assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]);
729    assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]);
730    assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]);
731    assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]);
732    assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]);
733    assert_eq!(collect("garçon").unwrap(), vec![Token::Id]);
734    assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]);
735    assert_eq!(collect("москва").unwrap(), vec![Token::Id]);
736    assert_eq!(collect("東京").unwrap(), vec![Token::Id]);
737    assert_eq!(
738        collect("garçon-hühnervögel-москва-東京").unwrap(),
739        vec![Token::Id]
740    );
741    assert_eq!(collect("a0").unwrap(), vec![Token::Id]);
742    assert_eq!(collect("a").unwrap(), vec![Token::Id]);
743    assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]);
744    assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
745    assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
746    assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
747    assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
748    assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
749    assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
750    assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
751    assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
752    assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
753    assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
754
755    assert_eq!(collect("func").unwrap(), vec![Token::Func]);
756    assert_eq!(
757        collect("a: func()").unwrap(),
758        vec![
759            Token::Id,
760            Token::Colon,
761            Token::Func,
762            Token::LeftParen,
763            Token::RightParen
764        ]
765    );
766
767    assert_eq!(collect("resource").unwrap(), vec![Token::Resource]);
768
769    assert_eq!(collect("own").unwrap(), vec![Token::Own]);
770    assert_eq!(
771        collect("own<some-id>").unwrap(),
772        vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan]
773    );
774
775    assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]);
776    assert_eq!(
777        collect("borrow<some-id>").unwrap(),
778        vec![
779            Token::Borrow,
780            Token::LessThan,
781            Token::Id,
782            Token::GreaterThan
783        ]
784    );
785
786    assert!(collect("\u{149}").is_err(), "strongly discouraged");
787    assert!(collect("\u{673}").is_err(), "strongly discouraged");
788    assert!(collect("\u{17a3}").is_err(), "strongly discouraged");
789    assert!(collect("\u{17a4}").is_err(), "strongly discouraged");
790    assert!(collect("\u{202a}").is_err(), "bidirectional override");
791    assert!(collect("\u{2068}").is_err(), "bidirectional override");
792    assert!(collect("\u{0}").is_err(), "control code");
793    assert!(collect("\u{b}").is_err(), "control code");
794    assert!(collect("\u{c}").is_err(), "control code");
795    assert!(collect("\u{85}").is_err(), "control code");
796}