inf_wast/
lexer.rs

1//! Definition of a lexer for the WebAssembly text format.
2//!
3//! This module provides a [`Lexer`][] type which is an iterate over the raw
4//! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single
5//! byte in a WebAssembly text field, returning tokens even for comments and
6//! whitespace. Typically you'll ignore comments and whitespace, however.
7//!
8//! If you'd like to iterate over the tokens in a file you can do so via:
9//!
10//! ```
11//! # fn foo() -> Result<(), inf_wast::Error> {
12//! use inf_wast::lexer::Lexer;
13//!
14//! let wat = "(module (func $foo))";
15//! for token in Lexer::new(wat).iter(0) {
16//!     println!("{:?}", token?);
17//! }
18//! # Ok(())
19//! # }
20//! ```
21//!
22//! Note that you'll typically not use this module but will rather use
23//! [`ParseBuffer`](crate::parser::ParseBuffer) instead.
24//!
25//! [`Lexer`]: crate::lexer::Lexer
26
27use crate::token::Span;
28use crate::Error;
29use std::borrow::Cow;
30use std::char;
31use std::fmt;
32use std::slice;
33use std::str;
34use std::str::Utf8Error;
35
36/// A structure used to lex the s-expression syntax of WAT files.
37///
38/// This structure is used to generate [`Token`] items, which should account for
39/// every single byte of the input as we iterate over it. A [`LexError`] is
40/// returned for any non-lexable text.
41#[derive(Clone)]
42pub struct Lexer<'a> {
43    input: &'a str,
44    allow_confusing_unicode: bool,
45}
46
47/// A single token parsed from a `Lexer`.
48#[derive(Copy, Clone, Debug, PartialEq)]
49pub struct Token {
50    /// The kind of token this represents, such as whether it's whitespace, a
51    /// keyword, etc.
52    pub kind: TokenKind,
53    /// The byte offset within the original source for where this token came
54    /// from.
55    pub offset: usize,
56    /// The byte length of this token as it resides in the original source.
57    //
58    // NB: this is `u32` to enable packing `Token` into two pointers of size.
59    // This does limit a single token to being at most 4G large, but that seems
60    // probably ok.
61    pub len: u32,
62}
63
64#[test]
65fn token_is_not_too_big() {
66    assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * 2);
67}
68
69/// Classification of what was parsed from the input stream.
70///
71/// This enumeration contains all kinds of fragments, including comments and
72/// whitespace.
73#[derive(Copy, Clone, Debug, PartialEq)]
74pub enum TokenKind {
75    /// A line comment, preceded with `;;`
76    LineComment,
77
78    /// A block comment, surrounded by `(;` and `;)`. Note that these can be
79    /// nested.
80    BlockComment,
81
82    /// A fragment of source that represents whitespace.
83    Whitespace,
84
85    /// A left-parenthesis, including the source text for where it comes from.
86    LParen,
87    /// A right-parenthesis, including the source text for where it comes from.
88    RParen,
89
90    /// A string literal, which is actually a list of bytes.
91    String,
92
93    /// An identifier (like `$foo`).
94    ///
95    /// All identifiers start with `$` and the payload here is the original
96    /// source text.
97    Id,
98
99    /// A keyword, or something that starts with an alphabetic character.
100    ///
101    /// The payload here is the original source text.
102    Keyword,
103
104    /// An annotation (like `@foo`).
105    ///
106    /// All annotations start with `@` and the payload will be the name of the
107    /// annotation.
108    Annotation,
109
110    /// A reserved series of `idchar` symbols. Unknown what this is meant to be
111    /// used for, you'll probably generate an error about an unexpected token.
112    Reserved,
113
114    /// An integer.
115    Integer(IntegerKind),
116
117    /// A float.
118    Float(FloatKind),
119}
120
121/// Description of the parsed integer from the source.
122#[derive(Copy, Clone, Debug, PartialEq)]
123pub struct IntegerKind {
124    sign: Option<SignToken>,
125    has_underscores: bool,
126    hex: bool,
127}
128
129/// Description of a parsed float from the source.
130#[allow(missing_docs)]
131#[derive(Copy, Clone, Debug, PartialEq)]
132pub enum FloatKind {
133    #[doc(hidden)]
134    Inf { negative: bool },
135    #[doc(hidden)]
136    Nan { negative: bool },
137    #[doc(hidden)]
138    NanVal {
139        negative: bool,
140        has_underscores: bool,
141    },
142    #[doc(hidden)]
143    Normal { has_underscores: bool, hex: bool },
144}
145
146enum ReservedKind {
147    /// "..."
148    String,
149    /// anything that's just a sequence of `idchars!()`
150    Idchars,
151    /// $"..."
152    IdString,
153    /// @"..."
154    AnnotationString,
155    /// everything else (a conglomeration of strings, idchars, etc)
156    Reserved,
157}
158
159/// Errors that can be generated while lexing.
160///
161/// All lexing errors have line/colum/position information as well as a
162/// `LexError` indicating what kind of error happened while lexing.
163#[derive(Debug, Clone, PartialEq, Eq)]
164#[non_exhaustive]
165pub enum LexError {
166    /// A dangling block comment was found with an unbalanced `(;` which was
167    /// never terminated in the file.
168    DanglingBlockComment,
169
170    /// An unexpected character was encountered when generally parsing and
171    /// looking for something else.
172    Unexpected(char),
173
174    /// An invalid `char` in a string literal was found.
175    InvalidStringElement(char),
176
177    /// An invalid string escape letter was found (the thing after the `\` in
178    /// string literals)
179    InvalidStringEscape(char),
180
181    /// An invalid hexadecimal digit was found.
182    InvalidHexDigit(char),
183
184    /// An invalid base-10 digit was found.
185    InvalidDigit(char),
186
187    /// Parsing expected `wanted` but ended up finding `found` instead where the
188    /// two characters aren't the same.
189    Expected {
190        /// The character that was expected to be found
191        wanted: char,
192        /// The character that was actually found
193        found: char,
194    },
195
196    /// We needed to parse more but EOF (or end of the string) was encountered.
197    UnexpectedEof,
198
199    /// A number failed to parse because it was too big to fit within the target
200    /// type.
201    NumberTooBig,
202
203    /// An invalid unicode value was found in a `\u{...}` escape in a string,
204    /// only valid unicode scalars can be escaped that way.
205    InvalidUnicodeValue(u32),
206
207    /// A lone underscore was found when parsing a number, since underscores
208    /// should always be preceded and succeeded with a digit of some form.
209    LoneUnderscore,
210
211    /// A "confusing" unicode character is present in a comment or a string
212    /// literal, such as a character that changes the direction text is
213    /// typically displayed in editors. This could cause the human-read
214    /// version to behave differently than the compiler-visible version, so
215    /// these are simply rejected for now.
216    ConfusingUnicode(char),
217
218    /// An invalid utf-8 sequence was found in a quoted identifier, such as
219    /// `$"\ff"`.
220    InvalidUtf8Id(Utf8Error),
221
222    /// An empty identifier was found, or a lone `$`.
223    EmptyId,
224
225    /// An empty identifier was found, or a lone `@`.
226    EmptyAnnotation,
227}
228
229/// A sign token for an integer.
230#[derive(Clone, Copy, Debug, PartialEq, Eq)]
231pub enum SignToken {
232    /// Plus sign: "+",
233    Plus,
234    /// Minus sign: "-",
235    Minus,
236}
237
238/// A fully parsed integer from a source string with a payload ready to parse
239/// into an integral type.
240#[derive(Debug, PartialEq)]
241pub struct Integer<'a> {
242    sign: Option<SignToken>,
243    val: Cow<'a, str>,
244    hex: bool,
245}
246
247/// Possible parsed float values
248#[derive(Debug, PartialEq, Eq)]
249pub enum Float<'a> {
250    /// A float `NaN` representation
251    Nan {
252        /// The specific bits to encode for this float, optionally
253        val: Option<Cow<'a, str>>,
254        /// Whether or not this is a negative `NaN` or not.
255        negative: bool,
256    },
257    /// An float infinite representation,
258    Inf {
259        #[allow(missing_docs)]
260        negative: bool,
261    },
262    /// A parsed and separated floating point value
263    Val {
264        /// Whether or not the `integral` and `fractional` are specified in hex
265        hex: bool,
266        /// The float parts before the `.`
267        integral: Cow<'a, str>,
268        /// The float parts after the `.`
269        fractional: Option<Cow<'a, str>>,
270        /// The exponent to multiple this `integral.fractional` portion of the
271        /// float by. If `hex` is true this is `2^exponent` and otherwise it's
272        /// `10^exponent`
273        exponent: Option<Cow<'a, str>>,
274    },
275}
276
277// https://webassembly.github.io/spec/core/text/values.html#text-idchar
278macro_rules! idchars {
279    () => {
280        b'0'..=b'9'
281        | b'A'..=b'Z'
282        | b'a'..=b'z'
283        | b'!'
284        | b'#'
285        | b'$'
286        | b'%'
287        | b'&'
288        | b'\''
289        | b'*'
290        | b'+'
291        | b'-'
292        | b'.'
293        | b'/'
294        | b':'
295        | b'<'
296        | b'='
297        | b'>'
298        | b'?'
299        | b'@'
300        | b'\\'
301        | b'^'
302        | b'_'
303        | b'`'
304        | b'|'
305        | b'~'
306    }
307}
308
309impl<'a> Lexer<'a> {
310    /// Creates a new lexer which will lex the `input` source string.
311    pub fn new(input: &str) -> Lexer<'_> {
312        Lexer {
313            input,
314            allow_confusing_unicode: false,
315        }
316    }
317
318    /// Returns the original source input that we're lexing.
319    pub fn input(&self) -> &'a str {
320        self.input
321    }
322
323    /// Configures whether "confusing" unicode characters are allowed while
324    /// lexing.
325    ///
326    /// If allowed then no error will happen if these characters are found, but
327    /// otherwise if disallowed a lex error will be produced when these
328    /// characters are found. Confusing characters are denied by default.
329    ///
330    /// For now "confusing characters" are primarily related to the "trojan
331    /// source" problem where it refers to characters which cause humans to read
332    /// text differently than this lexer, such as characters that alter the
333    /// left-to-right display of the source code.
334    pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self {
335        self.allow_confusing_unicode = allow;
336        self
337    }
338
339    /// Lexes the next at the byte position `pos` in the input.
340    ///
341    /// Returns `Some` if a token is found or `None` if we're at EOF.
342    ///
343    /// The `pos` argument will be updated to point to the next token on a
344    /// successful parse.
345    ///
346    /// # Errors
347    ///
348    /// Returns an error if the input is malformed.
349    pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> {
350        let offset = *pos;
351        Ok(match self.parse_kind(pos)? {
352            Some(kind) => Some(Token {
353                kind,
354                offset,
355                len: (*pos - offset).try_into().unwrap(),
356            }),
357            None => None,
358        })
359    }
360
361    fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> {
362        let start = *pos;
363        // This `match` generally parses the grammar specified at
364        //
365        // https://webassembly.github.io/spec/core/text/lexical.html#text-token
366        let remaining = &self.input.as_bytes()[start..];
367        let byte = match remaining.first() {
368            Some(b) => b,
369            None => return Ok(None),
370        };
371
372        match byte {
373            // Open-parens check the next character to see if this is the start
374            // of a block comment, otherwise it's just a bland left-paren
375            // token.
376            b'(' => match remaining.get(1) {
377                Some(b';') => {
378                    let mut level = 1;
379                    // Note that we're doing a byte-level search here for the
380                    // close-delimiter of `;)`. The actual source text is utf-8
381                    // encode in `remaining` but due to how utf-8 works we
382                    // can safely search for an ASCII byte since it'll never
383                    // otherwise appear in the middle of a codepoint and if we
384                    // find it then it's guaranteed to be the right byte.
385                    //
386                    // Mainly we're avoiding the overhead of decoding utf-8
387                    // characters into a Rust `char` since it's otherwise
388                    // unnecessary work.
389                    let mut iter = remaining[2..].iter();
390                    while let Some(ch) = iter.next() {
391                        match ch {
392                            b'(' => {
393                                if let Some(b';') = iter.as_slice().first() {
394                                    level += 1;
395                                    iter.next();
396                                }
397                            }
398                            b';' => {
399                                if let Some(b')') = iter.as_slice().first() {
400                                    level -= 1;
401                                    iter.next();
402                                    if level == 0 {
403                                        let len = remaining.len() - iter.as_slice().len();
404                                        let comment = &self.input[start..][..len];
405                                        *pos += len;
406                                        self.check_confusing_comment(*pos, comment)?;
407                                        return Ok(Some(TokenKind::BlockComment));
408                                    }
409                                }
410                            }
411                            _ => {}
412                        }
413                    }
414                    Err(self.error(start, LexError::DanglingBlockComment))
415                }
416                _ => {
417                    *pos += 1;
418
419                    Ok(Some(TokenKind::LParen))
420                }
421            },
422
423            b')' => {
424                *pos += 1;
425                Ok(Some(TokenKind::RParen))
426            }
427
428            // https://webassembly.github.io/spec/core/text/lexical.html#white-space
429            b' ' | b'\n' | b'\r' | b'\t' => {
430                self.skip_ws(pos);
431                Ok(Some(TokenKind::Whitespace))
432            }
433
434            c @ (idchars!() | b'"') => {
435                let (kind, src) = self.parse_reserved(pos)?;
436                match kind {
437                    // If the reserved token was simply a single string then
438                    // that is converted to a standalone string token
439                    ReservedKind::String => return Ok(Some(TokenKind::String)),
440
441                    // If only idchars were consumed then this could be a
442                    // specific kind of standalone token we're interested in.
443                    ReservedKind::Idchars => {
444                        // https://webassembly.github.io/spec/core/text/values.html#integers
445                        if let Some(ret) = self.classify_number(src) {
446                            return Ok(Some(ret));
447                        // https://webassembly.github.io/spec/core/text/values.html#text-id
448                        } else if *c == b'$' {
449                            return Ok(Some(TokenKind::Id));
450                        // part of the WebAssembly/annotations proposal
451                        // (no online url yet)
452                        } else if *c == b'@' {
453                            return Ok(Some(TokenKind::Annotation));
454                        // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword
455                        } else if b'a' <= *c && *c <= b'z' {
456                            return Ok(Some(TokenKind::Keyword));
457                        }
458                    }
459
460                    ReservedKind::IdString => return Ok(Some(TokenKind::Id)),
461                    ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)),
462
463                    // ... otherwise this was a conglomeration of idchars,
464                    // strings, or just idchars that don't match a prior rule,
465                    // meaning this falls through to the fallback `Reserved`
466                    // token.
467                    ReservedKind::Reserved => {}
468                }
469
470                Ok(Some(TokenKind::Reserved))
471            }
472
473            // This could be a line comment, otherwise `;` is a reserved token.
474            // The second byte is checked to see if it's a `;;` line comment
475            //
476            // Note that this character being considered as part of a
477            // `reserved` token is part of the annotations proposal.
478            b';' => match remaining.get(1) {
479                Some(b';') => {
480                    let remaining = &self.input[*pos..];
481                    let byte_pos = memchr::memchr2(b'\n', b'\r', remaining.as_bytes())
482                        .unwrap_or(remaining.len());
483                    *pos += byte_pos;
484                    let comment = &remaining[..byte_pos];
485                    self.check_confusing_comment(*pos, comment)?;
486                    Ok(Some(TokenKind::LineComment))
487                }
488                _ => {
489                    *pos += 1;
490                    Ok(Some(TokenKind::Reserved))
491                }
492            },
493
494            // Other known reserved tokens other than `;`
495            //
496            // Note that these characters being considered as part of a
497            // `reserved` token is part of the annotations proposal.
498            b',' | b'[' | b']' | b'{' | b'}' => {
499                *pos += 1;
500                Ok(Some(TokenKind::Reserved))
501            }
502
503            _ => {
504                let ch = self.input[start..].chars().next().unwrap();
505                Err(self.error(*pos, LexError::Unexpected(ch)))
506            }
507        }
508    }
509
510    fn skip_ws(&self, pos: &mut usize) {
511        // This table is a byte lookup table to determine whether a byte is a
512        // whitespace byte. There are only 4 whitespace bytes for the `*.wat`
513        // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes
514        // have a '1' in the table below.
515        //
516        // Due to how utf-8 works (our input is guaranteed to be utf-8) it is
517        // known that if these bytes are found they're guaranteed to be the
518        // whitespace byte, so they can be safely skipped and we don't have to
519        // do full utf-8 decoding. This means that the goal of this function is
520        // to find the first non-whitespace byte in `remaining`.
521        //
522        // For now this lookup table seems to be the fastest, but projects like
523        // https://github.com/lemire/despacer show other simd algorithms which
524        // can possibly accelerate this even more. Note that `*.wat` files often
525        // have a lot of whitespace so this function is typically quite hot when
526        // parsing inputs.
527        #[rustfmt::skip]
528        const WS: [u8; 256] = [
529            //                                   \t \n       \r
530            /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
531            /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
532            //        ' '
533            /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
534            /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
535            /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
536            /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
537            /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
538            /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
539            /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
540            /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
541            /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
542            /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543            /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544            /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
545            /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546            /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547        ];
548        let remaining = &self.input[*pos..];
549        let non_ws_pos = remaining
550            .as_bytes()
551            .iter()
552            .position(|b| WS[*b as usize] != 1)
553            .unwrap_or(remaining.len());
554        *pos += non_ws_pos;
555    }
556
557    /// Splits off a "reserved" token which is then further processed later on
558    /// to figure out which kind of token it is `depending on `ReservedKind`.
559    ///
560    /// For more information on this method see the clarification at
561    /// <https://github.com/WebAssembly/spec/pull/1499> but the general gist is
562    /// that this is parsing the grammar:
563    ///
564    /// ```text
565    /// reserved := (idchar | string)+
566    /// ```
567    ///
568    /// which means that it is eating any number of adjacent string/idchar
569    /// tokens (e.g. `a"b"c`) and returning the classification of what was
570    /// eaten. The classification assists in determining what the actual token
571    /// here eaten looks like.
572    fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> {
573        let mut idchars = 0u32;
574        let mut strings = 0u32;
575        let start = *pos;
576        while let Some(byte) = self.input.as_bytes().get(*pos) {
577            match byte {
578                // Normal `idchars` production which appends to the reserved
579                // token that's being produced.
580                idchars!() => {
581                    idchars += 1;
582                    *pos += 1;
583                }
584
585                // https://webassembly.github.io/spec/core/text/values.html#text-string
586                b'"' => {
587                    strings += 1;
588                    *pos += 1;
589                    let mut it = self.input[*pos..].chars();
590                    let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode);
591                    *pos = self.input.len() - it.as_str().len();
592                    match result {
593                        Ok(_) => {}
594                        Err(e) => {
595                            let err_pos = match &e {
596                                LexError::UnexpectedEof => self.input.len(),
597                                _ => self.input[..*pos].char_indices().next_back().unwrap().0,
598                            };
599                            return Err(self.error(err_pos, e));
600                        }
601                    }
602                }
603
604                // Nothing else is considered part of a reserved token
605                _ => break,
606            }
607        }
608        let ret = &self.input[start..*pos];
609        Ok(match (idchars, strings) {
610            (0, 0) => unreachable!(),
611            (0, 1) => (ReservedKind::String, ret),
612            (_, 0) => (ReservedKind::Idchars, ret),
613            // Pattern match `@"..."` and `$"..."` for string-based
614            // identifiers and annotations.
615            (1, 1) if ret.starts_with("$") => (ReservedKind::IdString, ret),
616            (1, 1) if ret.starts_with("@") => (ReservedKind::AnnotationString, ret),
617            _ => (ReservedKind::Reserved, ret),
618        })
619    }
620
621    fn classify_number(&self, src: &str) -> Option<TokenKind> {
622        let (sign, num) = if let Some(stripped) = src.strip_prefix('+') {
623            (Some(SignToken::Plus), stripped)
624        } else if let Some(stripped) = src.strip_prefix('-') {
625            (Some(SignToken::Minus), stripped)
626        } else {
627            (None, src)
628        };
629
630        let negative = sign == Some(SignToken::Minus);
631
632        // Handle `inf` and `nan` which are special numbers here
633        if num == "inf" {
634            return Some(TokenKind::Float(FloatKind::Inf { negative }));
635        } else if num == "nan" {
636            return Some(TokenKind::Float(FloatKind::Nan { negative }));
637        } else if let Some(stripped) = num.strip_prefix("nan:0x") {
638            let mut it = stripped.as_bytes().iter();
639            let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?;
640            if it.next().is_some() {
641                return None;
642            }
643            return Some(TokenKind::Float(FloatKind::NanVal {
644                negative,
645                has_underscores,
646            }));
647        }
648
649        // Figure out if we're a hex number or not
650        let test_valid: fn(u8) -> bool;
651        let (mut it, hex) = if let Some(stripped) = num.strip_prefix("0x") {
652            test_valid = |x: u8| char::from(x).is_ascii_hexdigit();
653            (stripped.as_bytes().iter(), true)
654        } else {
655            test_valid = |x: u8| char::from(x).is_ascii_digit();
656            (num.as_bytes().iter(), false)
657        };
658
659        // Evaluate the first part, moving out all underscores
660        let mut has_underscores = skip_underscores(&mut it, test_valid)?;
661
662        match it.clone().next() {
663            // If we're followed by something this may be a float so keep going.
664            Some(_) => {}
665
666            // Otherwise this is a valid integer literal!
667            None => {
668                return Some(TokenKind::Integer(IntegerKind {
669                    has_underscores,
670                    sign,
671                    hex,
672                }))
673            }
674        }
675
676        // A number can optionally be after the dot so only actually try to
677        // parse one if it's there.
678        if it.clone().next() == Some(&b'.') {
679            it.next();
680            match it.clone().next() {
681                Some(c) if test_valid(*c) => {
682                    if skip_underscores(&mut it, test_valid)? {
683                        has_underscores = true;
684                    }
685                }
686                Some(_) | None => {}
687            }
688        };
689
690        // Figure out if there's an exponential part here to make a float, and
691        // if so parse it but defer its actual calculation until later.
692        match (hex, it.next()) {
693            (true, Some(b'p')) | (true, Some(b'P')) | (false, Some(b'e')) | (false, Some(b'E')) => {
694                match it.clone().next() {
695                    Some(b'-') => {
696                        it.next();
697                    }
698                    Some(b'+') => {
699                        it.next();
700                    }
701                    _ => {}
702                }
703                if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? {
704                    has_underscores = true;
705                }
706            }
707            (_, None) => {}
708            _ => return None,
709        }
710
711        // We should have eaten everything by now, if not then this is surely
712        // not a float or integer literal.
713        if it.next().is_some() {
714            return None;
715        }
716
717        return Some(TokenKind::Float(FloatKind::Normal {
718            has_underscores,
719            hex,
720        }));
721
722        fn skip_underscores<'a>(
723            it: &mut slice::Iter<'_, u8>,
724            good: fn(u8) -> bool,
725        ) -> Option<bool> {
726            let mut last_underscore = false;
727            let mut has_underscores = false;
728            let first = *it.next()?;
729            if !good(first) {
730                return None;
731            }
732            while let Some(c) = it.clone().next() {
733                if *c == b'_' && !last_underscore {
734                    has_underscores = true;
735                    it.next();
736                    last_underscore = true;
737                    continue;
738                }
739                if !good(*c) {
740                    break;
741                }
742                last_underscore = false;
743                it.next();
744            }
745            if last_underscore {
746                return None;
747            }
748            Some(has_underscores)
749        }
750    }
751
752    /// Verifies that `comment`, which is about to be returned, has a "confusing
753    /// unicode character" in it and should instead be transformed into an
754    /// error.
755    fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> {
756        if self.allow_confusing_unicode {
757            return Ok(());
758        }
759
760        // In an effort to avoid utf-8 decoding the entire `comment` the search
761        // here is a bit more optimized. This checks for the `0xe2` byte because
762        // in the utf-8 encoding that's the leading encoding byte for all
763        // "confusing characters". Each instance of 0xe2 is checked to see if it
764        // starts a confusing character, and if so that's returned.
765        //
766        // Also note that 0xe2 will never be found in the middle of a codepoint,
767        // it's always the start of a codepoint. This means that if our special
768        // characters show up they're guaranteed to start with 0xe2 bytes.
769        let bytes = comment.as_bytes();
770        for pos in memchr::Memchr::new(0xe2, bytes) {
771            if let Some(c) = comment[pos..].chars().next() {
772                if is_confusing_unicode(c) {
773                    // Note that `self.cur()` accounts for already having
774                    // parsed `comment`, so we move backwards to where
775                    // `comment` started and then add the index within
776                    // `comment`.
777                    let pos = end - comment.len() + pos;
778                    return Err(self.error(pos, LexError::ConfusingUnicode(c)));
779                }
780            }
781        }
782
783        Ok(())
784    }
785
786    fn parse_str(
787        it: &mut str::Chars<'a>,
788        allow_confusing_unicode: bool,
789    ) -> Result<Cow<'a, [u8]>, LexError> {
790        enum State {
791            Start,
792            String(Vec<u8>),
793        }
794        let orig = it.as_str();
795        let mut state = State::Start;
796        loop {
797            match it.next().ok_or(LexError::UnexpectedEof)? {
798                '"' => break,
799                '\\' => {
800                    match state {
801                        State::String(_) => {}
802                        State::Start => {
803                            let pos = orig.len() - it.as_str().len() - 1;
804                            state = State::String(orig[..pos].as_bytes().to_vec());
805                        }
806                    }
807                    let buf = match &mut state {
808                        State::String(b) => b,
809                        State::Start => unreachable!(),
810                    };
811                    match it.next().ok_or(LexError::UnexpectedEof)? {
812                        '"' => buf.push(b'"'),
813                        '\'' => buf.push(b'\''),
814                        't' => buf.push(b'\t'),
815                        'n' => buf.push(b'\n'),
816                        'r' => buf.push(b'\r'),
817                        '\\' => buf.push(b'\\'),
818                        'u' => {
819                            Lexer::must_eat_char(it, '{')?;
820                            let n = Lexer::hexnum(it)?;
821                            let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?;
822                            buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
823                            Lexer::must_eat_char(it, '}')?;
824                        }
825                        c1 if c1.is_ascii_hexdigit() => {
826                            let c2 = Lexer::hexdigit(it)?;
827                            buf.push(to_hex(c1) * 16 + c2);
828                        }
829                        c => return Err(LexError::InvalidStringEscape(c)),
830                    }
831                }
832                c if (c as u32) < 0x20 || c as u32 == 0x7f => {
833                    return Err(LexError::InvalidStringElement(c))
834                }
835                c if !allow_confusing_unicode && is_confusing_unicode(c) => {
836                    return Err(LexError::ConfusingUnicode(c))
837                }
838                c => match &mut state {
839                    State::Start => {}
840                    State::String(v) => {
841                        v.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
842                    }
843                },
844            }
845        }
846        match state {
847            State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()),
848            State::String(s) => Ok(s.into()),
849        }
850    }
851
852    /// Parses an id-or-string-based name from `it`.
853    ///
854    /// Note that `it` should already have been lexed and this is just
855    /// extracting the value. If the token lexed was `@a` then this should point
856    /// to `a`.
857    ///
858    /// This will automatically detect quoted syntax such as `@"..."` and the
859    /// byte string will be parsed and validated as utf-8.
860    ///
861    /// # Errors
862    ///
863    /// Returns an error if a quoted byte string is found and contains invalid
864    /// utf-8.
865    fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> {
866        if it.clone().next() == Some('"') {
867            it.next();
868            match Lexer::parse_str(it, true)? {
869                Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) {
870                    Ok(s) => Ok(Cow::Borrowed(s)),
871                    Err(e) => Err(LexError::InvalidUtf8Id(e)),
872                },
873                Cow::Owned(bytes) => match String::from_utf8(bytes) {
874                    Ok(s) => Ok(Cow::Owned(s)),
875                    Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())),
876                },
877            }
878        } else {
879            Ok(Cow::Borrowed(it.as_str()))
880        }
881    }
882
883    fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> {
884        let n = Lexer::hexdigit(it)?;
885        let mut last_underscore = false;
886        let mut n = n as u32;
887        while let Some(c) = it.clone().next() {
888            if c == '_' {
889                it.next();
890                last_underscore = true;
891                continue;
892            }
893            if !c.is_ascii_hexdigit() {
894                break;
895            }
896            last_underscore = false;
897            it.next();
898            n = n
899                .checked_mul(16)
900                .and_then(|n| n.checked_add(to_hex(c) as u32))
901                .ok_or(LexError::NumberTooBig)?;
902        }
903        if last_underscore {
904            return Err(LexError::LoneUnderscore);
905        }
906        Ok(n)
907    }
908
909    /// Reads a hexidecimal digit from the input stream, returning where it's
910    /// defined and the hex value. Returns an error on EOF or an invalid hex
911    /// digit.
912    fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> {
913        let ch = Lexer::must_char(it)?;
914        if ch.is_ascii_hexdigit() {
915            Ok(to_hex(ch))
916        } else {
917            Err(LexError::InvalidHexDigit(ch))
918        }
919    }
920
921    /// Reads the next character from the input string and where it's located,
922    /// returning an error if the input stream is empty.
923    fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> {
924        it.next().ok_or(LexError::UnexpectedEof)
925    }
926
927    /// Expects that a specific character must be read next
928    fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> {
929        let found = Lexer::must_char(it)?;
930        if wanted == found {
931            Ok(())
932        } else {
933            Err(LexError::Expected { wanted, found })
934        }
935    }
936
937    /// Creates an error at `pos` with the specified `kind`
938    fn error(&self, pos: usize, kind: LexError) -> Error {
939        Error::lex(Span { offset: pos }, self.input, kind)
940    }
941
942    /// Returns an iterator over all tokens in the original source string
943    /// starting at the `pos` specified.
944    pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ {
945        std::iter::from_fn(move || self.parse(&mut pos).transpose())
946    }
947
948    /// Returns whether an annotation is present at `pos`. If it is present then
949    /// `Ok(Some(token))` is returned corresponding to the token, otherwise
950    /// `Ok(None)` is returned. If the next token cannot be parsed then an error
951    /// is returned.
952    pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> {
953        let bytes = self.input.as_bytes();
954        // Quickly reject anything that for sure isn't an annotation since this
955        // method is used every time an lparen is parsed.
956        if bytes.get(pos) != Some(&b'@') {
957            return Ok(None);
958        }
959        match self.parse(&mut pos)? {
960            Some(token) => match token.kind {
961                TokenKind::Annotation => Ok(Some(token)),
962                _ => Ok(None),
963            },
964            None => Ok(None),
965        }
966    }
967}
968
969impl Token {
970    /// Returns the original source text for this token.
971    pub fn src<'a>(&self, s: &'a str) -> &'a str {
972        &s[self.offset..][..self.len.try_into().unwrap()]
973    }
974
975    /// Returns the identifier, without the leading `$` symbol, that this token
976    /// represents.
977    ///
978    /// Note that this method returns the contents of the identifier. With a
979    /// string-based identifier this means that escapes have been resolved to
980    /// their string-based equivalent.
981    ///
982    /// Should only be used with `TokenKind::Id`.
983    ///
984    /// # Errors
985    ///
986    /// Returns an error if this is a string-based identifier (e.g. `$"..."`)
987    /// which is invalid utf-8.
988    pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
989        let mut ch = self.src(s).chars();
990        let dollar = ch.next();
991        debug_assert_eq!(dollar, Some('$'));
992        let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
993        if id.is_empty() {
994            return Err(self.error(s, LexError::EmptyId));
995        }
996        Ok(id)
997    }
998
999    /// Returns the annotation, without the leading `@` symbol, that this token
1000    /// represents.
1001    ///
1002    /// Note that this method returns the contents of the identifier. With a
1003    /// string-based identifier this means that escapes have been resolved to
1004    /// their string-based equivalent.
1005    ///
1006    /// Should only be used with `TokenKind::Annotation`.
1007    ///
1008    /// # Errors
1009    ///
1010    /// Returns an error if this is a string-based identifier (e.g. `$"..."`)
1011    /// which is invalid utf-8.
1012    pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
1013        let mut ch = self.src(s).chars();
1014        let at = ch.next();
1015        debug_assert_eq!(at, Some('@'));
1016        let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
1017        if id.is_empty() {
1018            return Err(self.error(s, LexError::EmptyAnnotation));
1019        }
1020        Ok(id)
1021    }
1022
1023    /// Returns the keyword this token represents.
1024    ///
1025    /// Should only be used with [`TokenKind::Keyword`].
1026    pub fn keyword<'a>(&self, s: &'a str) -> &'a str {
1027        self.src(s)
1028    }
1029
1030    /// Returns the reserved string this token represents.
1031    ///
1032    /// Should only be used with [`TokenKind::Reserved`].
1033    pub fn reserved<'a>(&self, s: &'a str) -> &'a str {
1034        self.src(s)
1035    }
1036
1037    /// Returns the parsed string that this token represents.
1038    ///
1039    /// This returns either a raw byte slice into the source if that's possible
1040    /// or an owned representation to handle escaped characters and such.
1041    ///
1042    /// Should only be used with [`TokenKind::String`].
1043    pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> {
1044        let mut ch = self.src(s).chars();
1045        ch.next().unwrap();
1046        Lexer::parse_str(&mut ch, true).unwrap()
1047    }
1048
1049    /// Returns the decomposed float token that this represents.
1050    ///
1051    /// This will slice up the float token into its component parts and return a
1052    /// description of the float token in the source.
1053    ///
1054    /// Should only be used with [`TokenKind::Float`].
1055    pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> {
1056        match kind {
1057            FloatKind::Inf { negative } => Float::Inf { negative },
1058            FloatKind::Nan { negative } => Float::Nan {
1059                val: None,
1060                negative,
1061            },
1062            FloatKind::NanVal {
1063                negative,
1064                has_underscores,
1065            } => {
1066                let src = self.src(s);
1067                let src = if src.starts_with("n") { src } else { &src[1..] };
1068                let mut val = Cow::Borrowed(src.strip_prefix("nan:0x").unwrap());
1069                if has_underscores {
1070                    *val.to_mut() = val.replace("_", "");
1071                }
1072                Float::Nan {
1073                    val: Some(val),
1074                    negative,
1075                }
1076            }
1077            FloatKind::Normal {
1078                has_underscores,
1079                hex,
1080            } => {
1081                let src = self.src(s);
1082                let (integral, fractional, exponent) = match src.find('.') {
1083                    Some(i) => {
1084                        let integral = &src[..i];
1085                        let rest = &src[i + 1..];
1086                        let exponent = if hex {
1087                            rest.find('p').or_else(|| rest.find('P'))
1088                        } else {
1089                            rest.find('e').or_else(|| rest.find('E'))
1090                        };
1091                        match exponent {
1092                            Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])),
1093                            None => (integral, Some(rest), None),
1094                        }
1095                    }
1096                    None => {
1097                        let exponent = if hex {
1098                            src.find('p').or_else(|| src.find('P'))
1099                        } else {
1100                            src.find('e').or_else(|| src.find('E'))
1101                        };
1102                        match exponent {
1103                            Some(i) => (&src[..i], None, Some(&src[i + 1..])),
1104                            None => (src, None, None),
1105                        }
1106                    }
1107                };
1108                let mut integral = Cow::Borrowed(integral.strip_prefix('+').unwrap_or(integral));
1109                let mut fractional = fractional.and_then(|s| {
1110                    if s.is_empty() {
1111                        None
1112                    } else {
1113                        Some(Cow::Borrowed(s))
1114                    }
1115                });
1116                let mut exponent =
1117                    exponent.map(|s| Cow::Borrowed(s.strip_prefix('+').unwrap_or(s)));
1118                if has_underscores {
1119                    *integral.to_mut() = integral.replace("_", "");
1120                    if let Some(fractional) = &mut fractional {
1121                        *fractional.to_mut() = fractional.replace("_", "");
1122                    }
1123                    if let Some(exponent) = &mut exponent {
1124                        *exponent.to_mut() = exponent.replace("_", "");
1125                    }
1126                }
1127                if hex {
1128                    *integral.to_mut() = integral.replace("0x", "");
1129                }
1130                Float::Val {
1131                    hex,
1132                    integral,
1133                    fractional,
1134                    exponent,
1135                }
1136            }
1137        }
1138    }
1139
1140    /// Returns the decomposed integer token that this represents.
1141    ///
1142    /// This will slice up the integer token into its component parts and
1143    /// return a description of the integer token in the source.
1144    ///
1145    /// Should only be used with [`TokenKind::Integer`].
1146    pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> {
1147        let src = self.src(s);
1148        let val = match kind.sign {
1149            Some(SignToken::Plus) => src.strip_prefix('+').unwrap(),
1150            Some(SignToken::Minus) => src,
1151            None => src,
1152        };
1153        let mut val = Cow::Borrowed(val);
1154        if kind.has_underscores {
1155            *val.to_mut() = val.replace("_", "");
1156        }
1157        if kind.hex {
1158            *val.to_mut() = val.replace("0x", "");
1159        }
1160        Integer {
1161            sign: kind.sign,
1162            hex: kind.hex,
1163            val,
1164        }
1165    }
1166
1167    fn error(&self, src: &str, err: LexError) -> Error {
1168        Error::lex(
1169            Span {
1170                offset: self.offset,
1171            },
1172            src,
1173            err,
1174        )
1175    }
1176}
1177
1178impl<'a> Integer<'a> {
1179    /// Returns the sign token for this integer.
1180    pub fn sign(&self) -> Option<SignToken> {
1181        self.sign
1182    }
1183
1184    /// Returns the value string that can be parsed for this integer, as well
1185    /// as the base that it should be parsed in
1186    pub fn val(&self) -> (&str, u32) {
1187        (&self.val, if self.hex { 16 } else { 10 })
1188    }
1189}
1190
1191fn to_hex(c: char) -> u8 {
1192    match c {
1193        'a'..='f' => c as u8 - b'a' + 10,
1194        'A'..='F' => c as u8 - b'A' + 10,
1195        _ => c as u8 - b'0',
1196    }
1197}
1198
1199impl fmt::Display for LexError {
1200    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1201        use LexError::*;
1202        match self {
1203            DanglingBlockComment => f.write_str("unterminated block comment")?,
1204            Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?,
1205            InvalidStringElement(c) => {
1206                write!(f, "invalid character in string '{}'", escape_char(*c))?
1207            }
1208            InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?,
1209            InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?,
1210            InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?,
1211            Expected { wanted, found } => write!(
1212                f,
1213                "expected '{}' but found '{}'",
1214                escape_char(*wanted),
1215                escape_char(*found)
1216            )?,
1217            UnexpectedEof => write!(f, "unexpected end-of-file")?,
1218            NumberTooBig => f.write_str("number is too big to parse")?,
1219            InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?,
1220            LoneUnderscore => write!(f, "bare underscore in numeric literal")?,
1221            ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?,
1222            InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id")?,
1223            EmptyId => write!(f, "empty identifier")?,
1224            EmptyAnnotation => write!(f, "empty annotation id")?,
1225        }
1226        Ok(())
1227    }
1228}
1229
1230fn escape_char(c: char) -> String {
1231    match c {
1232        '\t' => String::from("\\t"),
1233        '\r' => String::from("\\r"),
1234        '\n' => String::from("\\n"),
1235        '\\' => String::from("\\\\"),
1236        '\'' => String::from("\\\'"),
1237        '\"' => String::from("\""),
1238        '\x20'..='\x7e' => String::from(c),
1239        _ => c.escape_unicode().to_string(),
1240    }
1241}
1242
1243/// This is an attempt to protect agains the "trojan source" [1] problem where
1244/// unicode characters can cause editors to render source code differently
1245/// for humans than the compiler itself sees.
1246///
1247/// To mitigate this issue, and because it's relatively rare in practice,
1248/// this simply rejects characters of that form.
1249///
1250/// [1]: https://www.trojansource.codes/
1251fn is_confusing_unicode(ch: char) -> bool {
1252    matches!(
1253        ch,
1254        '\u{202a}'
1255            | '\u{202b}'
1256            | '\u{202d}'
1257            | '\u{202e}'
1258            | '\u{2066}'
1259            | '\u{2067}'
1260            | '\u{2068}'
1261            | '\u{206c}'
1262            | '\u{2069}'
1263    )
1264}
1265
1266#[cfg(test)]
1267mod tests {
1268    use super::*;
1269
1270    #[test]
1271    fn ws_smoke() {
1272        fn get_whitespace(input: &str) -> &str {
1273            let token = get_token(input);
1274            match token.kind {
1275                TokenKind::Whitespace => token.src(input),
1276                other => panic!("unexpected {:?}", other),
1277            }
1278        }
1279        assert_eq!(get_whitespace(" "), " ");
1280        assert_eq!(get_whitespace("  "), "  ");
1281        assert_eq!(get_whitespace("  \n "), "  \n ");
1282        assert_eq!(get_whitespace("  x"), "  ");
1283        assert_eq!(get_whitespace("  ;"), "  ");
1284    }
1285
1286    #[test]
1287    fn line_comment_smoke() {
1288        fn get_line_comment(input: &str) -> &str {
1289            let token = get_token(input);
1290            match token.kind {
1291                TokenKind::LineComment => token.src(input),
1292                other => panic!("unexpected {:?}", other),
1293            }
1294        }
1295        assert_eq!(get_line_comment(";;"), ";;");
1296        assert_eq!(get_line_comment(";; xyz"), ";; xyz");
1297        assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz");
1298        assert_eq!(get_line_comment(";;\nabc"), ";;");
1299        assert_eq!(get_line_comment(";;   \nabc"), ";;   ");
1300        assert_eq!(get_line_comment(";;   \rabc"), ";;   ");
1301        assert_eq!(get_line_comment(";;   \r\nabc"), ";;   ");
1302    }
1303
1304    #[test]
1305    fn block_comment_smoke() {
1306        fn get_block_comment(input: &str) -> &str {
1307            let token = get_token(input);
1308            match token.kind {
1309                TokenKind::BlockComment => token.src(input),
1310                other => panic!("unexpected {:?}", other),
1311            }
1312        }
1313        assert_eq!(get_block_comment("(;;)"), "(;;)");
1314        assert_eq!(get_block_comment("(; ;)"), "(; ;)");
1315        assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)");
1316    }
1317
1318    fn get_token(input: &str) -> Token {
1319        Lexer::new(input)
1320            .parse(&mut 0)
1321            .expect("no first token")
1322            .expect("no token")
1323    }
1324
1325    #[test]
1326    fn lparen() {
1327        assert_eq!(get_token("((").kind, TokenKind::LParen);
1328    }
1329
1330    #[test]
1331    fn rparen() {
1332        assert_eq!(get_token(")(").kind, TokenKind::RParen);
1333    }
1334
1335    #[test]
1336    fn strings() {
1337        fn get_string(input: &str) -> Vec<u8> {
1338            let token = get_token(input);
1339            match token.kind {
1340                TokenKind::String => token.string(input).to_vec(),
1341                other => panic!("not keyword {:?}", other),
1342            }
1343        }
1344        assert_eq!(&*get_string("\"\""), b"");
1345        assert_eq!(&*get_string("\"a\""), b"a");
1346        assert_eq!(&*get_string("\"a b c d\""), b"a b c d");
1347        assert_eq!(&*get_string("\"\\\"\""), b"\"");
1348        assert_eq!(&*get_string("\"\\'\""), b"'");
1349        assert_eq!(&*get_string("\"\\n\""), b"\n");
1350        assert_eq!(&*get_string("\"\\t\""), b"\t");
1351        assert_eq!(&*get_string("\"\\r\""), b"\r");
1352        assert_eq!(&*get_string("\"\\\\\""), b"\\");
1353        assert_eq!(&*get_string("\"\\01\""), &[1]);
1354        assert_eq!(&*get_string("\"\\u{1}\""), &[1]);
1355        assert_eq!(
1356            &*get_string("\"\\u{0f3}\""),
1357            '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1358        );
1359        assert_eq!(
1360            &*get_string("\"\\u{0_f_3}\""),
1361            '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1362        );
1363
1364        for i in 0..=255i32 {
1365            let s = format!("\"\\{:02x}\"", i);
1366            assert_eq!(&*get_string(&s), &[i as u8]);
1367        }
1368    }
1369
1370    #[test]
1371    fn id() {
1372        fn get_id(input: &str) -> String {
1373            let token = get_token(input);
1374            match token.kind {
1375                TokenKind::Id => token.id(input).unwrap().to_string(),
1376                other => panic!("not id {:?}", other),
1377            }
1378        }
1379        assert_eq!(get_id("$x"), "x");
1380        assert_eq!(get_id("$xyz"), "xyz");
1381        assert_eq!(get_id("$x_z"), "x_z");
1382        assert_eq!(get_id("$0^"), "0^");
1383        assert_eq!(get_id("$0^;;"), "0^");
1384        assert_eq!(get_id("$0^ ;;"), "0^");
1385        assert_eq!(get_id("$\"x\" ;;"), "x");
1386    }
1387
1388    #[test]
1389    fn annotation() {
1390        fn get_annotation(input: &str) -> String {
1391            let token = get_token(input);
1392            match token.kind {
1393                TokenKind::Annotation => token.annotation(input).unwrap().to_string(),
1394                other => panic!("not annotation {:?}", other),
1395            }
1396        }
1397        assert_eq!(get_annotation("@foo"), "foo");
1398        assert_eq!(get_annotation("@foo "), "foo");
1399        assert_eq!(get_annotation("@f "), "f");
1400        assert_eq!(get_annotation("@\"x\" "), "x");
1401        assert_eq!(get_annotation("@0 "), "0");
1402    }
1403
1404    #[test]
1405    fn keyword() {
1406        fn get_keyword(input: &str) -> &str {
1407            let token = get_token(input);
1408            match token.kind {
1409                TokenKind::Keyword => token.keyword(input),
1410                other => panic!("not keyword {:?}", other),
1411            }
1412        }
1413        assert_eq!(get_keyword("x"), "x");
1414        assert_eq!(get_keyword("xyz"), "xyz");
1415        assert_eq!(get_keyword("x_z"), "x_z");
1416        assert_eq!(get_keyword("x_z "), "x_z");
1417        assert_eq!(get_keyword("x_z "), "x_z");
1418    }
1419
1420    #[test]
1421    fn reserved() {
1422        fn get_reserved(input: &str) -> &str {
1423            let token = get_token(input);
1424            match token.kind {
1425                TokenKind::Reserved => token.reserved(input),
1426                other => panic!("not reserved {:?}", other),
1427            }
1428        }
1429        assert_eq!(get_reserved("^_x "), "^_x");
1430    }
1431
1432    #[test]
1433    fn integer() {
1434        fn get_integer(input: &str) -> String {
1435            let token = get_token(input);
1436            match token.kind {
1437                TokenKind::Integer(i) => token.integer(input, i).val.to_string(),
1438                other => panic!("not integer {:?}", other),
1439            }
1440        }
1441        assert_eq!(get_integer("1"), "1");
1442        assert_eq!(get_integer("0"), "0");
1443        assert_eq!(get_integer("-1"), "-1");
1444        assert_eq!(get_integer("+1"), "1");
1445        assert_eq!(get_integer("+1_000"), "1000");
1446        assert_eq!(get_integer("+1_0_0_0"), "1000");
1447        assert_eq!(get_integer("+0x10"), "10");
1448        assert_eq!(get_integer("-0x10"), "-10");
1449        assert_eq!(get_integer("0x10"), "10");
1450    }
1451
1452    #[test]
1453    fn float() {
1454        fn get_float(input: &str) -> Float<'_> {
1455            let token = get_token(input);
1456            match token.kind {
1457                TokenKind::Float(f) => token.float(input, f),
1458                other => panic!("not float {:?}", other),
1459            }
1460        }
1461        assert_eq!(
1462            get_float("nan"),
1463            Float::Nan {
1464                val: None,
1465                negative: false
1466            },
1467        );
1468        assert_eq!(
1469            get_float("-nan"),
1470            Float::Nan {
1471                val: None,
1472                negative: true,
1473            },
1474        );
1475        assert_eq!(
1476            get_float("+nan"),
1477            Float::Nan {
1478                val: None,
1479                negative: false,
1480            },
1481        );
1482        assert_eq!(
1483            get_float("+nan:0x1"),
1484            Float::Nan {
1485                val: Some("1".into()),
1486                negative: false,
1487            },
1488        );
1489        assert_eq!(
1490            get_float("nan:0x7f_ffff"),
1491            Float::Nan {
1492                val: Some("7fffff".into()),
1493                negative: false,
1494            },
1495        );
1496        assert_eq!(get_float("inf"), Float::Inf { negative: false });
1497        assert_eq!(get_float("-inf"), Float::Inf { negative: true });
1498        assert_eq!(get_float("+inf"), Float::Inf { negative: false });
1499
1500        assert_eq!(
1501            get_float("1.2"),
1502            Float::Val {
1503                integral: "1".into(),
1504                fractional: Some("2".into()),
1505                exponent: None,
1506                hex: false,
1507            },
1508        );
1509        assert_eq!(
1510            get_float("1.2e3"),
1511            Float::Val {
1512                integral: "1".into(),
1513                fractional: Some("2".into()),
1514                exponent: Some("3".into()),
1515                hex: false,
1516            },
1517        );
1518        assert_eq!(
1519            get_float("-1_2.1_1E+0_1"),
1520            Float::Val {
1521                integral: "-12".into(),
1522                fractional: Some("11".into()),
1523                exponent: Some("01".into()),
1524                hex: false,
1525            },
1526        );
1527        assert_eq!(
1528            get_float("+1_2.1_1E-0_1"),
1529            Float::Val {
1530                integral: "12".into(),
1531                fractional: Some("11".into()),
1532                exponent: Some("-01".into()),
1533                hex: false,
1534            },
1535        );
1536        assert_eq!(
1537            get_float("0x1_2.3_4p5_6"),
1538            Float::Val {
1539                integral: "12".into(),
1540                fractional: Some("34".into()),
1541                exponent: Some("56".into()),
1542                hex: true,
1543            },
1544        );
1545        assert_eq!(
1546            get_float("+0x1_2.3_4P-5_6"),
1547            Float::Val {
1548                integral: "12".into(),
1549                fractional: Some("34".into()),
1550                exponent: Some("-56".into()),
1551                hex: true,
1552            },
1553        );
1554        assert_eq!(
1555            get_float("1."),
1556            Float::Val {
1557                integral: "1".into(),
1558                fractional: None,
1559                exponent: None,
1560                hex: false,
1561            },
1562        );
1563        assert_eq!(
1564            get_float("0x1p-24"),
1565            Float::Val {
1566                integral: "1".into(),
1567                fractional: None,
1568                exponent: Some("-24".into()),
1569                hex: true,
1570            },
1571        );
1572    }
1573}
inf_wast/lexer.rs

inf_wast/
lexer.rs