wain_syntax_text/
lexer.rs

1use crate::source::describe_position;
2use std::borrow::Cow;
3use std::char;
4use std::fmt;
5use std::iter;
6use std::ops;
7use std::str;
8
9#[cfg_attr(test, derive(Debug))]
10#[derive(Clone)]
11pub enum LexErrorKind<'source> {
12    UnterminatedBlockComment,
13    UnterminatedString,
14    ReservedName(&'source str),
15    UnexpectedCharacter(char),
16    ControlCharInString,
17    InvalidStringFormat,
18}
19
20// TODO: Support std::error::Error
21
22#[cfg_attr(test, derive(Debug))]
23#[derive(Clone)]
24pub struct LexError<'source> {
25    kind: LexErrorKind<'source>,
26    offset: usize,
27    source: &'source str,
28}
29
30impl<'s> LexError<'s> {
31    pub fn kind(&self) -> &LexErrorKind<'s> {
32        &self.kind
33    }
34
35    pub fn offset(&self) -> usize {
36        self.offset
37    }
38
39    pub fn source(&self) -> &'s str {
40        self.source
41    }
42}
43
44impl<'s> fmt::Display for LexError<'s> {
45    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
46        use LexErrorKind::*;
47        match &self.kind {
48            UnterminatedBlockComment => write!(f, "block comment is not terminated")?,
49            UnterminatedString => write!(f, "string literal is not terminated",)?,
50            ReservedName(name) => write!(f, "name '{}' is unavailable since it's reserved name", name)?,
51            UnexpectedCharacter(c) => write!(f, "unexpected character '{}'", c)?,
52            ControlCharInString => write!(f, "control char in string")?,
53            InvalidStringFormat => write!(
54                f,
55                r#"escape must be one of \t, \n, \r, \", \', \\, \u{{hexnum}}, \MN where M and N are hex number"#
56            )?,
57        }
58        describe_position(f, self.source, self.offset)
59    }
60}
61
62type Result<'s, T> = ::std::result::Result<T, Box<LexError<'s>>>;
63
64#[cfg_attr(test, derive(Debug))]
65#[derive(Clone, Copy, PartialEq)]
66pub enum Sign {
67    Plus,
68    Minus,
69}
70
71impl Sign {
72    pub fn apply<N: ops::Neg<Output = N>>(self, n: N) -> N::Output {
73        match self {
74            Sign::Plus => n,
75            Sign::Minus => -n,
76        }
77    }
78}
79
80impl fmt::Display for Sign {
81    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
82        match self {
83            Sign::Plus => f.write_str("+"),
84            Sign::Minus => f.write_str("-"),
85        }
86    }
87}
88
89#[cfg_attr(test, derive(Debug))]
90#[derive(PartialEq, Clone, Copy)]
91pub enum NumBase {
92    Hex,
93    Dec,
94}
95
96impl NumBase {
97    pub fn prefix(self) -> &'static str {
98        match self {
99            NumBase::Hex => "0x",
100            NumBase::Dec => "",
101        }
102    }
103
104    pub fn radix(self) -> u32 {
105        match self {
106            NumBase::Hex => 16,
107            NumBase::Dec => 10,
108        }
109    }
110}
111
112// https://webassembly.github.io/spec/core/text/values.html#floating-point
113#[cfg_attr(test, derive(Debug, PartialEq))]
114#[derive(Clone)]
115pub enum Float<'source> {
116    Nan(Option<&'source str>),
117    Inf,
118    Val {
119        base: NumBase,
120        frac: &'source str,
121        exp: Option<(Sign, &'source str)>,
122    },
123}
124
125// https://webassembly.github.io/spec/core/text/lexical.html#tokens
126#[cfg_attr(test, derive(Debug, PartialEq))]
127#[derive(Clone)]
128pub enum Token<'source> {
129    LParen,
130    RParen,
131    Keyword(&'source str), // Too many keywords so it'source not pragmatic to define `Keyword` enum in terms of maintenance
132    Int(Sign, NumBase, &'source str),
133    Float(Sign, Float<'source>),
134    String(Cow<'source, [u8]>, &'source str),
135    Ident(&'source str),
136}
137
138impl<'s> fmt::Display for Token<'s> {
139    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
140        match self {
141            Token::LParen => f.write_str("paren '('"),
142            Token::RParen => f.write_str("paren ')'"),
143            Token::Keyword(kw) => write!(f, "keyword '{}'", kw),
144            Token::Int(sign, base, s) => write!(f, "integer '{}{}{}'", sign, base.prefix(), s),
145            Token::Float(sign, Float::Nan(Some(payload))) => {
146                write!(f, "float number '{}nan:0x{}'", sign, payload)
147            }
148            Token::Float(sign, Float::Nan(None)) => write!(f, "float number '{}nan'", sign),
149            Token::Float(sign, Float::Inf) => write!(f, "float number '{}inf'", sign),
150            Token::Float(
151                sign,
152                Float::Val {
153                    base,
154                    frac,
155                    exp: Some((exp_sign, exp)),
156                },
157            ) => {
158                let exp_leader = if *base == NumBase::Hex { 'P' } else { 'E' };
159                write!(
160                    f,
161                    "float number '{sign}{prefix}{frac}{exp_leader}{exp_sign}{exp}",
162                    sign = sign,
163                    prefix = base.prefix(),
164                    frac = frac,
165                    exp_leader = exp_leader,
166                    exp_sign = exp_sign,
167                    exp = exp
168                )
169            }
170            Token::Float(sign, Float::Val { base, frac, exp: None }) => {
171                write!(f, "float number '{}{}{}", sign, base.prefix(), frac,)
172            }
173            Token::String(_, s) => write!(f, "string literal {}", s),
174            Token::Ident(ident) => write!(f, "identifier '{}'", ident),
175        }
176    }
177}
178
179type Lexed<'s> = Option<(Token<'s>, usize)>;
180type LexResult<'s> = Result<'s, Lexed<'s>>;
181
182#[derive(Clone)]
183pub struct Lexer<'source> {
184    chars: iter::Peekable<str::CharIndices<'source>>, // LL(1)
185    source: &'source str,
186}
187
188impl<'s> Lexer<'s> {
189    pub fn new(source: &str) -> Lexer<'_> {
190        Lexer {
191            source,
192            chars: source.char_indices().peekable(),
193        }
194    }
195
196    pub fn source(&self) -> &'s str {
197        self.source
198    }
199
200    pub fn lex(&mut self) -> LexResult<'s> {
201        while self.eat_whitespace()? {}
202
203        // https://webassembly.github.io/spec/core/text/lexical.html#tokens
204        if let Some(lexed) = self.lex_paren() {
205            return Ok(Some(lexed));
206        }
207        if let Some(lexed) = self.lex_string()? {
208            return Ok(Some(lexed));
209        }
210        // id, keyword, reserved, number
211        if let Some(lexed) = self.lex_idchars()? {
212            return Ok(Some(lexed));
213        }
214
215        if let Some(peeked) = self.chars.peek() {
216            let (offset, c) = *peeked; // Borrow checker complains about *c and *offset in below statement
217            self.fail(LexErrorKind::UnexpectedCharacter(c), offset)
218        } else {
219            Ok(None)
220        }
221    }
222
223    fn lex_paren(&mut self) -> Lexed<'s> {
224        if let Some(offset) = self.eat_char('(') {
225            Some((Token::LParen, offset))
226        } else {
227            self.eat_char(')').map(|offset| (Token::RParen, offset))
228        }
229    }
230
231    fn lex_string(&mut self) -> LexResult<'s> {
232        // https://webassembly.github.io/spec/core/text/values.html#strings
233        let start = match self.eat_char('"') {
234            Some(offset) => offset,
235            None => return Ok(None),
236        };
237
238        let mut buf = vec![];
239        while let Some((i, c)) = self.chars.next() {
240            match c {
241                '"' => {
242                    let content = if buf.is_empty() {
243                        // When no escape is included in string literal, keep slice without copy
244                        // omitting the first and last double quotes
245                        Cow::Borrowed(self.source[start + 1..i].as_bytes())
246                    } else {
247                        Cow::Owned(buf)
248                    };
249                    let token = Token::String(content, &self.source[start..i + 1]);
250                    return Ok(Some((token, start)));
251                }
252                '\\' => {
253                    if buf.is_empty() {
254                        // Encounter the first escaped character. It means the source of literal is
255                        // different from its content. Need to allocate another buffer to keep its
256                        // content. Note that `+ 1` omits the first '"'
257                        buf.extend_from_slice(self.source[start + 1..i].as_bytes());
258                    }
259
260                    match self.chars.next() {
261                        Some((_, 't')) => buf.push(b'\t'),
262                        Some((_, 'n')) => buf.push(b'\n'),
263                        Some((_, 'r')) => buf.push(b'\r'),
264                        Some((_, '"')) => buf.push(b'"'),
265                        Some((_, '\'')) => buf.push(b'\''),
266                        Some((_, '\\')) => buf.push(b'\\'),
267                        Some((_, 'u')) => {
268                            match self.chars.next() {
269                                Some((i, '{')) => {
270                                    let brace_start = i + 1; // next to '{'
271                                    let uend = loop {
272                                        match self.chars.next() {
273                                            Some((i, '}')) => break i,
274                                            Some(_) => continue,
275                                            None => return self.fail(LexErrorKind::UnterminatedString, start),
276                                        }
277                                    };
278                                    if let Some(c) = u32::from_str_radix(&self.source[brace_start..uend], 16)
279                                        .ok()
280                                        .and_then(char::from_u32)
281                                    {
282                                        let mut b = [0; 4];
283                                        buf.extend_from_slice(c.encode_utf8(&mut b).as_bytes());
284                                    } else {
285                                        return self.fail(LexErrorKind::InvalidStringFormat, start);
286                                    }
287                                }
288                                Some(_) => return self.fail(LexErrorKind::InvalidStringFormat, start),
289                                None => return self.fail(LexErrorKind::UnterminatedString, start),
290                            }
291                        }
292                        Some((_, c)) => {
293                            let hi = c.to_digit(16);
294                            let lo = self.chars.next().and_then(|(_, c)| c.to_digit(16));
295                            match (hi, lo) {
296                                (Some(hi), Some(lo)) => buf.push((hi * 16 + lo) as u8),
297                                _ => return self.fail(LexErrorKind::InvalidStringFormat, start),
298                            }
299                        }
300                        None => return self.fail(LexErrorKind::UnterminatedString, start),
301                    }
302                }
303                _ if c.is_ascii_control() => return self.fail(LexErrorKind::ControlCharInString, start),
304                _ if !buf.is_empty() => {
305                    let mut b = [0; 4];
306                    buf.extend_from_slice(c.encode_utf8(&mut b).as_bytes());
307                }
308                _ => { /* Have not seen any escape chars yet */ }
309            }
310        }
311
312        self.fail(LexErrorKind::UnterminatedString, start)
313    }
314
315    fn lex_idchars(&mut self) -> LexResult<'s> {
316        fn is_idchar(c: char) -> bool {
317            // https://webassembly.github.io/spec/core/text/values.html#text-idchar
318            matches!(c,
319                '0'..='9'
320                | 'a'..='z'
321                | 'A'..='Z'
322                | '!'
323                | '#'
324                | '$'
325                | '%'
326                | '&'
327                | '\''
328                | '*'
329                | '+'
330                | '-'
331                | '.'
332                | '/'
333                | ':'
334                | '<'
335                | '='
336                | '>'
337                | '?'
338                | '@'
339                | '\\'
340                | '^'
341                | '_'
342                | '`'
343                | '|'
344                | '~'
345            )
346        }
347
348        let start = self.offset();
349        let end = loop {
350            match self.chars.peek() {
351                Some((_, c)) if is_idchar(*c) => {
352                    self.chars.next();
353                    continue;
354                }
355                Some((offset, _)) => break *offset,
356                None => break self.source.len(),
357            }
358        };
359
360        if start == end {
361            return Ok(None);
362        }
363
364        // Note: Number must be lexed before keyword for 'inf' and 'nan'
365        let idchars = &self.source[start..end];
366        if let Some(lexed) = Self::lex_number_from_idchars(idchars, start) {
367            return Ok(Some(lexed));
368        }
369        if let Some(lexed) = Self::lex_ident_or_keyword_from_idchars(idchars, start) {
370            return Ok(Some(lexed));
371        }
372
373        // https://webassembly.github.io/spec/core/text/lexical.html#text-reserved
374        self.fail(LexErrorKind::ReservedName(idchars), start)
375    }
376
377    fn is_num<F: Fn(&char) -> bool>(s: &str, pred: F) -> bool {
378        if s.is_empty() {
379            return false;
380        }
381        let mut prev_underscore = true; // true because number cannot start with '_'
382        for c in s.chars() {
383            match c {
384                '_' if prev_underscore => return false,
385                '_' => prev_underscore = true,
386                _ if pred(&c) => prev_underscore = false,
387                _ => return false,
388            }
389        }
390        !prev_underscore
391    }
392
393    fn lex_unsigned_number(idchars: &'s str, sign: Sign, base: NumBase) -> Option<Token<'s>> {
394        // https://webassembly.github.io/spec/core/text/values.html#integers
395        // https://webassembly.github.io/spec/core/text/values.html#floating-point
396
397        fn is_hex_exp(c: char) -> bool {
398            c == 'p' || c == 'P'
399        }
400        fn is_dec_exp(c: char) -> bool {
401            c == 'e' || c == 'E'
402        }
403
404        #[allow(clippy::type_complexity)]
405        let (is_digit, is_exp): (fn(&char) -> bool, fn(char) -> bool) = match base {
406            NumBase::Hex => (char::is_ascii_hexdigit, is_hex_exp),
407            NumBase::Dec => (char::is_ascii_digit, is_dec_exp),
408        };
409        let mut chars = idchars.char_indices();
410        if chars.next().map(|(_, c)| !is_digit(&c)).unwrap_or(true) {
411            return None;
412        }
413
414        let mut exp_start = false;
415        let mut saw_dot = false;
416        {
417            #[derive(PartialEq, Eq)]
418            enum PrevChar {
419                Dot,
420                Underscore,
421                Digit,
422            }
423
424            let mut prev_char = PrevChar::Digit;
425            for (_, c) in &mut chars {
426                prev_char = match c {
427                    '.' if saw_dot || prev_char != PrevChar::Digit => return None,
428                    '.' => {
429                        saw_dot = true;
430                        PrevChar::Dot
431                    }
432                    '_' if prev_char != PrevChar::Digit => return None,
433                    '_' => PrevChar::Underscore,
434                    c if is_exp(c) => {
435                        exp_start = true;
436                        break;
437                    }
438                    c if is_digit(&c) => PrevChar::Digit,
439                    _ => return None,
440                };
441            }
442
443            // Number cannot end with '_'
444            if prev_char == PrevChar::Underscore {
445                return None;
446            }
447        }
448
449        match chars.next() {
450            Some((i, c)) if exp_start => {
451                let (exp_sign, start) = match c {
452                    '+' => (Sign::Plus, i + 1),
453                    '-' => (Sign::Minus, i + 1),
454                    _ => (Sign::Plus, i),
455                };
456                let frac = &idchars[..i - 1]; // - 1 for 'e', 'E', 'p', 'P'
457                let exp = &idchars[start..];
458                if Self::is_num(exp, char::is_ascii_digit) {
459                    let float = Float::Val {
460                        base,
461                        frac,
462                        exp: Some((exp_sign, exp)),
463                    };
464                    Some(Token::Float(sign, float))
465                } else {
466                    None
467                }
468            }
469            Some(_) => unreachable!(),
470            None if exp_start => None, // e.g. '123e', '0x1fp'
471            None if saw_dot => Some(Token::Float(
472                sign,
473                Float::Val {
474                    base,
475                    frac: idchars,
476                    exp: None,
477                },
478            )),
479            None => Some(Token::Int(sign, base, idchars)),
480        }
481    }
482
483    fn lex_number_from_idchars(idchars: &'s str, start: usize) -> Lexed<'s> {
484        let (sign, idchars) = match idchars.chars().next() {
485            Some('+') => (Sign::Plus, &idchars[1..]),
486            Some('-') => (Sign::Minus, &idchars[1..]),
487            _ => (Sign::Plus, idchars),
488        };
489
490        // https://webassembly.github.io/spec/core/text/values.html#text-float
491        let token = match idchars {
492            "inf" => Some(Token::Float(sign, Float::Inf)),
493            "nan" => Some(Token::Float(sign, Float::Nan(None))),
494            idchars if idchars.starts_with("nan:0x") => {
495                let payload = &idchars[6..];
496                if Self::is_num(payload, char::is_ascii_hexdigit) {
497                    Some(Token::Float(sign, Float::Nan(Some(payload))))
498                } else {
499                    None
500                }
501            }
502            idchars if idchars.starts_with("0x") => Self::lex_unsigned_number(&idchars[2..], sign, NumBase::Hex),
503            idchars => Self::lex_unsigned_number(idchars, sign, NumBase::Dec),
504        };
505        token.map(|t| (t, start))
506    }
507
508    fn lex_ident_or_keyword_from_idchars(idchars: &'s str, start: usize) -> Lexed<'s> {
509        // https://webassembly.github.io/spec/core/text/lexical.html#tokens
510        match idchars.chars().next() {
511            Some('$') if idchars.len() > 1 => Some((Token::Ident(idchars), start)), // https://webassembly.github.io/spec/core/text/values.html#text-id
512            Some('a'..='z') => Some((Token::Keyword(idchars), start)), // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword
513            _ => None,
514        }
515    }
516
517    fn eat_whitespace(&mut self) -> Result<'s, bool> {
518        // https://webassembly.github.io/spec/core/text/lexical.html#white-space
519        fn is_ws_char(c: char) -> bool {
520            matches!(c, ' ' | '\t' | '\n' | '\r')
521        }
522        Ok(self.eat_char_by(is_ws_char) || self.eat_line_comment() || self.eat_block_comment()?)
523    }
524
525    fn eat_line_comment(&mut self) -> bool {
526        // linecomment https://webassembly.github.io/spec/core/text/lexical.html#comments
527        if self.eat_str(";;").is_none() {
528            return false;
529        }
530
531        for (_, c) in &mut self.chars {
532            if c == '\n' {
533                break;
534            }
535        }
536
537        true
538    }
539
540    fn eat_block_comment(&mut self) -> Result<'s, bool> {
541        // blockcomment https://webassembly.github.io/spec/core/text/lexical.html#comments
542        let start = if let Some(offset) = self.eat_str("(;") {
543            offset
544        } else {
545            return Ok(false);
546        };
547
548        // blockchar
549        loop {
550            if self.eat_block_comment()? {
551                continue;
552            }
553            if self.eat_str(";)").is_some() {
554                return Ok(true);
555            }
556            if self.chars.next().is_none() {
557                return self.fail(LexErrorKind::UnterminatedBlockComment, start);
558            }
559        }
560    }
561
562    fn eat_char(&mut self, want: char) -> Option<usize> {
563        match self.chars.peek() {
564            Some((offset, c)) if *c == want => {
565                let offset = *offset;
566                self.chars.next();
567                Some(offset)
568            }
569            _ => None,
570        }
571    }
572
573    fn eat_char_by<F: Fn(char) -> bool>(&mut self, pred: F) -> bool {
574        match self.chars.peek() {
575            Some((_, c)) if pred(*c) => {
576                self.chars.next();
577                true
578            }
579            _ => false,
580        }
581    }
582
583    fn eat_str(&mut self, s: &str) -> Option<usize> {
584        assert!(!s.is_empty());
585        let offset = self.offset();
586        if self.source[offset..].starts_with(s) {
587            self.chars.nth(s.len() - 1);
588            Some(offset)
589        } else {
590            None
591        }
592    }
593
594    fn offset(&mut self) -> usize {
595        match self.chars.peek() {
596            Some((offset, _)) => *offset,
597            None => self.source.len(),
598        }
599    }
600
601    fn fail<T>(&self, kind: LexErrorKind<'s>, offset: usize) -> Result<'s, T> {
602        Err(Box::new(LexError {
603            kind,
604            offset,
605            source: self.source,
606        }))
607    }
608}
609
610impl<'s> Iterator for Lexer<'s> {
611    type Item = Result<'s, (Token<'s>, usize)>;
612
613    fn next(&mut self) -> Option<Self::Item> {
614        self.lex().transpose()
615    }
616}
617
618#[cfg(test)]
619mod tests {
620    use super::*;
621
622    fn lex_all(s: &str) -> Result<'_, Vec<(Token<'_>, usize)>> {
623        Lexer::new(s).collect()
624    }
625
626    macro_rules! assert_lex_one {
627        ($input:expr, $token:pat) => {
628            let tokens = lex_all($input).unwrap();
629            assert_eq!(tokens.len(), 1);
630            match &tokens[0].0 {
631                $token => {}
632                e => panic!(
633                    "assertion failed: {:?} did not match to token {}",
634                    e,
635                    stringify!($token)
636                ),
637            }
638        };
639    }
640
641    macro_rules! assert_lex_error {
642        ($input:expr, $errkind:pat) => {
643            match lex_all($input).unwrap_err().kind() {
644                $errkind => {}
645                e => panic!(
646                    "assertion failed: {:?} did not match to error kind {}",
647                    e,
648                    stringify!($token)
649                ),
650            }
651        };
652    }
653
654    #[test]
655    fn spaces() {
656        assert!(lex_all("").unwrap().is_empty());
657        assert!(lex_all(" ").unwrap().is_empty());
658        assert!(lex_all("\t").unwrap().is_empty());
659        assert!(lex_all("\n").unwrap().is_empty());
660        assert!(lex_all("\r").unwrap().is_empty());
661        assert!(lex_all(" \t\r\n   \t\n\n\n\n ").unwrap().is_empty());
662    }
663
664    #[test]
665    fn comments() {
666        assert!(lex_all(";;").unwrap().is_empty());
667        assert!(lex_all(";;foo").unwrap().is_empty());
668        assert!(lex_all(";;foo\n;;bar\n  ;; piyo").unwrap().is_empty());
669        assert!(lex_all("(;;)").unwrap().is_empty());
670        assert!(lex_all("(; hi! ;)").unwrap().is_empty());
671        assert!(lex_all("(; hi!\n  how are you?\n  bye!\n ;)").unwrap().is_empty());
672        assert!(lex_all("(;(;;);)").unwrap().is_empty());
673        assert!(lex_all("(;\nhi!\n (;how are you?\n;) bye!\n;)").unwrap().is_empty());
674        // Errors
675        assert_lex_error!("(;", LexErrorKind::UnterminatedBlockComment);
676        assert_lex_error!("(; hi! ", LexErrorKind::UnterminatedBlockComment);
677        assert_lex_error!("(;(;;)", LexErrorKind::UnterminatedBlockComment);
678    }
679
680    #[test]
681    fn parens() {
682        assert_lex_one!("(", Token::LParen);
683        assert_lex_one!(")", Token::RParen);
684    }
685
686    #[test]
687    fn strings() {
688        macro_rules! assert_lex_string {
689            ($input:expr, $bytes:expr) => {
690                let tokens = lex_all($input).unwrap();
691                assert_eq!(tokens.len(), 1);
692                match &tokens[0].0 {
693                    Token::String(v, src) if *v == $bytes.to_vec() && *src == $input => {}
694                    e => panic!(
695                        "assertion failed: {:?} did not match to token {}",
696                        e,
697                        stringify!(Token::String($bytes, $input))
698                    ),
699                }
700            };
701        }
702
703        assert_lex_string!(r#""""#, b"");
704        assert_lex_string!(r#""hello""#, b"hello");
705        let mut v = "\t\n\r\"\'\\\u{1234}\x00".as_bytes().to_vec();
706        v.push(b'\xa9');
707        assert_lex_string!(r#""\t\n\r\"\'\\\u{1234}\00\a9""#, v);
708        assert_lex_string!(r#""あいうえお""#, "あいうえお".as_bytes());
709        assert_lex_error!(r#"""#, LexErrorKind::UnterminatedString);
710        assert_lex_error!(r#""foo\""#, LexErrorKind::UnterminatedString);
711        assert_lex_error!(r#""\u{41""#, LexErrorKind::UnterminatedString);
712        assert_lex_error!(r#""\u"#, LexErrorKind::UnterminatedString);
713        assert_lex_error!(r#""\u{""#, LexErrorKind::UnterminatedString);
714
715        assert_lex_error!(r#""\x""#, LexErrorKind::InvalidStringFormat);
716        assert_lex_error!(r#""\0""#, LexErrorKind::InvalidStringFormat);
717        assert_lex_error!(r#""\0x""#, LexErrorKind::InvalidStringFormat);
718        assert_lex_error!(r#""\u""#, LexErrorKind::InvalidStringFormat);
719        assert_lex_error!(r#""\u{}""#, LexErrorKind::InvalidStringFormat);
720        assert_lex_error!(r#""\u{hello!}""#, LexErrorKind::InvalidStringFormat);
721        assert_lex_error!(r#""\u{d800}""#, LexErrorKind::InvalidStringFormat);
722        assert_lex_error!(r#""\u{dfff}""#, LexErrorKind::InvalidStringFormat);
723        assert_lex_error!(r#""\u{110000}""#, LexErrorKind::InvalidStringFormat);
724
725        assert_lex_error!("\"\x00\"", LexErrorKind::ControlCharInString);
726        assert_lex_error!("\"\x1f\"", LexErrorKind::ControlCharInString);
727        assert_lex_error!("\"\x7f\"", LexErrorKind::ControlCharInString);
728    }
729
730    #[test]
731    fn idents() {
732        assert_lex_one!("$x", Token::Ident("$x"));
733        assert_lex_one!("$foo0123FOO", Token::Ident("$foo0123FOO"));
734        assert_lex_one!(
735            "$0aB!#$%&'*+-./:<=>?@\\^_`|~",
736            Token::Ident("$0aB!#$%&'*+-./:<=>?@\\^_`|~")
737        );
738    }
739
740    #[test]
741    fn keywords() {
742        assert_lex_one!("module", Token::Keyword("module"));
743        assert_lex_one!("i32.const", Token::Keyword("i32.const"));
744        assert_lex_one!("nan:0x_1", Token::Keyword("nan:0x_1"));
745        assert_lex_one!("nan:0x1_", Token::Keyword("nan:0x1_"));
746        assert_lex_one!("nan:0x1__2", Token::Keyword("nan:0x1__2"));
747    }
748
749    #[test]
750    fn reserved() {
751        assert_lex_error!("0$foo", LexErrorKind::ReservedName("0$foo"));
752        assert_lex_error!("$", LexErrorKind::ReservedName("$"));
753        assert_lex_error!("$ ;;", LexErrorKind::ReservedName("$"));
754        assert_lex_error!("123p3", LexErrorKind::ReservedName("123p3"));
755        assert_lex_error!("0x123p1f", LexErrorKind::ReservedName("0x123p1f"));
756        assert_lex_error!("123e", LexErrorKind::ReservedName("123e"));
757        assert_lex_error!("123e+", LexErrorKind::ReservedName("123e+"));
758        assert_lex_error!("0x", LexErrorKind::ReservedName("0x"));
759        assert_lex_error!("1_", LexErrorKind::ReservedName("1_"));
760        assert_lex_error!("1__2", LexErrorKind::ReservedName("1__2"));
761        assert_lex_error!("1.2_", LexErrorKind::ReservedName("1.2_"));
762        assert_lex_error!("1._2", LexErrorKind::ReservedName("1._2"));
763        assert_lex_error!("1.2__3", LexErrorKind::ReservedName("1.2__3"));
764        assert_lex_error!("1.E2_", LexErrorKind::ReservedName("1.E2_"));
765        assert_lex_error!("1.E_2", LexErrorKind::ReservedName("1.E_2"));
766        assert_lex_error!("1.E2__3", LexErrorKind::ReservedName("1.E2__3"));
767    }
768
769    #[test]
770    fn integers() {
771        assert_lex_one!("1", Token::Int(Sign::Plus, NumBase::Dec, "1"));
772        assert_lex_one!("123", Token::Int(Sign::Plus, NumBase::Dec, "123"));
773        assert_lex_one!("1_2_3", Token::Int(Sign::Plus, NumBase::Dec, "1_2_3"));
774        assert_lex_one!("+1", Token::Int(Sign::Plus, NumBase::Dec, "1"));
775        assert_lex_one!("+123", Token::Int(Sign::Plus, NumBase::Dec, "123"));
776        assert_lex_one!("-1", Token::Int(Sign::Minus, NumBase::Dec, "1"));
777        assert_lex_one!("-123", Token::Int(Sign::Minus, NumBase::Dec, "123"));
778        assert_lex_one!("0xd", Token::Int(Sign::Plus, NumBase::Hex, "d"));
779        assert_lex_one!("0xc0ffee", Token::Int(Sign::Plus, NumBase::Hex, "c0ffee"));
780        assert_lex_one!("+0xd", Token::Int(Sign::Plus, NumBase::Hex, "d"));
781        assert_lex_one!("+0xc0ffee", Token::Int(Sign::Plus, NumBase::Hex, "c0ffee"));
782        assert_lex_one!("-0xd", Token::Int(Sign::Minus, NumBase::Hex, "d"));
783        assert_lex_one!("-0xc0ffee", Token::Int(Sign::Minus, NumBase::Hex, "c0ffee"));
784    }
785
786    #[test]
787    fn floats() {
788        assert_lex_one!(
789            "123.",
790            Token::Float(
791                Sign::Plus,
792                Float::Val {
793                    base: NumBase::Dec,
794                    frac: "123.",
795                    exp: None,
796                }
797            )
798        );
799        assert_lex_one!(
800            "123.456",
801            Token::Float(
802                Sign::Plus,
803                Float::Val {
804                    base: NumBase::Dec,
805                    frac: "123.456",
806                    exp: None,
807                }
808            )
809        );
810        assert_lex_one!(
811            "+123.",
812            Token::Float(
813                Sign::Plus,
814                Float::Val {
815                    base: NumBase::Dec,
816                    frac: "123.",
817                    exp: None,
818                }
819            )
820        );
821        assert_lex_one!(
822            "-123.",
823            Token::Float(
824                Sign::Minus,
825                Float::Val {
826                    base: NumBase::Dec,
827                    frac: "123.",
828                    exp: None,
829                }
830            )
831        );
832        assert_lex_one!(
833            "123.e10",
834            Token::Float(
835                Sign::Plus,
836                Float::Val {
837                    base: NumBase::Dec,
838                    frac: "123.",
839                    exp: Some((Sign::Plus, "10")),
840                }
841            )
842        );
843        assert_lex_one!(
844            "123.456e10",
845            Token::Float(
846                Sign::Plus,
847                Float::Val {
848                    base: NumBase::Dec,
849                    frac: "123.456",
850                    exp: Some((Sign::Plus, "10")),
851                }
852            )
853        );
854        assert_lex_one!(
855            "1_2_3.4_5_6e1_0",
856            Token::Float(
857                Sign::Plus,
858                Float::Val {
859                    base: NumBase::Dec,
860                    frac: "1_2_3.4_5_6",
861                    exp: Some((Sign::Plus, "1_0")),
862                }
863            )
864        );
865        assert_lex_one!(
866            "123.E10",
867            Token::Float(
868                Sign::Plus,
869                Float::Val {
870                    base: NumBase::Dec,
871                    frac: "123.",
872                    exp: Some((Sign::Plus, "10")),
873                }
874            )
875        );
876        assert_lex_one!(
877            "123.e+10",
878            Token::Float(
879                Sign::Plus,
880                Float::Val {
881                    base: NumBase::Dec,
882                    frac: "123.",
883                    exp: Some((Sign::Plus, "10")),
884                }
885            )
886        );
887        assert_lex_one!(
888            "123.e-10",
889            Token::Float(
890                Sign::Plus,
891                Float::Val {
892                    base: NumBase::Dec,
893                    frac: "123.",
894                    exp: Some((Sign::Minus, "10")),
895                }
896            )
897        );
898
899        assert_lex_one!(
900            "0xc0f.",
901            Token::Float(
902                Sign::Plus,
903                Float::Val {
904                    base: NumBase::Hex,
905                    frac: "c0f.",
906                    exp: None,
907                }
908            )
909        );
910        assert_lex_one!(
911            "0xc0f.fee",
912            Token::Float(
913                Sign::Plus,
914                Float::Val {
915                    base: NumBase::Hex,
916                    frac: "c0f.fee",
917                    exp: None,
918                }
919            )
920        );
921        assert_lex_one!(
922            "+0xc0f.",
923            Token::Float(
924                Sign::Plus,
925                Float::Val {
926                    base: NumBase::Hex,
927                    frac: "c0f.",
928                    exp: None,
929                }
930            )
931        );
932        assert_lex_one!(
933            "-0xc0f.",
934            Token::Float(
935                Sign::Minus,
936                Float::Val {
937                    base: NumBase::Hex,
938                    frac: "c0f.",
939                    exp: None,
940                }
941            )
942        );
943        assert_lex_one!(
944            "0xc0f.p10",
945            Token::Float(
946                Sign::Plus,
947                Float::Val {
948                    base: NumBase::Hex,
949                    frac: "c0f.",
950                    exp: Some((Sign::Plus, "10")),
951                }
952            )
953        );
954        assert_lex_one!(
955            "0xc0f.feep10",
956            Token::Float(
957                Sign::Plus,
958                Float::Val {
959                    base: NumBase::Hex,
960                    frac: "c0f.fee",
961                    exp: Some((Sign::Plus, "10")),
962                }
963            )
964        );
965        assert_lex_one!(
966            "0xc_0_f.f_e_ep1_0",
967            Token::Float(
968                Sign::Plus,
969                Float::Val {
970                    base: NumBase::Hex,
971                    frac: "c_0_f.f_e_e",
972                    exp: Some((Sign::Plus, "1_0")),
973                }
974            )
975        );
976        assert_lex_one!(
977            "0xc0f.feeP10",
978            Token::Float(
979                Sign::Plus,
980                Float::Val {
981                    base: NumBase::Hex,
982                    frac: "c0f.fee",
983                    exp: Some((Sign::Plus, "10")),
984                }
985            )
986        );
987        assert_lex_one!(
988            "0xc0f.p+10",
989            Token::Float(
990                Sign::Plus,
991                Float::Val {
992                    base: NumBase::Hex,
993                    frac: "c0f.",
994                    exp: Some((Sign::Plus, "10")),
995                }
996            )
997        );
998        assert_lex_one!(
999            "0xc0f.p-10",
1000            Token::Float(
1001                Sign::Plus,
1002                Float::Val {
1003                    base: NumBase::Hex,
1004                    frac: "c0f.",
1005                    exp: Some((Sign::Minus, "10")),
1006                }
1007            )
1008        );
1009
1010        assert_lex_one!("inf", Token::Float(Sign::Plus, Float::Inf));
1011        assert_lex_one!("+inf", Token::Float(Sign::Plus, Float::Inf));
1012        assert_lex_one!("-inf", Token::Float(Sign::Minus, Float::Inf));
1013        assert_lex_one!("nan", Token::Float(Sign::Plus, Float::Nan(None)));
1014        assert_lex_one!("+nan", Token::Float(Sign::Plus, Float::Nan(None)));
1015        assert_lex_one!("-nan", Token::Float(Sign::Minus, Float::Nan(None)));
1016        assert_lex_one!("nan:0x1f", Token::Float(Sign::Plus, Float::Nan(Some("1f"))));
1017        assert_lex_one!("nan:0x1_f", Token::Float(Sign::Plus, Float::Nan(Some("1_f"))));
1018        assert_lex_one!("+nan:0x1f", Token::Float(Sign::Plus, Float::Nan(Some("1f"))));
1019        assert_lex_one!("-nan:0x1f", Token::Float(Sign::Minus, Float::Nan(Some("1f"))));
1020    }
1021
1022    #[test]
1023    fn unexpected_characters() {
1024        // '[' is a reserved character and cannot appear in wat for now
1025        assert_lex_error!("[", LexErrorKind::UnexpectedCharacter('['));
1026        assert_lex_error!(" [", LexErrorKind::UnexpectedCharacter('['));
1027        assert_lex_error!("(;_;) [", LexErrorKind::UnexpectedCharacter('['));
1028        assert_lex_error!(";;\n[", LexErrorKind::UnexpectedCharacter('['));
1029    }
1030
1031    #[test]
1032    fn hello_world() {
1033        let input = r#"
1034(module
1035 (type $i32_=>_none (func (param i32)))
1036 (type $none_=>_i32 (func (result i32)))
1037 (import "env" "print" (func $print (param i32)))
1038 (memory $0 2)
1039 (data (i32.const 1024) "Hello, world\n\00")
1040 (table $0 1 1 funcref)
1041 (global $global$0 (mut i32) (i32.const 66576))
1042 (export "memory" (memory $0))
1043 (export "_start" (func $_start))
1044 (func $_start (; 1 ;) (result i32)
1045  (call $print
1046   (i32.const 1024)
1047  )
1048  (i32.const 0)
1049 )
1050 ;; custom section "producers", size 27
1051)
1052        "#;
1053        let tokens = lex_all(input).unwrap();
1054        let tokens: Vec<_> = tokens.into_iter().map(|(t, _)| t).collect();
1055        assert_eq!(
1056            tokens,
1057            vec![
1058                Token::LParen,
1059                Token::Keyword("module"),
1060                Token::LParen,
1061                Token::Keyword("type"),
1062                Token::Ident("$i32_=>_none"),
1063                Token::LParen,
1064                Token::Keyword("func"),
1065                Token::LParen,
1066                Token::Keyword("param"),
1067                Token::Keyword("i32"),
1068                Token::RParen,
1069                Token::RParen,
1070                Token::RParen,
1071                Token::LParen,
1072                Token::Keyword("type"),
1073                Token::Ident("$none_=>_i32"),
1074                Token::LParen,
1075                Token::Keyword("func"),
1076                Token::LParen,
1077                Token::Keyword("result"),
1078                Token::Keyword("i32"),
1079                Token::RParen,
1080                Token::RParen,
1081                Token::RParen,
1082                Token::LParen,
1083                Token::Keyword("import"),
1084                Token::String(Cow::Borrowed(b"env"), r#""env""#),
1085                Token::String(Cow::Borrowed(b"print"), r#""print""#),
1086                Token::LParen,
1087                Token::Keyword("func"),
1088                Token::Ident("$print"),
1089                Token::LParen,
1090                Token::Keyword("param"),
1091                Token::Keyword("i32"),
1092                Token::RParen,
1093                Token::RParen,
1094                Token::RParen,
1095                Token::LParen,
1096                Token::Keyword("memory"),
1097                Token::Ident("$0"),
1098                Token::Int(Sign::Plus, NumBase::Dec, "2"),
1099                Token::RParen,
1100                Token::LParen,
1101                Token::Keyword("data"),
1102                Token::LParen,
1103                Token::Keyword("i32.const"),
1104                Token::Int(Sign::Plus, NumBase::Dec, "1024"),
1105                Token::RParen,
1106                Token::String(Cow::Borrowed(b"Hello, world\n\x00"), r#""Hello, world\n\00""#),
1107                Token::RParen,
1108                Token::LParen,
1109                Token::Keyword("table"),
1110                Token::Ident("$0"),
1111                Token::Int(Sign::Plus, NumBase::Dec, "1"),
1112                Token::Int(Sign::Plus, NumBase::Dec, "1"),
1113                Token::Keyword("funcref"),
1114                Token::RParen,
1115                Token::LParen,
1116                Token::Keyword("global"),
1117                Token::Ident("$global$0"),
1118                Token::LParen,
1119                Token::Keyword("mut"),
1120                Token::Keyword("i32"),
1121                Token::RParen,
1122                Token::LParen,
1123                Token::Keyword("i32.const"),
1124                Token::Int(Sign::Plus, NumBase::Dec, "66576"),
1125                Token::RParen,
1126                Token::RParen,
1127                Token::LParen,
1128                Token::Keyword("export"),
1129                Token::String(Cow::Borrowed(b"memory"), r#""memory""#),
1130                Token::LParen,
1131                Token::Keyword("memory"),
1132                Token::Ident("$0"),
1133                Token::RParen,
1134                Token::RParen,
1135                Token::LParen,
1136                Token::Keyword("export"),
1137                Token::String(Cow::Borrowed(b"_start"), r#""_start""#),
1138                Token::LParen,
1139                Token::Keyword("func"),
1140                Token::Ident("$_start"),
1141                Token::RParen,
1142                Token::RParen,
1143                Token::LParen,
1144                Token::Keyword("func"),
1145                Token::Ident("$_start"),
1146                Token::LParen,
1147                Token::Keyword("result"),
1148                Token::Keyword("i32"),
1149                Token::RParen,
1150                Token::LParen,
1151                Token::Keyword("call"),
1152                Token::Ident("$print"),
1153                Token::LParen,
1154                Token::Keyword("i32.const"),
1155                Token::Int(Sign::Plus, NumBase::Dec, "1024"),
1156                Token::RParen,
1157                Token::RParen,
1158                Token::LParen,
1159                Token::Keyword("i32.const"),
1160                Token::Int(Sign::Plus, NumBase::Dec, "0"),
1161                Token::RParen,
1162                Token::RParen,
1163                Token::RParen,
1164            ]
1165        );
1166    }
1167
1168    #[test]
1169    fn apply_sign() {
1170        assert_eq!(Sign::Plus.apply(42), 42);
1171        assert_eq!(Sign::Plus.apply(-42), -42);
1172        assert_eq!(Sign::Plus.apply(1.0), 1.0);
1173        assert_eq!(Sign::Plus.apply(-1.0), -1.0);
1174        assert_eq!(Sign::Minus.apply(42), -42);
1175        assert_eq!(Sign::Minus.apply(-42), 42);
1176        assert_eq!(Sign::Minus.apply(1.0), -1.0);
1177        assert_eq!(Sign::Minus.apply(-1.0), 1.0);
1178    }
1179}