huff_lexer/
lib.rs

1#![doc = include_str!("../README.md")]
2#![allow(dead_code)]
3#![warn(missing_docs)]
4#![warn(unused_extern_crates)]
5#![forbid(unsafe_code)]
6#![forbid(where_clauses_object_safety)]
7
8use huff_utils::{bytes_util::*, error::*, evm::*, span::*, token::*, types::*};
9use regex::Regex;
10use std::{iter::Peekable, str::Chars};
11
12/// Defines a context in which the lexing happens.
13/// Allows to differientate between EVM types and opcodes that can either
14/// be identical or the latter being a substring of the former (example : bytes32 and byte)
15#[derive(Debug, PartialEq, Eq)]
16pub enum Context {
17    /// global context
18    Global,
19    /// Macro definition context
20    MacroDefinition,
21    /// Macro's body context
22    MacroBody,
23    /// ABI context
24    Abi,
25    /// Lexing args of functions inputs/outputs and events
26    AbiArgs,
27    /// constant context
28    Constant,
29}
30
31/// ## Lexer
32///
33/// The lexer encapsulated in a struct.
34pub struct Lexer<'a> {
35    /// The source code as peekable chars.
36    /// SHOULD NOT BE MODIFIED EVER!
37    pub reference_chars: Peekable<Chars<'a>>,
38    /// The source code as peekable chars.
39    pub chars: Peekable<Chars<'a>>,
40    /// The raw source code.
41    pub source: &'a str,
42    /// The current lexing span.
43    pub span: Span,
44    /// The previous lexed Token.
45    /// Cannot be a whitespace.
46    pub lookback: Option<Token>,
47    /// If the lexer has reached the end of file.
48    pub eof: bool,
49    /// EOF Token has been returned.
50    pub eof_returned: bool,
51    /// Current context.
52    pub context: Context,
53}
54
55impl<'a> Lexer<'a> {
56    /// Public associated function that instantiates a new lexer.
57    pub fn new(source: &'a str) -> Self {
58        Self {
59            reference_chars: source.chars().peekable(),
60            chars: source.chars().peekable(),
61            source,
62            span: Span::default(),
63            lookback: None,
64            eof: false,
65            eof_returned: false,
66            context: Context::Global,
67        }
68    }
69
70    // TODO: This does not account for commented out imports for example:
71    // `// #include "./Utils.huff"`
72    /// Lex all imports
73    pub fn lex_imports(source: &str) -> Vec<String> {
74        let mut imports = vec![];
75        let mut peekable_source = source.chars().peekable();
76        let mut include_chars_iterator = "#include".chars().peekable();
77        while peekable_source.peek().is_some() {
78            while let Some(nc) = peekable_source.next() {
79                if include_chars_iterator.peek().is_none() {
80                    // Reset the include chars iterator
81                    include_chars_iterator = "#include".chars().peekable();
82
83                    // Skip over whitespace
84                    while peekable_source.peek().is_some() {
85                        if !peekable_source.peek().unwrap().is_whitespace() {
86                            break
87                        } else {
88                            peekable_source.next();
89                        }
90                    }
91
92                    // Then we should have an import path between quotes
93                    match peekable_source.peek() {
94                        Some(char) => match char {
95                            '"' | '\'' => {
96                                peekable_source.next();
97                                let mut import = String::new();
98                                while peekable_source.peek().is_some() {
99                                    match peekable_source.next().unwrap() {
100                                        '"' | '\'' => {
101                                            imports.push(import);
102                                            break
103                                        }
104                                        c => import.push(c),
105                                    }
106                                }
107                            }
108                            _ => { /* Ignore non-include tokens */ }
109                        },
110                        None => { /* EOF */ }
111                    }
112                } else if nc != include_chars_iterator.next().unwrap() {
113                    include_chars_iterator = "#include".chars().peekable();
114                    break
115                }
116            }
117        }
118        imports
119    }
120
121    /// Public associated function that returns the current lexing span.
122    pub fn current_span(&self) -> Span {
123        if self.eof {
124            Span::EOF
125        } else {
126            self.span
127        }
128    }
129
130    /// Get the length of the previous lexing span.
131    pub fn lookback_len(&self) -> usize {
132        if let Some(lookback) = &self.lookback {
133            return lookback.span.end - lookback.span.start
134        }
135        0
136    }
137
138    /// Checks the previous token kind against the input.
139    pub fn checked_lookback(&self, kind: TokenKind) -> bool {
140        self.lookback.clone().and_then(|t| if t.kind == kind { Some(true) } else { None }).is_some()
141    }
142
143    /// Try to peek at the next character from the source
144    pub fn peek(&mut self) -> Option<char> {
145        self.chars.peek().copied()
146    }
147
148    /// Dynamically peeks characters based on the filter
149    pub fn dyn_peek(&mut self, f: impl Fn(&char) -> bool + Copy) -> String {
150        let mut chars: Vec<char> = Vec::new();
151        let mut current_pos = self.span.start;
152        while self.nth_peek(current_pos).map(|x| f(&x)).unwrap_or(false) {
153            chars.push(self.nth_peek(current_pos).unwrap());
154            current_pos += 1;
155        }
156        chars.iter().collect()
157    }
158
159    /// Try to peek at the nth character from the source
160    pub fn nth_peek(&mut self, n: usize) -> Option<char> {
161        self.reference_chars.clone().nth(n)
162    }
163
164    /// Try to peek at next n characters from the source
165    pub fn peek_n_chars(&mut self, n: usize) -> String {
166        let mut newspan: Span = self.span;
167        newspan.end += n;
168        // Break with an empty string if the bounds are exceeded
169        if newspan.end > self.source.len() {
170            return String::default()
171        }
172        self.source[newspan.range().unwrap()].to_string()
173    }
174
175    /// Peek n chars from a given start point in the source
176    pub fn peek_n_chars_from(&mut self, n: usize, from: usize) -> String {
177        self.source[Span::new(from..(from + n)).range().unwrap()].to_string()
178    }
179
180    /// Gets the current slice of the source code covered by span
181    pub fn slice(&self) -> &'a str {
182        &self.source[self.span.range().unwrap()]
183    }
184
185    /// Consumes the characters
186    pub fn consume(&mut self) -> Option<char> {
187        self.chars.next().map(|x| {
188            self.span.end += 1;
189            x
190        })
191    }
192
193    /// Consumes n characters
194    pub fn nconsume(&mut self, count: usize) {
195        for _ in 0..count {
196            let _ = self.consume();
197        }
198    }
199
200    /// Consume characters until a sequence matches
201    pub fn seq_consume(&mut self, word: &str) {
202        let mut current_pos = self.span.start;
203        while self.peek() != None {
204            let peeked = self.peek_n_chars_from(word.len(), current_pos);
205            if word == peeked {
206                break
207            }
208            self.consume();
209            current_pos += 1;
210        }
211    }
212
213    /// Dynamically consumes characters based on filters
214    pub fn dyn_consume(&mut self, f: impl Fn(&char) -> bool + Copy) {
215        while self.peek().map(|x| f(&x)).unwrap_or(false) {
216            self.consume();
217        }
218    }
219
220    /// Resets the Lexer's span
221    ///
222    /// Only sets the previous span if the current token is not a whitespace.
223    pub fn reset(&mut self) {
224        self.span.start = self.span.end;
225    }
226
227    /// Check if a given keyword follows the keyword rules in the `source`. If not, it is a
228    /// `TokenKind::Ident`.
229    ///
230    /// Rules:
231    /// - The `macro`, `function`, `constant`, `event` keywords must be preceded by a `#define`
232    ///   keyword.
233    /// - The `takes` keyword must be preceded by an assignment operator: `=`.
234    /// - The `nonpayable`, `payable`, `view`, and `pure` keywords must be preceeded by one of these
235    ///   keywords or a close paren.
236    /// - The `returns` keyword must be succeeded by an open parenthesis and must *not* be succeeded
237    ///   by a colon or preceded by the keyword `function`
238    pub fn check_keyword_rules(&mut self, found_kind: &Option<TokenKind>) -> bool {
239        match found_kind {
240            Some(TokenKind::Macro) |
241            Some(TokenKind::Function) |
242            Some(TokenKind::Constant) |
243            Some(TokenKind::Event) => self.checked_lookback(TokenKind::Define),
244            Some(TokenKind::NonPayable) |
245            Some(TokenKind::Payable) |
246            Some(TokenKind::View) |
247            Some(TokenKind::Pure) => {
248                let keys = [
249                    TokenKind::NonPayable,
250                    TokenKind::Payable,
251                    TokenKind::View,
252                    TokenKind::Pure,
253                    TokenKind::CloseParen,
254                ];
255                for key in keys {
256                    if self.checked_lookback(key) {
257                        return true
258                    }
259                }
260                false
261            }
262            Some(TokenKind::Takes) => self.checked_lookback(TokenKind::Assign),
263            Some(TokenKind::Returns) => {
264                // Allow for loose and tight syntax (e.g. `returns (0)` & `returns(0)`)
265                self.peek_n_chars_from(2, self.span.end).trim().starts_with('(') &&
266                    !self.checked_lookback(TokenKind::Function) &&
267                    self.peek_n_chars_from(1, self.span.end) != ":"
268            }
269            _ => true,
270        }
271    }
272}
273
274impl<'a> Iterator for Lexer<'a> {
275    type Item = Result<Token, LexicalError<'a>>;
276
277    /// Iterates over the source code
278    fn next(&mut self) -> Option<Self::Item> {
279        self.reset();
280        if let Some(ch) = self.consume() {
281            let kind = match ch {
282                // Comments
283                '/' => {
284                    if let Some(ch2) = self.peek() {
285                        match ch2 {
286                            '/' => {
287                                self.consume();
288                                // Consume until newline
289                                self.dyn_consume(|c| *c != '\n');
290                                TokenKind::Comment(self.slice().to_string())
291                            }
292                            '*' => {
293                                self.consume();
294                                // Consume until next '*/' occurance
295                                self.seq_consume("*/");
296                                TokenKind::Comment(self.slice().to_string())
297                            }
298                            _ => TokenKind::Div,
299                        }
300                    } else {
301                        TokenKind::Div
302                    }
303                }
304                // # keywords
305                '#' => {
306                    let mut found_kind: Option<TokenKind> = None;
307
308                    let keys = [TokenKind::Define, TokenKind::Include];
309                    for kind in &keys {
310                        let key = kind.to_string();
311                        let token_length = key.len() - 1;
312                        let peeked = self.peek_n_chars(token_length);
313
314                        if *key == peeked {
315                            self.nconsume(token_length);
316                            found_kind = Some(kind.clone());
317                            break
318                        }
319                    }
320
321                    if let Some(kind) = found_kind {
322                        kind
323                    } else {
324                        // Otherwise we don't support # prefixed indentifiers
325                        return Some(Err(LexicalError::new(
326                            LexicalErrorKind::InvalidCharacter('#'),
327                            self.current_span(),
328                        )))
329                    }
330                }
331                // Alphabetical characters
332                ch if ch.is_alphabetic() => {
333                    let mut found_kind: Option<TokenKind> = None;
334
335                    let keys = [
336                        TokenKind::Macro,
337                        TokenKind::Function,
338                        TokenKind::Constant,
339                        TokenKind::Takes,
340                        TokenKind::Returns,
341                        TokenKind::Event,
342                        TokenKind::NonPayable,
343                        TokenKind::Payable,
344                        TokenKind::Indexed,
345                        TokenKind::View,
346                        TokenKind::Pure,
347                    ];
348                    for kind in &keys {
349                        if self.context == Context::MacroBody {
350                            break
351                        }
352                        let key = kind.to_string();
353                        let token_length = key.len() - 1;
354                        let peeked = self.peek_n_chars(token_length);
355
356                        if *key == peeked {
357                            self.nconsume(token_length);
358                            found_kind = Some(kind.clone());
359                            break
360                        }
361                    }
362
363                    // Check to see if the found kind is, in fact, a keyword and not the name of
364                    // a function. If it is, set `found_kind` to `None` so that it is set to a
365                    // `TokenKind::Ident` in the following control flow.
366                    if !self.check_keyword_rules(&found_kind) {
367                        found_kind = None;
368                    }
369
370                    if let Some(tokind) = &found_kind {
371                        match tokind {
372                            TokenKind::Macro => self.context = Context::MacroDefinition,
373                            TokenKind::Function | TokenKind::Event => self.context = Context::Abi,
374                            TokenKind::Constant => self.context = Context::Constant,
375                            _ => (),
376                        }
377                    }
378
379                    // Check for macro keyword
380                    let fsp = "FREE_STORAGE_POINTER";
381                    let token_length = fsp.len() - 1;
382                    let peeked = self.peek_n_chars(token_length);
383                    if fsp == peeked {
384                        self.nconsume(token_length);
385                        // Consume the parenthesis following the FREE_STORAGE_POINTER
386                        // Note: This will consume `FREE_STORAGE_POINTER)` or
387                        // `FREE_STORAGE_POINTER(` as well
388                        if let Some('(') = self.peek() {
389                            self.consume();
390                        }
391                        if let Some(')') = self.peek() {
392                            self.consume();
393                        }
394                        found_kind = Some(TokenKind::FreeStoragePointer);
395                    }
396
397                    let potential_label: String =
398                        self.dyn_peek(|c| c.is_alphanumeric() || c == &'_' || c == &':');
399                    if let true = potential_label.ends_with(':') {
400                        self.dyn_consume(|c| c.is_alphanumeric() || c == &'_' || c == &':');
401                        let label = self.slice();
402                        if let Some(l) = label.get(0..label.len() - 1) {
403                            found_kind = Some(TokenKind::Label(l.to_string()));
404                        } else {
405                            tracing::error!("[huff_lexer] Fatal Label Colon Truncation!");
406                        }
407                    }
408
409                    let pot_op = self.dyn_peek(|c| c.is_alphanumeric());
410                    // goes over all opcodes
411                    for opcode in OPCODES {
412                        if self.context != Context::MacroBody {
413                            break
414                        }
415                        if opcode == pot_op {
416                            self.dyn_consume(|c| c.is_alphanumeric());
417                            if let Some(o) = OPCODES_MAP.get(opcode) {
418                                found_kind = Some(TokenKind::Opcode(o.to_owned()));
419                            } else {
420                                tracing::error!("[huff_lexer] Fatal Opcode Mapping!");
421                            }
422                            break
423                        }
424                    }
425
426                    // Last case ; we are in ABI context and
427                    // we are parsing an EVM type
428                    if self.context == Context::AbiArgs {
429                        let curr_char = self.peek()?;
430                        if !['(', ')'].contains(&curr_char) {
431                            self.dyn_consume(|c| c.is_alphanumeric() || *c == '[' || *c == ']');
432                            // got a type at this point, we have to know which
433                            let raw_type: &str = self.slice();
434                            // check for arrays first
435                            if EVM_TYPE_ARRAY_REGEX.is_match(raw_type) {
436                                // split to get array size and type
437                                // TODO: support multi-dimensional arrays
438                                let mut words: Vec<String> = Regex::new(r"\[")
439                                    .unwrap()
440                                    .split(raw_type)
441                                    .map(|x| x.replace(']', ""))
442                                    .collect();
443                                // unbounded array == array with a size of 0
444                                if words[1].is_empty() {
445                                    words[1] = String::from("0");
446                                }
447                                let arr_size: usize = words[1]
448                                    .parse::<usize>()
449                                    .map_err(|_| {
450                                        let err = LexicalError {
451                                            kind: LexicalErrorKind::InvalidArraySize(&words[1]),
452                                            span: self.span,
453                                        };
454                                        tracing::error!("{}", format!("{:?}", err));
455                                        err
456                                    })
457                                    .unwrap();
458                                let primitive = PrimitiveEVMType::try_from(words[0].clone());
459                                if let Ok(primitive) = primitive {
460                                    found_kind = Some(TokenKind::ArrayType(primitive, arr_size));
461                                } else {
462                                    let err = LexicalError {
463                                        kind: LexicalErrorKind::InvalidPrimitiveType(&words[0]),
464                                        span: self.span,
465                                    };
466                                    tracing::error!("{}", format!("{:?}", err));
467                                }
468                            } else {
469                                // We don't want to consider any argument names or the "indexed"
470                                // keyword here.
471                                let primitive = PrimitiveEVMType::try_from(raw_type.to_string());
472                                if let Ok(primitive) = primitive {
473                                    found_kind = Some(TokenKind::PrimitiveType(primitive));
474                                }
475                            }
476                        }
477                    }
478
479                    if let Some(kind) = &found_kind {
480                        kind.clone()
481                    } else {
482                        self.dyn_consume(|c| c.is_alphanumeric() || c.eq(&'_'));
483                        TokenKind::Ident(self.slice().to_string())
484                    }
485                }
486                // If it's the start of a hex literal
487                ch if ch == '0' && self.peek().unwrap() == 'x' => {
488                    self.consume(); // Consume the 'x' after '0' (separated from the `dyn_consume` so we don't have
489                                    // to match `x` in the actual hex)
490                    self.dyn_consume(|c| {
491                        c.is_numeric() ||
492                            // Match a-f & A-F
493                            matches!(c, '\u{0041}'..='\u{0046}' | '\u{0061}'..='\u{0066}')
494                    });
495                    self.span.start += 2; // Ignore the "0x"
496                    TokenKind::Literal(str_to_bytes32(self.slice()))
497                }
498                '=' => TokenKind::Assign,
499                '(' => {
500                    if self.context == Context::Abi {
501                        self.context = Context::AbiArgs;
502                    }
503                    TokenKind::OpenParen
504                }
505                ')' => {
506                    if self.context == Context::AbiArgs {
507                        self.context = Context::Abi;
508                    }
509                    TokenKind::CloseParen
510                }
511                '[' => TokenKind::OpenBracket,
512                ']' => TokenKind::CloseBracket,
513                '{' => {
514                    if self.context == Context::MacroDefinition {
515                        self.context = Context::MacroBody;
516                    }
517                    TokenKind::OpenBrace
518                }
519                '}' => {
520                    if self.context == Context::MacroBody {
521                        self.context = Context::Global;
522                    }
523                    TokenKind::CloseBrace
524                }
525                '+' => TokenKind::Add,
526                '-' => TokenKind::Sub,
527                '*' => TokenKind::Mul,
528                '<' => TokenKind::LeftAngle,
529                '>' => TokenKind::RightAngle,
530                // NOTE: TokenKind::Div is lexed further up since it overlaps with comment
531                ':' => TokenKind::Colon,
532                // identifiers
533                ',' => TokenKind::Comma,
534                '0'..='9' => {
535                    self.dyn_consume(char::is_ascii_digit);
536                    TokenKind::Num(self.slice().parse().unwrap())
537                }
538                // Lexes Spaces and Newlines as Whitespace
539                ch if ch.is_ascii_whitespace() => {
540                    self.dyn_consume(char::is_ascii_whitespace);
541                    TokenKind::Whitespace
542                }
543                // String literals
544                '"' => loop {
545                    match self.peek() {
546                        Some('"') => {
547                            self.consume();
548                            let str = self.slice();
549                            break TokenKind::Str((&str[1..str.len() - 1]).to_string())
550                        }
551                        Some('\\') if matches!(self.nth_peek(1), Some('\\') | Some('"')) => {
552                            self.consume();
553                        }
554                        Some(_) => {}
555                        None => {
556                            self.eof = true;
557                            return Some(Err(LexicalError::new(
558                                LexicalErrorKind::UnexpectedEof,
559                                self.span,
560                            )))
561                        }
562                    }
563                    self.consume();
564                },
565                // Allow string literals to be wrapped by single quotes
566                '\'' => loop {
567                    match self.peek() {
568                        Some('\'') => {
569                            self.consume();
570                            let str = self.slice();
571                            break TokenKind::Str((&str[1..str.len() - 1]).to_string())
572                        }
573                        Some('\\') if matches!(self.nth_peek(1), Some('\\') | Some('\'')) => {
574                            self.consume();
575                        }
576                        Some(_) => {}
577                        None => {
578                            self.eof = true;
579                            return Some(Err(LexicalError::new(
580                                LexicalErrorKind::UnexpectedEof,
581                                self.span,
582                            )))
583                        }
584                    }
585                    self.consume();
586                },
587                // At this point, the source code has an invalid or unsupported token
588                ch => {
589                    return Some(Err(LexicalError::new(
590                        LexicalErrorKind::InvalidCharacter(ch),
591                        self.span,
592                    )))
593                }
594            };
595
596            if self.peek().is_none() {
597                self.eof = true;
598            }
599
600            let token = Token { kind, span: self.span };
601            if token.kind != TokenKind::Whitespace {
602                self.lookback = Some(token.clone());
603            }
604
605            return Some(Ok(token))
606        }
607
608        // Mark EOF
609        self.eof = true;
610
611        // If we haven't returned an eof token, return one
612        if !self.eof_returned {
613            self.eof_returned = true;
614            let token = Token { kind: TokenKind::Eof, span: self.span };
615            if token.kind != TokenKind::Whitespace {
616                self.lookback = Some(token.clone());
617            }
618            return Some(Ok(token))
619        }
620
621        None
622    }
623}