huff_lexer/lib.rs
1#![doc = include_str!("../README.md")]
2#![allow(dead_code)]
3#![warn(missing_docs)]
4#![warn(unused_extern_crates)]
5#![forbid(unsafe_code)]
6#![forbid(where_clauses_object_safety)]
7
8use huff_utils::{bytes_util::*, error::*, evm::*, span::*, token::*, types::*};
9use regex::Regex;
10use std::{iter::Peekable, str::Chars};
11
12/// Defines a context in which the lexing happens.
13/// Allows to differientate between EVM types and opcodes that can either
14/// be identical or the latter being a substring of the former (example : bytes32 and byte)
15#[derive(Debug, PartialEq, Eq)]
16pub enum Context {
17 /// global context
18 Global,
19 /// Macro definition context
20 MacroDefinition,
21 /// Macro's body context
22 MacroBody,
23 /// ABI context
24 Abi,
25 /// Lexing args of functions inputs/outputs and events
26 AbiArgs,
27 /// constant context
28 Constant,
29}
30
31/// ## Lexer
32///
33/// The lexer encapsulated in a struct.
34pub struct Lexer<'a> {
35 /// The source code as peekable chars.
36 /// SHOULD NOT BE MODIFIED EVER!
37 pub reference_chars: Peekable<Chars<'a>>,
38 /// The source code as peekable chars.
39 pub chars: Peekable<Chars<'a>>,
40 /// The raw source code.
41 pub source: &'a str,
42 /// The current lexing span.
43 pub span: Span,
44 /// The previous lexed Token.
45 /// Cannot be a whitespace.
46 pub lookback: Option<Token>,
47 /// If the lexer has reached the end of file.
48 pub eof: bool,
49 /// EOF Token has been returned.
50 pub eof_returned: bool,
51 /// Current context.
52 pub context: Context,
53}
54
55impl<'a> Lexer<'a> {
56 /// Public associated function that instantiates a new lexer.
57 pub fn new(source: &'a str) -> Self {
58 Self {
59 reference_chars: source.chars().peekable(),
60 chars: source.chars().peekable(),
61 source,
62 span: Span::default(),
63 lookback: None,
64 eof: false,
65 eof_returned: false,
66 context: Context::Global,
67 }
68 }
69
70 // TODO: This does not account for commented out imports for example:
71 // `// #include "./Utils.huff"`
72 /// Lex all imports
73 pub fn lex_imports(source: &str) -> Vec<String> {
74 let mut imports = vec![];
75 let mut peekable_source = source.chars().peekable();
76 let mut include_chars_iterator = "#include".chars().peekable();
77 while peekable_source.peek().is_some() {
78 while let Some(nc) = peekable_source.next() {
79 if include_chars_iterator.peek().is_none() {
80 // Reset the include chars iterator
81 include_chars_iterator = "#include".chars().peekable();
82
83 // Skip over whitespace
84 while peekable_source.peek().is_some() {
85 if !peekable_source.peek().unwrap().is_whitespace() {
86 break
87 } else {
88 peekable_source.next();
89 }
90 }
91
92 // Then we should have an import path between quotes
93 match peekable_source.peek() {
94 Some(char) => match char {
95 '"' | '\'' => {
96 peekable_source.next();
97 let mut import = String::new();
98 while peekable_source.peek().is_some() {
99 match peekable_source.next().unwrap() {
100 '"' | '\'' => {
101 imports.push(import);
102 break
103 }
104 c => import.push(c),
105 }
106 }
107 }
108 _ => { /* Ignore non-include tokens */ }
109 },
110 None => { /* EOF */ }
111 }
112 } else if nc != include_chars_iterator.next().unwrap() {
113 include_chars_iterator = "#include".chars().peekable();
114 break
115 }
116 }
117 }
118 imports
119 }
120
121 /// Public associated function that returns the current lexing span.
122 pub fn current_span(&self) -> Span {
123 if self.eof {
124 Span::EOF
125 } else {
126 self.span
127 }
128 }
129
130 /// Get the length of the previous lexing span.
131 pub fn lookback_len(&self) -> usize {
132 if let Some(lookback) = &self.lookback {
133 return lookback.span.end - lookback.span.start
134 }
135 0
136 }
137
138 /// Checks the previous token kind against the input.
139 pub fn checked_lookback(&self, kind: TokenKind) -> bool {
140 self.lookback.clone().and_then(|t| if t.kind == kind { Some(true) } else { None }).is_some()
141 }
142
143 /// Try to peek at the next character from the source
144 pub fn peek(&mut self) -> Option<char> {
145 self.chars.peek().copied()
146 }
147
148 /// Dynamically peeks characters based on the filter
149 pub fn dyn_peek(&mut self, f: impl Fn(&char) -> bool + Copy) -> String {
150 let mut chars: Vec<char> = Vec::new();
151 let mut current_pos = self.span.start;
152 while self.nth_peek(current_pos).map(|x| f(&x)).unwrap_or(false) {
153 chars.push(self.nth_peek(current_pos).unwrap());
154 current_pos += 1;
155 }
156 chars.iter().collect()
157 }
158
159 /// Try to peek at the nth character from the source
160 pub fn nth_peek(&mut self, n: usize) -> Option<char> {
161 self.reference_chars.clone().nth(n)
162 }
163
164 /// Try to peek at next n characters from the source
165 pub fn peek_n_chars(&mut self, n: usize) -> String {
166 let mut newspan: Span = self.span;
167 newspan.end += n;
168 // Break with an empty string if the bounds are exceeded
169 if newspan.end > self.source.len() {
170 return String::default()
171 }
172 self.source[newspan.range().unwrap()].to_string()
173 }
174
175 /// Peek n chars from a given start point in the source
176 pub fn peek_n_chars_from(&mut self, n: usize, from: usize) -> String {
177 self.source[Span::new(from..(from + n)).range().unwrap()].to_string()
178 }
179
180 /// Gets the current slice of the source code covered by span
181 pub fn slice(&self) -> &'a str {
182 &self.source[self.span.range().unwrap()]
183 }
184
185 /// Consumes the characters
186 pub fn consume(&mut self) -> Option<char> {
187 self.chars.next().map(|x| {
188 self.span.end += 1;
189 x
190 })
191 }
192
193 /// Consumes n characters
194 pub fn nconsume(&mut self, count: usize) {
195 for _ in 0..count {
196 let _ = self.consume();
197 }
198 }
199
200 /// Consume characters until a sequence matches
201 pub fn seq_consume(&mut self, word: &str) {
202 let mut current_pos = self.span.start;
203 while self.peek() != None {
204 let peeked = self.peek_n_chars_from(word.len(), current_pos);
205 if word == peeked {
206 break
207 }
208 self.consume();
209 current_pos += 1;
210 }
211 }
212
213 /// Dynamically consumes characters based on filters
214 pub fn dyn_consume(&mut self, f: impl Fn(&char) -> bool + Copy) {
215 while self.peek().map(|x| f(&x)).unwrap_or(false) {
216 self.consume();
217 }
218 }
219
220 /// Resets the Lexer's span
221 ///
222 /// Only sets the previous span if the current token is not a whitespace.
223 pub fn reset(&mut self) {
224 self.span.start = self.span.end;
225 }
226
227 /// Check if a given keyword follows the keyword rules in the `source`. If not, it is a
228 /// `TokenKind::Ident`.
229 ///
230 /// Rules:
231 /// - The `macro`, `function`, `constant`, `event` keywords must be preceded by a `#define`
232 /// keyword.
233 /// - The `takes` keyword must be preceded by an assignment operator: `=`.
234 /// - The `nonpayable`, `payable`, `view`, and `pure` keywords must be preceeded by one of these
235 /// keywords or a close paren.
236 /// - The `returns` keyword must be succeeded by an open parenthesis and must *not* be succeeded
237 /// by a colon or preceded by the keyword `function`
238 pub fn check_keyword_rules(&mut self, found_kind: &Option<TokenKind>) -> bool {
239 match found_kind {
240 Some(TokenKind::Macro) |
241 Some(TokenKind::Function) |
242 Some(TokenKind::Constant) |
243 Some(TokenKind::Event) => self.checked_lookback(TokenKind::Define),
244 Some(TokenKind::NonPayable) |
245 Some(TokenKind::Payable) |
246 Some(TokenKind::View) |
247 Some(TokenKind::Pure) => {
248 let keys = [
249 TokenKind::NonPayable,
250 TokenKind::Payable,
251 TokenKind::View,
252 TokenKind::Pure,
253 TokenKind::CloseParen,
254 ];
255 for key in keys {
256 if self.checked_lookback(key) {
257 return true
258 }
259 }
260 false
261 }
262 Some(TokenKind::Takes) => self.checked_lookback(TokenKind::Assign),
263 Some(TokenKind::Returns) => {
264 // Allow for loose and tight syntax (e.g. `returns (0)` & `returns(0)`)
265 self.peek_n_chars_from(2, self.span.end).trim().starts_with('(') &&
266 !self.checked_lookback(TokenKind::Function) &&
267 self.peek_n_chars_from(1, self.span.end) != ":"
268 }
269 _ => true,
270 }
271 }
272}
273
274impl<'a> Iterator for Lexer<'a> {
275 type Item = Result<Token, LexicalError<'a>>;
276
277 /// Iterates over the source code
278 fn next(&mut self) -> Option<Self::Item> {
279 self.reset();
280 if let Some(ch) = self.consume() {
281 let kind = match ch {
282 // Comments
283 '/' => {
284 if let Some(ch2) = self.peek() {
285 match ch2 {
286 '/' => {
287 self.consume();
288 // Consume until newline
289 self.dyn_consume(|c| *c != '\n');
290 TokenKind::Comment(self.slice().to_string())
291 }
292 '*' => {
293 self.consume();
294 // Consume until next '*/' occurance
295 self.seq_consume("*/");
296 TokenKind::Comment(self.slice().to_string())
297 }
298 _ => TokenKind::Div,
299 }
300 } else {
301 TokenKind::Div
302 }
303 }
304 // # keywords
305 '#' => {
306 let mut found_kind: Option<TokenKind> = None;
307
308 let keys = [TokenKind::Define, TokenKind::Include];
309 for kind in &keys {
310 let key = kind.to_string();
311 let token_length = key.len() - 1;
312 let peeked = self.peek_n_chars(token_length);
313
314 if *key == peeked {
315 self.nconsume(token_length);
316 found_kind = Some(kind.clone());
317 break
318 }
319 }
320
321 if let Some(kind) = found_kind {
322 kind
323 } else {
324 // Otherwise we don't support # prefixed indentifiers
325 return Some(Err(LexicalError::new(
326 LexicalErrorKind::InvalidCharacter('#'),
327 self.current_span(),
328 )))
329 }
330 }
331 // Alphabetical characters
332 ch if ch.is_alphabetic() => {
333 let mut found_kind: Option<TokenKind> = None;
334
335 let keys = [
336 TokenKind::Macro,
337 TokenKind::Function,
338 TokenKind::Constant,
339 TokenKind::Takes,
340 TokenKind::Returns,
341 TokenKind::Event,
342 TokenKind::NonPayable,
343 TokenKind::Payable,
344 TokenKind::Indexed,
345 TokenKind::View,
346 TokenKind::Pure,
347 ];
348 for kind in &keys {
349 if self.context == Context::MacroBody {
350 break
351 }
352 let key = kind.to_string();
353 let token_length = key.len() - 1;
354 let peeked = self.peek_n_chars(token_length);
355
356 if *key == peeked {
357 self.nconsume(token_length);
358 found_kind = Some(kind.clone());
359 break
360 }
361 }
362
363 // Check to see if the found kind is, in fact, a keyword and not the name of
364 // a function. If it is, set `found_kind` to `None` so that it is set to a
365 // `TokenKind::Ident` in the following control flow.
366 if !self.check_keyword_rules(&found_kind) {
367 found_kind = None;
368 }
369
370 if let Some(tokind) = &found_kind {
371 match tokind {
372 TokenKind::Macro => self.context = Context::MacroDefinition,
373 TokenKind::Function | TokenKind::Event => self.context = Context::Abi,
374 TokenKind::Constant => self.context = Context::Constant,
375 _ => (),
376 }
377 }
378
379 // Check for macro keyword
380 let fsp = "FREE_STORAGE_POINTER";
381 let token_length = fsp.len() - 1;
382 let peeked = self.peek_n_chars(token_length);
383 if fsp == peeked {
384 self.nconsume(token_length);
385 // Consume the parenthesis following the FREE_STORAGE_POINTER
386 // Note: This will consume `FREE_STORAGE_POINTER)` or
387 // `FREE_STORAGE_POINTER(` as well
388 if let Some('(') = self.peek() {
389 self.consume();
390 }
391 if let Some(')') = self.peek() {
392 self.consume();
393 }
394 found_kind = Some(TokenKind::FreeStoragePointer);
395 }
396
397 let potential_label: String =
398 self.dyn_peek(|c| c.is_alphanumeric() || c == &'_' || c == &':');
399 if let true = potential_label.ends_with(':') {
400 self.dyn_consume(|c| c.is_alphanumeric() || c == &'_' || c == &':');
401 let label = self.slice();
402 if let Some(l) = label.get(0..label.len() - 1) {
403 found_kind = Some(TokenKind::Label(l.to_string()));
404 } else {
405 tracing::error!("[huff_lexer] Fatal Label Colon Truncation!");
406 }
407 }
408
409 let pot_op = self.dyn_peek(|c| c.is_alphanumeric());
410 // goes over all opcodes
411 for opcode in OPCODES {
412 if self.context != Context::MacroBody {
413 break
414 }
415 if opcode == pot_op {
416 self.dyn_consume(|c| c.is_alphanumeric());
417 if let Some(o) = OPCODES_MAP.get(opcode) {
418 found_kind = Some(TokenKind::Opcode(o.to_owned()));
419 } else {
420 tracing::error!("[huff_lexer] Fatal Opcode Mapping!");
421 }
422 break
423 }
424 }
425
426 // Last case ; we are in ABI context and
427 // we are parsing an EVM type
428 if self.context == Context::AbiArgs {
429 let curr_char = self.peek()?;
430 if !['(', ')'].contains(&curr_char) {
431 self.dyn_consume(|c| c.is_alphanumeric() || *c == '[' || *c == ']');
432 // got a type at this point, we have to know which
433 let raw_type: &str = self.slice();
434 // check for arrays first
435 if EVM_TYPE_ARRAY_REGEX.is_match(raw_type) {
436 // split to get array size and type
437 // TODO: support multi-dimensional arrays
438 let mut words: Vec<String> = Regex::new(r"\[")
439 .unwrap()
440 .split(raw_type)
441 .map(|x| x.replace(']', ""))
442 .collect();
443 // unbounded array == array with a size of 0
444 if words[1].is_empty() {
445 words[1] = String::from("0");
446 }
447 let arr_size: usize = words[1]
448 .parse::<usize>()
449 .map_err(|_| {
450 let err = LexicalError {
451 kind: LexicalErrorKind::InvalidArraySize(&words[1]),
452 span: self.span,
453 };
454 tracing::error!("{}", format!("{:?}", err));
455 err
456 })
457 .unwrap();
458 let primitive = PrimitiveEVMType::try_from(words[0].clone());
459 if let Ok(primitive) = primitive {
460 found_kind = Some(TokenKind::ArrayType(primitive, arr_size));
461 } else {
462 let err = LexicalError {
463 kind: LexicalErrorKind::InvalidPrimitiveType(&words[0]),
464 span: self.span,
465 };
466 tracing::error!("{}", format!("{:?}", err));
467 }
468 } else {
469 // We don't want to consider any argument names or the "indexed"
470 // keyword here.
471 let primitive = PrimitiveEVMType::try_from(raw_type.to_string());
472 if let Ok(primitive) = primitive {
473 found_kind = Some(TokenKind::PrimitiveType(primitive));
474 }
475 }
476 }
477 }
478
479 if let Some(kind) = &found_kind {
480 kind.clone()
481 } else {
482 self.dyn_consume(|c| c.is_alphanumeric() || c.eq(&'_'));
483 TokenKind::Ident(self.slice().to_string())
484 }
485 }
486 // If it's the start of a hex literal
487 ch if ch == '0' && self.peek().unwrap() == 'x' => {
488 self.consume(); // Consume the 'x' after '0' (separated from the `dyn_consume` so we don't have
489 // to match `x` in the actual hex)
490 self.dyn_consume(|c| {
491 c.is_numeric() ||
492 // Match a-f & A-F
493 matches!(c, '\u{0041}'..='\u{0046}' | '\u{0061}'..='\u{0066}')
494 });
495 self.span.start += 2; // Ignore the "0x"
496 TokenKind::Literal(str_to_bytes32(self.slice()))
497 }
498 '=' => TokenKind::Assign,
499 '(' => {
500 if self.context == Context::Abi {
501 self.context = Context::AbiArgs;
502 }
503 TokenKind::OpenParen
504 }
505 ')' => {
506 if self.context == Context::AbiArgs {
507 self.context = Context::Abi;
508 }
509 TokenKind::CloseParen
510 }
511 '[' => TokenKind::OpenBracket,
512 ']' => TokenKind::CloseBracket,
513 '{' => {
514 if self.context == Context::MacroDefinition {
515 self.context = Context::MacroBody;
516 }
517 TokenKind::OpenBrace
518 }
519 '}' => {
520 if self.context == Context::MacroBody {
521 self.context = Context::Global;
522 }
523 TokenKind::CloseBrace
524 }
525 '+' => TokenKind::Add,
526 '-' => TokenKind::Sub,
527 '*' => TokenKind::Mul,
528 '<' => TokenKind::LeftAngle,
529 '>' => TokenKind::RightAngle,
530 // NOTE: TokenKind::Div is lexed further up since it overlaps with comment
531 ':' => TokenKind::Colon,
532 // identifiers
533 ',' => TokenKind::Comma,
534 '0'..='9' => {
535 self.dyn_consume(char::is_ascii_digit);
536 TokenKind::Num(self.slice().parse().unwrap())
537 }
538 // Lexes Spaces and Newlines as Whitespace
539 ch if ch.is_ascii_whitespace() => {
540 self.dyn_consume(char::is_ascii_whitespace);
541 TokenKind::Whitespace
542 }
543 // String literals
544 '"' => loop {
545 match self.peek() {
546 Some('"') => {
547 self.consume();
548 let str = self.slice();
549 break TokenKind::Str((&str[1..str.len() - 1]).to_string())
550 }
551 Some('\\') if matches!(self.nth_peek(1), Some('\\') | Some('"')) => {
552 self.consume();
553 }
554 Some(_) => {}
555 None => {
556 self.eof = true;
557 return Some(Err(LexicalError::new(
558 LexicalErrorKind::UnexpectedEof,
559 self.span,
560 )))
561 }
562 }
563 self.consume();
564 },
565 // Allow string literals to be wrapped by single quotes
566 '\'' => loop {
567 match self.peek() {
568 Some('\'') => {
569 self.consume();
570 let str = self.slice();
571 break TokenKind::Str((&str[1..str.len() - 1]).to_string())
572 }
573 Some('\\') if matches!(self.nth_peek(1), Some('\\') | Some('\'')) => {
574 self.consume();
575 }
576 Some(_) => {}
577 None => {
578 self.eof = true;
579 return Some(Err(LexicalError::new(
580 LexicalErrorKind::UnexpectedEof,
581 self.span,
582 )))
583 }
584 }
585 self.consume();
586 },
587 // At this point, the source code has an invalid or unsupported token
588 ch => {
589 return Some(Err(LexicalError::new(
590 LexicalErrorKind::InvalidCharacter(ch),
591 self.span,
592 )))
593 }
594 };
595
596 if self.peek().is_none() {
597 self.eof = true;
598 }
599
600 let token = Token { kind, span: self.span };
601 if token.kind != TokenKind::Whitespace {
602 self.lookback = Some(token.clone());
603 }
604
605 return Some(Ok(token))
606 }
607
608 // Mark EOF
609 self.eof = true;
610
611 // If we haven't returned an eof token, return one
612 if !self.eof_returned {
613 self.eof_returned = true;
614 let token = Token { kind: TokenKind::Eof, span: self.span };
615 if token.kind != TokenKind::Whitespace {
616 self.lookback = Some(token.clone());
617 }
618 return Some(Ok(token))
619 }
620
621 None
622 }
623}