pattern_lexer/
lexer.rs

1use crate::token::{Token, Tokenizer};
2use anyhow::anyhow;
3
4/// A plugin-based lexer.
5/// It perform lexical analysis on strings
6pub struct Lexer<'a> {
7    /// The [Tokenizer]s who describe the lexicon
8    tokenizers: Vec<Tokenizer<'a>>,
9}
10
11impl<'a> Lexer<'a> {
12    /// Create a builder
13    ///
14    /// # Example
15    /// ```rust
16    /// # use pattern_lexer::Lexer;
17    /// # use pattern_lexer::token::{TokenKind, Tokenizer};
18    /// #    
19    /// let int = Tokenizer::new(TokenKind::LITERAL("INT"), |s: &str| { s.chars().all(|c| c.is_digit(10)) });
20    /// let lex = Lexer::builder().add(int).build();
21    /// ```
22    pub fn builder() -> LexerBuilder<'a> {
23        LexerBuilder::new()
24    }
25
26    /// Return a vector of [Token] from the given `&str`
27    ///
28    /// # Errors
29    /// Return an Error if a part of the string doesn't match any token
30    ///
31    /// # Example
32    /// ```rust
33    /// # use regex::Regex;
34    /// # use pattern_lexer::Lexer;
35    /// # use pattern_lexer::token::{TokenKind, Tokenizer};
36    /// #    
37    /// // Simple math lexer example
38    /// let plus = Tokenizer::new(TokenKind::OPERATOR("PLUS"), '+');
39    /// let minus = Tokenizer::new(TokenKind::OPERATOR("MINUS"), '-');
40    /// let star = Tokenizer::new(TokenKind::OPERATOR("STAR"), '*');
41    /// let slash = Tokenizer::new(TokenKind::OPERATOR("SLASH"), '/');
42    /// let equal = Tokenizer::new(TokenKind::OPERATOR("EQUAL"), '=');
43    /// let number = Tokenizer::new(TokenKind::LITERAL("NUMBER"), |s: &str| {
44    ///   let mut dot_seen = false;
45    ///
46    ///   for ch in s.chars() {
47    ///     if !ch.is_digit(10) && (ch != '.' || dot_seen) {
48    ///       return false;
49    ///     } else if ch == '.' {
50    ///       dot_seen = true;
51    ///     }
52    ///   }
53    ///   
54    ///   true
55    /// });
56    /// let id_regex = Regex::new(r"[a-zA-Z_$][a-zA-Z_$0-9]*").unwrap();
57    /// let id = Tokenizer::new(TokenKind::IDENTIFIER, id_regex);
58    /// let whitespace = Tokenizer::new(TokenKind::WHITESPACE("SPACE"), ' ');
59    /// let lexer = Lexer::builder()
60    ///   .extend(vec![plus, minus, star, slash, equal, number, id, whitespace])
61    ///   .build();
62    ///
63    /// assert!(lexer.tokenize("x_4 = 1 + 3 = 8 * 0.25").is_ok());
64    /// // Our lexer doesn't handle parenthesis...
65    /// assert!(lexer.tokenize("x_4 = (1 + 3)").is_err());
66    /// ```
67    pub fn tokenize(&self, value: &'a str) -> anyhow::Result<Vec<Token<'a>>> {
68        let mut start = 0;
69        let mut result = Vec::new();
70        // Try to find tokens until EOF or Error
71        while start < value.len() {
72            let token = self.tokenize_once(value, start)?;
73            start += token.value.len();
74            result.push(token);
75        }
76
77        Ok(result)
78    }
79
80    /// Return a [Token] for the given `&str`
81    ///
82    /// # Error
83    /// Return an Error if a part of the string doesn't match any token
84    fn tokenize_once(&self, value: &'a str, start: usize) -> anyhow::Result<Token<'a>> {
85        let mut best_token = None;
86        let mut best_size = 0;
87        // Try for each tokenizer in order
88        for tokenizer in &self.tokenizers {
89            if let Some(token) = tokenizer.tokenize(&value[start..]) {
90                if token.value.len() > best_size {
91                    best_size = token.value.len();
92                    best_token = Some(token);
93                }
94            }
95        }
96
97        best_token.ok_or(anyhow!("No match for '{}'", value))
98    }
99}
100
101/// Builder for [Lexer]
102pub struct LexerBuilder<'a> {
103    /// Temporary tokenizers for the Lexer
104    tokenizers: Vec<Tokenizer<'a>>,
105}
106
107impl<'a> LexerBuilder<'a> {
108    /// Create a LexerBuilder
109    ///
110    /// # Example
111    /// ```rust
112    /// # use pattern_lexer::LexerBuilder;
113    /// #
114    /// let mut builder = LexerBuilder::new();
115    /// ```
116    pub fn new() -> Self {
117        Self {
118            tokenizers: Vec::new(),
119        }
120    }
121
122    /// Add a [Tokenizer] to the builder and return it
123    ///
124    /// # Example
125    /// ```rust
126    /// # use pattern_lexer::LexerBuilder;
127    /// # use pattern_lexer::token::{TokenKind, Tokenizer};
128    /// #
129    /// let builder = LexerBuilder::new()
130    ///   .add(Tokenizer::new(TokenKind::DELIMITER("OPAREN"), '('))
131    ///   .add(Tokenizer::new(TokenKind::DELIMITER("CPAREN"), ')'));
132    /// ```
133    pub fn add(mut self, tokenizer: Tokenizer<'a>) -> Self {
134        self.tokenizers.push(tokenizer);
135        self
136    }
137
138    /// Add a vector of [Tokenizer] to the builder and return it
139    ///
140    /// # Example
141    /// ```rust
142    /// # use pattern_lexer::LexerBuilder;
143    /// # use pattern_lexer::token::{TokenKind, Tokenizer};
144    /// #
145    /// let tok = vec![
146    ///     Tokenizer::new(TokenKind::COMMENT("OPEN"), "/*"),
147    ///     Tokenizer::new(TokenKind::DELIMITER("CLOSE"), "*/"),
148    /// ];
149    /// let builder = LexerBuilder::new().extend(tok);
150    /// ```
151    pub fn extend(mut self, tokenizers: Vec<Tokenizer<'a>>) -> Self {
152        self.tokenizers.extend(tokenizers);
153        self
154    }
155
156    /// Build a [Lexer]
157    ///
158    /// # Example
159    /// ```rust
160    /// # use pattern_lexer::LexerBuilder;
161    /// # use pattern_lexer::token::{TokenKind, Tokenizer};
162    /// #
163    /// let lexer = LexerBuilder::new()
164    ///   .add(Tokenizer::new(TokenKind::WHITESPACE("ALL"), [' ', '\n', '\t', '\r']))
165    ///   .build();
166    /// ```
167    pub fn build(mut self) -> Lexer<'a> {
168        // Sort tokenizers by priority
169        self.tokenizers.sort_by(|a, b| a.partial_cmp(b).unwrap());
170        Lexer {
171            tokenizers: self.tokenizers,
172        }
173    }
174}