pattern_lexer/lexer.rs
1use crate::token::{Token, Tokenizer};
2use anyhow::anyhow;
3
4/// A plugin-based lexer.
5/// It perform lexical analysis on strings
6pub struct Lexer<'a> {
7 /// The [Tokenizer]s who describe the lexicon
8 tokenizers: Vec<Tokenizer<'a>>,
9}
10
11impl<'a> Lexer<'a> {
12 /// Create a builder
13 ///
14 /// # Example
15 /// ```rust
16 /// # use pattern_lexer::Lexer;
17 /// # use pattern_lexer::token::{TokenKind, Tokenizer};
18 /// #
19 /// let int = Tokenizer::new(TokenKind::LITERAL("INT"), |s: &str| { s.chars().all(|c| c.is_digit(10)) });
20 /// let lex = Lexer::builder().add(int).build();
21 /// ```
22 pub fn builder() -> LexerBuilder<'a> {
23 LexerBuilder::new()
24 }
25
26 /// Return a vector of [Token] from the given `&str`
27 ///
28 /// # Errors
29 /// Return an Error if a part of the string doesn't match any token
30 ///
31 /// # Example
32 /// ```rust
33 /// # use regex::Regex;
34 /// # use pattern_lexer::Lexer;
35 /// # use pattern_lexer::token::{TokenKind, Tokenizer};
36 /// #
37 /// // Simple math lexer example
38 /// let plus = Tokenizer::new(TokenKind::OPERATOR("PLUS"), '+');
39 /// let minus = Tokenizer::new(TokenKind::OPERATOR("MINUS"), '-');
40 /// let star = Tokenizer::new(TokenKind::OPERATOR("STAR"), '*');
41 /// let slash = Tokenizer::new(TokenKind::OPERATOR("SLASH"), '/');
42 /// let equal = Tokenizer::new(TokenKind::OPERATOR("EQUAL"), '=');
43 /// let number = Tokenizer::new(TokenKind::LITERAL("NUMBER"), |s: &str| {
44 /// let mut dot_seen = false;
45 ///
46 /// for ch in s.chars() {
47 /// if !ch.is_digit(10) && (ch != '.' || dot_seen) {
48 /// return false;
49 /// } else if ch == '.' {
50 /// dot_seen = true;
51 /// }
52 /// }
53 ///
54 /// true
55 /// });
56 /// let id_regex = Regex::new(r"[a-zA-Z_$][a-zA-Z_$0-9]*").unwrap();
57 /// let id = Tokenizer::new(TokenKind::IDENTIFIER, id_regex);
58 /// let whitespace = Tokenizer::new(TokenKind::WHITESPACE("SPACE"), ' ');
59 /// let lexer = Lexer::builder()
60 /// .extend(vec![plus, minus, star, slash, equal, number, id, whitespace])
61 /// .build();
62 ///
63 /// assert!(lexer.tokenize("x_4 = 1 + 3 = 8 * 0.25").is_ok());
64 /// // Our lexer doesn't handle parenthesis...
65 /// assert!(lexer.tokenize("x_4 = (1 + 3)").is_err());
66 /// ```
67 pub fn tokenize(&self, value: &'a str) -> anyhow::Result<Vec<Token<'a>>> {
68 let mut start = 0;
69 let mut result = Vec::new();
70 // Try to find tokens until EOF or Error
71 while start < value.len() {
72 let token = self.tokenize_once(value, start)?;
73 start += token.value.len();
74 result.push(token);
75 }
76
77 Ok(result)
78 }
79
80 /// Return a [Token] for the given `&str`
81 ///
82 /// # Error
83 /// Return an Error if a part of the string doesn't match any token
84 fn tokenize_once(&self, value: &'a str, start: usize) -> anyhow::Result<Token<'a>> {
85 let mut best_token = None;
86 let mut best_size = 0;
87 // Try for each tokenizer in order
88 for tokenizer in &self.tokenizers {
89 if let Some(token) = tokenizer.tokenize(&value[start..]) {
90 if token.value.len() > best_size {
91 best_size = token.value.len();
92 best_token = Some(token);
93 }
94 }
95 }
96
97 best_token.ok_or(anyhow!("No match for '{}'", value))
98 }
99}
100
101/// Builder for [Lexer]
102pub struct LexerBuilder<'a> {
103 /// Temporary tokenizers for the Lexer
104 tokenizers: Vec<Tokenizer<'a>>,
105}
106
107impl<'a> LexerBuilder<'a> {
108 /// Create a LexerBuilder
109 ///
110 /// # Example
111 /// ```rust
112 /// # use pattern_lexer::LexerBuilder;
113 /// #
114 /// let mut builder = LexerBuilder::new();
115 /// ```
116 pub fn new() -> Self {
117 Self {
118 tokenizers: Vec::new(),
119 }
120 }
121
122 /// Add a [Tokenizer] to the builder and return it
123 ///
124 /// # Example
125 /// ```rust
126 /// # use pattern_lexer::LexerBuilder;
127 /// # use pattern_lexer::token::{TokenKind, Tokenizer};
128 /// #
129 /// let builder = LexerBuilder::new()
130 /// .add(Tokenizer::new(TokenKind::DELIMITER("OPAREN"), '('))
131 /// .add(Tokenizer::new(TokenKind::DELIMITER("CPAREN"), ')'));
132 /// ```
133 pub fn add(mut self, tokenizer: Tokenizer<'a>) -> Self {
134 self.tokenizers.push(tokenizer);
135 self
136 }
137
138 /// Add a vector of [Tokenizer] to the builder and return it
139 ///
140 /// # Example
141 /// ```rust
142 /// # use pattern_lexer::LexerBuilder;
143 /// # use pattern_lexer::token::{TokenKind, Tokenizer};
144 /// #
145 /// let tok = vec![
146 /// Tokenizer::new(TokenKind::COMMENT("OPEN"), "/*"),
147 /// Tokenizer::new(TokenKind::DELIMITER("CLOSE"), "*/"),
148 /// ];
149 /// let builder = LexerBuilder::new().extend(tok);
150 /// ```
151 pub fn extend(mut self, tokenizers: Vec<Tokenizer<'a>>) -> Self {
152 self.tokenizers.extend(tokenizers);
153 self
154 }
155
156 /// Build a [Lexer]
157 ///
158 /// # Example
159 /// ```rust
160 /// # use pattern_lexer::LexerBuilder;
161 /// # use pattern_lexer::token::{TokenKind, Tokenizer};
162 /// #
163 /// let lexer = LexerBuilder::new()
164 /// .add(Tokenizer::new(TokenKind::WHITESPACE("ALL"), [' ', '\n', '\t', '\r']))
165 /// .build();
166 /// ```
167 pub fn build(mut self) -> Lexer<'a> {
168 // Sort tokenizers by priority
169 self.tokenizers.sort_by(|a, b| a.partial_cmp(b).unwrap());
170 Lexer {
171 tokenizers: self.tokenizers,
172 }
173 }
174}