1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
use crate::token::{Token, Tokenizer};
use anyhow::anyhow;

/// A plugin-based Lexer.
/// It perform lexical analysis on strings
pub struct Lexer<'a> {
    /// The [Tokenizer]s who describe the lexicon
    tokenizers: Vec<Tokenizer<'a>>,
}

impl<'a> Lexer<'a> {
    /// Create a builder
    ///
    /// # Example
    /// ```rust
    /// use lexer::Lexer;
    /// use lexer::token::{TokenKind, Tokenizer};
    ///
    /// let int = Tokenizer::new(TokenKind::LITERAL("INT"), |s: &str| { s.chars().all(|c| c.is_digit(10)) });
    /// let lex = Lexer::builder().add(int).build();
    /// ```
    pub fn builder() -> LexerBuilder<'a> {
        LexerBuilder::new()
    }

    /// Return a vector of [Token] from the given string
    ///
    /// # Errors
    /// Return an Error if a part of the string doesn't match any token
    ///
    /// # Example
    /// ```rust
    /// use regex::Regex;
    /// use lexer::Lexer;
    /// use lexer::token::{TokenKind, Tokenizer};
    ///
    /// // Simple math lexer example
    /// let plus = Tokenizer::new(TokenKind::OPERATOR("PLUS"), '+');
    /// let minus = Tokenizer::new(TokenKind::OPERATOR("MINUS"), '-');
    /// let star = Tokenizer::new(TokenKind::OPERATOR("STAR"), '*');
    /// let slash = Tokenizer::new(TokenKind::OPERATOR("SLASH"), '/');
    /// let equal = Tokenizer::new(TokenKind::OPERATOR("EQUAL"), '=');
    /// let number = Tokenizer::new(TokenKind::LITERAL("NUMBER"), |s: &str| {
    ///   let mut dot_seen = false;
    ///
    ///   for ch in s.chars() {
    ///     if !ch.is_digit(10) && (ch != '.' || dot_seen) {
    ///       return false;
    ///     } else if ch == '.' {
    ///       dot_seen = true;
    ///     }
    ///   }
    ///   
    ///   true
    /// });
    /// let id_regex = Regex::new(r"[a-zA-Z_$][a-zA-Z_$0-9]*").unwrap();
    /// let id = Tokenizer::new(TokenKind::IDENTIFIER, id_regex);
    /// let whitespace = Tokenizer::new(TokenKind::WHITESPACE("SPACE"), ' ');
    /// let lexer = Lexer::builder()
    ///   .extend(vec![plus, minus, star, slash, equal, number, id, whitespace])
    ///   .build();
    ///
    /// assert!(lexer.tokenize("x_4 = 1 + 3 = 8 * 0.25").is_ok());
    /// // Our lexer doesn't handle parenthesis...
    /// assert!(lexer.tokenize("x_4 = (1 + 3)").is_err());
    /// ```
    pub fn tokenize(&self, value: &'a str) -> anyhow::Result<Vec<Token<'a>>> {
        let mut start = 0;
        let mut result = Vec::new();
        // Try to find tokens until EOF or Error
        while start < value.len() {
            let token = self.tokenize_once(value, start)?;
            start += token.value.len();
            result.push(token);
        }

        Ok(result)
    }

    /// Return a [Token] for the given string
    ///
    /// # Error
    /// Return an Error if a part of the string doesn't match any token
    fn tokenize_once(&self, value: &'a str, start: usize) -> anyhow::Result<Token<'a>> {
        let mut best_token = None;
        let mut best_size = 0;
        // Try for each tokenizer in order
        for tokenizer in &self.tokenizers {
            if let Some(token) = tokenizer.tokenize(&value[start..]) {
                if token.value.len() > best_size {
                    best_size = token.value.len();
                    best_token = Some(token);
                }
            }
        }

        best_token.ok_or(anyhow!("No match for '{}'", value))
    }
}

/// Builder for [Lexer]
pub struct LexerBuilder<'a> {
    /// Temporary tokenizers for the Lexer
    tokenizers: Vec<Tokenizer<'a>>,
}

impl<'a> LexerBuilder<'a> {
    /// Create a LexerBuilder
    ///
    /// # Example
    /// ```rust
    /// use lexer::LexerBuilder;
    ///
    /// let mut builder = LexerBuilder::new();
    /// ```
    pub fn new() -> Self {
        Self {
            tokenizers: Vec::new(),
        }
    }

    /// Add a [Tokenizer] to the builder and return it
    ///
    /// # Example
    /// ```rust
    /// use lexer::LexerBuilder;
    /// use lexer::token::{TokenKind, Tokenizer};
    ///
    /// let builder = LexerBuilder::new()
    ///   .add(Tokenizer::new(TokenKind::DELIMITER("OPAREN"), '('))
    ///   .add(Tokenizer::new(TokenKind::DELIMITER("CPAREN"), ')'));
    /// ```
    pub fn add(mut self, tokenizer: Tokenizer<'a>) -> Self {
        self.tokenizers.push(tokenizer);
        self
    }

    /// Add a vector of [Tokenizer] to the builder and return it
    ///
    /// # Example
    /// ```rust
    /// use lexer::LexerBuilder;
    /// use lexer::token::{TokenKind, Tokenizer};
    ///
    /// let tok = vec![
    ///     Tokenizer::new(TokenKind::COMMENT("OPEN"), "/*"),
    ///     Tokenizer::new(TokenKind::DELIMITER("CLOSE"), "*/"),
    /// ];
    /// let builder = LexerBuilder::new().extend(tok);
    /// ```
    pub fn extend(mut self, tokenizers: Vec<Tokenizer<'a>>) -> Self {
        self.tokenizers.extend(tokenizers);
        self
    }

    /// Build a [Lexer]
    ///
    /// # Example
    /// ```rust
    /// use lexer::LexerBuilder;
    /// use lexer::token::{TokenKind, Tokenizer};
    ///
    /// let lexer = LexerBuilder::new()
    ///   .add(Tokenizer::new(TokenKind::WHITESPACE("ALL"), [' ', '\n', '\t', '\r']))
    ///   .build();
    /// ```
    pub fn build(mut self) -> Lexer<'a> {
        // Sort tokenizers by priority
        self.tokenizers.sort_by(|a, b| a.partial_cmp(b).unwrap());
        Lexer {
            tokenizers: self.tokenizers,
        }
    }
}