1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
//! Sana is a crate that allows you to create lexers easily.
//!
//! # Example
//!
//! ```rust
//! use sana::{Sana, Spanned};
//!
//! #[derive(Debug, Clone, Copy, PartialEq, Sana)]
//! enum Token {
//!     #[regex("[a-zA-Z_][a-zA-Z0-9_]*")]
//!     Ident,
//!     #[regex("[0-9]+")]
//!     Integer,
//!
//!     #[token("let", priority = 1)]
//!     Let,
//!     #[token("=")]
//!     Equals,
//!     #[regex(";")]
//!     Semicolon,
//!
//!     #[regex("[ \t\r\n]")]
//!     Whitespace,
//!     #[error]
//!     Error,
//! }
//!
//! let mut lexer = Token::lexer("let answer = 42;");
//!
//! assert_eq!(
//!     lexer.next(),
//!     Some(Spanned { value: Token::Let, start: 0, end: 3 })
//! );
//! assert_eq!(lexer.next(),
//!     Some(Spanned { value: Token::Whitespace, start: 3, end: 4 })
//! );
//! assert_eq!(lexer.next(),
//!     Some(Spanned { value: Token::Ident, start: 4, end: 10 })
//! );
//! assert_eq!(lexer.next(),
//!     Some(Spanned { value: Token::Whitespace, start: 10, end: 11 })
//! );
//! assert_eq!(
//!     lexer.next(),
//!     Some(Spanned { value: Token::Equals, start: 11, end: 12 })
//! );
//! assert_eq!(
//!     lexer.next(),
//!     Some(Spanned { value: Token::Whitespace, start: 12, end: 13 })
//! );
//! assert_eq!(
//!     lexer.next(),
//!     Some(Spanned { value: Token::Integer, start: 13, end: 15 })
//! );
//! assert_eq!(
//!     lexer.next(),
//!     Some(Spanned { value: Token::Semicolon, start: 15, end: 16 })
//! );
//!
//! // No tokens left
//! assert_eq!(
//!     lexer.next(),
//!     None
//! );
//! ```
//!
//! # Lexers – how do they work?
//!
//! A lexer is defined by a set of rules. A rule is a struct
//! `{ regex, priority, action }` where `regex` is a regular expression, `priority`
//! is the priority of the rule, and `action` is the action that corresponds to
//! the token that the rule matches.
//!
//! We say, that a string `s` *matches* regular expression `r`, if `s` belongs to the
//! language (a set of strings) defined by `r`. For example, if `r = [a-z]`, then the
//! string `"a"` matches `r`, because `"a" ∈ { "a", "b", ..., "z" }` but the string
//! `"9"` does not match `r`.
//!
//! Note, that unlike many regular expression implementations like
//! [regex](https://docs.rs/regex), we consider only the full string match rather
//! than a substring match. So the string `"abc"` does *not* match `[a-z]`.
//!
//! A string `s` *matches a rule set* if the rule set contains at least one rule `r`
//! such as `r.regex` matches `s`.
//!
//! A *prefix set* of a string is a set of all prefixes of that string. For example,
//! the prefix set of `"let"` is `{ "l", "le", "let" }`.
//!
//! The *longest match* of the string `s` by a rule set `R` is the longest string
//! `s_max` in the prefix set of `s` such as `s_max` matches `R`.
//!
//! The job of the lexer is to find the longest match of the rule set and emit the
//! action of the rule that matches the longest match. If there more than one such
//! rules, the rule with the highest priority is selected.
//!
//! Consider example above. The longest match of `"let answer = 42;"` is `"let"`.
//! There are two rules that match this string:
//!
//! - `{ regex: "[a-zA-Z_][a-zA-Z0-9_]*", priority: 0, action: Ident }`
//! - `{ regex: "let", priority: 1, action: Let }`.
//!
//! From these rules, the `Let` rule is selected, because it has the highest priority.
//!
//! Implementation-wise, a lexer generator constructs a DFA to match a string by a
//! rule set. The generated lexer consumes the input until it reaches the teminal state. While
//! walking the DFA, lexer remembers the last seen action state. When the teminal
//! state is reached, the lexer returns the last action state or an error, if the
//! lexer visited no action states.

pub use sana_derive::Sana;
#[doc(hidden)]
pub use sana_core::ir;

use sana_core::ir::{Op, Vm};

/// Trait implemented for an enum representing all tokens.
///
/// The trait implemented by `#[derive(Sana)]`. You should not implement it yourself.
pub trait Sana: Sized + Clone + Copy {
    const ERROR: Self;
    #[doc(hidden)]
    const USES_VM: bool = false;

    #[doc(hidden)]
    fn ir() -> &'static [Op<Self>];

    #[doc(hidden)]
    fn lex<'input>(_cursor: &mut ir::Cursor<'input>) -> ir::VmResult<Self> {
        ir::VmResult::Eoi
    }

    /// Create a new `Lexer` that will produce tokens of this type
    fn lexer(input: &str) -> Lexer<'_, Self> {
        Lexer::new(input)
    }
}

/// The `Lexer` is an `Iterator` of tokens
#[derive(Debug, Clone)]
pub struct Lexer<'input, Token: Sana + 'static> {
    vm: Vm<'static, 'input, Token>,
}

impl<'input, Token: Sana> Lexer<'input, Token> {
    /// Create a new `Lexer` on the given input
    ///
    /// **NOTE:** for better type inference it's prefered
    /// to use `Sana::lexer` instead
    pub fn new(input: &'input str) -> Self {
        let ir = Token::ir();
        let vm = Vm::new(ir, input);

        Lexer { vm }
    }

    /// Morth the lexer into another lexer, which scans a different token
    ///
    /// The cursor position of the new lexer is the same as the cursor
    /// position of the old lexer before the metamorphosis
    pub fn morph<Lexeme: Sana + 'static>(self) -> Lexer<'input, Lexeme> {
        let mut lexer = Lexeme::lexer(self.source());
        lexer.rewind(self.position());

        lexer
    }

    /// Set the cursor at position `pos`
    pub fn rewind(&mut self, pos: usize) {
        self.vm.cursor.rewind(pos)
    }

    /// The current position of the cursor
    pub fn position(&self) -> usize {
        self.vm.cursor.position()
    }

    /// The source string of the lexer
    pub fn source(&self) -> &'input str {
        self.vm.cursor.input
    }
}

/// A value (for example, token) together with its range
///
/// The range includes the start but excludes the end, similar to `start..end` ranges
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct Spanned<T> {
    pub start: usize,
    pub end: usize,
    pub value: T,
}

impl<'input, Token: Sana> Iterator for Lexer<'input, Token> {
    type Item = Spanned<Token>;

    fn next(&mut self) -> Option<Self::Item> {
        use sana_core::ir::VmResult::*;

        let res =
            if Token::USES_VM { self.vm.run() }
            else { Token::lex(&mut self.vm.cursor) };

        let token = match res {
            Action { start, end, action } =>
                Spanned { start, end, value: action },
            Error { start, end } =>
                Spanned { start, end, value: Token::ERROR },
            Eoi => return None,
        };

        Some(token)
    }
}