1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
//! Sana is a crate that allows you to create lexers easily. //! //! # Example //! //! ```rust //! use sana::{Sana, Spanned}; //! //! #[derive(Debug, Clone, Copy, PartialEq, Sana)] //! enum Token { //! #[regex("[a-zA-Z_][a-zA-Z0-9_]*")] //! Ident, //! #[regex("[0-9]+")] //! Integer, //! //! #[token("let", priority = 1)] //! Let, //! #[token("=")] //! Equals, //! #[regex(";")] //! Semicolon, //! //! #[regex("[ \t\r\n]")] //! Whitespace, //! #[error] //! Error, //! } //! //! let mut lexer = Token::lexer("let answer = 42;"); //! //! assert_eq!( //! lexer.next(), //! Some(Spanned { value: Token::Let, start: 0, end: 3 }) //! ); //! assert_eq!(lexer.next(), //! Some(Spanned { value: Token::Whitespace, start: 3, end: 4 }) //! ); //! assert_eq!(lexer.next(), //! Some(Spanned { value: Token::Ident, start: 4, end: 10 }) //! ); //! assert_eq!(lexer.next(), //! Some(Spanned { value: Token::Whitespace, start: 10, end: 11 }) //! ); //! assert_eq!( //! lexer.next(), //! Some(Spanned { value: Token::Equals, start: 11, end: 12 }) //! ); //! assert_eq!( //! lexer.next(), //! Some(Spanned { value: Token::Whitespace, start: 12, end: 13 }) //! ); //! assert_eq!( //! lexer.next(), //! Some(Spanned { value: Token::Integer, start: 13, end: 15 }) //! ); //! assert_eq!( //! lexer.next(), //! Some(Spanned { value: Token::Semicolon, start: 15, end: 16 }) //! ); //! //! // No tokens left //! assert_eq!( //! lexer.next(), //! None //! ); //! ``` //! //! # Lexers – how do they work? //! //! A lexer is defined by a set of rules. A rule is a struct //! `{ regex, priority, action }` where `regex` is a regular expression, `priority` //! is the priority of the rule, and `action` is the action that corresponds to //! the token that the rule matches. //! //! We say, that a string `s` *matches* regular expression `r`, if `s` belongs to the //! language (a set of strings) defined by `r`. For example, if `r = [a-z]`, then the //! string `"a"` matches `r`, because `"a" ∈ { "a", "b", ..., "z" }` but the string //! `"9"` does not match `r`. //! //! Note, that unlike many regular expression implementations like //! [regex](https://docs.rs/regex), we consider only the full string match rather //! than a substring match. So the string `"abc"` does *not* match `[a-z]`. //! //! A string `s` *matches a rule set* if the rule set contains at least one rule `r` //! such as `r.regex` matches `s`. //! //! A *prefix set* of a string is a set of all prefixes of that string. For example, //! the prefix set of `"let"` is `{ "l", "le", "let" }`. //! //! The *longest match* of the string `s` by a rule set `R` is the longest string //! `s_max` in the prefix set of `s` such as `s_max` matches `R`. //! //! The job of the lexer is to find the longest match of the rule set and emit the //! action of the rule that matches the longest match. If there more than one such //! rules, the rule with the highest priority is selected. //! //! Consider example above. The longest match of `"let answer = 42;"` is `"let"`. //! There are two rules that match this string: //! //! - `{ regex: "[a-zA-Z_][a-zA-Z0-9_]*", priority: 0, action: Ident }` //! - `{ regex: "let", priority: 1, action: Let }`. //! //! From these rules, the `Let` rule is selected, because it has the highest priority. //! //! Implementation-wise, a lexer generator constructs a DFA to match a string by a //! rule set. The generated lexer consumes the input until it reaches the teminal state. While //! walking the DFA, lexer remembers the last seen action state. When the teminal //! state is reached, the lexer returns the last action state or an error, if the //! lexer visited no action states. pub use sana_derive::Sana; #[doc(hidden)] pub use sana_core::ir; use sana_core::ir::{Op, Vm}; /// Trait implemented for an enum representing all tokens. /// /// The trait implemented by `#[derive(Sana)]`. You should not implement it yourself. pub trait Sana: Sized + Clone + Copy { const ERROR: Self; #[doc(hidden)] const USES_VM: bool = false; #[doc(hidden)] fn ir() -> &'static [Op<Self>]; #[doc(hidden)] fn lex<'input>(_cursor: &mut ir::Cursor<'input>) -> ir::VmResult<Self> { ir::VmResult::Eoi } /// Create a new `Lexer` that will produce tokens of this type fn lexer(input: &str) -> Lexer<'_, Self> { Lexer::new(input) } } /// The `Lexer` is an `Iterator` of tokens #[derive(Debug, Clone)] pub struct Lexer<'input, Token: Sana + 'static> { vm: Vm<'static, 'input, Token>, } impl<'input, Token: Sana> Lexer<'input, Token> { /// Create a new `Lexer` on the given input /// /// **NOTE:** for better type inference it's prefered /// to use `Sana::lexer` instead pub fn new(input: &'input str) -> Self { let ir = Token::ir(); let vm = Vm::new(ir, input); Lexer { vm } } /// Morth the lexer into another lexer, which scans a different token /// /// The cursor position of the new lexer is the same as the cursor /// position of the old lexer before the metamorphosis pub fn morph<Lexeme: Sana + 'static>(self) -> Lexer<'input, Lexeme> { let mut lexer = Lexeme::lexer(self.source()); lexer.rewind(self.position()); lexer } /// Set the cursor at position `pos` pub fn rewind(&mut self, pos: usize) { self.vm.cursor.rewind(pos) } /// The current position of the cursor pub fn position(&self) -> usize { self.vm.cursor.position() } /// The source string of the lexer pub fn source(&self) -> &'input str { self.vm.cursor.input } } /// A value (for example, token) together with its range /// /// The range includes the start but excludes the end, similar to `start..end` ranges #[derive(Debug, Clone, Copy, PartialEq)] pub struct Spanned<T> { pub start: usize, pub end: usize, pub value: T, } impl<'input, Token: Sana> Iterator for Lexer<'input, Token> { type Item = Spanned<Token>; fn next(&mut self) -> Option<Self::Item> { use sana_core::ir::VmResult::*; let res = if Token::USES_VM { self.vm.run() } else { Token::lex(&mut self.vm.cursor) }; let token = match res { Action { start, end, action } => Spanned { start, end, value: action }, Error { start, end } => Spanned { start, end, value: Token::ERROR }, Eoi => return None, }; Some(token) } }