highlighter_core/
lexer.rs

1use regex::{Regex, Error, CaptureLocations};
2
3use crate::language::{Language, Scope};
4
5/// A token in a token context.
6#[derive(Clone, Debug, PartialEq)]
7pub struct Token {
8    /// The scope of this token.
9    pub scope: Scope,
10
11    /// The raw value of this token.
12    pub value: String,
13}
14
15/// A context for processing tokens.
16pub struct TokenContext {
17    /// The tokens in the context.
18    tokens: Vec<Token>,
19}
20
21impl TokenContext {
22    /// Inserts a token into the token context.
23    pub fn token<Str: Into<String>>(&mut self, scope: Scope, value: Str) {
24        self.tokens.push(Token { scope, value: value.into() });
25    }
26}
27
28/// A function which handles a match, and then outputs the corresponding tokens.
29pub type TokenHandler = fn(CaptureLocations, &String, &mut TokenContext);
30
31/// A pattern in the lexer.
32struct HandledPattern {
33    /// The pattern to match.
34    regex: Regex,
35
36    /// The handler for the pattern.
37    handler: TokenHandler,
38}
39
40/// A pattern in the Highlighter.
41enum Pattern {
42    /// A plain pattern with no handler.
43    Plain(Scope, Regex),
44
45    /// A pattern with a handler.
46    Handled(HandledPattern),
47}
48
49/// A context used by the lexer to match tokens.
50pub struct LexerContext {
51    /// A list of patterns that this lexer context can match.
52    patterns: Vec<Pattern>,
53}
54
55impl LexerContext {
56    /// Creates a new lexer context initialized with a catchall token.
57    #[inline]
58    fn new() -> Result<Self, Error> {
59        Ok(Self { patterns: Vec::new() })
60    }
61
62    pub fn token<Str: Into<String>>(&mut self, scope: Scope, pattern: Str) -> Result<(), Error> {
63        // insert before `__other` token.
64        self.patterns.push(Pattern::Plain(scope, Regex::new(&pattern.into())?));
65        Ok(())
66    }
67
68    /// Registers a pattern in the lexer context.
69    pub fn advanced_token<Str: Into<String>>(&mut self, pattern: Str, handler: TokenHandler) -> Result<(), Error> {
70        // insert before `__other` token.
71        self.patterns.push(Pattern::Handled(HandledPattern { regex: Regex::new(&pattern.into())?, handler }));
72        Ok(())
73    }
74}
75
76/// A lexer for the selected language.
77pub struct Lexer {
78    /// The context used by the language.
79    ctx: LexerContext,
80}
81
82impl Lexer {
83    /// Creates a lexer, initialized for the selected language.
84    pub fn new<L: Language>(language: L) -> Result<Self, Error> {
85        let mut ctx = LexerContext::new()?;
86
87        language.init(&mut ctx)?;
88
89        Ok(Self { ctx })
90    }
91
92    /// Tokenizes a string.
93    pub fn lex(&self, str: &str) -> Vec<Token> {
94        let mut i = 0;
95        let mut tokens = TokenContext { tokens: Vec::new() };
96
97        'str_iter: while i < str.len() {
98            for pattern in &self.ctx.patterns {
99                match pattern {
100                    Pattern::Plain(scope, regex) => {
101                        let mut captures = regex.capture_locations();
102
103                        if let Some(m) = regex.captures_read_at(&mut captures, str, i) {
104                            if m.start() != i {
105                                continue;
106                            }
107
108                            i = m.end();
109                            tokens.token(*scope, m.as_str());
110                            continue 'str_iter;
111                        }
112                    },
113                    Pattern::Handled(pattern) => {
114                        let regex = &pattern.regex;
115                        let mut captures = pattern.regex.capture_locations();
116
117                        if let Some(m) = regex.captures_read_at(&mut captures, str, i) {
118                            if m.start() != i {
119                                continue;
120                            }
121
122                            i = m.end();
123                            (pattern.handler)(captures, &str.to_owned(), &mut tokens);
124                            continue 'str_iter;
125                        }
126                    },
127                }
128            }
129
130            tokens.token(Scope::None, str.chars().nth(i).unwrap());
131            i += 1;
132        }
133
134        tokens.tokens
135    }
136}