Skip to main content

oak_regex/lexer/
mod.rs

1use crate::language::RegexLanguage;
2use oak_core::{
3    Lexer, LexerCache, LexerState,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7
8mod lex;
9
10type State<'a, S> = LexerState<'a, S, RegexLanguage>;
11
12/// Lexer for regular expressions.
13///
14/// `RegexLexer` is responsible for tokenizing regular expression source code into a series of tokens
15/// that can be used by the parser. It handles all regex syntax including character classes,
16/// quantifiers, groups, assertions, and special characters.
17///
18/// # Examples
19///
20/// Basic usage:
21///
22/// ```
23/// use oak_core::{Lexer, LexerCache, LexerState, ParseSession, SourceText};
24/// use oak_regex::{RegexLanguage, RegexLexer};
25///
26/// let language = RegexLanguage::default();
27/// let lexer = RegexLexer::new(&language);
28/// let source = SourceText::new(r"[a-z]+\d{1,3}");
29/// let mut cache = ParseSession::<RegexLanguage>::default();
30/// let output = lexer.lex(&source, &[], &mut cache);
31///
32/// // Output contains tokens for the entire source
33/// assert!(!output.result.unwrap().is_empty());
34/// ```
35///
36/// Tokenizing different regex constructs:
37///
38/// ```
39/// use oak_core::{Lexer, LexerCache, LexerState, ParseSession, SourceText};
40/// use oak_regex::{RegexLanguage, RegexLexer};
41///
42/// let language = RegexLanguage::default();
43/// let lexer = RegexLexer::new(&language);
44///
45/// // Tokenize a complex regular expression
46/// let source = SourceText::new(r"(?:(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,})");
47/// let mut cache = ParseSession::<RegexLanguage>::default();
48/// let output = lexer.lex(&source, &[], &mut cache);
49///
50/// // Verify tokens were generated
51/// assert!(output.result.unwrap().len() > 5);
52/// ```
53#[derive(Clone, Debug)]
54pub struct RegexLexer<'config> {
55    _config: &'config RegexLanguage,
56}
57
58impl<'config> Lexer<RegexLanguage> for RegexLexer<'config> {
59    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<RegexLanguage>) -> LexOutput<RegexLanguage> {
60        let mut state = State::new(source);
61        let result = self.run(&mut state);
62        if result.is_ok() {
63            state.add_eof();
64        }
65        state.finish_with_cache(result, cache)
66    }
67}
68
69impl<'config> RegexLexer<'config> {
70    /// Creates a new `RegexLexer` with the given language configuration.
71    ///
72    /// # Arguments
73    ///
74    /// * `config` - A `RegexLanguage` configuration that controls
75    ///   language-specific parsing behavior.
76    ///
77    /// # Examples
78    ///
79    /// ```
80    /// # use oak_regex::{RegexLexer, RegexLanguage};
81    ///
82    /// let language = RegexLanguage::default();
83    /// let lexer = RegexLexer::new(&language);
84    /// ```
85    pub fn new(config: &'config RegexLanguage) -> Self {
86        Self { _config: config }
87    }
88
89    /// Returns the whitespace configuration for the lexer.
90    ///
91    /// This method defines how the lexer should handle whitespace characters.
92    /// The configuration enables Unicode whitespace support, allowing the lexer
93    /// to recognize all Unicode whitespace characters, not just ASCII spaces.
94    pub fn whitespace_rules(&self) -> &WhitespaceConfig {
95        &WhitespaceConfig { unicode_whitespace: true }
96    }
97
98    /// Returns the comment configuration for the lexer.
99    ///
100    /// This method defines how the lexer should handle comments in regular expressions.
101    /// Regular expressions typically use `#` as a line comment marker, with comments
102    /// continuing to the end of the line.
103    pub fn comment_rules(&self) -> CommentConfig {
104        CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false }
105    }
106
107    /// Returns the string literal configuration for the lexer.
108    ///
109    /// This method defines how the lexer should handle string literals in regular expressions.
110    /// Regex strings are typically enclosed in double quotes (") and use backslash (\) as escape character.
111    pub fn string_rules(&self) -> StringConfig {
112        StringConfig { quotes: &['"'], escape: Some('\\') }
113    }
114
115    /// Returns the character literal configuration for the lexer.
116    ///
117    /// This method defines how the lexer should handle character literals in regular expressions.
118    /// Regex character literals are enclosed in single quotes (') and do not use escape characters
119    /// in the same way as strings.
120    pub fn char_rules(&self) -> StringConfig {
121        StringConfig { quotes: &['\''], escape: None }
122    }
123}