oak_regex/lexer/mod.rs
1use crate::language::RegexLanguage;
2use oak_core::{
3 Lexer, LexerCache, LexerState,
4 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7
8mod lex;
9
10type State<'a, S> = LexerState<'a, S, RegexLanguage>;
11
12/// Lexer for regular expressions.
13///
14/// `RegexLexer` is responsible for tokenizing regular expression source code into a series of tokens
15/// that can be used by the parser. It handles all regex syntax including character classes,
16/// quantifiers, groups, assertions, and special characters.
17///
18/// # Examples
19///
20/// Basic usage:
21///
22/// ```
23/// use oak_core::{Lexer, LexerCache, LexerState, ParseSession, SourceText};
24/// use oak_regex::{RegexLanguage, RegexLexer};
25///
26/// let language = RegexLanguage::default();
27/// let lexer = RegexLexer::new(&language);
28/// let source = SourceText::new(r"[a-z]+\d{1,3}");
29/// let mut cache = ParseSession::<RegexLanguage>::default();
30/// let output = lexer.lex(&source, &[], &mut cache);
31///
32/// // Output contains tokens for the entire source
33/// assert!(!output.result.unwrap().is_empty());
34/// ```
35///
36/// Tokenizing different regex constructs:
37///
38/// ```
39/// use oak_core::{Lexer, LexerCache, LexerState, ParseSession, SourceText};
40/// use oak_regex::{RegexLanguage, RegexLexer};
41///
42/// let language = RegexLanguage::default();
43/// let lexer = RegexLexer::new(&language);
44///
45/// // Tokenize a complex regular expression
46/// let source = SourceText::new(r"(?:(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,})");
47/// let mut cache = ParseSession::<RegexLanguage>::default();
48/// let output = lexer.lex(&source, &[], &mut cache);
49///
50/// // Verify tokens were generated
51/// assert!(output.result.unwrap().len() > 5);
52/// ```
53#[derive(Clone, Debug)]
54pub struct RegexLexer<'config> {
55 _config: &'config RegexLanguage,
56}
57
58impl<'config> Lexer<RegexLanguage> for RegexLexer<'config> {
59 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<RegexLanguage>) -> LexOutput<RegexLanguage> {
60 let mut state = State::new(source);
61 let result = self.run(&mut state);
62 if result.is_ok() {
63 state.add_eof();
64 }
65 state.finish_with_cache(result, cache)
66 }
67}
68
69impl<'config> RegexLexer<'config> {
70 /// Creates a new `RegexLexer` with the given language configuration.
71 ///
72 /// # Arguments
73 ///
74 /// * `config` - A `RegexLanguage` configuration that controls
75 /// language-specific parsing behavior.
76 ///
77 /// # Examples
78 ///
79 /// ```
80 /// # use oak_regex::{RegexLexer, RegexLanguage};
81 ///
82 /// let language = RegexLanguage::default();
83 /// let lexer = RegexLexer::new(&language);
84 /// ```
85 pub fn new(config: &'config RegexLanguage) -> Self {
86 Self { _config: config }
87 }
88
89 /// Returns the whitespace configuration for the lexer.
90 ///
91 /// This method defines how the lexer should handle whitespace characters.
92 /// The configuration enables Unicode whitespace support, allowing the lexer
93 /// to recognize all Unicode whitespace characters, not just ASCII spaces.
94 pub fn whitespace_rules(&self) -> &WhitespaceConfig {
95 &WhitespaceConfig { unicode_whitespace: true }
96 }
97
98 /// Returns the comment configuration for the lexer.
99 ///
100 /// This method defines how the lexer should handle comments in regular expressions.
101 /// Regular expressions typically use `#` as a line comment marker, with comments
102 /// continuing to the end of the line.
103 pub fn comment_rules(&self) -> CommentConfig {
104 CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false }
105 }
106
107 /// Returns the string literal configuration for the lexer.
108 ///
109 /// This method defines how the lexer should handle string literals in regular expressions.
110 /// Regex strings are typically enclosed in double quotes (") and use backslash (\) as escape character.
111 pub fn string_rules(&self) -> StringConfig {
112 StringConfig { quotes: &['"'], escape: Some('\\') }
113 }
114
115 /// Returns the character literal configuration for the lexer.
116 ///
117 /// This method defines how the lexer should handle character literals in regular expressions.
118 /// Regex character literals are enclosed in single quotes (') and do not use escape characters
119 /// in the same way as strings.
120 pub fn char_rules(&self) -> StringConfig {
121 StringConfig { quotes: &['\''], escape: None }
122 }
123}