Skip to main content

oak_tex/lexer/
mod.rs

1use crate::{kind::TexSyntaxKind, language::TexLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, TexLanguage>;
10
11static TEX_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static TEX_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "%", block_start: "", block_end: "", nested_blocks: false });
13
14#[derive(Clone, Debug)]
15pub struct TexLexer<'config> {
16    _config: &'config TexLanguage,
17}
18
19impl<'config> Lexer<TexLanguage> for TexLexer<'config> {
20    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<TexLanguage>) -> LexOutput<TexLanguage> {
21        let mut state = State::new_with_cache(source, 0, cache);
22        let result = self.run(&mut state);
23        if result.is_ok() {
24            state.add_eof();
25        }
26        state.finish_with_cache(result, cache)
27    }
28}
29
30impl<'config> TexLexer<'config> {
31    pub fn new(config: &'config TexLanguage) -> Self {
32        Self { _config: config }
33    }
34
35    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
36        while state.not_at_end() {
37            let safe_point = state.get_position();
38
39            if self.skip_whitespace(state) {
40                continue;
41            }
42
43            if self.skip_comment(state) {
44                continue;
45            }
46
47            if self.lex_command(state) {
48                continue;
49            }
50
51            if self.lex_math_delimiters(state) {
52                continue;
53            }
54
55            if self.lex_braces_and_brackets(state) {
56                continue;
57            }
58
59            if self.lex_special_chars(state) {
60                continue;
61            }
62
63            if self.lex_number(state) {
64                continue;
65            }
66
67            if self.lex_text(state) {
68                continue;
69            }
70
71            state.advance_if_dead_lock(safe_point);
72        }
73
74        Ok(())
75    }
76
77    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
78        TEX_WHITESPACE.scan(state, TexSyntaxKind::Whitespace)
79    }
80
81    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
82        TEX_COMMENT.scan(state, TexSyntaxKind::Comment, TexSyntaxKind::Comment)
83    }
84
85    fn lex_command<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86        let start = state.get_position();
87
88        if state.peek() != Some('\\') {
89            return false;
90        }
91
92        state.advance(1); // consume '\'
93
94        // 读取命令名
95        let mut has_name = false;
96        if let Some(ch) = state.peek() {
97            if ch.is_ascii_alphabetic() {
98                while let Some(ch) = state.peek() {
99                    if ch.is_ascii_alphabetic() {
100                        state.advance(ch.len_utf8());
101                        has_name = true;
102                    }
103                    else {
104                        break;
105                    }
106                }
107            }
108            else {
109                // Single non-alphabetic character command (e.g., \\, \&, \$, \ )
110                state.advance(ch.len_utf8());
111                has_name = true;
112            }
113        }
114
115        if has_name {
116            let end = state.get_position();
117            let text = state.get_text_in((start + 1..end).into()); // 跳过反斜杠
118
119            let kind = match text.as_ref() {
120                "begin" => TexSyntaxKind::BeginKeyword,
121                "end" => TexSyntaxKind::EndKeyword,
122                "documentclass" => TexSyntaxKind::DocumentclassKeyword,
123                "usepackage" => TexSyntaxKind::UsepackageKeyword,
124                "section" => TexSyntaxKind::SectionKeyword,
125                "subsection" => TexSyntaxKind::SubsectionKeyword,
126                "subsubsection" => TexSyntaxKind::SubsubsectionKeyword,
127                "chapter" => TexSyntaxKind::ChapterKeyword,
128                "part" => TexSyntaxKind::PartKeyword,
129                "title" => TexSyntaxKind::TitleKeyword,
130                "author" => TexSyntaxKind::AuthorKeyword,
131                "date" => TexSyntaxKind::DateKeyword,
132                "maketitle" => TexSyntaxKind::MaketitleKeyword,
133                "tableofcontents" => TexSyntaxKind::TableofcontentsKeyword,
134                "item" => TexSyntaxKind::ItemKeyword,
135                "label" => TexSyntaxKind::LabelKeyword,
136                "ref" => TexSyntaxKind::RefKeyword,
137                "cite" => TexSyntaxKind::CiteKeyword,
138                "includegraphics" => TexSyntaxKind::IncludegraphicsKeyword,
139                "textbf" => TexSyntaxKind::TextbfKeyword,
140                "textit" => TexSyntaxKind::TextitKeyword,
141                "emph" => TexSyntaxKind::EmphKeyword,
142                "frac" => TexSyntaxKind::Frac,
143                "sqrt" => TexSyntaxKind::Sqrt,
144                "sum" => TexSyntaxKind::Sum,
145                "int" => TexSyntaxKind::Int,
146                "lim" => TexSyntaxKind::Lim,
147                "alpha" => TexSyntaxKind::Alpha,
148                "beta" => TexSyntaxKind::Beta,
149                "gamma" => TexSyntaxKind::Gamma,
150                "delta" => TexSyntaxKind::Delta,
151                "epsilon" => TexSyntaxKind::Epsilon,
152                "zeta" => TexSyntaxKind::Zeta,
153                "eta" => TexSyntaxKind::Eta,
154                "theta" => TexSyntaxKind::Theta,
155                "iota" => TexSyntaxKind::Iota,
156                "kappa" => TexSyntaxKind::Kappa,
157                "lambda" => TexSyntaxKind::Lambda,
158                "mu" => TexSyntaxKind::Mu,
159                "nu" => TexSyntaxKind::Nu,
160                "xi" => TexSyntaxKind::Xi,
161                "omicron" => TexSyntaxKind::Omicron,
162                "pi" => TexSyntaxKind::Pi,
163                "rho" => TexSyntaxKind::Rho,
164                "sigma" => TexSyntaxKind::Sigma,
165                "tau" => TexSyntaxKind::Tau,
166                "upsilon" => TexSyntaxKind::Upsilon,
167                "phi" => TexSyntaxKind::Phi,
168                "chi" => TexSyntaxKind::Chi,
169                "psi" => TexSyntaxKind::Psi,
170                "omega" => TexSyntaxKind::Omega,
171                "varepsilon" => TexSyntaxKind::VarEpsilon,
172                "vartheta" => TexSyntaxKind::VarTheta,
173                "varkappa" => TexSyntaxKind::VarKappa,
174                "varpi" => TexSyntaxKind::VarPi,
175                "varrho" => TexSyntaxKind::VarRho,
176                "varsigma" => TexSyntaxKind::VarSigma,
177                "varphi" => TexSyntaxKind::VarPhi,
178                "Gamma" => TexSyntaxKind::UpperGamma,
179                "Delta" => TexSyntaxKind::UpperDelta,
180                "Theta" => TexSyntaxKind::UpperTheta,
181                "Lambda" => TexSyntaxKind::UpperLambda,
182                "Xi" => TexSyntaxKind::UpperXi,
183                "Pi" => TexSyntaxKind::UpperPi,
184                "Sigma" => TexSyntaxKind::UpperSigma,
185                "Upsilon" => TexSyntaxKind::UpperUpsilon,
186                "Phi" => TexSyntaxKind::UpperPhi,
187                "Psi" => TexSyntaxKind::UpperPsi,
188                "Omega" => TexSyntaxKind::UpperOmega,
189                _ => TexSyntaxKind::Command,
190            };
191
192            state.add_token(kind, start, state.get_position());
193            return true;
194        }
195
196        // 如果没有命令名,只是一个反斜杠
197        state.add_token(TexSyntaxKind::Backslash, start, state.get_position());
198        true
199    }
200
201    fn lex_math_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
202        let start = state.get_position();
203
204        if state.consume_if_starts_with("$$") {
205            state.add_token(TexSyntaxKind::DoubleDollar, start, state.get_position());
206            return true;
207        }
208
209        if state.consume_if_starts_with("$") {
210            state.add_token(TexSyntaxKind::Dollar, start, state.get_position());
211            return true;
212        }
213
214        false
215    }
216
217    fn lex_braces_and_brackets<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
218        let start = state.get_position();
219
220        if let Some(ch) = state.peek() {
221            let kind = match ch {
222                '{' => TexSyntaxKind::LeftBrace,
223                '}' => TexSyntaxKind::RightBrace,
224                '[' => TexSyntaxKind::LeftBracket,
225                ']' => TexSyntaxKind::RightBracket,
226                '(' => TexSyntaxKind::LeftParen,
227                ')' => TexSyntaxKind::RightParen,
228                _ => return false,
229            };
230
231            state.advance(ch.len_utf8());
232            state.add_token(kind, start, state.get_position());
233            return true;
234        }
235
236        false
237    }
238
239    fn lex_special_chars<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
240        let start = state.get_position();
241
242        if let Some(ch) = state.peek() {
243            let kind = match ch {
244                '&' => TexSyntaxKind::Ampersand,
245                '#' => TexSyntaxKind::Hash,
246                '^' => TexSyntaxKind::Caret,
247                '_' => TexSyntaxKind::Underscore,
248                '~' => TexSyntaxKind::Tilde,
249                '=' => TexSyntaxKind::Equals,
250                '+' => TexSyntaxKind::Plus,
251                '-' => TexSyntaxKind::Minus,
252                '*' => TexSyntaxKind::Star,
253                '/' => TexSyntaxKind::Slash,
254                '|' => TexSyntaxKind::Pipe,
255                '<' => TexSyntaxKind::Less,
256                '>' => TexSyntaxKind::Greater,
257                '!' => TexSyntaxKind::Exclamation,
258                '?' => TexSyntaxKind::Question,
259                '@' => TexSyntaxKind::At,
260                ':' => TexSyntaxKind::Colon,
261                ';' => TexSyntaxKind::Semicolon,
262                ',' => TexSyntaxKind::Comma,
263                '.' => TexSyntaxKind::Dot,
264                _ => return false,
265            };
266
267            state.advance(ch.len_utf8());
268            state.add_token(kind, start, state.get_position());
269            return true;
270        }
271
272        false
273    }
274
275    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
276        let start = state.get_position();
277        let first = match state.peek() {
278            Some(c) => c,
279            None => return false,
280        };
281
282        if !first.is_ascii_digit() {
283            return false;
284        }
285
286        state.advance(1);
287        while let Some(c) = state.peek() {
288            if c.is_ascii_digit() || c == '.' {
289                state.advance(1);
290            }
291            else {
292                break;
293            }
294        }
295
296        state.add_token(TexSyntaxKind::Number, start, state.get_position());
297        true
298    }
299
300    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
301        let start = state.get_position();
302
303        if let Some(ch) = state.peek() {
304            if ch.is_alphabetic() {
305                state.advance(ch.len_utf8());
306                while let Some(c) = state.peek() {
307                    if c.is_alphanumeric() {
308                        state.advance(c.len_utf8());
309                    }
310                    else {
311                        break;
312                    }
313                }
314                state.add_token(TexSyntaxKind::Identifier, start, state.get_position());
315                return true;
316            }
317        }
318
319        false
320    }
321}