oak_tex/lexer/
mod.rs

1use crate::{kind::TexSyntaxKind, language::TexLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, TexLanguage>;
10
11static TEX_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static TEX_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["%"] });
13
14#[derive(Clone)]
15pub struct TexLexer<'config> {
16    config: &'config TexLanguage,
17}
18
19impl<'config> Lexer<TexLanguage> for TexLexer<'config> {
20    fn lex_incremental(
21        &self,
22        source: impl Source,
23        changed: usize,
24        cache: IncrementalCache<TexLanguage>,
25    ) -> LexOutput<TexLanguage> {
26        let mut state = LexerState::new_with_cache(source, changed, cache);
27        let result = self.run(&mut state);
28        state.finish(result)
29    }
30}
31
32impl<'config> TexLexer<'config> {
33    pub fn new(config: &'config TexLanguage) -> Self {
34        Self { config }
35    }
36
37    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
38        while state.not_at_end() {
39            let safe_point = state.get_position();
40
41            if self.skip_whitespace(state) {
42                continue;
43            }
44
45            if self.skip_comment(state) {
46                continue;
47            }
48
49            if self.lex_command(state) {
50                continue;
51            }
52
53            if self.lex_math_delimiters(state) {
54                continue;
55            }
56
57            if self.lex_braces_and_brackets(state) {
58                continue;
59            }
60
61            if self.lex_special_chars(state) {
62                continue;
63            }
64
65            if self.lex_number(state) {
66                continue;
67            }
68
69            if self.lex_text(state) {
70                continue;
71            }
72
73            state.safe_check(safe_point);
74        }
75
76        // 添加 EOF token
77        let eof_pos = state.get_position();
78        state.add_token(TexSyntaxKind::Eof, eof_pos, eof_pos);
79        Ok(())
80    }
81
82    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
83        match TEX_WHITESPACE.scan(state.rest(), state.get_position(), TexSyntaxKind::Whitespace) {
84            Some(token) => {
85                state.advance_with(token);
86                true
87            }
88            None => false,
89        }
90    }
91
92    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
93        let start = state.get_position();
94        let rest = state.rest();
95
96        // TeX 行注释: % ... 直到换行
97        if rest.starts_with("%") {
98            state.advance(1);
99            while let Some(ch) = state.peek() {
100                if ch == '\n' || ch == '\r' {
101                    break;
102                }
103                state.advance(ch.len_utf8());
104            }
105            state.add_token(TexSyntaxKind::Comment, start, state.get_position());
106            return true;
107        }
108        false
109    }
110
111    fn lex_command<S: Source>(&self, state: &mut State<S>) -> bool {
112        let start = state.get_position();
113
114        if state.current() != Some('\\') {
115            return false;
116        }
117
118        state.advance(1); // consume '\'
119
120        // 读取命令名
121        let mut has_name = false;
122        while let Some(ch) = state.current() {
123            if ch.is_ascii_alphabetic() {
124                state.advance(1);
125                has_name = true;
126            }
127            else {
128                break;
129            }
130        }
131
132        if has_name {
133            let end = state.get_position();
134            let text = state.get_text_in((start + 1..end).into()); // 跳过反斜杠
135
136            let kind = match text {
137                "begin" => TexSyntaxKind::BeginKeyword,
138                "end" => TexSyntaxKind::EndKeyword,
139                "documentclass" => TexSyntaxKind::DocumentclassKeyword,
140                "usepackage" => TexSyntaxKind::UsepackageKeyword,
141                "section" => TexSyntaxKind::SectionKeyword,
142                "subsection" => TexSyntaxKind::SubsectionKeyword,
143                "subsubsection" => TexSyntaxKind::SubsubsectionKeyword,
144                "chapter" => TexSyntaxKind::ChapterKeyword,
145                "part" => TexSyntaxKind::PartKeyword,
146                "title" => TexSyntaxKind::TitleKeyword,
147                "author" => TexSyntaxKind::AuthorKeyword,
148                "date" => TexSyntaxKind::DateKeyword,
149                "maketitle" => TexSyntaxKind::MaketitleKeyword,
150                "tableofcontents" => TexSyntaxKind::TableofcontentsKeyword,
151                "item" => TexSyntaxKind::ItemKeyword,
152                "label" => TexSyntaxKind::LabelKeyword,
153                "ref" => TexSyntaxKind::RefKeyword,
154                "cite" => TexSyntaxKind::CiteKeyword,
155                "includegraphics" => TexSyntaxKind::IncludegraphicsKeyword,
156                "textbf" => TexSyntaxKind::TextbfKeyword,
157                "textit" => TexSyntaxKind::TextitKeyword,
158                "emph" => TexSyntaxKind::EmphKeyword,
159                "frac" => TexSyntaxKind::Frac,
160                "sqrt" => TexSyntaxKind::Sqrt,
161                "sum" => TexSyntaxKind::Sum,
162                "int" => TexSyntaxKind::Int,
163                "lim" => TexSyntaxKind::Lim,
164                "alpha" => TexSyntaxKind::Alpha,
165                "beta" => TexSyntaxKind::Beta,
166                "gamma" => TexSyntaxKind::Gamma,
167                "delta" => TexSyntaxKind::Delta,
168                "epsilon" => TexSyntaxKind::Epsilon,
169                _ => TexSyntaxKind::Command,
170            };
171
172            state.add_token(kind, start, state.get_position());
173            return true;
174        }
175
176        // 如果没有命令名,只是一个反斜杠
177        state.add_token(TexSyntaxKind::Backslash, start, state.get_position());
178        true
179    }
180
181    fn lex_math_delimiters<S: Source>(&self, state: &mut State<S>) -> bool {
182        let start = state.get_position();
183        let rest = state.rest();
184
185        if rest.starts_with("$$") {
186            state.advance(2);
187            state.add_token(TexSyntaxKind::DoubleDollar, start, state.get_position());
188            return true;
189        }
190
191        if rest.starts_with("$") {
192            state.advance(1);
193            state.add_token(TexSyntaxKind::Dollar, start, state.get_position());
194            return true;
195        }
196
197        false
198    }
199
200    fn lex_braces_and_brackets<S: Source>(&self, state: &mut State<S>) -> bool {
201        let start = state.get_position();
202
203        if let Some(ch) = state.current() {
204            let kind = match ch {
205                '{' => TexSyntaxKind::LeftBrace,
206                '}' => TexSyntaxKind::RightBrace,
207                '[' => TexSyntaxKind::LeftBracket,
208                ']' => TexSyntaxKind::RightBracket,
209                '(' => TexSyntaxKind::LeftParen,
210                ')' => TexSyntaxKind::RightParen,
211                _ => return false,
212            };
213
214            state.advance(ch.len_utf8());
215            state.add_token(kind, start, state.get_position());
216            return true;
217        }
218
219        false
220    }
221
222    fn lex_special_chars<S: Source>(&self, state: &mut State<S>) -> bool {
223        let start = state.get_position();
224
225        if let Some(ch) = state.current() {
226            let kind = match ch {
227                '&' => TexSyntaxKind::Ampersand,
228                '#' => TexSyntaxKind::Hash,
229                '^' => TexSyntaxKind::Caret,
230                '_' => TexSyntaxKind::Underscore,
231                '~' => TexSyntaxKind::Tilde,
232                '=' => TexSyntaxKind::Equal,
233                '+' => TexSyntaxKind::Plus,
234                '-' => TexSyntaxKind::Minus,
235                '*' => TexSyntaxKind::Star,
236                '/' => TexSyntaxKind::Slash,
237                '|' => TexSyntaxKind::Pipe,
238                '<' => TexSyntaxKind::Less,
239                '>' => TexSyntaxKind::Greater,
240                '!' => TexSyntaxKind::Exclamation,
241                '?' => TexSyntaxKind::Question,
242                '@' => TexSyntaxKind::At,
243                ':' => TexSyntaxKind::Colon,
244                ';' => TexSyntaxKind::Semicolon,
245                ',' => TexSyntaxKind::Comma,
246                '.' => TexSyntaxKind::Dot,
247                _ => return false,
248            };
249
250            state.advance(ch.len_utf8());
251            state.add_token(kind, start, state.get_position());
252            return true;
253        }
254
255        false
256    }
257
258    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
259        let start = state.get_position();
260        let first = match state.current() {
261            Some(c) => c,
262            None => return false,
263        };
264
265        if !first.is_ascii_digit() {
266            return false;
267        }
268
269        state.advance(1);
270        while let Some(c) = state.current() {
271            if c.is_ascii_digit() || c == '.' {
272                state.advance(1);
273            }
274            else {
275                break;
276            }
277        }
278
279        state.add_token(TexSyntaxKind::Number, start, state.get_position());
280        true
281    }
282
283    fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
284        let start = state.get_position();
285
286        if let Some(ch) = state.current() {
287            if ch.is_ascii_alphabetic() {
288                state.advance(1);
289                while let Some(c) = state.current() {
290                    if c.is_ascii_alphanumeric() {
291                        state.advance(1);
292                    }
293                    else {
294                        break;
295                    }
296                }
297                state.add_token(TexSyntaxKind::Identifier, start, state.get_position());
298                return true;
299            }
300        }
301
302        false
303    }
304}