Skip to main content

oak_tex/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use crate::{language::TexLanguage, lexer::token_type::TexTokenType};
3pub mod token_type;
4use oak_core::{
5    Lexer, LexerCache, LexerState, OakError, Source, TextEdit,
6    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
7};
8use std::sync::LazyLock;
9
10type State<'a, S> = LexerState<'a, S, TexLanguage>;
11
12static TEX_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13static TEX_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "%", block_start: "", block_end: "", nested_blocks: false });
14
15#[derive(Clone, Debug)]
16pub struct TexLexer<'config> {
17    _config: &'config TexLanguage,
18}
19
20impl<'config> Lexer<TexLanguage> for TexLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<TexLanguage>) -> LexOutput<TexLanguage> {
22        let mut state = State::new_with_cache(source, 0, cache);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof()
26        }
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl<'config> TexLexer<'config> {
32    pub fn new(config: &'config TexLanguage) -> Self {
33        Self { _config: config }
34    }
35
36    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            let safe_point = state.get_position();
39
40            if self.skip_whitespace(state) {
41                continue;
42            }
43
44            if self.skip_comment(state) {
45                continue;
46            }
47
48            if self.lex_command(state) {
49                continue;
50            }
51
52            if self.lex_math_delimiters(state) {
53                continue;
54            }
55
56            if self.lex_braces_and_brackets(state) {
57                continue;
58            }
59
60            if self.lex_special_chars(state) {
61                continue;
62            }
63
64            if self.lex_number(state) {
65                continue;
66            }
67
68            if self.lex_text(state) {
69                continue;
70            }
71
72            state.advance_if_dead_lock(safe_point)
73        }
74
75        Ok(())
76    }
77
78    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
79        TEX_WHITESPACE.scan(state, TexTokenType::Whitespace)
80    }
81
82    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
83        TEX_COMMENT.scan(state, TexTokenType::Comment, TexTokenType::Comment)
84    }
85
86    fn lex_command<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87        let start = state.get_position();
88
89        if state.peek() != Some('\\') {
90            return false;
91        }
92
93        state.advance(1); // consume '\'
94
95        // 读取命令名
96        let mut has_name = false;
97        if let Some(ch) = state.peek() {
98            if ch.is_ascii_alphabetic() {
99                while let Some(ch) = state.peek() {
100                    if ch.is_ascii_alphabetic() {
101                        state.advance(ch.len_utf8());
102                        has_name = true
103                    }
104                    else {
105                        break;
106                    }
107                }
108            }
109            else {
110                // Single non-alphabetic character command (e.g., \\, \&, \$, \ )
111                state.advance(ch.len_utf8());
112                has_name = true
113            }
114        }
115
116        if has_name {
117            let end = state.get_position();
118            let text = state.get_text_in((start + 1..end).into()); // 跳过反斜杠
119
120            let kind = match text.as_ref() {
121                "begin" => TexTokenType::BeginKeyword,
122                "end" => TexTokenType::EndKeyword,
123                "documentclass" => TexTokenType::DocumentclassKeyword,
124                "usepackage" => TexTokenType::UsepackageKeyword,
125                "section" => TexTokenType::SectionKeyword,
126                "subsection" => TexTokenType::SubsectionKeyword,
127                "subsubsection" => TexTokenType::SubsubsectionKeyword,
128                "chapter" => TexTokenType::ChapterKeyword,
129                "part" => TexTokenType::PartKeyword,
130                "title" => TexTokenType::TitleKeyword,
131                "author" => TexTokenType::AuthorKeyword,
132                "date" => TexTokenType::DateKeyword,
133                "maketitle" => TexTokenType::MaketitleKeyword,
134                "tableofcontents" => TexTokenType::TableofcontentsKeyword,
135                "item" => TexTokenType::ItemKeyword,
136                "label" => TexTokenType::LabelKeyword,
137                "ref" => TexTokenType::RefKeyword,
138                "cite" => TexTokenType::CiteKeyword,
139                "includegraphics" => TexTokenType::IncludegraphicsKeyword,
140                "textbf" => TexTokenType::TextbfKeyword,
141                "textit" => TexTokenType::TextitKeyword,
142                "emph" => TexTokenType::EmphKeyword,
143                "frac" => TexTokenType::Frac,
144                "sqrt" => TexTokenType::Sqrt,
145                "sum" => TexTokenType::Sum,
146                "int" => TexTokenType::Int,
147                "lim" => TexTokenType::Lim,
148                "alpha" => TexTokenType::Alpha,
149                "beta" => TexTokenType::Beta,
150                "gamma" => TexTokenType::Gamma,
151                "delta" => TexTokenType::Delta,
152                "epsilon" => TexTokenType::Epsilon,
153                "zeta" => TexTokenType::Zeta,
154                "eta" => TexTokenType::Eta,
155                "theta" => TexTokenType::Theta,
156                "iota" => TexTokenType::Iota,
157                "kappa" => TexTokenType::Kappa,
158                "lambda" => TexTokenType::Lambda,
159                "mu" => TexTokenType::Mu,
160                "nu" => TexTokenType::Nu,
161                "xi" => TexTokenType::Xi,
162                "omicron" => TexTokenType::Omicron,
163                "pi" => TexTokenType::Pi,
164                "rho" => TexTokenType::Rho,
165                "sigma" => TexTokenType::Sigma,
166                "tau" => TexTokenType::Tau,
167                "upsilon" => TexTokenType::Upsilon,
168                "phi" => TexTokenType::Phi,
169                "chi" => TexTokenType::Chi,
170                "psi" => TexTokenType::Psi,
171                "omega" => TexTokenType::Omega,
172                "varepsilon" => TexTokenType::VarEpsilon,
173                "vartheta" => TexTokenType::VarTheta,
174                "varkappa" => TexTokenType::VarKappa,
175                "varpi" => TexTokenType::VarPi,
176                "varrho" => TexTokenType::VarRho,
177                "varsigma" => TexTokenType::VarSigma,
178                "varphi" => TexTokenType::VarPhi,
179                "Gamma" => TexTokenType::UpperGamma,
180                "Delta" => TexTokenType::UpperDelta,
181                "Theta" => TexTokenType::UpperTheta,
182                "Lambda" => TexTokenType::UpperLambda,
183                "Xi" => TexTokenType::UpperXi,
184                "Pi" => TexTokenType::UpperPi,
185                "Sigma" => TexTokenType::UpperSigma,
186                "Upsilon" => TexTokenType::UpperUpsilon,
187                "Phi" => TexTokenType::UpperPhi,
188                "Psi" => TexTokenType::UpperPsi,
189                "Omega" => TexTokenType::UpperOmega,
190                _ => TexTokenType::Command,
191            };
192
193            state.add_token(kind, start, state.get_position());
194            return true;
195        }
196
197        // 如果没有命令名,只是一个反斜杠
198        state.add_token(TexTokenType::Backslash, start, state.get_position());
199        true
200    }
201
202    fn lex_math_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
203        let start = state.get_position();
204
205        if state.consume_if_starts_with("$$") {
206            state.add_token(TexTokenType::DoubleDollar, start, state.get_position());
207            return true;
208        }
209
210        if state.consume_if_starts_with("$") {
211            state.add_token(TexTokenType::Dollar, start, state.get_position());
212            return true;
213        }
214
215        false
216    }
217
218    fn lex_braces_and_brackets<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
219        let start = state.get_position();
220
221        if let Some(ch) = state.peek() {
222            let kind = match ch {
223                '{' => TexTokenType::LeftBrace,
224                '}' => TexTokenType::RightBrace,
225                '[' => TexTokenType::LeftBracket,
226                ']' => TexTokenType::RightBracket,
227                '(' => TexTokenType::LeftParen,
228                ')' => TexTokenType::RightParen,
229                _ => return false,
230            };
231
232            state.advance(ch.len_utf8());
233            state.add_token(kind, start, state.get_position());
234            return true;
235        }
236
237        false
238    }
239
240    fn lex_special_chars<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
241        let start = state.get_position();
242
243        if let Some(ch) = state.peek() {
244            let kind = match ch {
245                '&' => TexTokenType::Ampersand,
246                '#' => TexTokenType::Hash,
247                '^' => TexTokenType::Caret,
248                '_' => TexTokenType::Underscore,
249                '~' => TexTokenType::Tilde,
250                '=' => TexTokenType::Equals,
251                '+' => TexTokenType::Plus,
252                '-' => TexTokenType::Minus,
253                '*' => TexTokenType::Star,
254                '/' => TexTokenType::Slash,
255                '|' => TexTokenType::Pipe,
256                '<' => TexTokenType::Less,
257                '>' => TexTokenType::Greater,
258                '!' => TexTokenType::Exclamation,
259                '?' => TexTokenType::Question,
260                '@' => TexTokenType::At,
261                ':' => TexTokenType::Colon,
262                ';' => TexTokenType::Semicolon,
263                ',' => TexTokenType::Comma,
264                '.' => TexTokenType::Dot,
265                _ => return false,
266            };
267
268            state.advance(ch.len_utf8());
269            state.add_token(kind, start, state.get_position());
270            return true;
271        }
272
273        false
274    }
275
276    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
277        let start = state.get_position();
278        let first = match state.peek() {
279            Some(c) => c,
280            None => return false,
281        };
282
283        if !first.is_ascii_digit() {
284            return false;
285        }
286
287        state.advance(1);
288        while let Some(c) = state.peek() {
289            if c.is_ascii_digit() || c == '.' {
290                state.advance(1);
291            }
292            else {
293                break;
294            }
295        }
296
297        state.add_token(TexTokenType::Number, start, state.get_position());
298        true
299    }
300
301    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
302        let start = state.get_position();
303
304        if let Some(ch) = state.peek() {
305            if ch.is_alphabetic() {
306                state.advance(ch.len_utf8());
307                while let Some(c) = state.peek() {
308                    if c.is_alphanumeric() {
309                        state.advance(c.len_utf8());
310                    }
311                    else {
312                        break;
313                    }
314                }
315                state.add_token(TexTokenType::Identifier, start, state.get_position());
316                return true;
317            }
318        }
319
320        false
321    }
322}