1use crate::{kind::TexSyntaxKind, language::TexLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, TexLanguage>;
10
11static TEX_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static TEX_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "%", block_start: "", block_end: "", nested_blocks: false });
13
14#[derive(Clone, Debug)]
15pub struct TexLexer<'config> {
16 _config: &'config TexLanguage,
17}
18
19impl<'config> Lexer<TexLanguage> for TexLexer<'config> {
20 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<TexLanguage>) -> LexOutput<TexLanguage> {
21 let mut state = State::new_with_cache(source, 0, cache);
22 let result = self.run(&mut state);
23 if result.is_ok() {
24 state.add_eof();
25 }
26 state.finish_with_cache(result, cache)
27 }
28}
29
30impl<'config> TexLexer<'config> {
31 pub fn new(config: &'config TexLanguage) -> Self {
32 Self { _config: config }
33 }
34
35 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
36 while state.not_at_end() {
37 let safe_point = state.get_position();
38
39 if self.skip_whitespace(state) {
40 continue;
41 }
42
43 if self.skip_comment(state) {
44 continue;
45 }
46
47 if self.lex_command(state) {
48 continue;
49 }
50
51 if self.lex_math_delimiters(state) {
52 continue;
53 }
54
55 if self.lex_braces_and_brackets(state) {
56 continue;
57 }
58
59 if self.lex_special_chars(state) {
60 continue;
61 }
62
63 if self.lex_number(state) {
64 continue;
65 }
66
67 if self.lex_text(state) {
68 continue;
69 }
70
71 state.advance_if_dead_lock(safe_point);
72 }
73
74 Ok(())
75 }
76
77 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
78 TEX_WHITESPACE.scan(state, TexSyntaxKind::Whitespace)
79 }
80
81 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
82 TEX_COMMENT.scan(state, TexSyntaxKind::Comment, TexSyntaxKind::Comment)
83 }
84
85 fn lex_command<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86 let start = state.get_position();
87
88 if state.peek() != Some('\\') {
89 return false;
90 }
91
92 state.advance(1); let mut has_name = false;
96 if let Some(ch) = state.peek() {
97 if ch.is_ascii_alphabetic() {
98 while let Some(ch) = state.peek() {
99 if ch.is_ascii_alphabetic() {
100 state.advance(ch.len_utf8());
101 has_name = true;
102 }
103 else {
104 break;
105 }
106 }
107 }
108 else {
109 state.advance(ch.len_utf8());
111 has_name = true;
112 }
113 }
114
115 if has_name {
116 let end = state.get_position();
117 let text = state.get_text_in((start + 1..end).into()); let kind = match text.as_ref() {
120 "begin" => TexSyntaxKind::BeginKeyword,
121 "end" => TexSyntaxKind::EndKeyword,
122 "documentclass" => TexSyntaxKind::DocumentclassKeyword,
123 "usepackage" => TexSyntaxKind::UsepackageKeyword,
124 "section" => TexSyntaxKind::SectionKeyword,
125 "subsection" => TexSyntaxKind::SubsectionKeyword,
126 "subsubsection" => TexSyntaxKind::SubsubsectionKeyword,
127 "chapter" => TexSyntaxKind::ChapterKeyword,
128 "part" => TexSyntaxKind::PartKeyword,
129 "title" => TexSyntaxKind::TitleKeyword,
130 "author" => TexSyntaxKind::AuthorKeyword,
131 "date" => TexSyntaxKind::DateKeyword,
132 "maketitle" => TexSyntaxKind::MaketitleKeyword,
133 "tableofcontents" => TexSyntaxKind::TableofcontentsKeyword,
134 "item" => TexSyntaxKind::ItemKeyword,
135 "label" => TexSyntaxKind::LabelKeyword,
136 "ref" => TexSyntaxKind::RefKeyword,
137 "cite" => TexSyntaxKind::CiteKeyword,
138 "includegraphics" => TexSyntaxKind::IncludegraphicsKeyword,
139 "textbf" => TexSyntaxKind::TextbfKeyword,
140 "textit" => TexSyntaxKind::TextitKeyword,
141 "emph" => TexSyntaxKind::EmphKeyword,
142 "frac" => TexSyntaxKind::Frac,
143 "sqrt" => TexSyntaxKind::Sqrt,
144 "sum" => TexSyntaxKind::Sum,
145 "int" => TexSyntaxKind::Int,
146 "lim" => TexSyntaxKind::Lim,
147 "alpha" => TexSyntaxKind::Alpha,
148 "beta" => TexSyntaxKind::Beta,
149 "gamma" => TexSyntaxKind::Gamma,
150 "delta" => TexSyntaxKind::Delta,
151 "epsilon" => TexSyntaxKind::Epsilon,
152 "zeta" => TexSyntaxKind::Zeta,
153 "eta" => TexSyntaxKind::Eta,
154 "theta" => TexSyntaxKind::Theta,
155 "iota" => TexSyntaxKind::Iota,
156 "kappa" => TexSyntaxKind::Kappa,
157 "lambda" => TexSyntaxKind::Lambda,
158 "mu" => TexSyntaxKind::Mu,
159 "nu" => TexSyntaxKind::Nu,
160 "xi" => TexSyntaxKind::Xi,
161 "omicron" => TexSyntaxKind::Omicron,
162 "pi" => TexSyntaxKind::Pi,
163 "rho" => TexSyntaxKind::Rho,
164 "sigma" => TexSyntaxKind::Sigma,
165 "tau" => TexSyntaxKind::Tau,
166 "upsilon" => TexSyntaxKind::Upsilon,
167 "phi" => TexSyntaxKind::Phi,
168 "chi" => TexSyntaxKind::Chi,
169 "psi" => TexSyntaxKind::Psi,
170 "omega" => TexSyntaxKind::Omega,
171 "varepsilon" => TexSyntaxKind::VarEpsilon,
172 "vartheta" => TexSyntaxKind::VarTheta,
173 "varkappa" => TexSyntaxKind::VarKappa,
174 "varpi" => TexSyntaxKind::VarPi,
175 "varrho" => TexSyntaxKind::VarRho,
176 "varsigma" => TexSyntaxKind::VarSigma,
177 "varphi" => TexSyntaxKind::VarPhi,
178 "Gamma" => TexSyntaxKind::UpperGamma,
179 "Delta" => TexSyntaxKind::UpperDelta,
180 "Theta" => TexSyntaxKind::UpperTheta,
181 "Lambda" => TexSyntaxKind::UpperLambda,
182 "Xi" => TexSyntaxKind::UpperXi,
183 "Pi" => TexSyntaxKind::UpperPi,
184 "Sigma" => TexSyntaxKind::UpperSigma,
185 "Upsilon" => TexSyntaxKind::UpperUpsilon,
186 "Phi" => TexSyntaxKind::UpperPhi,
187 "Psi" => TexSyntaxKind::UpperPsi,
188 "Omega" => TexSyntaxKind::UpperOmega,
189 _ => TexSyntaxKind::Command,
190 };
191
192 state.add_token(kind, start, state.get_position());
193 return true;
194 }
195
196 state.add_token(TexSyntaxKind::Backslash, start, state.get_position());
198 true
199 }
200
201 fn lex_math_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
202 let start = state.get_position();
203
204 if state.consume_if_starts_with("$$") {
205 state.add_token(TexSyntaxKind::DoubleDollar, start, state.get_position());
206 return true;
207 }
208
209 if state.consume_if_starts_with("$") {
210 state.add_token(TexSyntaxKind::Dollar, start, state.get_position());
211 return true;
212 }
213
214 false
215 }
216
217 fn lex_braces_and_brackets<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
218 let start = state.get_position();
219
220 if let Some(ch) = state.peek() {
221 let kind = match ch {
222 '{' => TexSyntaxKind::LeftBrace,
223 '}' => TexSyntaxKind::RightBrace,
224 '[' => TexSyntaxKind::LeftBracket,
225 ']' => TexSyntaxKind::RightBracket,
226 '(' => TexSyntaxKind::LeftParen,
227 ')' => TexSyntaxKind::RightParen,
228 _ => return false,
229 };
230
231 state.advance(ch.len_utf8());
232 state.add_token(kind, start, state.get_position());
233 return true;
234 }
235
236 false
237 }
238
239 fn lex_special_chars<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
240 let start = state.get_position();
241
242 if let Some(ch) = state.peek() {
243 let kind = match ch {
244 '&' => TexSyntaxKind::Ampersand,
245 '#' => TexSyntaxKind::Hash,
246 '^' => TexSyntaxKind::Caret,
247 '_' => TexSyntaxKind::Underscore,
248 '~' => TexSyntaxKind::Tilde,
249 '=' => TexSyntaxKind::Equals,
250 '+' => TexSyntaxKind::Plus,
251 '-' => TexSyntaxKind::Minus,
252 '*' => TexSyntaxKind::Star,
253 '/' => TexSyntaxKind::Slash,
254 '|' => TexSyntaxKind::Pipe,
255 '<' => TexSyntaxKind::Less,
256 '>' => TexSyntaxKind::Greater,
257 '!' => TexSyntaxKind::Exclamation,
258 '?' => TexSyntaxKind::Question,
259 '@' => TexSyntaxKind::At,
260 ':' => TexSyntaxKind::Colon,
261 ';' => TexSyntaxKind::Semicolon,
262 ',' => TexSyntaxKind::Comma,
263 '.' => TexSyntaxKind::Dot,
264 _ => return false,
265 };
266
267 state.advance(ch.len_utf8());
268 state.add_token(kind, start, state.get_position());
269 return true;
270 }
271
272 false
273 }
274
275 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
276 let start = state.get_position();
277 let first = match state.peek() {
278 Some(c) => c,
279 None => return false,
280 };
281
282 if !first.is_ascii_digit() {
283 return false;
284 }
285
286 state.advance(1);
287 while let Some(c) = state.peek() {
288 if c.is_ascii_digit() || c == '.' {
289 state.advance(1);
290 }
291 else {
292 break;
293 }
294 }
295
296 state.add_token(TexSyntaxKind::Number, start, state.get_position());
297 true
298 }
299
300 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
301 let start = state.get_position();
302
303 if let Some(ch) = state.peek() {
304 if ch.is_alphabetic() {
305 state.advance(ch.len_utf8());
306 while let Some(c) = state.peek() {
307 if c.is_alphanumeric() {
308 state.advance(c.len_utf8());
309 }
310 else {
311 break;
312 }
313 }
314 state.add_token(TexSyntaxKind::Identifier, start, state.get_position());
315 return true;
316 }
317 }
318
319 false
320 }
321}