1use crate::{kind::TexSyntaxKind, language::TexLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentLine, LexOutput, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, TexLanguage>;
10
11static TEX_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static TEX_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["%"] });
13
14#[derive(Clone)]
15pub struct TexLexer<'config> {
16 config: &'config TexLanguage,
17}
18
19impl<'config> Lexer<TexLanguage> for TexLexer<'config> {
20 fn lex_incremental(
21 &self,
22 source: impl Source,
23 changed: usize,
24 cache: IncrementalCache<TexLanguage>,
25 ) -> LexOutput<TexLanguage> {
26 let mut state = LexerState::new_with_cache(source, changed, cache);
27 let result = self.run(&mut state);
28 state.finish(result)
29 }
30}
31
32impl<'config> TexLexer<'config> {
33 pub fn new(config: &'config TexLanguage) -> Self {
34 Self { config }
35 }
36
37 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
38 while state.not_at_end() {
39 let safe_point = state.get_position();
40
41 if self.skip_whitespace(state) {
42 continue;
43 }
44
45 if self.skip_comment(state) {
46 continue;
47 }
48
49 if self.lex_command(state) {
50 continue;
51 }
52
53 if self.lex_math_delimiters(state) {
54 continue;
55 }
56
57 if self.lex_braces_and_brackets(state) {
58 continue;
59 }
60
61 if self.lex_special_chars(state) {
62 continue;
63 }
64
65 if self.lex_number(state) {
66 continue;
67 }
68
69 if self.lex_text(state) {
70 continue;
71 }
72
73 state.safe_check(safe_point);
74 }
75
76 let eof_pos = state.get_position();
78 state.add_token(TexSyntaxKind::Eof, eof_pos, eof_pos);
79 Ok(())
80 }
81
82 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
83 match TEX_WHITESPACE.scan(state.rest(), state.get_position(), TexSyntaxKind::Whitespace) {
84 Some(token) => {
85 state.advance_with(token);
86 true
87 }
88 None => false,
89 }
90 }
91
92 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
93 let start = state.get_position();
94 let rest = state.rest();
95
96 if rest.starts_with("%") {
98 state.advance(1);
99 while let Some(ch) = state.peek() {
100 if ch == '\n' || ch == '\r' {
101 break;
102 }
103 state.advance(ch.len_utf8());
104 }
105 state.add_token(TexSyntaxKind::Comment, start, state.get_position());
106 return true;
107 }
108 false
109 }
110
111 fn lex_command<S: Source>(&self, state: &mut State<S>) -> bool {
112 let start = state.get_position();
113
114 if state.current() != Some('\\') {
115 return false;
116 }
117
118 state.advance(1); let mut has_name = false;
122 while let Some(ch) = state.current() {
123 if ch.is_ascii_alphabetic() {
124 state.advance(1);
125 has_name = true;
126 }
127 else {
128 break;
129 }
130 }
131
132 if has_name {
133 let end = state.get_position();
134 let text = state.get_text_in((start + 1..end).into()); let kind = match text {
137 "begin" => TexSyntaxKind::BeginKeyword,
138 "end" => TexSyntaxKind::EndKeyword,
139 "documentclass" => TexSyntaxKind::DocumentclassKeyword,
140 "usepackage" => TexSyntaxKind::UsepackageKeyword,
141 "section" => TexSyntaxKind::SectionKeyword,
142 "subsection" => TexSyntaxKind::SubsectionKeyword,
143 "subsubsection" => TexSyntaxKind::SubsubsectionKeyword,
144 "chapter" => TexSyntaxKind::ChapterKeyword,
145 "part" => TexSyntaxKind::PartKeyword,
146 "title" => TexSyntaxKind::TitleKeyword,
147 "author" => TexSyntaxKind::AuthorKeyword,
148 "date" => TexSyntaxKind::DateKeyword,
149 "maketitle" => TexSyntaxKind::MaketitleKeyword,
150 "tableofcontents" => TexSyntaxKind::TableofcontentsKeyword,
151 "item" => TexSyntaxKind::ItemKeyword,
152 "label" => TexSyntaxKind::LabelKeyword,
153 "ref" => TexSyntaxKind::RefKeyword,
154 "cite" => TexSyntaxKind::CiteKeyword,
155 "includegraphics" => TexSyntaxKind::IncludegraphicsKeyword,
156 "textbf" => TexSyntaxKind::TextbfKeyword,
157 "textit" => TexSyntaxKind::TextitKeyword,
158 "emph" => TexSyntaxKind::EmphKeyword,
159 "frac" => TexSyntaxKind::Frac,
160 "sqrt" => TexSyntaxKind::Sqrt,
161 "sum" => TexSyntaxKind::Sum,
162 "int" => TexSyntaxKind::Int,
163 "lim" => TexSyntaxKind::Lim,
164 "alpha" => TexSyntaxKind::Alpha,
165 "beta" => TexSyntaxKind::Beta,
166 "gamma" => TexSyntaxKind::Gamma,
167 "delta" => TexSyntaxKind::Delta,
168 "epsilon" => TexSyntaxKind::Epsilon,
169 _ => TexSyntaxKind::Command,
170 };
171
172 state.add_token(kind, start, state.get_position());
173 return true;
174 }
175
176 state.add_token(TexSyntaxKind::Backslash, start, state.get_position());
178 true
179 }
180
181 fn lex_math_delimiters<S: Source>(&self, state: &mut State<S>) -> bool {
182 let start = state.get_position();
183 let rest = state.rest();
184
185 if rest.starts_with("$$") {
186 state.advance(2);
187 state.add_token(TexSyntaxKind::DoubleDollar, start, state.get_position());
188 return true;
189 }
190
191 if rest.starts_with("$") {
192 state.advance(1);
193 state.add_token(TexSyntaxKind::Dollar, start, state.get_position());
194 return true;
195 }
196
197 false
198 }
199
200 fn lex_braces_and_brackets<S: Source>(&self, state: &mut State<S>) -> bool {
201 let start = state.get_position();
202
203 if let Some(ch) = state.current() {
204 let kind = match ch {
205 '{' => TexSyntaxKind::LeftBrace,
206 '}' => TexSyntaxKind::RightBrace,
207 '[' => TexSyntaxKind::LeftBracket,
208 ']' => TexSyntaxKind::RightBracket,
209 '(' => TexSyntaxKind::LeftParen,
210 ')' => TexSyntaxKind::RightParen,
211 _ => return false,
212 };
213
214 state.advance(ch.len_utf8());
215 state.add_token(kind, start, state.get_position());
216 return true;
217 }
218
219 false
220 }
221
222 fn lex_special_chars<S: Source>(&self, state: &mut State<S>) -> bool {
223 let start = state.get_position();
224
225 if let Some(ch) = state.current() {
226 let kind = match ch {
227 '&' => TexSyntaxKind::Ampersand,
228 '#' => TexSyntaxKind::Hash,
229 '^' => TexSyntaxKind::Caret,
230 '_' => TexSyntaxKind::Underscore,
231 '~' => TexSyntaxKind::Tilde,
232 '=' => TexSyntaxKind::Equal,
233 '+' => TexSyntaxKind::Plus,
234 '-' => TexSyntaxKind::Minus,
235 '*' => TexSyntaxKind::Star,
236 '/' => TexSyntaxKind::Slash,
237 '|' => TexSyntaxKind::Pipe,
238 '<' => TexSyntaxKind::Less,
239 '>' => TexSyntaxKind::Greater,
240 '!' => TexSyntaxKind::Exclamation,
241 '?' => TexSyntaxKind::Question,
242 '@' => TexSyntaxKind::At,
243 ':' => TexSyntaxKind::Colon,
244 ';' => TexSyntaxKind::Semicolon,
245 ',' => TexSyntaxKind::Comma,
246 '.' => TexSyntaxKind::Dot,
247 _ => return false,
248 };
249
250 state.advance(ch.len_utf8());
251 state.add_token(kind, start, state.get_position());
252 return true;
253 }
254
255 false
256 }
257
258 fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
259 let start = state.get_position();
260 let first = match state.current() {
261 Some(c) => c,
262 None => return false,
263 };
264
265 if !first.is_ascii_digit() {
266 return false;
267 }
268
269 state.advance(1);
270 while let Some(c) = state.current() {
271 if c.is_ascii_digit() || c == '.' {
272 state.advance(1);
273 }
274 else {
275 break;
276 }
277 }
278
279 state.add_token(TexSyntaxKind::Number, start, state.get_position());
280 true
281 }
282
283 fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
284 let start = state.get_position();
285
286 if let Some(ch) = state.current() {
287 if ch.is_ascii_alphabetic() {
288 state.advance(1);
289 while let Some(c) = state.current() {
290 if c.is_ascii_alphanumeric() {
291 state.advance(1);
292 }
293 else {
294 break;
295 }
296 }
297 state.add_token(TexSyntaxKind::Identifier, start, state.get_position());
298 return true;
299 }
300 }
301
302 false
303 }
304}