1#![doc = include_str!("readme.md")]
2use crate::{language::TexLanguage, lexer::token_type::TexTokenType};
3pub mod token_type;
4use oak_core::{
5 Lexer, LexerCache, LexerState, OakError, Source, TextEdit,
6 lexer::{CommentConfig, LexOutput, WhitespaceConfig},
7};
8use std::sync::LazyLock;
9
10type State<'a, S> = LexerState<'a, S, TexLanguage>;
11
12static TEX_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13static TEX_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "%", block_start: "", block_end: "", nested_blocks: false });
14
15#[derive(Clone, Debug)]
16pub struct TexLexer<'config> {
17 _config: &'config TexLanguage,
18}
19
20impl<'config> Lexer<TexLanguage> for TexLexer<'config> {
21 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<TexLanguage>) -> LexOutput<TexLanguage> {
22 let mut state = State::new_with_cache(source, 0, cache);
23 let result = self.run(&mut state);
24 if result.is_ok() {
25 state.add_eof()
26 }
27 state.finish_with_cache(result, cache)
28 }
29}
30
31impl<'config> TexLexer<'config> {
32 pub fn new(config: &'config TexLanguage) -> Self {
33 Self { _config: config }
34 }
35
36 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37 while state.not_at_end() {
38 let safe_point = state.get_position();
39
40 if self.skip_whitespace(state) {
41 continue;
42 }
43
44 if self.skip_comment(state) {
45 continue;
46 }
47
48 if self.lex_command(state) {
49 continue;
50 }
51
52 if self.lex_math_delimiters(state) {
53 continue;
54 }
55
56 if self.lex_braces_and_brackets(state) {
57 continue;
58 }
59
60 if self.lex_special_chars(state) {
61 continue;
62 }
63
64 if self.lex_number(state) {
65 continue;
66 }
67
68 if self.lex_text(state) {
69 continue;
70 }
71
72 state.advance_if_dead_lock(safe_point)
73 }
74
75 Ok(())
76 }
77
78 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
79 TEX_WHITESPACE.scan(state, TexTokenType::Whitespace)
80 }
81
82 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
83 TEX_COMMENT.scan(state, TexTokenType::Comment, TexTokenType::Comment)
84 }
85
86 fn lex_command<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87 let start = state.get_position();
88
89 if state.peek() != Some('\\') {
90 return false;
91 }
92
93 state.advance(1); let mut has_name = false;
97 if let Some(ch) = state.peek() {
98 if ch.is_ascii_alphabetic() {
99 while let Some(ch) = state.peek() {
100 if ch.is_ascii_alphabetic() {
101 state.advance(ch.len_utf8());
102 has_name = true
103 }
104 else {
105 break;
106 }
107 }
108 }
109 else {
110 state.advance(ch.len_utf8());
112 has_name = true
113 }
114 }
115
116 if has_name {
117 let end = state.get_position();
118 let text = state.get_text_in((start + 1..end).into()); let kind = match text.as_ref() {
121 "begin" => TexTokenType::BeginKeyword,
122 "end" => TexTokenType::EndKeyword,
123 "documentclass" => TexTokenType::DocumentclassKeyword,
124 "usepackage" => TexTokenType::UsepackageKeyword,
125 "section" => TexTokenType::SectionKeyword,
126 "subsection" => TexTokenType::SubsectionKeyword,
127 "subsubsection" => TexTokenType::SubsubsectionKeyword,
128 "chapter" => TexTokenType::ChapterKeyword,
129 "part" => TexTokenType::PartKeyword,
130 "title" => TexTokenType::TitleKeyword,
131 "author" => TexTokenType::AuthorKeyword,
132 "date" => TexTokenType::DateKeyword,
133 "maketitle" => TexTokenType::MaketitleKeyword,
134 "tableofcontents" => TexTokenType::TableofcontentsKeyword,
135 "item" => TexTokenType::ItemKeyword,
136 "label" => TexTokenType::LabelKeyword,
137 "ref" => TexTokenType::RefKeyword,
138 "cite" => TexTokenType::CiteKeyword,
139 "includegraphics" => TexTokenType::IncludegraphicsKeyword,
140 "textbf" => TexTokenType::TextbfKeyword,
141 "textit" => TexTokenType::TextitKeyword,
142 "emph" => TexTokenType::EmphKeyword,
143 "frac" => TexTokenType::Frac,
144 "sqrt" => TexTokenType::Sqrt,
145 "sum" => TexTokenType::Sum,
146 "int" => TexTokenType::Int,
147 "lim" => TexTokenType::Lim,
148 "alpha" => TexTokenType::Alpha,
149 "beta" => TexTokenType::Beta,
150 "gamma" => TexTokenType::Gamma,
151 "delta" => TexTokenType::Delta,
152 "epsilon" => TexTokenType::Epsilon,
153 "zeta" => TexTokenType::Zeta,
154 "eta" => TexTokenType::Eta,
155 "theta" => TexTokenType::Theta,
156 "iota" => TexTokenType::Iota,
157 "kappa" => TexTokenType::Kappa,
158 "lambda" => TexTokenType::Lambda,
159 "mu" => TexTokenType::Mu,
160 "nu" => TexTokenType::Nu,
161 "xi" => TexTokenType::Xi,
162 "omicron" => TexTokenType::Omicron,
163 "pi" => TexTokenType::Pi,
164 "rho" => TexTokenType::Rho,
165 "sigma" => TexTokenType::Sigma,
166 "tau" => TexTokenType::Tau,
167 "upsilon" => TexTokenType::Upsilon,
168 "phi" => TexTokenType::Phi,
169 "chi" => TexTokenType::Chi,
170 "psi" => TexTokenType::Psi,
171 "omega" => TexTokenType::Omega,
172 "varepsilon" => TexTokenType::VarEpsilon,
173 "vartheta" => TexTokenType::VarTheta,
174 "varkappa" => TexTokenType::VarKappa,
175 "varpi" => TexTokenType::VarPi,
176 "varrho" => TexTokenType::VarRho,
177 "varsigma" => TexTokenType::VarSigma,
178 "varphi" => TexTokenType::VarPhi,
179 "Gamma" => TexTokenType::UpperGamma,
180 "Delta" => TexTokenType::UpperDelta,
181 "Theta" => TexTokenType::UpperTheta,
182 "Lambda" => TexTokenType::UpperLambda,
183 "Xi" => TexTokenType::UpperXi,
184 "Pi" => TexTokenType::UpperPi,
185 "Sigma" => TexTokenType::UpperSigma,
186 "Upsilon" => TexTokenType::UpperUpsilon,
187 "Phi" => TexTokenType::UpperPhi,
188 "Psi" => TexTokenType::UpperPsi,
189 "Omega" => TexTokenType::UpperOmega,
190 _ => TexTokenType::Command,
191 };
192
193 state.add_token(kind, start, state.get_position());
194 return true;
195 }
196
197 state.add_token(TexTokenType::Backslash, start, state.get_position());
199 true
200 }
201
202 fn lex_math_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
203 let start = state.get_position();
204
205 if state.consume_if_starts_with("$$") {
206 state.add_token(TexTokenType::DoubleDollar, start, state.get_position());
207 return true;
208 }
209
210 if state.consume_if_starts_with("$") {
211 state.add_token(TexTokenType::Dollar, start, state.get_position());
212 return true;
213 }
214
215 false
216 }
217
218 fn lex_braces_and_brackets<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
219 let start = state.get_position();
220
221 if let Some(ch) = state.peek() {
222 let kind = match ch {
223 '{' => TexTokenType::LeftBrace,
224 '}' => TexTokenType::RightBrace,
225 '[' => TexTokenType::LeftBracket,
226 ']' => TexTokenType::RightBracket,
227 '(' => TexTokenType::LeftParen,
228 ')' => TexTokenType::RightParen,
229 _ => return false,
230 };
231
232 state.advance(ch.len_utf8());
233 state.add_token(kind, start, state.get_position());
234 return true;
235 }
236
237 false
238 }
239
240 fn lex_special_chars<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
241 let start = state.get_position();
242
243 if let Some(ch) = state.peek() {
244 let kind = match ch {
245 '&' => TexTokenType::Ampersand,
246 '#' => TexTokenType::Hash,
247 '^' => TexTokenType::Caret,
248 '_' => TexTokenType::Underscore,
249 '~' => TexTokenType::Tilde,
250 '=' => TexTokenType::Equals,
251 '+' => TexTokenType::Plus,
252 '-' => TexTokenType::Minus,
253 '*' => TexTokenType::Star,
254 '/' => TexTokenType::Slash,
255 '|' => TexTokenType::Pipe,
256 '<' => TexTokenType::Less,
257 '>' => TexTokenType::Greater,
258 '!' => TexTokenType::Exclamation,
259 '?' => TexTokenType::Question,
260 '@' => TexTokenType::At,
261 ':' => TexTokenType::Colon,
262 ';' => TexTokenType::Semicolon,
263 ',' => TexTokenType::Comma,
264 '.' => TexTokenType::Dot,
265 _ => return false,
266 };
267
268 state.advance(ch.len_utf8());
269 state.add_token(kind, start, state.get_position());
270 return true;
271 }
272
273 false
274 }
275
276 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
277 let start = state.get_position();
278 let first = match state.peek() {
279 Some(c) => c,
280 None => return false,
281 };
282
283 if !first.is_ascii_digit() {
284 return false;
285 }
286
287 state.advance(1);
288 while let Some(c) = state.peek() {
289 if c.is_ascii_digit() || c == '.' {
290 state.advance(1);
291 }
292 else {
293 break;
294 }
295 }
296
297 state.add_token(TexTokenType::Number, start, state.get_position());
298 true
299 }
300
301 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
302 let start = state.get_position();
303
304 if let Some(ch) = state.peek() {
305 if ch.is_alphabetic() {
306 state.advance(ch.len_utf8());
307 while let Some(c) = state.peek() {
308 if c.is_alphanumeric() {
309 state.advance(c.len_utf8());
310 }
311 else {
312 break;
313 }
314 }
315 state.add_token(TexTokenType::Identifier, start, state.get_position());
316 return true;
317 }
318 }
319
320 false
321 }
322}