1#![doc = include_str!("readme.md")]
2use crate::{language::TexLanguage, lexer::token_type::TexTokenType};
3pub mod token_type;
5use oak_core::{
6 Lexer, LexerCache, LexerState, OakError, Source, TextEdit,
7 lexer::{CommentConfig, LexOutput, WhitespaceConfig},
8};
9use std::sync::LazyLock;
10
11pub(crate) type State<'a, S> = LexerState<'a, S, TexLanguage>;
12
13static TEX_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
14static TEX_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "%", block_start: "", block_end: "", nested_blocks: false });
15
16#[derive(Clone, Debug)]
18pub struct TexLexer<'config> {
19 config: &'config TexLanguage,
21}
22
23impl<'config> Lexer<TexLanguage> for TexLexer<'config> {
24 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<TexLanguage>) -> LexOutput<TexLanguage> {
26 let mut state = State::new_with_cache(source, 0, cache);
27 let result = self.run(&mut state);
28 if result.is_ok() {
29 state.add_eof()
30 }
31 state.finish_with_cache(result, cache)
32 }
33}
34
35impl<'config> TexLexer<'config> {
36 pub fn new(config: &'config TexLanguage) -> Self {
38 Self { config }
39 }
40
41 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
43 while state.not_at_end() {
44 let safe_point = state.get_position();
45
46 if self.skip_whitespace(state) {
47 continue;
48 }
49
50 if self.skip_comment(state) {
51 continue;
52 }
53
54 if self.lex_command(state) {
55 continue;
56 }
57
58 if self.lex_math_delimiters(state) {
59 continue;
60 }
61
62 if self.lex_braces_and_brackets(state) {
63 continue;
64 }
65
66 if self.lex_special_chars(state) {
67 continue;
68 }
69
70 if self.lex_number(state) {
71 continue;
72 }
73
74 if self.lex_text(state) {
75 continue;
76 }
77
78 state.advance_if_dead_lock(safe_point)
79 }
80
81 Ok(())
82 }
83
84 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86 TEX_WHITESPACE.scan(state, TexTokenType::Whitespace)
87 }
88
89 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91 TEX_COMMENT.scan(state, TexTokenType::Comment, TexTokenType::Comment)
92 }
93
94 fn lex_command<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
96 let start = state.get_position();
97
98 if state.peek() != Some('\\') {
99 return false;
100 }
101
102 state.advance(1); let mut has_name = false;
106 if let Some(ch) = state.peek() {
107 if ch.is_ascii_alphabetic() {
108 while let Some(ch) = state.peek() {
109 if ch.is_ascii_alphabetic() {
110 state.advance(ch.len_utf8());
111 has_name = true
112 }
113 else {
114 break;
115 }
116 }
117 }
118 else {
119 state.advance(ch.len_utf8());
121 has_name = true
122 }
123 }
124
125 if has_name {
126 let end = state.get_position();
127 let text = state.get_text_in((start + 1..end).into()); let kind = match text.as_ref() {
130 "begin" => TexTokenType::BeginKeyword,
131 "end" => TexTokenType::EndKeyword,
132 "(" => TexTokenType::Dollar, ")" => TexTokenType::Dollar,
134 "[" => TexTokenType::DoubleDollar, "]" => TexTokenType::DoubleDollar,
136 "documentclass" => TexTokenType::DocumentclassKeyword,
137 "usepackage" => TexTokenType::UsepackageKeyword,
138 "section" => TexTokenType::SectionKeyword,
139 "subsection" => TexTokenType::SubsectionKeyword,
140 "subsubsection" => TexTokenType::SubsubsectionKeyword,
141 "chapter" => TexTokenType::ChapterKeyword,
142 "part" => TexTokenType::PartKeyword,
143 "title" => TexTokenType::TitleKeyword,
144 "author" => TexTokenType::AuthorKeyword,
145 "date" => TexTokenType::DateKeyword,
146 "maketitle" => TexTokenType::MaketitleKeyword,
147 "tableofcontents" => TexTokenType::TableofcontentsKeyword,
148 "item" => TexTokenType::ItemKeyword,
149 "label" => TexTokenType::LabelKeyword,
150 "ref" => TexTokenType::RefKeyword,
151 "cite" => TexTokenType::CiteKeyword,
152 "includegraphics" => TexTokenType::IncludegraphicsKeyword,
153 "textbf" => TexTokenType::TextbfKeyword,
154 "textit" => TexTokenType::TextitKeyword,
155 "texttt" => TexTokenType::TextTt,
156 "textsc" => TexTokenType::TextSc,
157 "emph" => TexTokenType::EmphKeyword,
158 "underline" => TexTokenType::Underline,
159 "frac" => TexTokenType::Frac,
160 "sqrt" => TexTokenType::Sqrt,
161 "sum" => TexTokenType::Sum,
162 "int" => TexTokenType::Int,
163 "lim" => TexTokenType::Lim,
164 "alpha" => TexTokenType::Alpha,
165 "beta" => TexTokenType::Beta,
166 "gamma" => TexTokenType::Gamma,
167 "delta" => TexTokenType::Delta,
168 "epsilon" => TexTokenType::Epsilon,
169 "zeta" => TexTokenType::Zeta,
170 "eta" => TexTokenType::Eta,
171 "theta" => TexTokenType::Theta,
172 "iota" => TexTokenType::Iota,
173 "kappa" => TexTokenType::Kappa,
174 "lambda" => TexTokenType::Lambda,
175 "mu" => TexTokenType::Mu,
176 "nu" => TexTokenType::Nu,
177 "xi" => TexTokenType::Xi,
178 "omicron" => TexTokenType::Omicron,
179 "pi" => TexTokenType::Pi,
180 "rho" => TexTokenType::Rho,
181 "sigma" => TexTokenType::Sigma,
182 "tau" => TexTokenType::Tau,
183 "upsilon" => TexTokenType::Upsilon,
184 "phi" => TexTokenType::Phi,
185 "chi" => TexTokenType::Chi,
186 "psi" => TexTokenType::Psi,
187 "omega" => TexTokenType::Omega,
188 "Gamma" => TexTokenType::UpperGamma,
189 "Delta" => TexTokenType::UpperDelta,
190 "Theta" => TexTokenType::UpperTheta,
191 "Lambda" => TexTokenType::UpperLambda,
192 "Xi" => TexTokenType::UpperXi,
193 "Pi" => TexTokenType::UpperPi,
194 "Sigma" => TexTokenType::UpperSigma,
195 "Upsilon" => TexTokenType::UpperUpsilon,
196 "Phi" => TexTokenType::UpperPhi,
197 "Psi" => TexTokenType::UpperPsi,
198 "Omega" => TexTokenType::UpperOmega,
199 _ => TexTokenType::Command,
200 };
201
202 state.add_token(kind, start, end);
203 true
204 }
205 else {
206 false
207 }
208 }
209
210 fn lex_math_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
211 let start = state.get_position();
212 if let Some('$') = state.peek() {
213 state.advance(1);
214 if let Some('$') = state.peek() {
215 state.advance(1);
216 state.add_token(TexTokenType::DoubleDollar, start, state.get_position());
217 }
218 else {
219 state.add_token(TexTokenType::Dollar, start, state.get_position());
220 }
221 true
222 }
223 else {
224 false
225 }
226 }
227
228 fn lex_braces_and_brackets<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
229 let start = state.get_position();
230 if let Some(ch) = state.peek() {
231 let kind = match ch {
232 '{' => Some(TexTokenType::LeftBrace),
233 '}' => Some(TexTokenType::RightBrace),
234 '[' => Some(TexTokenType::LeftBracket),
235 ']' => Some(TexTokenType::RightBracket),
236 _ => None,
237 };
238
239 if let Some(kind) = kind {
240 state.advance(ch.len_utf8());
241 state.add_token(kind, start, state.get_position());
242 return true;
243 }
244 }
245 false
246 }
247
248 fn lex_special_chars<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
249 let start = state.get_position();
250 if let Some(ch) = state.peek() {
251 let kind = match ch {
252 '&' => Some(TexTokenType::Ampersand),
253 '_' => Some(TexTokenType::Underscore),
254 '^' => Some(TexTokenType::Caret),
255 '~' => Some(TexTokenType::Tilde),
256 '#' => Some(TexTokenType::Hash),
257 '%' => Some(TexTokenType::Percent),
258 _ => None,
259 };
260
261 if let Some(kind) = kind {
262 state.advance(ch.len_utf8());
263 state.add_token(kind, start, state.get_position());
264 return true;
265 }
266 }
267 false
268 }
269
270 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
271 let start = state.get_position();
272 let mut has_digits = false;
273
274 while let Some(ch) = state.peek() {
275 if ch.is_ascii_digit() {
276 state.advance(1);
277 has_digits = true;
278 }
279 else {
280 break;
281 }
282 }
283
284 if has_digits {
285 if let Some('.') = state.peek() {
286 state.advance(1);
287 while let Some(ch) = state.peek() {
288 if ch.is_ascii_digit() {
289 state.advance(1);
290 }
291 else {
292 break;
293 }
294 }
295 }
296 state.add_token(TexTokenType::Number, start, state.get_position());
297 true
298 }
299 else {
300 false
301 }
302 }
303
304 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
305 let start = state.get_position();
306 let mut has_text = false;
307
308 while let Some(ch) = state.peek() {
309 if ch.is_whitespace() || ch == '\\' || ch == '%' || ch == '$' || ch == '{' || ch == '}' || ch == '[' || ch == ']' || ch == '&' || ch == '_' || ch == '^' || ch == '~' || ch == '#' || ch.is_ascii_digit() {
310 break;
311 }
312 state.advance(ch.len_utf8());
313 has_text = true;
314 }
315
316 if has_text {
317 state.add_token(TexTokenType::Text, start, state.get_position());
318 true
319 }
320 else {
321 false
322 }
323 }
324}