Skip to main content

oak_tex/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use crate::{language::TexLanguage, lexer::token_type::TexTokenType};
3/// Token types for the TeX lexer.
4pub mod token_type;
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError, Source, TextEdit,
7    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
8};
9use std::sync::LazyLock;
10
11pub(crate) type State<'a, S> = LexerState<'a, S, TexLanguage>;
12
13static TEX_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
14static TEX_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "%", block_start: "", block_end: "", nested_blocks: false });
15
16/// A lexer for TeX source files.
17#[derive(Clone, Debug)]
18pub struct TexLexer<'config> {
19    /// The language configuration.
20    config: &'config TexLanguage,
21}
22
23impl<'config> Lexer<TexLanguage> for TexLexer<'config> {
24    /// Lexes the source text into tokens.
25    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<TexLanguage>) -> LexOutput<TexLanguage> {
26        let mut state = State::new_with_cache(source, 0, cache);
27        let result = self.run(&mut state);
28        if result.is_ok() {
29            state.add_eof()
30        }
31        state.finish_with_cache(result, cache)
32    }
33}
34
35impl<'config> TexLexer<'config> {
36    /// Creates a new TeX lexer with the given language configuration.
37    pub fn new(config: &'config TexLanguage) -> Self {
38        Self { config }
39    }
40
41    /// Runs the lexer on the current state.
42    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
43        while state.not_at_end() {
44            let safe_point = state.get_position();
45
46            if self.skip_whitespace(state) {
47                continue;
48            }
49
50            if self.skip_comment(state) {
51                continue;
52            }
53
54            if self.lex_command(state) {
55                continue;
56            }
57
58            if self.lex_math_delimiters(state) {
59                continue;
60            }
61
62            if self.lex_braces_and_brackets(state) {
63                continue;
64            }
65
66            if self.lex_special_chars(state) {
67                continue;
68            }
69
70            if self.lex_number(state) {
71                continue;
72            }
73
74            if self.lex_text(state) {
75                continue;
76            }
77
78            state.advance_if_dead_lock(safe_point)
79        }
80
81        Ok(())
82    }
83
84    /// Skips whitespace characters based on the TeX whitespace configuration.
85    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86        TEX_WHITESPACE.scan(state, TexTokenType::Whitespace)
87    }
88
89    /// Skips comments based on the TeX comment configuration.
90    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91        TEX_COMMENT.scan(state, TexTokenType::Comment, TexTokenType::Comment)
92    }
93
94    /// Lexes a TeX command (e.g., `\section`).
95    fn lex_command<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
96        let start = state.get_position();
97
98        if state.peek() != Some('\\') {
99            return false;
100        }
101
102        state.advance(1); // consume '\'
103
104        // Read the command name
105        let mut has_name = false;
106        if let Some(ch) = state.peek() {
107            if ch.is_ascii_alphabetic() {
108                while let Some(ch) = state.peek() {
109                    if ch.is_ascii_alphabetic() {
110                        state.advance(ch.len_utf8());
111                        has_name = true
112                    }
113                    else {
114                        break;
115                    }
116                }
117            }
118            else {
119                // Single non-alphabetic character command (e.g., \\, \&, \$, \ )
120                state.advance(ch.len_utf8());
121                has_name = true
122            }
123        }
124
125        if has_name {
126            let end = state.get_position();
127            let text = state.get_text_in((start + 1..end).into()); // Skip the backslash
128
129            let kind = match text.as_ref() {
130                "begin" => TexTokenType::BeginKeyword,
131                "end" => TexTokenType::EndKeyword,
132                "(" => TexTokenType::Dollar, // Shorthand for inline math
133                ")" => TexTokenType::Dollar,
134                "[" => TexTokenType::DoubleDollar, // Shorthand for display math
135                "]" => TexTokenType::DoubleDollar,
136                "documentclass" => TexTokenType::DocumentclassKeyword,
137                "usepackage" => TexTokenType::UsepackageKeyword,
138                "section" => TexTokenType::SectionKeyword,
139                "subsection" => TexTokenType::SubsectionKeyword,
140                "subsubsection" => TexTokenType::SubsubsectionKeyword,
141                "chapter" => TexTokenType::ChapterKeyword,
142                "part" => TexTokenType::PartKeyword,
143                "title" => TexTokenType::TitleKeyword,
144                "author" => TexTokenType::AuthorKeyword,
145                "date" => TexTokenType::DateKeyword,
146                "maketitle" => TexTokenType::MaketitleKeyword,
147                "tableofcontents" => TexTokenType::TableofcontentsKeyword,
148                "item" => TexTokenType::ItemKeyword,
149                "label" => TexTokenType::LabelKeyword,
150                "ref" => TexTokenType::RefKeyword,
151                "cite" => TexTokenType::CiteKeyword,
152                "includegraphics" => TexTokenType::IncludegraphicsKeyword,
153                "textbf" => TexTokenType::TextbfKeyword,
154                "textit" => TexTokenType::TextitKeyword,
155                "texttt" => TexTokenType::TextTt,
156                "textsc" => TexTokenType::TextSc,
157                "emph" => TexTokenType::EmphKeyword,
158                "underline" => TexTokenType::Underline,
159                "frac" => TexTokenType::Frac,
160                "sqrt" => TexTokenType::Sqrt,
161                "sum" => TexTokenType::Sum,
162                "int" => TexTokenType::Int,
163                "lim" => TexTokenType::Lim,
164                "alpha" => TexTokenType::Alpha,
165                "beta" => TexTokenType::Beta,
166                "gamma" => TexTokenType::Gamma,
167                "delta" => TexTokenType::Delta,
168                "epsilon" => TexTokenType::Epsilon,
169                "zeta" => TexTokenType::Zeta,
170                "eta" => TexTokenType::Eta,
171                "theta" => TexTokenType::Theta,
172                "iota" => TexTokenType::Iota,
173                "kappa" => TexTokenType::Kappa,
174                "lambda" => TexTokenType::Lambda,
175                "mu" => TexTokenType::Mu,
176                "nu" => TexTokenType::Nu,
177                "xi" => TexTokenType::Xi,
178                "omicron" => TexTokenType::Omicron,
179                "pi" => TexTokenType::Pi,
180                "rho" => TexTokenType::Rho,
181                "sigma" => TexTokenType::Sigma,
182                "tau" => TexTokenType::Tau,
183                "upsilon" => TexTokenType::Upsilon,
184                "phi" => TexTokenType::Phi,
185                "chi" => TexTokenType::Chi,
186                "psi" => TexTokenType::Psi,
187                "omega" => TexTokenType::Omega,
188                "Gamma" => TexTokenType::UpperGamma,
189                "Delta" => TexTokenType::UpperDelta,
190                "Theta" => TexTokenType::UpperTheta,
191                "Lambda" => TexTokenType::UpperLambda,
192                "Xi" => TexTokenType::UpperXi,
193                "Pi" => TexTokenType::UpperPi,
194                "Sigma" => TexTokenType::UpperSigma,
195                "Upsilon" => TexTokenType::UpperUpsilon,
196                "Phi" => TexTokenType::UpperPhi,
197                "Psi" => TexTokenType::UpperPsi,
198                "Omega" => TexTokenType::UpperOmega,
199                _ => TexTokenType::Command,
200            };
201
202            state.add_token(kind, start, end);
203            true
204        }
205        else {
206            false
207        }
208    }
209
210    fn lex_math_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
211        let start = state.get_position();
212        if let Some('$') = state.peek() {
213            state.advance(1);
214            if let Some('$') = state.peek() {
215                state.advance(1);
216                state.add_token(TexTokenType::DoubleDollar, start, state.get_position());
217            }
218            else {
219                state.add_token(TexTokenType::Dollar, start, state.get_position());
220            }
221            true
222        }
223        else {
224            false
225        }
226    }
227
228    fn lex_braces_and_brackets<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
229        let start = state.get_position();
230        if let Some(ch) = state.peek() {
231            let kind = match ch {
232                '{' => Some(TexTokenType::LeftBrace),
233                '}' => Some(TexTokenType::RightBrace),
234                '[' => Some(TexTokenType::LeftBracket),
235                ']' => Some(TexTokenType::RightBracket),
236                _ => None,
237            };
238
239            if let Some(kind) = kind {
240                state.advance(ch.len_utf8());
241                state.add_token(kind, start, state.get_position());
242                return true;
243            }
244        }
245        false
246    }
247
248    fn lex_special_chars<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
249        let start = state.get_position();
250        if let Some(ch) = state.peek() {
251            let kind = match ch {
252                '&' => Some(TexTokenType::Ampersand),
253                '_' => Some(TexTokenType::Underscore),
254                '^' => Some(TexTokenType::Caret),
255                '~' => Some(TexTokenType::Tilde),
256                '#' => Some(TexTokenType::Hash),
257                '%' => Some(TexTokenType::Percent),
258                _ => None,
259            };
260
261            if let Some(kind) = kind {
262                state.advance(ch.len_utf8());
263                state.add_token(kind, start, state.get_position());
264                return true;
265            }
266        }
267        false
268    }
269
270    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
271        let start = state.get_position();
272        let mut has_digits = false;
273
274        while let Some(ch) = state.peek() {
275            if ch.is_ascii_digit() {
276                state.advance(1);
277                has_digits = true;
278            }
279            else {
280                break;
281            }
282        }
283
284        if has_digits {
285            if let Some('.') = state.peek() {
286                state.advance(1);
287                while let Some(ch) = state.peek() {
288                    if ch.is_ascii_digit() {
289                        state.advance(1);
290                    }
291                    else {
292                        break;
293                    }
294                }
295            }
296            state.add_token(TexTokenType::Number, start, state.get_position());
297            true
298        }
299        else {
300            false
301        }
302    }
303
304    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
305        let start = state.get_position();
306        let mut has_text = false;
307
308        while let Some(ch) = state.peek() {
309            if ch.is_whitespace() || ch == '\\' || ch == '%' || ch == '$' || ch == '{' || ch == '}' || ch == '[' || ch == ']' || ch == '&' || ch == '_' || ch == '^' || ch == '~' || ch == '#' || ch.is_ascii_digit() {
310                break;
311            }
312            state.advance(ch.len_utf8());
313            has_text = true;
314        }
315
316        if has_text {
317            state.add_token(TexTokenType::Text, start, state.get_position());
318            true
319        }
320        else {
321            false
322        }
323    }
324}