oak_markdown/lexer/
mod.rs1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5mod block;
6mod inline;
7mod list;
8
9use crate::{language::MarkdownLanguage, lexer::token_type::MarkdownTokenType};
10use oak_core::{Lexer, LexerCache, LexerState, TextEdit, errors::OakError, lexer::LexOutput, source::Source};
11
12pub(crate) type State<'a, S> = LexerState<'a, S, MarkdownLanguage>;
13
14#[derive(Clone, Debug)]
16pub struct MarkdownLexer<'config> {
17 config: &'config MarkdownLanguage,
18}
19
20impl<'config> MarkdownLexer<'config> {
21 pub fn new(config: &'config MarkdownLanguage) -> Self {
23 Self { config }
24 }
25
26 fn run<S: Source + ?Sized>(&self, state: &mut State<S>) -> Result<(), OakError> {
27 while state.not_at_end() {
28 let safe_point = state.get_position();
29
30 if let Some(ch) = state.peek() {
31 match ch {
32 ' ' | '\t' => {
33 if self.config.allow_indented_code_blocks && self.lex_indented_code_block(state) {
34 continue;
35 }
36 self.skip_whitespace(state);
37 }
38 '\n' | '\r' => {
39 self.lex_newline(state);
40 }
41 '$' if self.config.allow_math => {
42 if self.lex_math(state) {
43 continue;
44 }
45 self.lex_special_char(state);
46 }
47 '^' if self.config.allow_subscript || self.config.allow_footnotes => {
48 if self.config.allow_footnotes && self.lex_footnote(state) {
49 continue;
50 }
51 if self.config.allow_subscript && self.lex_sub_superscript(state) {
52 continue;
53 }
54 self.lex_special_char(state);
55 }
56 '#' => {
57 if self.config.allow_headings && self.lex_heading(state) {
58 continue;
59 }
60 self.lex_special_char(state);
61 }
62 '`' => {
63 if self.config.allow_fenced_code_blocks && self.lex_code_block(state) {
64 continue;
65 }
66 if self.lex_inline_code(state) {
67 continue;
68 }
69 self.lex_special_char(state);
70 }
71 '~' => {
72 if self.lex_code_block(state) {
73 continue;
74 }
75 if self.config.allow_strikethrough && self.lex_strikethrough(state) {
76 continue;
77 }
78 if self.config.allow_subscript && self.lex_sub_superscript(state) {
79 continue;
80 }
81 self.lex_special_char(state);
82 }
83 '*' | '_' => {
84 if self.config.allow_horizontal_rules && self.lex_horizontal_rule(state) {
85 continue;
86 }
87 if self.config.allow_lists && self.lex_list_marker(state) {
88 continue;
89 }
90 if self.lex_emphasis(state) {
91 continue;
92 }
93 if self.config.allow_abbreviations && self.lex_abbreviation(state) {
94 continue;
95 }
96 self.lex_special_char(state);
97 }
98 '-' => {
99 if self.config.allow_front_matter && self.lex_front_matter(state) {
100 continue;
101 }
102 if self.config.allow_horizontal_rules && self.lex_horizontal_rule(state) {
103 continue;
104 }
105 if self.config.allow_lists && self.lex_list_marker(state) {
106 continue;
107 }
108 self.lex_special_char(state);
109 }
110 '+' => {
111 if self.config.allow_lists && self.lex_list_marker(state) {
112 continue;
113 }
114 self.lex_special_char(state);
115 }
116 '!' => {
117 if self.lex_link_or_image(state) {
118 continue;
119 }
120 self.lex_special_char(state);
121 }
122 '[' => {
123 if self.config.allow_task_lists && self.lex_task_marker(state) {
124 continue;
125 }
126 if self.lex_link_or_image(state) {
127 continue;
128 }
129 self.lex_special_char(state);
130 }
131 '>' => {
132 if self.config.allow_blockquotes && self.lex_blockquote(state) {
133 continue;
134 }
135 self.lex_special_char(state);
136 }
137 ':' => {
138 if self.config.allow_definition_lists && self.lex_definition_description(state) {
139 continue;
140 }
141 self.lex_special_char(state);
142 }
143 '|' if self.config.allow_tables => {
144 self.lex_special_char(state);
145 }
146 '0'..='9' => {
147 if self.lex_list_marker(state) {
148 continue;
149 }
150 self.lex_text(state);
151 }
152 '<' => {
153 if self.config.allow_html && self.lex_html_tag(state) {
154 continue;
155 }
156 if self.config.allow_xml && self.lex_xml_tag(state) {
157 continue;
158 }
159 self.lex_special_char(state);
160 }
161 ']' | '(' | ')' | '|' | '.' | '\\' => {
162 self.lex_special_char(state);
163 }
164 _ => {
165 if self.lex_text(state) {
166 continue;
167 }
168 let start_pos = state.get_position();
169 state.advance(ch.len_utf8());
170 state.add_token(MarkdownTokenType::Error, start_pos, state.get_position());
171 }
172 }
173 }
174
175 state.advance_if_dead_lock(safe_point)
176 }
177 Ok(())
178 }
179
180 fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
182 let start_pos = state.get_position();
183
184 while let Some(ch) = state.peek() {
185 if ch == ' ' || ch == '\t' {
186 state.advance(ch.len_utf8());
187 }
188 else {
189 break;
190 }
191 }
192
193 if state.get_position() > start_pos {
194 state.add_token(MarkdownTokenType::Whitespace, start_pos, state.get_position());
195 true
196 }
197 else {
198 false
199 }
200 }
201
202 fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
204 let start_pos = state.get_position();
205
206 if let Some('\n') = state.peek() {
207 state.advance(1);
208 state.add_token(MarkdownTokenType::Newline, start_pos, state.get_position());
209 true
210 }
211 else if let Some('\r') = state.peek() {
212 state.advance(1);
213 if let Some('\n') = state.peek() {
214 state.advance(1);
215 }
216 state.add_token(MarkdownTokenType::Newline, start_pos, state.get_position());
217 true
218 }
219 else {
220 false
221 }
222 }
223
224 fn lex_html_tag<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
226 self.lex_any_tag(state, MarkdownTokenType::HtmlTag, MarkdownTokenType::HtmlComment)
227 }
228
229 fn lex_xml_tag<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
231 self.lex_any_tag(state, MarkdownTokenType::XmlTag, MarkdownTokenType::XmlComment)
232 }
233
234 fn lex_any_tag<S: Source + ?Sized>(&self, state: &mut State<S>, tag_kind: MarkdownTokenType, comment_kind: MarkdownTokenType) -> bool {
236 let start_pos = state.get_position();
237
238 if let Some('<') = state.peek() {
239 state.advance(1);
240
241 if let Some('!') = state.peek() {
242 if state.source().get_char_at(state.get_position() + 1) == Some('-') && state.source().get_char_at(state.get_position() + 2) == Some('-') {
243 state.advance(3);
244 let mut found_end = false;
245 while let Some(ch) = state.peek() {
246 if ch == '-' && state.source().get_char_at(state.get_position() + 1) == Some('-') && state.source().get_char_at(state.get_position() + 2) == Some('>') {
247 state.advance(3);
248 found_end = true;
249 break;
250 }
251 state.advance(ch.len_utf8());
252 }
253 if found_end {
254 state.add_token(comment_kind, start_pos, state.get_position());
255 return true;
256 }
257 }
258 }
259
260 let mut found_end = false;
261 let mut in_string = None;
262
263 while let Some(ch) = state.peek() {
264 if let Some(quote) = in_string {
265 if ch == quote {
266 in_string = None;
267 }
268 }
269 else {
270 if ch == '>' {
271 state.advance(1);
272 found_end = true;
273 break;
274 }
275 else if ch == '"' || ch == '\'' {
276 in_string = Some(ch);
277 }
278 }
279 state.advance(ch.len_utf8());
280 }
281
282 if found_end {
283 state.add_token(tag_kind, start_pos, state.get_position());
284 true
285 }
286 else {
287 state.set_position(start_pos);
288 false
289 }
290 }
291 else {
292 false
293 }
294 }
295
296 fn lex_special_char<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
298 let start_pos = state.get_position();
299
300 if let Some(ch) = state.peek() {
301 let token_kind = match ch {
302 '[' => MarkdownTokenType::LBracket,
303 ']' => MarkdownTokenType::RBracket,
304 '(' => MarkdownTokenType::LParen,
305 ')' => MarkdownTokenType::RParen,
306 '<' => MarkdownTokenType::Less,
307 '>' => MarkdownTokenType::Greater,
308 '*' => MarkdownTokenType::Asterisk,
309 '_' => MarkdownTokenType::Underscore,
310 '`' => MarkdownTokenType::Backtick,
311 '~' => MarkdownTokenType::Tilde,
312 '#' => MarkdownTokenType::Hash,
313 '|' => MarkdownTokenType::Pipe,
314 '-' => MarkdownTokenType::Dash,
315 '+' => MarkdownTokenType::Plus,
316 '.' => MarkdownTokenType::Dot,
317 ':' => MarkdownTokenType::Colon,
318 '!' => MarkdownTokenType::Exclamation,
319 '\\' => MarkdownTokenType::Escape,
320 '$' => MarkdownTokenType::Dollar,
321 '^' => MarkdownTokenType::Caret,
322 _ => return false,
323 };
324
325 state.advance(ch.len_utf8());
326 state.add_token(token_kind, start_pos, state.get_position());
327 true
328 }
329 else {
330 false
331 }
332 }
333
334 fn lex_auto_link<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
336 let start_pos = state.get_position();
337
338 if state.source().get_char_at(start_pos) == Some('h') && state.source().get_char_at(start_pos + 1) == Some('t') && state.source().get_char_at(start_pos + 2) == Some('t') && state.source().get_char_at(start_pos + 3) == Some('p') {
339 let mut pos = start_pos + 4;
340 if state.source().get_char_at(pos) == Some('s') {
341 pos += 1;
342 }
343
344 if state.source().get_char_at(pos) == Some(':') && state.source().get_char_at(pos + 1) == Some('/') && state.source().get_char_at(pos + 2) == Some('/') {
345 pos += 3;
346
347 while pos < state.source().length() {
348 if let Some(ch) = state.source().get_char_at(pos) {
349 if ch.is_alphanumeric() || ch == '-' || ch == '_' || ch == '.' || ch == '/' || ch == '?' || ch == '=' || ch == '&' || ch == '#' || ch == '%' {
350 pos += 1;
351 }
352 else {
353 break;
354 }
355 }
356 else {
357 break;
358 }
359 }
360
361 if pos > start_pos + 7 {
362 state.set_position(pos);
363 state.add_token(MarkdownTokenType::AutoLink, start_pos, pos);
364 return true;
365 }
366 }
367 }
368
369 false
370 }
371
372 fn lex_text<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
374 if self.lex_auto_link(state) {
375 return true;
376 }
377
378 let start_pos = state.get_position();
379
380 while let Some(ch) = state.peek() {
381 match ch {
382 ' ' | '\t' | '\n' | '\r' | '#' | '*' | '_' | '`' | '~' | '[' | ']' | '(' | ')' | '<' | '>' | '|' | '-' | '+' | '.' | ':' | '!' | '\\' | '$' | '^' => break,
383 _ => {
384 state.advance(ch.len_utf8());
385 }
386 }
387 }
388
389 if state.get_position() > start_pos {
390 state.add_token(MarkdownTokenType::Text, start_pos, state.get_position());
391 true
392 }
393 else {
394 false
395 }
396 }
397}
398
399impl<'config> Lexer<MarkdownLanguage> for MarkdownLexer<'config> {
400 fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<MarkdownLanguage>) -> LexOutput<MarkdownLanguage> {
401 let mut state = State::new(text);
402 let result = self.run(&mut state);
403 if result.is_ok() {
404 state.add_eof();
405 }
406 state.finish_with_cache(result, cache)
407 }
408}
409
410impl<'config> MarkdownLexer<'config> {
411 pub fn lex_internal<'a, S: Source + ?Sized>(&self, source: &'a S) -> LexOutput<MarkdownLanguage> {
413 let mut state = State::new(source);
414 let result = self.run(&mut state);
415 state.finish(result)
416 }
417}