1use crate::event::{Content, Event, Kind, Link, Name, Point, VOID_EVENTS};
12use crate::message;
13use crate::parser::ParseState;
14use crate::resolve::{call as call_resolve, Name as ResolveName};
15use crate::state::{call, State};
16use crate::subtokenize::Subresult;
17
18#[cfg(feature = "log")]
19use crate::util::char::format_byte_opt;
20
21use crate::util::{constant::TAB_SIZE, edit_map::EditMap};
22use alloc::{boxed::Box, string::String, vec, vec::Vec};
23
24#[derive(Debug, Eq, PartialEq)]
32pub enum Container {
33 BlockQuote,
35 ListItem,
37 GfmFootnoteDefinition,
39}
40
41#[derive(Debug)]
45pub struct ContainerState {
46 pub kind: Container,
48 pub blank_initial: bool,
50 pub size: usize,
52}
53
54#[derive(Debug, PartialEq)]
56enum ByteAction {
57 Normal(u8),
61 Ignore,
63 Insert(u8),
65}
66
67#[derive(Debug, PartialEq, Eq)]
69pub enum LabelKind {
70 Image,
79 Link,
88 GfmFootnote,
97 GfmUndefinedFootnote,
107}
108
109#[derive(Debug)]
111pub struct LabelStart {
112 pub kind: LabelKind,
114 pub start: (usize, usize),
116 pub inactive: bool,
121}
122
123#[derive(Debug)]
125pub struct Label {
126 pub kind: LabelKind,
127 pub start: (usize, usize),
129 pub end: (usize, usize),
131}
132
133#[derive(Debug, PartialEq)]
135enum AttemptKind {
136 Attempt,
138 Check,
140}
141
142#[derive(Debug)]
144struct Attempt {
145 ok: State,
147 nok: State,
149 kind: AttemptKind,
151 progress: Option<Progress>,
157}
158
159#[derive(Clone, Debug)]
164struct Progress {
165 events_len: usize,
169 stack_len: usize,
173 previous: Option<u8>,
175 current: Option<u8>,
177 point: Point,
179}
180
181#[allow(clippy::struct_excessive_bools)]
183#[derive(Debug)]
184pub struct TokenizeState<'a> {
185 pub document_child: Option<Box<Tokenizer<'a>>>,
188 pub document_child_state: Option<State>,
190 pub document_container_stack: Vec<ContainerState>,
192 pub document_continued: usize,
194 pub document_data_index: Option<usize>,
196 pub document_exits: Vec<Option<Vec<Event>>>,
198 pub document_lazy_accepting_before: bool,
200 pub document_at_first_paragraph_of_list_item: bool,
204
205 pub space_or_tab_eol_content: Option<Content>,
207 pub space_or_tab_eol_connect: bool,
208 pub space_or_tab_eol_ok: bool,
209 pub space_or_tab_connect: bool,
210 pub space_or_tab_content: Option<Content>,
211 pub space_or_tab_min: usize,
212 pub space_or_tab_max: usize,
213 pub space_or_tab_size: usize,
214 pub space_or_tab_token: Name,
215
216 pub label_starts: Vec<LabelStart>,
221 pub label_starts_loose: Vec<LabelStart>,
225 pub labels: Vec<Label>,
229
230 pub definitions: Vec<String>,
232 pub gfm_footnote_definitions: Vec<String>,
234
235 pub mdx_last_parse_error: Option<(String, String, String)>,
237
238 pub connect: bool,
240 pub marker: u8,
242 pub marker_b: u8,
244 pub markers: &'static [u8],
246 pub seen: bool,
248 pub size: usize,
250 pub size_b: usize,
252 pub size_c: usize,
254 pub start: usize,
256 pub end: usize,
258 pub token_1: Name,
260 pub token_2: Name,
262 pub token_3: Name,
264 pub token_4: Name,
266 pub token_5: Name,
268 pub token_6: Name,
270}
271
272#[allow(clippy::struct_excessive_bools)]
274#[derive(Debug)]
275pub struct Tokenizer<'a> {
276 column_start: Vec<(usize, usize)>,
278 first_line: usize,
280 line_start: Point,
282 consumed: bool,
287 attempts: Vec<Attempt>,
289 pub current: Option<u8>,
291 pub previous: Option<u8>,
293 pub point: Point,
295 pub events: Vec<Event>,
297 pub stack: Vec<Name>,
301 pub map: EditMap,
303 pub resolvers: Vec<ResolveName>,
305 pub parse_state: &'a ParseState<'a>,
307 pub tokenize_state: TokenizeState<'a>,
309 pub interrupt: bool,
313 pub concrete: bool,
317 pub pierce: bool,
322 pub lazy: bool,
324}
325
326impl<'a> Tokenizer<'a> {
327 pub fn new(point: Point, parse_state: &'a ParseState) -> Tokenizer<'a> {
329 Tokenizer {
330 previous: None,
331 current: None,
332 column_start: vec![],
334 first_line: point.line,
335 line_start: point.clone(),
336 consumed: true,
337 attempts: vec![],
338 point,
339 stack: vec![],
340 events: vec![],
341 parse_state,
342 tokenize_state: TokenizeState {
343 connect: false,
344 document_container_stack: vec![],
345 document_exits: vec![],
346 document_continued: 0,
347 document_lazy_accepting_before: false,
348 document_data_index: None,
349 document_child_state: None,
350 document_child: None,
351 document_at_first_paragraph_of_list_item: false,
352 definitions: vec![],
353 gfm_footnote_definitions: vec![],
354 mdx_last_parse_error: None,
355 end: 0,
356 label_starts: vec![],
357 label_starts_loose: vec![],
358 marker: 0,
359 marker_b: 0,
360 markers: &[],
361 labels: vec![],
362 seen: false,
363 size: 0,
364 size_b: 0,
365 size_c: 0,
366 space_or_tab_eol_content: None,
367 space_or_tab_eol_connect: false,
368 space_or_tab_eol_ok: false,
369 space_or_tab_connect: false,
370 space_or_tab_content: None,
371 space_or_tab_min: 0,
372 space_or_tab_max: 0,
373 space_or_tab_size: 0,
374 space_or_tab_token: Name::SpaceOrTab,
375 start: 0,
376 token_1: Name::Data,
377 token_2: Name::Data,
378 token_3: Name::Data,
379 token_4: Name::Data,
380 token_5: Name::Data,
381 token_6: Name::Data,
382 },
383 map: EditMap::new(),
384 interrupt: false,
385 pierce: false,
386 concrete: false,
387 lazy: false,
388 resolvers: vec![],
389 }
390 }
391
392 pub fn register_resolver(&mut self, name: ResolveName) {
394 if !self.resolvers.contains(&name) {
395 self.resolvers.push(name);
396 }
397 }
398
399 pub fn register_resolver_before(&mut self, name: ResolveName) {
401 if !self.resolvers.contains(&name) {
402 self.resolvers.insert(0, name);
403 }
404 }
405
406 pub fn define_skip(&mut self, mut point: Point) {
410 move_point_back(self, &mut point);
411
412 let info = (point.index, point.vs);
413
414 #[cfg(feature = "log")]
415 log::trace!("position: define skip: {:?} -> ({:?})", point.line, info);
416
417 let at = point.line - self.first_line;
418
419 if at >= self.column_start.len() {
420 self.column_start.push(info);
421 } else {
422 self.column_start[at] = info;
423 }
424
425 self.account_for_potential_skip();
426 }
427
428 fn account_for_potential_skip(&mut self) {
431 let at = self.point.line - self.first_line;
432
433 if self.point.column == 1 && at != self.column_start.len() {
434 self.move_to(self.column_start[at]);
435 }
436 }
437
438 fn expect(&mut self, byte: Option<u8>) {
440 debug_assert!(self.consumed, "expected previous byte to be consumed");
441 self.consumed = false;
442 self.current = byte;
443 }
444
445 pub fn consume(&mut self) {
449 debug_assert!(!self.consumed, "expected code to *not* have been consumed: this might be because `State::Retry(x)` instead of `State::Next(x)` was returned");
450 self.move_one();
451
452 self.previous = self.current;
453 self.current = None;
456 self.consumed = true;
458 }
459
460 fn move_one(&mut self) {
462 match byte_action(self.parse_state.bytes, &self.point) {
463 ByteAction::Ignore => {
464 self.point.index += 1;
465 }
466 ByteAction::Insert(byte) => {
467 self.previous = Some(byte);
468 self.point.column += 1;
469 self.point.vs += 1;
470 }
471 ByteAction::Normal(byte) => {
472 self.previous = Some(byte);
473 self.point.vs = 0;
474 self.point.index += 1;
475
476 if byte == b'\n' {
477 self.point.line += 1;
478 self.point.column = 1;
479
480 if self.point.line - self.first_line + 1 > self.column_start.len() {
481 self.column_start.push((self.point.index, self.point.vs));
482 }
483
484 self.line_start = self.point.clone();
485
486 self.account_for_potential_skip();
487
488 #[cfg(feature = "log")]
489 log::trace!("position: after eol: `{:?}`", self.point);
490 } else {
491 self.point.column += 1;
492 }
493 }
494 }
495 }
496
497 fn move_to(&mut self, to: (usize, usize)) {
499 let (to_index, to_vs) = to;
500 while self.point.index < to_index || self.point.index == to_index && self.point.vs < to_vs {
501 self.move_one();
502 }
503 }
504
505 pub fn enter(&mut self, name: Name) {
507 enter_impl(self, name, None);
508 }
509
510 pub fn enter_link(&mut self, name: Name, link: Link) {
512 enter_impl(self, name, Some(link));
513 }
514
515 pub fn exit(&mut self, name: Name) {
517 let current = self.stack.pop().expect("cannot close w/o open tokens");
518
519 debug_assert_eq!(current, name, "expected exit event to match current event");
520
521 let previous = self.events.last().expect("cannot close w/o open event");
522 let mut point = self.point.clone();
523
524 debug_assert!(
525 current != previous.name
526 || previous.point.index != point.index
527 || previous.point.vs != point.vs,
528 "expected non-empty event"
529 );
530
531 if VOID_EVENTS.iter().any(|d| d == &name) {
532 debug_assert!(
533 current == previous.name,
534 "expected event to be void, instead of including something"
535 );
536 }
537
538 if matches!(self.previous, Some(b'\n')) {
541 point = self.line_start.clone();
542 } else {
543 move_point_back(self, &mut point);
544 }
545
546 #[cfg(feature = "log")]
547 log::debug!("exit: `{:?}`", name);
548
549 let event = Event {
550 kind: Kind::Exit,
551 name,
552 point,
553 link: None,
554 };
555 self.events.push(event);
556 }
557
558 fn capture(&mut self) -> Progress {
560 Progress {
561 previous: self.previous,
562 current: self.current,
563 point: self.point.clone(),
564 events_len: self.events.len(),
565 stack_len: self.stack.len(),
566 }
567 }
568
569 fn free(&mut self, previous: Progress) {
571 self.previous = previous.previous;
572 self.current = previous.current;
573 self.point = previous.point;
574 debug_assert!(
575 self.events.len() >= previous.events_len,
576 "expected to restore less events than before"
577 );
578 self.events.truncate(previous.events_len);
579 debug_assert!(
580 self.stack.len() >= previous.stack_len,
581 "expected to restore less stack items than before"
582 );
583 self.stack.truncate(previous.stack_len);
584 }
585
586 pub fn check(&mut self, ok: State, nok: State) {
589 let progress = Some(self.capture());
593 let attempt = Attempt {
594 kind: AttemptKind::Check,
595 progress,
596 ok,
597 nok,
598 };
599 self.attempts.push(attempt);
600 }
601
602 pub fn attempt(&mut self, ok: State, nok: State) {
605 let progress = if nok == State::Nok {
609 None
610 } else {
611 Some(self.capture())
612 };
613
614 let attempt = Attempt {
615 kind: AttemptKind::Attempt,
616 progress,
617 ok,
618 nok,
619 };
620 self.attempts.push(attempt);
621 }
622
623 pub fn push(&mut self, from: (usize, usize), to: (usize, usize), state: State) -> State {
625 push_impl(self, from, to, state, false)
626 }
627
628 pub fn flush(&mut self, state: State, resolve: bool) -> Result<Subresult, message::Message> {
630 let to = (self.point.index, self.point.vs);
631 let state = push_impl(self, to, to, state, true);
632
633 state.to_result()?;
634
635 let mut value = Subresult {
636 done: false,
637 gfm_footnote_definitions: self.tokenize_state.gfm_footnote_definitions.split_off(0),
638 definitions: self.tokenize_state.definitions.split_off(0),
639 };
640
641 if resolve {
642 let resolvers = self.resolvers.split_off(0);
643 let mut index = 0;
644 let defs = &mut value.definitions;
645 let fn_defs = &mut value.gfm_footnote_definitions;
646 while index < resolvers.len() {
647 if let Some(mut result) = call_resolve(self, resolvers[index])? {
648 fn_defs.append(&mut result.gfm_footnote_definitions);
649 defs.append(&mut result.definitions);
650 }
651 index += 1;
652 }
653
654 self.map.consume(&mut self.events);
655 }
656
657 Ok(value)
658 }
659}
660
661fn move_point_back(tokenizer: &mut Tokenizer, point: &mut Point) {
663 while point.index > 0 {
664 point.index -= 1;
665 let action = byte_action(tokenizer.parse_state.bytes, point);
666 if !matches!(action, ByteAction::Ignore) {
667 point.index += 1;
668 break;
669 }
670 }
671}
672
673fn enter_impl(tokenizer: &mut Tokenizer, name: Name, link: Option<Link>) {
675 let mut point = tokenizer.point.clone();
676 move_point_back(tokenizer, &mut point);
677
678 #[cfg(feature = "log")]
679 log::debug!("enter: `{:?}`", name);
680
681 tokenizer.stack.push(name.clone());
682 tokenizer.events.push(Event {
683 kind: Kind::Enter,
684 name,
685 point,
686 link,
687 });
688}
689
690fn push_impl(
692 tokenizer: &mut Tokenizer,
693 from: (usize, usize),
694 to: (usize, usize),
695 mut state: State,
696 flush: bool,
697) -> State {
698 debug_assert!(
699 from.0 > tokenizer.point.index
700 || (from.0 == tokenizer.point.index && from.1 >= tokenizer.point.vs),
701 "cannot move backwards"
702 );
703
704 tokenizer.move_to(from);
705
706 loop {
707 match state {
708 State::Error(_) => break,
709 State::Ok | State::Nok => {
710 if let Some(attempt) = tokenizer.attempts.pop() {
711 if attempt.kind == AttemptKind::Check || state == State::Nok {
712 if let Some(progress) = attempt.progress {
713 tokenizer.free(progress);
714 }
715 }
716
717 tokenizer.consumed = true;
718
719 let next = if state == State::Ok {
720 attempt.ok
721 } else {
722 attempt.nok
723 };
724
725 #[cfg(feature = "log")]
726 log::trace!("attempt: `{:?}` -> `{:?}`", state, next);
727
728 state = next;
729 } else {
730 break;
731 }
732 }
733 State::Next(name) => {
734 let action = if tokenizer.point.index < to.0
735 || (tokenizer.point.index == to.0 && tokenizer.point.vs < to.1)
736 {
737 Some(byte_action(tokenizer.parse_state.bytes, &tokenizer.point))
738 } else if flush {
739 None
740 } else {
741 break;
742 };
743
744 if let Some(ByteAction::Ignore) = action {
745 tokenizer.move_one();
746 } else {
747 let byte =
748 if let Some(ByteAction::Insert(byte) | ByteAction::Normal(byte)) = action {
749 Some(byte)
750 } else {
751 None
752 };
753
754 #[cfg(feature = "log")]
755 log::trace!("feed: {} to {:?}", format_byte_opt(byte), name);
756
757 tokenizer.expect(byte);
758 state = call(tokenizer, name);
759 }
760 }
761 State::Retry(name) => {
762 #[cfg(feature = "log")]
763 log::trace!("retry: `{:?}`", name);
764
765 state = call(tokenizer, name);
766 }
767 }
768 }
769
770 tokenizer.consumed = true;
771
772 if flush {
773 debug_assert!(matches!(state, State::Ok | State::Error(_)), "must be ok");
774 } else {
775 debug_assert!(
776 matches!(state, State::Next(_) | State::Error(_)),
777 "must have a next state"
778 );
779 }
780
781 state
782}
783
784fn byte_action(bytes: &[u8], point: &Point) -> ByteAction {
786 if point.index < bytes.len() {
787 let byte = bytes[point.index];
788
789 if byte == b'\r' {
790 if point.index < bytes.len() - 1 && bytes[point.index + 1] == b'\n' {
792 ByteAction::Ignore
793 }
794 else {
796 ByteAction::Normal(b'\n')
797 }
798 } else if byte == b'\t' {
799 let remainder = point.column % TAB_SIZE;
800 let vs = if remainder == 0 {
801 0
802 } else {
803 TAB_SIZE - remainder
804 };
805
806 if point.vs == 0 {
808 if vs == 0 {
809 ByteAction::Normal(byte)
810 } else {
811 ByteAction::Insert(byte)
812 }
813 } else if vs == 0 {
814 ByteAction::Normal(b' ')
815 } else {
816 ByteAction::Insert(b' ')
817 }
818 } else {
819 ByteAction::Normal(byte)
820 }
821 } else {
822 unreachable!("out of bounds")
823 }
824}