Skip to main content

streamdown_parser/
lib.rs

1//! Streamdown Parser
2//!
3//! A streaming markdown parser designed for real-time rendering of markdown
4//! content as it arrives. This is the core parsing engine for streamdown.
5//!
6//! # Overview
7//!
8//! The parser is designed to handle byte-by-byte input for streaming scenarios
9//! (like LLM output) while also working efficiently with complete documents.
10//!
11//! # Example
12//!
13//! ```
14//! use streamdown_parser::{Parser, ParseEvent};
15//!
16//! let mut parser = Parser::new();
17//!
18//! // Feed lines and get events
19//! for event in parser.parse_line("# Hello World") {
20//!     match event {
21//!         ParseEvent::Heading { level, content } => {
22//!             println!("H{}: {}", level, content);
23//!         }
24//!         _ => {}
25//!     }
26//! }
27//! ```
28
29pub mod entities;
30pub mod inline;
31pub mod tokenizer;
32
33pub use entities::decode_html_entities;
34pub use inline::{format_line, InlineElement, InlineParser};
35pub use tokenizer::{cjk_count, is_cjk, not_text, Token, Tokenizer};
36
37use regex::Regex;
38use std::sync::LazyLock;
39use streamdown_core::{BlockType, Code, ListType, ParseState};
40
41// =============================================================================
42// Regex patterns
43// =============================================================================
44
45/// Regex for code fence: ``` or ~~~ or <pre>
46static CODE_FENCE_RE: LazyLock<Regex> =
47    LazyLock::new(|| Regex::new(r"^\s*(```+|~~~+|<pre>)\s*([^\s]*)\s*$").unwrap());
48
49/// Regex for code fence end (also matches </pre>)
50static CODE_FENCE_END_RE: LazyLock<Regex> =
51    LazyLock::new(|| Regex::new(r"^\s*(```+|~~~+|</pre>)\s*$").unwrap());
52
53/// Regex for space-indented code (4+ spaces, not starting with * for lists)
54static SPACE_CODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^    \s*[^\s*]").unwrap());
55
56/// Regex for headings
57static HEADING_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(#{1,6})\s+(.*)$").unwrap());
58
59/// Regex for list items: handles -, *, +, +---, and 1. style
60static LIST_ITEM_RE: LazyLock<Regex> =
61    LazyLock::new(|| Regex::new(r"^(\s*)([+*-]|\+-+|\d+\.)\s+(.*)$").unwrap());
62
63/// Regex for blockquotes and think blocks (including unicode variants)
64static BLOCK_RE: LazyLock<Regex> =
65    LazyLock::new(|| Regex::new(r"^\s*((>\s*)+|[◁<].?think[>▷]|</?.?think[>▷]?)(.*)$").unwrap());
66
67/// Regex for horizontal rules
68static HR_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(---+|\*\*\*+|___+)\s*$").unwrap());
69
70/// Regex for table rows
71static TABLE_ROW_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\|(.+)\|\s*$").unwrap());
72
73/// Regex for table separator (only contains |, -, :, spaces)
74static TABLE_SEP_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[\s|:-]+$").unwrap());
75
76// =============================================================================
77// Types
78// =============================================================================
79
80/// List bullet type.
81#[derive(Debug, Clone, Copy, PartialEq, Eq)]
82pub enum ListBullet {
83    /// Dash bullet: -
84    Dash,
85    /// Asterisk bullet: *
86    Asterisk,
87    /// Plus bullet: +
88    Plus,
89    /// Expandable plus: +---
90    PlusExpand,
91    /// Ordered number
92    Ordered(usize),
93}
94
95impl ListBullet {
96    /// Parse a bullet string.
97    pub fn parse(s: &str) -> Option<Self> {
98        let s = s.trim();
99        if s.starts_with("+") && s.len() > 1 && s.chars().skip(1).all(|c| c == '-') {
100            return Some(ListBullet::PlusExpand);
101        }
102        match s {
103            "-" => Some(ListBullet::Dash),
104            "*" => Some(ListBullet::Asterisk),
105            "+" => Some(ListBullet::Plus),
106            s if s.ends_with('.') => {
107                let num = s.trim_end_matches('.').parse().ok()?;
108                Some(ListBullet::Ordered(num))
109            }
110            _ => None,
111        }
112    }
113
114    /// Check if this is an ordered bullet.
115    pub fn is_ordered(&self) -> bool {
116        matches!(self, ListBullet::Ordered(_))
117    }
118}
119
120/// Table parsing state.
121#[derive(Debug, Clone, Copy, PartialEq, Eq)]
122pub enum TableState {
123    /// Parsing header row
124    Header,
125    /// Saw separator, now in body
126    Body,
127}
128
129/// Events emitted by the parser.
130#[derive(Debug, Clone, PartialEq)]
131pub enum ParseEvent {
132    // === Inline elements ===
133    Text(String),
134    InlineCode(String),
135    Bold(String),
136    Italic(String),
137    Underline(String),
138    Strikeout(String),
139    BoldItalic(String),
140    Link {
141        text: String,
142        url: String,
143    },
144    Image {
145        alt: String,
146        url: String,
147    },
148    Footnote(String),
149
150    // === Block-level elements ===
151    Heading {
152        level: u8,
153        content: String,
154    },
155    CodeBlockStart {
156        language: Option<String>,
157        indent: usize,
158    },
159    CodeBlockLine(String),
160    CodeBlockEnd,
161    ListItem {
162        indent: usize,
163        bullet: ListBullet,
164        content: String,
165    },
166    ListEnd,
167    TableHeader(Vec<String>),
168    TableRow(Vec<String>),
169    TableSeparator,
170    TableEnd,
171    BlockquoteStart {
172        depth: usize,
173    },
174    BlockquoteLine(String),
175    BlockquoteEnd,
176    ThinkBlockStart,
177    ThinkBlockLine(String),
178    ThinkBlockEnd,
179    HorizontalRule,
180    EmptyLine,
181    Newline,
182    Prompt(String),
183    InlineElements(Vec<InlineElement>),
184}
185
186impl ParseEvent {
187    pub fn is_block(&self) -> bool {
188        !self.is_inline()
189    }
190
191    pub fn is_inline(&self) -> bool {
192        matches!(
193            self,
194            ParseEvent::Text(_)
195                | ParseEvent::InlineCode(_)
196                | ParseEvent::Bold(_)
197                | ParseEvent::Italic(_)
198                | ParseEvent::Underline(_)
199                | ParseEvent::Strikeout(_)
200                | ParseEvent::BoldItalic(_)
201                | ParseEvent::Link { .. }
202                | ParseEvent::Image { .. }
203                | ParseEvent::Footnote(_)
204        )
205    }
206}
207
208// =============================================================================
209// Parser
210// =============================================================================
211
212/// Streaming markdown parser.
213#[derive(Debug)]
214pub struct Parser {
215    state: ParseState,
216    inline_parser: InlineParser,
217    code_fence: Option<String>,
218    table_state: Option<TableState>,
219    events: Vec<ParseEvent>,
220    /// Track previous empty line for collapsing
221    prev_was_empty: bool,
222}
223
224impl Default for Parser {
225    fn default() -> Self {
226        Self::new()
227    }
228}
229
230impl Parser {
231    /// Create a new parser with default settings.
232    pub fn new() -> Self {
233        Self {
234            state: ParseState::new(),
235            inline_parser: InlineParser::new(),
236            code_fence: None,
237            table_state: None,
238            events: Vec::new(),
239            prev_was_empty: false,
240        }
241    }
242
243    /// Create a parser with a custom ParseState.
244    pub fn with_state(state: ParseState) -> Self {
245        let inline_parser = InlineParser::with_settings(state.links, state.images);
246        Self {
247            state,
248            inline_parser,
249            code_fence: None,
250            table_state: None,
251            events: Vec::new(),
252            prev_was_empty: false,
253        }
254    }
255
256    pub fn state(&self) -> &ParseState {
257        &self.state
258    }
259    pub fn state_mut(&mut self) -> &mut ParseState {
260        &mut self.state
261    }
262
263    pub fn set_process_links(&mut self, enabled: bool) {
264        self.state.links = enabled;
265        self.inline_parser.process_links = enabled;
266    }
267
268    pub fn set_process_images(&mut self, enabled: bool) {
269        self.state.images = enabled;
270        self.inline_parser.process_images = enabled;
271    }
272
273    /// Enable space-indented code blocks (4 spaces = code).
274    pub fn set_code_spaces(&mut self, enabled: bool) {
275        self.state.code_spaces = enabled;
276    }
277
278    /// Parse a single line and return events.
279    pub fn parse_line(&mut self, line: &str) -> Vec<ParseEvent> {
280        self.events.clear();
281
282        // Handle code blocks first (they consume everything)
283        if self.state.is_in_code() {
284            self.parse_in_code_block(line);
285            return std::mem::take(&mut self.events);
286        }
287
288        // Handle think blocks
289        if self.state.block_type == Some(BlockType::Think) {
290            self.parse_in_think_block(line);
291            return std::mem::take(&mut self.events);
292        }
293
294        // Check for empty line (with collapsing) - BEFORE indent stripping
295        if line.trim().is_empty() {
296            return self.handle_empty_line();
297        }
298
299        // Track that previous line wasn't empty
300        let was_prev_empty = self.prev_was_empty;
301        self.prev_was_empty = false;
302        self.state.last_line_empty = false;
303
304        // Check for space-indented code BEFORE first-indent stripping
305        // (so we don't accidentally strip the 4-space indent)
306        if self.try_parse_space_code(line, was_prev_empty) {
307            return self.take_events();
308        }
309
310        // Now apply first-indent stripping for other constructs
311        let line = self.strip_first_indent(line);
312
313        // Try block-level constructs in order
314        if self.try_parse_code_fence(&line) {
315            return self.take_events();
316        }
317        if self.try_parse_block(&line) {
318            return self.take_events();
319        }
320        if self.try_parse_heading(&line) {
321            return self.take_events();
322        }
323        if self.try_parse_hr(&line) {
324            return self.take_events();
325        }
326        if self.try_parse_list_item(&line) {
327            return self.take_events();
328        }
329        if self.try_parse_table(&line) {
330            return self.take_events();
331        }
332
333        // Exit special contexts for plain text
334        self.exit_block_contexts();
335
336        // Parse as inline content
337        self.parse_inline_content(&line);
338        self.take_events()
339    }
340
341    fn take_events(&mut self) -> Vec<ParseEvent> {
342        std::mem::take(&mut self.events)
343    }
344
345    /// Strip first-indent from line if configured.
346    /// This handles markdown that's indented in the input stream.
347    fn strip_first_indent(&mut self, line: &str) -> String {
348        // Set first_indent from the very first non-empty line
349        // Use character count, not byte count, to handle multi-byte whitespace
350        if self.state.first_indent.is_none() && !line.trim().is_empty() {
351            let indent = line.chars().take_while(|c| c.is_whitespace()).count();
352            self.state.first_indent = Some(indent);
353        }
354
355        // Only strip if first_indent is > 0
356        if let Some(first_indent) = self.state.first_indent {
357            if first_indent > 0 {
358                let current_indent = line.chars().take_while(|c| c.is_whitespace()).count();
359                if current_indent >= first_indent {
360                    // Skip first_indent characters (not bytes) to avoid UTF-8 boundary issues
361                    return line.chars().skip(first_indent).collect();
362                }
363            }
364        }
365
366        line.to_string()
367    }
368
369    /// Handle empty line with collapsing.
370    fn handle_empty_line(&mut self) -> Vec<ParseEvent> {
371        // Collapse consecutive empty lines
372        if self.prev_was_empty {
373            return vec![]; // Skip this empty line
374        }
375
376        self.prev_was_empty = true;
377        self.state.last_line_empty = true;
378
379        // End blockquote if in one
380        if self.state.block_depth > 0 && self.state.block_type == Some(BlockType::Quote) {
381            while self.state.block_depth > 0 {
382                self.state.exit_block();
383            }
384            self.events.push(ParseEvent::BlockquoteEnd);
385        }
386
387        // End list if in one
388        if self.state.in_list {
389            self.exit_list_context();
390        }
391
392        // End table if in one
393        if self.table_state.is_some() {
394            self.table_state = None;
395            self.state.in_table = None;
396            self.events.push(ParseEvent::TableEnd);
397        }
398
399        self.events.push(ParseEvent::EmptyLine);
400        self.take_events()
401    }
402
403    /// Exit block contexts when encountering plain text.
404    fn exit_block_contexts(&mut self) {
405        if self.state.in_list {
406            self.exit_list_context();
407        }
408        if self.table_state.is_some() {
409            self.table_state = None;
410            self.state.in_table = None;
411            self.events.push(ParseEvent::TableEnd);
412        }
413    }
414
415    // =========================================================================
416    // Code block parsing
417    // =========================================================================
418
419    fn parse_in_code_block(&mut self, line: &str) {
420        // Check for closing fence
421        if let Some(ref fence) = self.code_fence.clone() {
422            if let Some(caps) = CODE_FENCE_END_RE.captures(line) {
423                let end_fence = caps.get(1).map(|m| m.as_str()).unwrap_or("");
424                // Match fence type: ``` with ```, </pre> with <pre>
425                let matches = (fence.starts_with('`') && end_fence.starts_with('`'))
426                    || (fence.starts_with('~') && end_fence.starts_with('~'))
427                    || (fence == "<pre>" && end_fence == "</pre>");
428
429                if matches {
430                    self.events.push(ParseEvent::CodeBlockEnd);
431                    self.state.exit_code_block();
432                    self.code_fence = None;
433                    return;
434                }
435            }
436        }
437
438        // For space-indented code, check if we've dedented
439        if self.state.in_code == Some(Code::Spaces) {
440            let indent = line.chars().take_while(|c| c.is_whitespace()).count();
441            if indent < 4 && !line.trim().is_empty() {
442                self.events.push(ParseEvent::CodeBlockEnd);
443                self.state.exit_code_block();
444                // Re-parse this line - need to do it after we return
445                // For now, just parse inline content
446                self.parse_inline_content(line);
447                return;
448            }
449        }
450
451        // Emit code line (strip indent for space-indented code)
452        let code_line = if self.state.in_code == Some(Code::Spaces) {
453            line.chars().skip(4).collect()
454        } else {
455            line.to_string()
456        };
457
458        self.events.push(ParseEvent::CodeBlockLine(code_line));
459    }
460
461    fn try_parse_code_fence(&mut self, line: &str) -> bool {
462        if let Some(caps) = CODE_FENCE_RE.captures(line) {
463            let fence = caps.get(1).map(|m| m.as_str()).unwrap_or("```");
464            let lang = caps.get(2).map(|m| m.as_str()).filter(|s| !s.is_empty());
465            let indent = line.chars().take_while(|c| c.is_whitespace()).count();
466
467            self.code_fence = Some(fence.to_string());
468            self.state.code_indent = indent;
469            self.state.enter_code_block(
470                Code::Backtick,
471                lang.map(|s| s.to_string())
472                    .or_else(|| Some("text".to_string())),
473            );
474
475            self.events.push(ParseEvent::CodeBlockStart {
476                language: lang.map(|s| s.to_string()),
477                indent,
478            });
479            true
480        } else {
481            false
482        }
483    }
484
485    fn try_parse_space_code(&mut self, line: &str, was_prev_empty: bool) -> bool {
486        // Space-indented code only when CodeSpaces is enabled
487        if !self.state.code_spaces {
488            return false;
489        }
490
491        // Only after empty line, and not in a list
492        if !was_prev_empty || self.state.in_list {
493            return false;
494        }
495
496        if SPACE_CODE_RE.is_match(line) {
497            self.state
498                .enter_code_block(Code::Spaces, Some("text".to_string()));
499            self.events.push(ParseEvent::CodeBlockStart {
500                language: Some("text".to_string()),
501                indent: 4,
502            });
503            // Also emit the first line (skip 4 chars, not bytes)
504            let code_line: String = line.chars().skip(4).collect();
505            self.events.push(ParseEvent::CodeBlockLine(code_line));
506            true
507        } else {
508            false
509        }
510    }
511
512    // =========================================================================
513    // Think/blockquote parsing
514    // =========================================================================
515
516    fn parse_in_think_block(&mut self, line: &str) {
517        // Check for end of think block (various formats)
518        if line.trim() == "</think>" || line.trim() == "</think▷" || line.trim() == "◁/think▷"
519        {
520            self.events.push(ParseEvent::ThinkBlockEnd);
521            self.state.exit_block();
522        } else {
523            self.events
524                .push(ParseEvent::ThinkBlockLine(line.to_string()));
525        }
526    }
527
528    fn try_parse_block(&mut self, line: &str) -> bool {
529        if let Some(caps) = BLOCK_RE.captures(line) {
530            let marker = caps.get(1).map(|m| m.as_str()).unwrap_or("");
531            let content = caps.get(3).map(|m| m.as_str()).unwrap_or("");
532
533            // Check for think block variants
534            if marker.contains("think") {
535                if marker.contains('/') {
536                    // End of think block
537                    if self.state.block_type == Some(BlockType::Think) {
538                        self.events.push(ParseEvent::ThinkBlockEnd);
539                        self.state.exit_block();
540                    }
541                    return true;
542                } else {
543                    // Start of think block
544                    self.state.enter_block(BlockType::Think);
545                    self.events.push(ParseEvent::ThinkBlockStart);
546                    if !content.trim().is_empty() {
547                        self.events
548                            .push(ParseEvent::ThinkBlockLine(content.to_string()));
549                    }
550                    return true;
551                }
552            }
553
554            // Regular blockquote
555            let depth = marker.matches('>').count();
556            if depth > 0 {
557                if self.state.block_depth != depth {
558                    if depth > self.state.block_depth {
559                        for _ in self.state.block_depth..depth {
560                            self.state.enter_block(BlockType::Quote);
561                        }
562                        self.events.push(ParseEvent::BlockquoteStart { depth });
563                    } else {
564                        for _ in depth..self.state.block_depth {
565                            self.state.exit_block();
566                        }
567                    }
568                }
569                self.events
570                    .push(ParseEvent::BlockquoteLine(content.to_string()));
571                return true;
572            }
573        }
574
575        // End blockquote if we were in one and this line doesn't continue it
576        if self.state.block_depth > 0 && self.state.block_type == Some(BlockType::Quote) {
577            while self.state.block_depth > 0 {
578                self.state.exit_block();
579            }
580            self.events.push(ParseEvent::BlockquoteEnd);
581        }
582
583        false
584    }
585
586    // =========================================================================
587    // Other block parsing
588    // =========================================================================
589
590    fn try_parse_heading(&mut self, line: &str) -> bool {
591        if let Some(caps) = HEADING_RE.captures(line) {
592            let hashes = caps.get(1).map(|m| m.as_str()).unwrap_or("");
593            let content = caps.get(2).map(|m| m.as_str()).unwrap_or("");
594            let level = hashes.len().min(6) as u8;
595
596            self.events.push(ParseEvent::Heading {
597                level,
598                content: content.to_string(),
599            });
600            true
601        } else {
602            false
603        }
604    }
605
606    fn try_parse_hr(&mut self, line: &str) -> bool {
607        if HR_RE.is_match(line.trim()) {
608            self.events.push(ParseEvent::HorizontalRule);
609            true
610        } else {
611            false
612        }
613    }
614
615    fn try_parse_list_item(&mut self, line: &str) -> bool {
616        if let Some(caps) = LIST_ITEM_RE.captures(line) {
617            let indent_str = caps.get(1).map(|m| m.as_str()).unwrap_or("");
618            let bullet_str = caps.get(2).map(|m| m.as_str()).unwrap_or("");
619            let content = caps.get(3).map(|m| m.as_str()).unwrap_or("");
620
621            // Use character count, not byte length, for proper multi-byte whitespace handling
622            let indent = indent_str.chars().count();
623            let bullet = ListBullet::parse(bullet_str).unwrap_or(ListBullet::Dash);
624
625            // Update list_indent_text (width of bullet + space) - use char count
626            self.state.list_indent_text = bullet_str.chars().count();
627
628            let list_type = if bullet.is_ordered() {
629                ListType::Ordered
630            } else {
631                ListType::Bullet
632            };
633
634            // Pop items with greater or equal indent (for same-level items)
635            while let Some((stack_indent, _)) = self.state.list_item_stack.last() {
636                if *stack_indent > indent {
637                    self.state.pop_list();
638                } else {
639                    break;
640                }
641            }
642
643            // Push new level if indented further than current, or if stack is empty
644            let need_push = self
645                .state
646                .list_item_stack
647                .last()
648                .map(|(i, _)| indent > *i)
649                .unwrap_or(true);
650
651            if need_push {
652                self.state.push_list(indent, list_type);
653            }
654
655            // For ordered lists, get the next number
656            let final_bullet = if let ListBullet::Ordered(_) = bullet {
657                ListBullet::Ordered(self.state.next_list_number().unwrap_or(1))
658            } else {
659                bullet
660            };
661
662            self.events.push(ParseEvent::ListItem {
663                indent,
664                bullet: final_bullet,
665                content: content.to_string(),
666            });
667            true
668        } else {
669            false
670        }
671    }
672
673    fn exit_list_context(&mut self) {
674        while self.state.in_list {
675            self.state.pop_list();
676        }
677        self.events.push(ParseEvent::ListEnd);
678    }
679
680    fn try_parse_table(&mut self, line: &str) -> bool {
681        if let Some(caps) = TABLE_ROW_RE.captures(line) {
682            let inner = caps.get(1).map(|m| m.as_str()).unwrap_or("");
683
684            // Check if this is a separator row
685            if TABLE_SEP_RE.is_match(inner) && self.table_state == Some(TableState::Header) {
686                self.table_state = Some(TableState::Body);
687                self.state.in_table = Some(Code::Body);
688                self.events.push(ParseEvent::TableSeparator);
689                return true;
690            }
691
692            let cells: Vec<String> = inner.split('|').map(|s| s.trim().to_string()).collect();
693
694            match self.table_state {
695                None => {
696                    // First row is header
697                    self.table_state = Some(TableState::Header);
698                    self.state.in_table = Some(Code::Header);
699                    self.events.push(ParseEvent::TableHeader(cells));
700                }
701                Some(TableState::Header) => {
702                    // If we see another row before separator, it's still header
703                    // (some tables have multi-line headers)
704                    self.events.push(ParseEvent::TableHeader(cells));
705                }
706                Some(TableState::Body) => {
707                    self.events.push(ParseEvent::TableRow(cells));
708                }
709            }
710            return true;
711        }
712
713        // End table if we were in one
714        if self.table_state.is_some() {
715            self.table_state = None;
716            self.state.in_table = None;
717            self.events.push(ParseEvent::TableEnd);
718        }
719
720        false
721    }
722
723    fn parse_inline_content(&mut self, line: &str) {
724        let elements = self.inline_parser.parse(line);
725
726        for element in elements {
727            let event = match element {
728                InlineElement::Text(s) => ParseEvent::Text(s),
729                InlineElement::Bold(s) => ParseEvent::Bold(s),
730                InlineElement::Italic(s) => ParseEvent::Italic(s),
731                InlineElement::BoldItalic(s) => ParseEvent::BoldItalic(s),
732                InlineElement::Underline(s) => ParseEvent::Underline(s),
733                InlineElement::Strikeout(s) => ParseEvent::Strikeout(s),
734                InlineElement::Code(s) => ParseEvent::InlineCode(s),
735                InlineElement::Link { text, url } => ParseEvent::Link { text, url },
736                InlineElement::Image { alt, url } => ParseEvent::Image { alt, url },
737                InlineElement::Footnote(s) => ParseEvent::Footnote(s),
738            };
739            self.events.push(event);
740        }
741
742        self.events.push(ParseEvent::Newline);
743    }
744
745    /// Parse a complete document.
746    pub fn parse_document(&mut self, content: &str) -> Vec<ParseEvent> {
747        let mut all_events = Vec::new();
748        for line in content.lines() {
749            all_events.extend(self.parse_line(line));
750        }
751        all_events.extend(self.finalize());
752        all_events
753    }
754
755    /// Finalize parsing, closing any open blocks.
756    pub fn finalize(&mut self) -> Vec<ParseEvent> {
757        self.events.clear();
758
759        if self.state.is_in_code() {
760            self.events.push(ParseEvent::CodeBlockEnd);
761            self.state.exit_code_block();
762            self.code_fence = None;
763        }
764
765        if self.state.block_type == Some(BlockType::Think) {
766            self.events.push(ParseEvent::ThinkBlockEnd);
767            self.state.exit_block();
768        }
769
770        if self.state.block_depth > 0 {
771            self.events.push(ParseEvent::BlockquoteEnd);
772            while self.state.block_depth > 0 {
773                self.state.exit_block();
774            }
775        }
776
777        if self.state.in_list {
778            self.exit_list_context();
779        }
780
781        if self.table_state.is_some() {
782            self.table_state = None;
783            self.state.in_table = None;
784            self.events.push(ParseEvent::TableEnd);
785        }
786
787        self.take_events()
788    }
789
790    /// Reset the parser to initial state.
791    pub fn reset(&mut self) {
792        self.state = ParseState::new();
793        self.inline_parser.reset();
794        self.code_fence = None;
795        self.table_state = None;
796        self.events.clear();
797        self.prev_was_empty = false;
798    }
799}
800
801// =============================================================================
802// Tests
803// =============================================================================
804
805#[cfg(test)]
806mod tests {
807    use super::*;
808
809    #[test]
810    fn test_parse_heading() {
811        let mut parser = Parser::new();
812        let events = parser.parse_line("# Hello World");
813        assert!(events.iter().any(|e| matches!(
814            e, ParseEvent::Heading { level: 1, content } if content == "Hello World"
815        )));
816    }
817
818    #[test]
819    fn test_parse_code_block() {
820        let mut parser = Parser::new();
821        let e1 = parser.parse_line("```rust");
822        assert!(e1.iter().any(
823            |e| matches!(e, ParseEvent::CodeBlockStart { language: Some(l), .. } if l == "rust")
824        ));
825        let e2 = parser.parse_line("let x = 1;");
826        assert!(e2
827            .iter()
828            .any(|e| matches!(e, ParseEvent::CodeBlockLine(s) if s == "let x = 1;")));
829        let e3 = parser.parse_line("```");
830        assert!(e3.iter().any(|e| matches!(e, ParseEvent::CodeBlockEnd)));
831    }
832
833    #[test]
834    fn test_parse_pre_tag() {
835        let mut parser = Parser::new();
836        let e1 = parser.parse_line("<pre>");
837        assert!(e1
838            .iter()
839            .any(|e| matches!(e, ParseEvent::CodeBlockStart { .. })));
840        let e2 = parser.parse_line("code");
841        assert!(e2.iter().any(|e| matches!(e, ParseEvent::CodeBlockLine(_))));
842        let e3 = parser.parse_line("</pre>");
843        assert!(e3.iter().any(|e| matches!(e, ParseEvent::CodeBlockEnd)));
844    }
845
846    #[test]
847    fn test_space_indented_code() {
848        let mut parser = Parser::new();
849        parser.set_code_spaces(true);
850        parser.parse_line(""); // Empty line first
851        let events = parser.parse_line("    let x = 1;");
852        assert!(events
853            .iter()
854            .any(|e| matches!(e, ParseEvent::CodeBlockStart { .. })));
855        assert!(events
856            .iter()
857            .any(|e| matches!(e, ParseEvent::CodeBlockLine(s) if s == "let x = 1;")));
858    }
859
860    #[test]
861    fn test_empty_line_collapsing() {
862        let mut parser = Parser::new();
863        let e1 = parser.parse_line("");
864        assert!(e1.iter().any(|e| matches!(e, ParseEvent::EmptyLine)));
865        let e2 = parser.parse_line("");
866        assert!(e2.is_empty()); // Collapsed
867        let e3 = parser.parse_line("text");
868        assert!(!e3.is_empty());
869        let e4 = parser.parse_line("");
870        assert!(e4.iter().any(|e| matches!(e, ParseEvent::EmptyLine)));
871    }
872
873    #[test]
874    fn test_parse_think_block_unicode() {
875        let mut parser = Parser::new();
876        let e1 = parser.parse_line("◁think▷");
877        assert!(e1.iter().any(|e| matches!(e, ParseEvent::ThinkBlockStart)));
878    }
879
880    #[test]
881    fn test_parse_list() {
882        let mut parser = Parser::new();
883        let events = parser.parse_line("- Item one");
884        assert!(events.iter().any(|e| matches!(
885            e, ParseEvent::ListItem { bullet: ListBullet::Dash, content, .. } if content == "Item one"
886        )));
887    }
888
889    #[test]
890    fn test_parse_nested_list() {
891        let mut parser = Parser::new();
892        parser.parse_line("- Item 1");
893        let e2 = parser.parse_line("  - Nested");
894        // Nested item should have indent 2
895        assert!(e2
896            .iter()
897            .any(|e| matches!(e, ParseEvent::ListItem { indent: 2, .. })));
898    }
899
900    #[test]
901    fn test_parse_ordered_list_numbering() {
902        let mut parser = Parser::new();
903        parser.parse_line("1. First");
904        let e2 = parser.parse_line("2. Second");
905        // Should auto-number
906        assert!(e2.iter().any(|e| matches!(
907            e,
908            ParseEvent::ListItem {
909                bullet: ListBullet::Ordered(2),
910                ..
911            }
912        )));
913    }
914
915    #[test]
916    fn test_parse_blockquote() {
917        let mut parser = Parser::new();
918        let events = parser.parse_line("> Quote text");
919        assert!(events
920            .iter()
921            .any(|e| matches!(e, ParseEvent::BlockquoteLine(s) if s == "Quote text")));
922    }
923
924    #[test]
925    fn test_parse_nested_blockquote() {
926        let mut parser = Parser::new();
927        let events = parser.parse_line(">> Nested quote");
928        assert!(events
929            .iter()
930            .any(|e| matches!(e, ParseEvent::BlockquoteStart { depth: 2 })));
931    }
932
933    #[test]
934    fn test_parse_hr() {
935        let mut parser = Parser::new();
936        assert!(parser
937            .parse_line("---")
938            .iter()
939            .any(|e| matches!(e, ParseEvent::HorizontalRule)));
940        assert!(parser
941            .parse_line("***")
942            .iter()
943            .any(|e| matches!(e, ParseEvent::HorizontalRule)));
944        assert!(parser
945            .parse_line("___")
946            .iter()
947            .any(|e| matches!(e, ParseEvent::HorizontalRule)));
948    }
949
950    #[test]
951    fn test_parse_table() {
952        let mut parser = Parser::new();
953        let e1 = parser.parse_line("| A | B | C |");
954        assert!(e1.iter().any(|e| matches!(e, ParseEvent::TableHeader(_))));
955        let e2 = parser.parse_line("|---|---|---|");
956        assert!(e2.iter().any(|e| matches!(e, ParseEvent::TableSeparator)));
957        let e3 = parser.parse_line("| 1 | 2 | 3 |");
958        assert!(e3.iter().any(|e| matches!(e, ParseEvent::TableRow(_))));
959    }
960
961    #[test]
962    fn test_parse_think_block() {
963        let mut parser = Parser::new();
964        let e1 = parser.parse_line("<think>");
965        assert!(e1.iter().any(|e| matches!(e, ParseEvent::ThinkBlockStart)));
966        let e2 = parser.parse_line("Thinking...");
967        assert!(e2
968            .iter()
969            .any(|e| matches!(e, ParseEvent::ThinkBlockLine(s) if s == "Thinking...")));
970        let e3 = parser.parse_line("</think>");
971        assert!(e3.iter().any(|e| matches!(e, ParseEvent::ThinkBlockEnd)));
972    }
973
974    #[test]
975    fn test_first_indent_stripping() {
976        let mut parser = Parser::new();
977        // First line has 4 spaces indent
978        let e1 = parser.parse_line("    # Hello");
979        // Should strip the 4 spaces and parse as heading
980        assert!(e1
981            .iter()
982            .any(|e| matches!(e, ParseEvent::Heading { level: 1, content } if content == "Hello")));
983    }
984
985    #[test]
986    fn test_parse_document() {
987        let mut parser = Parser::new();
988        let doc = "# Title\n\nSome text.\n\n```\ncode\n```";
989        let events = parser.parse_document(doc);
990        assert!(events
991            .iter()
992            .any(|e| matches!(e, ParseEvent::Heading { level: 1, .. })));
993        assert!(events
994            .iter()
995            .any(|e| matches!(e, ParseEvent::CodeBlockStart { .. })));
996        assert!(events.iter().any(|e| matches!(e, ParseEvent::CodeBlockEnd)));
997    }
998
999    #[test]
1000    fn test_finalize_closes_blocks() {
1001        let mut parser = Parser::new();
1002        parser.parse_line("```");
1003        parser.parse_line("code");
1004        let events = parser.finalize();
1005        assert!(events.iter().any(|e| matches!(e, ParseEvent::CodeBlockEnd)));
1006    }
1007
1008    #[test]
1009    fn test_is_block_is_inline() {
1010        assert!(ParseEvent::Heading {
1011            level: 1,
1012            content: "x".to_string()
1013        }
1014        .is_block());
1015        assert!(ParseEvent::CodeBlockStart {
1016            language: None,
1017            indent: 0
1018        }
1019        .is_block());
1020        assert!(ParseEvent::Text("x".to_string()).is_inline());
1021        assert!(ParseEvent::Bold("x".to_string()).is_inline());
1022    }
1023
1024    #[test]
1025    fn test_first_indent_stripping_multibyte_whitespace() {
1026        // This test reproduces the byte boundary bug in strip_first_indent.
1027        //
1028        // Line 1: "  # Hello" = 2 ASCII spaces (2 bytes) + "# Hello"
1029        // Buggy code calculates first_indent = 2 (bytes)
1030        //
1031        // Line 2: " World" = 1 fullwidth space (3 bytes) + "World"
1032        // Buggy code checks: current_indent (3 bytes) >= first_indent (2 bytes) ✓
1033        // Then tries: line[2..] which is INSIDE the fullwidth space!
1034        // Panic: "byte index 2 is not a char boundary; it is inside ' '"
1035        let mut parser = Parser::new();
1036
1037        // First line: 2 ASCII spaces = 2 bytes indent
1038        let line1 = "  # Hello";
1039        assert_eq!(line1.len() - line1.trim_start().len(), 2);
1040        let _ = parser.parse_line(line1);
1041
1042        // Second line: 1 fullwidth space (3 bytes) - byte 2 is NOT a char boundary
1043        let line2 = " World";
1044        assert!(!line2.is_char_boundary(2)); // Verify byte 2 is invalid
1045
1046        // This will panic with buggy code: "byte index 2 is not a char boundary"
1047        let events = parser.parse_line(line2);
1048
1049        // Should produce valid output without panicking
1050        assert!(!events.is_empty());
1051    }
1052
1053    #[test]
1054    fn test_space_indented_code_strip_with_fullwidth() {
1055        // This test reproduces a panic when stripping indent from space-indented code.
1056        //
1057        // Scenario:
1058        // 1. Enter space-indented code block with "    code" (4 ASCII spaces)
1059        // 2. Continue with "  more" (2 fullwidth spaces = 6 bytes)
1060        // 3. Buggy code: line.len() >= 4 is true (6 >= 4), so it tries line[4..]
1061        // 4. Panic: byte 4 is inside the second fullwidth space (bytes 3..6)
1062        let mut parser = Parser::new();
1063        parser.set_code_spaces(true);
1064
1065        // Empty line required before space-indented code
1066        parser.parse_line("");
1067
1068        // First line: 4 ASCII spaces triggers space-indented code block
1069        let line1 = "    first line of code";
1070        let events1 = parser.parse_line(line1);
1071        assert!(events1.iter().any(|e| matches!(e, ParseEvent::CodeBlockStart { .. })));
1072
1073        // Second line: 2 fullwidth spaces (6 bytes) - byte 4 is NOT a char boundary
1074        // This would panic with buggy code: "byte index 4 is not a char boundary"
1075        let line2 = "  second line";
1076        assert!(!line2.is_char_boundary(4)); // Verify byte 4 is invalid
1077
1078        let events2 = parser.parse_line(line2);
1079
1080        // Should not panic and produce some output
1081        assert!(!events2.is_empty());
1082    }
1083
1084    #[test]
1085    fn test_list_item_indent_with_fullwidth_spaces() {
1086        // BUG: List indent uses byte-based calculation.
1087        // A list item with 1 fullwidth space (3 bytes) would be treated as
1088        // having indent 3, which could incorrectly affect nesting level.
1089        let mut parser = Parser::new();
1090
1091        // Top-level list item
1092        let events1 = parser.parse_line("- top level");
1093        assert!(events1.iter().any(|e| matches!(e, ParseEvent::ListItem { indent: 0, .. })));
1094
1095        // List item with 1 fullwidth space indent (3 bytes, 1 char)
1096        // Should be treated as indent 1 (char-based), not indent 3 (byte-based)
1097        let line2 = " - nested item"; // 1 fullwidth space
1098        let events2 = parser.parse_line(line2);
1099
1100        // Check that indent is character-based (1), not byte-based (3)
1101        let list_item = events2.iter().find(|e| matches!(e, ParseEvent::ListItem { .. }));
1102        assert!(list_item.is_some(), "Should have parsed list item");
1103
1104        if let Some(ParseEvent::ListItem { indent, .. }) = list_item {
1105            // With byte-based: indent = 3
1106            // With char-based: indent = 1
1107            assert_eq!(
1108                *indent, 1,
1109                "Indent should be 1 (char-based), not 3 (byte-based)"
1110            );
1111        }
1112    }
1113
1114    #[test]
1115    fn test_space_indented_code_dedent_with_fullwidth() {
1116        // BUG: Dedent detection uses byte-based indent calculation.
1117        // A line with 2 fullwidth spaces (6 bytes) would NOT trigger dedent
1118        // because 6 >= 4, but it should because 2 chars < 4 chars.
1119        let mut parser = Parser::new();
1120        parser.set_code_spaces(true);
1121
1122        // Empty line required before space-indented code
1123        parser.parse_line("");
1124
1125        // Enter code block with 4 ASCII spaces
1126        let events1 = parser.parse_line("    code line");
1127        assert!(events1.iter().any(|e| matches!(e, ParseEvent::CodeBlockStart { .. })));
1128
1129        // Line with 2 fullwidth spaces (6 bytes, 2 chars) should EXIT code block
1130        // because 2 chars < 4 required indent
1131        let line2 = "  not code anymore";
1132        let byte_indent = line2.len() - line2.trim_start().len();
1133        let char_indent = line2.chars().take_while(|c| c.is_whitespace()).count();
1134        assert_eq!(byte_indent, 6); // 2 fullwidth spaces = 6 bytes
1135        assert_eq!(char_indent, 2); // but only 2 characters
1136
1137        let events2 = parser.parse_line(line2);
1138
1139        // Should have exited code block (CodeBlockEnd event)
1140        assert!(
1141            events2.iter().any(|e| matches!(e, ParseEvent::CodeBlockEnd)),
1142            "Should have exited code block with only 2-char indent"
1143        );
1144    }
1145}