kimun_notes/components/text_editor/markdown/
parsed_buffer.rs

1//! `ParsedBuffer`: the full-buffer parsed representation produced by
2//! `parse()`. Owns the per-row classification (`kinds`), the
3//! lazy-depth tracking that gates reset-boundary detection, and the
4//! splice/parse-range machinery used by the incremental editor view.
5//!
6//! See openspec changes `parse-reset-boundaries` and
7//! `parse-reset-boundaries-v2` for the design.
8
9use super::super::parse_incremental::LineConstructKind;
10use super::super::text_coords::byte_col_to_char_col;
11use super::detect::{detect_image_placeholders, detect_wikilinks};
12use super::{
13    Element, ElementKind, PARSER_OPTIONS, ParsedLine, leading_ws_byte_len, list_marker_len,
14    tag_to_kind,
15};
16use pulldown_cmark::{CodeBlockKind, Event, Parser, Tag, TagEnd};
17use std::ops::Range;
18
19#[derive(Clone)]
20pub struct ParsedBuffer {
21    pub lines: Vec<ParsedLine>,
22    pub kinds: Vec<LineConstructKind>,
23    /// Sorted, deduped row indices `b` where pulldown-cmark's parser
24    /// state is provably reset — i.e. parsing `&lines[b..j]` in
25    /// isolation produces the same `ParsedLine` and
26    /// `LineConstructKind` for row `b` as parsing the full buffer
27    /// would, for every later boundary `j`. Used by
28    /// `parse_incremental::expand_to_reset_boundary` so the
29    /// incremental-parse widening is provably-equivalent to a fresh
30    /// parse over the spliced range — no post-slice verification
31    /// needed in release.
32    ///
33    /// Always contains `0` and `lines.len()` as sentinel boundaries.
34    /// Conservative starting set: only Blank-prefixed rows after an
35    /// `Event::End` of a top-level block. Long buffers without blank
36    /// separators degrade to full-rebuild on every edit (acceptable
37    /// — same behaviour as today's `widen_to_safe` + cap-trip path
38    /// in that regime).
39    pub reset_boundaries: Vec<usize>,
40    /// Per-row lazy-construct depth AFTER processing the row's open
41    /// events (i.e. the prefix sum is applied row-by-row INCLUDING
42    /// the current row's deltas before being stored). Counts ONLY
43    /// constructs that CommonMark allows to extend across blank rows
44    /// (List, BlockQuote, IndentedCode, HtmlBlock — see
45    /// [`is_lazy_continuable_tag`]). Fenced code, Paragraph, Heading
46    /// are NOT counted because their parse state is reset at blank
47    /// rows or single-row terminators.
48    ///
49    /// `lazy_depth[r] == 0` is a necessary precondition for treating
50    /// row `r` as a reset boundary: at depth 0 there is no open
51    /// lazy-extendable construct that a fresh parser starting at `r`
52    /// could miss.
53    pub lazy_depth: Vec<u32>,
54}
55
56impl ParsedBuffer {
57    /// Parse the entire editor buffer in a single pulldown-cmark pass.
58    ///
59    /// Returns a [`ParsedBuffer`] whose `lines` contains one `ParsedLine`
60    /// per input row (multi-row markdown elements split per row) and whose
61    /// `kinds` contains the per-row [`LineConstructKind`] classification
62    /// that drives safe-boundary widening in `parse_incremental`.
63    ///
64    /// The pulldown-cmark event walk classifies the major constructs
65    /// inline; three short O(n) post-passes (list-continuation,
66    /// blockquote-depth, setext-underline) refine the result. No second
67    /// invocation of the pulldown parser.
68    pub fn parse(lines: &[String]) -> ParsedBuffer {
69        // Build joined buffer and per-line byte-offset table.
70        let total_bytes: usize =
71            lines.iter().map(|l| l.len()).sum::<usize>() + lines.len().saturating_sub(1);
72        let mut joined = String::with_capacity(total_bytes);
73        let mut line_starts: Vec<usize> = Vec::with_capacity(lines.len() + 1);
74        for (i, line) in lines.iter().enumerate() {
75            line_starts.push(joined.len());
76            joined.push_str(line);
77            if i + 1 < lines.len() {
78                joined.push('\n');
79            }
80        }
81        // Sentinel past-end entry so binary_search on a byte offset that falls
82        // exactly on the last line's content still returns a valid `Err(row)`
83        // without landing on an `Ok` match at the real end. The `+ 1` ensures
84        // the sentinel is strictly greater than any real byte offset, including
85        // the trailing '\n' bytes between lines.
86        line_starts.push(joined.len() + 1);
87
88        // Pre-allocate per-line state.
89        let mut content_vis: Vec<Vec<bool>> = lines
90            .iter()
91            .map(|l| vec![false; l.chars().count()])
92            .collect();
93        let mut elements: Vec<Vec<Element>> = vec![Vec::new(); lines.len()];
94        let mut list_sigil_end: Vec<Option<usize>> = vec![None; lines.len()];
95
96        // Element stack: (start_row, start_col_char, kind).
97        // Spans are emitted on End events, split across rows they cross.
98        let mut stack: Vec<(usize, usize, ElementKind)> = Vec::new();
99
100        // `list_sigil_end[row]` is filled directly when we see `Start(Item)` on
101        // that row — we walk the line from the Item's start column past the
102        // marker (`- `, `* `, `+ `, or `N. `). This handles empty items (`- `)
103        // that have no Text event inside.
104
105        // Helper closure for pushing a multi-row span to `elements`.
106        let emit_span = |row_s: usize,
107                         col_s: usize,
108                         row_e: usize,
109                         col_e: usize,
110                         kind: ElementKind,
111                         elements: &mut Vec<Vec<Element>>,
112                         lines: &[String]| {
113            if row_s == row_e {
114                if col_e > col_s && row_s < elements.len() {
115                    elements[row_s].push(Element {
116                        start_char: col_s,
117                        end_char: col_e,
118                        kind,
119                    });
120                }
121                return;
122            }
123            // Multi-row: first row extends to end-of-line, middle rows cover whole line,
124            // last row covers 0..col_e.
125            if row_s < elements.len() {
126                let end_first = lines[row_s].chars().count();
127                if end_first > col_s {
128                    elements[row_s].push(Element {
129                        start_char: col_s,
130                        end_char: end_first,
131                        kind,
132                    });
133                }
134            }
135            for r in (row_s + 1)..row_e {
136                if r < elements.len() {
137                    let line_len = lines[r].chars().count();
138                    if line_len > 0 {
139                        elements[r].push(Element {
140                            start_char: 0,
141                            end_char: line_len,
142                            kind,
143                        });
144                    }
145                }
146            }
147            if row_e < elements.len() && col_e > 0 {
148                elements[row_e].push(Element {
149                    start_char: 0,
150                    end_char: col_e,
151                    kind,
152                });
153            }
154        };
155
156        // Per-line construct classification: initially Blank vs Plain based on
157        // whitespace, then updated during the event loop and post-passes below.
158        let mut kinds: Vec<LineConstructKind> = lines
159            .iter()
160            .map(|l| {
161                if l.trim().is_empty() {
162                    LineConstructKind::Blank
163                } else {
164                    LineConstructKind::Plain
165                }
166            })
167            .collect();
168
169        // Reset-boundary detection via depth prefix-sum. During the
170        // event walk we record per-row depth deltas (+1 at the start
171        // row of every top-level block, -1 at the row AFTER its end).
172        // After the walk, a prefix sum gives the depth at the start
173        // of each row — depth==0 means pulldown's parser is in the
174        // "between blocks" state at that row, with no open
175        // construct that could lazy-continue. A row at depth 0 whose
176        // own `kinds` is Blank (or EOF) is a true reset point.
177        //
178        // Depth deltas (not an inline counter) handle pulldown's
179        // overlapping nesting: a Paragraph inside an Item inside a
180        // List emits Start events at overlapping rows; an inline
181        // depth counter that decrements on the innermost End would
182        // see depth==0 prematurely while the outer List is still
183        // open. The delta+prefix-sum approach correctly sums all
184        // open constructs.
185        let mut reset_boundaries: Vec<usize> = Vec::new();
186        // +1 at the row a lazy-continuable construct opens, -1 at the
187        // row past where it closes. Length is `lines.len() + 1` so
188        // end-of-buffer drops have a sink (read by the prefix sum
189        // only up to `lines.len() - 1`). See [`is_lazy_continuable_tag`].
190        let mut lazy_delta: Vec<i32> = vec![0; lines.len() + 1];
191        // CodeBlock kind tracker. `Some(true)` = an Indented (lazy)
192        // CodeBlock is open; `Some(false)` = a Fenced (non-lazy)
193        // CodeBlock is open; `None` = no CodeBlock open. CommonMark
194        // does not nest code blocks, so a single Option captures the
195        // kind across the matching Start/End pair — and reading None
196        // on End signals an unmatched-End invariant violation.
197        let mut indented_codeblock_open: Option<bool> = None;
198
199        // Track fenced/indented code block byte ranges for F4 (label suppression).
200        // Populated during the main parser pass below and converted to per-line
201        // flags before the per-line label scan.
202        let mut code_block_byte_ranges: Vec<(usize, usize)> = Vec::new();
203        let mut code_block_depth = 0u32;
204        let mut code_block_start: Option<usize> = None;
205
206        let parser = Parser::new_ext(&joined, PARSER_OPTIONS);
207        for (event, range) in parser.into_offset_iter() {
208            let (sr, sc) = byte_to_row_col(range.start, lines, &line_starts);
209            let (er, ec) = byte_to_row_col(range.end, lines, &line_starts);
210
211            // V2 lazy_delta tracking — only constructs that
212            // lazy-extend across blank rows per CommonMark §4.4 / §4.6
213            // / §5.1 / §5.2. See [`is_lazy_continuable_tag`].
214            //
215            // End events use an unclamped row for the drop position:
216            // pulldown's `range.end` for an end-of-buffer block lands
217            // past the last content byte, which `byte_to_row_col`
218            // would clamp to `lines.len() - 1`. Dropping AT the last
219            // content row would zero `lazy_depth` there even though
220            // the construct still covers it.
221            //
222            // The CodeBlock arm is placed first so it absorbs both
223            // variants before the generic `is_lazy_continuable_tag`
224            // arm sees them — Indented would otherwise double-count.
225            match &event {
226                Event::Start(Tag::CodeBlock(kind)) => {
227                    let is_indented = matches!(kind, CodeBlockKind::Indented);
228                    indented_codeblock_open = Some(is_indented);
229                    if is_indented && sr < lazy_delta.len() {
230                        lazy_delta[sr] += 1;
231                    }
232                }
233                Event::Start(tag) if is_lazy_continuable_tag(tag) && sr < lazy_delta.len() => {
234                    lazy_delta[sr] += 1;
235                }
236                Event::End(TagEnd::CodeBlock) => {
237                    let was_indented = indented_codeblock_open.take();
238                    debug_assert!(
239                        was_indented.is_some(),
240                        "Event::End(CodeBlock) without matching Start at byte {}",
241                        range.start,
242                    );
243                    if was_indented == Some(true) {
244                        let (er_lazy, _) =
245                            byte_to_row_col_unclamped(range.end, lines, &line_starts);
246                        let drop_at = er_lazy.min(lines.len());
247                        if drop_at < lazy_delta.len() {
248                            lazy_delta[drop_at] -= 1;
249                        }
250                    }
251                }
252                Event::End(tag_end) if is_lazy_continuable_tag_end(tag_end) => {
253                    let (er_lazy, _) = byte_to_row_col_unclamped(range.end, lines, &line_starts);
254                    let drop_at = er_lazy.min(lines.len());
255                    if drop_at < lazy_delta.len() {
256                        lazy_delta[drop_at] -= 1;
257                    }
258                }
259                _ => {}
260            }
261
262            match event {
263                Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(_))) => {
264                    if code_block_depth == 0 {
265                        code_block_start = Some(range.start);
266                    }
267                    code_block_depth += 1;
268                    // Opening fence marker row.
269                    if sr < kinds.len() {
270                        kinds[sr] = LineConstructKind::FenceMarker;
271                    }
272                    // Rows between opening and closing fences are content.
273                    // Pulldown can emit `er == sr` for a single-row degenerate
274                    // fence (no body, no closing fence on a separate row) or
275                    // `er == lines.len()` for an unclosed fence at EOF; both
276                    // make the `[(sr+1)..er]` slice invalid.
277                    let content_end = er.min(kinds.len());
278                    if content_end > sr + 1 {
279                        kinds[(sr + 1)..content_end].fill(LineConstructKind::FenceContent);
280                    }
281                    // Closing fence marker row (er is the row of the closing
282                    // ```). When `er == sr` (degenerate single-row fence) or
283                    // `er >= lines.len()` (unclosed fence at EOF) there is
284                    // no separate closing row to mark.
285                    if er > sr && er < kinds.len() {
286                        kinds[er] = LineConstructKind::FenceMarker;
287                    }
288                }
289                Event::Start(Tag::CodeBlock(CodeBlockKind::Indented)) => {
290                    if code_block_depth == 0 {
291                        code_block_start = Some(range.start);
292                    }
293                    code_block_depth += 1;
294                    // Use the UNCLAMPED end row as the exclusive upper bound:
295                    // pulldown's exclusive byte end lands at the start of the
296                    // row *after* the block's content (or `lines.len()` at EOF),
297                    // so the content rows are `sr..end_row` — this already
298                    // excludes both a following less-indented paragraph
299                    // (CommonMark §4.4: such a line ends the block) and any
300                    // trailing blank line. The previous `er + 1` over-included
301                    // that following row.
302                    let (end_row, _) = byte_to_row_col_unclamped(range.end, lines, &line_starts);
303                    let hi = end_row.min(kinds.len());
304                    if hi > sr {
305                        kinds[sr..hi].fill(LineConstructKind::IndentedCode);
306                        // Invariant the bound above guarantees: the block never
307                        // ends on a trailing blank, so no trim is needed.
308                        debug_assert!(
309                            hi <= sr + 1 || !lines[hi - 1].trim().is_empty(),
310                            "indented block ended on trailing blank row {}",
311                            hi - 1
312                        );
313                    }
314                }
315                Event::End(TagEnd::CodeBlock) => {
316                    code_block_depth = code_block_depth.saturating_sub(1);
317                    if code_block_depth == 0
318                        && let Some(start) = code_block_start.take()
319                    {
320                        code_block_byte_ranges.push((start, range.end));
321                    }
322                }
323                Event::Start(ref tag) if let Some(kind) = tag_to_kind(tag) => {
324                    if matches!(
325                        kind,
326                        ElementKind::HeadingH1 | ElementKind::HeadingH2 | ElementKind::HeadingH3
327                    ) {
328                        kinds[sr] = LineConstructKind::Heading;
329                    }
330                    stack.push((sr, sc, kind));
331                }
332                Event::End(
333                    TagEnd::Strong
334                    | TagEnd::Emphasis
335                    | TagEnd::Strikethrough
336                    | TagEnd::Link
337                    | TagEnd::Heading(_)
338                    | TagEnd::BlockQuote(_),
339                ) => {
340                    if let Some((s_r, s_c, k)) = stack.pop() {
341                        emit_span(s_r, s_c, er, ec, k, &mut elements, lines);
342                    }
343                }
344                Event::Start(Tag::Item)
345                    // Pulldown-cmark's Item range does not always start at the
346                    // marker character — for nested items it starts at the
347                    // indentation-beyond-the-parent boundary, which can be
348                    // several chars before the marker. Scan the line from col
349                    // 0 to find leading whitespace + marker instead of relying
350                    // on `sc`.
351                    if sr < lines.len() && list_sigil_end[sr].is_none() =>
352                {
353                    kinds[sr] = LineConstructKind::ListMarker;
354                    let line = lines[sr].as_str();
355                    let ws_end = leading_ws_byte_len(line);
356                    if let Some(len) = list_marker_len(&line[ws_end..]) {
357                        // ws_end is byte length but ASCII whitespace makes it
358                        // equal to the char count.
359                        list_sigil_end[sr] = Some(ws_end + len);
360                    }
361                }
362                Event::Start(Tag::HtmlBlock) => {
363                    let hi = (er + 1).min(kinds.len());
364                    if hi > sr {
365                        kinds[sr..hi].fill(LineConstructKind::HtmlBlock);
366                    }
367                }
368                Event::Html(_) => {
369                    // Block-level HTML body. Already classified by the
370                    // enclosing `Tag::HtmlBlock` arm; this branch is a
371                    // safety-net for any row pulldown emits between the
372                    // tag boundaries but defers to fence/code kinds when
373                    // the rows happen to overlap (rare, but possible
374                    // with malformed input).
375                    let hi = (er + 1).min(kinds.len());
376                    if hi > sr {
377                        for kind in &mut kinds[sr..hi] {
378                            if !matches!(
379                                *kind,
380                                LineConstructKind::FenceContent
381                                    | LineConstructKind::IndentedCode
382                                    | LineConstructKind::FenceMarker
383                            ) {
384                                *kind = LineConstructKind::HtmlBlock;
385                            }
386                        }
387                    }
388                }
389                // Inline HTML (`<span>`, `<br/>`, etc.) lives inside a
390                // paragraph; it must NOT promote the row to HtmlBlock,
391                // because `HtmlBlock` is a non-safe widening boundary
392                // (see `parse_incremental::is_safe_boundary`). Painting
393                // the paragraph row as HtmlBlock would force widening
394                // to walk past it on every nearby edit. Leave kind as-is.
395                Event::InlineHtml(_) => {}
396                Event::End(TagEnd::Item) => {}
397                Event::Code(ref code_text) if sr == er && sr < lines.len() => {
398                    // Inline code — always single-line in practice.
399                    let code_len = code_text.chars().count();
400                    let range_char_len = ec.saturating_sub(sc);
401                    let sigil_each = range_char_len.saturating_sub(code_len) / 2;
402                    let cs = sc + sigil_each;
403                    for vis in content_vis[sr].iter_mut().skip(cs).take(code_len) {
404                        *vis = true;
405                    }
406                    elements[sr].push(Element {
407                        start_char: sc,
408                        end_char: ec,
409                        kind: ElementKind::InlineCode,
410                    });
411                }
412                Event::Text(_) | Event::SoftBreak | Event::HardBreak => {
413                    // Mark content_vis for each row the event touches.
414                    if sr == er {
415                        if sr < content_vis.len() {
416                            for vis in content_vis[sr]
417                                .iter_mut()
418                                .skip(sc)
419                                .take(ec.saturating_sub(sc))
420                            {
421                                *vis = true;
422                            }
423                        }
424                    } else {
425                        // First row: from sc to end-of-line.
426                        if sr < content_vis.len() {
427                            let line_chars = content_vis[sr].len();
428                            for vis in content_vis[sr]
429                                .iter_mut()
430                                .skip(sc)
431                                .take(line_chars.saturating_sub(sc))
432                            {
433                                *vis = true;
434                            }
435                        }
436                        // Middle rows: whole line.
437                        for r in (sr + 1)..er {
438                            if r < content_vis.len() {
439                                for vis in content_vis[r].iter_mut() {
440                                    *vis = true;
441                                }
442                            }
443                        }
444                        // Last row: 0..ec.
445                        if er < content_vis.len() {
446                            for vis in content_vis[er].iter_mut().take(ec) {
447                                *vis = true;
448                            }
449                        }
450                    }
451                }
452                _ => {}
453            }
454        }
455
456        // Build a per-line flag: `true` = this line is inside a fenced/indented code block.
457        // Lines whose byte range overlaps any code-block range are suppressed from label scan.
458        let line_in_code_block: Vec<bool> = {
459            let mut flags = vec![false; lines.len()];
460            for (cb_start, cb_end) in &code_block_byte_ranges {
461                // Find all lines that overlap [cb_start, cb_end).
462                for (row, &ls) in line_starts[..lines.len()].iter().enumerate() {
463                    let le = ls + lines[row].len();
464                    // Overlap when line_start < cb_end and line_end > cb_start.
465                    // We skip the fence delimiter lines (which contain the ``` markers)
466                    // by checking if the line's content byte range overlaps the code
467                    // block's content range. The CodeBlock event range in pulldown-cmark
468                    // covers the opening fence through the closing fence, so this
469                    // conservative check marks all lines within the span as in-block.
470                    if ls < *cb_end && le > *cb_start {
471                        flags[row] = true;
472                    }
473                }
474            }
475            flags
476        };
477
478        // Per-line post-processing: heading trailing whitespace, wikilinks, bitmasks.
479        let mut out: Vec<ParsedLine> = Vec::with_capacity(lines.len());
480        for (row, line) in lines.iter().enumerate() {
481            let mut cv = std::mem::take(&mut content_vis[row]);
482            let mut els = std::mem::take(&mut elements[row]);
483
484            // Heading trailing-whitespace fix.
485            for e in &els {
486                if matches!(
487                    e.kind,
488                    ElementKind::HeadingH1 | ElementKind::HeadingH2 | ElementKind::HeadingH3
489                ) {
490                    for i in (e.start_char..e.end_char).rev() {
491                        match line.chars().nth(i) {
492                            Some(' ' | '\t') => {
493                                if i < cv.len() {
494                                    cv[i] = true;
495                                }
496                            }
497                            _ => break,
498                        }
499                    }
500                }
501            }
502
503            detect_wikilinks(line, &mut cv, &mut els);
504            let image_placeholders = detect_image_placeholders(line, &mut cv, &mut els);
505
506            // Scan for #hashtag spans and emit Label elements.
507            // Guards (all must pass):
508            //   F4: skip if this line is inside a fenced code block
509            //   F2: word-boundary guard (handled inside label_matches)
510            //   F3: skip if the span overlaps InlineCode, Link, WikiLink, or Image
511            if !line_in_code_block[row] {
512                let line_str = line.as_str();
513                for lm in kimun_core::note::scan::label_matches(line_str) {
514                    // Convert byte offsets to char offsets for Element storage.
515                    let start_char = line_str[..lm.byte_start].chars().count();
516                    let end_char =
517                        start_char + line_str[lm.byte_start..lm.byte_end].chars().count();
518                    // F3: overlap guard (InlineCode + Link + WikiLink + Image)
519                    let overlaps_existing = els.iter().any(|e| {
520                        matches!(
521                            e.kind,
522                            ElementKind::InlineCode
523                                | ElementKind::Link
524                                | ElementKind::WikiLink
525                                | ElementKind::Image
526                        ) && !(end_char <= e.start_char || start_char >= e.end_char)
527                    });
528                    if !overlaps_existing {
529                        els.push(Element {
530                            start_char,
531                            end_char,
532                            kind: ElementKind::Label,
533                        });
534                    }
535                }
536            }
537            // Re-sort so elem_vis / elem_index precomputation sees elements in line order.
538            els.sort_by_key(|e| e.start_char);
539
540            // F6: use u16 for elem_index (supports up to 65535 elements per line)
541            debug_assert!(
542                els.len() < u16::MAX as usize,
543                "Too many elements on a single line ({})",
544                els.len()
545            );
546            let total = line.chars().count();
547            let mut elem_vis = vec![false; total];
548            let mut elem_index = vec![0u16; total];
549            // OR-ed emphasis mask: unlike elem_index (innermost wins), every
550            // covering Bold/Italic/Strikethrough contributes, so a Link or
551            // WikiLink nested in `**…**` / `*…*` keeps the outer emphasis.
552            let mut modifier_mask = vec![0u8; total];
553            for (i, e) in els.iter().enumerate() {
554                let tag = (i + 1) as u16;
555                let bit = super::modifier_bit(e.kind);
556                for pos in e.start_char..e.end_char {
557                    if pos < total {
558                        elem_vis[pos] = true;
559                        elem_index[pos] = tag;
560                        modifier_mask[pos] |= bit;
561                    }
562                }
563            }
564
565            // Blockquote depth = number of Blockquote elements covering this
566            // line. Derived from pulldown's structure (via `emit_span`) rather
567            // than counting leading `>`, so lazy-continuation lines (CommonMark
568            // §5.1 — quote text with no `>` prefix) and nested quotes report the
569            // correct depth and get the bar gutter, not just the quote color.
570            let blockquote_depth = {
571                let n = els
572                    .iter()
573                    .filter(|e| e.kind == ElementKind::Blockquote)
574                    .count();
575                (n > 0).then_some(n.min(u8::MAX as usize) as u8)
576            };
577
578            out.push(ParsedLine {
579                elements: els,
580                content_vis: cv,
581                elem_vis,
582                elem_index,
583                modifier_mask,
584                list_sigil_end: list_sigil_end[row],
585                image_placeholders,
586                blockquote_depth,
587            });
588        }
589
590        // Post-pass: blockquote depth (N = number of leading `>` characters).
591        // Done before the setext post-pass so that a blockquoted heading line
592        // is not mis-treated as setext text.
593        for row in 0..kinds.len() {
594            if !matches!(
595                kinds[row],
596                LineConstructKind::Plain | LineConstructKind::Blank
597            ) {
598                continue;
599            }
600            let line = &lines[row];
601            let mut depth: u8 = 0;
602            for ch in line.chars() {
603                match ch {
604                    '>' => depth = depth.saturating_add(1),
605                    ' ' | '\t' => continue,
606                    _ => break,
607                }
608            }
609            if depth > 0 {
610                kinds[row] = LineConstructKind::Blockquote(depth);
611            }
612        }
613
614        // Post-pass: setext underline classification.
615        // Pulldown reports setext headings as Heading events spanning the text
616        // line AND the underline line. We want to: (a) classify the underline
617        // row as SetextUnderline, (b) reset the heading text row to Plain so
618        // widening treats it as a safe boundary above.
619        for row in 0..lines.len().saturating_sub(1) {
620            if kinds[row] == LineConstructKind::Heading {
621                let next = &lines[row + 1];
622                let trimmed = next.trim();
623                if !trimmed.is_empty()
624                    && (trimmed.chars().all(|c| c == '=') || trimmed.chars().all(|c| c == '-'))
625                {
626                    kinds[row + 1] = LineConstructKind::SetextUnderline;
627                    kinds[row] = LineConstructKind::Plain;
628                }
629            }
630        }
631
632        // Post-pass: list-item continuation rows.
633        // Any Plain row immediately following a ListMarker or ListContinuation
634        // row is itself a continuation (lazy continuation, indented body, etc.).
635        for row in 1..kinds.len() {
636            if matches!(
637                kinds[row],
638                LineConstructKind::Plain | LineConstructKind::IndentedCode
639            ) && matches!(
640                kinds[row - 1],
641                LineConstructKind::ListMarker | LineConstructKind::ListContinuation
642            ) {
643                kinds[row] = LineConstructKind::ListContinuation;
644            }
645        }
646
647        // Prefix-sum lazy_delta → per-row lazy_depth, record reset
648        // boundaries at rows where lazy_depth == 0 AND kinds[r] is
649        // Blank. The lazy_depth==0 condition rules out blank rows
650        // inside a lazy-continuable construct (IndentedCode multi-
651        // chunk, etc.) where splicing across the blank would diverge
652        // from a fresh parse.
653        let mut lazy_depth_acc: i32 = 0;
654        let mut lazy_depth: Vec<u32> = Vec::with_capacity(lines.len());
655        for r in 0..lines.len() {
656            lazy_depth_acc += lazy_delta[r];
657            debug_assert!(
658                lazy_depth_acc >= 0,
659                "lazy_depth went negative at row {r}: delta history is unbalanced"
660            );
661            // `as u32` is correct under the assert (always non-negative
662            // here); in a hypothetical release with imbalanced deltas
663            // it would wrap to a very large value and surface as a
664            // panic in the boundary check below — preferable to a
665            // silent `.max(0)` clamp that would mask the bug.
666            lazy_depth.push(lazy_depth_acc as u32);
667            if lazy_depth_acc == 0 && kinds[r] == LineConstructKind::Blank {
668                reset_boundaries.push(r);
669            }
670        }
671        // Sentinels: 0 (start of buffer), lines.len() (past-end).
672        // Both make `expand_to_reset_boundary`'s unwrap_or fallbacks
673        // unreachable in a well-formed set.
674        reset_boundaries.push(0);
675        reset_boundaries.push(lines.len());
676        reset_boundaries.sort_unstable();
677        reset_boundaries.dedup();
678
679        ParsedBuffer {
680            lines: out,
681            kinds,
682            reset_boundaries,
683            lazy_depth,
684        }
685    }
686
687    /// O(N) placeholder buffer matching `lines`'s row count, every row
688    /// classified `Plain`, every char marked content-visible, no
689    /// elements. Used by `view.update`'s async-fallback path (perf #9
690    /// in the holistic review) to install a structurally-correct
691    /// `ParsedBuffer` cheaply while the real `ParsedBuffer::parse`
692    /// runs on a background tokio task. Render produces unstyled
693    /// markdown for one frame until the async result is installed
694    /// via `install_full_parse`.
695    pub fn placeholder(lines: &[String]) -> ParsedBuffer {
696        let mut out = Vec::with_capacity(lines.len());
697        for line in lines {
698            let total = line.chars().count();
699            out.push(ParsedLine {
700                elements: Vec::new(),
701                content_vis: vec![true; total],
702                elem_vis: vec![false; total],
703                elem_index: vec![0; total],
704                modifier_mask: vec![0; total],
705                list_sigil_end: None,
706                image_placeholders: Vec::new(),
707                blockquote_depth: None,
708            });
709        }
710        let kinds = vec![LineConstructKind::Plain; lines.len()];
711        let reset_boundaries = if lines.is_empty() {
712            vec![0]
713        } else {
714            vec![0, lines.len()]
715        };
716        let lazy_depth = vec![0u32; lines.len()];
717        ParsedBuffer {
718            lines: out,
719            kinds,
720            reset_boundaries,
721            lazy_depth,
722        }
723    }
724
725    /// Parse a contiguous slice of `lines` as if it were a standalone document.
726    ///
727    /// **Boundary contract:** the caller must pass a `range` whose `start` and
728    /// `end` land on safe construct boundaries (verified by
729    /// `parse_incremental::widen_to_safe` or `expand_to_reset_boundary`).
730    /// This function does not validate the contract — passing a mid-fence
731    /// range will produce a `ParsedBuffer` that mis-classifies the
732    /// boundary lines.
733    ///
734    /// Returns a `ParsedBuffer` whose `lines.len() == kinds.len() == range.len()`.
735    /// The returned `reset_boundaries` are in slice-local index space
736    /// (`0..range.len()`); `splice` shifts them by `range.start` when
737    /// merging into the parent buffer's boundary set.
738    pub fn parse_range(lines: &[String], range: Range<usize>) -> ParsedBuffer {
739        Self::parse(&lines[range])
740    }
741
742    /// Replace `self.lines[range]` and `self.kinds[range]` with the contents
743    /// of `other`. Both `other` vectors must have `range.len()` entries.
744    pub fn splice(&mut self, range: Range<usize>, other: ParsedBuffer) {
745        debug_assert!(
746            other.lines.len() == other.kinds.len(),
747            "splice: other has mismatched internal lengths (lines={} kinds={})",
748            other.lines.len(),
749            other.kinds.len(),
750        );
751        debug_assert!(
752            other.lines.len() == range.len(),
753            "splice: other.lines.len() ({}) != range.len() ({})",
754            other.lines.len(),
755            range.len(),
756        );
757        debug_assert!(
758            other.kinds.len() == range.len(),
759            "splice: other.kinds.len() ({}) != range.len() ({})",
760            other.kinds.len(),
761            range.len(),
762        );
763        debug_assert!(
764            other.lazy_depth.len() == range.len(),
765            "splice: other.lazy_depth.len() ({}) != range.len() ({})",
766            other.lazy_depth.len(),
767            range.len(),
768        );
769        self.lines.splice(range.clone(), other.lines);
770        self.kinds.splice(range.clone(), other.kinds);
771        self.lazy_depth.splice(range.clone(), other.lazy_depth);
772
773        // Rebuild `reset_boundaries`. The incremental splice path never
774        // changes line count (gated upstream in try_incremental_parse).
775        // Three runs, already sorted and positionally ordered:
776        //   - low:  self boundaries STRICTLY before the replaced region
777        //           (`b < range.start`). These rows are untouched.
778        //   - mid:  the replaced region's boundaries, RECOMPUTED from the
779        //           now-merged `kinds`/`lazy_depth` using the same rule as
780        //           `parse` (interior row is a boundary iff Blank with
781        //           lazy_depth 0). O(range.len()) ≤ the widen cap — no
782        //           pulldown reparse.
783        //   - high: self boundaries at/after `range.end` (untouched rows).
784        //
785        // Recomputing `mid` is the correctness fix: the edited rows'
786        // boundary status cannot be inherited. Inheriting `self`'s
787        // boundary at `range.start` (old `b <= range.start`) kept stale
788        // boundaries when the edit removed a Blank row; promoting the
789        // slice's sentinels invented boundaries the heuristic widener's
790        // non-reset edges never had. The post-splice `kinds`/`lazy_depth`
791        // are authoritative (the reset-boundary widening contract
792        // guarantees lazy_depth 0 at `range.start`, so the slice's
793        // isolated parse agrees with the parent context there).
794        let lines_len = self.lines.len();
795        let mut merged: Vec<usize> = Vec::with_capacity(self.reset_boundaries.len() + 1);
796        merged.extend(
797            self.reset_boundaries
798                .iter()
799                .copied()
800                .filter(|&b| b < range.start),
801        );
802        for r in range.clone() {
803            if r != 0
804                && r != lines_len
805                && self.lazy_depth[r] == 0
806                && self.kinds[r] == LineConstructKind::Blank
807            {
808                merged.push(r);
809            }
810        }
811        merged.extend(
812            self.reset_boundaries
813                .iter()
814                .copied()
815                .filter(|&b| b >= range.end),
816        );
817        // Sentinel `0` survives in `low` whenever `range.start > 0`; when
818        // the edit starts at row 0 it is not interior, so add it back.
819        // `lines_len` always survives in `high` (self held it, and
820        // `range.end <= lines_len`).
821        if merged.first() != Some(&0) {
822            merged.insert(0, 0);
823        }
824        merged.dedup();
825        debug_assert!(
826            merged.windows(2).all(|w| w[0] < w[1]),
827            "splice: merged boundaries must be strictly ascending: {merged:?}"
828        );
829        debug_assert!(
830            merged.first() == Some(&0) && merged.last() == Some(&lines_len),
831            "splice: merged boundaries must start with 0 and end with lines.len() ({lines_len})"
832        );
833        // Structural invariant: every interior reset boundary sits on a
834        // Blank row — `parse` only records non-sentinel boundaries at
835        // `kinds[r] == Blank` rows (0 and lines.len() are unconditional
836        // sentinels). A non-Blank interior boundary means the merge
837        // promoted a spurious one — the failure mode where the heuristic
838        // widener's range edges leak in as boundaries. Cheap (no full
839        // reparse), runs in every debug/test build so CI catches a merge
840        // regression that would otherwise only surface under
841        // `KIMUN_VIEW_VERIFY_INCREMENTAL`.
842        debug_assert!(
843            merged
844                .iter()
845                .all(|&b| b == 0 || b == lines_len || self.kinds[b] == LineConstructKind::Blank),
846            "splice: interior reset boundary on a non-Blank row — merge \
847             promoted a spurious boundary: {merged:?}"
848        );
849        self.reset_boundaries = merged;
850    }
851}
852
853/// Whether `tag` opens a lazy-continuable construct — one whose
854/// parse state can extend across blank rows per CommonMark §4.4
855/// (IndentedCode), §4.6 (HtmlBlock types 1/2/6/7), §5.1
856/// (BlockQuote paragraph continuation), §5.2 (loose list
857/// continuation). Used to populate `ParsedBuffer::lazy_depth`,
858/// which gates reset-boundary detection.
859///
860/// Excluded: `Paragraph` (blank terminates it), `Heading` (single
861/// row), `CodeBlock(Fenced)` (explicit closing fence required, not
862/// lazy across blanks), `Item` (counted via parent `List`).
863///
864/// Conservative on HtmlBlock: pulldown's `Tag::HtmlBlock` does not
865/// distinguish CommonMark's 7 HTML-block types in its public API.
866/// Treating all HtmlBlocks as lazy-continuable over-triggers full
867/// rebuilds on types 3/4/5 edits but never silently miscompiles.
868fn is_lazy_continuable_tag(tag: &Tag) -> bool {
869    matches!(
870        tag,
871        Tag::List(_)
872            | Tag::BlockQuote(_)
873            | Tag::CodeBlock(CodeBlockKind::Indented)
874            | Tag::HtmlBlock
875    )
876}
877
878/// `is_lazy_continuable_tag`'s `TagEnd` counterpart for the tags
879/// that can be unambiguously identified from `TagEnd` alone.
880///
881/// `TagEnd::CodeBlock` is INTENTIONALLY excluded: pulldown's
882/// `TagEnd` variant for CodeBlock does not carry the
883/// `CodeBlockKind::{Indented, Fenced(_)}` discriminant. Callers
884/// disambiguate via a parallel stack populated on
885/// `Tag::CodeBlock(_)` Start events.
886fn is_lazy_continuable_tag_end(tag_end: &TagEnd) -> bool {
887    matches!(
888        tag_end,
889        TagEnd::List(_) | TagEnd::BlockQuote(_) | TagEnd::HtmlBlock
890    )
891}
892
893/// Convert a byte offset in the joined buffer to `(row, char_col)` within
894/// `lines`. Assumes the joined buffer uses `'\n'` separators (one byte each)
895/// between consecutive lines.
896fn byte_to_row_col(byte_offset: usize, lines: &[String], line_starts: &[usize]) -> (usize, usize) {
897    // Binary-search the row whose start byte is <= byte_offset.
898    let row = match line_starts.binary_search(&byte_offset) {
899        Ok(r) => r,
900        Err(r) => r.saturating_sub(1),
901    };
902    let row = row.min(lines.len().saturating_sub(1));
903    let within = byte_offset - line_starts[row];
904    // Shared per-line kernel: clamps a trailing-'\n' offset to end-of-line and
905    // snaps any mid-codepoint offset to a boundary.
906    let char_col = byte_col_to_char_col(&lines[row], within);
907    (row, char_col)
908}
909
910/// Like [`byte_to_row_col`] but returns `(lines.len(), 0)` when
911/// `byte_offset` is at or past the joined buffer's last content
912/// byte. Used to compute the drop-row for `lazy_delta` decrements
913/// on `Event::End` of end-of-buffer blocks: a block that ends
914/// past-EOF must drop lazy_depth at `lines.len()` (past-array),
915/// not at `lines.len() - 1` — otherwise the decrement lands ON
916/// the last content row and lazy_depth there becomes 0 even
917/// though the construct still semantically covers that row.
918///
919/// The clamped variant is correct for Start events (which always
920/// land on a real content row) but wrong for End events when the
921/// block reaches EOF.
922fn byte_to_row_col_unclamped(
923    byte_offset: usize,
924    lines: &[String],
925    line_starts: &[usize],
926) -> (usize, usize) {
927    // Discriminate Ok vs Err from binary_search:
928    //
929    // - `Ok(r)`: byte_offset matches `line_starts[r]` exactly. If
930    //   `r < lines.len()`, it lands at the START of row r — return
931    //   (r, 0). If `r == lines.len()`, it matches the past-EOF
932    //   sentinel; return `(lines.len(), 0)`.
933    // - `Err(r)`: byte_offset is strictly between
934    //   `line_starts[r - 1]` and `line_starts[r]`, i.e. inside row
935    //   `r - 1`. Compute within-row offset; if the offset is past
936    //   the row's last content byte AND it's the last row, treat
937    //   as past-EOF (the `'\n'` separator slot for non-last rows is
938    //   handled by `Ok` exact match on the next row's start).
939    //
940    // The previous single check `row + 1 == lines.len() && within >=
941    // line.len()` also fired for `Ok(lines.len() - 1)` with within=0
942    // and a 0-length last row — incorrectly bumping a block End that
943    // landed at the start of a trailing blank row out to the
944    // past-EOF slot, leaving `lazy_depth` elevated on the blank that
945    // actually closes the construct. Seed: `["> a", ""]` →
946    // lazy_depth was `[1, 1]`; correct is `[1, 0]`.
947    match line_starts.binary_search(&byte_offset) {
948        Ok(r) => {
949            if r < lines.len() {
950                (r, 0)
951            } else {
952                (lines.len(), 0)
953            }
954        }
955        Err(r) => {
956            let row = r.saturating_sub(1);
957            if row >= lines.len() {
958                return (lines.len(), 0);
959            }
960            let within = byte_offset - line_starts[row];
961            let line = &lines[row];
962            if row + 1 == lines.len() && within >= line.len() {
963                return (lines.len(), 0);
964            }
965            let byte_in_line = within.min(line.len());
966            let char_col = line[..byte_in_line].chars().count();
967            (row, char_col)
968        }
969    }
970}
kimun_notes/components/text_editor/markdown/parsed_buffer.rs

kimun_notes/components/text_editor/markdown/
parsed_buffer.rs