kimun_notes/components/text_editor/markdown/parsed_buffer.rs
1//! `ParsedBuffer`: the full-buffer parsed representation produced by
2//! `parse()`. Owns the per-row classification (`kinds`), the
3//! lazy-depth tracking that gates reset-boundary detection, and the
4//! splice/parse-range machinery used by the incremental editor view.
5//!
6//! See openspec changes `parse-reset-boundaries` and
7//! `parse-reset-boundaries-v2` for the design.
8
9use super::super::parse_incremental::LineConstructKind;
10use super::super::text_coords::byte_col_to_char_col;
11use super::detect::{detect_image_placeholders, detect_wikilinks};
12use super::{
13 Element, ElementKind, PARSER_OPTIONS, ParsedLine, leading_ws_byte_len, list_marker_len,
14 tag_to_kind,
15};
16use pulldown_cmark::{CodeBlockKind, Event, Parser, Tag, TagEnd};
17use std::ops::Range;
18
19#[derive(Clone)]
20pub struct ParsedBuffer {
21 pub lines: Vec<ParsedLine>,
22 pub kinds: Vec<LineConstructKind>,
23 /// Sorted, deduped row indices `b` where pulldown-cmark's parser
24 /// state is provably reset — i.e. parsing `&lines[b..j]` in
25 /// isolation produces the same `ParsedLine` and
26 /// `LineConstructKind` for row `b` as parsing the full buffer
27 /// would, for every later boundary `j`. Used by
28 /// `parse_incremental::expand_to_reset_boundary` so the
29 /// incremental-parse widening is provably-equivalent to a fresh
30 /// parse over the spliced range — no post-slice verification
31 /// needed in release.
32 ///
33 /// Always contains `0` and `lines.len()` as sentinel boundaries.
34 /// Conservative starting set: only Blank-prefixed rows after an
35 /// `Event::End` of a top-level block. Long buffers without blank
36 /// separators degrade to full-rebuild on every edit (acceptable
37 /// — same behaviour as today's `widen_to_safe` + cap-trip path
38 /// in that regime).
39 pub reset_boundaries: Vec<usize>,
40 /// Per-row lazy-construct depth AFTER processing the row's open
41 /// events (i.e. the prefix sum is applied row-by-row INCLUDING
42 /// the current row's deltas before being stored). Counts ONLY
43 /// constructs that CommonMark allows to extend across blank rows
44 /// (List, BlockQuote, IndentedCode, HtmlBlock — see
45 /// [`is_lazy_continuable_tag`]). Fenced code, Paragraph, Heading
46 /// are NOT counted because their parse state is reset at blank
47 /// rows or single-row terminators.
48 ///
49 /// `lazy_depth[r] == 0` is a necessary precondition for treating
50 /// row `r` as a reset boundary: at depth 0 there is no open
51 /// lazy-extendable construct that a fresh parser starting at `r`
52 /// could miss.
53 pub lazy_depth: Vec<u32>,
54}
55
56impl ParsedBuffer {
57 /// Parse the entire editor buffer in a single pulldown-cmark pass.
58 ///
59 /// Returns a [`ParsedBuffer`] whose `lines` contains one `ParsedLine`
60 /// per input row (multi-row markdown elements split per row) and whose
61 /// `kinds` contains the per-row [`LineConstructKind`] classification
62 /// that drives safe-boundary widening in `parse_incremental`.
63 ///
64 /// The pulldown-cmark event walk classifies the major constructs
65 /// inline; three short O(n) post-passes (list-continuation,
66 /// blockquote-depth, setext-underline) refine the result. No second
67 /// invocation of the pulldown parser.
68 pub fn parse(lines: &[String]) -> ParsedBuffer {
69 // Build joined buffer and per-line byte-offset table.
70 let total_bytes: usize =
71 lines.iter().map(|l| l.len()).sum::<usize>() + lines.len().saturating_sub(1);
72 let mut joined = String::with_capacity(total_bytes);
73 let mut line_starts: Vec<usize> = Vec::with_capacity(lines.len() + 1);
74 for (i, line) in lines.iter().enumerate() {
75 line_starts.push(joined.len());
76 joined.push_str(line);
77 if i + 1 < lines.len() {
78 joined.push('\n');
79 }
80 }
81 // Sentinel past-end entry so binary_search on a byte offset that falls
82 // exactly on the last line's content still returns a valid `Err(row)`
83 // without landing on an `Ok` match at the real end. The `+ 1` ensures
84 // the sentinel is strictly greater than any real byte offset, including
85 // the trailing '\n' bytes between lines.
86 line_starts.push(joined.len() + 1);
87
88 // Pre-allocate per-line state.
89 let mut content_vis: Vec<Vec<bool>> = lines
90 .iter()
91 .map(|l| vec![false; l.chars().count()])
92 .collect();
93 let mut elements: Vec<Vec<Element>> = vec![Vec::new(); lines.len()];
94 let mut list_sigil_end: Vec<Option<usize>> = vec![None; lines.len()];
95
96 // Element stack: (start_row, start_col_char, kind).
97 // Spans are emitted on End events, split across rows they cross.
98 let mut stack: Vec<(usize, usize, ElementKind)> = Vec::new();
99
100 // `list_sigil_end[row]` is filled directly when we see `Start(Item)` on
101 // that row — we walk the line from the Item's start column past the
102 // marker (`- `, `* `, `+ `, or `N. `). This handles empty items (`- `)
103 // that have no Text event inside.
104
105 // Helper closure for pushing a multi-row span to `elements`.
106 let emit_span = |row_s: usize,
107 col_s: usize,
108 row_e: usize,
109 col_e: usize,
110 kind: ElementKind,
111 elements: &mut Vec<Vec<Element>>,
112 lines: &[String]| {
113 if row_s == row_e {
114 if col_e > col_s && row_s < elements.len() {
115 elements[row_s].push(Element {
116 start_char: col_s,
117 end_char: col_e,
118 kind,
119 });
120 }
121 return;
122 }
123 // Multi-row: first row extends to end-of-line, middle rows cover whole line,
124 // last row covers 0..col_e.
125 if row_s < elements.len() {
126 let end_first = lines[row_s].chars().count();
127 if end_first > col_s {
128 elements[row_s].push(Element {
129 start_char: col_s,
130 end_char: end_first,
131 kind,
132 });
133 }
134 }
135 for r in (row_s + 1)..row_e {
136 if r < elements.len() {
137 let line_len = lines[r].chars().count();
138 if line_len > 0 {
139 elements[r].push(Element {
140 start_char: 0,
141 end_char: line_len,
142 kind,
143 });
144 }
145 }
146 }
147 if row_e < elements.len() && col_e > 0 {
148 elements[row_e].push(Element {
149 start_char: 0,
150 end_char: col_e,
151 kind,
152 });
153 }
154 };
155
156 // Per-line construct classification: initially Blank vs Plain based on
157 // whitespace, then updated during the event loop and post-passes below.
158 let mut kinds: Vec<LineConstructKind> = lines
159 .iter()
160 .map(|l| {
161 if l.trim().is_empty() {
162 LineConstructKind::Blank
163 } else {
164 LineConstructKind::Plain
165 }
166 })
167 .collect();
168
169 // Reset-boundary detection via depth prefix-sum. During the
170 // event walk we record per-row depth deltas (+1 at the start
171 // row of every top-level block, -1 at the row AFTER its end).
172 // After the walk, a prefix sum gives the depth at the start
173 // of each row — depth==0 means pulldown's parser is in the
174 // "between blocks" state at that row, with no open
175 // construct that could lazy-continue. A row at depth 0 whose
176 // own `kinds` is Blank (or EOF) is a true reset point.
177 //
178 // Depth deltas (not an inline counter) handle pulldown's
179 // overlapping nesting: a Paragraph inside an Item inside a
180 // List emits Start events at overlapping rows; an inline
181 // depth counter that decrements on the innermost End would
182 // see depth==0 prematurely while the outer List is still
183 // open. The delta+prefix-sum approach correctly sums all
184 // open constructs.
185 let mut reset_boundaries: Vec<usize> = Vec::new();
186 // +1 at the row a lazy-continuable construct opens, -1 at the
187 // row past where it closes. Length is `lines.len() + 1` so
188 // end-of-buffer drops have a sink (read by the prefix sum
189 // only up to `lines.len() - 1`). See [`is_lazy_continuable_tag`].
190 let mut lazy_delta: Vec<i32> = vec![0; lines.len() + 1];
191 // CodeBlock kind tracker. `Some(true)` = an Indented (lazy)
192 // CodeBlock is open; `Some(false)` = a Fenced (non-lazy)
193 // CodeBlock is open; `None` = no CodeBlock open. CommonMark
194 // does not nest code blocks, so a single Option captures the
195 // kind across the matching Start/End pair — and reading None
196 // on End signals an unmatched-End invariant violation.
197 let mut indented_codeblock_open: Option<bool> = None;
198
199 // Track fenced/indented code block byte ranges for F4 (label suppression).
200 // Populated during the main parser pass below and converted to per-line
201 // flags before the per-line label scan.
202 let mut code_block_byte_ranges: Vec<(usize, usize)> = Vec::new();
203 let mut code_block_depth = 0u32;
204 let mut code_block_start: Option<usize> = None;
205
206 let parser = Parser::new_ext(&joined, PARSER_OPTIONS);
207 for (event, range) in parser.into_offset_iter() {
208 let (sr, sc) = byte_to_row_col(range.start, lines, &line_starts);
209 let (er, ec) = byte_to_row_col(range.end, lines, &line_starts);
210
211 // V2 lazy_delta tracking — only constructs that
212 // lazy-extend across blank rows per CommonMark §4.4 / §4.6
213 // / §5.1 / §5.2. See [`is_lazy_continuable_tag`].
214 //
215 // End events use an unclamped row for the drop position:
216 // pulldown's `range.end` for an end-of-buffer block lands
217 // past the last content byte, which `byte_to_row_col`
218 // would clamp to `lines.len() - 1`. Dropping AT the last
219 // content row would zero `lazy_depth` there even though
220 // the construct still covers it.
221 //
222 // The CodeBlock arm is placed first so it absorbs both
223 // variants before the generic `is_lazy_continuable_tag`
224 // arm sees them — Indented would otherwise double-count.
225 match &event {
226 Event::Start(Tag::CodeBlock(kind)) => {
227 let is_indented = matches!(kind, CodeBlockKind::Indented);
228 indented_codeblock_open = Some(is_indented);
229 if is_indented && sr < lazy_delta.len() {
230 lazy_delta[sr] += 1;
231 }
232 }
233 Event::Start(tag) if is_lazy_continuable_tag(tag) && sr < lazy_delta.len() => {
234 lazy_delta[sr] += 1;
235 }
236 Event::End(TagEnd::CodeBlock) => {
237 let was_indented = indented_codeblock_open.take();
238 debug_assert!(
239 was_indented.is_some(),
240 "Event::End(CodeBlock) without matching Start at byte {}",
241 range.start,
242 );
243 if was_indented == Some(true) {
244 let (er_lazy, _) =
245 byte_to_row_col_unclamped(range.end, lines, &line_starts);
246 let drop_at = er_lazy.min(lines.len());
247 if drop_at < lazy_delta.len() {
248 lazy_delta[drop_at] -= 1;
249 }
250 }
251 }
252 Event::End(tag_end) if is_lazy_continuable_tag_end(tag_end) => {
253 let (er_lazy, _) = byte_to_row_col_unclamped(range.end, lines, &line_starts);
254 let drop_at = er_lazy.min(lines.len());
255 if drop_at < lazy_delta.len() {
256 lazy_delta[drop_at] -= 1;
257 }
258 }
259 _ => {}
260 }
261
262 match event {
263 Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(_))) => {
264 if code_block_depth == 0 {
265 code_block_start = Some(range.start);
266 }
267 code_block_depth += 1;
268 // Opening fence marker row.
269 if sr < kinds.len() {
270 kinds[sr] = LineConstructKind::FenceMarker;
271 }
272 // Rows between opening and closing fences are content.
273 // Pulldown can emit `er == sr` for a single-row degenerate
274 // fence (no body, no closing fence on a separate row) or
275 // `er == lines.len()` for an unclosed fence at EOF; both
276 // make the `[(sr+1)..er]` slice invalid.
277 let content_end = er.min(kinds.len());
278 if content_end > sr + 1 {
279 kinds[(sr + 1)..content_end].fill(LineConstructKind::FenceContent);
280 }
281 // Closing fence marker row (er is the row of the closing
282 // ```). When `er == sr` (degenerate single-row fence) or
283 // `er >= lines.len()` (unclosed fence at EOF) there is
284 // no separate closing row to mark.
285 if er > sr && er < kinds.len() {
286 kinds[er] = LineConstructKind::FenceMarker;
287 }
288 }
289 Event::Start(Tag::CodeBlock(CodeBlockKind::Indented)) => {
290 if code_block_depth == 0 {
291 code_block_start = Some(range.start);
292 }
293 code_block_depth += 1;
294 // Use the UNCLAMPED end row as the exclusive upper bound:
295 // pulldown's exclusive byte end lands at the start of the
296 // row *after* the block's content (or `lines.len()` at EOF),
297 // so the content rows are `sr..end_row` — this already
298 // excludes both a following less-indented paragraph
299 // (CommonMark §4.4: such a line ends the block) and any
300 // trailing blank line. The previous `er + 1` over-included
301 // that following row.
302 let (end_row, _) = byte_to_row_col_unclamped(range.end, lines, &line_starts);
303 let hi = end_row.min(kinds.len());
304 if hi > sr {
305 kinds[sr..hi].fill(LineConstructKind::IndentedCode);
306 // Invariant the bound above guarantees: the block never
307 // ends on a trailing blank, so no trim is needed.
308 debug_assert!(
309 hi <= sr + 1 || !lines[hi - 1].trim().is_empty(),
310 "indented block ended on trailing blank row {}",
311 hi - 1
312 );
313 }
314 }
315 Event::End(TagEnd::CodeBlock) => {
316 code_block_depth = code_block_depth.saturating_sub(1);
317 if code_block_depth == 0
318 && let Some(start) = code_block_start.take()
319 {
320 code_block_byte_ranges.push((start, range.end));
321 }
322 }
323 Event::Start(ref tag) if let Some(kind) = tag_to_kind(tag) => {
324 if matches!(
325 kind,
326 ElementKind::HeadingH1 | ElementKind::HeadingH2 | ElementKind::HeadingH3
327 ) {
328 kinds[sr] = LineConstructKind::Heading;
329 }
330 stack.push((sr, sc, kind));
331 }
332 Event::End(
333 TagEnd::Strong
334 | TagEnd::Emphasis
335 | TagEnd::Strikethrough
336 | TagEnd::Link
337 | TagEnd::Heading(_)
338 | TagEnd::BlockQuote(_),
339 ) => {
340 if let Some((s_r, s_c, k)) = stack.pop() {
341 emit_span(s_r, s_c, er, ec, k, &mut elements, lines);
342 }
343 }
344 Event::Start(Tag::Item)
345 // Pulldown-cmark's Item range does not always start at the
346 // marker character — for nested items it starts at the
347 // indentation-beyond-the-parent boundary, which can be
348 // several chars before the marker. Scan the line from col
349 // 0 to find leading whitespace + marker instead of relying
350 // on `sc`.
351 if sr < lines.len() && list_sigil_end[sr].is_none() =>
352 {
353 kinds[sr] = LineConstructKind::ListMarker;
354 let line = lines[sr].as_str();
355 let ws_end = leading_ws_byte_len(line);
356 if let Some(len) = list_marker_len(&line[ws_end..]) {
357 // ws_end is byte length but ASCII whitespace makes it
358 // equal to the char count.
359 list_sigil_end[sr] = Some(ws_end + len);
360 }
361 }
362 Event::Start(Tag::HtmlBlock) => {
363 let hi = (er + 1).min(kinds.len());
364 if hi > sr {
365 kinds[sr..hi].fill(LineConstructKind::HtmlBlock);
366 }
367 }
368 Event::Html(_) => {
369 // Block-level HTML body. Already classified by the
370 // enclosing `Tag::HtmlBlock` arm; this branch is a
371 // safety-net for any row pulldown emits between the
372 // tag boundaries but defers to fence/code kinds when
373 // the rows happen to overlap (rare, but possible
374 // with malformed input).
375 let hi = (er + 1).min(kinds.len());
376 if hi > sr {
377 for kind in &mut kinds[sr..hi] {
378 if !matches!(
379 *kind,
380 LineConstructKind::FenceContent
381 | LineConstructKind::IndentedCode
382 | LineConstructKind::FenceMarker
383 ) {
384 *kind = LineConstructKind::HtmlBlock;
385 }
386 }
387 }
388 }
389 // Inline HTML (`<span>`, `<br/>`, etc.) lives inside a
390 // paragraph; it must NOT promote the row to HtmlBlock,
391 // because `HtmlBlock` is a non-safe widening boundary
392 // (see `parse_incremental::is_safe_boundary`). Painting
393 // the paragraph row as HtmlBlock would force widening
394 // to walk past it on every nearby edit. Leave kind as-is.
395 Event::InlineHtml(_) => {}
396 Event::End(TagEnd::Item) => {}
397 Event::Code(ref code_text) if sr == er && sr < lines.len() => {
398 // Inline code — always single-line in practice.
399 let code_len = code_text.chars().count();
400 let range_char_len = ec.saturating_sub(sc);
401 let sigil_each = range_char_len.saturating_sub(code_len) / 2;
402 let cs = sc + sigil_each;
403 for vis in content_vis[sr].iter_mut().skip(cs).take(code_len) {
404 *vis = true;
405 }
406 elements[sr].push(Element {
407 start_char: sc,
408 end_char: ec,
409 kind: ElementKind::InlineCode,
410 });
411 }
412 Event::Text(_) | Event::SoftBreak | Event::HardBreak => {
413 // Mark content_vis for each row the event touches.
414 if sr == er {
415 if sr < content_vis.len() {
416 for vis in content_vis[sr]
417 .iter_mut()
418 .skip(sc)
419 .take(ec.saturating_sub(sc))
420 {
421 *vis = true;
422 }
423 }
424 } else {
425 // First row: from sc to end-of-line.
426 if sr < content_vis.len() {
427 let line_chars = content_vis[sr].len();
428 for vis in content_vis[sr]
429 .iter_mut()
430 .skip(sc)
431 .take(line_chars.saturating_sub(sc))
432 {
433 *vis = true;
434 }
435 }
436 // Middle rows: whole line.
437 for r in (sr + 1)..er {
438 if r < content_vis.len() {
439 for vis in content_vis[r].iter_mut() {
440 *vis = true;
441 }
442 }
443 }
444 // Last row: 0..ec.
445 if er < content_vis.len() {
446 for vis in content_vis[er].iter_mut().take(ec) {
447 *vis = true;
448 }
449 }
450 }
451 }
452 _ => {}
453 }
454 }
455
456 // Build a per-line flag: `true` = this line is inside a fenced/indented code block.
457 // Lines whose byte range overlaps any code-block range are suppressed from label scan.
458 let line_in_code_block: Vec<bool> = {
459 let mut flags = vec![false; lines.len()];
460 for (cb_start, cb_end) in &code_block_byte_ranges {
461 // Find all lines that overlap [cb_start, cb_end).
462 for (row, &ls) in line_starts[..lines.len()].iter().enumerate() {
463 let le = ls + lines[row].len();
464 // Overlap when line_start < cb_end and line_end > cb_start.
465 // We skip the fence delimiter lines (which contain the ``` markers)
466 // by checking if the line's content byte range overlaps the code
467 // block's content range. The CodeBlock event range in pulldown-cmark
468 // covers the opening fence through the closing fence, so this
469 // conservative check marks all lines within the span as in-block.
470 if ls < *cb_end && le > *cb_start {
471 flags[row] = true;
472 }
473 }
474 }
475 flags
476 };
477
478 // Per-line post-processing: heading trailing whitespace, wikilinks, bitmasks.
479 let mut out: Vec<ParsedLine> = Vec::with_capacity(lines.len());
480 for (row, line) in lines.iter().enumerate() {
481 let mut cv = std::mem::take(&mut content_vis[row]);
482 let mut els = std::mem::take(&mut elements[row]);
483
484 // Heading trailing-whitespace fix.
485 for e in &els {
486 if matches!(
487 e.kind,
488 ElementKind::HeadingH1 | ElementKind::HeadingH2 | ElementKind::HeadingH3
489 ) {
490 for i in (e.start_char..e.end_char).rev() {
491 match line.chars().nth(i) {
492 Some(' ' | '\t') => {
493 if i < cv.len() {
494 cv[i] = true;
495 }
496 }
497 _ => break,
498 }
499 }
500 }
501 }
502
503 detect_wikilinks(line, &mut cv, &mut els);
504 let image_placeholders = detect_image_placeholders(line, &mut cv, &mut els);
505
506 // Scan for #hashtag spans and emit Label elements.
507 // Guards (all must pass):
508 // F4: skip if this line is inside a fenced code block
509 // F2: word-boundary guard (handled inside label_matches)
510 // F3: skip if the span overlaps InlineCode, Link, WikiLink, or Image
511 if !line_in_code_block[row] {
512 let line_str = line.as_str();
513 for lm in kimun_core::note::scan::label_matches(line_str) {
514 // Convert byte offsets to char offsets for Element storage.
515 let start_char = line_str[..lm.byte_start].chars().count();
516 let end_char =
517 start_char + line_str[lm.byte_start..lm.byte_end].chars().count();
518 // F3: overlap guard (InlineCode + Link + WikiLink + Image)
519 let overlaps_existing = els.iter().any(|e| {
520 matches!(
521 e.kind,
522 ElementKind::InlineCode
523 | ElementKind::Link
524 | ElementKind::WikiLink
525 | ElementKind::Image
526 ) && !(end_char <= e.start_char || start_char >= e.end_char)
527 });
528 if !overlaps_existing {
529 els.push(Element {
530 start_char,
531 end_char,
532 kind: ElementKind::Label,
533 });
534 }
535 }
536 }
537 // Re-sort so elem_vis / elem_index precomputation sees elements in line order.
538 els.sort_by_key(|e| e.start_char);
539
540 // F6: use u16 for elem_index (supports up to 65535 elements per line)
541 debug_assert!(
542 els.len() < u16::MAX as usize,
543 "Too many elements on a single line ({})",
544 els.len()
545 );
546 let total = line.chars().count();
547 let mut elem_vis = vec![false; total];
548 let mut elem_index = vec![0u16; total];
549 // OR-ed emphasis mask: unlike elem_index (innermost wins), every
550 // covering Bold/Italic/Strikethrough contributes, so a Link or
551 // WikiLink nested in `**…**` / `*…*` keeps the outer emphasis.
552 let mut modifier_mask = vec![0u8; total];
553 for (i, e) in els.iter().enumerate() {
554 let tag = (i + 1) as u16;
555 let bit = super::modifier_bit(e.kind);
556 for pos in e.start_char..e.end_char {
557 if pos < total {
558 elem_vis[pos] = true;
559 elem_index[pos] = tag;
560 modifier_mask[pos] |= bit;
561 }
562 }
563 }
564
565 // Blockquote depth = number of Blockquote elements covering this
566 // line. Derived from pulldown's structure (via `emit_span`) rather
567 // than counting leading `>`, so lazy-continuation lines (CommonMark
568 // §5.1 — quote text with no `>` prefix) and nested quotes report the
569 // correct depth and get the bar gutter, not just the quote color.
570 let blockquote_depth = {
571 let n = els
572 .iter()
573 .filter(|e| e.kind == ElementKind::Blockquote)
574 .count();
575 (n > 0).then_some(n.min(u8::MAX as usize) as u8)
576 };
577
578 out.push(ParsedLine {
579 elements: els,
580 content_vis: cv,
581 elem_vis,
582 elem_index,
583 modifier_mask,
584 list_sigil_end: list_sigil_end[row],
585 image_placeholders,
586 blockquote_depth,
587 });
588 }
589
590 // Post-pass: blockquote depth (N = number of leading `>` characters).
591 // Done before the setext post-pass so that a blockquoted heading line
592 // is not mis-treated as setext text.
593 for row in 0..kinds.len() {
594 if !matches!(
595 kinds[row],
596 LineConstructKind::Plain | LineConstructKind::Blank
597 ) {
598 continue;
599 }
600 let line = &lines[row];
601 let mut depth: u8 = 0;
602 for ch in line.chars() {
603 match ch {
604 '>' => depth = depth.saturating_add(1),
605 ' ' | '\t' => continue,
606 _ => break,
607 }
608 }
609 if depth > 0 {
610 kinds[row] = LineConstructKind::Blockquote(depth);
611 }
612 }
613
614 // Post-pass: setext underline classification.
615 // Pulldown reports setext headings as Heading events spanning the text
616 // line AND the underline line. We want to: (a) classify the underline
617 // row as SetextUnderline, (b) reset the heading text row to Plain so
618 // widening treats it as a safe boundary above.
619 for row in 0..lines.len().saturating_sub(1) {
620 if kinds[row] == LineConstructKind::Heading {
621 let next = &lines[row + 1];
622 let trimmed = next.trim();
623 if !trimmed.is_empty()
624 && (trimmed.chars().all(|c| c == '=') || trimmed.chars().all(|c| c == '-'))
625 {
626 kinds[row + 1] = LineConstructKind::SetextUnderline;
627 kinds[row] = LineConstructKind::Plain;
628 }
629 }
630 }
631
632 // Post-pass: list-item continuation rows.
633 // Any Plain row immediately following a ListMarker or ListContinuation
634 // row is itself a continuation (lazy continuation, indented body, etc.).
635 for row in 1..kinds.len() {
636 if matches!(
637 kinds[row],
638 LineConstructKind::Plain | LineConstructKind::IndentedCode
639 ) && matches!(
640 kinds[row - 1],
641 LineConstructKind::ListMarker | LineConstructKind::ListContinuation
642 ) {
643 kinds[row] = LineConstructKind::ListContinuation;
644 }
645 }
646
647 // Prefix-sum lazy_delta → per-row lazy_depth, record reset
648 // boundaries at rows where lazy_depth == 0 AND kinds[r] is
649 // Blank. The lazy_depth==0 condition rules out blank rows
650 // inside a lazy-continuable construct (IndentedCode multi-
651 // chunk, etc.) where splicing across the blank would diverge
652 // from a fresh parse.
653 let mut lazy_depth_acc: i32 = 0;
654 let mut lazy_depth: Vec<u32> = Vec::with_capacity(lines.len());
655 for r in 0..lines.len() {
656 lazy_depth_acc += lazy_delta[r];
657 debug_assert!(
658 lazy_depth_acc >= 0,
659 "lazy_depth went negative at row {r}: delta history is unbalanced"
660 );
661 // `as u32` is correct under the assert (always non-negative
662 // here); in a hypothetical release with imbalanced deltas
663 // it would wrap to a very large value and surface as a
664 // panic in the boundary check below — preferable to a
665 // silent `.max(0)` clamp that would mask the bug.
666 lazy_depth.push(lazy_depth_acc as u32);
667 if lazy_depth_acc == 0 && kinds[r] == LineConstructKind::Blank {
668 reset_boundaries.push(r);
669 }
670 }
671 // Sentinels: 0 (start of buffer), lines.len() (past-end).
672 // Both make `expand_to_reset_boundary`'s unwrap_or fallbacks
673 // unreachable in a well-formed set.
674 reset_boundaries.push(0);
675 reset_boundaries.push(lines.len());
676 reset_boundaries.sort_unstable();
677 reset_boundaries.dedup();
678
679 ParsedBuffer {
680 lines: out,
681 kinds,
682 reset_boundaries,
683 lazy_depth,
684 }
685 }
686
687 /// O(N) placeholder buffer matching `lines`'s row count, every row
688 /// classified `Plain`, every char marked content-visible, no
689 /// elements. Used by `view.update`'s async-fallback path (perf #9
690 /// in the holistic review) to install a structurally-correct
691 /// `ParsedBuffer` cheaply while the real `ParsedBuffer::parse`
692 /// runs on a background tokio task. Render produces unstyled
693 /// markdown for one frame until the async result is installed
694 /// via `install_full_parse`.
695 pub fn placeholder(lines: &[String]) -> ParsedBuffer {
696 let mut out = Vec::with_capacity(lines.len());
697 for line in lines {
698 let total = line.chars().count();
699 out.push(ParsedLine {
700 elements: Vec::new(),
701 content_vis: vec![true; total],
702 elem_vis: vec![false; total],
703 elem_index: vec![0; total],
704 modifier_mask: vec![0; total],
705 list_sigil_end: None,
706 image_placeholders: Vec::new(),
707 blockquote_depth: None,
708 });
709 }
710 let kinds = vec![LineConstructKind::Plain; lines.len()];
711 let reset_boundaries = if lines.is_empty() {
712 vec![0]
713 } else {
714 vec![0, lines.len()]
715 };
716 let lazy_depth = vec![0u32; lines.len()];
717 ParsedBuffer {
718 lines: out,
719 kinds,
720 reset_boundaries,
721 lazy_depth,
722 }
723 }
724
725 /// Parse a contiguous slice of `lines` as if it were a standalone document.
726 ///
727 /// **Boundary contract:** the caller must pass a `range` whose `start` and
728 /// `end` land on safe construct boundaries (verified by
729 /// `parse_incremental::widen_to_safe` or `expand_to_reset_boundary`).
730 /// This function does not validate the contract — passing a mid-fence
731 /// range will produce a `ParsedBuffer` that mis-classifies the
732 /// boundary lines.
733 ///
734 /// Returns a `ParsedBuffer` whose `lines.len() == kinds.len() == range.len()`.
735 /// The returned `reset_boundaries` are in slice-local index space
736 /// (`0..range.len()`); `splice` shifts them by `range.start` when
737 /// merging into the parent buffer's boundary set.
738 pub fn parse_range(lines: &[String], range: Range<usize>) -> ParsedBuffer {
739 Self::parse(&lines[range])
740 }
741
742 /// Replace `self.lines[range]` and `self.kinds[range]` with the contents
743 /// of `other`. Both `other` vectors must have `range.len()` entries.
744 pub fn splice(&mut self, range: Range<usize>, other: ParsedBuffer) {
745 debug_assert!(
746 other.lines.len() == other.kinds.len(),
747 "splice: other has mismatched internal lengths (lines={} kinds={})",
748 other.lines.len(),
749 other.kinds.len(),
750 );
751 debug_assert!(
752 other.lines.len() == range.len(),
753 "splice: other.lines.len() ({}) != range.len() ({})",
754 other.lines.len(),
755 range.len(),
756 );
757 debug_assert!(
758 other.kinds.len() == range.len(),
759 "splice: other.kinds.len() ({}) != range.len() ({})",
760 other.kinds.len(),
761 range.len(),
762 );
763 debug_assert!(
764 other.lazy_depth.len() == range.len(),
765 "splice: other.lazy_depth.len() ({}) != range.len() ({})",
766 other.lazy_depth.len(),
767 range.len(),
768 );
769 self.lines.splice(range.clone(), other.lines);
770 self.kinds.splice(range.clone(), other.kinds);
771 self.lazy_depth.splice(range.clone(), other.lazy_depth);
772
773 // Rebuild `reset_boundaries`. The incremental splice path never
774 // changes line count (gated upstream in try_incremental_parse).
775 // Three runs, already sorted and positionally ordered:
776 // - low: self boundaries STRICTLY before the replaced region
777 // (`b < range.start`). These rows are untouched.
778 // - mid: the replaced region's boundaries, RECOMPUTED from the
779 // now-merged `kinds`/`lazy_depth` using the same rule as
780 // `parse` (interior row is a boundary iff Blank with
781 // lazy_depth 0). O(range.len()) ≤ the widen cap — no
782 // pulldown reparse.
783 // - high: self boundaries at/after `range.end` (untouched rows).
784 //
785 // Recomputing `mid` is the correctness fix: the edited rows'
786 // boundary status cannot be inherited. Inheriting `self`'s
787 // boundary at `range.start` (old `b <= range.start`) kept stale
788 // boundaries when the edit removed a Blank row; promoting the
789 // slice's sentinels invented boundaries the heuristic widener's
790 // non-reset edges never had. The post-splice `kinds`/`lazy_depth`
791 // are authoritative (the reset-boundary widening contract
792 // guarantees lazy_depth 0 at `range.start`, so the slice's
793 // isolated parse agrees with the parent context there).
794 let lines_len = self.lines.len();
795 let mut merged: Vec<usize> = Vec::with_capacity(self.reset_boundaries.len() + 1);
796 merged.extend(
797 self.reset_boundaries
798 .iter()
799 .copied()
800 .filter(|&b| b < range.start),
801 );
802 for r in range.clone() {
803 if r != 0
804 && r != lines_len
805 && self.lazy_depth[r] == 0
806 && self.kinds[r] == LineConstructKind::Blank
807 {
808 merged.push(r);
809 }
810 }
811 merged.extend(
812 self.reset_boundaries
813 .iter()
814 .copied()
815 .filter(|&b| b >= range.end),
816 );
817 // Sentinel `0` survives in `low` whenever `range.start > 0`; when
818 // the edit starts at row 0 it is not interior, so add it back.
819 // `lines_len` always survives in `high` (self held it, and
820 // `range.end <= lines_len`).
821 if merged.first() != Some(&0) {
822 merged.insert(0, 0);
823 }
824 merged.dedup();
825 debug_assert!(
826 merged.windows(2).all(|w| w[0] < w[1]),
827 "splice: merged boundaries must be strictly ascending: {merged:?}"
828 );
829 debug_assert!(
830 merged.first() == Some(&0) && merged.last() == Some(&lines_len),
831 "splice: merged boundaries must start with 0 and end with lines.len() ({lines_len})"
832 );
833 // Structural invariant: every interior reset boundary sits on a
834 // Blank row — `parse` only records non-sentinel boundaries at
835 // `kinds[r] == Blank` rows (0 and lines.len() are unconditional
836 // sentinels). A non-Blank interior boundary means the merge
837 // promoted a spurious one — the failure mode where the heuristic
838 // widener's range edges leak in as boundaries. Cheap (no full
839 // reparse), runs in every debug/test build so CI catches a merge
840 // regression that would otherwise only surface under
841 // `KIMUN_VIEW_VERIFY_INCREMENTAL`.
842 debug_assert!(
843 merged
844 .iter()
845 .all(|&b| b == 0 || b == lines_len || self.kinds[b] == LineConstructKind::Blank),
846 "splice: interior reset boundary on a non-Blank row — merge \
847 promoted a spurious boundary: {merged:?}"
848 );
849 self.reset_boundaries = merged;
850 }
851}
852
853/// Whether `tag` opens a lazy-continuable construct — one whose
854/// parse state can extend across blank rows per CommonMark §4.4
855/// (IndentedCode), §4.6 (HtmlBlock types 1/2/6/7), §5.1
856/// (BlockQuote paragraph continuation), §5.2 (loose list
857/// continuation). Used to populate `ParsedBuffer::lazy_depth`,
858/// which gates reset-boundary detection.
859///
860/// Excluded: `Paragraph` (blank terminates it), `Heading` (single
861/// row), `CodeBlock(Fenced)` (explicit closing fence required, not
862/// lazy across blanks), `Item` (counted via parent `List`).
863///
864/// Conservative on HtmlBlock: pulldown's `Tag::HtmlBlock` does not
865/// distinguish CommonMark's 7 HTML-block types in its public API.
866/// Treating all HtmlBlocks as lazy-continuable over-triggers full
867/// rebuilds on types 3/4/5 edits but never silently miscompiles.
868fn is_lazy_continuable_tag(tag: &Tag) -> bool {
869 matches!(
870 tag,
871 Tag::List(_)
872 | Tag::BlockQuote(_)
873 | Tag::CodeBlock(CodeBlockKind::Indented)
874 | Tag::HtmlBlock
875 )
876}
877
878/// `is_lazy_continuable_tag`'s `TagEnd` counterpart for the tags
879/// that can be unambiguously identified from `TagEnd` alone.
880///
881/// `TagEnd::CodeBlock` is INTENTIONALLY excluded: pulldown's
882/// `TagEnd` variant for CodeBlock does not carry the
883/// `CodeBlockKind::{Indented, Fenced(_)}` discriminant. Callers
884/// disambiguate via a parallel stack populated on
885/// `Tag::CodeBlock(_)` Start events.
886fn is_lazy_continuable_tag_end(tag_end: &TagEnd) -> bool {
887 matches!(
888 tag_end,
889 TagEnd::List(_) | TagEnd::BlockQuote(_) | TagEnd::HtmlBlock
890 )
891}
892
893/// Convert a byte offset in the joined buffer to `(row, char_col)` within
894/// `lines`. Assumes the joined buffer uses `'\n'` separators (one byte each)
895/// between consecutive lines.
896fn byte_to_row_col(byte_offset: usize, lines: &[String], line_starts: &[usize]) -> (usize, usize) {
897 // Binary-search the row whose start byte is <= byte_offset.
898 let row = match line_starts.binary_search(&byte_offset) {
899 Ok(r) => r,
900 Err(r) => r.saturating_sub(1),
901 };
902 let row = row.min(lines.len().saturating_sub(1));
903 let within = byte_offset - line_starts[row];
904 // Shared per-line kernel: clamps a trailing-'\n' offset to end-of-line and
905 // snaps any mid-codepoint offset to a boundary.
906 let char_col = byte_col_to_char_col(&lines[row], within);
907 (row, char_col)
908}
909
910/// Like [`byte_to_row_col`] but returns `(lines.len(), 0)` when
911/// `byte_offset` is at or past the joined buffer's last content
912/// byte. Used to compute the drop-row for `lazy_delta` decrements
913/// on `Event::End` of end-of-buffer blocks: a block that ends
914/// past-EOF must drop lazy_depth at `lines.len()` (past-array),
915/// not at `lines.len() - 1` — otherwise the decrement lands ON
916/// the last content row and lazy_depth there becomes 0 even
917/// though the construct still semantically covers that row.
918///
919/// The clamped variant is correct for Start events (which always
920/// land on a real content row) but wrong for End events when the
921/// block reaches EOF.
922fn byte_to_row_col_unclamped(
923 byte_offset: usize,
924 lines: &[String],
925 line_starts: &[usize],
926) -> (usize, usize) {
927 // Discriminate Ok vs Err from binary_search:
928 //
929 // - `Ok(r)`: byte_offset matches `line_starts[r]` exactly. If
930 // `r < lines.len()`, it lands at the START of row r — return
931 // (r, 0). If `r == lines.len()`, it matches the past-EOF
932 // sentinel; return `(lines.len(), 0)`.
933 // - `Err(r)`: byte_offset is strictly between
934 // `line_starts[r - 1]` and `line_starts[r]`, i.e. inside row
935 // `r - 1`. Compute within-row offset; if the offset is past
936 // the row's last content byte AND it's the last row, treat
937 // as past-EOF (the `'\n'` separator slot for non-last rows is
938 // handled by `Ok` exact match on the next row's start).
939 //
940 // The previous single check `row + 1 == lines.len() && within >=
941 // line.len()` also fired for `Ok(lines.len() - 1)` with within=0
942 // and a 0-length last row — incorrectly bumping a block End that
943 // landed at the start of a trailing blank row out to the
944 // past-EOF slot, leaving `lazy_depth` elevated on the blank that
945 // actually closes the construct. Seed: `["> a", ""]` →
946 // lazy_depth was `[1, 1]`; correct is `[1, 0]`.
947 match line_starts.binary_search(&byte_offset) {
948 Ok(r) => {
949 if r < lines.len() {
950 (r, 0)
951 } else {
952 (lines.len(), 0)
953 }
954 }
955 Err(r) => {
956 let row = r.saturating_sub(1);
957 if row >= lines.len() {
958 return (lines.len(), 0);
959 }
960 let within = byte_offset - line_starts[row];
961 let line = &lines[row];
962 if row + 1 == lines.len() && within >= line.len() {
963 return (lines.len(), 0);
964 }
965 let byte_in_line = within.min(line.len());
966 let char_col = line[..byte_in_line].chars().count();
967 (row, char_col)
968 }
969 }
970}