mdwright_format/incremental/mod.rs
1//! Block-boundary checkpoint table for range-formatting.
2//!
3//! Editor latency requires that a one-paragraph edit format in time
4//! proportional to the paragraph, not the document. The mechanism is a
5//! *checkpoint table*: a sorted list of byte offsets, one per
6//! top-level Markdown block, that lets a caller-supplied byte range
7//! be snapped to the smallest covering whole-block slice. The slice is
8//! then parsed and formatted independently through the document and
9//! formatter entry points.
10//!
11//! A checkpoint sits at column 0 of a line that opens a **top-level**
12//! block (container depth zero, not inside a blockquote, list,
13//! footnote, or table). Both conditions are load-bearing: slicing two
14//! checkpoints must yield a syntactically self-contained Markdown
15//! sub-document, otherwise the substring contract in
16//! [`crate::format_range`] breaks. The depth clause specifically
17//! defends against slicing one item out of an ordered list, where the
18//! sliced item would re-parse as item 1 and `OrderedList::Renumber`
19//! would diverge from the whole-document output.
20//!
21//! Offsets are recorded in the **caller's source coordinates**
22//! (original bytes, before CM §2.1 / §2.3 canonicalisation). Pulldown
23//! reports canonical offsets; [`CheckpointTable::build`] translates
24//! them through `Source::to_original` once at build
25//! time. For the common case where input contains no `\r` or `\0` the
26//! translation is the identity and costs nothing.
27//!
28//! Frontmatter (YAML `---…---` or TOML `+++…+++`) is treated as a
29//! single prelude region: the first body checkpoint sits at the byte
30//! after the closing delimiter. A caller-supplied range that falls
31//! entirely inside frontmatter snaps forward to the first body block.
32
33use std::ops::Range;
34
35use mdwright_document::{Document, ParseError, ParseOptions};
36
37/// One block boundary in the caller's source.
38#[derive(Copy, Clone, Debug)]
39pub(crate) struct BlockCheckpoint {
40 /// Byte offset in the caller's source where the block starts.
41 /// Always column 0 of its line, always at container depth 0.
42 pub(crate) byte: u32,
43 /// Cheap snapshot of the parser walk state at this point. Reserved
44 /// for the incremental-rebuild work in the LSP session. The
45 /// current resumption logic doesn't read it, but recording it now
46 /// lets that session diff two tables for "did anything before this
47 /// boundary actually change?".
48 #[expect(dead_code, reason = "reserved for LSP incremental-rebuild")]
49 pub(crate) parser_state: u64,
50}
51
52/// Per-document table of top-level block boundaries.
53///
54/// Built once per source version via [`CheckpointTable::build`]; the
55/// LSP rebuilds on every `didChange` notification. Internally a sorted
56/// `Vec<BlockCheckpoint>` with `byte = 0` and a sentinel at
57/// `byte = source.len()` as bookends, so [`Self::snap_to_block_boundaries`]
58/// is branch-free at the bounds.
59#[derive(Debug)]
60pub struct CheckpointTable {
61 source_len: u32,
62 /// Sorted by `byte` ascending. First entry is always `byte = 0`;
63 /// last entry is always `byte = source_len`.
64 points: Vec<BlockCheckpoint>,
65}
66
67impl CheckpointTable {
68 /// Walk `source` once, recording one checkpoint per top-level
69 /// block. Cost: one pulldown event stream, one offset translation
70 /// per checkpoint, one `Vec` allocation.
71 ///
72 /// # Errors
73 ///
74 /// Returns [`ParseError`] if parser execution cannot safely
75 /// recognise the source.
76 pub fn build(source: &str) -> Result<Self, ParseError> {
77 Self::build_with_options(source, ParseOptions::default())
78 }
79
80 /// Build a checkpoint table under explicit recognition policy.
81 ///
82 /// # Errors
83 ///
84 /// Returns [`ParseError`] if parser execution cannot safely
85 /// recognise the source under `parse_options`.
86 pub fn build_with_options(source: &str, parse_options: ParseOptions) -> Result<Self, ParseError> {
87 let doc = Document::parse_with_options(source, parse_options)?;
88 Ok(Self::from_document(&doc))
89 }
90
91 /// Build from an already parsed document.
92 #[must_use]
93 pub fn from_document(doc: &Document) -> Self {
94 let facts = doc
95 .block_checkpoints()
96 .iter()
97 .map(|point| {
98 let byte = usize::try_from(point.byte).unwrap_or(usize::MAX);
99 let original = doc.canonical_to_original_range(byte..byte).start;
100 mdwright_document::BlockCheckpointFact {
101 byte: u32::try_from(original).unwrap_or(u32::MAX),
102 parser_state: point.parser_state,
103 }
104 })
105 .collect();
106 Self::from_facts(doc.original_source().len(), facts)
107 }
108
109 fn from_facts(source_len: usize, facts: Vec<mdwright_document::BlockCheckpointFact>) -> Self {
110 let source_len = u32::try_from(source_len).unwrap_or(u32::MAX);
111 let mut points: Vec<BlockCheckpoint> = facts
112 .into_iter()
113 .map(|point| BlockCheckpoint {
114 byte: point.byte,
115 parser_state: point.parser_state,
116 })
117 .collect();
118 if points.last().is_none_or(|last| last.byte < source_len) {
119 points.push(BlockCheckpoint {
120 byte: source_len,
121 parser_state: 0,
122 });
123 }
124
125 Self { source_len, points }
126 }
127
128 /// Smallest `[lo, hi)` byte range that covers `range` and starts
129 /// and ends on a checkpoint. Always succeeds: empty / out-of-bounds
130 /// / partial-block ranges all resolve to a well-defined slice (the
131 /// smallest superset; empty when the request is wholly past the
132 /// source end).
133 pub fn snap_to_block_boundaries(&self, range: Range<u32>) -> Range<u32> {
134 let req_start = range.start.min(self.source_len);
135 let req_end = range.end.min(self.source_len).max(req_start);
136
137 // Largest checkpoint byte ≤ req_start.
138 let lo_idx = match self.points.binary_search_by_key(&req_start, |p| p.byte) {
139 Ok(i) => i,
140 Err(i) => i.saturating_sub(1),
141 };
142 let lo = self.points.get(lo_idx).map_or(0, |p| p.byte);
143
144 // Smallest checkpoint byte ≥ req_end.
145 let hi_idx = match self.points.binary_search_by_key(&req_end, |p| p.byte) {
146 Ok(i) => i,
147 Err(i) => i,
148 };
149 let hi = self.points.get(hi_idx).map_or(self.source_len, |p| p.byte);
150
151 lo..hi
152 }
153
154 /// Number of recorded checkpoints, including the implicit `byte = 0`
155 /// and the end-of-source sentinel. Exposed for tests and the
156 /// allocation-discipline bench.
157 #[must_use]
158 pub fn len(&self) -> usize {
159 self.points.len()
160 }
161
162 /// `true` iff the table has only its two bookends; the document
163 /// contains no top-level block.
164 #[must_use]
165 pub fn is_empty(&self) -> bool {
166 self.points.len() <= 2
167 }
168}
169
170#[cfg(test)]
171#[allow(clippy::expect_used)]
172mod tests {
173 use super::CheckpointTable;
174
175 #[test]
176 fn empty_source() {
177 let t = CheckpointTable::build("").expect("checkpoint source parses");
178 // Both bookends collapse to byte 0 (source_len == 0).
179 assert_eq!(t.len(), 1);
180 assert!(t.is_empty());
181 assert_eq!(t.snap_to_block_boundaries(0..0), 0..0);
182 }
183
184 #[test]
185 fn three_paragraphs() {
186 let src = "a\n\nb\n\nc\n";
187 let t = CheckpointTable::build(src).expect("checkpoint source parses");
188 // 0, start-of-a, start-of-b, start-of-c, sentinel; but
189 // start-of-a coincides with 0 so it's deduplicated.
190 assert!(t.len() >= 4);
191 // Range hitting `b` snaps to start-of-b..start-of-c.
192 let snapped = t.snap_to_block_boundaries(3..4);
193 assert_eq!(&src[snapped.start as usize..snapped.end as usize], "b\n\n");
194 }
195
196 #[test]
197 fn range_inside_list_snaps_to_list_boundaries() {
198 let src = "para\n\n1. one\n2. two\n3. three\n\ntail\n";
199 let t = CheckpointTable::build(src).expect("checkpoint source parses");
200 // Pick a byte inside the list (offset of "two").
201 let two_at = src.find("two").unwrap_or(0);
202 let snapped = t.snap_to_block_boundaries(two_at as u32..two_at as u32 + 1);
203 let slice = &src[snapped.start as usize..snapped.end as usize];
204 // The slice must contain the full list, not just one item;
205 // otherwise renumber would diverge from whole-document output.
206 assert!(slice.contains("1. one"), "slice should include list start: {slice:?}");
207 assert!(slice.contains("3. three"), "slice should include list end: {slice:?}");
208 }
209
210 #[test]
211 fn range_past_end_snaps_to_empty() {
212 let src = "a\n";
213 let t = CheckpointTable::build(src).expect("checkpoint source parses");
214 let snapped = t.snap_to_block_boundaries(99..100);
215 assert_eq!(snapped, src.len() as u32..src.len() as u32);
216 }
217
218 #[test]
219 fn frontmatter_is_one_prelude_region() {
220 let src = "---\ntitle: x\n---\n# heading\n\npara\n";
221 let t = CheckpointTable::build(src).expect("checkpoint source parses");
222 // A range starting inside frontmatter should snap forward to
223 // the first body block ("# heading").
224 let heading_at = src.find("# heading").unwrap_or(0);
225 let snapped = t.snap_to_block_boundaries(2..3);
226 assert!(snapped.start as usize <= heading_at);
227 assert!(snapped.end as usize >= heading_at);
228 }
229
230 #[test]
231 fn crlf_source_preserves_original_offsets() {
232 let src = "a\r\n\r\nb\r\n";
233 let t = CheckpointTable::build(src).expect("checkpoint source parses");
234 // The "b" paragraph's checkpoint should be at the byte
235 // position of "b" in the ORIGINAL (CRLF) source.
236 let b_at = src.find('b').unwrap_or(0) as u32;
237 let snapped = t.snap_to_block_boundaries(b_at..b_at + 1);
238 let slice = &src[snapped.start as usize..snapped.end as usize];
239 assert!(slice.contains('b'));
240 }
241}