Skip to main content

mdwright_format/incremental/
mod.rs

1//! Block-boundary checkpoint table for range-formatting.
2//!
3//! Editor latency requires that a one-paragraph edit format in time
4//! proportional to the paragraph, not the document. The mechanism is a
5//! *checkpoint table*: a sorted list of byte offsets, one per
6//! top-level Markdown block, that lets a caller-supplied byte range
7//! be snapped to the smallest covering whole-block slice. The slice is
8//! then parsed and formatted independently through the document and
9//! formatter entry points.
10//!
11//! A checkpoint sits at column 0 of a line that opens a **top-level**
12//! block (container depth zero, not inside a blockquote, list,
13//! footnote, or table). Both conditions are load-bearing: slicing two
14//! checkpoints must yield a syntactically self-contained Markdown
15//! sub-document, otherwise the substring contract in
16//! [`crate::format_range`] breaks. The depth clause specifically
17//! defends against slicing one item out of an ordered list, where the
18//! sliced item would re-parse as item 1 and `OrderedList::Renumber`
19//! would diverge from the whole-document output.
20//!
21//! Offsets are recorded in the **caller's source coordinates**
22//! (original bytes, before CM §2.1 / §2.3 canonicalisation). Pulldown
23//! reports canonical offsets; [`CheckpointTable::build`] translates
24//! them through `Source::to_original` once at build
25//! time. For the common case where input contains no `\r` or `\0` the
26//! translation is the identity and costs nothing.
27//!
28//! Frontmatter (YAML `---…---` or TOML `+++…+++`) is treated as a
29//! single prelude region: the first body checkpoint sits at the byte
30//! after the closing delimiter. A caller-supplied range that falls
31//! entirely inside frontmatter snaps forward to the first body block.
32
33use std::ops::Range;
34
35use mdwright_document::{Document, ParseError, ParseOptions};
36
37/// One block boundary in the caller's source.
38#[derive(Copy, Clone, Debug)]
39pub(crate) struct BlockCheckpoint {
40    /// Byte offset in the caller's source where the block starts.
41    /// Always column 0 of its line, always at container depth 0.
42    pub(crate) byte: u32,
43    /// Cheap snapshot of the parser walk state at this point. Reserved
44    /// for the incremental-rebuild work in the LSP session. The
45    /// current resumption logic doesn't read it, but recording it now
46    /// lets that session diff two tables for "did anything before this
47    /// boundary actually change?".
48    #[expect(dead_code, reason = "reserved for LSP incremental-rebuild")]
49    pub(crate) parser_state: u64,
50}
51
52/// Per-document table of top-level block boundaries.
53///
54/// Built once per source version via [`CheckpointTable::build`]; the
55/// LSP rebuilds on every `didChange` notification. Internally a sorted
56/// `Vec<BlockCheckpoint>` with `byte = 0` and a sentinel at
57/// `byte = source.len()` as bookends, so [`Self::snap_to_block_boundaries`]
58/// is branch-free at the bounds.
59#[derive(Debug)]
60pub struct CheckpointTable {
61    source_len: u32,
62    /// Sorted by `byte` ascending. First entry is always `byte = 0`;
63    /// last entry is always `byte = source_len`.
64    points: Vec<BlockCheckpoint>,
65}
66
67impl CheckpointTable {
68    /// Walk `source` once, recording one checkpoint per top-level
69    /// block. Cost: one pulldown event stream, one offset translation
70    /// per checkpoint, one `Vec` allocation.
71    ///
72    /// # Errors
73    ///
74    /// Returns [`ParseError`] if parser execution cannot safely
75    /// recognise the source.
76    pub fn build(source: &str) -> Result<Self, ParseError> {
77        Self::build_with_options(source, ParseOptions::default())
78    }
79
80    /// Build a checkpoint table under explicit recognition policy.
81    ///
82    /// # Errors
83    ///
84    /// Returns [`ParseError`] if parser execution cannot safely
85    /// recognise the source under `parse_options`.
86    pub fn build_with_options(source: &str, parse_options: ParseOptions) -> Result<Self, ParseError> {
87        let doc = Document::parse_with_options(source, parse_options)?;
88        Ok(Self::from_document(&doc))
89    }
90
91    /// Build from an already parsed document.
92    #[must_use]
93    pub fn from_document(doc: &Document) -> Self {
94        let facts = doc
95            .block_checkpoints()
96            .iter()
97            .map(|point| {
98                let byte = usize::try_from(point.byte).unwrap_or(usize::MAX);
99                let original = doc.canonical_to_original_range(byte..byte).start;
100                mdwright_document::BlockCheckpointFact {
101                    byte: u32::try_from(original).unwrap_or(u32::MAX),
102                    parser_state: point.parser_state,
103                }
104            })
105            .collect();
106        Self::from_facts(doc.original_source().len(), facts)
107    }
108
109    fn from_facts(source_len: usize, facts: Vec<mdwright_document::BlockCheckpointFact>) -> Self {
110        let source_len = u32::try_from(source_len).unwrap_or(u32::MAX);
111        let mut points: Vec<BlockCheckpoint> = facts
112            .into_iter()
113            .map(|point| BlockCheckpoint {
114                byte: point.byte,
115                parser_state: point.parser_state,
116            })
117            .collect();
118        if points.last().is_none_or(|last| last.byte < source_len) {
119            points.push(BlockCheckpoint {
120                byte: source_len,
121                parser_state: 0,
122            });
123        }
124
125        Self { source_len, points }
126    }
127
128    /// Smallest `[lo, hi)` byte range that covers `range` and starts
129    /// and ends on a checkpoint. Always succeeds: empty / out-of-bounds
130    /// / partial-block ranges all resolve to a well-defined slice (the
131    /// smallest superset; empty when the request is wholly past the
132    /// source end).
133    pub fn snap_to_block_boundaries(&self, range: Range<u32>) -> Range<u32> {
134        let req_start = range.start.min(self.source_len);
135        let req_end = range.end.min(self.source_len).max(req_start);
136
137        // Largest checkpoint byte ≤ req_start.
138        let lo_idx = match self.points.binary_search_by_key(&req_start, |p| p.byte) {
139            Ok(i) => i,
140            Err(i) => i.saturating_sub(1),
141        };
142        let lo = self.points.get(lo_idx).map_or(0, |p| p.byte);
143
144        // Smallest checkpoint byte ≥ req_end.
145        let hi_idx = match self.points.binary_search_by_key(&req_end, |p| p.byte) {
146            Ok(i) => i,
147            Err(i) => i,
148        };
149        let hi = self.points.get(hi_idx).map_or(self.source_len, |p| p.byte);
150
151        lo..hi
152    }
153
154    /// Number of recorded checkpoints, including the implicit `byte = 0`
155    /// and the end-of-source sentinel. Exposed for tests and the
156    /// allocation-discipline bench.
157    #[must_use]
158    pub fn len(&self) -> usize {
159        self.points.len()
160    }
161
162    /// `true` iff the table has only its two bookends; the document
163    /// contains no top-level block.
164    #[must_use]
165    pub fn is_empty(&self) -> bool {
166        self.points.len() <= 2
167    }
168}
169
170#[cfg(test)]
171#[allow(clippy::expect_used)]
172mod tests {
173    use super::CheckpointTable;
174
175    #[test]
176    fn empty_source() {
177        let t = CheckpointTable::build("").expect("checkpoint source parses");
178        // Both bookends collapse to byte 0 (source_len == 0).
179        assert_eq!(t.len(), 1);
180        assert!(t.is_empty());
181        assert_eq!(t.snap_to_block_boundaries(0..0), 0..0);
182    }
183
184    #[test]
185    fn three_paragraphs() {
186        let src = "a\n\nb\n\nc\n";
187        let t = CheckpointTable::build(src).expect("checkpoint source parses");
188        // 0, start-of-a, start-of-b, start-of-c, sentinel; but
189        // start-of-a coincides with 0 so it's deduplicated.
190        assert!(t.len() >= 4);
191        // Range hitting `b` snaps to start-of-b..start-of-c.
192        let snapped = t.snap_to_block_boundaries(3..4);
193        assert_eq!(&src[snapped.start as usize..snapped.end as usize], "b\n\n");
194    }
195
196    #[test]
197    fn range_inside_list_snaps_to_list_boundaries() {
198        let src = "para\n\n1. one\n2. two\n3. three\n\ntail\n";
199        let t = CheckpointTable::build(src).expect("checkpoint source parses");
200        // Pick a byte inside the list (offset of "two").
201        let two_at = src.find("two").unwrap_or(0);
202        let snapped = t.snap_to_block_boundaries(two_at as u32..two_at as u32 + 1);
203        let slice = &src[snapped.start as usize..snapped.end as usize];
204        // The slice must contain the full list, not just one item;
205        // otherwise renumber would diverge from whole-document output.
206        assert!(slice.contains("1. one"), "slice should include list start: {slice:?}");
207        assert!(slice.contains("3. three"), "slice should include list end: {slice:?}");
208    }
209
210    #[test]
211    fn range_past_end_snaps_to_empty() {
212        let src = "a\n";
213        let t = CheckpointTable::build(src).expect("checkpoint source parses");
214        let snapped = t.snap_to_block_boundaries(99..100);
215        assert_eq!(snapped, src.len() as u32..src.len() as u32);
216    }
217
218    #[test]
219    fn frontmatter_is_one_prelude_region() {
220        let src = "---\ntitle: x\n---\n# heading\n\npara\n";
221        let t = CheckpointTable::build(src).expect("checkpoint source parses");
222        // A range starting inside frontmatter should snap forward to
223        // the first body block ("# heading").
224        let heading_at = src.find("# heading").unwrap_or(0);
225        let snapped = t.snap_to_block_boundaries(2..3);
226        assert!(snapped.start as usize <= heading_at);
227        assert!(snapped.end as usize >= heading_at);
228    }
229
230    #[test]
231    fn crlf_source_preserves_original_offsets() {
232        let src = "a\r\n\r\nb\r\n";
233        let t = CheckpointTable::build(src).expect("checkpoint source parses");
234        // The "b" paragraph's checkpoint should be at the byte
235        // position of "b" in the ORIGINAL (CRLF) source.
236        let b_at = src.find('b').unwrap_or(0) as u32;
237        let snapped = t.snap_to_block_boundaries(b_at..b_at + 1);
238        let slice = &src[snapped.start as usize..snapped.end as usize];
239        assert!(slice.contains('b'));
240    }
241}