Skip to main content

mdwright_document/
line_index.rs

1//! Byte-offset → (line, column) mapping over a Markdown source string.
2//!
3//! pulldown-cmark hands us byte ranges; lint diagnostics report
4//! `file:line:col` in the editor convention (both 1-indexed, columns
5//! counted in Unicode codepoints, not bytes). This helper records the
6//! line-start table once and answers every offset lookup in O(log n).
7//!
8//! The index owns no borrow on the source; callers pass the source
9//! `&str` at query time. This lets [`crate::source::Source`] keep
10//! ownership of the bytes without forcing a lifetime parameter on
11//! every type that holds a `LineIndex`. The intended pattern is one
12//! `LineIndex` built from `Source::original` at parse time, shared
13//! by reference for the document's lifetime.
14
15use std::fmt;
16
17/// Maps byte offsets in a source string to 1-indexed (line, column)
18/// pairs. Columns count Unicode codepoints, matching what `grep -n`
19/// and most editors display.
20#[derive(Debug, Clone)]
21pub struct LineIndex {
22    /// `line_starts[i]` is the byte offset of the first byte of line
23    /// `i + 1`. `line_starts[0]` is always `0`. The table has one
24    /// entry per newline plus one for the document start, so a final
25    /// non-terminated line still has a start entry.
26    line_starts: Vec<u32>,
27}
28
29#[derive(Clone, Debug, PartialEq, Eq)]
30pub enum LineIndexError {
31    OffsetOutOfBounds { byte: usize, source_len: usize },
32    OffsetTooLarge { byte: usize },
33    NotUtf8Boundary { byte: usize },
34}
35
36impl fmt::Display for LineIndexError {
37    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38        match self {
39            Self::OffsetOutOfBounds { byte, source_len } => {
40                write!(f, "byte offset {byte} past source length {source_len}")
41            }
42            Self::OffsetTooLarge { byte } => write!(f, "byte offset {byte} exceeds u32 range"),
43            Self::NotUtf8Boundary { byte } => write!(f, "byte {byte} not on a UTF-8 boundary"),
44        }
45    }
46}
47
48impl std::error::Error for LineIndexError {}
49
50impl LineIndex {
51    /// Build from `source` bytes. The index does not hold the
52    /// reference; it captures only the newline offsets.
53    #[must_use]
54    pub fn new(source: &str) -> Self {
55        let mut line_starts: Vec<u32> = Vec::with_capacity(source.len() / 40);
56        line_starts.push(0);
57        for (i, b) in source.bytes().enumerate() {
58            if b == b'\n' {
59                let next = u32::try_from(i.saturating_add(1)).unwrap_or(u32::MAX);
60                line_starts.push(next);
61            }
62        }
63        Self { line_starts }
64    }
65
66    /// 1-indexed (line, column) for the codepoint starting at `byte`.
67    ///
68    /// `source` must be the same buffer the index was built from
69    /// (otherwise codepoint counting will use the wrong bytes).
70    ///
71    /// # Errors
72    ///
73    /// Returns `Err` if `byte` lies past the end of `source` or not
74    /// on a UTF-8 boundary. Callers should pass offsets produced by
75    /// pulldown-cmark, which always satisfy both conditions.
76    pub fn locate(&self, source: &str, byte: usize) -> Result<(usize, usize), LineIndexError> {
77        if byte > source.len() {
78            return Err(LineIndexError::OffsetOutOfBounds {
79                byte,
80                source_len: source.len(),
81            });
82        }
83        let byte_u32 = u32::try_from(byte).map_err(|_| LineIndexError::OffsetTooLarge { byte })?;
84        // Binary search for the largest line_start ≤ byte.
85        let idx = match self.line_starts.binary_search(&byte_u32) {
86            Ok(i) => i,
87            Err(i) => i.saturating_sub(1),
88        };
89        let line_start = self.line_starts.get(idx).copied().unwrap_or(0) as usize;
90        let prefix = source
91            .get(line_start..byte)
92            .ok_or(LineIndexError::NotUtf8Boundary { byte })?;
93        let column = prefix.chars().count().saturating_add(1);
94        Ok((idx.saturating_add(1), column))
95    }
96
97    /// Byte offset of the codepoint at LSP-convention `(line, column)`
98    /// (both 0-indexed; column counts Unicode codepoints).
99    ///
100    /// Returns `None` if `line` is past the end of the source, if
101    /// `column` lands past the end of its line, or if the position
102    /// isn't on a UTF-8 boundary.
103    ///
104    /// The 0-based input convention matches the Language Server
105    /// Protocol (`Position { line, character }`); the existing
106    /// [`Self::locate`] reverses the mapping and returns 1-based
107    /// coordinates suitable for `file:line:col` diagnostic display.
108    /// Don't conflate the two.
109    #[must_use]
110    pub fn byte_of_position_0based(&self, source: &str, line: usize, column: usize) -> Option<usize> {
111        let line_start = self.line_starts.get(line).copied()? as usize;
112        let line_end = self
113            .line_starts
114            .get(line.saturating_add(1))
115            .copied()
116            .map_or(source.len(), |n| n as usize);
117        // Strip the trailing newline (and an optional preceding `\r`)
118        // so a request for column == line length resolves at the byte
119        // before the line terminator rather than past it.
120        let mut visible_end = line_end;
121        if visible_end > line_start && source.as_bytes().get(visible_end.saturating_sub(1)) == Some(&b'\n') {
122            visible_end = visible_end.saturating_sub(1);
123            if visible_end > line_start && source.as_bytes().get(visible_end.saturating_sub(1)) == Some(&b'\r') {
124                visible_end = visible_end.saturating_sub(1);
125            }
126        }
127        let line_text = source.get(line_start..visible_end)?;
128        let mut cursor = line_start;
129        let mut remaining = column;
130        for ch in line_text.chars() {
131            if remaining == 0 {
132                return Some(cursor);
133            }
134            cursor = cursor.saturating_add(ch.len_utf8());
135            remaining = remaining.saturating_sub(1);
136        }
137        // Column lands at, or past, the end of the visible line text.
138        // LSP convention permits "one past last codepoint" (end-of-line
139        // cursor), so accept column == codepoint_count by returning the
140        // visible-end byte. Larger columns are out of range.
141        (remaining == 0).then_some(visible_end)
142    }
143
144    /// Byte range of the line containing `byte`, with the trailing
145    /// `\n` trimmed. Returns `None` if `byte` is past the source end.
146    ///
147    /// The slice `source[range]` is exactly the line text the
148    /// rustc-style pretty renderer and the JSON Lines `snippet` field
149    /// quote back to the user.
150    #[must_use]
151    pub fn line_bounds(&self, source: &str, byte: usize) -> Option<std::ops::Range<usize>> {
152        if byte > source.len() {
153            return None;
154        }
155        let byte_u32 = u32::try_from(byte).ok()?;
156        let idx = match self.line_starts.binary_search(&byte_u32) {
157            Ok(i) => i,
158            Err(i) => i.saturating_sub(1),
159        };
160        let start = self.line_starts.get(idx).copied()? as usize;
161        let raw_end = self
162            .line_starts
163            .get(idx.saturating_add(1))
164            .copied()
165            .map_or(source.len(), |n| n as usize);
166        // Trim the trailing `\n` (and a preceding `\r`, if present)
167        // so callers get just the visible line text.
168        let mut end = raw_end;
169        if end > start && source.as_bytes().get(end.saturating_sub(1)) == Some(&b'\n') {
170            end = end.saturating_sub(1);
171            if end > start && source.as_bytes().get(end.saturating_sub(1)) == Some(&b'\r') {
172                end = end.saturating_sub(1);
173            }
174        }
175        Some(start..end)
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use super::LineIndex;
182
183    fn locate(idx: &LineIndex, source: &str, byte: usize) -> Result<(usize, usize), String> {
184        idx.locate(source, byte).map_err(|err| err.to_string())
185    }
186
187    #[test]
188    fn locate_start_of_first_line() -> Result<(), String> {
189        let src = "hello\nworld\n";
190        let idx = LineIndex::new(src);
191        assert_eq!(locate(&idx, src, 0)?, (1, 1));
192        Ok(())
193    }
194
195    #[test]
196    fn locate_after_newline() -> Result<(), String> {
197        let src = "hello\nworld\n";
198        let idx = LineIndex::new(src);
199        assert_eq!(locate(&idx, src, 6)?, (2, 1));
200        Ok(())
201    }
202
203    #[test]
204    fn locate_codepoint_column() -> Result<(), String> {
205        let src = "αβγ\n";
206        let idx = LineIndex::new(src);
207        assert_eq!(locate(&idx, src, 4)?, (1, 3));
208        Ok(())
209    }
210
211    #[test]
212    fn rejects_out_of_range() {
213        let src = "hi\n";
214        let idx = LineIndex::new(src);
215        assert!(idx.locate(src, 99).is_err());
216    }
217
218    #[test]
219    fn byte_of_position_0based_basic() {
220        let src = "hello\nworld\n";
221        let idx = LineIndex::new(src);
222        assert_eq!(idx.byte_of_position_0based(src, 0, 0), Some(0));
223        assert_eq!(idx.byte_of_position_0based(src, 0, 5), Some(5));
224        assert_eq!(idx.byte_of_position_0based(src, 1, 0), Some(6));
225        assert_eq!(idx.byte_of_position_0based(src, 1, 5), Some(11));
226    }
227
228    #[test]
229    fn byte_of_position_0based_codepoints() {
230        let src = "αβγ\n";
231        let idx = LineIndex::new(src);
232        assert_eq!(idx.byte_of_position_0based(src, 0, 0), Some(0));
233        assert_eq!(idx.byte_of_position_0based(src, 0, 1), Some(2));
234        assert_eq!(idx.byte_of_position_0based(src, 0, 2), Some(4));
235        assert_eq!(idx.byte_of_position_0based(src, 0, 3), Some(6));
236    }
237
238    #[test]
239    fn byte_of_position_0based_out_of_range() {
240        let src = "hi\n";
241        let idx = LineIndex::new(src);
242        assert!(idx.byte_of_position_0based(src, 5, 0).is_none());
243        assert!(idx.byte_of_position_0based(src, 0, 99).is_none());
244    }
245}