Skip to main content

quarto_source_map/
file_info.rs

1//! Efficient file information for location lookups
2
3use crate::types::Location;
4use serde::{Deserialize, Serialize};
5
6/// Efficient file content analysis for location lookups
7///
8/// This struct stores metadata about a file that enables fast conversion
9/// from byte offsets to (row, column) positions without storing the full
10/// file content.
11#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
12pub struct FileInformation {
13    /// Byte offsets of each newline character in the file
14    line_breaks: Vec<usize>,
15
16    /// Total length of the file in bytes
17    total_length: usize,
18}
19
20impl FileInformation {
21    /// Create file information by analyzing content
22    ///
23    /// Scans the content once to build an index of line break positions.
24    /// This enables O(log n) offset-to-location lookups via binary search.
25    ///
26    /// # Example
27    ///
28    /// ```
29    /// use quarto_source_map::FileInformation;
30    ///
31    /// let info = FileInformation::new("line 1\nline 2\nline 3");
32    /// ```
33    pub fn new(content: &str) -> Self {
34        let line_breaks: Vec<usize> = content
35            .char_indices()
36            .filter_map(|(idx, ch)| if ch == '\n' { Some(idx) } else { None })
37            .collect();
38
39        FileInformation {
40            line_breaks,
41            total_length: content.len(),
42        }
43    }
44
45    /// Create file information from pre-computed parts
46    ///
47    /// This is useful when deserializing from formats that store
48    /// line break information directly (like JSON).
49    ///
50    /// # Example
51    ///
52    /// ```
53    /// use quarto_source_map::FileInformation;
54    ///
55    /// let info = FileInformation::from_parts(vec![6, 13], 20);
56    /// ```
57    pub fn from_parts(line_breaks: Vec<usize>, total_length: usize) -> Self {
58        FileInformation {
59            line_breaks,
60            total_length,
61        }
62    }
63
64    /// Convert a byte offset to a Location with row and column
65    ///
66    /// Uses binary search to find which line contains the offset.
67    /// Runs in O(log n) time where n is the number of lines.
68    ///
69    /// The column is computed as character count (not byte count) from the start
70    /// of the line to the offset, which requires the content parameter.
71    ///
72    /// Returns None if the offset is out of bounds.
73    ///
74    /// # Example
75    ///
76    /// ```
77    /// use quarto_source_map::FileInformation;
78    ///
79    /// let content = "hello\nworld";
80    /// let info = FileInformation::new(content);
81    /// let loc = info.offset_to_location(6, content).unwrap();
82    /// assert_eq!(loc.row, 1);
83    /// assert_eq!(loc.column, 0);
84    /// ```
85    pub fn offset_to_location(&self, offset: usize, content: &str) -> Option<Location> {
86        if offset > self.total_length {
87            return None;
88        }
89
90        // Binary search to find which line the offset is on
91        // line_breaks[i] is the position of the i-th newline (0-indexed)
92        // So line 0 contains [0, line_breaks[0])
93        // Line 1 contains [line_breaks[0]+1, line_breaks[1])
94        // etc.
95
96        let row = match self.line_breaks.binary_search(&offset) {
97            // Offset is exactly at a newline character
98            // That newline belongs to the line it terminates, not the next line
99            Ok(idx) => idx,
100            // Offset is between line breaks (or before the first, or after the last)
101            Err(idx) => idx,
102        };
103
104        // Column is distance from the start of this line
105        let line_start = if row == 0 {
106            0
107        } else {
108            self.line_breaks[row - 1] + 1 // +1 to skip past the '\n'
109        };
110
111        // Count characters (not bytes) from line_start to offset.
112        // Tree-sitter and Pandoc-source byte ranges occasionally produce
113        // offsets that land inside a multi-byte UTF-8 sequence; floor such
114        // offsets to the start of the enclosing char so the slice stays on
115        // a valid char boundary instead of panicking.
116        let mut safe_offset = offset;
117        while safe_offset > line_start && !content.is_char_boundary(safe_offset) {
118            safe_offset -= 1;
119        }
120        let column = content[line_start..safe_offset].chars().count();
121
122        Some(Location {
123            offset,
124            row,
125            column,
126        })
127    }
128
129    /// Get the total length of the file in bytes
130    pub fn total_length(&self) -> usize {
131        self.total_length
132    }
133
134    /// Get the line breaks array (byte offsets of newline characters)
135    pub fn line_breaks(&self) -> &[usize] {
136        &self.line_breaks
137    }
138
139    /// Get the number of lines in the file
140    pub fn line_count(&self) -> usize {
141        // If there are no newlines, there's 1 line
142        // If there are n newlines, there are n+1 lines
143        self.line_breaks.len() + 1
144    }
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150
151    #[test]
152    fn test_empty_file() {
153        let content = "";
154        let info = FileInformation::new(content);
155        assert_eq!(info.total_length(), 0);
156        assert_eq!(info.line_count(), 1);
157
158        let loc = info.offset_to_location(0, content).unwrap();
159        assert_eq!(loc.offset, 0);
160        assert_eq!(loc.row, 0);
161        assert_eq!(loc.column, 0);
162    }
163
164    #[test]
165    fn test_single_line() {
166        let content = "hello world";
167        let info = FileInformation::new(content);
168        assert_eq!(info.total_length(), 11);
169        assert_eq!(info.line_count(), 1);
170
171        // Start of line
172        let loc = info.offset_to_location(0, content).unwrap();
173        assert_eq!(loc.row, 0);
174        assert_eq!(loc.column, 0);
175
176        // Middle of line
177        let loc = info.offset_to_location(6, content).unwrap();
178        assert_eq!(loc.row, 0);
179        assert_eq!(loc.column, 6);
180
181        // End of line
182        let loc = info.offset_to_location(11, content).unwrap();
183        assert_eq!(loc.row, 0);
184        assert_eq!(loc.column, 11);
185    }
186
187    #[test]
188    fn test_multiple_lines() {
189        let content = "line 1\nline 2\nline 3";
190        let info = FileInformation::new(content);
191        assert_eq!(info.line_count(), 3);
192
193        // First line
194        let loc = info.offset_to_location(0, content).unwrap();
195        assert_eq!(loc.row, 0);
196        assert_eq!(loc.column, 0);
197
198        // At first newline (offset 6 is '\n')
199        let loc = info.offset_to_location(6, content).unwrap();
200        assert_eq!(loc.row, 0);
201        assert_eq!(loc.column, 6);
202
203        // Start of second line (offset 7 is 'l' in "line 2")
204        let loc = info.offset_to_location(7, content).unwrap();
205        assert_eq!(loc.row, 1);
206        assert_eq!(loc.column, 0);
207
208        // At second newline (offset 13 is '\n')
209        let loc = info.offset_to_location(13, content).unwrap();
210        assert_eq!(loc.row, 1);
211        assert_eq!(loc.column, 6);
212
213        // Start of third line (offset 14 is 'l' in "line 3")
214        let loc = info.offset_to_location(14, content).unwrap();
215        assert_eq!(loc.row, 2);
216        assert_eq!(loc.column, 0);
217
218        // End of file
219        let loc = info.offset_to_location(20, content).unwrap();
220        assert_eq!(loc.row, 2);
221        assert_eq!(loc.column, 6);
222    }
223
224    #[test]
225    fn test_out_of_bounds() {
226        let content = "hello";
227        let info = FileInformation::new(content);
228        assert!(info.offset_to_location(100, content).is_none());
229    }
230
231    #[test]
232    fn test_unicode_content() {
233        // "café" - 'é' is 2 bytes in UTF-8
234        let content = "café\nwörld"; // 4 chars + 1 newline + 5 chars = but more bytes
235        let info = FileInformation::new(content);
236
237        // Verify we're working with byte offsets for positioning, but character counts for columns
238        // "café" is 5 bytes: c(1) a(1) f(1) é(2)
239        // newline is 1 byte
240        // So second line starts at byte offset 6
241        let loc = info.offset_to_location(6, content).unwrap();
242        assert_eq!(loc.row, 1);
243        assert_eq!(loc.column, 0);
244    }
245
246    #[test]
247    fn test_file_ending_with_newline() {
248        let content = "line 1\nline 2\n";
249        let info = FileInformation::new(content);
250        assert_eq!(info.line_count(), 3); // Empty third line
251
252        // The final newline
253        let loc = info.offset_to_location(13, content).unwrap();
254        assert_eq!(loc.row, 1);
255        assert_eq!(loc.column, 6);
256
257        // After the final newline (empty line 3)
258        let loc = info.offset_to_location(14, content).unwrap();
259        assert_eq!(loc.row, 2);
260        assert_eq!(loc.column, 0);
261    }
262
263    #[test]
264    fn test_consecutive_newlines() {
265        let content = "a\n\n\nb";
266        let info = FileInformation::new(content);
267        assert_eq!(info.line_count(), 4);
268
269        // First line
270        let loc = info.offset_to_location(0, content).unwrap();
271        assert_eq!(loc.row, 0);
272        assert_eq!(loc.column, 0);
273
274        // First newline (offset 1)
275        let loc = info.offset_to_location(1, content).unwrap();
276        assert_eq!(loc.row, 0);
277        assert_eq!(loc.column, 1);
278
279        // Empty second line (offset 2)
280        let loc = info.offset_to_location(2, content).unwrap();
281        assert_eq!(loc.row, 1);
282        assert_eq!(loc.column, 0);
283
284        // Empty third line (offset 3)
285        let loc = info.offset_to_location(3, content).unwrap();
286        assert_eq!(loc.row, 2);
287        assert_eq!(loc.column, 0);
288
289        // Fourth line 'b' (offset 4)
290        let loc = info.offset_to_location(4, content).unwrap();
291        assert_eq!(loc.row, 3);
292        assert_eq!(loc.column, 0);
293    }
294
295    #[test]
296    fn test_multibyte_utf8_column_should_be_character_count() {
297        // This test verifies that column is character count, not byte offset
298        // Swedish text with multi-byte UTF-8 characters (å = 2 bytes, ä = 2 bytes, ö = 2 bytes)
299        let content = "Gällande frågorna om något";
300        // Character positions: G=0, ä=1, l=2, l=3, a=4, n=5, d=6, e=7, space=8, f=9, r=10, å=11, g=12, ...
301        // Byte positions:      G=0, ä=1-2, l=3, l=4, a=5, n=6, d=7, e=8, space=9, f=10, r=11, å=12-13, g=14, ...
302
303        let info = FileInformation::new(content);
304
305        // Test position at "å" in "frågorna" (character 11, byte offset starts at 12)
306        // The byte offset 12 is where "å" starts (it's 2 bytes: 12-13)
307        let loc = info.offset_to_location(12, content).unwrap();
308        assert_eq!(loc.row, 0);
309        // With the fix, this should return 11 (character count), not 12 (byte offset)
310        assert_eq!(
311            loc.column, 11,
312            "Column should be character count (11), not byte offset (12)"
313        );
314
315        // Test position at "g" after "å" in "frågorna" (character 12, byte offset 14)
316        let loc = info.offset_to_location(14, content).unwrap();
317        assert_eq!(loc.row, 0);
318        // Should return 12 (character count), not 14 (byte offset)
319        assert_eq!(
320            loc.column, 12,
321            "Column should be character count (12), not byte offset (14)"
322        );
323    }
324
325    #[test]
326    fn test_offset_inside_multibyte_char_does_not_panic() {
327        // Regression: a byte offset that lands inside a multi-byte UTF-8
328        // sequence used to panic with "byte index N is not a char boundary".
329        // Tree-sitter and Pandoc-source byte ranges can both produce such
330        // offsets in the wild, so the function must return a valid Location
331        // (rounded down to the previous char boundary) rather than panic.
332        let content = "x ❤️ y"; // ❤ = 3 bytes (E2 9D A4), ️ = 3 bytes (EF B8 8F)
333        let info = FileInformation::new(content);
334
335        // Byte 3 is the second byte of "❤" (which spans bytes 2..5).
336        // Floor to the start of "❤" at byte 2 → 2 chars before it ("x ").
337        let loc = info.offset_to_location(3, content).unwrap();
338        assert_eq!(loc.row, 0);
339        assert_eq!(loc.column, 2);
340
341        // Byte 4 is the third byte of "❤" — also inside the same char.
342        let loc = info.offset_to_location(4, content).unwrap();
343        assert_eq!(loc.row, 0);
344        assert_eq!(loc.column, 2);
345    }
346}