quarto_source_map/file_info.rs
1//! Efficient file information for location lookups
2
3use crate::types::Location;
4use serde::{Deserialize, Serialize};
5
6/// Efficient file content analysis for location lookups
7///
8/// This struct stores metadata about a file that enables fast conversion
9/// from byte offsets to (row, column) positions without storing the full
10/// file content.
11#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
12pub struct FileInformation {
13 /// Byte offsets of each newline character in the file
14 line_breaks: Vec<usize>,
15
16 /// Total length of the file in bytes
17 total_length: usize,
18}
19
20impl FileInformation {
21 /// Create file information by analyzing content
22 ///
23 /// Scans the content once to build an index of line break positions.
24 /// This enables O(log n) offset-to-location lookups via binary search.
25 ///
26 /// # Example
27 ///
28 /// ```
29 /// use quarto_source_map::FileInformation;
30 ///
31 /// let info = FileInformation::new("line 1\nline 2\nline 3");
32 /// ```
33 pub fn new(content: &str) -> Self {
34 let line_breaks: Vec<usize> = content
35 .char_indices()
36 .filter_map(|(idx, ch)| if ch == '\n' { Some(idx) } else { None })
37 .collect();
38
39 FileInformation {
40 line_breaks,
41 total_length: content.len(),
42 }
43 }
44
45 /// Create file information from pre-computed parts
46 ///
47 /// This is useful when deserializing from formats that store
48 /// line break information directly (like JSON).
49 ///
50 /// # Example
51 ///
52 /// ```
53 /// use quarto_source_map::FileInformation;
54 ///
55 /// let info = FileInformation::from_parts(vec![6, 13], 20);
56 /// ```
57 pub fn from_parts(line_breaks: Vec<usize>, total_length: usize) -> Self {
58 FileInformation {
59 line_breaks,
60 total_length,
61 }
62 }
63
64 /// Convert a byte offset to a Location with row and column
65 ///
66 /// Uses binary search to find which line contains the offset.
67 /// Runs in O(log n) time where n is the number of lines.
68 ///
69 /// The column is computed as character count (not byte count) from the start
70 /// of the line to the offset, which requires the content parameter.
71 ///
72 /// Returns None if the offset is out of bounds.
73 ///
74 /// # Example
75 ///
76 /// ```
77 /// use quarto_source_map::FileInformation;
78 ///
79 /// let content = "hello\nworld";
80 /// let info = FileInformation::new(content);
81 /// let loc = info.offset_to_location(6, content).unwrap();
82 /// assert_eq!(loc.row, 1);
83 /// assert_eq!(loc.column, 0);
84 /// ```
85 pub fn offset_to_location(&self, offset: usize, content: &str) -> Option<Location> {
86 if offset > self.total_length {
87 return None;
88 }
89
90 // Binary search to find which line the offset is on
91 // line_breaks[i] is the position of the i-th newline (0-indexed)
92 // So line 0 contains [0, line_breaks[0])
93 // Line 1 contains [line_breaks[0]+1, line_breaks[1])
94 // etc.
95
96 let row = match self.line_breaks.binary_search(&offset) {
97 // Offset is exactly at a newline character
98 // That newline belongs to the line it terminates, not the next line
99 Ok(idx) => idx,
100 // Offset is between line breaks (or before the first, or after the last)
101 Err(idx) => idx,
102 };
103
104 // Column is distance from the start of this line
105 let line_start = if row == 0 {
106 0
107 } else {
108 self.line_breaks[row - 1] + 1 // +1 to skip past the '\n'
109 };
110
111 // Count characters (not bytes) from line_start to offset.
112 // Tree-sitter and Pandoc-source byte ranges occasionally produce
113 // offsets that land inside a multi-byte UTF-8 sequence; floor such
114 // offsets to the start of the enclosing char so the slice stays on
115 // a valid char boundary instead of panicking.
116 let mut safe_offset = offset;
117 while safe_offset > line_start && !content.is_char_boundary(safe_offset) {
118 safe_offset -= 1;
119 }
120 let column = content[line_start..safe_offset].chars().count();
121
122 Some(Location {
123 offset,
124 row,
125 column,
126 })
127 }
128
129 /// Get the total length of the file in bytes
130 pub fn total_length(&self) -> usize {
131 self.total_length
132 }
133
134 /// Get the line breaks array (byte offsets of newline characters)
135 pub fn line_breaks(&self) -> &[usize] {
136 &self.line_breaks
137 }
138
139 /// Get the number of lines in the file
140 pub fn line_count(&self) -> usize {
141 // If there are no newlines, there's 1 line
142 // If there are n newlines, there are n+1 lines
143 self.line_breaks.len() + 1
144 }
145}
146
147#[cfg(test)]
148mod tests {
149 use super::*;
150
151 #[test]
152 fn test_empty_file() {
153 let content = "";
154 let info = FileInformation::new(content);
155 assert_eq!(info.total_length(), 0);
156 assert_eq!(info.line_count(), 1);
157
158 let loc = info.offset_to_location(0, content).unwrap();
159 assert_eq!(loc.offset, 0);
160 assert_eq!(loc.row, 0);
161 assert_eq!(loc.column, 0);
162 }
163
164 #[test]
165 fn test_single_line() {
166 let content = "hello world";
167 let info = FileInformation::new(content);
168 assert_eq!(info.total_length(), 11);
169 assert_eq!(info.line_count(), 1);
170
171 // Start of line
172 let loc = info.offset_to_location(0, content).unwrap();
173 assert_eq!(loc.row, 0);
174 assert_eq!(loc.column, 0);
175
176 // Middle of line
177 let loc = info.offset_to_location(6, content).unwrap();
178 assert_eq!(loc.row, 0);
179 assert_eq!(loc.column, 6);
180
181 // End of line
182 let loc = info.offset_to_location(11, content).unwrap();
183 assert_eq!(loc.row, 0);
184 assert_eq!(loc.column, 11);
185 }
186
187 #[test]
188 fn test_multiple_lines() {
189 let content = "line 1\nline 2\nline 3";
190 let info = FileInformation::new(content);
191 assert_eq!(info.line_count(), 3);
192
193 // First line
194 let loc = info.offset_to_location(0, content).unwrap();
195 assert_eq!(loc.row, 0);
196 assert_eq!(loc.column, 0);
197
198 // At first newline (offset 6 is '\n')
199 let loc = info.offset_to_location(6, content).unwrap();
200 assert_eq!(loc.row, 0);
201 assert_eq!(loc.column, 6);
202
203 // Start of second line (offset 7 is 'l' in "line 2")
204 let loc = info.offset_to_location(7, content).unwrap();
205 assert_eq!(loc.row, 1);
206 assert_eq!(loc.column, 0);
207
208 // At second newline (offset 13 is '\n')
209 let loc = info.offset_to_location(13, content).unwrap();
210 assert_eq!(loc.row, 1);
211 assert_eq!(loc.column, 6);
212
213 // Start of third line (offset 14 is 'l' in "line 3")
214 let loc = info.offset_to_location(14, content).unwrap();
215 assert_eq!(loc.row, 2);
216 assert_eq!(loc.column, 0);
217
218 // End of file
219 let loc = info.offset_to_location(20, content).unwrap();
220 assert_eq!(loc.row, 2);
221 assert_eq!(loc.column, 6);
222 }
223
224 #[test]
225 fn test_out_of_bounds() {
226 let content = "hello";
227 let info = FileInformation::new(content);
228 assert!(info.offset_to_location(100, content).is_none());
229 }
230
231 #[test]
232 fn test_unicode_content() {
233 // "café" - 'é' is 2 bytes in UTF-8
234 let content = "café\nwörld"; // 4 chars + 1 newline + 5 chars = but more bytes
235 let info = FileInformation::new(content);
236
237 // Verify we're working with byte offsets for positioning, but character counts for columns
238 // "café" is 5 bytes: c(1) a(1) f(1) é(2)
239 // newline is 1 byte
240 // So second line starts at byte offset 6
241 let loc = info.offset_to_location(6, content).unwrap();
242 assert_eq!(loc.row, 1);
243 assert_eq!(loc.column, 0);
244 }
245
246 #[test]
247 fn test_file_ending_with_newline() {
248 let content = "line 1\nline 2\n";
249 let info = FileInformation::new(content);
250 assert_eq!(info.line_count(), 3); // Empty third line
251
252 // The final newline
253 let loc = info.offset_to_location(13, content).unwrap();
254 assert_eq!(loc.row, 1);
255 assert_eq!(loc.column, 6);
256
257 // After the final newline (empty line 3)
258 let loc = info.offset_to_location(14, content).unwrap();
259 assert_eq!(loc.row, 2);
260 assert_eq!(loc.column, 0);
261 }
262
263 #[test]
264 fn test_consecutive_newlines() {
265 let content = "a\n\n\nb";
266 let info = FileInformation::new(content);
267 assert_eq!(info.line_count(), 4);
268
269 // First line
270 let loc = info.offset_to_location(0, content).unwrap();
271 assert_eq!(loc.row, 0);
272 assert_eq!(loc.column, 0);
273
274 // First newline (offset 1)
275 let loc = info.offset_to_location(1, content).unwrap();
276 assert_eq!(loc.row, 0);
277 assert_eq!(loc.column, 1);
278
279 // Empty second line (offset 2)
280 let loc = info.offset_to_location(2, content).unwrap();
281 assert_eq!(loc.row, 1);
282 assert_eq!(loc.column, 0);
283
284 // Empty third line (offset 3)
285 let loc = info.offset_to_location(3, content).unwrap();
286 assert_eq!(loc.row, 2);
287 assert_eq!(loc.column, 0);
288
289 // Fourth line 'b' (offset 4)
290 let loc = info.offset_to_location(4, content).unwrap();
291 assert_eq!(loc.row, 3);
292 assert_eq!(loc.column, 0);
293 }
294
295 #[test]
296 fn test_multibyte_utf8_column_should_be_character_count() {
297 // This test verifies that column is character count, not byte offset
298 // Swedish text with multi-byte UTF-8 characters (å = 2 bytes, ä = 2 bytes, ö = 2 bytes)
299 let content = "Gällande frågorna om något";
300 // Character positions: G=0, ä=1, l=2, l=3, a=4, n=5, d=6, e=7, space=8, f=9, r=10, å=11, g=12, ...
301 // Byte positions: G=0, ä=1-2, l=3, l=4, a=5, n=6, d=7, e=8, space=9, f=10, r=11, å=12-13, g=14, ...
302
303 let info = FileInformation::new(content);
304
305 // Test position at "å" in "frågorna" (character 11, byte offset starts at 12)
306 // The byte offset 12 is where "å" starts (it's 2 bytes: 12-13)
307 let loc = info.offset_to_location(12, content).unwrap();
308 assert_eq!(loc.row, 0);
309 // With the fix, this should return 11 (character count), not 12 (byte offset)
310 assert_eq!(
311 loc.column, 11,
312 "Column should be character count (11), not byte offset (12)"
313 );
314
315 // Test position at "g" after "å" in "frågorna" (character 12, byte offset 14)
316 let loc = info.offset_to_location(14, content).unwrap();
317 assert_eq!(loc.row, 0);
318 // Should return 12 (character count), not 14 (byte offset)
319 assert_eq!(
320 loc.column, 12,
321 "Column should be character count (12), not byte offset (14)"
322 );
323 }
324
325 #[test]
326 fn test_offset_inside_multibyte_char_does_not_panic() {
327 // Regression: a byte offset that lands inside a multi-byte UTF-8
328 // sequence used to panic with "byte index N is not a char boundary".
329 // Tree-sitter and Pandoc-source byte ranges can both produce such
330 // offsets in the wild, so the function must return a valid Location
331 // (rounded down to the previous char boundary) rather than panic.
332 let content = "x ❤️ y"; // ❤ = 3 bytes (E2 9D A4), ️ = 3 bytes (EF B8 8F)
333 let info = FileInformation::new(content);
334
335 // Byte 3 is the second byte of "❤" (which spans bytes 2..5).
336 // Floor to the start of "❤" at byte 2 → 2 chars before it ("x ").
337 let loc = info.offset_to_location(3, content).unwrap();
338 assert_eq!(loc.row, 0);
339 assert_eq!(loc.column, 2);
340
341 // Byte 4 is the third byte of "❤" — also inside the same char.
342 let loc = info.offset_to_location(4, content).unwrap();
343 assert_eq!(loc.row, 0);
344 assert_eq!(loc.column, 2);
345 }
346}