sipha_source/source_file.rs
1//! Source file management and position conversion.
2
3use std::io;
4use std::path::{Path, PathBuf};
5
6use sipha_core::span::Span;
7
8use crate::content::SourceContent;
9use crate::line_map::LineMap;
10use crate::position::Position;
11
12/// Represents a source file with efficient line/column conversion.
13///
14/// `SourceFile` maintains the source content, optional file path, and a cached
15/// line map for efficient position conversions.
16///
17/// # Example
18///
19/// ```rust
20/// use sipha_source::SourceFile;
21/// use sipha_core::span::Span;
22///
23/// let source = SourceFile::new(
24/// "fn main() {\n println!(\"Hello\");\n}".to_string(),
25/// None,
26/// );
27///
28/// // Convert byte offset to line/column
29/// let pos = source.byte_to_line_col(16).unwrap();
30/// assert_eq!(pos.line(), 2);
31/// assert_eq!(pos.column(), 5);
32///
33/// // Extract source text for a span
34/// let span = Span::new(16, 23);
35/// let text = source.extract_span(span).unwrap();
36/// assert_eq!(text, "println");
37/// ```
38#[derive(Clone, Debug)]
39pub struct SourceFile {
40 /// The source content (UTF-8 or binary).
41 content: SourceContent,
42 /// Optional file path.
43 path: Option<PathBuf>,
44 /// Cached line map for efficient lookups.
45 line_map: LineMap,
46 /// Whether to use UTF-8 character-based column calculation.
47 /// If false, columns are byte-based (for non-UTF-8 content).
48 utf8_columns: bool,
49}
50
51/// A source code snippet with context lines.
52///
53/// Used for displaying error messages with surrounding context.
54#[derive(Clone, Debug, PartialEq, Eq)]
55pub struct SourceSnippet {
56 /// Lines in the snippet, as (line_number, line_content) pairs.
57 /// Line numbers are 1-indexed.
58 pub lines: Vec<(usize, String)>,
59 /// The span to highlight in the snippet.
60 pub highlight_span: Span,
61 /// First line number in the snippet (1-indexed).
62 pub start_line: usize,
63 /// Last line number in the snippet (1-indexed).
64 pub end_line: usize,
65}
66
67impl SourceFile {
68 /// Create a new `SourceFile` from UTF-8 content.
69 ///
70 /// # Arguments
71 ///
72 /// * `content` - The source code content (UTF-8)
73 /// * `path` - Optional file path
74 ///
75 /// # Example
76 ///
77 /// ```rust
78 /// use sipha_source::SourceFile;
79 ///
80 /// let source = SourceFile::new("hello world".to_string(), None);
81 /// ```
82 pub fn new(content: String, path: Option<PathBuf>) -> Self {
83 let line_map = LineMap::new(&content);
84 Self {
85 content: SourceContent::Utf8(content),
86 path,
87 line_map,
88 utf8_columns: true,
89 }
90 }
91
92 /// Create a new `SourceFile` from bytes.
93 ///
94 /// This method supports both UTF-8 and non-UTF-8 content.
95 /// For non-UTF-8 content, column calculations will be byte-based.
96 ///
97 /// # Arguments
98 ///
99 /// * `content` - The source code content as bytes
100 /// * `path` - Optional file path
101 ///
102 /// # Example
103 ///
104 /// ```rust
105 /// use sipha_source::SourceFile;
106 ///
107 /// // UTF-8 content
108 /// let source = SourceFile::from_bytes(b"hello world".to_vec(), None);
109 ///
110 /// // Non-UTF-8 content (e.g., Latin-1)
111 /// let latin1 = vec![0xE9, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64]; // "é world" in Latin-1
112 /// let source = SourceFile::from_bytes(latin1, None);
113 /// ```
114 pub fn from_bytes(content: Vec<u8>, path: Option<PathBuf>) -> Self {
115 let line_map = LineMap::from_bytes(&content);
116 // Check if content is valid UTF-8 and convert if so
117 if let Ok(utf8_str) = std::str::from_utf8(&content) {
118 Self {
119 content: SourceContent::Utf8(utf8_str.to_string()),
120 path,
121 line_map,
122 utf8_columns: true,
123 }
124 } else {
125 Self {
126 content: SourceContent::Bytes(content),
127 path,
128 line_map,
129 utf8_columns: false,
130 }
131 }
132 }
133
134 /// Load a source file from the filesystem as UTF-8.
135 ///
136 /// # Arguments
137 ///
138 /// * `path` - Path to the file to load
139 ///
140 /// # Errors
141 ///
142 /// Returns an `io::Error` if the file cannot be read or is not valid UTF-8.
143 ///
144 /// # Example
145 ///
146 /// ```rust,no_run
147 /// use sipha_source::SourceFile;
148 /// use std::path::Path;
149 ///
150 /// let source = SourceFile::from_path(Path::new("src/main.rs")).unwrap();
151 /// ```
152 pub fn from_path(path: &Path) -> Result<Self, io::Error> {
153 let content = std::fs::read_to_string(path)?;
154 Ok(Self::new(content, Some(path.to_path_buf())))
155 }
156
157 /// Load a source file from the filesystem as raw bytes.
158 ///
159 /// This method supports both UTF-8 and non-UTF-8 files.
160 ///
161 /// # Arguments
162 ///
163 /// * `path` - Path to the file to load
164 ///
165 /// # Errors
166 ///
167 /// Returns an `io::Error` if the file cannot be read.
168 ///
169 /// # Example
170 ///
171 /// ```rust,no_run
172 /// use sipha_source::SourceFile;
173 /// use std::path::Path;
174 ///
175 /// // Load any file, including non-UTF-8
176 /// let source = SourceFile::from_path_bytes(Path::new("data.bin")).unwrap();
177 /// ```
178 pub fn from_path_bytes(path: &Path) -> Result<Self, io::Error> {
179 let content = std::fs::read(path)?;
180 Ok(Self::from_bytes(content, Some(path.to_path_buf())))
181 }
182
183 /// Get the source content as a UTF-8 string, if available.
184 ///
185 /// Returns `None` if the content is not valid UTF-8.
186 pub fn content(&self) -> Option<&str> {
187 self.content.as_str()
188 }
189
190 /// Get the source content as bytes.
191 pub fn content_bytes(&self) -> &[u8] {
192 self.content.as_bytes()
193 }
194
195 /// Check if the content is UTF-8 encoded.
196 pub fn is_utf8(&self) -> bool {
197 self.content.is_utf8() && self.utf8_columns
198 }
199
200 /// Get the file path, if available.
201 pub fn path(&self) -> Option<&Path> {
202 self.path.as_deref()
203 }
204
205 /// Get the total byte length of the source.
206 pub fn byte_len(&self) -> usize {
207 self.content.len()
208 }
209
210 /// Get the total number of lines in the source.
211 pub fn line_count(&self) -> usize {
212 self.line_map.line_count()
213 }
214
215 /// Get a specific line by line number (1-indexed) as a UTF-8 string.
216 ///
217 /// Returns the line content without the trailing newline (if present).
218 /// Returns `None` if the line number is out of bounds or the line is not valid UTF-8.
219 ///
220 /// # Example
221 ///
222 /// ```rust
223 /// use sipha_source::SourceFile;
224 ///
225 /// let source = SourceFile::new("line 1\nline 2\nline 3".to_string(), None);
226 /// assert_eq!(source.line(1), Some("line 1"));
227 /// assert_eq!(source.line(2), Some("line 2"));
228 /// assert_eq!(source.line(4), None);
229 /// ```
230 pub fn line(&self, line_num: usize) -> Option<&str> {
231 if line_num == 0 || line_num > self.line_map.line_count() {
232 return None;
233 }
234
235 let line_idx = line_num - 1; // Convert to 0-indexed
236 let start = self.line_map.line_start(line_idx)?;
237 let end = self.line_map.line_end(line_idx)?;
238
239 self.content.try_str_slice(start, end)
240 }
241
242 /// Get a specific line by line number (1-indexed) as bytes.
243 ///
244 /// Returns the line content without the trailing newline (if present).
245 /// Returns `None` if the line number is out of bounds.
246 ///
247 /// # Example
248 ///
249 /// ```rust
250 /// use sipha_source::SourceFile;
251 ///
252 /// let source = SourceFile::from_bytes(b"line 1\nline 2".to_vec(), None);
253 /// let line = source.line_bytes(1).unwrap();
254 /// assert_eq!(line, b"line 1");
255 /// ```
256 pub fn line_bytes(&self, line_num: usize) -> Option<&[u8]> {
257 if line_num == 0 || line_num > self.line_map.line_count() {
258 return None;
259 }
260
261 let line_idx = line_num - 1; // Convert to 0-indexed
262 let start = self.line_map.line_start(line_idx)?;
263 let end = self.line_map.line_end(line_idx)?;
264
265 Some(&self.content.as_bytes()[start..end])
266 }
267
268 /// Convert a byte offset to a line/column position.
269 ///
270 /// Returns `None` if the byte offset is out of bounds.
271 /// For UTF-8 content, column numbers are 1-indexed character positions.
272 /// For non-UTF-8 content, column numbers are 1-indexed byte positions.
273 ///
274 /// # Example
275 ///
276 /// ```rust
277 /// use sipha_source::SourceFile;
278 ///
279 /// let source = SourceFile::new("hello\nworld".to_string(), None);
280 /// let pos = source.byte_to_line_col(6).unwrap();
281 /// assert_eq!(pos.line(), 2);
282 /// assert_eq!(pos.column(), 1);
283 /// ```
284 pub fn byte_to_line_col(&self, byte_offset: usize) -> Option<Position> {
285 if byte_offset > self.content.len() {
286 return None;
287 }
288
289 // Find the line containing this byte offset
290 let line_idx = self.line_map.byte_to_line(byte_offset)?;
291 let line_start = self.line_map.line_start(line_idx)?;
292
293 // Calculate column
294 let byte_offset_in_line = byte_offset - line_start;
295 let column = if self.utf8_columns {
296 // UTF-8: calculate as character position
297 if let Some(line_content) = self
298 .content
299 .try_str_slice(line_start, self.line_map.line_end(line_idx)?)
300 {
301 line_content
302 .char_indices()
303 .take_while(|(idx, _)| *idx < byte_offset_in_line)
304 .count()
305 + 1 // 1-indexed
306 } else {
307 // Fallback to byte-based if line is not valid UTF-8
308 byte_offset_in_line + 1
309 }
310 } else {
311 // Non-UTF-8: use byte position
312 byte_offset_in_line + 1 // 1-indexed
313 };
314
315 Some(Position::new(
316 line_idx + 1, // 1-indexed line
317 column,
318 byte_offset,
319 ))
320 }
321
322 /// Convert a line/column position to a byte offset.
323 ///
324 /// Returns `None` if the line or column is out of bounds.
325 /// For UTF-8 content, column numbers are 1-indexed character positions.
326 /// For non-UTF-8 content, column numbers are 1-indexed byte positions.
327 ///
328 /// # Example
329 ///
330 /// ```rust
331 /// use sipha_source::SourceFile;
332 ///
333 /// let source = SourceFile::new("hello\nworld".to_string(), None);
334 /// let byte_offset = source.line_col_to_byte(2, 1).unwrap();
335 /// assert_eq!(byte_offset, 6); // Start of "world"
336 /// ```
337 pub fn line_col_to_byte(&self, line: usize, col: usize) -> Option<usize> {
338 if line == 0 {
339 return None;
340 }
341
342 let line_idx = line - 1; // Convert to 0-indexed
343
344 // Check if line exists (accounting for trailing newline creating an empty line)
345 if line_idx >= self.line_map.line_count() {
346 return None;
347 }
348
349 let line_start = self.line_map.line_start(line_idx)?;
350 let line_end = self.line_map.line_end(line_idx)?;
351
352 // Convert column (1-indexed) to byte offset
353 if col == 0 {
354 return Some(line_start);
355 }
356
357 if self.utf8_columns {
358 // UTF-8: convert character position to byte offset
359 if let Some(line_content) = self.content.try_str_slice(line_start, line_end) {
360 let char_count = line_content.chars().count();
361
362 // If the line is empty and we're asking for column > 1, return None
363 if char_count == 0 && col > 1 {
364 return None;
365 }
366
367 if col > char_count + 1 {
368 return Some(line_end);
369 }
370
371 let target_char_idx = col - 1; // Convert to 0-indexed
372
373 if target_char_idx == 0 {
374 return Some(line_start);
375 }
376
377 if target_char_idx >= char_count {
378 return Some(line_end);
379 }
380
381 // Find the byte offset for the target character
382 for (char_idx, (byte_idx, _)) in line_content.char_indices().enumerate() {
383 if char_idx == target_char_idx {
384 return Some(line_start + byte_idx);
385 }
386 }
387
388 Some(line_end)
389 } else {
390 // Line is not valid UTF-8, fall back to byte-based
391 let byte_col = col - 1;
392 if byte_col > (line_end - line_start) {
393 Some(line_end)
394 } else {
395 Some(line_start + byte_col)
396 }
397 }
398 } else {
399 // Non-UTF-8: use byte position directly
400 let byte_col = col - 1; // Convert to 0-indexed
401 let line_len = line_end - line_start;
402
403 if byte_col > line_len {
404 Some(line_end)
405 } else {
406 Some(line_start + byte_col)
407 }
408 }
409 }
410
411 /// Extract the source text for a span as a UTF-8 string.
412 ///
413 /// Returns `None` if the span is out of bounds or the span is not valid UTF-8.
414 ///
415 /// # Example
416 ///
417 /// ```rust
418 /// use sipha_source::SourceFile;
419 /// use sipha_core::span::Span;
420 ///
421 /// let source = SourceFile::new("hello world".to_string(), None);
422 /// let span = Span::new(0, 5);
423 /// assert_eq!(source.extract_span(span), Some("hello"));
424 /// ```
425 pub fn extract_span(&self, span: Span) -> Option<&str> {
426 if span.end() > self.content.len() {
427 return None;
428 }
429 self.content.try_str_slice(span.start(), span.end())
430 }
431
432 /// Extract the source bytes for a span.
433 ///
434 /// Returns `None` if the span is out of bounds.
435 ///
436 /// # Example
437 ///
438 /// ```rust
439 /// use sipha_source::SourceFile;
440 /// use sipha_core::span::Span;
441 ///
442 /// let source = SourceFile::from_bytes(b"hello world".to_vec(), None);
443 /// let span = Span::new(0, 5);
444 /// assert_eq!(source.extract_span_bytes(span), Some(b"hello".as_slice()));
445 /// ```
446 pub fn extract_span_bytes(&self, span: Span) -> Option<&[u8]> {
447 if span.end() > self.content.len() {
448 return None;
449 }
450 Some(&self.content.as_bytes()[span.start()..span.end()])
451 }
452
453 /// Extract a source snippet with context lines around a span.
454 ///
455 /// Returns `None` if the span is out of bounds.
456 ///
457 /// # Arguments
458 ///
459 /// * `span` - The span to highlight
460 /// * `context_lines` - Number of context lines to include before and after
461 ///
462 /// # Example
463 ///
464 /// ```rust
465 /// use sipha_source::SourceFile;
466 /// use sipha_core::span::Span;
467 ///
468 /// let source = SourceFile::new(
469 /// "line 1\nline 2\nline 3\nline 4\nline 5".to_string(),
470 /// None,
471 /// );
472 /// let span = Span::new(14, 19); // "line 3"
473 /// let snippet = source.extract_snippet(span, 1).unwrap();
474 /// assert_eq!(snippet.start_line, 2);
475 /// assert_eq!(snippet.end_line, 4);
476 /// ```
477 pub fn extract_snippet(&self, span: Span, context_lines: usize) -> Option<SourceSnippet> {
478 if span.end() > self.content.len() {
479 return None;
480 }
481
482 // Find the lines containing the span
483 let start_pos = self.byte_to_line_col(span.start())?;
484 let end_pos = self.byte_to_line_col(span.end().saturating_sub(1))?;
485
486 let start_line = start_pos.line();
487 let end_line = end_pos.line();
488
489 // Expand with context
490 let snippet_start_line = start_line.saturating_sub(context_lines);
491 let snippet_end_line = (end_line + context_lines).min(self.line_count());
492
493 // Collect lines
494 let mut lines = Vec::new();
495 for line_num in snippet_start_line..=snippet_end_line {
496 if let Some(line_bytes) = self.line_bytes(line_num) {
497 // Try to convert to string, or use a placeholder for non-UTF-8
498 let line_str = std::str::from_utf8(line_bytes)
499 .map(|s| s.to_string())
500 .unwrap_or_else(|_| format!("<non-UTF-8: {} bytes>", line_bytes.len()));
501 lines.push((line_num, line_str));
502 }
503 }
504
505 Some(SourceSnippet {
506 lines,
507 highlight_span: span,
508 start_line: snippet_start_line,
509 end_line: snippet_end_line,
510 })
511 }
512}
513
514#[cfg(test)]
515mod tests {
516 use super::*;
517
518 #[test]
519 fn test_new() {
520 let source = SourceFile::new("hello".to_string(), None);
521 assert_eq!(source.content(), Some("hello"));
522 assert_eq!(source.path(), None);
523 assert_eq!(source.byte_len(), 5);
524 }
525
526 #[test]
527 fn test_with_path() {
528 let path = PathBuf::from("test.rs");
529 let source = SourceFile::new("hello".to_string(), Some(path.clone()));
530 assert_eq!(source.path(), Some(path.as_path()));
531 }
532
533 #[test]
534 fn test_line() {
535 let source = SourceFile::new("line 1\nline 2\nline 3".to_string(), None);
536 assert_eq!(source.line(1), Some("line 1"));
537 assert_eq!(source.line(2), Some("line 2"));
538 assert_eq!(source.line(3), Some("line 3"));
539 assert_eq!(source.line(0), None);
540 assert_eq!(source.line(4), None);
541 }
542
543 #[test]
544 fn test_byte_to_line_col() {
545 let source = SourceFile::new("hello\nworld".to_string(), None);
546 let pos = source.byte_to_line_col(0).unwrap();
547 assert_eq!(pos.line(), 1);
548 assert_eq!(pos.column(), 1);
549
550 let pos = source.byte_to_line_col(6).unwrap();
551 assert_eq!(pos.line(), 2);
552 assert_eq!(pos.column(), 1);
553
554 let pos = source.byte_to_line_col(7).unwrap();
555 assert_eq!(pos.line(), 2);
556 assert_eq!(pos.column(), 2);
557 }
558
559 #[test]
560 fn test_line_col_to_byte() {
561 let source = SourceFile::new("hello\nworld".to_string(), None);
562 assert_eq!(source.line_col_to_byte(1, 1), Some(0));
563 assert_eq!(source.line_col_to_byte(1, 5), Some(4));
564 assert_eq!(source.line_col_to_byte(2, 1), Some(6));
565 assert_eq!(source.line_col_to_byte(2, 5), Some(10));
566 }
567
568 #[test]
569 fn test_extract_span() {
570 let source = SourceFile::new("hello world".to_string(), None);
571 let span = Span::new(0, 5);
572 assert_eq!(source.extract_span(span), Some("hello"));
573 }
574
575 #[test]
576 fn test_extract_snippet() {
577 let content = "line 1\nline 2\nline 3\nline 4\nline 5".to_string();
578 let source = SourceFile::new(content, None);
579 let span = Span::new(14, 19); // "line 3"
580 let snippet = source.extract_snippet(span, 1).unwrap();
581
582 assert_eq!(snippet.start_line, 2);
583 assert_eq!(snippet.end_line, 4);
584 assert_eq!(snippet.lines.len(), 3);
585 }
586
587 #[test]
588 fn test_utf8_handling() {
589 let source = SourceFile::new("hello 世界\nworld".to_string(), None);
590 // "hello 世界" is 5 + 1 + 6 bytes = 12 bytes
591 // "hello " is 6 bytes, so "世" starts at byte 6
592 let pos = source.byte_to_line_col(6).unwrap();
593 assert_eq!(pos.line(), 1);
594 assert_eq!(pos.column(), 7); // Character position, not byte
595
596 // Test reverse conversion
597 let byte = source.line_col_to_byte(1, 7).unwrap();
598 assert_eq!(byte, 6);
599 }
600}