Skip to main content

sipha_source/
source_file.rs

1//! Source file management and position conversion.
2
3use std::io;
4use std::path::{Path, PathBuf};
5
6use sipha_core::span::Span;
7
8use crate::content::SourceContent;
9use crate::line_map::LineMap;
10use crate::position::Position;
11
12/// Represents a source file with efficient line/column conversion.
13///
14/// `SourceFile` maintains the source content, optional file path, and a cached
15/// line map for efficient position conversions.
16///
17/// # Example
18///
19/// ```rust
20/// use sipha_source::SourceFile;
21/// use sipha_core::span::Span;
22///
23/// let source = SourceFile::new(
24///     "fn main() {\n    println!(\"Hello\");\n}".to_string(),
25///     None,
26/// );
27///
28/// // Convert byte offset to line/column
29/// let pos = source.byte_to_line_col(16).unwrap();
30/// assert_eq!(pos.line(), 2);
31/// assert_eq!(pos.column(), 5);
32///
33/// // Extract source text for a span
34/// let span = Span::new(16, 23);
35/// let text = source.extract_span(span).unwrap();
36/// assert_eq!(text, "println");
37/// ```
38#[derive(Clone, Debug)]
39pub struct SourceFile {
40    /// The source content (UTF-8 or binary).
41    content: SourceContent,
42    /// Optional file path.
43    path: Option<PathBuf>,
44    /// Cached line map for efficient lookups.
45    line_map: LineMap,
46    /// Whether to use UTF-8 character-based column calculation.
47    /// If false, columns are byte-based (for non-UTF-8 content).
48    utf8_columns: bool,
49}
50
51/// A source code snippet with context lines.
52///
53/// Used for displaying error messages with surrounding context.
54#[derive(Clone, Debug, PartialEq, Eq)]
55pub struct SourceSnippet {
56    /// Lines in the snippet, as (line_number, line_content) pairs.
57    /// Line numbers are 1-indexed.
58    pub lines: Vec<(usize, String)>,
59    /// The span to highlight in the snippet.
60    pub highlight_span: Span,
61    /// First line number in the snippet (1-indexed).
62    pub start_line: usize,
63    /// Last line number in the snippet (1-indexed).
64    pub end_line: usize,
65}
66
67impl SourceFile {
68    /// Create a new `SourceFile` from UTF-8 content.
69    ///
70    /// # Arguments
71    ///
72    /// * `content` - The source code content (UTF-8)
73    /// * `path` - Optional file path
74    ///
75    /// # Example
76    ///
77    /// ```rust
78    /// use sipha_source::SourceFile;
79    ///
80    /// let source = SourceFile::new("hello world".to_string(), None);
81    /// ```
82    pub fn new(content: String, path: Option<PathBuf>) -> Self {
83        let line_map = LineMap::new(&content);
84        Self {
85            content: SourceContent::Utf8(content),
86            path,
87            line_map,
88            utf8_columns: true,
89        }
90    }
91
92    /// Create a new `SourceFile` from bytes.
93    ///
94    /// This method supports both UTF-8 and non-UTF-8 content.
95    /// For non-UTF-8 content, column calculations will be byte-based.
96    ///
97    /// # Arguments
98    ///
99    /// * `content` - The source code content as bytes
100    /// * `path` - Optional file path
101    ///
102    /// # Example
103    ///
104    /// ```rust
105    /// use sipha_source::SourceFile;
106    ///
107    /// // UTF-8 content
108    /// let source = SourceFile::from_bytes(b"hello world".to_vec(), None);
109    ///
110    /// // Non-UTF-8 content (e.g., Latin-1)
111    /// let latin1 = vec![0xE9, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64]; // "é world" in Latin-1
112    /// let source = SourceFile::from_bytes(latin1, None);
113    /// ```
114    pub fn from_bytes(content: Vec<u8>, path: Option<PathBuf>) -> Self {
115        let line_map = LineMap::from_bytes(&content);
116        // Check if content is valid UTF-8 and convert if so
117        if let Ok(utf8_str) = std::str::from_utf8(&content) {
118            Self {
119                content: SourceContent::Utf8(utf8_str.to_string()),
120                path,
121                line_map,
122                utf8_columns: true,
123            }
124        } else {
125            Self {
126                content: SourceContent::Bytes(content),
127                path,
128                line_map,
129                utf8_columns: false,
130            }
131        }
132    }
133
134    /// Load a source file from the filesystem as UTF-8.
135    ///
136    /// # Arguments
137    ///
138    /// * `path` - Path to the file to load
139    ///
140    /// # Errors
141    ///
142    /// Returns an `io::Error` if the file cannot be read or is not valid UTF-8.
143    ///
144    /// # Example
145    ///
146    /// ```rust,no_run
147    /// use sipha_source::SourceFile;
148    /// use std::path::Path;
149    ///
150    /// let source = SourceFile::from_path(Path::new("src/main.rs")).unwrap();
151    /// ```
152    pub fn from_path(path: &Path) -> Result<Self, io::Error> {
153        let content = std::fs::read_to_string(path)?;
154        Ok(Self::new(content, Some(path.to_path_buf())))
155    }
156
157    /// Load a source file from the filesystem as raw bytes.
158    ///
159    /// This method supports both UTF-8 and non-UTF-8 files.
160    ///
161    /// # Arguments
162    ///
163    /// * `path` - Path to the file to load
164    ///
165    /// # Errors
166    ///
167    /// Returns an `io::Error` if the file cannot be read.
168    ///
169    /// # Example
170    ///
171    /// ```rust,no_run
172    /// use sipha_source::SourceFile;
173    /// use std::path::Path;
174    ///
175    /// // Load any file, including non-UTF-8
176    /// let source = SourceFile::from_path_bytes(Path::new("data.bin")).unwrap();
177    /// ```
178    pub fn from_path_bytes(path: &Path) -> Result<Self, io::Error> {
179        let content = std::fs::read(path)?;
180        Ok(Self::from_bytes(content, Some(path.to_path_buf())))
181    }
182
183    /// Get the source content as a UTF-8 string, if available.
184    ///
185    /// Returns `None` if the content is not valid UTF-8.
186    pub fn content(&self) -> Option<&str> {
187        self.content.as_str()
188    }
189
190    /// Get the source content as bytes.
191    pub fn content_bytes(&self) -> &[u8] {
192        self.content.as_bytes()
193    }
194
195    /// Check if the content is UTF-8 encoded.
196    pub fn is_utf8(&self) -> bool {
197        self.content.is_utf8() && self.utf8_columns
198    }
199
200    /// Get the file path, if available.
201    pub fn path(&self) -> Option<&Path> {
202        self.path.as_deref()
203    }
204
205    /// Get the total byte length of the source.
206    pub fn byte_len(&self) -> usize {
207        self.content.len()
208    }
209
210    /// Get the total number of lines in the source.
211    pub fn line_count(&self) -> usize {
212        self.line_map.line_count()
213    }
214
215    /// Get a specific line by line number (1-indexed) as a UTF-8 string.
216    ///
217    /// Returns the line content without the trailing newline (if present).
218    /// Returns `None` if the line number is out of bounds or the line is not valid UTF-8.
219    ///
220    /// # Example
221    ///
222    /// ```rust
223    /// use sipha_source::SourceFile;
224    ///
225    /// let source = SourceFile::new("line 1\nline 2\nline 3".to_string(), None);
226    /// assert_eq!(source.line(1), Some("line 1"));
227    /// assert_eq!(source.line(2), Some("line 2"));
228    /// assert_eq!(source.line(4), None);
229    /// ```
230    pub fn line(&self, line_num: usize) -> Option<&str> {
231        if line_num == 0 || line_num > self.line_map.line_count() {
232            return None;
233        }
234
235        let line_idx = line_num - 1; // Convert to 0-indexed
236        let start = self.line_map.line_start(line_idx)?;
237        let end = self.line_map.line_end(line_idx)?;
238
239        self.content.try_str_slice(start, end)
240    }
241
242    /// Get a specific line by line number (1-indexed) as bytes.
243    ///
244    /// Returns the line content without the trailing newline (if present).
245    /// Returns `None` if the line number is out of bounds.
246    ///
247    /// # Example
248    ///
249    /// ```rust
250    /// use sipha_source::SourceFile;
251    ///
252    /// let source = SourceFile::from_bytes(b"line 1\nline 2".to_vec(), None);
253    /// let line = source.line_bytes(1).unwrap();
254    /// assert_eq!(line, b"line 1");
255    /// ```
256    pub fn line_bytes(&self, line_num: usize) -> Option<&[u8]> {
257        if line_num == 0 || line_num > self.line_map.line_count() {
258            return None;
259        }
260
261        let line_idx = line_num - 1; // Convert to 0-indexed
262        let start = self.line_map.line_start(line_idx)?;
263        let end = self.line_map.line_end(line_idx)?;
264
265        Some(&self.content.as_bytes()[start..end])
266    }
267
268    /// Convert a byte offset to a line/column position.
269    ///
270    /// Returns `None` if the byte offset is out of bounds.
271    /// For UTF-8 content, column numbers are 1-indexed character positions.
272    /// For non-UTF-8 content, column numbers are 1-indexed byte positions.
273    ///
274    /// # Example
275    ///
276    /// ```rust
277    /// use sipha_source::SourceFile;
278    ///
279    /// let source = SourceFile::new("hello\nworld".to_string(), None);
280    /// let pos = source.byte_to_line_col(6).unwrap();
281    /// assert_eq!(pos.line(), 2);
282    /// assert_eq!(pos.column(), 1);
283    /// ```
284    pub fn byte_to_line_col(&self, byte_offset: usize) -> Option<Position> {
285        if byte_offset > self.content.len() {
286            return None;
287        }
288
289        // Find the line containing this byte offset
290        let line_idx = self.line_map.byte_to_line(byte_offset)?;
291        let line_start = self.line_map.line_start(line_idx)?;
292
293        // Calculate column
294        let byte_offset_in_line = byte_offset - line_start;
295        let column = if self.utf8_columns {
296            // UTF-8: calculate as character position
297            if let Some(line_content) = self
298                .content
299                .try_str_slice(line_start, self.line_map.line_end(line_idx)?)
300            {
301                line_content
302                    .char_indices()
303                    .take_while(|(idx, _)| *idx < byte_offset_in_line)
304                    .count()
305                    + 1 // 1-indexed
306            } else {
307                // Fallback to byte-based if line is not valid UTF-8
308                byte_offset_in_line + 1
309            }
310        } else {
311            // Non-UTF-8: use byte position
312            byte_offset_in_line + 1 // 1-indexed
313        };
314
315        Some(Position::new(
316            line_idx + 1, // 1-indexed line
317            column,
318            byte_offset,
319        ))
320    }
321
322    /// Convert a line/column position to a byte offset.
323    ///
324    /// Returns `None` if the line or column is out of bounds.
325    /// For UTF-8 content, column numbers are 1-indexed character positions.
326    /// For non-UTF-8 content, column numbers are 1-indexed byte positions.
327    ///
328    /// # Example
329    ///
330    /// ```rust
331    /// use sipha_source::SourceFile;
332    ///
333    /// let source = SourceFile::new("hello\nworld".to_string(), None);
334    /// let byte_offset = source.line_col_to_byte(2, 1).unwrap();
335    /// assert_eq!(byte_offset, 6); // Start of "world"
336    /// ```
337    pub fn line_col_to_byte(&self, line: usize, col: usize) -> Option<usize> {
338        if line == 0 {
339            return None;
340        }
341
342        let line_idx = line - 1; // Convert to 0-indexed
343
344        // Check if line exists (accounting for trailing newline creating an empty line)
345        if line_idx >= self.line_map.line_count() {
346            return None;
347        }
348
349        let line_start = self.line_map.line_start(line_idx)?;
350        let line_end = self.line_map.line_end(line_idx)?;
351
352        // Convert column (1-indexed) to byte offset
353        if col == 0 {
354            return Some(line_start);
355        }
356
357        if self.utf8_columns {
358            // UTF-8: convert character position to byte offset
359            if let Some(line_content) = self.content.try_str_slice(line_start, line_end) {
360                let char_count = line_content.chars().count();
361
362                // If the line is empty and we're asking for column > 1, return None
363                if char_count == 0 && col > 1 {
364                    return None;
365                }
366
367                if col > char_count + 1 {
368                    return Some(line_end);
369                }
370
371                let target_char_idx = col - 1; // Convert to 0-indexed
372
373                if target_char_idx == 0 {
374                    return Some(line_start);
375                }
376
377                if target_char_idx >= char_count {
378                    return Some(line_end);
379                }
380
381                // Find the byte offset for the target character
382                for (char_idx, (byte_idx, _)) in line_content.char_indices().enumerate() {
383                    if char_idx == target_char_idx {
384                        return Some(line_start + byte_idx);
385                    }
386                }
387
388                Some(line_end)
389            } else {
390                // Line is not valid UTF-8, fall back to byte-based
391                let byte_col = col - 1;
392                if byte_col > (line_end - line_start) {
393                    Some(line_end)
394                } else {
395                    Some(line_start + byte_col)
396                }
397            }
398        } else {
399            // Non-UTF-8: use byte position directly
400            let byte_col = col - 1; // Convert to 0-indexed
401            let line_len = line_end - line_start;
402
403            if byte_col > line_len {
404                Some(line_end)
405            } else {
406                Some(line_start + byte_col)
407            }
408        }
409    }
410
411    /// Extract the source text for a span as a UTF-8 string.
412    ///
413    /// Returns `None` if the span is out of bounds or the span is not valid UTF-8.
414    ///
415    /// # Example
416    ///
417    /// ```rust
418    /// use sipha_source::SourceFile;
419    /// use sipha_core::span::Span;
420    ///
421    /// let source = SourceFile::new("hello world".to_string(), None);
422    /// let span = Span::new(0, 5);
423    /// assert_eq!(source.extract_span(span), Some("hello"));
424    /// ```
425    pub fn extract_span(&self, span: Span) -> Option<&str> {
426        if span.end() > self.content.len() {
427            return None;
428        }
429        self.content.try_str_slice(span.start(), span.end())
430    }
431
432    /// Extract the source bytes for a span.
433    ///
434    /// Returns `None` if the span is out of bounds.
435    ///
436    /// # Example
437    ///
438    /// ```rust
439    /// use sipha_source::SourceFile;
440    /// use sipha_core::span::Span;
441    ///
442    /// let source = SourceFile::from_bytes(b"hello world".to_vec(), None);
443    /// let span = Span::new(0, 5);
444    /// assert_eq!(source.extract_span_bytes(span), Some(b"hello".as_slice()));
445    /// ```
446    pub fn extract_span_bytes(&self, span: Span) -> Option<&[u8]> {
447        if span.end() > self.content.len() {
448            return None;
449        }
450        Some(&self.content.as_bytes()[span.start()..span.end()])
451    }
452
453    /// Extract a source snippet with context lines around a span.
454    ///
455    /// Returns `None` if the span is out of bounds.
456    ///
457    /// # Arguments
458    ///
459    /// * `span` - The span to highlight
460    /// * `context_lines` - Number of context lines to include before and after
461    ///
462    /// # Example
463    ///
464    /// ```rust
465    /// use sipha_source::SourceFile;
466    /// use sipha_core::span::Span;
467    ///
468    /// let source = SourceFile::new(
469    ///     "line 1\nline 2\nline 3\nline 4\nline 5".to_string(),
470    ///     None,
471    /// );
472    /// let span = Span::new(14, 19); // "line 3"
473    /// let snippet = source.extract_snippet(span, 1).unwrap();
474    /// assert_eq!(snippet.start_line, 2);
475    /// assert_eq!(snippet.end_line, 4);
476    /// ```
477    pub fn extract_snippet(&self, span: Span, context_lines: usize) -> Option<SourceSnippet> {
478        if span.end() > self.content.len() {
479            return None;
480        }
481
482        // Find the lines containing the span
483        let start_pos = self.byte_to_line_col(span.start())?;
484        let end_pos = self.byte_to_line_col(span.end().saturating_sub(1))?;
485
486        let start_line = start_pos.line();
487        let end_line = end_pos.line();
488
489        // Expand with context
490        let snippet_start_line = start_line.saturating_sub(context_lines);
491        let snippet_end_line = (end_line + context_lines).min(self.line_count());
492
493        // Collect lines
494        let mut lines = Vec::new();
495        for line_num in snippet_start_line..=snippet_end_line {
496            if let Some(line_bytes) = self.line_bytes(line_num) {
497                // Try to convert to string, or use a placeholder for non-UTF-8
498                let line_str = std::str::from_utf8(line_bytes)
499                    .map(|s| s.to_string())
500                    .unwrap_or_else(|_| format!("<non-UTF-8: {} bytes>", line_bytes.len()));
501                lines.push((line_num, line_str));
502            }
503        }
504
505        Some(SourceSnippet {
506            lines,
507            highlight_span: span,
508            start_line: snippet_start_line,
509            end_line: snippet_end_line,
510        })
511    }
512}
513
514#[cfg(test)]
515mod tests {
516    use super::*;
517
518    #[test]
519    fn test_new() {
520        let source = SourceFile::new("hello".to_string(), None);
521        assert_eq!(source.content(), Some("hello"));
522        assert_eq!(source.path(), None);
523        assert_eq!(source.byte_len(), 5);
524    }
525
526    #[test]
527    fn test_with_path() {
528        let path = PathBuf::from("test.rs");
529        let source = SourceFile::new("hello".to_string(), Some(path.clone()));
530        assert_eq!(source.path(), Some(path.as_path()));
531    }
532
533    #[test]
534    fn test_line() {
535        let source = SourceFile::new("line 1\nline 2\nline 3".to_string(), None);
536        assert_eq!(source.line(1), Some("line 1"));
537        assert_eq!(source.line(2), Some("line 2"));
538        assert_eq!(source.line(3), Some("line 3"));
539        assert_eq!(source.line(0), None);
540        assert_eq!(source.line(4), None);
541    }
542
543    #[test]
544    fn test_byte_to_line_col() {
545        let source = SourceFile::new("hello\nworld".to_string(), None);
546        let pos = source.byte_to_line_col(0).unwrap();
547        assert_eq!(pos.line(), 1);
548        assert_eq!(pos.column(), 1);
549
550        let pos = source.byte_to_line_col(6).unwrap();
551        assert_eq!(pos.line(), 2);
552        assert_eq!(pos.column(), 1);
553
554        let pos = source.byte_to_line_col(7).unwrap();
555        assert_eq!(pos.line(), 2);
556        assert_eq!(pos.column(), 2);
557    }
558
559    #[test]
560    fn test_line_col_to_byte() {
561        let source = SourceFile::new("hello\nworld".to_string(), None);
562        assert_eq!(source.line_col_to_byte(1, 1), Some(0));
563        assert_eq!(source.line_col_to_byte(1, 5), Some(4));
564        assert_eq!(source.line_col_to_byte(2, 1), Some(6));
565        assert_eq!(source.line_col_to_byte(2, 5), Some(10));
566    }
567
568    #[test]
569    fn test_extract_span() {
570        let source = SourceFile::new("hello world".to_string(), None);
571        let span = Span::new(0, 5);
572        assert_eq!(source.extract_span(span), Some("hello"));
573    }
574
575    #[test]
576    fn test_extract_snippet() {
577        let content = "line 1\nline 2\nline 3\nline 4\nline 5".to_string();
578        let source = SourceFile::new(content, None);
579        let span = Span::new(14, 19); // "line 3"
580        let snippet = source.extract_snippet(span, 1).unwrap();
581
582        assert_eq!(snippet.start_line, 2);
583        assert_eq!(snippet.end_line, 4);
584        assert_eq!(snippet.lines.len(), 3);
585    }
586
587    #[test]
588    fn test_utf8_handling() {
589        let source = SourceFile::new("hello 世界\nworld".to_string(), None);
590        // "hello 世界" is 5 + 1 + 6 bytes = 12 bytes
591        // "hello " is 6 bytes, so "世" starts at byte 6
592        let pos = source.byte_to_line_col(6).unwrap();
593        assert_eq!(pos.line(), 1);
594        assert_eq!(pos.column(), 7); // Character position, not byte
595
596        // Test reverse conversion
597        let byte = source.line_col_to_byte(1, 7).unwrap();
598        assert_eq!(byte, 6);
599    }
600}