Skip to main content

tldr_core/
encoding.rs

1//! File encoding handling (Phase 10)
2//!
3//! This module provides robust handling of file encodings during analysis.
4//!
5//! # Mitigations
6//!
7//! - A34: Silent data corruption on non-UTF8 files
8//!   - Detects and handles UTF-8 BOM
9//!   - Falls back to lossy decoding with warning
10//!   - Detects and skips binary files
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use tldr_core::encoding::{read_source_file, FileReadResult};
16//! use std::path::Path;
17//!
18//! match read_source_file(Path::new("example.py")) {
19//!     Ok(FileReadResult::Ok(content)) => {
20//!         // Process UTF-8 content
21//!     }
22//!     Ok(FileReadResult::Lossy { content, warning }) => {
23//!         // Process with warning
24//!         eprintln!("Warning: {}", warning);
25//!     }
26//!     Ok(FileReadResult::Binary) => {
27//!         // Skip binary file
28//!     }
29//!     Err(e) => {
30//!         // Handle IO error
31//!     }
32//! }
33//! ```
34
35use std::path::Path;
36
37use serde::{Deserialize, Serialize};
38
39use crate::TldrError;
40
41// =============================================================================
42// File Read Result
43// =============================================================================
44
45/// Result of reading a source file with encoding detection.
46#[derive(Debug, Clone)]
47pub enum FileReadResult {
48    /// File was valid UTF-8 (possibly with BOM stripped)
49    Ok(String),
50    /// File required lossy UTF-8 decoding
51    Lossy {
52        /// The decoded content (with replacement characters)
53        content: String,
54        /// Warning message about encoding issues
55        warning: String,
56    },
57    /// File appears to be binary (contains null bytes)
58    Binary,
59}
60
61impl FileReadResult {
62    /// Get the content if available.
63    pub fn content(&self) -> Option<&str> {
64        match self {
65            FileReadResult::Ok(s) => Some(s),
66            FileReadResult::Lossy { content, .. } => Some(content),
67            FileReadResult::Binary => None,
68        }
69    }
70
71    /// Check if this result has a warning.
72    pub fn has_warning(&self) -> bool {
73        matches!(self, FileReadResult::Lossy { .. })
74    }
75
76    /// Get the warning message if any.
77    pub fn warning(&self) -> Option<&str> {
78        match self {
79            FileReadResult::Lossy { warning, .. } => Some(warning),
80            _ => None,
81        }
82    }
83
84    /// Check if file is binary.
85    pub fn is_binary(&self) -> bool {
86        matches!(self, FileReadResult::Binary)
87    }
88}
89
90// =============================================================================
91// Encoding Issues Tracking
92// =============================================================================
93
94/// Record of encoding issues encountered during analysis.
95#[derive(Debug, Clone, Default, Serialize, Deserialize)]
96pub struct EncodingIssues {
97    /// Files that required lossy UTF-8 decoding
98    pub lossy_files: Vec<EncodingIssue>,
99    /// Files that were skipped as binary
100    pub binary_files: Vec<String>,
101    /// Files with UTF-8 BOM (stripped)
102    pub bom_files: Vec<String>,
103}
104
105impl EncodingIssues {
106    /// Create a new issues tracker.
107    pub fn new() -> Self {
108        Self::default()
109    }
110
111    /// Record a lossy decode.
112    pub fn add_lossy(&mut self, file: impl Into<String>, issue: impl Into<String>) {
113        self.lossy_files.push(EncodingIssue {
114            file: file.into(),
115            issue: issue.into(),
116        });
117    }
118
119    /// Record a binary file skip.
120    pub fn add_binary(&mut self, file: impl Into<String>) {
121        self.binary_files.push(file.into());
122    }
123
124    /// Record a BOM file.
125    pub fn add_bom(&mut self, file: impl Into<String>) {
126        self.bom_files.push(file.into());
127    }
128
129    /// Check if any issues were recorded.
130    pub fn has_issues(&self) -> bool {
131        !self.lossy_files.is_empty() || !self.binary_files.is_empty()
132    }
133
134    /// Get total number of issues.
135    pub fn total(&self) -> usize {
136        self.lossy_files.len() + self.binary_files.len()
137    }
138}
139
140/// A single encoding issue record.
141#[derive(Debug, Clone, Serialize, Deserialize)]
142pub struct EncodingIssue {
143    /// File path
144    pub file: String,
145    /// Issue description
146    pub issue: String,
147}
148
149// =============================================================================
150// File Reading Functions
151// =============================================================================
152
153/// UTF-8 BOM bytes
154const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
155
156/// UTF-16 LE BOM bytes
157const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
158
159/// UTF-16 BE BOM bytes
160const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
161
162/// Read a source file with encoding detection.
163///
164/// This function:
165/// 1. Reads the file as bytes
166/// 2. Checks for and strips UTF-8 BOM
167/// 3. Detects UTF-16 BOM and reports unsupported
168/// 4. Attempts UTF-8 decoding
169/// 5. Falls back to lossy decoding if needed
170/// 6. Detects binary files (contains null bytes)
171///
172/// # Arguments
173///
174/// * `path` - Path to the file to read
175///
176/// # Returns
177///
178/// * `Ok(FileReadResult::Ok(content))` - Valid UTF-8 content
179/// * `Ok(FileReadResult::Lossy { content, warning })` - Lossy decoded content with warning
180/// * `Ok(FileReadResult::Binary)` - File is binary
181/// * `Err(TldrError)` - IO error
182pub fn read_source_file(path: &Path) -> Result<FileReadResult, TldrError> {
183    let bytes = std::fs::read(path)?;
184
185    // Check for UTF-16 BOM (unsupported, would need conversion)
186    if bytes.starts_with(UTF16_LE_BOM) || bytes.starts_with(UTF16_BE_BOM) {
187        return Ok(FileReadResult::Lossy {
188            content: String::new(),
189            warning: format!(
190                "File {} appears to be UTF-16 encoded (unsupported), skipping",
191                path.display()
192            ),
193        });
194    }
195
196    // Check for and strip UTF-8 BOM
197    let (bytes, had_bom) = if bytes.starts_with(UTF8_BOM) {
198        (&bytes[3..], true)
199    } else {
200        (&bytes[..], false)
201    };
202
203    // Check for binary file (null bytes in first 8KB)
204    let check_len = bytes.len().min(8192);
205    if bytes[..check_len].contains(&0) {
206        return Ok(FileReadResult::Binary);
207    }
208
209    // Try UTF-8 decoding
210    match String::from_utf8(bytes.to_vec()) {
211        Ok(content) => {
212            if had_bom {
213                // Valid UTF-8 with BOM stripped (not a warning, just note)
214                Ok(FileReadResult::Ok(content))
215            } else {
216                Ok(FileReadResult::Ok(content))
217            }
218        }
219        Err(_) => {
220            // Fall back to lossy decoding
221            let content = String::from_utf8_lossy(bytes).into_owned();
222            let replacement_count = content.matches('\u{FFFD}').count();
223            Ok(FileReadResult::Lossy {
224                content,
225                warning: format!(
226                    "File {} is not valid UTF-8, used lossy decoding ({} replacement characters)",
227                    path.display(),
228                    replacement_count
229                ),
230            })
231        }
232    }
233}
234
235/// Read a source file, returning the content or skipping on error.
236///
237/// This is a convenience function that:
238/// - Returns `Some(content)` for valid files
239/// - Returns `None` for binary files or errors
240/// - Optionally records issues in an EncodingIssues tracker
241///
242/// # Arguments
243///
244/// * `path` - Path to the file
245/// * `issues` - Optional issues tracker
246///
247/// # Returns
248///
249/// * `Some(String)` - File content (may be lossy decoded)
250/// * `None` - File was skipped (binary, error, etc.)
251pub fn read_source_file_or_skip(
252    path: &Path,
253    issues: Option<&mut EncodingIssues>,
254) -> Option<String> {
255    match read_source_file(path) {
256        Ok(FileReadResult::Ok(content)) => Some(content),
257        Ok(FileReadResult::Lossy { content, warning }) => {
258            if let Some(issues) = issues {
259                issues.add_lossy(path.display().to_string(), &warning);
260            }
261            Some(content)
262        }
263        Ok(FileReadResult::Binary) => {
264            if let Some(issues) = issues {
265                issues.add_binary(path.display().to_string());
266            }
267            None
268        }
269        Err(_) => None,
270    }
271}
272
273/// Check if a file appears to be binary.
274///
275/// Reads the first 8KB of the file and checks for null bytes.
276pub fn is_binary_file(path: &Path) -> Result<bool, TldrError> {
277    let file = std::fs::File::open(path)?;
278    let mut reader = std::io::BufReader::new(file);
279
280    let mut buffer = [0u8; 8192];
281    use std::io::Read;
282    let bytes_read = reader.read(&mut buffer)?;
283
284    Ok(buffer[..bytes_read].contains(&0))
285}
286
287// =============================================================================
288// Tests
289// =============================================================================
290
291#[cfg(test)]
292mod tests {
293    use super::*;
294    use std::io::Write;
295    use tempfile::NamedTempFile;
296
297    #[test]
298    fn test_read_utf8_file() {
299        let mut file = NamedTempFile::new().unwrap();
300        write!(file, "Hello, world!").unwrap();
301
302        let result = read_source_file(file.path()).unwrap();
303        assert!(matches!(result, FileReadResult::Ok(_)));
304        assert_eq!(result.content(), Some("Hello, world!"));
305    }
306
307    #[test]
308    fn test_read_utf8_bom_file() {
309        let mut file = NamedTempFile::new().unwrap();
310        // Write UTF-8 BOM followed by content
311        file.write_all(&[0xEF, 0xBB, 0xBF]).unwrap();
312        file.write_all(b"Hello, BOM!").unwrap();
313
314        let result = read_source_file(file.path()).unwrap();
315        assert!(matches!(result, FileReadResult::Ok(_)));
316        assert_eq!(result.content(), Some("Hello, BOM!"));
317    }
318
319    #[test]
320    fn test_read_binary_file() {
321        let mut file = NamedTempFile::new().unwrap();
322        // Write some binary content with null bytes
323        file.write_all(&[0x00, 0x01, 0x02, 0x00]).unwrap();
324
325        let result = read_source_file(file.path()).unwrap();
326        assert!(matches!(result, FileReadResult::Binary));
327        assert!(result.is_binary());
328        assert!(result.content().is_none());
329    }
330
331    #[test]
332    fn test_read_invalid_utf8() {
333        let mut file = NamedTempFile::new().unwrap();
334        // Write invalid UTF-8 sequence (no null bytes)
335        file.write_all(&[0x80, 0x81, 0x82, 0x61, 0x62, 0x63])
336            .unwrap();
337
338        let result = read_source_file(file.path()).unwrap();
339        assert!(matches!(result, FileReadResult::Lossy { .. }));
340        assert!(result.has_warning());
341    }
342
343    #[test]
344    fn test_encoding_issues_tracker() {
345        let mut issues = EncodingIssues::new();
346        assert!(!issues.has_issues());
347
348        issues.add_lossy("file1.py", "Invalid UTF-8");
349        issues.add_binary("file2.bin");
350        issues.add_bom("file3.py");
351
352        assert!(issues.has_issues());
353        assert_eq!(issues.total(), 2); // lossy + binary
354        assert_eq!(issues.lossy_files.len(), 1);
355        assert_eq!(issues.binary_files.len(), 1);
356        assert_eq!(issues.bom_files.len(), 1);
357    }
358
359    #[test]
360    fn test_read_source_file_or_skip_valid() {
361        let mut file = NamedTempFile::new().unwrap();
362        write!(file, "def foo(): pass").unwrap();
363
364        let mut issues = EncodingIssues::new();
365        let content = read_source_file_or_skip(file.path(), Some(&mut issues));
366
367        assert!(content.is_some());
368        assert!(!issues.has_issues());
369    }
370
371    #[test]
372    fn test_read_source_file_or_skip_binary() {
373        let mut file = NamedTempFile::new().unwrap();
374        file.write_all(&[0x00, 0x01, 0x02]).unwrap();
375
376        let mut issues = EncodingIssues::new();
377        let content = read_source_file_or_skip(file.path(), Some(&mut issues));
378
379        assert!(content.is_none());
380        assert_eq!(issues.binary_files.len(), 1);
381    }
382
383    #[test]
384    fn test_is_binary_file() {
385        let mut file = NamedTempFile::new().unwrap();
386        file.write_all(&[0x00, 0x01]).unwrap();
387
388        assert!(is_binary_file(file.path()).unwrap());
389
390        let mut text_file = NamedTempFile::new().unwrap();
391        write!(text_file, "text content").unwrap();
392
393        assert!(!is_binary_file(text_file.path()).unwrap());
394    }
395}