1use std::path::Path;
36
37use serde::{Deserialize, Serialize};
38
39use crate::TldrError;
40
41#[derive(Debug, Clone)]
47pub enum FileReadResult {
48 Ok(String),
50 Lossy {
52 content: String,
54 warning: String,
56 },
57 Binary,
59}
60
61impl FileReadResult {
62 pub fn content(&self) -> Option<&str> {
64 match self {
65 FileReadResult::Ok(s) => Some(s),
66 FileReadResult::Lossy { content, .. } => Some(content),
67 FileReadResult::Binary => None,
68 }
69 }
70
71 pub fn has_warning(&self) -> bool {
73 matches!(self, FileReadResult::Lossy { .. })
74 }
75
76 pub fn warning(&self) -> Option<&str> {
78 match self {
79 FileReadResult::Lossy { warning, .. } => Some(warning),
80 _ => None,
81 }
82 }
83
84 pub fn is_binary(&self) -> bool {
86 matches!(self, FileReadResult::Binary)
87 }
88}
89
90#[derive(Debug, Clone, Default, Serialize, Deserialize)]
96pub struct EncodingIssues {
97 pub lossy_files: Vec<EncodingIssue>,
99 pub binary_files: Vec<String>,
101 pub bom_files: Vec<String>,
103}
104
105impl EncodingIssues {
106 pub fn new() -> Self {
108 Self::default()
109 }
110
111 pub fn add_lossy(&mut self, file: impl Into<String>, issue: impl Into<String>) {
113 self.lossy_files.push(EncodingIssue {
114 file: file.into(),
115 issue: issue.into(),
116 });
117 }
118
119 pub fn add_binary(&mut self, file: impl Into<String>) {
121 self.binary_files.push(file.into());
122 }
123
124 pub fn add_bom(&mut self, file: impl Into<String>) {
126 self.bom_files.push(file.into());
127 }
128
129 pub fn has_issues(&self) -> bool {
131 !self.lossy_files.is_empty() || !self.binary_files.is_empty()
132 }
133
134 pub fn total(&self) -> usize {
136 self.lossy_files.len() + self.binary_files.len()
137 }
138}
139
140#[derive(Debug, Clone, Serialize, Deserialize)]
142pub struct EncodingIssue {
143 pub file: String,
145 pub issue: String,
147}
148
149const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
155
156const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
158
159const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
161
162pub fn read_source_file(path: &Path) -> Result<FileReadResult, TldrError> {
183 let bytes = std::fs::read(path)?;
184
185 if bytes.starts_with(UTF16_LE_BOM) || bytes.starts_with(UTF16_BE_BOM) {
187 return Ok(FileReadResult::Lossy {
188 content: String::new(),
189 warning: format!(
190 "File {} appears to be UTF-16 encoded (unsupported), skipping",
191 path.display()
192 ),
193 });
194 }
195
196 let (bytes, had_bom) = if bytes.starts_with(UTF8_BOM) {
198 (&bytes[3..], true)
199 } else {
200 (&bytes[..], false)
201 };
202
203 let check_len = bytes.len().min(8192);
205 if bytes[..check_len].contains(&0) {
206 return Ok(FileReadResult::Binary);
207 }
208
209 match String::from_utf8(bytes.to_vec()) {
211 Ok(content) => {
212 if had_bom {
213 Ok(FileReadResult::Ok(content))
215 } else {
216 Ok(FileReadResult::Ok(content))
217 }
218 }
219 Err(_) => {
220 let content = String::from_utf8_lossy(bytes).into_owned();
222 let replacement_count = content.matches('\u{FFFD}').count();
223 Ok(FileReadResult::Lossy {
224 content,
225 warning: format!(
226 "File {} is not valid UTF-8, used lossy decoding ({} replacement characters)",
227 path.display(),
228 replacement_count
229 ),
230 })
231 }
232 }
233}
234
235pub fn read_source_file_or_skip(
252 path: &Path,
253 issues: Option<&mut EncodingIssues>,
254) -> Option<String> {
255 match read_source_file(path) {
256 Ok(FileReadResult::Ok(content)) => Some(content),
257 Ok(FileReadResult::Lossy { content, warning }) => {
258 if let Some(issues) = issues {
259 issues.add_lossy(path.display().to_string(), &warning);
260 }
261 Some(content)
262 }
263 Ok(FileReadResult::Binary) => {
264 if let Some(issues) = issues {
265 issues.add_binary(path.display().to_string());
266 }
267 None
268 }
269 Err(_) => None,
270 }
271}
272
273pub fn is_binary_file(path: &Path) -> Result<bool, TldrError> {
277 let file = std::fs::File::open(path)?;
278 let mut reader = std::io::BufReader::new(file);
279
280 let mut buffer = [0u8; 8192];
281 use std::io::Read;
282 let bytes_read = reader.read(&mut buffer)?;
283
284 Ok(buffer[..bytes_read].contains(&0))
285}
286
287#[cfg(test)]
292mod tests {
293 use super::*;
294 use std::io::Write;
295 use tempfile::NamedTempFile;
296
297 #[test]
298 fn test_read_utf8_file() {
299 let mut file = NamedTempFile::new().unwrap();
300 write!(file, "Hello, world!").unwrap();
301
302 let result = read_source_file(file.path()).unwrap();
303 assert!(matches!(result, FileReadResult::Ok(_)));
304 assert_eq!(result.content(), Some("Hello, world!"));
305 }
306
307 #[test]
308 fn test_read_utf8_bom_file() {
309 let mut file = NamedTempFile::new().unwrap();
310 file.write_all(&[0xEF, 0xBB, 0xBF]).unwrap();
312 file.write_all(b"Hello, BOM!").unwrap();
313
314 let result = read_source_file(file.path()).unwrap();
315 assert!(matches!(result, FileReadResult::Ok(_)));
316 assert_eq!(result.content(), Some("Hello, BOM!"));
317 }
318
319 #[test]
320 fn test_read_binary_file() {
321 let mut file = NamedTempFile::new().unwrap();
322 file.write_all(&[0x00, 0x01, 0x02, 0x00]).unwrap();
324
325 let result = read_source_file(file.path()).unwrap();
326 assert!(matches!(result, FileReadResult::Binary));
327 assert!(result.is_binary());
328 assert!(result.content().is_none());
329 }
330
331 #[test]
332 fn test_read_invalid_utf8() {
333 let mut file = NamedTempFile::new().unwrap();
334 file.write_all(&[0x80, 0x81, 0x82, 0x61, 0x62, 0x63])
336 .unwrap();
337
338 let result = read_source_file(file.path()).unwrap();
339 assert!(matches!(result, FileReadResult::Lossy { .. }));
340 assert!(result.has_warning());
341 }
342
343 #[test]
344 fn test_encoding_issues_tracker() {
345 let mut issues = EncodingIssues::new();
346 assert!(!issues.has_issues());
347
348 issues.add_lossy("file1.py", "Invalid UTF-8");
349 issues.add_binary("file2.bin");
350 issues.add_bom("file3.py");
351
352 assert!(issues.has_issues());
353 assert_eq!(issues.total(), 2); assert_eq!(issues.lossy_files.len(), 1);
355 assert_eq!(issues.binary_files.len(), 1);
356 assert_eq!(issues.bom_files.len(), 1);
357 }
358
359 #[test]
360 fn test_read_source_file_or_skip_valid() {
361 let mut file = NamedTempFile::new().unwrap();
362 write!(file, "def foo(): pass").unwrap();
363
364 let mut issues = EncodingIssues::new();
365 let content = read_source_file_or_skip(file.path(), Some(&mut issues));
366
367 assert!(content.is_some());
368 assert!(!issues.has_issues());
369 }
370
371 #[test]
372 fn test_read_source_file_or_skip_binary() {
373 let mut file = NamedTempFile::new().unwrap();
374 file.write_all(&[0x00, 0x01, 0x02]).unwrap();
375
376 let mut issues = EncodingIssues::new();
377 let content = read_source_file_or_skip(file.path(), Some(&mut issues));
378
379 assert!(content.is_none());
380 assert_eq!(issues.binary_files.len(), 1);
381 }
382
383 #[test]
384 fn test_is_binary_file() {
385 let mut file = NamedTempFile::new().unwrap();
386 file.write_all(&[0x00, 0x01]).unwrap();
387
388 assert!(is_binary_file(file.path()).unwrap());
389
390 let mut text_file = NamedTempFile::new().unwrap();
391 write!(text_file, "text content").unwrap();
392
393 assert!(!is_binary_file(text_file.path()).unwrap());
394 }
395}