use std::path::Path;
use serde::{Deserialize, Serialize};
use crate::TldrError;
#[derive(Debug, Clone)]
pub enum FileReadResult {
Ok(String),
Lossy {
content: String,
warning: String,
},
Binary,
}
impl FileReadResult {
pub fn content(&self) -> Option<&str> {
match self {
FileReadResult::Ok(s) => Some(s),
FileReadResult::Lossy { content, .. } => Some(content),
FileReadResult::Binary => None,
}
}
pub fn has_warning(&self) -> bool {
matches!(self, FileReadResult::Lossy { .. })
}
pub fn warning(&self) -> Option<&str> {
match self {
FileReadResult::Lossy { warning, .. } => Some(warning),
_ => None,
}
}
pub fn is_binary(&self) -> bool {
matches!(self, FileReadResult::Binary)
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct EncodingIssues {
pub lossy_files: Vec<EncodingIssue>,
pub binary_files: Vec<String>,
pub bom_files: Vec<String>,
}
impl EncodingIssues {
pub fn new() -> Self {
Self::default()
}
pub fn add_lossy(&mut self, file: impl Into<String>, issue: impl Into<String>) {
self.lossy_files.push(EncodingIssue {
file: file.into(),
issue: issue.into(),
});
}
pub fn add_binary(&mut self, file: impl Into<String>) {
self.binary_files.push(file.into());
}
pub fn add_bom(&mut self, file: impl Into<String>) {
self.bom_files.push(file.into());
}
pub fn has_issues(&self) -> bool {
!self.lossy_files.is_empty() || !self.binary_files.is_empty()
}
pub fn total(&self) -> usize {
self.lossy_files.len() + self.binary_files.len()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EncodingIssue {
pub file: String,
pub issue: String,
}
const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
pub fn read_source_file(path: &Path) -> Result<FileReadResult, TldrError> {
let bytes = std::fs::read(path)?;
if bytes.starts_with(UTF16_LE_BOM) || bytes.starts_with(UTF16_BE_BOM) {
return Ok(FileReadResult::Lossy {
content: String::new(),
warning: format!(
"File {} appears to be UTF-16 encoded (unsupported), skipping",
path.display()
),
});
}
let (bytes, had_bom) = if bytes.starts_with(UTF8_BOM) {
(&bytes[3..], true)
} else {
(&bytes[..], false)
};
let check_len = bytes.len().min(8192);
if bytes[..check_len].contains(&0) {
return Ok(FileReadResult::Binary);
}
match String::from_utf8(bytes.to_vec()) {
Ok(content) => {
if had_bom {
Ok(FileReadResult::Ok(content))
} else {
Ok(FileReadResult::Ok(content))
}
}
Err(_) => {
let content = String::from_utf8_lossy(bytes).into_owned();
let replacement_count = content.matches('\u{FFFD}').count();
Ok(FileReadResult::Lossy {
content,
warning: format!(
"File {} is not valid UTF-8, used lossy decoding ({} replacement characters)",
path.display(),
replacement_count
),
})
}
}
}
pub fn read_source_file_or_skip(
path: &Path,
issues: Option<&mut EncodingIssues>,
) -> Option<String> {
match read_source_file(path) {
Ok(FileReadResult::Ok(content)) => Some(content),
Ok(FileReadResult::Lossy { content, warning }) => {
if let Some(issues) = issues {
issues.add_lossy(path.display().to_string(), &warning);
}
Some(content)
}
Ok(FileReadResult::Binary) => {
if let Some(issues) = issues {
issues.add_binary(path.display().to_string());
}
None
}
Err(_) => None,
}
}
pub fn is_binary_file(path: &Path) -> Result<bool, TldrError> {
let file = std::fs::File::open(path)?;
let mut reader = std::io::BufReader::new(file);
let mut buffer = [0u8; 8192];
use std::io::Read;
let bytes_read = reader.read(&mut buffer)?;
Ok(buffer[..bytes_read].contains(&0))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_read_utf8_file() {
let mut file = NamedTempFile::new().unwrap();
write!(file, "Hello, world!").unwrap();
let result = read_source_file(file.path()).unwrap();
assert!(matches!(result, FileReadResult::Ok(_)));
assert_eq!(result.content(), Some("Hello, world!"));
}
#[test]
fn test_read_utf8_bom_file() {
let mut file = NamedTempFile::new().unwrap();
file.write_all(&[0xEF, 0xBB, 0xBF]).unwrap();
file.write_all(b"Hello, BOM!").unwrap();
let result = read_source_file(file.path()).unwrap();
assert!(matches!(result, FileReadResult::Ok(_)));
assert_eq!(result.content(), Some("Hello, BOM!"));
}
#[test]
fn test_read_binary_file() {
let mut file = NamedTempFile::new().unwrap();
file.write_all(&[0x00, 0x01, 0x02, 0x00]).unwrap();
let result = read_source_file(file.path()).unwrap();
assert!(matches!(result, FileReadResult::Binary));
assert!(result.is_binary());
assert!(result.content().is_none());
}
#[test]
fn test_read_invalid_utf8() {
let mut file = NamedTempFile::new().unwrap();
file.write_all(&[0x80, 0x81, 0x82, 0x61, 0x62, 0x63])
.unwrap();
let result = read_source_file(file.path()).unwrap();
assert!(matches!(result, FileReadResult::Lossy { .. }));
assert!(result.has_warning());
}
#[test]
fn test_encoding_issues_tracker() {
let mut issues = EncodingIssues::new();
assert!(!issues.has_issues());
issues.add_lossy("file1.py", "Invalid UTF-8");
issues.add_binary("file2.bin");
issues.add_bom("file3.py");
assert!(issues.has_issues());
assert_eq!(issues.total(), 2); assert_eq!(issues.lossy_files.len(), 1);
assert_eq!(issues.binary_files.len(), 1);
assert_eq!(issues.bom_files.len(), 1);
}
#[test]
fn test_read_source_file_or_skip_valid() {
let mut file = NamedTempFile::new().unwrap();
write!(file, "def foo(): pass").unwrap();
let mut issues = EncodingIssues::new();
let content = read_source_file_or_skip(file.path(), Some(&mut issues));
assert!(content.is_some());
assert!(!issues.has_issues());
}
#[test]
fn test_read_source_file_or_skip_binary() {
let mut file = NamedTempFile::new().unwrap();
file.write_all(&[0x00, 0x01, 0x02]).unwrap();
let mut issues = EncodingIssues::new();
let content = read_source_file_or_skip(file.path(), Some(&mut issues));
assert!(content.is_none());
assert_eq!(issues.binary_files.len(), 1);
}
#[test]
fn test_is_binary_file() {
let mut file = NamedTempFile::new().unwrap();
file.write_all(&[0x00, 0x01]).unwrap();
assert!(is_binary_file(file.path()).unwrap());
let mut text_file = NamedTempFile::new().unwrap();
write!(text_file, "text content").unwrap();
assert!(!is_binary_file(text_file.path()).unwrap());
}
}