1use crate::error::{Error, Result};
2use once_cell::sync::Lazy;
3use std::collections::HashSet;
4use std::fs::File;
5use std::io::{BufReader, Read};
6use std::path::{Path, PathBuf};
7
8static BINARY_EXTENSIONS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
9 [
10 "exe", "dll", "so", "dylib", "a", "o", "obj", "png", "jpg", "jpeg", "gif", "bmp", "ico",
11 "webp", "mp3", "mp4", "avi", "mkv", "mov", "wav", "flac", "pdf", "doc", "docx", "xls",
12 "xlsx", "ppt", "pptx", "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "wasm", "pyc",
13 "class",
14 ]
15 .into_iter()
16 .collect()
17});
18
19#[derive(Debug, Clone)]
21pub struct FileData {
22 pub absolute_path: PathBuf,
24
25 pub relative_path: String,
27
28 pub content: FileContent,
30
31 pub token_count: usize,
33}
34
35#[derive(Debug, Clone)]
37pub enum FileContent {
38 Text(String),
40
41 Binary {
43 size: u64,
45 },
46}
47
48impl FileData {
49 #[must_use]
51 pub fn new_text(
52 absolute_path: PathBuf,
53 relative_path: String,
54 content: String,
55 token_count: usize,
56 ) -> Self {
57 Self {
58 absolute_path,
59 relative_path,
60 content: FileContent::Text(content),
61 token_count,
62 }
63 }
64
65 #[must_use]
67 pub fn new_binary(absolute_path: PathBuf, relative_path: String, size: u64) -> Self {
68 Self {
69 absolute_path,
70 relative_path,
71 content: FileContent::Binary { size },
72 token_count: 0,
73 }
74 }
75
76 #[must_use]
78 pub const fn is_text(&self) -> bool {
79 matches!(self.content, FileContent::Text(_))
80 }
81
82 #[must_use]
84 pub const fn is_binary(&self) -> bool {
85 matches!(self.content, FileContent::Binary { .. })
86 }
87
88 #[must_use]
90 pub fn content_str(&self) -> Option<&str> {
91 match &self.content {
92 FileContent::Text(s) => Some(s),
93 FileContent::Binary { .. } => None,
94 }
95 }
96
97 #[must_use]
99 pub fn size_bytes(&self) -> u64 {
100 match &self.content {
101 FileContent::Text(s) => s.len() as u64,
102 FileContent::Binary { size } => *size,
103 }
104 }
105
106 #[must_use]
108 pub fn line_count(&self) -> Option<usize> {
109 self.content_str().map(|s| s.lines().count())
110 }
111}
112
113pub(crate) fn is_likely_binary(path: &Path) -> Result<bool> {
126 const BUFFER_SIZE: usize = 8192;
127 const ASCII_THRESHOLD: f64 = 0.85;
128
129 let file = File::open(path).map_err(|e| Error::io(path, e))?;
130 let mut reader = BufReader::with_capacity(BUFFER_SIZE, file);
131 let mut buffer = [0u8; BUFFER_SIZE];
132
133 let bytes_read = reader.read(&mut buffer).map_err(|e| Error::io(path, e))?;
134
135 if bytes_read == 0 {
136 return Ok(false);
137 }
138
139 let sample = &buffer[..bytes_read];
140
141 if memchr::memchr(0, sample).is_some() {
143 return Ok(true);
144 }
145
146 let ascii_count = sample.iter().filter(|&&b| b < 128).count();
148 let ascii_ratio = ascii_count as f64 / bytes_read as f64;
149
150 Ok(ascii_ratio < ASCII_THRESHOLD)
151}
152
153#[must_use]
155pub(crate) fn has_text_extension(path: &Path) -> bool {
156 static TEXT_EXTENSIONS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
157 [
158 "rs", "toml", "md", "txt", "json", "yaml", "yml", "js", "ts", "jsx", "tsx", "py", "go",
159 "java", "c", "cpp", "h", "hpp", "cs", "rb", "php", "html", "css", "scss", "sass",
160 "xml", "svg", "sh", "bash", "zsh", "fish", "vim", "lua",
161 ]
162 .into_iter()
163 .collect()
164 });
165 path.extension()
166 .and_then(|ext| ext.to_str())
167 .map(|ext| TEXT_EXTENSIONS.contains(ext))
168 .unwrap_or(false)
169}
170
171#[must_use]
173pub(crate) fn has_binary_extension(path: &Path) -> bool {
174 path.extension()
175 .and_then(|ext| ext.to_str())
176 .map(|ext| BINARY_EXTENSIONS.contains(ext))
177 .unwrap_or(false)
178}
179
180#[cfg(test)]
181mod tests {
182 use super::*;
183 use assert_fs::prelude::*;
184 use std::fs::File;
185 use std::io::Write;
186
187 #[test]
188 fn test_file_data_text() {
189 let data = FileData::new_text(
190 PathBuf::from("test.rs"),
191 "test.rs".to_string(),
192 "fn main() {}".to_string(),
193 3,
194 );
195
196 assert!(data.is_text());
197 assert!(!data.is_binary());
198 assert_eq!(data.content_str(), Some("fn main() {}"));
199 assert_eq!(data.token_count, 3);
200 }
201
202 #[test]
203 fn test_file_data_binary() {
204 let data = FileData::new_binary(PathBuf::from("test.exe"), "test.exe".to_string(), 1024);
205
206 assert!(data.is_binary());
207 assert!(!data.is_text());
208 assert_eq!(data.content_str(), None);
209 assert_eq!(data.size_bytes(), 1024);
210 }
211
212 #[test]
213 fn test_is_likely_binary_text_file() {
214 let temp = assert_fs::TempDir::new().unwrap();
215 let file = temp.child("test.txt");
216 file.write_str("Hello, world!").unwrap();
217
218 assert!(!is_likely_binary(file.path()).unwrap());
219 }
220
221 #[test]
222 fn test_is_likely_binary_binary_file() {
223 let temp = assert_fs::TempDir::new().unwrap();
224 let file = temp.child("test.bin");
225
226 let mut f = File::create(file.path()).unwrap();
227 f.write_all(&[0u8; 100]).unwrap(); assert!(is_likely_binary(file.path()).unwrap());
230 }
231
232 #[test]
233 fn test_is_likely_binary_empty_file() {
234 let temp = assert_fs::TempDir::new().unwrap();
235 let file = temp.child("empty.txt");
236 file.touch().unwrap();
237
238 assert!(!is_likely_binary(file.path()).unwrap());
239 }
240
241 #[test]
242 fn test_has_text_extension() {
243 assert!(has_text_extension(Path::new("test.rs")));
244 assert!(has_text_extension(Path::new("config.toml")));
245 assert!(has_text_extension(Path::new("README.md")));
246 assert!(!has_text_extension(Path::new("binary.exe")));
247 assert!(!has_text_extension(Path::new("no_extension")));
248 }
249
250 #[test]
251 fn test_has_binary_extension() {
252 assert!(has_binary_extension(Path::new("app.exe")));
253 assert!(has_binary_extension(Path::new("image.png")));
254 assert!(has_binary_extension(Path::new("archive.zip")));
255 assert!(!has_binary_extension(Path::new("code.rs")));
256 }
257
258 #[test]
259 fn test_line_count() {
260 let data = FileData::new_text(
261 PathBuf::from("test.rs"),
262 "test.rs".to_string(),
263 "line1\nline2\nline3".to_string(),
264 5,
265 );
266
267 assert_eq!(data.line_count(), Some(3));
268 }
269
270 #[test]
271 fn test_line_count_binary() {
272 let data = FileData::new_binary(PathBuf::from("test.exe"), "test.exe".to_string(), 1024);
273
274 assert_eq!(data.line_count(), None);
275 }
276}