llm_utl/
file.rs

1use crate::error::{Error, Result};
2use once_cell::sync::Lazy;
3use std::collections::HashSet;
4use std::fs::File;
5use std::io::{BufReader, Read};
6use std::path::{Path, PathBuf};
7
8static BINARY_EXTENSIONS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
9    [
10        "exe", "dll", "so", "dylib", "a", "o", "obj", "png", "jpg", "jpeg", "gif", "bmp", "ico",
11        "webp", "mp3", "mp4", "avi", "mkv", "mov", "wav", "flac", "pdf", "doc", "docx", "xls",
12        "xlsx", "ppt", "pptx", "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "wasm", "pyc",
13        "class",
14    ]
15    .into_iter()
16    .collect()
17});
18
19/// Represents a file with its content and metadata.
20#[derive(Debug, Clone)]
21pub struct FileData {
22    /// Absolute path to the file
23    pub absolute_path: PathBuf,
24
25    /// Relative path from the root directory
26    pub relative_path: String,
27
28    /// File content (text or binary)
29    pub content: FileContent,
30
31    /// Estimated token count
32    pub token_count: usize,
33}
34
35/// File content type (text or binary).
36#[derive(Debug, Clone)]
37pub enum FileContent {
38    /// Text content with UTF-8 string
39    Text(String),
40
41    /// Binary content with file size
42    Binary {
43        /// Size of the binary file in bytes
44        size: u64,
45    },
46}
47
48impl FileData {
49    /// Creates a new text file data.
50    #[must_use]
51    pub fn new_text(
52        absolute_path: PathBuf,
53        relative_path: String,
54        content: String,
55        token_count: usize,
56    ) -> Self {
57        Self {
58            absolute_path,
59            relative_path,
60            content: FileContent::Text(content),
61            token_count,
62        }
63    }
64
65    /// Creates a new binary file data.
66    #[must_use]
67    pub fn new_binary(absolute_path: PathBuf, relative_path: String, size: u64) -> Self {
68        Self {
69            absolute_path,
70            relative_path,
71            content: FileContent::Binary { size },
72            token_count: 0,
73        }
74    }
75
76    /// Returns true if this is a text file.
77    #[must_use]
78    pub const fn is_text(&self) -> bool {
79        matches!(self.content, FileContent::Text(_))
80    }
81
82    /// Returns true if this is a binary file.
83    #[must_use]
84    pub const fn is_binary(&self) -> bool {
85        matches!(self.content, FileContent::Binary { .. })
86    }
87
88    /// Returns the text content if this is a text file.
89    #[must_use]
90    pub fn content_str(&self) -> Option<&str> {
91        match &self.content {
92            FileContent::Text(s) => Some(s),
93            FileContent::Binary { .. } => None,
94        }
95    }
96
97    /// Returns the size in bytes.
98    #[must_use]
99    pub fn size_bytes(&self) -> u64 {
100        match &self.content {
101            FileContent::Text(s) => s.len() as u64,
102            FileContent::Binary { size } => *size,
103        }
104    }
105
106    /// Returns the number of lines (for text files only).
107    #[must_use]
108    pub fn line_count(&self) -> Option<usize> {
109        self.content_str().map(|s| s.lines().count())
110    }
111}
112
113/// Determines if a file is likely binary by analyzing its content.
114///
115/// # Algorithm
116///
117/// 1. Reads the first 8KB of the file
118/// 2. Checks for null bytes (binary indicator)
119/// 3. Calculates the ratio of ASCII characters
120/// 4. Files with null bytes or low ASCII ratio are considered binary
121///
122/// # Errors
123///
124/// Returns an error if the file cannot be opened or read.
125pub(crate) fn is_likely_binary(path: &Path) -> Result<bool> {
126    const BUFFER_SIZE: usize = 8192;
127    const ASCII_THRESHOLD: f64 = 0.85;
128
129    let file = File::open(path).map_err(|e| Error::io(path, e))?;
130    let mut reader = BufReader::with_capacity(BUFFER_SIZE, file);
131    let mut buffer = [0u8; BUFFER_SIZE];
132
133    let bytes_read = reader.read(&mut buffer).map_err(|e| Error::io(path, e))?;
134
135    if bytes_read == 0 {
136        return Ok(false);
137    }
138
139    let sample = &buffer[..bytes_read];
140
141    // Быстрая проверка на null bytes с помощью memchr
142    if memchr::memchr(0, sample).is_some() {
143        return Ok(true);
144    }
145
146    // Подсчет ASCII символов
147    let ascii_count = sample.iter().filter(|&&b| b < 128).count();
148    let ascii_ratio = ascii_count as f64 / bytes_read as f64;
149
150    Ok(ascii_ratio < ASCII_THRESHOLD)
151}
152
153/// Checks if a file extension suggests a text file.
154#[must_use]
155pub(crate) fn has_text_extension(path: &Path) -> bool {
156    static TEXT_EXTENSIONS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
157        [
158            "rs", "toml", "md", "txt", "json", "yaml", "yml", "js", "ts", "jsx", "tsx", "py", "go",
159            "java", "c", "cpp", "h", "hpp", "cs", "rb", "php", "html", "css", "scss", "sass",
160            "xml", "svg", "sh", "bash", "zsh", "fish", "vim", "lua",
161        ]
162        .into_iter()
163        .collect()
164    });
165    path.extension()
166        .and_then(|ext| ext.to_str())
167        .map(|ext| TEXT_EXTENSIONS.contains(ext))
168        .unwrap_or(false)
169}
170
171/// Checks if a file extension suggests a binary file.
172#[must_use]
173pub(crate) fn has_binary_extension(path: &Path) -> bool {
174    path.extension()
175        .and_then(|ext| ext.to_str())
176        .map(|ext| BINARY_EXTENSIONS.contains(ext))
177        .unwrap_or(false)
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183    use assert_fs::prelude::*;
184    use std::fs::File;
185    use std::io::Write;
186
187    #[test]
188    fn test_file_data_text() {
189        let data = FileData::new_text(
190            PathBuf::from("test.rs"),
191            "test.rs".to_string(),
192            "fn main() {}".to_string(),
193            3,
194        );
195
196        assert!(data.is_text());
197        assert!(!data.is_binary());
198        assert_eq!(data.content_str(), Some("fn main() {}"));
199        assert_eq!(data.token_count, 3);
200    }
201
202    #[test]
203    fn test_file_data_binary() {
204        let data = FileData::new_binary(PathBuf::from("test.exe"), "test.exe".to_string(), 1024);
205
206        assert!(data.is_binary());
207        assert!(!data.is_text());
208        assert_eq!(data.content_str(), None);
209        assert_eq!(data.size_bytes(), 1024);
210    }
211
212    #[test]
213    fn test_is_likely_binary_text_file() {
214        let temp = assert_fs::TempDir::new().unwrap();
215        let file = temp.child("test.txt");
216        file.write_str("Hello, world!").unwrap();
217
218        assert!(!is_likely_binary(file.path()).unwrap());
219    }
220
221    #[test]
222    fn test_is_likely_binary_binary_file() {
223        let temp = assert_fs::TempDir::new().unwrap();
224        let file = temp.child("test.bin");
225
226        let mut f = File::create(file.path()).unwrap();
227        f.write_all(&[0u8; 100]).unwrap(); // Null bytes
228
229        assert!(is_likely_binary(file.path()).unwrap());
230    }
231
232    #[test]
233    fn test_is_likely_binary_empty_file() {
234        let temp = assert_fs::TempDir::new().unwrap();
235        let file = temp.child("empty.txt");
236        file.touch().unwrap();
237
238        assert!(!is_likely_binary(file.path()).unwrap());
239    }
240
241    #[test]
242    fn test_has_text_extension() {
243        assert!(has_text_extension(Path::new("test.rs")));
244        assert!(has_text_extension(Path::new("config.toml")));
245        assert!(has_text_extension(Path::new("README.md")));
246        assert!(!has_text_extension(Path::new("binary.exe")));
247        assert!(!has_text_extension(Path::new("no_extension")));
248    }
249
250    #[test]
251    fn test_has_binary_extension() {
252        assert!(has_binary_extension(Path::new("app.exe")));
253        assert!(has_binary_extension(Path::new("image.png")));
254        assert!(has_binary_extension(Path::new("archive.zip")));
255        assert!(!has_binary_extension(Path::new("code.rs")));
256    }
257
258    #[test]
259    fn test_line_count() {
260        let data = FileData::new_text(
261            PathBuf::from("test.rs"),
262            "test.rs".to_string(),
263            "line1\nline2\nline3".to_string(),
264            5,
265        );
266
267        assert_eq!(data.line_count(), Some(3));
268    }
269
270    #[test]
271    fn test_line_count_binary() {
272        let data = FileData::new_binary(PathBuf::from("test.exe"), "test.exe".to_string(), 1024);
273
274        assert_eq!(data.line_count(), None);
275    }
276}