infiniloom_engine/scanner/
common.rs

1//! Common scanner utilities
2//!
3//! This module provides shared utilities for detecting binary files,
4//! used by both CLI and bindings scanners.
5
6use std::path::Path;
7
8/// List of known binary file extensions
9///
10/// Comprehensive list including executables, compiled code, archives,
11/// media files, documents, fonts, and databases.
12pub const BINARY_EXTENSIONS: &[&str] = &[
13    // Executables and libraries
14    "exe", "dll", "so", "dylib", "a", "o", "obj", "lib", // Compiled bytecode
15    "pyc", "pyo", "class", "jar", "war", "ear", // Archives
16    "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "tgz", // Images
17    "png", "jpg", "jpeg", "gif", "bmp", "ico", "webp", "svg", "tiff", "psd",
18    // Audio/Video
19    "mp3", "mp4", "avi", "mov", "wav", "flac", "ogg", "webm", "mkv", // Documents
20    "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "odt", // Fonts
21    "woff", "woff2", "ttf", "eot", "otf", // Database
22    "db", "sqlite", "sqlite3", // Misc binary
23    "bin", "dat", "cache",
24];
25
26/// Check if a file path has a known binary extension
27///
28/// # Arguments
29/// * `path` - Path to check
30///
31/// # Returns
32/// `true` if the file extension is in the known binary list
33///
34/// # Example
35/// ```
36/// use infiniloom_engine::scanner::is_binary_extension;
37/// use std::path::Path;
38///
39/// assert!(is_binary_extension(Path::new("image.png")));
40/// assert!(is_binary_extension(Path::new("archive.zip")));
41/// assert!(!is_binary_extension(Path::new("code.rs")));
42/// ```
43pub fn is_binary_extension(path: &Path) -> bool {
44    let ext = match path.extension().and_then(|e| e.to_str()) {
45        Some(e) => e.to_lowercase(),
46        None => return false,
47    };
48
49    BINARY_EXTENSIONS.contains(&ext.as_str())
50}
51
52/// Check if content appears to be binary by examining bytes
53///
54/// Uses a heuristic: if more than 10% of the first 8KB contains
55/// null bytes or other control characters, the file is considered binary.
56///
57/// # Arguments
58/// * `content` - Byte slice to check (typically first 8KB of file)
59///
60/// # Returns
61/// `true` if the content appears to be binary
62///
63/// # Example
64/// ```
65/// use infiniloom_engine::scanner::is_binary_content;
66///
67/// // Text content
68/// assert!(!is_binary_content(b"fn main() { println!(\"hello\"); }"));
69///
70/// // Binary content (has null bytes)
71/// assert!(is_binary_content(&[0x00, 0x01, 0x02, 0x00, 0x00]));
72/// ```
73pub fn is_binary_content(content: &[u8]) -> bool {
74    // Check first 8KB for binary indicators
75    let check_len = content.len().min(8192);
76    let sample = &content[..check_len];
77
78    if sample.is_empty() {
79        return false;
80    }
81
82    // Count null bytes and non-printable characters
83    let binary_chars = sample
84        .iter()
85        .filter(|&&b| {
86            // Null byte or control char (except common whitespace)
87            b == 0 || (b < 32 && b != b'\n' && b != b'\r' && b != b'\t')
88        })
89        .count();
90
91    // If more than 10% are binary characters, consider it binary
92    let threshold = sample.len() / 10;
93    binary_chars > threshold
94}
95
96#[cfg(test)]
97mod tests {
98    use super::*;
99
100    #[test]
101    fn test_binary_extension_executables() {
102        assert!(is_binary_extension(Path::new("program.exe")));
103        assert!(is_binary_extension(Path::new("lib.dll")));
104        assert!(is_binary_extension(Path::new("library.so")));
105        assert!(is_binary_extension(Path::new("framework.dylib")));
106    }
107
108    #[test]
109    fn test_binary_extension_archives() {
110        assert!(is_binary_extension(Path::new("archive.zip")));
111        assert!(is_binary_extension(Path::new("backup.tar")));
112        assert!(is_binary_extension(Path::new("compressed.gz")));
113        assert!(is_binary_extension(Path::new("package.7z")));
114    }
115
116    #[test]
117    fn test_binary_extension_images() {
118        assert!(is_binary_extension(Path::new("photo.jpg")));
119        assert!(is_binary_extension(Path::new("logo.png")));
120        assert!(is_binary_extension(Path::new("icon.gif")));
121        assert!(is_binary_extension(Path::new("image.webp")));
122    }
123
124    #[test]
125    fn test_binary_extension_media() {
126        assert!(is_binary_extension(Path::new("song.mp3")));
127        assert!(is_binary_extension(Path::new("video.mp4")));
128        assert!(is_binary_extension(Path::new("movie.mkv")));
129    }
130
131    #[test]
132    fn test_binary_extension_documents() {
133        assert!(is_binary_extension(Path::new("doc.pdf")));
134        assert!(is_binary_extension(Path::new("spreadsheet.xlsx")));
135        assert!(is_binary_extension(Path::new("presentation.pptx")));
136    }
137
138    #[test]
139    fn test_binary_extension_fonts() {
140        assert!(is_binary_extension(Path::new("font.woff")));
141        assert!(is_binary_extension(Path::new("font.woff2")));
142        assert!(is_binary_extension(Path::new("font.ttf")));
143    }
144
145    #[test]
146    fn test_binary_extension_database() {
147        assert!(is_binary_extension(Path::new("data.db")));
148        assert!(is_binary_extension(Path::new("store.sqlite")));
149        assert!(is_binary_extension(Path::new("cache.sqlite3")));
150    }
151
152    #[test]
153    fn test_non_binary_extensions() {
154        assert!(!is_binary_extension(Path::new("code.rs")));
155        assert!(!is_binary_extension(Path::new("script.py")));
156        assert!(!is_binary_extension(Path::new("module.ts")));
157        assert!(!is_binary_extension(Path::new("style.css")));
158        assert!(!is_binary_extension(Path::new("data.json")));
159        assert!(!is_binary_extension(Path::new("config.yaml")));
160        assert!(!is_binary_extension(Path::new("readme.md")));
161    }
162
163    #[test]
164    fn test_no_extension() {
165        assert!(!is_binary_extension(Path::new("Makefile")));
166        assert!(!is_binary_extension(Path::new("Dockerfile")));
167        assert!(!is_binary_extension(Path::new(".gitignore")));
168    }
169
170    #[test]
171    fn test_case_insensitive() {
172        assert!(is_binary_extension(Path::new("FILE.PNG")));
173        assert!(is_binary_extension(Path::new("Archive.ZIP")));
174        assert!(is_binary_extension(Path::new("Video.MP4")));
175    }
176
177    #[test]
178    fn test_binary_content_text() {
179        // Normal text content
180        assert!(!is_binary_content(b"fn main() {\n    println!(\"hello\");\n}"));
181        assert!(!is_binary_content(b"Hello, World!\n"));
182        assert!(!is_binary_content(b"def foo():\n    return 42\n"));
183    }
184
185    #[test]
186    fn test_binary_content_with_nulls() {
187        // Content with null bytes (clearly binary)
188        let binary = vec![0u8; 100];
189        assert!(is_binary_content(&binary));
190
191        // Mixed content with many nulls
192        let mut mixed = b"some text".to_vec();
193        mixed.extend(vec![0u8; 100]);
194        assert!(is_binary_content(&mixed));
195    }
196
197    #[test]
198    fn test_binary_content_control_chars() {
199        // Content with control characters
200        let control: Vec<u8> = (0..32)
201            .filter(|&b| b != b'\n' && b != b'\r' && b != b'\t')
202            .collect();
203        let mut content = control.repeat(10);
204        content.extend(b"some text");
205        // This should be detected as binary due to many control chars
206        assert!(is_binary_content(&content));
207    }
208
209    #[test]
210    fn test_binary_content_empty() {
211        assert!(!is_binary_content(b""));
212    }
213
214    #[test]
215    fn test_binary_content_whitespace_ok() {
216        // Content with tabs, newlines, carriage returns should be fine
217        assert!(!is_binary_content(b"line1\nline2\r\nline3\ttabbed"));
218    }
219}