pith/
filter.rs

1//! File filtering using layered blocklist/allowlist/heuristics.
2//!
3//! Determines which files should be processed for codemap extraction.
4
5use std::path::{Path, PathBuf};
6use thiserror::Error;
7
8/// Errors that can occur during file filtering.
9#[derive(Debug, Error)]
10pub enum FilterError {
11    #[error("failed to read file for heuristics: {path}")]
12    ReadFailed { path: PathBuf },
13}
14
15/// Supported programming languages for codemap extraction.
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
17pub enum Language {
18    Rust,
19    TypeScript,
20    Tsx,
21    JavaScript,
22    Jsx,
23    Python,
24    Go,
25}
26
27impl std::fmt::Display for Language {
28    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29        match self {
30            Language::Rust => write!(f, "rust"),
31            Language::TypeScript => write!(f, "typescript"),
32            Language::Tsx => write!(f, "tsx"),
33            Language::JavaScript => write!(f, "javascript"),
34            Language::Jsx => write!(f, "jsx"),
35            Language::Python => write!(f, "python"),
36            Language::Go => write!(f, "go"),
37        }
38    }
39}
40
41impl std::str::FromStr for Language {
42    type Err = String;
43
44    fn from_str(s: &str) -> Result<Self, Self::Err> {
45        match s.to_lowercase().as_str() {
46            "rust" | "rs" => Ok(Language::Rust),
47            "typescript" | "ts" => Ok(Language::TypeScript),
48            "tsx" => Ok(Language::Tsx),
49            "javascript" | "js" => Ok(Language::JavaScript),
50            "jsx" => Ok(Language::Jsx),
51            "python" | "py" => Ok(Language::Python),
52            "go" => Ok(Language::Go),
53            _ => Err(format!("unknown language: {}", s)),
54        }
55    }
56}
57
58impl Language {
59    /// Get all supported languages.
60    pub fn all() -> &'static [Language] {
61        &[
62            Language::Rust,
63            Language::TypeScript,
64            Language::Tsx,
65            Language::JavaScript,
66            Language::Jsx,
67            Language::Python,
68            Language::Go,
69        ]
70    }
71
72    /// Get file extensions for this language.
73    pub fn extensions(&self) -> &'static [&'static str] {
74        match self {
75            Language::Rust => &["rs"],
76            Language::TypeScript => &["ts"],
77            Language::Tsx => &["tsx"],
78            Language::JavaScript => &["js", "mjs", "cjs"],
79            Language::Jsx => &["jsx"],
80            Language::Python => &["py", "pyi"],
81            Language::Go => &["go"],
82        }
83    }
84}
85
86/// Result of filtering a file.
87#[derive(Debug, Clone, PartialEq, Eq)]
88pub enum FilterResult {
89    /// File accepted for processing with detected language.
90    Accept(Language),
91    /// File rejected with reason.
92    Reject(RejectReason),
93}
94
95/// Reason why a file was rejected.
96#[derive(Debug, Clone, PartialEq, Eq)]
97pub enum RejectReason {
98    /// Extension is in the blocklist (binary, lock file, etc.)
99    BlocklistedExtension,
100    /// Extension not in the allowlist of supported languages
101    UnknownExtension,
102    /// No extension on file
103    NoExtension,
104    /// File contains binary content (null bytes)
105    BinaryContent,
106    /// File appears to be minified (very long lines)
107    MinifiedContent,
108    /// File appears to be generated (contains markers)
109    GeneratedFile,
110}
111
112impl std::fmt::Display for RejectReason {
113    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
114        match self {
115            RejectReason::BlocklistedExtension => write!(f, "blocklisted extension"),
116            RejectReason::UnknownExtension => write!(f, "unknown extension"),
117            RejectReason::NoExtension => write!(f, "no extension"),
118            RejectReason::BinaryContent => write!(f, "binary content"),
119            RejectReason::MinifiedContent => write!(f, "minified content"),
120            RejectReason::GeneratedFile => write!(f, "generated file"),
121        }
122    }
123}
124
125/// Extensions that are always rejected (binary, lock files, etc.)
126const BLOCKLISTED_EXTENSIONS: &[&str] = &[
127    // Images
128    "png", "jpg", "jpeg", "gif", "webp", "ico", "svg", "bmp", "tiff",
129    // Binary
130    "wasm", "so", "dll", "dylib", "exe", "bin", "o", "a", "lib",
131    // Archives
132    "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "tgz",
133    // Documents
134    "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
135    // Media
136    "mp3", "mp4", "wav", "avi", "mov", "mkv", "flac", "ogg",
137    // Fonts
138    "ttf", "otf", "woff", "woff2", "eot",
139    // Lock files
140    "lock",
141    // Source maps
142    "map",
143    // Database
144    "db", "sqlite", "sqlite3",
145];
146
147/// File names that are always rejected.
148const BLOCKLISTED_FILENAMES: &[&str] = &[
149    "package-lock.json",
150    "yarn.lock",
151    "pnpm-lock.yaml",
152    "Cargo.lock",
153    "poetry.lock",
154    "Gemfile.lock",
155    "composer.lock",
156];
157
158/// Markers indicating a generated file.
159const GENERATED_MARKERS: &[&str] = &[
160    "// Code generated",
161    "// DO NOT EDIT",
162    "# Generated by",
163    "/* Auto-generated */",
164    "// This file is auto-generated",
165    "@generated",
166    "// generated from",
167    "// Autogenerated",
168    "# Autogenerated",
169    "# DO NOT EDIT",
170    "<!-- Generated -->",
171    "// THIS FILE IS GENERATED",
172];
173
174/// Maximum line length before considering a file minified.
175const MAX_LINE_LENGTH: usize = 500;
176
177/// Check if an extension is blocklisted.
178pub fn is_blocklisted(extension: &str) -> bool {
179    let ext_lower = extension.to_lowercase();
180    BLOCKLISTED_EXTENSIONS.contains(&ext_lower.as_str())
181}
182
183/// Check if a filename is blocklisted.
184pub fn is_blocklisted_filename(filename: &str) -> bool {
185    BLOCKLISTED_FILENAMES.contains(&filename)
186}
187
188/// Detect language from file path based on extension.
189pub fn detect_language(path: &Path) -> Option<Language> {
190    let ext = path.extension()?.to_str()?.to_lowercase();
191
192    for lang in Language::all() {
193        if lang.extensions().contains(&ext.as_str()) {
194            return Some(*lang);
195        }
196    }
197    None
198}
199
200/// Check if content contains binary data (null bytes).
201pub fn is_binary(content: &[u8]) -> bool {
202    content.contains(&0)
203}
204
205/// Check if content appears to be minified (very long lines).
206/// Uses early exit - returns as soon as a long line is found.
207pub fn is_minified(content: &[u8]) -> bool {
208    let mut line_len = 0;
209    for &b in content {
210        if b == b'\n' {
211            line_len = 0;
212        } else {
213            line_len += 1;
214            if line_len > MAX_LINE_LENGTH {
215                return true; // Early exit
216            }
217        }
218    }
219    false
220}
221
222/// Check if content appears to be generated.
223pub fn is_generated(content: &[u8]) -> bool {
224    // Only check first 2KB for efficiency
225    let check_len = content.len().min(2048);
226    let Ok(text) = std::str::from_utf8(&content[..check_len]) else {
227        return false; // Can't check non-UTF8
228    };
229
230    GENERATED_MARKERS
231        .iter()
232        .any(|marker| text.contains(marker))
233}
234
235/// Determine if a file should be processed for codemap extraction.
236///
237/// Uses a layered filtering approach:
238/// 1. Extension blocklist (instant skip)
239/// 2. Extension allowlist (supported languages)
240/// 3. Content heuristics (binary, minified, generated)
241///
242/// # Arguments
243///
244/// * `path` - File path to check
245/// * `content` - Optional first 1-2KB of file content for heuristics.
246///   Pass `None` to skip heuristic checks.
247///
248/// # Examples
249///
250/// ```
251/// use std::path::Path;
252/// use pith::filter::{should_process, FilterResult, Language};
253///
254/// // Check by extension only
255/// let result = should_process(Path::new("src/main.rs"), None);
256/// assert!(matches!(result, FilterResult::Accept(Language::Rust)));
257///
258/// // Check with content heuristics
259/// let content = b"fn main() {}";
260/// let result = should_process(Path::new("src/main.rs"), Some(content));
261/// assert!(matches!(result, FilterResult::Accept(Language::Rust)));
262/// ```
263pub fn should_process(path: &Path, content: Option<&[u8]>) -> FilterResult {
264    // Check filename blocklist first
265    if let Some(filename) = path.file_name().and_then(|f| f.to_str()) {
266        if is_blocklisted_filename(filename) {
267            return FilterResult::Reject(RejectReason::BlocklistedExtension);
268        }
269    }
270
271    // Get extension
272    let ext = match path.extension().and_then(|e| e.to_str()) {
273        Some(e) => e.to_lowercase(),
274        None => return FilterResult::Reject(RejectReason::NoExtension),
275    };
276
277    // Layer 1: Blocklist
278    if is_blocklisted(&ext) {
279        return FilterResult::Reject(RejectReason::BlocklistedExtension);
280    }
281
282    // Check for minified JS/CSS by filename pattern
283    if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
284        if stem.to_lowercase().ends_with(".min") {
285            return FilterResult::Reject(RejectReason::MinifiedContent);
286        }
287    }
288
289    // Layer 2: Allowlist (language detection)
290    let Some(language) = detect_language(path) else {
291        return FilterResult::Reject(RejectReason::UnknownExtension);
292    };
293
294    // Layer 3: Content heuristics (if content provided)
295    if let Some(content) = content {
296        if is_binary(content) {
297            return FilterResult::Reject(RejectReason::BinaryContent);
298        }
299
300        if is_minified(content) {
301            return FilterResult::Reject(RejectReason::MinifiedContent);
302        }
303
304        if is_generated(content) {
305            return FilterResult::Reject(RejectReason::GeneratedFile);
306        }
307    }
308
309    FilterResult::Accept(language)
310}
311
312/// Check if a path passes basic extension filtering (no content check).
313///
314/// Useful for quick filtering before reading file content.
315pub fn passes_extension_filter(path: &Path) -> Option<Language> {
316    match should_process(path, None) {
317        FilterResult::Accept(lang) => Some(lang),
318        FilterResult::Reject(_) => None,
319    }
320}
321
322#[cfg(test)]
323mod tests {
324    use super::*;
325
326    #[test]
327    fn test_rust_file() {
328        let result = should_process(Path::new("src/main.rs"), None);
329        assert_eq!(result, FilterResult::Accept(Language::Rust));
330    }
331
332    #[test]
333    fn test_typescript_file() {
334        let result = should_process(Path::new("src/index.ts"), None);
335        assert_eq!(result, FilterResult::Accept(Language::TypeScript));
336    }
337
338    #[test]
339    fn test_tsx_file() {
340        let result = should_process(Path::new("components/App.tsx"), None);
341        assert_eq!(result, FilterResult::Accept(Language::Tsx));
342    }
343
344    #[test]
345    fn test_javascript_file() {
346        let result = should_process(Path::new("lib/utils.js"), None);
347        assert_eq!(result, FilterResult::Accept(Language::JavaScript));
348
349        let result = should_process(Path::new("lib/utils.mjs"), None);
350        assert_eq!(result, FilterResult::Accept(Language::JavaScript));
351    }
352
353    #[test]
354    fn test_python_file() {
355        let result = should_process(Path::new("script.py"), None);
356        assert_eq!(result, FilterResult::Accept(Language::Python));
357
358        let result = should_process(Path::new("types.pyi"), None);
359        assert_eq!(result, FilterResult::Accept(Language::Python));
360    }
361
362    #[test]
363    fn test_go_file() {
364        let result = should_process(Path::new("main.go"), None);
365        assert_eq!(result, FilterResult::Accept(Language::Go));
366    }
367
368    #[test]
369    fn test_blocklisted_extension() {
370        let result = should_process(Path::new("image.png"), None);
371        assert_eq!(
372            result,
373            FilterResult::Reject(RejectReason::BlocklistedExtension)
374        );
375
376        let result = should_process(Path::new("archive.zip"), None);
377        assert_eq!(
378            result,
379            FilterResult::Reject(RejectReason::BlocklistedExtension)
380        );
381    }
382
383    #[test]
384    fn test_blocklisted_filename() {
385        let result = should_process(Path::new("package-lock.json"), None);
386        assert_eq!(
387            result,
388            FilterResult::Reject(RejectReason::BlocklistedExtension)
389        );
390
391        let result = should_process(Path::new("Cargo.lock"), None);
392        assert_eq!(
393            result,
394            FilterResult::Reject(RejectReason::BlocklistedExtension)
395        );
396    }
397
398    #[test]
399    fn test_unknown_extension() {
400        let result = should_process(Path::new("README.md"), None);
401        assert_eq!(result, FilterResult::Reject(RejectReason::UnknownExtension));
402
403        let result = should_process(Path::new("config.yaml"), None);
404        assert_eq!(result, FilterResult::Reject(RejectReason::UnknownExtension));
405    }
406
407    #[test]
408    fn test_no_extension() {
409        let result = should_process(Path::new("Makefile"), None);
410        assert_eq!(result, FilterResult::Reject(RejectReason::NoExtension));
411    }
412
413    #[test]
414    fn test_minified_filename() {
415        let result = should_process(Path::new("bundle.min.js"), None);
416        assert_eq!(result, FilterResult::Reject(RejectReason::MinifiedContent));
417    }
418
419    #[test]
420    fn test_binary_content() {
421        let content = b"fn main() {\x00}";
422        let result = should_process(Path::new("test.rs"), Some(content));
423        assert_eq!(result, FilterResult::Reject(RejectReason::BinaryContent));
424    }
425
426    #[test]
427    fn test_minified_content() {
428        // Create a line longer than MAX_LINE_LENGTH
429        let long_line = "x".repeat(MAX_LINE_LENGTH + 100);
430        let content = format!("var x = {{\n{}\n}}", long_line);
431        let result = should_process(Path::new("bundle.js"), Some(content.as_bytes()));
432        assert_eq!(result, FilterResult::Reject(RejectReason::MinifiedContent));
433    }
434
435    #[test]
436    fn test_generated_content() {
437        let content = b"// Code generated by protoc. DO NOT EDIT.\npackage main";
438        let result = should_process(Path::new("proto.go"), Some(content));
439        assert_eq!(result, FilterResult::Reject(RejectReason::GeneratedFile));
440    }
441
442    #[test]
443    fn test_valid_content() {
444        let content = b"fn main() {\n    println!(\"Hello\");\n}";
445        let result = should_process(Path::new("main.rs"), Some(content));
446        assert_eq!(result, FilterResult::Accept(Language::Rust));
447    }
448
449    #[test]
450    fn test_language_extensions() {
451        assert_eq!(Language::Rust.extensions(), &["rs"]);
452        assert_eq!(Language::JavaScript.extensions(), &["js", "mjs", "cjs"]);
453        assert_eq!(Language::Python.extensions(), &["py", "pyi"]);
454    }
455
456    #[test]
457    fn test_language_from_str() {
458        assert_eq!("rust".parse::<Language>().unwrap(), Language::Rust);
459        assert_eq!("rs".parse::<Language>().unwrap(), Language::Rust);
460        assert_eq!("typescript".parse::<Language>().unwrap(), Language::TypeScript);
461        assert_eq!("ts".parse::<Language>().unwrap(), Language::TypeScript);
462        assert!("invalid".parse::<Language>().is_err());
463    }
464
465    #[test]
466    fn test_passes_extension_filter() {
467        assert_eq!(
468            passes_extension_filter(Path::new("main.rs")),
469            Some(Language::Rust)
470        );
471        assert_eq!(passes_extension_filter(Path::new("image.png")), None);
472    }
473}