Skip to main content

st/
interest_calculator.rs

1//
2// -----------------------------------------------------------------------------
3//  INTEREST CALCULATOR: The Scoring Engine
4//
5//  This is where the magic happens. We take raw file metadata and turn it into
6//  actionable intelligence: "Is this file interesting right now?"
7//
8//  The calculator weighs multiple factors:
9//  - Recency: Modified in the last 24h? That's hot.
10//  - Security: Suspicious patterns? Critical.
11//  - Key files: README, Cargo.toml, package.json? Important.
12//  - Changes: Different from last scan? Notable.
13//  - Context: Inside node_modules? Probably boring (unless suspicious).
14//
15//  "Interest is contextual. A config file is boring until it changes." - Omni
16// -----------------------------------------------------------------------------
17//
18
19use crate::scanner::{FileCategory, FileNode, FilesystemType};
20use crate::scanner_interest::{
21    ChangeType, DependencyManager, InterestFactor, InterestLevel, InterestScore,
22    InterestWeights, KeyFileType, RiskLevel, TraversalContext, TraversalPath,
23};
24use crate::scanner_state::{FileSignature, ScanState};
25use crate::security_scan::{SecurityFinding, SecurityScanner};
26use std::collections::HashSet;
27use std::path::Path;
28use std::time::SystemTime;
29
30/// The Interest Calculator - determines what's worth showing
31pub struct InterestCalculator {
32    /// Weights for different interest factors
33    weights: InterestWeights,
34
35    /// Previous scan state for change detection
36    previous_state: Option<ScanState>,
37
38    /// Directories marked as "hot" (frequent changes)
39    hot_dirs: HashSet<std::path::PathBuf>,
40
41    /// Security scanner for detecting suspicious patterns
42    security_scanner: Option<SecurityScanner>,
43
44    /// Current time (cached for consistency during scan)
45    now: SystemTime,
46}
47
48impl InterestCalculator {
49    /// Create a new interest calculator with default weights
50    pub fn new() -> Self {
51        Self {
52            weights: InterestWeights::default(),
53            previous_state: None,
54            hot_dirs: HashSet::new(),
55            security_scanner: Some(SecurityScanner::new()),
56            now: SystemTime::now(),
57        }
58    }
59
60    /// Create with custom weights
61    pub fn with_weights(weights: InterestWeights) -> Self {
62        Self {
63            weights,
64            previous_state: None,
65            hot_dirs: HashSet::new(),
66            security_scanner: Some(SecurityScanner::new()),
67            now: SystemTime::now(),
68        }
69    }
70
71    /// Set the previous state for change detection
72    pub fn with_previous_state(mut self, state: ScanState) -> Self {
73        self.previous_state = Some(state);
74        self
75    }
76
77    /// Set hot directories to watch
78    pub fn with_hot_dirs(mut self, dirs: HashSet<std::path::PathBuf>) -> Self {
79        self.hot_dirs = dirs;
80        self
81    }
82
83    /// Disable security scanning (for performance)
84    pub fn without_security(mut self) -> Self {
85        self.security_scanner = None;
86        self
87    }
88
89    /// Calculate the interest score for a file node
90    pub fn calculate(&self, node: &FileNode) -> InterestScore {
91        let mut factors = Vec::new();
92
93        // Factor 1: Recently modified
94        if let Some(factor) = self.check_recency(node) {
95            factors.push(factor);
96        }
97
98        // Factor 2: Key project file
99        if let Some(factor) = self.check_key_file(node) {
100            factors.push(factor);
101        }
102
103        // Factor 3: Changed since last scan
104        if let Some(factor) = self.check_changed(node) {
105            factors.push(factor);
106        }
107
108        // Factor 4: In hot directory
109        if let Some(factor) = self.check_hot_dir(node) {
110            factors.push(factor);
111        }
112
113        // Factor 5: Inside dependency tree (negative weight)
114        if let Some(factor) = self.check_dependency_context(node) {
115            factors.push(factor);
116        }
117
118        // Factor 6: Virtual filesystem (usually boring)
119        if let Some(factor) = self.check_filesystem_type(node) {
120            factors.push(factor);
121        }
122
123        // Factor 7: File category boost
124        if let Some(factor) = self.check_category_boost(node) {
125            factors.push(factor);
126        }
127
128        InterestScore::from_factors(factors)
129    }
130
131    /// Calculate interest and include security findings
132    pub fn calculate_with_security(
133        &self,
134        node: &FileNode,
135        content: Option<&str>,
136    ) -> (InterestScore, Vec<SecurityFinding>) {
137        let mut factors = Vec::new();
138        let mut findings = Vec::new();
139
140        // Run security scan if enabled and we have content
141        if let (Some(scanner), Some(content)) = (&self.security_scanner, content) {
142            let file_findings = scanner.scan_file_content(&node.path, content);
143            for finding in &file_findings {
144                let risk_level = match finding.risk_level {
145                    crate::security_scan::RiskLevel::Critical => RiskLevel::Critical,
146                    crate::security_scan::RiskLevel::High => RiskLevel::High,
147                    crate::security_scan::RiskLevel::Medium => RiskLevel::Medium,
148                    crate::security_scan::RiskLevel::Low => RiskLevel::Low,
149                };
150
151                factors.push(InterestFactor::SecurityPattern {
152                    risk: risk_level,
153                    description: finding.description.clone(),
154                    weight: match finding.risk_level {
155                        crate::security_scan::RiskLevel::Critical => 1.0,
156                        crate::security_scan::RiskLevel::High => 0.8,
157                        crate::security_scan::RiskLevel::Medium => 0.5,
158                        crate::security_scan::RiskLevel::Low => 0.2,
159                    },
160                });
161            }
162            findings = file_findings;
163        }
164
165        // Add all other factors
166        if let Some(factor) = self.check_recency(node) {
167            factors.push(factor);
168        }
169        if let Some(factor) = self.check_key_file(node) {
170            factors.push(factor);
171        }
172        if let Some(factor) = self.check_changed(node) {
173            factors.push(factor);
174        }
175        if let Some(factor) = self.check_hot_dir(node) {
176            factors.push(factor);
177        }
178        if let Some(factor) = self.check_dependency_context(node) {
179            factors.push(factor);
180        }
181        if let Some(factor) = self.check_filesystem_type(node) {
182            factors.push(factor);
183        }
184        if let Some(factor) = self.check_category_boost(node) {
185            factors.push(factor);
186        }
187
188        (InterestScore::from_factors(factors), findings)
189    }
190
191    /// Check if file was recently modified
192    fn check_recency(&self, node: &FileNode) -> Option<InterestFactor> {
193        let duration = self.now.duration_since(node.modified).ok()?;
194        let hours = duration.as_secs_f32() / 3600.0;
195
196        // Interest decays over time
197        let weight = if hours < 1.0 {
198            self.weights.recent_modification * 1.5 // Very recent boost
199        } else if hours < 24.0 {
200            self.weights.recent_modification * (1.0 - hours / 48.0)
201        } else if hours < 168.0 {
202            // Within a week
203            self.weights.recent_modification * 0.3 * (1.0 - hours / 336.0)
204        } else {
205            return None; // Too old to matter
206        };
207
208        if weight > 0.05 {
209            Some(InterestFactor::RecentlyModified {
210                hours_ago: hours,
211                weight,
212            })
213        } else {
214            None
215        }
216    }
217
218    /// Check if this is a key project file
219    fn check_key_file(&self, node: &FileNode) -> Option<InterestFactor> {
220        if node.is_dir {
221            return None;
222        }
223
224        let file_name = node.path.file_name()?.to_str()?;
225        let file_name_lower = file_name.to_lowercase();
226
227        let key_type = match file_name_lower.as_str() {
228            // Documentation
229            "readme.md" | "readme" | "readme.txt" | "changelog.md" | "changelog" | "history.md" => {
230                Some(KeyFileType::Documentation)
231            }
232
233            // Build configs
234            "cargo.toml" | "package.json" | "pyproject.toml" | "go.mod" | "gemfile"
235            | "build.gradle" | "pom.xml" | "makefile" | "cmakelists.txt" => {
236                Some(KeyFileType::BuildConfig)
237            }
238
239            // Configuration
240            ".env" | ".env.local" | ".env.example" | "config.toml" | "config.yaml"
241            | "config.json" | "settings.toml" | "settings.yaml" => Some(KeyFileType::Configuration),
242
243            // Entry points
244            "main.rs" | "lib.rs" | "mod.rs" | "index.js" | "index.ts" | "main.py" | "__init__.py"
245            | "app.py" | "main.go" | "main.java" => Some(KeyFileType::EntryPoint),
246
247            // License
248            "license" | "license.md" | "license.txt" | "copying" => Some(KeyFileType::License),
249
250            // CI/CD
251            ".gitlab-ci.yml" | "jenkinsfile" | ".travis.yml" | "azure-pipelines.yml" => {
252                Some(KeyFileType::CiConfig)
253            }
254
255            // Container
256            "dockerfile" | "docker-compose.yml" | "docker-compose.yaml" | "containerfile" => {
257                Some(KeyFileType::Container)
258            }
259
260            // AI config
261            "claude.md" | ".cursorrules" | ".aider" | "copilot.md" => Some(KeyFileType::AiConfig),
262
263            _ => None,
264        };
265
266        // Also check for GitHub workflows
267        let key_type = key_type.or_else(|| {
268            if node.path.to_string_lossy().contains(".github/workflows") {
269                Some(KeyFileType::CiConfig)
270            } else {
271                None
272            }
273        });
274
275        key_type.map(|file_type| InterestFactor::KeyProjectFile {
276            file_type,
277            weight: self.weights.key_file,
278        })
279    }
280
281    /// Check if file changed since last scan
282    fn check_changed(&self, node: &FileNode) -> Option<InterestFactor> {
283        let prev_state = self.previous_state.as_ref()?;
284        let prev_sig = prev_state.signatures.get(&node.path);
285
286        match prev_sig {
287            None => {
288                // File is new
289                Some(InterestFactor::ChangedSinceLastScan {
290                    change: ChangeType::Added,
291                    weight: self.weights.changed_since_scan,
292                })
293            }
294            Some(old_sig) => {
295                // Check if changed
296                let new_sig = FileSignature::from_path(&node.path).ok()?;
297
298                if new_sig.changed(old_sig) {
299                    let change_type = if old_sig.permissions != new_sig.permissions {
300                        ChangeType::PermissionChanged
301                    } else {
302                        ChangeType::Modified
303                    };
304
305                    Some(InterestFactor::ChangedSinceLastScan {
306                        change: change_type,
307                        weight: self.weights.changed_since_scan,
308                    })
309                } else {
310                    None
311                }
312            }
313        }
314    }
315
316    /// Check if file is in a hot directory
317    fn check_hot_dir(&self, node: &FileNode) -> Option<InterestFactor> {
318        // Check if any ancestor is a hot directory
319        for ancestor in node.path.ancestors() {
320            if self.hot_dirs.contains(ancestor) {
321                return Some(InterestFactor::HotDirectory {
322                    change_count: 0, // We don't track exact count here
323                    weight: self.weights.hot_directory,
324                });
325            }
326        }
327        None
328    }
329
330    /// Check if inside a dependency tree (reduces interest)
331    fn check_dependency_context(&self, node: &FileNode) -> Option<InterestFactor> {
332        let path_str = node.path.to_string_lossy();
333
334        // Check for common dependency directories
335        let dep_indicators = [
336            ("node_modules", DependencyManager::Npm),
337            ("target/debug", DependencyManager::Cargo),
338            ("target/release", DependencyManager::Cargo),
339            (".venv", DependencyManager::Python),
340            ("venv", DependencyManager::Python),
341            ("__pycache__", DependencyManager::Python),
342            ("vendor", DependencyManager::Go), // Could also be Ruby/PHP
343            (".m2", DependencyManager::Java),
344            ("build/classes", DependencyManager::Java),
345        ];
346
347        for (indicator, _manager) in &dep_indicators {
348            if path_str.contains(indicator) {
349                // Calculate depth inside dependency tree
350                let depth = path_str
351                    .split(indicator)
352                    .nth(1)
353                    .map(|s| s.matches('/').count())
354                    .unwrap_or(0);
355
356                return Some(InterestFactor::InDependencyTree {
357                    depth,
358                    weight: self.weights.dependency_depth_penalty * (depth as f32 + 1.0),
359                });
360            }
361        }
362
363        None
364    }
365
366    /// Check filesystem type (virtual filesystems are less interesting)
367    fn check_filesystem_type(&self, node: &FileNode) -> Option<InterestFactor> {
368        match node.filesystem_type {
369            FilesystemType::Procfs | FilesystemType::Sysfs | FilesystemType::Devfs => {
370                Some(InterestFactor::InDependencyTree {
371                    depth: 0,
372                    weight: -0.5, // Strong negative for virtual filesystems
373                })
374            }
375            FilesystemType::Tmpfs => Some(InterestFactor::InDependencyTree {
376                depth: 0,
377                weight: -0.2, // Mild negative for temp filesystems
378            }),
379            _ => None,
380        }
381    }
382
383    /// Boost interest based on file category
384    fn check_category_boost(&self, node: &FileNode) -> Option<InterestFactor> {
385        if node.is_dir {
386            return None;
387        }
388
389        // Source code files are generally more interesting
390        let boost: f32 = match node.category {
391            FileCategory::Rust
392            | FileCategory::Python
393            | FileCategory::JavaScript
394            | FileCategory::TypeScript
395            | FileCategory::Go
396            | FileCategory::Java
397            | FileCategory::Cpp
398            | FileCategory::C => 0.1,
399
400            // Config and build files
401            FileCategory::Toml
402            | FileCategory::Yaml
403            | FileCategory::Json
404            | FileCategory::Makefile
405            | FileCategory::Dockerfile => 0.15,
406
407            // Documentation
408            FileCategory::Markdown | FileCategory::Readme => 0.1,
409
410            // Tests are interesting
411            FileCategory::Test => 0.1,
412
413            // Archives and binaries less interesting
414            FileCategory::Archive | FileCategory::Binary | FileCategory::DiskImage => -0.1,
415
416            // Temp and backup files not interesting
417            FileCategory::Temp | FileCategory::Backup => -0.2,
418
419            _ => 0.0,
420        };
421
422        if boost.abs() > 0.01 {
423            Some(InterestFactor::Custom {
424                name: format!("Category: {:?}", node.category),
425                weight: boost,
426            })
427        } else {
428            None
429        }
430    }
431
432    /// Build traversal context for a node
433    pub fn build_traversal_context(
434        &self,
435        node: &FileNode,
436        parent_interest: Option<InterestLevel>,
437    ) -> TraversalContext {
438        let path_str = node.path.to_string_lossy();
439
440        // Determine traversal path type
441        let traversal_path = if node.is_symlink {
442            TraversalPath::Symlink {
443                target: std::fs::read_link(&node.path).unwrap_or_default(),
444                target_exists: node.path.exists(),
445            }
446        } else if let Some((indicator, manager)) = self.find_dependency_indicator(&path_str) {
447            TraversalPath::Dependency {
448                manager,
449                dep_root: node
450                    .path
451                    .to_string_lossy()
452                    .split(indicator)
453                    .next()
454                    .map(|s| std::path::PathBuf::from(format!("{}{}", s, indicator)))
455                    .unwrap_or_default(),
456            }
457        } else {
458            TraversalPath::Direct
459        };
460
461        // Check for git worktree
462        let in_git_worktree = node.path.join(".git").exists()
463            || node
464                .path
465                .ancestors()
466                .any(|p| p.join(".git").exists());
467
468        // Check for submodule
469        let in_submodule = node
470            .path
471            .ancestors()
472            .any(|p| p.join(".git").is_file()); // Submodules have .git as file
473
474        TraversalContext {
475            path: traversal_path,
476            depth_from_root: node.depth,
477            in_git_worktree,
478            in_submodule,
479            parent_interest,
480        }
481    }
482
483    /// Find dependency indicator in path
484    fn find_dependency_indicator(&self, path: &str) -> Option<(&'static str, DependencyManager)> {
485        let indicators = [
486            ("node_modules", DependencyManager::Npm),
487            ("target/debug", DependencyManager::Cargo),
488            ("target/release", DependencyManager::Cargo),
489            (".venv", DependencyManager::Python),
490            ("venv", DependencyManager::Python),
491            ("vendor", DependencyManager::Go),
492            (".m2", DependencyManager::Java),
493        ];
494
495        for (indicator, manager) in indicators {
496            if path.contains(indicator) {
497                return Some((indicator, manager));
498            }
499        }
500        None
501    }
502}
503
504impl Default for InterestCalculator {
505    fn default() -> Self {
506        Self::new()
507    }
508}
509
510/// Quick helper to determine if a path is likely interesting
511pub fn quick_interest_check(path: &Path) -> InterestLevel {
512    let path_str = path.to_string_lossy();
513
514    // Critical paths
515    if path_str.contains(".env") && !path_str.contains(".env.example") {
516        return InterestLevel::Critical;
517    }
518
519    // Boring paths
520    let boring_patterns = [
521        "node_modules",
522        "target/debug",
523        "target/release",
524        "__pycache__",
525        ".git/objects",
526        ".venv",
527        "venv/lib",
528    ];
529
530    for pattern in boring_patterns {
531        if path_str.contains(pattern) {
532            return InterestLevel::Boring;
533        }
534    }
535
536    // Key files
537    if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
538        let name_lower = name.to_lowercase();
539        if matches!(
540            name_lower.as_str(),
541            "readme.md"
542                | "cargo.toml"
543                | "package.json"
544                | "main.rs"
545                | "lib.rs"
546                | "index.js"
547                | "index.ts"
548        ) {
549            return InterestLevel::Important;
550        }
551    }
552
553    InterestLevel::Background
554}
555
556#[cfg(test)]
557mod tests {
558    use super::*;
559    use crate::scanner::{FileCategory, FileType, FilesystemType};
560    use std::path::PathBuf;
561    use std::time::Duration;
562
563    fn make_test_node(path: &str, is_dir: bool, hours_old: f32) -> FileNode {
564        let modified = SystemTime::now() - Duration::from_secs_f32(hours_old * 3600.0);
565
566        FileNode {
567            path: PathBuf::from(path),
568            is_dir,
569            size: 1000,
570            permissions: 0o644,
571            uid: 1000,
572            gid: 1000,
573            modified,
574            is_symlink: false,
575            is_hidden: false,
576            permission_denied: false,
577            is_ignored: false,
578            depth: path.matches('/').count(),
579            file_type: if is_dir {
580                FileType::Directory
581            } else {
582                FileType::RegularFile
583            },
584            category: FileCategory::Unknown,
585            search_matches: None,
586            filesystem_type: FilesystemType::Unknown,
587            git_branch: None,
588            traversal_context: None,
589            interest: None,
590            security_findings: Vec::new(),
591            change_status: None,
592            content_hash: None,
593        }
594    }
595
596    #[test]
597    fn test_recency_scoring() {
598        let calc = InterestCalculator::new();
599
600        // Very recent file
601        let recent = make_test_node("src/main.rs", false, 0.5);
602        let score = calc.calculate(&recent);
603        assert!(score.score > 0.3, "Recent file should have high score");
604
605        // Old file
606        let old = make_test_node("src/old.rs", false, 200.0);
607        let score = calc.calculate(&old);
608        assert!(score.score < 0.2, "Old file should have low score");
609    }
610
611    #[test]
612    fn test_key_file_detection() {
613        let calc = InterestCalculator::new();
614
615        let readme = make_test_node("README.md", false, 100.0);
616        let score = calc.calculate(&readme);
617        assert!(
618            score.score >= 0.4,
619            "README should be important: {}",
620            score.score
621        );
622
623        let cargo = make_test_node("Cargo.toml", false, 100.0);
624        let score = calc.calculate(&cargo);
625        assert!(
626            score.score >= 0.4,
627            "Cargo.toml should be important: {}",
628            score.score
629        );
630    }
631
632    #[test]
633    fn test_dependency_penalty() {
634        let calc = InterestCalculator::new();
635
636        // File in node_modules (use old file to avoid recency boost)
637        let node_mod = make_test_node("node_modules/lodash/index.js", false, 200.0);
638        let score = calc.calculate(&node_mod);
639        // Category boost (+0.1) minus dependency penalty (-0.1*depth) can be positive
640        // Key assertion: it should be lower than files outside node_modules
641        assert!(
642            score.score < 0.3,
643            "node_modules file should have reduced interest: {}",
644            score.score
645        );
646
647        // Same file outside node_modules (also old)
648        let normal = make_test_node("src/utils/index.js", false, 200.0);
649        let score = calc.calculate(&normal);
650
651        // The normal file should score higher than node_modules file
652        let node_mod_score = calc.calculate(&make_test_node("node_modules/lodash/index.js", false, 200.0)).score;
653        assert!(
654            score.score > node_mod_score,
655            "Normal source file ({}) should have higher interest than node_modules ({})",
656            score.score,
657            node_mod_score
658        );
659    }
660
661    #[test]
662    fn test_quick_interest_check() {
663        assert_eq!(
664            quick_interest_check(Path::new(".env")),
665            InterestLevel::Critical
666        );
667        assert_eq!(
668            quick_interest_check(Path::new("node_modules/foo/bar.js")),
669            InterestLevel::Boring
670        );
671        assert_eq!(
672            quick_interest_check(Path::new("README.md")),
673            InterestLevel::Important
674        );
675        assert_eq!(
676            quick_interest_check(Path::new("src/utils.rs")),
677            InterestLevel::Background
678        );
679    }
680}