scribe_analysis/heuristics/
mod.rs

1//! # Heuristic Scoring System for File Prioritization
2//!
3//! Implements a sophisticated multi-dimensional scoring system for ranking file importance
4//! within a codebase. This module ports and enhances the legacy Python heuristics that
5//! originally powered Scribe, while adding more robust Rust-based features:
6//!
7//! ## Core Scoring Formula
8//! ```text
9//! score = w_doc*doc + w_readme*readme + w_imp*imp_deg + w_path*path_depth^-1 +
10//!         w_test*test_link + w_churn*churn + w_centrality*centrality +
11//!         w_entrypoint*entrypoint + w_examples*examples + priority_boost
12//! ```
13//!
14//! ## Key Features
15//! - **Template Detection**: Advanced template engine detection using multiple methods
16//! - **Import Graph Analysis**: Dependency centrality and PageRank calculations
17//! - **Documentation Intelligence**: README prioritization and document structure analysis
18//! - **Test-Code Relationships**: Heuristic test file linkage detection
19//! - **Git Churn Integration**: Change recency and frequency signals
20//! - **Configurable Weighting**: V1/V2 feature flags with dynamic weight adjustment
21//! - **Performance Optimized**: Caching and lazy evaluation for large codebases
22
23pub mod import_analysis;
24pub mod scoring;
25pub mod template_detection;
26
27pub use scoring::{
28    HeuristicScorer, HeuristicWeights, ScoreComponents, ScoringFeatures, WeightPreset,
29};
30
31pub use template_detection::{
32    get_template_score_boost, is_template_file, TemplateDetectionMethod, TemplateDetector,
33    TemplateEngine,
34};
35
36pub use import_analysis::{
37    import_matches_file, CentralityCalculator, ImportGraph, ImportGraphBuilder,
38};
39
40use scribe_core::Result;
41use std::collections::HashMap;
42
43/// Main entry point for the heuristic scoring system
44#[derive(Debug)]
45pub struct HeuristicSystem {
46    /// Core file scorer
47    scorer: HeuristicScorer,
48    /// Template detection engine
49    template_detector: TemplateDetector,
50}
51
52impl HeuristicSystem {
53    /// Create a new heuristic system with default configuration
54    pub fn new() -> Result<Self> {
55        Ok(Self {
56            scorer: HeuristicScorer::new(HeuristicWeights::default()),
57            template_detector: TemplateDetector::new()?,
58        })
59    }
60
61    /// Create with custom weights
62    pub fn with_weights(weights: HeuristicWeights) -> Result<Self> {
63        Ok(Self {
64            scorer: HeuristicScorer::new(weights),
65            template_detector: TemplateDetector::new()?,
66        })
67    }
68
69    /// Create with V2 features enabled
70    pub fn with_v2_features() -> Result<Self> {
71        Ok(Self {
72            scorer: HeuristicScorer::new(HeuristicWeights::with_v2_features()),
73            template_detector: TemplateDetector::new()?,
74        })
75    }
76
77    /// Score a single file within the context of all scanned files
78    pub fn score_file<T>(&mut self, file: &T, all_files: &[T]) -> Result<ScoreComponents>
79    where
80        T: ScanResult,
81    {
82        self.scorer.score_file(file, all_files)
83    }
84
85    /// Score all files and return ranked results
86    pub fn score_all_files<T>(&mut self, files: &[T]) -> Result<Vec<(usize, ScoreComponents)>>
87    where
88        T: ScanResult,
89    {
90        self.scorer.score_all_files(files)
91    }
92
93    /// Get top-K files by heuristic score
94    pub fn get_top_files<T>(
95        &mut self,
96        files: &[T],
97        top_k: usize,
98    ) -> Result<Vec<(usize, ScoreComponents)>>
99    where
100        T: ScanResult,
101    {
102        Ok(self
103            .score_all_files(files)?
104            .into_iter()
105            .take(top_k)
106            .collect())
107    }
108
109    /// Get template score boost for a file path
110    pub fn get_template_boost(&self, file_path: &str) -> Result<f64> {
111        self.template_detector.get_score_boost(file_path)
112    }
113
114    /// Check if two imports match (for import graph construction)
115    pub fn import_matches(&self, import_name: &str, file_path: &str) -> bool {
116        import_analysis::import_matches_file(import_name, file_path)
117    }
118}
119
120impl Default for HeuristicSystem {
121    fn default() -> Self {
122        Self::new().expect("Failed to create HeuristicSystem")
123    }
124}
125
126/// Trait that scan results must implement for heuristic scoring
127pub trait ScanResult {
128    /// Get the file path
129    fn path(&self) -> &str;
130
131    /// Get the relative path from repository root
132    fn relative_path(&self) -> &str;
133
134    /// Get file depth (directory nesting level)
135    fn depth(&self) -> usize;
136
137    /// Check if this is a documentation file
138    fn is_docs(&self) -> bool;
139
140    /// Check if this is a README file
141    fn is_readme(&self) -> bool;
142
143    /// Check if this is a test file
144    fn is_test(&self) -> bool;
145
146    /// Check if this is an entrypoint file (main, index, etc.)
147    fn is_entrypoint(&self) -> bool;
148
149    /// Check if this file contains examples
150    fn has_examples(&self) -> bool;
151
152    /// Get the priority boost value (from scanner)
153    fn priority_boost(&self) -> f64;
154
155    /// Get the churn score (git activity)
156    fn churn_score(&self) -> f64;
157
158    /// Get centrality in score (PageRank)
159    fn centrality_in(&self) -> f64;
160
161    /// Get list of import statements
162    fn imports(&self) -> Option<&[String]>;
163
164    /// Get document analysis results (if available)
165    fn doc_analysis(&self) -> Option<&DocumentAnalysis>;
166}
167
168/// Document analysis results for scoring
169#[derive(Debug, Clone)]
170pub struct DocumentAnalysis {
171    /// Number of headings in the document
172    pub heading_count: usize,
173    /// Number of table-of-contents indicators
174    pub toc_indicators: usize,
175    /// Number of links in the document
176    pub link_count: usize,
177    /// Number of code blocks
178    pub code_block_count: usize,
179    /// Whether the document appears well-structured
180    pub is_well_structured: bool,
181}
182
183impl DocumentAnalysis {
184    pub fn new() -> Self {
185        Self {
186            heading_count: 0,
187            toc_indicators: 0,
188            link_count: 0,
189            code_block_count: 0,
190            is_well_structured: false,
191        }
192    }
193
194    /// Calculate structure score based on analysis
195    pub fn structure_score(&self) -> f64 {
196        let mut score = 0.0;
197
198        // Heading structure bonus
199        if self.heading_count > 0 {
200            score += (self.heading_count as f64 / 10.0).min(0.5);
201        }
202
203        // TOC indicates well-organized document
204        if self.toc_indicators > 0 {
205            score += 0.3;
206        }
207
208        // Links indicate reference document
209        if self.link_count > 0 {
210            score += (self.link_count as f64 / 20.0).min(0.3);
211        }
212
213        // Code blocks in docs indicate technical documentation
214        if self.code_block_count > 0 {
215            score += (self.code_block_count as f64 / 10.0).min(0.2);
216        }
217
218        score
219    }
220}
221
222impl Default for DocumentAnalysis {
223    fn default() -> Self {
224        Self::new()
225    }
226}
227
228/// Performance metrics for the heuristic system
229#[derive(Debug, Clone)]
230pub struct HeuristicMetrics {
231    /// Number of files processed
232    pub files_processed: usize,
233    /// Total processing time in milliseconds
234    pub processing_time_ms: u64,
235    /// Import graph construction time
236    pub import_graph_time_ms: u64,
237    /// Template detection time
238    pub template_detection_time_ms: u64,
239    /// Average time per file
240    pub avg_time_per_file_ms: f64,
241    /// Cache hit rates
242    pub cache_hit_rates: HashMap<String, f64>,
243}
244
245impl HeuristicMetrics {
246    pub fn new() -> Self {
247        Self {
248            files_processed: 0,
249            processing_time_ms: 0,
250            import_graph_time_ms: 0,
251            template_detection_time_ms: 0,
252            avg_time_per_file_ms: 0.0,
253            cache_hit_rates: HashMap::new(),
254        }
255    }
256
257    pub fn finalize(&mut self) {
258        if self.files_processed > 0 {
259            self.avg_time_per_file_ms =
260                self.processing_time_ms as f64 / self.files_processed as f64;
261        }
262    }
263}
264
265impl Default for HeuristicMetrics {
266    fn default() -> Self {
267        Self::new()
268    }
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    #[test]
276    fn test_heuristic_system_creation() {
277        let system = HeuristicSystem::new();
278        assert!(system.is_ok());
279
280        let v2_system = HeuristicSystem::with_v2_features();
281        assert!(v2_system.is_ok());
282    }
283
284    #[test]
285    fn test_document_analysis() {
286        let mut doc = DocumentAnalysis::new();
287        doc.heading_count = 5;
288        doc.link_count = 10;
289        doc.code_block_count = 3;
290
291        let score = doc.structure_score();
292        assert!(score > 0.0);
293        assert!(score < 2.0); // Should be reasonable
294    }
295
296    #[test]
297    fn test_metrics() {
298        let mut metrics = HeuristicMetrics::new();
299        metrics.files_processed = 100;
300        metrics.processing_time_ms = 500;
301        metrics.finalize();
302
303        assert_eq!(metrics.avg_time_per_file_ms, 5.0);
304    }
305}