scribe_analysis/heuristics/
mod.rs

1//! # Heuristic Scoring System for File Prioritization
2//!
3//! Implements a sophisticated multi-dimensional scoring system for ranking file importance
4//! within a codebase. This module ports and enhances the Python FastPath heuristic system
5//! with advanced features:
6//!
7//! ## Core Scoring Formula
8//! ```text
9//! score = w_doc*doc + w_readme*readme + w_imp*imp_deg + w_path*path_depth^-1 + 
10//!         w_test*test_link + w_churn*churn + w_centrality*centrality + 
11//!         w_entrypoint*entrypoint + w_examples*examples + priority_boost
12//! ```
13//!
14//! ## Key Features
15//! - **Template Detection**: Advanced template engine detection using multiple methods
16//! - **Import Graph Analysis**: Dependency centrality and PageRank calculations
17//! - **Documentation Intelligence**: README prioritization and document structure analysis
18//! - **Test-Code Relationships**: Heuristic test file linkage detection
19//! - **Git Churn Integration**: Change recency and frequency signals
20//! - **Configurable Weighting**: V1/V2 feature flags with dynamic weight adjustment
21//! - **Performance Optimized**: Caching and lazy evaluation for large codebases
22
23pub mod template_detection;
24pub mod scoring;
25pub mod import_analysis;
26
27pub use scoring::{
28    HeuristicScorer, 
29    ScoreComponents, 
30    HeuristicWeights,
31    ScoringFeatures,
32};
33
34pub use template_detection::{
35    TemplateDetector,
36    TemplateEngine,
37    TemplateDetectionMethod,
38    is_template_file,
39    get_template_score_boost,
40};
41
42pub use import_analysis::{
43    ImportGraphBuilder,
44    ImportGraph,
45    CentralityCalculator,
46    import_matches_file,
47};
48
49use scribe_core::Result;
50use std::collections::HashMap;
51
52/// Main entry point for the heuristic scoring system
53#[derive(Debug)]
54pub struct HeuristicSystem {
55    /// Core file scorer
56    scorer: HeuristicScorer,
57    /// Template detection engine
58    template_detector: TemplateDetector,
59    /// Import graph builder
60    import_builder: ImportGraphBuilder,
61}
62
63impl HeuristicSystem {
64    /// Create a new heuristic system with default configuration
65    pub fn new() -> Result<Self> {
66        Ok(Self {
67            scorer: HeuristicScorer::new(HeuristicWeights::default()),
68            template_detector: TemplateDetector::new()?,
69            import_builder: ImportGraphBuilder::new()?,
70        })
71    }
72    
73    /// Create with custom weights
74    pub fn with_weights(weights: HeuristicWeights) -> Result<Self> {
75        Ok(Self {
76            scorer: HeuristicScorer::new(weights),
77            template_detector: TemplateDetector::new()?,
78            import_builder: ImportGraphBuilder::new()?,
79        })
80    }
81    
82    /// Create with V2 features enabled
83    pub fn with_v2_features() -> Result<Self> {
84        Ok(Self {
85            scorer: HeuristicScorer::new(HeuristicWeights::with_v2_features()),
86            template_detector: TemplateDetector::new()?,
87            import_builder: ImportGraphBuilder::new()?,
88        })
89    }
90    
91    /// Score a single file within the context of all scanned files
92    pub fn score_file<T>(&mut self, file: &T, all_files: &[T]) -> Result<ScoreComponents> 
93    where 
94        T: ScanResult,
95    {
96        self.scorer.score_file(file, all_files)
97    }
98    
99    /// Score all files and return ranked results
100    pub fn score_all_files<T>(&mut self, files: &[T]) -> Result<Vec<(usize, ScoreComponents)>>
101    where 
102        T: ScanResult,
103    {
104        self.scorer.score_all_files(files)
105    }
106    
107    /// Get top-K files by heuristic score
108    pub fn get_top_files<T>(&mut self, files: &[T], top_k: usize) -> Result<Vec<(usize, ScoreComponents)>>
109    where 
110        T: ScanResult,
111    {
112        Ok(self.score_all_files(files)?.into_iter().take(top_k).collect())
113    }
114    
115    /// Get template score boost for a file path
116    pub fn get_template_boost(&self, file_path: &str) -> Result<f64> {
117        self.template_detector.get_score_boost(file_path)
118    }
119    
120    /// Check if two imports match (for import graph construction)
121    pub fn import_matches(&self, import_name: &str, file_path: &str) -> bool {
122        import_analysis::import_matches_file(import_name, file_path)
123    }
124}
125
126impl Default for HeuristicSystem {
127    fn default() -> Self {
128        Self::new().expect("Failed to create HeuristicSystem")
129    }
130}
131
132/// Trait that scan results must implement for heuristic scoring
133pub trait ScanResult {
134    /// Get the file path
135    fn path(&self) -> &str;
136    
137    /// Get the relative path from repository root
138    fn relative_path(&self) -> &str;
139    
140    /// Get file depth (directory nesting level)
141    fn depth(&self) -> usize;
142    
143    /// Check if this is a documentation file
144    fn is_docs(&self) -> bool;
145    
146    /// Check if this is a README file
147    fn is_readme(&self) -> bool;
148    
149    /// Check if this is a test file
150    fn is_test(&self) -> bool;
151    
152    /// Check if this is an entrypoint file (main, index, etc.)
153    fn is_entrypoint(&self) -> bool;
154    
155    /// Check if this file contains examples
156    fn has_examples(&self) -> bool;
157    
158    /// Get the priority boost value (from scanner)
159    fn priority_boost(&self) -> f64;
160    
161    /// Get the churn score (git activity)
162    fn churn_score(&self) -> f64;
163    
164    /// Get centrality in score (PageRank)
165    fn centrality_in(&self) -> f64;
166    
167    /// Get list of import statements
168    fn imports(&self) -> Option<&[String]>;
169    
170    /// Get document analysis results (if available)
171    fn doc_analysis(&self) -> Option<&DocumentAnalysis>;
172}
173
174/// Document analysis results for scoring
175#[derive(Debug, Clone)]
176pub struct DocumentAnalysis {
177    /// Number of headings in the document
178    pub heading_count: usize,
179    /// Number of table-of-contents indicators
180    pub toc_indicators: usize,
181    /// Number of links in the document
182    pub link_count: usize,
183    /// Number of code blocks
184    pub code_block_count: usize,
185    /// Whether the document appears well-structured
186    pub is_well_structured: bool,
187}
188
189impl DocumentAnalysis {
190    pub fn new() -> Self {
191        Self {
192            heading_count: 0,
193            toc_indicators: 0,
194            link_count: 0,
195            code_block_count: 0,
196            is_well_structured: false,
197        }
198    }
199    
200    /// Calculate structure score based on analysis
201    pub fn structure_score(&self) -> f64 {
202        let mut score = 0.0;
203        
204        // Heading structure bonus
205        if self.heading_count > 0 {
206            score += (self.heading_count as f64 / 10.0).min(0.5);
207        }
208        
209        // TOC indicates well-organized document
210        if self.toc_indicators > 0 {
211            score += 0.3;
212        }
213        
214        // Links indicate reference document
215        if self.link_count > 0 {
216            score += (self.link_count as f64 / 20.0).min(0.3);
217        }
218        
219        // Code blocks in docs indicate technical documentation
220        if self.code_block_count > 0 {
221            score += (self.code_block_count as f64 / 10.0).min(0.2);
222        }
223        
224        score
225    }
226}
227
228impl Default for DocumentAnalysis {
229    fn default() -> Self {
230        Self::new()
231    }
232}
233
234/// Performance metrics for the heuristic system
235#[derive(Debug, Clone)]
236pub struct HeuristicMetrics {
237    /// Number of files processed
238    pub files_processed: usize,
239    /// Total processing time in milliseconds
240    pub processing_time_ms: u64,
241    /// Import graph construction time
242    pub import_graph_time_ms: u64,
243    /// Template detection time
244    pub template_detection_time_ms: u64,
245    /// Average time per file
246    pub avg_time_per_file_ms: f64,
247    /// Cache hit rates
248    pub cache_hit_rates: HashMap<String, f64>,
249}
250
251impl HeuristicMetrics {
252    pub fn new() -> Self {
253        Self {
254            files_processed: 0,
255            processing_time_ms: 0,
256            import_graph_time_ms: 0,
257            template_detection_time_ms: 0,
258            avg_time_per_file_ms: 0.0,
259            cache_hit_rates: HashMap::new(),
260        }
261    }
262    
263    pub fn finalize(&mut self) {
264        if self.files_processed > 0 {
265            self.avg_time_per_file_ms = self.processing_time_ms as f64 / self.files_processed as f64;
266        }
267    }
268}
269
270impl Default for HeuristicMetrics {
271    fn default() -> Self {
272        Self::new()
273    }
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279    
280    #[test]
281    fn test_heuristic_system_creation() {
282        let system = HeuristicSystem::new();
283        assert!(system.is_ok());
284        
285        let v2_system = HeuristicSystem::with_v2_features();
286        assert!(v2_system.is_ok());
287    }
288    
289    #[test]
290    fn test_document_analysis() {
291        let mut doc = DocumentAnalysis::new();
292        doc.heading_count = 5;
293        doc.link_count = 10;
294        doc.code_block_count = 3;
295        
296        let score = doc.structure_score();
297        assert!(score > 0.0);
298        assert!(score < 2.0); // Should be reasonable
299    }
300    
301    #[test]
302    fn test_metrics() {
303        let mut metrics = HeuristicMetrics::new();
304        metrics.files_processed = 100;
305        metrics.processing_time_ms = 500;
306        metrics.finalize();
307        
308        assert_eq!(metrics.avg_time_per_file_ms, 5.0);
309    }
310}