scribe_analysis/heuristics/
mod.rs

1//! # Heuristic Scoring System for File Prioritization
2//!
3//! Implements a sophisticated multi-dimensional scoring system for ranking file importance
4//! within a codebase. This module ports and enhances the Python FastPath heuristic system
5//! with advanced features:
6//!
7//! ## Core Scoring Formula
8//! ```text
9//! score = w_doc*doc + w_readme*readme + w_imp*imp_deg + w_path*path_depth^-1 + 
10//!         w_test*test_link + w_churn*churn + w_centrality*centrality + 
11//!         w_entrypoint*entrypoint + w_examples*examples + priority_boost
12//! ```
13//!
14//! ## Key Features
15//! - **Template Detection**: Advanced template engine detection using multiple methods
16//! - **Import Graph Analysis**: Dependency centrality and PageRank calculations
17//! - **Documentation Intelligence**: README prioritization and document structure analysis
18//! - **Test-Code Relationships**: Heuristic test file linkage detection
19//! - **Git Churn Integration**: Change recency and frequency signals
20//! - **Configurable Weighting**: V1/V2 feature flags with dynamic weight adjustment
21//! - **Performance Optimized**: Caching and lazy evaluation for large codebases
22
23pub mod template_detection;
24pub mod scoring;
25pub mod import_analysis;
26pub mod enhanced_scoring;
27
28pub use scoring::{
29    HeuristicScorer, 
30    ScoreComponents,
31    RawScoreComponents,
32    HeuristicWeights,
33    ScoringFeatures,
34    WeightPreset,
35};
36
37pub use template_detection::{
38    TemplateDetector,
39    TemplateEngine,
40    TemplateDetectionMethod,
41    is_template_file,
42    get_template_score_boost,
43};
44
45pub use import_analysis::{
46    ImportGraphBuilder,
47    ImportGraph,
48    CentralityCalculator,
49    import_matches_file,
50};
51
52pub use enhanced_scoring::{
53    EnhancedHeuristicScorer,
54    EnhancedScoreComponents,
55    EnhancedWeights,
56    AdaptiveFactors,
57    RepositoryCharacteristics,
58    ProjectType,
59};
60
61use scribe_core::Result;
62use std::collections::HashMap;
63
64/// Main entry point for the heuristic scoring system
65#[derive(Debug)]
66pub struct HeuristicSystem {
67    /// Core file scorer
68    scorer: HeuristicScorer,
69    /// Template detection engine
70    template_detector: TemplateDetector,
71    /// Import graph builder
72    import_builder: ImportGraphBuilder,
73}
74
75impl HeuristicSystem {
76    /// Create a new heuristic system with default configuration
77    pub fn new() -> Result<Self> {
78        Ok(Self {
79            scorer: HeuristicScorer::new(HeuristicWeights::default()),
80            template_detector: TemplateDetector::new()?,
81            import_builder: ImportGraphBuilder::new()?,
82        })
83    }
84    
85    /// Create with custom weights
86    pub fn with_weights(weights: HeuristicWeights) -> Result<Self> {
87        Ok(Self {
88            scorer: HeuristicScorer::new(weights),
89            template_detector: TemplateDetector::new()?,
90            import_builder: ImportGraphBuilder::new()?,
91        })
92    }
93    
94    /// Create with V2 features enabled
95    pub fn with_v2_features() -> Result<Self> {
96        Ok(Self {
97            scorer: HeuristicScorer::new(HeuristicWeights::with_v2_features()),
98            template_detector: TemplateDetector::new()?,
99            import_builder: ImportGraphBuilder::new()?,
100        })
101    }
102    
103    /// Score a single file within the context of all scanned files
104    pub fn score_file<T>(&mut self, file: &T, all_files: &[T]) -> Result<ScoreComponents> 
105    where 
106        T: ScanResult,
107    {
108        self.scorer.score_file(file, all_files)
109    }
110    
111    /// Score all files and return ranked results
112    pub fn score_all_files<T>(&mut self, files: &[T]) -> Result<Vec<(usize, ScoreComponents)>>
113    where 
114        T: ScanResult,
115    {
116        self.scorer.score_all_files(files)
117    }
118    
119    /// Get top-K files by heuristic score
120    pub fn get_top_files<T>(&mut self, files: &[T], top_k: usize) -> Result<Vec<(usize, ScoreComponents)>>
121    where 
122        T: ScanResult,
123    {
124        Ok(self.score_all_files(files)?.into_iter().take(top_k).collect())
125    }
126    
127    /// Get template score boost for a file path
128    pub fn get_template_boost(&self, file_path: &str) -> Result<f64> {
129        self.template_detector.get_score_boost(file_path)
130    }
131    
132    /// Check if two imports match (for import graph construction)
133    pub fn import_matches(&self, import_name: &str, file_path: &str) -> bool {
134        import_analysis::import_matches_file(import_name, file_path)
135    }
136}
137
138impl Default for HeuristicSystem {
139    fn default() -> Self {
140        Self::new().expect("Failed to create HeuristicSystem")
141    }
142}
143
144/// Trait that scan results must implement for heuristic scoring
145pub trait ScanResult {
146    /// Get the file path
147    fn path(&self) -> &str;
148    
149    /// Get the relative path from repository root
150    fn relative_path(&self) -> &str;
151    
152    /// Get file depth (directory nesting level)
153    fn depth(&self) -> usize;
154    
155    /// Check if this is a documentation file
156    fn is_docs(&self) -> bool;
157    
158    /// Check if this is a README file
159    fn is_readme(&self) -> bool;
160    
161    /// Check if this is a test file
162    fn is_test(&self) -> bool;
163    
164    /// Check if this is an entrypoint file (main, index, etc.)
165    fn is_entrypoint(&self) -> bool;
166    
167    /// Check if this file contains examples
168    fn has_examples(&self) -> bool;
169    
170    /// Get the priority boost value (from scanner)
171    fn priority_boost(&self) -> f64;
172    
173    /// Get the churn score (git activity)
174    fn churn_score(&self) -> f64;
175    
176    /// Get centrality in score (PageRank)
177    fn centrality_in(&self) -> f64;
178    
179    /// Get list of import statements
180    fn imports(&self) -> Option<&[String]>;
181    
182    /// Get document analysis results (if available)
183    fn doc_analysis(&self) -> Option<&DocumentAnalysis>;
184}
185
186/// Document analysis results for scoring
187#[derive(Debug, Clone)]
188pub struct DocumentAnalysis {
189    /// Number of headings in the document
190    pub heading_count: usize,
191    /// Number of table-of-contents indicators
192    pub toc_indicators: usize,
193    /// Number of links in the document
194    pub link_count: usize,
195    /// Number of code blocks
196    pub code_block_count: usize,
197    /// Whether the document appears well-structured
198    pub is_well_structured: bool,
199}
200
201impl DocumentAnalysis {
202    pub fn new() -> Self {
203        Self {
204            heading_count: 0,
205            toc_indicators: 0,
206            link_count: 0,
207            code_block_count: 0,
208            is_well_structured: false,
209        }
210    }
211    
212    /// Calculate structure score based on analysis
213    pub fn structure_score(&self) -> f64 {
214        let mut score = 0.0;
215        
216        // Heading structure bonus
217        if self.heading_count > 0 {
218            score += (self.heading_count as f64 / 10.0).min(0.5);
219        }
220        
221        // TOC indicates well-organized document
222        if self.toc_indicators > 0 {
223            score += 0.3;
224        }
225        
226        // Links indicate reference document
227        if self.link_count > 0 {
228            score += (self.link_count as f64 / 20.0).min(0.3);
229        }
230        
231        // Code blocks in docs indicate technical documentation
232        if self.code_block_count > 0 {
233            score += (self.code_block_count as f64 / 10.0).min(0.2);
234        }
235        
236        score
237    }
238}
239
240impl Default for DocumentAnalysis {
241    fn default() -> Self {
242        Self::new()
243    }
244}
245
246/// Performance metrics for the heuristic system
247#[derive(Debug, Clone)]
248pub struct HeuristicMetrics {
249    /// Number of files processed
250    pub files_processed: usize,
251    /// Total processing time in milliseconds
252    pub processing_time_ms: u64,
253    /// Import graph construction time
254    pub import_graph_time_ms: u64,
255    /// Template detection time
256    pub template_detection_time_ms: u64,
257    /// Average time per file
258    pub avg_time_per_file_ms: f64,
259    /// Cache hit rates
260    pub cache_hit_rates: HashMap<String, f64>,
261}
262
263impl HeuristicMetrics {
264    pub fn new() -> Self {
265        Self {
266            files_processed: 0,
267            processing_time_ms: 0,
268            import_graph_time_ms: 0,
269            template_detection_time_ms: 0,
270            avg_time_per_file_ms: 0.0,
271            cache_hit_rates: HashMap::new(),
272        }
273    }
274    
275    pub fn finalize(&mut self) {
276        if self.files_processed > 0 {
277            self.avg_time_per_file_ms = self.processing_time_ms as f64 / self.files_processed as f64;
278        }
279    }
280}
281
282impl Default for HeuristicMetrics {
283    fn default() -> Self {
284        Self::new()
285    }
286}
287
288#[cfg(test)]
289mod tests {
290    use super::*;
291    
292    #[test]
293    fn test_heuristic_system_creation() {
294        let system = HeuristicSystem::new();
295        assert!(system.is_ok());
296        
297        let v2_system = HeuristicSystem::with_v2_features();
298        assert!(v2_system.is_ok());
299    }
300    
301    #[test]
302    fn test_document_analysis() {
303        let mut doc = DocumentAnalysis::new();
304        doc.heading_count = 5;
305        doc.link_count = 10;
306        doc.code_block_count = 3;
307        
308        let score = doc.structure_score();
309        assert!(score > 0.0);
310        assert!(score < 2.0); // Should be reasonable
311    }
312    
313    #[test]
314    fn test_metrics() {
315        let mut metrics = HeuristicMetrics::new();
316        metrics.files_processed = 100;
317        metrics.processing_time_ms = 500;
318        metrics.finalize();
319        
320        assert_eq!(metrics.avg_time_per_file_ms, 5.0);
321    }
322}