scribe_analysis/heuristics/
mod.rs

1//! # Heuristic Scoring System for File Prioritization
2//!
3//! Implements a sophisticated multi-dimensional scoring system for ranking file importance
4//! within a codebase. This module ports and enhances the Python FastPath heuristic system
5//! with advanced features:
6//!
7//! ## Core Scoring Formula
8//! ```text
9//! score = w_doc*doc + w_readme*readme + w_imp*imp_deg + w_path*path_depth^-1 +
10//!         w_test*test_link + w_churn*churn + w_centrality*centrality +
11//!         w_entrypoint*entrypoint + w_examples*examples + priority_boost
12//! ```
13//!
14//! ## Key Features
15//! - **Template Detection**: Advanced template engine detection using multiple methods
16//! - **Import Graph Analysis**: Dependency centrality and PageRank calculations
17//! - **Documentation Intelligence**: README prioritization and document structure analysis
18//! - **Test-Code Relationships**: Heuristic test file linkage detection
19//! - **Git Churn Integration**: Change recency and frequency signals
20//! - **Configurable Weighting**: V1/V2 feature flags with dynamic weight adjustment
21//! - **Performance Optimized**: Caching and lazy evaluation for large codebases
22
23pub mod enhanced_scoring;
24pub mod import_analysis;
25pub mod scoring;
26pub mod template_detection;
27
28pub use scoring::{
29    HeuristicScorer, HeuristicWeights, RawScoreComponents, ScoreComponents, ScoringFeatures,
30    WeightPreset,
31};
32
33pub use template_detection::{
34    get_template_score_boost, is_template_file, TemplateDetectionMethod, TemplateDetector,
35    TemplateEngine,
36};
37
38pub use import_analysis::{
39    import_matches_file, CentralityCalculator, ImportGraph, ImportGraphBuilder,
40};
41
42pub use enhanced_scoring::{
43    AdaptiveFactors, EnhancedHeuristicScorer, EnhancedScoreComponents, EnhancedWeights,
44    ProjectType, RepositoryCharacteristics,
45};
46
47use scribe_core::Result;
48use std::collections::HashMap;
49
50/// Main entry point for the heuristic scoring system
51#[derive(Debug)]
52pub struct HeuristicSystem {
53    /// Core file scorer
54    scorer: HeuristicScorer,
55    /// Template detection engine
56    template_detector: TemplateDetector,
57    /// Import graph builder
58    import_builder: ImportGraphBuilder,
59}
60
61impl HeuristicSystem {
62    /// Create a new heuristic system with default configuration
63    pub fn new() -> Result<Self> {
64        Ok(Self {
65            scorer: HeuristicScorer::new(HeuristicWeights::default()),
66            template_detector: TemplateDetector::new()?,
67            import_builder: ImportGraphBuilder::new()?,
68        })
69    }
70
71    /// Create with custom weights
72    pub fn with_weights(weights: HeuristicWeights) -> Result<Self> {
73        Ok(Self {
74            scorer: HeuristicScorer::new(weights),
75            template_detector: TemplateDetector::new()?,
76            import_builder: ImportGraphBuilder::new()?,
77        })
78    }
79
80    /// Create with V2 features enabled
81    pub fn with_v2_features() -> Result<Self> {
82        Ok(Self {
83            scorer: HeuristicScorer::new(HeuristicWeights::with_v2_features()),
84            template_detector: TemplateDetector::new()?,
85            import_builder: ImportGraphBuilder::new()?,
86        })
87    }
88
89    /// Score a single file within the context of all scanned files
90    pub fn score_file<T>(&mut self, file: &T, all_files: &[T]) -> Result<ScoreComponents>
91    where
92        T: ScanResult,
93    {
94        self.scorer.score_file(file, all_files)
95    }
96
97    /// Score all files and return ranked results
98    pub fn score_all_files<T>(&mut self, files: &[T]) -> Result<Vec<(usize, ScoreComponents)>>
99    where
100        T: ScanResult,
101    {
102        self.scorer.score_all_files(files)
103    }
104
105    /// Get top-K files by heuristic score
106    pub fn get_top_files<T>(
107        &mut self,
108        files: &[T],
109        top_k: usize,
110    ) -> Result<Vec<(usize, ScoreComponents)>>
111    where
112        T: ScanResult,
113    {
114        Ok(self
115            .score_all_files(files)?
116            .into_iter()
117            .take(top_k)
118            .collect())
119    }
120
121    /// Get template score boost for a file path
122    pub fn get_template_boost(&self, file_path: &str) -> Result<f64> {
123        self.template_detector.get_score_boost(file_path)
124    }
125
126    /// Check if two imports match (for import graph construction)
127    pub fn import_matches(&self, import_name: &str, file_path: &str) -> bool {
128        import_analysis::import_matches_file(import_name, file_path)
129    }
130}
131
132impl Default for HeuristicSystem {
133    fn default() -> Self {
134        Self::new().expect("Failed to create HeuristicSystem")
135    }
136}
137
138/// Trait that scan results must implement for heuristic scoring
139pub trait ScanResult {
140    /// Get the file path
141    fn path(&self) -> &str;
142
143    /// Get the relative path from repository root
144    fn relative_path(&self) -> &str;
145
146    /// Get file depth (directory nesting level)
147    fn depth(&self) -> usize;
148
149    /// Check if this is a documentation file
150    fn is_docs(&self) -> bool;
151
152    /// Check if this is a README file
153    fn is_readme(&self) -> bool;
154
155    /// Check if this is a test file
156    fn is_test(&self) -> bool;
157
158    /// Check if this is an entrypoint file (main, index, etc.)
159    fn is_entrypoint(&self) -> bool;
160
161    /// Check if this file contains examples
162    fn has_examples(&self) -> bool;
163
164    /// Get the priority boost value (from scanner)
165    fn priority_boost(&self) -> f64;
166
167    /// Get the churn score (git activity)
168    fn churn_score(&self) -> f64;
169
170    /// Get centrality in score (PageRank)
171    fn centrality_in(&self) -> f64;
172
173    /// Get list of import statements
174    fn imports(&self) -> Option<&[String]>;
175
176    /// Get document analysis results (if available)
177    fn doc_analysis(&self) -> Option<&DocumentAnalysis>;
178}
179
180/// Document analysis results for scoring
181#[derive(Debug, Clone)]
182pub struct DocumentAnalysis {
183    /// Number of headings in the document
184    pub heading_count: usize,
185    /// Number of table-of-contents indicators
186    pub toc_indicators: usize,
187    /// Number of links in the document
188    pub link_count: usize,
189    /// Number of code blocks
190    pub code_block_count: usize,
191    /// Whether the document appears well-structured
192    pub is_well_structured: bool,
193}
194
195impl DocumentAnalysis {
196    pub fn new() -> Self {
197        Self {
198            heading_count: 0,
199            toc_indicators: 0,
200            link_count: 0,
201            code_block_count: 0,
202            is_well_structured: false,
203        }
204    }
205
206    /// Calculate structure score based on analysis
207    pub fn structure_score(&self) -> f64 {
208        let mut score = 0.0;
209
210        // Heading structure bonus
211        if self.heading_count > 0 {
212            score += (self.heading_count as f64 / 10.0).min(0.5);
213        }
214
215        // TOC indicates well-organized document
216        if self.toc_indicators > 0 {
217            score += 0.3;
218        }
219
220        // Links indicate reference document
221        if self.link_count > 0 {
222            score += (self.link_count as f64 / 20.0).min(0.3);
223        }
224
225        // Code blocks in docs indicate technical documentation
226        if self.code_block_count > 0 {
227            score += (self.code_block_count as f64 / 10.0).min(0.2);
228        }
229
230        score
231    }
232}
233
234impl Default for DocumentAnalysis {
235    fn default() -> Self {
236        Self::new()
237    }
238}
239
240/// Performance metrics for the heuristic system
241#[derive(Debug, Clone)]
242pub struct HeuristicMetrics {
243    /// Number of files processed
244    pub files_processed: usize,
245    /// Total processing time in milliseconds
246    pub processing_time_ms: u64,
247    /// Import graph construction time
248    pub import_graph_time_ms: u64,
249    /// Template detection time
250    pub template_detection_time_ms: u64,
251    /// Average time per file
252    pub avg_time_per_file_ms: f64,
253    /// Cache hit rates
254    pub cache_hit_rates: HashMap<String, f64>,
255}
256
257impl HeuristicMetrics {
258    pub fn new() -> Self {
259        Self {
260            files_processed: 0,
261            processing_time_ms: 0,
262            import_graph_time_ms: 0,
263            template_detection_time_ms: 0,
264            avg_time_per_file_ms: 0.0,
265            cache_hit_rates: HashMap::new(),
266        }
267    }
268
269    pub fn finalize(&mut self) {
270        if self.files_processed > 0 {
271            self.avg_time_per_file_ms =
272                self.processing_time_ms as f64 / self.files_processed as f64;
273        }
274    }
275}
276
277impl Default for HeuristicMetrics {
278    fn default() -> Self {
279        Self::new()
280    }
281}
282
283#[cfg(test)]
284mod tests {
285    use super::*;
286
287    #[test]
288    fn test_heuristic_system_creation() {
289        let system = HeuristicSystem::new();
290        assert!(system.is_ok());
291
292        let v2_system = HeuristicSystem::with_v2_features();
293        assert!(v2_system.is_ok());
294    }
295
296    #[test]
297    fn test_document_analysis() {
298        let mut doc = DocumentAnalysis::new();
299        doc.heading_count = 5;
300        doc.link_count = 10;
301        doc.code_block_count = 3;
302
303        let score = doc.structure_score();
304        assert!(score > 0.0);
305        assert!(score < 2.0); // Should be reasonable
306    }
307
308    #[test]
309    fn test_metrics() {
310        let mut metrics = HeuristicMetrics::new();
311        metrics.files_processed = 100;
312        metrics.processing_time_ms = 500;
313        metrics.finalize();
314
315        assert_eq!(metrics.avg_time_per_file_ms, 5.0);
316    }
317}