scribe_analysis/heuristics/scoring/
normalization.rs

1//! Score normalization logic for consistent heuristic scoring
2
3use super::types::{RawScoreComponents, HeuristicWeights};
4use super::super::ScanResult;
5
6/// Normalized score components after statistical normalization
7#[derive(Debug, Clone)]
8pub struct NormalizedScores {
9    pub doc_score: f64,
10    pub readme_score: f64,
11    pub import_score: f64,
12    pub path_score: f64,
13    pub test_link_score: f64,
14    pub churn_score: f64,
15    pub centrality_score: f64,
16    pub entrypoint_score: f64,
17    pub examples_score: f64,
18}
19
20/// Statistics for normalization across all files
21#[derive(Debug, Clone)]
22pub struct NormalizationStats {
23    pub max_doc_raw: f64,
24    pub max_readme_raw: f64,
25    pub max_import_degree_in: usize,
26    pub max_import_degree_out: usize,
27    pub max_path_depth: usize,
28    pub max_test_links: usize,
29    pub max_churn_commits: usize,
30    pub max_centrality_raw: f64,
31    pub max_examples_count: usize,
32}
33
34/// Build normalization statistics from all files
35pub fn build_normalization_stats<T>(files: &[T]) -> NormalizationStats 
36where 
37    T: ScanResult,
38{
39    let mut stats = NormalizationStats {
40        max_doc_raw: 0.0,
41        max_readme_raw: 0.0,
42        max_import_degree_in: 0,
43        max_import_degree_out: 0,
44        max_path_depth: 0,
45        max_test_links: 0,
46        max_churn_commits: 0,
47        max_centrality_raw: 0.0,
48        max_examples_count: 0,
49    };
50    
51    for file in files {
52        // Documentation stats
53        if file.is_docs() {
54            stats.max_doc_raw = stats.max_doc_raw.max(1.0);
55            if let Some(doc_analysis) = file.doc_analysis() {
56                stats.max_doc_raw = stats.max_doc_raw.max(doc_analysis.structure_score());
57            }
58        }
59        
60        // README stats
61        if file.is_readme() {
62            let readme_score = if file.depth() <= 1 { 1.5 } else { 1.0 };
63            stats.max_readme_raw = stats.max_readme_raw.max(readme_score);
64        }
65        
66        // Path depth
67        stats.max_path_depth = stats.max_path_depth.max(file.depth());
68        
69        // Test links (use is_test as proxy)
70        if file.is_test() {
71            stats.max_test_links = stats.max_test_links.max(1);
72        }
73        
74        // Git churn (use churn_score from trait)
75        let churn_score = file.churn_score() as usize;
76        stats.max_churn_commits = stats.max_churn_commits.max(churn_score);
77        
78        // Count examples
79        let examples_count = count_examples_in_file(file);
80        stats.max_examples_count = stats.max_examples_count.max(examples_count);
81    }
82    
83    // Ensure minimums to avoid division by zero
84    stats.max_doc_raw = stats.max_doc_raw.max(1.0);
85    stats.max_readme_raw = stats.max_readme_raw.max(1.0);
86    stats.max_import_degree_in = stats.max_import_degree_in.max(1);
87    stats.max_import_degree_out = stats.max_import_degree_out.max(1);
88    stats.max_path_depth = stats.max_path_depth.max(1);
89    stats.max_test_links = stats.max_test_links.max(1);
90    stats.max_churn_commits = stats.max_churn_commits.max(1);
91    stats.max_centrality_raw = stats.max_centrality_raw.max(0.1);
92    stats.max_examples_count = stats.max_examples_count.max(1);
93    
94    stats
95}
96
97/// Normalize raw scores using statistics
98pub fn normalize_scores(
99    raw_scores: &RawScoreComponents, 
100    stats: &NormalizationStats
101) -> NormalizedScores {
102    NormalizedScores {
103        doc_score: raw_scores.doc_raw / stats.max_doc_raw,
104        readme_score: raw_scores.readme_raw / stats.max_readme_raw,
105        import_score: calculate_import_score(raw_scores, stats),
106        path_score: calculate_path_score(raw_scores, stats),
107        test_link_score: raw_scores.test_links_found as f64 / stats.max_test_links as f64,
108        churn_score: raw_scores.churn_commits as f64 / stats.max_churn_commits as f64,
109        centrality_score: raw_scores.centrality_raw / stats.max_centrality_raw,
110        entrypoint_score: if raw_scores.is_entrypoint { 1.0 } else { 0.0 },
111        examples_score: raw_scores.examples_count as f64 / stats.max_examples_count as f64,
112    }
113}
114
115/// Calculate normalized import score combining in and out degree
116fn calculate_import_score(raw_scores: &RawScoreComponents, stats: &NormalizationStats) -> f64 {
117    let in_score = raw_scores.import_degree_in as f64 / stats.max_import_degree_in as f64;
118    let out_score = raw_scores.import_degree_out as f64 / stats.max_import_degree_out as f64;
119    
120    // Weight incoming imports higher (more important files are imported more)
121    0.7 * in_score + 0.3 * out_score
122}
123
124/// Calculate normalized path score (inverted - deeper paths get lower scores)
125fn calculate_path_score(raw_scores: &RawScoreComponents, stats: &NormalizationStats) -> f64 {
126    // Invert path depth (deeper = lower score)
127    1.0 - (raw_scores.path_depth as f64 / stats.max_path_depth as f64)
128}
129
130/// Count examples or example-like content in a file
131fn count_examples_in_file<T: ScanResult>(file: &T) -> usize {
132    // Use the built-in method from ScanResult trait
133    if file.has_examples() { 1 } else { 0 }
134}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139    use std::path::PathBuf;
140    
141    // Mock implementation for testing
142    struct MockFile {
143        path: String,
144        is_docs: bool,
145        is_readme: bool,
146        depth: usize,
147        content: Option<String>,
148    }
149    
150    impl ScanResult for MockFile {
151        fn path(&self) -> &str { &self.path }
152        fn relative_path(&self) -> &str { &self.path }
153        fn depth(&self) -> usize { self.depth }
154        fn is_docs(&self) -> bool { self.is_docs }
155        fn is_readme(&self) -> bool { self.is_readme }
156        fn is_test(&self) -> bool { false }
157        fn is_entrypoint(&self) -> bool { false }
158        fn has_examples(&self) -> bool { false }
159        fn priority_boost(&self) -> f64 { 0.0 }
160        fn churn_score(&self) -> f64 { 0.0 }
161        fn centrality_in(&self) -> f64 { 0.0 }
162        fn imports(&self) -> Option<&[String]> { None }
163        fn doc_analysis(&self) -> Option<&crate::heuristics::DocumentAnalysis> { None }
164    }
165    
166    #[test]
167    fn test_normalization_stats() {
168        let files = vec![
169            MockFile {
170                path: "README.md".to_string(),
171                is_docs: false,
172                is_readme: true,
173                depth: 1,
174                content: None,
175            },
176            MockFile {
177                path: "docs/guide.md".to_string(),
178                is_docs: true,
179                is_readme: false,
180                depth: 2,
181                content: None,
182            },
183        ];
184        
185        let stats = build_normalization_stats(&files);
186        assert!(stats.max_readme_raw > 0.0);
187        assert!(stats.max_doc_raw > 0.0);
188        assert_eq!(stats.max_path_depth, 2);
189    }
190    
191    #[test]
192    fn test_path_score_inversion() {
193        let raw_scores = RawScoreComponents {
194            doc_raw: 0.0,
195            readme_raw: 0.0,
196            import_degree_in: 0,
197            import_degree_out: 0,
198            path_depth: 3, // Deep path
199            test_links_found: 0,
200            churn_commits: 0,
201            centrality_raw: 0.0,
202            is_entrypoint: false,
203            examples_count: 0,
204        };
205        
206        let stats = NormalizationStats {
207            max_doc_raw: 1.0,
208            max_readme_raw: 1.0,
209            max_import_degree_in: 1,
210            max_import_degree_out: 1,
211            max_path_depth: 5, // Max depth is 5
212            max_test_links: 1,
213            max_churn_commits: 1,
214            max_centrality_raw: 1.0,
215            max_examples_count: 1,
216        };
217        
218        let path_score = calculate_path_score(&raw_scores, &stats);
219        // Path depth 3/5 = 0.6, so inverted score should be 1.0 - 0.6 = 0.4
220        assert!((path_score - 0.4).abs() < 0.01);
221    }
222}