scribe_analysis/heuristics/scoring/
normalization.rs

1//! Score normalization logic for consistent heuristic scoring
2
3use super::super::ScanResult;
4use super::types::{HeuristicWeights, RawScoreComponents};
5
6/// Normalized score components after statistical normalization
7#[derive(Debug, Clone)]
8pub struct NormalizedScores {
9    pub doc_score: f64,
10    pub readme_score: f64,
11    pub import_score: f64,
12    pub path_score: f64,
13    pub test_link_score: f64,
14    pub churn_score: f64,
15    pub centrality_score: f64,
16    pub entrypoint_score: f64,
17    pub examples_score: f64,
18}
19
20/// Statistics for normalization across all files
21#[derive(Debug, Clone)]
22pub struct NormalizationStats {
23    pub max_doc_raw: f64,
24    pub max_readme_raw: f64,
25    pub max_import_degree_in: usize,
26    pub max_import_degree_out: usize,
27    pub max_path_depth: usize,
28    pub max_test_links: usize,
29    pub max_churn_commits: usize,
30    pub max_centrality_raw: f64,
31    pub max_examples_count: usize,
32}
33
34/// Build normalization statistics from all files
35pub fn build_normalization_stats<T>(files: &[T]) -> NormalizationStats
36where
37    T: ScanResult,
38{
39    let mut stats = NormalizationStats {
40        max_doc_raw: 0.0,
41        max_readme_raw: 0.0,
42        max_import_degree_in: 0,
43        max_import_degree_out: 0,
44        max_path_depth: 0,
45        max_test_links: 0,
46        max_churn_commits: 0,
47        max_centrality_raw: 0.0,
48        max_examples_count: 0,
49    };
50
51    for file in files {
52        // Documentation stats
53        if file.is_docs() {
54            stats.max_doc_raw = stats.max_doc_raw.max(1.0);
55            if let Some(doc_analysis) = file.doc_analysis() {
56                stats.max_doc_raw = stats.max_doc_raw.max(doc_analysis.structure_score());
57            }
58        }
59
60        // README stats
61        if file.is_readme() {
62            let readme_score = if file.depth() <= 1 { 1.5 } else { 1.0 };
63            stats.max_readme_raw = stats.max_readme_raw.max(readme_score);
64        }
65
66        // Path depth
67        stats.max_path_depth = stats.max_path_depth.max(file.depth());
68
69        // Test links (use is_test as proxy)
70        if file.is_test() {
71            stats.max_test_links = stats.max_test_links.max(1);
72        }
73
74        // Git churn (use churn_score from trait)
75        let churn_score = file.churn_score() as usize;
76        stats.max_churn_commits = stats.max_churn_commits.max(churn_score);
77
78        // Count examples
79        let examples_count = count_examples_in_file(file);
80        stats.max_examples_count = stats.max_examples_count.max(examples_count);
81    }
82
83    // Ensure minimums to avoid division by zero
84    stats.max_doc_raw = stats.max_doc_raw.max(1.0);
85    stats.max_readme_raw = stats.max_readme_raw.max(1.0);
86    stats.max_import_degree_in = stats.max_import_degree_in.max(1);
87    stats.max_import_degree_out = stats.max_import_degree_out.max(1);
88    stats.max_path_depth = stats.max_path_depth.max(1);
89    stats.max_test_links = stats.max_test_links.max(1);
90    stats.max_churn_commits = stats.max_churn_commits.max(1);
91    stats.max_centrality_raw = stats.max_centrality_raw.max(0.1);
92    stats.max_examples_count = stats.max_examples_count.max(1);
93
94    stats
95}
96
97/// Normalize raw scores using statistics
98pub fn normalize_scores(
99    raw_scores: &RawScoreComponents,
100    stats: &NormalizationStats,
101) -> NormalizedScores {
102    NormalizedScores {
103        doc_score: raw_scores.doc_raw / stats.max_doc_raw,
104        readme_score: raw_scores.readme_raw / stats.max_readme_raw,
105        import_score: calculate_import_score(raw_scores, stats),
106        path_score: calculate_path_score(raw_scores, stats),
107        test_link_score: raw_scores.test_links_found as f64 / stats.max_test_links as f64,
108        churn_score: raw_scores.churn_commits as f64 / stats.max_churn_commits as f64,
109        centrality_score: raw_scores.centrality_raw / stats.max_centrality_raw,
110        entrypoint_score: if raw_scores.is_entrypoint { 1.0 } else { 0.0 },
111        examples_score: raw_scores.examples_count as f64 / stats.max_examples_count as f64,
112    }
113}
114
115/// Calculate normalized import score combining in and out degree
116fn calculate_import_score(raw_scores: &RawScoreComponents, stats: &NormalizationStats) -> f64 {
117    let in_score = raw_scores.import_degree_in as f64 / stats.max_import_degree_in as f64;
118    let out_score = raw_scores.import_degree_out as f64 / stats.max_import_degree_out as f64;
119
120    // Weight incoming imports higher (more important files are imported more)
121    0.7 * in_score + 0.3 * out_score
122}
123
124/// Calculate normalized path score (inverted - deeper paths get lower scores)
125fn calculate_path_score(raw_scores: &RawScoreComponents, stats: &NormalizationStats) -> f64 {
126    // Invert path depth (deeper = lower score)
127    1.0 - (raw_scores.path_depth as f64 / stats.max_path_depth as f64)
128}
129
130/// Count examples or example-like content in a file
131fn count_examples_in_file<T: ScanResult>(file: &T) -> usize {
132    // Use the built-in method from ScanResult trait
133    if file.has_examples() {
134        1
135    } else {
136        0
137    }
138}
139
140#[cfg(test)]
141mod tests {
142    use super::*;
143    use std::path::PathBuf;
144
145    // Mock implementation for testing
146    struct MockFile {
147        path: String,
148        is_docs: bool,
149        is_readme: bool,
150        depth: usize,
151        content: Option<String>,
152    }
153
154    impl ScanResult for MockFile {
155        fn path(&self) -> &str {
156            &self.path
157        }
158        fn relative_path(&self) -> &str {
159            &self.path
160        }
161        fn depth(&self) -> usize {
162            self.depth
163        }
164        fn is_docs(&self) -> bool {
165            self.is_docs
166        }
167        fn is_readme(&self) -> bool {
168            self.is_readme
169        }
170        fn is_test(&self) -> bool {
171            false
172        }
173        fn is_entrypoint(&self) -> bool {
174            false
175        }
176        fn has_examples(&self) -> bool {
177            false
178        }
179        fn priority_boost(&self) -> f64 {
180            0.0
181        }
182        fn churn_score(&self) -> f64 {
183            0.0
184        }
185        fn centrality_in(&self) -> f64 {
186            0.0
187        }
188        fn imports(&self) -> Option<&[String]> {
189            None
190        }
191        fn doc_analysis(&self) -> Option<&crate::heuristics::DocumentAnalysis> {
192            None
193        }
194    }
195
196    #[test]
197    fn test_normalization_stats() {
198        let files = vec![
199            MockFile {
200                path: "README.md".to_string(),
201                is_docs: false,
202                is_readme: true,
203                depth: 1,
204                content: None,
205            },
206            MockFile {
207                path: "docs/guide.md".to_string(),
208                is_docs: true,
209                is_readme: false,
210                depth: 2,
211                content: None,
212            },
213        ];
214
215        let stats = build_normalization_stats(&files);
216        assert!(stats.max_readme_raw > 0.0);
217        assert!(stats.max_doc_raw > 0.0);
218        assert_eq!(stats.max_path_depth, 2);
219    }
220
221    #[test]
222    fn test_path_score_inversion() {
223        let raw_scores = RawScoreComponents {
224            doc_raw: 0.0,
225            readme_raw: 0.0,
226            import_degree_in: 0,
227            import_degree_out: 0,
228            path_depth: 3, // Deep path
229            test_links_found: 0,
230            churn_commits: 0,
231            centrality_raw: 0.0,
232            is_entrypoint: false,
233            examples_count: 0,
234        };
235
236        let stats = NormalizationStats {
237            max_doc_raw: 1.0,
238            max_readme_raw: 1.0,
239            max_import_degree_in: 1,
240            max_import_degree_out: 1,
241            max_path_depth: 5, // Max depth is 5
242            max_test_links: 1,
243            max_churn_commits: 1,
244            max_centrality_raw: 1.0,
245            max_examples_count: 1,
246        };
247
248        let path_score = calculate_path_score(&raw_scores, &stats);
249        // Path depth 3/5 = 0.6, so inverted score should be 1.0 - 0.6 = 0.4
250        assert!((path_score - 0.4).abs() < 0.01);
251    }
252}