scribe_analysis/heuristics/scoring/
normalization.rs

1//! Score normalization logic for consistent heuristic scoring
2
3use super::super::ScanResult;
4use super::types::ScoringFeatures;
5
6/// Normalized score components after statistical normalization
7#[derive(Debug, Clone)]
8pub struct NormalizedScores {
9    pub doc_score: f64,
10    pub readme_score: f64,
11    pub import_score: f64,
12    pub path_score: f64,
13    pub test_link_score: f64,
14    pub churn_score: f64,
15    pub centrality_score: f64,
16    pub entrypoint_score: f64,
17    pub examples_score: f64,
18}
19
20/// Statistics for normalization across all files
21#[derive(Debug, Clone)]
22pub struct NormalizationStats {
23    pub max_doc_raw: f64,
24    pub max_readme_raw: f64,
25    pub max_import_degree_in: usize,
26    pub max_import_degree_out: usize,
27    pub max_path_depth: usize,
28    pub max_test_links: usize,
29    pub max_churn_commits: usize,
30    pub max_centrality_raw: f64,
31    pub max_examples_count: usize,
32}
33
34/// Build normalization statistics from all files
35pub fn build_normalization_stats<T>(files: &[T]) -> NormalizationStats
36where
37    T: ScanResult,
38{
39    let mut stats = NormalizationStats {
40        max_doc_raw: 0.0,
41        max_readme_raw: 0.0,
42        max_import_degree_in: 0,
43        max_import_degree_out: 0,
44        max_path_depth: 0,
45        max_test_links: 0,
46        max_churn_commits: 0,
47        max_centrality_raw: 0.0,
48        max_examples_count: 0,
49    };
50
51    for file in files {
52        // Documentation stats
53        if file.is_docs() {
54            stats.max_doc_raw = stats.max_doc_raw.max(1.0);
55            if let Some(doc_analysis) = file.doc_analysis() {
56                stats.max_doc_raw = stats.max_doc_raw.max(doc_analysis.structure_score());
57            }
58        }
59
60        // README stats
61        if file.is_readme() {
62            let readme_score = if file.depth() <= 1 { 1.5 } else { 1.0 };
63            stats.max_readme_raw = stats.max_readme_raw.max(readme_score);
64        }
65
66        // Path depth
67        stats.max_path_depth = stats.max_path_depth.max(file.depth());
68
69        // Test links (use is_test as proxy)
70        if file.is_test() {
71            stats.max_test_links = stats.max_test_links.max(1);
72        }
73
74        // Git churn (use churn_score from trait)
75        let churn_score = file.churn_score() as usize;
76        stats.max_churn_commits = stats.max_churn_commits.max(churn_score);
77
78        if let Some(imports) = file.imports() {
79            stats.max_import_degree_out = stats.max_import_degree_out.max(imports.len());
80        }
81
82        let centrality_raw = file.centrality_in();
83        stats.max_import_degree_in = stats
84            .max_import_degree_in
85            .max(centrality_raw.round() as usize);
86        stats.max_centrality_raw = stats.max_centrality_raw.max(centrality_raw);
87
88        // Count examples
89        let examples_count = if file.has_examples() { 1 } else { 0 };
90        stats.max_examples_count = stats.max_examples_count.max(examples_count);
91    }
92
93    // Ensure minimums to avoid division by zero
94    stats.max_doc_raw = stats.max_doc_raw.max(1.0);
95    stats.max_readme_raw = stats.max_readme_raw.max(1.0);
96    stats.max_import_degree_in = stats.max_import_degree_in.max(1);
97    stats.max_import_degree_out = stats.max_import_degree_out.max(1);
98    stats.max_path_depth = stats.max_path_depth.max(1);
99    stats.max_test_links = stats.max_test_links.max(1);
100    stats.max_churn_commits = stats.max_churn_commits.max(1);
101    stats.max_centrality_raw = stats.max_centrality_raw.max(0.1);
102    stats.max_examples_count = stats.max_examples_count.max(1);
103
104    stats
105}
106
107/// Normalize raw scores using statistics
108pub fn normalize_scores<T: ScanResult>(
109    file: &T,
110    stats: &NormalizationStats,
111    features: &ScoringFeatures,
112) -> NormalizedScores {
113    let mut doc_raw = if file.is_docs() { 1.0 } else { 0.0 };
114    if features.enable_doc_analysis {
115        if let Some(doc_analysis) = file.doc_analysis() {
116            doc_raw += doc_analysis.structure_score();
117        }
118    }
119
120    let readme_raw = if file.is_readme() {
121        if file.depth() <= 1 {
122            1.5
123        } else {
124            1.0
125        }
126    } else {
127        0.0
128    };
129
130    let import_out = file.imports().map(|imports| imports.len()).unwrap_or(0);
131    let import_in = file.centrality_in().round().max(0.0) as usize;
132
133    let path_depth = file.depth();
134
135    let test_links_found = if features.enable_test_linking && file.is_test() {
136        1
137    } else {
138        0
139    };
140
141    let churn_commits = if features.enable_churn_analysis {
142        file.churn_score().round().max(0.0) as usize
143    } else {
144        0
145    };
146
147    let centrality_raw = if features.enable_centrality {
148        file.centrality_in()
149    } else {
150        0.0
151    };
152
153    let examples_count = if features.enable_examples_detection && file.has_examples() {
154        1
155    } else {
156        0
157    };
158
159    NormalizedScores {
160        doc_score: doc_raw / stats.max_doc_raw,
161        readme_score: readme_raw / stats.max_readme_raw,
162        import_score: calculate_import_score(import_in, import_out, stats),
163        path_score: calculate_path_score(path_depth, stats),
164        test_link_score: test_links_found as f64 / stats.max_test_links as f64,
165        churn_score: churn_commits as f64 / stats.max_churn_commits as f64,
166        centrality_score: centrality_raw / stats.max_centrality_raw,
167        entrypoint_score: if file.is_entrypoint() { 1.0 } else { 0.0 },
168        examples_score: examples_count as f64 / stats.max_examples_count as f64,
169    }
170}
171
172/// Calculate normalized import score combining in and out degree
173fn calculate_import_score(import_in: usize, import_out: usize, stats: &NormalizationStats) -> f64 {
174    let in_score = import_in as f64 / stats.max_import_degree_in as f64;
175    let out_score = import_out as f64 / stats.max_import_degree_out as f64;
176
177    // Weight incoming imports higher (more important files are imported more)
178    0.7 * in_score + 0.3 * out_score
179}
180
181/// Calculate normalized path score (inverted - deeper paths get lower scores)
182fn calculate_path_score(path_depth: usize, stats: &NormalizationStats) -> f64 {
183    // Invert path depth (deeper = lower score)
184    1.0 - (path_depth as f64 / stats.max_path_depth as f64)
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190    use std::path::PathBuf;
191
192    // Mock implementation for testing
193    struct MockFile {
194        path: String,
195        is_docs: bool,
196        is_readme: bool,
197        depth: usize,
198        content: Option<String>,
199    }
200
201    impl ScanResult for MockFile {
202        fn path(&self) -> &str {
203            &self.path
204        }
205        fn relative_path(&self) -> &str {
206            &self.path
207        }
208        fn depth(&self) -> usize {
209            self.depth
210        }
211        fn is_docs(&self) -> bool {
212            self.is_docs
213        }
214        fn is_readme(&self) -> bool {
215            self.is_readme
216        }
217        fn is_test(&self) -> bool {
218            false
219        }
220        fn is_entrypoint(&self) -> bool {
221            false
222        }
223        fn has_examples(&self) -> bool {
224            false
225        }
226        fn priority_boost(&self) -> f64 {
227            0.0
228        }
229        fn churn_score(&self) -> f64 {
230            0.0
231        }
232        fn centrality_in(&self) -> f64 {
233            0.0
234        }
235        fn imports(&self) -> Option<&[String]> {
236            None
237        }
238        fn doc_analysis(&self) -> Option<&crate::heuristics::DocumentAnalysis> {
239            None
240        }
241    }
242
243    #[test]
244    fn test_normalization_stats() {
245        let files = vec![
246            MockFile {
247                path: "README.md".to_string(),
248                is_docs: false,
249                is_readme: true,
250                depth: 1,
251                content: None,
252            },
253            MockFile {
254                path: "docs/guide.md".to_string(),
255                is_docs: true,
256                is_readme: false,
257                depth: 2,
258                content: None,
259            },
260        ];
261
262        let stats = build_normalization_stats(&files);
263        assert!(stats.max_readme_raw > 0.0);
264        assert!(stats.max_doc_raw > 0.0);
265        assert_eq!(stats.max_path_depth, 2);
266    }
267
268    #[test]
269    fn test_path_score_inversion() {
270        let stats = NormalizationStats {
271            max_doc_raw: 1.0,
272            max_readme_raw: 1.0,
273            max_import_degree_in: 1,
274            max_import_degree_out: 1,
275            max_path_depth: 5, // Max depth is 5
276            max_test_links: 1,
277            max_churn_commits: 1,
278            max_centrality_raw: 1.0,
279            max_examples_count: 1,
280        };
281
282        let path_score = calculate_path_score(3, &stats);
283        // Path depth 3/5 = 0.6, so inverted score should be 1.0 - 0.6 = 0.4
284        assert!((path_score - 0.4).abs() < 0.01);
285    }
286}