scribe_analysis/heuristics/scoring/
normalization.rs1use super::super::ScanResult;
4use super::types::{HeuristicWeights, RawScoreComponents};
5
6#[derive(Debug, Clone)]
8pub struct NormalizedScores {
9 pub doc_score: f64,
10 pub readme_score: f64,
11 pub import_score: f64,
12 pub path_score: f64,
13 pub test_link_score: f64,
14 pub churn_score: f64,
15 pub centrality_score: f64,
16 pub entrypoint_score: f64,
17 pub examples_score: f64,
18}
19
20#[derive(Debug, Clone)]
22pub struct NormalizationStats {
23 pub max_doc_raw: f64,
24 pub max_readme_raw: f64,
25 pub max_import_degree_in: usize,
26 pub max_import_degree_out: usize,
27 pub max_path_depth: usize,
28 pub max_test_links: usize,
29 pub max_churn_commits: usize,
30 pub max_centrality_raw: f64,
31 pub max_examples_count: usize,
32}
33
34pub fn build_normalization_stats<T>(files: &[T]) -> NormalizationStats
36where
37 T: ScanResult,
38{
39 let mut stats = NormalizationStats {
40 max_doc_raw: 0.0,
41 max_readme_raw: 0.0,
42 max_import_degree_in: 0,
43 max_import_degree_out: 0,
44 max_path_depth: 0,
45 max_test_links: 0,
46 max_churn_commits: 0,
47 max_centrality_raw: 0.0,
48 max_examples_count: 0,
49 };
50
51 for file in files {
52 if file.is_docs() {
54 stats.max_doc_raw = stats.max_doc_raw.max(1.0);
55 if let Some(doc_analysis) = file.doc_analysis() {
56 stats.max_doc_raw = stats.max_doc_raw.max(doc_analysis.structure_score());
57 }
58 }
59
60 if file.is_readme() {
62 let readme_score = if file.depth() <= 1 { 1.5 } else { 1.0 };
63 stats.max_readme_raw = stats.max_readme_raw.max(readme_score);
64 }
65
66 stats.max_path_depth = stats.max_path_depth.max(file.depth());
68
69 if file.is_test() {
71 stats.max_test_links = stats.max_test_links.max(1);
72 }
73
74 let churn_score = file.churn_score() as usize;
76 stats.max_churn_commits = stats.max_churn_commits.max(churn_score);
77
78 let examples_count = count_examples_in_file(file);
80 stats.max_examples_count = stats.max_examples_count.max(examples_count);
81 }
82
83 stats.max_doc_raw = stats.max_doc_raw.max(1.0);
85 stats.max_readme_raw = stats.max_readme_raw.max(1.0);
86 stats.max_import_degree_in = stats.max_import_degree_in.max(1);
87 stats.max_import_degree_out = stats.max_import_degree_out.max(1);
88 stats.max_path_depth = stats.max_path_depth.max(1);
89 stats.max_test_links = stats.max_test_links.max(1);
90 stats.max_churn_commits = stats.max_churn_commits.max(1);
91 stats.max_centrality_raw = stats.max_centrality_raw.max(0.1);
92 stats.max_examples_count = stats.max_examples_count.max(1);
93
94 stats
95}
96
97pub fn normalize_scores(
99 raw_scores: &RawScoreComponents,
100 stats: &NormalizationStats,
101) -> NormalizedScores {
102 NormalizedScores {
103 doc_score: raw_scores.doc_raw / stats.max_doc_raw,
104 readme_score: raw_scores.readme_raw / stats.max_readme_raw,
105 import_score: calculate_import_score(raw_scores, stats),
106 path_score: calculate_path_score(raw_scores, stats),
107 test_link_score: raw_scores.test_links_found as f64 / stats.max_test_links as f64,
108 churn_score: raw_scores.churn_commits as f64 / stats.max_churn_commits as f64,
109 centrality_score: raw_scores.centrality_raw / stats.max_centrality_raw,
110 entrypoint_score: if raw_scores.is_entrypoint { 1.0 } else { 0.0 },
111 examples_score: raw_scores.examples_count as f64 / stats.max_examples_count as f64,
112 }
113}
114
115fn calculate_import_score(raw_scores: &RawScoreComponents, stats: &NormalizationStats) -> f64 {
117 let in_score = raw_scores.import_degree_in as f64 / stats.max_import_degree_in as f64;
118 let out_score = raw_scores.import_degree_out as f64 / stats.max_import_degree_out as f64;
119
120 0.7 * in_score + 0.3 * out_score
122}
123
124fn calculate_path_score(raw_scores: &RawScoreComponents, stats: &NormalizationStats) -> f64 {
126 1.0 - (raw_scores.path_depth as f64 / stats.max_path_depth as f64)
128}
129
130fn count_examples_in_file<T: ScanResult>(file: &T) -> usize {
132 if file.has_examples() {
134 1
135 } else {
136 0
137 }
138}
139
140#[cfg(test)]
141mod tests {
142 use super::*;
143 use std::path::PathBuf;
144
145 struct MockFile {
147 path: String,
148 is_docs: bool,
149 is_readme: bool,
150 depth: usize,
151 content: Option<String>,
152 }
153
154 impl ScanResult for MockFile {
155 fn path(&self) -> &str {
156 &self.path
157 }
158 fn relative_path(&self) -> &str {
159 &self.path
160 }
161 fn depth(&self) -> usize {
162 self.depth
163 }
164 fn is_docs(&self) -> bool {
165 self.is_docs
166 }
167 fn is_readme(&self) -> bool {
168 self.is_readme
169 }
170 fn is_test(&self) -> bool {
171 false
172 }
173 fn is_entrypoint(&self) -> bool {
174 false
175 }
176 fn has_examples(&self) -> bool {
177 false
178 }
179 fn priority_boost(&self) -> f64 {
180 0.0
181 }
182 fn churn_score(&self) -> f64 {
183 0.0
184 }
185 fn centrality_in(&self) -> f64 {
186 0.0
187 }
188 fn imports(&self) -> Option<&[String]> {
189 None
190 }
191 fn doc_analysis(&self) -> Option<&crate::heuristics::DocumentAnalysis> {
192 None
193 }
194 }
195
196 #[test]
197 fn test_normalization_stats() {
198 let files = vec![
199 MockFile {
200 path: "README.md".to_string(),
201 is_docs: false,
202 is_readme: true,
203 depth: 1,
204 content: None,
205 },
206 MockFile {
207 path: "docs/guide.md".to_string(),
208 is_docs: true,
209 is_readme: false,
210 depth: 2,
211 content: None,
212 },
213 ];
214
215 let stats = build_normalization_stats(&files);
216 assert!(stats.max_readme_raw > 0.0);
217 assert!(stats.max_doc_raw > 0.0);
218 assert_eq!(stats.max_path_depth, 2);
219 }
220
221 #[test]
222 fn test_path_score_inversion() {
223 let raw_scores = RawScoreComponents {
224 doc_raw: 0.0,
225 readme_raw: 0.0,
226 import_degree_in: 0,
227 import_degree_out: 0,
228 path_depth: 3, test_links_found: 0,
230 churn_commits: 0,
231 centrality_raw: 0.0,
232 is_entrypoint: false,
233 examples_count: 0,
234 };
235
236 let stats = NormalizationStats {
237 max_doc_raw: 1.0,
238 max_readme_raw: 1.0,
239 max_import_degree_in: 1,
240 max_import_degree_out: 1,
241 max_path_depth: 5, max_test_links: 1,
243 max_churn_commits: 1,
244 max_centrality_raw: 1.0,
245 max_examples_count: 1,
246 };
247
248 let path_score = calculate_path_score(&raw_scores, &stats);
249 assert!((path_score - 0.4).abs() < 0.01);
251 }
252}