scribe_analysis/heuristics/scoring/
normalization.rs1use super::super::ScanResult;
4use super::types::ScoringFeatures;
5
6#[derive(Debug, Clone)]
8pub struct NormalizedScores {
9 pub doc_score: f64,
10 pub readme_score: f64,
11 pub import_score: f64,
12 pub path_score: f64,
13 pub test_link_score: f64,
14 pub churn_score: f64,
15 pub centrality_score: f64,
16 pub entrypoint_score: f64,
17 pub examples_score: f64,
18}
19
20#[derive(Debug, Clone)]
22pub struct NormalizationStats {
23 pub max_doc_raw: f64,
24 pub max_readme_raw: f64,
25 pub max_import_degree_in: usize,
26 pub max_import_degree_out: usize,
27 pub max_path_depth: usize,
28 pub max_test_links: usize,
29 pub max_churn_commits: usize,
30 pub max_centrality_raw: f64,
31 pub max_examples_count: usize,
32}
33
34pub fn build_normalization_stats<T>(files: &[T]) -> NormalizationStats
36where
37 T: ScanResult,
38{
39 let mut stats = NormalizationStats {
40 max_doc_raw: 0.0,
41 max_readme_raw: 0.0,
42 max_import_degree_in: 0,
43 max_import_degree_out: 0,
44 max_path_depth: 0,
45 max_test_links: 0,
46 max_churn_commits: 0,
47 max_centrality_raw: 0.0,
48 max_examples_count: 0,
49 };
50
51 for file in files {
52 if file.is_docs() {
54 stats.max_doc_raw = stats.max_doc_raw.max(1.0);
55 if let Some(doc_analysis) = file.doc_analysis() {
56 stats.max_doc_raw = stats.max_doc_raw.max(doc_analysis.structure_score());
57 }
58 }
59
60 if file.is_readme() {
62 let readme_score = if file.depth() <= 1 { 1.5 } else { 1.0 };
63 stats.max_readme_raw = stats.max_readme_raw.max(readme_score);
64 }
65
66 stats.max_path_depth = stats.max_path_depth.max(file.depth());
68
69 if file.is_test() {
71 stats.max_test_links = stats.max_test_links.max(1);
72 }
73
74 let churn_score = file.churn_score() as usize;
76 stats.max_churn_commits = stats.max_churn_commits.max(churn_score);
77
78 if let Some(imports) = file.imports() {
79 stats.max_import_degree_out = stats.max_import_degree_out.max(imports.len());
80 }
81
82 let centrality_raw = file.centrality_in();
83 stats.max_import_degree_in = stats
84 .max_import_degree_in
85 .max(centrality_raw.round() as usize);
86 stats.max_centrality_raw = stats.max_centrality_raw.max(centrality_raw);
87
88 let examples_count = if file.has_examples() { 1 } else { 0 };
90 stats.max_examples_count = stats.max_examples_count.max(examples_count);
91 }
92
93 stats.max_doc_raw = stats.max_doc_raw.max(1.0);
95 stats.max_readme_raw = stats.max_readme_raw.max(1.0);
96 stats.max_import_degree_in = stats.max_import_degree_in.max(1);
97 stats.max_import_degree_out = stats.max_import_degree_out.max(1);
98 stats.max_path_depth = stats.max_path_depth.max(1);
99 stats.max_test_links = stats.max_test_links.max(1);
100 stats.max_churn_commits = stats.max_churn_commits.max(1);
101 stats.max_centrality_raw = stats.max_centrality_raw.max(0.1);
102 stats.max_examples_count = stats.max_examples_count.max(1);
103
104 stats
105}
106
107pub fn normalize_scores<T: ScanResult>(
109 file: &T,
110 stats: &NormalizationStats,
111 features: &ScoringFeatures,
112) -> NormalizedScores {
113 let mut doc_raw = if file.is_docs() { 1.0 } else { 0.0 };
114 if features.enable_doc_analysis {
115 if let Some(doc_analysis) = file.doc_analysis() {
116 doc_raw += doc_analysis.structure_score();
117 }
118 }
119
120 let readme_raw = if file.is_readme() {
121 if file.depth() <= 1 {
122 1.5
123 } else {
124 1.0
125 }
126 } else {
127 0.0
128 };
129
130 let import_out = file.imports().map(|imports| imports.len()).unwrap_or(0);
131 let import_in = file.centrality_in().round().max(0.0) as usize;
132
133 let path_depth = file.depth();
134
135 let test_links_found = if features.enable_test_linking && file.is_test() {
136 1
137 } else {
138 0
139 };
140
141 let churn_commits = if features.enable_churn_analysis {
142 file.churn_score().round().max(0.0) as usize
143 } else {
144 0
145 };
146
147 let centrality_raw = if features.enable_centrality {
148 file.centrality_in()
149 } else {
150 0.0
151 };
152
153 let examples_count = if features.enable_examples_detection && file.has_examples() {
154 1
155 } else {
156 0
157 };
158
159 NormalizedScores {
160 doc_score: doc_raw / stats.max_doc_raw,
161 readme_score: readme_raw / stats.max_readme_raw,
162 import_score: calculate_import_score(import_in, import_out, stats),
163 path_score: calculate_path_score(path_depth, stats),
164 test_link_score: test_links_found as f64 / stats.max_test_links as f64,
165 churn_score: churn_commits as f64 / stats.max_churn_commits as f64,
166 centrality_score: centrality_raw / stats.max_centrality_raw,
167 entrypoint_score: if file.is_entrypoint() { 1.0 } else { 0.0 },
168 examples_score: examples_count as f64 / stats.max_examples_count as f64,
169 }
170}
171
172fn calculate_import_score(import_in: usize, import_out: usize, stats: &NormalizationStats) -> f64 {
174 let in_score = import_in as f64 / stats.max_import_degree_in as f64;
175 let out_score = import_out as f64 / stats.max_import_degree_out as f64;
176
177 0.7 * in_score + 0.3 * out_score
179}
180
181fn calculate_path_score(path_depth: usize, stats: &NormalizationStats) -> f64 {
183 1.0 - (path_depth as f64 / stats.max_path_depth as f64)
185}
186
187#[cfg(test)]
188mod tests {
189 use super::*;
190 use std::path::PathBuf;
191
192 struct MockFile {
194 path: String,
195 is_docs: bool,
196 is_readme: bool,
197 depth: usize,
198 content: Option<String>,
199 }
200
201 impl ScanResult for MockFile {
202 fn path(&self) -> &str {
203 &self.path
204 }
205 fn relative_path(&self) -> &str {
206 &self.path
207 }
208 fn depth(&self) -> usize {
209 self.depth
210 }
211 fn is_docs(&self) -> bool {
212 self.is_docs
213 }
214 fn is_readme(&self) -> bool {
215 self.is_readme
216 }
217 fn is_test(&self) -> bool {
218 false
219 }
220 fn is_entrypoint(&self) -> bool {
221 false
222 }
223 fn has_examples(&self) -> bool {
224 false
225 }
226 fn priority_boost(&self) -> f64 {
227 0.0
228 }
229 fn churn_score(&self) -> f64 {
230 0.0
231 }
232 fn centrality_in(&self) -> f64 {
233 0.0
234 }
235 fn imports(&self) -> Option<&[String]> {
236 None
237 }
238 fn doc_analysis(&self) -> Option<&crate::heuristics::DocumentAnalysis> {
239 None
240 }
241 }
242
243 #[test]
244 fn test_normalization_stats() {
245 let files = vec![
246 MockFile {
247 path: "README.md".to_string(),
248 is_docs: false,
249 is_readme: true,
250 depth: 1,
251 content: None,
252 },
253 MockFile {
254 path: "docs/guide.md".to_string(),
255 is_docs: true,
256 is_readme: false,
257 depth: 2,
258 content: None,
259 },
260 ];
261
262 let stats = build_normalization_stats(&files);
263 assert!(stats.max_readme_raw > 0.0);
264 assert!(stats.max_doc_raw > 0.0);
265 assert_eq!(stats.max_path_depth, 2);
266 }
267
268 #[test]
269 fn test_path_score_inversion() {
270 let stats = NormalizationStats {
271 max_doc_raw: 1.0,
272 max_readme_raw: 1.0,
273 max_import_degree_in: 1,
274 max_import_degree_out: 1,
275 max_path_depth: 5, max_test_links: 1,
277 max_churn_commits: 1,
278 max_centrality_raw: 1.0,
279 max_examples_count: 1,
280 };
281
282 let path_score = calculate_path_score(3, &stats);
283 assert!((path_score - 0.4).abs() < 0.01);
285 }
286}