1use std::collections::HashMap;
43use std::sync::Mutex;
44
45use crate::navigator::curvature_analysis;
46use crate::pipeline::SphereQLPipeline;
47use crate::quality_metric::{BridgeCoherence, QualityMetric};
48
49#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
53#[serde(default)]
54pub struct CorpusQualityWeights {
55 pub w_evr: f64,
56 pub w_bridge: f64,
57 pub w_curvature: f64,
58 pub w_balance: f64,
59}
60
61impl Default for CorpusQualityWeights {
62 fn default() -> Self {
63 Self {
64 w_evr: 0.30,
65 w_bridge: 0.30,
66 w_curvature: 0.20,
67 w_balance: 0.20,
68 }
69 }
70}
71
72impl CorpusQualityWeights {
73 pub fn validate(&self) -> Result<f64, String> {
77 let w = [self.w_evr, self.w_bridge, self.w_curvature, self.w_balance];
78 for v in w {
79 if !v.is_finite() {
80 return Err(format!("non-finite weight: {v}"));
81 }
82 if v < 0.0 {
83 return Err(format!("negative weight: {v}"));
84 }
85 }
86 let total: f64 = w.iter().sum();
87 if total <= 0.0 {
88 return Err("all weights are zero".into());
89 }
90 Ok(total)
91 }
92}
93
94#[derive(Debug, Clone, Copy)]
98pub struct CorpusQualityBreakdown {
99 pub evr: f64,
100 pub bridge_coherence: f64,
101 pub curvature_health: f64,
102 pub category_balance: f64,
103 pub composite: f64,
104}
105
106#[derive(Debug)]
114pub struct CorpusQuality {
115 weights: CorpusQualityWeights,
116 last_breakdown: Mutex<Option<CorpusQualityBreakdown>>,
117}
118
119impl Default for CorpusQuality {
120 fn default() -> Self {
121 Self::new(CorpusQualityWeights::default())
122 }
123}
124
125impl Clone for CorpusQuality {
126 fn clone(&self) -> Self {
127 let snap = self.last_breakdown.lock().ok().and_then(|g| *g);
128 Self {
129 weights: self.weights,
130 last_breakdown: Mutex::new(snap),
131 }
132 }
133}
134
135impl CorpusQuality {
136 pub fn new(weights: CorpusQualityWeights) -> Self {
141 weights
142 .validate()
143 .expect("CorpusQualityWeights::validate failed");
144 Self {
145 weights,
146 last_breakdown: Mutex::new(None),
147 }
148 }
149
150 pub fn weights(&self) -> CorpusQualityWeights {
151 self.weights
152 }
153
154 pub fn last_breakdown(&self) -> Option<CorpusQualityBreakdown> {
157 self.last_breakdown.lock().ok().and_then(|g| *g)
158 }
159}
160
161impl QualityMetric for CorpusQuality {
162 fn name(&self) -> &str {
163 "corpus_quality"
164 }
165
166 fn score(&self, pipeline: &SphereQLPipeline) -> f64 {
167 self.score_with_components(pipeline).0
168 }
169
170 fn score_with_components(&self, pipeline: &SphereQLPipeline) -> (f64, Vec<(String, f64, f64)>) {
171 let evr = pipeline.explained_variance_ratio().clamp(0.0, 1.0);
172 let bridge_coherence = compute_bridge_coherence(pipeline);
173 let curvature_health = compute_curvature_health(pipeline);
174 let category_balance = compute_category_balance(pipeline.categories());
175
176 let total = self
177 .weights
178 .validate()
179 .expect("weights re-validated at score time");
180 let composite = (self.weights.w_evr * evr
181 + self.weights.w_bridge * bridge_coherence
182 + self.weights.w_curvature * curvature_health
183 + self.weights.w_balance * category_balance)
184 / total;
185 let composite = composite.clamp(0.0, 1.0);
186
187 if let Ok(mut guard) = self.last_breakdown.lock() {
188 *guard = Some(CorpusQualityBreakdown {
189 evr,
190 bridge_coherence,
191 curvature_health,
192 category_balance,
193 composite,
194 });
195 }
196
197 let components = vec![
200 ("evr".to_string(), self.weights.w_evr / total, evr),
201 (
202 "bridge_coherence".to_string(),
203 self.weights.w_bridge / total,
204 bridge_coherence,
205 ),
206 (
207 "curvature_health".to_string(),
208 self.weights.w_curvature / total,
209 curvature_health,
210 ),
211 (
212 "category_balance".to_string(),
213 self.weights.w_balance / total,
214 category_balance,
215 ),
216 ];
217 (composite, components)
218 }
219}
220
221fn compute_bridge_coherence(pipeline: &SphereQLPipeline) -> f64 {
229 BridgeCoherence.score(pipeline)
230}
231
232fn compute_curvature_health(pipeline: &SphereQLPipeline) -> f64 {
233 let layer = pipeline.category_layer();
234 if layer.num_categories() < 3 {
235 return 1.0;
239 }
240 let report = curvature_analysis(layer, 0);
241 if report.signatures.is_empty() {
242 return 1.0;
243 }
244 let mean_abs_z: f64 = report
245 .signatures
246 .iter()
247 .map(|s| s.mean_excess_z.abs().min(1.0))
248 .sum::<f64>()
249 / report.signatures.len() as f64;
250 (1.0 - mean_abs_z).clamp(0.0, 1.0)
251}
252
253fn compute_category_balance(categories: &[String]) -> f64 {
254 if categories.is_empty() {
255 return 0.0;
256 }
257 let mut counts: HashMap<&str, usize> = HashMap::new();
258 for c in categories {
259 *counts.entry(c.as_str()).or_insert(0) += 1;
260 }
261 if counts.len() <= 1 {
262 return 0.0;
263 }
264 let total = categories.len() as f64;
265 let mut entropy = 0.0;
266 for &n in counts.values() {
267 let p = n as f64 / total;
268 if p > 0.0 {
269 entropy -= p * p.log2();
270 }
271 }
272 let max_entropy = (counts.len() as f64).log2();
273 if max_entropy == 0.0 {
274 0.0
275 } else {
276 (entropy / max_entropy).clamp(0.0, 1.0)
277 }
278}
279
280#[cfg(test)]
283mod tests {
284 use super::*;
285 use crate::pipeline::PipelineInput;
286 use crate::quality_metric::QualityMetric;
287
288 #[test]
289 fn weights_validate_rejects_negative() {
290 let w = CorpusQualityWeights {
291 w_evr: -0.1,
292 w_bridge: 1.0,
293 w_curvature: 1.0,
294 w_balance: 1.0,
295 };
296 assert!(w.validate().is_err());
297 }
298
299 #[test]
300 fn weights_validate_rejects_all_zero() {
301 let w = CorpusQualityWeights {
302 w_evr: 0.0,
303 w_bridge: 0.0,
304 w_curvature: 0.0,
305 w_balance: 0.0,
306 };
307 assert!(w.validate().is_err());
308 }
309
310 #[test]
311 fn category_balance_uniform_is_one() {
312 let cats: Vec<String> = (0..30)
313 .flat_map(|i| std::iter::repeat_n(format!("cat_{i}"), 10))
314 .collect();
315 let s = compute_category_balance(&cats);
316 assert!((s - 1.0).abs() < 1e-9);
317 }
318
319 #[test]
320 fn category_balance_collapses_when_one_category_dominates() {
321 let mut cats: Vec<String> = std::iter::repeat_n("a".to_string(), 95).collect();
322 cats.extend(std::iter::repeat_n("b".to_string(), 5));
323 let s = compute_category_balance(&cats);
324 assert!(s < 0.4, "expected balance < 0.4 for skewed corpus, got {s}");
325 }
326
327 #[test]
328 fn default_metric_has_expected_name() {
329 let m = CorpusQuality::default();
330 assert_eq!(m.name(), "corpus_quality");
331 }
332
333 fn synthetic_pipeline() -> SphereQLPipeline {
334 let n_per = 12usize;
335 let n_cats = 8usize;
336 let dim = 16usize;
337 let mut categories = Vec::with_capacity(n_per * n_cats);
338 let mut embeddings = Vec::with_capacity(n_per * n_cats);
339 let mut rng_state: u64 = 0xDEADBEEF;
340 for c in 0..n_cats {
341 for _ in 0..n_per {
342 categories.push(format!("cat_{c}"));
343 let mut v = vec![0.0_f64; dim];
344 v[c % dim] = 1.0;
345 for x in v.iter_mut() {
346 rng_state = rng_state
347 .wrapping_mul(6364136223846793005)
348 .wrapping_add(1442695040888963407);
349 let u = (rng_state >> 33) as f64 / (1u64 << 31) as f64;
350 *x += (u - 0.5) * 0.02;
351 }
352 embeddings.push(v);
353 }
354 }
355 SphereQLPipeline::new(PipelineInput {
356 categories,
357 embeddings,
358 })
359 .expect("build pipeline")
360 }
361
362 #[test]
365 fn smoke_score_on_synthetic_input() {
366 let pipeline = synthetic_pipeline();
367 let m = CorpusQuality::default();
368 let s = m.score(&pipeline);
369 assert!((0.0..=1.0).contains(&s), "composite out of range: {s}");
370 let bd = m.last_breakdown().expect("breakdown populated");
371 assert!((0.0..=1.0).contains(&bd.evr));
372 assert!((0.0..=1.0).contains(&bd.bridge_coherence));
373 assert!((0.0..=1.0).contains(&bd.curvature_health));
374 assert!((0.0..=1.0).contains(&bd.category_balance));
375 assert!((bd.composite - s).abs() < 1e-12);
376 }
377
378 #[test]
379 fn score_with_components_reports_four_subscores() {
380 let pipeline = synthetic_pipeline();
381 let m = CorpusQuality::default();
382 let (total, components) = m.score_with_components(&pipeline);
383 assert_eq!(components.len(), 4);
384 let names: Vec<&str> = components.iter().map(|(n, _, _)| n.as_str()).collect();
385 assert_eq!(
386 names,
387 [
388 "evr",
389 "bridge_coherence",
390 "curvature_health",
391 "category_balance"
392 ]
393 );
394 let weight_sum: f64 = components.iter().map(|(_, w, _)| w).sum();
395 assert!((weight_sum - 1.0).abs() < 1e-12);
396 let recomposed: f64 = components.iter().map(|(_, w, s)| w * s).sum();
397 assert!((total - recomposed.clamp(0.0, 1.0)).abs() < 1e-12);
398 assert!((total - m.score(&pipeline)).abs() < 1e-12);
399 }
400
401 #[test]
402 fn bridge_subscore_matches_canonical_bridge_coherence() {
403 let pipeline = synthetic_pipeline();
407 let m = CorpusQuality::default();
408 let _ = m.score(&pipeline);
409 let bd = m.last_breakdown().unwrap();
410 let standalone = BridgeCoherence.score(&pipeline);
411 assert_eq!(bd.bridge_coherence, standalone);
412 }
413
414 #[test]
415 fn custom_weights_change_composite() {
416 let n_per = 10usize;
417 let n_cats = 6usize;
418 let dim = 12usize;
419 let mut categories = Vec::with_capacity(n_per * n_cats);
420 let mut embeddings = Vec::with_capacity(n_per * n_cats);
421 for c in 0..n_cats {
422 for r in 0..n_per {
423 categories.push(format!("cat_{c}"));
424 let mut v = vec![0.0_f64; dim];
425 v[c % dim] = 1.0 + (r as f64) * 0.001;
426 embeddings.push(v);
427 }
428 }
429 let input = PipelineInput {
430 categories,
431 embeddings,
432 };
433 let pipeline = SphereQLPipeline::new(input).expect("build pipeline");
434
435 let balanced = CorpusQuality::default();
436 let evr_only = CorpusQuality::new(CorpusQualityWeights {
437 w_evr: 1.0,
438 w_bridge: 0.0,
439 w_curvature: 0.0,
440 w_balance: 0.0,
441 });
442 let s_default = balanced.score(&pipeline);
443 let s_evr = evr_only.score(&pipeline);
444 assert!((0.0..=1.0).contains(&s_default));
447 assert!((0.0..=1.0).contains(&s_evr));
448 let bd = evr_only.last_breakdown().unwrap();
449 assert!((s_evr - bd.evr).abs() < 1e-12);
450 }
451}