1#![warn(missing_docs)]
18
19use mimirs_core::{Memory, MemoryClass, MemoryScope, QuantumMeasurementResult, VerifiabilityStage};
20
21use serde::{Deserialize, Serialize};
22
23#[derive(Debug, thiserror::Error)]
27pub enum EvalError {
28 #[error("Need at least {minimum} memories for evaluation, got {actual}")]
30 InsufficientData {
31 minimum: usize,
33 actual: usize,
35 },
36 #[error("Dimension evaluation failed: {0}")]
38 DimensionFailed(String),
39 #[error("Internal error: {0}")]
41 Internal(String),
42}
43
44#[derive(Debug, Clone, Deserialize, Serialize)]
48pub struct EvalConfig {
49 pub retrieval_threshold: f32,
51 pub isolation_healthy_threshold: f32,
53 pub min_fact_verifiability: VerifiabilityStage,
55 pub min_sample_size: usize,
57 pub dimension_weights: EvalWeights,
59}
60
61impl Default for EvalConfig {
62 fn default() -> Self {
63 Self {
64 retrieval_threshold: 0.7,
65 isolation_healthy_threshold: 0.3,
66 min_fact_verifiability: VerifiabilityStage::Corroborated,
67 min_sample_size: 5,
68 dimension_weights: EvalWeights::default(),
69 }
70 }
71}
72
73#[derive(Debug, Clone, Deserialize, Serialize)]
75pub struct EvalWeights {
76 pub retrieval: f32,
78 pub summarization: f32,
80 pub isolation: f32,
82 pub inference: f32,
84 pub reproduction: f32,
86 pub learning: f32,
88 pub habituation: f32,
90}
91
92impl Default for EvalWeights {
93 fn default() -> Self {
94 Self {
95 retrieval: 0.2,
96 summarization: 0.15,
97 isolation: 0.15,
98 inference: 0.1,
99 reproduction: 0.1,
100 learning: 0.15,
101 habituation: 0.15,
102 }
103 }
104}
105
106#[derive(Debug, Clone, Deserialize, Serialize)]
110pub struct DimensionScore {
111 pub name: String,
113 pub score: f32,
115 pub sample_count: usize,
117 pub explanation: String,
119}
120
121#[derive(Debug, Clone, Deserialize, Serialize)]
123pub struct EvalReport {
124 pub dimensions: Vec<DimensionScore>,
126 pub overall_score: f32,
128 pub total_memories: usize,
130 pub source_sessions: usize,
132 pub timestamp: String,
134 pub recommendations: Vec<String>,
136}
137
138impl EvalReport {
139 pub fn dimension(&self, name: &str) -> Option<&DimensionScore> {
141 self.dimensions.iter().find(|d| d.name == name)
142 }
143
144 pub fn all_dimensions_pass(&self, threshold: f32) -> bool {
146 self.dimensions.iter().all(|d| d.score >= threshold)
147 }
148}
149
150pub struct EvalEngine {
154 pub config: EvalConfig,
156}
157
158impl EvalEngine {
159 pub fn new() -> Self {
161 Self {
162 config: EvalConfig::default(),
163 }
164 }
165
166 pub fn with_config(config: EvalConfig) -> Self {
168 Self { config }
169 }
170
171 pub fn evaluate(
178 &self,
179 memories: &[Memory],
180 query_results: &[Vec<QuantumMeasurementResult>],
181 identity_patterns: Option<&[mimirs_identity::HabituatedPattern]>,
182 ) -> Result<EvalReport, EvalError> {
183 if memories.len() < self.config.min_sample_size {
184 return Err(EvalError::InsufficientData {
185 minimum: self.config.min_sample_size,
186 actual: memories.len(),
187 });
188 }
189
190 let dimensions = vec![
191 self.evaluate_retrieval(query_results)?,
192 self.evaluate_summarization(memories)?,
193 self.evaluate_isolation(memories)?,
194 self.evaluate_inference(memories)?,
195 self.evaluate_reproduction(memories)?,
196 self.evaluate_learning(memories)?,
197 self.evaluate_habituation(identity_patterns)?,
198 ];
199
200 let weights = &self.config.dimension_weights;
201 let overall_score = self::weighted_score(&dimensions, weights);
202
203 let source_sessions: std::collections::HashSet<String> = memories
204 .iter()
205 .filter_map(|m| m.source_session.clone())
206 .collect();
207
208 let recommendations = self.generate_recommendations(&dimensions);
209
210 Ok(EvalReport {
211 dimensions,
212 overall_score,
213 total_memories: memories.len(),
214 source_sessions: source_sessions.len(),
215 timestamp: chrono::Utc::now().to_rfc3339(),
216 recommendations,
217 })
218 }
219
220 pub fn evaluate_retrieval(
227 &self,
228 query_results: &[Vec<QuantumMeasurementResult>],
229 ) -> Result<DimensionScore, EvalError> {
230 if query_results.is_empty() {
231 return Ok(DimensionScore {
232 name: "retrieval".into(),
233 score: 0.5, sample_count: 0,
235 explanation: "No query results to evaluate".into(),
236 });
237 }
238
239 let mut successful_queries = 0;
240 let mut total_results = 0;
241 let mut high_quality_results = 0;
242
243 for results in query_results {
244 total_results += results.len();
245 let has_match = results
246 .iter()
247 .any(|r| r.expected >= self.config.retrieval_threshold && r.isolation_score >= 0.5);
248 if has_match {
249 successful_queries += 1;
250 }
251 high_quality_results += results
252 .iter()
253 .filter(|r| r.expected >= self.config.retrieval_threshold)
254 .count();
255 }
256
257 let success_rate = successful_queries as f32 / query_results.len() as f32;
258 let avg_quality = if total_results > 0 {
259 high_quality_results as f32 / total_results as f32
260 } else {
261 0.0
262 };
263 let score = 0.6 * success_rate + 0.4 * avg_quality;
264
265 Ok(DimensionScore {
266 name: "retrieval".into(),
267 score: score.clamp(0.0, 1.0),
268 sample_count: query_results.len(),
269 explanation: format!(
270 "{:.1}% successful queries, avg quality {:.2}",
271 success_rate * 100.0,
272 avg_quality
273 ),
274 })
275 }
276
277 pub fn evaluate_summarization(&self, memories: &[Memory]) -> Result<DimensionScore, EvalError> {
283 if memories.is_empty() {
284 return Ok(DimensionScore {
285 name: "summarization".into(),
286 score: 0.0,
287 sample_count: 0,
288 explanation: "No memories to evaluate".into(),
289 });
290 }
291
292 let semantic_count = memories
293 .iter()
294 .filter(|m| m.memory_class == MemoryClass::Semantic)
295 .count();
296 let episodic_count = memories
297 .iter()
298 .filter(|m| m.memory_class == MemoryClass::Episodic)
299 .count();
300 let total = memories.len();
301
302 let semantic_ratio = semantic_count as f32 / total as f32;
304 let episodic_ratio = episodic_count as f32 / total as f32;
305
306 let score = if semantic_ratio < 0.1 {
308 0.3 + 7.0 * semantic_ratio } else if episodic_ratio < 0.2 {
311 0.5 + 2.5 * episodic_ratio } else {
314 0.8 + 0.2 * (1.0 - (semantic_ratio - 0.25).abs() * 4.0).max(0.0)
316 };
317
318 Ok(DimensionScore {
319 name: "summarization".into(),
320 score: score.clamp(0.0, 1.0),
321 sample_count: total,
322 explanation: format!(
323 "{:.1}% semantic, {:.1}% episodic — {}",
324 semantic_ratio * 100.0,
325 episodic_ratio * 100.0,
326 if score > 0.7 {
327 "healthy balance"
328 } else {
329 "needs attention"
330 }
331 ),
332 })
333 }
334
335 pub fn evaluate_isolation(&self, memories: &[Memory]) -> Result<DimensionScore, EvalError> {
341 let memories_with_source: Vec<_> = memories
342 .iter()
343 .filter(|m| m.source_session.is_some())
344 .collect();
345
346 if memories_with_source.len() < 2 {
347 return Ok(DimensionScore {
348 name: "isolation".into(),
349 score: 0.5, sample_count: memories_with_source.len(),
351 explanation: "Need memories from 2+ sources for isolation evaluation".into(),
352 });
353 }
354
355 let sources: std::collections::HashSet<String> = memories_with_source
357 .iter()
358 .filter_map(|m| m.source_session.clone())
359 .collect();
360
361 let source_vec: Vec<_> = sources.iter().collect();
362 let mut total_overlap = 0.0f32;
363 let mut cross_source_pairs = 0usize;
364
365 for i in 0..source_vec.len() {
366 for j in (i + 1)..source_vec.len() {
367 let mems_i: Vec<_> = memories_with_source
368 .iter()
369 .filter(|m| m.source_session.as_ref() == Some(source_vec[i]))
370 .filter_map(|m| m.rho.as_ref())
371 .collect();
372 let mems_j: Vec<_> = memories_with_source
373 .iter()
374 .filter(|m| m.source_session.as_ref() == Some(source_vec[j]))
375 .filter_map(|m| m.rho.as_ref())
376 .collect();
377
378 for rho_i in &mems_i {
379 for rho_j in &mems_j {
380 total_overlap += rho_i.overlap(rho_j);
381 cross_source_pairs += 1;
382 }
383 }
384 }
385 }
386
387 let avg_overlap = if cross_source_pairs > 0 {
388 total_overlap / cross_source_pairs as f32
389 } else {
390 0.0
391 };
392
393 let score = (1.0 - avg_overlap / self.config.isolation_healthy_threshold).clamp(0.0, 1.0);
396
397 Ok(DimensionScore {
398 name: "isolation".into(),
399 score,
400 sample_count: memories_with_source.len(),
401 explanation: format!(
402 "{} sources, {} cross-source pairs, avg overlap {:.3} — {}",
403 sources.len(),
404 cross_source_pairs,
405 avg_overlap,
406 if score > 0.7 {
407 "good isolation"
408 } else if score > 0.4 {
409 "moderate interference"
410 } else {
411 "poor isolation — source contamination risk"
412 }
413 ),
414 })
415 }
416
417 pub fn evaluate_inference(&self, memories: &[Memory]) -> Result<DimensionScore, EvalError> {
423 if memories.is_empty() {
424 return Ok(DimensionScore {
425 name: "inference".into(),
426 score: 0.0,
427 sample_count: 0,
428 explanation: "No memories to evaluate".into(),
429 });
430 }
431
432 let verified_or_durable = memories
433 .iter()
434 .filter(|m| {
435 matches!(
436 m.verifiability,
437 VerifiabilityStage::Verified | VerifiabilityStage::Durable
438 )
439 })
440 .count();
441
442 let speculative = memories
443 .iter()
444 .filter(|m| m.verifiability == VerifiabilityStage::Speculative)
445 .count();
446
447 let high_confidence_ratio = verified_or_durable as f32 / memories.len() as f32;
448 let speculation_penalty = (speculative as f32 / memories.len() as f32).min(0.3);
449
450 let score = (high_confidence_ratio * 1.2 - speculation_penalty).clamp(0.0, 1.0);
451
452 Ok(DimensionScore {
453 name: "inference".into(),
454 score,
455 sample_count: memories.len(),
456 explanation: format!(
457 "{:.1}% verified/durable, {:.1}% speculative",
458 high_confidence_ratio * 100.0,
459 speculation_penalty * 100.0 / 0.3
460 ),
461 })
462 }
463
464 pub fn evaluate_reproduction(&self, memories: &[Memory]) -> Result<DimensionScore, EvalError> {
470 if memories.is_empty() {
471 return Ok(DimensionScore {
472 name: "reproduction".into(),
473 score: 0.0,
474 sample_count: 0,
475 explanation: "No memories to evaluate".into(),
476 });
477 }
478
479 let procedural = memories
480 .iter()
481 .filter(|m| m.memory_class == MemoryClass::Procedural)
482 .count();
483
484 let ratio = procedural as f32 / memories.len() as f32;
485 let score = if ratio < 0.05 {
487 ratio / 0.05 } else if ratio <= 0.25 {
489 1.0 } else {
491 1.0 - (ratio - 0.25) * 2.0 };
493
494 Ok(DimensionScore {
495 name: "reproduction".into(),
496 score: score.clamp(0.0, 1.0),
497 sample_count: memories.len(),
498 explanation: format!("{:.1}% procedural memories", ratio * 100.0),
499 })
500 }
501
502 pub fn evaluate_learning(&self, memories: &[Memory]) -> Result<DimensionScore, EvalError> {
507 if memories.len() < self.config.min_sample_size {
508 return Ok(DimensionScore {
509 name: "learning".into(),
510 score: 0.5,
511 sample_count: memories.len(),
512 explanation: "Insufficient data for learning evaluation".into(),
513 });
514 }
515
516 let mut class_counts: Vec<(MemoryClass, usize)> = Vec::new();
519 for m in memories {
520 let class = m.memory_class;
521 if let Some(entry) = class_counts.iter_mut().find(|(c, _)| *c == class) {
522 entry.1 += 1;
523 } else {
524 class_counts.push((class, 1));
525 }
526 }
527
528 let total = memories.len() as f32;
529 let mut entropy = 0.0f32;
530 for &(_class, count) in &class_counts {
531 let p = count as f32 / total;
532 if p > 0.0 {
533 entropy -= p * p.ln();
534 }
535 }
536 let max_entropy = (4.0_f32).ln(); let normalized_entropy = entropy / max_entropy;
538
539 let agent_scoped = memories
541 .iter()
542 .filter(|m| m.scope == MemoryScope::Agent)
543 .count();
544 let session_scoped = memories
545 .iter()
546 .filter(|m| m.scope == MemoryScope::Session)
547 .count();
548 let user_scoped = memories
549 .iter()
550 .filter(|m| m.scope == MemoryScope::User)
551 .count();
552
553 let unique_scopes = [agent_scoped, session_scoped, user_scoped]
554 .iter()
555 .filter(|&&c| c > 0)
556 .count();
557 let scope_score = unique_scopes as f32 / 3.0;
558
559 let score = 0.6 * normalized_entropy + 0.4 * scope_score;
560
561 Ok(DimensionScore {
562 name: "learning".into(),
563 score: score.clamp(0.0, 1.0),
564 sample_count: memories.len(),
565 explanation: format!(
566 "class entropy {:.2}, {} scopes active",
567 entropy, unique_scopes
568 ),
569 })
570 }
571
572 pub fn evaluate_habituation(
577 &self,
578 patterns: Option<&[mimirs_identity::HabituatedPattern]>,
579 ) -> Result<DimensionScore, EvalError> {
580 let patterns = match patterns {
581 Some(p) if !p.is_empty() => p,
582 _ => {
583 return Ok(DimensionScore {
584 name: "habituation".into(),
585 score: 0.3, sample_count: 0,
587 explanation: "No habituated patterns yet — reinforce patterns through repeated activation".into(),
588 });
589 }
590 };
591
592 let habituated_count = patterns
593 .iter()
594 .filter(|p| p.activation_count >= 3 && p.stability >= 0.7)
595 .count();
596
597 let avg_stability =
598 patterns.iter().map(|p| p.stability).sum::<f32>() / patterns.len() as f32;
599 let habituation_ratio = habituated_count as f32 / patterns.len() as f32;
600
601 let score = 0.5 * habituation_ratio + 0.5 * avg_stability;
602
603 Ok(DimensionScore {
604 name: "habituation".into(),
605 score: score.clamp(0.0, 1.0),
606 sample_count: patterns.len(),
607 explanation: format!(
608 "{}/{} habituated, avg stability {:.2}",
609 habituated_count,
610 patterns.len(),
611 avg_stability
612 ),
613 })
614 }
615
616 fn generate_recommendations(&self, dimensions: &[DimensionScore]) -> Vec<String> {
619 let mut recommendations = Vec::new();
620
621 for dim in dimensions {
622 if dim.score < 0.4 {
623 recommendations.push(format!(
624 "⚠ {} is critically low ({:.1}%) — {}",
625 dim.name,
626 dim.score * 100.0,
627 dim.explanation
628 ));
629 } else if dim.score < 0.6 {
630 recommendations.push(format!(
631 "→ {} could be improved ({:.1}%) — {}",
632 dim.name,
633 dim.score * 100.0,
634 dim.explanation
635 ));
636 }
637 }
638
639 if recommendations.is_empty() {
640 recommendations.push("All memory dimensions are healthy!".into());
641 }
642
643 recommendations
644 }
645}
646
647fn weighted_score(dimensions: &[DimensionScore], weights: &EvalWeights) -> f32 {
648 let weight_sum = weights.retrieval
649 + weights.summarization
650 + weights.isolation
651 + weights.inference
652 + weights.reproduction
653 + weights.learning
654 + weights.habituation;
655
656 let weighted_sum = weights.retrieval * dimensions[0].score
657 + weights.summarization * dimensions[1].score
658 + weights.isolation * dimensions[2].score
659 + weights.inference * dimensions[3].score
660 + weights.reproduction * dimensions[4].score
661 + weights.learning * dimensions[5].score
662 + weights.habituation * dimensions[6].score;
663
664 if weight_sum > 0.0 {
665 (weighted_sum / weight_sum).clamp(0.0, 1.0)
666 } else {
667 0.0
668 }
669}
670
671impl Default for EvalEngine {
672 fn default() -> Self {
673 Self::new()
674 }
675}
676
677#[cfg(test)]
680mod tests {
681 use super::*;
682 use mimirs_core::{DensityMemory, MemoryId};
683
684 fn test_memory(
685 class: MemoryClass,
686 scope: MemoryScope,
687 verifiability: VerifiabilityStage,
688 source: Option<&str>,
689 ) -> Memory {
690 let mut v = vec![0.0f32; 128];
691 v[0] = 1.0;
692 Memory {
693 id: MemoryId::new(),
694 content: "Test content".into(),
695 metadata: Default::default(),
696 scope,
697 verifiability,
698 memory_class: class,
699 rho: Some(DensityMemory::from_pure(&v).unwrap()),
700 qrc_state: None,
701 scramble_score: None,
702 source_session: source.map(String::from),
703 }
704 }
705
706 #[test]
707 fn test_eval_report_generation() {
708 let engine = EvalEngine::new();
709 let memories = vec![
710 test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Verified, None),
711 test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Durable, None),
712 test_memory(MemoryClass::Episodic, MemoryScope::Session, VerifiabilityStage::Speculative, Some("s1")),
713 test_memory(MemoryClass::Episodic, MemoryScope::Session, VerifiabilityStage::Corroborated, Some("s1")),
714 test_memory(MemoryClass::Procedural, MemoryScope::Agent, VerifiabilityStage::Verified, None),
715 ];
716 let query_results = vec![
717 vec![QuantumMeasurementResult {
718 id: MemoryId::new(),
719 expected: 0.8,
720 variance: 0.1,
721 memory: memories[0].rho.clone().unwrap(),
722 isolation_score: 1.0,
723 }],
724 ];
725 let report = engine.evaluate(&memories, &query_results, None).unwrap();
726 assert_eq!(report.dimensions.len(), 7);
727 assert!(report.overall_score >= 0.0 && report.overall_score <= 1.0);
728 }
729
730 #[test]
731 fn test_eval_retrieval_empty() {
732 let engine = EvalEngine::with_config(EvalConfig::default());
733 let memories = vec![
734 test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Durable, None),
735 test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Verified, None),
736 test_memory(MemoryClass::Episodic, MemoryScope::Session, VerifiabilityStage::Speculative, None),
737 test_memory(MemoryClass::Procedural, MemoryScope::Agent, VerifiabilityStage::Corroborated, None),
738 test_memory(MemoryClass::Episodic, MemoryScope::User, VerifiabilityStage::Durable, None),
739 ];
740 let report = engine.evaluate(&memories, &[], None).unwrap();
741 let retrieval = report.dimension("retrieval").unwrap();
742 assert_eq!(retrieval.sample_count, 0);
743 }
744
745 #[test]
746 fn test_eval_summarization() {
747 let engine = EvalEngine::new();
748 let memories = vec![
749 test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Verified, None),
750 test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Durable, None),
751 test_memory(MemoryClass::Episodic, MemoryScope::Session, VerifiabilityStage::Speculative, None),
752 test_memory(MemoryClass::Procedural, MemoryScope::Agent, VerifiabilityStage::Corroborated, None),
753 test_memory(MemoryClass::Semantic, MemoryScope::User, VerifiabilityStage::Verified, None),
754 ];
755 let report = engine.evaluate(&memories, &[], None).unwrap();
756 let summarization = report.dimension("summarization").unwrap();
757 assert!(summarization.score >= 0.0 && summarization.score <= 1.0);
758 }
759}