#![warn(missing_docs)]
use mimirs_core::{Memory, MemoryClass, MemoryScope, QuantumMeasurementResult, VerifiabilityStage};
use serde::{Deserialize, Serialize};
#[derive(Debug, thiserror::Error)]
pub enum EvalError {
#[error("Need at least {minimum} memories for evaluation, got {actual}")]
InsufficientData {
minimum: usize,
actual: usize,
},
#[error("Dimension evaluation failed: {0}")]
DimensionFailed(String),
#[error("Internal error: {0}")]
Internal(String),
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct EvalConfig {
pub retrieval_threshold: f32,
pub isolation_healthy_threshold: f32,
pub min_fact_verifiability: VerifiabilityStage,
pub min_sample_size: usize,
pub dimension_weights: EvalWeights,
}
impl Default for EvalConfig {
fn default() -> Self {
Self {
retrieval_threshold: 0.7,
isolation_healthy_threshold: 0.3,
min_fact_verifiability: VerifiabilityStage::Corroborated,
min_sample_size: 5,
dimension_weights: EvalWeights::default(),
}
}
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct EvalWeights {
pub retrieval: f32,
pub summarization: f32,
pub isolation: f32,
pub inference: f32,
pub reproduction: f32,
pub learning: f32,
pub habituation: f32,
}
impl Default for EvalWeights {
fn default() -> Self {
Self {
retrieval: 0.2,
summarization: 0.15,
isolation: 0.15,
inference: 0.1,
reproduction: 0.1,
learning: 0.15,
habituation: 0.15,
}
}
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct DimensionScore {
pub name: String,
pub score: f32,
pub sample_count: usize,
pub explanation: String,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct EvalReport {
pub dimensions: Vec<DimensionScore>,
pub overall_score: f32,
pub total_memories: usize,
pub source_sessions: usize,
pub timestamp: String,
pub recommendations: Vec<String>,
}
impl EvalReport {
pub fn dimension(&self, name: &str) -> Option<&DimensionScore> {
self.dimensions.iter().find(|d| d.name == name)
}
pub fn all_dimensions_pass(&self, threshold: f32) -> bool {
self.dimensions.iter().all(|d| d.score >= threshold)
}
}
pub struct EvalEngine {
pub config: EvalConfig,
}
impl EvalEngine {
pub fn new() -> Self {
Self {
config: EvalConfig::default(),
}
}
pub fn with_config(config: EvalConfig) -> Self {
Self { config }
}
pub fn evaluate(
&self,
memories: &[Memory],
query_results: &[Vec<QuantumMeasurementResult>],
identity_patterns: Option<&[mimirs_identity::HabituatedPattern]>,
) -> Result<EvalReport, EvalError> {
if memories.len() < self.config.min_sample_size {
return Err(EvalError::InsufficientData {
minimum: self.config.min_sample_size,
actual: memories.len(),
});
}
let dimensions = vec![
self.evaluate_retrieval(query_results)?,
self.evaluate_summarization(memories)?,
self.evaluate_isolation(memories)?,
self.evaluate_inference(memories)?,
self.evaluate_reproduction(memories)?,
self.evaluate_learning(memories)?,
self.evaluate_habituation(identity_patterns)?,
];
let weights = &self.config.dimension_weights;
let overall_score = self::weighted_score(&dimensions, weights);
let source_sessions: std::collections::HashSet<String> = memories
.iter()
.filter_map(|m| m.source_session.clone())
.collect();
let recommendations = self.generate_recommendations(&dimensions);
Ok(EvalReport {
dimensions,
overall_score,
total_memories: memories.len(),
source_sessions: source_sessions.len(),
timestamp: chrono::Utc::now().to_rfc3339(),
recommendations,
})
}
pub fn evaluate_retrieval(
&self,
query_results: &[Vec<QuantumMeasurementResult>],
) -> Result<DimensionScore, EvalError> {
if query_results.is_empty() {
return Ok(DimensionScore {
name: "retrieval".into(),
score: 0.5, sample_count: 0,
explanation: "No query results to evaluate".into(),
});
}
let mut successful_queries = 0;
let mut total_results = 0;
let mut high_quality_results = 0;
for results in query_results {
total_results += results.len();
let has_match = results
.iter()
.any(|r| r.expected >= self.config.retrieval_threshold && r.isolation_score >= 0.5);
if has_match {
successful_queries += 1;
}
high_quality_results += results
.iter()
.filter(|r| r.expected >= self.config.retrieval_threshold)
.count();
}
let success_rate = successful_queries as f32 / query_results.len() as f32;
let avg_quality = if total_results > 0 {
high_quality_results as f32 / total_results as f32
} else {
0.0
};
let score = 0.6 * success_rate + 0.4 * avg_quality;
Ok(DimensionScore {
name: "retrieval".into(),
score: score.clamp(0.0, 1.0),
sample_count: query_results.len(),
explanation: format!(
"{:.1}% successful queries, avg quality {:.2}",
success_rate * 100.0,
avg_quality
),
})
}
pub fn evaluate_summarization(&self, memories: &[Memory]) -> Result<DimensionScore, EvalError> {
if memories.is_empty() {
return Ok(DimensionScore {
name: "summarization".into(),
score: 0.0,
sample_count: 0,
explanation: "No memories to evaluate".into(),
});
}
let semantic_count = memories
.iter()
.filter(|m| m.memory_class == MemoryClass::Semantic)
.count();
let episodic_count = memories
.iter()
.filter(|m| m.memory_class == MemoryClass::Episodic)
.count();
let total = memories.len();
let semantic_ratio = semantic_count as f32 / total as f32;
let episodic_ratio = episodic_count as f32 / total as f32;
let score = if semantic_ratio < 0.1 {
0.3 + 7.0 * semantic_ratio } else if episodic_ratio < 0.2 {
0.5 + 2.5 * episodic_ratio } else {
0.8 + 0.2 * (1.0 - (semantic_ratio - 0.25).abs() * 4.0).max(0.0)
};
Ok(DimensionScore {
name: "summarization".into(),
score: score.clamp(0.0, 1.0),
sample_count: total,
explanation: format!(
"{:.1}% semantic, {:.1}% episodic — {}",
semantic_ratio * 100.0,
episodic_ratio * 100.0,
if score > 0.7 {
"healthy balance"
} else {
"needs attention"
}
),
})
}
pub fn evaluate_isolation(&self, memories: &[Memory]) -> Result<DimensionScore, EvalError> {
let memories_with_source: Vec<_> = memories
.iter()
.filter(|m| m.source_session.is_some())
.collect();
if memories_with_source.len() < 2 {
return Ok(DimensionScore {
name: "isolation".into(),
score: 0.5, sample_count: memories_with_source.len(),
explanation: "Need memories from 2+ sources for isolation evaluation".into(),
});
}
let sources: std::collections::HashSet<String> = memories_with_source
.iter()
.filter_map(|m| m.source_session.clone())
.collect();
let source_vec: Vec<_> = sources.iter().collect();
let mut total_overlap = 0.0f32;
let mut cross_source_pairs = 0usize;
for i in 0..source_vec.len() {
for j in (i + 1)..source_vec.len() {
let mems_i: Vec<_> = memories_with_source
.iter()
.filter(|m| m.source_session.as_ref() == Some(source_vec[i]))
.filter_map(|m| m.rho.as_ref())
.collect();
let mems_j: Vec<_> = memories_with_source
.iter()
.filter(|m| m.source_session.as_ref() == Some(source_vec[j]))
.filter_map(|m| m.rho.as_ref())
.collect();
for rho_i in &mems_i {
for rho_j in &mems_j {
total_overlap += rho_i.overlap(rho_j);
cross_source_pairs += 1;
}
}
}
}
let avg_overlap = if cross_source_pairs > 0 {
total_overlap / cross_source_pairs as f32
} else {
0.0
};
let score = (1.0 - avg_overlap / self.config.isolation_healthy_threshold).clamp(0.0, 1.0);
Ok(DimensionScore {
name: "isolation".into(),
score,
sample_count: memories_with_source.len(),
explanation: format!(
"{} sources, {} cross-source pairs, avg overlap {:.3} — {}",
sources.len(),
cross_source_pairs,
avg_overlap,
if score > 0.7 {
"good isolation"
} else if score > 0.4 {
"moderate interference"
} else {
"poor isolation — source contamination risk"
}
),
})
}
pub fn evaluate_inference(&self, memories: &[Memory]) -> Result<DimensionScore, EvalError> {
if memories.is_empty() {
return Ok(DimensionScore {
name: "inference".into(),
score: 0.0,
sample_count: 0,
explanation: "No memories to evaluate".into(),
});
}
let verified_or_durable = memories
.iter()
.filter(|m| {
matches!(
m.verifiability,
VerifiabilityStage::Verified | VerifiabilityStage::Durable
)
})
.count();
let speculative = memories
.iter()
.filter(|m| m.verifiability == VerifiabilityStage::Speculative)
.count();
let high_confidence_ratio = verified_or_durable as f32 / memories.len() as f32;
let speculation_penalty = (speculative as f32 / memories.len() as f32).min(0.3);
let score = (high_confidence_ratio * 1.2 - speculation_penalty).clamp(0.0, 1.0);
Ok(DimensionScore {
name: "inference".into(),
score,
sample_count: memories.len(),
explanation: format!(
"{:.1}% verified/durable, {:.1}% speculative",
high_confidence_ratio * 100.0,
speculation_penalty * 100.0 / 0.3
),
})
}
pub fn evaluate_reproduction(&self, memories: &[Memory]) -> Result<DimensionScore, EvalError> {
if memories.is_empty() {
return Ok(DimensionScore {
name: "reproduction".into(),
score: 0.0,
sample_count: 0,
explanation: "No memories to evaluate".into(),
});
}
let procedural = memories
.iter()
.filter(|m| m.memory_class == MemoryClass::Procedural)
.count();
let ratio = procedural as f32 / memories.len() as f32;
let score = if ratio < 0.05 {
ratio / 0.05 } else if ratio <= 0.25 {
1.0 } else {
1.0 - (ratio - 0.25) * 2.0 };
Ok(DimensionScore {
name: "reproduction".into(),
score: score.clamp(0.0, 1.0),
sample_count: memories.len(),
explanation: format!("{:.1}% procedural memories", ratio * 100.0),
})
}
pub fn evaluate_learning(&self, memories: &[Memory]) -> Result<DimensionScore, EvalError> {
if memories.len() < self.config.min_sample_size {
return Ok(DimensionScore {
name: "learning".into(),
score: 0.5,
sample_count: memories.len(),
explanation: "Insufficient data for learning evaluation".into(),
});
}
let mut class_counts: Vec<(MemoryClass, usize)> = Vec::new();
for m in memories {
let class = m.memory_class;
if let Some(entry) = class_counts.iter_mut().find(|(c, _)| *c == class) {
entry.1 += 1;
} else {
class_counts.push((class, 1));
}
}
let total = memories.len() as f32;
let mut entropy = 0.0f32;
for &(_class, count) in &class_counts {
let p = count as f32 / total;
if p > 0.0 {
entropy -= p * p.ln();
}
}
let max_entropy = (4.0_f32).ln(); let normalized_entropy = entropy / max_entropy;
let agent_scoped = memories
.iter()
.filter(|m| m.scope == MemoryScope::Agent)
.count();
let session_scoped = memories
.iter()
.filter(|m| m.scope == MemoryScope::Session)
.count();
let user_scoped = memories
.iter()
.filter(|m| m.scope == MemoryScope::User)
.count();
let unique_scopes = [agent_scoped, session_scoped, user_scoped]
.iter()
.filter(|&&c| c > 0)
.count();
let scope_score = unique_scopes as f32 / 3.0;
let score = 0.6 * normalized_entropy + 0.4 * scope_score;
Ok(DimensionScore {
name: "learning".into(),
score: score.clamp(0.0, 1.0),
sample_count: memories.len(),
explanation: format!(
"class entropy {:.2}, {} scopes active",
entropy, unique_scopes
),
})
}
pub fn evaluate_habituation(
&self,
patterns: Option<&[mimirs_identity::HabituatedPattern]>,
) -> Result<DimensionScore, EvalError> {
let patterns = match patterns {
Some(p) if !p.is_empty() => p,
_ => {
return Ok(DimensionScore {
name: "habituation".into(),
score: 0.3, sample_count: 0,
explanation: "No habituated patterns yet — reinforce patterns through repeated activation".into(),
});
}
};
let habituated_count = patterns
.iter()
.filter(|p| p.activation_count >= 3 && p.stability >= 0.7)
.count();
let avg_stability =
patterns.iter().map(|p| p.stability).sum::<f32>() / patterns.len() as f32;
let habituation_ratio = habituated_count as f32 / patterns.len() as f32;
let score = 0.5 * habituation_ratio + 0.5 * avg_stability;
Ok(DimensionScore {
name: "habituation".into(),
score: score.clamp(0.0, 1.0),
sample_count: patterns.len(),
explanation: format!(
"{}/{} habituated, avg stability {:.2}",
habituated_count,
patterns.len(),
avg_stability
),
})
}
fn generate_recommendations(&self, dimensions: &[DimensionScore]) -> Vec<String> {
let mut recommendations = Vec::new();
for dim in dimensions {
if dim.score < 0.4 {
recommendations.push(format!(
"⚠ {} is critically low ({:.1}%) — {}",
dim.name,
dim.score * 100.0,
dim.explanation
));
} else if dim.score < 0.6 {
recommendations.push(format!(
"→ {} could be improved ({:.1}%) — {}",
dim.name,
dim.score * 100.0,
dim.explanation
));
}
}
if recommendations.is_empty() {
recommendations.push("All memory dimensions are healthy!".into());
}
recommendations
}
}
fn weighted_score(dimensions: &[DimensionScore], weights: &EvalWeights) -> f32 {
let weight_sum = weights.retrieval
+ weights.summarization
+ weights.isolation
+ weights.inference
+ weights.reproduction
+ weights.learning
+ weights.habituation;
let weighted_sum = weights.retrieval * dimensions[0].score
+ weights.summarization * dimensions[1].score
+ weights.isolation * dimensions[2].score
+ weights.inference * dimensions[3].score
+ weights.reproduction * dimensions[4].score
+ weights.learning * dimensions[5].score
+ weights.habituation * dimensions[6].score;
if weight_sum > 0.0 {
(weighted_sum / weight_sum).clamp(0.0, 1.0)
} else {
0.0
}
}
impl Default for EvalEngine {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use mimirs_core::{DensityMemory, MemoryId};
fn test_memory(
class: MemoryClass,
scope: MemoryScope,
verifiability: VerifiabilityStage,
source: Option<&str>,
) -> Memory {
let mut v = vec![0.0f32; 128];
v[0] = 1.0;
Memory {
id: MemoryId::new(),
content: "Test content".into(),
metadata: Default::default(),
scope,
verifiability,
memory_class: class,
rho: Some(DensityMemory::from_pure(&v).unwrap()),
qrc_state: None,
scramble_score: None,
source_session: source.map(String::from),
}
}
#[test]
fn test_eval_report_generation() {
let engine = EvalEngine::new();
let memories = vec![
test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Verified, None),
test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Durable, None),
test_memory(MemoryClass::Episodic, MemoryScope::Session, VerifiabilityStage::Speculative, Some("s1")),
test_memory(MemoryClass::Episodic, MemoryScope::Session, VerifiabilityStage::Corroborated, Some("s1")),
test_memory(MemoryClass::Procedural, MemoryScope::Agent, VerifiabilityStage::Verified, None),
];
let query_results = vec![
vec![QuantumMeasurementResult {
id: MemoryId::new(),
expected: 0.8,
variance: 0.1,
memory: memories[0].rho.clone().unwrap(),
isolation_score: 1.0,
}],
];
let report = engine.evaluate(&memories, &query_results, None).unwrap();
assert_eq!(report.dimensions.len(), 7);
assert!(report.overall_score >= 0.0 && report.overall_score <= 1.0);
}
#[test]
fn test_eval_retrieval_empty() {
let engine = EvalEngine::with_config(EvalConfig::default());
let memories = vec![
test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Durable, None),
test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Verified, None),
test_memory(MemoryClass::Episodic, MemoryScope::Session, VerifiabilityStage::Speculative, None),
test_memory(MemoryClass::Procedural, MemoryScope::Agent, VerifiabilityStage::Corroborated, None),
test_memory(MemoryClass::Episodic, MemoryScope::User, VerifiabilityStage::Durable, None),
];
let report = engine.evaluate(&memories, &[], None).unwrap();
let retrieval = report.dimension("retrieval").unwrap();
assert_eq!(retrieval.sample_count, 0);
}
#[test]
fn test_eval_summarization() {
let engine = EvalEngine::new();
let memories = vec![
test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Verified, None),
test_memory(MemoryClass::Semantic, MemoryScope::Agent, VerifiabilityStage::Durable, None),
test_memory(MemoryClass::Episodic, MemoryScope::Session, VerifiabilityStage::Speculative, None),
test_memory(MemoryClass::Procedural, MemoryScope::Agent, VerifiabilityStage::Corroborated, None),
test_memory(MemoryClass::Semantic, MemoryScope::User, VerifiabilityStage::Verified, None),
];
let report = engine.evaluate(&memories, &[], None).unwrap();
let summarization = report.dimension("summarization").unwrap();
assert!(summarization.score >= 0.0 && summarization.score <= 1.0);
}
}