use std::collections::HashMap;
use std::time::Instant;
use super::{Benchmark, BenchmarkResult};
use crate::storage::queries::{create_memory, update_memory};
use crate::storage::Storage;
use crate::types::{CreateMemoryInput, MemoryType, StorageConfig, StorageMode, UpdateMemoryInput};
pub struct LongMemEvalBenchmark {
pub dimension_weights: HashMap<String, f64>,
}
impl Default for LongMemEvalBenchmark {
fn default() -> Self {
let mut weights = HashMap::new();
weights.insert("information_retention".to_string(), 0.25);
weights.insert("temporal_reasoning".to_string(), 0.20);
weights.insert("knowledge_update".to_string(), 0.20);
weights.insert("multi_hop".to_string(), 0.20);
weights.insert("contradiction_detection".to_string(), 0.15);
Self {
dimension_weights: weights,
}
}
}
struct TestCase {
setup_memories: Vec<String>,
query_keyword: String,
expected_content_substring: String,
}
impl LongMemEvalBenchmark {
fn eval_information_retention(&self, storage: &Storage) -> f64 {
let cases = vec![
TestCase {
setup_memories: vec![
"Alice is a software engineer at TechCorp".to_string(),
"Alice has 5 years of experience in Rust".to_string(),
],
query_keyword: "Alice".to_string(),
expected_content_substring: "engineer".to_string(),
},
TestCase {
setup_memories: vec![
"The Eiffel Tower is located in Paris, France".to_string(),
"The Eiffel Tower was built in 1889".to_string(),
],
query_keyword: "Eiffel".to_string(),
expected_content_substring: "Paris".to_string(),
},
TestCase {
setup_memories: vec!["Project Alpha deadline is Q3 2026".to_string()],
query_keyword: "Alpha".to_string(),
expected_content_substring: "Q3 2026".to_string(),
},
];
self.run_cases(storage, &cases)
}
fn eval_temporal_reasoning(&self, storage: &Storage) -> f64 {
let cases = vec![
TestCase {
setup_memories: vec![
"Meeting on 2026-01-10: discussed Q1 roadmap".to_string(),
"Meeting on 2026-02-15: reviewed Q2 budget".to_string(),
],
query_keyword: "2026-01".to_string(),
expected_content_substring: "Q1 roadmap".to_string(),
},
TestCase {
setup_memories: vec![
"Sprint 42 started on 2026-03-01".to_string(),
"Sprint 42 ended on 2026-03-14 with 12 story points".to_string(),
],
query_keyword: "Sprint 42".to_string(),
expected_content_substring: "story points".to_string(),
},
];
self.run_cases(storage, &cases)
}
fn eval_knowledge_update(&self, storage: &Storage) -> f64 {
let mut correct = 0usize;
let total = 3usize;
let mem = storage
.with_connection(|conn| {
create_memory(
conn,
&CreateMemoryInput {
content: "Budget for Q1 is $50,000".to_string(),
memory_type: MemoryType::Note,
workspace: Some("longmemeval-bench".to_string()),
..Default::default()
},
)
})
.unwrap();
let update = UpdateMemoryInput {
content: Some("Budget for Q1 is $75,000 (revised)".to_string()),
memory_type: None,
tags: None,
metadata: None,
importance: None,
scope: None,
ttl_seconds: None,
event_time: None,
trigger_pattern: None,
media_url: None,
};
let _ = storage.with_connection(|conn| update_memory(conn, mem.id, &update));
let updated_content: Option<String> = storage
.with_connection(|conn| {
conn.query_row(
"SELECT content FROM memories WHERE id = ?1",
[mem.id],
|row| row.get(0),
)
.map_err(crate::error::EngramError::Database)
})
.ok();
if let Some(c) = updated_content {
if c.contains("$75,000") {
correct += 1;
}
}
let tag_mem = storage
.with_connection(|conn| {
create_memory(
conn,
&CreateMemoryInput {
content: "Service config: timeout=30s".to_string(),
memory_type: MemoryType::Note,
workspace: Some("longmemeval-bench".to_string()),
..Default::default()
},
)
})
.unwrap();
let update2 = UpdateMemoryInput {
content: Some("Service config: timeout=60s (doubled for reliability)".to_string()),
memory_type: None,
tags: None,
metadata: None,
importance: None,
scope: None,
ttl_seconds: None,
event_time: None,
trigger_pattern: None,
media_url: None,
};
let _ = storage.with_connection(|conn| update_memory(conn, tag_mem.id, &update2));
let updated2: Option<String> = storage
.with_connection(|conn| {
conn.query_row(
"SELECT content FROM memories WHERE id = ?1",
[tag_mem.id],
|row| row.get(0),
)
.map_err(crate::error::EngramError::Database)
})
.ok();
if let Some(c) = updated2 {
if c.contains("timeout=60s") {
correct += 1;
}
}
let count: i64 = storage
.with_connection(|conn| {
conn.query_row(
"SELECT COUNT(*) FROM memories WHERE content LIKE '%timeout=30s%' AND id = ?1",
[tag_mem.id],
|row| row.get(0),
)
.map_err(crate::error::EngramError::Database)
})
.unwrap_or(1);
if count == 0 {
correct += 1;
}
correct as f64 / total as f64
}
fn eval_multi_hop(&self, storage: &Storage) -> f64 {
let cases = vec![
TestCase {
setup_memories: vec![
"Node A connects to Node B via link L1".to_string(),
"Node B connects to Node C via link L2".to_string(),
],
query_keyword: "Node B".to_string(),
expected_content_substring: "connects".to_string(),
},
TestCase {
setup_memories: vec![
"Company Acme acquired Startup X in 2024".to_string(),
"Startup X built the Zephyr product".to_string(),
"Zephyr product has 50,000 active users".to_string(),
],
query_keyword: "Zephyr".to_string(),
expected_content_substring: "users".to_string(),
},
];
self.run_cases(storage, &cases)
}
fn eval_contradiction_detection(&self, storage: &Storage) -> f64 {
let mut correct = 0usize;
let total = 2usize;
let pairs = [
(
"Server capacity is 100 concurrent users (from 2025-01 report)",
"Server capacity is 500 concurrent users (from 2026-01 report)",
"concurrent users",
),
(
"API rate limit is 100 req/min per client",
"API rate limit is 1000 req/min per client (updated)",
"rate limit",
),
];
for (fact_a, fact_b, keyword) in &pairs {
let _ = storage.with_connection(|conn| {
create_memory(
conn,
&CreateMemoryInput {
content: fact_a.to_string(),
memory_type: MemoryType::Note,
workspace: Some("longmemeval-bench".to_string()),
..Default::default()
},
)
});
let _ = storage.with_connection(|conn| {
create_memory(
conn,
&CreateMemoryInput {
content: fact_b.to_string(),
memory_type: MemoryType::Note,
workspace: Some("longmemeval-bench".to_string()),
..Default::default()
},
)
});
let count: i64 = storage
.with_connection(|conn| {
conn.query_row(
"SELECT COUNT(*) FROM memories WHERE content LIKE ?1",
[format!("%{}%", keyword)],
|row| row.get(0),
)
.map_err(crate::error::EngramError::Database)
})
.unwrap_or(0);
if count >= 2 {
correct += 1;
}
}
correct as f64 / total as f64
}
fn run_cases(&self, storage: &Storage, cases: &[TestCase]) -> f64 {
if cases.is_empty() {
return 1.0;
}
let mut correct = 0usize;
for case in cases {
for content in &case.setup_memories {
let _ = storage.with_connection(|conn| {
create_memory(
conn,
&CreateMemoryInput {
content: content.clone(),
memory_type: MemoryType::Note,
workspace: Some("longmemeval-bench".to_string()),
..Default::default()
},
)
});
}
let retrieved: Option<String> = storage
.with_connection(|conn| {
conn.query_row(
"SELECT content FROM memories WHERE content LIKE ?1 LIMIT 1",
[format!("%{}%", case.query_keyword)],
|row| row.get(0),
)
.map_err(crate::error::EngramError::Database)
})
.ok();
if let Some(content) = retrieved {
if content.contains(&case.expected_content_substring) {
correct += 1;
}
}
}
correct as f64 / cases.len() as f64
}
fn weighted_score(&self, scores: &HashMap<String, f64>) -> f64 {
let total_weight: f64 = self.dimension_weights.values().sum();
if total_weight == 0.0 {
return 0.0;
}
self.dimension_weights
.iter()
.filter_map(|(dim, &weight)| scores.get(dim).map(|&score| score * weight))
.sum::<f64>()
/ total_weight
}
}
impl Benchmark for LongMemEvalBenchmark {
fn name(&self) -> &str {
"longmemeval"
}
fn description(&self) -> &str {
"5-dimension memory evaluation benchmark: information retention, temporal reasoning, \
knowledge update, multi-hop reasoning, and contradiction detection."
}
fn run(&self, db_path: &str) -> Result<BenchmarkResult, Box<dyn std::error::Error>> {
let start = Instant::now();
let storage = if db_path == ":memory:" {
Storage::open_in_memory()?
} else {
let bench_path = format!("{}.longmemeval_bench.db", db_path);
Storage::open(StorageConfig {
db_path: bench_path,
storage_mode: StorageMode::Local,
cloud_uri: None,
encrypt_cloud: false,
confidence_half_life_days: 30.0,
auto_sync: false,
sync_debounce_ms: 5000,
})?
};
let retention = self.eval_information_retention(&storage);
let temporal = self.eval_temporal_reasoning(&storage);
let knowledge_update = self.eval_knowledge_update(&storage);
let multi_hop = self.eval_multi_hop(&storage);
let contradiction = self.eval_contradiction_detection(&storage);
let mut dimension_scores = HashMap::new();
dimension_scores.insert("information_retention".to_string(), retention);
dimension_scores.insert("temporal_reasoning".to_string(), temporal);
dimension_scores.insert("knowledge_update".to_string(), knowledge_update);
dimension_scores.insert("multi_hop".to_string(), multi_hop);
dimension_scores.insert("contradiction_detection".to_string(), contradiction);
let weighted = self.weighted_score(&dimension_scores);
let duration_ms = start.elapsed().as_millis() as u64;
let mut metrics = dimension_scores;
metrics.insert("weighted_score".to_string(), weighted);
if db_path != ":memory:" {
let bench_path = format!("{}.longmemeval_bench.db", db_path);
drop(storage);
let _ = std::fs::remove_file(&bench_path);
let _ = std::fs::remove_file(format!("{}-wal", bench_path));
let _ = std::fs::remove_file(format!("{}-shm", bench_path));
}
Ok(BenchmarkResult {
name: self.name().to_string(),
metrics,
duration_ms,
timestamp: chrono::Utc::now().to_rfc3339(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_longmemeval_runs() {
let bench = LongMemEvalBenchmark::default();
let result = bench.run(":memory:").expect("benchmark should succeed");
assert_eq!(result.name, "longmemeval");
}
#[test]
fn test_all_dimensions_present() {
let bench = LongMemEvalBenchmark::default();
let result = bench.run(":memory:").expect("benchmark should succeed");
let expected_dims = [
"information_retention",
"temporal_reasoning",
"knowledge_update",
"multi_hop",
"contradiction_detection",
"weighted_score",
];
for dim in &expected_dims {
assert!(
result.metrics.contains_key(*dim),
"missing dimension: {}",
dim
);
}
}
#[test]
fn test_scores_in_range() {
let bench = LongMemEvalBenchmark::default();
let result = bench.run(":memory:").expect("benchmark should succeed");
for (key, value) in &result.metrics {
assert!(
(0.0..=1.0).contains(value),
"metric '{}' = {} out of range [0,1]",
key,
value
);
}
}
#[test]
fn test_weighted_score_with_custom_weights() {
let mut weights = HashMap::new();
weights.insert("information_retention".to_string(), 1.0);
weights.insert("temporal_reasoning".to_string(), 0.0);
weights.insert("knowledge_update".to_string(), 0.0);
weights.insert("multi_hop".to_string(), 0.0);
weights.insert("contradiction_detection".to_string(), 0.0);
let bench = LongMemEvalBenchmark {
dimension_weights: weights,
};
let result = bench.run(":memory:").expect("benchmark should succeed");
let retention = result.metrics["information_retention"];
let weighted = result.metrics["weighted_score"];
assert!(
(weighted - retention).abs() < 1e-9,
"weighted={} retention={}",
weighted,
retention
);
}
}