1use std::collections::HashMap;
11use std::time::Instant;
12
13use super::{Benchmark, BenchmarkResult};
14use crate::storage::queries::{create_memory, update_memory};
15use crate::storage::Storage;
16use crate::types::{CreateMemoryInput, MemoryType, StorageConfig, StorageMode, UpdateMemoryInput};
17
18pub struct LongMemEvalBenchmark {
20 pub dimension_weights: HashMap<String, f64>,
22}
23
24impl Default for LongMemEvalBenchmark {
25 fn default() -> Self {
26 let mut weights = HashMap::new();
27 weights.insert("information_retention".to_string(), 0.25);
28 weights.insert("temporal_reasoning".to_string(), 0.20);
29 weights.insert("knowledge_update".to_string(), 0.20);
30 weights.insert("multi_hop".to_string(), 0.20);
31 weights.insert("contradiction_detection".to_string(), 0.15);
32 Self {
33 dimension_weights: weights,
34 }
35 }
36}
37
38struct TestCase {
40 setup_memories: Vec<String>,
41 query_keyword: String,
42 expected_content_substring: String,
43}
44
45impl LongMemEvalBenchmark {
46 fn eval_information_retention(&self, storage: &Storage) -> f64 {
48 let cases = vec![
49 TestCase {
50 setup_memories: vec![
51 "Alice is a software engineer at TechCorp".to_string(),
52 "Alice has 5 years of experience in Rust".to_string(),
53 ],
54 query_keyword: "Alice".to_string(),
55 expected_content_substring: "engineer".to_string(),
56 },
57 TestCase {
58 setup_memories: vec![
59 "The Eiffel Tower is located in Paris, France".to_string(),
60 "The Eiffel Tower was built in 1889".to_string(),
61 ],
62 query_keyword: "Eiffel".to_string(),
63 expected_content_substring: "Paris".to_string(),
64 },
65 TestCase {
66 setup_memories: vec!["Project Alpha deadline is Q3 2026".to_string()],
67 query_keyword: "Alpha".to_string(),
68 expected_content_substring: "Q3 2026".to_string(),
69 },
70 ];
71
72 self.run_cases(storage, &cases)
73 }
74
75 fn eval_temporal_reasoning(&self, storage: &Storage) -> f64 {
77 let cases = vec![
78 TestCase {
79 setup_memories: vec![
80 "Meeting on 2026-01-10: discussed Q1 roadmap".to_string(),
81 "Meeting on 2026-02-15: reviewed Q2 budget".to_string(),
82 ],
83 query_keyword: "2026-01".to_string(),
84 expected_content_substring: "Q1 roadmap".to_string(),
85 },
86 TestCase {
87 setup_memories: vec![
88 "Sprint 42 started on 2026-03-01".to_string(),
89 "Sprint 42 ended on 2026-03-14 with 12 story points".to_string(),
90 ],
91 query_keyword: "Sprint 42".to_string(),
92 expected_content_substring: "story points".to_string(),
93 },
94 ];
95
96 self.run_cases(storage, &cases)
97 }
98
99 fn eval_knowledge_update(&self, storage: &Storage) -> f64 {
101 let mut correct = 0usize;
102 let total = 3usize;
103
104 let mem = storage
106 .with_connection(|conn| {
107 create_memory(
108 conn,
109 &CreateMemoryInput {
110 content: "Budget for Q1 is $50,000".to_string(),
111 memory_type: MemoryType::Note,
112 workspace: Some("longmemeval-bench".to_string()),
113 ..Default::default()
114 },
115 )
116 })
117 .unwrap();
118
119 let update = UpdateMemoryInput {
120 content: Some("Budget for Q1 is $75,000 (revised)".to_string()),
121 memory_type: None,
122 tags: None,
123 metadata: None,
124 importance: None,
125 scope: None,
126 ttl_seconds: None,
127 event_time: None,
128 trigger_pattern: None,
129 media_url: None,
130 };
131 let _ = storage.with_connection(|conn| update_memory(conn, mem.id, &update));
132
133 let updated_content: Option<String> = storage
134 .with_connection(|conn| {
135 conn.query_row(
136 "SELECT content FROM memories WHERE id = ?1",
137 [mem.id],
138 |row| row.get(0),
139 )
140 .map_err(crate::error::EngramError::Database)
141 })
142 .ok();
143
144 if let Some(c) = updated_content {
145 if c.contains("$75,000") {
146 correct += 1;
147 }
148 }
149
150 let tag_mem = storage
152 .with_connection(|conn| {
153 create_memory(
154 conn,
155 &CreateMemoryInput {
156 content: "Service config: timeout=30s".to_string(),
157 memory_type: MemoryType::Note,
158 workspace: Some("longmemeval-bench".to_string()),
159 ..Default::default()
160 },
161 )
162 })
163 .unwrap();
164
165 let update2 = UpdateMemoryInput {
166 content: Some("Service config: timeout=60s (doubled for reliability)".to_string()),
167 memory_type: None,
168 tags: None,
169 metadata: None,
170 importance: None,
171 scope: None,
172 ttl_seconds: None,
173 event_time: None,
174 trigger_pattern: None,
175 media_url: None,
176 };
177 let _ = storage.with_connection(|conn| update_memory(conn, tag_mem.id, &update2));
178
179 let updated2: Option<String> = storage
180 .with_connection(|conn| {
181 conn.query_row(
182 "SELECT content FROM memories WHERE id = ?1",
183 [tag_mem.id],
184 |row| row.get(0),
185 )
186 .map_err(crate::error::EngramError::Database)
187 })
188 .ok();
189
190 if let Some(c) = updated2 {
191 if c.contains("timeout=60s") {
192 correct += 1;
193 }
194 }
195
196 let count: i64 = storage
198 .with_connection(|conn| {
199 conn.query_row(
200 "SELECT COUNT(*) FROM memories WHERE content LIKE '%timeout=30s%' AND id = ?1",
201 [tag_mem.id],
202 |row| row.get(0),
203 )
204 .map_err(crate::error::EngramError::Database)
205 })
206 .unwrap_or(1);
207
208 if count == 0 {
209 correct += 1;
210 }
211
212 correct as f64 / total as f64
213 }
214
215 fn eval_multi_hop(&self, storage: &Storage) -> f64 {
217 let cases = vec![
218 TestCase {
219 setup_memories: vec![
220 "Node A connects to Node B via link L1".to_string(),
221 "Node B connects to Node C via link L2".to_string(),
222 ],
223 query_keyword: "Node B".to_string(),
224 expected_content_substring: "connects".to_string(),
225 },
226 TestCase {
227 setup_memories: vec![
228 "Company Acme acquired Startup X in 2024".to_string(),
229 "Startup X built the Zephyr product".to_string(),
230 "Zephyr product has 50,000 active users".to_string(),
231 ],
232 query_keyword: "Zephyr".to_string(),
233 expected_content_substring: "users".to_string(),
234 },
235 ];
236
237 self.run_cases(storage, &cases)
238 }
239
240 fn eval_contradiction_detection(&self, storage: &Storage) -> f64 {
242 let mut correct = 0usize;
243 let total = 2usize;
244
245 let pairs = [
246 (
247 "Server capacity is 100 concurrent users (from 2025-01 report)",
248 "Server capacity is 500 concurrent users (from 2026-01 report)",
249 "concurrent users",
250 ),
251 (
252 "API rate limit is 100 req/min per client",
253 "API rate limit is 1000 req/min per client (updated)",
254 "rate limit",
255 ),
256 ];
257
258 for (fact_a, fact_b, keyword) in &pairs {
259 let _ = storage.with_connection(|conn| {
260 create_memory(
261 conn,
262 &CreateMemoryInput {
263 content: fact_a.to_string(),
264 memory_type: MemoryType::Note,
265 workspace: Some("longmemeval-bench".to_string()),
266 ..Default::default()
267 },
268 )
269 });
270 let _ = storage.with_connection(|conn| {
271 create_memory(
272 conn,
273 &CreateMemoryInput {
274 content: fact_b.to_string(),
275 memory_type: MemoryType::Note,
276 workspace: Some("longmemeval-bench".to_string()),
277 ..Default::default()
278 },
279 )
280 });
281
282 let count: i64 = storage
283 .with_connection(|conn| {
284 conn.query_row(
285 "SELECT COUNT(*) FROM memories WHERE content LIKE ?1",
286 [format!("%{}%", keyword)],
287 |row| row.get(0),
288 )
289 .map_err(crate::error::EngramError::Database)
290 })
291 .unwrap_or(0);
292
293 if count >= 2 {
294 correct += 1;
295 }
296 }
297
298 correct as f64 / total as f64
299 }
300
301 fn run_cases(&self, storage: &Storage, cases: &[TestCase]) -> f64 {
303 if cases.is_empty() {
304 return 1.0;
305 }
306
307 let mut correct = 0usize;
308
309 for case in cases {
310 for content in &case.setup_memories {
311 let _ = storage.with_connection(|conn| {
312 create_memory(
313 conn,
314 &CreateMemoryInput {
315 content: content.clone(),
316 memory_type: MemoryType::Note,
317 workspace: Some("longmemeval-bench".to_string()),
318 ..Default::default()
319 },
320 )
321 });
322 }
323
324 let retrieved: Option<String> = storage
325 .with_connection(|conn| {
326 conn.query_row(
327 "SELECT content FROM memories WHERE content LIKE ?1 LIMIT 1",
328 [format!("%{}%", case.query_keyword)],
329 |row| row.get(0),
330 )
331 .map_err(crate::error::EngramError::Database)
332 })
333 .ok();
334
335 if let Some(content) = retrieved {
336 if content.contains(&case.expected_content_substring) {
337 correct += 1;
338 }
339 }
340 }
341
342 correct as f64 / cases.len() as f64
343 }
344
345 fn weighted_score(&self, scores: &HashMap<String, f64>) -> f64 {
347 let total_weight: f64 = self.dimension_weights.values().sum();
348 if total_weight == 0.0 {
349 return 0.0;
350 }
351
352 self.dimension_weights
353 .iter()
354 .filter_map(|(dim, &weight)| scores.get(dim).map(|&score| score * weight))
355 .sum::<f64>()
356 / total_weight
357 }
358}
359
360impl Benchmark for LongMemEvalBenchmark {
361 fn name(&self) -> &str {
362 "longmemeval"
363 }
364
365 fn description(&self) -> &str {
366 "5-dimension memory evaluation benchmark: information retention, temporal reasoning, \
367 knowledge update, multi-hop reasoning, and contradiction detection."
368 }
369
370 fn run(&self, db_path: &str) -> Result<BenchmarkResult, Box<dyn std::error::Error>> {
371 let start = Instant::now();
372
373 let storage = if db_path == ":memory:" {
374 Storage::open_in_memory()?
375 } else {
376 let bench_path = format!("{}.longmemeval_bench.db", db_path);
377 Storage::open(StorageConfig {
378 db_path: bench_path,
379 storage_mode: StorageMode::Local,
380 cloud_uri: None,
381 encrypt_cloud: false,
382 confidence_half_life_days: 30.0,
383 auto_sync: false,
384 sync_debounce_ms: 5000,
385 })?
386 };
387
388 let retention = self.eval_information_retention(&storage);
389 let temporal = self.eval_temporal_reasoning(&storage);
390 let knowledge_update = self.eval_knowledge_update(&storage);
391 let multi_hop = self.eval_multi_hop(&storage);
392 let contradiction = self.eval_contradiction_detection(&storage);
393
394 let mut dimension_scores = HashMap::new();
395 dimension_scores.insert("information_retention".to_string(), retention);
396 dimension_scores.insert("temporal_reasoning".to_string(), temporal);
397 dimension_scores.insert("knowledge_update".to_string(), knowledge_update);
398 dimension_scores.insert("multi_hop".to_string(), multi_hop);
399 dimension_scores.insert("contradiction_detection".to_string(), contradiction);
400
401 let weighted = self.weighted_score(&dimension_scores);
402
403 let duration_ms = start.elapsed().as_millis() as u64;
404
405 let mut metrics = dimension_scores;
406 metrics.insert("weighted_score".to_string(), weighted);
407
408 if db_path != ":memory:" {
410 let bench_path = format!("{}.longmemeval_bench.db", db_path);
411 drop(storage);
412 let _ = std::fs::remove_file(&bench_path);
413 let _ = std::fs::remove_file(format!("{}-wal", bench_path));
414 let _ = std::fs::remove_file(format!("{}-shm", bench_path));
415 }
416
417 Ok(BenchmarkResult {
418 name: self.name().to_string(),
419 metrics,
420 duration_ms,
421 timestamp: chrono::Utc::now().to_rfc3339(),
422 })
423 }
424}
425
426#[cfg(test)]
427mod tests {
428 use super::*;
429
430 #[test]
431 fn test_longmemeval_runs() {
432 let bench = LongMemEvalBenchmark::default();
433 let result = bench.run(":memory:").expect("benchmark should succeed");
434 assert_eq!(result.name, "longmemeval");
435 }
436
437 #[test]
438 fn test_all_dimensions_present() {
439 let bench = LongMemEvalBenchmark::default();
440 let result = bench.run(":memory:").expect("benchmark should succeed");
441
442 let expected_dims = [
443 "information_retention",
444 "temporal_reasoning",
445 "knowledge_update",
446 "multi_hop",
447 "contradiction_detection",
448 "weighted_score",
449 ];
450 for dim in &expected_dims {
451 assert!(
452 result.metrics.contains_key(*dim),
453 "missing dimension: {}",
454 dim
455 );
456 }
457 }
458
459 #[test]
460 fn test_scores_in_range() {
461 let bench = LongMemEvalBenchmark::default();
462 let result = bench.run(":memory:").expect("benchmark should succeed");
463
464 for (key, value) in &result.metrics {
465 assert!(
466 (0.0..=1.0).contains(value),
467 "metric '{}' = {} out of range [0,1]",
468 key,
469 value
470 );
471 }
472 }
473
474 #[test]
475 fn test_weighted_score_with_custom_weights() {
476 let mut weights = HashMap::new();
477 weights.insert("information_retention".to_string(), 1.0);
478 weights.insert("temporal_reasoning".to_string(), 0.0);
479 weights.insert("knowledge_update".to_string(), 0.0);
480 weights.insert("multi_hop".to_string(), 0.0);
481 weights.insert("contradiction_detection".to_string(), 0.0);
482
483 let bench = LongMemEvalBenchmark {
484 dimension_weights: weights,
485 };
486 let result = bench.run(":memory:").expect("benchmark should succeed");
487 let retention = result.metrics["information_retention"];
488 let weighted = result.metrics["weighted_score"];
489 assert!(
490 (weighted - retention).abs() < 1e-9,
491 "weighted={} retention={}",
492 weighted,
493 retention
494 );
495 }
496}