1use std::collections::HashMap;
11use std::time::Instant;
12
13use super::{Benchmark, BenchmarkResult};
14use crate::storage::queries::{create_memory, update_memory};
15use crate::storage::Storage;
16use crate::types::{CreateMemoryInput, MemoryType, StorageConfig, StorageMode, UpdateMemoryInput};
17
18pub struct LongMemEvalBenchmark {
20 pub dimension_weights: HashMap<String, f64>,
22}
23
24impl Default for LongMemEvalBenchmark {
25 fn default() -> Self {
26 let mut weights = HashMap::new();
27 weights.insert("information_retention".to_string(), 0.25);
28 weights.insert("temporal_reasoning".to_string(), 0.20);
29 weights.insert("knowledge_update".to_string(), 0.20);
30 weights.insert("multi_hop".to_string(), 0.20);
31 weights.insert("contradiction_detection".to_string(), 0.15);
32 Self {
33 dimension_weights: weights,
34 }
35 }
36}
37
38struct TestCase {
40 setup_memories: Vec<String>,
41 query_keyword: String,
42 expected_content_substring: String,
43}
44
45impl LongMemEvalBenchmark {
46 fn eval_information_retention(&self, storage: &Storage) -> f64 {
48 let cases = vec![
49 TestCase {
50 setup_memories: vec![
51 "Alice is a software engineer at TechCorp".to_string(),
52 "Alice has 5 years of experience in Rust".to_string(),
53 ],
54 query_keyword: "Alice".to_string(),
55 expected_content_substring: "engineer".to_string(),
56 },
57 TestCase {
58 setup_memories: vec![
59 "The Eiffel Tower is located in Paris, France".to_string(),
60 "The Eiffel Tower was built in 1889".to_string(),
61 ],
62 query_keyword: "Eiffel".to_string(),
63 expected_content_substring: "Paris".to_string(),
64 },
65 TestCase {
66 setup_memories: vec!["Project Alpha deadline is Q3 2026".to_string()],
67 query_keyword: "Alpha".to_string(),
68 expected_content_substring: "Q3 2026".to_string(),
69 },
70 ];
71
72 self.run_cases(storage, &cases)
73 }
74
75 fn eval_temporal_reasoning(&self, storage: &Storage) -> f64 {
77 let cases = vec![
78 TestCase {
79 setup_memories: vec![
80 "Meeting on 2026-01-10: discussed Q1 roadmap".to_string(),
81 "Meeting on 2026-02-15: reviewed Q2 budget".to_string(),
82 ],
83 query_keyword: "2026-01".to_string(),
84 expected_content_substring: "Q1 roadmap".to_string(),
85 },
86 TestCase {
87 setup_memories: vec![
88 "Sprint 42 started on 2026-03-01".to_string(),
89 "Sprint 42 ended on 2026-03-14 with 12 story points".to_string(),
90 ],
91 query_keyword: "Sprint 42".to_string(),
92 expected_content_substring: "story points".to_string(),
93 },
94 ];
95
96 self.run_cases(storage, &cases)
97 }
98
99 fn eval_knowledge_update(&self, storage: &Storage) -> f64 {
101 let mut correct = 0usize;
102 let total = 3usize;
103
104 let mem = storage
106 .with_connection(|conn| {
107 create_memory(
108 conn,
109 &CreateMemoryInput {
110 content: "Budget for Q1 is $50,000".to_string(),
111 memory_type: MemoryType::Note,
112 workspace: Some("longmemeval-bench".to_string()),
113 ..Default::default()
114 },
115 )
116 })
117 .unwrap();
118
119 let update = UpdateMemoryInput {
120 content: Some("Budget for Q1 is $75,000 (revised)".to_string()),
121 memory_type: None,
122 tags: None,
123 metadata: None,
124 importance: None,
125 scope: None,
126 ttl_seconds: None,
127 event_time: None,
128 trigger_pattern: None,
129 };
130 let _ = storage.with_connection(|conn| update_memory(conn, mem.id, &update));
131
132 let updated_content: Option<String> = storage
133 .with_connection(|conn| {
134 conn.query_row(
135 "SELECT content FROM memories WHERE id = ?1",
136 [mem.id],
137 |row| row.get(0),
138 )
139 .map_err(crate::error::EngramError::Database)
140 })
141 .ok();
142
143 if let Some(c) = updated_content {
144 if c.contains("$75,000") {
145 correct += 1;
146 }
147 }
148
149 let tag_mem = storage
151 .with_connection(|conn| {
152 create_memory(
153 conn,
154 &CreateMemoryInput {
155 content: "Service config: timeout=30s".to_string(),
156 memory_type: MemoryType::Note,
157 workspace: Some("longmemeval-bench".to_string()),
158 ..Default::default()
159 },
160 )
161 })
162 .unwrap();
163
164 let update2 = UpdateMemoryInput {
165 content: Some("Service config: timeout=60s (doubled for reliability)".to_string()),
166 memory_type: None,
167 tags: None,
168 metadata: None,
169 importance: None,
170 scope: None,
171 ttl_seconds: None,
172 event_time: None,
173 trigger_pattern: None,
174 };
175 let _ = storage.with_connection(|conn| update_memory(conn, tag_mem.id, &update2));
176
177 let updated2: Option<String> = storage
178 .with_connection(|conn| {
179 conn.query_row(
180 "SELECT content FROM memories WHERE id = ?1",
181 [tag_mem.id],
182 |row| row.get(0),
183 )
184 .map_err(crate::error::EngramError::Database)
185 })
186 .ok();
187
188 if let Some(c) = updated2 {
189 if c.contains("timeout=60s") {
190 correct += 1;
191 }
192 }
193
194 let count: i64 = storage
196 .with_connection(|conn| {
197 conn.query_row(
198 "SELECT COUNT(*) FROM memories WHERE content LIKE '%timeout=30s%' AND id = ?1",
199 [tag_mem.id],
200 |row| row.get(0),
201 )
202 .map_err(crate::error::EngramError::Database)
203 })
204 .unwrap_or(1);
205
206 if count == 0 {
207 correct += 1;
208 }
209
210 correct as f64 / total as f64
211 }
212
213 fn eval_multi_hop(&self, storage: &Storage) -> f64 {
215 let cases = vec![
216 TestCase {
217 setup_memories: vec![
218 "Node A connects to Node B via link L1".to_string(),
219 "Node B connects to Node C via link L2".to_string(),
220 ],
221 query_keyword: "Node B".to_string(),
222 expected_content_substring: "connects".to_string(),
223 },
224 TestCase {
225 setup_memories: vec![
226 "Company Acme acquired Startup X in 2024".to_string(),
227 "Startup X built the Zephyr product".to_string(),
228 "Zephyr product has 50,000 active users".to_string(),
229 ],
230 query_keyword: "Zephyr".to_string(),
231 expected_content_substring: "users".to_string(),
232 },
233 ];
234
235 self.run_cases(storage, &cases)
236 }
237
238 fn eval_contradiction_detection(&self, storage: &Storage) -> f64 {
240 let mut correct = 0usize;
241 let total = 2usize;
242
243 let pairs = [
244 (
245 "Server capacity is 100 concurrent users (from 2025-01 report)",
246 "Server capacity is 500 concurrent users (from 2026-01 report)",
247 "concurrent users",
248 ),
249 (
250 "API rate limit is 100 req/min per client",
251 "API rate limit is 1000 req/min per client (updated)",
252 "rate limit",
253 ),
254 ];
255
256 for (fact_a, fact_b, keyword) in &pairs {
257 let _ = storage.with_connection(|conn| {
258 create_memory(
259 conn,
260 &CreateMemoryInput {
261 content: fact_a.to_string(),
262 memory_type: MemoryType::Note,
263 workspace: Some("longmemeval-bench".to_string()),
264 ..Default::default()
265 },
266 )
267 });
268 let _ = storage.with_connection(|conn| {
269 create_memory(
270 conn,
271 &CreateMemoryInput {
272 content: fact_b.to_string(),
273 memory_type: MemoryType::Note,
274 workspace: Some("longmemeval-bench".to_string()),
275 ..Default::default()
276 },
277 )
278 });
279
280 let count: i64 = storage
281 .with_connection(|conn| {
282 conn.query_row(
283 "SELECT COUNT(*) FROM memories WHERE content LIKE ?1",
284 [format!("%{}%", keyword)],
285 |row| row.get(0),
286 )
287 .map_err(crate::error::EngramError::Database)
288 })
289 .unwrap_or(0);
290
291 if count >= 2 {
292 correct += 1;
293 }
294 }
295
296 correct as f64 / total as f64
297 }
298
299 fn run_cases(&self, storage: &Storage, cases: &[TestCase]) -> f64 {
301 if cases.is_empty() {
302 return 1.0;
303 }
304
305 let mut correct = 0usize;
306
307 for case in cases {
308 for content in &case.setup_memories {
309 let _ = storage.with_connection(|conn| {
310 create_memory(
311 conn,
312 &CreateMemoryInput {
313 content: content.clone(),
314 memory_type: MemoryType::Note,
315 workspace: Some("longmemeval-bench".to_string()),
316 ..Default::default()
317 },
318 )
319 });
320 }
321
322 let retrieved: Option<String> = storage
323 .with_connection(|conn| {
324 conn.query_row(
325 "SELECT content FROM memories WHERE content LIKE ?1 LIMIT 1",
326 [format!("%{}%", case.query_keyword)],
327 |row| row.get(0),
328 )
329 .map_err(crate::error::EngramError::Database)
330 })
331 .ok();
332
333 if let Some(content) = retrieved {
334 if content.contains(&case.expected_content_substring) {
335 correct += 1;
336 }
337 }
338 }
339
340 correct as f64 / cases.len() as f64
341 }
342
343 fn weighted_score(&self, scores: &HashMap<String, f64>) -> f64 {
345 let total_weight: f64 = self.dimension_weights.values().sum();
346 if total_weight == 0.0 {
347 return 0.0;
348 }
349
350 self.dimension_weights
351 .iter()
352 .filter_map(|(dim, &weight)| scores.get(dim).map(|&score| score * weight))
353 .sum::<f64>()
354 / total_weight
355 }
356}
357
358impl Benchmark for LongMemEvalBenchmark {
359 fn name(&self) -> &str {
360 "longmemeval"
361 }
362
363 fn description(&self) -> &str {
364 "5-dimension memory evaluation benchmark: information retention, temporal reasoning, \
365 knowledge update, multi-hop reasoning, and contradiction detection."
366 }
367
368 fn run(&self, db_path: &str) -> Result<BenchmarkResult, Box<dyn std::error::Error>> {
369 let start = Instant::now();
370
371 let storage = if db_path == ":memory:" {
372 Storage::open_in_memory()?
373 } else {
374 let bench_path = format!("{}.longmemeval_bench.db", db_path);
375 Storage::open(StorageConfig {
376 db_path: bench_path,
377 storage_mode: StorageMode::Local,
378 cloud_uri: None,
379 encrypt_cloud: false,
380 confidence_half_life_days: 30.0,
381 auto_sync: false,
382 sync_debounce_ms: 5000,
383 })?
384 };
385
386 let retention = self.eval_information_retention(&storage);
387 let temporal = self.eval_temporal_reasoning(&storage);
388 let knowledge_update = self.eval_knowledge_update(&storage);
389 let multi_hop = self.eval_multi_hop(&storage);
390 let contradiction = self.eval_contradiction_detection(&storage);
391
392 let mut dimension_scores = HashMap::new();
393 dimension_scores.insert("information_retention".to_string(), retention);
394 dimension_scores.insert("temporal_reasoning".to_string(), temporal);
395 dimension_scores.insert("knowledge_update".to_string(), knowledge_update);
396 dimension_scores.insert("multi_hop".to_string(), multi_hop);
397 dimension_scores.insert("contradiction_detection".to_string(), contradiction);
398
399 let weighted = self.weighted_score(&dimension_scores);
400
401 let duration_ms = start.elapsed().as_millis() as u64;
402
403 let mut metrics = dimension_scores;
404 metrics.insert("weighted_score".to_string(), weighted);
405
406 if db_path != ":memory:" {
408 let bench_path = format!("{}.longmemeval_bench.db", db_path);
409 drop(storage);
410 let _ = std::fs::remove_file(&bench_path);
411 let _ = std::fs::remove_file(format!("{}-wal", bench_path));
412 let _ = std::fs::remove_file(format!("{}-shm", bench_path));
413 }
414
415 Ok(BenchmarkResult {
416 name: self.name().to_string(),
417 metrics,
418 duration_ms,
419 timestamp: chrono::Utc::now().to_rfc3339(),
420 })
421 }
422}
423
424#[cfg(test)]
425mod tests {
426 use super::*;
427
428 #[test]
429 fn test_longmemeval_runs() {
430 let bench = LongMemEvalBenchmark::default();
431 let result = bench.run(":memory:").expect("benchmark should succeed");
432 assert_eq!(result.name, "longmemeval");
433 }
434
435 #[test]
436 fn test_all_dimensions_present() {
437 let bench = LongMemEvalBenchmark::default();
438 let result = bench.run(":memory:").expect("benchmark should succeed");
439
440 let expected_dims = [
441 "information_retention",
442 "temporal_reasoning",
443 "knowledge_update",
444 "multi_hop",
445 "contradiction_detection",
446 "weighted_score",
447 ];
448 for dim in &expected_dims {
449 assert!(
450 result.metrics.contains_key(*dim),
451 "missing dimension: {}",
452 dim
453 );
454 }
455 }
456
457 #[test]
458 fn test_scores_in_range() {
459 let bench = LongMemEvalBenchmark::default();
460 let result = bench.run(":memory:").expect("benchmark should succeed");
461
462 for (key, value) in &result.metrics {
463 assert!(
464 (0.0..=1.0).contains(value),
465 "metric '{}' = {} out of range [0,1]",
466 key,
467 value
468 );
469 }
470 }
471
472 #[test]
473 fn test_weighted_score_with_custom_weights() {
474 let mut weights = HashMap::new();
475 weights.insert("information_retention".to_string(), 1.0);
476 weights.insert("temporal_reasoning".to_string(), 0.0);
477 weights.insert("knowledge_update".to_string(), 0.0);
478 weights.insert("multi_hop".to_string(), 0.0);
479 weights.insert("contradiction_detection".to_string(), 0.0);
480
481 let bench = LongMemEvalBenchmark {
482 dimension_weights: weights,
483 };
484 let result = bench.run(":memory:").expect("benchmark should succeed");
485 let retention = result.metrics["information_retention"];
486 let weighted = result.metrics["weighted_score"];
487 assert!(
488 (weighted - retention).abs() < 1e-9,
489 "weighted={} retention={}",
490 weighted,
491 retention
492 );
493 }
494}