Skip to main content

engram/bench/
longmemeval.rs

1//! LongMemEval benchmark — 5-dimension memory evaluation
2//!
3//! Evaluates memory quality across five dimensions:
4//! 1. Information Retention — can we retrieve stored facts?
5//! 2. Temporal Reasoning — can we answer time-based questions?
6//! 3. Knowledge Update — does updating a fact reflect correctly?
7//! 4. Multi-Hop Reasoning — can we chain facts across memories?
8//! 5. Contradiction Detection — do we surface conflicting facts?
9
10use std::collections::HashMap;
11use std::time::Instant;
12
13use super::{Benchmark, BenchmarkResult};
14use crate::storage::queries::{create_memory, update_memory};
15use crate::storage::Storage;
16use crate::types::{CreateMemoryInput, MemoryType, StorageConfig, StorageMode, UpdateMemoryInput};
17
18/// LongMemEval benchmark with configurable dimension weights
19pub struct LongMemEvalBenchmark {
20    /// Weights per dimension (will be normalized if they don't sum to 1.0)
21    pub dimension_weights: HashMap<String, f64>,
22}
23
24impl Default for LongMemEvalBenchmark {
25    fn default() -> Self {
26        let mut weights = HashMap::new();
27        weights.insert("information_retention".to_string(), 0.25);
28        weights.insert("temporal_reasoning".to_string(), 0.20);
29        weights.insert("knowledge_update".to_string(), 0.20);
30        weights.insert("multi_hop".to_string(), 0.20);
31        weights.insert("contradiction_detection".to_string(), 0.15);
32        Self {
33            dimension_weights: weights,
34        }
35    }
36}
37
38/// A single test case for keyword-retrieval dimensions
39struct TestCase {
40    setup_memories: Vec<String>,
41    query_keyword: String,
42    expected_content_substring: String,
43}
44
45impl LongMemEvalBenchmark {
46    /// Evaluate information retention: store facts and retrieve them by keyword
47    fn eval_information_retention(&self, storage: &Storage) -> f64 {
48        let cases = vec![
49            TestCase {
50                setup_memories: vec![
51                    "Alice is a software engineer at TechCorp".to_string(),
52                    "Alice has 5 years of experience in Rust".to_string(),
53                ],
54                query_keyword: "Alice".to_string(),
55                expected_content_substring: "engineer".to_string(),
56            },
57            TestCase {
58                setup_memories: vec![
59                    "The Eiffel Tower is located in Paris, France".to_string(),
60                    "The Eiffel Tower was built in 1889".to_string(),
61                ],
62                query_keyword: "Eiffel".to_string(),
63                expected_content_substring: "Paris".to_string(),
64            },
65            TestCase {
66                setup_memories: vec!["Project Alpha deadline is Q3 2026".to_string()],
67                query_keyword: "Alpha".to_string(),
68                expected_content_substring: "Q3 2026".to_string(),
69            },
70        ];
71
72        self.run_cases(storage, &cases)
73    }
74
75    /// Evaluate temporal reasoning: tag memories with dates and retrieve by time context
76    fn eval_temporal_reasoning(&self, storage: &Storage) -> f64 {
77        let cases = vec![
78            TestCase {
79                setup_memories: vec![
80                    "Meeting on 2026-01-10: discussed Q1 roadmap".to_string(),
81                    "Meeting on 2026-02-15: reviewed Q2 budget".to_string(),
82                ],
83                query_keyword: "2026-01".to_string(),
84                expected_content_substring: "Q1 roadmap".to_string(),
85            },
86            TestCase {
87                setup_memories: vec![
88                    "Sprint 42 started on 2026-03-01".to_string(),
89                    "Sprint 42 ended on 2026-03-14 with 12 story points".to_string(),
90                ],
91                query_keyword: "Sprint 42".to_string(),
92                expected_content_substring: "story points".to_string(),
93            },
94        ];
95
96        self.run_cases(storage, &cases)
97    }
98
99    /// Evaluate knowledge update: create a fact, update it, verify new value is retrievable
100    fn eval_knowledge_update(&self, storage: &Storage) -> f64 {
101        let mut correct = 0usize;
102        let total = 3usize;
103
104        // Test 1: Update memory content and verify retrieval
105        let mem = storage
106            .with_connection(|conn| {
107                create_memory(
108                    conn,
109                    &CreateMemoryInput {
110                        content: "Budget for Q1 is $50,000".to_string(),
111                        memory_type: MemoryType::Note,
112                        workspace: Some("longmemeval-bench".to_string()),
113                        ..Default::default()
114                    },
115                )
116            })
117            .unwrap();
118
119        let update = UpdateMemoryInput {
120            content: Some("Budget for Q1 is $75,000 (revised)".to_string()),
121            memory_type: None,
122            tags: None,
123            metadata: None,
124            importance: None,
125            scope: None,
126            ttl_seconds: None,
127            event_time: None,
128            trigger_pattern: None,
129            media_url: None,
130        };
131        let _ = storage.with_connection(|conn| update_memory(conn, mem.id, &update));
132
133        let updated_content: Option<String> = storage
134            .with_connection(|conn| {
135                conn.query_row(
136                    "SELECT content FROM memories WHERE id = ?1",
137                    [mem.id],
138                    |row| row.get(0),
139                )
140                .map_err(crate::error::EngramError::Database)
141            })
142            .ok();
143
144        if let Some(c) = updated_content {
145            if c.contains("$75,000") {
146                correct += 1;
147            }
148        }
149
150        // Test 2: Update service config timeout
151        let tag_mem = storage
152            .with_connection(|conn| {
153                create_memory(
154                    conn,
155                    &CreateMemoryInput {
156                        content: "Service config: timeout=30s".to_string(),
157                        memory_type: MemoryType::Note,
158                        workspace: Some("longmemeval-bench".to_string()),
159                        ..Default::default()
160                    },
161                )
162            })
163            .unwrap();
164
165        let update2 = UpdateMemoryInput {
166            content: Some("Service config: timeout=60s (doubled for reliability)".to_string()),
167            memory_type: None,
168            tags: None,
169            metadata: None,
170            importance: None,
171            scope: None,
172            ttl_seconds: None,
173            event_time: None,
174            trigger_pattern: None,
175            media_url: None,
176        };
177        let _ = storage.with_connection(|conn| update_memory(conn, tag_mem.id, &update2));
178
179        let updated2: Option<String> = storage
180            .with_connection(|conn| {
181                conn.query_row(
182                    "SELECT content FROM memories WHERE id = ?1",
183                    [tag_mem.id],
184                    |row| row.get(0),
185                )
186                .map_err(crate::error::EngramError::Database)
187            })
188            .ok();
189
190        if let Some(c) = updated2 {
191            if c.contains("timeout=60s") {
192                correct += 1;
193            }
194        }
195
196        // Test 3: Verify original value is replaced (not duplicated)
197        let count: i64 = storage
198            .with_connection(|conn| {
199                conn.query_row(
200                    "SELECT COUNT(*) FROM memories WHERE content LIKE '%timeout=30s%' AND id = ?1",
201                    [tag_mem.id],
202                    |row| row.get(0),
203                )
204                .map_err(crate::error::EngramError::Database)
205            })
206            .unwrap_or(1);
207
208        if count == 0 {
209            correct += 1;
210        }
211
212        correct as f64 / total as f64
213    }
214
215    /// Evaluate multi-hop: store chained facts, verify both are retrievable via linking keyword
216    fn eval_multi_hop(&self, storage: &Storage) -> f64 {
217        let cases = vec![
218            TestCase {
219                setup_memories: vec![
220                    "Node A connects to Node B via link L1".to_string(),
221                    "Node B connects to Node C via link L2".to_string(),
222                ],
223                query_keyword: "Node B".to_string(),
224                expected_content_substring: "connects".to_string(),
225            },
226            TestCase {
227                setup_memories: vec![
228                    "Company Acme acquired Startup X in 2024".to_string(),
229                    "Startup X built the Zephyr product".to_string(),
230                    "Zephyr product has 50,000 active users".to_string(),
231                ],
232                query_keyword: "Zephyr".to_string(),
233                expected_content_substring: "users".to_string(),
234            },
235        ];
236
237        self.run_cases(storage, &cases)
238    }
239
240    /// Evaluate contradiction detection: store conflicting facts, verify both appear
241    fn eval_contradiction_detection(&self, storage: &Storage) -> f64 {
242        let mut correct = 0usize;
243        let total = 2usize;
244
245        let pairs = [
246            (
247                "Server capacity is 100 concurrent users (from 2025-01 report)",
248                "Server capacity is 500 concurrent users (from 2026-01 report)",
249                "concurrent users",
250            ),
251            (
252                "API rate limit is 100 req/min per client",
253                "API rate limit is 1000 req/min per client (updated)",
254                "rate limit",
255            ),
256        ];
257
258        for (fact_a, fact_b, keyword) in &pairs {
259            let _ = storage.with_connection(|conn| {
260                create_memory(
261                    conn,
262                    &CreateMemoryInput {
263                        content: fact_a.to_string(),
264                        memory_type: MemoryType::Note,
265                        workspace: Some("longmemeval-bench".to_string()),
266                        ..Default::default()
267                    },
268                )
269            });
270            let _ = storage.with_connection(|conn| {
271                create_memory(
272                    conn,
273                    &CreateMemoryInput {
274                        content: fact_b.to_string(),
275                        memory_type: MemoryType::Note,
276                        workspace: Some("longmemeval-bench".to_string()),
277                        ..Default::default()
278                    },
279                )
280            });
281
282            let count: i64 = storage
283                .with_connection(|conn| {
284                    conn.query_row(
285                        "SELECT COUNT(*) FROM memories WHERE content LIKE ?1",
286                        [format!("%{}%", keyword)],
287                        |row| row.get(0),
288                    )
289                    .map_err(crate::error::EngramError::Database)
290                })
291                .unwrap_or(0);
292
293            if count >= 2 {
294                correct += 1;
295            }
296        }
297
298        correct as f64 / total as f64
299    }
300
301    /// Helper: store memories for each case and check if expected content is retrievable
302    fn run_cases(&self, storage: &Storage, cases: &[TestCase]) -> f64 {
303        if cases.is_empty() {
304            return 1.0;
305        }
306
307        let mut correct = 0usize;
308
309        for case in cases {
310            for content in &case.setup_memories {
311                let _ = storage.with_connection(|conn| {
312                    create_memory(
313                        conn,
314                        &CreateMemoryInput {
315                            content: content.clone(),
316                            memory_type: MemoryType::Note,
317                            workspace: Some("longmemeval-bench".to_string()),
318                            ..Default::default()
319                        },
320                    )
321                });
322            }
323
324            let retrieved: Option<String> = storage
325                .with_connection(|conn| {
326                    conn.query_row(
327                        "SELECT content FROM memories WHERE content LIKE ?1 LIMIT 1",
328                        [format!("%{}%", case.query_keyword)],
329                        |row| row.get(0),
330                    )
331                    .map_err(crate::error::EngramError::Database)
332                })
333                .ok();
334
335            if let Some(content) = retrieved {
336                if content.contains(&case.expected_content_substring) {
337                    correct += 1;
338                }
339            }
340        }
341
342        correct as f64 / cases.len() as f64
343    }
344
345    /// Compute weighted score across dimensions
346    fn weighted_score(&self, scores: &HashMap<String, f64>) -> f64 {
347        let total_weight: f64 = self.dimension_weights.values().sum();
348        if total_weight == 0.0 {
349            return 0.0;
350        }
351
352        self.dimension_weights
353            .iter()
354            .filter_map(|(dim, &weight)| scores.get(dim).map(|&score| score * weight))
355            .sum::<f64>()
356            / total_weight
357    }
358}
359
360impl Benchmark for LongMemEvalBenchmark {
361    fn name(&self) -> &str {
362        "longmemeval"
363    }
364
365    fn description(&self) -> &str {
366        "5-dimension memory evaluation benchmark: information retention, temporal reasoning, \
367         knowledge update, multi-hop reasoning, and contradiction detection."
368    }
369
370    fn run(&self, db_path: &str) -> Result<BenchmarkResult, Box<dyn std::error::Error>> {
371        let start = Instant::now();
372
373        let storage = if db_path == ":memory:" {
374            Storage::open_in_memory()?
375        } else {
376            let bench_path = format!("{}.longmemeval_bench.db", db_path);
377            Storage::open(StorageConfig {
378                db_path: bench_path,
379                storage_mode: StorageMode::Local,
380                cloud_uri: None,
381                encrypt_cloud: false,
382                confidence_half_life_days: 30.0,
383                auto_sync: false,
384                sync_debounce_ms: 5000,
385            })?
386        };
387
388        let retention = self.eval_information_retention(&storage);
389        let temporal = self.eval_temporal_reasoning(&storage);
390        let knowledge_update = self.eval_knowledge_update(&storage);
391        let multi_hop = self.eval_multi_hop(&storage);
392        let contradiction = self.eval_contradiction_detection(&storage);
393
394        let mut dimension_scores = HashMap::new();
395        dimension_scores.insert("information_retention".to_string(), retention);
396        dimension_scores.insert("temporal_reasoning".to_string(), temporal);
397        dimension_scores.insert("knowledge_update".to_string(), knowledge_update);
398        dimension_scores.insert("multi_hop".to_string(), multi_hop);
399        dimension_scores.insert("contradiction_detection".to_string(), contradiction);
400
401        let weighted = self.weighted_score(&dimension_scores);
402
403        let duration_ms = start.elapsed().as_millis() as u64;
404
405        let mut metrics = dimension_scores;
406        metrics.insert("weighted_score".to_string(), weighted);
407
408        // Clean up temporary file
409        if db_path != ":memory:" {
410            let bench_path = format!("{}.longmemeval_bench.db", db_path);
411            drop(storage);
412            let _ = std::fs::remove_file(&bench_path);
413            let _ = std::fs::remove_file(format!("{}-wal", bench_path));
414            let _ = std::fs::remove_file(format!("{}-shm", bench_path));
415        }
416
417        Ok(BenchmarkResult {
418            name: self.name().to_string(),
419            metrics,
420            duration_ms,
421            timestamp: chrono::Utc::now().to_rfc3339(),
422        })
423    }
424}
425
426#[cfg(test)]
427mod tests {
428    use super::*;
429
430    #[test]
431    fn test_longmemeval_runs() {
432        let bench = LongMemEvalBenchmark::default();
433        let result = bench.run(":memory:").expect("benchmark should succeed");
434        assert_eq!(result.name, "longmemeval");
435    }
436
437    #[test]
438    fn test_all_dimensions_present() {
439        let bench = LongMemEvalBenchmark::default();
440        let result = bench.run(":memory:").expect("benchmark should succeed");
441
442        let expected_dims = [
443            "information_retention",
444            "temporal_reasoning",
445            "knowledge_update",
446            "multi_hop",
447            "contradiction_detection",
448            "weighted_score",
449        ];
450        for dim in &expected_dims {
451            assert!(
452                result.metrics.contains_key(*dim),
453                "missing dimension: {}",
454                dim
455            );
456        }
457    }
458
459    #[test]
460    fn test_scores_in_range() {
461        let bench = LongMemEvalBenchmark::default();
462        let result = bench.run(":memory:").expect("benchmark should succeed");
463
464        for (key, value) in &result.metrics {
465            assert!(
466                (0.0..=1.0).contains(value),
467                "metric '{}' = {} out of range [0,1]",
468                key,
469                value
470            );
471        }
472    }
473
474    #[test]
475    fn test_weighted_score_with_custom_weights() {
476        let mut weights = HashMap::new();
477        weights.insert("information_retention".to_string(), 1.0);
478        weights.insert("temporal_reasoning".to_string(), 0.0);
479        weights.insert("knowledge_update".to_string(), 0.0);
480        weights.insert("multi_hop".to_string(), 0.0);
481        weights.insert("contradiction_detection".to_string(), 0.0);
482
483        let bench = LongMemEvalBenchmark {
484            dimension_weights: weights,
485        };
486        let result = bench.run(":memory:").expect("benchmark should succeed");
487        let retention = result.metrics["information_retention"];
488        let weighted = result.metrics["weighted_score"];
489        assert!(
490            (weighted - retention).abs() < 1e-9,
491            "weighted={} retention={}",
492            weighted,
493            retention
494        );
495    }
496}