Skip to main content

engram/bench/
longmemeval.rs

1//! LongMemEval benchmark — 5-dimension memory evaluation
2//!
3//! Evaluates memory quality across five dimensions:
4//! 1. Information Retention — can we retrieve stored facts?
5//! 2. Temporal Reasoning — can we answer time-based questions?
6//! 3. Knowledge Update — does updating a fact reflect correctly?
7//! 4. Multi-Hop Reasoning — can we chain facts across memories?
8//! 5. Contradiction Detection — do we surface conflicting facts?
9
10use std::collections::HashMap;
11use std::time::Instant;
12
13use super::{Benchmark, BenchmarkResult};
14use crate::storage::queries::{create_memory, update_memory};
15use crate::storage::Storage;
16use crate::types::{CreateMemoryInput, MemoryType, StorageConfig, StorageMode, UpdateMemoryInput};
17
18/// LongMemEval benchmark with configurable dimension weights
19pub struct LongMemEvalBenchmark {
20    /// Weights per dimension (will be normalized if they don't sum to 1.0)
21    pub dimension_weights: HashMap<String, f64>,
22}
23
24impl Default for LongMemEvalBenchmark {
25    fn default() -> Self {
26        let mut weights = HashMap::new();
27        weights.insert("information_retention".to_string(), 0.25);
28        weights.insert("temporal_reasoning".to_string(), 0.20);
29        weights.insert("knowledge_update".to_string(), 0.20);
30        weights.insert("multi_hop".to_string(), 0.20);
31        weights.insert("contradiction_detection".to_string(), 0.15);
32        Self {
33            dimension_weights: weights,
34        }
35    }
36}
37
38/// A single test case for keyword-retrieval dimensions
39struct TestCase {
40    setup_memories: Vec<String>,
41    query_keyword: String,
42    expected_content_substring: String,
43}
44
45impl LongMemEvalBenchmark {
46    /// Evaluate information retention: store facts and retrieve them by keyword
47    fn eval_information_retention(&self, storage: &Storage) -> f64 {
48        let cases = vec![
49            TestCase {
50                setup_memories: vec![
51                    "Alice is a software engineer at TechCorp".to_string(),
52                    "Alice has 5 years of experience in Rust".to_string(),
53                ],
54                query_keyword: "Alice".to_string(),
55                expected_content_substring: "engineer".to_string(),
56            },
57            TestCase {
58                setup_memories: vec![
59                    "The Eiffel Tower is located in Paris, France".to_string(),
60                    "The Eiffel Tower was built in 1889".to_string(),
61                ],
62                query_keyword: "Eiffel".to_string(),
63                expected_content_substring: "Paris".to_string(),
64            },
65            TestCase {
66                setup_memories: vec!["Project Alpha deadline is Q3 2026".to_string()],
67                query_keyword: "Alpha".to_string(),
68                expected_content_substring: "Q3 2026".to_string(),
69            },
70        ];
71
72        self.run_cases(storage, &cases)
73    }
74
75    /// Evaluate temporal reasoning: tag memories with dates and retrieve by time context
76    fn eval_temporal_reasoning(&self, storage: &Storage) -> f64 {
77        let cases = vec![
78            TestCase {
79                setup_memories: vec![
80                    "Meeting on 2026-01-10: discussed Q1 roadmap".to_string(),
81                    "Meeting on 2026-02-15: reviewed Q2 budget".to_string(),
82                ],
83                query_keyword: "2026-01".to_string(),
84                expected_content_substring: "Q1 roadmap".to_string(),
85            },
86            TestCase {
87                setup_memories: vec![
88                    "Sprint 42 started on 2026-03-01".to_string(),
89                    "Sprint 42 ended on 2026-03-14 with 12 story points".to_string(),
90                ],
91                query_keyword: "Sprint 42".to_string(),
92                expected_content_substring: "story points".to_string(),
93            },
94        ];
95
96        self.run_cases(storage, &cases)
97    }
98
99    /// Evaluate knowledge update: create a fact, update it, verify new value is retrievable
100    fn eval_knowledge_update(&self, storage: &Storage) -> f64 {
101        let mut correct = 0usize;
102        let total = 3usize;
103
104        // Test 1: Update memory content and verify retrieval
105        let mem = storage
106            .with_connection(|conn| {
107                create_memory(
108                    conn,
109                    &CreateMemoryInput {
110                        content: "Budget for Q1 is $50,000".to_string(),
111                        memory_type: MemoryType::Note,
112                        workspace: Some("longmemeval-bench".to_string()),
113                        ..Default::default()
114                    },
115                )
116            })
117            .unwrap();
118
119        let update = UpdateMemoryInput {
120            content: Some("Budget for Q1 is $75,000 (revised)".to_string()),
121            memory_type: None,
122            tags: None,
123            metadata: None,
124            importance: None,
125            scope: None,
126            ttl_seconds: None,
127            event_time: None,
128            trigger_pattern: None,
129        };
130        let _ = storage.with_connection(|conn| update_memory(conn, mem.id, &update));
131
132        let updated_content: Option<String> = storage
133            .with_connection(|conn| {
134                conn.query_row(
135                    "SELECT content FROM memories WHERE id = ?1",
136                    [mem.id],
137                    |row| row.get(0),
138                )
139                .map_err(crate::error::EngramError::Database)
140            })
141            .ok();
142
143        if let Some(c) = updated_content {
144            if c.contains("$75,000") {
145                correct += 1;
146            }
147        }
148
149        // Test 2: Update service config timeout
150        let tag_mem = storage
151            .with_connection(|conn| {
152                create_memory(
153                    conn,
154                    &CreateMemoryInput {
155                        content: "Service config: timeout=30s".to_string(),
156                        memory_type: MemoryType::Note,
157                        workspace: Some("longmemeval-bench".to_string()),
158                        ..Default::default()
159                    },
160                )
161            })
162            .unwrap();
163
164        let update2 = UpdateMemoryInput {
165            content: Some("Service config: timeout=60s (doubled for reliability)".to_string()),
166            memory_type: None,
167            tags: None,
168            metadata: None,
169            importance: None,
170            scope: None,
171            ttl_seconds: None,
172            event_time: None,
173            trigger_pattern: None,
174        };
175        let _ = storage.with_connection(|conn| update_memory(conn, tag_mem.id, &update2));
176
177        let updated2: Option<String> = storage
178            .with_connection(|conn| {
179                conn.query_row(
180                    "SELECT content FROM memories WHERE id = ?1",
181                    [tag_mem.id],
182                    |row| row.get(0),
183                )
184                .map_err(crate::error::EngramError::Database)
185            })
186            .ok();
187
188        if let Some(c) = updated2 {
189            if c.contains("timeout=60s") {
190                correct += 1;
191            }
192        }
193
194        // Test 3: Verify original value is replaced (not duplicated)
195        let count: i64 = storage
196            .with_connection(|conn| {
197                conn.query_row(
198                    "SELECT COUNT(*) FROM memories WHERE content LIKE '%timeout=30s%' AND id = ?1",
199                    [tag_mem.id],
200                    |row| row.get(0),
201                )
202                .map_err(crate::error::EngramError::Database)
203            })
204            .unwrap_or(1);
205
206        if count == 0 {
207            correct += 1;
208        }
209
210        correct as f64 / total as f64
211    }
212
213    /// Evaluate multi-hop: store chained facts, verify both are retrievable via linking keyword
214    fn eval_multi_hop(&self, storage: &Storage) -> f64 {
215        let cases = vec![
216            TestCase {
217                setup_memories: vec![
218                    "Node A connects to Node B via link L1".to_string(),
219                    "Node B connects to Node C via link L2".to_string(),
220                ],
221                query_keyword: "Node B".to_string(),
222                expected_content_substring: "connects".to_string(),
223            },
224            TestCase {
225                setup_memories: vec![
226                    "Company Acme acquired Startup X in 2024".to_string(),
227                    "Startup X built the Zephyr product".to_string(),
228                    "Zephyr product has 50,000 active users".to_string(),
229                ],
230                query_keyword: "Zephyr".to_string(),
231                expected_content_substring: "users".to_string(),
232            },
233        ];
234
235        self.run_cases(storage, &cases)
236    }
237
238    /// Evaluate contradiction detection: store conflicting facts, verify both appear
239    fn eval_contradiction_detection(&self, storage: &Storage) -> f64 {
240        let mut correct = 0usize;
241        let total = 2usize;
242
243        let pairs = [
244            (
245                "Server capacity is 100 concurrent users (from 2025-01 report)",
246                "Server capacity is 500 concurrent users (from 2026-01 report)",
247                "concurrent users",
248            ),
249            (
250                "API rate limit is 100 req/min per client",
251                "API rate limit is 1000 req/min per client (updated)",
252                "rate limit",
253            ),
254        ];
255
256        for (fact_a, fact_b, keyword) in &pairs {
257            let _ = storage.with_connection(|conn| {
258                create_memory(
259                    conn,
260                    &CreateMemoryInput {
261                        content: fact_a.to_string(),
262                        memory_type: MemoryType::Note,
263                        workspace: Some("longmemeval-bench".to_string()),
264                        ..Default::default()
265                    },
266                )
267            });
268            let _ = storage.with_connection(|conn| {
269                create_memory(
270                    conn,
271                    &CreateMemoryInput {
272                        content: fact_b.to_string(),
273                        memory_type: MemoryType::Note,
274                        workspace: Some("longmemeval-bench".to_string()),
275                        ..Default::default()
276                    },
277                )
278            });
279
280            let count: i64 = storage
281                .with_connection(|conn| {
282                    conn.query_row(
283                        "SELECT COUNT(*) FROM memories WHERE content LIKE ?1",
284                        [format!("%{}%", keyword)],
285                        |row| row.get(0),
286                    )
287                    .map_err(crate::error::EngramError::Database)
288                })
289                .unwrap_or(0);
290
291            if count >= 2 {
292                correct += 1;
293            }
294        }
295
296        correct as f64 / total as f64
297    }
298
299    /// Helper: store memories for each case and check if expected content is retrievable
300    fn run_cases(&self, storage: &Storage, cases: &[TestCase]) -> f64 {
301        if cases.is_empty() {
302            return 1.0;
303        }
304
305        let mut correct = 0usize;
306
307        for case in cases {
308            for content in &case.setup_memories {
309                let _ = storage.with_connection(|conn| {
310                    create_memory(
311                        conn,
312                        &CreateMemoryInput {
313                            content: content.clone(),
314                            memory_type: MemoryType::Note,
315                            workspace: Some("longmemeval-bench".to_string()),
316                            ..Default::default()
317                        },
318                    )
319                });
320            }
321
322            let retrieved: Option<String> = storage
323                .with_connection(|conn| {
324                    conn.query_row(
325                        "SELECT content FROM memories WHERE content LIKE ?1 LIMIT 1",
326                        [format!("%{}%", case.query_keyword)],
327                        |row| row.get(0),
328                    )
329                    .map_err(crate::error::EngramError::Database)
330                })
331                .ok();
332
333            if let Some(content) = retrieved {
334                if content.contains(&case.expected_content_substring) {
335                    correct += 1;
336                }
337            }
338        }
339
340        correct as f64 / cases.len() as f64
341    }
342
343    /// Compute weighted score across dimensions
344    fn weighted_score(&self, scores: &HashMap<String, f64>) -> f64 {
345        let total_weight: f64 = self.dimension_weights.values().sum();
346        if total_weight == 0.0 {
347            return 0.0;
348        }
349
350        self.dimension_weights
351            .iter()
352            .filter_map(|(dim, &weight)| scores.get(dim).map(|&score| score * weight))
353            .sum::<f64>()
354            / total_weight
355    }
356}
357
358impl Benchmark for LongMemEvalBenchmark {
359    fn name(&self) -> &str {
360        "longmemeval"
361    }
362
363    fn description(&self) -> &str {
364        "5-dimension memory evaluation benchmark: information retention, temporal reasoning, \
365         knowledge update, multi-hop reasoning, and contradiction detection."
366    }
367
368    fn run(&self, db_path: &str) -> Result<BenchmarkResult, Box<dyn std::error::Error>> {
369        let start = Instant::now();
370
371        let storage = if db_path == ":memory:" {
372            Storage::open_in_memory()?
373        } else {
374            let bench_path = format!("{}.longmemeval_bench.db", db_path);
375            Storage::open(StorageConfig {
376                db_path: bench_path,
377                storage_mode: StorageMode::Local,
378                cloud_uri: None,
379                encrypt_cloud: false,
380                confidence_half_life_days: 30.0,
381                auto_sync: false,
382                sync_debounce_ms: 5000,
383            })?
384        };
385
386        let retention = self.eval_information_retention(&storage);
387        let temporal = self.eval_temporal_reasoning(&storage);
388        let knowledge_update = self.eval_knowledge_update(&storage);
389        let multi_hop = self.eval_multi_hop(&storage);
390        let contradiction = self.eval_contradiction_detection(&storage);
391
392        let mut dimension_scores = HashMap::new();
393        dimension_scores.insert("information_retention".to_string(), retention);
394        dimension_scores.insert("temporal_reasoning".to_string(), temporal);
395        dimension_scores.insert("knowledge_update".to_string(), knowledge_update);
396        dimension_scores.insert("multi_hop".to_string(), multi_hop);
397        dimension_scores.insert("contradiction_detection".to_string(), contradiction);
398
399        let weighted = self.weighted_score(&dimension_scores);
400
401        let duration_ms = start.elapsed().as_millis() as u64;
402
403        let mut metrics = dimension_scores;
404        metrics.insert("weighted_score".to_string(), weighted);
405
406        // Clean up temporary file
407        if db_path != ":memory:" {
408            let bench_path = format!("{}.longmemeval_bench.db", db_path);
409            drop(storage);
410            let _ = std::fs::remove_file(&bench_path);
411            let _ = std::fs::remove_file(format!("{}-wal", bench_path));
412            let _ = std::fs::remove_file(format!("{}-shm", bench_path));
413        }
414
415        Ok(BenchmarkResult {
416            name: self.name().to_string(),
417            metrics,
418            duration_ms,
419            timestamp: chrono::Utc::now().to_rfc3339(),
420        })
421    }
422}
423
424#[cfg(test)]
425mod tests {
426    use super::*;
427
428    #[test]
429    fn test_longmemeval_runs() {
430        let bench = LongMemEvalBenchmark::default();
431        let result = bench.run(":memory:").expect("benchmark should succeed");
432        assert_eq!(result.name, "longmemeval");
433    }
434
435    #[test]
436    fn test_all_dimensions_present() {
437        let bench = LongMemEvalBenchmark::default();
438        let result = bench.run(":memory:").expect("benchmark should succeed");
439
440        let expected_dims = [
441            "information_retention",
442            "temporal_reasoning",
443            "knowledge_update",
444            "multi_hop",
445            "contradiction_detection",
446            "weighted_score",
447        ];
448        for dim in &expected_dims {
449            assert!(
450                result.metrics.contains_key(*dim),
451                "missing dimension: {}",
452                dim
453            );
454        }
455    }
456
457    #[test]
458    fn test_scores_in_range() {
459        let bench = LongMemEvalBenchmark::default();
460        let result = bench.run(":memory:").expect("benchmark should succeed");
461
462        for (key, value) in &result.metrics {
463            assert!(
464                (0.0..=1.0).contains(value),
465                "metric '{}' = {} out of range [0,1]",
466                key,
467                value
468            );
469        }
470    }
471
472    #[test]
473    fn test_weighted_score_with_custom_weights() {
474        let mut weights = HashMap::new();
475        weights.insert("information_retention".to_string(), 1.0);
476        weights.insert("temporal_reasoning".to_string(), 0.0);
477        weights.insert("knowledge_update".to_string(), 0.0);
478        weights.insert("multi_hop".to_string(), 0.0);
479        weights.insert("contradiction_detection".to_string(), 0.0);
480
481        let bench = LongMemEvalBenchmark {
482            dimension_weights: weights,
483        };
484        let result = bench.run(":memory:").expect("benchmark should succeed");
485        let retention = result.metrics["information_retention"];
486        let weighted = result.metrics["weighted_score"];
487        assert!(
488            (weighted - retention).abs() < 1e-9,
489            "weighted={} retention={}",
490            weighted,
491            retention
492        );
493    }
494}