Skip to main content

mnem_bench/datasets/
longmemeval.rs

1//! LongMemEval-S (per-session-cleaned) dataset spec + loader.
2//!
3//! Source: HuggingFace `xiaowu0162/longmemeval-cleaned` repo, file
4//! `longmemeval_s_cleaned.json` (single JSON, not JSONL).
5//! ~264 MB. The cached copy lives at
6//! `~/.mnem/bench-data/longmemeval/longmemeval_s_cleaned.json`.
7
8use std::fs;
9use std::path::Path;
10
11use anyhow::{Context, Result};
12use serde::Deserialize;
13
14use super::DatasetSpec;
15use crate::bench::Bench;
16
17/// Static spec. `sha256` left empty: the 264MB upstream blob is not
18/// digest-pinned. Anyone bypassing via `MNEM_BENCH_DATA` hits the
19/// same accept path.
20pub const SPEC: DatasetSpec = DatasetSpec {
21    bench: Bench::LongMemEval,
22    filename: "longmemeval_s_cleaned.json",
23    url: "https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_s_cleaned.json",
24    sha256: "",
25    bytes: 264 * 1024 * 1024,
26};
27
28/// One session of one question. Mirrors the LongMemEval-S record
29/// shape closely enough that the same JSON loads against either
30/// the per-turn or per-session adapter.
31#[derive(Clone, Debug, Deserialize)]
32pub struct Question {
33    /// Stable question id.
34    pub question_id: String,
35    /// Question category. Optional in some splits.
36    #[serde(default)]
37    pub question_type: Option<String>,
38    /// The question itself.
39    pub question: String,
40    /// Set of session ids that the gold answer lives in.
41    #[serde(default)]
42    pub answer_session_ids: Vec<String>,
43    /// Haystack session ids, parallel to `haystack_sessions`.
44    #[serde(default)]
45    pub haystack_session_ids: Vec<String>,
46    /// Per-session conversation turns.
47    /// Each session is a list of `{role, content}` turns. We only
48    /// concatenate user-role content (matches the upstream
49    /// per-session adapter).
50    #[serde(default)]
51    pub haystack_sessions: Vec<Vec<Turn>>,
52}
53
54/// One conversational turn inside a session.
55#[derive(Clone, Debug, Deserialize)]
56pub struct Turn {
57    /// `"user"` / `"assistant"` etc. The session-rendering helper
58    /// keeps only `"user"` turns.
59    #[serde(default)]
60    pub role: String,
61    /// Free-form turn content.
62    #[serde(default)]
63    pub content: String,
64}
65
66/// Load + parse the LongMemEval JSON at `path`. Accepts either a
67/// JSON array of questions or a JSON-Lines file (one question per
68/// line); both are observed in the wild.
69pub fn load(path: &Path) -> Result<Vec<Question>> {
70    let bytes = fs::read(path).with_context(|| format!("reading {}", path.display()))?;
71    // Try array first.
72    if let Ok(v) = serde_json::from_slice::<Vec<Question>>(&bytes) {
73        return Ok(v);
74    }
75    // Fallback to JSON-Lines.
76    let text = std::str::from_utf8(&bytes).context("longmemeval file is not utf-8")?;
77    let mut out = Vec::new();
78    for (lineno, line) in text.lines().enumerate() {
79        let trim = line.trim();
80        if trim.is_empty() {
81            continue;
82        }
83        let q: Question = serde_json::from_str(trim)
84            .with_context(|| format!("parsing line {} of {}", lineno + 1, path.display()))?;
85        out.push(q);
86    }
87    Ok(out)
88}
89
90/// Concatenate user-role turns into the per-session string the
91/// embedder consumes. Mirrors the upstream Python adapter:
92///
93/// > only `role == "user"` turns; non-empty strips; joined on `\n`;
94/// > truncated to `cap` characters if `cap > 0`.
95#[must_use]
96pub fn render_session(turns: &[Turn], cap: usize) -> String {
97    let mut lines: Vec<&str> = Vec::with_capacity(turns.len());
98    for t in turns {
99        if t.role != "user" {
100            continue;
101        }
102        let s = t.content.trim();
103        if !s.is_empty() {
104            lines.push(s);
105        }
106    }
107    let s = lines.join("\n");
108    if cap > 0 && s.len() > cap {
109        let mut end = cap;
110        while end > 0 && !s.is_char_boundary(end) {
111            end -= 1;
112        }
113        s[..end].to_string()
114    } else {
115        s
116    }
117}