mnem_bench/datasets/
longmemeval.rs1use std::fs;
9use std::path::Path;
10
11use anyhow::{Context, Result};
12use serde::Deserialize;
13
14use super::DatasetSpec;
15use crate::bench::Bench;
16
17pub const SPEC: DatasetSpec = DatasetSpec {
21 bench: Bench::LongMemEval,
22 filename: "longmemeval_s_cleaned.json",
23 url: "https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_s_cleaned.json",
24 sha256: "",
25 bytes: 264 * 1024 * 1024,
26};
27
28#[derive(Clone, Debug, Deserialize)]
32pub struct Question {
33 pub question_id: String,
35 #[serde(default)]
37 pub question_type: Option<String>,
38 pub question: String,
40 #[serde(default)]
42 pub answer_session_ids: Vec<String>,
43 #[serde(default)]
45 pub haystack_session_ids: Vec<String>,
46 #[serde(default)]
51 pub haystack_sessions: Vec<Vec<Turn>>,
52}
53
54#[derive(Clone, Debug, Deserialize)]
56pub struct Turn {
57 #[serde(default)]
60 pub role: String,
61 #[serde(default)]
63 pub content: String,
64}
65
66pub fn load(path: &Path) -> Result<Vec<Question>> {
70 let bytes = fs::read(path).with_context(|| format!("reading {}", path.display()))?;
71 if let Ok(v) = serde_json::from_slice::<Vec<Question>>(&bytes) {
73 return Ok(v);
74 }
75 let text = std::str::from_utf8(&bytes).context("longmemeval file is not utf-8")?;
77 let mut out = Vec::new();
78 for (lineno, line) in text.lines().enumerate() {
79 let trim = line.trim();
80 if trim.is_empty() {
81 continue;
82 }
83 let q: Question = serde_json::from_str(trim)
84 .with_context(|| format!("parsing line {} of {}", lineno + 1, path.display()))?;
85 out.push(q);
86 }
87 Ok(out)
88}
89
90#[must_use]
96pub fn render_session(turns: &[Turn], cap: usize) -> String {
97 let mut lines: Vec<&str> = Vec::with_capacity(turns.len());
98 for t in turns {
99 if t.role != "user" {
100 continue;
101 }
102 let s = t.content.trim();
103 if !s.is_empty() {
104 lines.push(s);
105 }
106 }
107 let s = lines.join("\n");
108 if cap > 0 && s.len() > cap {
109 let mut end = cap;
110 while end > 0 && !s.is_char_boundary(end) {
111 end -= 1;
112 }
113 s[..end].to_string()
114 } else {
115 s
116 }
117}