use std::fs;
use std::path::Path;
use anyhow::{Context, Result};
use serde::Deserialize;
use super::DatasetSpec;
use crate::bench::Bench;
pub const SPEC: DatasetSpec = DatasetSpec {
bench: Bench::LongMemEval,
filename: "longmemeval_s_cleaned.json",
url: "https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_s_cleaned.json",
sha256: "",
bytes: 264 * 1024 * 1024,
};
#[derive(Clone, Debug, Deserialize)]
pub struct Question {
pub question_id: String,
#[serde(default)]
pub question_type: Option<String>,
pub question: String,
#[serde(default)]
pub answer_session_ids: Vec<String>,
#[serde(default)]
pub haystack_session_ids: Vec<String>,
#[serde(default)]
pub haystack_sessions: Vec<Vec<Turn>>,
}
#[derive(Clone, Debug, Deserialize)]
pub struct Turn {
#[serde(default)]
pub role: String,
#[serde(default)]
pub content: String,
}
pub fn load(path: &Path) -> Result<Vec<Question>> {
let bytes = fs::read(path).with_context(|| format!("reading {}", path.display()))?;
if let Ok(v) = serde_json::from_slice::<Vec<Question>>(&bytes) {
return Ok(v);
}
let text = std::str::from_utf8(&bytes).context("longmemeval file is not utf-8")?;
let mut out = Vec::new();
for (lineno, line) in text.lines().enumerate() {
let trim = line.trim();
if trim.is_empty() {
continue;
}
let q: Question = serde_json::from_str(trim)
.with_context(|| format!("parsing line {} of {}", lineno + 1, path.display()))?;
out.push(q);
}
Ok(out)
}
#[must_use]
pub fn render_session(turns: &[Turn], cap: usize) -> String {
let mut lines: Vec<&str> = Vec::with_capacity(turns.len());
for t in turns {
if t.role != "user" {
continue;
}
let s = t.content.trim();
if !s.is_empty() {
lines.push(s);
}
}
let s = lines.join("\n");
if cap > 0 && s.len() > cap {
let mut end = cap;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
s[..end].to_string()
} else {
s
}
}