use std::fs;
use std::io::Write;
use std::path::Path;
use rig_retrieval_evals::dataset::{GoldQuery, Qrels};
use crate::error::{DciError, Result};
#[derive(Debug, Clone)]
pub struct SyntheticLogConfig {
pub files: usize,
pub lines_per_file: usize,
pub needles: usize,
pub seed: u64,
}
impl Default for SyntheticLogConfig {
fn default() -> Self {
Self {
files: 8,
lines_per_file: 500,
needles: 12,
seed: 1,
}
}
}
struct SplitMix64 {
state: u64,
}
impl SplitMix64 {
fn new(seed: u64) -> Self {
Self { state: seed }
}
fn next_u64(&mut self) -> u64 {
self.state = self.state.wrapping_add(0x9E37_79B9_7F4A_7C15);
let mut z = self.state;
z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
z ^ (z >> 31)
}
fn below(&mut self, bound: usize) -> usize {
if bound == 0 {
0
} else {
(self.next_u64() % bound as u64) as usize
}
}
}
const USERS: &[&str] = &["alice", "bob", "carol", "dave", "erin", "frank"];
const ACTIONS: &[&str] = &[
"login succeeded",
"session opened",
"heartbeat ok",
"config reloaded",
"cache flushed",
"job completed",
];
const HOSTS: &[&str] = &["web01", "web02", "db01", "cache01", "edge01"];
pub fn generate(dir: &Path, cfg: &SyntheticLogConfig) -> Result<Qrels> {
if cfg.files == 0 {
return Err(DciError::Worker(
"synthetic generator requires at least one file".to_string(),
));
}
if !dir.is_dir() {
return Err(DciError::InvalidRoot {
path: dir.to_path_buf(),
reason: "output directory does not exist".to_string(),
});
}
let mut rng = SplitMix64::new(cfg.seed);
struct Needle {
token: String,
file_idx: usize,
line_idx: usize,
}
let mut needles = Vec::with_capacity(cfg.needles);
for i in 0..cfg.needles {
let file_idx = rng.below(cfg.files);
let line_idx = if cfg.lines_per_file == 0 {
0
} else {
rng.below(cfg.lines_per_file)
};
let token = format!("IOC-{:08x}-{i:04}", cfg.seed);
needles.push(Needle {
token,
file_idx,
line_idx,
});
}
let file_name = |idx: usize| format!("log-{idx:03}.log");
for file_idx in 0..cfg.files {
let path = dir.join(file_name(file_idx));
let mut file = fs::File::create(&path).map_err(|e| DciError::Io {
path: path.clone(),
source: e,
})?;
let needles_here: Vec<&Needle> =
needles.iter().filter(|n| n.file_idx == file_idx).collect();
for line_idx in 0..cfg.lines_per_file {
let ts = 1_700_000_000u64 + (file_idx as u64 * 100_000) + line_idx as u64;
let user = *USERS.get(rng.below(USERS.len())).unwrap_or(&"");
let host = *HOSTS.get(rng.below(HOSTS.len())).unwrap_or(&"");
let action = *ACTIONS.get(rng.below(ACTIONS.len())).unwrap_or(&"");
writeln!(file, "{ts} host={host} user={user} event=\"{action}\"").map_err(|e| {
DciError::Io {
path: path.clone(),
source: e,
}
})?;
for needle in needles_here.iter().filter(|n| n.line_idx == line_idx) {
writeln!(
file,
"{ts} host={host} user={user} event=\"alert\" indicator={}",
needle.token
)
.map_err(|e| DciError::Io {
path: path.clone(),
source: e,
})?;
}
}
}
let queries = needles
.iter()
.enumerate()
.map(|(i, needle)| {
let mut relevant = std::collections::HashMap::new();
relevant.insert(file_name(needle.file_idx), 1u8);
GoldQuery {
query_id: format!("needle-{i:04}"),
query: needle.token.clone(),
relevant_docs: relevant,
reference_answer: None,
}
})
.collect();
Ok(Qrels { queries })
}
#[cfg(test)]
mod tests {
#![allow(
clippy::unwrap_used,
clippy::expect_used,
clippy::indexing_slicing,
clippy::panic
)]
use super::*;
#[test]
fn generation_is_deterministic_for_a_seed() {
let dir_a = tempfile::tempdir().unwrap();
let dir_b = tempfile::tempdir().unwrap();
let cfg = SyntheticLogConfig {
files: 3,
lines_per_file: 50,
needles: 5,
seed: 7,
};
let qa = generate(dir_a.path(), &cfg).unwrap();
let qb = generate(dir_b.path(), &cfg).unwrap();
assert_eq!(qa.queries.len(), 5);
let qa_tokens: Vec<_> = qa.queries.iter().map(|q| &q.query).collect();
let qb_tokens: Vec<_> = qb.queries.iter().map(|q| &q.query).collect();
assert_eq!(qa_tokens, qb_tokens);
let file_a = std::fs::read(dir_a.path().join("log-000.log")).unwrap();
let file_b = std::fs::read(dir_b.path().join("log-000.log")).unwrap();
assert_eq!(file_a, file_b);
}
#[test]
fn each_needle_token_appears_in_its_file() {
let dir = tempfile::tempdir().unwrap();
let cfg = SyntheticLogConfig {
files: 4,
lines_per_file: 40,
needles: 6,
seed: 3,
};
let qrels = generate(dir.path(), &cfg).unwrap();
for query in &qrels.queries {
let (file, _grade) = query.relevant_docs.iter().next().unwrap();
let content = std::fs::read_to_string(dir.path().join(file)).unwrap();
assert!(
content.contains(&query.query),
"token {} should be in {file}",
query.query
);
}
}
}