1use std::fs;
2use std::path::Path;
3
4use anyhow::Result;
5use chrono::{Duration, Utc};
6use fake::faker::lorem::en::Sentence;
7use fake::rand::SeedableRng as FakeSeedableRng;
8use fake::rand::rngs::StdRng as FakeStdRng;
9use fake::Fake;
10use rand::distributions::{Distribution, WeightedIndex};
11use rand::seq::SliceRandom;
12use rand::{Rng, SeedableRng};
13use rand_chacha::ChaCha8Rng;
14use serde::{Deserialize, Serialize};
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
17#[serde(rename_all = "camelCase")]
18pub struct WeightedTerm {
19 pub term: String,
20 pub weight: u32,
21}
22
23#[derive(Debug, Clone, Serialize, Deserialize)]
24#[serde(rename_all = "camelCase")]
25pub struct TierWeights {
26 pub raw: u32,
27 pub daily: u32,
28 pub weekly: u32,
29 pub monthly: u32,
30}
31
32impl Default for TierWeights {
33 fn default() -> Self {
34 Self {
35 raw: 70,
36 daily: 15,
37 weekly: 10,
38 monthly: 5,
39 }
40 }
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize)]
44#[serde(rename_all = "camelCase")]
45pub struct FakerConfig {
46 pub seed: u64,
47 pub sessions: usize,
48 pub min_nodes_per_session: usize,
49 pub max_nodes_per_session: usize,
50 pub tier_distribution: TierWeights,
51 pub filler_ratio: f32,
52 pub topic_drift: f32,
53 pub timestamp_span_days: usize,
54 pub domain_lexicon: Vec<WeightedTerm>,
55}
56
57impl Default for FakerConfig {
58 fn default() -> Self {
59 Self {
60 seed: 42,
61 sessions: 5,
62 min_nodes_per_session: 5,
63 max_nodes_per_session: 15,
64 tier_distribution: TierWeights::default(),
65 filler_ratio: 0.18,
66 topic_drift: 0.22,
67 timestamp_span_days: 30,
68 domain_lexicon: vec![
69 WeightedTerm {
70 term: "retrieval".to_string(),
71 weight: 10,
72 },
73 WeightedTerm {
74 term: "session".to_string(),
75 weight: 10,
76 },
77 WeightedTerm {
78 term: "embedding".to_string(),
79 weight: 9,
80 },
81 WeightedTerm {
82 term: "fallback".to_string(),
83 weight: 8,
84 },
85 WeightedTerm {
86 term: "aggregate".to_string(),
87 weight: 7,
88 },
89 WeightedTerm {
90 term: "transform".to_string(),
91 weight: 7,
92 },
93 WeightedTerm {
94 term: "schema".to_string(),
95 weight: 6,
96 },
97 WeightedTerm {
98 term: "parser".to_string(),
99 weight: 5,
100 },
101 ],
102 }
103 }
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
107#[serde(rename_all = "camelCase")]
108pub struct NoiseProfile {
109 pub filler_ratio_actual: f32,
110 pub distractor_ratio: f32,
111}
112
113#[derive(Debug, Clone, Serialize, Deserialize)]
114#[serde(rename_all = "camelCase")]
115pub struct FakerOutputRecord {
116 pub synthetic_id: String,
117 pub session_id: String,
118 pub tier: String,
119 pub timestamp: chrono::DateTime<Utc>,
120 pub raw_text: String,
121 pub expected_anchor_terms: Vec<String>,
122 pub noise_profile: NoiseProfile,
123}
124
125pub struct SttpFakerBuilder {
126 config: FakerConfig,
127}
128
129impl SttpFakerBuilder {
130 pub fn new(config: FakerConfig) -> Self {
131 Self { config }
132 }
133
134 pub fn generate(&self) -> Vec<FakerOutputRecord> {
135 let mut rng = ChaCha8Rng::seed_from_u64(self.config.seed);
136 let mut fake_rng = FakeStdRng::seed_from_u64(self.config.seed);
137
138 let min_nodes = self.config.min_nodes_per_session.max(1);
139 let max_nodes = self.config.max_nodes_per_session.max(min_nodes);
140 let span_days = self.config.timestamp_span_days.max(1) as i64;
141
142 let mut records = Vec::new();
143
144 for session_index in 0..self.config.sessions.max(1) {
145 let session_id = format!("session-{:03}", session_index + 1);
146 let node_count = rng.gen_range(min_nodes..=max_nodes);
147
148 for node_index in 0..node_count {
149 let tier = sample_tier(&self.config.tier_distribution, &mut rng);
150 let day_offset = rng.gen_range(0..span_days);
151 let minute_offset = rng.gen_range(0..(24 * 60));
152 let timestamp = Utc::now()
153 - Duration::days(day_offset)
154 - Duration::minutes(minute_offset);
155
156 let anchor_terms = sample_anchor_terms(&self.config.domain_lexicon, 5, &mut rng);
157 let (raw_text, noise_profile) = compose_text(
158 &anchor_terms,
159 self.config.filler_ratio,
160 self.config.topic_drift,
161 &mut rng,
162 &mut fake_rng,
163 );
164
165 records.push(FakerOutputRecord {
166 synthetic_id: format!("{}-{:04}", session_id, node_index + 1),
167 session_id: session_id.clone(),
168 tier,
169 timestamp,
170 raw_text,
171 expected_anchor_terms: anchor_terms,
172 noise_profile,
173 });
174 }
175 }
176
177 records
178 }
179}
180
181pub fn records_to_jsonl(records: &[FakerOutputRecord]) -> Result<String> {
182 let lines = records
183 .iter()
184 .map(serde_json::to_string)
185 .collect::<std::result::Result<Vec<_>, _>>()?;
186 Ok(lines.join("\n"))
187}
188
189pub fn write_jsonl_fixture(path: &Path, records: &[FakerOutputRecord]) -> Result<()> {
190 let jsonl = records_to_jsonl(records)?;
191 fs::write(path, jsonl)?;
192 Ok(())
193}
194
195fn sample_tier(weights: &TierWeights, rng: &mut ChaCha8Rng) -> String {
196 let labels = ["raw", "daily", "weekly", "monthly"];
197 let weight_values = [weights.raw, weights.daily, weights.weekly, weights.monthly];
198
199 let index = if let Ok(dist) = WeightedIndex::new(weight_values) {
200 dist.sample(rng)
201 } else {
202 0
203 };
204
205 labels[index].to_string()
206}
207
208fn sample_anchor_terms(lexicon: &[WeightedTerm], count: usize, rng: &mut ChaCha8Rng) -> Vec<String> {
209 if lexicon.is_empty() {
210 return vec!["memory".to_string(), "session".to_string()];
211 }
212
213 let mut picked = Vec::new();
214 let weights = lexicon
215 .iter()
216 .map(|item| item.weight.max(1))
217 .collect::<Vec<_>>();
218
219 for _ in 0..count.max(1) {
220 let index = if let Ok(dist) = WeightedIndex::new(&weights) {
221 dist.sample(rng)
222 } else {
223 rng.gen_range(0..lexicon.len())
224 };
225 let term = lexicon[index].term.clone();
226 if !picked.iter().any(|existing| existing == &term) {
227 picked.push(term);
228 }
229 }
230
231 if picked.is_empty() {
232 picked.push(lexicon[0].term.clone());
233 }
234
235 picked
236}
237
238fn compose_text(
239 anchors: &[String],
240 filler_ratio: f32,
241 topic_drift: f32,
242 rng: &mut ChaCha8Rng,
243 fake_rng: &mut FakeStdRng,
244) -> (String, NoiseProfile) {
245 let filler_words = [
246 "basically",
247 "actually",
248 "really",
249 "just",
250 "kind of",
251 "sort of",
252 "you know",
253 "i mean",
254 ];
255
256 let mut fragments = Vec::new();
257 for anchor in anchors {
258 let phrase = format!("{} pipeline stability and retrieval behavior", anchor);
259 fragments.push(phrase);
260 }
261
262 let extra_sentence: String = Sentence(6..12).fake_with_rng(fake_rng);
263 fragments.push(extra_sentence.to_ascii_lowercase());
264
265 let mut filler_count = 0usize;
266 let mut distractor_count = 0usize;
267 let sample_size = fragments.len().max(1);
268
269 for _ in 0..sample_size {
270 if rng.gen_bool(filler_ratio.clamp(0.0, 1.0) as f64) {
271 if let Some(filler) = filler_words.choose(rng) {
272 fragments.push((*filler).to_string());
273 filler_count += 1;
274 }
275 }
276
277 if rng.gen_bool(topic_drift.clamp(0.0, 1.0) as f64) {
278 let distractor: String = Sentence(3..8).fake_with_rng(fake_rng);
279 fragments.push(distractor.to_ascii_lowercase());
280 distractor_count += 1;
281 }
282 }
283
284 fragments.shuffle(rng);
285
286 let total = fragments.len().max(1) as f32;
287 let noise_profile = NoiseProfile {
288 filler_ratio_actual: filler_count as f32 / total,
289 distractor_ratio: distractor_count as f32 / total,
290 };
291
292 (fragments.join(". "), noise_profile)
293}
294
295#[cfg(test)]
296mod tests {
297 use super::{FakerConfig, SttpFakerBuilder, records_to_jsonl};
298
299 #[test]
300 fn seeded_generation_is_deterministic() {
301 let config = FakerConfig {
302 seed: 7,
303 sessions: 2,
304 min_nodes_per_session: 2,
305 max_nodes_per_session: 2,
306 ..Default::default()
307 };
308
309 let left = SttpFakerBuilder::new(config.clone()).generate();
310 let right = SttpFakerBuilder::new(config).generate();
311
312 assert_eq!(left.len(), right.len());
313 assert_eq!(left[0].raw_text, right[0].raw_text);
314 assert_eq!(left[0].expected_anchor_terms, right[0].expected_anchor_terms);
315 }
316
317 #[test]
318 fn jsonl_export_produces_one_line_per_record() {
319 let config = FakerConfig {
320 seed: 9,
321 sessions: 1,
322 min_nodes_per_session: 3,
323 max_nodes_per_session: 3,
324 ..Default::default()
325 };
326
327 let records = SttpFakerBuilder::new(config).generate();
328 let jsonl = records_to_jsonl(&records).expect("jsonl export should succeed");
329
330 assert_eq!(jsonl.lines().count(), 3);
331 }
332}