Skip to main content

locus_sdk/testing/
faker.rs

1use std::fs;
2use std::path::Path;
3
4use anyhow::Result;
5use chrono::{Duration, Utc};
6use fake::faker::lorem::en::Sentence;
7use fake::rand::SeedableRng as FakeSeedableRng;
8use fake::rand::rngs::StdRng as FakeStdRng;
9use fake::Fake;
10use rand::distributions::{Distribution, WeightedIndex};
11use rand::seq::SliceRandom;
12use rand::{Rng, SeedableRng};
13use rand_chacha::ChaCha8Rng;
14use serde::{Deserialize, Serialize};
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
17#[serde(rename_all = "camelCase")]
18pub struct WeightedTerm {
19    pub term: String,
20    pub weight: u32,
21}
22
23#[derive(Debug, Clone, Serialize, Deserialize)]
24#[serde(rename_all = "camelCase")]
25pub struct TierWeights {
26    pub raw: u32,
27    pub daily: u32,
28    pub weekly: u32,
29    pub monthly: u32,
30}
31
32impl Default for TierWeights {
33    fn default() -> Self {
34        Self {
35            raw: 70,
36            daily: 15,
37            weekly: 10,
38            monthly: 5,
39        }
40    }
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize)]
44#[serde(rename_all = "camelCase")]
45pub struct FakerConfig {
46    pub seed: u64,
47    pub sessions: usize,
48    pub min_nodes_per_session: usize,
49    pub max_nodes_per_session: usize,
50    pub tier_distribution: TierWeights,
51    pub filler_ratio: f32,
52    pub topic_drift: f32,
53    pub timestamp_span_days: usize,
54    pub domain_lexicon: Vec<WeightedTerm>,
55}
56
57impl Default for FakerConfig {
58    fn default() -> Self {
59        Self {
60            seed: 42,
61            sessions: 5,
62            min_nodes_per_session: 5,
63            max_nodes_per_session: 15,
64            tier_distribution: TierWeights::default(),
65            filler_ratio: 0.18,
66            topic_drift: 0.22,
67            timestamp_span_days: 30,
68            domain_lexicon: vec![
69                WeightedTerm {
70                    term: "retrieval".to_string(),
71                    weight: 10,
72                },
73                WeightedTerm {
74                    term: "session".to_string(),
75                    weight: 10,
76                },
77                WeightedTerm {
78                    term: "embedding".to_string(),
79                    weight: 9,
80                },
81                WeightedTerm {
82                    term: "fallback".to_string(),
83                    weight: 8,
84                },
85                WeightedTerm {
86                    term: "aggregate".to_string(),
87                    weight: 7,
88                },
89                WeightedTerm {
90                    term: "transform".to_string(),
91                    weight: 7,
92                },
93                WeightedTerm {
94                    term: "schema".to_string(),
95                    weight: 6,
96                },
97                WeightedTerm {
98                    term: "parser".to_string(),
99                    weight: 5,
100                },
101            ],
102        }
103    }
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
107#[serde(rename_all = "camelCase")]
108pub struct NoiseProfile {
109    pub filler_ratio_actual: f32,
110    pub distractor_ratio: f32,
111}
112
113#[derive(Debug, Clone, Serialize, Deserialize)]
114#[serde(rename_all = "camelCase")]
115pub struct FakerOutputRecord {
116    pub synthetic_id: String,
117    pub session_id: String,
118    pub tier: String,
119    pub timestamp: chrono::DateTime<Utc>,
120    pub raw_text: String,
121    pub expected_anchor_terms: Vec<String>,
122    pub noise_profile: NoiseProfile,
123}
124
125pub struct SttpFakerBuilder {
126    config: FakerConfig,
127}
128
129impl SttpFakerBuilder {
130    pub fn new(config: FakerConfig) -> Self {
131        Self { config }
132    }
133
134    pub fn generate(&self) -> Vec<FakerOutputRecord> {
135        let mut rng = ChaCha8Rng::seed_from_u64(self.config.seed);
136        let mut fake_rng = FakeStdRng::seed_from_u64(self.config.seed);
137
138        let min_nodes = self.config.min_nodes_per_session.max(1);
139        let max_nodes = self.config.max_nodes_per_session.max(min_nodes);
140        let span_days = self.config.timestamp_span_days.max(1) as i64;
141
142        let mut records = Vec::new();
143
144        for session_index in 0..self.config.sessions.max(1) {
145            let session_id = format!("session-{:03}", session_index + 1);
146            let node_count = rng.gen_range(min_nodes..=max_nodes);
147
148            for node_index in 0..node_count {
149                let tier = sample_tier(&self.config.tier_distribution, &mut rng);
150                let day_offset = rng.gen_range(0..span_days);
151                let minute_offset = rng.gen_range(0..(24 * 60));
152                let timestamp = Utc::now()
153                    - Duration::days(day_offset)
154                    - Duration::minutes(minute_offset);
155
156                let anchor_terms = sample_anchor_terms(&self.config.domain_lexicon, 5, &mut rng);
157                let (raw_text, noise_profile) = compose_text(
158                    &anchor_terms,
159                    self.config.filler_ratio,
160                    self.config.topic_drift,
161                    &mut rng,
162                    &mut fake_rng,
163                );
164
165                records.push(FakerOutputRecord {
166                    synthetic_id: format!("{}-{:04}", session_id, node_index + 1),
167                    session_id: session_id.clone(),
168                    tier,
169                    timestamp,
170                    raw_text,
171                    expected_anchor_terms: anchor_terms,
172                    noise_profile,
173                });
174            }
175        }
176
177        records
178    }
179}
180
181pub fn records_to_jsonl(records: &[FakerOutputRecord]) -> Result<String> {
182    let lines = records
183        .iter()
184        .map(serde_json::to_string)
185        .collect::<std::result::Result<Vec<_>, _>>()?;
186    Ok(lines.join("\n"))
187}
188
189pub fn write_jsonl_fixture(path: &Path, records: &[FakerOutputRecord]) -> Result<()> {
190    let jsonl = records_to_jsonl(records)?;
191    fs::write(path, jsonl)?;
192    Ok(())
193}
194
195fn sample_tier(weights: &TierWeights, rng: &mut ChaCha8Rng) -> String {
196    let labels = ["raw", "daily", "weekly", "monthly"];
197    let weight_values = [weights.raw, weights.daily, weights.weekly, weights.monthly];
198
199    let index = if let Ok(dist) = WeightedIndex::new(weight_values) {
200        dist.sample(rng)
201    } else {
202        0
203    };
204
205    labels[index].to_string()
206}
207
208fn sample_anchor_terms(lexicon: &[WeightedTerm], count: usize, rng: &mut ChaCha8Rng) -> Vec<String> {
209    if lexicon.is_empty() {
210        return vec!["memory".to_string(), "session".to_string()];
211    }
212
213    let mut picked = Vec::new();
214    let weights = lexicon
215        .iter()
216        .map(|item| item.weight.max(1))
217        .collect::<Vec<_>>();
218
219    for _ in 0..count.max(1) {
220        let index = if let Ok(dist) = WeightedIndex::new(&weights) {
221            dist.sample(rng)
222        } else {
223            rng.gen_range(0..lexicon.len())
224        };
225        let term = lexicon[index].term.clone();
226        if !picked.iter().any(|existing| existing == &term) {
227            picked.push(term);
228        }
229    }
230
231    if picked.is_empty() {
232        picked.push(lexicon[0].term.clone());
233    }
234
235    picked
236}
237
238fn compose_text(
239    anchors: &[String],
240    filler_ratio: f32,
241    topic_drift: f32,
242    rng: &mut ChaCha8Rng,
243    fake_rng: &mut FakeStdRng,
244) -> (String, NoiseProfile) {
245    let filler_words = [
246        "basically",
247        "actually",
248        "really",
249        "just",
250        "kind of",
251        "sort of",
252        "you know",
253        "i mean",
254    ];
255
256    let mut fragments = Vec::new();
257    for anchor in anchors {
258        let phrase = format!("{} pipeline stability and retrieval behavior", anchor);
259        fragments.push(phrase);
260    }
261
262    let extra_sentence: String = Sentence(6..12).fake_with_rng(fake_rng);
263    fragments.push(extra_sentence.to_ascii_lowercase());
264
265    let mut filler_count = 0usize;
266    let mut distractor_count = 0usize;
267    let sample_size = fragments.len().max(1);
268
269    for _ in 0..sample_size {
270        if rng.gen_bool(filler_ratio.clamp(0.0, 1.0) as f64) {
271            if let Some(filler) = filler_words.choose(rng) {
272                fragments.push((*filler).to_string());
273                filler_count += 1;
274            }
275        }
276
277        if rng.gen_bool(topic_drift.clamp(0.0, 1.0) as f64) {
278            let distractor: String = Sentence(3..8).fake_with_rng(fake_rng);
279            fragments.push(distractor.to_ascii_lowercase());
280            distractor_count += 1;
281        }
282    }
283
284    fragments.shuffle(rng);
285
286    let total = fragments.len().max(1) as f32;
287    let noise_profile = NoiseProfile {
288        filler_ratio_actual: filler_count as f32 / total,
289        distractor_ratio: distractor_count as f32 / total,
290    };
291
292    (fragments.join(". "), noise_profile)
293}
294
295#[cfg(test)]
296mod tests {
297    use super::{FakerConfig, SttpFakerBuilder, records_to_jsonl};
298
299    #[test]
300    fn seeded_generation_is_deterministic() {
301        let config = FakerConfig {
302            seed: 7,
303            sessions: 2,
304            min_nodes_per_session: 2,
305            max_nodes_per_session: 2,
306            ..Default::default()
307        };
308
309        let left = SttpFakerBuilder::new(config.clone()).generate();
310        let right = SttpFakerBuilder::new(config).generate();
311
312        assert_eq!(left.len(), right.len());
313        assert_eq!(left[0].raw_text, right[0].raw_text);
314        assert_eq!(left[0].expected_anchor_terms, right[0].expected_anchor_terms);
315    }
316
317    #[test]
318    fn jsonl_export_produces_one_line_per_record() {
319        let config = FakerConfig {
320            seed: 9,
321            sessions: 1,
322            min_nodes_per_session: 3,
323            max_nodes_per_session: 3,
324            ..Default::default()
325        };
326
327        let records = SttpFakerBuilder::new(config).generate();
328        let jsonl = records_to_jsonl(&records).expect("jsonl export should succeed");
329
330        assert_eq!(jsonl.lines().count(), 3);
331    }
332}