locus-sdk 0.1.2

SDK-first STTP memory primitives and AI provider abstraction
Documentation
use std::fs;
use std::path::Path;

use anyhow::Result;
use chrono::{Duration, Utc};
use fake::faker::lorem::en::Sentence;
use fake::rand::SeedableRng as FakeSeedableRng;
use fake::rand::rngs::StdRng as FakeStdRng;
use fake::Fake;
use rand::distributions::{Distribution, WeightedIndex};
use rand::seq::SliceRandom;
use rand::{Rng, SeedableRng};
use rand_chacha::ChaCha8Rng;
use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct WeightedTerm {
    pub term: String,
    pub weight: u32,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct TierWeights {
    pub raw: u32,
    pub daily: u32,
    pub weekly: u32,
    pub monthly: u32,
}

impl Default for TierWeights {
    fn default() -> Self {
        Self {
            raw: 70,
            daily: 15,
            weekly: 10,
            monthly: 5,
        }
    }
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct FakerConfig {
    pub seed: u64,
    pub sessions: usize,
    pub min_nodes_per_session: usize,
    pub max_nodes_per_session: usize,
    pub tier_distribution: TierWeights,
    pub filler_ratio: f32,
    pub topic_drift: f32,
    pub timestamp_span_days: usize,
    pub domain_lexicon: Vec<WeightedTerm>,
}

impl Default for FakerConfig {
    fn default() -> Self {
        Self {
            seed: 42,
            sessions: 5,
            min_nodes_per_session: 5,
            max_nodes_per_session: 15,
            tier_distribution: TierWeights::default(),
            filler_ratio: 0.18,
            topic_drift: 0.22,
            timestamp_span_days: 30,
            domain_lexicon: vec![
                WeightedTerm {
                    term: "retrieval".to_string(),
                    weight: 10,
                },
                WeightedTerm {
                    term: "session".to_string(),
                    weight: 10,
                },
                WeightedTerm {
                    term: "embedding".to_string(),
                    weight: 9,
                },
                WeightedTerm {
                    term: "fallback".to_string(),
                    weight: 8,
                },
                WeightedTerm {
                    term: "aggregate".to_string(),
                    weight: 7,
                },
                WeightedTerm {
                    term: "transform".to_string(),
                    weight: 7,
                },
                WeightedTerm {
                    term: "schema".to_string(),
                    weight: 6,
                },
                WeightedTerm {
                    term: "parser".to_string(),
                    weight: 5,
                },
            ],
        }
    }
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct NoiseProfile {
    pub filler_ratio_actual: f32,
    pub distractor_ratio: f32,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct FakerOutputRecord {
    pub synthetic_id: String,
    pub session_id: String,
    pub tier: String,
    pub timestamp: chrono::DateTime<Utc>,
    pub raw_text: String,
    pub expected_anchor_terms: Vec<String>,
    pub noise_profile: NoiseProfile,
}

pub struct SttpFakerBuilder {
    config: FakerConfig,
}

impl SttpFakerBuilder {
    pub fn new(config: FakerConfig) -> Self {
        Self { config }
    }

    pub fn generate(&self) -> Vec<FakerOutputRecord> {
        let mut rng = ChaCha8Rng::seed_from_u64(self.config.seed);
        let mut fake_rng = FakeStdRng::seed_from_u64(self.config.seed);

        let min_nodes = self.config.min_nodes_per_session.max(1);
        let max_nodes = self.config.max_nodes_per_session.max(min_nodes);
        let span_days = self.config.timestamp_span_days.max(1) as i64;

        let mut records = Vec::new();

        for session_index in 0..self.config.sessions.max(1) {
            let session_id = format!("session-{:03}", session_index + 1);
            let node_count = rng.gen_range(min_nodes..=max_nodes);

            for node_index in 0..node_count {
                let tier = sample_tier(&self.config.tier_distribution, &mut rng);
                let day_offset = rng.gen_range(0..span_days);
                let minute_offset = rng.gen_range(0..(24 * 60));
                let timestamp = Utc::now()
                    - Duration::days(day_offset)
                    - Duration::minutes(minute_offset);

                let anchor_terms = sample_anchor_terms(&self.config.domain_lexicon, 5, &mut rng);
                let (raw_text, noise_profile) = compose_text(
                    &anchor_terms,
                    self.config.filler_ratio,
                    self.config.topic_drift,
                    &mut rng,
                    &mut fake_rng,
                );

                records.push(FakerOutputRecord {
                    synthetic_id: format!("{}-{:04}", session_id, node_index + 1),
                    session_id: session_id.clone(),
                    tier,
                    timestamp,
                    raw_text,
                    expected_anchor_terms: anchor_terms,
                    noise_profile,
                });
            }
        }

        records
    }
}

pub fn records_to_jsonl(records: &[FakerOutputRecord]) -> Result<String> {
    let lines = records
        .iter()
        .map(serde_json::to_string)
        .collect::<std::result::Result<Vec<_>, _>>()?;
    Ok(lines.join("\n"))
}

pub fn write_jsonl_fixture(path: &Path, records: &[FakerOutputRecord]) -> Result<()> {
    let jsonl = records_to_jsonl(records)?;
    fs::write(path, jsonl)?;
    Ok(())
}

fn sample_tier(weights: &TierWeights, rng: &mut ChaCha8Rng) -> String {
    let labels = ["raw", "daily", "weekly", "monthly"];
    let weight_values = [weights.raw, weights.daily, weights.weekly, weights.monthly];

    let index = if let Ok(dist) = WeightedIndex::new(weight_values) {
        dist.sample(rng)
    } else {
        0
    };

    labels[index].to_string()
}

fn sample_anchor_terms(lexicon: &[WeightedTerm], count: usize, rng: &mut ChaCha8Rng) -> Vec<String> {
    if lexicon.is_empty() {
        return vec!["memory".to_string(), "session".to_string()];
    }

    let mut picked = Vec::new();
    let weights = lexicon
        .iter()
        .map(|item| item.weight.max(1))
        .collect::<Vec<_>>();

    for _ in 0..count.max(1) {
        let index = if let Ok(dist) = WeightedIndex::new(&weights) {
            dist.sample(rng)
        } else {
            rng.gen_range(0..lexicon.len())
        };
        let term = lexicon[index].term.clone();
        if !picked.iter().any(|existing| existing == &term) {
            picked.push(term);
        }
    }

    if picked.is_empty() {
        picked.push(lexicon[0].term.clone());
    }

    picked
}

fn compose_text(
    anchors: &[String],
    filler_ratio: f32,
    topic_drift: f32,
    rng: &mut ChaCha8Rng,
    fake_rng: &mut FakeStdRng,
) -> (String, NoiseProfile) {
    let filler_words = [
        "basically",
        "actually",
        "really",
        "just",
        "kind of",
        "sort of",
        "you know",
        "i mean",
    ];

    let mut fragments = Vec::new();
    for anchor in anchors {
        let phrase = format!("{} pipeline stability and retrieval behavior", anchor);
        fragments.push(phrase);
    }

    let extra_sentence: String = Sentence(6..12).fake_with_rng(fake_rng);
    fragments.push(extra_sentence.to_ascii_lowercase());

    let mut filler_count = 0usize;
    let mut distractor_count = 0usize;
    let sample_size = fragments.len().max(1);

    for _ in 0..sample_size {
        if rng.gen_bool(filler_ratio.clamp(0.0, 1.0) as f64) {
            if let Some(filler) = filler_words.choose(rng) {
                fragments.push((*filler).to_string());
                filler_count += 1;
            }
        }

        if rng.gen_bool(topic_drift.clamp(0.0, 1.0) as f64) {
            let distractor: String = Sentence(3..8).fake_with_rng(fake_rng);
            fragments.push(distractor.to_ascii_lowercase());
            distractor_count += 1;
        }
    }

    fragments.shuffle(rng);

    let total = fragments.len().max(1) as f32;
    let noise_profile = NoiseProfile {
        filler_ratio_actual: filler_count as f32 / total,
        distractor_ratio: distractor_count as f32 / total,
    };

    (fragments.join(". "), noise_profile)
}

#[cfg(test)]
mod tests {
    use super::{FakerConfig, SttpFakerBuilder, records_to_jsonl};

    #[test]
    fn seeded_generation_is_deterministic() {
        let config = FakerConfig {
            seed: 7,
            sessions: 2,
            min_nodes_per_session: 2,
            max_nodes_per_session: 2,
            ..Default::default()
        };

        let left = SttpFakerBuilder::new(config.clone()).generate();
        let right = SttpFakerBuilder::new(config).generate();

        assert_eq!(left.len(), right.len());
        assert_eq!(left[0].raw_text, right[0].raw_text);
        assert_eq!(left[0].expected_anchor_terms, right[0].expected_anchor_terms);
    }

    #[test]
    fn jsonl_export_produces_one_line_per_record() {
        let config = FakerConfig {
            seed: 9,
            sessions: 1,
            min_nodes_per_session: 3,
            max_nodes_per_session: 3,
            ..Default::default()
        };

        let records = SttpFakerBuilder::new(config).generate();
        let jsonl = records_to_jsonl(&records).expect("jsonl export should succeed");

        assert_eq!(jsonl.lines().count(), 3);
    }
}