Skip to main content

embeddenator_testkit/
generators.rs

1//! Test data generators for VSA vectors and test datasets
2//!
3//! Provides utilities to generate:
4//! - Random sparse vectors with controlled sparsity
5//! - Deterministic vectors for reproducible testing
6//! - Noise patterns and synthetic data
7//! - Test helper functions for VSA operations
8
9use embeddenator_vsa::SparseVec;
10use rand::Rng;
11use std::collections::HashSet;
12
13/// Generate a random sparse vector with specified dimensions and sparsity
14///
15/// # Arguments
16/// * `rng` - Random number generator
17/// * `dims` - Total dimensions of the vector
18/// * `sparsity` - Number of non-zero elements (split roughly evenly between pos/neg)
19///
20/// # Example
21/// ```rust,ignore
22/// use rand::thread_rng;
23/// let mut rng = thread_rng();
24/// let vec = random_sparse_vec(&mut rng, 10000, 200);
25/// assert_eq!(vec.pos.len() + vec.neg.len(), 200);
26/// ```
27pub fn random_sparse_vec(rng: &mut impl Rng, dims: usize, sparsity: usize) -> SparseVec {
28    let mut used: HashSet<usize> = HashSet::with_capacity(sparsity.saturating_mul(2));
29    let mut pos = Vec::with_capacity(sparsity / 2);
30    let mut neg = Vec::with_capacity(sparsity / 2);
31
32    // Roughly half pos/half neg.
33    let target_each = sparsity / 2;
34    while pos.len() < target_each {
35        let idx = rng.random_range(0..dims);
36        if used.insert(idx) {
37            pos.push(idx);
38        }
39    }
40    while neg.len() < target_each {
41        let idx = rng.random_range(0..dims);
42        if used.insert(idx) {
43            neg.push(idx);
44        }
45    }
46
47    pos.sort_unstable();
48    neg.sort_unstable();
49    SparseVec { pos, neg }
50}
51
52/// Alias for `random_sparse_vec` for backwards compatibility
53pub fn mk_random_sparsevec(rng: &mut impl Rng, dims: usize, sparsity: usize) -> SparseVec {
54    random_sparse_vec(rng, dims, sparsity)
55}
56
57/// Generate a deterministic sparse vector using LCG for reproducibility
58///
59/// # Arguments
60/// * `dim` - Total dimensions of the vector
61/// * `nnz` - Number of non-zero elements
62/// * `seed` - Random seed for reproducibility
63///
64/// # Example
65/// ```rust,ignore
66/// let vec1 = deterministic_sparse_vec(10000, 200, 42);
67/// let vec2 = deterministic_sparse_vec(10000, 200, 42);
68/// assert_eq!(vec1.pos, vec2.pos);
69/// assert_eq!(vec1.neg, vec2.neg);
70/// ```
71pub fn deterministic_sparse_vec(dim: usize, nnz: usize, seed: u64) -> SparseVec {
72    // Split nnz roughly evenly between pos and neg
73    let pos_count = nnz / 2;
74    let neg_count = nnz - pos_count;
75
76    let mut state = seed;
77    let lcg = |s: &mut u64| -> u64 {
78        *s = s.wrapping_mul(6364136223846793005).wrapping_add(1);
79        *s
80    };
81
82    let mut pos = Vec::with_capacity(pos_count);
83    let mut neg = Vec::with_capacity(neg_count);
84    let mut used = HashSet::new();
85
86    for _ in 0..pos_count {
87        loop {
88            let idx = (lcg(&mut state) as usize) % dim;
89            if used.insert(idx) {
90                pos.push(idx);
91                break;
92            }
93        }
94    }
95
96    for _ in 0..neg_count {
97        loop {
98            let idx = (lcg(&mut state) as usize) % dim;
99            if used.insert(idx) {
100                neg.push(idx);
101                break;
102            }
103        }
104    }
105
106    pos.sort_unstable();
107    neg.sort_unstable();
108
109    SparseVec { pos, neg }
110}
111
112/// Count intersections between two sorted slices (used for dot product)
113fn intersection_count_sorted(a: &[usize], b: &[usize]) -> usize {
114    let mut i = 0;
115    let mut j = 0;
116    let mut count = 0;
117    while i < a.len() && j < b.len() {
118        match a[i].cmp(&b[j]) {
119            std::cmp::Ordering::Less => i += 1,
120            std::cmp::Ordering::Greater => j += 1,
121            std::cmp::Ordering::Equal => {
122                count += 1;
123                i += 1;
124                j += 1;
125            }
126        }
127    }
128    count
129}
130
131/// Compute sparse ternary dot product: (pp + nn) - (pn + np)
132///
133/// This is a reference implementation useful for testing optimized dot product implementations.
134///
135/// # Arguments
136/// * `a` - First sparse vector
137/// * `b` - Second sparse vector
138///
139/// # Returns
140/// Dot product as i32
141pub fn sparse_dot(a: &SparseVec, b: &SparseVec) -> i32 {
142    let pp = intersection_count_sorted(&a.pos, &b.pos) as i32;
143    let nn = intersection_count_sorted(&a.neg, &b.neg) as i32;
144    let pn = intersection_count_sorted(&a.pos, &b.neg) as i32;
145    let np = intersection_count_sorted(&a.neg, &b.pos) as i32;
146    (pp + nn) - (pn + np)
147}
148
149/// Generate synthetic noise pattern using LCG
150///
151/// Useful for creating reproducible pseudo-random test data.
152pub fn generate_noise_pattern(size: usize, seed: u64) -> Vec<u8> {
153    let mut data = Vec::with_capacity(size);
154    let mut state = seed;
155    for _ in 0..size {
156        // Simple LCG for reproducible pseudo-random data
157        state = state.wrapping_mul(6364136223846793005).wrapping_add(1);
158        data.push((state >> 56) as u8);
159    }
160    data
161}
162
163/// Generate synthetic gradient pattern (useful for image-like data)
164pub fn generate_gradient_pattern(width: usize, height: usize) -> Vec<u8> {
165    let mut data = Vec::with_capacity(width * height);
166    for y in 0..height {
167        for x in 0..width {
168            // Linear gradient from top-left to bottom-right
169            let val = ((x + y) * 255) / (width + height);
170            data.push(val as u8);
171        }
172    }
173    data
174}
175
176/// Generate synthetic binary blob (executable-like pattern)
177pub fn generate_binary_blob(size: usize) -> Vec<u8> {
178    let mut data = Vec::with_capacity(size);
179
180    // ELF-like header
181    if size >= 16 {
182        data.extend_from_slice(&[0x7f, b'E', b'L', b'F']);
183        data.extend_from_slice(&[2, 1, 1, 0]); // 64-bit, little endian, v1, SYSV
184        data.extend_from_slice(&[0; 8]); // padding
185    }
186
187    // Fill with mix of patterns
188    let mut offset = data.len();
189    while offset < size {
190        let pattern_type = (offset / 256) % 4;
191        match pattern_type {
192            0 => data.push(0x90),                  // NOP slide
193            1 => data.push((offset & 0xFF) as u8), // Sequential
194            2 => data.push(0x00),                  // Zero fill
195            _ => data.push(0xCC),                  // INT3
196        }
197        offset += 1;
198    }
199
200    data.truncate(size);
201    data
202}
203
204#[cfg(test)]
205mod tests {
206    use super::*;
207    use rand::SeedableRng;
208
209    #[test]
210    fn test_random_sparse_vec() {
211        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
212        let vec = random_sparse_vec(&mut rng, 10000, 200);
213        let nnz = vec.pos.len() + vec.neg.len();
214        assert_eq!(nnz, 200);
215
216        // Check sorted
217        assert!(vec.pos.windows(2).all(|w| w[0] < w[1]));
218        assert!(vec.neg.windows(2).all(|w| w[0] < w[1]));
219
220        // Check no overlap
221        let pos_set: HashSet<_> = vec.pos.iter().collect();
222        let neg_set: HashSet<_> = vec.neg.iter().collect();
223        assert_eq!(pos_set.intersection(&neg_set).count(), 0);
224    }
225
226    #[test]
227    fn test_deterministic_sparse_vec() {
228        let vec1 = deterministic_sparse_vec(10000, 200, 42);
229        let vec2 = deterministic_sparse_vec(10000, 200, 42);
230        assert_eq!(vec1.pos, vec2.pos);
231        assert_eq!(vec1.neg, vec2.neg);
232
233        // Different seed should give different result
234        let vec3 = deterministic_sparse_vec(10000, 200, 43);
235        assert_ne!(vec1.pos, vec3.pos);
236    }
237
238    #[test]
239    fn test_sparse_dot() {
240        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
241        let a = random_sparse_vec(&mut rng, 10000, 200);
242        let b = random_sparse_vec(&mut rng, 10000, 200);
243
244        let dot = sparse_dot(&a, &b);
245
246        // Dot product should be symmetric
247        let dot_rev = sparse_dot(&b, &a);
248        assert_eq!(dot, dot_rev);
249    }
250
251    #[test]
252    fn test_generate_noise_pattern() {
253        let data1 = generate_noise_pattern(1000, 42);
254        let data2 = generate_noise_pattern(1000, 42);
255        assert_eq!(data1, data2);
256
257        let data3 = generate_noise_pattern(1000, 43);
258        assert_ne!(data1, data3);
259    }
260}