embeddenator_testkit/lib.rs
1//! # Embeddenator TestKit
2//!
3//! Comprehensive testing utilities for embeddenator VSA operations, performance benchmarking,
4//! and large-scale data validation.
5//!
6//! ## Performance Optimization Insights (v0.20.0-alpha.1)
7//!
8//! Based on extensive benchmarking across scales from 250MB to 20GB+:
9//!
10//! ### Current Optimizations (bt-phase-2 + SIMD)
11//! - **Packed ternary operations**: 10-20x speedup for dense vectors
12//! - **SIMD cosine similarity**: Platform-specific acceleration (AVX2/NEON)
13//! - **Thread-local scratch buffers**: Eliminates allocation overhead
14//! - **Hybrid bundling**: Adaptive selection between pairwise/sum-many modes
15//!
16//! ### Performance Baselines (Intel i7-14700K, 46GB RAM)
17//! - **Bundle (pairwise)**: ~43ns (sparse), ~32µs (dense packed)
18//! - **Bind**: ~11ns (sparse), ~20µs (dense packed)
19//! - **Cosine**: ~7ns (sparse), ~14µs (dense packed)
20//! - **Ingestion**: ~15 MB/s (2GB dataset), scales linearly
21//! - **Extraction**: ~41 MB/s (2GB dataset), bit-perfect reconstruction
22//!
23//! ### Memory Scaling
24//! - **Storage overhead**: 2.8x (engram size vs input)
25//! - **Peak memory**: Bounded by hierarchical chunking
26//! - **Large datasets**: 20GB+ supported with linear scaling
27//!
28//! ## Future Optimizations (Planned)
29//! - **GPU acceleration**: CUDA/OpenCL backends for VSA operations
30//! - **CPU-GPU coprocessing**: Hybrid execution models
31//! - **Memory-mapped I/O**: For datasets > RAM capacity
32//! - **Distributed processing**: Multi-node VSA operations
33//!
34//! ## Testing Infrastructure
35//!
36//! ### Benchmark Categories
37//! - **Micro-benchmarks**: Individual VSA operations (ns scale)
38//! - **Macro-benchmarks**: End-to-end workflows (ms-seconds scale)
39//! - **Scale benchmarks**: 20GB-40GB dataset validation
40//! - **Stress tests**: Memory pressure, concurrent operations
41//!
42//! ### Dataset Generation
43//! - **Synthetic data**: Controlled patterns for reproducible testing
44//! - **Realistic data**: Varied file types, sizes, and content patterns
45//! - **Scale patterns**: Linear growth from KB to TB scales
46//!
47//! ## Usage Examples
48//!
49//! ```rust,ignore
50//! use embeddenator_testkit::*;
51//!
52//! // Generate random sparse vectors for testing
53//! let mut rng = rand::thread_rng();
54//! let vec = generators::random_sparse_vec(&mut rng, 10000, 200);
55//!
56//! // Create test datasets
57//! let harness = TestHarness::new();
58//! let dataset = harness.create_dataset(100); // 100MB
59//!
60//! // Run performance validation
61//! let mut metrics = TestMetrics::new("bind_operation");
62//! metrics.start_timing();
63//! let result = vec.bind(&vec);
64//! metrics.stop_timing();
65//! println!("{}", metrics.summary());
66//! ```
67
68pub mod chaos;
69pub mod fixtures;
70pub mod generators;
71pub mod harness;
72pub mod integrity;
73pub mod metrics;
74
75// Re-export commonly used items
76pub use chaos::ChaosInjector;
77pub use fixtures::{create_test_data, create_test_dataset, TestDataPattern};
78pub use generators::{
79 deterministic_sparse_vec, mk_random_sparsevec, random_sparse_vec, sparse_dot,
80};
81pub use harness::TestHarness;
82pub use integrity::{IntegrityReport, IntegrityValidator};
83pub use metrics::{AccuracyMetrics, TestMetrics, TimingStats, VsaEvaluationMetrics};
84
85// Re-export VSA types for integration tests
86pub use embeddenator_vsa::{SparseVec, SparsityScaling, VsaConfig, VsaConfigSchema, DIM};
87
88// Real-world dataset management
89#[cfg(feature = "realworld-datasets")]
90pub mod datasets;
91
92#[cfg(feature = "realworld-datasets")]
93pub use datasets::{DatasetCatalog, DatasetCategory, DatasetManager, DatasetTier};
94
95/// Smoke test for testkit functionality
96pub fn testkit_smoke() -> bool {
97 true
98}
99
100#[cfg(test)]
101mod tests {
102 use super::*;
103
104 #[test]
105 fn test_smoke() {
106 assert!(testkit_smoke());
107 }
108}