nt_features/
embeddings.rs

1// Deterministic hash-based embeddings
2//
3// Performance target: <100μs per embedding generation
4// Uses SeaHash for fast, deterministic hashing
5
6use seahash::hash;
7
8/// Generate deterministic hash-based embedding
9pub fn hash_embed(data: &[u8], dimension: usize) -> Vec<f32> {
10    let mut embedding = Vec::with_capacity(dimension);
11
12    for i in 0..dimension {
13        // Combine dimension index with data for unique hash per dimension
14        let mut combined = Vec::with_capacity(8 + data.len());
15        combined.extend_from_slice(&(i as u64).to_le_bytes());
16        combined.extend_from_slice(data);
17
18        let hash_value = hash(&combined);
19
20        // Convert to [-1.0, 1.0] range
21        let normalized = (hash_value as f64 / u64::MAX as f64) * 2.0 - 1.0;
22        embedding.push(normalized as f32);
23    }
24
25    embedding
26}
27
28pub struct EmbeddingGenerator {
29    dimension: usize,
30}
31
32impl EmbeddingGenerator {
33    pub fn new(dimension: usize) -> Self {
34        Self { dimension }
35    }
36
37    /// Generate embedding from structured data
38    pub fn embed_observation(
39        &self,
40        symbol: &str,
41        timestamp_us: i64,
42        price: f64,
43        volume: f64,
44        spread: f64,
45    ) -> Vec<f32> {
46        let mut data = Vec::new();
47        data.extend_from_slice(&timestamp_us.to_le_bytes());
48        data.extend_from_slice(symbol.as_bytes());
49        data.extend_from_slice(&price.to_le_bytes());
50        data.extend_from_slice(&volume.to_le_bytes());
51        data.extend_from_slice(&spread.to_le_bytes());
52
53        hash_embed(&data, self.dimension)
54    }
55
56    /// Generate embedding for trading signal
57    pub fn embed_signal(
58        &self,
59        strategy_id: &str,
60        symbol: &str,
61        timestamp_us: i64,
62        direction: u8,
63        confidence: f64,
64        features: &[f64],
65    ) -> Vec<f32> {
66        let mut data = Vec::new();
67        data.extend_from_slice(&timestamp_us.to_le_bytes());
68        data.extend_from_slice(strategy_id.as_bytes());
69        data.extend_from_slice(symbol.as_bytes());
70        data.push(direction);
71        data.extend_from_slice(&confidence.to_le_bytes());
72
73        // Include feature vector
74        for &feature in features {
75            data.extend_from_slice(&feature.to_le_bytes());
76        }
77
78        hash_embed(&data, self.dimension)
79    }
80
81    /// Generate embedding for order
82    pub fn embed_order(
83        &self,
84        signal_id: &[u8],
85        symbol: &str,
86        side: u8,
87        order_type: u8,
88        quantity: u32,
89        limit_price: Option<f64>,
90    ) -> Vec<f32> {
91        let mut data = Vec::new();
92        data.extend_from_slice(signal_id);
93        data.extend_from_slice(symbol.as_bytes());
94        data.push(side);
95        data.push(order_type);
96        data.extend_from_slice(&quantity.to_le_bytes());
97
98        if let Some(price) = limit_price {
99            data.extend_from_slice(&price.to_le_bytes());
100        }
101
102        hash_embed(&data, self.dimension)
103    }
104
105    /// Calculate cosine similarity between embeddings
106    pub fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
107        if a.len() != b.len() {
108            return 0.0;
109        }
110
111        let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
112        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
113        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
114
115        if norm_a == 0.0 || norm_b == 0.0 {
116            return 0.0;
117        }
118
119        dot / (norm_a * norm_b)
120    }
121
122    /// Calculate Euclidean distance
123    pub fn euclidean_distance(&self, a: &[f32], b: &[f32]) -> f32 {
124        if a.len() != b.len() {
125            return f32::MAX;
126        }
127
128        a.iter()
129            .zip(b.iter())
130            .map(|(x, y)| (x - y).powi(2))
131            .sum::<f32>()
132            .sqrt()
133    }
134}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139
140    #[test]
141    fn test_deterministic_embedding() {
142        let data = b"AAPL,150.0,1000";
143
144        let embed1 = hash_embed(data, 512);
145        let embed2 = hash_embed(data, 512);
146
147        // Should be identical
148        assert_eq!(embed1, embed2);
149        assert_eq!(embed1.len(), 512);
150    }
151
152    #[test]
153    fn test_embedding_uniqueness() {
154        let data1 = b"AAPL,150.0,1000";
155        let data2 = b"AAPL,150.1,1000";
156
157        let embed1 = hash_embed(data1, 512);
158        let embed2 = hash_embed(data2, 512);
159
160        // Should be different
161        assert_ne!(embed1, embed2);
162    }
163
164    #[test]
165    fn test_embedding_range() {
166        let data = b"test_data";
167        let embedding = hash_embed(data, 256);
168
169        // All values should be in [-1.0, 1.0]
170        for &value in &embedding {
171            assert!(value >= -1.0 && value <= 1.0);
172        }
173    }
174
175    #[test]
176    fn test_cosine_similarity() {
177        let generator = EmbeddingGenerator::new(128);
178
179        let embed1 = generator.embed_observation("AAPL", 1000000, 150.0, 1000.0, 0.01);
180        let embed2 = generator.embed_observation("AAPL", 1000000, 150.0, 1000.0, 0.01);
181        let embed3 = generator.embed_observation("MSFT", 1000000, 300.0, 2000.0, 0.02);
182
183        // Identical embeddings should have similarity 1.0
184        let sim1 = generator.cosine_similarity(&embed1, &embed2);
185        assert!((sim1 - 1.0).abs() < 0.0001);
186
187        // Different embeddings should have similarity < 1.0
188        let sim2 = generator.cosine_similarity(&embed1, &embed3);
189        assert!(sim2 < 0.99);
190    }
191
192    #[test]
193    fn test_euclidean_distance() {
194        let generator = EmbeddingGenerator::new(128);
195
196        let embed1 = generator.embed_observation("AAPL", 1000000, 150.0, 1000.0, 0.01);
197        let embed2 = generator.embed_observation("AAPL", 1000000, 150.0, 1000.0, 0.01);
198        let embed3 = generator.embed_observation("MSFT", 1000000, 300.0, 2000.0, 0.02);
199
200        // Identical embeddings should have distance 0
201        let dist1 = generator.euclidean_distance(&embed1, &embed2);
202        assert!(dist1 < 0.0001);
203
204        // Different embeddings should have distance > 0
205        let dist2 = generator.euclidean_distance(&embed1, &embed3);
206        assert!(dist2 > 0.1);
207    }
208
209    #[test]
210    fn test_embed_signal() {
211        let generator = EmbeddingGenerator::new(768);
212
213        let features = vec![0.5, 0.3, 0.8, 0.2];
214        let embedding = generator.embed_signal(
215            "momentum_v1",
216            "AAPL",
217            1000000,
218            1, // Long
219            0.85,
220            &features,
221        );
222
223        assert_eq!(embedding.len(), 768);
224    }
225}