nt_features/
embeddings.rs1use seahash::hash;
7
8pub fn hash_embed(data: &[u8], dimension: usize) -> Vec<f32> {
10 let mut embedding = Vec::with_capacity(dimension);
11
12 for i in 0..dimension {
13 let mut combined = Vec::with_capacity(8 + data.len());
15 combined.extend_from_slice(&(i as u64).to_le_bytes());
16 combined.extend_from_slice(data);
17
18 let hash_value = hash(&combined);
19
20 let normalized = (hash_value as f64 / u64::MAX as f64) * 2.0 - 1.0;
22 embedding.push(normalized as f32);
23 }
24
25 embedding
26}
27
28pub struct EmbeddingGenerator {
29 dimension: usize,
30}
31
32impl EmbeddingGenerator {
33 pub fn new(dimension: usize) -> Self {
34 Self { dimension }
35 }
36
37 pub fn embed_observation(
39 &self,
40 symbol: &str,
41 timestamp_us: i64,
42 price: f64,
43 volume: f64,
44 spread: f64,
45 ) -> Vec<f32> {
46 let mut data = Vec::new();
47 data.extend_from_slice(×tamp_us.to_le_bytes());
48 data.extend_from_slice(symbol.as_bytes());
49 data.extend_from_slice(&price.to_le_bytes());
50 data.extend_from_slice(&volume.to_le_bytes());
51 data.extend_from_slice(&spread.to_le_bytes());
52
53 hash_embed(&data, self.dimension)
54 }
55
56 pub fn embed_signal(
58 &self,
59 strategy_id: &str,
60 symbol: &str,
61 timestamp_us: i64,
62 direction: u8,
63 confidence: f64,
64 features: &[f64],
65 ) -> Vec<f32> {
66 let mut data = Vec::new();
67 data.extend_from_slice(×tamp_us.to_le_bytes());
68 data.extend_from_slice(strategy_id.as_bytes());
69 data.extend_from_slice(symbol.as_bytes());
70 data.push(direction);
71 data.extend_from_slice(&confidence.to_le_bytes());
72
73 for &feature in features {
75 data.extend_from_slice(&feature.to_le_bytes());
76 }
77
78 hash_embed(&data, self.dimension)
79 }
80
81 pub fn embed_order(
83 &self,
84 signal_id: &[u8],
85 symbol: &str,
86 side: u8,
87 order_type: u8,
88 quantity: u32,
89 limit_price: Option<f64>,
90 ) -> Vec<f32> {
91 let mut data = Vec::new();
92 data.extend_from_slice(signal_id);
93 data.extend_from_slice(symbol.as_bytes());
94 data.push(side);
95 data.push(order_type);
96 data.extend_from_slice(&quantity.to_le_bytes());
97
98 if let Some(price) = limit_price {
99 data.extend_from_slice(&price.to_le_bytes());
100 }
101
102 hash_embed(&data, self.dimension)
103 }
104
105 pub fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
107 if a.len() != b.len() {
108 return 0.0;
109 }
110
111 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
112 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
113 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
114
115 if norm_a == 0.0 || norm_b == 0.0 {
116 return 0.0;
117 }
118
119 dot / (norm_a * norm_b)
120 }
121
122 pub fn euclidean_distance(&self, a: &[f32], b: &[f32]) -> f32 {
124 if a.len() != b.len() {
125 return f32::MAX;
126 }
127
128 a.iter()
129 .zip(b.iter())
130 .map(|(x, y)| (x - y).powi(2))
131 .sum::<f32>()
132 .sqrt()
133 }
134}
135
136#[cfg(test)]
137mod tests {
138 use super::*;
139
140 #[test]
141 fn test_deterministic_embedding() {
142 let data = b"AAPL,150.0,1000";
143
144 let embed1 = hash_embed(data, 512);
145 let embed2 = hash_embed(data, 512);
146
147 assert_eq!(embed1, embed2);
149 assert_eq!(embed1.len(), 512);
150 }
151
152 #[test]
153 fn test_embedding_uniqueness() {
154 let data1 = b"AAPL,150.0,1000";
155 let data2 = b"AAPL,150.1,1000";
156
157 let embed1 = hash_embed(data1, 512);
158 let embed2 = hash_embed(data2, 512);
159
160 assert_ne!(embed1, embed2);
162 }
163
164 #[test]
165 fn test_embedding_range() {
166 let data = b"test_data";
167 let embedding = hash_embed(data, 256);
168
169 for &value in &embedding {
171 assert!(value >= -1.0 && value <= 1.0);
172 }
173 }
174
175 #[test]
176 fn test_cosine_similarity() {
177 let generator = EmbeddingGenerator::new(128);
178
179 let embed1 = generator.embed_observation("AAPL", 1000000, 150.0, 1000.0, 0.01);
180 let embed2 = generator.embed_observation("AAPL", 1000000, 150.0, 1000.0, 0.01);
181 let embed3 = generator.embed_observation("MSFT", 1000000, 300.0, 2000.0, 0.02);
182
183 let sim1 = generator.cosine_similarity(&embed1, &embed2);
185 assert!((sim1 - 1.0).abs() < 0.0001);
186
187 let sim2 = generator.cosine_similarity(&embed1, &embed3);
189 assert!(sim2 < 0.99);
190 }
191
192 #[test]
193 fn test_euclidean_distance() {
194 let generator = EmbeddingGenerator::new(128);
195
196 let embed1 = generator.embed_observation("AAPL", 1000000, 150.0, 1000.0, 0.01);
197 let embed2 = generator.embed_observation("AAPL", 1000000, 150.0, 1000.0, 0.01);
198 let embed3 = generator.embed_observation("MSFT", 1000000, 300.0, 2000.0, 0.02);
199
200 let dist1 = generator.euclidean_distance(&embed1, &embed2);
202 assert!(dist1 < 0.0001);
203
204 let dist2 = generator.euclidean_distance(&embed1, &embed3);
206 assert!(dist2 > 0.1);
207 }
208
209 #[test]
210 fn test_embed_signal() {
211 let generator = EmbeddingGenerator::new(768);
212
213 let features = vec![0.5, 0.3, 0.8, 0.2];
214 let embedding = generator.embed_signal(
215 "momentum_v1",
216 "AAPL",
217 1000000,
218 1, 0.85,
220 &features,
221 );
222
223 assert_eq!(embedding.len(), 768);
224 }
225}