strange_loop/nano_agent/
optimization.rs

1//! SIMD optimizations and cache-aligned data structures for nano-agents
2
3use std::arch::x86_64::*;
4use std::mem;
5
6/// Cache-aligned vector for SIMD operations
7#[repr(align(64))] // Align to cache line (64 bytes)
8pub struct AlignedVector {
9    data: Vec<f32>,
10    capacity: usize,
11}
12
13impl AlignedVector {
14    /// Create new cache-aligned vector with specified capacity
15    pub fn new(capacity: usize) -> Self {
16        let aligned_capacity = (capacity + 15) & !15; // Round up to multiple of 16
17        let mut data = Vec::with_capacity(aligned_capacity);
18        data.resize(aligned_capacity, 0.0);
19
20        Self {
21            data,
22            capacity: aligned_capacity,
23        }
24    }
25
26    /// Get raw pointer for SIMD operations
27    pub fn as_ptr(&self) -> *const f32 {
28        self.data.as_ptr()
29    }
30
31    /// Get mutable raw pointer for SIMD operations
32    pub fn as_mut_ptr(&mut self) -> *mut f32 {
33        self.data.as_mut_ptr()
34    }
35
36    /// Get length of vector
37    pub fn len(&self) -> usize {
38        self.data.len()
39    }
40
41    /// Check if vector is empty
42    pub fn is_empty(&self) -> bool {
43        self.data.is_empty()
44    }
45
46    /// SIMD-accelerated vector addition
47    #[target_feature(enable = "avx2")]
48    pub unsafe fn simd_add(&mut self, other: &AlignedVector) -> Result<(), &'static str> {
49        if self.len() != other.len() {
50            return Err("Vector lengths must match");
51        }
52
53        let len = self.len();
54        let chunks = len / 8; // Process 8 f32s at a time with AVX2
55
56        let self_ptr = self.as_mut_ptr();
57        let other_ptr = other.as_ptr();
58
59        // Process chunks of 8 elements
60        for i in 0..chunks {
61            let offset = i * 8;
62
63            // Load 8 f32 values from each vector
64            let a = _mm256_load_ps(self_ptr.add(offset));
65            let b = _mm256_load_ps(other_ptr.add(offset));
66
67            // Perform SIMD addition
68            let result = _mm256_add_ps(a, b);
69
70            // Store result back
71            _mm256_store_ps(self_ptr.add(offset), result);
72        }
73
74        // Handle remaining elements
75        for i in (chunks * 8)..len {
76            *self_ptr.add(i) += *other_ptr.add(i);
77        }
78
79        Ok(())
80    }
81
82    /// SIMD-accelerated dot product
83    #[target_feature(enable = "avx2")]
84    pub unsafe fn simd_dot(&self, other: &AlignedVector) -> Result<f32, &'static str> {
85        if self.len() != other.len() {
86            return Err("Vector lengths must match");
87        }
88
89        let len = self.len();
90        let chunks = len / 8;
91
92        let self_ptr = self.as_ptr();
93        let other_ptr = other.as_ptr();
94
95        // Accumulator for sum
96        let mut sum_vec = _mm256_setzero_ps();
97
98        // Process chunks of 8 elements
99        for i in 0..chunks {
100            let offset = i * 8;
101
102            let a = _mm256_load_ps(self_ptr.add(offset));
103            let b = _mm256_load_ps(other_ptr.add(offset));
104
105            // Multiply and accumulate
106            let product = _mm256_mul_ps(a, b);
107            sum_vec = _mm256_add_ps(sum_vec, product);
108        }
109
110        // Horizontal sum of the accumulated vector
111        let mut result_array = [0.0f32; 8];
112        _mm256_store_ps(result_array.as_mut_ptr(), sum_vec);
113        let mut dot_product: f32 = result_array.iter().sum();
114
115        // Handle remaining elements
116        for i in (chunks * 8)..len {
117            dot_product += *self_ptr.add(i) * *other_ptr.add(i);
118        }
119
120        Ok(dot_product)
121    }
122
123    /// SIMD-accelerated vector scaling
124    #[target_feature(enable = "avx2")]
125    pub unsafe fn simd_scale(&mut self, scalar: f32) {
126        let len = self.len();
127        let chunks = len / 8;
128
129        let self_ptr = self.as_mut_ptr();
130        let scalar_vec = _mm256_set1_ps(scalar); // Broadcast scalar to all elements
131
132        // Process chunks of 8 elements
133        for i in 0..chunks {
134            let offset = i * 8;
135
136            let a = _mm256_load_ps(self_ptr.add(offset));
137            let result = _mm256_mul_ps(a, scalar_vec);
138            _mm256_store_ps(self_ptr.add(offset), result);
139        }
140
141        // Handle remaining elements
142        for i in (chunks * 8)..len {
143            *self_ptr.add(i) *= scalar;
144        }
145    }
146}
147
148/// Cache-optimized agent state structure
149#[repr(align(64))]
150pub struct AgentState {
151    // Hot data (frequently accessed) - first cache line
152    pub position: [f32; 3],
153    pub velocity: [f32; 3],
154    pub acceleration: [f32; 3],
155    pub energy: f32,
156    pub active: bool,
157    _padding1: [u8; 31], // Pad to cache line boundary
158
159    // Warm data - second cache line
160    pub parameters: AlignedVector,
161    pub last_update_ns: u128,
162    pub performance_score: f32,
163    _padding2: [u8; 36],
164
165    // Cold data - third cache line
166    pub debug_info: String,
167    pub creation_time: std::time::Instant,
168}
169
170impl AgentState {
171    pub fn new(param_count: usize) -> Self {
172        Self {
173            position: [0.0; 3],
174            velocity: [0.0; 3],
175            acceleration: [0.0; 3],
176            energy: 1.0,
177            active: true,
178            _padding1: [0; 31],
179            parameters: AlignedVector::new(param_count),
180            last_update_ns: 0,
181            performance_score: 0.0,
182            _padding2: [0; 36],
183            debug_info: String::new(),
184            creation_time: std::time::Instant::now(),
185        }
186    }
187
188    /// SIMD-optimized state update
189    pub fn simd_update(&mut self, dt: f32) {
190        unsafe {
191            // Update position using SIMD for 3D vector operations
192            let pos_ptr = self.position.as_mut_ptr();
193            let vel_ptr = self.velocity.as_ptr();
194
195            // Load position and velocity vectors (pad to 4 elements for SIMD)
196            let mut pos_padded = [0.0f32; 4];
197            let mut vel_padded = [0.0f32; 4];
198
199            pos_padded[..3].copy_from_slice(&self.position);
200            vel_padded[..3].copy_from_slice(&self.velocity);
201
202            let pos_vec = _mm_load_ps(pos_padded.as_ptr());
203            let vel_vec = _mm_load_ps(vel_padded.as_ptr());
204            let dt_vec = _mm_set1_ps(dt);
205
206            // position += velocity * dt
207            let vel_scaled = _mm_mul_ps(vel_vec, dt_vec);
208            let new_pos = _mm_add_ps(pos_vec, vel_scaled);
209
210            // Store result back (only first 3 elements)
211            _mm_store_ps(pos_padded.as_mut_ptr(), new_pos);
212            self.position.copy_from_slice(&pos_padded[..3]);
213        }
214    }
215}
216
217/// SIMD-optimized batch operations for multiple agents
218pub struct BatchProcessor {
219    positions: AlignedVector,
220    velocities: AlignedVector,
221    accelerations: AlignedVector,
222    agent_count: usize,
223}
224
225impl BatchProcessor {
226    pub fn new(max_agents: usize) -> Self {
227        Self {
228            positions: AlignedVector::new(max_agents * 3),
229            velocities: AlignedVector::new(max_agents * 3),
230            accelerations: AlignedVector::new(max_agents * 3),
231            agent_count: 0,
232        }
233    }
234
235    /// Batch update all agent positions using SIMD
236    #[target_feature(enable = "avx2")]
237    pub unsafe fn batch_update_positions(&mut self, dt: f32) {
238        // positions += velocities * dt + 0.5 * accelerations * dt^2
239
240        let len = self.agent_count * 3;
241        let chunks = len / 8;
242
243        let pos_ptr = self.positions.as_mut_ptr();
244        let vel_ptr = self.velocities.as_ptr();
245        let acc_ptr = self.accelerations.as_ptr();
246
247        let dt_vec = _mm256_set1_ps(dt);
248        let dt2_vec = _mm256_set1_ps(dt * dt * 0.5);
249
250        for i in 0..chunks {
251            let offset = i * 8;
252
253            let pos = _mm256_load_ps(pos_ptr.add(offset));
254            let vel = _mm256_load_ps(vel_ptr.add(offset));
255            let acc = _mm256_load_ps(acc_ptr.add(offset));
256
257            // velocity * dt
258            let vel_term = _mm256_mul_ps(vel, dt_vec);
259
260            // 0.5 * acceleration * dt^2
261            let acc_term = _mm256_mul_ps(acc, dt2_vec);
262
263            // position + vel_term + acc_term
264            let result = _mm256_add_ps(pos, _mm256_add_ps(vel_term, acc_term));
265
266            _mm256_store_ps(pos_ptr.add(offset), result);
267        }
268
269        // Handle remaining elements
270        for i in (chunks * 8)..len {
271            *pos_ptr.add(i) += *vel_ptr.add(i) * dt + 0.5 * *acc_ptr.add(i) * dt * dt;
272        }
273    }
274
275    /// Calculate forces between agents using SIMD
276    #[target_feature(enable = "avx2")]
277    pub unsafe fn calculate_forces(&mut self) -> AlignedVector {
278        let mut forces = AlignedVector::new(self.agent_count * 3);
279
280        // Simplified force calculation (normally would be N^2 complexity)
281        // This is a placeholder for actual force computation
282
283        forces
284    }
285}
286
287/// Memory pool for zero-allocation agent operations
288pub struct AgentMemoryPool {
289    states: Vec<AgentState>,
290    free_indices: Vec<usize>,
291    capacity: usize,
292}
293
294impl AgentMemoryPool {
295    pub fn new(capacity: usize) -> Self {
296        let mut states = Vec::with_capacity(capacity);
297        let mut free_indices = Vec::with_capacity(capacity);
298
299        for i in 0..capacity {
300            states.push(AgentState::new(16)); // 16 parameters per agent
301            free_indices.push(i);
302        }
303
304        Self {
305            states,
306            free_indices,
307            capacity,
308        }
309    }
310
311    pub fn allocate_agent(&mut self) -> Option<usize> {
312        self.free_indices.pop()
313    }
314
315    pub fn deallocate_agent(&mut self, index: usize) {
316        if index < self.capacity {
317            self.free_indices.push(index);
318        }
319    }
320
321    pub fn get_state(&self, index: usize) -> Option<&AgentState> {
322        self.states.get(index)
323    }
324
325    pub fn get_state_mut(&mut self, index: usize) -> Option<&mut AgentState> {
326        self.states.get_mut(index)
327    }
328}
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333
334    #[test]
335    fn test_aligned_vector_creation() {
336        let vec = AlignedVector::new(100);
337        assert_eq!(vec.len(), 112); // Rounded up to multiple of 16
338        assert_eq!(vec.as_ptr() as usize % 64, 0); // Cache-aligned
339    }
340
341    #[test]
342    fn test_simd_operations() {
343        let mut a = AlignedVector::new(16);
344        let mut b = AlignedVector::new(16);
345
346        // Initialize test data
347        for i in 0..16 {
348            unsafe {
349                *a.as_mut_ptr().add(i) = i as f32;
350                *b.as_mut_ptr().add(i) = (i * 2) as f32;
351            }
352        }
353
354        unsafe {
355            // Test SIMD addition
356            a.simd_add(&b).unwrap();
357
358            // Test SIMD dot product
359            let dot = a.simd_dot(&b).unwrap();
360            assert!(dot > 0.0);
361
362            // Test SIMD scaling
363            a.simd_scale(2.0);
364        }
365    }
366
367    #[test]
368    fn test_agent_state_alignment() {
369        let state = AgentState::new(16);
370        let ptr = &state as *const AgentState as usize;
371        assert_eq!(ptr % 64, 0); // Cache-aligned
372    }
373
374    #[test]
375    fn test_memory_pool() {
376        let mut pool = AgentMemoryPool::new(10);
377
378        let agent1 = pool.allocate_agent().unwrap();
379        let agent2 = pool.allocate_agent().unwrap();
380
381        assert_ne!(agent1, agent2);
382
383        pool.deallocate_agent(agent1);
384        let agent3 = pool.allocate_agent().unwrap();
385        assert_eq!(agent1, agent3); // Reused index
386    }
387}