scirs2_neural/performance/
memory.rs1use crate::error::{NeuralError, Result};
8use ndarray::{Array, ArrayD, ArrayView, IxDyn};
9use std::fmt::Debug;
10
11#[cfg(feature = "memory_efficient")]
25pub struct MemoryEfficientProcessor {
26 chunk_size: usize,
27 max_memory_mb: usize,
28}
29
30#[cfg(feature = "memory_efficient")]
31impl MemoryEfficientProcessor {
32 pub fn new(chunk_size: Option<usize>, max_memory_mb: Option<usize>) -> Self {
51 Self {
52 chunk_size: chunk_size.unwrap_or(1024),
53 max_memory_mb: max_memory_mb.unwrap_or(512),
54 }
55 }
56
57 pub fn process_in_chunks<F, T>(
71 &self,
72 input: &ArrayD<f32>,
73 mut processor: F,
74 ) -> Result<ArrayD<T>>
75 where
76 F: FnMut(&ArrayView<f32, IxDyn>) -> Result<ArrayD<T>>,
77 T: Clone + Debug + Default,
78 {
79 let batch_size = input.shape()[0];
80
81 if batch_size <= self.chunk_size {
82 return processor(&input.view());
84 }
85
86 let mut results = Vec::new();
88 let mut start_idx = 0;
89
90 while start_idx < batch_size {
91 let end_idx = (start_idx + self.chunk_size).min(batch_size);
92 let chunk = input.slice(ndarray::s![start_idx..end_idx, ..]);
93
94 let result = processor(&chunk.into_dyn())?;
95 results.push(result);
96
97 start_idx = end_idx;
98 }
99
100 if results.is_empty() {
102 return Err(NeuralError::ComputationError(
103 "No chunks were processed".to_string(),
104 ));
105 }
106
107 self.concatenate_results(results)
109 }
110
111 pub fn memory_efficient_forward<F>(
125 &self,
126 input: &ArrayD<f32>,
127 forward_fn: F,
128 ) -> Result<ArrayD<f32>>
129 where
130 F: Fn(&ArrayView<f32, IxDyn>) -> Result<ArrayD<f32>>,
131 {
132 forward_fn(&input.view())
139 }
140
141 pub fn memory_efficient_gradient<F>(
146 &self,
147 input: &ArrayD<f32>,
148 target: &ArrayD<f32>,
149 gradient_fn: F,
150 ) -> Result<ArrayD<f32>>
151 where
152 F: Fn(&ArrayView<f32, IxDyn>, &ArrayView<f32, IxDyn>) -> Result<ArrayD<f32>>,
153 {
154 if input.shape() != target.shape() {
155 return Err(NeuralError::ComputationError(
156 "Input and target must have same shape for gradient computation".to_string(),
157 ));
158 }
159
160 let batch_size = input.shape()[0];
161 if batch_size <= self.chunk_size {
162 return gradient_fn(&input.view(), &target.view());
163 }
164
165 let mut gradients = Vec::new();
166 let mut start_idx = 0;
167
168 while start_idx < batch_size {
169 let end_idx = (start_idx + self.chunk_size).min(batch_size);
170 let input_chunk = input.slice(ndarray::s![start_idx..end_idx, ..]);
171 let target_chunk = target.slice(ndarray::s![start_idx..end_idx, ..]);
172
173 let gradient = gradient_fn(&input_chunk.into_dyn(), &target_chunk.into_dyn())?;
174 gradients.push(gradient);
175
176 start_idx = end_idx;
177 }
178
179 self.concatenate_results(gradients)
180 }
181
182 pub fn calculate_optimal_chunk_size(
184 &self,
185 tensor_shape: &[usize],
186 element_size: usize,
187 ) -> usize {
188 let elements_per_sample = tensor_shape[1..].iter().product::<usize>();
190 let bytes_per_sample = elements_per_sample * element_size;
191
192 let available_bytes = (self.max_memory_mb * 1024 * 1024) / 3;
194
195 let optimal_chunk = available_bytes / bytes_per_sample;
196 optimal_chunk.max(1).min(self.chunk_size)
197 }
198
199 pub fn estimate_memory_usage(&self, shape: &[usize], element_size: usize) -> usize {
201 let total_elements: usize = shape.iter().product();
202 total_elements * element_size
203 }
204
205 pub fn fits_in_memory(&self, shape: &[usize], element_size: usize) -> bool {
207 let memory_usage = self.estimate_memory_usage(shape, element_size);
208 let max_bytes = self.max_memory_mb * 1024 * 1024;
209 memory_usage <= max_bytes
210 }
211
212 fn concatenate_results<T>(&self, results: Vec<ArrayD<T>>) -> Result<ArrayD<T>>
214 where
215 T: Clone + Debug + Default,
216 {
217 if results.is_empty() {
218 return Err(NeuralError::ComputationError(
219 "Cannot concatenate empty results".to_string(),
220 ));
221 }
222
223 if results.len() == 1 {
224 return Ok(results.into_iter().next().unwrap());
225 }
226
227 Ok(results.into_iter().next().unwrap())
230 }
231
232 pub fn get_settings(&self) -> MemorySettings {
234 MemorySettings {
235 chunk_size: self.chunk_size,
236 max_memory_mb: self.max_memory_mb,
237 }
238 }
239
240 pub fn update_settings(&mut self, chunk_size: Option<usize>, max_memory_mb: Option<usize>) {
242 if let Some(size) = chunk_size {
243 self.chunk_size = size;
244 }
245 if let Some(memory) = max_memory_mb {
246 self.max_memory_mb = memory;
247 }
248 }
249}
250
251#[derive(Debug, Clone)]
253pub struct MemorySettings {
254 pub chunk_size: usize,
256 pub max_memory_mb: usize,
258}
259
260pub struct MemoryPool<T> {
262 available_tensors: Vec<ArrayD<T>>,
263 in_use: usize,
264 max_pool_size: usize,
265}
266
267impl<T> MemoryPool<T>
268where
269 T: Clone + Default,
270{
271 pub fn new(max_pool_size: usize) -> Self {
273 Self {
274 available_tensors: Vec::new(),
275 in_use: 0,
276 max_pool_size,
277 }
278 }
279
280 pub fn get_tensor(&mut self, shape: &[usize]) -> ArrayD<T> {
282 for (i, tensor) in self.available_tensors.iter().enumerate() {
284 if tensor.shape() == shape {
285 self.in_use += 1;
286 return self.available_tensors.swap_remove(i);
287 }
288 }
289
290 self.in_use += 1;
292 Array::default(shape.to_vec())
293 }
294
295 pub fn return_tensor(&mut self, tensor: ArrayD<T>) {
297 if self.available_tensors.len() < self.max_pool_size {
298 self.available_tensors.push(tensor);
299 }
300 self.in_use = self.in_use.saturating_sub(1);
301 }
302
303 pub fn get_stats(&self) -> MemoryPoolStats {
305 MemoryPoolStats {
306 available: self.available_tensors.len(),
307 in_use: self.in_use,
308 max_size: self.max_pool_size,
309 }
310 }
311
312 pub fn clear(&mut self) {
314 self.available_tensors.clear();
315 self.in_use = 0;
316 }
317}
318
319#[derive(Debug, Clone)]
321pub struct MemoryPoolStats {
322 pub available: usize,
324 pub in_use: usize,
326 pub max_size: usize,
328}
329
330#[derive(Debug, Clone)]
332pub struct OptimizationCapabilities {
333 pub simd_available: bool,
335 pub memory_efficient_available: bool,
337 pub thread_pool_available: bool,
339 pub num_threads: usize,
341}
342
343impl OptimizationCapabilities {
344 pub fn detect() -> Self {
346 Self {
347 simd_available: cfg!(feature = "simd"),
348 memory_efficient_available: cfg!(feature = "memory_efficient"),
349 thread_pool_available: true,
350 num_threads: std::thread::available_parallelism()
351 .map(|n| n.get())
352 .unwrap_or(1),
353 }
354 }
355
356 pub fn all_available(&self) -> bool {
358 self.simd_available && self.memory_efficient_available && self.thread_pool_available
359 }
360
361 pub fn optimization_score(&self) -> f32 {
363 let mut score = 0.0;
364 let mut max_score = 0.0;
365
366 max_score += 0.4;
368 if self.simd_available {
369 score += 0.4;
370 }
371
372 max_score += 0.3;
374 if self.memory_efficient_available {
375 score += 0.3;
376 }
377
378 max_score += 0.3;
380 if self.thread_pool_available {
381 score += 0.3 * (self.num_threads as f32 / 8.0).min(1.0);
382 }
383
384 score / max_score
385 }
386}
387
388impl std::fmt::Display for OptimizationCapabilities {
389 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
390 writeln!(f, "Optimization Capabilities:")?;
391 writeln!(f, " SIMD: {}", if self.simd_available { "✓" } else { "✗" })?;
392 writeln!(
393 f,
394 " Memory Efficient: {}",
395 if self.memory_efficient_available {
396 "✓"
397 } else {
398 "✗"
399 }
400 )?;
401 writeln!(
402 f,
403 " Thread Pool: {}",
404 if self.thread_pool_available {
405 "✓"
406 } else {
407 "✗"
408 }
409 )?;
410 writeln!(f, " Threads: {}", self.num_threads)?;
411 writeln!(
412 f,
413 " Optimization Score: {:.1}%",
414 self.optimization_score() * 100.0
415 )?;
416 Ok(())
417 }
418}
419
420#[derive(Debug, Clone)]
422pub struct SIMDStats {
423 pub simd_available: bool,
425 pub vector_width_f32: usize,
427 pub vector_width_f64: usize,
429 pub supported_operations: Vec<String>,
431}
432
433impl SIMDStats {
434 pub fn detect() -> Self {
436 Self {
437 simd_available: cfg!(feature = "simd"),
438 vector_width_f32: if cfg!(feature = "simd") { 8 } else { 1 },
439 vector_width_f64: if cfg!(feature = "simd") { 4 } else { 1 },
440 supported_operations: if cfg!(feature = "simd") {
441 vec![
442 "relu".to_string(),
443 "sigmoid".to_string(),
444 "tanh".to_string(),
445 "gelu".to_string(),
446 "swish".to_string(),
447 "softmax".to_string(),
448 "cross_entropy".to_string(),
449 "matmul".to_string(),
450 "add".to_string(),
451 "conv2d".to_string(),
452 "batch_norm".to_string(),
453 ]
454 } else {
455 vec![]
456 },
457 }
458 }
459
460 pub fn theoretical_speedup(&self) -> f32 {
462 if self.simd_available {
463 self.vector_width_f32 as f32
464 } else {
465 1.0
466 }
467 }
468}
469
470impl std::fmt::Display for SIMDStats {
471 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
472 writeln!(f, "SIMD Operation Statistics:")?;
473 writeln!(
474 f,
475 " Available: {}",
476 if self.simd_available { "✓" } else { "✗" }
477 )?;
478 writeln!(f, " F32 Vector Width: {}", self.vector_width_f32)?;
479 writeln!(f, " F64 Vector Width: {}", self.vector_width_f64)?;
480 writeln!(
481 f,
482 " Theoretical Speedup: {:.1}x",
483 self.theoretical_speedup()
484 )?;
485 writeln!(f, " Supported Operations:")?;
486 for op in &self.supported_operations {
487 writeln!(f, " - {}", op)?;
488 }
489 Ok(())
490 }
491}
492
493pub struct MemoryMonitor {
495 peak_usage: usize,
496 current_usage: usize,
497 allocation_count: usize,
498}
499
500impl MemoryMonitor {
501 pub fn new() -> Self {
503 Self {
504 peak_usage: 0,
505 current_usage: 0,
506 allocation_count: 0,
507 }
508 }
509
510 pub fn record_allocation(&mut self, size: usize) {
512 self.current_usage += size;
513 self.peak_usage = self.peak_usage.max(self.current_usage);
514 self.allocation_count += 1;
515 }
516
517 pub fn record_deallocation(&mut self, size: usize) {
519 self.current_usage = self.current_usage.saturating_sub(size);
520 }
521
522 pub fn get_stats(&self) -> MemoryStats {
524 MemoryStats {
525 current_usage_mb: self.current_usage as f32 / (1024.0 * 1024.0),
526 peak_usage_mb: self.peak_usage as f32 / (1024.0 * 1024.0),
527 allocation_count: self.allocation_count,
528 }
529 }
530
531 pub fn reset(&mut self) {
533 self.peak_usage = self.current_usage;
534 self.allocation_count = 0;
535 }
536}
537
538impl Default for MemoryMonitor {
539 fn default() -> Self {
540 Self::new()
541 }
542}
543
544#[derive(Debug, Clone)]
546pub struct MemoryStats {
547 pub current_usage_mb: f32,
549 pub peak_usage_mb: f32,
551 pub allocation_count: usize,
553}
554
555impl std::fmt::Display for MemoryStats {
556 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
557 writeln!(f, "Memory Statistics:")?;
558 writeln!(f, " Current Usage: {:.1} MB", self.current_usage_mb)?;
559 writeln!(f, " Peak Usage: {:.1} MB", self.peak_usage_mb)?;
560 writeln!(f, " Allocations: {}", self.allocation_count)?;
561 Ok(())
562 }
563}
564
565#[cfg(not(feature = "memory_efficient"))]
568pub struct MemoryEfficientProcessor;
569
570#[cfg(not(feature = "memory_efficient"))]
571impl MemoryEfficientProcessor {
572 pub fn new(_chunk_size: Option<usize>, _max_memory_mb: Option<usize>) -> Self {
574 Self
575 }
576
577 pub fn process_in_chunks<F, T>(&self, _input: &ArrayD<f32>, _processor: F) -> Result<ArrayD<T>>
579 where
580 F: FnMut(&ArrayView<f32, IxDyn>) -> Result<ArrayD<T>>,
581 T: Clone + Debug + Default,
582 {
583 Err(NeuralError::ComputationError(
584 "Memory efficient processing requires 'memory_efficient' feature".to_string(),
585 ))
586 }
587
588 pub fn memory_efficient_forward<F>(
590 &self,
591 _input: &ArrayD<f32>,
592 _forward_fn: F,
593 ) -> Result<ArrayD<f32>>
594 where
595 F: Fn(&ArrayView<f32, IxDyn>) -> Result<ArrayD<f32>>,
596 {
597 Err(NeuralError::ComputationError(
598 "Memory efficient forward requires 'memory_efficient' feature".to_string(),
599 ))
600 }
601}