use crate::error::{NeuralError, Result};
use scirs2_core::ndarray::{Array, ArrayD, ArrayView, IxDyn};
use std::fmt::Debug;
#[cfg(feature = "memory_efficient")]
use scirs2_core::memory_efficient::{chunk_wise_op, ChunkingStrategy};
pub struct MemoryEfficientProcessor {
chunk_size: usize,
max_memory_mb: usize,
}
impl MemoryEfficientProcessor {
pub fn new(_chunk_size: Option<usize>, max_memorymb: Option<usize>) -> Self {
Self {
chunk_size: chunk_size.unwrap_or(1024),
max_memory_mb: max_memory_mb.unwrap_or(512),
}
}
pub fn process_in_chunks<F, T>(
&self,
input: &ArrayD<f32>,
mut processor: F,
) -> Result<ArrayD<T>>
where
F: FnMut(&ArrayView<f32, IxDyn>) -> Result<ArrayD<T>>,
T: Clone + Debug + Default,
{
let batch_size = input.shape()[0];
if batch_size <= self.chunk_size {
return processor(&input.view());
let mut results = Vec::new();
let mut start_idx = 0;
while start_idx < batch_size {
let end_idx = (start_idx + self.chunk_size).min(batch_size);
let chunk = input.slice(scirs2_core::ndarray::s![start_idx..end_idx, ..]);
let result = processor(&chunk.into_dyn())?;
results.push(result);
start_idx = end_idx;
if results.is_empty() {
return Err(NeuralError::ComputationError(
"No chunks were processed".to_string(),
));
self.concatenate_results(results)
pub fn memory_efficient_forward<F>(
forward_fn: F,
) -> Result<ArrayD<f32>>
F: Fn(&ArrayView<f32, IxDyn>) -> Result<ArrayD<f32>>,
let strategy = ChunkingStrategy::Fixed(self.chunk_size);
chunk_wise_op(input, forward_fn, strategy).map_err(|e| {
NeuralError::ComputationError(format!("Memory-efficient forward failed: {:?}", e))
})
pub fn memory_efficient_gradient<F>(
target: &ArrayD<f32>,
gradient_fn: F,
F: Fn(&ArrayView<f32, IxDyn>, &ArrayView<f32, IxDyn>) -> Result<ArrayD<f32>>,
if input.shape() != target.shape() {
"Input and target must have same shape for gradient computation".to_string(),
return gradient_fn(&input.view(), &target.view());
let mut gradients = Vec::new();
let input_chunk = input.slice(scirs2_core::ndarray::s![start_idx..end_idx, ..]);
let target_chunk = target.slice(scirs2_core::ndarray::s![start_idx..end_idx, ..]);
let gradient = gradient_fn(&input_chunk.into_dyn(), &target_chunk.into_dyn())?;
gradients.push(gradient);
self.concatenate_results(gradients)
pub fn calculate_optimal_chunk_size(
tensorshape: &[usize],
element_size: usize,
) -> usize {
let elements_per_sample = tensorshape[1..].iter().product::<usize>();
let bytes_per_sample = elements_per_sample * element_size;
let available_bytes = (self.max_memory_mb * 1024 * 1024) / 3;
let optimal_chunk = available_bytes / bytes_per_sample;
optimal_chunk.max(1).min(self.chunk_size)
pub fn estimate_memory_usage(&self, shape: &[usize], elementsize: usize) -> usize {
let total_elements: usize = shape.iter().product();
total_elements * element_size
pub fn fits_in_memory(&self, shape: &[usize], elementsize: usize) -> bool {
let memory_usage = self.estimate_memory_usage(shape, element_size);
let max_bytes = self.max_memory_mb * 1024 * 1024;
memory_usage <= max_bytes
fn concatenate_results<T>(&self, results: Vec<ArrayD<T>>) -> Result<ArrayD<T>>
"Cannot concatenate empty results".to_string(),
if results.len() == 1 {
return Ok(results.into_iter().next().expect("Operation failed"));
Ok(results.into_iter().next().expect("Operation failed"))
pub fn get_settings(&self) -> MemorySettings {
MemorySettings {
chunk_size: self.chunk_size,
max_memory_mb: self.max_memory_mb,
pub fn update_settings(&mut self, chunk_size: Option<usize>, max_memorymb: Option<usize>) {
if let Some(size) = chunk_size {
self.chunk_size = size;
if let Some(memory) = max_memory_mb {
self.max_memory_mb = memory;
#[derive(Debug, Clone)]
pub struct MemorySettings {
pub chunk_size: usize,
pub max_memory_mb: usize,
pub struct MemoryPool<T> {
available_tensors: Vec<ArrayD<T>>,
in_use: usize,
max_pool_size: usize,
impl<T> MemoryPool<T>
where
T: Clone + Default,
{
pub fn new(_max_poolsize: usize) -> Self {
available_tensors: Vec::new(),
in_use: 0,
max_pool_size,
pub fn get_tensor(&mut self, shape: &[usize]) -> ArrayD<T> {
for (i, tensor) in self.available_tensors.iter().enumerate() {
if tensor.shape() == shape {
self.in_use += 1;
return self.available_tensors.swap_remove(i);
}
self.in_use += 1;
Array::default(shape.to_vec())
pub fn return_tensor(&mut self, tensor: ArrayD<T>) {
if self.available_tensors.len() < self.max_pool_size {
self.available_tensors.push(tensor);
self.in_use = self.in_use.saturating_sub(1);
pub fn get_stats(&self) -> MemoryPoolStats {
MemoryPoolStats {
available: self.available_tensors.len(),
in_use: self.in_use,
max_size: self.max_pool_size,
pub fn clear(&mut self) {
self.available_tensors.clear();
self.in_use = 0;
pub struct MemoryPoolStats {
pub available: usize,
pub in_use: usize,
pub max_size: usize,
pub struct OptimizationCapabilities {
pub simd_available: bool,
pub memory_efficient_available: bool,
pub thread_pool_available: bool,
pub num_threads: usize,
impl OptimizationCapabilities {
pub fn detect() -> Self {
simd_available: cfg!(feature = "simd"),
memory_efficient_available: cfg!(feature = "memory_efficient"),
thread_pool_available: true,
num_threads: std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(1),
pub fn all_available(&self) -> bool {
self.simd_available && self.memory_efficient_available && self.thread_pool_available
pub fn optimization_score(&self) -> f32 {
let mut score = 0.0;
let mut max_score = 0.0;
max_score += 0.4;
if self.simd_available {
score += 0.4;
max_score += 0.3;
if self.memory_efficient_available {
score += 0.3;
if self.thread_pool_available {
score += 0.3 * (self.num_threads as f32 / 8.0).min(1.0);
score / max_score
impl std::fmt::Display for OptimizationCapabilities {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "Optimization Capabilities:")?;
writeln!(f, " SIMD: {}", if self.simd_available { "✓" } else { "✗" })?;
writeln!(
f,
" Memory Efficient: {}",
if self.memory_efficient_available {
"✓"
} else {
"✗"
)?;
" Thread Pool: {}",
if self.thread_pool_available {
writeln!(f, " Threads: {}", self.num_threads)?;
" Optimization Score: {:.1}%",
self.optimization_score() * 100.0
Ok(())
/// SIMD operation statistics and capabilities
pub struct SIMDStats {
/// Whether SIMD is available
/// Vector width for f32 operations
pub vector_width_f32: usize,
/// Vector width for f64 operations
pub vector_width_f64: usize,
/// List of supported SIMD operations
pub supported_operations: Vec<String>,
impl SIMDStats {
/// Create SIMD stats with detection
vector_width_f32: if cfg!(feature = "simd") { 8 } else { 1 },
vector_width_f64: if cfg!(feature = "simd") { 4 } else { 1 },
supported_operations: if cfg!(feature = "simd") {
vec![
"relu".to_string(),
"sigmoid".to_string(),
"tanh".to_string(),
"gelu".to_string(),
"swish".to_string(),
"softmax".to_string(),
"cross_entropy".to_string(),
"matmul".to_string(),
"add".to_string(),
"conv2d".to_string(),
"batch_norm".to_string(),
]
vec![]
},
/// Get theoretical speedup for SIMD operations
pub fn theoretical_speedup(&self) -> f32 {
self.vector_width_f32 as f32
} else {
1.0
impl std::fmt::Display for SIMDStats {
writeln!(f, "SIMD Operation Statistics:")?;
" Available: {}",
if self.simd_available { "✓" } else { "✗" }
writeln!(f, " F32 Vector Width: {}", self.vector_width_f32)?;
writeln!(f, " F64 Vector Width: {}", self.vector_width_f64)?;
" Theoretical Speedup: {:.1}x",
self.theoretical_speedup()
writeln!(f, " Supported Operations:")?;
for op in &self.supported_operations {
writeln!(f, " - {}", op)?;
/// Memory usage monitor for tracking neural network memory consumption
pub struct MemoryMonitor {
peak_usage: usize,
current_usage: usize,
allocation_count: usize,
impl MemoryMonitor {
/// Create a new memory monitor
pub fn new() -> Self {
peak_usage: 0,
current_usage: 0,
allocation_count: 0,
/// Record memory allocation
pub fn record_allocation(&mut self, size: usize) {
self.current_usage += size;
self.peak_usage = self.peak_usage.max(self.current_usage);
self.allocation_count += 1;
/// Record memory deallocation
pub fn record_deallocation(&mut self, size: usize) {
self.current_usage = self.current_usage.saturating_sub(size);
/// Get current memory usage statistics
pub fn get_stats(&self) -> MemoryStats {
MemoryStats {
current_usage_mb: self.current_usage as f32 / (1024.0 * 1024.0),
peak_usage_mb: self.peak_usage as f32 / (1024.0 * 1024.0),
allocation_count: self.allocation_count,
/// Reset memory monitoring
pub fn reset(&mut self) {
self.peak_usage = self.current_usage;
self.allocation_count = 0;
impl Default for MemoryMonitor {
fn default() -> Self {
Self::new()
/// Memory usage statistics
pub struct MemoryStats {
/// Current memory usage in MB
pub current_usage_mb: f32,
/// Peak memory usage in MB
pub peak_usage_mb: f32,
/// Number of allocations recorded
pub allocation_count: usize,
impl std::fmt::Display for MemoryStats {
writeln!(f, "Memory Statistics:")?;
writeln!(f, " Current Usage: {:.1} MB", self.current_usage_mb)?;
writeln!(f, " Peak Usage: {:.1} MB", self.peak_usage_mb)?;
writeln!(f, " Allocations: {}", self.allocation_count)?;
// Provide no-op implementations when memory_efficient feature is not available
/// Memory efficient processor for handling large models (no-op implementation when feature disabled)
#[cfg(not(feature = "memory_efficient"))]
pub struct MemoryEfficientProcessor;
/// Create a new memory efficient processor
pub fn new(_chunk_size: Option<usize>, _max_memorymb: Option<usize>) -> Self {
Self
/// Process input data in chunks to reduce memory usage
pub fn process_in_chunks<F, T>(&mut self,
_input: &ArrayD<f32>, _processor: F) -> Result<ArrayD<T>>
Err(NeuralError::ComputationError(
"Memory efficient processing requires 'memory_efficient' feature".to_string(),
))
/// Perform memory-efficient forward pass
_input: &ArrayD<f32>, _forward_fn: F,
"Memory efficient forward requires 'memory_efficient' feature".to_string(),