torsh-core 0.1.2

Core types and traits for ToRSh deep learning framework
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
//! Storage system for tensor data management
//!
//! This module provides a comprehensive storage system for tensor data, including
//! memory allocation, NUMA awareness, memory mapping, pooling, and cross-device
//! operations. The system is designed to be flexible, efficient, and backend-agnostic.
//!
//! # Architecture
//!
//! The storage system is organized into several specialized modules:
//!
//! - [`core`] - Core storage interfaces and shared storage wrapper
//! - [`allocation`] - Backend allocation framework with typed memory handles
//! - [`memory_info`] - Memory information and allocation strategies
//! - [`operations`] - Memory copy operations and async support
//! - [`memory_format`] - Memory layout management for different tensor formats
//! - [`views`] - Storage view system for zero-copy tensor operations
//! - [`pooling`] - Memory pooling system with thread-local optimization
//! - [`numa`] - NUMA-aware memory allocation and topology detection
//! - [`mapped`] - Memory-mapped storage for large tensors with lazy loading
//! - [`registry`] - Registry for backend allocators and storage systems
//!
//! # Basic Usage
//!
//! ```ignore
//! use torsh_core::storage::{Storage, SharedStorage, BackendAllocator};
//! use torsh_core::device::CpuDevice;
//!
//! // Create a storage instance
//! let device = CpuDevice::new();
//! let storage = MyStorage::allocate(&device, 1000)?;
//! let shared = SharedStorage::new(storage);
//!
//! // Use with allocators
//! let allocator = MyAllocator::new();
//! let handle = allocator.allocate_raw(&device, 1024, 8)?;
//! ```
//!
//! # Memory Management
//!
//! The storage system provides several memory management strategies:
//!
//! - **Direct allocation** via [`BackendAllocator`] for simple cases
//! - **Pooled allocation** via [`pooling`] for frequent small allocations
//! - **NUMA-aware allocation** via [`numa`] for multi-socket systems
//! - **Memory-mapped storage** via [`mapped`] for large datasets
//! - **Cross-device operations** via [`operations`] for GPU/CPU transfers
//!
//! # Thread Safety
//!
//! All storage components are designed to be thread-safe and can be shared
//! across threads using [`SharedStorage`] and similar wrapper types.

// Module declarations
pub mod aligned;
pub mod allocation;
pub mod core;
pub mod mapped;
pub mod memory_format;
pub mod memory_info;
pub mod numa;
pub mod operations;
pub mod pooling;
pub mod registry;
pub mod views;

// Re-export all public items for backward compatibility
// This ensures that existing code using `use torsh_core::storage::SomeType` continues to work

// Core storage types
pub use self::core::{SharedStorage, Storage};

// Allocation system
pub use self::allocation::{
    AllocationRequest, BackendAllocator, RawMemoryHandle, TypedMemoryHandle, TypedMemoryStats,
};

// Memory information and strategies
pub use self::memory_info::{AllocationStrategy, MemoryInfo};

// Memory operations
pub use self::operations::{
    BackendAsyncMemory, BackendMemoryCopy, CopyOperation, MemoryOperationStats,
};

// Memory formats
pub use self::memory_format::{
    ConversionCost, FormatPreference, HardwareType, MemoryFormat, OperationType,
};

// Storage views
pub use self::views::{StorageView, ViewBuilder, ViewStatistics};

// Memory pooling
pub use self::pooling::{
    // Re-export pooled allocation functions
    allocate_pooled,
    allocate_pooled_with_value,
    clear_pooled_memory,
    configure_pools,
    deallocate_pooled,
    pooled_memory_stats,
    warmup_pools,
    MemoryPool,
    PoolConfig,
    PoolSizeStats,
    PoolStats,
};

// NUMA support
pub use self::numa::{
    MemoryAccessPattern, NumaAllocator, NumaMemoryHandle, NumaMetadata, NumaPolicy, NumaTopology,
    NumaTopologyStats, WorkloadType,
};

// Memory-mapped storage
pub use self::mapped::{
    AccessPatternStats, LazyLoadConfig, MappedSlice, MappedStorage, MappedStorageStats,
};

// Allocator registry
pub use self::registry::{
    AllocatorCapability, AllocatorMetadata, AllocatorRegistry, AllocatorRequirements,
    RegistryStatistics,
};

// Aligned storage for SIMD optimization
pub use self::aligned::{
    alignment, AlignedVec, AlignmentChecker, SimdLayoutAnalysis, SimdLayoutAnalyzer,
};

// Utility modules - re-export selected utilities
pub use self::allocation::utils as allocation_utils;
pub use self::memory_format::utils as memory_format_utils;
pub use self::numa::utils as numa_utils;
pub use self::operations::utils as operations_utils;
pub use self::registry::utils as registry_utils;
pub use self::views::utils as view_utils;

// Global registry functions
pub use self::registry::{global_registry, initialize_global_registry};

// Additional convenience re-exports for commonly used combinations
pub use self::allocation::TypedMemoryHandle as TensorMemoryHandle;
pub use self::core::SharedStorage as SharedTensorStorage;

/// Prelude module for common storage imports
///
/// This module provides a convenient way to import the most commonly used
/// storage types and traits.
///
/// ```ignore
/// use torsh_core::storage::prelude::*;
/// ```
pub mod prelude {
    pub use super::allocation::{BackendAllocator, RawMemoryHandle, TypedMemoryHandle};
    pub use super::core::{SharedStorage, Storage};
    pub use super::memory_format::MemoryFormat;
    pub use super::memory_info::{AllocationStrategy, MemoryInfo};
    pub use super::numa::{NumaPolicy, NumaTopology};
    pub use super::pooling::{allocate_pooled, deallocate_pooled};
    pub use super::registry::AllocatorRegistry;
    pub use super::views::StorageView;
}

/// Utility functions that operate across multiple storage modules
pub mod utils {
    use super::*;

    /// Create a default storage configuration for a given device type
    pub fn default_storage_config() -> StorageConfig {
        StorageConfig {
            memory_format: MemoryFormat::Contiguous,
            allocation_strategy: AllocationStrategy::Immediate,
            numa_policy: NumaPolicy::LocalPreferred,
            enable_pooling: true,
            enable_memory_mapping: false,
            lazy_load_config: mapped::LazyLoadConfig::default(),
        }
    }

    /// Get recommended memory format for a tensor shape and operation
    pub fn recommend_memory_format(
        shape: &[usize],
        operation: OperationType,
        hardware: HardwareType,
    ) -> MemoryFormat {
        memory_format_utils::optimal_format_for_tensor(shape, operation, hardware)
    }

    /// Calculate optimal allocation strategy based on size and access pattern
    pub fn recommend_allocation_strategy(
        size_bytes: usize,
        access_pattern: AccessPattern,
    ) -> AllocationStrategy {
        match access_pattern {
            AccessPattern::Frequent if size_bytes <= 64 * 1024 => AllocationStrategy::Pooled,
            AccessPattern::Large if size_bytes >= 1024 * 1024 * 1024 => {
                AllocationStrategy::PreAllocated
            }
            AccessPattern::Lazy => AllocationStrategy::Lazy,
            _ => AllocationStrategy::Immediate,
        }
    }

    /// Check if NUMA optimization would be beneficial
    pub fn should_use_numa(allocation_sizes: &[usize], numa_topology: &NumaTopology) -> bool {
        // Only beneficial for multi-node systems with significant allocations
        numa_topology.node_count > 1
            && allocation_sizes.iter().sum::<usize>() > 1024 * 1024 // 1MB threshold
            && numa_utils::has_numa_topology(numa_topology)
    }

    /// Create a storage view that automatically optimizes for the access pattern
    pub fn create_optimized_view<S: Storage>(
        storage: SharedStorage<S>,
        access_pattern: AccessPattern,
    ) -> Result<StorageView<S>, crate::error::TorshError> {
        let view_len = storage.get().len();
        match access_pattern {
            AccessPattern::Sequential => {
                // Create view for the entire storage for sequential access
                StorageView::new(storage, 0, view_len)
            }
            AccessPattern::Random => {
                // For random access, create a smaller view to enable better caching
                let chunk_size = std::cmp::min(view_len, 64 * 1024); // 64KB chunks
                StorageView::new(storage, 0, chunk_size)
            }
            AccessPattern::Frequent => {
                // For frequent access, use the full view
                StorageView::new(storage, 0, view_len)
            }
            AccessPattern::Large => {
                // For large data, use full view but consider memory mapping
                StorageView::new(storage, 0, view_len)
            }
            AccessPattern::Lazy => {
                // Start with a small view for lazy access
                let initial_size = std::cmp::min(view_len, 4096); // 4KB initial
                StorageView::new(storage, 0, initial_size)
            }
        }
    }

    /// Get storage statistics across all components
    pub fn storage_system_stats() -> StorageSystemStats {
        let pool_stats = pooled_memory_stats();
        let registry = global_registry();
        let registry_stats = registry
            .read()
            .expect("lock should not be poisoned")
            .statistics();

        StorageSystemStats {
            pooled_memory_types: pool_stats.len(),
            total_pooled_allocations: pool_stats
                .values()
                .map(|s| s.total_cached_allocations as u64)
                .sum(),
            registered_allocators: registry_stats.total_allocators,
            backend_types: registry_stats.backend_counts.len(),
        }
    }
}

/// Configuration for storage system behavior
#[derive(Debug, Clone)]
pub struct StorageConfig {
    /// Default memory format to use
    pub memory_format: MemoryFormat,
    /// Default allocation strategy
    pub allocation_strategy: AllocationStrategy,
    /// NUMA policy for multi-node systems
    pub numa_policy: NumaPolicy,
    /// Whether to enable memory pooling for small allocations
    pub enable_pooling: bool,
    /// Whether to enable memory mapping for large data
    pub enable_memory_mapping: bool,
    /// Configuration for lazy loading when memory mapping is enabled
    pub lazy_load_config: mapped::LazyLoadConfig,
}

impl Default for StorageConfig {
    fn default() -> Self {
        utils::default_storage_config()
    }
}

/// Access pattern hints for optimization
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AccessPattern {
    /// Sequential access pattern
    Sequential,
    /// Random access pattern
    Random,
    /// Frequent small accesses
    Frequent,
    /// Large block access
    Large,
    /// Lazy/deferred access
    Lazy,
}

/// System-wide storage statistics
#[derive(Debug, Clone)]
pub struct StorageSystemStats {
    /// Number of different types using pooled memory
    pub pooled_memory_types: usize,
    /// Total number of pooled allocations across all types
    pub total_pooled_allocations: u64,
    /// Number of registered allocators
    pub registered_allocators: usize,
    /// Number of different backend types
    pub backend_types: usize,
}

impl std::fmt::Display for StorageSystemStats {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "StorageSystem(pooled_types={}, pooled_allocs={}, allocators={}, backends={})",
            self.pooled_memory_types,
            self.total_pooled_allocations,
            self.registered_allocators,
            self.backend_types
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::device::CpuDevice;

    // Simple test storage implementation
    #[derive(Debug)]
    struct TestStorage {
        data: Vec<f32>,
        device: CpuDevice,
    }

    impl Storage for TestStorage {
        type Elem = f32;
        type Device = CpuDevice;

        fn allocate(device: &Self::Device, size: usize) -> Result<Self, crate::error::TorshError> {
            Ok(TestStorage {
                data: vec![0.0; size],
                device: device.clone(),
            })
        }

        fn len(&self) -> usize {
            self.data.len()
        }

        fn device(&self) -> &Self::Device {
            &self.device
        }

        fn clone_storage(&self) -> Result<Self, crate::error::TorshError> {
            Ok(TestStorage {
                data: self.data.clone(),
                device: self.device.clone(),
            })
        }
    }

    #[test]
    fn test_storage_integration() {
        let device = CpuDevice::new();
        let storage = TestStorage::allocate(&device, 100).expect("allocate should succeed");
        let shared = SharedStorage::new(storage);

        assert_eq!(shared.get().len(), 100);
        assert_eq!(shared.strong_count(), 1);

        // Test cloning
        let _cloned_shared = shared.clone();
        assert_eq!(shared.strong_count(), 2);
    }

    #[test]
    fn test_storage_view_integration() {
        let device = CpuDevice::new();
        let storage = TestStorage::allocate(&device, 100).expect("allocate should succeed");
        let shared = SharedStorage::new(storage);

        let view =
            StorageView::new(shared.clone(), 10, 20).expect("StorageView::new should succeed");
        assert_eq!(view.offset(), 10);
        assert_eq!(view.view_len(), 20);

        let sub_view = view.slice(5, 10).expect("slice should succeed");
        assert_eq!(sub_view.offset(), 15); // 10 + 5
        assert_eq!(sub_view.view_len(), 10);
    }

    #[test]
    fn test_memory_format_integration() {
        let format = MemoryFormat::Contiguous;
        assert!(format.is_contiguous());
        assert!(!format.is_channels_last());

        let channels_last = MemoryFormat::ChannelsLast;
        assert!(channels_last.is_channels_last());
        assert_eq!(channels_last.expected_dims(), Some(4));
    }

    #[test]
    fn test_storage_config() {
        let config = StorageConfig::default();
        assert_eq!(config.memory_format, MemoryFormat::Contiguous);
        assert_eq!(config.allocation_strategy, AllocationStrategy::Immediate);
        assert!(config.enable_pooling);
    }

    #[test]
    fn test_utils_recommendations() {
        // Test memory format recommendation
        let shape = [1, 3, 224, 224]; // NCHW
        let format =
            utils::recommend_memory_format(&shape, OperationType::Convolution, HardwareType::GPU);
        assert_eq!(format, MemoryFormat::ChannelsLast); // Should recommend NHWC for GPU convolution

        // Test allocation strategy recommendation
        let strategy = utils::recommend_allocation_strategy(1024, AccessPattern::Frequent);
        assert_eq!(strategy, AllocationStrategy::Pooled);

        let strategy =
            utils::recommend_allocation_strategy(2 * 1024 * 1024 * 1024, AccessPattern::Large);
        assert_eq!(strategy, AllocationStrategy::PreAllocated);
    }

    #[test]
    fn test_optimized_view_creation() {
        let device = CpuDevice::new();
        let storage = TestStorage::allocate(&device, 1000).expect("allocate should succeed");
        let shared = SharedStorage::new(storage);

        // Test sequential access view
        let view = utils::create_optimized_view(shared.clone(), AccessPattern::Sequential)
            .expect("create_optimized_view should succeed");
        assert_eq!(view.view_len(), 1000); // Should use full view for sequential access

        // Test random access view
        let view = utils::create_optimized_view(shared.clone(), AccessPattern::Random)
            .expect("create_optimized_view should succeed");
        assert!(view.view_len() <= 64 * 1024); // Should use smaller chunk for random access
    }

    #[test]
    fn test_prelude_imports() {
        // Test that prelude imports work correctly
        use super::prelude::*;

        let device = CpuDevice::new();
        let storage = TestStorage::allocate(&device, 10).expect("allocate should succeed");
        let _shared = SharedStorage::new(storage);

        let _format = MemoryFormat::Contiguous;
        let _strategy = AllocationStrategy::Immediate;
    }

    #[test]
    fn test_backward_compatibility() {
        // Test that all re-exported types are available at the module level
        let _: MemoryFormat = MemoryFormat::Contiguous;
        let _: AllocationStrategy = AllocationStrategy::Immediate;
        let _: NumaPolicy = NumaPolicy::LocalPreferred;

        // Test that utility functions are accessible
        let _config = utils::default_storage_config();
        let _stats = utils::storage_system_stats();
    }

    #[test]
    fn test_storage_system_stats() {
        let stats = utils::storage_system_stats();

        // Basic sanity checks (all fields are unsigned, so no need for >= 0 checks)
        // Just check that stats are available
        let _check = stats.registered_allocators;
        let _check = stats.backend_types;
        let _check = stats.pooled_memory_types;
    }
}