tenflowers-core 0.1.1

Core tensor operations and execution engine for TenfloweRS
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
//! # TenfloweRS Core
//!
//! The core tensor operations and device management library for the TenfloweRS machine learning framework.
//! This crate provides the foundational building blocks for building, training, and deploying deep learning
//! models in pure Rust with safety, performance, and cross-platform GPU acceleration.
//!
//! ## Features
//!
//! - **Tensor Operations**: Comprehensive n-dimensional array operations with automatic broadcasting
//! - **Device Management**: Unified CPU/GPU abstraction with automatic memory management
//! - **Performance**: SIMD vectorization, parallel execution, and GPU compute kernels
//! - **Cross-Platform GPU**: WGPU-based GPU support (Metal, Vulkan, DirectX, WebGPU)
//! - **Advanced Optimizations**: Mixed precision, quantization, kernel fusion, memory pooling
//! - **Production Features**: Checkpointing, serialization, deterministic execution, profiling
//! - **SciRS2 Integration**: Built on the robust SciRS2 scientific computing ecosystem
//!
//! ## Quick Start
//!
//! ### Basic Tensor Creation and Operations
//!
//! ```rust,no_run
//! use tenflowers_core::{Tensor, Device};
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! // Create tensors
//! let a = Tensor::<f32>::zeros(&[2, 3]);
//! let b = Tensor::<f32>::ones(&[2, 3]);
//!
//! // Arithmetic operations
//! let c = tenflowers_core::ops::add(&a, &b)?;
//! let d = tenflowers_core::ops::mul(&a, &b)?;
//!
//! // Matrix multiplication
//! let x = Tensor::<f32>::ones(&[2, 3]);
//! let y = Tensor::<f32>::ones(&[3, 4]);
//! let z = tenflowers_core::ops::matmul(&x, &y)?;
//! # Ok(())
//! # }
//! ```
//!
//! ### GPU Acceleration
//!
//! ```rust,ignore
//! use tenflowers_core::{Tensor, Device};
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! # #[cfg(feature = "gpu")]
//! # {
//! // Create tensor on GPU
//! let device = Device::gpu(0)?;
//! let gpu_tensor = Tensor::<f32>::zeros(&[1000, 1000]).to_device(&device)?;
//!
//! // Operations automatically run on GPU
//! let result = tenflowers_core::ops::matmul(&gpu_tensor, &gpu_tensor)?;
//! # }
//! # Ok(())
//! # }
//! ```
//!
//! ### Advanced Features
//!
//! #### Mixed Precision Training
//!
//! ```rust,no_run
//! use tenflowers_core::{Tensor, f16, MixedPrecisionConfig};
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! // Use f16 for faster training with less memory
//! let fp16_tensor = Tensor::<f16>::ones(&[1024, 1024]);
//! let result = tenflowers_core::ops::matmul(&fp16_tensor, &fp16_tensor)?;
//! # Ok(())
//! # }
//! ```
//!
//! #### Quantization
//!
//! ```rust,ignore
//! use tenflowers_core::{Tensor, quantize, QuantizationParams};
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! let tensor = Tensor::<f32>::ones(&[100, 100]);
//!
//! // Quantize to 8-bit for inference
//! let quantized = quantize(&tensor, 8)?;
//! # Ok(())
//! # }
//! ```
//!
//! #### Deterministic Execution
//!
//! ```rust,no_run
//! use tenflowers_core::{set_deterministic_mode, set_global_seed};
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! // Enable deterministic mode for reproducible results
//! set_deterministic_mode(true);
//! set_global_seed(42);
//! # Ok(())
//! # }
//! ```
//!
//! ## Architecture Overview
//!
//! The crate is organized into the following modules:
//!
//! - [`tensor`]: Core tensor type with device placement and memory management
//! - [`ops`]: Tensor operations (arithmetic, linear algebra, neural network primitives)
//! - [`device`]: Device abstraction (CPU, GPU, custom accelerators)
//! - [`dtype`]: Data type system (f32, f64, f16, bf16, i32, etc.)
//! - [`shape`]: Shape inference and validation
//! - [`memory`]: Memory management, pooling, and optimization
//! - [`graph`]: Computation graph construction and optimization
//! - [`session`]: Graph execution engine
//! - [`quantization`]: Model quantization for deployment
//! - [`mixed_precision`]: Mixed precision training utilities
//! - [`checkpointing`]: Model checkpointing and restoration
//! - [`deterministic`]: Deterministic execution controls
//! - [`monitoring`]: Performance monitoring and profiling
//!
//! ## Performance Features
//!
//! ### SIMD Optimization
//!
//! The crate automatically uses SIMD instructions when available for maximum performance:
//!
//! ```rust,ignore
//! use tenflowers_core::{Tensor, SimdCapabilities};
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! // Check available SIMD features
//! let capabilities = SimdCapabilities::detect();
//! println!("SIMD support: {:?}", capabilities);
//!
//! // Operations automatically use SIMD when beneficial
//! let a = Tensor::<f32>::ones(&[10000]);
//! let b = Tensor::<f32>::ones(&[10000]);
//! let c = tenflowers_core::ops::add(&a, &b)?;
//! # Ok(())
//! # }
//! ```
//!
//! ### Memory Optimization
//!
//! ```rust,ignore
//! use tenflowers_core::{Tensor, Device};
//! use tenflowers_core::memory::{BufferPool, GlobalBufferPool};
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! // Use buffer pooling for efficient memory reuse
//! let pool = GlobalBufferPool::get();
//! pool.set_max_pool_size(1024 * 1024 * 1024); // 1GB
//!
//! // Tensors automatically use the pool
//! let tensor = Tensor::<f32>::zeros(&[1000, 1000]);
//! # Ok(())
//! # }
//! ```
//!
//! ## Integration with TenfloweRS Ecosystem
//!
//! This crate integrates seamlessly with:
//! - `tenflowers-autograd`: Automatic differentiation engine
//! - `tenflowers-neural`: High-level neural network layers
//! - `tenflowers-dataset`: Data loading and preprocessing
//! - `scirs2-core`: Scientific computing primitives
//! - `scirs2-autograd`: Static graph optimization
//!
//! ## GPU Support
//!
//! TenfloweRS Core uses WGPU for cross-platform GPU acceleration, supporting:
//! - **Metal** (macOS, iOS)
//! - **Vulkan** (Windows, Linux, Android)
//! - **DirectX 12** (Windows)
//! - **WebGPU** (browsers)
//!
//! Enable GPU support with the `gpu` feature flag:
//!
//! ```toml
//! [dependencies]
//! tenflowers-core = { version = "0.1.1", features = ["gpu"] }
//! ```
//!
//! ## Safety and Correctness
//!
//! TenfloweRS Core is designed with safety as a primary concern:
//! - Memory-safe by default (no unsafe code in core tensor operations)
//! - Extensive shape validation and error handling
//! - Gradient checking utilities for numerical correctness
//! - Deterministic execution modes for reproducibility
//!
//! ## Performance Benchmarking
//!
//! Use the built-in benchmarking utilities to measure performance:
//!
//! ```rust,ignore
//! use tenflowers_core::{Tensor, Device};
//! use tenflowers_core::profiling::Profiler;
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! let profiler = Profiler::new();
//! profiler.start("matmul");
//!
//! let a = Tensor::<f32>::ones(&[1000, 1000]);
//! let b = Tensor::<f32>::ones(&[1000, 1000]);
//! let c = tenflowers_core::ops::matmul(&a, &b)?;
//!
//! profiler.stop("matmul");
//! profiler.print_summary();
//! # Ok(())
//! # }
//! ```

#![cfg_attr(not(feature = "std"), no_std)]
#![allow(clippy::result_large_err)]
// Allow common patterns in GPU code that clippy flags
#![allow(clippy::needless_borrow)]
#![allow(clippy::redundant_closure)]
#![allow(clippy::collapsible_if)]
#![allow(clippy::manual_range_contains)]
#![allow(clippy::match_like_matches_macro)]
#![allow(clippy::upper_case_acronyms)]

pub mod adaptive_tuning;
#[cfg(feature = "gpu")]
pub mod async_gpu_optimizations;
pub mod buffer;
pub mod checkpointing;
pub mod collective;
pub mod complex;
pub mod context;
pub mod cross_platform_optimization;
pub mod deployment;
pub mod deterministic;
pub mod device;
pub mod dispatch_init;
pub mod dispatch_registry;
pub mod dispatch_registry_examples;
pub mod dispatch_registry_extended;
pub mod dtype;
pub mod eager_execution;
pub mod error;
pub mod fallback;
pub mod gpu_memory_metrics;
pub mod gpu_stub;
pub mod gradient_clipping;
pub mod gradient_coverage_audit;
pub mod gradient_validation_framework;
pub mod graph;
pub mod half_precision;
pub mod integration;
pub mod large_model_optimization;
pub mod layout;
pub mod memory;
pub mod memory_tensorflow_comparison;
pub mod mixed_precision;
pub mod monitoring;
pub mod neural_optimization;
pub mod numerical_gradient;
pub mod onnx_interop;
pub mod ops;
pub mod performance_benchmarks;
pub mod performance_gates;
pub mod production_benchmarks;
pub mod production_performance_monitoring;
pub mod quantization;
#[cfg(feature = "serialize")]
pub mod serialization;
#[cfg(feature = "serialize")]
pub mod serialization_onnx;
pub mod session;
pub mod shape;
pub mod shape_error_taxonomy;
pub mod simd;
pub mod simplified_benchmarks;
pub mod strided;
pub mod structured_arrays;
pub mod system_health;
pub mod tensor;
pub mod tensor_view;
pub mod ultra_performance_profiler;
pub mod wasm;
pub mod wasm_optimization;
// pub mod benchmarks;  // Temporarily disabled due to compilation issues

pub use complex::{Complex32, Complex64};
pub use device::Device;
pub use dtype::{dtype_from_type, DType};
pub use error::{Result, TensorError};
pub use fallback::{
    cleanup_memory_and_retry, execute_binary_op_with_fallback, execute_unary_op_with_fallback,
    get_fallback_config, is_auto_fallback_enabled, set_auto_fallback_enabled, set_fallback_config,
    FallbackConfig, FallbackWrapper,
};
pub use half_precision::{
    bf16, f16, HalfPrecision, MixedPrecisionConfig as HalfMixedPrecisionConfig,
};
pub use integration::{
    BaselinePerformance, OptimizationBreakdown, PerformanceTargets, UltraPerformanceValidator,
    ValidationReport, ValidationResult, ValidationTestSuite,
};
pub use layout::{convert_layout, infer_layout, DataLayout, LayoutOptimizer, OperationType};
pub use quantization::{
    dequantize, dynamic_quantize, fake_quantize, per_channel_quantize, quantize, QuantizationParams,
};
pub use shape::Shape;
pub use shape_error_taxonomy::{
    validate_broadcast_shapes, validate_elementwise_shapes, validate_matmul_shapes,
    validate_reduction_axis, validate_reshape, ShapeErrorBuilder, ShapeErrorCategory,
    ShapeErrorUtils,
};
#[cfg(feature = "simd")]
pub use simd::{benchmarks::Benchmarks as simd_benchmarks, SimdCapabilities, SimdOptimizer};
pub use simd::{
    global_simd_engine, AdvancedKernelRegistry, CacheFriendlyMatMul, CacheOptimizedTensorOps,
    ConvolutionParams, CpuFeatures, ElementWiseOp, KernelOptimizationStrategy, MemoryAccessPattern,
    ReductionOp as SimdReductionOp, SimdEngineConfig, SpecializedKernel, UltraSimdEngine,
};
pub use tensor::Tensor;
// pub use deployment::{GraphFreezer, GraphFreezingConfig, GraphFreezingStats, freeze_graph_for_inference, freeze_graph_with_config};
pub use adaptive_tuning::{
    execute_with_adaptive_tuning, AdaptiveTuner, ExecutionStrategy, OperationMetrics,
    PerformancePredictor, GLOBAL_TUNER,
};
#[cfg(feature = "gpu")]
pub use async_gpu_optimizations::{
    utils as async_gpu_utils, AccessPattern, AsyncGpuOperation, AsyncGpuScheduler,
    AsyncMatMulOperation, ComputeIntensity, OperationPriority,
    PerformanceMetrics as AsyncPerformanceMetrics,
};
pub use collective::{
    all_gather, all_reduce, broadcast, create_process_group, init_collective, CollectiveManager,
    CollectiveOp, CommunicationGroup, ReductionOp,
};
pub use context::{get_context, set_context, Context};
pub use cross_platform_optimization::{
    get_global_optimizer, get_optimal_configuration, initialize_cross_platform_optimizer,
    CrossPlatformOptimizer, OptimalConfiguration, TargetArchitecture, TargetPlatform,
};
pub use deterministic::{
    clear_operation_log, get_global_seed, get_operation_log, get_operation_seed,
    get_state_snapshot, is_deterministic_mode, is_strict_mode, mark_non_deterministic,
    reset_operation_counter, restore_state_snapshot, set_deterministic_mode, set_global_seed,
    set_strict_mode, should_use_deterministic_gpu_ops, DeterministicConfig, DeterministicScope,
    DeterministicSnapshot, DeterministicState,
};
pub use dispatch_init::ensure_initialized as ensure_dispatch_initialized;
pub use dispatch_registry::{
    get_registry, BackendType, BinaryKernelFn, DispatchBenchmarkResult, DispatchRegistry,
    KernelImplementation, OperationDescriptor, UnaryKernelFn, F32_REGISTRY, F64_REGISTRY,
    I32_REGISTRY,
};
pub use eager_execution::{
    CacheStatistics, EagerExecutionConfig, EagerExecutionEngine, EagerPerformanceReport,
    ExecutionMetrics, EAGER_ENGINE,
};
pub use gpu_memory_metrics::{
    generate_memory_report, get_gpu_memory_snapshot, get_gpu_memory_usage, get_gpu_peak_memory,
    print_memory_report, reset_gpu_memory_metrics, GpuMemoryMetrics, GpuMemoryReport,
    GpuMemorySnapshot, GPU_MEMORY_METRICS,
};
pub use gradient_clipping::{
    GradientClipper, GradientClippingConfig, GradientStatistics, NormType,
};
pub use graph::{
    AttributeValue, AttributeValueDef, EdgeId, Graph, GraphDef, GraphEdge, GraphNode, NodeDef,
    NodeId, NodeType,
};
pub use large_model_optimization::{
    LargeModelConfig, LargeModelOptimizationReport, LargeModelOptimizer, MemoryOptimizationStats,
    ModelExecutionPlan, LARGE_MODEL_OPTIMIZER,
};
#[cfg(feature = "gpu")]
pub use memory::DiagnosticMemoryPool;
pub use memory::{
    global_monitor, global_monitor_arc, IntegratedDiagnosticReport, KernelOccupancyStats,
    MemoryAliasDetector, MemoryPool, MemoryPoolStats, MultiStreamMemoryManager, OperationTimer,
    OptimizationResult, PerformanceMonitor, PoolHealthMetrics, PoolHealthStatus,
    PoolOptimizationConfig, StridedView,
};
pub use memory_tensorflow_comparison::{
    MemoryComparisonReport, MemoryOptimizationSuggestion, MemoryProfilingConfig, MemorySnapshot,
    TensorFlowMemoryProfiler, MEMORY_PROFILER,
};
pub use mixed_precision::{
    disable_autocast, enable_autocast, enable_autocast_bfloat16, from_bfloat16_f32,
    from_bfloat16_f64, from_half, from_half_f32, from_half_f64, to_bfloat16_f32, to_bfloat16_f64,
    to_half, to_half_f32, to_half_f64, AutocastContext, GradientScaler, MixedPrecisionConfig,
    MixedPrecisionState,
};
pub use monitoring::{
    AlertSeverity,
    // Analytics and trends
    BottleneckType,
    MonitoringConfig as UltraMonitoringConfig,
    MonitoringReport,
    // Metrics
    OperationMetrics as MonitoringOperationMetrics,
    OptimizationOpportunity,
    PerformanceAlert,
    PerformanceDashboard,

    PerformancePrediction,

    // Use different name to avoid conflict with adaptive_tuning::PerformancePredictor
    PerformancePredictor as MonitoringPerformancePredictor,

    PerformanceSnapshot,
    SystemBottleneck,
    SystemMetrics,
    TrendDirection,
    TrendType,
    // Core monitoring components
    UltraPerformanceMonitor,
};
pub use neural_optimization::{
    LayerPerformanceMetrics, NetworkPerformanceReport,
    OptimizationBreakdown as NeuralOptimizationBreakdown, UltraOptimizedActivations,
    UltraOptimizedDenseLayer, UltraOptimizedNeuralNetwork,
};
pub use onnx_interop::{
    OnnxConfig,
    OnnxExporter,
    OnnxImporter,
    OnnxModel,
    // NOTE(v0.2): Add back when implemented: utils as onnx_utils, BenchmarkStats, CompatibilityReport, TenfloweRSModel
};
pub use ops::{
    execute_fused_graph, get_fusion_stats, infer_binary_elementwise,
    infer_binary_elementwise_validated, infer_concat, infer_conv2d, infer_matmul, infer_reduction,
    infer_reshape, print_framework_comparison_results, print_fusion_report,
    record_fusion_opportunity, reset_fusion_stats, run_framework_comparison_benchmark,
    BroadcastableConstraint, ElementwiseOpType, ExactShapeConstraint, FrameworkBenchmarkConfig,
    FrameworkComparisonResult, FusionGraph, FusionNode, FusionPassBuilder, FusionStats,
    MatMulCompatibleConstraint, MinRankConstraint, RankConstraint, ShapeConstraint, ShapeContext,
    ShapeValidator,
};
pub use performance_gates::{
    get_baseline, list_baselines, register_baseline, OperationBaseline, PerformanceGate,
    PerformanceGateSuite, PerformanceMeasurement,
};
pub use production_benchmarks::{
    run_comprehensive_production_benchmarks, BenchmarkConfig, BenchmarkResult,
    BenchmarkSummary as ProductionBenchmarkSummary,
    OptimizationBreakdown as ProductionOptimizationBreakdown, ProblemSize,
    ProductionBenchmarkReport, ProductionBenchmarkSuite, QualityMetrics,
};
pub use production_performance_monitoring::{
    get_global_monitor, initialize_performance_monitoring, record_performance_event,
    AlertThresholds, MonitoringConfig, PerformanceEvent, PerformanceMetrics,
    ProductionPerformanceMonitor,
};
pub use session::{create_session, DefaultSession, FeedDict, FetchSpec, Session, SessionConfig};
pub use simplified_benchmarks::{
    run_simple_benchmarks, validate_optimizations, BenchmarkReport, BenchmarkSummary,
    SimpleBenchmarkConfig, SimpleBenchmarkResult, SimpleBenchmarkSuite,
};
pub use strided::{SliceParams, StridedLayout};
pub use structured_arrays::{FieldDescriptor, FieldValue, StructuredArray};
pub use system_health::{
    run_quick_health_check, run_system_health_check, FeaturesInfo, GpuMemoryInfo,
    HealthCheckConfig, HealthStatus, MemoryInfo, PerformanceBenchmarks, SystemHealthChecker,
    SystemInfo,
};
pub use tensor_view::{MemoryStats, TensorView, TensorViewOps};
pub use wasm::{utils as wasm_utils, WasmContext};
#[cfg(target_arch = "wasm32")]
pub use wasm::{WasmContextWithGpu, WasmWebGpuContext, WebGpuBackend, WebGpuLimits};
#[cfg(feature = "wasm")]
pub use wasm_optimization::{
    WasmBundleOptimizer, WasmEdgeInference, WasmMemoryManager, WasmOptimizationConfig,
    WasmOptimizedTensor, WasmTensorOperations,
};

#[cfg(feature = "gpu")]
pub use gpu_profiler::{
    disable_gpu_profiling, enable_gpu_profiling, generate_gpu_profiling_report,
    get_gpu_profiling_stats, global_profiler, GpuProfiler, OperationProfile, ProfileStats,
};

#[cfg(feature = "gpu")]
pub use gpu::memory_diagnostics::{
    check_gpu_memory_leaks, print_gpu_diagnostics, run_gpu_diagnostics, DiagnosticReport,
    DiagnosticsConfig, FragmentationAnalysis, GpuMemoryDiagnostics, LeakDetectionResult,
    OperationProfile as MemoryOperationProfile, GLOBAL_GPU_DIAGNOSTICS,
};

#[cfg(feature = "gpu")]
pub use gpu::memory_tracing::{
    current_gpu_memory_usage, generate_gpu_memory_report, peak_gpu_memory_usage,
    print_gpu_memory_report, record_gpu_allocation, record_gpu_deallocation, AllocationInfo,
    GpuMemoryTracker, MemoryReport, MemoryTracingConfig, GLOBAL_GPU_MEMORY_TRACKER,
};

#[cfg(feature = "gpu")]
pub mod gpu;

#[cfg(feature = "gpu")]
pub mod gpu_profiler;

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_basic_tensor_creation() {
        let tensor = Tensor::<f32>::zeros(&[2, 3]);
        assert_eq!(tensor.shape(), &Shape::from_slice(&[2, 3]));
    }
}
pub mod shape_inference_helpers;