Skip to main content

tenflowers_core/
lib.rs

1//! # TenfloweRS Core
2//!
3//! The core tensor operations and device management library for the TenfloweRS machine learning framework.
4//! This crate provides the foundational building blocks for building, training, and deploying deep learning
5//! models in pure Rust with safety, performance, and cross-platform GPU acceleration.
6//!
7//! ## Features
8//!
9//! - **Tensor Operations**: Comprehensive n-dimensional array operations with automatic broadcasting
10//! - **Device Management**: Unified CPU/GPU abstraction with automatic memory management
11//! - **Performance**: SIMD vectorization, parallel execution, and GPU compute kernels
12//! - **Cross-Platform GPU**: WGPU-based GPU support (Metal, Vulkan, DirectX, WebGPU)
13//! - **Advanced Optimizations**: Mixed precision, quantization, kernel fusion, memory pooling
14//! - **Production Features**: Checkpointing, serialization, deterministic execution, profiling
15//! - **SciRS2 Integration**: Built on the robust SciRS2 scientific computing ecosystem
16//!
17//! ## Quick Start
18//!
19//! ### Basic Tensor Creation and Operations
20//!
21//! ```rust,no_run
22//! use tenflowers_core::{Tensor, Device};
23//!
24//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
25//! // Create tensors
26//! let a = Tensor::<f32>::zeros(&[2, 3]);
27//! let b = Tensor::<f32>::ones(&[2, 3]);
28//!
29//! // Arithmetic operations
30//! let c = tenflowers_core::ops::add(&a, &b)?;
31//! let d = tenflowers_core::ops::mul(&a, &b)?;
32//!
33//! // Matrix multiplication
34//! let x = Tensor::<f32>::ones(&[2, 3]);
35//! let y = Tensor::<f32>::ones(&[3, 4]);
36//! let z = tenflowers_core::ops::matmul(&x, &y)?;
37//! # Ok(())
38//! # }
39//! ```
40//!
41//! ### GPU Acceleration
42//!
43//! ```rust,ignore
44//! use tenflowers_core::{Tensor, Device};
45//!
46//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
47//! # #[cfg(feature = "gpu")]
48//! # {
49//! // Create tensor on GPU
50//! let device = Device::gpu(0)?;
51//! let gpu_tensor = Tensor::<f32>::zeros(&[1000, 1000]).to_device(&device)?;
52//!
53//! // Operations automatically run on GPU
54//! let result = tenflowers_core::ops::matmul(&gpu_tensor, &gpu_tensor)?;
55//! # }
56//! # Ok(())
57//! # }
58//! ```
59//!
60//! ### Advanced Features
61//!
62//! #### Mixed Precision Training
63//!
64//! ```rust,no_run
65//! use tenflowers_core::{Tensor, f16, MixedPrecisionConfig};
66//!
67//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
68//! // Use f16 for faster training with less memory
69//! let fp16_tensor = Tensor::<f16>::ones(&[1024, 1024]);
70//! let result = tenflowers_core::ops::matmul(&fp16_tensor, &fp16_tensor)?;
71//! # Ok(())
72//! # }
73//! ```
74//!
75//! #### Quantization
76//!
77//! ```rust,ignore
78//! use tenflowers_core::{Tensor, quantize, QuantizationParams};
79//!
80//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
81//! let tensor = Tensor::<f32>::ones(&[100, 100]);
82//!
83//! // Quantize to 8-bit for inference
84//! let quantized = quantize(&tensor, 8)?;
85//! # Ok(())
86//! # }
87//! ```
88//!
89//! #### Deterministic Execution
90//!
91//! ```rust,no_run
92//! use tenflowers_core::{set_deterministic_mode, set_global_seed};
93//!
94//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
95//! // Enable deterministic mode for reproducible results
96//! set_deterministic_mode(true);
97//! set_global_seed(42);
98//! # Ok(())
99//! # }
100//! ```
101//!
102//! ## Architecture Overview
103//!
104//! The crate is organized into the following modules:
105//!
106//! - [`tensor`]: Core tensor type with device placement and memory management
107//! - [`ops`]: Tensor operations (arithmetic, linear algebra, neural network primitives)
108//! - [`device`]: Device abstraction (CPU, GPU, custom accelerators)
109//! - [`dtype`]: Data type system (f32, f64, f16, bf16, i32, etc.)
110//! - [`shape`]: Shape inference and validation
111//! - [`memory`]: Memory management, pooling, and optimization
112//! - [`graph`]: Computation graph construction and optimization
113//! - [`session`]: Graph execution engine
114//! - [`quantization`]: Model quantization for deployment
115//! - [`mixed_precision`]: Mixed precision training utilities
116//! - [`checkpointing`]: Model checkpointing and restoration
117//! - [`deterministic`]: Deterministic execution controls
118//! - [`monitoring`]: Performance monitoring and profiling
119//!
120//! ## Performance Features
121//!
122//! ### SIMD Optimization
123//!
124//! The crate automatically uses SIMD instructions when available for maximum performance:
125//!
126//! ```rust,ignore
127//! use tenflowers_core::{Tensor, SimdCapabilities};
128//!
129//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
130//! // Check available SIMD features
131//! let capabilities = SimdCapabilities::detect();
132//! println!("SIMD support: {:?}", capabilities);
133//!
134//! // Operations automatically use SIMD when beneficial
135//! let a = Tensor::<f32>::ones(&[10000]);
136//! let b = Tensor::<f32>::ones(&[10000]);
137//! let c = tenflowers_core::ops::add(&a, &b)?;
138//! # Ok(())
139//! # }
140//! ```
141//!
142//! ### Memory Optimization
143//!
144//! ```rust,ignore
145//! use tenflowers_core::{Tensor, Device};
146//! use tenflowers_core::memory::{BufferPool, GlobalBufferPool};
147//!
148//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
149//! // Use buffer pooling for efficient memory reuse
150//! let pool = GlobalBufferPool::get();
151//! pool.set_max_pool_size(1024 * 1024 * 1024); // 1GB
152//!
153//! // Tensors automatically use the pool
154//! let tensor = Tensor::<f32>::zeros(&[1000, 1000]);
155//! # Ok(())
156//! # }
157//! ```
158//!
159//! ## Integration with TenfloweRS Ecosystem
160//!
161//! This crate integrates seamlessly with:
162//! - `tenflowers-autograd`: Automatic differentiation engine
163//! - `tenflowers-neural`: High-level neural network layers
164//! - `tenflowers-dataset`: Data loading and preprocessing
165//! - `scirs2-core`: Scientific computing primitives
166//! - `scirs2-autograd`: Static graph optimization
167//!
168//! ## GPU Support
169//!
170//! TenfloweRS Core uses WGPU for cross-platform GPU acceleration, supporting:
171//! - **Metal** (macOS, iOS)
172//! - **Vulkan** (Windows, Linux, Android)
173//! - **DirectX 12** (Windows)
174//! - **WebGPU** (browsers)
175//!
176//! Enable GPU support with the `gpu` feature flag:
177//!
178//! ```toml
179//! [dependencies]
180//! tenflowers-core = { version = "0.1.0", features = ["gpu"] }
181//! ```
182//!
183//! ## Safety and Correctness
184//!
185//! TenfloweRS Core is designed with safety as a primary concern:
186//! - Memory-safe by default (no unsafe code in core tensor operations)
187//! - Extensive shape validation and error handling
188//! - Gradient checking utilities for numerical correctness
189//! - Deterministic execution modes for reproducibility
190//!
191//! ## Performance Benchmarking
192//!
193//! Use the built-in benchmarking utilities to measure performance:
194//!
195//! ```rust,ignore
196//! use tenflowers_core::{Tensor, Device};
197//! use tenflowers_core::profiling::Profiler;
198//!
199//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
200//! let profiler = Profiler::new();
201//! profiler.start("matmul");
202//!
203//! let a = Tensor::<f32>::ones(&[1000, 1000]);
204//! let b = Tensor::<f32>::ones(&[1000, 1000]);
205//! let c = tenflowers_core::ops::matmul(&a, &b)?;
206//!
207//! profiler.stop("matmul");
208//! profiler.print_summary();
209//! # Ok(())
210//! # }
211//! ```
212
213#![cfg_attr(not(feature = "std"), no_std)]
214#![allow(clippy::result_large_err)]
215// Allow common patterns in GPU code that clippy flags
216#![allow(clippy::needless_borrow)]
217#![allow(clippy::redundant_closure)]
218#![allow(clippy::collapsible_if)]
219#![allow(clippy::manual_range_contains)]
220#![allow(clippy::match_like_matches_macro)]
221#![allow(clippy::upper_case_acronyms)]
222
223pub mod adaptive_tuning;
224#[cfg(feature = "gpu")]
225pub mod async_gpu_optimizations;
226pub mod buffer;
227pub mod checkpointing;
228pub mod collective;
229pub mod complex;
230pub mod context;
231pub mod cross_platform_optimization;
232pub mod deployment;
233pub mod deterministic;
234pub mod device;
235pub mod dispatch_init;
236pub mod dispatch_registry;
237pub mod dispatch_registry_examples;
238pub mod dispatch_registry_extended;
239pub mod dtype;
240pub mod eager_execution;
241pub mod error;
242pub mod fallback;
243pub mod gpu_memory_metrics;
244pub mod gpu_stub;
245pub mod gradient_clipping;
246pub mod gradient_coverage_audit;
247pub mod gradient_validation_framework;
248pub mod graph;
249pub mod half_precision;
250pub mod integration;
251pub mod large_model_optimization;
252pub mod layout;
253pub mod memory;
254pub mod memory_tensorflow_comparison;
255pub mod mixed_precision;
256pub mod monitoring;
257pub mod neural_optimization;
258pub mod numerical_gradient;
259pub mod onnx_interop;
260pub mod ops;
261pub mod performance_benchmarks;
262pub mod performance_gates;
263pub mod production_benchmarks;
264pub mod production_performance_monitoring;
265pub mod quantization;
266#[cfg(feature = "serialize")]
267pub mod serialization;
268#[cfg(feature = "serialize")]
269pub mod serialization_onnx;
270pub mod session;
271pub mod shape;
272pub mod shape_error_taxonomy;
273pub mod simd;
274pub mod simplified_benchmarks;
275pub mod strided;
276pub mod structured_arrays;
277pub mod system_health;
278pub mod tensor;
279pub mod tensor_view;
280pub mod ultra_performance_profiler;
281pub mod wasm;
282pub mod wasm_optimization;
283// pub mod benchmarks;  // Temporarily disabled due to compilation issues
284
285pub use complex::{Complex32, Complex64};
286pub use device::Device;
287pub use dtype::{dtype_from_type, DType};
288pub use error::{Result, TensorError};
289pub use fallback::{
290    cleanup_memory_and_retry, execute_binary_op_with_fallback, execute_unary_op_with_fallback,
291    get_fallback_config, is_auto_fallback_enabled, set_auto_fallback_enabled, set_fallback_config,
292    FallbackConfig, FallbackWrapper,
293};
294pub use half_precision::{
295    bf16, f16, HalfPrecision, MixedPrecisionConfig as HalfMixedPrecisionConfig,
296};
297pub use integration::{
298    BaselinePerformance, OptimizationBreakdown, PerformanceTargets, UltraPerformanceValidator,
299    ValidationReport, ValidationResult, ValidationTestSuite,
300};
301pub use layout::{convert_layout, infer_layout, DataLayout, LayoutOptimizer, OperationType};
302pub use quantization::{
303    dequantize, dynamic_quantize, fake_quantize, per_channel_quantize, quantize, QuantizationParams,
304};
305pub use shape::Shape;
306pub use shape_error_taxonomy::{
307    validate_broadcast_shapes, validate_elementwise_shapes, validate_matmul_shapes,
308    validate_reduction_axis, validate_reshape, ShapeErrorBuilder, ShapeErrorCategory,
309    ShapeErrorUtils,
310};
311#[cfg(feature = "simd")]
312pub use simd::{benchmarks::Benchmarks as simd_benchmarks, SimdCapabilities, SimdOptimizer};
313pub use simd::{
314    global_simd_engine, AdvancedKernelRegistry, CacheFriendlyMatMul, CacheOptimizedTensorOps,
315    ConvolutionParams, CpuFeatures, ElementWiseOp, KernelOptimizationStrategy, MemoryAccessPattern,
316    ReductionOp as SimdReductionOp, SimdEngineConfig, SpecializedKernel, UltraSimdEngine,
317};
318pub use tensor::Tensor;
319// pub use deployment::{GraphFreezer, GraphFreezingConfig, GraphFreezingStats, freeze_graph_for_inference, freeze_graph_with_config};
320pub use adaptive_tuning::{
321    execute_with_adaptive_tuning, AdaptiveTuner, ExecutionStrategy, OperationMetrics,
322    PerformancePredictor, GLOBAL_TUNER,
323};
324#[cfg(feature = "gpu")]
325pub use async_gpu_optimizations::{
326    utils as async_gpu_utils, AccessPattern, AsyncGpuOperation, AsyncGpuScheduler,
327    AsyncMatMulOperation, ComputeIntensity, OperationPriority,
328    PerformanceMetrics as AsyncPerformanceMetrics,
329};
330pub use collective::{
331    all_gather, all_reduce, broadcast, create_process_group, init_collective, CollectiveManager,
332    CollectiveOp, CommunicationGroup, ReductionOp,
333};
334pub use context::{get_context, set_context, Context};
335pub use cross_platform_optimization::{
336    get_global_optimizer, get_optimal_configuration, initialize_cross_platform_optimizer,
337    CrossPlatformOptimizer, OptimalConfiguration, TargetArchitecture, TargetPlatform,
338};
339pub use deterministic::{
340    clear_operation_log, get_global_seed, get_operation_log, get_operation_seed,
341    get_state_snapshot, is_deterministic_mode, is_strict_mode, mark_non_deterministic,
342    reset_operation_counter, restore_state_snapshot, set_deterministic_mode, set_global_seed,
343    set_strict_mode, should_use_deterministic_gpu_ops, DeterministicConfig, DeterministicScope,
344    DeterministicSnapshot, DeterministicState,
345};
346pub use dispatch_init::ensure_initialized as ensure_dispatch_initialized;
347pub use dispatch_registry::{
348    get_registry, BackendType, BinaryKernelFn, DispatchBenchmarkResult, DispatchRegistry,
349    KernelImplementation, OperationDescriptor, UnaryKernelFn, F32_REGISTRY, F64_REGISTRY,
350    I32_REGISTRY,
351};
352pub use eager_execution::{
353    CacheStatistics, EagerExecutionConfig, EagerExecutionEngine, EagerPerformanceReport,
354    ExecutionMetrics, EAGER_ENGINE,
355};
356pub use gpu_memory_metrics::{
357    generate_memory_report, get_gpu_memory_snapshot, get_gpu_memory_usage, get_gpu_peak_memory,
358    print_memory_report, reset_gpu_memory_metrics, GpuMemoryMetrics, GpuMemoryReport,
359    GpuMemorySnapshot, GPU_MEMORY_METRICS,
360};
361pub use gradient_clipping::{
362    GradientClipper, GradientClippingConfig, GradientStatistics, NormType,
363};
364pub use graph::{
365    AttributeValue, AttributeValueDef, EdgeId, Graph, GraphDef, GraphEdge, GraphNode, NodeDef,
366    NodeId, NodeType,
367};
368pub use large_model_optimization::{
369    LargeModelConfig, LargeModelOptimizationReport, LargeModelOptimizer, MemoryOptimizationStats,
370    ModelExecutionPlan, LARGE_MODEL_OPTIMIZER,
371};
372#[cfg(feature = "gpu")]
373pub use memory::DiagnosticMemoryPool;
374pub use memory::{
375    global_monitor, global_monitor_arc, IntegratedDiagnosticReport, KernelOccupancyStats,
376    MemoryAliasDetector, MemoryPool, MemoryPoolStats, MultiStreamMemoryManager, OperationTimer,
377    OptimizationResult, PerformanceMonitor, PoolHealthMetrics, PoolHealthStatus,
378    PoolOptimizationConfig, StridedView,
379};
380pub use memory_tensorflow_comparison::{
381    MemoryComparisonReport, MemoryOptimizationSuggestion, MemoryProfilingConfig, MemorySnapshot,
382    TensorFlowMemoryProfiler, MEMORY_PROFILER,
383};
384pub use mixed_precision::{
385    disable_autocast, enable_autocast, enable_autocast_bfloat16, from_bfloat16_f32,
386    from_bfloat16_f64, from_half, from_half_f32, from_half_f64, to_bfloat16_f32, to_bfloat16_f64,
387    to_half, to_half_f32, to_half_f64, AutocastContext, GradientScaler, MixedPrecisionConfig,
388    MixedPrecisionState,
389};
390pub use monitoring::{
391    AlertSeverity,
392    // Analytics and trends
393    BottleneckType,
394    MonitoringConfig as UltraMonitoringConfig,
395    MonitoringReport,
396    // Metrics
397    OperationMetrics as MonitoringOperationMetrics,
398    OptimizationOpportunity,
399    PerformanceAlert,
400    PerformanceDashboard,
401
402    PerformancePrediction,
403
404    // Use different name to avoid conflict with adaptive_tuning::PerformancePredictor
405    PerformancePredictor as MonitoringPerformancePredictor,
406
407    PerformanceSnapshot,
408    SystemBottleneck,
409    SystemMetrics,
410    TrendDirection,
411    TrendType,
412    // Core monitoring components
413    UltraPerformanceMonitor,
414};
415pub use neural_optimization::{
416    LayerPerformanceMetrics, NetworkPerformanceReport,
417    OptimizationBreakdown as NeuralOptimizationBreakdown, UltraOptimizedActivations,
418    UltraOptimizedDenseLayer, UltraOptimizedNeuralNetwork,
419};
420pub use onnx_interop::{
421    OnnxConfig,
422    OnnxExporter,
423    OnnxImporter,
424    OnnxModel,
425    // NOTE(v0.2): Add back when implemented: utils as onnx_utils, BenchmarkStats, CompatibilityReport, TenfloweRSModel
426};
427pub use ops::{
428    execute_fused_graph, get_fusion_stats, infer_binary_elementwise,
429    infer_binary_elementwise_validated, infer_concat, infer_conv2d, infer_matmul, infer_reduction,
430    infer_reshape, print_framework_comparison_results, print_fusion_report,
431    record_fusion_opportunity, reset_fusion_stats, run_framework_comparison_benchmark,
432    BroadcastableConstraint, ElementwiseOpType, ExactShapeConstraint, FrameworkBenchmarkConfig,
433    FrameworkComparisonResult, FusionGraph, FusionNode, FusionPassBuilder, FusionStats,
434    MatMulCompatibleConstraint, MinRankConstraint, RankConstraint, ShapeConstraint, ShapeContext,
435    ShapeValidator,
436};
437pub use performance_gates::{
438    get_baseline, list_baselines, register_baseline, OperationBaseline, PerformanceGate,
439    PerformanceGateSuite, PerformanceMeasurement,
440};
441pub use production_benchmarks::{
442    run_comprehensive_production_benchmarks, BenchmarkConfig, BenchmarkResult,
443    BenchmarkSummary as ProductionBenchmarkSummary,
444    OptimizationBreakdown as ProductionOptimizationBreakdown, ProblemSize,
445    ProductionBenchmarkReport, ProductionBenchmarkSuite, QualityMetrics,
446};
447pub use production_performance_monitoring::{
448    get_global_monitor, initialize_performance_monitoring, record_performance_event,
449    AlertThresholds, MonitoringConfig, PerformanceEvent, PerformanceMetrics,
450    ProductionPerformanceMonitor,
451};
452pub use session::{create_session, DefaultSession, FeedDict, FetchSpec, Session, SessionConfig};
453pub use simplified_benchmarks::{
454    run_simple_benchmarks, validate_optimizations, BenchmarkReport, BenchmarkSummary,
455    SimpleBenchmarkConfig, SimpleBenchmarkResult, SimpleBenchmarkSuite,
456};
457pub use strided::{SliceParams, StridedLayout};
458pub use structured_arrays::{FieldDescriptor, FieldValue, StructuredArray};
459pub use system_health::{
460    run_quick_health_check, run_system_health_check, FeaturesInfo, GpuMemoryInfo,
461    HealthCheckConfig, HealthStatus, MemoryInfo, PerformanceBenchmarks, SystemHealthChecker,
462    SystemInfo,
463};
464pub use tensor_view::{MemoryStats, TensorView, TensorViewOps};
465pub use wasm::{utils as wasm_utils, WasmContext};
466#[cfg(target_arch = "wasm32")]
467pub use wasm::{WasmContextWithGpu, WasmWebGpuContext, WebGpuBackend, WebGpuLimits};
468#[cfg(feature = "wasm")]
469pub use wasm_optimization::{
470    WasmBundleOptimizer, WasmEdgeInference, WasmMemoryManager, WasmOptimizationConfig,
471    WasmOptimizedTensor, WasmTensorOperations,
472};
473
474#[cfg(feature = "gpu")]
475pub use gpu_profiler::{
476    disable_gpu_profiling, enable_gpu_profiling, generate_gpu_profiling_report,
477    get_gpu_profiling_stats, global_profiler, GpuProfiler, OperationProfile, ProfileStats,
478};
479
480#[cfg(feature = "gpu")]
481pub use gpu::memory_diagnostics::{
482    check_gpu_memory_leaks, print_gpu_diagnostics, run_gpu_diagnostics, DiagnosticReport,
483    DiagnosticsConfig, FragmentationAnalysis, GpuMemoryDiagnostics, LeakDetectionResult,
484    OperationProfile as MemoryOperationProfile, GLOBAL_GPU_DIAGNOSTICS,
485};
486
487#[cfg(feature = "gpu")]
488pub use gpu::memory_tracing::{
489    current_gpu_memory_usage, generate_gpu_memory_report, peak_gpu_memory_usage,
490    print_gpu_memory_report, record_gpu_allocation, record_gpu_deallocation, AllocationInfo,
491    GpuMemoryTracker, MemoryReport, MemoryTracingConfig, GLOBAL_GPU_MEMORY_TRACKER,
492};
493
494#[cfg(feature = "gpu")]
495pub mod gpu;
496
497#[cfg(feature = "gpu")]
498pub mod gpu_profiler;
499
500#[cfg(test)]
501mod tests {
502    use super::*;
503
504    #[test]
505    fn test_basic_tensor_creation() {
506        let tensor = Tensor::<f32>::zeros(&[2, 3]);
507        assert_eq!(tensor.shape(), &Shape::from_slice(&[2, 3]));
508    }
509}
510pub mod shape_inference_helpers;