tenflowers_autograd/
lib.rs

1//! # TenfloweRS Automatic Differentiation
2//!
3//! `tenflowers-autograd` provides a comprehensive automatic differentiation engine for the TenfloweRS
4//! machine learning framework. This crate implements both forward-mode and reverse-mode automatic
5//! differentiation with support for higher-order derivatives, custom gradients, and advanced
6//! optimization techniques.
7//!
8//! ## Features
9//!
10//! - **Complete Gradient Operations**: All fundamental tensor operations with mathematically correct gradients
11//! - **Higher-Order Derivatives**: Efficient computation of Hessians, third-order derivatives, and beyond
12//! - **Performance Optimization**: Kernel fusion, memory optimization, and distributed gradient computation
13//! - **Advanced Differentiation**: Mixed-mode AD, implicit differentiation, and custom gradient functions
14//! - **Neural Network Integration**: Seamless integration with tenflowers-neural for deep learning
15//! - **Distributed Training**: Parameter servers, gradient compression, and cross-datacenter replication
16//!
17//! ## Quick Start
18//!
19//! ```rust,no_run
20//! use tenflowers_autograd::{GradientTape, TrackedTensor};
21//! use tenflowers_core::{Tensor, Device};
22//!
23//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
24//! let device = Device::Cpu;
25//! let mut tape = GradientTape::new();
26//!
27//! // Create tracked tensors
28//! let x = tape.watch(Tensor::<f32>::ones(&[2, 2]));
29//! let y = tape.watch(Tensor::<f32>::ones(&[2, 2]));
30//!
31//! // Compute gradients using GradientTape::gradient
32//! let z = tape.watch(Tensor::<f32>::ones(&[2, 2])); // Placeholder for x+y result
33//! let gradients = tape.gradient(&[z], &[x, y])?;
34//! println!("Gradient of x: {:?}", gradients[0]);
35//! # Ok(())
36//! # }
37//! ```
38//!
39//! ## Advanced Usage
40//!
41//! ### Custom Gradients
42//!
43//! ```rust,no_run
44//! use tenflowers_autograd::{CustomGradientFunction, GradientTape};
45//! use tenflowers_core::{Tensor, Result};
46//!
47//! struct MyCustomOp;
48//!
49//! impl CustomGradientFunction<f32> for MyCustomOp {
50//!     fn forward(&self, inputs: &[&Tensor<f32>]) -> Result<Tensor<f32>> {
51//!         // Custom forward implementation: y = x^2 + sin(x)
52//!         let x = inputs[0];
53//!         let x_squared = tenflowers_core::ops::mul(x, x)?;
54//!         let sin_x = tenflowers_core::ops::sin(x)?;
55//!         tenflowers_core::ops::add(&x_squared, &sin_x)
56//!     }
57//!
58//!     fn backward(&self, grad_output: &Tensor<f32>, inputs: &[&Tensor<f32>], output: &Tensor<f32>) -> Result<Vec<Tensor<f32>>> {
59//!         // Custom backward implementation: dy/dx = 2x + cos(x)
60//!         let x = inputs[0];
61//!         let two = tenflowers_core::Tensor::from_array(scirs2_core::ndarray::arr0(2.0f32).into_dyn());
62//!         let two_x = tenflowers_core::ops::mul(&two, x)?;
63//!         let cos_x = tenflowers_core::ops::cos(x)?;
64//!         let grad_x = tenflowers_core::ops::add(&two_x, &cos_x)?;
65//!         let final_grad = tenflowers_core::ops::mul(grad_output, &grad_x)?;
66//!         Ok(vec![final_grad])
67//!     }
68//!
69//!     fn name(&self) -> &str {
70//!         "MyCustomOp"
71//!     }
72//! }
73//! ```
74//!
75//! ### Higher-Order Derivatives
76//!
77//! ```rust,no_run
78//! use tenflowers_autograd::{GradientTape, TrackedTensor};
79//! use tenflowers_core::Tensor;
80//!
81//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
82//! let mut tape = GradientTape::new();
83//! let x = TrackedTensor::new(Tensor::<f32>::ones(&[1]));
84//! let target = TrackedTensor::new(Tensor::<f32>::ones(&[1]));
85//!
86//! // Compute third-order derivatives
87//! let third_order = tape.third_derivative(&target, &x)?;
88//!
89//! // Compute nth-order derivatives
90//! let nth_order = tape.nth_derivative(&target, &x, 3)?;
91//! # Ok(())
92//! # }
93//! ```
94//!
95//! ## Performance Features
96//!
97//! - **Kernel Fusion**: Automatically fuses operations to reduce memory bandwidth
98//! - **Gradient Compression**: Quantization and sparsification for distributed training
99//! - **Memory Optimization**: Checkpointing and in-place operations for large models
100//! - **JIT Compilation**: Runtime kernel optimization for specific tensor shapes
101//!
102//! ## Integration with TenfloweRS Ecosystem
103//!
104//! This crate integrates seamlessly with:
105//! - `tenflowers-core`: Core tensor operations and device management
106//! - `tenflowers-neural`: Neural network layers and training loops
107//! - `tenflowers-dataset`: Data loading and preprocessing
108//! - `scirs2-autograd`: Static graph optimization and analysis
109
110#![deny(unsafe_code)]
111#![allow(clippy::result_large_err)]
112
113pub mod advanced_grad_ops;
114// NOTE(v0.2): advanced_linalg module planned but not yet implemented
115pub mod amp_policy;
116pub mod boolean_indexing;
117pub mod checkpointing;
118pub mod context;
119pub mod coverage_matrix;
120pub mod custom_gradients;
121pub mod debug;
122pub mod deterministic;
123pub mod device_placement;
124pub mod efficient_memory;
125pub mod ellipsis_newaxis;
126pub mod error_taxonomy;
127pub mod forward_ad;
128pub mod forward_reverse;
129pub mod global_pooling;
130pub mod gpu_gradient_expansion;
131pub mod grad_ops;
132pub mod gradient_accumulation;
133pub mod gradient_analyzer;
134pub mod gradient_buffer_manager_simple;
135pub mod gradient_compression;
136pub mod gradient_compression_advanced;
137pub mod gradient_ops;
138pub mod gradient_utils;
139// NOTE(v0.2): gradient_validation module planned but not yet implemented
140pub mod gradient_visualization;
141pub mod graph_optimization;
142pub mod higher_order;
143pub mod hybrid_scheduler;
144pub mod implicit_differentiation;
145pub mod inplace_ops;
146pub mod jit_compiler;
147pub mod jit_integration;
148pub mod kernel_fusion;
149pub mod memory_diff_reporter;
150pub mod memory_profiler;
151pub mod neural_integration;
152pub mod no_grad;
153pub mod numerical_checker;
154pub mod ops;
155#[cfg(feature = "parallel")]
156pub mod parallel_gradients;
157pub mod parameter_server;
158pub mod performance_benchmark;
159pub mod second_order;
160pub mod second_order_utils;
161pub mod simd_grad_ops_simple;
162pub mod special_functions;
163pub mod subgraph_extraction;
164pub mod tape;
165pub mod tape_optimization;
166pub mod tensor_ext;
167pub mod tensor_networks;
168pub mod ultra_gradient;
169pub mod ultra_gradient_engine_simple;
170
171pub use boolean_indexing::{
172    boolean_mask_backward, integer_array_indexing_backward, where_backward,
173};
174pub use checkpointing::{
175    checkpoint_sequence, ActivationCheckpointPolicy, ActivationCheckpointing,
176    ActivationRecomputeManager, CheckpointManager, CheckpointStrategy, CheckpointedFunction,
177    CheckpointedGradientTape, CheckpointingStats, LayerMetadata, RecomputationContext,
178};
179pub use context::{AutogradContext, ShapeInferenceRule, StaticShapeInference};
180pub use coverage_matrix::{
181    CategoryCoverage, CoverageMatrix, CoverageReport, OperationCategory, OperationMetadata,
182};
183pub use custom_gradients::{
184    CustomGradientFunction, CustomGradientOp, GradientClipFunction, GradientScaleFunction,
185    StopGradientFunction,
186};
187pub use debug::{GradientDebugInfo, GradientDebugger};
188pub use deterministic::{
189    clear_operation_seeds, get_global_seed, get_operation_seed, get_seeded_operation_count,
190    hash_tensor_data, is_deterministic, reset_deterministic_state, set_deterministic,
191    set_global_seed, set_operation_seed, DeterministicConfig, DeterministicContext,
192    DeterministicOperation, ReproducibilityChecker, ReproducibilityStats, SeedManager,
193};
194pub use device_placement::{
195    DevicePlacementConfig, DevicePlacementOptimizer, GraphOperation, PlacementDecision,
196    PlacementResult, PlacementStrategy,
197};
198pub use ellipsis_newaxis::{ellipsis_newaxis_backward, AdvancedIndexer, IndexSpec};
199pub use error_taxonomy::{
200    utils as error_utils, AutogradErrorBuilder, ErrorPatternValidator, GradientContext,
201    ValidationResult,
202};
203pub use forward_ad::{forward_ops, DualTensor, ForwardADContext, ForwardMode};
204pub use forward_reverse::{
205    ComplexityEstimate, DifferentiationMode, ForwardReverseConfig, ForwardReverseDifferentiator,
206};
207pub use global_pooling::{
208    adaptive_avg_pool2d_backward, adaptive_max_pool2d_backward,
209    fractional_adaptive_avg_pool2d_backward, global_avg_pool2d_backward,
210    global_max_pool2d_backward,
211};
212pub use gpu_gradient_expansion::{
213    GpuCategoryCoverage, GpuCoverageAnalysis, GpuGradInfo, GpuGradStatus, GpuGradientPlanner,
214    ImplementationPlan, ImplementationTask, Priority,
215};
216pub use grad_ops::{
217    batch_fused_activations_forward_backward, fused_gelu_forward_backward,
218    fused_log_softmax_forward_backward, fused_tanh_forward_backward,
219};
220#[cfg(feature = "parallel")]
221pub use gradient_accumulation::{
222    accumulate_gradients_distributed, DistributedGradientAccumulator, DistributedStats,
223};
224pub use gradient_accumulation::{accumulate_gradients_over_batch, GradientAccumulator};
225pub use gradient_buffer_manager_simple::{
226    global_gradient_buffer_manager, AllocationMetrics, EfficiencyMetrics, GradientBufferAllocation,
227    GradientBufferConfig, GradientBufferManager, GradientMemoryStatistics,
228    MemoryPressureStatistics,
229};
230pub use gradient_compression::{
231    CompressedGradient, CompressionConfig, CompressionMethod, CompressionStats, GradientCompressor,
232};
233pub use gradient_ops::{
234    accumulate_gradients, add_gradient_noise, average_gradients, clip_by_global_norm,
235    clip_by_value, compute_gradient_statistics, has_invalid_gradients, scale_gradients,
236    zero_gradients, GradientPipeline, GradientStatistics, NamedGradientAccumulator,
237};
238pub use gradient_visualization::{
239    ColorScheme, EdgeType, GradientFlowAnalysis, GradientFlowEdge, GradientFlowIssue,
240    GradientFlowNode, GradientFlowVisualizer, GradientStats, IssueType, LayoutAlgorithm, NodeType,
241    OutputFormat, Severity, ValueStats, VisualizationSettings,
242};
243pub use graph_optimization::{
244    CommunicationPlan, EnhancedGraphOptimizer, GradientFusion, GraphOptimizationConfig,
245    GraphOptimizationResult, MemoryOptimization,
246};
247pub use hybrid_scheduler::{
248    ExecutionStats, ExecutionSummary, GraphAnalysis, HybridScheduler, SchedulerConfig, StrategyCost,
249};
250pub use implicit_differentiation::{
251    FixedPointFunction, GradientInfo, ImplicitDiffConfig, ImplicitDifferentiator, ImplicitFunction,
252    OptimizationLayer,
253};
254pub use inplace_ops::{InPlaceOptimizer, InPlaceSequenceOptimizer};
255pub use jit_compiler::{
256    CompiledKernel, DeviceFeatures, GradientKernelTemplate, JitCompiler, KernelPerformance,
257    KernelSignature, OptimizationLevel,
258};
259pub use jit_integration::{utils as jit_utils, JitConfig, JitGradientContext, JitGradientTapeExt};
260pub use kernel_fusion::{FusableOp, FusedKernel, FusionStats, KernelFusionOptimizer, OpSequence};
261pub use memory_diff_reporter::{MemoryDiff, MemoryDiffReporter, MemorySnapshot};
262pub use memory_profiler::{get_global_profiler, GradientMemoryProfiler, MemoryReport, MemoryStats};
263pub use neural_integration::{
264    AutogradLayer, AutogradOptimizer, AutogradTrainer, OptimizerType, TrainingMetrics,
265};
266pub use no_grad::{
267    enable_grad, is_grad_enabled, no_grad, set_grad_enabled, EnableGradGuard, NoGradGuard,
268};
269pub use numerical_checker::{
270    CheckerConfig, ErrorAnalysis, FiniteDifferenceMethod, GradientCheckResult, NumericalChecker,
271};
272#[cfg(feature = "parallel")]
273pub use parallel_gradients::{
274    AsyncGradientHandle, CommunicationBackend, GradientTask, ParallelGradientConfig,
275    ParallelGradientConfigBuilder, ParallelGradientEngine, ParallelGradientResult, PipelineConfig,
276};
277pub use parameter_server::{
278    FaultToleranceMode, LoadBalancingStrategy, ParameterServer, ParameterServerClient,
279    ParameterServerConfig, ParameterServerStats,
280};
281pub use performance_benchmark::{
282    BenchmarkConfig, BenchmarkReport, BenchmarkResult, BenchmarkStatistics, BenchmarkSummary,
283    ComparisonResult, PerformanceBenchmark, RegressionReport, RegressionSeverity,
284    ThroughputMetrics,
285};
286pub use simd_grad_ops_simple::{
287    global_simd_grad_ops, SimdGradConfig, SimdGradOps, SimdPerformanceMetrics,
288};
289pub use special_functions::{
290    bessel_j0_backward, bessel_j1_backward, beta_backward, digamma_backward, erf_backward,
291    erfc_backward, gamma_backward, lgamma_backward,
292};
293pub use subgraph_extraction::{
294    ExtractionStrategy, Subgraph, SubgraphConfig, SubgraphExtractionResult, SubgraphExtractor,
295    SubgraphOperation,
296};
297pub use tape::{GradientTape, Operation, TapeNode, TrackedTensor};
298pub use tape_optimization::{TapeOptimizationConfig, TapeOptimizationStats, TapeOptimizer};
299pub use tensor_ext::TensorAutograd;
300pub use tensor_networks::{
301    ContractionEdge, ContractionPath, ContractionStep, ContractionStrategy, TensorNetwork,
302    TensorNetworkGradient, TensorNetworkNode, TensorNetworkOptimizer,
303};
304pub use ultra_gradient_engine_simple::{
305    global_ultra_gradient_engine, GradientMemoryStats, GradientPerformanceMetrics,
306    OptimizationInsights, UltraGradientConfig, UltraGradientEngine, UltraGradientResult,
307    UltraGradientTapeExt,
308};
309
310use tenflowers_core::{Result, Tensor};
311
312pub use advanced_grad_ops::{
313    gradient_clipping, higher_order as advanced_higher_order, jacobian, optimization,
314    AdaptiveGradientAccumulator,
315};
316pub use amp_policy::{
317    AMPConfig, AMPPolicy, AMPStabilityMetrics, ScaleAdjustment, ScaleAdjustmentReason,
318};
319pub use efficient_memory::{
320    AggregationStats, CheckpointStats, GradientCheckpointer, GradientMemoryManager,
321    GradientMemoryPool, LazyGradient, MemoryManagerStats, MemoryPoolStats,
322    StreamingGradientAggregator,
323};
324pub use gradient_analyzer::{
325    AnalysisConfig, GradientAnalysisReport, GradientAnalyzer,
326    GradientFlowAnalysis as AdvancedGradientFlowAnalysis, GradientIssue,
327    GradientStatistics as AnalyzerGradientStatistics, PerformanceMetrics,
328};
329pub use second_order_utils::{
330    compute_hessian, compute_hessian_diagonal, compute_jacobian, compute_laplacian,
331    directional_second_derivative, hessian_vector_product,
332};
333
334pub trait Differentiable<T> {
335    fn backward(&self, grad_output: &Tensor<T>) -> Result<Vec<Tensor<T>>>;
336    fn grad(&self) -> Option<&Tensor<T>>;
337}
tenflowers_autograd/lib.rs

tenflowers_autograd/
lib.rs