sklears_utils/
lib.rs

1#![allow(dead_code)]
2#![allow(non_snake_case)]
3#![allow(missing_docs)]
4#![allow(deprecated)]
5#![allow(clippy::needless_range_loop)]
6#![allow(clippy::needless_borrow)]
7//! Utility functions and helpers for sklears
8//!
9//! This crate provides common utilities used across the sklears ecosystem,
10//! including data validation, array manipulation, random number generation,
11//! and dataset creation utilities.
12//!
13//! # Examples
14//!
15//! ```rust
16//! use sklears_utils::validation::check_consistent_length;
17//! use sklears_utils::random::set_random_state;
18//! use scirs2_core::ndarray::array;
19//!
20//! // Validation
21//! let x = array![1, 2, 3];
22//! let y = array![4, 5, 6];
23//! assert!(check_consistent_length(&[&x, &y]).is_ok());
24//!
25//! // Random state
26//! set_random_state(42);
27//! ```
28
29pub mod api_integration;
30// Temporarily commented out due to missing submodules
31// pub mod architecture;
32pub mod array_utils;
33pub mod cloud_storage;
34pub mod config;
35pub mod cross_validation;
36pub mod data_generation;
37pub mod data_pipeline;
38pub mod data_structures;
39pub mod database;
40pub mod debug;
41pub mod distributed_computing;
42pub mod ensemble;
43pub mod environment;
44pub mod error_handling;
45pub mod external_integration;
46pub mod feature_engineering;
47pub mod file_io;
48pub mod gpu_computing;
49pub mod linear_algebra;
50pub mod logging;
51pub mod math_utils;
52pub mod memory;
53pub mod metrics;
54pub mod multiclass;
55pub mod optimization;
56pub mod parallel;
57pub mod performance;
58pub mod performance_regression;
59pub mod preprocessing;
60pub mod probabilistic;
61pub mod profile_guided_optimization;
62pub mod r_integration;
63pub mod random;
64pub mod simd;
65pub mod spatial;
66pub mod statistical;
67pub mod stats;
68pub mod text_processing;
69pub mod time_series;
70pub mod type_safety;
71pub mod validation;
72pub mod visualization;
73
74#[allow(non_snake_case)]
75#[cfg(test)]
76mod property_tests;
77
78// Re-export specific functions to avoid conflicts
79pub use api_integration::{
80    ApiClient, ApiConfig, ApiError, ApiMetrics, ApiRequest, ApiResponse, ApiService,
81    Authentication, HttpMethod, MLApiPatterns, MethodStats, MockApiClient, RequestBuilder,
82};
83// Temporarily commented out due to missing submodules
84/*
85pub use architecture::{
86    AspectContext, AspectManager, BackoffStrategy, ChainValidationRule, ChainValidationType,
87    ComparisonOperator, ComponentError, ComponentFactory, ComponentRegistry, ConfigurationBuilder,
88    ConfigurationPreset, ErrorHandleResult, ErrorHandler, Event, EventBus, EventError,
89    EventHandler, EventRecord, ExecutionStats, FeatureModule, FluentApiBuilder, FluentChainBuilder,
90    FluentCondition, FluentConditionType, FluentError, FluentErrorHandling, FluentExecutionResult,
91    FluentExecutionStats, FluentOperation, FluentOperationType, FluentRetryPolicy,
92    FluentUtilityChain, Hook, HookConfig, HookContext, HookError, HookErrorHandling,
93    HookExecutionStats, HookRegistry, HookResult, HookType, MiddlewareContext, MiddlewareError,
94    MiddlewarePipeline, ModuleConfig, ModuleError, ModuleRegistry, PipelineHookManager, Plugin,
95    PluginContext, PluginError, PluginExecution, PluginManager, PluginResult,
96    PresetApplicationResult, PresetBuilder, PresetError, PresetRegistry, RetryCondition,
97    ServiceLifecycle, ServiceLocator, ServiceMetadata, UtilityContext, UtilityError,
98    UtilityFunction, UtilityHookManager, UtilityRegistry, UtilityResult, UtilityValue,
99    ValidationError, ValidationRule, ValidationRuleType,
100};
101*/
102pub use array_utils::{
103    argmax,
104    argmin,
105    argsort,
106    array_add_constant_inplace,
107    array_apply_inplace,
108    array_concatenate,
109    array_cumsum,
110    array_describe,
111    array_max,
112    array_mean,
113    array_mean_f64,
114    array_median,
115    array_min,
116    array_min_max,
117    array_min_max_normalize,
118    array_min_max_normalize_inplace,
119    array_percentile,
120    array_quantiles,
121    array_resize,
122    array_reverse,
123
124    array_scale_inplace,
125    array_split,
126    array_standardize,
127    // In-place operations
128    array_standardize_inplace,
129    array_std,
130    // Statistical functions
131    array_sum,
132    array_unique_counts,
133    array_var,
134    array_variance_f64,
135    boolean_indexing_1d,
136    boolean_indexing_2d,
137    broadcast_shape,
138    // Core utilities
139    check_array_1d,
140    check_array_2d,
141    column_or_1d,
142    compatible_layout,
143
144    compress_1d,
145
146    concatenate_2d,
147    create_mask,
148    densify_threshold,
149
150    efficient_copy,
151    // Advanced indexing
152    fancy_indexing_1d,
153    fancy_indexing_2d,
154    fast_dot_product_f32,
155    fast_dot_product_f64,
156    fast_sum_f32,
157
158    fast_sum_f64,
159    filter_array,
160    flatten_2d,
161    get_strides,
162    is_broadcastable,
163    // Memory operations
164    is_contiguous,
165    label_counts,
166    make_contiguous,
167    normalize_array,
168    pad_2d,
169
170    put_1d,
171    // Shape operations
172    reshape_1d_to_2d,
173    safe_indexing,
174    safe_indexing_2d,
175    // Sparse operations
176    safe_sparse_dot,
177    safe_sparse_dot_f32,
178    safe_sparse_dot_f64,
179    simd_add_arrays_f32,
180    // SIMD operations
181    simd_add_arrays_f64,
182    simd_multiply_arrays_f32,
183    simd_multiply_arrays_f64,
184    simd_scale_array_f32,
185    simd_scale_array_f64,
186    slice_with_step,
187    sparse_add,
188    sparse_diag,
189    sparse_transpose,
190    split_2d,
191    stack_1d,
192    take_1d,
193    tile_2d,
194    transpose,
195    unique_labels,
196    where_condition,
197    ArrayStatistics,
198};
199pub use cloud_storage::{
200    CloudProvider, CloudStorageClient, CloudStorageConfig, CloudStorageFactory, CloudStorageUtils,
201    MockCloudStorageClient, ObjectMetadata, StorageMetrics, SyncMode, SyncResult,
202};
203pub use config::{
204    ArgParser, Config, ConfigBuilder, ConfigSource, ConfigValidator, ConfigValue, HotReloadConfig,
205};
206pub use cross_validation::{
207    CVSplit, GroupKFold, LeaveOneGroupOut, StratifiedKFold, TimeSeriesSplit,
208};
209pub use data_generation::*;
210pub use data_pipeline::{
211    DataPipeline, MLPipelineBuilder, PipelineContext, PipelineMetrics, PipelineMonitor,
212    PipelineResult, PipelineStep, StepMetrics, TransformStep,
213};
214pub use data_structures::{
215    AtomicCounter, BinarySearchTree, BlockMatrix, ConcurrentHashMap, ConcurrentQueue,
216    ConcurrentRingBuffer, Graph, RingBuffer, TreeNode, TreeStatistics, Trie, TrieStatistics,
217    WeightedGraph, WorkQueue,
218};
219pub use database::{
220    Connection, DatabaseConfig, DatabaseError, DatabasePool, Query, QueryBuilder, QueryResult,
221    ResultSet, Transaction,
222};
223pub use debug::{
224    ArrayDebugger, DebugContext, DiagnosticTools, MemoryDebugger, PerformanceDebugger,
225    TestDataGenerator, TimingStats,
226};
227pub use distributed_computing::{
228    ClusterConfig, ClusterNode, ClusterStats, DistributedCluster, DistributedError, DistributedJob,
229    FaultDetector, JobExecution, JobPriority, JobScheduler, JobStatus, JobType, LoadBalancer,
230    LoadMetrics, NodeCapabilities, NodeStatus, ResourceRequirements, ResourceUsage,
231    SchedulingStrategy,
232};
233pub use ensemble::{
234    AggregationStrategy, BaggingPredictor, Bootstrap, OOBScoreEstimator, StackingHelper,
235};
236pub use environment::{
237    CacheInfo, CpuInfo, EnvironmentInfo, FeatureChecker, HardwareDetector, MemoryInfo, OSInfo,
238    PerformanceCharacteristics, RuntimeInfo,
239};
240pub use error_handling::{
241    create_error, create_error_at, EnhancedError, ErrorAggregator, ErrorContext, ErrorRecovery,
242    ErrorReporter, ErrorStatistics, ErrorSummary, RecoveryStrategy,
243};
244pub use external_integration::{
245    ArrayTransfer, CFunctionSignature, CParameter, CType, FFIUtils, PyArrayBuffer, PythonInterop,
246    PythonParameter, PythonValue, WasmBuildConfig, WasmOptimization, WasmParameter, WasmType,
247    WasmUtils,
248};
249pub use feature_engineering::{
250    BinningStrategy, FeatureBinner, InteractionFeatures, PolynomialFeatures,
251};
252pub use file_io::{
253    CompressionUtils, EfficientFileReader, EfficientFileWriter, FormatConverter,
254    SerializationUtils, StreamProcessor,
255};
256pub use gpu_computing::{
257    ActivationFunction, GpuArrayOps, GpuDevice, GpuError, GpuKernelExecution, GpuKernelInfo,
258    GpuMemoryAllocation, GpuProfiler, GpuUtils, KernelStats, MemoryStats, MemoryTransferStats,
259};
260pub use linear_algebra::{
261    ConditionNumber, EigenDecomposition, MatrixDecomposition, MatrixNorms, MatrixRank, MatrixUtils,
262    Pseudoinverse,
263};
264pub use logging::{
265    flush_global_logger, get_global_logger, set_global_level, ConsoleOutput, DistributedLogger,
266    FileOutput, JsonFormatter, LogAnalysis, LogAnalyzer, LogEntry, LogLevel, LogStats, Logger,
267    LoggerConfig, OperationStats, PerformanceLogger, TextFormatter,
268};
269pub use math_utils::{
270    constants, NumericalPrecision, OverflowDetection, RobustArrayOps, SpecialFunctions,
271};
272pub use memory::{
273    AllocationStats, GcHelper, LeakDetector, MemoryAlignment, MemoryMappedFile, MemoryMonitor,
274    MemoryPool, MemoryValidator, SafeBuffer, SafePtr, SafeVec, StackGuard, TrackingAllocator,
275};
276pub use metrics::{
277    bhattacharyya_distance, braycurtis_distance, canberra_distance, chebyshev_distance,
278    cosine_distance, cosine_distance_f32, cosine_similarity, cosine_similarity_f32,
279    euclidean_distance, euclidean_distance_f32, hamming_distance, hamming_distance_normalized,
280    hellinger_distance, jaccard_distance, jaccard_similarity, jensen_shannon_divergence,
281    kl_divergence, mahalanobis_distance, manhattan_distance, manhattan_distance_f32,
282    minkowski_distance, wasserstein_1d,
283};
284pub use multiclass::*;
285pub use optimization::{
286    ConstraintHandler, ConstraintViolation, ConvergenceCriteria, ConvergenceStatus,
287    GradientComputer, GradientMethod, LineSearch, LineSearchMethod, OptimizationHistory,
288};
289pub use parallel::{ParallelIterator, ParallelReducer, ThreadPool, WorkStealingQueue};
290pub use performance::{
291    BaselineMetrics, Benchmark, BenchmarkResult, MemoryTracker, ProfileReport, ProfileResult,
292    Profiler, RegressionDetector, RegressionResult, Timer, TimerSummary,
293};
294pub use performance_regression::{
295    PerformanceRegressionTester, RegressionTestResult, RegressionThresholds,
296};
297pub use preprocessing::{DataCleaner, DataQualityAssessor, FeatureScaler, OutlierDetector};
298pub use probabilistic::{
299    BloomFilter, BloomFilterStats, CountMinSketch, CountMinSketchStats, HyperLogLog,
300    HyperLogLogStats, LSHash, LSHashStats, MinHash, MinHashStats,
301};
302pub use profile_guided_optimization::{
303    BranchProfile, BranchType, CacheStatistics, DependencyChain, FunctionProfile,
304    ImplementationEffort, InstructionMix, LoopProfile, MemoryAccessPattern, MemoryAccessType,
305    OptimizationApplication, OptimizationOpportunity, OptimizationRecommendation,
306    OptimizationReport, OptimizationRule, OptimizationType, PerformanceProfile, PerformanceTargets,
307    ProfileError, ProfileGuidedOptimizer, ProfileSummary, ProfilerConfig, RiskLevel, StridePattern,
308    TriggerCondition,
309};
310pub use r_integration::{
311    RDataFrame, RError, RIntegration, RMatrix, RPackageManager, RScriptBuilder,
312    RStatisticalFunctions, RValue,
313};
314pub use random::{
315    bootstrap_indices, get_rng, importance_sampling, k_fold_indices, random_indices,
316    random_permutation, random_weights, reservoir_sampling, set_random_state, shuffle_indices,
317    stratified_split_indices, train_test_split_indices, weighted_sampling_without_replacement,
318    DistributionSampler, ThreadSafeRng,
319};
320pub use simd::{
321    SimdCapabilities, SimdDistanceOps, SimdF32Ops, SimdF64Ops, SimdMatrixOps, SimdStatsOps,
322};
323pub use spatial::{
324    geographic::{CoordinateSystem, GeoBounds, GeoPoint, GeoUtils, Hemisphere},
325    KdTree, OctTree, Point, QuadTree, RTree, Rectangle, SpatialHash, SpatialHashStats,
326};
327pub use statistical::{
328    ConfidenceInterval, ConfidenceIntervals, CorrelationAnalysis, DistributionFitting,
329    StatisticalTests, TestResult,
330};
331pub use text_processing::{
332    RegexUtils, StringSimilarity, TextAnalysis, TextNormalizer, TextParser, UnicodeUtils,
333};
334pub use time_series::{
335    AggregationMethod, LagFeatureGenerator, SlidingWindow, TemporalAggregator, TemporalIndex,
336    TimeSeries, TimeSeriesPoint, TimeZoneUtils, Timestamp, TrendDirection, WindowStats,
337};
338pub use type_safety::{
339    DataState, ExactSize, Kilograms, MatrixMul, Measurement, Meters, MinSize, ModelState,
340    NonNegative, Normalized, One, Pixels, Positive, Seconds, Three, Trained, Two, TypedArray,
341    Untrained, Unvalidated, Validated, ValidatedArray, Zero, D1, D2, D3,
342};
343pub use validation::*;
344pub use visualization::{
345    AxisConfig, BoxPlotData, ChartData, Color, HeatmapData, HistogramData, LinePlotData,
346    MLVisualizationUtils, PlotData, PlotLayout, PlotMargin, PlotSummary, PlotUtils, Point2D,
347    ScatterPlotData,
348};
349
350/// Common error type for utils
351#[derive(thiserror::Error, Debug, Clone)]
352pub enum UtilsError {
353    #[error("Shape mismatch: expected {expected:?}, got {actual:?}")]
354    ShapeMismatch {
355        expected: Vec<usize>,
356        actual: Vec<usize>,
357    },
358    #[error("Invalid parameter: {0}")]
359    InvalidParameter(String),
360    #[error("Empty input")]
361    EmptyInput,
362    #[error("Invalid random state: {0}")]
363    InvalidRandomState(String),
364    #[error("Insufficient data: need at least {min} samples, got {actual}")]
365    InsufficientData { min: usize, actual: usize },
366}
367
368impl From<UtilsError> for sklears_core::error::SklearsError {
369    fn from(err: UtilsError) -> Self {
370        sklears_core::error::SklearsError::InvalidInput(err.to_string())
371    }
372}
373
374impl From<serde_json::Error> for UtilsError {
375    fn from(err: serde_json::Error) -> Self {
376        UtilsError::InvalidParameter(format!("JSON serialization error: {err}"))
377    }
378}
379
380/// Type alias for utils results
381pub type UtilsResult<T> = std::result::Result<T, UtilsError>;