Skip to main content

oxigdal_cluster/
lib.rs

1//! OxiGDAL Cluster - Distributed geospatial processing at scale.
2//!
3//! This crate provides a comprehensive distributed computing framework for geospatial
4//! data processing, including:
5//!
6//! - **Distributed Scheduler**: Work-stealing scheduler with priority queues and dynamic load balancing
7//! - **Task Graph Engine**: DAG-based task dependencies with parallel execution planning
8//! - **Worker Pool Management**: Worker registration, health monitoring, and automatic failover
9//! - **Data Locality Optimization**: Minimize data transfer through intelligent task placement
10//! - **Fault Tolerance**: Task retry, speculative execution, and checkpointing
11//! - **Distributed Cache**: Cache coherency with compression and distributed LRU
12//! - **Data Replication**: Quorum-based reads/writes with automatic re-replication
13//! - **Cluster Coordinator**: Raft-based consensus and leader election
14//! - **Advanced Scheduling**: Gang scheduling, fair-share, deadline-based, and multi-queue scheduling
15//! - **Resource Management**: Quota management, resource reservation, and usage accounting
16//! - **Network Optimization**: Topology-aware scheduling, bandwidth tracking, and congestion control
17//! - **Workflow Engine**: Complex workflow orchestration with conditional execution and loops
18//! - **Autoscaling**: Dynamic cluster sizing based on load metrics and predictive scaling
19//! - **Monitoring & Alerting**: Real-time metrics, custom alerts, and anomaly detection
20//! - **Security & Access Control**: Authentication, RBAC, audit logging, and secret management
21//!
22//! # Examples
23//!
24//! ## Basic Cluster Setup
25//!
26//! ```rust,no_run
27//! use oxigdal_cluster::{Cluster, ClusterBuilder, SchedulerConfig};
28//!
29//! #[tokio::main]
30//! async fn main() -> oxigdal_cluster::Result<()> {
31//!     // Create a cluster with default configuration
32//!     let cluster = Cluster::new();
33//!
34//!     // Start the cluster
35//!     cluster.start().await?;
36//!
37//!     // Get cluster statistics
38//!     let stats = cluster.get_statistics();
39//!     println!("Active workers: {}", stats.worker_pool.total_workers);
40//!
41//!     // Stop the cluster
42//!     cluster.stop().await?;
43//!     Ok(())
44//! }
45//! ```
46//!
47//! ## Custom Cluster Configuration
48//!
49//! ```rust,no_run
50//! use oxigdal_cluster::{ClusterBuilder, SchedulerConfig, LoadBalanceStrategy};
51//! use oxigdal_cluster::worker_pool::WorkerPoolConfig;
52//! use std::time::Duration;
53//!
54//! #[tokio::main]
55//! async fn main() -> oxigdal_cluster::Result<()> {
56//!     // Configure the scheduler
57//!     let scheduler_config = SchedulerConfig {
58//!         max_queue_size: 10000,
59//!         work_steal_threshold: 10,
60//!         scheduling_interval: Duration::from_millis(100),
61//!         load_balance_strategy: LoadBalanceStrategy::LeastLoaded,
62//!         task_timeout: Duration::from_secs(300),
63//!         enable_work_stealing: true,
64//!         enable_backpressure: true,
65//!         max_concurrent_tasks_per_worker: 1000,
66//!     };
67//!
68//!     // Configure the worker pool
69//!     let worker_config = WorkerPoolConfig {
70//!         heartbeat_timeout: Duration::from_secs(30),
71//!         health_check_interval: Duration::from_secs(10),
72//!         max_unhealthy_duration: Duration::from_secs(120),
73//!         min_workers: 1,
74//!         max_workers: 100,
75//!     };
76//!
77//!     // Build the cluster with custom configuration
78//!     let cluster = ClusterBuilder::new()
79//!         .with_scheduler_config(scheduler_config)
80//!         .with_worker_pool_config(worker_config)
81//!         .build();
82//!
83//!     cluster.start().await?;
84//!
85//!     // Your cluster operations here
86//!
87//!     cluster.stop().await?;
88//!     Ok(())
89//! }
90//! ```
91
92#![deny(missing_docs)]
93#![warn(clippy::all)]
94
95pub mod autoscale;
96pub mod cache_coherency;
97pub mod coordinator;
98pub mod data_locality;
99pub mod error;
100pub mod fault_tolerance;
101pub mod metrics;
102pub mod monitoring;
103pub mod network;
104pub mod replication;
105pub mod resources;
106pub mod scheduler;
107pub mod security;
108pub mod task_graph;
109pub mod worker_pool;
110pub mod workflow;
111
112// Re-export common types
113pub use autoscale::{
114    AutoscaleConfig, AutoscaleStats, Autoscaler, MetricsSnapshot as AutoscaleMetrics, ScaleDecision,
115};
116pub use cache_coherency::{CacheConfig, CacheKey, DistributedCache};
117pub use coordinator::{ClusterCoordinator, CoordinatorConfig, NodeId, NodeRole};
118pub use data_locality::{DataLocalityOptimizer, LocalityConfig, PlacementRecommendation};
119pub use error::{ClusterError, Result};
120pub use fault_tolerance::{
121    // Bulkhead
122    Bulkhead,
123    BulkheadConfig,
124    BulkheadRegistry,
125    BulkheadStats,
126    // Circuit breaker
127    CircuitBreaker,
128    CircuitBreakerConfig,
129    CircuitBreakerRegistry,
130    CircuitBreakerStats,
131    CircuitState,
132    // Timeout management
133    Deadline,
134    // Graceful degradation
135    DegradationConfig,
136    DegradationLevel,
137    DegradationManager,
138    DegradationStats,
139    // Core fault tolerance
140    FaultToleranceConfig,
141    FaultToleranceManager,
142    FaultToleranceStatistics,
143    // Health checks
144    HealthCheck,
145    HealthCheckConfig,
146    HealthCheckManager,
147    HealthCheckResult,
148    HealthCheckStats,
149    HealthStatus,
150    RequestPriority,
151    RetryDecision,
152    TimeoutBudget,
153    TimeoutConfig,
154    TimeoutManager,
155    TimeoutRegistry,
156    TimeoutStats,
157};
158pub use metrics::{ClusterMetrics, MetricsSnapshot, WorkerMetrics};
159pub use monitoring::{Alert, AlertRule, AlertSeverity, MonitoringManager, MonitoringStats};
160pub use network::{BandwidthTracker, CompressionManager, CongestionController, TopologyManager};
161pub use replication::{ReplicaSet, ReplicationConfig, ReplicationManager};
162pub use resources::{AccountingManager, QuotaManager, ReservationManager};
163pub use scheduler::{LoadBalanceStrategy, Scheduler, SchedulerConfig, SchedulerStats};
164pub use security::{Role, SecurityManager, SecurityStats, User};
165pub use task_graph::{ExecutionPlan, ResourceRequirements, Task, TaskGraph, TaskId, TaskStatus};
166pub use worker_pool::{
167    SelectionStrategy, Worker, WorkerCapabilities, WorkerCapacity, WorkerId, WorkerPool,
168    WorkerStatus, WorkerUsage,
169};
170pub use workflow::{Workflow, WorkflowEngine, WorkflowExecution, WorkflowStatus};
171
172/// Cluster builder for easy setup.
173///
174/// The builder pattern allows you to customize various aspects of the cluster
175/// before initialization. All configuration is optional; if not specified,
176/// sensible defaults will be used.
177///
178/// # Examples
179///
180/// ```rust
181/// use oxigdal_cluster::{ClusterBuilder, SchedulerConfig, LoadBalanceStrategy};
182/// use std::time::Duration;
183///
184/// let scheduler_config = SchedulerConfig {
185///     max_queue_size: 10000,
186///     work_steal_threshold: 10,
187///     scheduling_interval: Duration::from_millis(100),
188///     load_balance_strategy: LoadBalanceStrategy::RoundRobin,
189///     task_timeout: Duration::from_secs(600),
190///     enable_work_stealing: true,
191///     enable_backpressure: true,
192///     max_concurrent_tasks_per_worker: 500,
193/// };
194///
195/// let cluster = ClusterBuilder::new()
196///     .with_scheduler_config(scheduler_config)
197///     .build();
198/// ```
199pub struct ClusterBuilder {
200    scheduler_config: Option<SchedulerConfig>,
201    worker_pool_config: Option<worker_pool::WorkerPoolConfig>,
202    coordinator_config: Option<CoordinatorConfig>,
203    fault_tolerance_config: Option<FaultToleranceConfig>,
204    locality_config: Option<LocalityConfig>,
205    cache_config: Option<CacheConfig>,
206    replication_config: Option<ReplicationConfig>,
207}
208
209impl ClusterBuilder {
210    /// Create a new cluster builder.
211    pub fn new() -> Self {
212        Self {
213            scheduler_config: None,
214            worker_pool_config: None,
215            coordinator_config: None,
216            fault_tolerance_config: None,
217            locality_config: None,
218            cache_config: None,
219            replication_config: None,
220        }
221    }
222
223    /// Set scheduler configuration.
224    pub fn with_scheduler_config(mut self, config: SchedulerConfig) -> Self {
225        self.scheduler_config = Some(config);
226        self
227    }
228
229    /// Set worker pool configuration.
230    pub fn with_worker_pool_config(mut self, config: worker_pool::WorkerPoolConfig) -> Self {
231        self.worker_pool_config = Some(config);
232        self
233    }
234
235    /// Set coordinator configuration.
236    pub fn with_coordinator_config(mut self, config: CoordinatorConfig) -> Self {
237        self.coordinator_config = Some(config);
238        self
239    }
240
241    /// Set fault tolerance configuration.
242    pub fn with_fault_tolerance_config(mut self, config: FaultToleranceConfig) -> Self {
243        self.fault_tolerance_config = Some(config);
244        self
245    }
246
247    /// Set locality optimizer configuration.
248    pub fn with_locality_config(mut self, config: LocalityConfig) -> Self {
249        self.locality_config = Some(config);
250        self
251    }
252
253    /// Set cache configuration.
254    pub fn with_cache_config(mut self, config: CacheConfig) -> Self {
255        self.cache_config = Some(config);
256        self
257    }
258
259    /// Set replication configuration.
260    pub fn with_replication_config(mut self, config: ReplicationConfig) -> Self {
261        self.replication_config = Some(config);
262        self
263    }
264
265    /// Build the cluster.
266    pub fn build(self) -> Cluster {
267        use std::sync::Arc;
268
269        let task_graph = Arc::new(TaskGraph::new());
270        let metrics = Arc::new(ClusterMetrics::new());
271
272        let worker_pool = Arc::new(if let Some(config) = self.worker_pool_config {
273            WorkerPool::new(config)
274        } else {
275            WorkerPool::with_defaults()
276        });
277
278        let scheduler = Arc::new(if let Some(config) = self.scheduler_config {
279            Scheduler::new(
280                Arc::clone(&task_graph),
281                Arc::clone(&worker_pool),
282                Arc::clone(&metrics),
283                config,
284            )
285        } else {
286            Scheduler::with_defaults(
287                Arc::clone(&task_graph),
288                Arc::clone(&worker_pool),
289                Arc::clone(&metrics),
290            )
291        });
292
293        let coordinator = Arc::new(if let Some(config) = self.coordinator_config {
294            ClusterCoordinator::new(config)
295        } else {
296            ClusterCoordinator::with_defaults()
297        });
298
299        let fault_tolerance = Arc::new(if let Some(config) = self.fault_tolerance_config {
300            FaultToleranceManager::new(config)
301        } else {
302            FaultToleranceManager::with_defaults()
303        });
304
305        let locality_optimizer = Arc::new(if let Some(config) = self.locality_config {
306            DataLocalityOptimizer::new(config)
307        } else {
308            DataLocalityOptimizer::with_defaults()
309        });
310
311        let cache = Arc::new(if let Some(config) = self.cache_config {
312            DistributedCache::new(config)
313        } else {
314            DistributedCache::with_defaults()
315        });
316
317        let replication = Arc::new(if let Some(config) = self.replication_config {
318            ReplicationManager::new(config)
319        } else {
320            ReplicationManager::with_defaults()
321        });
322
323        Cluster {
324            task_graph,
325            worker_pool,
326            scheduler,
327            coordinator,
328            fault_tolerance,
329            locality_optimizer,
330            cache,
331            replication,
332            metrics,
333        }
334    }
335}
336
337impl Default for ClusterBuilder {
338    fn default() -> Self {
339        Self::new()
340    }
341}
342
343/// Complete cluster instance.
344///
345/// The `Cluster` struct provides access to all cluster components including
346/// the task graph, worker pool, scheduler, coordinator, fault tolerance,
347/// data locality optimizer, distributed cache, and replication manager.
348///
349/// # Examples
350///
351/// ```rust,no_run
352/// use oxigdal_cluster::Cluster;
353///
354/// #[tokio::main]
355/// async fn main() -> oxigdal_cluster::Result<()> {
356///     // Create and start a cluster
357///     let cluster = Cluster::new();
358///     cluster.start().await?;
359///
360///     // Access cluster components
361///     let metrics = cluster.metrics.snapshot();
362///     println!("Tasks completed: {}", metrics.tasks_completed);
363///
364///     // Get cluster statistics
365///     let stats = cluster.get_statistics();
366///     println!("Total workers: {}", stats.worker_pool.total_workers);
367///
368///     // Stop the cluster
369///     cluster.stop().await?;
370///     Ok(())
371/// }
372/// ```
373pub struct Cluster {
374    /// Task graph
375    pub task_graph: std::sync::Arc<TaskGraph>,
376
377    /// Worker pool
378    pub worker_pool: std::sync::Arc<WorkerPool>,
379
380    /// Scheduler
381    pub scheduler: std::sync::Arc<Scheduler>,
382
383    /// Coordinator
384    pub coordinator: std::sync::Arc<ClusterCoordinator>,
385
386    /// Fault tolerance manager
387    pub fault_tolerance: std::sync::Arc<FaultToleranceManager>,
388
389    /// Data locality optimizer
390    pub locality_optimizer: std::sync::Arc<DataLocalityOptimizer>,
391
392    /// Distributed cache
393    pub cache: std::sync::Arc<DistributedCache>,
394
395    /// Replication manager
396    pub replication: std::sync::Arc<ReplicationManager>,
397
398    /// Cluster metrics
399    pub metrics: std::sync::Arc<ClusterMetrics>,
400}
401
402impl Cluster {
403    /// Create a new cluster with default configuration.
404    pub fn new() -> Self {
405        ClusterBuilder::new().build()
406    }
407
408    /// Start all cluster components.
409    pub async fn start(&self) -> Result<()> {
410        self.coordinator.start().await?;
411        self.scheduler.start().await?;
412        Ok(())
413    }
414
415    /// Stop all cluster components.
416    pub async fn stop(&self) -> Result<()> {
417        self.scheduler.stop().await?;
418        self.coordinator.stop().await?;
419        Ok(())
420    }
421
422    /// Get cluster-wide statistics.
423    pub fn get_statistics(&self) -> ClusterStatistics {
424        ClusterStatistics {
425            metrics: self.metrics.snapshot(),
426            scheduler: self.scheduler.get_statistics(),
427            worker_pool: self.worker_pool.get_statistics(),
428            coordinator: self.coordinator.get_statistics(),
429            fault_tolerance: self.fault_tolerance.get_statistics(),
430            locality: self.locality_optimizer.get_statistics(),
431            cache: self.cache.get_statistics(),
432            replication: self.replication.get_statistics(),
433        }
434    }
435}
436
437impl Default for Cluster {
438    fn default() -> Self {
439        Self::new()
440    }
441}
442
443/// Cluster-wide statistics.
444#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
445pub struct ClusterStatistics {
446    /// Metrics snapshot
447    pub metrics: MetricsSnapshot,
448
449    /// Scheduler statistics
450    pub scheduler: SchedulerStats,
451
452    /// Worker pool statistics
453    pub worker_pool: worker_pool::WorkerPoolStatistics,
454
455    /// Coordinator statistics
456    pub coordinator: coordinator::CoordinatorStatistics,
457
458    /// Fault tolerance statistics
459    pub fault_tolerance: fault_tolerance::FaultToleranceStatistics,
460
461    /// Locality statistics
462    pub locality: data_locality::LocalityStats,
463
464    /// Cache statistics
465    pub cache: cache_coherency::CacheStats,
466
467    /// Replication statistics
468    pub replication: replication::ReplicationStatistics,
469}