oxigdal_cluster/lib.rs
1//! OxiGDAL Cluster - Distributed geospatial processing at scale.
2//!
3//! This crate provides a comprehensive distributed computing framework for geospatial
4//! data processing, including:
5//!
6//! - **Distributed Scheduler**: Work-stealing scheduler with priority queues and dynamic load balancing
7//! - **Task Graph Engine**: DAG-based task dependencies with parallel execution planning
8//! - **Worker Pool Management**: Worker registration, health monitoring, and automatic failover
9//! - **Data Locality Optimization**: Minimize data transfer through intelligent task placement
10//! - **Fault Tolerance**: Task retry, speculative execution, and checkpointing
11//! - **Distributed Cache**: Cache coherency with compression and distributed LRU
12//! - **Data Replication**: Quorum-based reads/writes with automatic re-replication
13//! - **Cluster Coordinator**: Raft-based consensus and leader election
14//! - **Advanced Scheduling**: Gang scheduling, fair-share, deadline-based, and multi-queue scheduling
15//! - **Resource Management**: Quota management, resource reservation, and usage accounting
16//! - **Network Optimization**: Topology-aware scheduling, bandwidth tracking, and congestion control
17//! - **Workflow Engine**: Complex workflow orchestration with conditional execution and loops
18//! - **Autoscaling**: Dynamic cluster sizing based on load metrics and predictive scaling
19//! - **Monitoring & Alerting**: Real-time metrics, custom alerts, and anomaly detection
20//! - **Security & Access Control**: Authentication, RBAC, audit logging, and secret management
21//!
22//! # Examples
23//!
24//! ## Basic Cluster Setup
25//!
26//! ```rust,no_run
27//! use oxigdal_cluster::{Cluster, ClusterBuilder, SchedulerConfig};
28//!
29//! #[tokio::main]
30//! async fn main() -> oxigdal_cluster::Result<()> {
31//! // Create a cluster with default configuration
32//! let cluster = Cluster::new();
33//!
34//! // Start the cluster
35//! cluster.start().await?;
36//!
37//! // Get cluster statistics
38//! let stats = cluster.get_statistics();
39//! println!("Active workers: {}", stats.worker_pool.total_workers);
40//!
41//! // Stop the cluster
42//! cluster.stop().await?;
43//! Ok(())
44//! }
45//! ```
46//!
47//! ## Custom Cluster Configuration
48//!
49//! ```rust,no_run
50//! use oxigdal_cluster::{ClusterBuilder, SchedulerConfig, LoadBalanceStrategy};
51//! use oxigdal_cluster::worker_pool::WorkerPoolConfig;
52//! use std::time::Duration;
53//!
54//! #[tokio::main]
55//! async fn main() -> oxigdal_cluster::Result<()> {
56//! // Configure the scheduler
57//! let scheduler_config = SchedulerConfig {
58//! max_queue_size: 10000,
59//! work_steal_threshold: 10,
60//! scheduling_interval: Duration::from_millis(100),
61//! load_balance_strategy: LoadBalanceStrategy::LeastLoaded,
62//! task_timeout: Duration::from_secs(300),
63//! enable_work_stealing: true,
64//! enable_backpressure: true,
65//! max_concurrent_tasks_per_worker: 1000,
66//! };
67//!
68//! // Configure the worker pool
69//! let worker_config = WorkerPoolConfig {
70//! heartbeat_timeout: Duration::from_secs(30),
71//! health_check_interval: Duration::from_secs(10),
72//! max_unhealthy_duration: Duration::from_secs(120),
73//! min_workers: 1,
74//! max_workers: 100,
75//! };
76//!
77//! // Build the cluster with custom configuration
78//! let cluster = ClusterBuilder::new()
79//! .with_scheduler_config(scheduler_config)
80//! .with_worker_pool_config(worker_config)
81//! .build();
82//!
83//! cluster.start().await?;
84//!
85//! // Your cluster operations here
86//!
87//! cluster.stop().await?;
88//! Ok(())
89//! }
90//! ```
91
92#![deny(missing_docs)]
93#![warn(clippy::all)]
94
95pub mod autoscale;
96pub mod cache_coherency;
97pub mod coordinator;
98pub mod data_locality;
99pub mod error;
100pub mod fault_tolerance;
101pub mod metrics;
102pub mod monitoring;
103pub mod network;
104pub mod replication;
105pub mod resources;
106pub mod scheduler;
107pub mod security;
108pub mod task_graph;
109pub mod worker_pool;
110pub mod workflow;
111
112// Re-export common types
113pub use autoscale::{
114 AutoscaleConfig, AutoscaleStats, Autoscaler, MetricsSnapshot as AutoscaleMetrics, ScaleDecision,
115};
116pub use cache_coherency::{CacheConfig, CacheKey, DistributedCache};
117pub use coordinator::{ClusterCoordinator, CoordinatorConfig, NodeId, NodeRole};
118pub use data_locality::{DataLocalityOptimizer, LocalityConfig, PlacementRecommendation};
119pub use error::{ClusterError, Result};
120pub use fault_tolerance::{
121 // Bulkhead
122 Bulkhead,
123 BulkheadConfig,
124 BulkheadRegistry,
125 BulkheadStats,
126 // Circuit breaker
127 CircuitBreaker,
128 CircuitBreakerConfig,
129 CircuitBreakerRegistry,
130 CircuitBreakerStats,
131 CircuitState,
132 // Timeout management
133 Deadline,
134 // Graceful degradation
135 DegradationConfig,
136 DegradationLevel,
137 DegradationManager,
138 DegradationStats,
139 // Core fault tolerance
140 FaultToleranceConfig,
141 FaultToleranceManager,
142 FaultToleranceStatistics,
143 // Health checks
144 HealthCheck,
145 HealthCheckConfig,
146 HealthCheckManager,
147 HealthCheckResult,
148 HealthCheckStats,
149 HealthStatus,
150 RequestPriority,
151 RetryDecision,
152 TimeoutBudget,
153 TimeoutConfig,
154 TimeoutManager,
155 TimeoutRegistry,
156 TimeoutStats,
157};
158pub use metrics::{ClusterMetrics, MetricsSnapshot, WorkerMetrics};
159pub use monitoring::{Alert, AlertRule, AlertSeverity, MonitoringManager, MonitoringStats};
160pub use network::{BandwidthTracker, CompressionManager, CongestionController, TopologyManager};
161pub use replication::{ReplicaSet, ReplicationConfig, ReplicationManager};
162pub use resources::{AccountingManager, QuotaManager, ReservationManager};
163pub use scheduler::{LoadBalanceStrategy, Scheduler, SchedulerConfig, SchedulerStats};
164pub use security::{Role, SecurityManager, SecurityStats, User};
165pub use task_graph::{ExecutionPlan, ResourceRequirements, Task, TaskGraph, TaskId, TaskStatus};
166pub use worker_pool::{
167 SelectionStrategy, Worker, WorkerCapabilities, WorkerCapacity, WorkerId, WorkerPool,
168 WorkerStatus, WorkerUsage,
169};
170pub use workflow::{Workflow, WorkflowEngine, WorkflowExecution, WorkflowStatus};
171
172/// Cluster builder for easy setup.
173///
174/// The builder pattern allows you to customize various aspects of the cluster
175/// before initialization. All configuration is optional; if not specified,
176/// sensible defaults will be used.
177///
178/// # Examples
179///
180/// ```rust
181/// use oxigdal_cluster::{ClusterBuilder, SchedulerConfig, LoadBalanceStrategy};
182/// use std::time::Duration;
183///
184/// let scheduler_config = SchedulerConfig {
185/// max_queue_size: 10000,
186/// work_steal_threshold: 10,
187/// scheduling_interval: Duration::from_millis(100),
188/// load_balance_strategy: LoadBalanceStrategy::RoundRobin,
189/// task_timeout: Duration::from_secs(600),
190/// enable_work_stealing: true,
191/// enable_backpressure: true,
192/// max_concurrent_tasks_per_worker: 500,
193/// };
194///
195/// let cluster = ClusterBuilder::new()
196/// .with_scheduler_config(scheduler_config)
197/// .build();
198/// ```
199pub struct ClusterBuilder {
200 scheduler_config: Option<SchedulerConfig>,
201 worker_pool_config: Option<worker_pool::WorkerPoolConfig>,
202 coordinator_config: Option<CoordinatorConfig>,
203 fault_tolerance_config: Option<FaultToleranceConfig>,
204 locality_config: Option<LocalityConfig>,
205 cache_config: Option<CacheConfig>,
206 replication_config: Option<ReplicationConfig>,
207}
208
209impl ClusterBuilder {
210 /// Create a new cluster builder.
211 pub fn new() -> Self {
212 Self {
213 scheduler_config: None,
214 worker_pool_config: None,
215 coordinator_config: None,
216 fault_tolerance_config: None,
217 locality_config: None,
218 cache_config: None,
219 replication_config: None,
220 }
221 }
222
223 /// Set scheduler configuration.
224 pub fn with_scheduler_config(mut self, config: SchedulerConfig) -> Self {
225 self.scheduler_config = Some(config);
226 self
227 }
228
229 /// Set worker pool configuration.
230 pub fn with_worker_pool_config(mut self, config: worker_pool::WorkerPoolConfig) -> Self {
231 self.worker_pool_config = Some(config);
232 self
233 }
234
235 /// Set coordinator configuration.
236 pub fn with_coordinator_config(mut self, config: CoordinatorConfig) -> Self {
237 self.coordinator_config = Some(config);
238 self
239 }
240
241 /// Set fault tolerance configuration.
242 pub fn with_fault_tolerance_config(mut self, config: FaultToleranceConfig) -> Self {
243 self.fault_tolerance_config = Some(config);
244 self
245 }
246
247 /// Set locality optimizer configuration.
248 pub fn with_locality_config(mut self, config: LocalityConfig) -> Self {
249 self.locality_config = Some(config);
250 self
251 }
252
253 /// Set cache configuration.
254 pub fn with_cache_config(mut self, config: CacheConfig) -> Self {
255 self.cache_config = Some(config);
256 self
257 }
258
259 /// Set replication configuration.
260 pub fn with_replication_config(mut self, config: ReplicationConfig) -> Self {
261 self.replication_config = Some(config);
262 self
263 }
264
265 /// Build the cluster.
266 pub fn build(self) -> Cluster {
267 use std::sync::Arc;
268
269 let task_graph = Arc::new(TaskGraph::new());
270 let metrics = Arc::new(ClusterMetrics::new());
271
272 let worker_pool = Arc::new(if let Some(config) = self.worker_pool_config {
273 WorkerPool::new(config)
274 } else {
275 WorkerPool::with_defaults()
276 });
277
278 let scheduler = Arc::new(if let Some(config) = self.scheduler_config {
279 Scheduler::new(
280 Arc::clone(&task_graph),
281 Arc::clone(&worker_pool),
282 Arc::clone(&metrics),
283 config,
284 )
285 } else {
286 Scheduler::with_defaults(
287 Arc::clone(&task_graph),
288 Arc::clone(&worker_pool),
289 Arc::clone(&metrics),
290 )
291 });
292
293 let coordinator = Arc::new(if let Some(config) = self.coordinator_config {
294 ClusterCoordinator::new(config)
295 } else {
296 ClusterCoordinator::with_defaults()
297 });
298
299 let fault_tolerance = Arc::new(if let Some(config) = self.fault_tolerance_config {
300 FaultToleranceManager::new(config)
301 } else {
302 FaultToleranceManager::with_defaults()
303 });
304
305 let locality_optimizer = Arc::new(if let Some(config) = self.locality_config {
306 DataLocalityOptimizer::new(config)
307 } else {
308 DataLocalityOptimizer::with_defaults()
309 });
310
311 let cache = Arc::new(if let Some(config) = self.cache_config {
312 DistributedCache::new(config)
313 } else {
314 DistributedCache::with_defaults()
315 });
316
317 let replication = Arc::new(if let Some(config) = self.replication_config {
318 ReplicationManager::new(config)
319 } else {
320 ReplicationManager::with_defaults()
321 });
322
323 Cluster {
324 task_graph,
325 worker_pool,
326 scheduler,
327 coordinator,
328 fault_tolerance,
329 locality_optimizer,
330 cache,
331 replication,
332 metrics,
333 }
334 }
335}
336
337impl Default for ClusterBuilder {
338 fn default() -> Self {
339 Self::new()
340 }
341}
342
343/// Complete cluster instance.
344///
345/// The `Cluster` struct provides access to all cluster components including
346/// the task graph, worker pool, scheduler, coordinator, fault tolerance,
347/// data locality optimizer, distributed cache, and replication manager.
348///
349/// # Examples
350///
351/// ```rust,no_run
352/// use oxigdal_cluster::Cluster;
353///
354/// #[tokio::main]
355/// async fn main() -> oxigdal_cluster::Result<()> {
356/// // Create and start a cluster
357/// let cluster = Cluster::new();
358/// cluster.start().await?;
359///
360/// // Access cluster components
361/// let metrics = cluster.metrics.snapshot();
362/// println!("Tasks completed: {}", metrics.tasks_completed);
363///
364/// // Get cluster statistics
365/// let stats = cluster.get_statistics();
366/// println!("Total workers: {}", stats.worker_pool.total_workers);
367///
368/// // Stop the cluster
369/// cluster.stop().await?;
370/// Ok(())
371/// }
372/// ```
373pub struct Cluster {
374 /// Task graph
375 pub task_graph: std::sync::Arc<TaskGraph>,
376
377 /// Worker pool
378 pub worker_pool: std::sync::Arc<WorkerPool>,
379
380 /// Scheduler
381 pub scheduler: std::sync::Arc<Scheduler>,
382
383 /// Coordinator
384 pub coordinator: std::sync::Arc<ClusterCoordinator>,
385
386 /// Fault tolerance manager
387 pub fault_tolerance: std::sync::Arc<FaultToleranceManager>,
388
389 /// Data locality optimizer
390 pub locality_optimizer: std::sync::Arc<DataLocalityOptimizer>,
391
392 /// Distributed cache
393 pub cache: std::sync::Arc<DistributedCache>,
394
395 /// Replication manager
396 pub replication: std::sync::Arc<ReplicationManager>,
397
398 /// Cluster metrics
399 pub metrics: std::sync::Arc<ClusterMetrics>,
400}
401
402impl Cluster {
403 /// Create a new cluster with default configuration.
404 pub fn new() -> Self {
405 ClusterBuilder::new().build()
406 }
407
408 /// Start all cluster components.
409 pub async fn start(&self) -> Result<()> {
410 self.coordinator.start().await?;
411 self.scheduler.start().await?;
412 Ok(())
413 }
414
415 /// Stop all cluster components.
416 pub async fn stop(&self) -> Result<()> {
417 self.scheduler.stop().await?;
418 self.coordinator.stop().await?;
419 Ok(())
420 }
421
422 /// Get cluster-wide statistics.
423 pub fn get_statistics(&self) -> ClusterStatistics {
424 ClusterStatistics {
425 metrics: self.metrics.snapshot(),
426 scheduler: self.scheduler.get_statistics(),
427 worker_pool: self.worker_pool.get_statistics(),
428 coordinator: self.coordinator.get_statistics(),
429 fault_tolerance: self.fault_tolerance.get_statistics(),
430 locality: self.locality_optimizer.get_statistics(),
431 cache: self.cache.get_statistics(),
432 replication: self.replication.get_statistics(),
433 }
434 }
435}
436
437impl Default for Cluster {
438 fn default() -> Self {
439 Self::new()
440 }
441}
442
443/// Cluster-wide statistics.
444#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
445pub struct ClusterStatistics {
446 /// Metrics snapshot
447 pub metrics: MetricsSnapshot,
448
449 /// Scheduler statistics
450 pub scheduler: SchedulerStats,
451
452 /// Worker pool statistics
453 pub worker_pool: worker_pool::WorkerPoolStatistics,
454
455 /// Coordinator statistics
456 pub coordinator: coordinator::CoordinatorStatistics,
457
458 /// Fault tolerance statistics
459 pub fault_tolerance: fault_tolerance::FaultToleranceStatistics,
460
461 /// Locality statistics
462 pub locality: data_locality::LocalityStats,
463
464 /// Cache statistics
465 pub cache: cache_coherency::CacheStats,
466
467 /// Replication statistics
468 pub replication: replication::ReplicationStatistics,
469}