oxigdal-distributed 0.1.4

Distributed processing capabilities for OxiGDAL using Apache Arrow Flight
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
//! Task definitions and management for distributed processing.
//!
//! This module defines the task types and execution logic for distributed
//! geospatial processing operations.

use crate::error::{DistributedError, Result};
use arrow::record_batch::RecordBatch;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::sync::Arc;

/// Unique identifier for a task.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct TaskId(pub u64);

impl fmt::Display for TaskId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "Task({})", self.0)
    }
}

/// Unique identifier for a partition.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct PartitionId(pub u64);

impl fmt::Display for PartitionId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "Partition({})", self.0)
    }
}

/// Status of a task execution.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum TaskStatus {
    /// Task is pending execution.
    Pending,
    /// Task is currently being executed.
    Running,
    /// Task completed successfully.
    Completed,
    /// Task failed with an error.
    Failed,
    /// Task was cancelled.
    Cancelled,
}

impl fmt::Display for TaskStatus {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::Pending => write!(f, "Pending"),
            Self::Running => write!(f, "Running"),
            Self::Completed => write!(f, "Completed"),
            Self::Failed => write!(f, "Failed"),
            Self::Cancelled => write!(f, "Cancelled"),
        }
    }
}

/// Type of geospatial operation to perform.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum TaskOperation {
    /// Apply a filter to data.
    Filter {
        /// Filter expression.
        expression: String,
    },
    /// Calculate a raster index (NDVI, NDWI, etc.).
    CalculateIndex {
        /// Index type.
        index_type: String,
        /// Band indices for calculation.
        bands: Vec<usize>,
    },
    /// Reproject data to a different CRS.
    Reproject {
        /// Target EPSG code.
        target_epsg: i32,
    },
    /// Resample raster data.
    Resample {
        /// Target width.
        width: usize,
        /// Target height.
        height: usize,
        /// Resampling method.
        method: String,
    },
    /// Clip data to a bounding box.
    Clip {
        /// Minimum X coordinate.
        min_x: f64,
        /// Minimum Y coordinate.
        min_y: f64,
        /// Maximum X coordinate.
        max_x: f64,
        /// Maximum Y coordinate.
        max_y: f64,
    },
    /// Apply a convolution kernel.
    Convolve {
        /// Kernel values.
        kernel: Vec<f64>,
        /// Kernel width.
        kernel_width: usize,
        /// Kernel height.
        kernel_height: usize,
    },
    /// Custom user-defined operation.
    Custom {
        /// Operation name.
        name: String,
        /// JSON-serialized parameters.
        params: String,
    },
}

/// A task to be executed by a worker.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Task {
    /// Unique task identifier.
    pub id: TaskId,
    /// Partition to process.
    pub partition_id: PartitionId,
    /// Operation to perform.
    pub operation: TaskOperation,
    /// Current status.
    pub status: TaskStatus,
    /// Worker ID assigned to this task (if any).
    pub worker_id: Option<String>,
    /// Number of retry attempts.
    pub retry_count: u32,
    /// Maximum number of retries allowed.
    pub max_retries: u32,
}

impl Task {
    /// Create a new task.
    pub fn new(id: TaskId, partition_id: PartitionId, operation: TaskOperation) -> Self {
        Self {
            id,
            partition_id,
            operation,
            status: TaskStatus::Pending,
            worker_id: None,
            retry_count: 0,
            max_retries: 3,
        }
    }

    /// Check if the task can be retried.
    pub fn can_retry(&self) -> bool {
        self.retry_count < self.max_retries
    }

    /// Mark the task as running on a specific worker.
    pub fn mark_running(&mut self, worker_id: String) {
        self.status = TaskStatus::Running;
        self.worker_id = Some(worker_id);
    }

    /// Mark the task as completed.
    pub fn mark_completed(&mut self) {
        self.status = TaskStatus::Completed;
    }

    /// Mark the task as failed and increment retry count.
    pub fn mark_failed(&mut self) {
        self.status = TaskStatus::Failed;
        self.retry_count += 1;
    }

    /// Mark the task as cancelled.
    pub fn mark_cancelled(&mut self) {
        self.status = TaskStatus::Cancelled;
    }

    /// Reset the task for retry.
    pub fn reset_for_retry(&mut self) {
        self.status = TaskStatus::Pending;
        self.worker_id = None;
    }
}

/// Result of a task execution.
#[derive(Debug, Clone)]
pub struct TaskResult {
    /// Task identifier.
    pub task_id: TaskId,
    /// Resulting data as Arrow RecordBatch.
    pub data: Option<Arc<RecordBatch>>,
    /// Execution time in milliseconds.
    pub execution_time_ms: u64,
    /// Error message if task failed.
    pub error: Option<String>,
}

impl TaskResult {
    /// Create a successful task result.
    pub fn success(task_id: TaskId, data: Arc<RecordBatch>, execution_time_ms: u64) -> Self {
        Self {
            task_id,
            data: Some(data),
            execution_time_ms,
            error: None,
        }
    }

    /// Create a failed task result.
    pub fn failure(task_id: TaskId, error: String, execution_time_ms: u64) -> Self {
        Self {
            task_id,
            data: None,
            execution_time_ms,
            error: Some(error),
        }
    }

    /// Check if the result indicates success.
    pub fn is_success(&self) -> bool {
        self.error.is_none()
    }

    /// Check if the result indicates failure.
    pub fn is_failure(&self) -> bool {
        self.error.is_some()
    }
}

/// Task execution context with metadata.
#[derive(Debug, Clone)]
pub struct TaskContext {
    /// Task identifier.
    pub task_id: TaskId,
    /// Worker identifier executing this task.
    pub worker_id: String,
    /// Total memory available (bytes).
    pub memory_limit: u64,
    /// Number of CPU cores available.
    pub num_cores: usize,
}

impl TaskContext {
    /// Create a new task context.
    pub fn new(task_id: TaskId, worker_id: String) -> Self {
        Self {
            task_id,
            worker_id,
            memory_limit: 1024 * 1024 * 1024, // 1 GB default
            num_cores: num_cpus(),
        }
    }

    /// Set the memory limit.
    pub fn with_memory_limit(mut self, limit: u64) -> Self {
        self.memory_limit = limit;
        self
    }

    /// Set the number of cores.
    pub fn with_num_cores(mut self, cores: usize) -> Self {
        self.num_cores = cores;
        self
    }
}

/// Get the number of available CPU cores.
fn num_cpus() -> usize {
    std::thread::available_parallelism()
        .map(|n| n.get())
        .unwrap_or(1)
}

/// Task scheduler for managing task execution order.
#[derive(Debug)]
pub struct TaskScheduler {
    /// Queue of pending tasks.
    pending: Vec<Task>,
    /// Currently running tasks.
    running: Vec<Task>,
    /// Completed tasks.
    completed: Vec<Task>,
    /// Failed tasks.
    failed: Vec<Task>,
}

impl TaskScheduler {
    /// Create a new task scheduler.
    pub fn new() -> Self {
        Self {
            pending: Vec::new(),
            running: Vec::new(),
            completed: Vec::new(),
            failed: Vec::new(),
        }
    }

    /// Add a task to the scheduler.
    pub fn add_task(&mut self, task: Task) {
        self.pending.push(task);
    }

    /// Get the next pending task.
    pub fn next_task(&mut self) -> Option<Task> {
        self.pending.pop()
    }

    /// Mark a task as running.
    pub fn mark_running(&mut self, mut task: Task, worker_id: String) {
        task.mark_running(worker_id);
        self.running.push(task);
    }

    /// Mark a task as completed.
    pub fn mark_completed(&mut self, task_id: TaskId) -> Result<()> {
        if let Some(pos) = self.running.iter().position(|t| t.id == task_id) {
            let mut task = self.running.remove(pos);
            task.mark_completed();
            self.completed.push(task);
            Ok(())
        } else {
            Err(DistributedError::coordinator(format!(
                "Task {} not found in running tasks",
                task_id
            )))
        }
    }

    /// Mark a task as failed and potentially retry.
    pub fn mark_failed(&mut self, task_id: TaskId) -> Result<()> {
        if let Some(pos) = self.running.iter().position(|t| t.id == task_id) {
            let mut task = self.running.remove(pos);
            task.mark_failed();

            if task.can_retry() {
                task.reset_for_retry();
                self.pending.push(task);
            } else {
                self.failed.push(task);
            }
            Ok(())
        } else {
            Err(DistributedError::coordinator(format!(
                "Task {} not found in running tasks",
                task_id
            )))
        }
    }

    /// Get the number of pending tasks.
    pub fn pending_count(&self) -> usize {
        self.pending.len()
    }

    /// Get the number of running tasks.
    pub fn running_count(&self) -> usize {
        self.running.len()
    }

    /// Get the number of completed tasks.
    pub fn completed_count(&self) -> usize {
        self.completed.len()
    }

    /// Get the number of failed tasks.
    pub fn failed_count(&self) -> usize {
        self.failed.len()
    }

    /// Check if all tasks are complete.
    pub fn is_complete(&self) -> bool {
        self.pending.is_empty() && self.running.is_empty()
    }
}

impl Default for TaskScheduler {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_task_creation() {
        let task = Task::new(
            TaskId(1),
            PartitionId(0),
            TaskOperation::Filter {
                expression: "value > 10".to_string(),
            },
        );

        assert_eq!(task.id, TaskId(1));
        assert_eq!(task.partition_id, PartitionId(0));
        assert_eq!(task.status, TaskStatus::Pending);
        assert!(task.worker_id.is_none());
    }

    #[test]
    fn test_task_lifecycle() {
        let mut task = Task::new(
            TaskId(1),
            PartitionId(0),
            TaskOperation::Filter {
                expression: "value > 10".to_string(),
            },
        );

        task.mark_running("worker-1".to_string());
        assert_eq!(task.status, TaskStatus::Running);
        assert_eq!(task.worker_id, Some("worker-1".to_string()));

        task.mark_completed();
        assert_eq!(task.status, TaskStatus::Completed);
    }

    #[test]
    fn test_task_retry() {
        let mut task = Task::new(
            TaskId(1),
            PartitionId(0),
            TaskOperation::Filter {
                expression: "value > 10".to_string(),
            },
        );

        task.max_retries = 2;

        assert!(task.can_retry());
        task.mark_failed();
        assert_eq!(task.retry_count, 1);
        assert!(task.can_retry());

        task.mark_failed();
        assert_eq!(task.retry_count, 2);
        assert!(!task.can_retry());
    }

    #[test]
    fn test_task_scheduler() -> std::result::Result<(), Box<dyn std::error::Error>> {
        let mut scheduler = TaskScheduler::new();

        let task1 = Task::new(
            TaskId(1),
            PartitionId(0),
            TaskOperation::Filter {
                expression: "value > 10".to_string(),
            },
        );
        let task2 = Task::new(
            TaskId(2),
            PartitionId(1),
            TaskOperation::Filter {
                expression: "value < 100".to_string(),
            },
        );

        scheduler.add_task(task1);
        scheduler.add_task(task2);

        assert_eq!(scheduler.pending_count(), 2);
        assert_eq!(scheduler.running_count(), 0);

        let task = scheduler
            .next_task()
            .ok_or_else(|| Box::<dyn std::error::Error>::from("should have task"))?;
        scheduler.mark_running(task, "worker-1".to_string());

        assert_eq!(scheduler.pending_count(), 1);
        assert_eq!(scheduler.running_count(), 1);

        scheduler.mark_completed(TaskId(2))?;

        assert_eq!(scheduler.running_count(), 0);
        assert_eq!(scheduler.completed_count(), 1);
        Ok(())
    }

    #[test]
    fn test_task_context() {
        let ctx = TaskContext::new(TaskId(1), "worker-1".to_string())
            .with_memory_limit(2 * 1024 * 1024 * 1024)
            .with_num_cores(4);

        assert_eq!(ctx.task_id, TaskId(1));
        assert_eq!(ctx.worker_id, "worker-1");
        assert_eq!(ctx.memory_limit, 2 * 1024 * 1024 * 1024);
        assert_eq!(ctx.num_cores, 4);
    }
}