pandrs 0.3.0 - Docs.rs

//! # Fault-tolerant DataFusion Context
//!
//! This module extends the DataFusion execution context with fault tolerance capabilities.

use std::sync::Arc;
use std::time::{Duration, Instant};

use crate::error::{Result, Error};
use crate::distributed::execution::{ExecutionPlan, ExecutionResult, ExecutionMetrics};
use crate::distributed::fault_tolerance::{FaultToleranceHandler, FaultTolerantContext, RecoveryAction, FailureType, FailureInfo};
use crate::distributed::partition::PartitionSet;
use super::context::DataFusionContext;

impl FaultTolerantContext for DataFusionContext {
    fn execute_with_fault_tolerance(
        &self,
        plan: &ExecutionPlan,
        fault_handler: &FaultToleranceHandler,
    ) -> Result<ExecutionResult> {
        // Start timing execution
        let start_time = Instant::now();
        let execution_id = format!("exec-{}", uuid::Uuid::new_v4());
        
        // Attempt to execute the plan with retry logic
        let result = fault_handler.execute_with_retry(|| {
            self.execute(plan)
        });
        
        match result {
            Ok(execution_result) => {
                // Log successful execution
                log::info!(
                    "Successfully executed plan with ID {} in {:?}",
                    execution_id,
                    start_time.elapsed()
                );
                
                // Return successful result
                Ok(execution_result)
            },
            Err(error) => {
                // Log the failure
                log::error!(
                    "Execution failed for plan with ID {}: {}",
                    execution_id,
                    error
                );
                
                // Handle failure and determine recovery action
                match fault_handler.handle_failure(plan, &error)? {
                    Some(recovery_action) => self.recover_from_failure(recovery_action),
                    None => Err(error),
                }
            }
        }
    }
    
    fn recover_from_failure(
        &self,
        action: RecoveryAction,
    ) -> Result<ExecutionResult> {
        match action {
            RecoveryAction::RetryQuery { plan, delay } => {
                // Sleep for the specified delay
                if !delay.is_zero() {
                    std::thread::sleep(delay);
                }
                
                // Retry the entire query
                log::info!("Retrying entire query after failure");
                self.execute(&plan)
            },
            RecoveryAction::RetryFailedPartitions { plan, partition_ids, delay } => {
                // Sleep for the specified delay
                if !delay.is_zero() {
                    std::thread::sleep(delay);
                }
                
                // Special handling required to retry only failed partitions
                // We'll need to modify the plan to execute only specific partitions
                
                // Log the recovery
                log::info!(
                    "Retrying {} failed partitions: {:?}",
                    partition_ids.len(),
                    partition_ids
                );
                
                // For now, we'll just retry the entire plan
                // In a complete implementation, we would:
                // 1. Split the plan into partition-specific plans
                // 2. Execute only the failed partitions
                // 3. Merge the results with any successful partitions from the original execution
                self.execute(&plan)
            },
            RecoveryAction::Reroute { plan, excluded_nodes } => {
                // In a real distributed environment, we would modify the execution to avoid specific nodes
                log::info!(
                    "Rerouting execution to avoid {} nodes: {:?}",
                    excluded_nodes.len(),
                    excluded_nodes
                );
                
                // Create a modified plan or execution context that avoids the problematic nodes
                // For local DataFusion, we'll just retry the plan
                self.execute(&plan)
            },
            RecoveryAction::LocalFallback { plan } => {
                // Fall back to a local execution strategy
                log::info!("Falling back to local execution after distributed failure");
                
                // In a real implementation, we would:
                // 1. Collect all required input data locally
                // 2. Use a local execution engine to process the data
                // 3. Return the results
                
                // For this implementation, we'll just retry with DataFusion
                // which is already a local execution engine
                self.execute(&plan)
            },
        }
    }
}

/// Creates a fault-tolerant DataFusion context
pub fn create_fault_tolerant_context(
    context: DataFusionContext,
    fault_handler: FaultToleranceHandler,
) -> FaultTolerantDataFusionContext {
    FaultTolerantDataFusionContext {
        inner: context,
        fault_handler: Arc::new(fault_handler),
    }
}

/// DataFusion context with fault tolerance
pub struct FaultTolerantDataFusionContext {
    /// Inner DataFusion context
    inner: DataFusionContext,
    /// Fault tolerance handler
    fault_handler: Arc<FaultToleranceHandler>,
}

impl FaultTolerantDataFusionContext {
    /// Creates a new fault-tolerant DataFusion context
    pub fn new(context: DataFusionContext, fault_handler: FaultToleranceHandler) -> Self {
        Self {
            inner: context,
            fault_handler: Arc::new(fault_handler),
        }
    }
    
    /// Executes a plan with fault tolerance
    pub fn execute(&self, plan: &ExecutionPlan) -> Result<ExecutionResult> {
        self.inner.execute_with_fault_tolerance(plan, &self.fault_handler)
    }
    
    /// Gets the inner context
    pub fn inner(&self) -> &DataFusionContext {
        &self.inner
    }
    
    /// Gets a cloned reference to the fault handler
    pub fn fault_handler(&self) -> Arc<FaultToleranceHandler> {
        self.fault_handler.clone()
    }
    
    /// Registers a dataset
    pub fn register_dataset(&mut self, name: &str, partitions: PartitionSet) -> Result<()> {
        self.inner.register_dataset(name, partitions)
    }
    
    /// Registers a CSV file
    pub fn register_csv(&mut self, name: &str, path: &str) -> Result<()> {
        self.inner.register_csv(name, path)
    }
    
    /// Registers a Parquet file
    pub fn register_parquet(&mut self, name: &str, path: &str) -> Result<()> {
        self.inner.register_parquet(name, path)
    }
    
    /// Explains an execution plan
    pub fn explain_plan(&self, plan: &ExecutionPlan, with_statistics: bool) -> Result<String> {
        self.inner.explain_plan(plan, with_statistics)
    }
}