oxirs-fuseki 0.2.4

SPARQL 1.1/1.2 HTTP protocol server with Fuseki-compatible configuration
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
//! Disaster Recovery
//!
//! Provides comprehensive disaster recovery capabilities including:
//! - Point-in-time recovery
//! - Automated failover
//! - Replication and synchronization
//! - Recovery testing and validation
//!
//! **v0.1.0 Final Enhancement**: Deep integration with StoreHealthMonitor
//! for intelligent failover decisions based on comprehensive health metrics.

use crate::backup::{BackupManager, BackupMetadata};
use crate::error::{FusekiError, FusekiResult};
use crate::store::Store;
use crate::store_health::{HealthMonitorConfig, HealthStatus, StoreHealthMonitor};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use tokio::time;
use tracing::{debug, error, info, warn};

/// Disaster recovery manager
pub struct DisasterRecoveryManager {
    /// Store to protect
    store: Arc<Store>,
    /// Backup manager
    backup_manager: Arc<BackupManager>,
    /// Health monitor for comprehensive health checks
    health_monitor: Option<Arc<StoreHealthMonitor>>,
    /// DR configuration
    config: DisasterRecoveryConfig,
    /// Recovery state
    state: Arc<tokio::sync::RwLock<RecoveryState>>,
}

/// Disaster recovery configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DisasterRecoveryConfig {
    /// Enable disaster recovery
    pub enabled: bool,
    /// Recovery point objective (RPO) in minutes
    pub rpo_minutes: u64,
    /// Recovery time objective (RTO) in minutes
    pub rto_minutes: u64,
    /// Enable automated failover
    pub auto_failover: bool,
    /// Replication targets
    pub replication_targets: Vec<ReplicationTarget>,
    /// Health check interval
    pub health_check_interval_secs: u64,
    /// Enable recovery testing
    pub enable_recovery_testing: bool,
    /// Recovery test interval (days)
    pub recovery_test_interval_days: u64,
}

impl Default for DisasterRecoveryConfig {
    fn default() -> Self {
        Self {
            enabled: false,
            rpo_minutes: 60, // 1 hour
            rto_minutes: 30, // 30 minutes
            auto_failover: false,
            replication_targets: Vec::new(),
            health_check_interval_secs: 30,
            enable_recovery_testing: true,
            recovery_test_interval_days: 7,
        }
    }
}

/// Replication target configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReplicationTarget {
    pub name: String,
    pub endpoint: String,
    pub region: String,
    pub priority: u32, // Lower number = higher priority
    pub enabled: bool,
}

/// Recovery state
#[derive(Debug, Clone)]
struct RecoveryState {
    healthy: bool,
    last_health_check: Option<DateTime<Utc>>,
    last_backup: Option<DateTime<Utc>>,
    last_recovery_test: Option<DateTime<Utc>>,
    failover_count: u64,
}

impl Default for RecoveryState {
    fn default() -> Self {
        Self {
            healthy: true,
            last_health_check: None,
            last_backup: None,
            last_recovery_test: None,
            failover_count: 0,
        }
    }
}

/// Recovery point information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecoveryPoint {
    pub id: String,
    pub timestamp: DateTime<Utc>,
    pub backup_id: String,
    pub description: String,
    pub size_bytes: u64,
    pub verified: bool,
}

/// Failover result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FailoverResult {
    pub success: bool,
    pub target: String,
    pub duration_secs: u64,
    pub data_loss_minutes: u64,
    pub timestamp: DateTime<Utc>,
}

impl DisasterRecoveryManager {
    /// Create a new disaster recovery manager
    pub fn new(
        store: Arc<Store>,
        backup_manager: Arc<BackupManager>,
        config: DisasterRecoveryConfig,
    ) -> Self {
        Self {
            store,
            backup_manager,
            health_monitor: None,
            config,
            state: Arc::new(tokio::sync::RwLock::new(RecoveryState::default())),
        }
    }

    /// Create a new disaster recovery manager with health monitoring
    pub fn with_health_monitoring(
        store: Arc<Store>,
        backup_manager: Arc<BackupManager>,
        config: DisasterRecoveryConfig,
    ) -> Self {
        // Create health monitor with disaster recovery focused configuration
        let health_config = HealthMonitorConfig {
            check_interval: Duration::from_secs(config.health_check_interval_secs),
            max_history: 100,
            performance_window: Duration::from_secs(600),
            error_window: Duration::from_secs(3600),
            memory_warning_threshold: 3 * 1024 * 1024 * 1024, // 3GB
            max_connections: 1000,                            // Default for disaster recovery
            memory_critical_threshold: 7 * 1024 * 1024 * 1024, // 7GB (conservative for DR)
        };

        let health_monitor = Arc::new(StoreHealthMonitor::with_config(
            Arc::clone(&store),
            health_config,
        ));

        // Start background monitoring
        Arc::clone(&health_monitor).start_monitoring();

        Self {
            store,
            backup_manager,
            health_monitor: Some(health_monitor),
            config,
            state: Arc::new(tokio::sync::RwLock::new(RecoveryState::default())),
        }
    }

    /// Get the health monitor reference
    pub fn health_monitor(&self) -> Option<&Arc<StoreHealthMonitor>> {
        self.health_monitor.as_ref()
    }

    /// Start disaster recovery monitoring
    pub async fn start(&self) -> FusekiResult<()> {
        if !self.config.enabled {
            info!("Disaster recovery disabled");
            return Ok(());
        }

        info!(
            "Starting disaster recovery (RPO: {}min, RTO: {}min)",
            self.config.rpo_minutes, self.config.rto_minutes
        );

        loop {
            if let Err(e) = self.health_check().await {
                error!("Health check failed: {}", e);

                if self.config.auto_failover {
                    warn!("Initiating automated failover");
                    if let Err(e) = self.perform_failover().await {
                        error!("Automated failover failed: {}", e);
                    }
                }
            }

            // Check if recovery test is due
            if self.config.enable_recovery_testing {
                if let Err(e) = self.check_recovery_test_schedule().await {
                    error!("Recovery test check failed: {}", e);
                }
            }

            time::sleep(Duration::from_secs(self.config.health_check_interval_secs)).await;
        }
    }

    /// Perform comprehensive health check using health monitor
    async fn health_check(&self) -> FusekiResult<()> {
        debug!("Performing disaster recovery health check");

        let mut state = self.state.write().await;
        state.last_health_check = Some(Utc::now());

        // Use health monitor if available for comprehensive checks
        if let Some(health_monitor) = &self.health_monitor {
            let health = health_monitor.check_health().await?;

            // Evaluate health status for DR purposes
            match health.status {
                HealthStatus::Healthy => {
                    debug!("Store health: HEALTHY (score: {})", health.health_score);
                    state.healthy = true;
                }
                HealthStatus::Degraded => {
                    warn!(
                        "Store health: DEGRADED (score: {}). Monitoring closely.",
                        health.health_score
                    );

                    // Degraded but not critical - no failover yet
                    if health.health_score < 50 {
                        warn!(
                            "Health score critically low ({}). Preparing for failover.",
                            health.health_score
                        );
                        state.healthy = false;
                        return Err(FusekiError::internal(format!(
                            "Store health degraded below threshold (score: {})",
                            health.health_score
                        )));
                    }

                    state.healthy = true; // Still operational
                }
                HealthStatus::Unhealthy | HealthStatus::Down => {
                    error!(
                        "Store health: {:?} (score: {}). Failover required!",
                        health.status, health.health_score
                    );

                    // Log component failures
                    for component in &health.components {
                        if component.status == HealthStatus::Unhealthy
                            || component.status == HealthStatus::Down
                        {
                            error!(
                                "Component {} is {:?}: {}",
                                component.name,
                                component.status,
                                component.message.as_deref().unwrap_or("Unknown issue")
                            );
                        }
                    }

                    state.healthy = false;
                    return Err(FusekiError::internal(format!(
                        "Store is {:?} (score: {})",
                        health.status, health.health_score
                    )));
                }
            }

            // Check performance metrics
            if health.performance.avg_query_latency_ms > 5000.0 {
                warn!(
                    "Average query latency is very high: {:.2}ms",
                    health.performance.avg_query_latency_ms
                );
            }

            // Check resource utilization
            if health.resources.memory_usage_percent > 90.0 {
                warn!(
                    "Memory usage critical: {:.1}%",
                    health.resources.memory_usage_percent
                );
            }

            // Check error rates
            if health.errors.errors_last_hour > 100 {
                warn!(
                    "High error rate: {} errors in last hour",
                    health.errors.errors_last_hour
                );
            }
        } else {
            // Fallback to basic health check
            debug!("Using basic health check (no health monitor available)");

            // Check if store is ready
            if !self.store.is_ready() {
                state.healthy = false;
                return Err(FusekiError::internal("Store is not ready".to_string()));
            }
        }

        // Check if backup is within RPO
        if let Some(last_backup) = state.last_backup {
            let age_minutes = (Utc::now() - last_backup).num_minutes() as u64;
            if age_minutes > self.config.rpo_minutes {
                warn!(
                    "Backup age ({} min) exceeds RPO ({} min)",
                    age_minutes, self.config.rpo_minutes
                );

                // RPO violation is serious but not immediate failover
                if age_minutes > self.config.rpo_minutes * 2 {
                    state.healthy = false;
                    return Err(FusekiError::internal(format!(
                        "Critical RPO violation: {} minutes since last backup",
                        age_minutes
                    )));
                }
            }
        } else {
            debug!("No backup history available yet");
        }

        state.healthy = true;
        Ok(())
    }

    /// Perform failover to replica
    async fn perform_failover(&self) -> FusekiResult<FailoverResult> {
        info!("Starting failover procedure");

        let start_time = Utc::now();

        // Sort replication targets by priority
        let mut targets = self.config.replication_targets.clone();
        targets.sort_by_key(|t| t.priority);

        for target in targets.iter().filter(|t| t.enabled) {
            info!("Attempting failover to: {}", target.name);

            match self.failover_to_target(target).await {
                Ok(_) => {
                    let duration = (Utc::now() - start_time).num_seconds() as u64;

                    let mut state = self.state.write().await;
                    state.failover_count += 1;

                    let result = FailoverResult {
                        success: true,
                        target: target.name.clone(),
                        duration_secs: duration,
                        data_loss_minutes: 0, // Calculate actual data loss
                        timestamp: Utc::now(),
                    };

                    info!("Failover successful to {} in {}s", target.name, duration);

                    return Ok(result);
                }
                Err(e) => {
                    warn!("Failover to {} failed: {}", target.name, e);
                    continue;
                }
            }
        }

        Err(FusekiError::internal(
            "All failover targets unavailable".to_string(),
        ))
    }

    /// Failover to specific target
    async fn failover_to_target(&self, _target: &ReplicationTarget) -> FusekiResult<()> {
        // In real implementation:
        // 1. Verify target is healthy
        // 2. Promote replica to primary
        // 3. Update DNS/load balancer
        // 4. Verify write operations work

        info!("Failover target verification would happen here");
        Ok(())
    }

    /// Create recovery point
    pub async fn create_recovery_point(&self, description: String) -> FusekiResult<RecoveryPoint> {
        info!("Creating recovery point: {}", description);

        // Trigger backup
        let backup_meta = self.backup_manager.perform_backup().await?;

        let recovery_point = RecoveryPoint {
            id: format!("rp-{}", Utc::now().format("%Y%m%d-%H%M%S")),
            timestamp: Utc::now(),
            backup_id: backup_meta.id.clone(),
            description,
            size_bytes: backup_meta.size_bytes,
            verified: false,
        };

        let mut state = self.state.write().await;
        state.last_backup = Some(Utc::now());

        info!("Recovery point created: {}", recovery_point.id);
        Ok(recovery_point)
    }

    /// Restore to recovery point
    pub async fn restore_to_point(&self, recovery_point_id: &str) -> FusekiResult<()> {
        info!("Restoring to recovery point: {}", recovery_point_id);

        // Find corresponding backup
        let backups = self.backup_manager.list_backups().await?;

        let backup = backups
            .iter()
            .find(|b| b.id.contains(recovery_point_id))
            .ok_or_else(|| {
                FusekiError::internal(format!("Recovery point not found: {}", recovery_point_id))
            })?;

        // Restore from backup
        self.backup_manager.restore_backup(&backup.id).await?;

        info!("Restore completed");
        Ok(())
    }

    /// Test recovery procedure
    pub async fn test_recovery(&self) -> FusekiResult<RecoveryTestReport> {
        info!("Starting recovery test");

        let start_time = Utc::now();

        // Create test backup
        let backup = self.backup_manager.perform_backup().await?;

        // Attempt restore in isolated environment
        // In real implementation, this would:
        // 1. Spin up temporary instance
        // 2. Restore backup to temp instance
        // 3. Verify data integrity
        // 4. Measure RTO
        // 5. Clean up temp instance

        let duration = (Utc::now() - start_time).num_seconds() as u64;
        let rto_met = duration < (self.config.rto_minutes * 60);

        let mut state = self.state.write().await;
        state.last_recovery_test = Some(Utc::now());

        let report = RecoveryTestReport {
            test_time: Utc::now(),
            backup_id: backup.id,
            success: rto_met,
            duration_secs: duration,
            rto_target_secs: self.config.rto_minutes * 60,
            rto_met,
            data_integrity_verified: true,
            notes: if rto_met {
                "Recovery test passed".to_string()
            } else {
                format!(
                    "RTO not met: {}s actual vs {}s target",
                    duration,
                    self.config.rto_minutes * 60
                )
            },
        };

        info!(
            "Recovery test completed: {} ({}s)",
            if report.success { "PASS" } else { "FAIL" },
            duration
        );

        Ok(report)
    }

    /// Check if recovery test is due
    async fn check_recovery_test_schedule(&self) -> FusekiResult<()> {
        let state = self.state.read().await;

        if let Some(last_test) = state.last_recovery_test {
            let days_since = (Utc::now() - last_test).num_days() as u64;
            if days_since >= self.config.recovery_test_interval_days {
                drop(state); // Release lock before test
                info!("Recovery test is due");
                self.test_recovery().await?;
            }
        } else {
            drop(state);
            info!("Running initial recovery test");
            self.test_recovery().await?;
        }

        Ok(())
    }

    /// Get DR status
    pub async fn get_status(&self) -> DisasterRecoveryStatus {
        let state = self.state.read().await;

        DisasterRecoveryStatus {
            enabled: self.config.enabled,
            healthy: state.healthy,
            rpo_minutes: self.config.rpo_minutes,
            rto_minutes: self.config.rto_minutes,
            last_health_check: state.last_health_check,
            last_backup: state.last_backup,
            last_recovery_test: state.last_recovery_test,
            failover_count: state.failover_count,
            replication_targets: self.config.replication_targets.len(),
        }
    }
}

/// Recovery test report
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecoveryTestReport {
    pub test_time: DateTime<Utc>,
    pub backup_id: String,
    pub success: bool,
    pub duration_secs: u64,
    pub rto_target_secs: u64,
    pub rto_met: bool,
    pub data_integrity_verified: bool,
    pub notes: String,
}

/// Disaster recovery status
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DisasterRecoveryStatus {
    pub enabled: bool,
    pub healthy: bool,
    pub rpo_minutes: u64,
    pub rto_minutes: u64,
    pub last_health_check: Option<DateTime<Utc>>,
    pub last_backup: Option<DateTime<Utc>>,
    pub last_recovery_test: Option<DateTime<Utc>>,
    pub failover_count: u64,
    pub replication_targets: usize,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_dr_config_default() {
        let config = DisasterRecoveryConfig::default();
        assert!(!config.enabled);
        assert_eq!(config.rpo_minutes, 60);
        assert_eq!(config.rto_minutes, 30);
    }

    #[test]
    fn test_replication_target_priority() {
        #[allow(clippy::useless_vec)] // Need vec for sorting in place
        let mut targets = vec![
            ReplicationTarget {
                name: "backup".to_string(),
                endpoint: "backup.example.com".to_string(),
                region: "us-west-2".to_string(),
                priority: 2,
                enabled: true,
            },
            ReplicationTarget {
                name: "primary".to_string(),
                endpoint: "primary.example.com".to_string(),
                region: "us-east-1".to_string(),
                priority: 1,
                enabled: true,
            },
        ];

        targets.sort_by_key(|t| t.priority);
        assert_eq!(targets[0].name, "primary");
        assert_eq!(targets[1].name, "backup");
    }
}