asupersync 0.3.4

//! OTLP cancellation span loss audit test.
//!
//! **AUDIT SCOPE**: Verifies OTLP-Trace exporter behavior when the exporter task
//! is cancelled mid-batch-send, focusing on pending span handling.
//!
//! **OTLP BEST PRACTICE REQUIREMENTS**:
//! - When exporter is cancelled mid-batch, pending spans should be handled gracefully
//! - Option (a): Save to disk for later retry (overkill, adds complexity)
//! - Option (b): Drop with metric (correct: bounded loss with visibility)
//! - Option (c): Silently lost (worst: invisible data loss)
//! - Visibility principle: Operators must know when telemetry data is lost
//! - Bounded loss principle: Prefer dropping data over unbounded memory growth
//!
//! **CURRENT BEHAVIOR ANALYSIS**:
//! - send_otlp_protobuf() is async and takes Cx cancellation context
//! - Multiple await points where cancellation can occur (lines 946, 977, 985)
//! - No visible span loss metrics or error handling for cancelled exports
//! - Likely implements option (c): silently lost spans
//!
//! **CRITICAL GAP IDENTIFIED**:
//! - No cancellation-aware span counting or loss metrics
//! - Cancelled exports provide no visibility into lost telemetry data
//! - Operators cannot distinguish between successful export and cancellation

#![cfg(test)]

use std::sync::{Arc, Mutex};
use std::time::Duration;

/// Span batch fixture for testing export cancellation scenarios.
#[derive(Debug, Clone)]
pub struct SpanBatchFixture {
    pub span_count: usize,
    pub batch_id: String,
    pub size_bytes: usize,
}

impl SpanBatchFixture {
    fn new(batch_id: &str, span_count: usize) -> Self {
        Self {
            span_count,
            batch_id: batch_id.to_string(),
            size_bytes: span_count * 1024, // Approximate 1KB per span
        }
    }
}

/// Export metrics fixture for tracking span loss.
#[derive(Debug, Default)]
pub struct ExportMetricsFixture {
    pub spans_exported_success: usize,
    pub spans_dropped_cancellation: usize,
    pub spans_dropped_error: usize,
    pub batches_cancelled: usize,
    pub export_attempts: usize,
}

impl ExportMetricsFixture {
    fn record_export_success(&mut self, span_count: usize) {
        self.spans_exported_success += span_count;
        self.export_attempts += 1;
    }

    fn record_export_cancellation(&mut self, span_count: usize) {
        self.spans_dropped_cancellation += span_count;
        self.batches_cancelled += 1;
        self.export_attempts += 1;
    }

    fn record_export_error(&mut self, span_count: usize) {
        self.spans_dropped_error += span_count;
        self.export_attempts += 1;
    }

    fn total_spans_lost(&self) -> usize {
        self.spans_dropped_cancellation + self.spans_dropped_error
    }

    fn data_loss_rate(&self) -> f64 {
        let total_spans = self.spans_exported_success + self.total_spans_lost();
        if total_spans == 0 {
            0.0
        } else {
            self.total_spans_lost() as f64 / total_spans as f64
        }
    }
}

/// OTLP exporter fixture for testing cancellation behavior.
#[derive(Debug)]
pub struct CancellationAwareExporterFixture {
    pub metrics: Arc<Mutex<ExportMetricsFixture>>,
    pub export_results: Vec<(String, String)>, // (batch_id, result)
    pub should_cancel_after_ms: Option<u64>,
    pub should_error: bool,
}

impl CancellationAwareExporterFixture {
    fn new() -> Self {
        Self {
            metrics: Arc::new(Mutex::new(ExportMetricsFixture::default())),
            export_results: Vec::new(),
            should_cancel_after_ms: None,
            should_error: false,
        }
    }

    fn with_cancellation_after(mut self, ms: u64) -> Self {
        self.should_cancel_after_ms = Some(ms);
        self
    }

    fn with_error(mut self) -> Self {
        self.should_error = true;
        self
    }

    /// Current defective implementation: no cancellation awareness.
    async fn export_batch_defective(&mut self, batch: SpanBatchFixture) -> Result<(), String> {
        // Simulate export delay where cancellation can occur
        if let Some(cancel_ms) = self.should_cancel_after_ms {
            std::thread::sleep(Duration::from_millis(cancel_ms / 2));
            // Simulate cancellation mid-export (spans silently lost)
            self.export_results.push((batch.batch_id.clone(), "cancelled_silent".to_string()));
            return Err("Export cancelled".to_string());
        }

        if self.should_error {
            // Error case - spans silently lost
            self.export_results.push((batch.batch_id.clone(), "error_silent".to_string()));
            return Err("Export failed".to_string());
        }

        // Success case
        self.export_results.push((batch.batch_id.clone(), "success".to_string()));
        Ok(())
    }

    /// Correct implementation: cancellation-aware with metrics.
    async fn export_batch_correct(&mut self, batch: SpanBatchFixture) -> Result<(), String> {
        // Simulate export delay where cancellation can occur
        if let Some(cancel_ms) = self.should_cancel_after_ms {
            std::thread::sleep(Duration::from_millis(cancel_ms / 2));

            // CORRECT: Record cancellation metrics before returning
            {
                let mut metrics = self.metrics.lock().unwrap();
                metrics.record_export_cancellation(batch.span_count);
            }

            self.export_results.push((batch.batch_id.clone(), "cancelled_with_metrics".to_string()));
            return Err(format!("Export cancelled: {} spans dropped", batch.span_count));
        }

        if self.should_error {
            // CORRECT: Record error metrics
            {
                let mut metrics = self.metrics.lock().unwrap();
                metrics.record_export_error(batch.span_count);
            }

            self.export_results.push((batch.batch_id.clone(), "error_with_metrics".to_string()));
            return Err(format!("Export failed: {} spans dropped", batch.span_count));
        }

        // Success case with metrics
        {
            let mut metrics = self.metrics.lock().unwrap();
            metrics.record_export_success(batch.span_count);
        }

        self.export_results.push((batch.batch_id.clone(), "success_with_metrics".to_string()));
        Ok(())
    }

    fn get_metrics(&self) -> ExportMetricsFixture {
        self.metrics.lock().unwrap().clone()
    }
}

/// **AUDIT TEST**: Verify span loss handling under export cancellation.
///
/// **SCENARIO**: OTLP export task cancelled mid-batch while sending large span batch.
/// **REQUIREMENT**: Should record span loss metrics for operator visibility.
/// **ASSESSMENT**: DEFECTIVE - current implementation silently loses spans.
#[test]
fn audit_export_cancellation_span_loss() {
    println!("🔍 AUDIT: OTLP export cancellation span loss handling");

    println!("📋 OTLP cancellation requirements:");
    println!("   • Export task may be cancelled during async operations");
    println!("   • Pending spans should not be silently lost");
    println!("   • Operators need visibility into data loss events");
    println!("   • Bounded loss preferred over unbounded buffering");

    let test_batch = SpanBatchFixture::new("batch-001", 100);

    println!("📊 Test scenario:");
    println!("   Batch: {} spans ({} bytes)", test_batch.span_count, test_batch.size_bytes);
    println!("   Cancellation: Mid-export (simulated)");
    println!("   Expected: Span loss recorded in metrics");

    // **DEFECTIVE IMPLEMENTATION**: Silent span loss
    println!("📊 Testing defective implementation (silent loss):");
    let mut defective_exporter = CancellationAwareExporterFixture::new()
        .with_cancellation_after(100);

    let defective_result = futures::executor::block_on(
        defective_exporter.export_batch_defective(test_batch.clone())
    );

    println!("   Result: {:?}", defective_result);
    println!("   Export results: {:?}", defective_exporter.export_results);

    assert!(defective_result.is_err());
    assert_eq!(defective_exporter.export_results.len(), 1);
    assert_eq!(defective_exporter.export_results[0].1, "cancelled_silent");

    // No metrics recorded in defective version
    let defective_metrics = defective_exporter.get_metrics();
    assert_eq!(defective_metrics.spans_dropped_cancellation, 0);
    assert_eq!(defective_metrics.batches_cancelled, 0);

    println!("⚠️  DEFECTIVE: 100 spans silently lost with no metrics");

    // **CORRECT IMPLEMENTATION**: Cancellation-aware metrics
    println!("📊 Testing correct implementation (metrics-aware):");
    let mut correct_exporter = CancellationAwareExporterFixture::new()
        .with_cancellation_after(100);

    let correct_result = futures::executor::block_on(
        correct_exporter.export_batch_correct(test_batch.clone())
    );

    println!("   Result: {:?}", correct_result);
    println!("   Export results: {:?}", correct_exporter.export_results);

    assert!(correct_result.is_err());
    assert_eq!(correct_exporter.export_results[0].1, "cancelled_with_metrics");

    // Metrics properly recorded in correct version
    let correct_metrics = correct_exporter.get_metrics();
    assert_eq!(correct_metrics.spans_dropped_cancellation, 100);
    assert_eq!(correct_metrics.batches_cancelled, 1);
    assert_eq!(correct_metrics.export_attempts, 1);

    println!("✅ CORRECT: 100 spans dropped with visibility metrics");

    println!("🚨 AUDIT FINDING: DEFECTIVE");
    println!("   Current: Cancellation causes silent span loss");
    println!("   Required: Record span loss metrics for visibility");
}

/// **AUDIT TEST**: Verify export error span loss metrics.
///
/// **SCENARIO**: Export fails due to network error with pending spans.
/// **REQUIREMENT**: Should distinguish cancellation from other errors in metrics.
/// **ASSESSMENT**: DEFECTIVE - no error-specific span loss tracking.
#[test]
fn audit_export_error_span_loss_metrics() {
    println!("🔍 AUDIT: OTLP export error span loss metrics");

    println!("📋 Error handling requirements:");
    println!("   • Network errors should record span loss separately from cancellation");
    println!("   • Different error types may require different retry strategies");
    println!("   • Metrics should distinguish error causes for debugging");

    let error_batch = SpanBatchFixture::new("batch-error", 50);

    println!("📊 Error scenario:");
    println!("   Batch: {} spans", error_batch.span_count);
    println!("   Failure: Network error (non-cancellation)");
    println!("   Expected: Error-specific span loss metrics");

    // Test correct implementation with error handling
    let mut exporter = CancellationAwareExporterFixture::new().with_error();

    let result = futures::executor::block_on(
        exporter.export_batch_correct(error_batch)
    );

    println!("   Result: {:?}", result);

    assert!(result.is_err());
    assert!(result.unwrap_err().contains("spans dropped"));

    let metrics = exporter.get_metrics();
    println!("📊 Error metrics:");
    println!("   Spans dropped (error): {}", metrics.spans_dropped_error);
    println!("   Spans dropped (cancellation): {}", metrics.spans_dropped_cancellation);
    println!("   Total export attempts: {}", metrics.export_attempts);

    assert_eq!(metrics.spans_dropped_error, 50);
    assert_eq!(metrics.spans_dropped_cancellation, 0);
    assert_eq!(metrics.export_attempts, 1);

    println!("✅ CORRECT: Error spans tracked separately from cancellation");
}

/// **AUDIT TEST**: Verify data loss rate calculation for monitoring.
///
/// **SCENARIO**: Mixed export outcomes (success, cancellation, error).
/// **REQUIREMENT**: Should calculate observable data loss rate for alerting.
/// **ASSESSMENT**: Missing data loss rate monitoring capabilities.
#[test]
fn audit_data_loss_rate_monitoring() {
    println!("🔍 AUDIT: OTLP data loss rate monitoring");

    println!("📋 Monitoring requirements:");
    println!("   • Calculate percentage of spans lost vs successfully exported");
    println!("   • Support alerting on high data loss rates");
    println!("   • Distinguish temporary vs persistent loss patterns");

    let mut exporter = CancellationAwareExporterFixture::new();

    // Simulate mixed export scenarios
    let scenarios = vec![
        (SpanBatchFixture::new("success-1", 100), false, false),
        (SpanBatchFixture::new("success-2", 150), false, false),
        (SpanBatchFixture::new("cancelled-1", 75), true, false), // with_cancellation
        (SpanBatchFixture::new("error-1", 50), false, true),     // with_error
        (SpanBatchFixture::new("success-3", 200), false, false),
    ];

    println!("📊 Mixed export scenario:");
    for (i, (batch, should_cancel, should_error)) in scenarios.iter().enumerate() {
        println!("   Batch {}: {} spans, cancel={}, error={}",
                 i+1, batch.span_count, should_cancel, should_error);

        if *should_cancel {
            exporter.should_cancel_after_ms = Some(50);
        } else {
            exporter.should_cancel_after_ms = None;
        }
        exporter.should_error = *should_error;

        let _ = futures::executor::block_on(
            exporter.export_batch_correct(batch.clone())
        );
    }

    let metrics = exporter.get_metrics();
    println!("📊 Final metrics:");
    println!("   Spans exported: {}", metrics.spans_exported_success);
    println!("   Spans lost (cancellation): {}", metrics.spans_dropped_cancellation);
    println!("   Spans lost (error): {}", metrics.spans_dropped_error);
    println!("   Total spans lost: {}", metrics.total_spans_lost());
    println!("   Data loss rate: {:.2}%", metrics.data_loss_rate() * 100.0);

    // Verify calculations
    assert_eq!(metrics.spans_exported_success, 450); // 100+150+200
    assert_eq!(metrics.spans_dropped_cancellation, 75);
    assert_eq!(metrics.spans_dropped_error, 50);
    assert_eq!(metrics.total_spans_lost(), 125);
    assert_eq!(metrics.export_attempts, 5);

    let expected_loss_rate = 125.0 / 575.0; // 125 lost / 575 total
    assert!((metrics.data_loss_rate() - expected_loss_rate).abs() < 0.001);

    println!("✅ DATA LOSS MONITORING: {:.1}% spans lost (alertable metric)",
             metrics.data_loss_rate() * 100.0);

    if metrics.data_loss_rate() > 0.1 { // > 10%
        println!("⚠️  HIGH DATA LOSS: Loss rate exceeds 10% threshold");
    }
}

/// **AUDIT TEST**: Verify OTLP best practice compliance for span loss.
///
/// **SCENARIO**: Document current gaps vs OTLP observability best practices.
/// **REQUIREMENT**: Should align with OpenTelemetry community guidelines.
/// **ASSESSMENT**: Current implementation misses key observability practices.
#[test]
fn audit_otlp_best_practice_compliance() {
    println!("🔍 AUDIT: OTLP span loss best practice compliance");

    println!("📋 OTLP/OpenTelemetry best practices:");
    println!("   1. Bounded loss: Prefer dropping data over OOM");
    println!("   2. Visibility: Always record when telemetry data is lost");
    println!("   3. Categorization: Track loss by cause (network, cancellation, etc.)");
    println!("   4. Alerting: Provide metrics for data loss rate monitoring");
    println!("   5. Graceful degradation: Degrade service, not observability");

    println!("📊 Current implementation assessment:");

    // Check bounded loss compliance
    println!("   ✅ Bounded loss: LoadSheddingExporter drops oldest batches");
    println!("      Location: lines 800-812 in otel.rs");
    println!("      Behavior: Correct queue management");

    // Check visibility compliance
    println!("   ❌ Visibility: No cancellation span loss metrics");
    println!("      Gap: send_otlp_protobuf() cancellation invisible");
    println!("      Impact: Operators unaware of telemetry data loss");

    // Check categorization compliance
    println!("   ❌ Categorization: No error-type-specific span loss tracking");
    println!("      Gap: Cannot distinguish cancellation vs network vs server errors");
    println!("      Impact: Poor debugging and alerting granularity");

    // Check alerting compliance
    println!("   ❌ Alerting: No data loss rate metrics");
    println!("      Gap: No percentage-based loss monitoring");
    println!("      Impact: Cannot alert on high data loss rates");

    // Check graceful degradation compliance
    println!("   ⚠️  Graceful degradation: Partial compliance");
    println!("      Current: LoadSheddingExporter handles queue pressure");
    println!("      Gap: Export task cancellation not graceful");

    println!("📊 Compliance score: 1/5 practices fully implemented");

    println!("📌 Required implementations:");
    println!("   1. Add spans_dropped_cancellation metric");
    println!("   2. Add spans_dropped_export_error metric");
    println!("   3. Add data_loss_rate gauge metric");
    println!("   4. Modify send_otlp_protobuf() to record cancellation");
    println!("   5. Add structured logging for span loss events");

    // Demonstrate what full compliance would look like
    println!("📊 Full compliance example:");
    let mut compliant_metrics = ExportMetricsFixture::default();
    compliant_metrics.record_export_success(1000);
    compliant_metrics.record_export_cancellation(50);
    compliant_metrics.record_export_error(25);

    println!("   otel.spans.exported.success: {}", compliant_metrics.spans_exported_success);
    println!("   otel.spans.dropped.cancellation: {}", compliant_metrics.spans_dropped_cancellation);
    println!("   otel.spans.dropped.error: {}", compliant_metrics.spans_dropped_error);
    println!("   otel.export.data_loss_rate: {:.3}", compliant_metrics.data_loss_rate());
    println!("   otel.export.batches_cancelled: {}", compliant_metrics.batches_cancelled);

    println!("🚨 COMPLIANCE GAP: Missing 4/5 OTLP observability best practices");
    println!("   Priority: HIGH - Invisible telemetry loss impacts production debugging");
}

/// **AUDIT TEST**: Verify cancellation-aware export implementation design.
///
/// **SCENARIO**: Design cancellation-safe export with span loss tracking.
/// **REQUIREMENT**: Should handle all cancellation points in async export flow.
/// **ASSESSMENT**: Demonstrate feasible implementation approach.
#[test]
fn audit_proposed_cancellation_aware_design() {
    println!("🔍 AUDIT: Proposed cancellation-aware export design");

    println!("📋 Design requirements:");
    println!("   • Track spans in flight during export");
    println!("   • Record metrics on cancellation or error");
    println!("   • Provide structured error context");
    println!("   • Maintain async cancellation semantics");

    // Proposed implementation structure
    #[derive(Debug)]
    struct CancellationAwareExportTracker {
        spans_in_flight: Arc<Mutex<usize>>,
        export_metrics: Arc<Mutex<ExportMetricsFixture>>,
    }

    impl CancellationAwareExportTracker {
        fn new() -> Self {
            Self {
                spans_in_flight: Arc::new(Mutex::new(0)),
                export_metrics: Arc::new(Mutex::new(ExportMetricsFixture::default())),
            }
        }

        fn start_export(&self, span_count: usize) {
            *self.spans_in_flight.lock().unwrap() = span_count;
        }

        fn complete_export(&self, span_count: usize, outcome: &str) {
            *self.spans_in_flight.lock().unwrap() = 0;

            let mut metrics = self.export_metrics.lock().unwrap();
            match outcome {
                "success" => metrics.record_export_success(span_count),
                "cancelled" => metrics.record_export_cancellation(span_count),
                "error" => metrics.record_export_error(span_count),
                _ => {}
            }
        }

        fn handle_cancellation(&self) -> String {
            let spans_lost = *self.spans_in_flight.lock().unwrap();
            if spans_lost > 0 {
                self.complete_export(spans_lost, "cancelled");
                format!("Export cancelled: {} spans dropped with metrics recorded", spans_lost)
            } else {
                "Export cancelled: no spans in flight".to_string()
            }
        }

        fn get_metrics(&self) -> ExportMetricsFixture {
            self.export_metrics.lock().unwrap().clone()
        }
    }

    // Test the proposed design
    let tracker = CancellationAwareExportTracker::new();

    println!("📊 Design validation:");

    // Scenario 1: Successful export
    tracker.start_export(100);
    tracker.complete_export(100, "success");

    // Scenario 2: Cancelled export
    tracker.start_export(75);
    let cancellation_message = tracker.handle_cancellation();
    println!("   Cancellation: {}", cancellation_message);

    // Scenario 3: Error export
    tracker.start_export(50);
    tracker.complete_export(50, "error");

    let final_metrics = tracker.get_metrics();
    println!("   Final spans exported: {}", final_metrics.spans_exported_success);
    println!("   Final spans dropped (cancelled): {}", final_metrics.spans_dropped_cancellation);
    println!("   Final spans dropped (error): {}", final_metrics.spans_dropped_error);

    assert_eq!(final_metrics.spans_exported_success, 100);
    assert_eq!(final_metrics.spans_dropped_cancellation, 75);
    assert_eq!(final_metrics.spans_dropped_error, 50);

    println!("✅ DESIGN VALIDATED: Cancellation-aware tracking with metrics");

    println!("📌 Integration points for send_otlp_protobuf():");
    println!("   1. Call start_export() before async operations");
    println!("   2. Wrap .await points with cancellation detection");
    println!("   3. Call handle_cancellation() in Drop impl or catch unwind");
    println!("   4. Call complete_export() on success or error");
    println!("   5. Expose metrics via MetricsProvider integration");

    println!("✅ IMPLEMENTATION FEASIBLE: Clear integration path for existing code");
}