asupersync 0.3.4

//! Runtime metrics endpoint exposure audit test.
//!
//! **AUDIT SCOPE**: Verifies runtime state metrics are exposed via HTTP endpoints
//! per AGENTS.md observability invariants.
//!
//! **OBSERVABILITY REQUIREMENT**:
//! - Runtime state must be observable (AGENTS.md)
//! - Metrics like task_count, region_count, obligation_count must be exposed
//! - Aggregate counts should be available via HTTP, not just raw object arrays
//!
//! **CRITICAL**: Production monitoring requires aggregate metrics, not just raw snapshots.

#![cfg(test)]

use crate::runtime::RuntimeSnapshot;

/// **AUDIT TEST**: Verify runtime metrics are exposed via HTTP endpoints.
///
/// **SCENARIO**: Runtime snapshot contains individual objects but no aggregate metrics.
/// **REQUIREMENT**: Aggregate metrics (counts) must be available via HTTP endpoints.
/// **ASSESSMENT**: DEFECT - diagnostic_resource_accounting metrics not exposed via HTTP.
#[test]
fn audit_runtime_metrics_http_endpoint_coverage() {
    println!("🔍 AUDIT: Runtime metrics HTTP endpoint exposure");

    // Test the current debug endpoint structure
    println!("📊 Current debug endpoint structure:");
    println!("   • /debug/snapshot: Returns RuntimeSnapshot JSON");
    println!(
        "   • RuntimeSnapshot contains: Vec<RegionSnapshot>, Vec<TaskSnapshot>, Vec<ObligationSnapshot>"
    );
    println!("   • Individual objects exposed: ✓");
    println!("   • Aggregate counts exposed: ✗");

    // Demonstrate what RuntimeSnapshot contains
    let empty_snapshot = create_test_runtime_snapshot();

    println!("📋 RuntimeSnapshot fields:");
    println!("   • timestamp: {} (nanoseconds)", empty_snapshot.timestamp);
    println!(
        "   • regions: Vec<RegionSnapshot> (length: {})",
        empty_snapshot.regions.len()
    );
    println!(
        "   • tasks: Vec<TaskSnapshot> (length: {})",
        empty_snapshot.tasks.len()
    );
    println!(
        "   • obligations: Vec<ObligationSnapshot> (length: {})",
        empty_snapshot.obligations.len()
    );
    println!(
        "   • recent_events: Vec<EventSnapshot> (length: {})",
        empty_snapshot.recent_events.len()
    );

    // Show what diagnostic_resource_accounting calculates but is NOT exposed
    println!(
        "📋 Missing aggregate metrics (calculated in diagnostic_resource_accounting but not exposed):"
    );
    println!("   ✗ total_regions, open_regions, closed_regions");
    println!("   ✗ total_tasks, live_tasks, completed_tasks, cancelled_tasks");
    println!("   ✗ total_obligations, leaked_obligations");
    println!("   ✗ No /metrics endpoint for Prometheus scraping");

    println!("🚨 DEFECT ANALYSIS:");
    println!("   ✗ PROBLEM: diagnostic_resource_accounting() only called in tests");
    println!("   ✗ MISSING: Aggregate metrics not included in RuntimeSnapshot");
    println!("   ✗ MISSING: No /metrics endpoint for external monitoring");
    println!("   ✗ IMPACT: Production monitoring cannot get aggregate counts");

    println!("📋 Required fixes:");
    println!("   1. Add aggregate metrics fields to RuntimeSnapshot");
    println!("   2. Include diagnostic_resource_accounting() output in snapshot() method");
    println!("   3. Add /metrics endpoint returning aggregate counts in Prometheus format");
    println!("   4. Wire diagnostic metrics to observability pipeline");

    // Verify the defect exists
    assert!(
        !runtime_snapshot_includes_aggregate_metrics(),
        "RuntimeSnapshot must include aggregate metrics for observability compliance"
    );

    println!("🚨 RUNTIME METRICS ENDPOINT: DEFECT - aggregate metrics not exposed via HTTP");
}

/// **AUDIT TEST**: Verify debug server endpoint structure.
///
/// **SCENARIO**: Debug server provides /debug/snapshot but no dedicated /metrics endpoint.
/// **REQUIREMENT**: Metrics should be easily accessible for monitoring systems.
/// **ASSESSMENT**: DEFECT - no standard metrics endpoint format.
#[test]
fn audit_debug_server_endpoint_structure() {
    println!("🔍 AUDIT: Debug server endpoint structure");

    println!("📊 Available debug endpoints:");
    println!("   • GET /debug - HTML dashboard");
    println!("   • GET /debug/snapshot - RuntimeSnapshot JSON");
    println!("   • GET /debug/trace - Recent events JSON");
    println!("   • GET /debug/ws - WebSocket upgrade");

    println!("📋 Standard monitoring endpoint expectations:");
    println!("   ✗ GET /metrics - Prometheus format (NOT PROVIDED)");
    println!("   ✗ GET /health - Health checks (provided by separate health module)");
    println!("   ✓ GET /debug/snapshot - Raw state dump (provided but no aggregates)");

    println!("🚨 MONITORING INTEGRATION ISSUES:");
    println!("   ✗ No Prometheus-compatible /metrics endpoint");
    println!("   ✗ Aggregate metrics require client-side calculation from arrays");
    println!("   ✗ Inefficient for high-frequency monitoring (large JSON responses)");
    println!("   ✗ No metric labeling/dimensions for filtering");

    println!("📋 Recommended metrics endpoint structure:");
    println!("   # HELP asupersync_total_regions Total number of regions created");
    println!("   # TYPE asupersync_total_regions counter");
    println!("   asupersync_total_regions {{instance=\"localhost:8080\"}} 42");
    println!("   ");
    println!("   # HELP asupersync_live_tasks Number of currently live tasks");
    println!("   # TYPE asupersync_live_tasks gauge");
    println!("   asupersync_live_tasks {{instance=\"localhost:8080\"}} 15");

    // Assert the defect
    assert!(
        !debug_server_provides_metrics_endpoint(),
        "Debug server must provide /metrics endpoint for monitoring integration"
    );
}

/// **AUDIT TEST**: Verify observability invariants compliance.
///
/// **SCENARIO**: AGENTS.md states "runtime state must be observable".
/// **REQUIREMENT**: All runtime metrics must be accessible via HTTP endpoints.
/// **ASSESSMENT**: PARTIAL COMPLIANCE - raw data available but not aggregated.
#[test]
fn audit_observability_invariants_compliance() {
    println!("🔍 AUDIT: AGENTS.md observability invariants compliance");

    println!("📋 AGENTS.md requirement: 'runtime state must be observable'");

    println!("📊 Current observability state:");
    println!("   ✓ Individual region records: observable via /debug/snapshot");
    println!("   ✓ Individual task records: observable via /debug/snapshot");
    println!("   ✓ Individual obligation records: observable via /debug/snapshot");
    println!("   ✓ Recent trace events: observable via /debug/trace");
    println!("   ✗ Aggregate metrics: NOT observable via HTTP");
    println!("   ✗ Monitoring-friendly format: NOT available");

    println!("📋 Observability gaps:");
    println!("   1. No aggregate counts (total_regions, live_tasks, leaked_obligations)");
    println!("   2. No time-series friendly format (Prometheus metrics)");
    println!("   3. No alerting-friendly thresholds or health indicators");
    println!("   4. diagnostic_resource_accounting() isolated to tests only");

    println!("✅ COMPLIANCE STATUS: PARTIAL");
    println!("   ✓ Raw runtime state is observable");
    println!("   ✗ Aggregate runtime metrics are NOT observable");
    println!("   ✗ Standard monitoring integration is NOT supported");

    // Document the compliance gap
    assert!(
        observability_has_gaps(),
        "Full observability compliance requires aggregate metrics exposure"
    );
}

// Helper functions to document the defects

fn create_test_runtime_snapshot() -> RuntimeSnapshot {
    RuntimeSnapshot {
        timestamp: 1640995200000000000, // 2022-01-01 00:00:00 UTC in nanoseconds
        regions: Vec::new(),
        tasks: Vec::new(),
        obligations: Vec::new(),
        recent_events: Vec::new(),
        finalizer_history: Vec::new(),
        loser_drain_history: Vec::new(),
    }
}

fn runtime_snapshot_includes_aggregate_metrics() -> bool {
    // RuntimeSnapshot struct has these fields:
    // - timestamp: u64
    // - regions: Vec<RegionSnapshot>
    // - tasks: Vec<TaskSnapshot>
    // - obligations: Vec<ObligationSnapshot>
    // - recent_events: Vec<EventSnapshot>
    // - finalizer_history: Vec<FinalizerHistoryEvent>
    // - loser_drain_history: Vec<LoserDrainHistoryEvent>
    //
    // It does NOT include:
    // - total_regions, open_regions, closed_regions
    // - total_tasks, live_tasks, completed_tasks
    // - total_obligations, leaked_obligations
    false
}

fn debug_server_provides_metrics_endpoint() -> bool {
    // DebugServer handles these paths:
    // - "/debug" | "/debug/" => HTML dashboard
    // - "/debug/snapshot" => RuntimeSnapshot JSON
    // - "/debug/trace" => Recent events JSON
    // - "/debug/ws" => WebSocket upgrade
    // - _ => 404 Not Found
    //
    // It does NOT handle:
    // - "/metrics" => Prometheus format metrics
    false
}

fn observability_has_gaps() -> bool {
    // AGENTS.md states: "runtime state must be observable"
    // Current gaps:
    // 1. diagnostic_resource_accounting() only called in tests
    // 2. Aggregate metrics not exposed via HTTP
    // 3. No /metrics endpoint for monitoring systems
    // 4. Inefficient monitoring (must parse large JSON arrays)
    true
}

/// **AUDIT TEST**: Document the fix strategy for runtime metrics exposure.
///
/// **SCENARIO**: Plan implementation to expose aggregate metrics via HTTP.
/// **REQUIREMENT**: Provide concrete steps to achieve observability compliance.
/// **ASSESSMENT**: PLANNING - document required changes for implementation.
#[test]
fn audit_runtime_metrics_fix_strategy_documentation() {
    println!("🔍 AUDIT: Runtime metrics fix strategy documentation");

    println!("📋 IMPLEMENTATION PLAN: Expose aggregate runtime metrics via HTTP");

    println!("📊 Phase 1: Extend RuntimeSnapshot with aggregate metrics");
    println!("   1. Add fields to RuntimeSnapshot:");
    println!("      - aggregate_metrics: Option<AggregateRuntimeMetrics>");
    println!("   2. Create AggregateRuntimeMetrics struct:");
    println!("      - total_regions, open_regions, closed_regions");
    println!("      - total_tasks, live_tasks, completed_tasks, cancelled_tasks");
    println!("      - total_obligations, leaked_obligations");
    println!("   3. Modify RuntimeState::snapshot() to call diagnostic_resource_accounting()");

    println!("📊 Phase 2: Add /metrics endpoint to DebugServer");
    println!("   1. Extend DebugServer endpoint routing:");
    println!("      - \"/metrics\" => Prometheus format response");
    println!("   2. Implement prometheus_format_metrics() function");
    println!("   3. Include aggregate metrics in Prometheus output format");

    println!("📊 Phase 3: Integration with observability pipeline");
    println!("   1. Wire aggregate metrics to OpenTelemetry metrics");
    println!("   2. Add metrics export via OTLP to external collectors");
    println!("   3. Ensure metrics are emitted at regular intervals");

    println!("📋 VERIFICATION TESTS:");
    println!("   1. RuntimeSnapshot includes aggregate_metrics field");
    println!("   2. /debug/snapshot JSON response includes aggregate counts");
    println!("   3. /metrics endpoint returns Prometheus format");
    println!("   4. Metrics export to OTLP collectors");
    println!("   5. All diagnostic_resource_accounting metrics exposed");

    println!("✅ STRATEGY DOCUMENTED: Ready for implementation");

    // This test always passes - it's documentation
    assert!(true, "Fix strategy documented for runtime metrics exposure");
}