nd300 3.3.0 - Docs.rs

//! Diagnostic-driven fix loop driver.
//!
//! The flow:
//!
//! 1. Run baseline diagnostics.
//! 2. If everything passes, exit cleanly.
//! 3. Otherwise, in a bounded loop:
//!    a. Detect hard blocks (captive portal / ISP outage / no link / enterprise VPN) — exit cleanly with guidance.
//!    b. Compute the actionable failure set, group by root cause, and build a plan.
//!    c. Apply the plan's actions one by one, prompting Y/N for any High-risk action.
//!    d. After each action, sleep its `stabilization` window.
//!    e. Re-run diagnostics; if all pass, exit; else continue.
//! 4. Bounded by iteration count, wall clock, and per-action attempt caps.

use std::time::{Duration, Instant};

use crate::config::{Config, OutputFormat};
use crate::diagnostics;

use super::action::{self, DiagnosticKey};
use super::session::{FinalOutcome, Reporter, RestoreRegistry, Session, DEFAULT_ITERATION_DELAY};
use super::triage::{
    actionable_failures, build_plan, hard_block_detected, requires_confirmation,
    requires_high_risk_consent, HardBlock, MAX_ITERATIONS,
};

/// Outer timeout on the whole restore drain. The drain runs on every terminal
/// path (normal end, Ctrl-C, panic, wall-clock cap); each op is individually
/// bounded too, but this caps the aggregate so cleanup itself can never hang.
const DRAIN_CAP: Duration = Duration::from_secs(90);

/// Runs the full triage loop, populating the caller-owned `session` so the
/// report stays rich even if the run is interrupted or panics. Returns the
/// `FinalOutcome`. Destructive actions register inverse ops on `restore`; the
/// caller drains it on every terminal path.
pub async fn run(
    config: &Config,
    session: &mut Session,
    restore: &RestoreRegistry,
) -> FinalOutcome {
    let interactive = is_interactive(config);
    let reporter = Reporter::new(config);

    if interactive {
        reporter.header();
    }

    // Iteration 1: baseline diagnostics.
    let baseline = diagnostics::run_all(config).await;
    session.record_baseline(baseline.clone());

    let mut current = baseline;

    if interactive {
        let initial_failures = actionable_failures(&current);
        reporter.baseline_summary(initial_failures.len());
    }

    for iteration in 1..=MAX_ITERATIONS {
        // Wall-clock cap.
        if session.wall_clock_exhausted() {
            let remaining: Vec<DiagnosticKey> = actionable_failures(&current).into_iter().collect();
            let outcome = FinalOutcome::Timeout(remaining);
            session.final_outcome = Some(outcome.clone());
            if interactive {
                reporter.final_verdict(&outcome, None);
            }
            return outcome;
        }

        let failures = actionable_failures(&current);
        if failures.is_empty() {
            let outcome = FinalOutcome::Fixed;
            session.final_outcome = Some(outcome.clone());
            if interactive {
                reporter.final_verdict(&outcome, None);
            }
            return outcome;
        }

        // Hard-block check — short-circuits before any action runs.
        if let Some(block) = hard_block_detected(&current) {
            let outcome = FinalOutcome::HardBlock(block);
            session.final_outcome = Some(outcome.clone());
            if interactive {
                reporter.final_verdict(&outcome, None);
            }
            return outcome;
        }

        if interactive {
            reporter.iteration_header(iteration);
        }

        let registry = action::all_actions();
        let plan = build_plan(
            &failures,
            &session.attempts,
            &session.effectiveness,
            &registry,
        );

        if plan.is_empty() {
            let remaining: Vec<DiagnosticKey> = failures.into_iter().collect();
            let outcome = FinalOutcome::Exhausted(remaining);
            session.final_outcome = Some(outcome.clone());
            if interactive {
                reporter.final_verdict(&outcome, None);
            }
            return outcome;
        }

        // Apply actions in cost-order. Fatal env changes break early so we
        // re-probe before applying further actions in the same iteration.
        let mut user_declined_confirmation = false;
        let mut skipped_for_confirmation = false;
        let mut ran_action = false;
        for action in &plan {
            if session.wall_clock_exhausted() {
                break;
            }

            // Confirmation gates. High-risk always requires explicit Y/N;
            // medium-risk and DNS-changing actions honor --yes.
            if requires_confirmation(action, config.auto_confirm_medium_risk) {
                if !interactive {
                    session.record_action(
                        iteration,
                        action,
                        super::action::ActionOutcome::fail(
                            "Skipped: requires confirmation. Re-run `nd300 fix` in a terminal or use `--yes` for medium-risk actions.",
                        ),
                        Duration::from_millis(0),
                        false,
                        true,
                    );
                    skipped_for_confirmation = true;
                    continue;
                }

                let approved = if requires_high_risk_consent(action) {
                    reporter.high_risk_prompt(action)
                } else {
                    reporter.confirmation_prompt(action)
                };

                if !approved {
                    reporter.confirmation_declined(action);
                    session.record_action(
                        iteration,
                        action,
                        super::action::ActionOutcome::fail("User declined the prompt."),
                        Duration::from_millis(0),
                        true,
                        false,
                    );
                    user_declined_confirmation = true;
                    break;
                }
            }

            if interactive {
                reporter.announce_action(action);
            }
            let started = Instant::now();
            let outcome = action.apply(config, restore).await;
            let duration = started.elapsed();
            if interactive {
                reporter.finish_action(&outcome, duration);
            }

            let fatal_env_change = outcome.fatal_environment_change;
            session.record_action(iteration, action, outcome, duration, false, false);
            ran_action = true;

            // Stabilize before either re-probing or applying the next action.
            if action.stabilization > Duration::from_millis(0) {
                tokio::time::sleep(action.stabilization).await;
            }

            if fatal_env_change {
                // Break out of the plan-loop and re-probe immediately.
                break;
            }
        }

        if user_declined_confirmation || (skipped_for_confirmation && !ran_action) {
            let remaining: Vec<DiagnosticKey> = actionable_failures(&current).into_iter().collect();
            let outcome = FinalOutcome::UserDeclined(remaining);
            session.final_outcome = Some(outcome.clone());
            if interactive {
                reporter.final_verdict(&outcome, None);
            }
            return outcome;
        }

        // Light delay between iterations to let the OS settle.
        tokio::time::sleep(DEFAULT_ITERATION_DELAY).await;

        // Re-probe.
        let prior_failures = actionable_failures(&current);
        current = diagnostics::run_all(config).await;
        let now_failures = actionable_failures(&current);
        session.record_iteration(iteration, current.clone());
        session.update_effectiveness(iteration, &prior_failures, &now_failures);
    }

    // Hit MAX_ITERATIONS without converging.
    let remaining_failures = actionable_failures(&current);
    let remaining: Vec<DiagnosticKey> = remaining_failures.iter().copied().collect();
    let outcome = if remaining_failures.is_empty() {
        FinalOutcome::Fixed
    } else {
        let baseline_failures = session
            .baseline
            .as_ref()
            .map(actionable_failures)
            .unwrap_or_default();
        let any_progress = baseline_failures
            .difference(&remaining_failures)
            .next()
            .is_some();
        if any_progress {
            FinalOutcome::Partial(remaining)
        } else {
            FinalOutcome::Exhausted(remaining)
        }
    };
    session.final_outcome = Some(outcome.clone());
    if interactive {
        reporter.final_verdict(&outcome, None);
    }
    outcome
}

/// True when the loop can render interactive prompts (TTY + non-JSON output).
fn is_interactive(config: &Config) -> bool {
    use std::io::IsTerminal;
    config.format != OutputFormat::Json && std::io::stdin().is_terminal()
}

/// Convenience wrapper used by `actions::fix::run`. Persists the Markdown
/// report and returns the exit code derived from the `FinalOutcome`.
///
/// This is the interrupt-safe boundary: the triage loop runs inside a
/// `tokio::select!` that races it against `Ctrl-C`, and the loop future is
/// wrapped in `catch_unwind` so a panic is caught rather than aborting the
/// process. On EVERY terminal path — normal end, user-declined, wall-clock cap,
/// Ctrl-C, or panic — the restore registry is drained so any half-applied
/// network change (a disabled adapter, a disconnected VPN, a removed macOS
/// service) is rolled back before the process exits.
pub async fn run_and_finalize(config: &Config) -> i32 {
    use futures_util::FutureExt;

    // Pre-flight: elevation
    if !crate::platform::is_elevated() {
        let outcome = FinalOutcome::PreflightFailed(
            "The fix flow requires elevated privileges. Run with sudo (Unix) or as Administrator (Windows).".to_string(),
        );
        if config.format == OutputFormat::Json {
            print_json_outcome(&Session::new(), &outcome, None, &[]);
        } else {
            let reporter = Reporter::new(config);
            reporter.final_verdict(&outcome, None);
        }
        return outcome.exit_code();
    }

    let is_json = config.format == OutputFormat::Json;
    let mut session = Session::new();
    let restore = RestoreRegistry::new();

    // Race the loop against Ctrl-C, and catch any panic from the loop so we can
    // still drain restores instead of leaving the network half-broken.
    //
    // `AssertUnwindSafe` is sound here: the registry uses a non-poisoning
    // `tokio::sync::Mutex`, and after a caught panic we only READ the partially
    // populated `Session` to build a best-effort report — we never rely on it
    // being in a logically-consistent state.
    let loop_result = {
        let fut = std::panic::AssertUnwindSafe(run(config, &mut session, &restore)).catch_unwind();
        tokio::select! {
            biased;
            _ = tokio::signal::ctrl_c() => None,
            r = fut => Some(r),
        }
    };

    // Classify the terminal path.
    //   None                -> Ctrl-C interrupted the loop.
    //   Some(Ok(outcome))   -> loop finished normally (verdict already printed).
    //   Some(Err(_panic))   -> loop panicked (caught); re-raise after cleanup.
    let (outcome, panicked) = match loop_result {
        Some(Ok(outcome)) => (outcome, false),
        Some(Err(_panic)) => (
            FinalOutcome::Interrupted(remaining_after_interrupt(&session)),
            true,
        ),
        None => {
            // Ctrl-C: print a clear interrupted line now (the loop never
            // returned, so it never printed a verdict).
            if !is_json {
                println!();
                println!("  Interrupted — cleaning up and restoring network state...");
            }
            (
                FinalOutcome::Interrupted(remaining_after_interrupt(&session)),
                false,
            )
        }
    };

    if panicked && !is_json {
        println!();
        println!(
            "  A fatal internal error occurred mid-fix — restoring network state before exiting..."
        );
    }

    // ALWAYS drain restores, regardless of how we got here. Bound the whole
    // drain so cleanup itself can never hang.
    let drain_failures = match tokio::time::timeout(DRAIN_CAP, restore.drain()).await {
        Ok(failures) => failures,
        Err(_) => vec![format!(
            "Network-state cleanup did not finish within {}s; some changes may not have been restored.",
            DRAIN_CAP.as_secs()
        )],
    };

    // For the Interrupted path, print the verdict now (after the drain attempt)
    // so the manual-recovery guidance reads in order.
    if matches!(outcome, FinalOutcome::Interrupted(_)) && !is_json {
        let reporter = Reporter::new(config);
        reporter.final_verdict(&outcome, None);
    }

    // Surface anything that couldn't be restored as explicit manual-recovery
    // guidance (non-JSON; JSON carries it in the structured object).
    if !drain_failures.is_empty() && !is_json {
        println!();
        println!(
            "  {}",
            crate::render::color::yellow("Manual recovery needed:", config)
        );
        for f in &drain_failures {
            println!("    • {}", crate::render::color::yellow(f, config));
        }
    }

    // Record the final outcome on the session so the report reflects it even on
    // the interrupted / panic path.
    session.final_outcome = Some(outcome.clone());

    let report_path =
        super::report::save_session_report_with_recovery(&session, &outcome, &drain_failures);

    if is_json {
        print_json_outcome(&session, &outcome, report_path.as_deref(), &drain_failures);
    } else if let Some(path) = &report_path {
        // Re-print the path under the verdict so users see where to find it.
        println!(
            "  {} {}",
            crate::render::color::dim("Saved report:", config),
            crate::render::color::dim(&path.display().to_string(), config),
        );
    }

    let code = outcome.exit_code();

    // If the loop panicked, re-raise the failure as exit 101 AFTER cleanup so
    // the operator sees the standard panic exit code, having had the network
    // restored first.
    if panicked {
        std::process::exit(101);
    }

    code
}

/// Best-effort remaining-failure set for an interrupted run: the actionable
/// failures from the most recent diagnostics snapshot, or empty if none ran.
fn remaining_after_interrupt(session: &Session) -> Vec<DiagnosticKey> {
    session
        .snapshots
        .last()
        .map(|s| actionable_failures(&s.results).into_iter().collect())
        .unwrap_or_default()
}

fn print_json_outcome(
    session: &Session,
    outcome: &FinalOutcome,
    report_path: Option<&std::path::Path>,
    recovery_needed: &[String],
) {
    use serde_json::json;

    let outcome_label = match outcome {
        FinalOutcome::Fixed => "fixed",
        FinalOutcome::Partial(_) => "partial",
        FinalOutcome::Exhausted(_) => "exhausted",
        FinalOutcome::HardBlock(_) => "hard_block",
        FinalOutcome::Timeout(_) => "timeout",
        FinalOutcome::UserDeclined(_) => "user_declined",
        FinalOutcome::PreflightFailed(_) => "preflight_failed",
        FinalOutcome::Interrupted(_) => "interrupted",
    };

    let remaining: Vec<&str> = match outcome {
        FinalOutcome::Partial(rs)
        | FinalOutcome::Exhausted(rs)
        | FinalOutcome::Timeout(rs)
        | FinalOutcome::UserDeclined(rs)
        | FinalOutcome::Interrupted(rs) => rs.iter().map(|k| diagnostic_key_str(*k)).collect(),
        _ => Vec::new(),
    };

    let actions_json: Vec<_> = session
        .action_log
        .iter()
        .map(|r| {
            json!({
                "iteration": r.iteration,
                "action": format!("{:?}", r.action_id),
                "label": r.label,
                "ok": r.outcome.ok,
                "message": r.outcome.message,
                "duration_ms": r.duration.as_millis() as u64,
                "user_declined": r.user_declined,
                "skipped_no_interaction": r.skipped_no_interaction,
            })
        })
        .collect();

    let value = json!({
        "action": "fix",
        "outcome": outcome_label,
        "exit_code": outcome.exit_code(),
        "iterations": session.snapshots.len().saturating_sub(1),
        "remaining_failures": remaining,
        "applied_actions": actions_json,
        "elapsed_seconds": session.elapsed().as_secs(),
        "report_path": report_path.map(|p| p.display().to_string()),
        "interrupted": matches!(outcome, FinalOutcome::Interrupted(_)),
        "manual_recovery_needed": recovery_needed,
        "preflight_error": match outcome {
            FinalOutcome::PreflightFailed(s) => Some(s.clone()),
            _ => None,
        },
        "hard_block": match outcome {
            FinalOutcome::HardBlock(b) => Some(hard_block_str(b).to_string()),
            _ => None,
        },
    });

    println!(
        "{}",
        serde_json::to_string_pretty(&value).unwrap_or_else(|_| "{}".to_string())
    );
}

fn diagnostic_key_str(k: DiagnosticKey) -> &'static str {
    match k {
        DiagnosticKey::Adapters => "adapters",
        DiagnosticKey::Interfaces => "interfaces",
        DiagnosticKey::Gateway => "gateway",
        DiagnosticKey::Dns => "dns",
        DiagnosticKey::PublicIp => "public_ip",
        DiagnosticKey::Latency => "latency",
        DiagnosticKey::Ports => "ports",
        DiagnosticKey::Speed => "speed",
    }
}

fn hard_block_str(b: &HardBlock) -> &'static str {
    match b {
        HardBlock::CaptivePortal => "captive_portal",
        HardBlock::NoPhysicalLink => "no_physical_link",
        HardBlock::IspOutage => "isp_outage",
        HardBlock::EnterpriseVpnActive(_) => "enterprise_vpn_active",
    }
}