Skip to main content

nd_300/actions/fix/
loop_runner.rs

1//! Diagnostic-driven fix loop driver.
2//!
3//! The flow:
4//!
5//! 1. Run baseline diagnostics.
6//! 2. If everything passes, exit cleanly.
7//! 3. Otherwise, in a bounded loop:
8//!    a. Detect hard blocks (captive portal / ISP outage / no link / enterprise VPN) — exit cleanly with guidance.
9//!    b. Compute the actionable failure set, group by root cause, and build a plan.
10//!    c. Apply the plan's actions one by one, prompting Y/N for any High-risk action.
11//!    d. After each action, sleep its `stabilization` window.
12//!    e. Re-run diagnostics; if all pass, exit; else continue.
13//! 4. Bounded by iteration count, wall clock, and per-action attempt caps.
14
15use std::time::{Duration, Instant};
16
17use crate::config::{Config, OutputFormat};
18use crate::diagnostics;
19
20use super::action::{self, DiagnosticKey};
21use super::session::{FinalOutcome, Reporter, RestoreRegistry, Session, DEFAULT_ITERATION_DELAY};
22use super::triage::{
23    actionable_failures, build_plan, hard_block_detected, requires_confirmation,
24    requires_high_risk_consent, HardBlock, MAX_ITERATIONS,
25};
26
27/// Outer timeout on the whole restore drain. The drain runs on every terminal
28/// path (normal end, Ctrl-C, panic, wall-clock cap); each op is individually
29/// bounded too, but this caps the aggregate so cleanup itself can never hang.
30const DRAIN_CAP: Duration = Duration::from_secs(90);
31
32/// Runs the full triage loop, populating the caller-owned `session` so the
33/// report stays rich even if the run is interrupted or panics. Returns the
34/// `FinalOutcome`. Destructive actions register inverse ops on `restore`; the
35/// caller drains it on every terminal path.
36pub async fn run(
37    config: &Config,
38    session: &mut Session,
39    restore: &RestoreRegistry,
40) -> FinalOutcome {
41    let interactive = is_interactive(config);
42    let reporter = Reporter::new(config);
43
44    if interactive {
45        reporter.header();
46    }
47
48    // Iteration 1: baseline diagnostics.
49    let baseline = diagnostics::run_all(config).await;
50    session.record_baseline(baseline.clone());
51
52    let mut current = baseline;
53
54    if interactive {
55        let initial_failures = actionable_failures(&current);
56        reporter.baseline_summary(initial_failures.len());
57    }
58
59    for iteration in 1..=MAX_ITERATIONS {
60        // Wall-clock cap.
61        if session.wall_clock_exhausted() {
62            let remaining: Vec<DiagnosticKey> = actionable_failures(&current).into_iter().collect();
63            let outcome = FinalOutcome::Timeout(remaining);
64            session.final_outcome = Some(outcome.clone());
65            if interactive {
66                reporter.final_verdict(&outcome, None);
67            }
68            return outcome;
69        }
70
71        let failures = actionable_failures(&current);
72        if failures.is_empty() {
73            let outcome = FinalOutcome::Fixed;
74            session.final_outcome = Some(outcome.clone());
75            if interactive {
76                reporter.final_verdict(&outcome, None);
77            }
78            return outcome;
79        }
80
81        // Hard-block check — short-circuits before any action runs.
82        if let Some(block) = hard_block_detected(&current) {
83            let outcome = FinalOutcome::HardBlock(block);
84            session.final_outcome = Some(outcome.clone());
85            if interactive {
86                reporter.final_verdict(&outcome, None);
87            }
88            return outcome;
89        }
90
91        if interactive {
92            reporter.iteration_header(iteration);
93        }
94
95        let registry = action::all_actions();
96        let plan = build_plan(
97            &failures,
98            &session.attempts,
99            &session.effectiveness,
100            &registry,
101        );
102
103        if plan.is_empty() {
104            let remaining: Vec<DiagnosticKey> = failures.into_iter().collect();
105            let outcome = FinalOutcome::Exhausted(remaining);
106            session.final_outcome = Some(outcome.clone());
107            if interactive {
108                reporter.final_verdict(&outcome, None);
109            }
110            return outcome;
111        }
112
113        // Apply actions in cost-order. Fatal env changes break early so we
114        // re-probe before applying further actions in the same iteration.
115        let mut user_declined_confirmation = false;
116        let mut skipped_for_confirmation = false;
117        let mut ran_action = false;
118        for action in &plan {
119            if session.wall_clock_exhausted() {
120                break;
121            }
122
123            // Confirmation gates. High-risk always requires explicit Y/N;
124            // medium-risk and DNS-changing actions honor --yes.
125            if requires_confirmation(action, config.auto_confirm_medium_risk) {
126                if !interactive {
127                    session.record_action(
128                        iteration,
129                        action,
130                        super::action::ActionOutcome::fail(
131                            "Skipped: requires confirmation. Re-run `nd300 fix` in a terminal or use `--yes` for medium-risk actions.",
132                        ),
133                        Duration::from_millis(0),
134                        false,
135                        true,
136                    );
137                    skipped_for_confirmation = true;
138                    continue;
139                }
140
141                let approved = if requires_high_risk_consent(action) {
142                    reporter.high_risk_prompt(action)
143                } else {
144                    reporter.confirmation_prompt(action)
145                };
146
147                if !approved {
148                    reporter.confirmation_declined(action);
149                    session.record_action(
150                        iteration,
151                        action,
152                        super::action::ActionOutcome::fail("User declined the prompt."),
153                        Duration::from_millis(0),
154                        true,
155                        false,
156                    );
157                    user_declined_confirmation = true;
158                    break;
159                }
160            }
161
162            if interactive {
163                reporter.announce_action(action);
164            }
165            let started = Instant::now();
166            let outcome = action.apply(config, restore).await;
167            let duration = started.elapsed();
168            if interactive {
169                reporter.finish_action(&outcome, duration);
170            }
171
172            let fatal_env_change = outcome.fatal_environment_change;
173            session.record_action(iteration, action, outcome, duration, false, false);
174            ran_action = true;
175
176            // Stabilize before either re-probing or applying the next action.
177            if action.stabilization > Duration::from_millis(0) {
178                tokio::time::sleep(action.stabilization).await;
179            }
180
181            if fatal_env_change {
182                // Break out of the plan-loop and re-probe immediately.
183                break;
184            }
185        }
186
187        if user_declined_confirmation || (skipped_for_confirmation && !ran_action) {
188            let remaining: Vec<DiagnosticKey> = actionable_failures(&current).into_iter().collect();
189            let outcome = FinalOutcome::UserDeclined(remaining);
190            session.final_outcome = Some(outcome.clone());
191            if interactive {
192                reporter.final_verdict(&outcome, None);
193            }
194            return outcome;
195        }
196
197        // Light delay between iterations to let the OS settle.
198        tokio::time::sleep(DEFAULT_ITERATION_DELAY).await;
199
200        // Re-probe.
201        let prior_failures = actionable_failures(&current);
202        current = diagnostics::run_all(config).await;
203        let now_failures = actionable_failures(&current);
204        session.record_iteration(iteration, current.clone());
205        session.update_effectiveness(iteration, &prior_failures, &now_failures);
206    }
207
208    // Hit MAX_ITERATIONS without converging.
209    let remaining_failures = actionable_failures(&current);
210    let remaining: Vec<DiagnosticKey> = remaining_failures.iter().copied().collect();
211    let outcome = if remaining_failures.is_empty() {
212        FinalOutcome::Fixed
213    } else {
214        let baseline_failures = session
215            .baseline
216            .as_ref()
217            .map(actionable_failures)
218            .unwrap_or_default();
219        let any_progress = baseline_failures
220            .difference(&remaining_failures)
221            .next()
222            .is_some();
223        if any_progress {
224            FinalOutcome::Partial(remaining)
225        } else {
226            FinalOutcome::Exhausted(remaining)
227        }
228    };
229    session.final_outcome = Some(outcome.clone());
230    if interactive {
231        reporter.final_verdict(&outcome, None);
232    }
233    outcome
234}
235
236/// True when the loop can render interactive prompts (TTY + non-JSON output).
237fn is_interactive(config: &Config) -> bool {
238    use std::io::IsTerminal;
239    config.format != OutputFormat::Json && std::io::stdin().is_terminal()
240}
241
242/// Convenience wrapper used by `actions::fix::run`. Persists the Markdown
243/// report and returns the exit code derived from the `FinalOutcome`.
244///
245/// This is the interrupt-safe boundary: the triage loop runs inside a
246/// `tokio::select!` that races it against `Ctrl-C`, and the loop future is
247/// wrapped in `catch_unwind` so a panic is caught rather than aborting the
248/// process. On EVERY terminal path — normal end, user-declined, wall-clock cap,
249/// Ctrl-C, or panic — the restore registry is drained so any half-applied
250/// network change (a disabled adapter, a disconnected VPN, a removed macOS
251/// service) is rolled back before the process exits.
252pub async fn run_and_finalize(config: &Config) -> i32 {
253    use futures_util::FutureExt;
254
255    // Pre-flight: elevation
256    if !crate::platform::is_elevated() {
257        let outcome = FinalOutcome::PreflightFailed(
258            "The fix flow requires elevated privileges. Run with sudo (Unix) or as Administrator (Windows).".to_string(),
259        );
260        if config.format == OutputFormat::Json {
261            print_json_outcome(&Session::new(), &outcome, None, &[]);
262        } else {
263            let reporter = Reporter::new(config);
264            reporter.final_verdict(&outcome, None);
265        }
266        return outcome.exit_code();
267    }
268
269    let is_json = config.format == OutputFormat::Json;
270    let mut session = Session::new();
271    let restore = RestoreRegistry::new();
272
273    // Race the loop against Ctrl-C, and catch any panic from the loop so we can
274    // still drain restores instead of leaving the network half-broken.
275    //
276    // `AssertUnwindSafe` is sound here: the registry uses a non-poisoning
277    // `tokio::sync::Mutex`, and after a caught panic we only READ the partially
278    // populated `Session` to build a best-effort report — we never rely on it
279    // being in a logically-consistent state.
280    let loop_result = {
281        let fut = std::panic::AssertUnwindSafe(run(config, &mut session, &restore)).catch_unwind();
282        tokio::select! {
283            biased;
284            _ = tokio::signal::ctrl_c() => None,
285            r = fut => Some(r),
286        }
287    };
288
289    // Classify the terminal path.
290    //   None                -> Ctrl-C interrupted the loop.
291    //   Some(Ok(outcome))   -> loop finished normally (verdict already printed).
292    //   Some(Err(_panic))   -> loop panicked (caught); re-raise after cleanup.
293    let (outcome, panicked) = match loop_result {
294        Some(Ok(outcome)) => (outcome, false),
295        Some(Err(_panic)) => (
296            FinalOutcome::Interrupted(remaining_after_interrupt(&session)),
297            true,
298        ),
299        None => {
300            // Ctrl-C: print a clear interrupted line now (the loop never
301            // returned, so it never printed a verdict).
302            if !is_json {
303                println!();
304                println!("  Interrupted — cleaning up and restoring network state...");
305            }
306            (
307                FinalOutcome::Interrupted(remaining_after_interrupt(&session)),
308                false,
309            )
310        }
311    };
312
313    if panicked && !is_json {
314        println!();
315        println!(
316            "  A fatal internal error occurred mid-fix — restoring network state before exiting..."
317        );
318    }
319
320    // ALWAYS drain restores, regardless of how we got here. Bound the whole
321    // drain so cleanup itself can never hang.
322    let drain_failures = match tokio::time::timeout(DRAIN_CAP, restore.drain()).await {
323        Ok(failures) => failures,
324        Err(_) => vec![format!(
325            "Network-state cleanup did not finish within {}s; some changes may not have been restored.",
326            DRAIN_CAP.as_secs()
327        )],
328    };
329
330    // For the Interrupted path, print the verdict now (after the drain attempt)
331    // so the manual-recovery guidance reads in order.
332    if matches!(outcome, FinalOutcome::Interrupted(_)) && !is_json {
333        let reporter = Reporter::new(config);
334        reporter.final_verdict(&outcome, None);
335    }
336
337    // Surface anything that couldn't be restored as explicit manual-recovery
338    // guidance (non-JSON; JSON carries it in the structured object).
339    if !drain_failures.is_empty() && !is_json {
340        println!();
341        println!(
342            "  {}",
343            crate::render::color::yellow("Manual recovery needed:", config)
344        );
345        for f in &drain_failures {
346            println!("    • {}", crate::render::color::yellow(f, config));
347        }
348    }
349
350    // Record the final outcome on the session so the report reflects it even on
351    // the interrupted / panic path.
352    session.final_outcome = Some(outcome.clone());
353
354    let report_path =
355        super::report::save_session_report_with_recovery(&session, &outcome, &drain_failures);
356
357    if is_json {
358        print_json_outcome(&session, &outcome, report_path.as_deref(), &drain_failures);
359    } else if let Some(path) = &report_path {
360        // Re-print the path under the verdict so users see where to find it.
361        println!(
362            "  {} {}",
363            crate::render::color::dim("Saved report:", config),
364            crate::render::color::dim(&path.display().to_string(), config),
365        );
366    }
367
368    let code = outcome.exit_code();
369
370    // If the loop panicked, re-raise the failure as exit 101 AFTER cleanup so
371    // the operator sees the standard panic exit code, having had the network
372    // restored first.
373    if panicked {
374        std::process::exit(101);
375    }
376
377    code
378}
379
380/// Best-effort remaining-failure set for an interrupted run: the actionable
381/// failures from the most recent diagnostics snapshot, or empty if none ran.
382fn remaining_after_interrupt(session: &Session) -> Vec<DiagnosticKey> {
383    session
384        .snapshots
385        .last()
386        .map(|s| actionable_failures(&s.results).into_iter().collect())
387        .unwrap_or_default()
388}
389
390fn print_json_outcome(
391    session: &Session,
392    outcome: &FinalOutcome,
393    report_path: Option<&std::path::Path>,
394    recovery_needed: &[String],
395) {
396    use serde_json::json;
397
398    let outcome_label = match outcome {
399        FinalOutcome::Fixed => "fixed",
400        FinalOutcome::Partial(_) => "partial",
401        FinalOutcome::Exhausted(_) => "exhausted",
402        FinalOutcome::HardBlock(_) => "hard_block",
403        FinalOutcome::Timeout(_) => "timeout",
404        FinalOutcome::UserDeclined(_) => "user_declined",
405        FinalOutcome::PreflightFailed(_) => "preflight_failed",
406        FinalOutcome::Interrupted(_) => "interrupted",
407    };
408
409    let remaining: Vec<&str> = match outcome {
410        FinalOutcome::Partial(rs)
411        | FinalOutcome::Exhausted(rs)
412        | FinalOutcome::Timeout(rs)
413        | FinalOutcome::UserDeclined(rs)
414        | FinalOutcome::Interrupted(rs) => rs.iter().map(|k| diagnostic_key_str(*k)).collect(),
415        _ => Vec::new(),
416    };
417
418    let actions_json: Vec<_> = session
419        .action_log
420        .iter()
421        .map(|r| {
422            json!({
423                "iteration": r.iteration,
424                "action": format!("{:?}", r.action_id),
425                "label": r.label,
426                "ok": r.outcome.ok,
427                "message": r.outcome.message,
428                "duration_ms": r.duration.as_millis() as u64,
429                "user_declined": r.user_declined,
430                "skipped_no_interaction": r.skipped_no_interaction,
431            })
432        })
433        .collect();
434
435    let value = json!({
436        "action": "fix",
437        "outcome": outcome_label,
438        "exit_code": outcome.exit_code(),
439        "iterations": session.snapshots.len().saturating_sub(1),
440        "remaining_failures": remaining,
441        "applied_actions": actions_json,
442        "elapsed_seconds": session.elapsed().as_secs(),
443        "report_path": report_path.map(|p| p.display().to_string()),
444        "interrupted": matches!(outcome, FinalOutcome::Interrupted(_)),
445        "manual_recovery_needed": recovery_needed,
446        "preflight_error": match outcome {
447            FinalOutcome::PreflightFailed(s) => Some(s.clone()),
448            _ => None,
449        },
450        "hard_block": match outcome {
451            FinalOutcome::HardBlock(b) => Some(hard_block_str(b).to_string()),
452            _ => None,
453        },
454    });
455
456    println!(
457        "{}",
458        serde_json::to_string_pretty(&value).unwrap_or_else(|_| "{}".to_string())
459    );
460}
461
462fn diagnostic_key_str(k: DiagnosticKey) -> &'static str {
463    match k {
464        DiagnosticKey::Adapters => "adapters",
465        DiagnosticKey::Interfaces => "interfaces",
466        DiagnosticKey::Gateway => "gateway",
467        DiagnosticKey::Dns => "dns",
468        DiagnosticKey::PublicIp => "public_ip",
469        DiagnosticKey::Latency => "latency",
470        DiagnosticKey::Ports => "ports",
471        DiagnosticKey::Speed => "speed",
472    }
473}
474
475fn hard_block_str(b: &HardBlock) -> &'static str {
476    match b {
477        HardBlock::CaptivePortal => "captive_portal",
478        HardBlock::NoPhysicalLink => "no_physical_link",
479        HardBlock::IspOutage => "isp_outage",
480        HardBlock::EnterpriseVpnActive(_) => "enterprise_vpn_active",
481    }
482}