nd300 3.3.0

Cross-platform network diagnostic tool
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
//! Diagnostic-driven fix loop driver.
//!
//! The flow:
//!
//! 1. Run baseline diagnostics.
//! 2. If everything passes, exit cleanly.
//! 3. Otherwise, in a bounded loop:
//!    a. Detect hard blocks (captive portal / ISP outage / no link / enterprise VPN) — exit cleanly with guidance.
//!    b. Compute the actionable failure set, group by root cause, and build a plan.
//!    c. Apply the plan's actions one by one, prompting Y/N for any High-risk action.
//!    d. After each action, sleep its `stabilization` window.
//!    e. Re-run diagnostics; if all pass, exit; else continue.
//! 4. Bounded by iteration count, wall clock, and per-action attempt caps.

use std::time::{Duration, Instant};

use crate::config::{Config, OutputFormat};
use crate::diagnostics;

use super::action::{self, DiagnosticKey};
use super::session::{FinalOutcome, Reporter, RestoreRegistry, Session, DEFAULT_ITERATION_DELAY};
use super::triage::{
    actionable_failures, build_plan, hard_block_detected, requires_confirmation,
    requires_high_risk_consent, HardBlock, MAX_ITERATIONS,
};

/// Outer timeout on the whole restore drain. The drain runs on every terminal
/// path (normal end, Ctrl-C, panic, wall-clock cap); each op is individually
/// bounded too, but this caps the aggregate so cleanup itself can never hang.
const DRAIN_CAP: Duration = Duration::from_secs(90);

/// Runs the full triage loop, populating the caller-owned `session` so the
/// report stays rich even if the run is interrupted or panics. Returns the
/// `FinalOutcome`. Destructive actions register inverse ops on `restore`; the
/// caller drains it on every terminal path.
pub async fn run(
    config: &Config,
    session: &mut Session,
    restore: &RestoreRegistry,
) -> FinalOutcome {
    let interactive = is_interactive(config);
    let reporter = Reporter::new(config);

    if interactive {
        reporter.header();
    }

    // Iteration 1: baseline diagnostics.
    let baseline = diagnostics::run_all(config).await;
    session.record_baseline(baseline.clone());

    let mut current = baseline;

    if interactive {
        let initial_failures = actionable_failures(&current);
        reporter.baseline_summary(initial_failures.len());
    }

    for iteration in 1..=MAX_ITERATIONS {
        // Wall-clock cap.
        if session.wall_clock_exhausted() {
            let remaining: Vec<DiagnosticKey> = actionable_failures(&current).into_iter().collect();
            let outcome = FinalOutcome::Timeout(remaining);
            session.final_outcome = Some(outcome.clone());
            if interactive {
                reporter.final_verdict(&outcome, None);
            }
            return outcome;
        }

        let failures = actionable_failures(&current);
        if failures.is_empty() {
            let outcome = FinalOutcome::Fixed;
            session.final_outcome = Some(outcome.clone());
            if interactive {
                reporter.final_verdict(&outcome, None);
            }
            return outcome;
        }

        // Hard-block check — short-circuits before any action runs.
        if let Some(block) = hard_block_detected(&current) {
            let outcome = FinalOutcome::HardBlock(block);
            session.final_outcome = Some(outcome.clone());
            if interactive {
                reporter.final_verdict(&outcome, None);
            }
            return outcome;
        }

        if interactive {
            reporter.iteration_header(iteration);
        }

        let registry = action::all_actions();
        let plan = build_plan(
            &failures,
            &session.attempts,
            &session.effectiveness,
            &registry,
        );

        if plan.is_empty() {
            let remaining: Vec<DiagnosticKey> = failures.into_iter().collect();
            let outcome = FinalOutcome::Exhausted(remaining);
            session.final_outcome = Some(outcome.clone());
            if interactive {
                reporter.final_verdict(&outcome, None);
            }
            return outcome;
        }

        // Apply actions in cost-order. Fatal env changes break early so we
        // re-probe before applying further actions in the same iteration.
        let mut user_declined_confirmation = false;
        let mut skipped_for_confirmation = false;
        let mut ran_action = false;
        for action in &plan {
            if session.wall_clock_exhausted() {
                break;
            }

            // Confirmation gates. High-risk always requires explicit Y/N;
            // medium-risk and DNS-changing actions honor --yes.
            if requires_confirmation(action, config.auto_confirm_medium_risk) {
                if !interactive {
                    session.record_action(
                        iteration,
                        action,
                        super::action::ActionOutcome::fail(
                            "Skipped: requires confirmation. Re-run `nd300 fix` in a terminal or use `--yes` for medium-risk actions.",
                        ),
                        Duration::from_millis(0),
                        false,
                        true,
                    );
                    skipped_for_confirmation = true;
                    continue;
                }

                let approved = if requires_high_risk_consent(action) {
                    reporter.high_risk_prompt(action)
                } else {
                    reporter.confirmation_prompt(action)
                };

                if !approved {
                    reporter.confirmation_declined(action);
                    session.record_action(
                        iteration,
                        action,
                        super::action::ActionOutcome::fail("User declined the prompt."),
                        Duration::from_millis(0),
                        true,
                        false,
                    );
                    user_declined_confirmation = true;
                    break;
                }
            }

            if interactive {
                reporter.announce_action(action);
            }
            let started = Instant::now();
            let outcome = action.apply(config, restore).await;
            let duration = started.elapsed();
            if interactive {
                reporter.finish_action(&outcome, duration);
            }

            let fatal_env_change = outcome.fatal_environment_change;
            session.record_action(iteration, action, outcome, duration, false, false);
            ran_action = true;

            // Stabilize before either re-probing or applying the next action.
            if action.stabilization > Duration::from_millis(0) {
                tokio::time::sleep(action.stabilization).await;
            }

            if fatal_env_change {
                // Break out of the plan-loop and re-probe immediately.
                break;
            }
        }

        if user_declined_confirmation || (skipped_for_confirmation && !ran_action) {
            let remaining: Vec<DiagnosticKey> = actionable_failures(&current).into_iter().collect();
            let outcome = FinalOutcome::UserDeclined(remaining);
            session.final_outcome = Some(outcome.clone());
            if interactive {
                reporter.final_verdict(&outcome, None);
            }
            return outcome;
        }

        // Light delay between iterations to let the OS settle.
        tokio::time::sleep(DEFAULT_ITERATION_DELAY).await;

        // Re-probe.
        let prior_failures = actionable_failures(&current);
        current = diagnostics::run_all(config).await;
        let now_failures = actionable_failures(&current);
        session.record_iteration(iteration, current.clone());
        session.update_effectiveness(iteration, &prior_failures, &now_failures);
    }

    // Hit MAX_ITERATIONS without converging.
    let remaining_failures = actionable_failures(&current);
    let remaining: Vec<DiagnosticKey> = remaining_failures.iter().copied().collect();
    let outcome = if remaining_failures.is_empty() {
        FinalOutcome::Fixed
    } else {
        let baseline_failures = session
            .baseline
            .as_ref()
            .map(actionable_failures)
            .unwrap_or_default();
        let any_progress = baseline_failures
            .difference(&remaining_failures)
            .next()
            .is_some();
        if any_progress {
            FinalOutcome::Partial(remaining)
        } else {
            FinalOutcome::Exhausted(remaining)
        }
    };
    session.final_outcome = Some(outcome.clone());
    if interactive {
        reporter.final_verdict(&outcome, None);
    }
    outcome
}

/// True when the loop can render interactive prompts (TTY + non-JSON output).
fn is_interactive(config: &Config) -> bool {
    use std::io::IsTerminal;
    config.format != OutputFormat::Json && std::io::stdin().is_terminal()
}

/// Convenience wrapper used by `actions::fix::run`. Persists the Markdown
/// report and returns the exit code derived from the `FinalOutcome`.
///
/// This is the interrupt-safe boundary: the triage loop runs inside a
/// `tokio::select!` that races it against `Ctrl-C`, and the loop future is
/// wrapped in `catch_unwind` so a panic is caught rather than aborting the
/// process. On EVERY terminal path — normal end, user-declined, wall-clock cap,
/// Ctrl-C, or panic — the restore registry is drained so any half-applied
/// network change (a disabled adapter, a disconnected VPN, a removed macOS
/// service) is rolled back before the process exits.
pub async fn run_and_finalize(config: &Config) -> i32 {
    use futures_util::FutureExt;

    // Pre-flight: elevation
    if !crate::platform::is_elevated() {
        let outcome = FinalOutcome::PreflightFailed(
            "The fix flow requires elevated privileges. Run with sudo (Unix) or as Administrator (Windows).".to_string(),
        );
        if config.format == OutputFormat::Json {
            print_json_outcome(&Session::new(), &outcome, None, &[]);
        } else {
            let reporter = Reporter::new(config);
            reporter.final_verdict(&outcome, None);
        }
        return outcome.exit_code();
    }

    let is_json = config.format == OutputFormat::Json;
    let mut session = Session::new();
    let restore = RestoreRegistry::new();

    // Race the loop against Ctrl-C, and catch any panic from the loop so we can
    // still drain restores instead of leaving the network half-broken.
    //
    // `AssertUnwindSafe` is sound here: the registry uses a non-poisoning
    // `tokio::sync::Mutex`, and after a caught panic we only READ the partially
    // populated `Session` to build a best-effort report — we never rely on it
    // being in a logically-consistent state.
    let loop_result = {
        let fut = std::panic::AssertUnwindSafe(run(config, &mut session, &restore)).catch_unwind();
        tokio::select! {
            biased;
            _ = tokio::signal::ctrl_c() => None,
            r = fut => Some(r),
        }
    };

    // Classify the terminal path.
    //   None                -> Ctrl-C interrupted the loop.
    //   Some(Ok(outcome))   -> loop finished normally (verdict already printed).
    //   Some(Err(_panic))   -> loop panicked (caught); re-raise after cleanup.
    let (outcome, panicked) = match loop_result {
        Some(Ok(outcome)) => (outcome, false),
        Some(Err(_panic)) => (
            FinalOutcome::Interrupted(remaining_after_interrupt(&session)),
            true,
        ),
        None => {
            // Ctrl-C: print a clear interrupted line now (the loop never
            // returned, so it never printed a verdict).
            if !is_json {
                println!();
                println!("  Interrupted — cleaning up and restoring network state...");
            }
            (
                FinalOutcome::Interrupted(remaining_after_interrupt(&session)),
                false,
            )
        }
    };

    if panicked && !is_json {
        println!();
        println!(
            "  A fatal internal error occurred mid-fix — restoring network state before exiting..."
        );
    }

    // ALWAYS drain restores, regardless of how we got here. Bound the whole
    // drain so cleanup itself can never hang.
    let drain_failures = match tokio::time::timeout(DRAIN_CAP, restore.drain()).await {
        Ok(failures) => failures,
        Err(_) => vec![format!(
            "Network-state cleanup did not finish within {}s; some changes may not have been restored.",
            DRAIN_CAP.as_secs()
        )],
    };

    // For the Interrupted path, print the verdict now (after the drain attempt)
    // so the manual-recovery guidance reads in order.
    if matches!(outcome, FinalOutcome::Interrupted(_)) && !is_json {
        let reporter = Reporter::new(config);
        reporter.final_verdict(&outcome, None);
    }

    // Surface anything that couldn't be restored as explicit manual-recovery
    // guidance (non-JSON; JSON carries it in the structured object).
    if !drain_failures.is_empty() && !is_json {
        println!();
        println!(
            "  {}",
            crate::render::color::yellow("Manual recovery needed:", config)
        );
        for f in &drain_failures {
            println!("{}", crate::render::color::yellow(f, config));
        }
    }

    // Record the final outcome on the session so the report reflects it even on
    // the interrupted / panic path.
    session.final_outcome = Some(outcome.clone());

    let report_path =
        super::report::save_session_report_with_recovery(&session, &outcome, &drain_failures);

    if is_json {
        print_json_outcome(&session, &outcome, report_path.as_deref(), &drain_failures);
    } else if let Some(path) = &report_path {
        // Re-print the path under the verdict so users see where to find it.
        println!(
            "  {} {}",
            crate::render::color::dim("Saved report:", config),
            crate::render::color::dim(&path.display().to_string(), config),
        );
    }

    let code = outcome.exit_code();

    // If the loop panicked, re-raise the failure as exit 101 AFTER cleanup so
    // the operator sees the standard panic exit code, having had the network
    // restored first.
    if panicked {
        std::process::exit(101);
    }

    code
}

/// Best-effort remaining-failure set for an interrupted run: the actionable
/// failures from the most recent diagnostics snapshot, or empty if none ran.
fn remaining_after_interrupt(session: &Session) -> Vec<DiagnosticKey> {
    session
        .snapshots
        .last()
        .map(|s| actionable_failures(&s.results).into_iter().collect())
        .unwrap_or_default()
}

fn print_json_outcome(
    session: &Session,
    outcome: &FinalOutcome,
    report_path: Option<&std::path::Path>,
    recovery_needed: &[String],
) {
    use serde_json::json;

    let outcome_label = match outcome {
        FinalOutcome::Fixed => "fixed",
        FinalOutcome::Partial(_) => "partial",
        FinalOutcome::Exhausted(_) => "exhausted",
        FinalOutcome::HardBlock(_) => "hard_block",
        FinalOutcome::Timeout(_) => "timeout",
        FinalOutcome::UserDeclined(_) => "user_declined",
        FinalOutcome::PreflightFailed(_) => "preflight_failed",
        FinalOutcome::Interrupted(_) => "interrupted",
    };

    let remaining: Vec<&str> = match outcome {
        FinalOutcome::Partial(rs)
        | FinalOutcome::Exhausted(rs)
        | FinalOutcome::Timeout(rs)
        | FinalOutcome::UserDeclined(rs)
        | FinalOutcome::Interrupted(rs) => rs.iter().map(|k| diagnostic_key_str(*k)).collect(),
        _ => Vec::new(),
    };

    let actions_json: Vec<_> = session
        .action_log
        .iter()
        .map(|r| {
            json!({
                "iteration": r.iteration,
                "action": format!("{:?}", r.action_id),
                "label": r.label,
                "ok": r.outcome.ok,
                "message": r.outcome.message,
                "duration_ms": r.duration.as_millis() as u64,
                "user_declined": r.user_declined,
                "skipped_no_interaction": r.skipped_no_interaction,
            })
        })
        .collect();

    let value = json!({
        "action": "fix",
        "outcome": outcome_label,
        "exit_code": outcome.exit_code(),
        "iterations": session.snapshots.len().saturating_sub(1),
        "remaining_failures": remaining,
        "applied_actions": actions_json,
        "elapsed_seconds": session.elapsed().as_secs(),
        "report_path": report_path.map(|p| p.display().to_string()),
        "interrupted": matches!(outcome, FinalOutcome::Interrupted(_)),
        "manual_recovery_needed": recovery_needed,
        "preflight_error": match outcome {
            FinalOutcome::PreflightFailed(s) => Some(s.clone()),
            _ => None,
        },
        "hard_block": match outcome {
            FinalOutcome::HardBlock(b) => Some(hard_block_str(b).to_string()),
            _ => None,
        },
    });

    println!(
        "{}",
        serde_json::to_string_pretty(&value).unwrap_or_else(|_| "{}".to_string())
    );
}

fn diagnostic_key_str(k: DiagnosticKey) -> &'static str {
    match k {
        DiagnosticKey::Adapters => "adapters",
        DiagnosticKey::Interfaces => "interfaces",
        DiagnosticKey::Gateway => "gateway",
        DiagnosticKey::Dns => "dns",
        DiagnosticKey::PublicIp => "public_ip",
        DiagnosticKey::Latency => "latency",
        DiagnosticKey::Ports => "ports",
        DiagnosticKey::Speed => "speed",
    }
}

fn hard_block_str(b: &HardBlock) -> &'static str {
    match b {
        HardBlock::CaptivePortal => "captive_portal",
        HardBlock::NoPhysicalLink => "no_physical_link",
        HardBlock::IspOutage => "isp_outage",
        HardBlock::EnterpriseVpnActive(_) => "enterprise_vpn_active",
    }
}