Skip to main content

ryra_core/system/
doctor.rs

1//! Unified environment + install-state checks for `ryra doctor` and the
2//! preflight gate that runs before every `ryra add`.
3//!
4//! Each [`Issue`] variant carries the data needed to render an actionable
5//! fix message via its `Display` impl, plus a [`Severity`] that decides
6//! whether `ryra add` should bail (`Blocker`) or just warn (`Warning` /
7//! `Info`). One source of truth for "what can go wrong with a ryra
8//! setup" — adding a new check means adding one variant + one detection
9//! function and both `ryra doctor` and the install gate pick it up.
10//!
11//! `--tailscale`-specific checks are kept separate ([`check_tailscale_runtime`])
12//! because they're only relevant when the user explicitly opts into the
13//! Tailscale path; surfacing them in the always-on `ryra doctor` view
14//! would be noise for users who never touch tailscale.
15
16use std::fmt;
17use std::fs;
18use std::path::PathBuf;
19
20use crate::config::schema::Config;
21use crate::system::tailscale;
22
23/// Minimum subuid/subgid range required for rootless podman to map common
24/// container UIDs/GIDs (e.g. nginx user 101, postgres 999, shadow group 42).
25/// 65536 is the standard allocation size shipped by adduser/usermod.
26const MIN_SUBID_RANGE: u32 = 65536;
27
28/// Minimum podman version. Quadlet must pass `${...}` port/path values
29/// through to the generated ExecStart (validation removed in 5.3.0) —
30/// older quadlet rejects `PublishPort=${SERVICE_PORT_HTTP}:...` outright,
31/// so every registry service fails to generate. Ubuntu 24.04 LTS ships
32/// 4.9; this check turns that into one clear message instead of a
33/// confusing unit failure.
34const MIN_PODMAN: (u32, u32) = (5, 3);
35
36/// How serious an [`Issue`] is. Drives both UI grouping in `ryra doctor`
37/// output and the gate behaviour of `ryra add` (which bails on any
38/// `Blocker` but otherwise prints warnings without stopping).
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub enum Severity {
41    /// Will cause installs to fail outright. `ryra add` refuses to proceed.
42    Blocker,
43    /// Service runs but is in a state the user probably wants to fix —
44    /// stale symlinks, linger off, etc.
45    Warning,
46    /// Informational, doesn't affect anything currently. Old-format
47    /// installs missing `metadata.toml` etc.
48    Info,
49}
50
51/// A typed, renderable problem detected by [`check_all`].
52#[derive(Debug, Clone, PartialEq, Eq)]
53pub enum Issue {
54    /// `podman` is missing or older than [`MIN_PODMAN`]. Registry
55    /// quadlets rely on runtime env expansion that older quadlet
56    /// versions reject at generation time.
57    PodmanUnsupported {
58        /// `podman --version` output, `None` when the binary isn't on
59        /// PATH or didn't run.
60        found: Option<String>,
61    },
62    /// User has no entry in /etc/subuid or /etc/subgid.
63    SubidNotConfigured {
64        user: String,
65        missing_files: Vec<&'static str>,
66    },
67    /// Range is too small — common on Debian where adduser doesn't auto-allocate.
68    SubidRangeTooSmall {
69        user: String,
70        current: u32,
71        minimum: u32,
72    },
73    /// `--tailscale` was used but the `tailscale` CLI isn't on PATH.
74    TailscaleCliMissing,
75    /// CLI is present but `tailscale status --json` doesn't return a
76    /// `*.ts.net` DNSName for this node.
77    TailscaleNotLoggedIn,
78    /// A service's metadata says OIDC SSO is on, but the managed auth
79    /// provider's config has no client registered for it: ryra's
80    /// bookkeeping and the provider's actual state disagree, so SSO is
81    /// silently broken. Usually a `ryra backup restore` of the provider
82    /// from a snapshot predating this service's registration.
83    AuthSsoDesync { service: String },
84    /// A service is exposed via Tailscale (`svc:<svc_name>`) but the
85    /// control plane hasn't approved this host to serve it, so the
86    /// `*.ts.net` URL routes nowhere even though the container is healthy.
87    /// `ryra add` verifies approval once at install time; this catches it
88    /// drifting out of approval afterwards (ACL change, host de-approved,
89    /// tailscaled losing the advertisement across a reboot).
90    TailscaleServiceUnapproved { service: String, svc_name: String },
91    /// A symlink in `~/.config/containers/systemd/` points at a target
92    /// that no longer exists. Usually means the user `rm -rf`d the
93    /// service's home dir under `~/.local/share/services/<svc>/`.
94    DanglingSymlink { link: PathBuf, target: PathBuf },
95    /// A real quadlet file lives in `~/.local/share/services/<svc>/`
96    /// but no matching symlink exists in the systemd quadlet path,
97    /// so systemd doesn't know about it. Usually means the user
98    /// deleted the symlink by hand.
99    OrphanQuadletFile { path: PathBuf },
100    /// A service is installed (has a marker'd quadlet) but lacks a
101    /// `metadata.toml`. Pre-metadata.toml install — reinstall to migrate.
102    MissingMetadata { service: String },
103    /// A `runtime = "native"` service was installed from a local project dir
104    /// that no longer exists (the user deleted or moved their repo). The unit
105    /// runs from that dir, so it can't start or rebuild (a zombie install).
106    NativeSourceMissing { service: String, source: PathBuf },
107    /// A quadlet references an `EnvironmentFile=` that doesn't exist on
108    /// disk. The unit fails to start, or starts with every `${SERVICE_*}`
109    /// var expanding to an empty string. Usually means the service's data
110    /// dir was moved or renamed (e.g. `mv grafana grafana-test`) or the
111    /// `.env` was deleted by hand.
112    BrokenEnvFileRef {
113        service: String,
114        quadlet: PathBuf,
115        env_file: PathBuf,
116    },
117    /// `loginctl --user enable-linger` hasn't been run, so user-level
118    /// services don't survive logout / reboot.
119    LingerNotEnabled,
120    /// Rootless podman fell back to the cgroupfs cgroup manager because there's
121    /// no usable systemd user session (no user D-Bus). `systemctl --user`
122    /// quadlets still run (systemd owns their cgroup), but direct `podman build`
123    /// and `podman exec` fail to create containers ("sd-bus call: Interactive
124    /// authentication required"). Caused by lingering off and/or a missing user
125    /// D-Bus session (`dbus-user-session` on Debian/Ubuntu) and/or an unset
126    /// XDG_RUNTIME_DIR. This is the wart that turns a clean `ryra add` of a
127    /// container-built service into a cryptic crun failure.
128    PodmanCgroupfsFallback,
129    /// Couldn't read the quadlet symlink farm or service data root to
130    /// detect drift — usually a permissions problem on
131    /// `~/.config/containers/systemd/` or `~/.local/share/services/`.
132    /// Surfaced rather than swallowed so the user knows their install
133    /// state isn't being checked.
134    IntegrityScanFailed { error: String },
135    /// The recommended RAM of all installed services summed exceeds the
136    /// machine's total RAM. Services may fail to start or get OOM-killed under
137    /// load (and with no swap the box can hard-lock). Remove a service or move
138    /// to a larger machine. The install path warns before crossing this line,
139    /// but it's overridable and other paths add services, so a box can drift
140    /// into it.
141    RamOvercommitted { recommended_mb: u64, total_mb: u64 },
142    /// No swap is configured, so memory pressure goes straight to the OOM
143    /// killer with no cushion -- a heavy enough service set can thrash the box
144    /// past the point where even sshd can fork. Add zram (compressed RAM-backed
145    /// swap).
146    NoSwap,
147}
148
149impl Issue {
150    /// How `ryra add` and `ryra doctor` should treat this issue.
151    pub fn severity(&self) -> Severity {
152        match self {
153            Issue::PodmanUnsupported { .. } => Severity::Blocker,
154            Issue::SubidNotConfigured { .. } | Issue::SubidRangeTooSmall { .. } => {
155                Severity::Blocker
156            }
157            Issue::TailscaleCliMissing | Issue::TailscaleNotLoggedIn => Severity::Warning,
158            Issue::AuthSsoDesync { .. } => Severity::Warning,
159            Issue::TailscaleServiceUnapproved { .. } => Severity::Warning,
160            Issue::DanglingSymlink { .. } | Issue::OrphanQuadletFile { .. } => Severity::Warning,
161            Issue::BrokenEnvFileRef { .. } => Severity::Warning,
162            Issue::LingerNotEnabled => Severity::Warning,
163            Issue::RamOvercommitted { .. } | Issue::NoSwap => Severity::Warning,
164            Issue::PodmanCgroupfsFallback => Severity::Warning,
165            Issue::MissingMetadata { .. } => Severity::Info,
166            Issue::NativeSourceMissing { .. } => Severity::Warning,
167            Issue::IntegrityScanFailed { .. } => Severity::Warning,
168        }
169    }
170
171    /// Stable machine-readable identifier for the issue variant, so a UI or
172    /// rpc client can switch on it without parsing the message. Kept in one
173    /// match so adding a variant surfaces here as a compile error rather than
174    /// silently degrading to a blank code.
175    pub fn code(&self) -> &'static str {
176        match self {
177            Issue::PodmanUnsupported { .. } => "podman_unsupported",
178            Issue::SubidNotConfigured { .. } => "subid_not_configured",
179            Issue::SubidRangeTooSmall { .. } => "subid_range_too_small",
180            Issue::TailscaleCliMissing => "tailscale_cli_missing",
181            Issue::TailscaleNotLoggedIn => "tailscale_not_logged_in",
182            Issue::AuthSsoDesync { .. } => "auth_sso_desync",
183            Issue::TailscaleServiceUnapproved { .. } => "tailscale_service_unapproved",
184            Issue::DanglingSymlink { .. } => "dangling_symlink",
185            Issue::OrphanQuadletFile { .. } => "orphan_quadlet_file",
186            Issue::MissingMetadata { .. } => "missing_metadata",
187            Issue::NativeSourceMissing { .. } => "native_source_missing",
188            Issue::BrokenEnvFileRef { .. } => "broken_env_file_ref",
189            Issue::LingerNotEnabled => "linger_not_enabled",
190            Issue::RamOvercommitted { .. } => "ram_overcommitted",
191            Issue::NoSwap => "no_swap",
192            Issue::PodmanCgroupfsFallback => "podman_cgroupfs_fallback",
193            Issue::IntegrityScanFailed { .. } => "integrity_scan_failed",
194        }
195    }
196
197    /// The installed service this issue is scoped to, when it's service-specific.
198    pub fn service(&self) -> Option<String> {
199        match self {
200            Issue::AuthSsoDesync { service }
201            | Issue::TailscaleServiceUnapproved { service, .. }
202            | Issue::MissingMetadata { service }
203            | Issue::NativeSourceMissing { service, .. }
204            | Issue::BrokenEnvFileRef { service, .. } => Some(service.clone()),
205            _ => None,
206        }
207    }
208}
209
210impl fmt::Display for Issue {
211    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
212        match self {
213            Issue::PodmanUnsupported { found } => match found {
214                Some(version) => write!(
215                    f,
216                    "podman {version} is too old — ryra needs podman >= {}.{} \
217                     (quadlet env expansion in PublishPort/Volume).\n\
218                     \n\
219                     Fix: upgrade podman — current Debian-based, Fedora, and Arch \
220                     releases all ship a supported version.",
221                    MIN_PODMAN.0, MIN_PODMAN.1,
222                ),
223                None => write!(
224                    f,
225                    "podman isn't on PATH — ryra runs every service as a rootless \
226                     podman container.\n\
227                     \n\
228                     Fix:\n  \
229                       sudo apt install podman      # Debian-based\n  \
230                       sudo dnf install podman      # Fedora\n  \
231                       sudo pacman -S podman        # Arch",
232                ),
233            },
234            Issue::SubidNotConfigured {
235                user,
236                missing_files,
237            } => {
238                write!(
239                    f,
240                    "rootless podman needs subuid/subgid mappings, but {} has no entry in {}.\n\
241                     \n\
242                     Fix:\n  \
243                       sudo usermod --add-subuids 100000-165535 --add-subgids 100000-165535 {}\n  \
244                       podman system migrate",
245                    user,
246                    missing_files.join(" / "),
247                    user,
248                )
249            }
250            Issue::SubidRangeTooSmall {
251                user,
252                current,
253                minimum,
254            } => {
255                write!(
256                    f,
257                    "rootless podman needs at least {minimum} subuids/subgids, but {user} has only {current}.\n\
258                     Containers with non-zero UIDs (postgres, nginx, etc.) will fail to extract.\n\
259                     \n\
260                     Fix:\n  \
261                       sudo usermod --add-subuids 100000-165535 --add-subgids 100000-165535 {user}\n  \
262                       podman system migrate",
263                )
264            }
265            Issue::TailscaleCliMissing => {
266                write!(
267                    f,
268                    "the `tailscale` CLI isn't on PATH.\n\
269                     \n\
270                     Fix (Debian/Ubuntu):\n  \
271                       curl -fsSL https://tailscale.com/install.sh | sh\n\
272                     Or drop --tailscale and reach the service via Caddy \
273                     (run `ryra add caddy` first) or your own URL (--url).",
274                )
275            }
276            Issue::TailscaleNotLoggedIn => {
277                write!(
278                    f,
279                    "this node isn't logged into a tailnet.\n\
280                     `tailscale status` doesn't return a *.ts.net hostname.\n\
281                     \n\
282                     Fix:\n  \
283                       sudo tailscale up",
284                )
285            }
286            Issue::AuthSsoDesync { service } => {
287                write!(
288                    f,
289                    "{service} is configured for OIDC SSO, but the auth provider has no client \
290                     registered for it, so SSO is broken even though ryra's metadata says it's \
291                     wired. Often follows a `ryra backup restore` of the provider from a \
292                     snapshot taken before {service} was added with --auth.\n\
293                     \n\
294                     Fix (re-registers using the existing client credentials in {service}'s \
295                     .env, no secret rotation):\n  \
296                       ryra config {service} --reassert-auth -y",
297                )
298            }
299            Issue::TailscaleServiceUnapproved { service, svc_name } => {
300                write!(
301                    f,
302                    "{service} is exposed on your tailnet (svc:{svc_name}) but the control \
303                     plane hasn't approved this host to serve it, so its *.ts.net URL routes \
304                     nowhere even though the container is healthy.\n\
305                     \n\
306                     Fix (most common: tailscaled didn't push the advertisement):\n  \
307                       sudo systemctl restart tailscaled\n\
308                     If it stays unapproved, your tailnet ACL isn't auto-approving the service. \
309                     Confirm with:\n  \
310                       sudo tailscale status --json | jq '.Self.CapMap[\"service-host\"]'\n\
311                     and add the service to autoApprovers.services in the ACL (or approve the \
312                     host in the admin console).",
313                )
314            }
315            Issue::DanglingSymlink { link, target } => {
316                write!(
317                    f,
318                    "{} is a dangling symlink → {} (target missing).\n\
319                     The service's data dir was moved, renamed, or deleted, but the \
320                     systemd unit pointer wasn't updated.\n\
321                     \n\
322                     Fix (restore the dir if it was moved, or drop the unit):\n  \
323                       # put the data dir back so {} exists again\n  \
324                       # or: rm {}",
325                    link.display(),
326                    target.display(),
327                    target.display(),
328                    link.display(),
329                )
330            }
331            Issue::OrphanQuadletFile { path } => {
332                write!(
333                    f,
334                    "{} exists but no matching symlink in ~/.config/containers/systemd/, so systemd doesn't see it.\n\
335                     \n\
336                     Fix (re-link):\n  \
337                       ln -sf {} ~/.config/containers/systemd/{}\n  \
338                       systemctl --user daemon-reload\n\
339                     Or delete the orphan: ryra remove --purge <service>",
340                    path.display(),
341                    path.display(),
342                    path.file_name().and_then(|n| n.to_str()).unwrap_or("?"),
343                )
344            }
345            Issue::MissingMetadata { service } => {
346                write!(
347                    f,
348                    "{service} is installed but has no metadata.toml — install record from a pre-metadata ryra version.\n\
349                     `ryra list` and `ryra remove` will work but URL/exposure won't be reported.\n\
350                     \n\
351                     Fix (reinstall to migrate):\n  \
352                       ryra remove --purge {service} && ryra add {service}",
353                )
354            }
355            Issue::NativeSourceMissing { service, source } => {
356                write!(
357                    f,
358                    "{service} (native) runs from {} but that directory is gone \
359                     (deleted or moved). It can't start or rebuild.\n\
360                     \n\
361                     Fix (restore the source, then re-render):\n  \
362                       # put the project back at {}, then: ryra upgrade {service}\n  \
363                       # or drop the install: ryra remove --purge {service}",
364                    source.display(),
365                    source.display(),
366                )
367            }
368            Issue::BrokenEnvFileRef {
369                service,
370                quadlet,
371                env_file,
372            } => {
373                write!(
374                    f,
375                    "{} references EnvironmentFile={} but that file doesn't exist.\n\
376                     The unit can't start — and ${{SERVICE_HOME}}/${{SERVICE_PORT_*}} in it \
377                     would expand to empty strings.\n\
378                     Usually the service's data dir was moved or renamed, or the .env was deleted.\n\
379                     \n\
380                     Fix (restore the path, or reinstall):\n  \
381                       # put the data back at {}, then: systemctl --user restart {service}\n  \
382                       # or: ryra remove --purge {service} && ryra add {service}",
383                    quadlet.display(),
384                    env_file.display(),
385                    env_file
386                        .parent()
387                        .unwrap_or_else(|| std::path::Path::new("?"))
388                        .display(),
389                )
390            }
391            Issue::LingerNotEnabled => {
392                write!(
393                    f,
394                    "loginctl linger isn't enabled, so your user services stop when you log out.\n\
395                     \n\
396                     Fix:\n  \
397                       loginctl enable-linger",
398                )
399            }
400            Issue::RamOvercommitted {
401                recommended_mb,
402                total_mb,
403            } => {
404                write!(
405                    f,
406                    "installed services recommend {recommended_mb} MB of RAM but this machine \
407                     has {total_mb} MB.\n\
408                     Under load they may fail to start or get OOM-killed, and with no swap the \
409                     box can hard-lock.\n\
410                     \n\
411                     Fix (remove a service, or move to a larger machine):\n  \
412                       ryra remove <service>",
413                )
414            }
415            Issue::NoSwap => {
416                write!(
417                    f,
418                    "this box has no swap, so memory pressure goes straight to the OOM killer \
419                     with no cushion, and a heavy enough service set can thrash it past the \
420                     point where even sshd can fork.\n\
421                     \n\
422                     Fix (add compressed RAM-backed swap):\n  \
423                       sudo apt-get install -y systemd-zram-generator\n  \
424                       printf '[zram0]\\nzram-size = min(ram / 2, 4096)\\ncompression-algorithm = zstd\\n' \
425                       | sudo tee /etc/systemd/zram-generator.conf\n  \
426                       sudo systemctl daemon-reload\n  \
427                       sudo systemctl start systemd-zram-setup@zram0.service",
428                )
429            }
430            Issue::PodmanCgroupfsFallback => {
431                write!(
432                    f,
433                    "rootless podman has no usable systemd user session and fell back to the\n\
434                     cgroupfs cgroup manager. Quadlet services started via `systemctl --user`\n\
435                     still run, but direct `podman build` / `podman exec` fail to create\n\
436                     containers (\"sd-bus call: Interactive authentication required\"). This box\n\
437                     can pull + run images, but it can't build one locally until the user\n\
438                     session works.\n\
439                     \n\
440                     Fix (run all three, then log out and back in so the session starts):\n  \
441                       sudo loginctl enable-linger $USER\n  \
442                       sudo apt-get install -y dbus-user-session   # Debian/Ubuntu: provides the user D-Bus session\n  \
443                       # confirm XDG_RUNTIME_DIR=/run/user/$(id -u) is set in your shell\n\
444                     \n\
445                     Verify afterwards:  podman info --format '{{{{.Host.CgroupManager}}}}'  (want: systemd)\n\
446                     Or sidestep it entirely: build the image in CI and let the box pull it.",
447                )
448            }
449            Issue::IntegrityScanFailed { error } => {
450                write!(
451                    f,
452                    "couldn't scan installed services to check for drift: {error}\n\
453                     Fix the underlying error (commonly a permissions problem on \
454                     ~/.config/containers/systemd/ or ~/.local/share/services/) so \
455                     `ryra doctor` can verify install state.",
456                )
457            }
458        }
459    }
460}
461
462/// Run every always-applicable check and return all detected issues
463/// (any severity). Tailscale-only checks are conditional; see
464/// [`check_tailscale_runtime`].
465pub fn check_all(_config: &Config) -> Vec<Issue> {
466    let mut issues = Vec::new();
467    if let Err(e) = check_podman_version() {
468        issues.push(e);
469    }
470    if let Err(e) = check_subid_range() {
471        issues.push(e);
472    }
473    if !check_linger_enabled() {
474        issues.push(Issue::LingerNotEnabled);
475    }
476    if !check_podman_user_session() {
477        issues.push(Issue::PodmanCgroupfsFallback);
478    }
479    issues.extend(check_install_integrity());
480    issues
481}
482
483/// Memory-pressure checks (doctor-only, like [`check_tailscale_services`]): the
484/// box has no swap cushion, or the recommended RAM of installed services already
485/// exceeds total RAM. Deliberately not in [`check_all`] -- no point re-running
486/// it on every `ryra add` blocker gate, where the install path's own RAM check
487/// already warns. Reads only the *cached* registry, never fetching, so it stays
488/// fast and offline. `cache_dir` is `ConfigPaths::cache_dir`.
489pub fn check_memory(cache_dir: &std::path::Path) -> Vec<Issue> {
490    let mut issues = Vec::new();
491
492    // No swap: pressure goes straight to OOM with no cushion.
493    if crate::system::memory::swap_total_mb() == Some(0) {
494        issues.push(Issue::NoSwap);
495    }
496
497    // Cumulative overcommit: sum the recommended RAM of installed services (from
498    // the cached registry manifests) and compare to total RAM. Any step we
499    // can't complete -> skip silently rather than warn on bad data.
500    if let Some(total_mb) = crate::system::memory::total_ram_mb()
501        && let Some(dir) = crate::registry::resolve::cached_default_registry_dir(cache_dir)
502        && let Ok(catalog) = crate::search_services(&dir, None)
503    {
504        let recommended_mb: u64 = catalog
505            .iter()
506            .filter(|s| s.installed)
507            .filter_map(|s| s.recommended_ram_mb)
508            .sum();
509        if recommended_mb > total_mb {
510            issues.push(Issue::RamOvercommitted {
511                recommended_mb,
512                total_mb,
513            });
514        }
515    }
516
517    issues
518}
519
520/// Filtered view: only `Blocker`-severity issues. `ryra add` calls this
521/// to decide whether to bail; warnings/info get printed but don't gate.
522pub fn blockers(config: &Config) -> Vec<Issue> {
523    check_all(config)
524        .into_iter()
525        .filter(|i| i.severity() == Severity::Blocker)
526        .collect()
527}
528
529/// Verify the host can do `tailscale serve`: the CLI is on PATH and the
530/// node is logged into a tailnet. Called from the `--tailscale` flag
531/// handler and the "Tailscale" branch of the exposure prompt; failure is
532/// fatal (we can't auto-derive a tailnet URL without a logged-in node).
533pub fn check_tailscale_runtime() -> Result<(), Issue> {
534    if !tailscale::cli_available() {
535        return Err(Issue::TailscaleCliMissing);
536    }
537    if tailscale::self_dns_name().is_none() {
538        return Err(Issue::TailscaleNotLoggedIn);
539    }
540    Ok(())
541}
542
543/// Verify ryra's auth bookkeeping matches the provider's actual state: for
544/// every installed service whose metadata says SSO is on, the managed auth
545/// provider should still have a client registered for it. Catches the
546/// provider/consumer desync (e.g. a provider restore that rolled back past
547/// a registration) that local install-state checks miss entirely.
548///
549/// Doctor-only (not in the `ryra add` gate) and silent unless the managed
550/// provider is installed and a service claims auth; only a definite
551/// mismatch is reported; undeterminable (provider config unreadable) stays
552/// quiet.
553pub fn check_auth_wiring() -> Vec<Issue> {
554    // Only the managed provider exposes a config we can introspect; an
555    // external OIDC provider is the user's to verify.
556    if !crate::is_service_installed(crate::WellKnownService::Authelia.as_str()) {
557        return Vec::new();
558    }
559    let Ok(installed) = crate::list_installed() else {
560        return Vec::new();
561    };
562    let mut issues = Vec::new();
563    for svc in &installed {
564        if svc.auth_kind.is_none() {
565            continue;
566        }
567        if crate::authelia::oidc_client_registered(&svc.name) == Some(false) {
568            issues.push(Issue::AuthSsoDesync {
569                service: svc.name.clone(),
570            });
571        }
572    }
573    issues
574}
575
576/// Verify every Tailscale-exposed installed service is still approved by
577/// the tailnet to serve its `svc:<name>`. Deliberately *not* part of
578/// [`check_all`]: it's a `ryra doctor`-only check (no point probing the
579/// tailnet on the `ryra add` blocker gate), and it stays silent unless the
580/// user actually has Tailscale-exposed services; no tailnet calls happen
581/// otherwise. Only a definite "not approved" is reported; when approval
582/// can't be determined (CLI missing, status unreadable) we say nothing
583/// rather than nag.
584pub fn check_tailscale_services() -> Vec<Issue> {
585    let Ok(installed) = crate::list_installed() else {
586        // Install-state errors are already surfaced by check_install_integrity;
587        // don't double-report here.
588        return Vec::new();
589    };
590    let mut issues = Vec::new();
591    for svc in &installed {
592        if !svc.exposure.is_tailscale() {
593            continue;
594        }
595        let Some(svc_name) = svc.exposure.tailscale_svc_name() else {
596            continue;
597        };
598        if tailscale::is_service_approved(&svc_name) == Some(false) {
599            issues.push(Issue::TailscaleServiceUnapproved {
600                service: svc.name.clone(),
601                svc_name,
602            });
603        }
604    }
605    issues
606}
607
608/// Blocker when podman is missing or below [`MIN_PODMAN`].
609fn check_podman_version() -> Result<(), Issue> {
610    let Ok(output) = std::process::Command::new("podman")
611        .arg("--version")
612        .output()
613    else {
614        return Err(Issue::PodmanUnsupported { found: None });
615    };
616    let text = String::from_utf8_lossy(&output.stdout);
617    let Some((major, minor, patch)) = parse_podman_version(&text) else {
618        // Ran but printed something unparseable — report what we saw
619        // rather than guessing it's fine.
620        return Err(Issue::PodmanUnsupported {
621            found: Some(text.trim().to_string()),
622        });
623    };
624    if (major, minor) < MIN_PODMAN {
625        return Err(Issue::PodmanUnsupported {
626            found: Some(format!("{major}.{minor}.{patch}")),
627        });
628    }
629    Ok(())
630}
631
632/// Parse `podman --version` output ("podman version 5.8.2", tolerating
633/// suffixes like "5.9.0-dev").
634fn parse_podman_version(s: &str) -> Option<(u32, u32, u32)> {
635    let nums = s.split_whitespace().last()?;
636    let mut parts = nums.split('.');
637    let digits = |p: &str| -> Option<u32> {
638        let d: String = p.chars().take_while(|c| c.is_ascii_digit()).collect();
639        d.parse().ok()
640    };
641    let major = digits(parts.next()?)?;
642    let minor = digits(parts.next()?)?;
643    let patch = parts.next().and_then(digits).unwrap_or(0);
644    Some((major, minor, patch))
645}
646
647fn check_subid_range() -> Result<(), Issue> {
648    let user = std::env::var("USER").unwrap_or_default();
649    if user.is_empty() {
650        // No $USER means we can't check — skip rather than false-positive.
651        return Ok(());
652    }
653
654    let mut missing = Vec::new();
655    let subuid_size = parse_subid_range("/etc/subuid", &user, &mut missing);
656    let subgid_size = parse_subid_range("/etc/subgid", &user, &mut missing);
657
658    if !missing.is_empty() {
659        return Err(Issue::SubidNotConfigured {
660            user,
661            missing_files: missing,
662        });
663    }
664    let min = subuid_size.min(subgid_size);
665    if min < MIN_SUBID_RANGE {
666        return Err(Issue::SubidRangeTooSmall {
667            user,
668            current: min,
669            minimum: MIN_SUBID_RANGE,
670        });
671    }
672    Ok(())
673}
674
675/// `loginctl --user enable-linger`: are user services allowed to keep
676/// running after logout? Reads via `loginctl show-user`. Unable-to-read
677/// is treated as "enabled" (don't false-positive on systems without
678/// loginctl, e.g. some CI containers).
679fn check_linger_enabled() -> bool {
680    let user = match std::env::var("USER") {
681        Ok(u) if !u.is_empty() => u,
682        _ => return true,
683    };
684    let output = std::process::Command::new("loginctl")
685        .args(["show-user", &user, "--property=Linger"])
686        .output();
687    match output {
688        Ok(o) if o.status.success() => {
689            let stdout = String::from_utf8_lossy(&o.stdout);
690            !stdout.trim().eq_ignore_ascii_case("Linger=no")
691        }
692        _ => true,
693    }
694}
695
696/// Whether rootless podman has a usable systemd user session, i.e. it isn't
697/// falling back to the cgroupfs cgroup manager. We read the observable symptom
698/// directly: `podman info`'s cgroup manager. `systemd` (or anything that isn't
699/// `cgroupfs`) is healthy; `cgroupfs` means crun can't register systemd scopes,
700/// so `podman build`/`exec` will hit "Interactive authentication required".
701///
702/// Conservative: if podman is absent or unreadable we return `true` (healthy)
703/// so we don't false-positive on hosts where this check can't run (and the
704/// separate podman-version check already flags a missing podman).
705fn check_podman_user_session() -> bool {
706    let output = std::process::Command::new("podman")
707        .args(["info", "--format", "{{.Host.CgroupManager}}"])
708        .output();
709    match output {
710        Ok(o) if o.status.success() => !String::from_utf8_lossy(&o.stdout)
711            .trim()
712            .eq_ignore_ascii_case("cgroupfs"),
713        _ => true,
714    }
715}
716
717/// Detect drift between the quadlet symlink farm and the per-service
718/// home dirs: dangling symlinks, orphan quadlet files, and installed
719/// services missing `metadata.toml`. Returns the issues in the order
720/// they're discovered.
721/// Scan a generated `.container` file for `EnvironmentFile=` lines whose
722/// target doesn't exist. `%h` is resolved like systemd would; a leading `-`
723/// (systemd's ignore-missing marker) means absence is by design — skipped.
724fn broken_env_file_refs(service: &str, quadlet_path: &std::path::Path) -> Vec<Issue> {
725    let Ok(content) = std::fs::read_to_string(quadlet_path) else {
726        return Vec::new();
727    };
728    let Ok(home) = crate::home_dir() else {
729        return Vec::new();
730    };
731    let mut out = Vec::new();
732    for line in content.lines() {
733        let Some(value) = line.trim().strip_prefix("EnvironmentFile=") else {
734            continue;
735        };
736        let value = value.trim();
737        if value.is_empty() || value.starts_with('-') {
738            continue;
739        }
740        let resolved = PathBuf::from(value.replace("%h", &home.to_string_lossy()));
741        if !resolved.exists()
742            && !out.iter().any(
743                |i| matches!(i, Issue::BrokenEnvFileRef { env_file, .. } if *env_file == resolved),
744            )
745        {
746            out.push(Issue::BrokenEnvFileRef {
747                service: service.to_string(),
748                quadlet: quadlet_path.to_path_buf(),
749                env_file: resolved,
750            });
751        }
752    }
753    out
754}
755
756fn check_install_integrity() -> Vec<Issue> {
757    let mut out = Vec::new();
758    let Ok(quadlet) = crate::quadlet_dir() else {
759        return out;
760    };
761    let Ok(data_root) = crate::service_data_root() else {
762        return out;
763    };
764
765    // Dangling symlinks in quadlet dir whose target sits under our data root.
766    if let Ok(entries) = std::fs::read_dir(&quadlet) {
767        for entry in entries.flatten() {
768            let path = entry.path();
769            let Ok(meta) = std::fs::symlink_metadata(&path) else {
770                continue;
771            };
772            if !meta.file_type().is_symlink() {
773                continue;
774            }
775            let Ok(target) = std::fs::read_link(&path) else {
776                continue;
777            };
778            let resolved = if target.is_absolute() {
779                target.clone()
780            } else {
781                // `path` came from read_dir on `quadlet`, so it always has a
782                // parent. The else-arm only fires if a future caller hands us
783                // a rootless path — skip rather than join against an empty
784                // base and report a phantom dangling symlink.
785                let Some(parent) = path.parent() else {
786                    continue;
787                };
788                parent.join(&target)
789            };
790            if !resolved.starts_with(&data_root) {
791                continue;
792            }
793            if !resolved.exists() {
794                out.push(Issue::DanglingSymlink {
795                    link: path,
796                    target: resolved,
797                });
798            }
799        }
800    }
801
802    // Orphan quadlet files: real .container/.network/.volume in service home
803    // with no matching symlink in quadlet dir, plus missing metadata.toml
804    // for marker'd installs.
805    let managed = match crate::scan_managed_services() {
806        Ok(m) => m,
807        Err(e) => {
808            out.push(Issue::IntegrityScanFailed {
809                error: e.to_string(),
810            });
811            return out;
812        }
813    };
814    for svc in &managed {
815        let Ok(home) = crate::service_home(svc) else {
816            continue;
817        };
818        if !home.is_dir() {
819            continue;
820        }
821        if let Ok(meta_path) = crate::metadata_path(svc)
822            && !meta_path.exists()
823        {
824            out.push(Issue::MissingMetadata {
825                service: svc.clone(),
826            });
827        }
828        if let Ok(entries) = std::fs::read_dir(&home) {
829            for entry in entries.flatten() {
830                let path = entry.path();
831                let name = entry.file_name();
832                let n = name.to_string_lossy();
833                if !(n.ends_with(".container") || n.ends_with(".network") || n.ends_with(".volume"))
834                {
835                    continue;
836                }
837                let symlink = quadlet.join(&name);
838                let symlink_ok = std::fs::read_link(&symlink)
839                    .ok()
840                    .and_then(|t| {
841                        if t.is_absolute() {
842                            Some(t)
843                        } else {
844                            // symlink = quadlet.join(name) — always has a parent.
845                            symlink.parent().map(|p| p.join(&t))
846                        }
847                    })
848                    .is_some_and(|resolved| resolved == path);
849                if !symlink_ok {
850                    out.push(Issue::OrphanQuadletFile { path: path.clone() });
851                }
852                if n.ends_with(".container") {
853                    out.extend(broken_env_file_refs(svc, &path));
854                }
855            }
856        }
857    }
858
859    // Native services run from their source dir (recorded in metadata as the
860    // install's `registry`, which for a local-path install holds the project
861    // path). Quadlet scans above never see them, so check separately that the
862    // source still exists: a deleted/moved repo leaves a zombie install.
863    if let Ok(root) = crate::paths::service_data_root()
864        && let Ok(entries) = std::fs::read_dir(&root)
865    {
866        for entry in entries.flatten() {
867            let Some(svc) = entry.file_name().to_str().map(str::to_string) else {
868                continue;
869            };
870            let Ok(Some(meta)) = crate::metadata::load_metadata(&svc) else {
871                continue;
872            };
873            if meta.runtime != crate::registry::service_def::Runtime::Native {
874                continue;
875            }
876            // Only local-path installs record a filesystem path here; registry
877            // installs record a registry name (ryra-managed, not user-deletable).
878            if crate::registry::resolve::is_path_like(&meta.registry) {
879                let source = PathBuf::from(&meta.registry);
880                if !source.is_dir() {
881                    out.push(Issue::NativeSourceMissing {
882                        service: svc,
883                        source,
884                    });
885                }
886            }
887        }
888    }
889
890    out
891}
892
893/// Parse `username:start:count` lines for the given user. Returns the count
894/// (range size) — 0 if the user isn't found. Records `path` in `missing` if
895/// the file is unreadable or has no entry for the user.
896fn parse_subid_range(path: &'static str, user: &str, missing: &mut Vec<&'static str>) -> u32 {
897    let contents = match fs::read_to_string(path) {
898        Ok(c) => c,
899        Err(_) => {
900            missing.push(path);
901            return 0;
902        }
903    };
904    for line in contents.lines() {
905        let mut parts = line.splitn(3, ':');
906        let Some(name) = parts.next() else { continue };
907        if name != user {
908            continue;
909        }
910        let _start = parts.next();
911        // A malformed count falls through as 0, which then trips
912        // SubidRangeTooSmall — same actionable fix command as a missing range.
913        let count = parts
914            .next()
915            .and_then(|s| s.parse::<u32>().ok())
916            .unwrap_or(0);
917        return count;
918    }
919    missing.push(path);
920    0
921}
922
923#[cfg(test)]
924mod tests {
925    use super::*;
926
927    #[test]
928    fn podman_version_parsing() {
929        assert_eq!(
930            parse_podman_version("podman version 5.8.2"),
931            Some((5, 8, 2))
932        );
933        assert_eq!(
934            parse_podman_version("podman version 4.9.3"),
935            Some((4, 9, 3))
936        );
937        assert_eq!(
938            parse_podman_version("podman version 5.9.0-dev"),
939            Some((5, 9, 0))
940        );
941        assert_eq!(parse_podman_version("podman version 6.0"), Some((6, 0, 0)));
942        assert_eq!(parse_podman_version("garbage"), None);
943        // The floor itself: 5.3 passes, 5.2 / 4.x don't.
944        assert!((5, 3) >= MIN_PODMAN);
945        assert!((5, 2) < MIN_PODMAN);
946        assert!((4, 9) < MIN_PODMAN);
947    }
948
949    #[test]
950    fn display_too_small_includes_fix_command() {
951        let e = Issue::SubidRangeTooSmall {
952            user: "alice".into(),
953            current: 1000,
954            minimum: 65536,
955        };
956        let s = format!("{e}");
957        assert!(s.contains("usermod --add-subuids"));
958        assert!(s.contains("alice"));
959        assert!(s.contains("podman system migrate"));
960    }
961
962    #[test]
963    fn display_not_configured_lists_files() {
964        let e = Issue::SubidNotConfigured {
965            user: "bob".into(),
966            missing_files: vec!["/etc/subuid", "/etc/subgid"],
967        };
968        let s = format!("{e}");
969        assert!(s.contains("/etc/subuid"));
970        assert!(s.contains("/etc/subgid"));
971    }
972
973    #[test]
974    fn tailscale_cli_missing_display_has_install_hint() {
975        let s = format!("{}", Issue::TailscaleCliMissing);
976        assert!(s.contains("tailscale.com/install"));
977        assert!(s.contains("ryra add caddy") && s.contains("--url"));
978    }
979
980    #[test]
981    fn tailscale_not_logged_in_display_has_up_command() {
982        let s = format!("{}", Issue::TailscaleNotLoggedIn);
983        assert!(s.contains("tailscale up"));
984    }
985
986    #[test]
987    fn podman_cgroupfs_fallback_display_has_the_session_fix() {
988        let s = format!("{}", Issue::PodmanCgroupfsFallback);
989        // The three commands a user needs, and the verify hint (with the braces
990        // un-escaped from the format string).
991        assert!(s.contains("enable-linger"), "{s}");
992        assert!(s.contains("dbus-user-session"), "{s}");
993        assert!(s.contains("XDG_RUNTIME_DIR"), "{s}");
994        assert!(s.contains("{{.Host.CgroupManager}}"), "{s}");
995        assert_eq!(Issue::PodmanCgroupfsFallback.severity(), Severity::Warning);
996    }
997
998    #[test]
999    fn auth_sso_desync_display_names_service_and_nonrotating_fix() {
1000        let issue = Issue::AuthSsoDesync {
1001            service: "seafile".into(),
1002        };
1003        assert_eq!(issue.severity(), Severity::Warning);
1004        let s = format!("{issue}");
1005        assert!(s.contains("seafile"));
1006        // Points at the non-rotating repair command.
1007        assert!(s.contains("ryra config seafile --reassert-auth"));
1008    }
1009
1010    #[test]
1011    fn tailscale_unapproved_display_names_service_and_fix() {
1012        let issue = Issue::TailscaleServiceUnapproved {
1013            service: "vikunja".into(),
1014            svc_name: "vikunja-debian".into(),
1015        };
1016        assert_eq!(issue.severity(), Severity::Warning);
1017        let s = format!("{issue}");
1018        // Names the service and its svc:, and carries the one-line fix.
1019        assert!(s.contains("vikunja") && s.contains("svc:vikunja-debian"));
1020        assert!(s.contains("systemctl restart tailscaled"));
1021        assert!(s.contains("autoApprovers.services"));
1022    }
1023
1024    #[test]
1025    fn severity_split() {
1026        assert_eq!(
1027            Issue::SubidRangeTooSmall {
1028                user: "x".into(),
1029                current: 0,
1030                minimum: 1,
1031            }
1032            .severity(),
1033            Severity::Blocker
1034        );
1035        assert_eq!(
1036            Issue::DanglingSymlink {
1037                link: "/a".into(),
1038                target: "/b".into(),
1039            }
1040            .severity(),
1041            Severity::Warning
1042        );
1043        assert_eq!(
1044            Issue::MissingMetadata {
1045                service: "x".into(),
1046            }
1047            .severity(),
1048            Severity::Info
1049        );
1050    }
1051
1052    #[test]
1053    fn dangling_symlink_display_has_rm_fix() {
1054        let s = format!(
1055            "{}",
1056            Issue::DanglingSymlink {
1057                link: "/x/foo.container".into(),
1058                target: "/y/foo.container".into(),
1059            }
1060        );
1061        assert!(s.contains("rm /x/foo.container"));
1062    }
1063
1064    #[test]
1065    fn missing_metadata_display_suggests_reinstall() {
1066        let s = format!(
1067            "{}",
1068            Issue::MissingMetadata {
1069                service: "forgejo".into(),
1070            }
1071        );
1072        assert!(s.contains("ryra remove --purge forgejo"));
1073        assert!(s.contains("ryra add forgejo"));
1074    }
1075}