Skip to main content

ryra_core/
backup.rs

1//! Backup planning. Pure functions that take service install state +
2//! the user's backup config and produce typed plans the CLI executes.
3//!
4//! What lives here:
5//! - [`BackupRunPlan`]: everything the CLI needs to push one service's
6//!   data to the configured restic repository.
7//! - [`BackupRestorePlan`]: same shape for the reverse operation.
8//! - [`plan_backup_run`] / [`plan_backup_restore`]: the planners.
9//!
10//! What does *not* live here: spawning the `restic` subprocess, running
11//! hook scripts, or any other side effect. The CLI layer owns those.
12//! Keeping the planner pure means it round-trips cleanly in tests
13//! against a tempdir without needing restic on the test runner.
14
15use std::collections::BTreeMap;
16use std::path::{Path, PathBuf};
17
18use sha2::{Digest, Sha256};
19
20use crate::config::ConfigPaths;
21use crate::config::schema::{BackupBackend, Config};
22use crate::error::{Error, Result};
23use crate::metadata::{Metadata, load_metadata};
24use crate::paths::service_home;
25use crate::registry;
26use crate::registry::service_def::ServiceDef;
27
28const SERVICE_TOML_FILENAME: &str = "service.toml";
29
30/// Concrete instructions for backing up one installed service.
31///
32/// The CLI consumes this by:
33/// 1. Running every `pre_backup_hook` script in order.
34/// 2. Spawning `restic backup` with `repo`, `password` (via
35///    `RESTIC_PASSWORD` env), `env` set on the child, `--tag` for each
36///    string in `tags`, and `--exclude` for each string in `excludes`,
37///    with `paths` as the positional arguments.
38/// 3. Running every `post_backup_hook` (even if step 2 failed —
39///    failure-cleanup matters; see [`PlanHook::Cleanup`]).
40#[derive(Debug, Clone)]
41pub struct BackupRunPlan {
42    pub service_name: String,
43    pub service_home: PathBuf,
44    pub repo: String,
45    pub password: String,
46    pub env: BTreeMap<String, String>,
47    pub tags: Vec<String>,
48    pub paths: Vec<PathBuf>,
49    pub excludes: Vec<String>,
50    /// False (the default) means a cold snapshot: ryra stops `units`, makes
51    /// `data_paths` readable, snapshots, then restarts. True means ryra leaves
52    /// the service running and only drives the hooks (see [`BackupConfig`]).
53    pub online: bool,
54    /// The service's systemd units (one per container quadlet), derived so ryra
55    /// can stop the whole stack for a cold snapshot. Empty for an online service.
56    pub units: Vec<String>,
57    /// The service's data directories (absolute), derived from `[backup].paths`.
58    /// Cold snapshots `podman unshare chown` these so restic can read them.
59    pub data_paths: Vec<PathBuf>,
60    pub pre_backup_hook: Option<PathBuf>,
61    pub post_backup_hook: Option<PathBuf>,
62}
63
64/// Instructions for restoring one installed service from a specific
65/// restic snapshot.
66#[derive(Debug, Clone)]
67pub struct BackupRestorePlan {
68    pub service_name: String,
69    pub service_home: PathBuf,
70    pub repo: String,
71    pub password: String,
72    pub env: BTreeMap<String, String>,
73    /// `latest` to grab the newest snapshot, or a specific restic
74    /// snapshot id (hex prefix) when the user passed `--at <id>`.
75    pub snapshot: String,
76    /// Mirror of [`BackupRunPlan::online`]. A cold restore stops `units`, wipes
77    /// `data_paths` to a clean tree, restores, then restarts. An online restore
78    /// runs only the hooks around restic.
79    pub online: bool,
80    /// The service's systemd units, derived so ryra can stop the stack before a
81    /// cold restore. Empty for an online service.
82    pub units: Vec<String>,
83    /// The service's data directories (absolute), derived from `[backup].paths`.
84    /// A cold restore wipes these to a clean tree before `restic restore`.
85    pub data_paths: Vec<PathBuf>,
86    /// Also restore the global `preferences.toml` bundled in the snapshot.
87    /// Default `false`: a per-service restore must NOT clobber global config
88    /// (SMTP/auth/backup creds/other services) with a stale copy. `true` is the
89    /// disaster-recovery opt-in (`ryra backup restore --config`).
90    pub include_config: bool,
91    pub pre_restore_hook: Option<PathBuf>,
92    pub post_restore_hook: Option<PathBuf>,
93}
94
95/// Instructions for pruning one service's snapshots to the retention ladder.
96/// Built from the configured retention policy; the CLI spawns `restic forget`
97/// (then `--prune` to reclaim space) scoped to this service's `service:<name>`
98/// tag, so one service's policy can't evict another's snapshots.
99#[derive(Debug, Clone)]
100pub struct BackupForgetPlan {
101    pub service_name: String,
102    pub repo: String,
103    pub password: String,
104    pub env: BTreeMap<String, String>,
105    /// restic `--tag` filter (e.g. `service:<name>,mode:daily`) — forget only
106    /// considers snapshots matching all of these comma-joined tags.
107    pub tag: String,
108    /// `--keep-*` flags from the policy. Never empty (the planner returns
109    /// `None` for an absent/all-zero policy rather than an empty plan).
110    pub keep_args: Vec<String>,
111    /// Reclaim space after forgetting. Skipped in a dry run.
112    pub prune: bool,
113    /// Show what would be removed without removing it.
114    pub dry_run: bool,
115}
116
117/// Plan a `ryra backup manual <service>` invocation. Errors loudly when:
118/// - the service isn't installed,
119/// - the user hasn't run `ryra backup connect` yet,
120/// - the service author hasn't declared backup support (defensive —
121///   the install-time check should have caught this earlier, but a
122///   manifest change between install and backup is possible).
123///
124/// Note: a snapshot does NOT require the service to be enrolled
125/// (`backup_enabled`). Enrollment only governs the daily/weekly schedule; a
126/// manual one-off backup of any backup-capable install is allowed.
127pub fn plan_backup_run(
128    service_name: &str,
129    config: &Config,
130    repo_dir: &Path,
131    mode: &str,
132) -> Result<BackupRunPlan> {
133    // Ensure it's installed (errors otherwise); enrollment is not required.
134    load_install_metadata(service_name)?;
135    let settings = config
136        .backup
137        .as_ref()
138        .ok_or(Error::BackupRepoNotConfigured)?;
139
140    let svc = registry::find_service(repo_dir, service_name)?;
141    if !svc.def.integrations.backup {
142        return Err(Error::BackupNotSupported(service_name.to_string()));
143    }
144
145    let home = service_home(service_name)?;
146    let (mut paths, excludes) = resolve_paths(&svc.def, &home)?;
147
148    // Every snapshot also carries the global `preferences.toml` (repo
149    // creds, SMTP, auth, generated secrets). It's tiny and restic dedups
150    // it across services, so the cost is ~nothing — and it means any
151    // single service snapshot is enough to restore the global config.
152    let prefs = ConfigPaths::resolve()?.config_file;
153    if prefs.exists() {
154        paths.push(prefs);
155    }
156
157    let manifest_sha = manifest_sha256(&svc.service_dir);
158    let mut tags = vec![format!("service:{service_name}")];
159    tags.push(format!("manifest_sha:{}", &manifest_sha[..16]));
160    // Stamp the stable machine id so a snapshot self-identifies which machine it
161    // came from (the label is the hostname, carried in restic's own `host`
162    // field). Lets the bucket be read back machine-by-machine even with only the
163    // backups in hand.
164    if let Some(machine) = config.machine.as_ref() {
165        tags.push(format!("machine_id:{}", machine.id));
166    }
167    // The cadence this snapshot belongs to (daily | weekly | manual). Drives
168    // per-mode retention (keep the last N of a mode) and the grouped listing.
169    tags.push(format!("mode:{mode}"));
170
171    let backup = svc.def.backup.as_ref();
172    let pre = resolve_hook(
173        backup.and_then(|b| b.pre_backup.as_deref()),
174        &home,
175        "backup-pre.sh",
176    );
177    let post = resolve_hook(
178        backup.and_then(|b| b.post_backup.as_deref()),
179        &home,
180        "backup-post.sh",
181    );
182    let online = backup.is_some_and(|b| b.online);
183    // Cold snapshots stop the stack; online ones don't, so they need no units.
184    let units = if online {
185        Vec::new()
186    } else {
187        service_units(&home)
188    };
189    let data = data_paths(&svc.def, &home);
190
191    Ok(BackupRunPlan {
192        service_name: service_name.to_string(),
193        service_home: home,
194        repo: settings.backend.restic_repo(),
195        password: settings.password.clone(),
196        env: backend_env_map(&settings.backend),
197        tags,
198        paths,
199        excludes,
200        online,
201        units,
202        data_paths: data,
203        pre_backup_hook: pre,
204        post_backup_hook: post,
205    })
206}
207
208/// Plan a `ryra backup restore <service>` invocation.
209///
210/// `snapshot` is either `latest` (newest snapshot tagged with this
211/// service) or an explicit restic snapshot id. The CLI resolves the
212/// actual id by querying restic; this planner stays pure and just
213/// passes the user's choice through.
214pub fn plan_backup_restore(
215    service_name: &str,
216    snapshot: &str,
217    config: &Config,
218    repo_dir: &Path,
219) -> Result<BackupRestorePlan> {
220    // Ensure it's installed (errors otherwise); a snapshot can be restored
221    // whether or not the service is enrolled in the schedule.
222    load_install_metadata(service_name)?;
223    let settings = config
224        .backup
225        .as_ref()
226        .ok_or(Error::BackupRepoNotConfigured)?;
227
228    let svc = registry::find_service(repo_dir, service_name)?;
229    let home = service_home(service_name)?;
230
231    let backup = svc.def.backup.as_ref();
232    let pre = resolve_hook(
233        backup.and_then(|b| b.pre_restore.as_deref()),
234        &home,
235        "restore-pre.sh",
236    );
237    let post = resolve_hook(
238        backup.and_then(|b| b.post_restore.as_deref()),
239        &home,
240        "restore-post.sh",
241    );
242    let online = backup.is_some_and(|b| b.online);
243    let units = if online {
244        Vec::new()
245    } else {
246        service_units(&home)
247    };
248    let data = data_paths(&svc.def, &home);
249
250    Ok(BackupRestorePlan {
251        service_name: service_name.to_string(),
252        service_home: home,
253        repo: settings.backend.restic_repo(),
254        password: settings.password.clone(),
255        env: backend_env_map(&settings.backend),
256        snapshot: snapshot.to_string(),
257        online,
258        units,
259        data_paths: data,
260        // Per-service restore never touches the global config by default; the
261        // CLI's `--config` flag flips this for disaster recovery.
262        include_config: false,
263        pre_restore_hook: pre,
264        post_restore_hook: post,
265    })
266}
267
268/// Plan a per-mode prune for one service: keep at most `keep` snapshots tagged
269/// `mode:<mode>` (the daily or weekly cap), dropping the oldest beyond that.
270/// Manual snapshots are never pruned, so callers only pass `daily`/`weekly`.
271/// Returns `Ok(None)` when `keep == 0` (unlimited) rather than running a
272/// keep-nothing forget.
273pub fn plan_mode_prune(
274    service_name: &str,
275    config: &Config,
276    mode: &str,
277    keep: u32,
278    dry_run: bool,
279) -> Result<Option<BackupForgetPlan>> {
280    if keep == 0 {
281        return Ok(None);
282    }
283    let metadata = load_install_metadata(service_name)?;
284    if !metadata.backup_enabled {
285        return Err(Error::BackupNotEnabled(service_name.to_string()));
286    }
287    let settings = config
288        .backup
289        .as_ref()
290        .ok_or(Error::BackupRepoNotConfigured)?;
291    Ok(Some(BackupForgetPlan {
292        service_name: service_name.to_string(),
293        repo: settings.backend.restic_repo(),
294        password: settings.password.clone(),
295        env: backend_env_map(&settings.backend),
296        // AND of both tags ("a,b"): only THIS service's snapshots in THIS mode.
297        tag: format!("service:{service_name},mode:{mode}"),
298        keep_args: vec!["--keep-last".to_string(), keep.to_string()],
299        prune: true,
300        dry_run,
301    }))
302}
303
304/// List installed services that have `backup_enabled = true` in their
305/// metadata. The CLI's `ryra backup manual` (no service argument) uses
306/// this to iterate every enabled install.
307pub fn list_backup_enabled() -> Result<Vec<String>> {
308    let root = crate::paths::service_data_root()?;
309    if !root.is_dir() {
310        return Ok(Vec::new());
311    }
312    let mut out = Vec::new();
313    for entry in std::fs::read_dir(&root).map_err(|source| Error::FileRead {
314        path: root.clone(),
315        source,
316    })? {
317        let entry = entry.map_err(|source| Error::FileRead {
318            path: root.clone(),
319            source,
320        })?;
321        let name = match entry.file_name().to_str() {
322            Some(s) => s.to_string(),
323            None => continue,
324        };
325        if let Some(meta) = load_metadata(&name)?
326            && meta.backup_enabled
327        {
328            out.push(name);
329        }
330    }
331    out.sort();
332    Ok(out)
333}
334
335/// Enroll or unenroll a service in backups by flipping `backup_enabled` in its
336/// `metadata.toml`. Returns whether the flag actually changed (`false` if the
337/// service isn't installed, or was already in that state). This on-disk flag is
338/// what [`list_backup_enabled`] and a no-argument `ryra backup manual` read, so it
339/// is the single source of truth both the CLI picker and the rpc layer set.
340pub fn set_backup_enabled(service: &str, enabled: bool) -> Result<bool> {
341    let Some(mut meta) = load_metadata(service)? else {
342        return Ok(false);
343    };
344    if meta.backup_enabled == enabled {
345        return Ok(false);
346    }
347    meta.backup_enabled = enabled;
348    let path = service_home(service)?.join("metadata.toml");
349    let toml = toml::to_string_pretty(&meta)?;
350    std::fs::write(&path, toml).map_err(|source| Error::FileWrite { path, source })?;
351    Ok(true)
352}
353
354fn load_install_metadata(service_name: &str) -> Result<Metadata> {
355    load_metadata(service_name)?.ok_or_else(|| Error::ServiceNotInstalled(service_name.to_string()))
356}
357
358/// Resolve the set of absolute paths to feed restic, plus the list of
359/// `--exclude` patterns.
360///
361/// Two routes:
362/// - Explicit `[backup].paths`: trust the manifest, resolve each
363///   entry against the service home.
364/// - No explicit paths: ask the classifier "what's data here?" — that
365///   covers every top-level child not in the install manifest (the
366///   `data/` directory, the `db-data/` directory, anything the user
367///   has dropped in). Also include `.backup/` if the manifest declared
368///   any pre_backup hook, since that's the convention for dumping.
369fn resolve_paths(def: &ServiceDef, home: &Path) -> Result<(Vec<PathBuf>, Vec<String>)> {
370    let backup = def.backup.as_ref();
371    let excludes: Vec<String> = backup.map(|b| b.exclude.clone()).unwrap_or_default();
372
373    // Whole-folder backup: capture the entire service home in one path.
374    // This carries config (`.env`, `metadata.toml`, quadlets, rendered
375    // configs) alongside data, so a restore reconstructs the install
376    // without re-running `ryra add` — that's the difference between
377    // "restore and go" and a hand rebuild.
378    //
379    // Database consistency is the hooks' job, not the path list's:
380    //  - dump services (`backup-pre.sh` → mariadb-dump/pg_dump into
381    //    `.backup/`) list their *live* DB dir in `[backup].exclude` so
382    //    the consistent dump is authoritative, not the changing files;
383    //  - cold-stop services stop the DB before the snapshot, so its dir
384    //    is already consistent and is captured as part of the folder.
385    //
386    // `exclude` also drops regenerable caches. An explicit
387    // `[backup].paths` still narrows the capture for the rare service
388    // that needs it, but the default — and the recommendation — is the
389    // whole folder.
390    if let Some(b) = backup
391        && !b.paths.is_empty()
392    {
393        // A curated `paths` list keeps regenerable junk (thumbnails,
394        // transcodes) out of the snapshot — honour it for *data*, but
395        // always add the config artifacts so a restore can still
396        // reconstruct the install without `ryra add`.
397        let mut abs: Vec<PathBuf> = b.paths.iter().map(|p| home.join(p)).collect();
398        abs.extend(config_artifacts(home));
399        abs.sort();
400        abs.dedup();
401        return Ok((abs, excludes));
402    }
403
404    Ok((vec![home.to_path_buf()], excludes))
405}
406
407/// The config artifacts that must travel with every backup so a restore
408/// reconstructs the install without re-running `ryra add`: the generated
409/// `.env`, `metadata.toml`, the render manifest, the rendered `configs/`
410/// tree, and the quadlet unit files. Only existing paths are returned so
411/// the list feeds straight to restic. (Services with no explicit
412/// `paths` capture the whole folder, which already covers all of these.)
413fn config_artifacts(home: &Path) -> Vec<PathBuf> {
414    let mut out = Vec::new();
415    for f in [".env", "metadata.toml", "service.manifest"] {
416        let p = home.join(f);
417        if p.exists() {
418            out.push(p);
419        }
420    }
421    let configs = home.join("configs");
422    if configs.is_dir() {
423        out.push(configs);
424    }
425    if let Ok(entries) = std::fs::read_dir(home) {
426        for entry in entries.flatten() {
427            let name = entry.file_name();
428            let n = name.to_string_lossy();
429            if n.ends_with(".container") || n.ends_with(".network") || n.ends_with(".volume") {
430                out.push(entry.path());
431            }
432        }
433    }
434    out
435}
436
437/// The service's systemd units — one `<stem>.service` per `<stem>.container`
438/// quadlet in the service home. This is the set ryra stops for a cold snapshot
439/// or restore (`Requires=` governs startup, not shutdown, so the whole stack is
440/// stopped explicitly). Sorted for determinism.
441fn service_units(home: &Path) -> Vec<String> {
442    let mut units = Vec::new();
443    if let Ok(entries) = std::fs::read_dir(home) {
444        for entry in entries.flatten() {
445            let name = entry.file_name();
446            if let Some(stem) = name.to_string_lossy().strip_suffix(".container") {
447                units.push(format!("{stem}.service"));
448            }
449        }
450    }
451    units.sort();
452    units
453}
454
455/// The service's data directories (absolute) — the `[backup].paths` entries
456/// resolved against the home. These are the trees a cold snapshot chowns so
457/// restic can read them, and a cold restore wipes to a clean tree first. Config
458/// artifacts and `preferences.toml` are never in this set (never wiped). Empty
459/// when the service declares no explicit paths.
460fn data_paths(def: &ServiceDef, home: &Path) -> Vec<PathBuf> {
461    def.backup
462        .as_ref()
463        .map(|b| b.paths.iter().map(|p| home.join(p)).collect())
464        .unwrap_or_default()
465}
466
467/// Whether backing up this service stops it. Cold services (the default) take a
468/// stop-the-stack snapshot; `online` services snapshot live. Surfaced to the UI
469/// so "Back up now" can warn about the brief downtime.
470pub fn backup_stops_service(def: &ServiceDef) -> bool {
471    def.backup.as_ref().is_some_and(|b| !b.online)
472}
473
474/// Whether restoring this service stops it. A cold restore always stops the
475/// stack to wipe + replace its data; an `online` service stops only if it ships
476/// restore hooks (e.g. it pauses the app while re-importing a dump). Surfaced to
477/// the UI so the restore confirm can warn about downtime.
478pub fn restore_stops_service(def: &ServiceDef, home: &Path) -> bool {
479    match def.backup.as_ref() {
480        None => false,
481        Some(b) if !b.online => true,
482        Some(b) => {
483            resolve_hook(b.pre_restore.as_deref(), home, "restore-pre.sh").is_some()
484                || resolve_hook(b.post_restore.as_deref(), home, "restore-post.sh").is_some()
485        }
486    }
487}
488
489fn hook_path(home: &Path, filename: &str) -> PathBuf {
490    home.join("configs").join("scripts").join(filename)
491}
492
493/// Decide which hook script (if any) to invoke for a given lifecycle
494/// phase. Priority:
495/// 1. Explicit `[backup].pre_backup` (or sibling) in service.toml.
496/// 2. Convention: `configs/scripts/<phase>.sh` on disk.
497/// 3. None — phase is a no-op.
498///
499/// The convention path means a typical service.toml's `[backup]`
500/// section is a single `paths = [...]` line; the four hook scripts
501/// are auto-discovered when their conventional names are present in
502/// `configs/scripts/`, and authors never have to repeat the
503/// filenames in the manifest.
504fn resolve_hook(explicit: Option<&str>, home: &Path, conventional: &str) -> Option<PathBuf> {
505    if let Some(name) = explicit {
506        return Some(hook_path(home, name));
507    }
508    let conv = hook_path(home, conventional);
509    if conv.exists() { Some(conv) } else { None }
510}
511
512fn backend_env_map(backend: &BackupBackend) -> BTreeMap<String, String> {
513    backend
514        .env()
515        .into_iter()
516        .map(|(k, v)| (k.to_string(), v))
517        .collect()
518}
519
520/// Hex SHA256 of the service's `service.toml`. Used as the
521/// `manifest_sha:` tag on each snapshot so a future restore can detect
522/// version skew between the snapshot and the currently-installed
523/// service definition.
524///
525/// Falls back to an all-zero hash if the file can't be read — the
526/// caller's higher-level error handling will already have failed for
527/// other reasons, and a sentinel hash is more useful than panicking.
528pub fn manifest_sha256(service_dir: &Path) -> String {
529    let path = service_dir.join(SERVICE_TOML_FILENAME);
530    let bytes = match std::fs::read(&path) {
531        Ok(b) => b,
532        Err(_) => return "0".repeat(64),
533    };
534    let mut hasher = Sha256::new();
535    hasher.update(&bytes);
536    let digest = hasher.finalize();
537    hex_encode(&digest)
538}
539
540fn hex_encode(bytes: &[u8]) -> String {
541    const HEX: &[u8; 16] = b"0123456789abcdef";
542    let mut s = String::with_capacity(bytes.len() * 2);
543    for b in bytes {
544        s.push(HEX[(b >> 4) as usize] as char);
545        s.push(HEX[(b & 0xf) as usize] as char);
546    }
547    s
548}
549
550// ---------------------------------------------------------------------------
551// Execution: shared by every frontend (CLI, ryra-api). restic runs as
552// the invoking user; ownership round-trips via the hooks + quadlet `:U`
553// (see the hook scripts in the registry).
554// ---------------------------------------------------------------------------
555
556/// Run a pre/post backup or restore hook with the service's `.env`
557/// loaded, mirroring how quadlet ExecStartPre/Post scripts see it.
558pub fn run_hook(
559    kind: &str,
560    service: &str,
561    script: &std::path::Path,
562    service_home: &std::path::Path,
563) -> anyhow::Result<()> {
564    use anyhow::Context;
565    if !script.exists() {
566        return Err(crate::error::Error::BackupHookFailed {
567            service: service.to_string(),
568            hook: kind.to_string(),
569            message: format!("hook script not found: {}", script.display()),
570        }
571        .into());
572    }
573    let env_file = service_home.join(".env");
574    let envs = if env_file.exists() {
575        parse_env_file(&env_file)
576    } else {
577        Vec::new()
578    };
579    let mut cmd = std::process::Command::new("/bin/bash");
580    cmd.arg(script)
581        .env("SERVICE_HOME", service_home)
582        .current_dir(service_home);
583    for (k, v) in envs {
584        cmd.env(k, v);
585    }
586    let status = cmd
587        .status()
588        .with_context(|| format!("running hook {kind} for {service}"))?;
589    if !status.success() {
590        return Err(crate::error::Error::BackupHookFailed {
591            service: service.to_string(),
592            hook: kind.to_string(),
593            message: format!("hook script exited with {}", status.code().unwrap_or(-1)),
594        }
595        .into());
596    }
597    Ok(())
598}
599
600/// Execute a planned backup with restic. Ownership of container-owned
601/// bind mounts is the pre-hook's job (`podman unshare chown`); by this
602/// point every file is readable by the invoking user.
603pub fn restic_backup(plan: &BackupRunPlan) -> anyhow::Result<()> {
604    use anyhow::{Context, bail};
605    let mut cmd = std::process::Command::new("restic");
606    cmd.arg("backup")
607        .arg("--repo")
608        .arg(&plan.repo)
609        .env("RESTIC_PASSWORD", &plan.password);
610    for (k, v) in &plan.env {
611        cmd.env(k, v);
612    }
613    for tag in &plan.tags {
614        cmd.arg("--tag").arg(tag);
615    }
616    for excl in &plan.excludes {
617        // Excludes from service.toml are relative to the service home,
618        // hence cwd below.
619        cmd.arg("--exclude").arg(excl);
620    }
621    cmd.current_dir(&plan.service_home);
622    for path in &plan.paths {
623        cmd.arg(path);
624    }
625    let status = cmd
626        .status()
627        .with_context(|| format!("spawning `restic backup` for {}", plan.service_name))?;
628    if !status.success() {
629        bail!("restic backup exited with {}", status.code().unwrap_or(-1));
630    }
631    Ok(())
632}
633
634/// Execute a planned restore. Files come back owned by the invoking
635/// user; the next container start's `:U` re-chowns to the container's
636/// USER. (Running inside `podman unshare` would preserve snapshot UIDs
637/// but fails chowning `/home` outside the namespace mapping.)
638pub fn restic_restore(plan: &BackupRestorePlan) -> anyhow::Result<()> {
639    use anyhow::{Context, bail};
640    let mut cmd = std::process::Command::new("restic");
641    cmd.arg("restore")
642        .arg(&plan.snapshot)
643        .arg("--repo")
644        .arg(&plan.repo)
645        .arg("--target")
646        .arg("/")
647        .arg("--tag")
648        .arg(format!("service:{}", plan.service_name))
649        .env("RESTIC_PASSWORD", &plan.password);
650    for (k, v) in &plan.env {
651        cmd.env(k, v);
652    }
653    // Every snapshot bundles the global preferences.toml (for disaster
654    // recovery), but a normal per-service restore must NOT overwrite the live
655    // global config (SMTP/auth/backup creds/other services) with this snapshot's
656    // possibly-stale copy. Exclude it unless the caller opted in.
657    if !plan.include_config
658        && let Ok(paths) = ConfigPaths::resolve()
659    {
660        cmd.arg("--exclude").arg(&paths.config_file);
661    }
662    let status = cmd.status().context("spawning `restic restore`")?;
663    if !status.success() {
664        bail!("restic restore exited with {}", status.code().unwrap_or(-1));
665    }
666    Ok(())
667}
668
669/// Execute a planned retention sweep, returning `(kept, removed)` snapshot
670/// counts. Runs `restic forget --json` filtered to the service's
671/// `service:<name>` tag (so keep rules apply only to that service), parses the
672/// keep/remove decision, then runs `restic prune` SEPARATELY to reclaim space
673/// (real runs only, when something was actually removed). Splitting prune out
674/// keeps the `--json` output clean to parse. In a dry run nothing is deleted
675/// and `removed` is the count that WOULD be removed.
676pub fn restic_forget(plan: &BackupForgetPlan) -> anyhow::Result<(u32, u32)> {
677    use anyhow::{Context, bail};
678    let mut cmd = std::process::Command::new("restic");
679    cmd.arg("forget")
680        .arg("--repo")
681        .arg(&plan.repo)
682        .arg("--tag")
683        .arg(&plan.tag)
684        .arg("--json")
685        .env("RESTIC_PASSWORD", &plan.password);
686    for (k, v) in &plan.env {
687        cmd.env(k, v);
688    }
689    for arg in &plan.keep_args {
690        cmd.arg(arg);
691    }
692    if plan.dry_run {
693        cmd.arg("--dry-run");
694    }
695    let output = cmd
696        .output()
697        .with_context(|| format!("spawning `restic forget` for {}", plan.service_name))?;
698    if !output.status.success() {
699        bail!(
700            "restic forget exited with {}: {}",
701            output.status.code().unwrap_or(-1),
702            String::from_utf8_lossy(&output.stderr).trim()
703        );
704    }
705    // `restic forget --json` is an array of groups, each with a `keep` list and
706    // a `remove` list (the latter null/absent when nothing is dropped).
707    #[derive(serde::Deserialize)]
708    struct ForgetGroup {
709        #[serde(default)]
710        keep: Vec<serde_json::Value>,
711        #[serde(default)]
712        remove: Option<Vec<serde_json::Value>>,
713    }
714    let groups: Vec<ForgetGroup> = serde_json::from_slice(&output.stdout).unwrap_or_default();
715    let kept: u32 = groups.iter().map(|g| g.keep.len() as u32).sum();
716    let removed: u32 = groups
717        .iter()
718        .map(|g| g.remove.as_ref().map_or(0, Vec::len) as u32)
719        .sum();
720    // Reclaim space, but only for a real run that actually dropped snapshots.
721    if !plan.dry_run && plan.prune && removed > 0 {
722        let mut prune = std::process::Command::new("restic");
723        prune
724            .arg("prune")
725            .arg("--repo")
726            .arg(&plan.repo)
727            .env("RESTIC_PASSWORD", &plan.password);
728        for (k, v) in &plan.env {
729            prune.env(k, v);
730        }
731        let status = prune
732            .status()
733            .with_context(|| format!("spawning `restic prune` for {}", plan.service_name))?;
734        if !status.success() {
735            bail!("restic prune exited with {}", status.code().unwrap_or(-1));
736        }
737    }
738    Ok((kept, removed))
739}
740
741/// KEY=VALUE lines from a `.env` file; malformed lines are skipped the
742/// same way systemd's EnvironmentFile= skips them.
743pub fn parse_env_file(path: &std::path::Path) -> Vec<(String, String)> {
744    let Ok(content) = std::fs::read_to_string(path) else {
745        return Vec::new();
746    };
747    content
748        .lines()
749        .filter_map(|l| {
750            let l = l.trim();
751            if l.is_empty() || l.starts_with('#') {
752                return None;
753            }
754            l.split_once('=')
755                .map(|(k, v)| (k.trim().to_string(), v.trim().to_string()))
756        })
757        .collect()
758}
759
760/// How long to let units settle after a stop before touching their data, so
761/// the database has flushed and file handles are closed. Matches the `sleep`
762/// the per-service hook scripts used to do.
763const SETTLE: std::time::Duration = std::time::Duration::from_secs(3);
764
765/// Stop a service's units for a cold snapshot/restore. Best-effort, like the
766/// hook scripts' `systemctl stop ... || true`: a unit that isn't running is not
767/// an error. `Requires=` governs startup not shutdown, so every unit is listed.
768fn stop_units(units: &[String]) {
769    if units.is_empty() {
770        return;
771    }
772    let mut cmd = std::process::Command::new("systemctl");
773    cmd.arg("--user").arg("stop");
774    for u in units {
775        cmd.arg(u);
776    }
777    let _ = cmd.status();
778}
779
780/// Bring a service back up after a cold snapshot/restore: clear any
781/// start-limit/failed state left by the stop+start churn, then start the
782/// primary unit (`<service>.service`), whose `Requires=` cascades its sidecars.
783/// Services that need more (extra units, or a DB-readiness wait) ship a
784/// post_backup/post_restore hook instead.
785fn start_service(service: &str) -> anyhow::Result<()> {
786    use anyhow::{Context, bail};
787    let _ = std::process::Command::new("systemctl")
788        .args(["--user", "reset-failed"])
789        .status();
790    let unit = format!("{service}.service");
791    let status = std::process::Command::new("systemctl")
792        .args(["--user", "start", &unit])
793        .status()
794        .with_context(|| format!("spawning `systemctl --user start {unit}`"))?;
795    if !status.success() {
796        bail!(
797            "`systemctl --user start {unit}` exited with {}",
798            status.code().unwrap_or(-1)
799        );
800    }
801    Ok(())
802}
803
804/// Make container-owned bind mounts readable by the invoking user so restic can
805/// snapshot them. `podman unshare chown -R 0:0` maps namespace-root (= this
806/// user); the next container start re-applies `:U`. A no-op for data that is
807/// already user-owned (no `:U` mount).
808fn chown_for_read(paths: &[PathBuf]) -> anyhow::Result<()> {
809    use anyhow::{Context, bail};
810    for p in paths {
811        if !p.exists() {
812            continue;
813        }
814        let status = std::process::Command::new("podman")
815            .args(["unshare", "chown", "-R", "0:0"])
816            .arg(p)
817            .status()
818            .with_context(|| format!("spawning `podman unshare chown` on {}", p.display()))?;
819        if !status.success() {
820            bail!("`podman unshare chown` on {} failed", p.display());
821        }
822    }
823    Ok(())
824}
825
826/// Wipe a cold service's data dirs to a clean tree before `restic restore`, so
827/// files created after the snapshot don't linger and desync the restored state.
828/// `podman unshare` so `:U`-chowned (container-uid) trees are removable; the
829/// dirs are recreated empty since podman refuses to start a container whose
830/// bind-mount source is missing.
831fn wipe_for_restore(paths: &[PathBuf]) -> anyhow::Result<()> {
832    use anyhow::Context;
833    for p in paths {
834        let _ = std::process::Command::new("podman")
835            .args(["unshare", "rm", "-rf"])
836            .arg(p)
837            .status();
838        std::fs::create_dir_all(p).with_context(|| format!("recreating {}", p.display()))?;
839    }
840    Ok(())
841}
842
843/// Run a planned backup end-to-end.
844///
845/// `online` services (a live dump, or safe-to-copy flat data) run only their
846/// own hooks around restic: `pre_backup` -> restic -> `post_backup`. The post
847/// hook runs even when restic fails (it usually cleans up a dump), but its own
848/// failure never masks restic's error.
849///
850/// Cold services (the default) get the full lifecycle ryra derives from their
851/// units + paths: stop the stack, make the data readable, optional `pre_backup`
852/// prep, restic, then bring the service back (a `post_backup` hook if present,
853/// else start the primary unit). The service is always brought back up, even
854/// when restic fails, so a failed backup never leaves it down.
855pub fn execute_backup_run(plan: &BackupRunPlan) -> anyhow::Result<()> {
856    if plan.online {
857        if let Some(hook) = &plan.pre_backup_hook {
858            run_hook("pre_backup", &plan.service_name, hook, &plan.service_home)?;
859        }
860        let restic_result = restic_backup(plan);
861        if let Some(hook) = &plan.post_backup_hook
862            && let Err(e) = run_hook("post_backup", &plan.service_name, hook, &plan.service_home)
863            && restic_result.is_ok()
864        {
865            return Err(e);
866        }
867        return restic_result;
868    }
869
870    // Cold snapshot: ryra owns the stop/chown/restart the hook scripts used to.
871    stop_units(&plan.units);
872    std::thread::sleep(SETTLE);
873    chown_for_read(&plan.data_paths)?;
874    if let Some(hook) = &plan.pre_backup_hook {
875        run_hook("pre_backup", &plan.service_name, hook, &plan.service_home)?;
876    }
877    let restic_result = restic_backup(plan);
878    // Always bring the service back, even if restic failed.
879    let bring_up = match &plan.post_backup_hook {
880        Some(hook) => run_hook("post_backup", &plan.service_name, hook, &plan.service_home),
881        None => start_service(&plan.service_name),
882    };
883    match (restic_result, bring_up) {
884        (Ok(()), bring) => bring,
885        (Err(e), _) => Err(e),
886    }
887}
888
889/// Run a planned restore end-to-end.
890///
891/// `online` services run only their own hooks around restic: `pre_restore` ->
892/// restic -> `post_restore` (e.g. seafile pauses the app, restores the tree,
893/// re-imports a live dump). Cold services (the default) get ryra's derived
894/// lifecycle: stop the stack, wipe `data_paths` to a clean tree, optional
895/// `pre_restore` (extra wipes), restic restore, then bring the service back (a
896/// `post_restore` hook if present — typically a DB-readiness wait — else start
897/// the primary unit).
898pub fn execute_backup_restore(plan: &BackupRestorePlan) -> anyhow::Result<()> {
899    if plan.online {
900        if let Some(hook) = &plan.pre_restore_hook {
901            run_hook("pre_restore", &plan.service_name, hook, &plan.service_home)?;
902        }
903        restic_restore(plan)?;
904        if let Some(hook) = &plan.post_restore_hook {
905            run_hook("post_restore", &plan.service_name, hook, &plan.service_home)?;
906        }
907        return Ok(());
908    }
909
910    // Cold restore: stop, wipe to a clean tree, restore, bring back up.
911    stop_units(&plan.units);
912    std::thread::sleep(SETTLE);
913    wipe_for_restore(&plan.data_paths)?;
914    if let Some(hook) = &plan.pre_restore_hook {
915        run_hook("pre_restore", &plan.service_name, hook, &plan.service_home)?;
916    }
917    restic_restore(plan)?;
918    match &plan.post_restore_hook {
919        Some(hook) => run_hook("post_restore", &plan.service_name, hook, &plan.service_home)?,
920        None => start_service(&plan.service_name)?,
921    }
922    Ok(())
923}
924
925#[cfg(test)]
926mod tests {
927    use super::*;
928    use crate::config::schema::{BackupBackend, BackupSettings};
929    use crate::registry::service_def::{
930        Arch, BackupConfig, HttpsRequirement, IntegrationFlags, PortDef, ServiceDef, ServiceMeta,
931    };
932
933    fn def_with_backup(backup_section: Option<BackupConfig>) -> ServiceDef {
934        ServiceDef {
935            service: ServiceMeta {
936                name: "demo".into(),
937                description: "demo".into(),
938                url: None,
939                kind: Default::default(),
940                architecture: vec![Arch::Amd64, Arch::Arm64],
941                https: HttpsRequirement::default(),
942                runtime: Default::default(),
943                run: None,
944                build: None,
945                post_install: None,
946                deploy: Default::default(),
947                health_check: None,
948                health_timeout: None,
949            },
950            requirements: None,
951            ports: vec![PortDef {
952                name: "http".into(),
953                container_port: 80,
954                host_port: None,
955                protocol: Default::default(),
956                tailscale_https: None,
957            }],
958            env: vec![],
959            env_groups: vec![],
960            choices: vec![],
961            requires: vec![],
962            mappings: Default::default(),
963            integrations: IntegrationFlags {
964                backup: backup_section.is_some(),
965                ..Default::default()
966            },
967            capabilities: Default::default(),
968            backup: backup_section,
969            metrics: None,
970        }
971    }
972
973    #[test]
974    fn resolve_paths_whole_folder_when_paths_empty() {
975        let dir = tempfile::tempdir().unwrap();
976        let home = dir.path();
977        // No explicit `paths` → capture the whole service folder.
978        let def = def_with_backup(Some(BackupConfig::default()));
979        let (paths, excludes) = resolve_paths(&def.clone(), home).unwrap();
980        assert_eq!(paths, vec![home.to_path_buf()]);
981        assert!(excludes.is_empty());
982    }
983
984    #[test]
985    fn resolve_paths_explicit_list_plus_config_artifacts() {
986        let dir = tempfile::tempdir().unwrap();
987        let home = dir.path();
988        // Config artifacts present in the home travel with the data.
989        std::fs::write(home.join(".env"), "x").unwrap();
990        std::fs::write(home.join("metadata.toml"), "x").unwrap();
991        let def = def_with_backup(Some(BackupConfig {
992            paths: vec!["data/uploads".into(), ".backup/db.sql".into()],
993            exclude: vec!["data/uploads/cache".into()],
994            ..Default::default()
995        }));
996        let (paths, excludes) = resolve_paths(&def, home).unwrap();
997        // Curated data paths honoured...
998        assert!(paths.contains(&home.join("data/uploads")), "got {paths:?}");
999        assert!(
1000            paths.contains(&home.join(".backup/db.sql")),
1001            "got {paths:?}"
1002        );
1003        // ...and config artifacts added so a restore can rebuild the install.
1004        assert!(paths.contains(&home.join(".env")), "got {paths:?}");
1005        assert!(paths.contains(&home.join("metadata.toml")), "got {paths:?}");
1006        assert_eq!(excludes, vec!["data/uploads/cache"]);
1007    }
1008
1009    #[test]
1010    fn config_artifacts_collects_env_metadata_quadlets_configs() {
1011        let dir = tempfile::tempdir().unwrap();
1012        let home = dir.path();
1013        std::fs::write(home.join(".env"), "x").unwrap();
1014        std::fs::write(home.join("metadata.toml"), "x").unwrap();
1015        std::fs::write(home.join("service.manifest"), "x").unwrap();
1016        std::fs::write(home.join("demo.container"), "x").unwrap();
1017        std::fs::write(home.join("demo.network"), "x").unwrap();
1018        std::fs::create_dir(home.join("configs")).unwrap();
1019        let names: Vec<String> = config_artifacts(home)
1020            .iter()
1021            .map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
1022            .collect();
1023        for want in [
1024            ".env",
1025            "metadata.toml",
1026            "service.manifest",
1027            "demo.container",
1028            "demo.network",
1029            "configs",
1030        ] {
1031            assert!(
1032                names.contains(&want.to_string()),
1033                "{want} missing: {names:?}"
1034            );
1035        }
1036    }
1037
1038    #[test]
1039    fn hook_path_resolves_under_configs_scripts() {
1040        let home = PathBuf::from("/x/y");
1041        assert_eq!(
1042            hook_path(&home, "backup-pre.sh"),
1043            PathBuf::from("/x/y/configs/scripts/backup-pre.sh")
1044        );
1045    }
1046
1047    #[test]
1048    fn resolve_hook_prefers_explicit_over_convention() {
1049        let dir = tempfile::tempdir().unwrap();
1050        let home = dir.path();
1051        // Both the conventional and a custom-named file exist; the
1052        // explicit field wins.
1053        let scripts = home.join("configs").join("scripts");
1054        std::fs::create_dir_all(&scripts).unwrap();
1055        std::fs::write(scripts.join("backup-pre.sh"), "#!/bin/sh\n").unwrap();
1056        std::fs::write(scripts.join("custom.sh"), "#!/bin/sh\n").unwrap();
1057        let resolved = resolve_hook(Some("custom.sh"), home, "backup-pre.sh");
1058        assert_eq!(resolved.unwrap().file_name().unwrap(), "custom.sh");
1059    }
1060
1061    #[test]
1062    fn resolve_hook_falls_back_to_convention_when_present() {
1063        let dir = tempfile::tempdir().unwrap();
1064        let home = dir.path();
1065        let scripts = home.join("configs").join("scripts");
1066        std::fs::create_dir_all(&scripts).unwrap();
1067        std::fs::write(scripts.join("backup-pre.sh"), "#!/bin/sh\n").unwrap();
1068        let resolved = resolve_hook(None, home, "backup-pre.sh");
1069        assert_eq!(resolved.unwrap().file_name().unwrap(), "backup-pre.sh");
1070    }
1071
1072    #[test]
1073    fn resolve_hook_returns_none_when_no_script_exists() {
1074        let dir = tempfile::tempdir().unwrap();
1075        // No configs/scripts/ at all → no hook to run.
1076        assert!(resolve_hook(None, dir.path(), "backup-pre.sh").is_none());
1077    }
1078
1079    #[test]
1080    fn manifest_sha256_changes_with_content() {
1081        let a = tempfile::tempdir().unwrap();
1082        let b = tempfile::tempdir().unwrap();
1083        std::fs::write(a.path().join("service.toml"), "v1").unwrap();
1084        std::fs::write(b.path().join("service.toml"), "v2").unwrap();
1085        assert_ne!(manifest_sha256(a.path()), manifest_sha256(b.path()));
1086    }
1087
1088    #[test]
1089    fn manifest_sha256_stable_for_identical_content() {
1090        let a = tempfile::tempdir().unwrap();
1091        let b = tempfile::tempdir().unwrap();
1092        std::fs::write(a.path().join("service.toml"), "same").unwrap();
1093        std::fs::write(b.path().join("service.toml"), "same").unwrap();
1094        assert_eq!(manifest_sha256(a.path()), manifest_sha256(b.path()));
1095    }
1096
1097    #[test]
1098    fn manifest_sha256_returns_zero_hash_on_missing_file() {
1099        let dir = tempfile::tempdir().unwrap();
1100        assert_eq!(manifest_sha256(dir.path()), "0".repeat(64));
1101    }
1102
1103    #[test]
1104    fn service_units_one_per_container_quadlet() {
1105        let dir = tempfile::tempdir().unwrap();
1106        let home = dir.path();
1107        // Container quadlets -> .service units; network/volume quadlets don't.
1108        std::fs::write(home.join("forgejo.container"), "").unwrap();
1109        std::fs::write(home.join("forgejo-postgres.container"), "").unwrap();
1110        std::fs::write(home.join("forgejo.network"), "").unwrap();
1111        assert_eq!(
1112            service_units(home),
1113            vec![
1114                "forgejo-postgres.service".to_string(),
1115                "forgejo.service".to_string()
1116            ]
1117        );
1118    }
1119
1120    #[test]
1121    fn data_paths_are_backup_paths_only() {
1122        let dir = tempfile::tempdir().unwrap();
1123        let home = dir.path();
1124        let def = def_with_backup(Some(BackupConfig {
1125            paths: vec!["db-data".into(), "data".into()],
1126            ..Default::default()
1127        }));
1128        assert_eq!(
1129            data_paths(&def, home),
1130            vec![home.join("db-data"), home.join("data")]
1131        );
1132        // No explicit paths -> nothing to chown/wipe (whole-folder backup).
1133        let whole = def_with_backup(Some(BackupConfig::default()));
1134        assert!(data_paths(&whole, home).is_empty());
1135    }
1136
1137    #[test]
1138    fn stop_flags_track_online_and_restore_hooks() {
1139        let dir = tempfile::tempdir().unwrap();
1140        let home = dir.path();
1141
1142        // Cold (default): both backup and restore stop the service.
1143        let cold = def_with_backup(Some(BackupConfig::default()));
1144        assert!(backup_stops_service(&cold));
1145        assert!(restore_stops_service(&cold, home));
1146
1147        // Online with no restore hooks: neither stops (e.g. flat/append data).
1148        let online = def_with_backup(Some(BackupConfig {
1149            online: true,
1150            ..Default::default()
1151        }));
1152        assert!(!backup_stops_service(&online));
1153        assert!(!restore_stops_service(&online, home));
1154
1155        // Online but ships a restore hook (e.g. seafile re-imports a dump):
1156        // backup runs live, restore still stops.
1157        let scripts = home.join("configs").join("scripts");
1158        std::fs::create_dir_all(&scripts).unwrap();
1159        std::fs::write(scripts.join("restore-post.sh"), "#!/bin/sh\n").unwrap();
1160        assert!(!backup_stops_service(&online));
1161        assert!(restore_stops_service(&online, home));
1162
1163        // No backup support at all: never stops.
1164        let none = def_with_backup(None);
1165        assert!(!backup_stops_service(&none));
1166        assert!(!restore_stops_service(&none, home));
1167    }
1168
1169    #[test]
1170    fn backend_env_map_round_trips_aws_creds() {
1171        let settings = BackupSettings {
1172            password: "p".into(),
1173            backend: BackupBackend::S3 {
1174                endpoint: "http://h:9000".into(),
1175                bucket: "b".into(),
1176                access_key_id: "id".into(),
1177                secret_access_key: "secret".into(),
1178                session_token: None,
1179                prefix: None,
1180            },
1181            daily: None,
1182            weekly: None,
1183        };
1184        let env = backend_env_map(&settings.backend);
1185        assert_eq!(env.get("AWS_ACCESS_KEY_ID"), Some(&"id".to_string()));
1186        assert_eq!(
1187            env.get("AWS_SECRET_ACCESS_KEY"),
1188            Some(&"secret".to_string())
1189        );
1190    }
1191}