Skip to main content

canic_backup/snapshot/
mod.rs

1use crate::{
2    artifacts::{ArtifactChecksum, ArtifactChecksumError},
3    discovery::{DiscoveryError, SnapshotTarget, parse_registry_entries, targets_from_registry},
4    journal::{
5        ArtifactJournalEntry, ArtifactState, DownloadJournal, DownloadOperationMetrics,
6        JournalValidationError,
7    },
8    manifest::{
9        BackupUnit, BackupUnitKind, ConsistencySection, FleetBackupManifest, FleetMember,
10        FleetSection, IdentityMode, ManifestValidationError, SourceMetadata, SourceSnapshot,
11        ToolMetadata, VerificationCheck, VerificationPlan,
12    },
13    persistence::{BackupLayout, PersistenceError},
14    timestamp::current_timestamp_marker,
15    topology::{TopologyHash, TopologyHasher, TopologyRecord},
16};
17use candid::Principal;
18use std::{
19    collections::BTreeSet,
20    error::Error as StdError,
21    fs,
22    path::{Path, PathBuf},
23};
24use thiserror::Error as ThisError;
25
26pub type SnapshotDriverError = Box<dyn StdError + Send + Sync + 'static>;
27
28///
29/// SnapshotArtifact
30///
31
32#[derive(Clone, Debug, Eq, PartialEq)]
33pub struct SnapshotArtifact {
34    pub canister_id: String,
35    pub snapshot_id: String,
36    pub path: std::path::PathBuf,
37    pub checksum: String,
38}
39
40///
41/// SnapshotLifecycleMode
42///
43
44#[derive(Clone, Copy, Debug, Eq, PartialEq)]
45pub enum SnapshotLifecycleMode {
46    StopBeforeSnapshot,
47    StopAndResume,
48}
49
50impl SnapshotLifecycleMode {
51    /// Build the lifecycle mode from the optional post-snapshot resume flag.
52    #[must_use]
53    pub const fn from_resume_flag(resume_after_snapshot: bool) -> Self {
54        if resume_after_snapshot {
55            Self::StopAndResume
56        } else {
57            Self::StopBeforeSnapshot
58        }
59    }
60
61    /// Return whether snapshot capture should stop the canister first.
62    #[must_use]
63    pub const fn stop_before_snapshot(self) -> bool {
64        true
65    }
66
67    /// Return whether snapshot capture should resume the canister afterward.
68    #[must_use]
69    pub const fn resume_after_snapshot(self) -> bool {
70        matches!(self, Self::StopAndResume)
71    }
72}
73
74///
75/// SnapshotDownloadConfig
76///
77
78#[derive(Clone, Debug, Eq, PartialEq)]
79pub struct SnapshotDownloadConfig {
80    pub canister: String,
81    pub out: PathBuf,
82    pub root: Option<String>,
83    pub include_children: bool,
84    pub recursive: bool,
85    pub dry_run: bool,
86    pub lifecycle: SnapshotLifecycleMode,
87    pub backup_id: String,
88    pub created_at: String,
89    pub tool_name: String,
90    pub tool_version: String,
91    pub environment: String,
92}
93
94///
95/// SnapshotDownloadResult
96///
97
98#[derive(Clone, Debug, Eq, PartialEq)]
99pub struct SnapshotDownloadResult {
100    pub artifacts: Vec<SnapshotArtifact>,
101    pub planned_commands: Vec<String>,
102}
103
104///
105/// SnapshotDownloadError
106///
107
108#[derive(Debug, ThisError)]
109pub enum SnapshotDownloadError {
110    #[error("missing --root when using --include-children")]
111    MissingRegistrySource,
112
113    #[error("snapshot capture requires stopping each canister before snapshot create")]
114    SnapshotRequiresStoppedCanister,
115
116    #[error("snapshot driver failed: {0}")]
117    Driver(#[source] SnapshotDriverError),
118
119    #[error(transparent)]
120    Io(#[from] std::io::Error),
121
122    #[error(transparent)]
123    Checksum(#[from] ArtifactChecksumError),
124
125    #[error(transparent)]
126    Persistence(#[from] PersistenceError),
127
128    #[error(transparent)]
129    Journal(#[from] JournalValidationError),
130
131    #[error(transparent)]
132    Discovery(#[from] DiscoveryError),
133
134    #[error(transparent)]
135    Manifest(#[from] SnapshotManifestError),
136}
137
138///
139/// SnapshotDriver
140///
141
142pub trait SnapshotDriver {
143    /// Load the root registry JSON used to resolve child snapshot targets.
144    fn registry_json(&mut self, root: &str) -> Result<String, SnapshotDriverError>;
145
146    /// Create one canister snapshot and return its snapshot id.
147    fn create_snapshot(&mut self, canister_id: &str) -> Result<String, SnapshotDriverError>;
148
149    /// Stop one canister before snapshot creation.
150    fn stop_canister(&mut self, canister_id: &str) -> Result<(), SnapshotDriverError>;
151
152    /// Start one canister after snapshot capture.
153    fn start_canister(&mut self, canister_id: &str) -> Result<(), SnapshotDriverError>;
154
155    /// Download one snapshot into the supplied artifact directory.
156    fn download_snapshot(
157        &mut self,
158        canister_id: &str,
159        snapshot_id: &str,
160        artifact_path: &Path,
161    ) -> Result<(), SnapshotDriverError>;
162
163    /// Render the planned create command for dry-run output.
164    fn create_snapshot_command(&self, canister_id: &str) -> String;
165
166    /// Render the planned stop command for dry-run output.
167    fn stop_canister_command(&self, canister_id: &str) -> String;
168
169    /// Render the planned start command for dry-run output.
170    fn start_canister_command(&self, canister_id: &str) -> String;
171
172    /// Render the planned download command for dry-run output.
173    fn download_snapshot_command(
174        &self,
175        canister_id: &str,
176        snapshot_id: &str,
177        artifact_path: &Path,
178    ) -> String;
179}
180
181///
182/// SnapshotArtifactPaths
183///
184
185struct SnapshotArtifactPaths {
186    relative_path: PathBuf,
187    artifact_path: PathBuf,
188    temp_path: PathBuf,
189}
190
191impl SnapshotArtifactPaths {
192    // Build the durable and temporary filesystem paths for one snapshot target.
193    fn new(root: &Path, canister_id: &str) -> Self {
194        let relative_path = PathBuf::from(safe_path_segment(canister_id));
195        let artifact_path = root.join(&relative_path);
196        let temp_path = root.join(format!("{}.tmp", safe_path_segment(canister_id)));
197
198        Self {
199            relative_path,
200            artifact_path,
201            temp_path,
202        }
203    }
204}
205
206///
207/// SnapshotManifestInput
208///
209
210pub struct SnapshotManifestInput<'a> {
211    pub backup_id: String,
212    pub created_at: String,
213    pub tool_name: String,
214    pub tool_version: String,
215    pub environment: String,
216    pub root_canister: String,
217    pub selected_canister: String,
218    pub include_children: bool,
219    pub targets: &'a [SnapshotTarget],
220    pub artifacts: &'a [SnapshotArtifact],
221    pub discovery_topology_hash: TopologyHash,
222    pub pre_snapshot_topology_hash: TopologyHash,
223}
224
225///
226/// SnapshotManifestError
227///
228
229#[derive(Debug, ThisError)]
230pub enum SnapshotManifestError {
231    #[error("field {field} must be a valid principal: {value}")]
232    InvalidPrincipal { field: &'static str, value: String },
233
234    #[error(
235        "topology changed before snapshot start: discovery={discovery}, pre_snapshot={pre_snapshot}"
236    )]
237    TopologyChanged {
238        discovery: String,
239        pre_snapshot: String,
240    },
241
242    #[error("missing snapshot artifact for canister {0}")]
243    MissingArtifact(String),
244
245    #[error(transparent)]
246    InvalidManifest(#[from] ManifestValidationError),
247}
248
249/// Create and download snapshots for the selected canister set.
250pub fn download_snapshots(
251    config: &SnapshotDownloadConfig,
252    driver: &mut impl SnapshotDriver,
253) -> Result<SnapshotDownloadResult, SnapshotDownloadError> {
254    validate_snapshot_lifecycle(config.lifecycle)?;
255    let targets = resolve_snapshot_targets(config, driver)?;
256    let discovery_topology_hash = topology_hash_for_targets(&config.canister, &targets)?;
257    let pre_snapshot_topology_hash =
258        accepted_pre_snapshot_topology_hash(config, driver, &discovery_topology_hash)?;
259    let layout = BackupLayout::new(config.out.clone());
260    let mut artifacts = Vec::with_capacity(targets.len());
261    let mut planned_commands = Vec::new();
262    let mut journal = DownloadJournal {
263        journal_version: 1,
264        backup_id: config.backup_id.clone(),
265        discovery_topology_hash: Some(discovery_topology_hash.hash.clone()),
266        pre_snapshot_topology_hash: Some(pre_snapshot_topology_hash.hash.clone()),
267        operation_metrics: DownloadOperationMetrics {
268            target_count: targets.len(),
269            ..DownloadOperationMetrics::default()
270        },
271        artifacts: Vec::new(),
272    };
273
274    for target in &targets {
275        let paths = SnapshotArtifactPaths::new(&config.out, &target.canister_id);
276
277        if config.dry_run {
278            let (artifact, commands) =
279                dry_run_artifact(config, driver, target, paths.artifact_path);
280            artifacts.push(artifact);
281            planned_commands.extend(commands);
282            continue;
283        }
284
285        artifacts.push(capture_snapshot_artifact(
286            config,
287            driver,
288            &layout,
289            &mut journal,
290            target,
291            paths,
292        )?);
293    }
294
295    if !config.dry_run {
296        let manifest = build_snapshot_manifest(SnapshotManifestInput {
297            backup_id: config.backup_id.clone(),
298            created_at: config.created_at.clone(),
299            tool_name: config.tool_name.clone(),
300            tool_version: config.tool_version.clone(),
301            environment: config.environment.clone(),
302            root_canister: config
303                .root
304                .clone()
305                .unwrap_or_else(|| config.canister.clone()),
306            selected_canister: config.canister.clone(),
307            include_children: config.include_children,
308            targets: &targets,
309            artifacts: &artifacts,
310            discovery_topology_hash,
311            pre_snapshot_topology_hash,
312        })?;
313        layout.write_manifest(&manifest)?;
314    }
315
316    Ok(SnapshotDownloadResult {
317        artifacts,
318        planned_commands,
319    })
320}
321
322// Enforce the IC snapshot precondition before any capture work is planned.
323const fn validate_snapshot_lifecycle(
324    lifecycle: SnapshotLifecycleMode,
325) -> Result<(), SnapshotDownloadError> {
326    if lifecycle.stop_before_snapshot() {
327        return Ok(());
328    }
329
330    Err(SnapshotDownloadError::SnapshotRequiresStoppedCanister)
331}
332
333/// Resolve the selected canister plus optional direct/recursive children.
334pub fn resolve_snapshot_targets(
335    config: &SnapshotDownloadConfig,
336    driver: &mut impl SnapshotDriver,
337) -> Result<Vec<SnapshotTarget>, SnapshotDownloadError> {
338    if !config.include_children {
339        return Ok(vec![SnapshotTarget {
340            canister_id: config.canister.clone(),
341            role: None,
342            parent_canister_id: None,
343        }]);
344    }
345
346    let registry_json = if let Some(root) = &config.root {
347        driver
348            .registry_json(root)
349            .map_err(SnapshotDownloadError::Driver)?
350    } else {
351        return Err(SnapshotDownloadError::MissingRegistrySource);
352    };
353    let registry = parse_registry_entries(&registry_json)?;
354    targets_from_registry(&registry, &config.canister, config.recursive)
355        .map_err(SnapshotDownloadError::from)
356}
357
358/// Build a validated fleet backup manifest for one successful snapshot run.
359pub fn build_snapshot_manifest(
360    input: SnapshotManifestInput<'_>,
361) -> Result<FleetBackupManifest, SnapshotManifestError> {
362    let roles = input
363        .targets
364        .iter()
365        .enumerate()
366        .map(|(index, target)| target_role(&input.selected_canister, index, target))
367        .collect::<BTreeSet<_>>()
368        .into_iter()
369        .collect::<Vec<_>>();
370
371    let manifest = FleetBackupManifest {
372        manifest_version: 1,
373        backup_id: input.backup_id,
374        created_at: input.created_at,
375        tool: ToolMetadata {
376            name: input.tool_name,
377            version: input.tool_version,
378        },
379        source: SourceMetadata {
380            environment: input.environment,
381            root_canister: input.root_canister.clone(),
382        },
383        consistency: ConsistencySection {
384            backup_units: vec![BackupUnit {
385                unit_id: "snapshot-selection".to_string(),
386                kind: if input.include_children {
387                    BackupUnitKind::Subtree
388                } else {
389                    BackupUnitKind::Single
390                },
391                roles,
392            }],
393        },
394        fleet: FleetSection {
395            topology_hash_algorithm: input.discovery_topology_hash.algorithm,
396            topology_hash_input: input.discovery_topology_hash.input,
397            discovery_topology_hash: input.discovery_topology_hash.hash.clone(),
398            pre_snapshot_topology_hash: input.pre_snapshot_topology_hash.hash,
399            topology_hash: input.discovery_topology_hash.hash,
400            members: input
401                .targets
402                .iter()
403                .enumerate()
404                .map(|(index, target)| {
405                    fleet_member(
406                        &input.selected_canister,
407                        Some(input.root_canister.as_str()).filter(|_| input.include_children),
408                        index,
409                        target,
410                        input.artifacts,
411                    )
412                })
413                .collect::<Result<Vec<_>, _>>()?,
414        },
415        verification: VerificationPlan::default(),
416    };
417
418    manifest.validate()?;
419    Ok(manifest)
420}
421
422/// Compute the canonical topology hash for one resolved target set.
423pub fn topology_hash_for_targets(
424    selected_canister: &str,
425    targets: &[SnapshotTarget],
426) -> Result<TopologyHash, SnapshotManifestError> {
427    let topology_records = targets
428        .iter()
429        .enumerate()
430        .map(|(index, target)| topology_record(selected_canister, index, target))
431        .collect::<Result<Vec<_>, _>>()?;
432    Ok(TopologyHasher::hash(&topology_records))
433}
434
435/// Fail closed if topology changes after discovery but before snapshot creation.
436pub fn ensure_topology_stable(
437    discovery: &TopologyHash,
438    pre_snapshot: &TopologyHash,
439) -> Result<(), SnapshotManifestError> {
440    if discovery.hash == pre_snapshot.hash {
441        return Ok(());
442    }
443
444    Err(SnapshotManifestError::TopologyChanged {
445        discovery: discovery.hash.clone(),
446        pre_snapshot: pre_snapshot.hash.clone(),
447    })
448}
449
450// Resolve and verify the pre-snapshot topology hash before any mutation.
451fn accepted_pre_snapshot_topology_hash(
452    config: &SnapshotDownloadConfig,
453    driver: &mut impl SnapshotDriver,
454    discovery_topology_hash: &TopologyHash,
455) -> Result<TopologyHash, SnapshotDownloadError> {
456    if config.dry_run {
457        return Ok(discovery_topology_hash.clone());
458    }
459
460    let pre_snapshot_targets = resolve_snapshot_targets(config, driver)?;
461    let pre_snapshot_topology_hash =
462        topology_hash_for_targets(&config.canister, &pre_snapshot_targets)?;
463    ensure_topology_stable(discovery_topology_hash, &pre_snapshot_topology_hash)?;
464    Ok(pre_snapshot_topology_hash)
465}
466
467// Return dry-run commands and a placeholder artifact without mutating state.
468fn dry_run_artifact(
469    config: &SnapshotDownloadConfig,
470    driver: &impl SnapshotDriver,
471    target: &SnapshotTarget,
472    artifact_path: PathBuf,
473) -> (SnapshotArtifact, Vec<String>) {
474    let mut commands = Vec::new();
475    if config.lifecycle.stop_before_snapshot() {
476        commands.push(driver.stop_canister_command(&target.canister_id));
477    }
478    commands.push(driver.create_snapshot_command(&target.canister_id));
479    commands.push(driver.download_snapshot_command(
480        &target.canister_id,
481        "<snapshot-id>",
482        &artifact_path,
483    ));
484    if config.lifecycle.resume_after_snapshot() {
485        commands.push(driver.start_canister_command(&target.canister_id));
486    }
487
488    (
489        SnapshotArtifact {
490            canister_id: target.canister_id.clone(),
491            snapshot_id: "<snapshot-id>".to_string(),
492            path: artifact_path,
493            checksum: "<sha256>".to_string(),
494        },
495        commands,
496    )
497}
498
499// Create, download, checksum, and finalize one durable snapshot artifact.
500fn capture_snapshot_artifact(
501    config: &SnapshotDownloadConfig,
502    driver: &mut impl SnapshotDriver,
503    layout: &BackupLayout,
504    journal: &mut DownloadJournal,
505    target: &SnapshotTarget,
506    paths: SnapshotArtifactPaths,
507) -> Result<SnapshotArtifact, SnapshotDownloadError> {
508    if config.lifecycle.stop_before_snapshot() {
509        driver
510            .stop_canister(&target.canister_id)
511            .map_err(SnapshotDownloadError::Driver)?;
512    }
513
514    let result = capture_snapshot_artifact_body(
515        driver,
516        layout,
517        journal,
518        target,
519        &paths.relative_path,
520        paths.artifact_path,
521        paths.temp_path,
522    );
523
524    if config.lifecycle.resume_after_snapshot() {
525        match result {
526            Ok(artifact) => {
527                driver
528                    .start_canister(&target.canister_id)
529                    .map_err(SnapshotDownloadError::Driver)?;
530                Ok(artifact)
531            }
532            Err(error) => {
533                let _ = driver.start_canister(&target.canister_id);
534                Err(error)
535            }
536        }
537    } else {
538        result
539    }
540}
541
542// Run the mutation-heavy capture path after lifecycle handling is settled.
543fn capture_snapshot_artifact_body(
544    driver: &mut impl SnapshotDriver,
545    layout: &BackupLayout,
546    journal: &mut DownloadJournal,
547    target: &SnapshotTarget,
548    artifact_relative_path: &Path,
549    artifact_path: PathBuf,
550    temp_path: PathBuf,
551) -> Result<SnapshotArtifact, SnapshotDownloadError> {
552    journal.operation_metrics.snapshot_create_started += 1;
553    let snapshot_id = driver
554        .create_snapshot(&target.canister_id)
555        .map_err(SnapshotDownloadError::Driver)?;
556    journal.operation_metrics.snapshot_create_completed += 1;
557    let mut entry = ArtifactJournalEntry {
558        canister_id: target.canister_id.clone(),
559        snapshot_id: snapshot_id.clone(),
560        state: ArtifactState::Created,
561        temp_path: None,
562        artifact_path: artifact_relative_path.display().to_string(),
563        checksum_algorithm: "sha256".to_string(),
564        checksum: None,
565        updated_at: current_timestamp_marker(),
566    };
567    journal.artifacts.push(entry.clone());
568    layout.write_journal(journal)?;
569
570    if temp_path.exists() {
571        fs::remove_dir_all(&temp_path)?;
572    }
573    fs::create_dir_all(&temp_path)?;
574    journal.operation_metrics.snapshot_download_started += 1;
575    layout.write_journal(journal)?;
576    driver
577        .download_snapshot(&target.canister_id, &snapshot_id, &temp_path)
578        .map_err(SnapshotDownloadError::Driver)?;
579    journal.operation_metrics.snapshot_download_completed += 1;
580    entry.advance_to(ArtifactState::Downloaded, current_timestamp_marker())?;
581    entry.temp_path = Some(temp_path.display().to_string());
582    update_journal_entry(journal, &entry);
583    layout.write_journal(journal)?;
584
585    journal.operation_metrics.checksum_verify_started += 1;
586    layout.write_journal(journal)?;
587    let checksum = ArtifactChecksum::from_path(&temp_path)?;
588    journal.operation_metrics.checksum_verify_completed += 1;
589    entry.checksum = Some(checksum.hash.clone());
590    entry.advance_to(ArtifactState::ChecksumVerified, current_timestamp_marker())?;
591    update_journal_entry(journal, &entry);
592    layout.write_journal(journal)?;
593
594    journal.operation_metrics.artifact_finalize_started += 1;
595    layout.write_journal(journal)?;
596    if artifact_path.exists() {
597        return Err(std::io::Error::new(
598            std::io::ErrorKind::AlreadyExists,
599            format!("artifact path already exists: {}", artifact_path.display()),
600        )
601        .into());
602    }
603    fs::rename(&temp_path, &artifact_path)?;
604    journal.operation_metrics.artifact_finalize_completed += 1;
605    entry.temp_path = None;
606    entry.advance_to(ArtifactState::Durable, current_timestamp_marker())?;
607    update_journal_entry(journal, &entry);
608    layout.write_journal(journal)?;
609
610    Ok(SnapshotArtifact {
611        canister_id: target.canister_id.clone(),
612        snapshot_id,
613        path: artifact_path,
614        checksum: checksum.hash,
615    })
616}
617
618// Replace one artifact row in the mutable journal.
619fn update_journal_entry(journal: &mut DownloadJournal, entry: &ArtifactJournalEntry) {
620    if let Some(existing) = journal.artifacts.iter_mut().find(|existing| {
621        existing.canister_id == entry.canister_id && existing.snapshot_id == entry.snapshot_id
622    }) {
623        *existing = entry.clone();
624    }
625}
626
627// Build one manifest member from a captured durable artifact.
628fn fleet_member(
629    selected_canister: &str,
630    subnet_canister_id: Option<&str>,
631    index: usize,
632    target: &SnapshotTarget,
633    artifacts: &[SnapshotArtifact],
634) -> Result<FleetMember, SnapshotManifestError> {
635    let Some(artifact) = artifacts
636        .iter()
637        .find(|artifact| artifact.canister_id == target.canister_id)
638    else {
639        return Err(SnapshotManifestError::MissingArtifact(
640            target.canister_id.clone(),
641        ));
642    };
643    let role = target_role(selected_canister, index, target);
644
645    Ok(FleetMember {
646        role: role.clone(),
647        canister_id: target.canister_id.clone(),
648        parent_canister_id: target.parent_canister_id.clone(),
649        subnet_canister_id: subnet_canister_id.map(str::to_string),
650        controller_hint: None,
651        identity_mode: if target.canister_id == selected_canister {
652            IdentityMode::Fixed
653        } else {
654            IdentityMode::Relocatable
655        },
656        verification_checks: vec![VerificationCheck {
657            kind: "status".to_string(),
658            roles: vec![role],
659        }],
660        source_snapshot: SourceSnapshot {
661            snapshot_id: artifact.snapshot_id.clone(),
662            module_hash: None,
663            wasm_hash: None,
664            code_version: None,
665            artifact_path: safe_path_segment(&target.canister_id),
666            checksum_algorithm: "sha256".to_string(),
667            checksum: Some(artifact.checksum.clone()),
668        },
669    })
670}
671
672// Build one canonical topology record for manifest hashing.
673fn topology_record(
674    selected_canister: &str,
675    index: usize,
676    target: &SnapshotTarget,
677) -> Result<TopologyRecord, SnapshotManifestError> {
678    Ok(TopologyRecord {
679        pid: parse_principal("fleet.members[].canister_id", &target.canister_id)?,
680        parent_pid: target
681            .parent_canister_id
682            .as_deref()
683            .map(|parent| parse_principal("fleet.members[].parent_canister_id", parent))
684            .transpose()?,
685        role: target_role(selected_canister, index, target),
686        module_hash: None,
687    })
688}
689
690// Return the manifest role for one selected snapshot target.
691fn target_role(selected_canister: &str, index: usize, target: &SnapshotTarget) -> String {
692    target.role.clone().unwrap_or_else(|| {
693        if target.canister_id == selected_canister {
694            "root".to_string()
695        } else {
696            format!("member-{index}")
697        }
698    })
699}
700
701// Parse one principal used by generated topology manifest metadata.
702fn parse_principal(field: &'static str, value: &str) -> Result<Principal, SnapshotManifestError> {
703    Principal::from_text(value).map_err(|_| SnapshotManifestError::InvalidPrincipal {
704        field,
705        value: value.to_string(),
706    })
707}
708
709// Sanitize a canister id into a relative artifact directory segment.
710fn safe_path_segment(value: &str) -> String {
711    value
712        .chars()
713        .map(|ch| match ch {
714            'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' => ch,
715            _ => '_',
716        })
717        .collect()
718}
719
720#[cfg(test)]
721mod tests;