iceberg_rust_spec/spec/
snapshot.rs

1//! Snapshot management and versioning for Iceberg tables.
2//!
3//! This module provides the core types and implementations for managing table snapshots, which
4//! represent the state of a table at specific points in time. Key components include:
5//!
6//! - [`Snapshot`] - Represents a point-in-time state of the table
7//! - [`Operation`] - Types of operations that can create new snapshots
8//! - [`Summary`] - Metadata about changes made in a snapshot
9//! - [`SnapshotReference`] - Named references to snapshots (branches and tags)
10//! - [`SnapshotRetention`] - Policies for snapshot retention and cleanup
11//!
12//! Snapshots are fundamental to Iceberg's time travel and version control capabilities,
13//! allowing tables to maintain their history and enabling features like rollbacks
14//! and incremental processing.
15
16use std::{
17    collections::HashMap,
18    fmt, str,
19    time::{SystemTime, UNIX_EPOCH},
20};
21
22use derive_builder::Builder;
23use derive_getters::Getters;
24use serde::{Deserialize, Serialize};
25
26use crate::error::Error;
27
28use _serde::SnapshotEnum;
29
30#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Builder, Getters)]
31#[serde(from = "SnapshotEnum", into = "SnapshotEnum")]
32#[builder(build_fn(error = "Error"), setter(prefix = "with"))]
33/// A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table.
34pub struct Snapshot {
35    /// A unique long ID
36    #[builder(default = "generate_snapshot_id()")]
37    snapshot_id: i64,
38    /// The snapshot ID of the snapshot’s parent.
39    /// Omitted for any snapshot with no parent
40    #[builder(setter(strip_option), default)]
41    parent_snapshot_id: Option<i64>,
42    /// A monotonically increasing long that tracks the order of
43    /// changes to a table.
44    sequence_number: i64,
45    /// A timestamp when the snapshot was created, used for garbage
46    /// collection and table inspection
47    #[builder(
48        default = "SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as i64"
49    )]
50    timestamp_ms: i64,
51    /// The location of a manifest list for this snapshot that
52    /// tracks manifest files with additional metadata.
53    manifest_list: String,
54    /// A string map that summarizes the snapshot changes, including operation.
55    #[builder(default)]
56    summary: Summary,
57    /// ID of the table’s current schema when the snapshot was created.
58    #[builder(setter(strip_option), default)]
59    schema_id: Option<i32>,
60}
61
62/// Generates a random snapshot ID using a cryptographically secure random number generator.
63///
64/// The function generates 8 random bytes and converts them to a positive i64 value.
65/// This ensures unique snapshot IDs across the table's history.
66pub fn generate_snapshot_id() -> i64 {
67    let mut bytes: [u8; 8] = [0u8; 8];
68    getrandom::fill(&mut bytes).unwrap();
69    i64::from_le_bytes(bytes).abs()
70}
71
72impl fmt::Display for Snapshot {
73    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
74        write!(
75            f,
76            "{}",
77            &serde_json::to_string(self).map_err(|_| fmt::Error)?,
78        )
79    }
80}
81
82impl str::FromStr for Snapshot {
83    type Err = Error;
84    fn from_str(s: &str) -> Result<Self, Self::Err> {
85        serde_json::from_str(s).map_err(Error::from)
86    }
87}
88
89pub(crate) mod _serde {
90    use std::collections::HashMap;
91
92    use serde::{Deserialize, Serialize};
93
94    use super::{Operation, Snapshot, Summary};
95
96    #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
97    #[serde(untagged)]
98    pub(super) enum SnapshotEnum {
99        V2(SnapshotV2),
100        V1(SnapshotV1),
101    }
102
103    #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
104    #[serde(rename_all = "kebab-case")]
105    /// A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table.
106    pub struct SnapshotV2 {
107        /// A unique long ID
108        pub snapshot_id: i64,
109        /// The snapshot ID of the snapshot’s parent.
110        /// Omitted for any snapshot with no parent
111        #[serde(skip_serializing_if = "Option::is_none")]
112        pub parent_snapshot_id: Option<i64>,
113        /// A monotonically increasing long that tracks the order of
114        /// changes to a table.
115        pub sequence_number: i64,
116        /// A timestamp when the snapshot was created, used for garbage
117        /// collection and table inspection
118        pub timestamp_ms: i64,
119        /// The location of a manifest list for this snapshot that
120        /// tracks manifest files with additional metadata.
121        pub manifest_list: String,
122        /// A string map that summarizes the snapshot changes, including operation.
123        pub summary: Summary,
124        /// ID of the table’s current schema when the snapshot was created.
125        #[serde(skip_serializing_if = "Option::is_none")]
126        pub schema_id: Option<i32>,
127    }
128
129    #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
130    #[serde(rename_all = "kebab-case")]
131    /// A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table.
132    pub struct SnapshotV1 {
133        /// A unique long ID
134        pub snapshot_id: i64,
135        /// The snapshot ID of the snapshot’s parent.
136        /// Omitted for any snapshot with no parent
137        #[serde(skip_serializing_if = "Option::is_none")]
138        pub parent_snapshot_id: Option<i64>,
139        /// A timestamp when the snapshot was created, used for garbage
140        /// collection and table inspection
141        pub timestamp_ms: i64,
142        /// The location of a manifest list for this snapshot that
143        /// tracks manifest files with additional metadata.
144        #[serde(skip_serializing_if = "Option::is_none")]
145        pub manifest_list: Option<String>,
146        /// A list of manifest file locations. Must be omitted if manifest-list is present
147        #[serde(skip_serializing_if = "Option::is_none")]
148        pub manifests: Option<Vec<String>>,
149        /// A string map that summarizes the snapshot changes, including operation.
150        #[serde(skip_serializing_if = "Option::is_none")]
151        pub summary: Option<Summary>,
152        /// ID of the table’s current schema when the snapshot was created.
153        #[serde(skip_serializing_if = "Option::is_none")]
154        pub schema_id: Option<i32>,
155    }
156    impl From<SnapshotEnum> for Snapshot {
157        fn from(value: SnapshotEnum) -> Self {
158            match value {
159                SnapshotEnum::V2(value) => value.into(),
160                SnapshotEnum::V1(value) => value.into(),
161            }
162        }
163    }
164
165    impl From<Snapshot> for SnapshotEnum {
166        fn from(value: Snapshot) -> Self {
167            SnapshotEnum::V2(value.into())
168        }
169    }
170
171    impl From<SnapshotV1> for Snapshot {
172        fn from(v1: SnapshotV1) -> Self {
173            Snapshot {
174                snapshot_id: v1.snapshot_id,
175                parent_snapshot_id: v1.parent_snapshot_id,
176                sequence_number: 0,
177                timestamp_ms: v1.timestamp_ms,
178                manifest_list: v1.manifest_list.unwrap_or_default(),
179                summary: v1.summary.unwrap_or(Summary {
180                    operation: Operation::default(),
181                    other: HashMap::new(),
182                }),
183                schema_id: v1.schema_id,
184            }
185        }
186    }
187
188    impl From<Snapshot> for SnapshotV1 {
189        fn from(v1: Snapshot) -> Self {
190            SnapshotV1 {
191                snapshot_id: v1.snapshot_id,
192                parent_snapshot_id: v1.parent_snapshot_id,
193                timestamp_ms: v1.timestamp_ms,
194                manifest_list: Some(v1.manifest_list),
195                summary: Some(v1.summary),
196                schema_id: v1.schema_id,
197                manifests: None,
198            }
199        }
200    }
201
202    impl From<SnapshotV2> for Snapshot {
203        fn from(value: SnapshotV2) -> Self {
204            Snapshot {
205                snapshot_id: value.snapshot_id,
206                parent_snapshot_id: value.parent_snapshot_id,
207                sequence_number: value.sequence_number,
208                timestamp_ms: value.timestamp_ms,
209                manifest_list: value.manifest_list,
210                summary: value.summary,
211                schema_id: value.schema_id,
212            }
213        }
214    }
215
216    impl From<Snapshot> for SnapshotV2 {
217        fn from(value: Snapshot) -> Self {
218            SnapshotV2 {
219                snapshot_id: value.snapshot_id,
220                parent_snapshot_id: value.parent_snapshot_id,
221                sequence_number: value.sequence_number,
222                timestamp_ms: value.timestamp_ms,
223                manifest_list: value.manifest_list,
224                summary: value.summary,
225                schema_id: value.schema_id,
226            }
227        }
228    }
229}
230
231#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
232#[serde(rename_all = "lowercase")]
233/// The operation field is used by some operations, like snapshot expiration, to skip processing certain snapshots.
234#[derive(Default)]
235pub enum Operation {
236    /// Only data files were added and no files were removed.
237    #[default]
238    Append,
239    /// Data and delete files were added and removed without changing table data;
240    /// i.e., compaction, changing the data file format, or relocating data files.
241    Replace,
242    /// Data and delete files were added and removed in a logical overwrite operation.
243    Overwrite,
244    /// Data files were removed and their contents logically deleted and/or delete files were added to delete rows.
245    Delete,
246}
247
248#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, Default)]
249/// Summarises the changes in the snapshot.
250pub struct Summary {
251    /// The type of operation in the snapshot
252    pub operation: Operation,
253    /// Other summary data.
254    #[serde(flatten)]
255    pub other: HashMap<String, String>,
256}
257
258#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
259#[serde(rename_all = "kebab-case")]
260/// Iceberg tables keep track of branches and tags using snapshot references.
261pub struct SnapshotReference {
262    /// A reference’s snapshot ID. The tagged snapshot or latest snapshot of a branch.
263    pub snapshot_id: i64,
264    #[serde(flatten)]
265    /// Snapshot retention policy
266    pub retention: SnapshotRetention,
267}
268
269#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
270#[serde(rename_all = "lowercase", tag = "type")]
271/// The snapshot expiration procedure removes snapshots from table metadata and applies the table’s retention policy.
272pub enum SnapshotRetention {
273    #[serde(rename_all = "kebab-case")]
274    /// Branches are mutable named references that can be updated by committing a new snapshot as
275    /// the branch’s referenced snapshot using the Commit Conflict Resolution and Retry procedures.
276    Branch {
277        /// A positive number for the minimum number of snapshots to keep in a branch while expiring snapshots.
278        /// Defaults to table property history.expire.min-snapshots-to-keep.
279        #[serde(skip_serializing_if = "Option::is_none")]
280        min_snapshots_to_keep: Option<i32>,
281        /// A positive number for the max age of snapshots to keep when expiring, including the latest snapshot.
282        /// Defaults to table property history.expire.max-snapshot-age-ms.
283        #[serde(skip_serializing_if = "Option::is_none")]
284        max_snapshot_age_ms: Option<i64>,
285        /// For snapshot references except the main branch, a positive number for the max age of the snapshot reference to keep while expiring snapshots.
286        /// Defaults to table property history.expire.max-ref-age-ms. The main branch never expires.
287        #[serde(skip_serializing_if = "Option::is_none")]
288        max_ref_age_ms: Option<i64>,
289    },
290    #[serde(rename_all = "kebab-case")]
291    /// Tags are labels for individual snapshots.
292    Tag {
293        /// For snapshot references except the main branch, a positive number for the max age of the snapshot reference to keep while expiring snapshots.
294        /// Defaults to table property history.expire.max-ref-age-ms. The main branch never expires.
295        max_ref_age_ms: i64,
296    },
297}
298
299impl Default for SnapshotRetention {
300    fn default() -> Self {
301        SnapshotRetention::Branch {
302            max_ref_age_ms: None,
303            max_snapshot_age_ms: None,
304            min_snapshots_to_keep: None,
305        }
306    }
307}