iceberg_rust_spec/spec/snapshot.rs
1//! Snapshot management and versioning for Iceberg tables.
2//!
3//! This module provides the core types and implementations for managing table snapshots, which
4//! represent the state of a table at specific points in time. Key components include:
5//!
6//! - [`Snapshot`] - Represents a point-in-time state of the table
7//! - [`Operation`] - Types of operations that can create new snapshots
8//! - [`Summary`] - Metadata about changes made in a snapshot
9//! - [`SnapshotReference`] - Named references to snapshots (branches and tags)
10//! - [`SnapshotRetention`] - Policies for snapshot retention and cleanup
11//!
12//! Snapshots are fundamental to Iceberg's time travel and version control capabilities,
13//! allowing tables to maintain their history and enabling features like rollbacks
14//! and incremental processing.
15
16use std::{
17 collections::HashMap,
18 fmt, str,
19 time::{SystemTime, UNIX_EPOCH},
20};
21
22use derive_builder::Builder;
23use derive_getters::Getters;
24use serde::{Deserialize, Serialize};
25
26use crate::error::Error;
27
28use _serde::SnapshotEnum;
29
30#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Builder, Getters)]
31#[serde(from = "SnapshotEnum", into = "SnapshotEnum")]
32#[builder(build_fn(error = "Error"), setter(prefix = "with"))]
33/// A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table.
34pub struct Snapshot {
35 /// A unique long ID
36 #[builder(default = "generate_snapshot_id()")]
37 snapshot_id: i64,
38 /// The snapshot ID of the snapshot’s parent.
39 /// Omitted for any snapshot with no parent
40 #[builder(setter(strip_option), default)]
41 parent_snapshot_id: Option<i64>,
42 /// A monotonically increasing long that tracks the order of
43 /// changes to a table.
44 sequence_number: i64,
45 /// A timestamp when the snapshot was created, used for garbage
46 /// collection and table inspection
47 #[builder(
48 default = "SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as i64"
49 )]
50 timestamp_ms: i64,
51 /// The location of a manifest list for this snapshot that
52 /// tracks manifest files with additional metadata.
53 manifest_list: String,
54 /// A string map that summarizes the snapshot changes, including operation.
55 #[builder(default)]
56 summary: Summary,
57 /// ID of the table’s current schema when the snapshot was created.
58 #[builder(setter(strip_option), default)]
59 schema_id: Option<i32>,
60}
61
62/// Generates a random snapshot ID using a cryptographically secure random number generator.
63///
64/// The function generates 8 random bytes and converts them to a positive i64 value.
65/// This ensures unique snapshot IDs across the table's history.
66pub fn generate_snapshot_id() -> i64 {
67 let mut bytes: [u8; 8] = [0u8; 8];
68 getrandom::fill(&mut bytes).unwrap();
69 i64::from_le_bytes(bytes).abs()
70}
71
72impl fmt::Display for Snapshot {
73 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
74 write!(
75 f,
76 "{}",
77 &serde_json::to_string(self).map_err(|_| fmt::Error)?,
78 )
79 }
80}
81
82impl str::FromStr for Snapshot {
83 type Err = Error;
84 fn from_str(s: &str) -> Result<Self, Self::Err> {
85 serde_json::from_str(s).map_err(Error::from)
86 }
87}
88
89pub(crate) mod _serde {
90 use std::collections::HashMap;
91
92 use serde::{Deserialize, Serialize};
93
94 use super::{Operation, Snapshot, Summary};
95
96 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
97 #[serde(untagged)]
98 pub(super) enum SnapshotEnum {
99 V2(SnapshotV2),
100 V1(SnapshotV1),
101 }
102
103 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
104 #[serde(rename_all = "kebab-case")]
105 /// A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table.
106 pub struct SnapshotV2 {
107 /// A unique long ID
108 pub snapshot_id: i64,
109 /// The snapshot ID of the snapshot’s parent.
110 /// Omitted for any snapshot with no parent
111 #[serde(skip_serializing_if = "Option::is_none")]
112 pub parent_snapshot_id: Option<i64>,
113 /// A monotonically increasing long that tracks the order of
114 /// changes to a table.
115 pub sequence_number: i64,
116 /// A timestamp when the snapshot was created, used for garbage
117 /// collection and table inspection
118 pub timestamp_ms: i64,
119 /// The location of a manifest list for this snapshot that
120 /// tracks manifest files with additional metadata.
121 pub manifest_list: String,
122 /// A string map that summarizes the snapshot changes, including operation.
123 pub summary: Summary,
124 /// ID of the table’s current schema when the snapshot was created.
125 #[serde(skip_serializing_if = "Option::is_none")]
126 pub schema_id: Option<i32>,
127 }
128
129 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
130 #[serde(rename_all = "kebab-case")]
131 /// A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table.
132 pub struct SnapshotV1 {
133 /// A unique long ID
134 pub snapshot_id: i64,
135 /// The snapshot ID of the snapshot’s parent.
136 /// Omitted for any snapshot with no parent
137 #[serde(skip_serializing_if = "Option::is_none")]
138 pub parent_snapshot_id: Option<i64>,
139 /// A timestamp when the snapshot was created, used for garbage
140 /// collection and table inspection
141 pub timestamp_ms: i64,
142 /// The location of a manifest list for this snapshot that
143 /// tracks manifest files with additional metadata.
144 #[serde(skip_serializing_if = "Option::is_none")]
145 pub manifest_list: Option<String>,
146 /// A list of manifest file locations. Must be omitted if manifest-list is present
147 #[serde(skip_serializing_if = "Option::is_none")]
148 pub manifests: Option<Vec<String>>,
149 /// A string map that summarizes the snapshot changes, including operation.
150 #[serde(skip_serializing_if = "Option::is_none")]
151 pub summary: Option<Summary>,
152 /// ID of the table’s current schema when the snapshot was created.
153 #[serde(skip_serializing_if = "Option::is_none")]
154 pub schema_id: Option<i32>,
155 }
156 impl From<SnapshotEnum> for Snapshot {
157 fn from(value: SnapshotEnum) -> Self {
158 match value {
159 SnapshotEnum::V2(value) => value.into(),
160 SnapshotEnum::V1(value) => value.into(),
161 }
162 }
163 }
164
165 impl From<Snapshot> for SnapshotEnum {
166 fn from(value: Snapshot) -> Self {
167 SnapshotEnum::V2(value.into())
168 }
169 }
170
171 impl From<SnapshotV1> for Snapshot {
172 fn from(v1: SnapshotV1) -> Self {
173 Snapshot {
174 snapshot_id: v1.snapshot_id,
175 parent_snapshot_id: v1.parent_snapshot_id,
176 sequence_number: 0,
177 timestamp_ms: v1.timestamp_ms,
178 manifest_list: v1.manifest_list.unwrap_or_default(),
179 summary: v1.summary.unwrap_or(Summary {
180 operation: Operation::default(),
181 other: HashMap::new(),
182 }),
183 schema_id: v1.schema_id,
184 }
185 }
186 }
187
188 impl From<Snapshot> for SnapshotV1 {
189 fn from(v1: Snapshot) -> Self {
190 SnapshotV1 {
191 snapshot_id: v1.snapshot_id,
192 parent_snapshot_id: v1.parent_snapshot_id,
193 timestamp_ms: v1.timestamp_ms,
194 manifest_list: Some(v1.manifest_list),
195 summary: Some(v1.summary),
196 schema_id: v1.schema_id,
197 manifests: None,
198 }
199 }
200 }
201
202 impl From<SnapshotV2> for Snapshot {
203 fn from(value: SnapshotV2) -> Self {
204 Snapshot {
205 snapshot_id: value.snapshot_id,
206 parent_snapshot_id: value.parent_snapshot_id,
207 sequence_number: value.sequence_number,
208 timestamp_ms: value.timestamp_ms,
209 manifest_list: value.manifest_list,
210 summary: value.summary,
211 schema_id: value.schema_id,
212 }
213 }
214 }
215
216 impl From<Snapshot> for SnapshotV2 {
217 fn from(value: Snapshot) -> Self {
218 SnapshotV2 {
219 snapshot_id: value.snapshot_id,
220 parent_snapshot_id: value.parent_snapshot_id,
221 sequence_number: value.sequence_number,
222 timestamp_ms: value.timestamp_ms,
223 manifest_list: value.manifest_list,
224 summary: value.summary,
225 schema_id: value.schema_id,
226 }
227 }
228 }
229}
230
231#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
232#[serde(rename_all = "lowercase")]
233/// The operation field is used by some operations, like snapshot expiration, to skip processing certain snapshots.
234#[derive(Default)]
235pub enum Operation {
236 /// Only data files were added and no files were removed.
237 #[default]
238 Append,
239 /// Data and delete files were added and removed without changing table data;
240 /// i.e., compaction, changing the data file format, or relocating data files.
241 Replace,
242 /// Data and delete files were added and removed in a logical overwrite operation.
243 Overwrite,
244 /// Data files were removed and their contents logically deleted and/or delete files were added to delete rows.
245 Delete,
246}
247
248#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, Default)]
249/// Summarises the changes in the snapshot.
250pub struct Summary {
251 /// The type of operation in the snapshot
252 pub operation: Operation,
253 /// Other summary data.
254 #[serde(flatten)]
255 pub other: HashMap<String, String>,
256}
257
258#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
259#[serde(rename_all = "kebab-case")]
260/// Iceberg tables keep track of branches and tags using snapshot references.
261pub struct SnapshotReference {
262 /// A reference’s snapshot ID. The tagged snapshot or latest snapshot of a branch.
263 pub snapshot_id: i64,
264 #[serde(flatten)]
265 /// Snapshot retention policy
266 pub retention: SnapshotRetention,
267}
268
269#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
270#[serde(rename_all = "lowercase", tag = "type")]
271/// The snapshot expiration procedure removes snapshots from table metadata and applies the table’s retention policy.
272pub enum SnapshotRetention {
273 #[serde(rename_all = "kebab-case")]
274 /// Branches are mutable named references that can be updated by committing a new snapshot as
275 /// the branch’s referenced snapshot using the Commit Conflict Resolution and Retry procedures.
276 Branch {
277 /// A positive number for the minimum number of snapshots to keep in a branch while expiring snapshots.
278 /// Defaults to table property history.expire.min-snapshots-to-keep.
279 #[serde(skip_serializing_if = "Option::is_none")]
280 min_snapshots_to_keep: Option<i32>,
281 /// A positive number for the max age of snapshots to keep when expiring, including the latest snapshot.
282 /// Defaults to table property history.expire.max-snapshot-age-ms.
283 #[serde(skip_serializing_if = "Option::is_none")]
284 max_snapshot_age_ms: Option<i64>,
285 /// For snapshot references except the main branch, a positive number for the max age of the snapshot reference to keep while expiring snapshots.
286 /// Defaults to table property history.expire.max-ref-age-ms. The main branch never expires.
287 #[serde(skip_serializing_if = "Option::is_none")]
288 max_ref_age_ms: Option<i64>,
289 },
290 #[serde(rename_all = "kebab-case")]
291 /// Tags are labels for individual snapshots.
292 Tag {
293 /// For snapshot references except the main branch, a positive number for the max age of the snapshot reference to keep while expiring snapshots.
294 /// Defaults to table property history.expire.max-ref-age-ms. The main branch never expires.
295 max_ref_age_ms: i64,
296 },
297}
298
299impl Default for SnapshotRetention {
300 fn default() -> Self {
301 SnapshotRetention::Branch {
302 max_ref_age_ms: None,
303 max_snapshot_age_ms: None,
304 min_snapshots_to_keep: None,
305 }
306 }
307}