iceberg_rs/model/table.rs
1/*!
2Defines the [table metadata](https://iceberg.apache.org/spec/#table-metadata).
3The main struct here is [TableMetadataV2] which defines the data for a table.
4*/
5use std::collections::HashMap;
6
7use crate::model::{
8    partition::PartitionSpec,
9    schema,
10    snapshot::{Reference, SnapshotV2},
11    sort,
12};
13use serde::{Deserialize, Serialize};
14use uuid::Uuid;
15
16#[derive(Debug, Serialize, Deserialize)]
17#[serde(rename_all = "kebab-case", tag = "format-version")]
18/// Fields for the version 2 of the table metadata.
19pub struct TableMetadataV2 {
20    /// Integer Version for the format.
21    /// A UUID that identifies the table
22    pub table_uuid: Uuid,
23    /// Location tables base location
24    pub location: String,
25    /// The tables highest sequence number
26    pub last_sequence_number: i64,
27    /// Timestamp in milliseconds from the unix epoch when the table was last updated.
28    pub last_updated_ms: i64,
29    /// An integer; the highest assigned column ID for the table.
30    pub last_column_id: i32,
31    /// A list of schemas, stored as objects with schema-id.
32    pub schemas: Vec<schema::SchemaV2>,
33    /// ID of the table’s current schema.
34    pub current_schema_id: i32,
35    /// A list of partition specs, stored as full partition spec objects.
36    pub partition_specs: Vec<PartitionSpec>,
37    /// ID of the “current” spec that writers should use by default.
38    pub default_spec_id: i32,
39    /// An integer; the highest assigned partition field ID across all partition specs for the table.
40    pub last_partition_id: i32,
41    ///A string to string map of table properties. This is used to control settings that
42    /// affect reading and writing and is not intended to be used for arbitrary metadata.
43    /// For example, commit.retry.num-retries is used to control the number of commit retries.
44    pub properties: Option<HashMap<String, String>>,
45    /// long ID of the current table snapshot; must be the same as the current
46    /// ID of the main branch in refs.
47    pub current_snapshot_id: Option<i64>,
48    ///A list of valid snapshots. Valid snapshots are snapshots for which all
49    /// data files exist in the file system. A data file must not be deleted
50    /// from the file system until the last snapshot in which it was listed is
51    /// garbage collected.
52    pub snapshots: Option<Vec<SnapshotV2>>,
53    /// A list (optional) of timestamp and snapshot ID pairs that encodes changes
54    /// to the current snapshot for the table. Each time the current-snapshot-id
55    /// is changed, a new entry should be added with the last-updated-ms
56    /// and the new current-snapshot-id. When snapshots are expired from
57    /// the list of valid snapshots, all entries before a snapshot that has
58    /// expired should be removed.
59    pub snapshot_log: Option<Vec<SnapshotLog>>,
60
61    /// A list (optional) of timestamp and metadata file location pairs
62    /// that encodes changes to the previous metadata files for the table.
63    /// Each time a new metadata file is created, a new entry of the
64    /// previous metadata file location should be added to the list.
65    /// Tables can be configured to remove oldest metadata log entries and
66    /// keep a fixed-size log of the most recent entries after a commit.
67    pub metadata_log: Option<Vec<MetadataLog>>,
68
69    /// A list of sort orders, stored as full sort order objects.
70    pub sort_orders: Vec<sort::SortOrder>,
71    /// Default sort order id of the table. Note that this could be used by
72    /// writers, but is not used when reading because reads use the specs
73    /// stored in manifest files.
74    pub default_sort_order_id: i64,
75    ///A map of snapshot references. The map keys are the unique snapshot reference
76    /// names in the table, and the map values are snapshot reference objects.
77    /// There is always a main branch reference pointing to the current-snapshot-id
78    /// even if the refs map is null.
79    pub refs: Option<HashMap<String, Reference>>,
80}
81
82#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
83#[serde(rename_all = "kebab-case")]
84/// Encodes changes to the previous metadata files for the table
85pub struct MetadataLog {
86    /// The file for the log.
87    pub metadata_file: String,
88    /// Time new metadata was created
89    pub timestamp_ms: i64,
90}
91
92#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
93#[serde(rename_all = "kebab-case")]
94/// A log of when each snapshot was made.
95pub struct SnapshotLog {
96    /// Id of the snapshot.
97    pub snapshot_id: i64,
98    /// Last updated timestamp
99    pub timestamp_ms: i64,
100}
101
102#[cfg(test)]
103mod tests {
104    use anyhow::Result;
105
106    use super::TableMetadataV2;
107
108    #[test]
109    fn test_deserialize_table_data_v2() -> Result<()> {
110        let data = r#"
111            {
112                "format-version" : 2,
113                "table-uuid": "fb072c92-a02b-11e9-ae9c-1bb7bc9eca94",
114                "location": "s3://b/wh/data.db/table",
115                "last-sequence-number" : 1,
116                "last-updated-ms": 1515100955770,
117                "last-column-id": 1,
118                "schemas": [
119                    {
120                        "schema-id" : 1,
121                        "type" : "struct",
122                        "fields" :[
123                            {
124                                "id": 1,
125                                "name": "struct_name",
126                                "required": true,
127                                "field_type": "fixed[1]"
128                            }
129                        ]
130                    }
131                ],
132                "current-schema-id" : 1,
133                "partition-specs": [
134                    {
135                        "spec-id": 1,
136                        "fields": [
137                            {  
138                                "source-id": 4,  
139                                "field-id": 1000,  
140                                "name": "ts_day",  
141                                "transform": "day"
142                            } 
143                        ]
144                    }
145                ],
146                "default-spec-id": 1,
147                "last-partition-id": 1,
148                "properties": {
149                    "commit.retry.num-retries": "1"
150                },
151                "metadata-log": [
152                    {  
153                        "metadata-file": "s3://bucket/.../v1.json",  
154                        "timestamp-ms": 1515100
155                    }
156                ],
157                "sort-orders": [],
158                "default-sort-order-id": 0
159            }
160        "#;
161        let _metadata = serde_json::from_str::<TableMetadataV2>(&data)?;
162        Ok(())
163    }
164
165    #[test]
166    fn test_invalid_table_uuid() -> Result<()> {
167        let data = r#"
168            {
169                "format-version" : 2,
170                "table-uuid": "xxxx"
171            }
172        "#;
173        assert!(serde_json::from_str::<TableMetadataV2>(&data).is_err());
174        Ok(())
175    }
176    #[test]
177    fn test_deserialize_table_data_v2_invalid_format_version() -> Result<()> {
178        let data = r#"
179            {
180                "format-version" : 1
181            }
182        "#;
183        assert!(serde_json::from_str::<TableMetadataV2>(&data).is_err());
184        Ok(())
185    }
186}