iceberg_rust_spec/spec/
schema.rs

1/*!
2 * Schema definition and management for Iceberg tables
3 *
4 * This module provides the core schema functionality for Iceberg tables, including:
5 * - Schema versioning and evolution
6 * - Field definitions with unique IDs
7 * - Required vs optional field specifications
8 * - Schema builder patterns for constructing complex schemas
9 * - Schema projection for selecting subsets of fields
10 *
11 * The schema system is fundamental to Iceberg's data model, providing:
12 * - Type safety and validation
13 * - Schema evolution capabilities
14 * - Efficient field access via ID-based lookups
15 * - Support for nested data structures
16 */
17
18use std::{fmt, ops::Deref, str};
19
20use super::types::{StructField, StructType, StructTypeBuilder};
21use derive_getters::Getters;
22use serde::{Deserialize, Serialize};
23
24use crate::error::Error;
25
26pub static DEFAULT_SCHEMA_ID: i32 = 0;
27
28#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, Getters)]
29#[serde(rename_all = "kebab-case")]
30/// Names and types of fields in a table.
31pub struct Schema {
32    /// Identifier of the schema
33    schema_id: i32,
34    /// Set of primitive fields that identify rows in a table.
35    #[serde(skip_serializing_if = "Option::is_none")]
36    identifier_field_ids: Option<Vec<i32>>,
37
38    #[serde(flatten)]
39    /// The struct fields
40    fields: StructType,
41}
42
43impl Deref for Schema {
44    type Target = StructType;
45    fn deref(&self) -> &Self::Target {
46        &self.fields
47    }
48}
49
50impl Schema {
51    /// Creates a new SchemaBuilder to construct a Schema using the builder pattern
52    ///
53    /// # Returns
54    /// * A SchemaBuilder instance configured with default values
55    ///
56    /// This is the recommended way to construct Schema instances when you need
57    /// to add fields incrementally or set optional parameters.
58    pub fn builder() -> SchemaBuilder {
59        SchemaBuilder::default()
60    }
61
62    /// Creates a new Schema from a StructType and associated metadata
63    ///
64    /// # Arguments
65    /// * `fields` - The StructType containing the schema's fields
66    /// * `schema_id` - Unique identifier for this schema
67    /// * `identifier_field_ids` - Optional list of field IDs that identify rows in the table
68    ///
69    /// # Returns
70    /// * A new Schema instance with the provided fields and metadata
71    pub fn from_struct_type(
72        fields: StructType,
73        schema_id: i32,
74        identifier_field_ids: Option<Vec<i32>>,
75    ) -> Self {
76        Schema {
77            schema_id,
78            identifier_field_ids,
79            fields,
80        }
81    }
82
83    /// Creates a new Schema containing only the specified field IDs
84    ///
85    /// # Arguments
86    /// * `ids` - Array of field IDs to include in the projected schema
87    ///
88    /// # Returns
89    /// * A new Schema containing only the specified fields, maintaining the original
90    ///   schema ID and any identifier fields that were included in the projection
91    pub fn project(&self, ids: &[i32]) -> Schema {
92        Schema {
93            schema_id: self.schema_id,
94            identifier_field_ids: self.identifier_field_ids.as_ref().map(|x| {
95                x.iter()
96                    .filter(|x| ids.contains(x))
97                    .map(ToOwned::to_owned)
98                    .collect()
99            }),
100            fields: StructType::new(
101                self.fields()
102                    .iter()
103                    .filter(|x| ids.contains(&x.id))
104                    .map(ToOwned::to_owned)
105                    .collect(),
106            ),
107        }
108    }
109}
110
111impl fmt::Display for Schema {
112    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
113        write!(
114            f,
115            "{}",
116            &serde_json::to_string(self).map_err(|_| fmt::Error)?,
117        )
118    }
119}
120
121impl str::FromStr for Schema {
122    type Err = Error;
123    fn from_str(s: &str) -> Result<Self, Self::Err> {
124        serde_json::from_str(s).map_err(Error::from)
125    }
126}
127
128#[derive(Default)]
129pub struct SchemaBuilder {
130    schema_id: Option<i32>,
131    identifier_field_ids: Option<Vec<i32>>,
132    fields: StructTypeBuilder,
133}
134
135impl SchemaBuilder {
136    /// Sets the schema ID for this schema
137    ///
138    /// # Arguments
139    /// * `schema_id` - The unique identifier for this schema
140    ///
141    /// # Returns
142    /// * A mutable reference to self for method chaining
143    pub fn with_schema_id(&mut self, schema_id: i32) -> &mut Self {
144        self.schema_id = Some(schema_id);
145        self
146    }
147
148    /// Sets the identifier field IDs for this schema
149    ///
150    /// # Arguments
151    /// * `ids` - Collection of field IDs that identify rows in the table
152    ///
153    /// # Returns
154    /// * A mutable reference to self for method chaining
155    pub fn with_identifier_field_ids(&mut self, ids: impl Into<Vec<i32>>) -> &mut Self {
156        self.identifier_field_ids = Some(ids.into());
157        self
158    }
159
160    /// Adds a struct field to this schema
161    ///
162    /// # Arguments
163    /// * `field` - The StructField to add to the schema
164    ///
165    /// # Returns
166    /// * A mutable reference to self for method chaining
167    pub fn with_struct_field(&mut self, field: StructField) -> &mut Self {
168        self.fields.with_struct_field(field);
169        self
170    }
171
172    /// Builds and returns a new Schema from this builder's configuration
173    ///
174    /// # Returns
175    /// * `Ok(Schema)` - A new Schema instance with the configured fields and metadata
176    /// * `Err(Error)` - If there was an error building the schema
177    pub fn build(&mut self) -> Result<Schema, Error> {
178        let fields = self.fields.build()?;
179
180        Ok(Schema {
181            schema_id: self.schema_id.unwrap_or(DEFAULT_SCHEMA_ID),
182            identifier_field_ids: self.identifier_field_ids.take(),
183            fields,
184        })
185    }
186}
187
188impl TryFrom<SchemaV2> for Schema {
189    type Error = Error;
190    fn try_from(value: SchemaV2) -> Result<Self, Self::Error> {
191        Ok(Schema {
192            schema_id: value.schema_id,
193            identifier_field_ids: value.identifier_field_ids,
194            fields: value.fields,
195        })
196    }
197}
198
199impl TryFrom<SchemaV1> for Schema {
200    type Error = Error;
201    fn try_from(value: SchemaV1) -> Result<Self, Self::Error> {
202        Ok(Schema {
203            schema_id: value.schema_id.unwrap_or(0),
204            identifier_field_ids: value.identifier_field_ids,
205            fields: value.fields,
206        })
207    }
208}
209
210impl From<Schema> for SchemaV2 {
211    fn from(value: Schema) -> Self {
212        SchemaV2 {
213            schema_id: value.schema_id,
214            identifier_field_ids: value.identifier_field_ids,
215            fields: value.fields,
216        }
217    }
218}
219
220impl From<Schema> for SchemaV1 {
221    fn from(value: Schema) -> Self {
222        SchemaV1 {
223            schema_id: Some(value.schema_id),
224            identifier_field_ids: value.identifier_field_ids,
225            fields: value.fields,
226        }
227    }
228}
229
230#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
231#[serde(rename_all = "kebab-case")]
232/// Names and types of fields in a table.
233pub struct SchemaV2 {
234    /// Identifier of the schema
235    pub schema_id: i32,
236    /// Set of primitive fields that identify rows in a table.
237    #[serde(skip_serializing_if = "Option::is_none")]
238    pub identifier_field_ids: Option<Vec<i32>>,
239
240    #[serde(flatten)]
241    /// The struct fields
242    pub fields: StructType,
243}
244
245#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
246#[serde(rename_all = "kebab-case")]
247/// Names and types of fields in a table.
248pub struct SchemaV1 {
249    /// Identifier of the schema
250    #[serde(skip_serializing_if = "Option::is_none")]
251    pub schema_id: Option<i32>,
252    /// Set of primitive fields that identify rows in a table.
253    #[serde(skip_serializing_if = "Option::is_none")]
254    pub identifier_field_ids: Option<Vec<i32>>,
255
256    #[serde(flatten)]
257    /// The struct fields
258    pub fields: StructType,
259}
260
261impl From<SchemaV1> for SchemaV2 {
262    fn from(v1: SchemaV1) -> Self {
263        SchemaV2 {
264            schema_id: v1.schema_id.unwrap_or(0),
265            identifier_field_ids: v1.identifier_field_ids,
266            fields: v1.fields,
267        }
268    }
269}
270
271#[cfg(test)]
272mod tests {
273    use crate::spec::types::{PrimitiveType, Type};
274
275    use super::*;
276
277    #[test]
278    fn schema() {
279        let record = r#"
280        {
281            "type": "struct",
282            "schema-id": 1,
283            "fields": [ {
284            "id": 1,
285            "name": "id",
286            "required": true,
287            "type": "uuid"
288            }, {
289            "id": 2,
290            "name": "data",
291            "required": false,
292            "type": "int"
293            } ]
294            }
295        "#;
296
297        let result: SchemaV2 = serde_json::from_str(record).unwrap();
298        assert_eq!(1, result.schema_id);
299        assert_eq!(
300            Type::Primitive(PrimitiveType::Uuid),
301            result.fields[0].field_type
302        );
303        assert_eq!(1, result.fields[0].id);
304        assert!(result.fields[0].required);
305
306        assert_eq!(
307            Type::Primitive(PrimitiveType::Int),
308            result.fields[1].field_type
309        );
310        assert_eq!(2, result.fields[1].id);
311        assert!(!result.fields[1].required);
312    }
313}