iceberg_datafusion/table/
table_provider_factory.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::borrow::Cow;
19use std::collections::HashMap;
20use std::sync::Arc;
21
22use async_trait::async_trait;
23use datafusion::catalog::{Session, TableProvider, TableProviderFactory};
24use datafusion::error::Result as DFResult;
25use datafusion::logical_expr::CreateExternalTable;
26use datafusion::sql::TableReference;
27use iceberg::arrow::schema_to_arrow_schema;
28use iceberg::io::FileIO;
29use iceberg::table::StaticTable;
30use iceberg::{Error, ErrorKind, Result, TableIdent};
31
32use super::IcebergTableProvider;
33use crate::to_datafusion_error;
34
35/// A factory that implements DataFusion's `TableProviderFactory` to create `IcebergTableProvider` instances.
36///
37/// # Example
38///
39/// The following example demonstrates how to create an Iceberg external table using SQL in
40/// a DataFusion session with `IcebergTableProviderFactory`:
41///
42/// ```
43/// use std::sync::Arc;
44///
45/// use datafusion::execution::session_state::SessionStateBuilder;
46/// use datafusion::prelude::*;
47/// use datafusion::sql::TableReference;
48/// use iceberg_datafusion::IcebergTableProviderFactory;
49///
50/// #[tokio::main]
51/// async fn main() {
52///     // Create a new session context
53///     let mut state = SessionStateBuilder::new().with_default_features().build();
54///
55///     // Register the IcebergTableProviderFactory in the session
56///     state.table_factories_mut().insert(
57///         "ICEBERG".to_string(),
58///         Arc::new(IcebergTableProviderFactory::new()),
59///     );
60///
61///     let ctx = SessionContext::new_with_state(state);
62///
63///     // Define the table reference and the location of the Iceberg metadata file
64///     let table_ref = TableReference::bare("my_iceberg_table");
65///     // /path/to/iceberg/metadata
66///     let metadata_file_path = format!(
67///         "{}/testdata/table_metadata/{}",
68///         env!("CARGO_MANIFEST_DIR"),
69///         "TableMetadataV2.json"
70///     );
71///
72///     // SQL command to create the Iceberg external table
73///     let sql = format!(
74///         "CREATE EXTERNAL TABLE {} STORED AS ICEBERG LOCATION '{}'",
75///         table_ref, metadata_file_path
76///     );
77///
78///     // Execute the SQL to create the external table
79///     ctx.sql(&sql).await.expect("Failed to create table");
80///
81///     // Verify the table was created by retrieving the table provider
82///     let table_provider = ctx
83///         .table_provider(table_ref)
84///         .await
85///         .expect("Table not found");
86///
87///     println!("Iceberg external table created successfully.");
88/// }
89/// ```
90///
91/// # Note
92/// This factory is designed to work with the DataFusion query engine,
93/// specifically for handling Iceberg tables in external table commands.
94/// Currently, this implementation supports only reading Iceberg tables, with
95/// the creation of new tables not yet available.
96///
97/// # Errors
98/// An error will be returned if any unsupported feature, such as partition columns,
99/// order expressions, constraints, or column defaults, is detected in the table creation command.
100#[derive(Debug, Default)]
101pub struct IcebergTableProviderFactory {}
102
103impl IcebergTableProviderFactory {
104    pub fn new() -> Self {
105        Self {}
106    }
107}
108
109#[async_trait]
110impl TableProviderFactory for IcebergTableProviderFactory {
111    async fn create(
112        &self,
113        _state: &dyn Session,
114        cmd: &CreateExternalTable,
115    ) -> DFResult<Arc<dyn TableProvider>> {
116        check_cmd(cmd).map_err(to_datafusion_error)?;
117
118        let table_name = &cmd.name;
119        let metadata_file_path = &cmd.location;
120        let options = &cmd.options;
121
122        let table_name_with_ns = complement_namespace_if_necessary(table_name);
123
124        let table = create_static_table(table_name_with_ns, metadata_file_path, options)
125            .await
126            .map_err(to_datafusion_error)?
127            .into_table();
128
129        let schema = schema_to_arrow_schema(table.metadata().current_schema())
130            .map_err(to_datafusion_error)?;
131
132        Ok(Arc::new(IcebergTableProvider::new(table, Arc::new(schema))))
133    }
134}
135
136fn check_cmd(cmd: &CreateExternalTable) -> Result<()> {
137    let CreateExternalTable {
138        schema,
139        table_partition_cols,
140        order_exprs,
141        constraints,
142        column_defaults,
143        ..
144    } = cmd;
145
146    // Check if any of the fields violate the constraints in a single condition
147    let is_invalid = !schema.fields().is_empty()
148        || !table_partition_cols.is_empty()
149        || !order_exprs.is_empty()
150        || !constraints.is_empty()
151        || !column_defaults.is_empty();
152
153    if is_invalid {
154        return Err(Error::new(
155            ErrorKind::FeatureUnsupported,
156            "Currently we only support reading existing icebergs tables in external table command. To create new table, please use catalog provider.",
157        ));
158    }
159
160    Ok(())
161}
162
163/// Complements the namespace of a table name if necessary.
164///
165/// # Note
166/// If the table name is a bare name, it will be complemented with the 'default' namespace.
167/// Otherwise, it will be returned as is. Because Iceberg tables are always namespaced, but DataFusion
168/// external table commands maybe not include the namespace, this function ensures that the namespace is always present.
169///
170/// # See also
171/// - [`iceberg::NamespaceIdent`]
172/// - [`datafusion::sql::planner::SqlToRel::external_table_to_plan`]
173fn complement_namespace_if_necessary(table_name: &TableReference) -> Cow<'_, TableReference> {
174    match table_name {
175        TableReference::Bare { table } => {
176            Cow::Owned(TableReference::partial("default", table.as_ref()))
177        }
178        other => Cow::Borrowed(other),
179    }
180}
181
182async fn create_static_table(
183    table_name: Cow<'_, TableReference>,
184    metadata_file_path: &str,
185    props: &HashMap<String, String>,
186) -> Result<StaticTable> {
187    let table_ident = TableIdent::from_strs(table_name.to_vec())?;
188    let file_io = FileIO::from_path(metadata_file_path)?
189        .with_props(props)
190        .build()?;
191    StaticTable::from_metadata_file(metadata_file_path, table_ident, file_io).await
192}
193
194#[cfg(test)]
195mod tests {
196
197    use datafusion::arrow::datatypes::{DataType, Field, Schema};
198    use datafusion::catalog::TableProviderFactory;
199    use datafusion::common::{Constraints, DFSchema};
200    use datafusion::execution::session_state::SessionStateBuilder;
201    use datafusion::logical_expr::CreateExternalTable;
202    use datafusion::parquet::arrow::PARQUET_FIELD_ID_META_KEY;
203    use datafusion::prelude::SessionContext;
204    use datafusion::sql::TableReference;
205
206    use super::*;
207
208    fn table_metadata_v2_schema() -> Schema {
209        Schema::new(vec![
210            Field::new("x", DataType::Int64, false).with_metadata(HashMap::from([(
211                PARQUET_FIELD_ID_META_KEY.to_string(),
212                "1".to_string(),
213            )])),
214            Field::new("y", DataType::Int64, false).with_metadata(HashMap::from([(
215                PARQUET_FIELD_ID_META_KEY.to_string(),
216                "2".to_string(),
217            )])),
218            Field::new("z", DataType::Int64, false).with_metadata(HashMap::from([(
219                PARQUET_FIELD_ID_META_KEY.to_string(),
220                "3".to_string(),
221            )])),
222        ])
223    }
224
225    fn table_metadata_location() -> String {
226        format!(
227            "{}/testdata/table_metadata/{}",
228            env!("CARGO_MANIFEST_DIR"),
229            "TableMetadataV2.json"
230        )
231    }
232
233    fn create_external_table_cmd() -> CreateExternalTable {
234        let metadata_file_path = table_metadata_location();
235
236        CreateExternalTable {
237            name: TableReference::partial("static_ns", "static_table"),
238            location: metadata_file_path,
239            schema: Arc::new(DFSchema::empty()),
240            file_type: "iceberg".to_string(),
241            options: Default::default(),
242            table_partition_cols: Default::default(),
243            order_exprs: Default::default(),
244            constraints: Constraints::empty(),
245            column_defaults: Default::default(),
246            if_not_exists: Default::default(),
247            temporary: false,
248            definition: Default::default(),
249            unbounded: Default::default(),
250        }
251    }
252
253    #[tokio::test]
254    async fn test_schema_of_created_table() {
255        let factory = IcebergTableProviderFactory::new();
256
257        let state = SessionStateBuilder::new().build();
258        let cmd = create_external_table_cmd();
259
260        let table_provider = factory
261            .create(&state, &cmd)
262            .await
263            .expect("create table failed");
264
265        let expected_schema = table_metadata_v2_schema();
266        let actual_schema = table_provider.schema();
267
268        assert_eq!(actual_schema.as_ref(), &expected_schema);
269    }
270
271    #[tokio::test]
272    async fn test_schema_of_created_external_table_sql() {
273        let mut state = SessionStateBuilder::new().with_default_features().build();
274        state.table_factories_mut().insert(
275            "ICEBERG".to_string(),
276            Arc::new(IcebergTableProviderFactory::new()),
277        );
278        let ctx = SessionContext::new_with_state(state);
279
280        // All external tables in DataFusion use bare names.
281        // See https://github.com/apache/datafusion/blob/main/datafusion/sql/src/statement.rs#L1038-#L1039
282        let table_ref = TableReference::bare("static_table");
283
284        // Create the external table
285        let sql = format!(
286            "CREATE EXTERNAL TABLE {} STORED AS ICEBERG LOCATION '{}'",
287            table_ref,
288            table_metadata_location()
289        );
290        let _df = ctx.sql(&sql).await.expect("create table failed");
291
292        // Get the created external table
293        let table_provider = ctx
294            .table_provider(table_ref)
295            .await
296            .expect("table not found");
297
298        // Check the schema of the created table
299        let expected_schema = table_metadata_v2_schema();
300        let actual_schema = table_provider.schema();
301
302        assert_eq!(actual_schema.as_ref(), &expected_schema);
303    }
304}