datafusion_common/nested_struct.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::error::{DataFusionError, Result, _plan_err};
19use arrow::{
20 array::{new_null_array, Array, ArrayRef, StructArray},
21 compute::cast,
22 datatypes::{DataType::Struct, Field, FieldRef},
23};
24use std::sync::Arc;
25
26/// Cast a struct column to match target struct fields, handling nested structs recursively.
27///
28/// This function implements struct-to-struct casting with the assumption that **structs should
29/// always be allowed to cast to other structs**. However, the source column must already be
30/// a struct type - non-struct sources will result in an error.
31///
32/// ## Field Matching Strategy
33/// - **By Name**: Source struct fields are matched to target fields by name (case-sensitive)
34/// - **Type Adaptation**: When a matching field is found, it is recursively cast to the target field's type
35/// - **Missing Fields**: Target fields not present in the source are filled with null values
36/// - **Extra Fields**: Source fields not present in the target are ignored
37///
38/// ## Nested Struct Handling
39/// - Nested structs are handled recursively using the same casting rules
40/// - Each level of nesting follows the same field matching and null-filling strategy
41/// - This allows for complex struct transformations while maintaining data integrity
42///
43/// # Arguments
44/// * `source_col` - The source array to cast (must be a struct array)
45/// * `target_fields` - The target struct field definitions to cast to
46///
47/// # Returns
48/// A `Result<ArrayRef>` containing the cast struct array
49///
50/// # Errors
51/// Returns a `DataFusionError::Plan` if the source column is not a struct type
52fn cast_struct_column(
53 source_col: &ArrayRef,
54 target_fields: &[Arc<Field>],
55) -> Result<ArrayRef> {
56 if let Some(struct_array) = source_col.as_any().downcast_ref::<StructArray>() {
57 let mut children: Vec<(Arc<Field>, Arc<dyn Array>)> = Vec::new();
58 let num_rows = source_col.len();
59
60 for target_child_field in target_fields {
61 let field_arc = Arc::clone(target_child_field);
62 match struct_array.column_by_name(target_child_field.name()) {
63 Some(source_child_col) => {
64 let adapted_child =
65 cast_column(source_child_col, target_child_field)?;
66 children.push((field_arc, adapted_child));
67 }
68 None => {
69 children.push((
70 field_arc,
71 new_null_array(target_child_field.data_type(), num_rows),
72 ));
73 }
74 }
75 }
76
77 let struct_array = StructArray::from(children);
78 Ok(Arc::new(struct_array))
79 } else {
80 // Return error if source is not a struct type
81 Err(DataFusionError::Plan(format!(
82 "Cannot cast column of type {:?} to struct type. Source must be a struct to cast to struct.",
83 source_col.data_type()
84 )))
85 }
86}
87
88/// Cast a column to match the target field type, with special handling for nested structs.
89///
90/// This function serves as the main entry point for column casting operations. For struct
91/// types, it enforces that **only struct columns can be cast to struct types**.
92///
93/// ## Casting Behavior
94/// - **Struct Types**: Delegates to `cast_struct_column` for struct-to-struct casting only
95/// - **Non-Struct Types**: Uses Arrow's standard `cast` function for primitive type conversions
96///
97/// ## Struct Casting Requirements
98/// The struct casting logic requires that the source column must already be a struct type.
99/// This makes the function useful for:
100/// - Schema evolution scenarios where struct layouts change over time
101/// - Data migration between different struct schemas
102/// - Type-safe data processing pipelines that maintain struct type integrity
103///
104/// # Arguments
105/// * `source_col` - The source array to cast
106/// * `target_field` - The target field definition (including type and metadata)
107///
108/// # Returns
109/// A `Result<ArrayRef>` containing the cast array
110///
111/// # Errors
112/// Returns an error if:
113/// - Attempting to cast a non-struct column to a struct type
114/// - Arrow's cast function fails for non-struct types
115/// - Memory allocation fails during struct construction
116/// - Invalid data type combinations are encountered
117pub fn cast_column(source_col: &ArrayRef, target_field: &Field) -> Result<ArrayRef> {
118 match target_field.data_type() {
119 Struct(target_fields) => cast_struct_column(source_col, target_fields),
120 _ => Ok(cast(source_col, target_field.data_type())?),
121 }
122}
123
124/// Validates compatibility between source and target struct fields for casting operations.
125///
126/// This function implements comprehensive struct compatibility checking by examining:
127/// - Field name matching between source and target structs
128/// - Type castability for each matching field (including recursive struct validation)
129/// - Proper handling of missing fields (target fields not in source are allowed - filled with nulls)
130/// - Proper handling of extra fields (source fields not in target are allowed - ignored)
131///
132/// # Compatibility Rules
133/// - **Field Matching**: Fields are matched by name (case-sensitive)
134/// - **Missing Target Fields**: Allowed - will be filled with null values during casting
135/// - **Extra Source Fields**: Allowed - will be ignored during casting
136/// - **Type Compatibility**: Each matching field must be castable using Arrow's type system
137/// - **Nested Structs**: Recursively validates nested struct compatibility
138///
139/// # Arguments
140/// * `source_fields` - Fields from the source struct type
141/// * `target_fields` - Fields from the target struct type
142///
143/// # Returns
144/// * `Ok(true)` if the structs are compatible for casting
145/// * `Err(DataFusionError)` with detailed error message if incompatible
146///
147/// # Examples
148/// ```text
149/// // Compatible: source has extra field, target has missing field
150/// // Source: {a: i32, b: string, c: f64}
151/// // Target: {a: i64, d: bool}
152/// // Result: Ok(true) - 'a' can cast i32->i64, 'b','c' ignored, 'd' filled with nulls
153///
154/// // Incompatible: matching field has incompatible types
155/// // Source: {a: string}
156/// // Target: {a: binary}
157/// // Result: Err(...) - string cannot cast to binary
158/// ```
159pub fn validate_struct_compatibility(
160 source_fields: &[FieldRef],
161 target_fields: &[FieldRef],
162) -> Result<bool> {
163 // Check compatibility for each target field
164 for target_field in target_fields {
165 // Look for matching field in source by name
166 if let Some(source_field) = source_fields
167 .iter()
168 .find(|f| f.name() == target_field.name())
169 {
170 // Check if the matching field types are compatible
171 match (source_field.data_type(), target_field.data_type()) {
172 // Recursively validate nested structs
173 (Struct(source_nested), Struct(target_nested)) => {
174 validate_struct_compatibility(source_nested, target_nested)?;
175 }
176 // For non-struct types, use the existing castability check
177 _ => {
178 if !arrow::compute::can_cast_types(
179 source_field.data_type(),
180 target_field.data_type(),
181 ) {
182 return _plan_err!(
183 "Cannot cast struct field '{}' from type {:?} to type {:?}",
184 target_field.name(),
185 source_field.data_type(),
186 target_field.data_type()
187 );
188 }
189 }
190 }
191 }
192 // Missing fields in source are OK - they'll be filled with nulls
193 }
194
195 // Extra fields in source are OK - they'll be ignored
196 Ok(true)
197}
198
199#[cfg(test)]
200mod tests {
201 use super::*;
202 use arrow::{
203 array::{Int32Array, Int64Array, StringArray},
204 datatypes::{DataType, Field},
205 };
206 /// Macro to extract and downcast a column from a StructArray
207 macro_rules! get_column_as {
208 ($struct_array:expr, $column_name:expr, $array_type:ty) => {
209 $struct_array
210 .column_by_name($column_name)
211 .unwrap()
212 .as_any()
213 .downcast_ref::<$array_type>()
214 .unwrap()
215 };
216 }
217
218 #[test]
219 fn test_cast_simple_column() {
220 let source = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
221 let target_field = Field::new("ints", DataType::Int64, true);
222 let result = cast_column(&source, &target_field).unwrap();
223 let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
224 assert_eq!(result.len(), 3);
225 assert_eq!(result.value(0), 1);
226 assert_eq!(result.value(1), 2);
227 assert_eq!(result.value(2), 3);
228 }
229
230 #[test]
231 fn test_cast_struct_with_missing_field() {
232 let a_array = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef;
233 let source_struct = StructArray::from(vec![(
234 Arc::new(Field::new("a", DataType::Int32, true)),
235 Arc::clone(&a_array),
236 )]);
237 let source_col = Arc::new(source_struct) as ArrayRef;
238
239 let target_field = Field::new(
240 "s",
241 Struct(
242 vec![
243 Arc::new(Field::new("a", DataType::Int32, true)),
244 Arc::new(Field::new("b", DataType::Utf8, true)),
245 ]
246 .into(),
247 ),
248 true,
249 );
250
251 let result = cast_column(&source_col, &target_field).unwrap();
252 let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
253 assert_eq!(struct_array.fields().len(), 2);
254 let a_result = get_column_as!(&struct_array, "a", Int32Array);
255 assert_eq!(a_result.value(0), 1);
256 assert_eq!(a_result.value(1), 2);
257
258 let b_result = get_column_as!(&struct_array, "b", StringArray);
259 assert_eq!(b_result.len(), 2);
260 assert!(b_result.is_null(0));
261 assert!(b_result.is_null(1));
262 }
263
264 #[test]
265 fn test_cast_struct_source_not_struct() {
266 let source = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef;
267 let target_field = Field::new(
268 "s",
269 Struct(vec![Arc::new(Field::new("a", DataType::Int32, true))].into()),
270 true,
271 );
272
273 let result = cast_column(&source, &target_field);
274 assert!(result.is_err());
275 let error_msg = result.unwrap_err().to_string();
276 assert!(error_msg.contains("Cannot cast column of type"));
277 assert!(error_msg.contains("to struct type"));
278 assert!(error_msg.contains("Source must be a struct"));
279 }
280
281 #[test]
282 fn test_validate_struct_compatibility_incompatible_types() {
283 // Source struct: {field1: Binary, field2: String}
284 let source_fields = vec![
285 Arc::new(Field::new("field1", DataType::Binary, true)),
286 Arc::new(Field::new("field2", DataType::Utf8, true)),
287 ];
288
289 // Target struct: {field1: Int32}
290 let target_fields = vec![Arc::new(Field::new("field1", DataType::Int32, true))];
291
292 let result = validate_struct_compatibility(&source_fields, &target_fields);
293 assert!(result.is_err());
294 let error_msg = result.unwrap_err().to_string();
295 assert!(error_msg.contains("Cannot cast struct field 'field1'"));
296 assert!(error_msg.contains("Binary"));
297 assert!(error_msg.contains("Int32"));
298 }
299
300 #[test]
301 fn test_validate_struct_compatibility_compatible_types() {
302 // Source struct: {field1: Int32, field2: String}
303 let source_fields = vec![
304 Arc::new(Field::new("field1", DataType::Int32, true)),
305 Arc::new(Field::new("field2", DataType::Utf8, true)),
306 ];
307
308 // Target struct: {field1: Int64} (Int32 can cast to Int64)
309 let target_fields = vec![Arc::new(Field::new("field1", DataType::Int64, true))];
310
311 let result = validate_struct_compatibility(&source_fields, &target_fields);
312 assert!(result.is_ok());
313 assert!(result.unwrap());
314 }
315
316 #[test]
317 fn test_validate_struct_compatibility_missing_field_in_source() {
318 // Source struct: {field2: String} (missing field1)
319 let source_fields = vec![Arc::new(Field::new("field2", DataType::Utf8, true))];
320
321 // Target struct: {field1: Int32}
322 let target_fields = vec![Arc::new(Field::new("field1", DataType::Int32, true))];
323
324 // Should be OK - missing fields will be filled with nulls
325 let result = validate_struct_compatibility(&source_fields, &target_fields);
326 assert!(result.is_ok());
327 assert!(result.unwrap());
328 }
329}