mod column_filter;
use std::borrow::Cow;
use std::sync::Arc;
use column_filter::StatsColumnFilter;
pub(crate) use column_filter::StatsConfig;
use crate::schema::{
ArrayType, ColumnName, DataType, MapType, PrimitiveType, Schema, SchemaRef, StructField,
StructType,
};
use crate::transforms::SchemaTransform;
use crate::{DeltaResult, Error};
#[allow(unused)]
pub(crate) fn expected_stats_schema(
data_schema: &Schema,
config: &StatsConfig<'_>,
required_columns: Option<&[ColumnName]>,
requested_columns: Option<&[ColumnName]>,
) -> DeltaResult<Schema> {
let mut fields = Vec::with_capacity(5);
fields.push(StructField::nullable("numRecords", DataType::LONG));
let mut base_transform = BaseStatsTransform::new(config, required_columns, requested_columns);
if let Some(base_schema) = base_transform.transform_struct(data_schema) {
let base_schema = base_schema.into_owned();
let mut null_count_transform = NullCountStatsTransform;
if let Some(null_count_schema) = null_count_transform.transform_struct(&base_schema) {
fields.push(StructField::nullable(
"nullCount",
null_count_schema.into_owned(),
));
};
let mut min_max_transform = MinMaxStatsTransform;
if let Some(min_max_schema) = min_max_transform.transform_struct(&base_schema) {
let min_max_schema = min_max_schema.into_owned();
fields.push(StructField::nullable("minValues", min_max_schema.clone()));
fields.push(StructField::nullable("maxValues", min_max_schema));
}
}
fields.push(StructField::nullable("tightBounds", DataType::BOOLEAN));
StructType::try_new(fields)
}
#[allow(unused)]
pub(crate) fn stats_column_names(
data_schema: &Schema,
config: &StatsConfig<'_>,
required_columns: Option<&[ColumnName]>,
) -> Vec<ColumnName> {
let mut filter = StatsColumnFilter::new(config, required_columns, None);
let mut columns = Vec::new();
filter.collect_columns(data_schema, &mut columns);
columns
}
pub(crate) fn build_stats_schema(referenced_schema: &StructType) -> Option<SchemaRef> {
let stats_schema = schema_with_all_fields_nullable(referenced_schema).ok()?;
let nullcount_schema = NullCountStatsTransform
.transform_struct(&stats_schema)?
.into_owned();
let schema = StructType::new_unchecked([
StructField::nullable("numRecords", DataType::LONG),
StructField::nullable("nullCount", nullcount_schema),
StructField::nullable("minValues", stats_schema.clone()),
StructField::nullable("maxValues", stats_schema),
]);
let schema = StripFieldMetadataTransform
.transform_struct(&schema)
.map(|s| s.into_owned())
.unwrap_or(schema);
Some(Arc::new(schema))
}
pub(crate) struct StripFieldMetadataTransform;
impl<'a> SchemaTransform<'a> for StripFieldMetadataTransform {
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
Some(match self.transform(&field.data_type)? {
Cow::Borrowed(_) if field.metadata.is_empty() => Cow::Borrowed(field),
data_type => Cow::Owned(StructField {
name: field.name.clone(),
data_type: data_type.into_owned(),
nullable: field.is_nullable(),
metadata: Default::default(),
}),
})
}
}
pub(crate) fn schema_with_all_fields_nullable(schema: &Schema) -> DeltaResult<Schema> {
match NullableStatsTransform.transform_struct(schema) {
Some(schema) => Ok(schema.into_owned()),
None => Err(Error::internal_error("NullableStatsTransform failed")),
}
}
pub(crate) struct NullableStatsTransform;
impl<'a> SchemaTransform<'a> for NullableStatsTransform {
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
let data_type = self.transform(&field.data_type)?;
Some(make_nullable_field(field, data_type))
}
}
fn make_nullable_field<'a>(
field: &'a StructField,
data_type: Cow<'a, DataType>,
) -> Cow<'a, StructField> {
match data_type {
Cow::Borrowed(_) if field.is_nullable() => Cow::Borrowed(field),
data_type => Cow::Owned(StructField {
name: field.name.clone(),
data_type: data_type.into_owned(),
nullable: true,
metadata: field.metadata.clone(),
}),
}
}
pub(crate) struct NullCountStatsTransform;
impl<'a> SchemaTransform<'a> for NullCountStatsTransform {
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
match &field.data_type {
DataType::Struct(_) => self.recurse_into_struct_field(field),
_ => Some(Cow::Owned(StructField {
name: field.name.clone(),
data_type: DataType::LONG,
nullable: true,
metadata: field.metadata.clone(),
})),
}
}
}
#[allow(unused)]
struct BaseStatsTransform<'col> {
filter: StatsColumnFilter<'col>,
}
impl<'col> BaseStatsTransform<'col> {
#[allow(unused)]
fn new(
config: &StatsConfig<'col>,
required_columns: Option<&'col [ColumnName]>,
requested_columns: Option<&'col [ColumnName]>,
) -> Self {
Self {
filter: StatsColumnFilter::new(config, required_columns, requested_columns),
}
}
}
impl<'a> SchemaTransform<'a> for BaseStatsTransform<'_> {
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
self.filter.enter_field(field.name());
let data_type = self.transform(&field.data_type);
self.filter.exit_field();
Some(make_nullable_field(field, data_type?))
}
fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option<Cow<'a, PrimitiveType>> {
if !self.filter.should_include_for_table() {
return None;
}
self.filter.record_included();
self.filter
.should_include_for_requested()
.then_some(Cow::Borrowed(ptype))
}
fn transform_array(&mut self, _: &'a ArrayType) -> Option<Cow<'a, ArrayType>> {
None }
fn transform_map(&mut self, _: &'a MapType) -> Option<Cow<'a, MapType>> {
None }
fn transform_variant(&mut self, _: &'a StructType) -> Option<Cow<'a, StructType>> {
None }
}
#[allow(unused)]
struct MinMaxStatsTransform;
impl<'a> SchemaTransform<'a> for MinMaxStatsTransform {
fn transform_array(&mut self, _: &'a ArrayType) -> Option<Cow<'a, ArrayType>> {
None
}
fn transform_map(&mut self, _: &'a MapType) -> Option<Cow<'a, MapType>> {
None
}
fn transform_variant(&mut self, _: &'a StructType) -> Option<Cow<'a, StructType>> {
None
}
fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option<Cow<'a, PrimitiveType>> {
is_skipping_eligible_datatype(ptype).then_some(Cow::Borrowed(ptype))
}
}
pub(crate) fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool {
matches!(
data_type,
&PrimitiveType::Byte
| &PrimitiveType::Short
| &PrimitiveType::Integer
| &PrimitiveType::Long
| &PrimitiveType::Float
| &PrimitiveType::Double
| &PrimitiveType::Date
| &PrimitiveType::Timestamp
| &PrimitiveType::TimestampNtz
| &PrimitiveType::String
| PrimitiveType::Decimal(_)
)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::schema::ArrayType;
use crate::table_properties::TableProperties;
fn stats_config_from_table_properties(properties: &TableProperties) -> StatsConfig<'_> {
StatsConfig {
data_skipping_stats_columns: properties.data_skipping_stats_columns.as_deref(),
data_skipping_num_indexed_cols: properties.data_skipping_num_indexed_cols,
}
}
fn expected_stats(null_count: StructType, min_max: StructType) -> StructType {
StructType::new_unchecked([
StructField::nullable("numRecords", DataType::LONG),
StructField::nullable("nullCount", null_count),
StructField::nullable("minValues", min_max.clone()),
StructField::nullable("maxValues", min_max),
StructField::nullable("tightBounds", DataType::BOOLEAN),
])
}
#[test]
fn test_stats_schema_simple() {
let properties: TableProperties = [("key", "value")].into();
let file_schema = StructType::new_unchecked([StructField::nullable("id", DataType::LONG)]);
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
let expected = expected_stats(file_schema.clone(), file_schema);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_stats_schema_nested() {
let properties: TableProperties = [("key", "value")].into();
let user_struct = StructType::new_unchecked([
StructField::not_null("name", DataType::STRING),
StructField::nullable("age", DataType::INTEGER),
]);
let file_schema = StructType::new_unchecked([
StructField::not_null("id", DataType::LONG),
StructField::not_null("user", DataType::Struct(Box::new(user_struct.clone()))),
]);
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
let expected_min_max = NullableStatsTransform
.transform_struct(&file_schema)
.unwrap()
.into_owned();
let null_count = NullCountStatsTransform
.transform_struct(&expected_min_max)
.unwrap()
.into_owned();
let expected = expected_stats(null_count, expected_min_max);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_stats_schema_with_non_eligible_field() {
let properties: TableProperties = [("key", "value")].into();
let array_type = DataType::Array(Box::new(ArrayType::new(DataType::STRING, false)));
let metadata_struct = StructType::new_unchecked([
StructField::nullable("name", DataType::STRING),
StructField::nullable("tags", array_type),
StructField::nullable("score", DataType::DOUBLE),
]);
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable(
"metadata",
DataType::Struct(Box::new(metadata_struct.clone())),
),
]);
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
let expected_null_nested = StructType::new_unchecked([
StructField::nullable("name", DataType::LONG),
StructField::nullable("score", DataType::LONG),
]);
let expected_null = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("metadata", DataType::Struct(Box::new(expected_null_nested))),
]);
let expected_nested = StructType::new_unchecked([
StructField::nullable("name", DataType::STRING),
StructField::nullable("score", DataType::DOUBLE),
]);
let expected_fields = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("metadata", DataType::Struct(Box::new(expected_nested))),
]);
let expected = expected_stats(expected_null, expected_fields);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_stats_schema_col_names() {
let properties: TableProperties = [(
"delta.dataSkippingStatsColumns".to_string(),
"`user.info`.name".to_string(),
)]
.into();
let user_struct = StructType::new_unchecked([
StructField::nullable("name", DataType::STRING),
StructField::nullable("age", DataType::INTEGER),
]);
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("user.info", DataType::Struct(Box::new(user_struct.clone()))),
]);
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
let expected_nested =
StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]);
let expected_fields = StructType::new_unchecked([StructField::nullable(
"user.info",
DataType::Struct(Box::new(expected_nested)),
)]);
let null_count = NullCountStatsTransform
.transform_struct(&expected_fields)
.unwrap()
.into_owned();
let expected = expected_stats(null_count, expected_fields);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_stats_schema_n_cols() {
let properties: TableProperties = [(
"delta.dataSkippingNumIndexedCols".to_string(),
"1".to_string(),
)]
.into();
let logical_schema = StructType::new_unchecked([
StructField::nullable("name", DataType::STRING),
StructField::nullable("age", DataType::INTEGER),
]);
let stats_schema = expected_stats_schema(
&logical_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
let expected_fields =
StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]);
let null_count = NullCountStatsTransform
.transform_struct(&expected_fields)
.unwrap()
.into_owned();
let expected = expected_stats(null_count, expected_fields);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_stats_schema_different_fields_in_null_vs_minmax() {
let properties: TableProperties = [("key", "value")].into();
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("is_active", DataType::BOOLEAN),
StructField::nullable("metadata", DataType::BINARY),
]);
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
let expected_null_count = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("is_active", DataType::LONG),
StructField::nullable("metadata", DataType::LONG),
]);
let expected_min_max =
StructType::new_unchecked([StructField::nullable("id", DataType::LONG)]);
let expected = expected_stats(expected_null_count, expected_min_max);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_stats_schema_nested_different_fields_in_null_vs_minmax() {
let properties: TableProperties = [("key", "value")].into();
let user_struct = StructType::new_unchecked([
StructField::nullable("name", DataType::STRING), StructField::nullable("is_admin", DataType::BOOLEAN), StructField::nullable("age", DataType::INTEGER), StructField::nullable("profile_pic", DataType::BINARY), ]);
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("user", DataType::Struct(Box::new(user_struct.clone()))),
StructField::nullable("is_deleted", DataType::BOOLEAN), ]);
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
let expected_null_user = StructType::new_unchecked([
StructField::nullable("name", DataType::LONG),
StructField::nullable("is_admin", DataType::LONG),
StructField::nullable("age", DataType::LONG),
StructField::nullable("profile_pic", DataType::LONG),
]);
let expected_null_count = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("user", DataType::Struct(Box::new(expected_null_user))),
StructField::nullable("is_deleted", DataType::LONG),
]);
let expected_minmax_user = StructType::new_unchecked([
StructField::nullable("name", DataType::STRING),
StructField::nullable("age", DataType::INTEGER),
]);
let expected_min_max = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("user", DataType::Struct(Box::new(expected_minmax_user))),
]);
let expected = expected_stats(expected_null_count, expected_min_max);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_stats_schema_only_non_eligible_fields() {
let properties: TableProperties = [("key", "value")].into();
let file_schema = StructType::new_unchecked([
StructField::nullable("is_active", DataType::BOOLEAN),
StructField::nullable("metadata", DataType::BINARY),
StructField::nullable(
"tags",
DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))),
),
]);
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
let expected_null_count = StructType::new_unchecked([
StructField::nullable("is_active", DataType::LONG),
StructField::nullable("metadata", DataType::LONG),
]);
let expected = StructType::new_unchecked([
StructField::nullable("numRecords", DataType::LONG),
StructField::nullable("nullCount", expected_null_count),
StructField::nullable("tightBounds", DataType::BOOLEAN),
]);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_stats_schema_map_array_dont_count_against_limit() {
let properties: TableProperties = [(
"delta.dataSkippingNumIndexedCols".to_string(),
"2".to_string(),
)]
.into();
let file_schema = StructType::new_unchecked([
StructField::nullable(
"tags",
DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))),
),
StructField::nullable(
"metadata",
DataType::Map(Box::new(MapType::new(
DataType::STRING,
DataType::STRING,
true,
))),
),
StructField::nullable("col1", DataType::LONG),
StructField::nullable("col2", DataType::STRING),
StructField::nullable("col3", DataType::INTEGER), ]);
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
let expected_null_count = StructType::new_unchecked([
StructField::nullable("col1", DataType::LONG),
StructField::nullable("col2", DataType::LONG),
]);
let expected_min_max = StructType::new_unchecked([
StructField::nullable("col1", DataType::LONG),
StructField::nullable("col2", DataType::STRING),
]);
let expected = expected_stats(expected_null_count, expected_min_max);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_stats_column_names_default() {
let properties: TableProperties = [("key", "value")].into();
let user_struct = StructType::new_unchecked([
StructField::nullable("name", DataType::STRING),
StructField::nullable("age", DataType::INTEGER),
]);
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("user", DataType::Struct(Box::new(user_struct))),
]);
let config = StatsConfig {
data_skipping_stats_columns: properties.data_skipping_stats_columns.as_deref(),
data_skipping_num_indexed_cols: properties.data_skipping_num_indexed_cols,
};
let columns = stats_column_names(&file_schema, &config, None);
assert_eq!(
columns,
vec![
ColumnName::new(["id"]),
ColumnName::new(["user", "name"]),
ColumnName::new(["user", "age"]),
]
);
}
#[test]
fn test_stats_column_names_with_num_indexed_cols() {
let properties: TableProperties = [(
"delta.dataSkippingNumIndexedCols".to_string(),
"2".to_string(),
)]
.into();
let file_schema = StructType::new_unchecked([
StructField::nullable("a", DataType::LONG),
StructField::nullable("b", DataType::STRING),
StructField::nullable("c", DataType::INTEGER),
StructField::nullable("d", DataType::DOUBLE),
]);
let config = StatsConfig {
data_skipping_stats_columns: properties.data_skipping_stats_columns.as_deref(),
data_skipping_num_indexed_cols: properties.data_skipping_num_indexed_cols,
};
let columns = stats_column_names(&file_schema, &config, None);
assert_eq!(
columns,
vec![ColumnName::new(["a"]), ColumnName::new(["b"]),]
);
}
#[test]
fn test_stats_column_names_with_stats_columns() {
let properties: TableProperties = [(
"delta.dataSkippingStatsColumns".to_string(),
"id,user.age".to_string(),
)]
.into();
let user_struct = StructType::new_unchecked([
StructField::nullable("name", DataType::STRING),
StructField::nullable("age", DataType::INTEGER),
]);
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("user", DataType::Struct(Box::new(user_struct))),
StructField::nullable("extra", DataType::STRING),
]);
let config = StatsConfig {
data_skipping_stats_columns: properties.data_skipping_stats_columns.as_deref(),
data_skipping_num_indexed_cols: properties.data_skipping_num_indexed_cols,
};
let columns = stats_column_names(&file_schema, &config, None);
assert_eq!(
columns,
vec![ColumnName::new(["id"]), ColumnName::new(["user", "age"]),]
);
}
#[test]
fn test_stats_column_names_skips_non_eligible_types() {
let properties: TableProperties = [("key", "value")].into();
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable(
"tags",
DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))),
),
StructField::nullable(
"metadata",
DataType::Map(Box::new(MapType::new(
DataType::STRING,
DataType::STRING,
true,
))),
),
StructField::nullable("name", DataType::STRING),
]);
let config = StatsConfig {
data_skipping_stats_columns: properties.data_skipping_stats_columns.as_deref(),
data_skipping_num_indexed_cols: properties.data_skipping_num_indexed_cols,
};
let columns = stats_column_names(&file_schema, &config, None);
assert_eq!(
columns,
vec![ColumnName::new(["id"]), ColumnName::new(["name"]),]
);
}
#[test]
fn test_stats_schema_with_clustering_past_limit() {
let properties: TableProperties = [(
"delta.dataSkippingNumIndexedCols".to_string(),
"1".to_string(),
)]
.into();
let file_schema = StructType::new_unchecked([
StructField::nullable("a", DataType::LONG),
StructField::nullable("b", DataType::STRING),
StructField::nullable("c", DataType::INTEGER),
]);
let clustering_columns = vec![ColumnName::new(["c"])];
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
Some(&clustering_columns),
None,
)
.unwrap();
let expected_null_count = StructType::new_unchecked([
StructField::nullable("a", DataType::LONG),
StructField::nullable("c", DataType::LONG),
]);
let expected_min_max = StructType::new_unchecked([
StructField::nullable("a", DataType::LONG),
StructField::nullable("c", DataType::INTEGER),
]);
let expected = expected_stats(expected_null_count, expected_min_max);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_requested_filters_to_single_column() {
let properties: TableProperties = [("key", "value")].into();
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("name", DataType::STRING),
StructField::nullable("value", DataType::INTEGER),
]);
let columns = [ColumnName::new(["id"])];
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
Some(&columns),
)
.unwrap();
let expected_nested =
StructType::new_unchecked([StructField::nullable("id", DataType::LONG)]);
let expected = expected_stats(expected_nested.clone(), expected_nested);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_none_requested_returns_full_schema() {
let properties: TableProperties = [("key", "value")].into();
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("name", DataType::STRING),
]);
let with_none = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
let min_values = with_none.field("minValues").expect("should have minValues");
if let DataType::Struct(inner) = min_values.data_type() {
assert!(inner.field("id").is_some());
assert!(inner.field("name").is_some());
} else {
panic!("minValues should be a struct");
}
}
#[test]
fn test_requested_column_outside_limit_excluded() {
let properties: TableProperties = [("delta.dataSkippingNumIndexedCols", "1")].into();
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("name", DataType::STRING),
]);
let columns = [ColumnName::new(["name"])];
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
Some(&columns),
)
.unwrap();
let expected = StructType::new_unchecked([
StructField::nullable("numRecords", DataType::LONG),
StructField::nullable("tightBounds", DataType::BOOLEAN),
]);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_required_bypasses_limit_with_requested_filter() {
let properties: TableProperties = [("delta.dataSkippingNumIndexedCols", "1")].into();
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("name", DataType::STRING),
]);
let columns = [ColumnName::new(["name"])];
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
Some(&columns),
Some(&columns),
)
.unwrap();
let expected_nested =
StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]);
let expected_null =
StructType::new_unchecked([StructField::nullable("name", DataType::LONG)]);
let expected = expected_stats(expected_null, expected_nested);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_requested_does_not_affect_column_counting() {
let properties: TableProperties = [("delta.dataSkippingNumIndexedCols", "2")].into();
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("name", DataType::STRING),
StructField::nullable("value", DataType::INTEGER),
]);
let columns = [ColumnName::new(["name"])];
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
Some(&columns),
)
.unwrap();
let expected_nested =
StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]);
let expected_null =
StructType::new_unchecked([StructField::nullable("name", DataType::LONG)]);
let expected = expected_stats(expected_null, expected_nested);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_multiple_requested_columns() {
let properties: TableProperties = [("key", "value")].into();
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("name", DataType::STRING),
StructField::nullable("value", DataType::INTEGER),
]);
let columns = [ColumnName::new(["id"]), ColumnName::new(["name"])];
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
Some(&columns),
)
.unwrap();
let expected_nested = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("name", DataType::STRING),
]);
let expected_null = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("name", DataType::LONG),
]);
let expected = expected_stats(expected_null, expected_nested);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_nested_requested_column() {
let properties: TableProperties = [("key", "value")].into();
let user_struct = StructType::new_unchecked([
StructField::nullable("name", DataType::STRING),
StructField::nullable("age", DataType::INTEGER),
]);
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("user", DataType::Struct(Box::new(user_struct))),
]);
let columns = [ColumnName::new(["user", "name"])];
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
Some(&columns),
)
.unwrap();
let expected_user_nested =
StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]);
let expected_nested = StructType::new_unchecked([StructField::nullable(
"user",
DataType::Struct(Box::new(expected_user_nested)),
)]);
let expected_user_null =
StructType::new_unchecked([StructField::nullable("name", DataType::LONG)]);
let expected_null = StructType::new_unchecked([StructField::nullable(
"user",
DataType::Struct(Box::new(expected_user_null)),
)]);
let expected = expected_stats(expected_null, expected_nested);
assert_eq!(&expected, &stats_schema);
}
#[test]
fn test_empty_requested_columns() {
let properties: TableProperties = [("key", "value")].into();
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("name", DataType::STRING),
]);
let columns: [ColumnName; 0] = [];
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
Some(&columns),
)
.unwrap();
let full_stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
None,
)
.unwrap();
assert_eq!(&full_stats_schema, &stats_schema);
}
#[test]
fn test_mixed_nested_and_top_requested() {
let properties: TableProperties = [("key", "value")].into();
let user_struct = StructType::new_unchecked([
StructField::nullable("name", DataType::STRING),
StructField::nullable("age", DataType::INTEGER),
]);
let file_schema = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("user", DataType::Struct(Box::new(user_struct))),
StructField::nullable("value", DataType::DOUBLE),
]);
let columns = [ColumnName::new(["id"]), ColumnName::new(["user", "age"])];
let stats_schema = expected_stats_schema(
&file_schema,
&stats_config_from_table_properties(&properties),
None,
Some(&columns),
)
.unwrap();
let expected_user_nested =
StructType::new_unchecked([StructField::nullable("age", DataType::INTEGER)]);
let expected_nested = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable(
"user",
DataType::Struct(Box::new(expected_user_nested.clone())),
),
]);
let expected_user_null =
StructType::new_unchecked([StructField::nullable("age", DataType::LONG)]);
let expected_null = StructType::new_unchecked([
StructField::nullable("id", DataType::LONG),
StructField::nullable("user", DataType::Struct(Box::new(expected_user_null))),
]);
let expected = expected_stats(expected_null, expected_nested);
assert_eq!(&expected, &stats_schema);
}
}