datafusion-common 51.0.0

Common functionality for DataFusion query engine
Documentation
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

use std::{collections::BTreeMap, sync::Arc};

use arrow::datatypes::{DataType, Field};
use hashbrown::HashMap;

use crate::{error::_plan_err, DataFusionError, ScalarValue};

/// A [`ScalarValue`] with optional [`FieldMetadata`]
#[derive(Debug, Clone)]
pub struct ScalarAndMetadata {
    pub value: ScalarValue,
    pub metadata: Option<FieldMetadata>,
}

impl ScalarAndMetadata {
    /// Create a new Literal from a scalar value with optional [`FieldMetadata`]
    pub fn new(value: ScalarValue, metadata: Option<FieldMetadata>) -> Self {
        Self { value, metadata }
    }

    /// Access the underlying [ScalarValue] storage
    pub fn value(&self) -> &ScalarValue {
        &self.value
    }

    /// Access the [FieldMetadata] attached to this value, if any
    pub fn metadata(&self) -> Option<&FieldMetadata> {
        self.metadata.as_ref()
    }

    /// Consume self and return components
    pub fn into_inner(self) -> (ScalarValue, Option<FieldMetadata>) {
        (self.value, self.metadata)
    }

    /// Cast this values's storage type
    ///
    /// This operation assumes that if the underlying [ScalarValue] can be casted
    /// to a given type that any extension type represented by the metadata is also
    /// valid.
    pub fn cast_storage_to(
        &self,
        target_type: &DataType,
    ) -> Result<Self, DataFusionError> {
        let new_value = self.value().cast_to(target_type)?;
        Ok(Self::new(new_value, self.metadata.clone()))
    }
}

/// create a new ScalarAndMetadata from a ScalarValue without
/// any metadata
impl From<ScalarValue> for ScalarAndMetadata {
    fn from(value: ScalarValue) -> Self {
        Self::new(value, None)
    }
}

/// Assert equality of data types where one or both sides may have field metadata
///
/// This currently compares absent metadata (e.g., one side was a DataType) and
/// empty metadata (e.g., one side was a field where the field had no metadata)
/// as equal and uses byte-for-byte comparison for the keys and values of the
/// fields, even though this is potentially too strict for some cases (e.g.,
/// extension types where extension metadata is represented by JSON, or cases
/// where field metadata is orthogonal to the interpretation of the data type).
///
/// Returns a planning error with suitably formatted type representations if
/// actual and expected do not compare to equal.
pub fn check_metadata_with_storage_equal(
    actual: (
        &DataType,
        Option<&std::collections::HashMap<String, String>>,
    ),
    expected: (
        &DataType,
        Option<&std::collections::HashMap<String, String>>,
    ),
    what: &str,
    context: &str,
) -> Result<(), DataFusionError> {
    if actual.0 != expected.0 {
        return _plan_err!(
            "Expected {what} of type {}, got {}{context}",
            format_type_and_metadata(expected.0, expected.1),
            format_type_and_metadata(actual.0, actual.1)
        );
    }

    let metadata_equal = match (actual.1, expected.1) {
        (None, None) => true,
        (None, Some(expected_metadata)) => expected_metadata.is_empty(),
        (Some(actual_metadata), None) => actual_metadata.is_empty(),
        (Some(actual_metadata), Some(expected_metadata)) => {
            actual_metadata == expected_metadata
        }
    };

    if !metadata_equal {
        return _plan_err!(
            "Expected {what} of type {}, got {}{context}",
            format_type_and_metadata(expected.0, expected.1),
            format_type_and_metadata(actual.0, actual.1)
        );
    }

    Ok(())
}

/// Given a data type represented by storage and optional metadata, generate
/// a user-facing string
///
/// This function exists to reduce the number of Field debug strings that are
/// used to communicate type information in error messages and plan explain
/// renderings.
pub fn format_type_and_metadata(
    data_type: &DataType,
    metadata: Option<&std::collections::HashMap<String, String>>,
) -> String {
    match metadata {
        Some(metadata) if !metadata.is_empty() => {
            format!("{data_type}<{metadata:?}>")
        }
        _ => data_type.to_string(),
    }
}

/// Literal metadata
///
/// Stores metadata associated with a literal expressions
/// and is designed to be fast to `clone`.
///
/// This structure is used to store metadata associated with a literal expression, and it
/// corresponds to the `metadata` field on [`Field`].
///
/// # Example: Create [`FieldMetadata`] from a [`Field`]
/// ```
/// # use std::collections::HashMap;
/// # use datafusion_common::metadata::FieldMetadata;
/// # use arrow::datatypes::{Field, DataType};
/// # let field = Field::new("c1", DataType::Int32, true)
/// #  .with_metadata(HashMap::from([("foo".to_string(), "bar".to_string())]));
/// // Create a new `FieldMetadata` instance from a `Field`
/// let metadata = FieldMetadata::new_from_field(&field);
/// // There is also a `From` impl:
/// let metadata = FieldMetadata::from(&field);
/// ```
///
/// # Example: Update a [`Field`] with [`FieldMetadata`]
/// ```
/// # use datafusion_common::metadata::FieldMetadata;
/// # use arrow::datatypes::{Field, DataType};
/// # let field = Field::new("c1", DataType::Int32, true);
/// # let metadata = FieldMetadata::new_from_field(&field);
/// // Add any metadata from `FieldMetadata` to `Field`
/// let updated_field = metadata.add_to_field(field);
/// ```
#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
pub struct FieldMetadata {
    /// The inner metadata of a literal expression, which is a map of string
    /// keys to string values.
    ///
    /// Note this is not a `HashMap` because `HashMap` does not provide
    /// implementations for traits like `Debug` and `Hash`.
    inner: Arc<BTreeMap<String, String>>,
}

impl Default for FieldMetadata {
    fn default() -> Self {
        Self::new_empty()
    }
}

impl FieldMetadata {
    /// Create a new empty metadata instance.
    pub fn new_empty() -> Self {
        Self {
            inner: Arc::new(BTreeMap::new()),
        }
    }

    /// Merges two optional `FieldMetadata` instances, overwriting any existing
    /// keys in `m` with keys from `n` if present.
    ///
    /// This function is commonly used in alias operations, particularly for literals
    /// with metadata. When creating an alias expression, the metadata from the original
    /// expression (such as a literal) is combined with any metadata specified on the alias.
    ///
    /// # Arguments
    ///
    /// * `m` - The first metadata (typically from the original expression like a literal)
    /// * `n` - The second metadata (typically from the alias definition)
    ///
    /// # Merge Strategy
    ///
    /// - If both metadata instances exist, they are merged with `n` taking precedence
    /// - Keys from `n` will overwrite keys from `m` if they have the same name
    /// - If only one metadata instance exists, it is returned unchanged
    /// - If neither exists, `None` is returned
    ///
    /// # Example usage
    /// ```rust
    /// use datafusion_common::metadata::FieldMetadata;
    /// use std::collections::BTreeMap;
    ///
    /// // Create metadata for a literal expression
    /// let literal_metadata = Some(FieldMetadata::from(BTreeMap::from([
    ///     ("source".to_string(), "constant".to_string()),
    ///     ("type".to_string(), "int".to_string()),
    /// ])));
    ///
    /// // Create metadata for an alias
    /// let alias_metadata = Some(FieldMetadata::from(BTreeMap::from([
    ///     ("description".to_string(), "answer".to_string()),
    ///     ("source".to_string(), "user".to_string()), // This will override literal's "source"
    /// ])));
    ///
    /// // Merge the metadata
    /// let merged = FieldMetadata::merge_options(
    ///     literal_metadata.as_ref(),
    ///     alias_metadata.as_ref(),
    /// );
    ///
    /// // Result contains: {"source": "user", "type": "int", "description": "answer"}
    /// assert!(merged.is_some());
    /// ```
    pub fn merge_options(
        m: Option<&FieldMetadata>,
        n: Option<&FieldMetadata>,
    ) -> Option<FieldMetadata> {
        match (m, n) {
            (Some(m), Some(n)) => {
                let mut merged = m.clone();
                merged.extend(n.clone());
                Some(merged)
            }
            (Some(m), None) => Some(m.clone()),
            (None, Some(n)) => Some(n.clone()),
            (None, None) => None,
        }
    }

    /// Create a new metadata instance from a `Field`'s metadata.
    pub fn new_from_field(field: &Field) -> Self {
        let inner = field
            .metadata()
            .iter()
            .map(|(k, v)| (k.to_string(), v.to_string()))
            .collect();
        Self {
            inner: Arc::new(inner),
        }
    }

    /// Create a new metadata instance from a map of string keys to string values.
    pub fn new(inner: BTreeMap<String, String>) -> Self {
        Self {
            inner: Arc::new(inner),
        }
    }

    /// Get the inner metadata as a reference to a `BTreeMap`.
    pub fn inner(&self) -> &BTreeMap<String, String> {
        &self.inner
    }

    /// Return the inner metadata
    pub fn into_inner(self) -> Arc<BTreeMap<String, String>> {
        self.inner
    }

    /// Adds metadata from `other` into `self`, overwriting any existing keys.
    pub fn extend(&mut self, other: Self) {
        if other.is_empty() {
            return;
        }
        let other = Arc::unwrap_or_clone(other.into_inner());
        Arc::make_mut(&mut self.inner).extend(other);
    }

    /// Returns true if the metadata is empty.
    pub fn is_empty(&self) -> bool {
        self.inner.is_empty()
    }

    /// Returns the number of key-value pairs in the metadata.
    pub fn len(&self) -> usize {
        self.inner.len()
    }

    /// Convert this `FieldMetadata` into a `HashMap<String, String>`
    pub fn to_hashmap(&self) -> std::collections::HashMap<String, String> {
        self.inner
            .iter()
            .map(|(k, v)| (k.to_string(), v.to_string()))
            .collect()
    }

    /// Updates the metadata on the Field with this metadata, if it is not empty.
    pub fn add_to_field(&self, field: Field) -> Field {
        if self.inner.is_empty() {
            return field;
        }

        field.with_metadata(self.to_hashmap())
    }
}

impl From<&Field> for FieldMetadata {
    fn from(field: &Field) -> Self {
        Self::new_from_field(field)
    }
}

impl From<BTreeMap<String, String>> for FieldMetadata {
    fn from(inner: BTreeMap<String, String>) -> Self {
        Self::new(inner)
    }
}

impl From<std::collections::HashMap<String, String>> for FieldMetadata {
    fn from(map: std::collections::HashMap<String, String>) -> Self {
        Self::new(map.into_iter().collect())
    }
}

/// From reference
impl From<&std::collections::HashMap<String, String>> for FieldMetadata {
    fn from(map: &std::collections::HashMap<String, String>) -> Self {
        let inner = map
            .iter()
            .map(|(k, v)| (k.to_string(), v.to_string()))
            .collect();
        Self::new(inner)
    }
}

/// From hashbrown map
impl From<HashMap<String, String>> for FieldMetadata {
    fn from(map: HashMap<String, String>) -> Self {
        let inner = map.into_iter().collect();
        Self::new(inner)
    }
}

impl From<&HashMap<String, String>> for FieldMetadata {
    fn from(map: &HashMap<String, String>) -> Self {
        let inner = map
            .into_iter()
            .map(|(k, v)| (k.to_string(), v.to_string()))
            .collect();
        Self::new(inner)
    }
}