arrow-schema 58.1.0

Defines the logical types for arrow arrays
Documentation
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

//! Extension types.

#[cfg(feature = "canonical_extension_types")]
mod canonical;
#[cfg(feature = "canonical_extension_types")]
pub use canonical::*;

use crate::{ArrowError, DataType};
use std::collections::HashMap;

/// The metadata key for the string name identifying an [`ExtensionType`].
pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name";

/// The metadata key for a serialized representation of the [`ExtensionType`]
/// necessary to reconstruct the custom type.
pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata";

/// Extension types.
///
/// User-defined “extension” types can be defined setting certain key value
/// pairs in the [`Field`] metadata structure. These extension keys are:
/// - [`EXTENSION_TYPE_NAME_KEY`]
/// - [`EXTENSION_TYPE_METADATA_KEY`]
///
/// Canonical extension types support in this crate requires the
/// `canonical_extension_types` feature.
///
/// Extension types may or may not use the [`EXTENSION_TYPE_METADATA_KEY`]
/// field.
///
/// # Example
///
/// The example below demonstrates how to implement this trait for a `Uuid`
/// type. Note this is not the canonical extension type for `Uuid`, which does
/// not include information about the `Uuid` version.
///
/// ```
/// # use arrow_schema::ArrowError;
/// # fn main() -> Result<(), ArrowError> {
/// use arrow_schema::{DataType, extension::ExtensionType, Field};
/// use std::{fmt, str::FromStr};
///
/// /// The different Uuid versions.
/// #[derive(Clone, Copy, Debug, PartialEq)]
/// enum UuidVersion {
///     V1,
///     V2,
///     V3,
///     V4,
///     V5,
///     V6,
///     V7,
///     V8,
/// }
///
/// // We'll use `Display` to serialize.
/// impl fmt::Display for UuidVersion {
///     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
///         write!(
///             f,
///             "{}",
///             match self {
///                 Self::V1 => "V1",
///                 Self::V2 => "V2",
///                 Self::V3 => "V3",
///                 Self::V4 => "V4",
///                 Self::V5 => "V5",
///                 Self::V6 => "V6",
///                 Self::V7 => "V7",
///                 Self::V8 => "V8",
///             }
///         )
///     }
/// }
///
/// // And `FromStr` to deserialize.
/// impl FromStr for UuidVersion {
///     type Err = ArrowError;
///
///     fn from_str(s: &str) -> Result<Self, Self::Err> {
///         match s {
///             "V1" => Ok(Self::V1),
///             "V2" => Ok(Self::V2),
///             "V3" => Ok(Self::V3),
///             "V4" => Ok(Self::V4),
///             "V5" => Ok(Self::V5),
///             "V6" => Ok(Self::V6),
///             "V7" => Ok(Self::V7),
///             "V8" => Ok(Self::V8),
///             _ => Err(ArrowError::ParseError("Invalid UuidVersion".to_owned())),
///         }
///     }
/// }
///
/// /// This is the extension type, not the container for Uuid values. It
/// /// stores the Uuid version (this is the metadata of this extension type).
/// #[derive(Clone, Copy, Debug, PartialEq)]
/// struct Uuid(UuidVersion);
///
/// impl ExtensionType for Uuid {
///     // We use a namespace as suggested by the specification.
///     const NAME: &'static str = "myorg.example.uuid";
///
///     // The metadata type is the Uuid version.
///     type Metadata = UuidVersion;
///
///     // We just return a reference to the Uuid version.
///     fn metadata(&self) -> &Self::Metadata {
///         &self.0
///     }
///
///     // We use the `Display` implementation to serialize the Uuid
///     // version.
///     fn serialize_metadata(&self) -> Option<String> {
///         Some(self.0.to_string())
///     }
///
///     // We use the `FromStr` implementation to deserialize the Uuid
///     // version.
///     fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
///         metadata.map_or_else(
///             || {
///                 Err(ArrowError::InvalidArgumentError(
///                     "Uuid extension type metadata missing".to_owned(),
///                 ))
///             },
///             str::parse,
///         )
///     }
///
///     // The only supported data type is `FixedSizeBinary(16)`.
///     fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
///         match data_type {
///             DataType::FixedSizeBinary(16) => Ok(()),
///             data_type => Err(ArrowError::InvalidArgumentError(format!(
///                 "Uuid data type mismatch, expected FixedSizeBinary(16), found {data_type}"
///             ))),
///         }
///     }
///
///     // We should always check if the data type is supported before
///     // constructing the extension type.
///     fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> {
///         let uuid = Self(metadata);
///         uuid.supports_data_type(data_type)?;
///         Ok(uuid)
///     }
/// }
///
/// // We can now construct the extension type.
/// let uuid_v1 = Uuid(UuidVersion::V1);
///
/// // And add it to a field.
/// let mut field =
///     Field::new("", DataType::FixedSizeBinary(16), false).with_extension_type(uuid_v1);
///
/// // And extract it from this field.
/// assert_eq!(field.try_extension_type::<Uuid>()?, uuid_v1);
///
/// // When we try to add this to a field with an unsupported data type we
/// // get an error.
/// let result = Field::new("", DataType::Null, false).try_with_extension_type(uuid_v1);
/// assert!(result.is_err());
/// # Ok(()) }
/// ```
///
/// <https://arrow.apache.org/docs/format/Columnar.html#extension-types>
///
/// [`Field`]: crate::Field
pub trait ExtensionType: Sized {
    /// The name identifying this extension type.
    ///
    /// This is the string value that is used for the
    /// [`EXTENSION_TYPE_NAME_KEY`] in the [`Field::metadata`] of a [`Field`]
    /// to identify this extension type.
    ///
    /// We recommend that you use a “namespace”-style prefix for extension
    /// type names to minimize the possibility of conflicts with multiple Arrow
    /// readers and writers in the same application. For example, use
    /// `myorg.name_of_type` instead of simply `name_of_type`.
    ///
    /// Extension names beginning with `arrow.` are reserved for canonical
    /// extension types, they should not be used for third-party extension
    /// types.
    ///
    /// Extension names are case-sensitive.
    ///
    /// [`Field`]: crate::Field
    /// [`Field::metadata`]: crate::Field::metadata
    const NAME: &'static str;

    /// The metadata type of this extension type.
    ///
    /// Implementations can use strongly or loosly typed data structures here
    /// depending on the complexity of the metadata.
    ///
    /// Implementations can also use `Self` here if the extension type can be
    /// constructed directly from its metadata.
    ///
    /// If an extension type defines no metadata it should use `()` to indicate
    /// this.
    type Metadata;

    /// Returns a reference to the metadata of this extension type, or `&()` if
    /// if this extension type defines no metadata (`Self::Metadata=()`).
    fn metadata(&self) -> &Self::Metadata;

    /// Returns the serialized representation of the metadata of this extension
    /// type, or `None` if this extension type defines no metadata
    /// (`Self::Metadata=()`).
    ///
    /// This is string value that is used for the
    /// [`EXTENSION_TYPE_METADATA_KEY`] in the [`Field::metadata`] of a
    /// [`Field`].
    ///
    /// [`Field`]: crate::Field
    /// [`Field::metadata`]: crate::Field::metadata
    fn serialize_metadata(&self) -> Option<String>;

    /// Deserialize the metadata of this extension type from the serialized
    /// representation of the metadata. An extension type that defines no
    /// metadata should expect `None` for the serialized metadata and return
    /// `Ok(())`.
    ///
    /// This function should return an error when
    /// - expected metadata is missing (for extensions types with non-optional
    ///   metadata)
    /// - unexpected metadata is set (for extension types without metadata)
    /// - deserialization of metadata fails
    fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError>;

    /// Returns `Ok(())` iff the given data type is supported by this extension
    /// type.
    fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError>;

    /// Construct this extension type for a field with the given data type and
    /// metadata.
    ///
    /// This should return an error if the given data type is not supported by
    /// this extension type.
    fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError>;

    /// Construct this extension type from field metadata and data type.
    ///
    /// This is a provided method that extracts extension type information from
    /// metadata (using [`EXTENSION_TYPE_NAME_KEY`] and
    /// [`EXTENSION_TYPE_METADATA_KEY`]) and delegates to [`Self::try_new`].
    ///
    /// Returns an error if:
    /// - The extension type name is missing or doesn't match [`Self::NAME`]
    /// - Metadata deserialization fails
    /// - The data type is not supported
    ///
    /// This method enables extension type checking without requiring a full
    /// [`Field`] instance, useful when only metadata and data type are available.
    ///
    /// [`Field`]: crate::Field
    fn try_new_from_field_metadata(
        data_type: &DataType,
        metadata: &HashMap<String, String>,
    ) -> Result<Self, ArrowError> {
        // Check the extension name in the metadata
        match metadata.get(EXTENSION_TYPE_NAME_KEY).map(|s| s.as_str()) {
            // It should match the name of the given extension type
            Some(name) if name == Self::NAME => {
                // Deserialize the metadata and try to construct the extension type
                let ext_metadata = metadata
                    .get(EXTENSION_TYPE_METADATA_KEY)
                    .map(|s| s.as_str());
                let parsed = Self::deserialize_metadata(ext_metadata)?;
                Self::try_new(data_type, parsed)
            }
            // Name mismatch
            Some(name) => Err(ArrowError::InvalidArgumentError(format!(
                "Extension type name mismatch: expected {}, got {name}",
                Self::NAME
            ))),
            // Name missing
            None => Err(ArrowError::InvalidArgumentError(
                "Extension type name missing".to_string(),
            )),
        }
    }
}