Trait ScalarUDFImpl

Source

pub trait ScalarUDFImpl:
    Debug
    + DynEq
    + DynHash
    + Send
    + Sync {
Show 22 methods    // Required methods
    fn as_any(&self) -> &dyn Any;
    fn name(&self) -> &str;
    fn signature(&self) -> &Signature;
    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType>;
    fn invoke_with_args(
        &self,
        args: ScalarFunctionArgs,
    ) -> Result<ColumnarValue>;

    // Provided methods
    fn aliases(&self) -> &[String] { ... }
    fn display_name(&self, args: &[Expr]) -> Result<String> { ... }
    fn schema_name(&self, args: &[Expr]) -> Result<String> { ... }
    fn with_updated_config(&self, _config: &ConfigOptions) -> Option<ScalarUDF> { ... }
    fn return_field_from_args(
        &self,
        args: ReturnFieldArgs<'_>,
    ) -> Result<FieldRef> { ... }
    fn is_nullable(&self, _args: &[Expr], _schema: &dyn ExprSchema) -> bool { ... }
    fn simplify(
        &self,
        args: Vec<Expr>,
        _info: &SimplifyContext,
    ) -> Result<ExprSimplifyResult> { ... }
    fn preimage(
        &self,
        _args: &[Expr],
        _lit_expr: &Expr,
        _info: &SimplifyContext,
    ) -> Result<PreimageResult> { ... }
    fn short_circuits(&self) -> bool { ... }
    fn conditional_arguments<'a>(
        &self,
        args: &'a [Expr],
    ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> { ... }
    fn evaluate_bounds(&self, _input: &[&Interval]) -> Result<Interval> { ... }
    fn propagate_constraints(
        &self,
        _interval: &Interval,
        _inputs: &[&Interval],
    ) -> Result<Option<Vec<Interval>>> { ... }
    fn output_ordering(
        &self,
        inputs: &[ExprProperties],
    ) -> Result<SortProperties> { ... }
    fn preserves_lex_ordering(&self, _inputs: &[ExprProperties]) -> Result<bool> { ... }
    fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>> { ... }
    fn documentation(&self) -> Option<&Documentation> { ... }
    fn placement(&self, _args: &[ExpressionPlacement]) -> ExpressionPlacement { ... }
}

Expand description

Trait for implementing user defined scalar functions.

This trait exposes the full API for implementing user defined functions and can be used to implement any function.

See advanced_udf.rs for a full example with complete implementation and ScalarUDF for other available options.

§Basic Example

/// This struct for a simple UDF that adds one to an int32
#[derive(Debug, PartialEq, Eq, Hash)]
struct AddOne {
  signature: Signature,
}

impl AddOne {
  fn new() -> Self {
    Self {
      signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable),
     }
  }
}

static DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
        Documentation::builder(DOC_SECTION_MATH, "Add one to an int32", "add_one(2)")
            .with_argument("arg1", "The int32 number to add one to")
            .build()
    });

fn get_doc() -> &'static Documentation {
    &DOCUMENTATION
}

/// Implement the ScalarUDFImpl trait for AddOne
impl ScalarUDFImpl for AddOne {
   fn as_any(&self) -> &dyn Any { self }
   fn name(&self) -> &str { "add_one" }
   fn signature(&self) -> &Signature { &self.signature }
   fn return_type(&self, args: &[DataType]) -> Result<DataType> {
     if !matches!(args.get(0), Some(&DataType::Int32)) {
       return plan_err!("add_one only accepts Int32 arguments");
     }
     Ok(DataType::Int32)
   }
   // The actual implementation would add one to the argument
   fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
        unimplemented!()
   }
   fn documentation(&self) -> Option<&Documentation> {
        Some(get_doc())
    }
}

// Create a new ScalarUDF from the implementation
let add_one = ScalarUDF::from(AddOne::new());

// Call the function `add_one(col)`
let expr = add_one.call(vec![col("a")]);

Required Methods§

Source

fn as_any(&self) -> &dyn Any

Returns this object as an Any trait object

Source

fn name(&self) -> &str

Returns this function’s name

Source

fn signature(&self) -> &Signature

Returns a Signature describing the argument types for which this function has an implementation, and the function’s Volatility.

See Signature for more details on argument type handling and Self::return_type for computing the return type.

Source

fn return_type(&self, arg_types: &[DataType]) -> Result<DataType>

DataType returned by this function, given the types of the arguments.

§Arguments

arg_types Data types of the arguments. The implementation of return_type can assume that some other part of the code has coerced the actual argument types to match Self::signature.

§Notes

If you provide an implementation for Self::return_field_from_args, DataFusion will not call return_type (this function). While it is valid to to put unimplemented!() or unreachable!(), it is recommended to return DataFusionError::Internal instead, which reduces the severity of symptoms if bugs occur (an error rather than a panic).

Source

fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue>

Invoke the function returning the appropriate result.

§Performance

For the best performance, the implementations should handle the common case when one or more of their arguments are constant values (aka ColumnarValue::Scalar).

ColumnarValue::values_to_arrays can be used to convert the arguments to arrays, which will likely be simpler code, but be slower.

Provided Methods§

Source

fn aliases(&self) -> &[String]

Returns any aliases (alternate names) for this function.

Aliases can be used to invoke the same function using different names. For example in some databases now() and current_timestamp() are aliases for the same function. This behavior can be obtained by returning current_timestamp as an alias for the now function.

Note: aliases should only include names other than Self::name. Defaults to [] (no aliases)

Source

fn display_name(&self, args: &[Expr]) -> Result<String>

👎Deprecated since 50.0.0:

This method is unused and will be removed in a future release

Returns the user-defined display name of function, given the arguments

This can be used to customize the output column name generated by this function.

Defaults to name(args[0], args[1], ...)

Source

fn schema_name(&self, args: &[Expr]) -> Result<String>

Returns the name of the column this expression would create

See Expr::schema_name for details

Source

fn with_updated_config(&self, _config: &ConfigOptions) -> Option<ScalarUDF>

Create a new instance of this function with updated configuration.

This method is called when configuration options change at runtime (e.g., via SET statements) to allow functions that depend on configuration to update themselves accordingly.

Note the current ConfigOptions are also passed to Self::invoke_with_args so this API is not needed for functions where the values may depend on the current options.

This API is useful for functions where the return type depends on the configuration options, such as the now() function which depends on the current timezone.

§Arguments

config - The updated configuration options

§Returns

Some(ScalarUDF) - A new instance of this function configured with the new settings
None - If this function does not change with new configuration settings (the default)

Source

fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef>

What type will be returned by this function, given the arguments?

By default, this function calls Self::return_type with the types of each argument.

§Notes

For the majority of UDFs, implementing Self::return_type is sufficient, as the result type is typically a deterministic function of the input types (e.g., sqrt(f32) consistently yields f32). Implementing this method directly is generally unnecessary unless the return type depends on runtime values.

This function can be used for more advanced cases such as:

specifying nullability
return types based on the values of the arguments (rather than their types.

§Example creating `Field`

Note the name of the Field is ignored, except for structured types such as DataType::Struct.

fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
    // report output is only nullable if any one of the arguments are nullable
    let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
    let field = Arc::new(Field::new("ignored_name", DataType::Int32, nullable));
    Ok(field)
}

§Output Type based on Values

For example, the following two function calls get the same argument types (something and a Utf8 string) but return different types based on the value of the second argument:

arrow_cast(x, 'Int16') –> Int16
arrow_cast(x, 'Float32') –> Float32

§Requirements

This function must consistently return the same type for the same logical input even if the input is simplified (e.g. it must return the same value for ('foo' | 'bar') as it does for (‘foobar’).

Source

fn is_nullable(&self, _args: &[Expr], _schema: &dyn ExprSchema) -> bool

👎Deprecated since 45.0.0:

Use return_field_from_args instead. if you use is_nullable that returns non-nullable with return_type, you would need to switch to return_field_from_args, you might have error

Source

fn simplify( &self, args: Vec<Expr>, _info: &SimplifyContext, ) -> Result<ExprSimplifyResult>

Optionally apply per-UDF simplification / rewrite rules.

This can be used to apply function specific simplification rules during optimization (e.g. arrow_cast –> Expr::Cast). The default implementation does nothing.

Note that DataFusion handles simplifying arguments and “constant folding” (replacing a function call with constant arguments such as my_add(1,2) --> 3 ). Thus, there is no need to implement such optimizations manually for specific UDFs.

§Arguments

args: The arguments of the function
info: The necessary information for simplification

§Returns

ExprSimplifyResult indicating the result of the simplification NOTE if the function cannot be simplified, the arguments MUST be returned unmodified

§Notes

The returned expression must have the same schema as the original expression, including both the data type and nullability. For example, if the original expression is nullable, the returned expression must also be nullable, otherwise it may lead to schema verification errors later in query planning.

Source

fn preimage( &self, _args: &[Expr], _lit_expr: &Expr, _info: &SimplifyContext, ) -> Result<PreimageResult>

Returns a single contiguous preimage for this function and the specified scalar expression, if any.

Currently only applies to =, !=, >, >=, <, <=, is distinct from, is not distinct from predicates

§Return Value

Implementations should return a half-open interval: inclusive lower bound and exclusive upper bound. This is slightly different from normal Interval semantics where the upper bound is closed (inclusive). Typically this means the upper endpoint must be adjusted to the next value not included in the preimage. See the Half-Open Intervals section below for more details.

§Background

Inspired by the ClickHouse Paper, a “preimage rewrite” transforms a predicate containing a function call into a predicate containing an equivalent set of input literal (constant) values. The resulting predicate can often be further optimized by other rewrites (see Examples).

From the paper:

some functions can compute the preimage of a given function result. This is used to replace comparisons of constants with function calls on the key columns by comparing the key column value with the preimage. For example, toYear(k) = 2024 can be replaced by k >= 2024-01-01 && k < 2025-01-01

For example, given an expression like

date_part('YEAR', k) = 2024

The interval [2024-01-01, 2025-12-31]contains all possible input values (preimage values) for which the functiondate_part(YEAR, k)produces the output value2024(image value). Returning the interval (note upper bound adjusted up)[2024-01-01, 2025-01-01]` the expression can be rewritten to

k >= '2024-01-01' AND k < '2025-01-01'

which is a simpler and a more canonical form, making it easier for other optimizer passes to recognize and apply further transformations.

§Examples

Case 1:

Original:

date_part('YEAR', k) = 2024 AND k >= '2024-06-01'

After preimage rewrite:

k >= '2024-01-01' AND k < '2025-01-01' AND k >= '2024-06-01'

Since this form is much simpler, the optimizer can combine and simplify sub-expressions further into:

k >= '2024-06-01' AND k < '2025-01-01'

Case 2:

For min/max pruning, simpler predicates such as:

k >= '2024-01-01' AND k < '2025-01-01'

are much easier for the pruner to reason about. See PruningPredicate for the backgrounds of predicate pruning.

The trade-off with the preimage rewrite is that evaluating the rewritten form might be slightly more expensive than evaluating the original expression. In practice, this cost is usually outweighed by the more aggressive optimization opportunities it enables.

§Half-Open Intervals

The preimage API uses half-open intervals, which makes the rewrite easier to implement by avoiding calculations to adjust the upper bound. For example, if a function returns its input unchanged and the desired output is the single value 5, a closed interval could be represented as [5, 5], but then the rewrite would require adjusting the upper bound to 6 to create a proper range predicate. With a half-open interval, the same range is represented as [5, 6), which already forms a valid predicate.

Source

fn short_circuits(&self) -> bool

Returns true if some of this exprs subexpressions may not be evaluated and thus any side effects (like divide by zero) may not be encountered.

Setting this to true prevents certain optimizations such as common subexpression elimination

When overriding this function to return true, ScalarUDFImpl::conditional_arguments can also be overridden to report more accurately which arguments are eagerly evaluated and which ones lazily.

Source

fn conditional_arguments<'a>( &self, args: &'a [Expr], ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)>

Determines which of the arguments passed to this function are evaluated eagerly and which may be evaluated lazily.

If this function returns None, all arguments are eagerly evaluated. Returning None is a micro optimization that saves a needless Vec allocation.

If the function returns Some, returns (eager, lazy) where eager are the arguments that are always evaluated, and lazy are the arguments that may be evaluated lazily (i.e. may not be evaluated at all in some cases).

Implementations must ensure that the two returned Vecs are disjunct, and that each argument from args is present in one the two Vecs.

When overriding this function, ScalarUDFImpl::short_circuits must be overridden to return true.

Source

fn evaluate_bounds(&self, _input: &[&Interval]) -> Result<Interval>

Computes the output Interval for a ScalarUDFImpl, given the input intervals.

§Parameters

children are the intervals for the children (inputs) of this function.

§Example

If the function is ABS(a), and the input interval is a: [-3, 2], then the output interval would be [0, 3].

Source

fn propagate_constraints( &self, _interval: &Interval, _inputs: &[&Interval], ) -> Result<Option<Vec<Interval>>>

Updates bounds for child expressions, given a known Intervals for this function.

This function is used to propagate constraints down through an expression tree.

§Parameters

interval is the currently known interval for this function.
inputs are the current intervals for the inputs (children) of this function.

§Returns

A Vec of new intervals for the children, in order.

If constraint propagation reveals an infeasibility for any child, returns None. If none of the children intervals change as a result of propagation, may return an empty vector instead of cloning children. This is the default (and conservative) return value.

§Example

If the function is ABS(a), the current interval is [4, 5] and the input a is given as [-7, 3], then propagation would return [-5, 3].

Source

fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties>

Calculates the SortProperties of this function based on its children’s properties.

Source

fn preserves_lex_ordering(&self, _inputs: &[ExprProperties]) -> Result<bool>

Returns true if the function preserves lexicographical ordering based on the input ordering.

For example, concat(a || b) preserves lexicographical ordering, but abs(a) does not.

Source

fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>>

Coerce arguments of a function call to types that the function can evaluate.

This function is only called if ScalarUDFImpl::signature returns crate::TypeSignature::UserDefined. Most UDFs should return one of the other variants of TypeSignature which handle common cases.

See the type coercion module documentation for more details on type coercion

For example, if your function requires a floating point arguments, but the user calls it like my_func(1::int) (i.e. with 1 as an integer), coerce_types can return [DataType::Float64] to ensure the argument is converted to 1::double

§Parameters

arg_types: The argument types of the arguments this function with

§Return value

A Vec the same length as arg_types. DataFusion will CAST the function call arguments to these specific types.

Source

fn documentation(&self) -> Option<&Documentation>

Returns the documentation for this Scalar UDF.

Documentation can be accessed programmatically as well as generating publicly facing documentation.

Source

fn placement(&self, _args: &[ExpressionPlacement]) -> ExpressionPlacement

Returns placement information for this function.

This is used by optimizers to make decisions about expression placement, such as whether to push expressions down through projections.

The default implementation returns ExpressionPlacement::KeepInPlace, meaning the expression should be kept where it is in the plan.

Override this method to indicate that the function can be pushed down closer to the data source.

ScalarUDFImpl

Trait ScalarUDFImpl Copy item path

§Basic Example

Required Methods§

fn as_any(&self) -> &dyn Any

fn name(&self) -> &str

fn signature(&self) -> &Signature

fn return_type(&self, arg_types: &[DataType]) -> Result<DataType>

§Arguments

§Notes

fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue>

§Performance

Provided Methods§

fn aliases(&self) -> &[String]

fn display_name(&self, args: &[Expr]) -> Result<String>

fn schema_name(&self, args: &[Expr]) -> Result<String>

fn with_updated_config(&self, _config: &ConfigOptions) -> Option<ScalarUDF>

§Arguments

§Returns

fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef>

§Notes

§Example creating Field

§Output Type based on Values

§Requirements

fn is_nullable(&self, _args: &[Expr], _schema: &dyn ExprSchema) -> bool

fn simplify( &self, args: Vec<Expr>, _info: &SimplifyContext, ) -> Result<ExprSimplifyResult>

§Arguments

§Returns

§Notes

fn preimage( &self, _args: &[Expr], _lit_expr: &Expr, _info: &SimplifyContext, ) -> Result<PreimageResult>

§Return Value

§Background

§Examples

§Half-Open Intervals

fn short_circuits(&self) -> bool

fn conditional_arguments<'a>( &self, args: &'a [Expr], ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)>

fn evaluate_bounds(&self, _input: &[&Interval]) -> Result<Interval>

§Parameters

§Example

fn propagate_constraints( &self, _interval: &Interval, _inputs: &[&Interval], ) -> Result<Option<Vec<Interval>>>

§Parameters

§Returns

§Example

fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties>

fn preserves_lex_ordering(&self, _inputs: &[ExprProperties]) -> Result<bool>

fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>>

§Parameters

§Return value

fn documentation(&self) -> Option<&Documentation>

fn placement(&self, _args: &[ExpressionPlacement]) -> ExpressionPlacement

Implementors§

impl ScalarUDFImpl for AsyncScalarUDF

impl ScalarUDFImpl for SimpleScalarUDF

Trait ScalarUDFImpl

§Example creating `Field`