datafusion_expr/
udf.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`ScalarUDF`]: Scalar User Defined Functions
19
20use crate::async_udf::AsyncScalarUDF;
21use crate::expr::schema_name_from_exprs_comma_separated_without_space;
22use crate::simplify::{ExprSimplifyResult, SimplifyInfo};
23use crate::sort_properties::{ExprProperties, SortProperties};
24use crate::udf_eq::UdfEq;
25use crate::{ColumnarValue, Documentation, Expr, Signature};
26use arrow::datatypes::{DataType, Field, FieldRef};
27#[cfg(debug_assertions)]
28use datafusion_common::assert_or_internal_err;
29use datafusion_common::config::ConfigOptions;
30use datafusion_common::{ExprSchema, Result, ScalarValue, not_impl_err};
31use datafusion_expr_common::dyn_eq::{DynEq, DynHash};
32use datafusion_expr_common::interval_arithmetic::Interval;
33use std::any::Any;
34use std::cmp::Ordering;
35use std::fmt::Debug;
36use std::hash::{Hash, Hasher};
37use std::sync::Arc;
38
39/// Logical representation of a Scalar User Defined Function.
40///
41/// A scalar function produces a single row output for each row of input. This
42/// struct contains the information DataFusion needs to plan and invoke
43/// functions you supply such as name, type signature, return type, and actual
44/// implementation.
45///
46/// 1. For simple use cases, use [`create_udf`] (examples in [`simple_udf.rs`]).
47///
48/// 2. For advanced use cases, use [`ScalarUDFImpl`] which provides full API
49///    access (examples in  [`advanced_udf.rs`]).
50///
51/// See [`Self::call`] to create an `Expr` which invokes a `ScalarUDF` with arguments.
52///
53/// # API Note
54///
55/// This is a separate struct from [`ScalarUDFImpl`] to maintain backwards
56/// compatibility with the older API.
57///
58/// [`create_udf`]: crate::expr_fn::create_udf
59/// [`simple_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udf.rs
60/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
61#[derive(Debug, Clone)]
62pub struct ScalarUDF {
63    inner: Arc<dyn ScalarUDFImpl>,
64}
65
66impl PartialEq for ScalarUDF {
67    fn eq(&self, other: &Self) -> bool {
68        self.inner.dyn_eq(other.inner.as_any())
69    }
70}
71
72impl PartialOrd for ScalarUDF {
73    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
74        let mut cmp = self.name().cmp(other.name());
75        if cmp == Ordering::Equal {
76            cmp = self.signature().partial_cmp(other.signature())?;
77        }
78        if cmp == Ordering::Equal {
79            cmp = self.aliases().partial_cmp(other.aliases())?;
80        }
81        // Contract for PartialOrd and PartialEq consistency requires that
82        // a == b if and only if partial_cmp(a, b) == Some(Equal).
83        if cmp == Ordering::Equal && self != other {
84            // Functions may have other properties besides name and signature
85            // that differentiate two instances (e.g. type, or arbitrary parameters).
86            // We cannot return Some(Equal) in such case.
87            return None;
88        }
89        debug_assert!(
90            cmp == Ordering::Equal || self != other,
91            "Detected incorrect implementation of PartialEq when comparing functions: '{}' and '{}'. \
92            The functions compare as equal, but they are not equal based on general properties that \
93            the PartialOrd implementation observes,",
94            self.name(),
95            other.name()
96        );
97        Some(cmp)
98    }
99}
100
101impl Eq for ScalarUDF {}
102
103impl Hash for ScalarUDF {
104    fn hash<H: Hasher>(&self, state: &mut H) {
105        self.inner.dyn_hash(state)
106    }
107}
108
109impl ScalarUDF {
110    /// Create a new `ScalarUDF` from a `[ScalarUDFImpl]` trait object
111    ///
112    /// Note this is the same as using the `From` impl (`ScalarUDF::from`)
113    pub fn new_from_impl<F>(fun: F) -> ScalarUDF
114    where
115        F: ScalarUDFImpl + 'static,
116    {
117        Self::new_from_shared_impl(Arc::new(fun))
118    }
119
120    /// Create a new `ScalarUDF` from a `[ScalarUDFImpl]` trait object
121    pub fn new_from_shared_impl(fun: Arc<dyn ScalarUDFImpl>) -> ScalarUDF {
122        Self { inner: fun }
123    }
124
125    /// Return the underlying [`ScalarUDFImpl`] trait object for this function
126    pub fn inner(&self) -> &Arc<dyn ScalarUDFImpl> {
127        &self.inner
128    }
129
130    /// Adds additional names that can be used to invoke this function, in
131    /// addition to `name`
132    ///
133    /// If you implement [`ScalarUDFImpl`] directly you should return aliases directly.
134    pub fn with_aliases(self, aliases: impl IntoIterator<Item = &'static str>) -> Self {
135        Self::new_from_impl(AliasedScalarUDFImpl::new(Arc::clone(&self.inner), aliases))
136    }
137
138    /// Returns a [`Expr`] logical expression to call this UDF with specified
139    /// arguments.
140    ///
141    /// This utility allows easily calling UDFs
142    ///
143    /// # Example
144    /// ```no_run
145    /// use datafusion_expr::{col, lit, ScalarUDF};
146    /// # fn my_udf() -> ScalarUDF { unimplemented!() }
147    /// let my_func: ScalarUDF = my_udf();
148    /// // Create an expr for `my_func(a, 12.3)`
149    /// let expr = my_func.call(vec![col("a"), lit(12.3)]);
150    /// ```
151    pub fn call(&self, args: Vec<Expr>) -> Expr {
152        Expr::ScalarFunction(crate::expr::ScalarFunction::new_udf(
153            Arc::new(self.clone()),
154            args,
155        ))
156    }
157
158    /// Returns this function's name.
159    ///
160    /// See [`ScalarUDFImpl::name`] for more details.
161    pub fn name(&self) -> &str {
162        self.inner.name()
163    }
164
165    /// Returns this function's display_name.
166    ///
167    /// See [`ScalarUDFImpl::display_name`] for more details
168    #[deprecated(
169        since = "50.0.0",
170        note = "This method is unused and will be removed in a future release"
171    )]
172    pub fn display_name(&self, args: &[Expr]) -> Result<String> {
173        #[expect(deprecated)]
174        self.inner.display_name(args)
175    }
176
177    /// Returns this function's schema_name.
178    ///
179    /// See [`ScalarUDFImpl::schema_name`] for more details
180    pub fn schema_name(&self, args: &[Expr]) -> Result<String> {
181        self.inner.schema_name(args)
182    }
183
184    /// Returns the aliases for this function.
185    ///
186    /// See [`ScalarUDF::with_aliases`] for more details
187    pub fn aliases(&self) -> &[String] {
188        self.inner.aliases()
189    }
190
191    /// Returns this function's [`Signature`] (what input types are accepted).
192    ///
193    /// See [`ScalarUDFImpl::signature`] for more details.
194    pub fn signature(&self) -> &Signature {
195        self.inner.signature()
196    }
197
198    /// The datatype this function returns given the input argument types.
199    /// This function is used when the input arguments are [`DataType`]s.
200    ///
201    ///  # Notes
202    ///
203    /// If a function implement [`ScalarUDFImpl::return_field_from_args`],
204    /// its [`ScalarUDFImpl::return_type`] should raise an error.
205    ///
206    /// See [`ScalarUDFImpl::return_type`] for more details.
207    pub fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
208        self.inner.return_type(arg_types)
209    }
210
211    /// Return the datatype this function returns given the input argument types.
212    ///
213    /// See [`ScalarUDFImpl::return_field_from_args`] for more details.
214    pub fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
215        self.inner.return_field_from_args(args)
216    }
217
218    /// Do the function rewrite
219    ///
220    /// See [`ScalarUDFImpl::simplify`] for more details.
221    pub fn simplify(
222        &self,
223        args: Vec<Expr>,
224        info: &dyn SimplifyInfo,
225    ) -> Result<ExprSimplifyResult> {
226        self.inner.simplify(args, info)
227    }
228
229    #[deprecated(since = "50.0.0", note = "Use `return_field_from_args` instead.")]
230    pub fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool {
231        #[expect(deprecated)]
232        self.inner.is_nullable(args, schema)
233    }
234
235    /// Invoke the function on `args`, returning the appropriate result.
236    ///
237    /// See [`ScalarUDFImpl::invoke_with_args`] for details.
238    pub fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
239        #[cfg(debug_assertions)]
240        let return_field = Arc::clone(&args.return_field);
241        let result = self.inner.invoke_with_args(args)?;
242        // Maybe this could be enabled always?
243        // This doesn't use debug_assert!, but it's meant to run anywhere except on production. It's same in spirit, thus conditioning on debug_assertions.
244        #[cfg(debug_assertions)]
245        {
246            let result_data_type = result.data_type();
247            let expected_type = return_field.data_type();
248            assert_or_internal_err!(
249                result_data_type == *expected_type,
250                "Function '{}' returned value of type '{:?}' while the following type was promised at planning time and expected: '{:?}'",
251                self.name(),
252                result_data_type,
253                expected_type
254            );
255            // TODO verify return data is non-null when it was promised to be?
256        }
257        Ok(result)
258    }
259
260    /// Determines which of the arguments passed to this function are evaluated eagerly
261    /// and which may be evaluated lazily.
262    ///
263    /// See [ScalarUDFImpl::conditional_arguments] for more information.
264    pub fn conditional_arguments<'a>(
265        &self,
266        args: &'a [Expr],
267    ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> {
268        self.inner.conditional_arguments(args)
269    }
270
271    /// Returns true if some of this `exprs` subexpressions may not be evaluated
272    /// and thus any side effects (like divide by zero) may not be encountered.
273    ///
274    /// See [ScalarUDFImpl::short_circuits] for more information.
275    pub fn short_circuits(&self) -> bool {
276        self.inner.short_circuits()
277    }
278
279    /// Computes the output interval for a [`ScalarUDF`], given the input
280    /// intervals.
281    ///
282    /// # Parameters
283    ///
284    /// * `inputs` are the intervals for the inputs (children) of this function.
285    ///
286    /// # Example
287    ///
288    /// If the function is `ABS(a)`, and the input interval is `a: [-3, 2]`,
289    /// then the output interval would be `[0, 3]`.
290    pub fn evaluate_bounds(&self, inputs: &[&Interval]) -> Result<Interval> {
291        self.inner.evaluate_bounds(inputs)
292    }
293
294    /// Updates bounds for child expressions, given a known interval for this
295    /// function. This is used to propagate constraints down through an expression
296    /// tree.
297    ///
298    /// # Parameters
299    ///
300    /// * `interval` is the currently known interval for this function.
301    /// * `inputs` are the current intervals for the inputs (children) of this function.
302    ///
303    /// # Returns
304    ///
305    /// A `Vec` of new intervals for the children, in order.
306    ///
307    /// If constraint propagation reveals an infeasibility for any child, returns
308    /// [`None`]. If none of the children intervals change as a result of
309    /// propagation, may return an empty vector instead of cloning `children`.
310    /// This is the default (and conservative) return value.
311    ///
312    /// # Example
313    ///
314    /// If the function is `ABS(a)`, the current `interval` is `[4, 5]` and the
315    /// input `a` is given as `[-7, 3]`, then propagation would return `[-5, 3]`.
316    pub fn propagate_constraints(
317        &self,
318        interval: &Interval,
319        inputs: &[&Interval],
320    ) -> Result<Option<Vec<Interval>>> {
321        self.inner.propagate_constraints(interval, inputs)
322    }
323
324    /// Calculates the [`SortProperties`] of this function based on its
325    /// children's properties.
326    pub fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
327        self.inner.output_ordering(inputs)
328    }
329
330    pub fn preserves_lex_ordering(&self, inputs: &[ExprProperties]) -> Result<bool> {
331        self.inner.preserves_lex_ordering(inputs)
332    }
333
334    /// See [`ScalarUDFImpl::coerce_types`] for more details.
335    pub fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
336        self.inner.coerce_types(arg_types)
337    }
338
339    /// Returns the documentation for this Scalar UDF.
340    ///
341    /// Documentation can be accessed programmatically as well as
342    /// generating publicly facing documentation.
343    pub fn documentation(&self) -> Option<&Documentation> {
344        self.inner.documentation()
345    }
346
347    /// Return true if this function is an async function
348    pub fn as_async(&self) -> Option<&AsyncScalarUDF> {
349        self.inner().as_any().downcast_ref::<AsyncScalarUDF>()
350    }
351}
352
353impl<F> From<F> for ScalarUDF
354where
355    F: ScalarUDFImpl + 'static,
356{
357    fn from(fun: F) -> Self {
358        Self::new_from_impl(fun)
359    }
360}
361
362/// Arguments passed to [`ScalarUDFImpl::invoke_with_args`] when invoking a
363/// scalar function.
364#[derive(Debug, Clone)]
365pub struct ScalarFunctionArgs {
366    /// The evaluated arguments to the function
367    pub args: Vec<ColumnarValue>,
368    /// Field associated with each arg, if it exists
369    pub arg_fields: Vec<FieldRef>,
370    /// The number of rows in record batch being evaluated
371    pub number_rows: usize,
372    /// The return field of the scalar function returned (from `return_type`
373    /// or `return_field_from_args`) when creating the physical expression
374    /// from the logical expression
375    pub return_field: FieldRef,
376    /// The config options at execution time
377    pub config_options: Arc<ConfigOptions>,
378}
379
380impl ScalarFunctionArgs {
381    /// The return type of the function. See [`Self::return_field`] for more
382    /// details.
383    pub fn return_type(&self) -> &DataType {
384        self.return_field.data_type()
385    }
386}
387
388/// Information about arguments passed to the function
389///
390/// This structure contains metadata about how the function was called
391/// such as the type of the arguments, any scalar arguments and if the
392/// arguments can (ever) be null
393///
394/// See [`ScalarUDFImpl::return_field_from_args`] for more information
395#[derive(Debug)]
396pub struct ReturnFieldArgs<'a> {
397    /// The data types of the arguments to the function
398    pub arg_fields: &'a [FieldRef],
399    /// Is argument `i` to the function a scalar (constant)?
400    ///
401    /// If the argument `i` is not a scalar, it will be None
402    ///
403    /// For example, if a function is called like `my_function(column_a, 5)`
404    /// this field will be `[None, Some(ScalarValue::Int32(Some(5)))]`
405    pub scalar_arguments: &'a [Option<&'a ScalarValue>],
406}
407
408/// Trait for implementing user defined scalar functions.
409///
410/// This trait exposes the full API for implementing user defined functions and
411/// can be used to implement any function.
412///
413/// See [`advanced_udf.rs`] for a full example with complete implementation and
414/// [`ScalarUDF`] for other available options.
415///
416/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
417///
418/// # Basic Example
419/// ```
420/// # use std::any::Any;
421/// # use std::sync::LazyLock;
422/// # use arrow::datatypes::DataType;
423/// # use datafusion_common::{DataFusionError, plan_err, Result};
424/// # use datafusion_expr::{col, ColumnarValue, Documentation, ScalarFunctionArgs, Signature, Volatility};
425/// # use datafusion_expr::{ScalarUDFImpl, ScalarUDF};
426/// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH;
427/// /// This struct for a simple UDF that adds one to an int32
428/// #[derive(Debug, PartialEq, Eq, Hash)]
429/// struct AddOne {
430///   signature: Signature,
431/// }
432///
433/// impl AddOne {
434///   fn new() -> Self {
435///     Self {
436///       signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable),
437///      }
438///   }
439/// }
440///
441/// static DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
442///         Documentation::builder(DOC_SECTION_MATH, "Add one to an int32", "add_one(2)")
443///             .with_argument("arg1", "The int32 number to add one to")
444///             .build()
445///     });
446///
447/// fn get_doc() -> &'static Documentation {
448///     &DOCUMENTATION
449/// }
450///
451/// /// Implement the ScalarUDFImpl trait for AddOne
452/// impl ScalarUDFImpl for AddOne {
453///    fn as_any(&self) -> &dyn Any { self }
454///    fn name(&self) -> &str { "add_one" }
455///    fn signature(&self) -> &Signature { &self.signature }
456///    fn return_type(&self, args: &[DataType]) -> Result<DataType> {
457///      if !matches!(args.get(0), Some(&DataType::Int32)) {
458///        return plan_err!("add_one only accepts Int32 arguments");
459///      }
460///      Ok(DataType::Int32)
461///    }
462///    // The actual implementation would add one to the argument
463///    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
464///         unimplemented!()
465///    }
466///    fn documentation(&self) -> Option<&Documentation> {
467///         Some(get_doc())
468///     }
469/// }
470///
471/// // Create a new ScalarUDF from the implementation
472/// let add_one = ScalarUDF::from(AddOne::new());
473///
474/// // Call the function `add_one(col)`
475/// let expr = add_one.call(vec![col("a")]);
476/// ```
477pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
478    /// Returns this object as an [`Any`] trait object
479    fn as_any(&self) -> &dyn Any;
480
481    /// Returns this function's name
482    fn name(&self) -> &str;
483
484    /// Returns any aliases (alternate names) for this function.
485    ///
486    /// Aliases can be used to invoke the same function using different names.
487    /// For example in some databases `now()` and `current_timestamp()` are
488    /// aliases for the same function. This behavior can be obtained by
489    /// returning `current_timestamp` as an alias for the `now` function.
490    ///
491    /// Note: `aliases` should only include names other than [`Self::name`].
492    /// Defaults to `[]` (no aliases)
493    fn aliases(&self) -> &[String] {
494        &[]
495    }
496
497    /// Returns the user-defined display name of function, given the arguments
498    ///
499    /// This can be used to customize the output column name generated by this
500    /// function.
501    ///
502    /// Defaults to `name(args[0], args[1], ...)`
503    #[deprecated(
504        since = "50.0.0",
505        note = "This method is unused and will be removed in a future release"
506    )]
507    fn display_name(&self, args: &[Expr]) -> Result<String> {
508        let names: Vec<String> = args.iter().map(ToString::to_string).collect();
509        // TODO: join with ", " to standardize the formatting of Vec<Expr>, <https://github.com/apache/datafusion/issues/10364>
510        Ok(format!("{}({})", self.name(), names.join(",")))
511    }
512
513    /// Returns the name of the column this expression would create
514    ///
515    /// See [`Expr::schema_name`] for details
516    fn schema_name(&self, args: &[Expr]) -> Result<String> {
517        Ok(format!(
518            "{}({})",
519            self.name(),
520            schema_name_from_exprs_comma_separated_without_space(args)?
521        ))
522    }
523
524    /// Returns a [`Signature`] describing the argument types for which this
525    /// function has an implementation, and the function's [`Volatility`].
526    ///
527    /// See [`Signature`] for more details on argument type handling
528    /// and [`Self::return_type`] for computing the return type.
529    ///
530    /// [`Volatility`]: datafusion_expr_common::signature::Volatility
531    fn signature(&self) -> &Signature;
532
533    /// [`DataType`] returned by this function, given the types of the
534    /// arguments.
535    ///
536    /// # Arguments
537    ///
538    /// `arg_types` Data types of the arguments. The implementation of
539    /// `return_type` can assume that some other part of the code has coerced
540    /// the actual argument types to match [`Self::signature`].
541    ///
542    /// # Notes
543    ///
544    /// If you provide an implementation for [`Self::return_field_from_args`],
545    /// DataFusion will not call `return_type` (this function). While it is
546    /// valid to to put [`unimplemented!()`] or [`unreachable!()`], it is
547    /// recommended to return [`DataFusionError::Internal`] instead, which
548    /// reduces the severity of symptoms if bugs occur (an error rather than a
549    /// panic).
550    ///
551    /// [`DataFusionError::Internal`]: datafusion_common::DataFusionError::Internal
552    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType>;
553
554    /// Create a new instance of this function with updated configuration.
555    ///
556    /// This method is called when configuration options change at runtime
557    /// (e.g., via `SET` statements) to allow functions that depend on
558    /// configuration to update themselves accordingly.
559    ///
560    /// Note the current [`ConfigOptions`] are also passed to [`Self::invoke_with_args`] so
561    /// this API is not needed for functions where the values may
562    /// depend on the current options.
563    ///
564    /// This API is useful for functions where the return
565    /// **type** depends on the configuration options, such as the `now()` function
566    /// which depends on the current timezone.
567    ///
568    /// # Arguments
569    ///
570    /// * `config` - The updated configuration options
571    ///
572    /// # Returns
573    ///
574    /// * `Some(ScalarUDF)` - A new instance of this function configured with the new settings
575    /// * `None` - If this function does not change with new configuration settings (the default)
576    fn with_updated_config(&self, _config: &ConfigOptions) -> Option<ScalarUDF> {
577        None
578    }
579
580    /// What type will be returned by this function, given the arguments?
581    ///
582    /// By default, this function calls [`Self::return_type`] with the
583    /// types of each argument.
584    ///
585    /// # Notes
586    ///
587    /// For the majority of UDFs, implementing [`Self::return_type`] is sufficient,
588    /// as the result type is typically a deterministic function of the input types
589    /// (e.g., `sqrt(f32)` consistently yields `f32`). Implementing this method directly
590    /// is generally unnecessary unless the return type depends on runtime values.
591    ///
592    /// This function can be used for more advanced cases such as:
593    ///
594    /// 1. specifying nullability
595    /// 2. return types based on the **values** of the arguments (rather than
596    ///    their **types**.
597    ///
598    /// # Example creating `Field`
599    ///
600    /// Note the name of the [`Field`] is ignored, except for structured types such as
601    /// `DataType::Struct`.
602    ///
603    /// ```rust
604    /// # use std::sync::Arc;
605    /// # use arrow::datatypes::{DataType, Field, FieldRef};
606    /// # use datafusion_common::Result;
607    /// # use datafusion_expr::ReturnFieldArgs;
608    /// # struct Example{}
609    /// # impl Example {
610    /// fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
611    ///     // report output is only nullable if any one of the arguments are nullable
612    ///     let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
613    ///     let field = Arc::new(Field::new("ignored_name", DataType::Int32, nullable));
614    ///     Ok(field)
615    /// }
616    /// # }
617    /// ```
618    ///
619    /// # Output Type based on Values
620    ///
621    /// For example, the following two function calls get the same argument
622    /// types (something and a `Utf8` string) but return different types based
623    /// on the value of the second argument:
624    ///
625    /// * `arrow_cast(x, 'Int16')` --> `Int16`
626    /// * `arrow_cast(x, 'Float32')` --> `Float32`
627    ///
628    /// # Requirements
629    ///
630    /// This function **must** consistently return the same type for the same
631    /// logical input even if the input is simplified (e.g. it must return the same
632    /// value for `('foo' | 'bar')` as it does for ('foobar').
633    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
634        let data_types = args
635            .arg_fields
636            .iter()
637            .map(|f| f.data_type())
638            .cloned()
639            .collect::<Vec<_>>();
640        let return_type = self.return_type(&data_types)?;
641        Ok(Arc::new(Field::new(self.name(), return_type, true)))
642    }
643
644    #[deprecated(
645        since = "45.0.0",
646        note = "Use `return_field_from_args` instead. if you use `is_nullable` that returns non-nullable with `return_type`, you would need to switch to `return_field_from_args`, you might have error"
647    )]
648    fn is_nullable(&self, _args: &[Expr], _schema: &dyn ExprSchema) -> bool {
649        true
650    }
651
652    /// Invoke the function returning the appropriate result.
653    ///
654    /// # Performance
655    ///
656    /// For the best performance, the implementations should handle the common case
657    /// when one or more of their arguments are constant values (aka
658    /// [`ColumnarValue::Scalar`]).
659    ///
660    /// [`ColumnarValue::values_to_arrays`] can be used to convert the arguments
661    /// to arrays, which will likely be simpler code, but be slower.
662    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue>;
663
664    /// Optionally apply per-UDF simplification / rewrite rules.
665    ///
666    /// This can be used to apply function specific simplification rules during
667    /// optimization (e.g. `arrow_cast` --> `Expr::Cast`). The default
668    /// implementation does nothing.
669    ///
670    /// Note that DataFusion handles simplifying arguments and  "constant
671    /// folding" (replacing a function call with constant arguments such as
672    /// `my_add(1,2) --> 3` ). Thus, there is no need to implement such
673    /// optimizations manually for specific UDFs.
674    ///
675    /// # Arguments
676    /// * `args`: The arguments of the function
677    /// * `info`: The necessary information for simplification
678    ///
679    /// # Returns
680    /// [`ExprSimplifyResult`] indicating the result of the simplification NOTE
681    /// if the function cannot be simplified, the arguments *MUST* be returned
682    /// unmodified
683    ///
684    /// # Notes
685    ///
686    /// The returned expression must have the same schema as the original
687    /// expression, including both the data type and nullability. For example,
688    /// if the original expression is nullable, the returned expression must
689    /// also be nullable, otherwise it may lead to schema verification errors
690    /// later in query planning.
691    fn simplify(
692        &self,
693        args: Vec<Expr>,
694        _info: &dyn SimplifyInfo,
695    ) -> Result<ExprSimplifyResult> {
696        Ok(ExprSimplifyResult::Original(args))
697    }
698
699    /// Returns true if some of this `exprs` subexpressions may not be evaluated
700    /// and thus any side effects (like divide by zero) may not be encountered.
701    ///
702    /// Setting this to true prevents certain optimizations such as common
703    /// subexpression elimination
704    ///
705    /// When overriding this function to return `true`, [ScalarUDFImpl::conditional_arguments] can also be
706    /// overridden to report more accurately which arguments are eagerly evaluated and which ones
707    /// lazily.
708    fn short_circuits(&self) -> bool {
709        false
710    }
711
712    /// Determines which of the arguments passed to this function are evaluated eagerly
713    /// and which may be evaluated lazily.
714    ///
715    /// If this function returns `None`, all arguments are eagerly evaluated.
716    /// Returning `None` is a micro optimization that saves a needless `Vec`
717    /// allocation.
718    ///
719    /// If the function returns `Some`, returns (`eager`, `lazy`) where `eager`
720    /// are the arguments that are always evaluated, and `lazy` are the
721    /// arguments that may be evaluated lazily (i.e. may not be evaluated at all
722    /// in some cases).
723    ///
724    /// Implementations must ensure that the two returned `Vec`s are disjunct,
725    /// and that each argument from `args` is present in one the two `Vec`s.
726    ///
727    /// When overriding this function, [ScalarUDFImpl::short_circuits] must
728    /// be overridden to return `true`.
729    fn conditional_arguments<'a>(
730        &self,
731        args: &'a [Expr],
732    ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> {
733        if self.short_circuits() {
734            Some((vec![], args.iter().collect()))
735        } else {
736            None
737        }
738    }
739
740    /// Computes the output [`Interval`] for a [`ScalarUDFImpl`], given the input
741    /// intervals.
742    ///
743    /// # Parameters
744    ///
745    /// * `children` are the intervals for the children (inputs) of this function.
746    ///
747    /// # Example
748    ///
749    /// If the function is `ABS(a)`, and the input interval is `a: [-3, 2]`,
750    /// then the output interval would be `[0, 3]`.
751    fn evaluate_bounds(&self, _input: &[&Interval]) -> Result<Interval> {
752        // We cannot assume the input datatype is the same of output type.
753        Interval::make_unbounded(&DataType::Null)
754    }
755
756    /// Updates bounds for child expressions, given a known [`Interval`]s for this
757    /// function.
758    ///
759    /// This function is used to propagate constraints down through an
760    /// expression tree.
761    ///
762    /// # Parameters
763    ///
764    /// * `interval` is the currently known interval for this function.
765    /// * `inputs` are the current intervals for the inputs (children) of this function.
766    ///
767    /// # Returns
768    ///
769    /// A `Vec` of new intervals for the children, in order.
770    ///
771    /// If constraint propagation reveals an infeasibility for any child, returns
772    /// [`None`]. If none of the children intervals change as a result of
773    /// propagation, may return an empty vector instead of cloning `children`.
774    /// This is the default (and conservative) return value.
775    ///
776    /// # Example
777    ///
778    /// If the function is `ABS(a)`, the current `interval` is `[4, 5]` and the
779    /// input `a` is given as `[-7, 3]`, then propagation would return `[-5, 3]`.
780    fn propagate_constraints(
781        &self,
782        _interval: &Interval,
783        _inputs: &[&Interval],
784    ) -> Result<Option<Vec<Interval>>> {
785        Ok(Some(vec![]))
786    }
787
788    /// Calculates the [`SortProperties`] of this function based on its children's properties.
789    fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
790        if !self.preserves_lex_ordering(inputs)? {
791            return Ok(SortProperties::Unordered);
792        }
793
794        let Some(first_order) = inputs.first().map(|p| &p.sort_properties) else {
795            return Ok(SortProperties::Singleton);
796        };
797
798        if inputs
799            .iter()
800            .skip(1)
801            .all(|input| &input.sort_properties == first_order)
802        {
803            Ok(*first_order)
804        } else {
805            Ok(SortProperties::Unordered)
806        }
807    }
808
809    /// Returns true if the function preserves lexicographical ordering based on
810    /// the input ordering.
811    ///
812    /// For example, `concat(a || b)` preserves lexicographical ordering, but `abs(a)` does not.
813    fn preserves_lex_ordering(&self, _inputs: &[ExprProperties]) -> Result<bool> {
814        Ok(false)
815    }
816
817    /// Coerce arguments of a function call to types that the function can evaluate.
818    ///
819    /// This function is only called if [`ScalarUDFImpl::signature`] returns
820    /// [`crate::TypeSignature::UserDefined`]. Most UDFs should return one of
821    /// the other variants of [`TypeSignature`] which handle common cases.
822    ///
823    /// See the [type coercion module](crate::type_coercion)
824    /// documentation for more details on type coercion
825    ///
826    /// [`TypeSignature`]: crate::TypeSignature
827    ///
828    /// For example, if your function requires a floating point arguments, but the user calls
829    /// it like `my_func(1::int)` (i.e. with `1` as an integer), coerce_types can return `[DataType::Float64]`
830    /// to ensure the argument is converted to `1::double`
831    ///
832    /// # Parameters
833    /// * `arg_types`: The argument types of the arguments  this function with
834    ///
835    /// # Return value
836    /// A Vec the same length as `arg_types`. DataFusion will `CAST` the function call
837    /// arguments to these specific types.
838    fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>> {
839        not_impl_err!("Function {} does not implement coerce_types", self.name())
840    }
841
842    /// Returns the documentation for this Scalar UDF.
843    ///
844    /// Documentation can be accessed programmatically as well as generating
845    /// publicly facing documentation.
846    fn documentation(&self) -> Option<&Documentation> {
847        None
848    }
849}
850
851/// ScalarUDF that adds an alias to the underlying function. It is better to
852/// implement [`ScalarUDFImpl`], which supports aliases, directly if possible.
853#[derive(Debug, PartialEq, Eq, Hash)]
854struct AliasedScalarUDFImpl {
855    inner: UdfEq<Arc<dyn ScalarUDFImpl>>,
856    aliases: Vec<String>,
857}
858
859impl AliasedScalarUDFImpl {
860    pub fn new(
861        inner: Arc<dyn ScalarUDFImpl>,
862        new_aliases: impl IntoIterator<Item = &'static str>,
863    ) -> Self {
864        let mut aliases = inner.aliases().to_vec();
865        aliases.extend(new_aliases.into_iter().map(|s| s.to_string()));
866        Self {
867            inner: inner.into(),
868            aliases,
869        }
870    }
871}
872
873#[warn(clippy::missing_trait_methods)] // Delegates, so it should implement every single trait method
874impl ScalarUDFImpl for AliasedScalarUDFImpl {
875    fn as_any(&self) -> &dyn Any {
876        self
877    }
878
879    fn name(&self) -> &str {
880        self.inner.name()
881    }
882
883    fn display_name(&self, args: &[Expr]) -> Result<String> {
884        #[expect(deprecated)]
885        self.inner.display_name(args)
886    }
887
888    fn schema_name(&self, args: &[Expr]) -> Result<String> {
889        self.inner.schema_name(args)
890    }
891
892    fn signature(&self) -> &Signature {
893        self.inner.signature()
894    }
895
896    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
897        self.inner.return_type(arg_types)
898    }
899
900    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
901        self.inner.return_field_from_args(args)
902    }
903
904    fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool {
905        #[expect(deprecated)]
906        self.inner.is_nullable(args, schema)
907    }
908
909    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
910        self.inner.invoke_with_args(args)
911    }
912
913    fn with_updated_config(&self, _config: &ConfigOptions) -> Option<ScalarUDF> {
914        None
915    }
916
917    fn aliases(&self) -> &[String] {
918        &self.aliases
919    }
920
921    fn simplify(
922        &self,
923        args: Vec<Expr>,
924        info: &dyn SimplifyInfo,
925    ) -> Result<ExprSimplifyResult> {
926        self.inner.simplify(args, info)
927    }
928
929    fn conditional_arguments<'a>(
930        &self,
931        args: &'a [Expr],
932    ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> {
933        self.inner.conditional_arguments(args)
934    }
935
936    fn short_circuits(&self) -> bool {
937        self.inner.short_circuits()
938    }
939
940    fn evaluate_bounds(&self, input: &[&Interval]) -> Result<Interval> {
941        self.inner.evaluate_bounds(input)
942    }
943
944    fn propagate_constraints(
945        &self,
946        interval: &Interval,
947        inputs: &[&Interval],
948    ) -> Result<Option<Vec<Interval>>> {
949        self.inner.propagate_constraints(interval, inputs)
950    }
951
952    fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
953        self.inner.output_ordering(inputs)
954    }
955
956    fn preserves_lex_ordering(&self, inputs: &[ExprProperties]) -> Result<bool> {
957        self.inner.preserves_lex_ordering(inputs)
958    }
959
960    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
961        self.inner.coerce_types(arg_types)
962    }
963
964    fn documentation(&self) -> Option<&Documentation> {
965        self.inner.documentation()
966    }
967}
968
969#[cfg(test)]
970mod tests {
971    use super::*;
972    use datafusion_expr_common::signature::Volatility;
973    use std::hash::DefaultHasher;
974
975    #[derive(Debug, PartialEq, Eq, Hash)]
976    struct TestScalarUDFImpl {
977        name: &'static str,
978        field: &'static str,
979        signature: Signature,
980    }
981    impl ScalarUDFImpl for TestScalarUDFImpl {
982        fn as_any(&self) -> &dyn Any {
983            self
984        }
985
986        fn name(&self) -> &str {
987            self.name
988        }
989
990        fn signature(&self) -> &Signature {
991            &self.signature
992        }
993
994        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
995            unimplemented!()
996        }
997
998        fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
999            unimplemented!()
1000        }
1001    }
1002
1003    // PartialEq and Hash must be consistent, and also PartialEq and PartialOrd
1004    // must be consistent, so they are tested together.
1005    #[test]
1006    fn test_partial_eq_hash_and_partial_ord() {
1007        // A parameterized function
1008        let f = test_func("foo", "a");
1009
1010        // Same like `f`, different instance
1011        let f2 = test_func("foo", "a");
1012        assert_eq!(f, f2);
1013        assert_eq!(hash(&f), hash(&f2));
1014        assert_eq!(f.partial_cmp(&f2), Some(Ordering::Equal));
1015
1016        // Different parameter
1017        let b = test_func("foo", "b");
1018        assert_ne!(f, b);
1019        assert_ne!(hash(&f), hash(&b)); // hash can collide for different values but does not collide in this test
1020        assert_eq!(f.partial_cmp(&b), None);
1021
1022        // Different name
1023        let o = test_func("other", "a");
1024        assert_ne!(f, o);
1025        assert_ne!(hash(&f), hash(&o)); // hash can collide for different values but does not collide in this test
1026        assert_eq!(f.partial_cmp(&o), Some(Ordering::Less));
1027
1028        // Different name and parameter
1029        assert_ne!(b, o);
1030        assert_ne!(hash(&b), hash(&o)); // hash can collide for different values but does not collide in this test
1031        assert_eq!(b.partial_cmp(&o), Some(Ordering::Less));
1032    }
1033
1034    fn test_func(name: &'static str, parameter: &'static str) -> ScalarUDF {
1035        ScalarUDF::from(TestScalarUDFImpl {
1036            name,
1037            field: parameter,
1038            signature: Signature::any(1, Volatility::Immutable),
1039        })
1040    }
1041
1042    fn hash<T: Hash>(value: &T) -> u64 {
1043        let hasher = &mut DefaultHasher::new();
1044        value.hash(hasher);
1045        hasher.finish()
1046    }
1047}
datafusion_expr/udf.rs

datafusion_expr/
udf.rs