use crate::string::common::to_upper;
use arrow::datatypes::DataType;
use datafusion_common::Result;
use datafusion_common::types::logical_string;
use datafusion_expr::{
Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
TypeSignatureClass, Volatility,
};
use datafusion_macros::user_doc;
#[user_doc(
doc_section(label = "String Functions"),
description = "Converts a string to upper-case.",
syntax_example = "upper(str)",
sql_example = r#"```sql
> select upper('dataFusion');
+---------------------------+
| upper(Utf8("dataFusion")) |
+---------------------------+
| DATAFUSION |
+---------------------------+
```"#,
standard_argument(name = "str", prefix = "String"),
related_udf(name = "initcap"),
related_udf(name = "lower")
)]
#[derive(Debug, PartialEq, Eq, Hash)]
pub struct UpperFunc {
signature: Signature,
}
impl Default for UpperFunc {
fn default() -> Self {
Self::new()
}
}
impl UpperFunc {
pub fn new() -> Self {
Self {
signature: Signature::coercible(
vec![Coercion::new_exact(TypeSignatureClass::Native(
logical_string(),
))],
Volatility::Immutable,
),
}
}
}
impl ScalarUDFImpl for UpperFunc {
fn name(&self) -> &str {
"upper"
}
fn signature(&self) -> &Signature {
&self.signature
}
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
Ok(arg_types[0].clone())
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
to_upper(&args.args, "upper")
}
fn documentation(&self) -> Option<&Documentation> {
self.doc()
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow::array::{Array, ArrayRef, StringArray, StringViewArray};
use arrow::datatypes::Field;
use datafusion_common::config::ConfigOptions;
use std::sync::Arc;
fn invoke_upper(input: ArrayRef) -> Result<ArrayRef> {
let func = UpperFunc::new();
let data_type = input.data_type().clone();
let args = ScalarFunctionArgs {
number_rows: input.len(),
args: vec![ColumnarValue::Array(input)],
arg_fields: vec![Field::new("a", data_type.clone(), true).into()],
return_field: Field::new("f", data_type, true).into(),
config_options: Arc::new(ConfigOptions::default()),
};
match func.invoke_with_args(args)? {
ColumnarValue::Array(r) => Ok(r),
_ => unreachable!("upper"),
}
}
fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> {
let result = invoke_upper(input)?;
assert_eq!(&expected, &result);
Ok(())
}
#[test]
fn upper_maybe_optimization() -> Result<()> {
let input = Arc::new(StringArray::from(vec![
Some("农历新年"),
None,
Some("datafusion"),
Some("0123456789"),
Some(""),
])) as ArrayRef;
let expected = Arc::new(StringArray::from(vec![
Some("农历新年"),
None,
Some("DATAFUSION"),
Some("0123456789"),
Some(""),
])) as ArrayRef;
to_upper(input, expected)
}
#[test]
fn upper_full_optimization() -> Result<()> {
let input = Arc::new(StringArray::from(vec![
Some("arrow"),
None,
Some("datafusion"),
Some("0123456789"),
Some(""),
])) as ArrayRef;
let expected = Arc::new(StringArray::from(vec![
Some("ARROW"),
None,
Some("DATAFUSION"),
Some("0123456789"),
Some(""),
])) as ArrayRef;
to_upper(input, expected)
}
#[test]
fn upper_partial_optimization() -> Result<()> {
let input = Arc::new(StringArray::from(vec![
Some("arrow"),
None,
Some("datafusion"),
Some("@_"),
Some("0123456789"),
Some(""),
Some("\t\n"),
Some("ὀδυσσεύς"),
Some("tschüß"),
Some("ⱦ"), Some("农历新年"),
])) as ArrayRef;
let expected = Arc::new(StringArray::from(vec![
Some("ARROW"),
None,
Some("DATAFUSION"),
Some("@_"),
Some("0123456789"),
Some(""),
Some("\t\n"),
Some("ὈΔΥΣΣΕΎΣ"),
Some("TSCHÜSS"),
Some("Ⱦ"),
Some("农历新年"),
])) as ArrayRef;
to_upper(input, expected)
}
#[test]
fn upper_utf8view() -> Result<()> {
let input = Arc::new(StringViewArray::from(vec![
Some("arrow"),
None,
Some("tschüß"),
])) as ArrayRef;
let expected = Arc::new(StringViewArray::from(vec![
Some("ARROW"),
None,
Some("TSCHÜSS"),
])) as ArrayRef;
to_upper(input, expected)
}
#[test]
fn upper_ascii_utf8view() -> Result<()> {
let input = Arc::new(StringViewArray::from(vec![
Some("arrow"), None,
Some("hello world 123"), Some(""),
Some("0123456789"), Some("datafusion is cool"), ])) as ArrayRef;
let expected = Arc::new(StringViewArray::from(vec![
Some("ARROW"),
None,
Some("HELLO WORLD 123"),
Some(""),
Some("0123456789"),
Some("DATAFUSION IS COOL"),
])) as ArrayRef;
to_upper(input, expected)
}
#[test]
fn upper_sliced_ascii_utf8view() -> Result<()> {
let parent = Arc::new(StringViewArray::from(vec![
Some("农历新年long enough for buffer"),
Some("hello world 123"),
Some("datafusion rocks!"),
Some("zzzzzzzzzzzzzzzz"),
])) as ArrayRef;
let sliced = parent.slice(1, 2);
let result = invoke_upper(sliced)?;
let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
let expected = StringViewArray::from(vec![
Some("HELLO WORLD 123"),
Some("DATAFUSION ROCKS!"),
]);
assert_eq!(result_sv, &expected);
assert_eq!(result_sv.data_buffers().len(), 1);
assert_eq!(result_sv.data_buffers()[0].len(), 32);
Ok(())
}
#[test]
fn upper_utf8view_inline_only_no_buffers() -> Result<()> {
let input = Arc::new(StringViewArray::from(vec![
Some("hello"),
None,
Some(""),
Some("0123456789AB"), ])) as ArrayRef;
let result = invoke_upper(input)?;
let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
let expected = StringViewArray::from(vec![
Some("HELLO"),
None,
Some(""),
Some("0123456789AB"),
]);
assert_eq!(result_sv, &expected);
assert_eq!(
result_sv.data_buffers().len(),
0,
"inline-only Utf8View should produce no data buffers"
);
Ok(())
}
#[test]
fn upper_utf8view_long_packs_tight() -> Result<()> {
let input = Arc::new(StringViewArray::from(vec![
Some("hello world 123"), Some("abc"), None,
Some("datafusion rocks!"), Some("another long string"), ])) as ArrayRef;
let result = invoke_upper(input)?;
let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
let expected = StringViewArray::from(vec![
Some("HELLO WORLD 123"),
Some("ABC"),
None,
Some("DATAFUSION ROCKS!"),
Some("ANOTHER LONG STRING"),
]);
assert_eq!(result_sv, &expected);
assert_eq!(result_sv.data_buffers().len(), 1);
assert_eq!(result_sv.data_buffers()[0].len(), 15 + 17 + 19);
Ok(())
}
#[test]
fn upper_utf8view_splits_into_multiple_buffers() -> Result<()> {
const STR_LEN: usize = 500;
const N: usize = 40; let value = "x".repeat(STR_LEN);
let inputs: Vec<Option<String>> = (0..N).map(|_| Some(value.clone())).collect();
let input = Arc::new(StringViewArray::from(inputs.clone())) as ArrayRef;
let result = invoke_upper(input)?;
let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
let expected_value = "X".repeat(STR_LEN);
let expected: Vec<Option<&str>> =
(0..N).map(|_| Some(expected_value.as_str())).collect();
assert_eq!(result_sv, &StringViewArray::from(expected));
assert!(
result_sv.data_buffers().len() >= 2,
"expected the output to span more than one data buffer, got {}",
result_sv.data_buffers().len()
);
let total: usize = result_sv.data_buffers().iter().map(|b| b.len()).sum();
assert_eq!(total, N * STR_LEN);
Ok(())
}
#[test]
fn upper_sliced_utf8() -> Result<()> {
let parent = Arc::new(StringArray::from(vec![
Some("aaaaaaaa"),
Some("hello"),
Some("world"),
Some(""),
Some("zzzzzzzz"),
])) as ArrayRef;
let sliced = parent.slice(1, 3);
let result = invoke_upper(sliced)?;
let result_sa = result.as_any().downcast_ref::<StringArray>().unwrap();
let expected = StringArray::from(vec![Some("HELLO"), Some("WORLD"), Some("")]);
assert_eq!(result_sa, &expected);
assert_eq!(result_sa.value_data().len(), 10);
Ok(())
}
}