datafusion_functions/unicode/
character_length.rs1use crate::utils::{make_scalar_function, utf8_to_int_type};
19use arrow::array::{
20 Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveArray,
21 StringArrayType,
22};
23use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
24use datafusion_common::Result;
25use datafusion_expr::{
26 ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
27 Volatility,
28};
29use datafusion_macros::user_doc;
30use std::sync::Arc;
31
32#[user_doc(
33 doc_section(label = "String Functions"),
34 description = "Returns the number of characters in a string.",
35 syntax_example = "character_length(str)",
36 sql_example = r#"```sql
37> select character_length('Ångström');
38+------------------------------------+
39| character_length(Utf8("Ångström")) |
40+------------------------------------+
41| 8 |
42+------------------------------------+
43```"#,
44 standard_argument(name = "str", prefix = "String"),
45 related_udf(name = "bit_length"),
46 related_udf(name = "octet_length")
47)]
48#[derive(Debug, PartialEq, Eq, Hash)]
49pub struct CharacterLengthFunc {
50 signature: Signature,
51 aliases: Vec<String>,
52}
53
54impl Default for CharacterLengthFunc {
55 fn default() -> Self {
56 Self::new()
57 }
58}
59
60impl CharacterLengthFunc {
61 pub fn new() -> Self {
62 use DataType::*;
63 Self {
64 signature: Signature::uniform(
65 1,
66 vec![Utf8, LargeUtf8, Utf8View],
67 Volatility::Immutable,
68 ),
69 aliases: vec![String::from("length"), String::from("char_length")],
70 }
71 }
72}
73
74impl ScalarUDFImpl for CharacterLengthFunc {
75 fn name(&self) -> &str {
76 "character_length"
77 }
78
79 fn signature(&self) -> &Signature {
80 &self.signature
81 }
82
83 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
84 utf8_to_int_type(&arg_types[0], "character_length")
85 }
86
87 fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
88 make_scalar_function(character_length, vec![])(&args.args)
89 }
90
91 fn aliases(&self) -> &[String] {
92 &self.aliases
93 }
94
95 fn documentation(&self) -> Option<&Documentation> {
96 self.doc()
97 }
98}
99
100fn character_length(args: &[ArrayRef]) -> Result<ArrayRef> {
104 match args[0].data_type() {
105 DataType::Utf8 => {
106 let string_array = args[0].as_string::<i32>();
107 character_length_general::<Int32Type, _>(&string_array)
108 }
109 DataType::LargeUtf8 => {
110 let string_array = args[0].as_string::<i64>();
111 character_length_general::<Int64Type, _>(&string_array)
112 }
113 DataType::Utf8View => {
114 let string_array = args[0].as_string_view();
115 character_length_general::<Int32Type, _>(&string_array)
116 }
117 _ => unreachable!("CharacterLengthFunc"),
118 }
119}
120
121fn character_length_general<'a, T, V>(array: &V) -> Result<ArrayRef>
122where
123 T: ArrowPrimitiveType,
124 T::Native: OffsetSizeTrait,
125 V: StringArrayType<'a>,
126{
127 let is_array_ascii_only = array.is_ascii();
132 let nulls = array.nulls().cloned();
133 let array = {
134 if is_array_ascii_only {
135 let values: Vec<_> = (0..array.len())
136 .map(|i| {
137 let value = unsafe { array.value_unchecked(i) };
139 T::Native::usize_as(value.len())
140 })
141 .collect();
142 PrimitiveArray::<T>::new(values.into(), nulls)
143 } else {
144 let values: Vec<_> = (0..array.len())
145 .map(|i| {
146 if array.is_null(i) {
148 T::default_value()
149 } else {
150 let value = unsafe { array.value_unchecked(i) };
151 if value.is_empty() {
152 T::default_value()
153 } else if value.is_ascii() {
154 T::Native::usize_as(value.len())
155 } else {
156 T::Native::usize_as(value.chars().count())
157 }
158 }
159 })
160 .collect();
161 PrimitiveArray::<T>::new(values.into(), nulls)
162 }
163 };
164
165 Ok(Arc::new(array))
166}
167
168#[cfg(test)]
169mod tests {
170 use crate::unicode::character_length::CharacterLengthFunc;
171 use crate::utils::test::test_function;
172 use arrow::array::{Array, Int32Array, Int64Array};
173 use arrow::datatypes::DataType::{Int32, Int64};
174 use datafusion_common::{Result, ScalarValue};
175 use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
176
177 macro_rules! test_character_length {
178 ($INPUT:expr, $EXPECTED:expr) => {
179 test_function!(
180 CharacterLengthFunc::new(),
181 vec![ColumnarValue::Scalar(ScalarValue::Utf8($INPUT))],
182 $EXPECTED,
183 i32,
184 Int32,
185 Int32Array
186 );
187
188 test_function!(
189 CharacterLengthFunc::new(),
190 vec![ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT))],
191 $EXPECTED,
192 i64,
193 Int64,
194 Int64Array
195 );
196
197 test_function!(
198 CharacterLengthFunc::new(),
199 vec![ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
200 $EXPECTED,
201 i32,
202 Int32,
203 Int32Array
204 );
205 };
206 }
207
208 #[test]
209 fn test_functions() -> Result<()> {
210 #[cfg(feature = "unicode_expressions")]
211 {
212 test_character_length!(Some(String::from("chars")), Ok(Some(5)));
213 test_character_length!(Some(String::from("josé")), Ok(Some(4)));
214 test_character_length!(Some(String::from("joséjoséjoséjosé")), Ok(Some(16)));
216 test_character_length!(Some(String::from("")), Ok(Some(0)));
217 test_character_length!(None, Ok(None));
218 }
219
220 #[cfg(not(feature = "unicode_expressions"))]
221 test_function!(
222 CharacterLengthFunc::new(),
223 &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
224 String::from("josé")
225 )))],
226 internal_err!(
227 "function character_length requires compilation with feature flag: unicode_expressions."
228 ),
229 i32,
230 Int32,
231 Int32Array
232 );
233
234 Ok(())
235 }
236}