datafusion_functions/unicode/
initcap.rs1use std::any::Any;
19use std::sync::Arc;
20
21use arrow::array::{
22 Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder,
23};
24use arrow::datatypes::DataType;
25
26use crate::utils::{make_scalar_function, utf8_to_str_type};
27use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
28use datafusion_common::types::logical_string;
29use datafusion_common::{exec_err, Result};
30use datafusion_expr::{
31 Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignatureClass,
32 Volatility,
33};
34use datafusion_macros::user_doc;
35
36#[user_doc(
37 doc_section(label = "String Functions"),
38 description = "Capitalizes the first character in each word in the input string. \
39 Words are delimited by non-alphanumeric characters.",
40 syntax_example = "initcap(str)",
41 sql_example = r#"```sql
42> select initcap('apache datafusion');
43+------------------------------------+
44| initcap(Utf8("apache datafusion")) |
45+------------------------------------+
46| Apache Datafusion |
47+------------------------------------+
48```"#,
49 standard_argument(name = "str", prefix = "String"),
50 related_udf(name = "lower"),
51 related_udf(name = "upper")
52)]
53#[derive(Debug, PartialEq, Eq, Hash)]
54pub struct InitcapFunc {
55 signature: Signature,
56}
57
58impl Default for InitcapFunc {
59 fn default() -> Self {
60 InitcapFunc::new()
61 }
62}
63
64impl InitcapFunc {
65 pub fn new() -> Self {
66 Self {
67 signature: Signature::coercible(
68 vec![Coercion::new_exact(TypeSignatureClass::Native(
69 logical_string(),
70 ))],
71 Volatility::Immutable,
72 ),
73 }
74 }
75}
76
77impl ScalarUDFImpl for InitcapFunc {
78 fn as_any(&self) -> &dyn Any {
79 self
80 }
81
82 fn name(&self) -> &str {
83 "initcap"
84 }
85
86 fn signature(&self) -> &Signature {
87 &self.signature
88 }
89
90 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
91 if let DataType::Utf8View = arg_types[0] {
92 Ok(DataType::Utf8View)
93 } else {
94 utf8_to_str_type(&arg_types[0], "initcap")
95 }
96 }
97
98 fn invoke_with_args(
99 &self,
100 args: datafusion_expr::ScalarFunctionArgs,
101 ) -> Result<ColumnarValue> {
102 let args = &args.args;
103 match args[0].data_type() {
104 DataType::Utf8 => make_scalar_function(initcap::<i32>, vec![])(args),
105 DataType::LargeUtf8 => make_scalar_function(initcap::<i64>, vec![])(args),
106 DataType::Utf8View => make_scalar_function(initcap_utf8view, vec![])(args),
107 other => {
108 exec_err!("Unsupported data type {other:?} for function `initcap`")
109 }
110 }
111 }
112
113 fn documentation(&self) -> Option<&Documentation> {
114 self.doc()
115 }
116}
117
118fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
127 let string_array = as_generic_string_array::<T>(&args[0])?;
128
129 let mut builder = GenericStringBuilder::<T>::with_capacity(
130 string_array.len(),
131 string_array.value_data().len(),
132 );
133
134 let mut container = String::new();
135 string_array.iter().for_each(|str| match str {
136 Some(s) => {
137 initcap_string(s, &mut container);
138 builder.append_value(&container);
139 }
140 None => builder.append_null(),
141 });
142
143 Ok(Arc::new(builder.finish()) as ArrayRef)
144}
145
146fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
147 let string_view_array = as_string_view_array(&args[0])?;
148
149 let mut builder = StringViewBuilder::with_capacity(string_view_array.len());
150
151 let mut container = String::new();
152 string_view_array.iter().for_each(|str| match str {
153 Some(s) => {
154 initcap_string(s, &mut container);
155 builder.append_value(&container);
156 }
157 None => builder.append_null(),
158 });
159
160 Ok(Arc::new(builder.finish()) as ArrayRef)
161}
162
163fn initcap_string(input: &str, container: &mut String) {
164 container.clear();
165 let mut prev_is_alphanumeric = false;
166
167 if input.is_ascii() {
168 for c in input.chars() {
169 if prev_is_alphanumeric {
170 container.push(c.to_ascii_lowercase());
171 } else {
172 container.push(c.to_ascii_uppercase());
173 };
174 prev_is_alphanumeric = c.is_ascii_alphanumeric();
175 }
176 } else {
177 for c in input.chars() {
178 if prev_is_alphanumeric {
179 container.extend(c.to_lowercase());
180 } else {
181 container.extend(c.to_uppercase());
182 }
183 prev_is_alphanumeric = c.is_alphanumeric();
184 }
185 }
186}
187
188#[cfg(test)]
189mod tests {
190 use crate::unicode::initcap::InitcapFunc;
191 use crate::utils::test::test_function;
192 use arrow::array::{Array, StringArray, StringViewArray};
193 use arrow::datatypes::DataType::{Utf8, Utf8View};
194 use datafusion_common::{Result, ScalarValue};
195 use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
196
197 #[test]
198 fn test_functions() -> Result<()> {
199 test_function!(
200 InitcapFunc::new(),
201 vec![ColumnarValue::Scalar(ScalarValue::from("hi THOMAS"))],
202 Ok(Some("Hi Thomas")),
203 &str,
204 Utf8,
205 StringArray
206 );
207 test_function!(
208 InitcapFunc::new(),
209 vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
210 "êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
211 .to_string()
212 )))],
213 Ok(Some(
214 "Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
215 )),
216 &str,
217 Utf8,
218 StringArray
219 );
220 test_function!(
221 InitcapFunc::new(),
222 vec![ColumnarValue::Scalar(ScalarValue::from(""))],
223 Ok(Some("")),
224 &str,
225 Utf8,
226 StringArray
227 );
228 test_function!(
229 InitcapFunc::new(),
230 vec![ColumnarValue::Scalar(ScalarValue::from(""))],
231 Ok(Some("")),
232 &str,
233 Utf8,
234 StringArray
235 );
236 test_function!(
237 InitcapFunc::new(),
238 vec![ColumnarValue::Scalar(ScalarValue::Utf8(None))],
239 Ok(None),
240 &str,
241 Utf8,
242 StringArray
243 );
244
245 test_function!(
246 InitcapFunc::new(),
247 vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
248 "hi THOMAS".to_string()
249 )))],
250 Ok(Some("Hi Thomas")),
251 &str,
252 Utf8View,
253 StringViewArray
254 );
255 test_function!(
256 InitcapFunc::new(),
257 vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
258 "hi THOMAS wIth M0re ThAN 12 ChaRs".to_string()
259 )))],
260 Ok(Some("Hi Thomas With M0re Than 12 Chars")),
261 &str,
262 Utf8View,
263 StringViewArray
264 );
265 test_function!(
266 InitcapFunc::new(),
267 vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
268 "đẸp đẼ êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
269 .to_string()
270 )))],
271 Ok(Some(
272 "Đẹp Đẽ Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
273 )),
274 &str,
275 Utf8View,
276 StringViewArray
277 );
278 test_function!(
279 InitcapFunc::new(),
280 vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
281 "".to_string()
282 )))],
283 Ok(Some("")),
284 &str,
285 Utf8View,
286 StringViewArray
287 );
288 test_function!(
289 InitcapFunc::new(),
290 vec![ColumnarValue::Scalar(ScalarValue::Utf8View(None))],
291 Ok(None),
292 &str,
293 Utf8View,
294 StringViewArray
295 );
296
297 Ok(())
298 }
299}