Skip to main content

datafusion_functions/string/
upper.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::string::common::to_upper;
19use arrow::datatypes::DataType;
20use datafusion_common::Result;
21use datafusion_common::types::logical_string;
22use datafusion_expr::{
23    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
24    TypeSignatureClass, Volatility,
25};
26use datafusion_macros::user_doc;
27
28#[user_doc(
29    doc_section(label = "String Functions"),
30    description = "Converts a string to upper-case.",
31    syntax_example = "upper(str)",
32    sql_example = r#"```sql
33> select upper('dataFusion');
34+---------------------------+
35| upper(Utf8("dataFusion")) |
36+---------------------------+
37| DATAFUSION                |
38+---------------------------+
39```"#,
40    standard_argument(name = "str", prefix = "String"),
41    related_udf(name = "initcap"),
42    related_udf(name = "lower")
43)]
44#[derive(Debug, PartialEq, Eq, Hash)]
45pub struct UpperFunc {
46    signature: Signature,
47}
48
49impl Default for UpperFunc {
50    fn default() -> Self {
51        Self::new()
52    }
53}
54
55impl UpperFunc {
56    pub fn new() -> Self {
57        Self {
58            signature: Signature::coercible(
59                vec![Coercion::new_exact(TypeSignatureClass::Native(
60                    logical_string(),
61                ))],
62                Volatility::Immutable,
63            ),
64        }
65    }
66}
67
68impl ScalarUDFImpl for UpperFunc {
69    fn name(&self) -> &str {
70        "upper"
71    }
72
73    fn signature(&self) -> &Signature {
74        &self.signature
75    }
76
77    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
78        Ok(arg_types[0].clone())
79    }
80
81    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
82        to_upper(&args.args, "upper")
83    }
84
85    fn documentation(&self) -> Option<&Documentation> {
86        self.doc()
87    }
88}
89
90#[cfg(test)]
91mod tests {
92    use super::*;
93    use arrow::array::{Array, ArrayRef, StringArray, StringViewArray};
94    use arrow::datatypes::Field;
95    use datafusion_common::config::ConfigOptions;
96    use std::sync::Arc;
97
98    fn invoke_upper(input: ArrayRef) -> Result<ArrayRef> {
99        let func = UpperFunc::new();
100        let data_type = input.data_type().clone();
101        let args = ScalarFunctionArgs {
102            number_rows: input.len(),
103            args: vec![ColumnarValue::Array(input)],
104            arg_fields: vec![Field::new("a", data_type.clone(), true).into()],
105            return_field: Field::new("f", data_type, true).into(),
106            config_options: Arc::new(ConfigOptions::default()),
107        };
108        match func.invoke_with_args(args)? {
109            ColumnarValue::Array(r) => Ok(r),
110            _ => unreachable!("upper"),
111        }
112    }
113
114    fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> {
115        let result = invoke_upper(input)?;
116        assert_eq!(&expected, &result);
117        Ok(())
118    }
119
120    #[test]
121    fn upper_maybe_optimization() -> Result<()> {
122        let input = Arc::new(StringArray::from(vec![
123            Some("农历新年"),
124            None,
125            Some("datafusion"),
126            Some("0123456789"),
127            Some(""),
128        ])) as ArrayRef;
129
130        let expected = Arc::new(StringArray::from(vec![
131            Some("农历新年"),
132            None,
133            Some("DATAFUSION"),
134            Some("0123456789"),
135            Some(""),
136        ])) as ArrayRef;
137
138        to_upper(input, expected)
139    }
140
141    #[test]
142    fn upper_full_optimization() -> Result<()> {
143        let input = Arc::new(StringArray::from(vec![
144            Some("arrow"),
145            None,
146            Some("datafusion"),
147            Some("0123456789"),
148            Some(""),
149        ])) as ArrayRef;
150
151        let expected = Arc::new(StringArray::from(vec![
152            Some("ARROW"),
153            None,
154            Some("DATAFUSION"),
155            Some("0123456789"),
156            Some(""),
157        ])) as ArrayRef;
158
159        to_upper(input, expected)
160    }
161
162    #[test]
163    fn upper_partial_optimization() -> Result<()> {
164        let input = Arc::new(StringArray::from(vec![
165            Some("arrow"),
166            None,
167            Some("datafusion"),
168            Some("@_"),
169            Some("0123456789"),
170            Some(""),
171            Some("\t\n"),
172            Some("ὀδυσσεύς"),
173            Some("tschüß"),
174            Some("ⱦ"), // Ⱦ: length change
175            Some("农历新年"),
176        ])) as ArrayRef;
177
178        let expected = Arc::new(StringArray::from(vec![
179            Some("ARROW"),
180            None,
181            Some("DATAFUSION"),
182            Some("@_"),
183            Some("0123456789"),
184            Some(""),
185            Some("\t\n"),
186            Some("ὈΔΥΣΣΕΎΣ"),
187            Some("TSCHÜSS"),
188            Some("Ⱦ"),
189            Some("农历新年"),
190        ])) as ArrayRef;
191
192        to_upper(input, expected)
193    }
194
195    #[test]
196    fn upper_utf8view() -> Result<()> {
197        let input = Arc::new(StringViewArray::from(vec![
198            Some("arrow"),
199            None,
200            Some("tschüß"),
201        ])) as ArrayRef;
202
203        let expected = Arc::new(StringViewArray::from(vec![
204            Some("ARROW"),
205            None,
206            Some("TSCHÜSS"),
207        ])) as ArrayRef;
208
209        to_upper(input, expected)
210    }
211
212    #[test]
213    fn upper_ascii_utf8view() -> Result<()> {
214        // Mix of inlined (≤12 bytes) and referenced (>12 bytes) strings, plus
215        // a null and an empty, to exercise the all-ASCII Utf8View fast path.
216        let input = Arc::new(StringViewArray::from(vec![
217            Some("arrow"), // inlined short
218            None,
219            Some("hello world 123"), // referenced (15 bytes)
220            Some(""),
221            Some("0123456789"),         // inlined, no case change
222            Some("datafusion is cool"), // referenced
223        ])) as ArrayRef;
224
225        let expected = Arc::new(StringViewArray::from(vec![
226            Some("ARROW"),
227            None,
228            Some("HELLO WORLD 123"),
229            Some(""),
230            Some("0123456789"),
231            Some("DATAFUSION IS COOL"),
232        ])) as ArrayRef;
233
234        to_upper(input, expected)
235    }
236
237    #[test]
238    fn upper_sliced_ascii_utf8view() -> Result<()> {
239        // Slice of a parent that contains a non-ASCII string outside the
240        // slice. The slice is all-ASCII, so the fast path must run and produce
241        // correct output while the parent's unaddressed non-ASCII bytes are
242        // irrelevant to the result.
243        let parent = Arc::new(StringViewArray::from(vec![
244            Some("农历新年long enough for buffer"),
245            Some("hello world 123"),
246            Some("datafusion rocks!"),
247            Some("zzzzzzzzzzzzzzzz"),
248        ])) as ArrayRef;
249        let sliced = parent.slice(1, 2);
250        let result = invoke_upper(sliced)?;
251        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
252
253        let expected = StringViewArray::from(vec![
254            Some("HELLO WORLD 123"),
255            Some("DATAFUSION ROCKS!"),
256        ]);
257        assert_eq!(result_sv, &expected);
258        // The slice's two long views address 15 + 17 = 32 bytes; the ASCII
259        // fast path must produce a single packed buffer of exactly that
260        // size, not one scaled to the parent's data buffer.
261        assert_eq!(result_sv.data_buffers().len(), 1);
262        assert_eq!(result_sv.data_buffers()[0].len(), 32);
263        Ok(())
264    }
265
266    #[test]
267    fn upper_utf8view_inline_only_no_buffers() -> Result<()> {
268        // An array whose values are all ≤ 12 bytes is fully inline; the ASCII
269        // fast path should produce no data buffers at all.
270        let input = Arc::new(StringViewArray::from(vec![
271            Some("hello"),
272            None,
273            Some(""),
274            Some("0123456789AB"), // 12 bytes — inline boundary
275        ])) as ArrayRef;
276        let result = invoke_upper(input)?;
277        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
278
279        let expected = StringViewArray::from(vec![
280            Some("HELLO"),
281            None,
282            Some(""),
283            Some("0123456789AB"),
284        ]);
285        assert_eq!(result_sv, &expected);
286        assert_eq!(
287            result_sv.data_buffers().len(),
288            0,
289            "inline-only Utf8View should produce no data buffers"
290        );
291        Ok(())
292    }
293
294    #[test]
295    fn upper_utf8view_long_packs_tight() -> Result<()> {
296        // Mix of long and inline values; the long values should be packed into
297        // a single tight output buffer whose size is exactly the sum of their
298        // lengths (inline values do not contribute).
299        let input = Arc::new(StringViewArray::from(vec![
300            Some("hello world 123"), // 15 bytes (long)
301            Some("abc"),             // inline
302            None,
303            Some("datafusion rocks!"),   // 17 bytes (long)
304            Some("another long string"), // 19 bytes (long)
305        ])) as ArrayRef;
306        let result = invoke_upper(input)?;
307        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
308
309        let expected = StringViewArray::from(vec![
310            Some("HELLO WORLD 123"),
311            Some("ABC"),
312            None,
313            Some("DATAFUSION ROCKS!"),
314            Some("ANOTHER LONG STRING"),
315        ]);
316        assert_eq!(result_sv, &expected);
317        assert_eq!(result_sv.data_buffers().len(), 1);
318        assert_eq!(result_sv.data_buffers()[0].len(), 15 + 17 + 19);
319        Ok(())
320    }
321
322    #[test]
323    fn upper_utf8view_splits_into_multiple_buffers() -> Result<()> {
324        // Produce enough long-string output to overflow the first data block
325        // (≈16 KiB after the initial doubling) and confirm the fast path
326        // splits across buffers rather than packing everything into one and
327        // risking the i32::MAX offset limit.
328        const STR_LEN: usize = 500;
329        const N: usize = 40; // 40 × 500 B = 20 KiB total — crosses the first block.
330        let value = "x".repeat(STR_LEN);
331        let inputs: Vec<Option<String>> = (0..N).map(|_| Some(value.clone())).collect();
332        let input = Arc::new(StringViewArray::from(inputs.clone())) as ArrayRef;
333        let result = invoke_upper(input)?;
334        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
335
336        let expected_value = "X".repeat(STR_LEN);
337        let expected: Vec<Option<&str>> =
338            (0..N).map(|_| Some(expected_value.as_str())).collect();
339        assert_eq!(result_sv, &StringViewArray::from(expected));
340        assert!(
341            result_sv.data_buffers().len() >= 2,
342            "expected the output to span more than one data buffer, got {}",
343            result_sv.data_buffers().len()
344        );
345        // Total bytes across buffers must equal total long-value bytes
346        // (no row was inlined since each value is > 12 bytes).
347        let total: usize = result_sv.data_buffers().iter().map(|b| b.len()).sum();
348        assert_eq!(total, N * STR_LEN);
349        Ok(())
350    }
351
352    #[test]
353    fn upper_sliced_utf8() -> Result<()> {
354        let parent = Arc::new(StringArray::from(vec![
355            Some("aaaaaaaa"),
356            Some("hello"),
357            Some("world"),
358            Some(""),
359            Some("zzzzzzzz"),
360        ])) as ArrayRef;
361        let sliced = parent.slice(1, 3);
362        let result = invoke_upper(sliced)?;
363        let result_sa = result.as_any().downcast_ref::<StringArray>().unwrap();
364
365        let expected = StringArray::from(vec![Some("HELLO"), Some("WORLD"), Some("")]);
366        assert_eq!(result_sa, &expected);
367        // The slice's addressed bytes are "hello" + "world" = 10; the ASCII
368        // fast path must produce a tight output buffer (not the parent's).
369        assert_eq!(result_sa.value_data().len(), 10);
370        Ok(())
371    }
372}