Skip to main content

datafusion_functions/string/
lower.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use arrow::datatypes::DataType;
19
20use crate::string::common::to_lower;
21use datafusion_common::Result;
22use datafusion_common::types::logical_string;
23use datafusion_expr::{
24    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
25    TypeSignatureClass, Volatility,
26};
27use datafusion_macros::user_doc;
28
29#[user_doc(
30    doc_section(label = "String Functions"),
31    description = "Converts a string to lower-case.",
32    syntax_example = "lower(str)",
33    sql_example = r#"```sql
34> select lower('Ångström');
35+-------------------------+
36| lower(Utf8("Ångström")) |
37+-------------------------+
38| ångström                |
39+-------------------------+
40```"#,
41    standard_argument(name = "str", prefix = "String"),
42    related_udf(name = "initcap"),
43    related_udf(name = "upper")
44)]
45#[derive(Debug, PartialEq, Eq, Hash)]
46pub struct LowerFunc {
47    signature: Signature,
48}
49
50impl Default for LowerFunc {
51    fn default() -> Self {
52        Self::new()
53    }
54}
55
56impl LowerFunc {
57    pub fn new() -> Self {
58        Self {
59            signature: Signature::coercible(
60                vec![Coercion::new_exact(TypeSignatureClass::Native(
61                    logical_string(),
62                ))],
63                Volatility::Immutable,
64            ),
65        }
66    }
67}
68
69impl ScalarUDFImpl for LowerFunc {
70    fn name(&self) -> &str {
71        "lower"
72    }
73
74    fn signature(&self) -> &Signature {
75        &self.signature
76    }
77
78    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
79        Ok(arg_types[0].clone())
80    }
81
82    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
83        to_lower(&args.args, "lower")
84    }
85
86    fn documentation(&self) -> Option<&Documentation> {
87        self.doc()
88    }
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94    use arrow::array::{Array, ArrayRef, StringArray, StringViewArray};
95    use arrow::datatypes::Field;
96    use datafusion_common::config::ConfigOptions;
97    use std::sync::Arc;
98
99    fn invoke_lower(input: ArrayRef) -> Result<ArrayRef> {
100        let func = LowerFunc::new();
101        let data_type = input.data_type().clone();
102        let args = ScalarFunctionArgs {
103            number_rows: input.len(),
104            args: vec![ColumnarValue::Array(input)],
105            arg_fields: vec![Field::new("a", data_type.clone(), true).into()],
106            return_field: Field::new("f", data_type, true).into(),
107            config_options: Arc::new(ConfigOptions::default()),
108        };
109        match func.invoke_with_args(args)? {
110            ColumnarValue::Array(r) => Ok(r),
111            _ => unreachable!("lower"),
112        }
113    }
114
115    fn to_lower(input: ArrayRef, expected: ArrayRef) -> Result<()> {
116        let result = invoke_lower(input)?;
117        assert_eq!(&expected, &result);
118        Ok(())
119    }
120
121    #[test]
122    fn lower_maybe_optimization() -> Result<()> {
123        let input = Arc::new(StringArray::from(vec![
124            Some("农历新年"),
125            None,
126            Some("DATAFUSION"),
127            Some("0123456789"),
128            Some(""),
129        ])) as ArrayRef;
130
131        let expected = Arc::new(StringArray::from(vec![
132            Some("农历新年"),
133            None,
134            Some("datafusion"),
135            Some("0123456789"),
136            Some(""),
137        ])) as ArrayRef;
138
139        to_lower(input, expected)
140    }
141
142    #[test]
143    fn lower_full_optimization() -> Result<()> {
144        let input = Arc::new(StringArray::from(vec![
145            Some("ARROW"),
146            None,
147            Some("DATAFUSION"),
148            Some("0123456789"),
149            Some(""),
150        ])) as ArrayRef;
151
152        let expected = Arc::new(StringArray::from(vec![
153            Some("arrow"),
154            None,
155            Some("datafusion"),
156            Some("0123456789"),
157            Some(""),
158        ])) as ArrayRef;
159
160        to_lower(input, expected)
161    }
162
163    #[test]
164    fn lower_partial_optimization() -> Result<()> {
165        let input = Arc::new(StringArray::from(vec![
166            Some("ARROW"),
167            None,
168            Some("DATAFUSION"),
169            Some("@_"),
170            Some("0123456789"),
171            Some(""),
172            Some("\t\n"),
173            Some("ὈΔΥΣΣΕΎΣ"),
174            Some("TSCHÜSS"),
175            Some("Ⱦ"), // ⱦ: length change
176            Some("农历新年"),
177        ])) as ArrayRef;
178
179        let expected = Arc::new(StringArray::from(vec![
180            Some("arrow"),
181            None,
182            Some("datafusion"),
183            Some("@_"),
184            Some("0123456789"),
185            Some(""),
186            Some("\t\n"),
187            Some("ὀδυσσεύς"),
188            Some("tschüss"),
189            Some("ⱦ"),
190            Some("农历新年"),
191        ])) as ArrayRef;
192
193        to_lower(input, expected)
194    }
195
196    #[test]
197    fn lower_utf8view() -> Result<()> {
198        let input = Arc::new(StringViewArray::from(vec![
199            Some("ARROW"),
200            None,
201            Some("TSCHÜSS"),
202        ])) as ArrayRef;
203
204        let expected = Arc::new(StringViewArray::from(vec![
205            Some("arrow"),
206            None,
207            Some("tschüss"),
208        ])) as ArrayRef;
209
210        to_lower(input, expected)
211    }
212
213    #[test]
214    fn lower_ascii_utf8view() -> Result<()> {
215        // Mix of inlined (≤12 bytes) and referenced (>12 bytes) strings, plus
216        // a null and an empty, to exercise the all-ASCII Utf8View fast path.
217        let input = Arc::new(StringViewArray::from(vec![
218            Some("ARROW"), // inlined short
219            None,
220            Some("HELLO WORLD 123"), // referenced (15 bytes)
221            Some(""),
222            Some("0123456789"),         // inlined, no case change
223            Some("DATAFUSION IS COOL"), // referenced
224        ])) as ArrayRef;
225
226        let expected = Arc::new(StringViewArray::from(vec![
227            Some("arrow"),
228            None,
229            Some("hello world 123"),
230            Some(""),
231            Some("0123456789"),
232            Some("datafusion is cool"),
233        ])) as ArrayRef;
234
235        to_lower(input, expected)
236    }
237
238    #[test]
239    fn lower_sliced_ascii_utf8view() -> Result<()> {
240        // Slice of a parent that contains a non-ASCII string outside the
241        // slice. The slice is all-ASCII, so the fast path must run and produce
242        // correct output while the parent's unaddressed non-ASCII bytes are
243        // irrelevant to the result.
244        let parent = Arc::new(StringViewArray::from(vec![
245            Some("农历新年LONG ENOUGH FOR BUFFER"),
246            Some("HELLO WORLD 123"),
247            Some("DATAFUSION ROCKS!"),
248            Some("ZZZZZZZZZZZZZZZZ"),
249        ])) as ArrayRef;
250        let sliced = parent.slice(1, 2);
251        let result = invoke_lower(sliced)?;
252        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
253
254        let expected = StringViewArray::from(vec![
255            Some("hello world 123"),
256            Some("datafusion rocks!"),
257        ]);
258        assert_eq!(result_sv, &expected);
259        // The slice's two long views address 15 + 17 = 32 bytes; the ASCII
260        // fast path must produce a single packed buffer of exactly that
261        // size, not one scaled to the parent's data buffer.
262        assert_eq!(result_sv.data_buffers().len(), 1);
263        assert_eq!(result_sv.data_buffers()[0].len(), 32);
264        Ok(())
265    }
266
267    #[test]
268    fn lower_utf8view_inline_only_no_buffers() -> Result<()> {
269        // An array whose values are all ≤ 12 bytes is fully inline; the ASCII
270        // fast path should produce no data buffers at all.
271        let input = Arc::new(StringViewArray::from(vec![
272            Some("HELLO"),
273            None,
274            Some(""),
275            Some("0123456789ab"), // 12 bytes — inline boundary
276        ])) as ArrayRef;
277        let result = invoke_lower(input)?;
278        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
279
280        let expected = StringViewArray::from(vec![
281            Some("hello"),
282            None,
283            Some(""),
284            Some("0123456789ab"),
285        ]);
286        assert_eq!(result_sv, &expected);
287        assert_eq!(
288            result_sv.data_buffers().len(),
289            0,
290            "inline-only Utf8View should produce no data buffers"
291        );
292        Ok(())
293    }
294
295    #[test]
296    fn lower_utf8view_long_packs_tight() -> Result<()> {
297        // Mix of long and inline values; the long values should be packed into
298        // a single tight output buffer whose size is exactly the sum of their
299        // lengths (inline values do not contribute).
300        let input = Arc::new(StringViewArray::from(vec![
301            Some("HELLO WORLD 123"), // 15 bytes (long)
302            Some("ABC"),             // inline
303            None,
304            Some("DATAFUSION ROCKS!"),   // 17 bytes (long)
305            Some("ANOTHER LONG STRING"), // 19 bytes (long)
306        ])) as ArrayRef;
307        let result = invoke_lower(input)?;
308        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
309
310        let expected = StringViewArray::from(vec![
311            Some("hello world 123"),
312            Some("abc"),
313            None,
314            Some("datafusion rocks!"),
315            Some("another long string"),
316        ]);
317        assert_eq!(result_sv, &expected);
318        assert_eq!(result_sv.data_buffers().len(), 1);
319        assert_eq!(result_sv.data_buffers()[0].len(), 15 + 17 + 19);
320        Ok(())
321    }
322
323    #[test]
324    fn lower_utf8view_splits_into_multiple_buffers() -> Result<()> {
325        // Produce enough long-string output to overflow the first data block
326        // (≈16 KiB after the initial doubling) and confirm the fast path
327        // splits across buffers rather than packing everything into one and
328        // risking the i32::MAX offset limit.
329        const STR_LEN: usize = 500;
330        const N: usize = 40; // 40 × 500 B = 20 KiB total — crosses the first block.
331        let value = "X".repeat(STR_LEN);
332        let inputs: Vec<Option<String>> = (0..N).map(|_| Some(value.clone())).collect();
333        let input = Arc::new(StringViewArray::from(inputs.clone())) as ArrayRef;
334        let result = invoke_lower(input)?;
335        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
336
337        let expected_value = "x".repeat(STR_LEN);
338        let expected: Vec<Option<&str>> =
339            (0..N).map(|_| Some(expected_value.as_str())).collect();
340        assert_eq!(result_sv, &StringViewArray::from(expected));
341        assert!(
342            result_sv.data_buffers().len() >= 2,
343            "expected the output to span more than one data buffer, got {}",
344            result_sv.data_buffers().len()
345        );
346        // Total bytes across buffers must equal total long-value bytes
347        // (no row was inlined since each value is > 12 bytes).
348        let total: usize = result_sv.data_buffers().iter().map(|b| b.len()).sum();
349        assert_eq!(total, N * STR_LEN);
350        Ok(())
351    }
352
353    #[test]
354    fn lower_sliced_utf8() -> Result<()> {
355        let parent = Arc::new(StringArray::from(vec![
356            Some("AAAAAAAA"),
357            Some("HELLO"),
358            Some("WORLD"),
359            Some(""),
360            Some("ZZZZZZZZ"),
361        ])) as ArrayRef;
362        let sliced = parent.slice(1, 3);
363        let result = invoke_lower(sliced)?;
364        let result_sa = result.as_any().downcast_ref::<StringArray>().unwrap();
365
366        let expected = StringArray::from(vec![Some("hello"), Some("world"), Some("")]);
367        assert_eq!(result_sa, &expected);
368        // The slice's addressed bytes are "HELLO" + "WORLD" = 10; the ASCII
369        // fast path must produce a tight output buffer (not the parent's).
370        assert_eq!(result_sa.value_data().len(), 10);
371        Ok(())
372    }
373}