vortex_scalar/
utf8.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::{Display, Formatter};
5use std::sync::Arc;
6
7use vortex_buffer::BufferString;
8use vortex_dtype::Nullability::NonNullable;
9use vortex_dtype::{DType, Nullability};
10use vortex_error::{VortexError, VortexExpect as _, VortexResult, vortex_bail, vortex_err};
11
12use crate::{InnerScalarValue, Scalar, ScalarValue};
13
14/// A scalar value representing a UTF-8 encoded string.
15///
16/// This type provides a view into a UTF-8 string scalar value, which can be either
17/// a valid UTF-8 string or null.
18#[derive(Debug, Hash)]
19pub struct Utf8Scalar<'a> {
20    dtype: &'a DType,
21    value: Option<Arc<BufferString>>,
22}
23
24impl Display for Utf8Scalar<'_> {
25    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
26        match &self.value {
27            None => write!(f, "null"),
28            Some(v) => write!(f, "\"{}\"", v.as_str()),
29        }
30    }
31}
32
33impl PartialEq for Utf8Scalar<'_> {
34    fn eq(&self, other: &Self) -> bool {
35        self.dtype.eq_ignore_nullability(other.dtype) && self.value == other.value
36    }
37}
38
39impl Eq for Utf8Scalar<'_> {}
40
41impl PartialOrd for Utf8Scalar<'_> {
42    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
43        Some(self.value.cmp(&other.value))
44    }
45}
46
47impl Ord for Utf8Scalar<'_> {
48    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
49        self.value.cmp(&other.value)
50    }
51}
52
53impl<'a> Utf8Scalar<'a> {
54    /// Creates a UTF-8 scalar from a data type and scalar value.
55    ///
56    /// # Errors
57    ///
58    /// Returns an error if the data type is not a UTF-8 type.
59    pub fn from_scalar_value(dtype: &'a DType, value: ScalarValue) -> VortexResult<Self> {
60        if !matches!(dtype, DType::Utf8(..)) {
61            vortex_bail!("Can only construct utf8 scalar from utf8 dtype, found {dtype}")
62        }
63        Ok(Self {
64            dtype,
65            value: value.as_buffer_string()?,
66        })
67    }
68
69    /// Returns the data type of this UTF-8 scalar.
70    #[inline]
71    pub fn dtype(&self) -> &'a DType {
72        self.dtype
73    }
74
75    /// Returns the string value, or None if null.
76    pub fn value(&self) -> Option<BufferString> {
77        self.value.as_ref().map(|v| v.as_ref().clone())
78    }
79
80    /// Constructs a value at most `max_length` in size that's greater than this value.
81    ///
82    /// Returns None if constructing a greater value would overflow.
83    pub fn upper_bound(self, max_length: usize) -> Option<Self> {
84        if let Some(value) = self.value {
85            if value.len() > max_length {
86                let utf8_split_pos = (max_length.saturating_sub(3)..=max_length)
87                    .rfind(|p| value.is_char_boundary(*p))
88                    .vortex_expect("Failed to find utf8 character boundary");
89
90                let utf8_mut = value
91                    .get(..utf8_split_pos)
92                    .vortex_expect("Slicing with existing index");
93
94                for (idx, original_char) in utf8_mut.char_indices().rev() {
95                    let original_len = original_char.len_utf8();
96                    if let Some(next_char) = char::from_u32(original_char as u32 + 1) {
97                        // do not allow increasing byte width of incremented char
98                        if next_char.len_utf8() == original_len {
99                            let sliced = value.inner().slice(0..idx + original_len);
100                            drop(value);
101                            let mut result = sliced.into_mut();
102                            next_char.encode_utf8(&mut result[idx..]);
103                            return Some(Self {
104                                dtype: self.dtype,
105                                value: Some(Arc::new(unsafe {
106                                    BufferString::new_unchecked(result.freeze())
107                                })),
108                            });
109                        }
110                    }
111                }
112                None
113            } else {
114                Some(Self {
115                    dtype: self.dtype,
116                    value: Some(value),
117                })
118            }
119        } else {
120            Some(self)
121        }
122    }
123
124    /// Construct a value at most `max_length` in size that's less than ourselves.
125    pub fn lower_bound(self, max_length: usize) -> Self {
126        if let Some(value) = self.value {
127            if value.len() > max_length {
128                // UTF8 characters are at most 4 bytes, since we know that BufferString is UTF8 we must have a valid character boundary
129                let utf8_split_pos = (max_length.saturating_sub(3)..=max_length)
130                    .rfind(|p| value.is_char_boundary(*p))
131                    .vortex_expect("Failed to find utf8 character boundary");
132
133                Self {
134                    dtype: self.dtype,
135                    value: Some(Arc::new(unsafe {
136                        BufferString::new_unchecked(value.inner().slice(0..utf8_split_pos))
137                    })),
138                }
139            } else {
140                Self {
141                    dtype: self.dtype,
142                    value: Some(value),
143                }
144            }
145        } else {
146            self
147        }
148    }
149
150    pub(crate) fn cast(&self, dtype: &DType) -> VortexResult<Scalar> {
151        if !matches!(dtype, DType::Utf8(..)) {
152            vortex_bail!("Can't cast utf8 to {}", dtype)
153        }
154        Ok(Scalar::new(
155            dtype.clone(),
156            ScalarValue(InnerScalarValue::BufferString(
157                self.value
158                    .as_ref()
159                    .vortex_expect("nullness handled in Scalar::cast")
160                    .clone(),
161            )),
162        ))
163    }
164
165    /// Length of the scalar value or None if value is null
166    pub fn len(&self) -> Option<usize> {
167        self.value.as_ref().map(|v| v.len())
168    }
169
170    /// Returns whether its value is non-null and empty, otherwise `None`.
171    pub fn is_empty(&self) -> Option<bool> {
172        self.value.as_ref().map(|v| v.is_empty())
173    }
174
175    /// Convert typed scalar into ScalarValue
176    pub fn into_value(self) -> ScalarValue {
177        ScalarValue(
178            self.value
179                .map(InnerScalarValue::BufferString)
180                .unwrap_or_else(|| InnerScalarValue::Null),
181        )
182    }
183}
184
185impl Scalar {
186    /// Creates a new UTF-8 scalar from a string-like value.
187    ///
188    /// # Panics
189    ///
190    /// Panics if the input cannot be converted to a valid UTF-8 string.
191    pub fn utf8<B>(str: B, nullability: Nullability) -> Self
192    where
193        B: Into<BufferString>,
194    {
195        Self::try_utf8(str, nullability).unwrap()
196    }
197
198    /// Tries to create a new UTF-8 scalar from a string-like value.
199    ///
200    /// # Errors
201    ///
202    /// Returns an error if the input cannot be converted to a valid UTF-8 string.
203    pub fn try_utf8<B>(
204        str: B,
205        nullability: Nullability,
206    ) -> Result<Self, <B as TryInto<BufferString>>::Error>
207    where
208        B: TryInto<BufferString>,
209    {
210        Ok(Self {
211            dtype: DType::Utf8(nullability),
212            value: ScalarValue(InnerScalarValue::BufferString(Arc::new(str.try_into()?))),
213        })
214    }
215}
216
217impl<'a> TryFrom<&'a Scalar> for Utf8Scalar<'a> {
218    type Error = VortexError;
219
220    fn try_from(value: &'a Scalar) -> Result<Self, Self::Error> {
221        if !matches!(value.dtype(), DType::Utf8(_)) {
222            vortex_bail!("Expected utf8 scalar, found {}", value.dtype())
223        }
224        Ok(Self {
225            dtype: value.dtype(),
226            value: value.value.as_buffer_string()?,
227        })
228    }
229}
230
231impl<'a> TryFrom<&'a Scalar> for String {
232    type Error = VortexError;
233
234    fn try_from(value: &'a Scalar) -> Result<Self, Self::Error> {
235        Ok(BufferString::try_from(value)?.to_string())
236    }
237}
238
239impl From<&str> for Scalar {
240    fn from(value: &str) -> Self {
241        Self {
242            dtype: DType::Utf8(NonNullable),
243            value: ScalarValue(InnerScalarValue::BufferString(Arc::new(
244                value.to_string().into(),
245            ))),
246        }
247    }
248}
249
250impl From<String> for Scalar {
251    fn from(value: String) -> Self {
252        Self {
253            dtype: DType::Utf8(NonNullable),
254            value: ScalarValue(InnerScalarValue::BufferString(Arc::new(value.into()))),
255        }
256    }
257}
258
259impl From<BufferString> for Scalar {
260    fn from(value: BufferString) -> Self {
261        Self {
262            dtype: DType::Utf8(NonNullable),
263            value: ScalarValue(InnerScalarValue::BufferString(Arc::new(value))),
264        }
265    }
266}
267
268impl From<Arc<BufferString>> for Scalar {
269    fn from(value: Arc<BufferString>) -> Self {
270        Self {
271            dtype: DType::Utf8(NonNullable),
272            value: ScalarValue(InnerScalarValue::BufferString(value)),
273        }
274    }
275}
276
277impl<'a> TryFrom<&'a Scalar> for BufferString {
278    type Error = VortexError;
279
280    fn try_from(scalar: &'a Scalar) -> VortexResult<Self> {
281        <Option<BufferString>>::try_from(scalar)?
282            .ok_or_else(|| vortex_err!("Can't extract present value from null scalar"))
283    }
284}
285
286impl TryFrom<Scalar> for BufferString {
287    type Error = VortexError;
288
289    fn try_from(scalar: Scalar) -> Result<Self, Self::Error> {
290        Self::try_from(&scalar)
291    }
292}
293
294impl<'a> TryFrom<&'a Scalar> for Option<BufferString> {
295    type Error = VortexError;
296
297    fn try_from(scalar: &'a Scalar) -> Result<Self, Self::Error> {
298        Ok(Utf8Scalar::try_from(scalar)?.value())
299    }
300}
301
302impl TryFrom<Scalar> for Option<BufferString> {
303    type Error = VortexError;
304
305    fn try_from(scalar: Scalar) -> Result<Self, Self::Error> {
306        Self::try_from(&scalar)
307    }
308}
309
310#[cfg(test)]
311mod tests {
312    use vortex_dtype::Nullability;
313    use vortex_error::{VortexExpect, VortexUnwrap};
314
315    use crate::{Scalar, Utf8Scalar};
316
317    #[test]
318    fn lower_bound() {
319        let utf8 = Scalar::utf8("snowman⛄️snowman", Nullability::NonNullable);
320        let expected = Scalar::utf8("snowman", Nullability::NonNullable);
321        assert_eq!(
322            Utf8Scalar::try_from(&utf8).vortex_unwrap().lower_bound(9),
323            Utf8Scalar::try_from(&expected).vortex_unwrap()
324        );
325    }
326
327    #[test]
328    fn upper_bound() {
329        let utf8 = Scalar::utf8("char🪩", Nullability::NonNullable);
330        let expected = Scalar::utf8("chas", Nullability::NonNullable);
331        assert_eq!(
332            Utf8Scalar::try_from(&utf8)
333                .vortex_unwrap()
334                .upper_bound(5)
335                .vortex_expect("must have upper bound"),
336            Utf8Scalar::try_from(&expected).vortex_unwrap()
337        );
338    }
339
340    #[test]
341    fn upper_bound_overflow() {
342        let utf8 = Scalar::utf8("🂑🂒🂓", Nullability::NonNullable);
343        assert!(
344            Utf8Scalar::try_from(&utf8)
345                .vortex_unwrap()
346                .upper_bound(2)
347                .is_none()
348        );
349    }
350}