vortex_scalar/
utf8.rs

1use std::fmt::{Display, Formatter};
2use std::sync::Arc;
3
4use vortex_buffer::BufferString;
5use vortex_dtype::Nullability::NonNullable;
6use vortex_dtype::{DType, Nullability};
7use vortex_error::{VortexError, VortexExpect as _, VortexResult, vortex_bail, vortex_err};
8
9use crate::{InnerScalarValue, Scalar, ScalarValue};
10
11#[derive(Debug, Hash)]
12pub struct Utf8Scalar<'a> {
13    dtype: &'a DType,
14    value: Option<Arc<BufferString>>,
15}
16
17impl Display for Utf8Scalar<'_> {
18    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
19        match &self.value {
20            None => write!(f, "null"),
21            Some(v) => write!(f, "\"{}\"", v.as_str()),
22        }
23    }
24}
25
26impl PartialEq for Utf8Scalar<'_> {
27    fn eq(&self, other: &Self) -> bool {
28        self.dtype.eq_ignore_nullability(other.dtype) && self.value == other.value
29    }
30}
31
32impl Eq for Utf8Scalar<'_> {}
33
34impl PartialOrd for Utf8Scalar<'_> {
35    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
36        Some(self.value.cmp(&other.value))
37    }
38}
39
40impl Ord for Utf8Scalar<'_> {
41    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
42        self.value.cmp(&other.value)
43    }
44}
45
46impl<'a> Utf8Scalar<'a> {
47    pub fn from_scalar_value(dtype: &'a DType, value: ScalarValue) -> VortexResult<Self> {
48        if !matches!(dtype, DType::Utf8(..)) {
49            vortex_bail!("Can only construct utf8 scalar from utf8 dtype, found {dtype}")
50        }
51        Ok(Self {
52            dtype,
53            value: value.as_buffer_string()?,
54        })
55    }
56
57    #[inline]
58    pub fn dtype(&self) -> &'a DType {
59        self.dtype
60    }
61
62    pub fn value(&self) -> Option<BufferString> {
63        self.value.as_ref().map(|v| v.as_ref().clone())
64    }
65
66    /// Construct a value at most `max_length` in size that's greater than ourselves.
67    ///
68    /// Will return None if constructing greater value overflows
69    pub fn upper_bound(self, max_length: usize) -> Option<Self> {
70        if let Some(value) = self.value {
71            if value.len() > max_length {
72                let utf8_split_pos = (max_length.saturating_sub(3)..=max_length)
73                    .rfind(|p| value.is_char_boundary(*p))
74                    .vortex_expect("Failed to find utf8 character boundary");
75
76                let utf8_mut = value
77                    .get(..utf8_split_pos)
78                    .vortex_expect("Slicing with existing index");
79
80                for (idx, original_char) in utf8_mut.char_indices().rev() {
81                    let original_len = original_char.len_utf8();
82                    if let Some(next_char) = char::from_u32(original_char as u32 + 1) {
83                        // do not allow increasing byte width of incremented char
84                        if next_char.len_utf8() == original_len {
85                            let sliced = value.inner().slice(0..idx + original_len);
86                            drop(value);
87                            let mut result = sliced.into_mut();
88                            next_char.encode_utf8(&mut result[idx..]);
89                            return Some(Self {
90                                dtype: self.dtype,
91                                value: Some(Arc::new(unsafe {
92                                    BufferString::new_unchecked(result.freeze())
93                                })),
94                            });
95                        }
96                    }
97                }
98                None
99            } else {
100                Some(Self {
101                    dtype: self.dtype,
102                    value: Some(value),
103                })
104            }
105        } else {
106            Some(self)
107        }
108    }
109
110    /// Construct a value at most `max_length` in size that's less than ourselves.
111    pub fn lower_bound(self, max_length: usize) -> Self {
112        if let Some(value) = self.value {
113            if value.len() > max_length {
114                // UTF8 characters are at most 4 bytes, since we know that BufferString is UTF8 we must have a valid character boundary
115                let utf8_split_pos = (max_length.saturating_sub(3)..=max_length)
116                    .rfind(|p| value.is_char_boundary(*p))
117                    .vortex_expect("Failed to find utf8 character boundary");
118
119                Self {
120                    dtype: self.dtype,
121                    value: Some(Arc::new(unsafe {
122                        BufferString::new_unchecked(value.inner().slice(0..utf8_split_pos))
123                    })),
124                }
125            } else {
126                Self {
127                    dtype: self.dtype,
128                    value: Some(value),
129                }
130            }
131        } else {
132            self
133        }
134    }
135
136    pub(crate) fn cast(&self, dtype: &DType) -> VortexResult<Scalar> {
137        if !matches!(dtype, DType::Utf8(..)) {
138            vortex_bail!("Can't cast utf8 to {}", dtype)
139        }
140        Ok(Scalar::new(
141            dtype.clone(),
142            ScalarValue(InnerScalarValue::BufferString(
143                self.value
144                    .as_ref()
145                    .vortex_expect("nullness handled in Scalar::cast")
146                    .clone(),
147            )),
148        ))
149    }
150
151    /// Length of the scalar value or None if value is null
152    pub fn len(&self) -> Option<usize> {
153        self.value.as_ref().map(|v| v.len())
154    }
155
156    /// Returns whether its value is non-null and empty, otherwise `None`.
157    pub fn is_empty(&self) -> Option<bool> {
158        self.value.as_ref().map(|v| v.is_empty())
159    }
160
161    /// Convert typed scalar into ScalarValue
162    pub fn into_value(self) -> ScalarValue {
163        ScalarValue(
164            self.value
165                .map(InnerScalarValue::BufferString)
166                .unwrap_or_else(|| InnerScalarValue::Null),
167        )
168    }
169}
170
171impl Scalar {
172    pub fn utf8<B>(str: B, nullability: Nullability) -> Self
173    where
174        B: Into<BufferString>,
175    {
176        Self::try_utf8(str, nullability).unwrap()
177    }
178
179    pub fn try_utf8<B>(
180        str: B,
181        nullability: Nullability,
182    ) -> Result<Self, <B as TryInto<BufferString>>::Error>
183    where
184        B: TryInto<BufferString>,
185    {
186        Ok(Self {
187            dtype: DType::Utf8(nullability),
188            value: ScalarValue(InnerScalarValue::BufferString(Arc::new(str.try_into()?))),
189        })
190    }
191}
192
193impl<'a> TryFrom<&'a Scalar> for Utf8Scalar<'a> {
194    type Error = VortexError;
195
196    fn try_from(value: &'a Scalar) -> Result<Self, Self::Error> {
197        if !matches!(value.dtype(), DType::Utf8(_)) {
198            vortex_bail!("Expected utf8 scalar, found {}", value.dtype())
199        }
200        Ok(Self {
201            dtype: value.dtype(),
202            value: value.value.as_buffer_string()?,
203        })
204    }
205}
206
207impl<'a> TryFrom<&'a Scalar> for String {
208    type Error = VortexError;
209
210    fn try_from(value: &'a Scalar) -> Result<Self, Self::Error> {
211        Ok(BufferString::try_from(value)?.to_string())
212    }
213}
214
215impl From<&str> for Scalar {
216    fn from(value: &str) -> Self {
217        Self {
218            dtype: DType::Utf8(NonNullable),
219            value: ScalarValue(InnerScalarValue::BufferString(Arc::new(
220                value.to_string().into(),
221            ))),
222        }
223    }
224}
225
226impl From<String> for Scalar {
227    fn from(value: String) -> Self {
228        Self {
229            dtype: DType::Utf8(NonNullable),
230            value: ScalarValue(InnerScalarValue::BufferString(Arc::new(value.into()))),
231        }
232    }
233}
234
235impl From<BufferString> for Scalar {
236    fn from(value: BufferString) -> Self {
237        Self {
238            dtype: DType::Utf8(NonNullable),
239            value: ScalarValue(InnerScalarValue::BufferString(Arc::new(value))),
240        }
241    }
242}
243
244impl From<Arc<BufferString>> for Scalar {
245    fn from(value: Arc<BufferString>) -> Self {
246        Self {
247            dtype: DType::Utf8(NonNullable),
248            value: ScalarValue(InnerScalarValue::BufferString(value)),
249        }
250    }
251}
252
253impl<'a> TryFrom<&'a Scalar> for BufferString {
254    type Error = VortexError;
255
256    fn try_from(scalar: &'a Scalar) -> VortexResult<Self> {
257        <Option<BufferString>>::try_from(scalar)?
258            .ok_or_else(|| vortex_err!("Can't extract present value from null scalar"))
259    }
260}
261
262impl TryFrom<Scalar> for BufferString {
263    type Error = VortexError;
264
265    fn try_from(scalar: Scalar) -> Result<Self, Self::Error> {
266        Self::try_from(&scalar)
267    }
268}
269
270impl<'a> TryFrom<&'a Scalar> for Option<BufferString> {
271    type Error = VortexError;
272
273    fn try_from(scalar: &'a Scalar) -> Result<Self, Self::Error> {
274        Ok(Utf8Scalar::try_from(scalar)?.value())
275    }
276}
277
278impl TryFrom<Scalar> for Option<BufferString> {
279    type Error = VortexError;
280
281    fn try_from(scalar: Scalar) -> Result<Self, Self::Error> {
282        Self::try_from(&scalar)
283    }
284}
285
286#[cfg(test)]
287mod tests {
288    use vortex_dtype::Nullability;
289    use vortex_error::{VortexExpect, VortexUnwrap};
290
291    use crate::{Scalar, Utf8Scalar};
292
293    #[test]
294    fn lower_bound() {
295        let utf8 = Scalar::utf8("snowman⛄️snowman", Nullability::NonNullable);
296        let expected = Scalar::utf8("snowman", Nullability::NonNullable);
297        assert_eq!(
298            Utf8Scalar::try_from(&utf8).vortex_unwrap().lower_bound(9),
299            Utf8Scalar::try_from(&expected).vortex_unwrap()
300        );
301    }
302
303    #[test]
304    fn upper_bound() {
305        let utf8 = Scalar::utf8("char🪩", Nullability::NonNullable);
306        let expected = Scalar::utf8("chas", Nullability::NonNullable);
307        assert_eq!(
308            Utf8Scalar::try_from(&utf8)
309                .vortex_unwrap()
310                .upper_bound(5)
311                .vortex_expect("must have upper bound"),
312            Utf8Scalar::try_from(&expected).vortex_unwrap()
313        );
314    }
315
316    #[test]
317    fn upper_bound_overflow() {
318        let utf8 = Scalar::utf8("🂑🂒🂓", Nullability::NonNullable);
319        assert!(
320            Utf8Scalar::try_from(&utf8)
321                .vortex_unwrap()
322                .upper_bound(2)
323                .is_none()
324        );
325    }
326}