1use std::fmt::{Display, Formatter};
2use std::sync::Arc;
3
4use vortex_buffer::BufferString;
5use vortex_dtype::Nullability::NonNullable;
6use vortex_dtype::{DType, Nullability};
7use vortex_error::{VortexError, VortexExpect as _, VortexResult, vortex_bail, vortex_err};
8
9use crate::{InnerScalarValue, Scalar, ScalarValue};
10
11#[derive(Debug, Hash)]
12pub struct Utf8Scalar<'a> {
13 dtype: &'a DType,
14 value: Option<Arc<BufferString>>,
15}
16
17impl Display for Utf8Scalar<'_> {
18 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
19 match &self.value {
20 None => write!(f, "null"),
21 Some(v) => write!(f, "\"{}\"", v.as_str()),
22 }
23 }
24}
25
26impl PartialEq for Utf8Scalar<'_> {
27 fn eq(&self, other: &Self) -> bool {
28 self.dtype.eq_ignore_nullability(other.dtype) && self.value == other.value
29 }
30}
31
32impl Eq for Utf8Scalar<'_> {}
33
34impl PartialOrd for Utf8Scalar<'_> {
35 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
36 Some(self.value.cmp(&other.value))
37 }
38}
39
40impl Ord for Utf8Scalar<'_> {
41 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
42 self.value.cmp(&other.value)
43 }
44}
45
46impl<'a> Utf8Scalar<'a> {
47 pub fn from_scalar_value(dtype: &'a DType, value: ScalarValue) -> VortexResult<Self> {
48 if !matches!(dtype, DType::Utf8(..)) {
49 vortex_bail!("Can only construct utf8 scalar from utf8 dtype, found {dtype}")
50 }
51 Ok(Self {
52 dtype,
53 value: value.as_buffer_string()?,
54 })
55 }
56
57 #[inline]
58 pub fn dtype(&self) -> &'a DType {
59 self.dtype
60 }
61
62 pub fn value(&self) -> Option<BufferString> {
63 self.value.as_ref().map(|v| v.as_ref().clone())
64 }
65
66 pub fn upper_bound(self, max_length: usize) -> Option<Self> {
70 if let Some(value) = self.value {
71 if value.len() > max_length {
72 let utf8_split_pos = (max_length.saturating_sub(3)..=max_length)
73 .rfind(|p| value.is_char_boundary(*p))
74 .vortex_expect("Failed to find utf8 character boundary");
75
76 let utf8_mut = value
77 .get(..utf8_split_pos)
78 .vortex_expect("Slicing with existing index");
79
80 for (idx, original_char) in utf8_mut.char_indices().rev() {
81 let original_len = original_char.len_utf8();
82 if let Some(next_char) = char::from_u32(original_char as u32 + 1) {
83 if next_char.len_utf8() == original_len {
85 let sliced = value.inner().slice(0..idx + original_len);
86 drop(value);
87 let mut result = sliced.into_mut();
88 next_char.encode_utf8(&mut result[idx..]);
89 return Some(Self {
90 dtype: self.dtype,
91 value: Some(Arc::new(unsafe {
92 BufferString::new_unchecked(result.freeze())
93 })),
94 });
95 }
96 }
97 }
98 None
99 } else {
100 Some(Self {
101 dtype: self.dtype,
102 value: Some(value),
103 })
104 }
105 } else {
106 Some(self)
107 }
108 }
109
110 pub fn lower_bound(self, max_length: usize) -> Self {
112 if let Some(value) = self.value {
113 if value.len() > max_length {
114 let utf8_split_pos = (max_length.saturating_sub(3)..=max_length)
116 .rfind(|p| value.is_char_boundary(*p))
117 .vortex_expect("Failed to find utf8 character boundary");
118
119 Self {
120 dtype: self.dtype,
121 value: Some(Arc::new(unsafe {
122 BufferString::new_unchecked(value.inner().slice(0..utf8_split_pos))
123 })),
124 }
125 } else {
126 Self {
127 dtype: self.dtype,
128 value: Some(value),
129 }
130 }
131 } else {
132 self
133 }
134 }
135
136 pub(crate) fn cast(&self, dtype: &DType) -> VortexResult<Scalar> {
137 if !matches!(dtype, DType::Utf8(..)) {
138 vortex_bail!("Can't cast utf8 to {}", dtype)
139 }
140 Ok(Scalar::new(
141 dtype.clone(),
142 ScalarValue(InnerScalarValue::BufferString(
143 self.value
144 .as_ref()
145 .vortex_expect("nullness handled in Scalar::cast")
146 .clone(),
147 )),
148 ))
149 }
150
151 pub fn len(&self) -> Option<usize> {
153 self.value.as_ref().map(|v| v.len())
154 }
155
156 pub fn is_empty(&self) -> Option<bool> {
158 self.value.as_ref().map(|v| v.is_empty())
159 }
160
161 pub fn into_value(self) -> ScalarValue {
163 ScalarValue(
164 self.value
165 .map(InnerScalarValue::BufferString)
166 .unwrap_or_else(|| InnerScalarValue::Null),
167 )
168 }
169}
170
171impl Scalar {
172 pub fn utf8<B>(str: B, nullability: Nullability) -> Self
173 where
174 B: Into<BufferString>,
175 {
176 Self::try_utf8(str, nullability).unwrap()
177 }
178
179 pub fn try_utf8<B>(
180 str: B,
181 nullability: Nullability,
182 ) -> Result<Self, <B as TryInto<BufferString>>::Error>
183 where
184 B: TryInto<BufferString>,
185 {
186 Ok(Self {
187 dtype: DType::Utf8(nullability),
188 value: ScalarValue(InnerScalarValue::BufferString(Arc::new(str.try_into()?))),
189 })
190 }
191}
192
193impl<'a> TryFrom<&'a Scalar> for Utf8Scalar<'a> {
194 type Error = VortexError;
195
196 fn try_from(value: &'a Scalar) -> Result<Self, Self::Error> {
197 if !matches!(value.dtype(), DType::Utf8(_)) {
198 vortex_bail!("Expected utf8 scalar, found {}", value.dtype())
199 }
200 Ok(Self {
201 dtype: value.dtype(),
202 value: value.value.as_buffer_string()?,
203 })
204 }
205}
206
207impl<'a> TryFrom<&'a Scalar> for String {
208 type Error = VortexError;
209
210 fn try_from(value: &'a Scalar) -> Result<Self, Self::Error> {
211 Ok(BufferString::try_from(value)?.to_string())
212 }
213}
214
215impl From<&str> for Scalar {
216 fn from(value: &str) -> Self {
217 Self {
218 dtype: DType::Utf8(NonNullable),
219 value: ScalarValue(InnerScalarValue::BufferString(Arc::new(
220 value.to_string().into(),
221 ))),
222 }
223 }
224}
225
226impl From<String> for Scalar {
227 fn from(value: String) -> Self {
228 Self {
229 dtype: DType::Utf8(NonNullable),
230 value: ScalarValue(InnerScalarValue::BufferString(Arc::new(value.into()))),
231 }
232 }
233}
234
235impl From<BufferString> for Scalar {
236 fn from(value: BufferString) -> Self {
237 Self {
238 dtype: DType::Utf8(NonNullable),
239 value: ScalarValue(InnerScalarValue::BufferString(Arc::new(value))),
240 }
241 }
242}
243
244impl From<Arc<BufferString>> for Scalar {
245 fn from(value: Arc<BufferString>) -> Self {
246 Self {
247 dtype: DType::Utf8(NonNullable),
248 value: ScalarValue(InnerScalarValue::BufferString(value)),
249 }
250 }
251}
252
253impl<'a> TryFrom<&'a Scalar> for BufferString {
254 type Error = VortexError;
255
256 fn try_from(scalar: &'a Scalar) -> VortexResult<Self> {
257 <Option<BufferString>>::try_from(scalar)?
258 .ok_or_else(|| vortex_err!("Can't extract present value from null scalar"))
259 }
260}
261
262impl TryFrom<Scalar> for BufferString {
263 type Error = VortexError;
264
265 fn try_from(scalar: Scalar) -> Result<Self, Self::Error> {
266 Self::try_from(&scalar)
267 }
268}
269
270impl<'a> TryFrom<&'a Scalar> for Option<BufferString> {
271 type Error = VortexError;
272
273 fn try_from(scalar: &'a Scalar) -> Result<Self, Self::Error> {
274 Ok(Utf8Scalar::try_from(scalar)?.value())
275 }
276}
277
278impl TryFrom<Scalar> for Option<BufferString> {
279 type Error = VortexError;
280
281 fn try_from(scalar: Scalar) -> Result<Self, Self::Error> {
282 Self::try_from(&scalar)
283 }
284}
285
286#[cfg(test)]
287mod tests {
288 use vortex_dtype::Nullability;
289 use vortex_error::{VortexExpect, VortexUnwrap};
290
291 use crate::{Scalar, Utf8Scalar};
292
293 #[test]
294 fn lower_bound() {
295 let utf8 = Scalar::utf8("snowman⛄️snowman", Nullability::NonNullable);
296 let expected = Scalar::utf8("snowman", Nullability::NonNullable);
297 assert_eq!(
298 Utf8Scalar::try_from(&utf8).vortex_unwrap().lower_bound(9),
299 Utf8Scalar::try_from(&expected).vortex_unwrap()
300 );
301 }
302
303 #[test]
304 fn upper_bound() {
305 let utf8 = Scalar::utf8("char🪩", Nullability::NonNullable);
306 let expected = Scalar::utf8("chas", Nullability::NonNullable);
307 assert_eq!(
308 Utf8Scalar::try_from(&utf8)
309 .vortex_unwrap()
310 .upper_bound(5)
311 .vortex_expect("must have upper bound"),
312 Utf8Scalar::try_from(&expected).vortex_unwrap()
313 );
314 }
315
316 #[test]
317 fn upper_bound_overflow() {
318 let utf8 = Scalar::utf8("🂑🂒🂓", Nullability::NonNullable);
319 assert!(
320 Utf8Scalar::try_from(&utf8)
321 .vortex_unwrap()
322 .upper_bound(2)
323 .is_none()
324 );
325 }
326}