1#![allow(clippy::cast_possible_truncation)]
5
6use vortex_array::ArrayRef;
7use vortex_array::ArrayView;
8use vortex_array::ExecutionCtx;
9use vortex_array::IntoArray;
10use vortex_array::ToCanonical;
11use vortex_array::arrays::BoolArray;
12use vortex_array::arrays::varbin::VarBinArrayExt;
13use vortex_array::match_each_integer_ptype;
14use vortex_array::scalar_fn::fns::like::LikeKernel;
15use vortex_array::scalar_fn::fns::like::LikeOptions;
16use vortex_error::VortexResult;
17
18use crate::FSST;
19use crate::FSSTArrayExt;
20use crate::dfa::FsstMatcher;
21use crate::dfa::dfa_scan_to_bitbuf;
22
23impl LikeKernel for FSST {
24 fn like(
25 array: ArrayView<'_, Self>,
26 pattern: &ArrayRef,
27 options: LikeOptions,
28 _ctx: &mut ExecutionCtx,
29 ) -> VortexResult<Option<ArrayRef>> {
30 let Some(pattern_scalar) = pattern.as_constant() else {
31 return Ok(None);
32 };
33
34 if options.case_insensitive {
35 return Ok(None);
36 }
37
38 let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
39 let Some(v) = s.value() else {
40 return Ok(None);
41 };
42 v.as_ref()
43 } else if let Some(b) = pattern_scalar.as_binary_opt() {
44 let Some(v) = b.value() else {
45 return Ok(None);
46 };
47 v
48 } else {
49 return Ok(None);
50 };
51
52 let symbols = array.symbols();
53 let symbol_lengths = array.symbol_lengths();
54
55 let Some(matcher) =
56 FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
57 else {
58 return Ok(None);
59 };
60
61 let negated = options.negated;
62 let codes = array.codes();
63 let offsets = codes.offsets().to_primitive();
64 let all_bytes = codes.bytes();
65 let all_bytes = all_bytes.as_slice();
66 let n = codes.len();
67
68 let result = match_each_integer_ptype!(offsets.ptype(), |T| {
69 let off = offsets.as_slice::<T>();
70 dfa_scan_to_bitbuf(n, off, all_bytes, negated, |codes| matcher.matches(codes))
71 });
72
73 let validity = array
76 .codes()
77 .validity()?
78 .union_nullability(pattern_scalar.dtype().nullability());
79
80 Ok(Some(BoolArray::new(result, validity).into_array()))
81 }
82}
83
84#[cfg(test)]
85mod tests {
86 use std::sync::LazyLock;
87
88 use vortex_array::Canonical;
89 use vortex_array::IntoArray;
90 use vortex_array::VortexSessionExecute;
91 use vortex_array::arrays::BoolArray;
92 use vortex_array::arrays::ConstantArray;
93 use vortex_array::arrays::VarBinArray;
94 use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
95 use vortex_array::assert_arrays_eq;
96 use vortex_array::dtype::DType;
97 use vortex_array::dtype::Nullability;
98 use vortex_array::scalar_fn::fns::like::Like;
99 use vortex_array::scalar_fn::fns::like::LikeKernel;
100 use vortex_array::scalar_fn::fns::like::LikeOptions;
101 use vortex_array::session::ArraySession;
102 use vortex_error::VortexResult;
103 use vortex_session::VortexSession;
104
105 use crate::FSST;
106 use crate::FSSTArray;
107 use crate::fsst_compress;
108 use crate::fsst_train_compressor;
109
110 static SESSION: LazyLock<VortexSession> =
111 LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
112
113 fn make_fsst(strings: &[Option<&str>], nullability: Nullability) -> FSSTArray {
114 let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability));
115 let compressor = fsst_train_compressor(&varbin);
116 let len = varbin.len();
117 let dtype = varbin.dtype().clone();
118 fsst_compress(varbin, len, &dtype, &compressor)
119 }
120
121 fn run_like(array: FSSTArray, pattern: &str, opts: LikeOptions) -> VortexResult<BoolArray> {
122 let len = array.len();
123 let arr = array.into_array();
124 let pattern = ConstantArray::new(pattern, len).into_array();
125 let result = Like
126 .try_new_array(len, opts, [arr, pattern])?
127 .into_array()
128 .execute::<Canonical>(&mut SESSION.create_execution_ctx())?;
129 Ok(result.into_bool())
130 }
131
132 fn like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
133 run_like(array, pattern, LikeOptions::default())
134 }
135
136 #[test]
137 fn test_like_prefix() -> VortexResult<()> {
138 let fsst = make_fsst(
139 &[
140 Some("http://example.com"),
141 Some("http://test.org"),
142 Some("ftp://files.net"),
143 Some("http://vortex.dev"),
144 Some("ssh://server.io"),
145 ],
146 Nullability::NonNullable,
147 );
148 let result = like(fsst, "http%")?;
149 assert_arrays_eq!(
150 &result,
151 &BoolArray::from_iter([true, true, false, true, false])
152 );
153 Ok(())
154 }
155
156 #[test]
157 fn test_like_prefix_with_nulls() -> VortexResult<()> {
158 let fsst = make_fsst(
159 &[Some("hello"), None, Some("help"), None, Some("goodbye")],
160 Nullability::Nullable,
161 );
162 let result = like(fsst, "hel%")?; assert_arrays_eq!(
164 &result,
165 &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)])
166 );
167 Ok(())
168 }
169
170 #[test]
171 fn test_like_contains() -> VortexResult<()> {
172 let fsst = make_fsst(
173 &[
174 Some("hello world"),
175 Some("say hello"),
176 Some("goodbye"),
177 Some("hellooo"),
178 ],
179 Nullability::NonNullable,
180 );
181 let result = like(fsst, "%hello%")?;
182 assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true]));
183 Ok(())
184 }
185
186 #[test]
187 fn test_like_contains_cross_symbol() -> VortexResult<()> {
188 let fsst = make_fsst(
189 &[
190 Some("the quick brown fox jumps over the lazy dog"),
191 Some("a short string"),
192 Some("the lazy dog sleeps"),
193 Some("no match"),
194 ],
195 Nullability::NonNullable,
196 );
197 let result = like(fsst, "%lazy dog%")?;
198 assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false]));
199 Ok(())
200 }
201
202 #[test]
203 fn test_not_like_contains() -> VortexResult<()> {
204 let fsst = make_fsst(
205 &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")],
206 Nullability::NonNullable,
207 );
208 let opts = LikeOptions {
209 negated: true,
210 case_insensitive: false,
211 };
212 let result = run_like(fsst, "%sdf%", opts)?;
213 assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true]));
214 Ok(())
215 }
216
217 #[test]
218 fn test_like_match_all() -> VortexResult<()> {
219 let fsst = make_fsst(
220 &[Some("abc"), Some(""), Some("xyz")],
221 Nullability::NonNullable,
222 );
223 let result = like(fsst, "%")?;
224 assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true]));
225 Ok(())
226 }
227
228 #[test]
232 fn test_like_prefix_kernel_handles() -> VortexResult<()> {
233 let fsst = make_fsst(
234 &[Some("http://a.com"), Some("ftp://b.com")],
235 Nullability::NonNullable,
236 );
237 let pattern = ConstantArray::new("http%", fsst.len()).into_array();
238 let mut ctx = SESSION.create_execution_ctx();
239
240 let fsst = fsst.as_view();
241 let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
242 assert!(result.is_some(), "FSST LikeKernel should handle prefix%");
243 assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
244 Ok(())
245 }
246
247 #[test]
249 fn test_like_contains_kernel_handles() -> VortexResult<()> {
250 let fsst = make_fsst(
251 &[Some("hello world"), Some("goodbye")],
252 Nullability::NonNullable,
253 );
254 let pattern = ConstantArray::new("%world%", fsst.len()).into_array();
255 let mut ctx = SESSION.create_execution_ctx();
256
257 let fsst = fsst.as_view();
258 let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
259 assert!(result.is_some(), "FSST LikeKernel should handle %needle%");
260 assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
261 Ok(())
262 }
263
264 #[test]
266 fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> {
267 let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable);
268 let mut ctx = SESSION.create_execution_ctx();
269
270 let pattern = ConstantArray::new("a_c", fsst.len()).into_array();
272 let fsst_v = fsst.as_view();
273 let result =
274 <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
275 assert!(result.is_none(), "underscore pattern should fall back");
276
277 let pattern = ConstantArray::new("abc%", fsst.len()).into_array();
279 let opts = LikeOptions {
280 negated: false,
281 case_insensitive: true,
282 };
283 let result = <FSST as LikeKernel>::like(fsst_v, &pattern, opts, &mut ctx)?;
284 assert!(result.is_none(), "ilike should fall back");
285
286 Ok(())
287 }
288
289 #[test]
290 fn test_like_long_prefix_handled_by_flat_dfa() -> VortexResult<()> {
291 let fsst = make_fsst(
292 &[
293 Some("abcdefghijklmn-tail"),
294 Some("abcdefghijklmx-tail"),
295 Some("abcdefghijklmn"),
296 ],
297 Nullability::NonNullable,
298 );
299 let pattern = "abcdefghijklmn%";
300
301 let fsst = fsst.as_view();
302 let direct = <FSST as LikeKernel>::like(
303 fsst,
304 &ConstantArray::new(pattern, fsst.len()).into_array(),
305 LikeOptions::default(),
306 &mut SESSION.create_execution_ctx(),
307 )?;
308 assert!(
309 direct.is_some(),
310 "14-byte prefixes are now handled by the flat prefix DFA"
311 );
312 assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
313 Ok(())
314 }
315
316 #[test]
317 fn test_like_long_contains_falls_back_but_still_matches() -> VortexResult<()> {
318 let needle = "a".repeat(255);
319 let matching = format!("xx{needle}yy");
320 let non_matching = format!("xx{}byy", "a".repeat(254));
321 let exact = needle.clone();
322 let pattern = format!("%{needle}%");
323
324 let fsst = make_fsst(
325 &[Some(&matching), Some(&non_matching), Some(&exact)],
326 Nullability::NonNullable,
327 );
328
329 let fsst_v = fsst.as_view();
330 let direct = <FSST as LikeKernel>::like(
331 fsst_v,
332 &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
333 LikeOptions::default(),
334 &mut SESSION.create_execution_ctx(),
335 )?;
336 assert!(
337 direct.is_none(),
338 "contains needles longer than 254 bytes exceed the DFA's u8 state space"
339 );
340
341 let result = like(fsst, &pattern)?;
342 assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true]));
343 Ok(())
344 }
345
346 #[test]
347 fn test_like_contains_len_254_kernel_handles() -> VortexResult<()> {
348 let needle = "a".repeat(254);
349 let matching = format!("xx{needle}yy");
350 let non_matching = format!("xx{}byy", "a".repeat(253));
351 let pattern = format!("%{needle}%");
352
353 let fsst = make_fsst(
354 &[Some(&matching), Some(&non_matching), Some(needle.as_str())],
355 Nullability::NonNullable,
356 );
357
358 let fsst = fsst.as_view();
359 let direct = <FSST as LikeKernel>::like(
360 fsst,
361 &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
362 LikeOptions::default(),
363 &mut SESSION.create_execution_ctx(),
364 )?;
365 assert!(
366 direct.is_some(),
367 "254-byte contains needle should stay on the DFA path"
368 );
369 assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
370 Ok(())
371 }
372}