1use vortex_array::ArrayRef;
5use vortex_array::ArrayView;
6use vortex_array::ExecutionCtx;
7use vortex_array::IntoArray;
8use vortex_array::arrays::BoolArray;
9use vortex_array::arrays::PrimitiveArray;
10use vortex_array::arrays::varbin::VarBinArrayExt;
11use vortex_array::match_each_integer_ptype;
12use vortex_array::scalar_fn::fns::like::LikeKernel;
13use vortex_array::scalar_fn::fns::like::LikeOptions;
14use vortex_error::VortexResult;
15
16use crate::FSST;
17use crate::FSSTArrayExt;
18use crate::dfa::FsstMatcher;
19use crate::dfa::dfa_scan_to_bitbuf;
20
21impl LikeKernel for FSST {
22 fn like(
23 array: ArrayView<'_, Self>,
24 pattern: &ArrayRef,
25 options: LikeOptions,
26 ctx: &mut ExecutionCtx,
27 ) -> VortexResult<Option<ArrayRef>> {
28 let Some(pattern_scalar) = pattern.as_constant() else {
29 return Ok(None);
30 };
31
32 if options.case_insensitive {
33 return Ok(None);
34 }
35
36 let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
37 let Some(v) = s.value() else {
38 return Ok(None);
39 };
40 v.as_ref()
41 } else if let Some(b) = pattern_scalar.as_binary_opt() {
42 let Some(v) = b.value() else {
43 return Ok(None);
44 };
45 v
46 } else {
47 return Ok(None);
48 };
49
50 let symbols = array.symbols();
51 let symbol_lengths = array.symbol_lengths();
52
53 let Some(matcher) =
54 FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
55 else {
56 return Ok(None);
57 };
58
59 let negated = options.negated;
60 let codes = array.codes();
61 let offsets = codes.offsets().clone().execute::<PrimitiveArray>(ctx)?;
62 let all_bytes = codes.bytes();
63 let all_bytes = all_bytes.as_slice();
64 let n = codes.len();
65
66 let result = match_each_integer_ptype!(offsets.ptype(), |T| {
67 let off = offsets.as_slice::<T>();
68 dfa_scan_to_bitbuf(n, off, all_bytes, negated, |codes| matcher.matches(codes))
69 });
70
71 let validity = array
74 .codes()
75 .validity()?
76 .union_nullability(pattern_scalar.dtype().nullability());
77
78 Ok(Some(BoolArray::new(result, validity).into_array()))
79 }
80}
81
82#[cfg(test)]
83mod tests {
84 use std::sync::LazyLock;
85
86 use vortex_array::Canonical;
87 use vortex_array::IntoArray;
88 use vortex_array::VortexSessionExecute;
89 use vortex_array::arrays::BoolArray;
90 use vortex_array::arrays::ConstantArray;
91 use vortex_array::arrays::VarBinArray;
92 use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
93 use vortex_array::assert_arrays_eq;
94 use vortex_array::dtype::DType;
95 use vortex_array::dtype::Nullability;
96 use vortex_array::scalar_fn::fns::like::Like;
97 use vortex_array::scalar_fn::fns::like::LikeKernel;
98 use vortex_array::scalar_fn::fns::like::LikeOptions;
99 use vortex_array::session::ArraySession;
100 use vortex_error::VortexResult;
101 use vortex_session::VortexSession;
102
103 use crate::FSST;
104 use crate::FSSTArray;
105 use crate::fsst_compress;
106 use crate::fsst_train_compressor;
107
108 static SESSION: LazyLock<VortexSession> =
109 LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
110
111 fn make_fsst(strings: &[Option<&str>], nullability: Nullability) -> FSSTArray {
112 let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability));
113 let compressor = fsst_train_compressor(&varbin);
114 let len = varbin.len();
115 let dtype = varbin.dtype().clone();
116 fsst_compress(
117 varbin,
118 len,
119 &dtype,
120 &compressor,
121 &mut SESSION.create_execution_ctx(),
122 )
123 }
124
125 fn run_like(array: FSSTArray, pattern: &str, opts: LikeOptions) -> VortexResult<BoolArray> {
126 let len = array.len();
127 let arr = array.into_array();
128 let pattern = ConstantArray::new(pattern, len).into_array();
129 let result = Like
130 .try_new_array(len, opts, [arr, pattern])?
131 .into_array()
132 .execute::<Canonical>(&mut SESSION.create_execution_ctx())?;
133 Ok(result.into_bool())
134 }
135
136 fn like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
137 run_like(array, pattern, LikeOptions::default())
138 }
139
140 #[test]
141 fn test_like_prefix() -> VortexResult<()> {
142 let fsst = make_fsst(
143 &[
144 Some("http://example.com"),
145 Some("http://test.org"),
146 Some("ftp://files.net"),
147 Some("http://vortex.dev"),
148 Some("ssh://server.io"),
149 ],
150 Nullability::NonNullable,
151 );
152 let result = like(fsst, "http%")?;
153 assert_arrays_eq!(
154 &result,
155 &BoolArray::from_iter([true, true, false, true, false])
156 );
157 Ok(())
158 }
159
160 #[test]
161 fn test_like_prefix_with_nulls() -> VortexResult<()> {
162 let fsst = make_fsst(
163 &[Some("hello"), None, Some("help"), None, Some("goodbye")],
164 Nullability::Nullable,
165 );
166 let result = like(fsst, "hel%")?; assert_arrays_eq!(
168 &result,
169 &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)])
170 );
171 Ok(())
172 }
173
174 #[test]
175 fn test_like_contains() -> VortexResult<()> {
176 let fsst = make_fsst(
177 &[
178 Some("hello world"),
179 Some("say hello"),
180 Some("goodbye"),
181 Some("hellooo"),
182 ],
183 Nullability::NonNullable,
184 );
185 let result = like(fsst, "%hello%")?;
186 assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true]));
187 Ok(())
188 }
189
190 #[test]
191 fn test_like_contains_cross_symbol() -> VortexResult<()> {
192 let fsst = make_fsst(
193 &[
194 Some("the quick brown fox jumps over the lazy dog"),
195 Some("a short string"),
196 Some("the lazy dog sleeps"),
197 Some("no match"),
198 ],
199 Nullability::NonNullable,
200 );
201 let result = like(fsst, "%lazy dog%")?;
202 assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false]));
203 Ok(())
204 }
205
206 #[test]
207 fn test_not_like_contains() -> VortexResult<()> {
208 let fsst = make_fsst(
209 &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")],
210 Nullability::NonNullable,
211 );
212 let opts = LikeOptions {
213 negated: true,
214 case_insensitive: false,
215 };
216 let result = run_like(fsst, "%sdf%", opts)?;
217 assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true]));
218 Ok(())
219 }
220
221 #[test]
222 fn test_like_match_all() -> VortexResult<()> {
223 let fsst = make_fsst(
224 &[Some("abc"), Some(""), Some("xyz")],
225 Nullability::NonNullable,
226 );
227 let result = like(fsst, "%")?;
228 assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true]));
229 Ok(())
230 }
231
232 #[test]
236 fn test_like_prefix_kernel_handles() -> VortexResult<()> {
237 let fsst = make_fsst(
238 &[Some("http://a.com"), Some("ftp://b.com")],
239 Nullability::NonNullable,
240 );
241 let pattern = ConstantArray::new("http%", fsst.len()).into_array();
242 let mut ctx = SESSION.create_execution_ctx();
243
244 let fsst = fsst.as_view();
245 let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
246 assert!(result.is_some(), "FSST LikeKernel should handle prefix%");
247 assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
248 Ok(())
249 }
250
251 #[test]
253 fn test_like_contains_kernel_handles() -> VortexResult<()> {
254 let fsst = make_fsst(
255 &[Some("hello world"), Some("goodbye")],
256 Nullability::NonNullable,
257 );
258 let pattern = ConstantArray::new("%world%", fsst.len()).into_array();
259 let mut ctx = SESSION.create_execution_ctx();
260
261 let fsst = fsst.as_view();
262 let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
263 assert!(result.is_some(), "FSST LikeKernel should handle %needle%");
264 assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
265 Ok(())
266 }
267
268 #[test]
270 fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> {
271 let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable);
272 let mut ctx = SESSION.create_execution_ctx();
273
274 let pattern = ConstantArray::new("a_c", fsst.len()).into_array();
276 let fsst_v = fsst.as_view();
277 let result =
278 <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
279 assert!(result.is_none(), "underscore pattern should fall back");
280
281 let pattern = ConstantArray::new("abc%", fsst.len()).into_array();
283 let opts = LikeOptions {
284 negated: false,
285 case_insensitive: true,
286 };
287 let result = <FSST as LikeKernel>::like(fsst_v, &pattern, opts, &mut ctx)?;
288 assert!(result.is_none(), "ilike should fall back");
289
290 let pattern = ConstantArray::new(r"%\%", fsst.len()).into_array();
292 let result =
293 <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
294 assert!(result.is_none(), "escaped suffix pattern should fall back");
295
296 Ok(())
297 }
298
299 #[test]
300 fn test_like_kernel_handles_escaped_prefix_and_contains() -> VortexResult<()> {
301 let fsst = make_fsst(
302 &[
303 Some("%front"),
304 Some("_front"),
305 Some("\\front"),
306 Some("middle%value"),
307 Some("middle_value"),
308 Some("middle\\value"),
309 Some("front"),
310 ],
311 Nullability::NonNullable,
312 );
313 let fsst_v = fsst.as_view();
314 let mut ctx = SESSION.create_execution_ctx();
315
316 let pattern = ConstantArray::new(r"\%%", fsst.len()).into_array();
317 let result =
318 <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
319 assert!(result.is_some(), "escaped percent prefix should use FSST");
320 assert_arrays_eq!(
321 result.unwrap(),
322 BoolArray::from_iter([true, false, false, false, false, false, false])
323 );
324
325 let pattern = ConstantArray::new(r"\_%", fsst.len()).into_array();
326 let result =
327 <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
328 assert!(
329 result.is_some(),
330 "escaped underscore prefix should use FSST"
331 );
332 assert_arrays_eq!(
333 result.unwrap(),
334 BoolArray::from_iter([false, true, false, false, false, false, false])
335 );
336
337 let pattern = ConstantArray::new(r"\\%", fsst.len()).into_array();
338 let result =
339 <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
340 assert!(result.is_some(), "escaped backslash prefix should use FSST");
341 assert_arrays_eq!(
342 result.unwrap(),
343 BoolArray::from_iter([false, false, true, false, false, false, false])
344 );
345
346 let pattern = ConstantArray::new(r"%\%%", fsst.len()).into_array();
347 let result =
348 <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
349 assert!(result.is_some(), "escaped percent contains should use FSST");
350 assert_arrays_eq!(
351 result.unwrap(),
352 BoolArray::from_iter([true, false, false, true, false, false, false])
353 );
354
355 let pattern = ConstantArray::new(r"%\_%", fsst.len()).into_array();
356 let result =
357 <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
358 assert!(
359 result.is_some(),
360 "escaped underscore contains should use FSST"
361 );
362 assert_arrays_eq!(
363 result.unwrap(),
364 BoolArray::from_iter([false, true, false, false, true, false, false])
365 );
366
367 let pattern = ConstantArray::new(r"%\\%", fsst.len()).into_array();
368 let result =
369 <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
370 assert!(
371 result.is_some(),
372 "escaped backslash contains should use FSST"
373 );
374 assert_arrays_eq!(
375 result.unwrap(),
376 BoolArray::from_iter([false, false, true, false, false, true, false])
377 );
378
379 Ok(())
380 }
381
382 #[test]
383 fn test_like_long_prefix_handled_by_flat_dfa() -> VortexResult<()> {
384 let fsst = make_fsst(
385 &[
386 Some("abcdefghijklmn-tail"),
387 Some("abcdefghijklmx-tail"),
388 Some("abcdefghijklmn"),
389 ],
390 Nullability::NonNullable,
391 );
392 let pattern = "abcdefghijklmn%";
393
394 let fsst = fsst.as_view();
395 let direct = <FSST as LikeKernel>::like(
396 fsst,
397 &ConstantArray::new(pattern, fsst.len()).into_array(),
398 LikeOptions::default(),
399 &mut SESSION.create_execution_ctx(),
400 )?;
401 assert!(
402 direct.is_some(),
403 "14-byte prefixes are now handled by the flat prefix DFA"
404 );
405 assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
406 Ok(())
407 }
408
409 #[test]
410 fn test_like_long_contains_falls_back_but_still_matches() -> VortexResult<()> {
411 let needle = "a".repeat(255);
412 let matching = format!("xx{needle}yy");
413 let non_matching = format!("xx{}byy", "a".repeat(254));
414 let exact = needle.clone();
415 let pattern = format!("%{needle}%");
416
417 let fsst = make_fsst(
418 &[Some(&matching), Some(&non_matching), Some(&exact)],
419 Nullability::NonNullable,
420 );
421
422 let fsst_v = fsst.as_view();
423 let direct = <FSST as LikeKernel>::like(
424 fsst_v,
425 &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
426 LikeOptions::default(),
427 &mut SESSION.create_execution_ctx(),
428 )?;
429 assert!(
430 direct.is_none(),
431 "contains needles longer than 254 bytes exceed the DFA's u8 state space"
432 );
433
434 let result = like(fsst, &pattern)?;
435 assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true]));
436 Ok(())
437 }
438
439 #[test]
440 fn test_like_contains_len_254_kernel_handles() -> VortexResult<()> {
441 let needle = "a".repeat(254);
442 let matching = format!("xx{needle}yy");
443 let non_matching = format!("xx{}byy", "a".repeat(253));
444 let pattern = format!("%{needle}%");
445
446 let fsst = make_fsst(
447 &[Some(&matching), Some(&non_matching), Some(needle.as_str())],
448 Nullability::NonNullable,
449 );
450
451 let fsst = fsst.as_view();
452 let direct = <FSST as LikeKernel>::like(
453 fsst,
454 &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
455 LikeOptions::default(),
456 &mut SESSION.create_execution_ctx(),
457 )?;
458 assert!(
459 direct.is_some(),
460 "254-byte contains needle should stay on the DFA path"
461 );
462 assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
463 Ok(())
464 }
465}