1use vortex_array::ArrayRef;
5use vortex_array::ArrayView;
6use vortex_array::ExecutionCtx;
7use vortex_array::IntoArray;
8use vortex_array::arrays::BoolArray;
9use vortex_array::arrays::PrimitiveArray;
10use vortex_array::arrays::varbin::VarBinArrayExt;
11use vortex_array::match_each_integer_ptype;
12use vortex_array::scalar_fn::fns::like::LikeKernel;
13use vortex_array::scalar_fn::fns::like::LikeOptions;
14use vortex_error::VortexResult;
15
16use crate::FSST;
17use crate::FSSTArrayExt;
18use crate::dfa::FsstMatcher;
19use crate::dfa::dfa_scan_to_bitbuf;
20
21impl LikeKernel for FSST {
22 fn like(
23 array: ArrayView<'_, Self>,
24 pattern: &ArrayRef,
25 options: LikeOptions,
26 ctx: &mut ExecutionCtx,
27 ) -> VortexResult<Option<ArrayRef>> {
28 let Some(pattern_scalar) = pattern.as_constant() else {
29 return Ok(None);
30 };
31
32 if options.case_insensitive {
33 return Ok(None);
34 }
35
36 let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
37 let Some(v) = s.value() else {
38 return Ok(None);
39 };
40 v.as_ref()
41 } else if let Some(b) = pattern_scalar.as_binary_opt() {
42 let Some(v) = b.value() else {
43 return Ok(None);
44 };
45 v
46 } else {
47 return Ok(None);
48 };
49
50 let symbols = array.symbols();
51 let symbol_lengths = array.symbol_lengths();
52
53 let Some(matcher) =
54 FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
55 else {
56 return Ok(None);
57 };
58
59 let negated = options.negated;
60 let codes = array.codes();
61 let offsets = codes.offsets().clone().execute::<PrimitiveArray>(ctx)?;
62 let all_bytes = codes.bytes();
63 let all_bytes = all_bytes.as_slice();
64 let n = codes.len();
65
66 let result = match_each_integer_ptype!(offsets.ptype(), |T| {
67 let off = offsets.as_slice::<T>();
68 dfa_scan_to_bitbuf(n, off, all_bytes, negated, |codes| matcher.matches(codes))
69 });
70
71 let validity = array
74 .codes()
75 .validity()?
76 .union_nullability(pattern_scalar.dtype().nullability());
77
78 Ok(Some(BoolArray::new(result, validity).into_array()))
79 }
80}
81
82#[cfg(test)]
83mod tests {
84 use std::sync::LazyLock;
85
86 use vortex_array::Canonical;
87 use vortex_array::IntoArray;
88 use vortex_array::VortexSessionExecute;
89 use vortex_array::arrays::BoolArray;
90 use vortex_array::arrays::ConstantArray;
91 use vortex_array::arrays::VarBinArray;
92 use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
93 use vortex_array::assert_arrays_eq;
94 use vortex_array::dtype::DType;
95 use vortex_array::dtype::Nullability;
96 use vortex_array::scalar_fn::fns::like::Like;
97 use vortex_array::scalar_fn::fns::like::LikeKernel;
98 use vortex_array::scalar_fn::fns::like::LikeOptions;
99 use vortex_array::session::ArraySession;
100 use vortex_error::VortexResult;
101 use vortex_session::VortexSession;
102
103 use crate::FSST;
104 use crate::FSSTArray;
105 use crate::fsst_compress;
106 use crate::fsst_train_compressor;
107
108 static SESSION: LazyLock<VortexSession> =
109 LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
110
111 fn make_fsst(strings: &[Option<&str>], nullability: Nullability) -> FSSTArray {
112 let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability));
113 let compressor = fsst_train_compressor(&varbin);
114 let len = varbin.len();
115 let dtype = varbin.dtype().clone();
116 fsst_compress(
117 varbin,
118 len,
119 &dtype,
120 &compressor,
121 &mut SESSION.create_execution_ctx(),
122 )
123 }
124
125 fn run_like(array: FSSTArray, pattern: &str, opts: LikeOptions) -> VortexResult<BoolArray> {
126 let len = array.len();
127 let arr = array.into_array();
128 let pattern = ConstantArray::new(pattern, len).into_array();
129 let result = Like
130 .try_new_array(len, opts, [arr, pattern])?
131 .into_array()
132 .execute::<Canonical>(&mut SESSION.create_execution_ctx())?;
133 Ok(result.into_bool())
134 }
135
136 fn like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
137 run_like(array, pattern, LikeOptions::default())
138 }
139
140 #[test]
141 fn test_like_prefix() -> VortexResult<()> {
142 let fsst = make_fsst(
143 &[
144 Some("http://example.com"),
145 Some("http://test.org"),
146 Some("ftp://files.net"),
147 Some("http://vortex.dev"),
148 Some("ssh://server.io"),
149 ],
150 Nullability::NonNullable,
151 );
152 let result = like(fsst, "http%")?;
153 assert_arrays_eq!(
154 &result,
155 &BoolArray::from_iter([true, true, false, true, false])
156 );
157 Ok(())
158 }
159
160 #[test]
161 fn test_like_prefix_with_nulls() -> VortexResult<()> {
162 let fsst = make_fsst(
163 &[Some("hello"), None, Some("help"), None, Some("goodbye")],
164 Nullability::Nullable,
165 );
166 let result = like(fsst, "hel%")?; assert_arrays_eq!(
168 &result,
169 &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)])
170 );
171 Ok(())
172 }
173
174 #[test]
175 fn test_like_contains() -> VortexResult<()> {
176 let fsst = make_fsst(
177 &[
178 Some("hello world"),
179 Some("say hello"),
180 Some("goodbye"),
181 Some("hellooo"),
182 ],
183 Nullability::NonNullable,
184 );
185 let result = like(fsst, "%hello%")?;
186 assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true]));
187 Ok(())
188 }
189
190 #[test]
191 fn test_like_contains_cross_symbol() -> VortexResult<()> {
192 let fsst = make_fsst(
193 &[
194 Some("the quick brown fox jumps over the lazy dog"),
195 Some("a short string"),
196 Some("the lazy dog sleeps"),
197 Some("no match"),
198 ],
199 Nullability::NonNullable,
200 );
201 let result = like(fsst, "%lazy dog%")?;
202 assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false]));
203 Ok(())
204 }
205
206 #[test]
207 fn test_not_like_contains() -> VortexResult<()> {
208 let fsst = make_fsst(
209 &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")],
210 Nullability::NonNullable,
211 );
212 let opts = LikeOptions {
213 negated: true,
214 case_insensitive: false,
215 };
216 let result = run_like(fsst, "%sdf%", opts)?;
217 assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true]));
218 Ok(())
219 }
220
221 #[test]
222 fn test_like_match_all() -> VortexResult<()> {
223 let fsst = make_fsst(
224 &[Some("abc"), Some(""), Some("xyz")],
225 Nullability::NonNullable,
226 );
227 let result = like(fsst, "%")?;
228 assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true]));
229 Ok(())
230 }
231
232 #[test]
236 fn test_like_prefix_kernel_handles() -> VortexResult<()> {
237 let fsst = make_fsst(
238 &[Some("http://a.com"), Some("ftp://b.com")],
239 Nullability::NonNullable,
240 );
241 let pattern = ConstantArray::new("http%", fsst.len()).into_array();
242 let mut ctx = SESSION.create_execution_ctx();
243
244 let fsst = fsst.as_view();
245 let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
246 assert!(result.is_some(), "FSST LikeKernel should handle prefix%");
247 assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
248 Ok(())
249 }
250
251 #[test]
253 fn test_like_contains_kernel_handles() -> VortexResult<()> {
254 let fsst = make_fsst(
255 &[Some("hello world"), Some("goodbye")],
256 Nullability::NonNullable,
257 );
258 let pattern = ConstantArray::new("%world%", fsst.len()).into_array();
259 let mut ctx = SESSION.create_execution_ctx();
260
261 let fsst = fsst.as_view();
262 let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
263 assert!(result.is_some(), "FSST LikeKernel should handle %needle%");
264 assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
265 Ok(())
266 }
267
268 #[test]
270 fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> {
271 let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable);
272 let mut ctx = SESSION.create_execution_ctx();
273
274 let pattern = ConstantArray::new("a_c", fsst.len()).into_array();
276 let fsst_v = fsst.as_view();
277 let result =
278 <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
279 assert!(result.is_none(), "underscore pattern should fall back");
280
281 let pattern = ConstantArray::new("abc%", fsst.len()).into_array();
283 let opts = LikeOptions {
284 negated: false,
285 case_insensitive: true,
286 };
287 let result = <FSST as LikeKernel>::like(fsst_v, &pattern, opts, &mut ctx)?;
288 assert!(result.is_none(), "ilike should fall back");
289
290 Ok(())
291 }
292
293 #[test]
294 fn test_like_long_prefix_handled_by_flat_dfa() -> VortexResult<()> {
295 let fsst = make_fsst(
296 &[
297 Some("abcdefghijklmn-tail"),
298 Some("abcdefghijklmx-tail"),
299 Some("abcdefghijklmn"),
300 ],
301 Nullability::NonNullable,
302 );
303 let pattern = "abcdefghijklmn%";
304
305 let fsst = fsst.as_view();
306 let direct = <FSST as LikeKernel>::like(
307 fsst,
308 &ConstantArray::new(pattern, fsst.len()).into_array(),
309 LikeOptions::default(),
310 &mut SESSION.create_execution_ctx(),
311 )?;
312 assert!(
313 direct.is_some(),
314 "14-byte prefixes are now handled by the flat prefix DFA"
315 );
316 assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
317 Ok(())
318 }
319
320 #[test]
321 fn test_like_long_contains_falls_back_but_still_matches() -> VortexResult<()> {
322 let needle = "a".repeat(255);
323 let matching = format!("xx{needle}yy");
324 let non_matching = format!("xx{}byy", "a".repeat(254));
325 let exact = needle.clone();
326 let pattern = format!("%{needle}%");
327
328 let fsst = make_fsst(
329 &[Some(&matching), Some(&non_matching), Some(&exact)],
330 Nullability::NonNullable,
331 );
332
333 let fsst_v = fsst.as_view();
334 let direct = <FSST as LikeKernel>::like(
335 fsst_v,
336 &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
337 LikeOptions::default(),
338 &mut SESSION.create_execution_ctx(),
339 )?;
340 assert!(
341 direct.is_none(),
342 "contains needles longer than 254 bytes exceed the DFA's u8 state space"
343 );
344
345 let result = like(fsst, &pattern)?;
346 assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true]));
347 Ok(())
348 }
349
350 #[test]
351 fn test_like_contains_len_254_kernel_handles() -> VortexResult<()> {
352 let needle = "a".repeat(254);
353 let matching = format!("xx{needle}yy");
354 let non_matching = format!("xx{}byy", "a".repeat(253));
355 let pattern = format!("%{needle}%");
356
357 let fsst = make_fsst(
358 &[Some(&matching), Some(&non_matching), Some(needle.as_str())],
359 Nullability::NonNullable,
360 );
361
362 let fsst = fsst.as_view();
363 let direct = <FSST as LikeKernel>::like(
364 fsst,
365 &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
366 LikeOptions::default(),
367 &mut SESSION.create_execution_ctx(),
368 )?;
369 assert!(
370 direct.is_some(),
371 "254-byte contains needle should stay on the DFA path"
372 );
373 assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
374 Ok(())
375 }
376}