1#![allow(clippy::cast_possible_truncation)]
5
6use vortex_array::ArrayRef;
7use vortex_array::ExecutionCtx;
8use vortex_array::IntoArray;
9use vortex_array::ToCanonical;
10use vortex_array::arrays::BoolArray;
11use vortex_array::match_each_integer_ptype;
12use vortex_array::scalar_fn::fns::like::LikeKernel;
13use vortex_array::scalar_fn::fns::like::LikeOptions;
14use vortex_error::VortexResult;
15
16use crate::FSST;
17use crate::FSSTArray;
18use crate::dfa::FsstMatcher;
19use crate::dfa::dfa_scan_to_bitbuf;
20
21impl LikeKernel for FSST {
22 fn like(
23 array: &FSSTArray,
24 pattern: &ArrayRef,
25 options: LikeOptions,
26 _ctx: &mut ExecutionCtx,
27 ) -> VortexResult<Option<ArrayRef>> {
28 let Some(pattern_scalar) = pattern.as_constant() else {
29 return Ok(None);
30 };
31
32 if options.case_insensitive {
33 return Ok(None);
34 }
35
36 let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
37 let Some(v) = s.value() else {
38 return Ok(None);
39 };
40 v.as_ref()
41 } else if let Some(b) = pattern_scalar.as_binary_opt() {
42 let Some(v) = b.value() else {
43 return Ok(None);
44 };
45 v
46 } else {
47 return Ok(None);
48 };
49
50 let symbols = array.symbols();
51 let symbol_lengths = array.symbol_lengths();
52
53 let Some(matcher) =
54 FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
55 else {
56 return Ok(None);
57 };
58
59 let negated = options.negated;
60 let codes = array.codes();
61 let offsets = codes.offsets().to_primitive();
62 let all_bytes = codes.bytes();
63 let all_bytes = all_bytes.as_slice();
64 let n = codes.len();
65
66 let result = match_each_integer_ptype!(offsets.ptype(), |T| {
67 let off = offsets.as_slice::<T>();
68 dfa_scan_to_bitbuf(n, off, all_bytes, negated, |codes| matcher.matches(codes))
69 });
70
71 let validity = array
74 .codes()
75 .validity()?
76 .union_nullability(pattern_scalar.dtype().nullability());
77
78 Ok(Some(BoolArray::new(result, validity).into_array()))
79 }
80}
81
82#[cfg(test)]
83mod tests {
84 use std::sync::LazyLock;
85
86 use vortex_array::Canonical;
87 use vortex_array::IntoArray;
88 use vortex_array::VortexSessionExecute;
89 use vortex_array::arrays::BoolArray;
90 use vortex_array::arrays::ConstantArray;
91 use vortex_array::arrays::VarBinArray;
92 use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
93 use vortex_array::assert_arrays_eq;
94 use vortex_array::dtype::DType;
95 use vortex_array::dtype::Nullability;
96 use vortex_array::scalar_fn::fns::like::Like;
97 use vortex_array::scalar_fn::fns::like::LikeKernel;
98 use vortex_array::scalar_fn::fns::like::LikeOptions;
99 use vortex_array::session::ArraySession;
100 use vortex_error::VortexResult;
101 use vortex_session::VortexSession;
102
103 use crate::FSST;
104 use crate::FSSTArray;
105 use crate::fsst_compress;
106 use crate::fsst_train_compressor;
107
108 static SESSION: LazyLock<VortexSession> =
109 LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
110
111 fn make_fsst(strings: &[Option<&str>], nullability: Nullability) -> FSSTArray {
112 let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability));
113 let compressor = fsst_train_compressor(&varbin);
114 fsst_compress(varbin, &compressor)
115 }
116
117 fn run_like(array: FSSTArray, pattern: &str, opts: LikeOptions) -> VortexResult<BoolArray> {
118 let len = array.len();
119 let arr = array.into_array();
120 let pattern = ConstantArray::new(pattern, len).into_array();
121 let result = Like
122 .try_new_array(len, opts, [arr, pattern])?
123 .into_array()
124 .execute::<Canonical>(&mut SESSION.create_execution_ctx())?;
125 Ok(result.into_bool())
126 }
127
128 fn like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
129 run_like(array, pattern, LikeOptions::default())
130 }
131
132 #[test]
133 fn test_like_prefix() -> VortexResult<()> {
134 let fsst = make_fsst(
135 &[
136 Some("http://example.com"),
137 Some("http://test.org"),
138 Some("ftp://files.net"),
139 Some("http://vortex.dev"),
140 Some("ssh://server.io"),
141 ],
142 Nullability::NonNullable,
143 );
144 let result = like(fsst, "http%")?;
145 assert_arrays_eq!(
146 &result,
147 &BoolArray::from_iter([true, true, false, true, false])
148 );
149 Ok(())
150 }
151
152 #[test]
153 fn test_like_prefix_with_nulls() -> VortexResult<()> {
154 let fsst = make_fsst(
155 &[Some("hello"), None, Some("help"), None, Some("goodbye")],
156 Nullability::Nullable,
157 );
158 let result = like(fsst, "hel%")?; assert_arrays_eq!(
160 &result,
161 &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)])
162 );
163 Ok(())
164 }
165
166 #[test]
167 fn test_like_contains() -> VortexResult<()> {
168 let fsst = make_fsst(
169 &[
170 Some("hello world"),
171 Some("say hello"),
172 Some("goodbye"),
173 Some("hellooo"),
174 ],
175 Nullability::NonNullable,
176 );
177 let result = like(fsst, "%hello%")?;
178 assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true]));
179 Ok(())
180 }
181
182 #[test]
183 fn test_like_contains_cross_symbol() -> VortexResult<()> {
184 let fsst = make_fsst(
185 &[
186 Some("the quick brown fox jumps over the lazy dog"),
187 Some("a short string"),
188 Some("the lazy dog sleeps"),
189 Some("no match"),
190 ],
191 Nullability::NonNullable,
192 );
193 let result = like(fsst, "%lazy dog%")?;
194 assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false]));
195 Ok(())
196 }
197
198 #[test]
199 fn test_not_like_contains() -> VortexResult<()> {
200 let fsst = make_fsst(
201 &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")],
202 Nullability::NonNullable,
203 );
204 let opts = LikeOptions {
205 negated: true,
206 case_insensitive: false,
207 };
208 let result = run_like(fsst, "%sdf%", opts)?;
209 assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true]));
210 Ok(())
211 }
212
213 #[test]
214 fn test_like_match_all() -> VortexResult<()> {
215 let fsst = make_fsst(
216 &[Some("abc"), Some(""), Some("xyz")],
217 Nullability::NonNullable,
218 );
219 let result = like(fsst, "%")?;
220 assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true]));
221 Ok(())
222 }
223
224 #[test]
228 fn test_like_prefix_kernel_handles() -> VortexResult<()> {
229 let fsst = make_fsst(
230 &[Some("http://a.com"), Some("ftp://b.com")],
231 Nullability::NonNullable,
232 );
233 let pattern = ConstantArray::new("http%", fsst.len()).into_array();
234 let mut ctx = SESSION.create_execution_ctx();
235
236 let result = <FSST as LikeKernel>::like(&fsst, &pattern, LikeOptions::default(), &mut ctx)?;
237 assert!(result.is_some(), "FSST LikeKernel should handle prefix%");
238 assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
239 Ok(())
240 }
241
242 #[test]
244 fn test_like_contains_kernel_handles() -> VortexResult<()> {
245 let fsst = make_fsst(
246 &[Some("hello world"), Some("goodbye")],
247 Nullability::NonNullable,
248 );
249 let pattern = ConstantArray::new("%world%", fsst.len()).into_array();
250 let mut ctx = SESSION.create_execution_ctx();
251
252 let result = <FSST as LikeKernel>::like(&fsst, &pattern, LikeOptions::default(), &mut ctx)?;
253 assert!(result.is_some(), "FSST LikeKernel should handle %needle%");
254 assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
255 Ok(())
256 }
257
258 #[test]
260 fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> {
261 let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable);
262 let mut ctx = SESSION.create_execution_ctx();
263
264 let pattern = ConstantArray::new("a_c", fsst.len()).into_array();
266 let result = <FSST as LikeKernel>::like(&fsst, &pattern, LikeOptions::default(), &mut ctx)?;
267 assert!(result.is_none(), "underscore pattern should fall back");
268
269 let pattern = ConstantArray::new("abc%", fsst.len()).into_array();
271 let opts = LikeOptions {
272 negated: false,
273 case_insensitive: true,
274 };
275 let result = <FSST as LikeKernel>::like(&fsst, &pattern, opts, &mut ctx)?;
276 assert!(result.is_none(), "ilike should fall back");
277
278 Ok(())
279 }
280
281 #[test]
282 fn test_like_long_prefix_handled_by_flat_dfa() -> VortexResult<()> {
283 let fsst = make_fsst(
284 &[
285 Some("abcdefghijklmn-tail"),
286 Some("abcdefghijklmx-tail"),
287 Some("abcdefghijklmn"),
288 ],
289 Nullability::NonNullable,
290 );
291 let pattern = "abcdefghijklmn%";
292
293 let direct = <FSST as LikeKernel>::like(
294 &fsst,
295 &ConstantArray::new(pattern, fsst.len()).into_array(),
296 LikeOptions::default(),
297 &mut SESSION.create_execution_ctx(),
298 )?;
299 assert!(
300 direct.is_some(),
301 "14-byte prefixes are now handled by the flat prefix DFA"
302 );
303 assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
304 Ok(())
305 }
306
307 #[test]
308 fn test_like_long_contains_falls_back_but_still_matches() -> VortexResult<()> {
309 let needle = "a".repeat(255);
310 let matching = format!("xx{needle}yy");
311 let non_matching = format!("xx{}byy", "a".repeat(254));
312 let exact = needle.clone();
313 let pattern = format!("%{needle}%");
314
315 let fsst = make_fsst(
316 &[Some(&matching), Some(&non_matching), Some(&exact)],
317 Nullability::NonNullable,
318 );
319
320 let direct = <FSST as LikeKernel>::like(
321 &fsst,
322 &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
323 LikeOptions::default(),
324 &mut SESSION.create_execution_ctx(),
325 )?;
326 assert!(
327 direct.is_none(),
328 "contains needles longer than 254 bytes exceed the DFA's u8 state space"
329 );
330
331 let result = like(fsst, &pattern)?;
332 assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true]));
333 Ok(())
334 }
335
336 #[test]
337 fn test_like_contains_len_254_kernel_handles() -> VortexResult<()> {
338 let needle = "a".repeat(254);
339 let matching = format!("xx{needle}yy");
340 let non_matching = format!("xx{}byy", "a".repeat(253));
341 let pattern = format!("%{needle}%");
342
343 let fsst = make_fsst(
344 &[Some(&matching), Some(&non_matching), Some(needle.as_str())],
345 Nullability::NonNullable,
346 );
347
348 let direct = <FSST as LikeKernel>::like(
349 &fsst,
350 &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
351 LikeOptions::default(),
352 &mut SESSION.create_execution_ctx(),
353 )?;
354 assert!(
355 direct.is_some(),
356 "254-byte contains needle should stay on the DFA path"
357 );
358 assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
359 Ok(())
360 }
361}