1use std::collections::{HashMap, HashSet};
21
22use crate::memtable::ColumnData;
23use crate::reader::DecodedColumn;
24
25type MemtableDict<'a> = (
30 &'a [u32],
31 &'a [String],
32 &'a HashMap<String, u32>,
33 std::borrow::Cow<'a, [bool]>,
34);
35
36pub fn dict_eval_eq(col: &ColumnData, value: &str, row_count: usize) -> Option<Vec<u64>> {
45 let (ids, _, reverse, valid) = unpack_memtable(col)?;
46 match reverse.get(value) {
47 None => Some(zero_mask(row_count)),
48 Some(&target_id) => Some(build_eq_mask(ids, &valid, target_id, row_count)),
49 }
50}
51
52pub fn dict_eval_ne(col: &ColumnData, value: &str, row_count: usize) -> Option<Vec<u64>> {
58 let (ids, _, reverse, valid) = unpack_memtable(col)?;
59 match reverse.get(value) {
60 None => Some(all_valid_mask(&valid, row_count)),
61 Some(&target_id) => Some(build_ne_mask(ids, &valid, target_id, row_count)),
62 }
63}
64
65pub fn dict_eval_contains(col: &ColumnData, substr: &str, row_count: usize) -> Option<Vec<u64>> {
73 let (ids, dictionary, _, valid) = unpack_memtable(col)?;
74 let matching = matching_ids_contains(dictionary, substr);
75 if matching.is_empty() {
76 return Some(zero_mask(row_count));
77 }
78 Some(build_set_mask(ids, &valid, &matching, row_count))
79}
80
81pub fn dict_eval_like(col: &ColumnData, pattern: &str, row_count: usize) -> Option<Vec<u64>> {
89 let (ids, dictionary, _, valid) = unpack_memtable(col)?;
90 let matching = matching_ids_like(dictionary, pattern)?;
91 if matching.is_empty() {
92 return Some(zero_mask(row_count));
93 }
94 Some(build_set_mask(ids, &valid, &matching, row_count))
95}
96
97pub fn decoded_dict_eval_eq(
106 col: &DecodedColumn,
107 value: &str,
108 row_count: usize,
109) -> Option<Vec<u64>> {
110 let (ids, dictionary, valid) = unpack_decoded(col)?;
111 match find_dict_id(dictionary, value) {
112 None => Some(zero_mask(row_count)),
113 Some(target_id) => Some(build_eq_mask(ids, valid, target_id, row_count)),
114 }
115}
116
117pub fn decoded_dict_eval_ne(
119 col: &DecodedColumn,
120 value: &str,
121 row_count: usize,
122) -> Option<Vec<u64>> {
123 let (ids, dictionary, valid) = unpack_decoded(col)?;
124 match find_dict_id(dictionary, value) {
125 None => Some(all_valid_mask(valid, row_count)),
126 Some(target_id) => Some(build_ne_mask(ids, valid, target_id, row_count)),
127 }
128}
129
130pub fn decoded_dict_eval_contains(
132 col: &DecodedColumn,
133 substr: &str,
134 row_count: usize,
135) -> Option<Vec<u64>> {
136 let (ids, dictionary, valid) = unpack_decoded(col)?;
137 let matching = matching_ids_contains(dictionary, substr);
138 if matching.is_empty() {
139 return Some(zero_mask(row_count));
140 }
141 Some(build_set_mask(ids, valid, &matching, row_count))
142}
143
144pub fn decoded_dict_eval_like(
146 col: &DecodedColumn,
147 pattern: &str,
148 row_count: usize,
149) -> Option<Vec<u64>> {
150 let (ids, dictionary, valid) = unpack_decoded(col)?;
151 let matching = matching_ids_like(dictionary, pattern)?;
152 if matching.is_empty() {
153 return Some(zero_mask(row_count));
154 }
155 Some(build_set_mask(ids, valid, &matching, row_count))
156}
157
158#[inline]
164pub fn words_for(row_count: usize) -> usize {
165 row_count.div_ceil(64)
166}
167
168pub fn bitmask_and(a: &[u64], b: &[u64]) -> Vec<u64> {
170 let len = a.len().min(b.len());
171 let mut out = vec![0u64; len];
172 for i in 0..len {
173 out[i] = a[i] & b[i];
174 }
175 out
176}
177
178pub fn bitmask_all(row_count: usize) -> Vec<u64> {
180 let words = words_for(row_count);
181 let mut out = vec![u64::MAX; words];
182 let tail = row_count % 64;
183 if tail > 0 && !out.is_empty() {
184 *out.last_mut().expect("non-empty") = (1u64 << tail) - 1;
185 }
186 out
187}
188
189fn unpack_memtable(col: &ColumnData) -> Option<MemtableDict<'_>> {
195 if let ColumnData::DictEncoded {
196 ids,
197 dictionary,
198 reverse,
199 valid,
200 } = col
201 {
202 let validity = match valid {
203 Some(v) => std::borrow::Cow::Borrowed(v.as_slice()),
204 None => std::borrow::Cow::Owned(vec![true; ids.len()]),
205 };
206 Some((ids.as_slice(), dictionary.as_slice(), reverse, validity))
207 } else {
208 None
209 }
210}
211
212fn unpack_decoded(col: &DecodedColumn) -> Option<(&[u32], &[String], &[bool])> {
214 if let DecodedColumn::DictEncoded {
215 ids,
216 dictionary,
217 valid,
218 } = col
219 {
220 Some((ids.as_slice(), dictionary.as_slice(), valid.as_slice()))
221 } else {
222 None
223 }
224}
225
226fn find_dict_id(dictionary: &[String], value: &str) -> Option<u32> {
228 dictionary.iter().position(|s| s == value).map(|i| i as u32)
229}
230
231fn matching_ids_contains(dictionary: &[String], substr: &str) -> HashSet<u32> {
233 dictionary
234 .iter()
235 .enumerate()
236 .filter(|(_, s)| s.contains(substr))
237 .map(|(i, _)| i as u32)
238 .collect()
239}
240
241fn matching_ids_like(dictionary: &[String], pattern: &str) -> Option<HashSet<u32>> {
245 let matching = match (pattern.starts_with('%'), pattern.ends_with('%')) {
246 (true, true) => {
247 let inner = pattern.trim_matches('%');
249 if inner.contains('%') {
250 return None; }
252 dictionary
253 .iter()
254 .enumerate()
255 .filter(|(_, s)| s.contains(inner))
256 .map(|(i, _)| i as u32)
257 .collect()
258 }
259 (true, false) => {
260 let suffix = &pattern[1..];
262 if suffix.contains('%') {
263 return None;
264 }
265 dictionary
266 .iter()
267 .enumerate()
268 .filter(|(_, s)| s.ends_with(suffix))
269 .map(|(i, _)| i as u32)
270 .collect()
271 }
272 (false, true) => {
273 let prefix = &pattern[..pattern.len() - 1];
275 if prefix.contains('%') {
276 return None;
277 }
278 dictionary
279 .iter()
280 .enumerate()
281 .filter(|(_, s)| s.starts_with(prefix))
282 .map(|(i, _)| i as u32)
283 .collect()
284 }
285 (false, false) => {
286 if pattern.contains('%') {
288 return None;
289 }
290 dictionary
291 .iter()
292 .enumerate()
293 .filter(|(_, s)| s.as_str() == pattern)
294 .map(|(i, _)| i as u32)
295 .collect()
296 }
297 };
298 Some(matching)
299}
300
301fn build_eq_mask(ids: &[u32], valid: &[bool], target_id: u32, row_count: usize) -> Vec<u64> {
303 let words = words_for(row_count);
304 let mut mask = vec![0u64; words];
305 let n = row_count.min(ids.len()).min(valid.len());
306 for i in 0..n {
307 if valid[i] && ids[i] == target_id {
308 mask[i / 64] |= 1u64 << (i % 64);
309 }
310 }
311 mask
312}
313
314fn build_ne_mask(ids: &[u32], valid: &[bool], target_id: u32, row_count: usize) -> Vec<u64> {
316 let words = words_for(row_count);
317 let mut mask = vec![0u64; words];
318 let n = row_count.min(ids.len()).min(valid.len());
319 for i in 0..n {
320 if valid[i] && ids[i] != target_id {
321 mask[i / 64] |= 1u64 << (i % 64);
322 }
323 }
324 mask
325}
326
327fn build_set_mask(
329 ids: &[u32],
330 valid: &[bool],
331 matching: &HashSet<u32>,
332 row_count: usize,
333) -> Vec<u64> {
334 let words = words_for(row_count);
335 let mut mask = vec![0u64; words];
336 let n = row_count.min(ids.len()).min(valid.len());
337 for i in 0..n {
338 if valid[i] && matching.contains(&ids[i]) {
339 mask[i / 64] |= 1u64 << (i % 64);
340 }
341 }
342 mask
343}
344
345#[inline]
347fn zero_mask(row_count: usize) -> Vec<u64> {
348 vec![0u64; words_for(row_count)]
349}
350
351fn all_valid_mask(valid: &[bool], row_count: usize) -> Vec<u64> {
353 let words = words_for(row_count);
354 let mut mask = vec![0u64; words];
355 let n = row_count.min(valid.len());
356 for i in 0..n {
357 if valid[i] {
358 mask[i / 64] |= 1u64 << (i % 64);
359 }
360 }
361 mask
362}
363
364#[cfg(test)]
369mod tests {
370 use super::*;
371 use crate::memtable::ColumnData;
372 use crate::reader::DecodedColumn;
373 use std::collections::HashMap;
374
375 fn make_dict_col(values: &[Option<&str>]) -> ColumnData {
376 let mut dictionary: Vec<String> = Vec::new();
377 let mut reverse: HashMap<String, u32> = HashMap::new();
378 let mut ids: Vec<u32> = Vec::new();
379 let mut valid: Vec<bool> = Vec::new();
380
381 for opt in values {
382 match opt {
383 None => {
384 ids.push(0);
385 valid.push(false);
386 }
387 Some(s) => {
388 let id = if let Some(&existing) = reverse.get(*s) {
389 existing
390 } else {
391 let new_id = dictionary.len() as u32;
392 dictionary.push(s.to_string());
393 reverse.insert(s.to_string(), new_id);
394 new_id
395 };
396 ids.push(id);
397 valid.push(true);
398 }
399 }
400 }
401
402 ColumnData::DictEncoded {
403 ids,
404 dictionary,
405 reverse,
406 valid: Some(valid),
407 }
408 }
409
410 fn make_decoded_col(values: &[Option<&str>]) -> DecodedColumn {
411 let mut dictionary: Vec<String> = Vec::new();
412 let mut id_map: HashMap<String, u32> = HashMap::new();
413 let mut ids: Vec<u32> = Vec::new();
414 let mut valid: Vec<bool> = Vec::new();
415
416 for opt in values {
417 match opt {
418 None => {
419 ids.push(0);
420 valid.push(false);
421 }
422 Some(s) => {
423 let id = if let Some(&existing) = id_map.get(*s) {
424 existing
425 } else {
426 let new_id = dictionary.len() as u32;
427 dictionary.push(s.to_string());
428 id_map.insert(s.to_string(), new_id);
429 new_id
430 };
431 ids.push(id);
432 valid.push(true);
433 }
434 }
435 }
436
437 DecodedColumn::DictEncoded {
438 ids,
439 dictionary,
440 valid,
441 }
442 }
443
444 fn bits(mask: &[u64], row_count: usize) -> Vec<bool> {
445 (0..row_count)
446 .map(|i| (mask[i / 64] >> (i % 64)) & 1 == 1)
447 .collect()
448 }
449
450 #[test]
453 fn dict_eq_match() {
454 let col = make_dict_col(&[Some("web"), Some("db"), Some("web"), Some("cache")]);
455 let mask = dict_eval_eq(&col, "web", 4).unwrap();
456 assert_eq!(bits(&mask, 4), vec![true, false, true, false]);
457 }
458
459 #[test]
460 fn dict_eq_value_not_in_dict_returns_zero_mask() {
461 let col = make_dict_col(&[Some("web"), Some("db")]);
462 let mask = dict_eval_eq(&col, "missing", 2).unwrap();
463 assert_eq!(bits(&mask, 2), vec![false, false]);
464 }
465
466 #[test]
467 fn dict_eq_null_rows_excluded() {
468 let col = make_dict_col(&[Some("web"), None, Some("web")]);
469 let mask = dict_eval_eq(&col, "web", 3).unwrap();
470 assert_eq!(bits(&mask, 3), vec![true, false, true]);
471 }
472
473 #[test]
476 fn dict_ne_basic() {
477 let col = make_dict_col(&[Some("web"), Some("db"), Some("web")]);
478 let mask = dict_eval_ne(&col, "web", 3).unwrap();
479 assert_eq!(bits(&mask, 3), vec![false, true, false]);
480 }
481
482 #[test]
483 fn dict_ne_value_not_in_dict_all_valid_rows_pass() {
484 let col = make_dict_col(&[Some("web"), None, Some("db")]);
485 let mask = dict_eval_ne(&col, "missing", 3).unwrap();
486 assert_eq!(bits(&mask, 3), vec![true, false, true]);
488 }
489
490 #[test]
493 fn dict_contains_basic() {
494 let col = make_dict_col(&[Some("web-1"), Some("db-1"), Some("web-2"), Some("cache")]);
495 let mask = dict_eval_contains(&col, "web", 4).unwrap();
496 assert_eq!(bits(&mask, 4), vec![true, false, true, false]);
497 }
498
499 #[test]
500 fn dict_contains_no_match_zero_mask() {
501 let col = make_dict_col(&[Some("alpha"), Some("beta")]);
502 let mask = dict_eval_contains(&col, "gamma", 2).unwrap();
503 assert_eq!(bits(&mask, 2), vec![false, false]);
504 }
505
506 #[test]
509 fn dict_like_prefix_wildcard() {
510 let col = make_dict_col(&[Some("web-1"), Some("db-1"), Some("web-2")]);
511 let mask = dict_eval_like(&col, "web%", 3).unwrap();
512 assert_eq!(bits(&mask, 3), vec![true, false, true]);
513 }
514
515 #[test]
516 fn dict_like_suffix_wildcard() {
517 let col = make_dict_col(&[Some("alpha-web"), Some("beta-db"), Some("gamma-web")]);
518 let mask = dict_eval_like(&col, "%web", 3).unwrap();
519 assert_eq!(bits(&mask, 3), vec![true, false, true]);
520 }
521
522 #[test]
523 fn dict_like_both_wildcards() {
524 let col = make_dict_col(&[Some("alpha-web-1"), Some("beta-db"), Some("gamma-web-2")]);
525 let mask = dict_eval_like(&col, "%web%", 3).unwrap();
526 assert_eq!(bits(&mask, 3), vec![true, false, true]);
527 }
528
529 #[test]
530 fn dict_like_exact_no_wildcards() {
531 let col = make_dict_col(&[Some("exact"), Some("other")]);
532 let mask = dict_eval_like(&col, "exact", 2).unwrap();
533 assert_eq!(bits(&mask, 2), vec![true, false]);
534 }
535
536 #[test]
537 fn dict_like_unsupported_mid_wildcard_returns_none() {
538 let col = make_dict_col(&[Some("abc")]);
539 assert!(dict_eval_like(&col, "a%c", 1).is_none());
540 }
541
542 #[test]
545 fn decoded_dict_eq_match() {
546 let col = make_decoded_col(&[Some("web"), Some("db"), Some("web")]);
547 let mask = decoded_dict_eval_eq(&col, "web", 3).unwrap();
548 assert_eq!(bits(&mask, 3), vec![true, false, true]);
549 }
550
551 #[test]
552 fn decoded_dict_eq_not_in_dict() {
553 let col = make_decoded_col(&[Some("web"), Some("db")]);
554 let mask = decoded_dict_eval_eq(&col, "missing", 2).unwrap();
555 assert_eq!(bits(&mask, 2), vec![false, false]);
556 }
557
558 #[test]
559 fn decoded_dict_ne_not_in_dict_all_valid_pass() {
560 let col = make_decoded_col(&[Some("a"), None, Some("b")]);
561 let mask = decoded_dict_eval_ne(&col, "missing", 3).unwrap();
562 assert_eq!(bits(&mask, 3), vec![true, false, true]);
563 }
564
565 #[test]
566 fn decoded_dict_contains() {
567 let col = make_decoded_col(&[Some("web-1"), Some("db"), Some("web-2")]);
568 let mask = decoded_dict_eval_contains(&col, "web", 3).unwrap();
569 assert_eq!(bits(&mask, 3), vec![true, false, true]);
570 }
571
572 #[test]
573 fn decoded_dict_like() {
574 let col = make_decoded_col(&[Some("web-1"), Some("db"), Some("web-2")]);
575 let mask = decoded_dict_eval_like(&col, "web%", 3).unwrap();
576 assert_eq!(bits(&mask, 3), vec![true, false, true]);
577 }
578
579 #[test]
582 fn bitmask_all_correct_tail_bits() {
583 let mask = bitmask_all(65);
585 assert_eq!(mask.len(), 2);
586 assert_eq!(mask[0], u64::MAX);
587 assert_eq!(mask[1], 1u64); let mask66 = bitmask_all(66);
591 assert_eq!(mask66[1], 0b11u64);
592 }
593
594 #[test]
595 fn words_for_alignment() {
596 assert_eq!(words_for(0), 0);
597 assert_eq!(words_for(1), 1);
598 assert_eq!(words_for(64), 1);
599 assert_eq!(words_for(65), 2);
600 }
601
602 #[test]
603 fn non_dict_encoded_col_returns_none() {
604 let col = ColumnData::Int64 {
605 values: vec![1, 2, 3],
606 valid: Some(vec![true, true, true]),
607 };
608 assert!(dict_eval_eq(&col, "x", 3).is_none());
609 assert!(dict_eval_ne(&col, "x", 3).is_none());
610 assert!(dict_eval_contains(&col, "x", 3).is_none());
611 assert!(dict_eval_like(&col, "x%", 3).is_none());
612 }
613}