1use std::collections::{HashMap, HashSet};
23
24use crate::memtable::ColumnData;
25use crate::reader::DecodedColumn;
26
27type MemtableDict<'a> = (
32 &'a [u32],
33 &'a [String],
34 &'a HashMap<String, u32>,
35 std::borrow::Cow<'a, [bool]>,
36);
37
38pub fn dict_eval_eq(col: &ColumnData, value: &str, row_count: usize) -> Option<Vec<u64>> {
47 let (ids, _, reverse, valid) = unpack_memtable(col)?;
48 match reverse.get(value) {
49 None => Some(zero_mask(row_count)),
50 Some(&target_id) => Some(build_eq_mask(ids, &valid, target_id, row_count)),
51 }
52}
53
54pub fn dict_eval_ne(col: &ColumnData, value: &str, row_count: usize) -> Option<Vec<u64>> {
60 let (ids, _, reverse, valid) = unpack_memtable(col)?;
61 match reverse.get(value) {
62 None => Some(all_valid_mask(&valid, row_count)),
63 Some(&target_id) => Some(build_ne_mask(ids, &valid, target_id, row_count)),
64 }
65}
66
67pub fn dict_eval_contains(col: &ColumnData, substr: &str, row_count: usize) -> Option<Vec<u64>> {
75 let (ids, dictionary, _, valid) = unpack_memtable(col)?;
76 let matching = matching_ids_contains(dictionary, substr);
77 if matching.is_empty() {
78 return Some(zero_mask(row_count));
79 }
80 Some(build_set_mask(ids, &valid, &matching, row_count))
81}
82
83pub fn dict_eval_like(col: &ColumnData, pattern: &str, row_count: usize) -> Option<Vec<u64>> {
91 let (ids, dictionary, _, valid) = unpack_memtable(col)?;
92 let matching = matching_ids_like(dictionary, pattern)?;
93 if matching.is_empty() {
94 return Some(zero_mask(row_count));
95 }
96 Some(build_set_mask(ids, &valid, &matching, row_count))
97}
98
99pub fn decoded_dict_eval_eq(
108 col: &DecodedColumn,
109 value: &str,
110 row_count: usize,
111) -> Option<Vec<u64>> {
112 let (ids, dictionary, valid) = unpack_decoded(col)?;
113 match find_dict_id(dictionary, value) {
114 None => Some(zero_mask(row_count)),
115 Some(target_id) => Some(build_eq_mask(ids, valid, target_id, row_count)),
116 }
117}
118
119pub fn decoded_dict_eval_ne(
121 col: &DecodedColumn,
122 value: &str,
123 row_count: usize,
124) -> Option<Vec<u64>> {
125 let (ids, dictionary, valid) = unpack_decoded(col)?;
126 match find_dict_id(dictionary, value) {
127 None => Some(all_valid_mask(valid, row_count)),
128 Some(target_id) => Some(build_ne_mask(ids, valid, target_id, row_count)),
129 }
130}
131
132pub fn decoded_dict_eval_contains(
134 col: &DecodedColumn,
135 substr: &str,
136 row_count: usize,
137) -> Option<Vec<u64>> {
138 let (ids, dictionary, valid) = unpack_decoded(col)?;
139 let matching = matching_ids_contains(dictionary, substr);
140 if matching.is_empty() {
141 return Some(zero_mask(row_count));
142 }
143 Some(build_set_mask(ids, valid, &matching, row_count))
144}
145
146pub fn decoded_dict_eval_like(
148 col: &DecodedColumn,
149 pattern: &str,
150 row_count: usize,
151) -> Option<Vec<u64>> {
152 let (ids, dictionary, valid) = unpack_decoded(col)?;
153 let matching = matching_ids_like(dictionary, pattern)?;
154 if matching.is_empty() {
155 return Some(zero_mask(row_count));
156 }
157 Some(build_set_mask(ids, valid, &matching, row_count))
158}
159
160#[inline]
166pub fn words_for(row_count: usize) -> usize {
167 row_count.div_ceil(64)
168}
169
170pub fn bitmask_and(a: &[u64], b: &[u64]) -> Vec<u64> {
172 let len = a.len().min(b.len());
173 let mut out = vec![0u64; len];
174 for i in 0..len {
175 out[i] = a[i] & b[i];
176 }
177 out
178}
179
180pub fn bitmask_all(row_count: usize) -> Vec<u64> {
182 let words = words_for(row_count);
183 let mut out = vec![u64::MAX; words];
184 let tail = row_count % 64;
185 if tail > 0 && !out.is_empty() {
186 *out.last_mut().expect("non-empty") = (1u64 << tail) - 1;
187 }
188 out
189}
190
191fn unpack_memtable(col: &ColumnData) -> Option<MemtableDict<'_>> {
197 if let ColumnData::DictEncoded {
198 ids,
199 dictionary,
200 reverse,
201 valid,
202 } = col
203 {
204 let validity = match valid {
205 Some(v) => std::borrow::Cow::Borrowed(v.as_slice()),
206 None => std::borrow::Cow::Owned(vec![true; ids.len()]),
207 };
208 Some((ids.as_slice(), dictionary.as_slice(), reverse, validity))
209 } else {
210 None
211 }
212}
213
214fn unpack_decoded(col: &DecodedColumn) -> Option<(&[u32], &[String], &[bool])> {
216 if let DecodedColumn::DictEncoded {
217 ids,
218 dictionary,
219 valid,
220 } = col
221 {
222 Some((ids.as_slice(), dictionary.as_slice(), valid.as_slice()))
223 } else {
224 None
225 }
226}
227
228fn find_dict_id(dictionary: &[String], value: &str) -> Option<u32> {
230 dictionary.iter().position(|s| s == value).map(|i| i as u32)
231}
232
233fn matching_ids_contains(dictionary: &[String], substr: &str) -> HashSet<u32> {
235 dictionary
236 .iter()
237 .enumerate()
238 .filter(|(_, s)| s.contains(substr))
239 .map(|(i, _)| i as u32)
240 .collect()
241}
242
243fn matching_ids_like(dictionary: &[String], pattern: &str) -> Option<HashSet<u32>> {
247 let matching = match (pattern.starts_with('%'), pattern.ends_with('%')) {
248 (true, true) => {
249 let inner = pattern.trim_matches('%');
251 if inner.contains('%') {
252 return None; }
254 dictionary
255 .iter()
256 .enumerate()
257 .filter(|(_, s)| s.contains(inner))
258 .map(|(i, _)| i as u32)
259 .collect()
260 }
261 (true, false) => {
262 let suffix = &pattern[1..];
264 if suffix.contains('%') {
265 return None;
266 }
267 dictionary
268 .iter()
269 .enumerate()
270 .filter(|(_, s)| s.ends_with(suffix))
271 .map(|(i, _)| i as u32)
272 .collect()
273 }
274 (false, true) => {
275 let prefix = &pattern[..pattern.len() - 1];
277 if prefix.contains('%') {
278 return None;
279 }
280 dictionary
281 .iter()
282 .enumerate()
283 .filter(|(_, s)| s.starts_with(prefix))
284 .map(|(i, _)| i as u32)
285 .collect()
286 }
287 (false, false) => {
288 if pattern.contains('%') {
290 return None;
291 }
292 dictionary
293 .iter()
294 .enumerate()
295 .filter(|(_, s)| s.as_str() == pattern)
296 .map(|(i, _)| i as u32)
297 .collect()
298 }
299 };
300 Some(matching)
301}
302
303fn build_eq_mask(ids: &[u32], valid: &[bool], target_id: u32, row_count: usize) -> Vec<u64> {
305 let words = words_for(row_count);
306 let mut mask = vec![0u64; words];
307 let n = row_count.min(ids.len()).min(valid.len());
308 for i in 0..n {
309 if valid[i] && ids[i] == target_id {
310 mask[i / 64] |= 1u64 << (i % 64);
311 }
312 }
313 mask
314}
315
316fn build_ne_mask(ids: &[u32], valid: &[bool], target_id: u32, row_count: usize) -> Vec<u64> {
318 let words = words_for(row_count);
319 let mut mask = vec![0u64; words];
320 let n = row_count.min(ids.len()).min(valid.len());
321 for i in 0..n {
322 if valid[i] && ids[i] != target_id {
323 mask[i / 64] |= 1u64 << (i % 64);
324 }
325 }
326 mask
327}
328
329fn build_set_mask(
331 ids: &[u32],
332 valid: &[bool],
333 matching: &HashSet<u32>,
334 row_count: usize,
335) -> Vec<u64> {
336 let words = words_for(row_count);
337 let mut mask = vec![0u64; words];
338 let n = row_count.min(ids.len()).min(valid.len());
339 for i in 0..n {
340 if valid[i] && matching.contains(&ids[i]) {
341 mask[i / 64] |= 1u64 << (i % 64);
342 }
343 }
344 mask
345}
346
347#[inline]
349fn zero_mask(row_count: usize) -> Vec<u64> {
350 vec![0u64; words_for(row_count)]
351}
352
353fn all_valid_mask(valid: &[bool], row_count: usize) -> Vec<u64> {
355 let words = words_for(row_count);
356 let mut mask = vec![0u64; words];
357 let n = row_count.min(valid.len());
358 for i in 0..n {
359 if valid[i] {
360 mask[i / 64] |= 1u64 << (i % 64);
361 }
362 }
363 mask
364}
365
366#[cfg(test)]
371mod tests {
372 use super::*;
373 use crate::memtable::ColumnData;
374 use crate::reader::DecodedColumn;
375 use std::collections::HashMap;
376
377 fn make_dict_col(values: &[Option<&str>]) -> ColumnData {
378 let mut dictionary: Vec<String> = Vec::new();
379 let mut reverse: HashMap<String, u32> = HashMap::new();
380 let mut ids: Vec<u32> = Vec::new();
381 let mut valid: Vec<bool> = Vec::new();
382
383 for opt in values {
384 match opt {
385 None => {
386 ids.push(0);
387 valid.push(false);
388 }
389 Some(s) => {
390 let id = if let Some(&existing) = reverse.get(*s) {
391 existing
392 } else {
393 let new_id = dictionary.len() as u32;
394 dictionary.push(s.to_string());
395 reverse.insert(s.to_string(), new_id);
396 new_id
397 };
398 ids.push(id);
399 valid.push(true);
400 }
401 }
402 }
403
404 ColumnData::DictEncoded {
405 ids,
406 dictionary,
407 reverse,
408 valid: Some(valid),
409 }
410 }
411
412 fn make_decoded_col(values: &[Option<&str>]) -> DecodedColumn {
413 let mut dictionary: Vec<String> = Vec::new();
414 let mut id_map: HashMap<String, u32> = HashMap::new();
415 let mut ids: Vec<u32> = Vec::new();
416 let mut valid: Vec<bool> = Vec::new();
417
418 for opt in values {
419 match opt {
420 None => {
421 ids.push(0);
422 valid.push(false);
423 }
424 Some(s) => {
425 let id = if let Some(&existing) = id_map.get(*s) {
426 existing
427 } else {
428 let new_id = dictionary.len() as u32;
429 dictionary.push(s.to_string());
430 id_map.insert(s.to_string(), new_id);
431 new_id
432 };
433 ids.push(id);
434 valid.push(true);
435 }
436 }
437 }
438
439 DecodedColumn::DictEncoded {
440 ids,
441 dictionary,
442 valid,
443 }
444 }
445
446 fn bits(mask: &[u64], row_count: usize) -> Vec<bool> {
447 (0..row_count)
448 .map(|i| (mask[i / 64] >> (i % 64)) & 1 == 1)
449 .collect()
450 }
451
452 #[test]
455 fn dict_eq_match() {
456 let col = make_dict_col(&[Some("web"), Some("db"), Some("web"), Some("cache")]);
457 let mask = dict_eval_eq(&col, "web", 4).unwrap();
458 assert_eq!(bits(&mask, 4), vec![true, false, true, false]);
459 }
460
461 #[test]
462 fn dict_eq_value_not_in_dict_returns_zero_mask() {
463 let col = make_dict_col(&[Some("web"), Some("db")]);
464 let mask = dict_eval_eq(&col, "missing", 2).unwrap();
465 assert_eq!(bits(&mask, 2), vec![false, false]);
466 }
467
468 #[test]
469 fn dict_eq_null_rows_excluded() {
470 let col = make_dict_col(&[Some("web"), None, Some("web")]);
471 let mask = dict_eval_eq(&col, "web", 3).unwrap();
472 assert_eq!(bits(&mask, 3), vec![true, false, true]);
473 }
474
475 #[test]
478 fn dict_ne_basic() {
479 let col = make_dict_col(&[Some("web"), Some("db"), Some("web")]);
480 let mask = dict_eval_ne(&col, "web", 3).unwrap();
481 assert_eq!(bits(&mask, 3), vec![false, true, false]);
482 }
483
484 #[test]
485 fn dict_ne_value_not_in_dict_all_valid_rows_pass() {
486 let col = make_dict_col(&[Some("web"), None, Some("db")]);
487 let mask = dict_eval_ne(&col, "missing", 3).unwrap();
488 assert_eq!(bits(&mask, 3), vec![true, false, true]);
490 }
491
492 #[test]
495 fn dict_contains_basic() {
496 let col = make_dict_col(&[Some("web-1"), Some("db-1"), Some("web-2"), Some("cache")]);
497 let mask = dict_eval_contains(&col, "web", 4).unwrap();
498 assert_eq!(bits(&mask, 4), vec![true, false, true, false]);
499 }
500
501 #[test]
502 fn dict_contains_no_match_zero_mask() {
503 let col = make_dict_col(&[Some("alpha"), Some("beta")]);
504 let mask = dict_eval_contains(&col, "gamma", 2).unwrap();
505 assert_eq!(bits(&mask, 2), vec![false, false]);
506 }
507
508 #[test]
511 fn dict_like_prefix_wildcard() {
512 let col = make_dict_col(&[Some("web-1"), Some("db-1"), Some("web-2")]);
513 let mask = dict_eval_like(&col, "web%", 3).unwrap();
514 assert_eq!(bits(&mask, 3), vec![true, false, true]);
515 }
516
517 #[test]
518 fn dict_like_suffix_wildcard() {
519 let col = make_dict_col(&[Some("alpha-web"), Some("beta-db"), Some("gamma-web")]);
520 let mask = dict_eval_like(&col, "%web", 3).unwrap();
521 assert_eq!(bits(&mask, 3), vec![true, false, true]);
522 }
523
524 #[test]
525 fn dict_like_both_wildcards() {
526 let col = make_dict_col(&[Some("alpha-web-1"), Some("beta-db"), Some("gamma-web-2")]);
527 let mask = dict_eval_like(&col, "%web%", 3).unwrap();
528 assert_eq!(bits(&mask, 3), vec![true, false, true]);
529 }
530
531 #[test]
532 fn dict_like_exact_no_wildcards() {
533 let col = make_dict_col(&[Some("exact"), Some("other")]);
534 let mask = dict_eval_like(&col, "exact", 2).unwrap();
535 assert_eq!(bits(&mask, 2), vec![true, false]);
536 }
537
538 #[test]
539 fn dict_like_unsupported_mid_wildcard_returns_none() {
540 let col = make_dict_col(&[Some("abc")]);
541 assert!(dict_eval_like(&col, "a%c", 1).is_none());
542 }
543
544 #[test]
547 fn decoded_dict_eq_match() {
548 let col = make_decoded_col(&[Some("web"), Some("db"), Some("web")]);
549 let mask = decoded_dict_eval_eq(&col, "web", 3).unwrap();
550 assert_eq!(bits(&mask, 3), vec![true, false, true]);
551 }
552
553 #[test]
554 fn decoded_dict_eq_not_in_dict() {
555 let col = make_decoded_col(&[Some("web"), Some("db")]);
556 let mask = decoded_dict_eval_eq(&col, "missing", 2).unwrap();
557 assert_eq!(bits(&mask, 2), vec![false, false]);
558 }
559
560 #[test]
561 fn decoded_dict_ne_not_in_dict_all_valid_pass() {
562 let col = make_decoded_col(&[Some("a"), None, Some("b")]);
563 let mask = decoded_dict_eval_ne(&col, "missing", 3).unwrap();
564 assert_eq!(bits(&mask, 3), vec![true, false, true]);
565 }
566
567 #[test]
568 fn decoded_dict_contains() {
569 let col = make_decoded_col(&[Some("web-1"), Some("db"), Some("web-2")]);
570 let mask = decoded_dict_eval_contains(&col, "web", 3).unwrap();
571 assert_eq!(bits(&mask, 3), vec![true, false, true]);
572 }
573
574 #[test]
575 fn decoded_dict_like() {
576 let col = make_decoded_col(&[Some("web-1"), Some("db"), Some("web-2")]);
577 let mask = decoded_dict_eval_like(&col, "web%", 3).unwrap();
578 assert_eq!(bits(&mask, 3), vec![true, false, true]);
579 }
580
581 #[test]
584 fn bitmask_all_correct_tail_bits() {
585 let mask = bitmask_all(65);
587 assert_eq!(mask.len(), 2);
588 assert_eq!(mask[0], u64::MAX);
589 assert_eq!(mask[1], 1u64); let mask66 = bitmask_all(66);
593 assert_eq!(mask66[1], 0b11u64);
594 }
595
596 #[test]
597 fn words_for_alignment() {
598 assert_eq!(words_for(0), 0);
599 assert_eq!(words_for(1), 1);
600 assert_eq!(words_for(64), 1);
601 assert_eq!(words_for(65), 2);
602 }
603
604 #[test]
605 fn non_dict_encoded_col_returns_none() {
606 let col = ColumnData::Int64 {
607 values: vec![1, 2, 3],
608 valid: Some(vec![true, true, true]),
609 };
610 assert!(dict_eval_eq(&col, "x", 3).is_none());
611 assert!(dict_eval_ne(&col, "x", 3).is_none());
612 assert!(dict_eval_contains(&col, "x", 3).is_none());
613 assert!(dict_eval_like(&col, "x%", 3).is_none());
614 }
615}