use std::collections::{HashMap, HashSet};
use crate::memtable::ColumnData;
use crate::reader::DecodedColumn;
type MemtableDict<'a> = (
&'a [u32],
&'a [String],
&'a HashMap<String, u32>,
std::borrow::Cow<'a, [bool]>,
);
pub fn dict_eval_eq(col: &ColumnData, value: &str, row_count: usize) -> Option<Vec<u64>> {
let (ids, _, reverse, valid) = unpack_memtable(col)?;
match reverse.get(value) {
None => Some(zero_mask(row_count)),
Some(&target_id) => Some(build_eq_mask(ids, &valid, target_id, row_count)),
}
}
pub fn dict_eval_ne(col: &ColumnData, value: &str, row_count: usize) -> Option<Vec<u64>> {
let (ids, _, reverse, valid) = unpack_memtable(col)?;
match reverse.get(value) {
None => Some(all_valid_mask(&valid, row_count)),
Some(&target_id) => Some(build_ne_mask(ids, &valid, target_id, row_count)),
}
}
pub fn dict_eval_contains(col: &ColumnData, substr: &str, row_count: usize) -> Option<Vec<u64>> {
let (ids, dictionary, _, valid) = unpack_memtable(col)?;
let matching = matching_ids_contains(dictionary, substr);
if matching.is_empty() {
return Some(zero_mask(row_count));
}
Some(build_set_mask(ids, &valid, &matching, row_count))
}
pub fn dict_eval_like(col: &ColumnData, pattern: &str, row_count: usize) -> Option<Vec<u64>> {
let (ids, dictionary, _, valid) = unpack_memtable(col)?;
let matching = matching_ids_like(dictionary, pattern)?;
if matching.is_empty() {
return Some(zero_mask(row_count));
}
Some(build_set_mask(ids, &valid, &matching, row_count))
}
pub fn decoded_dict_eval_eq(
col: &DecodedColumn,
value: &str,
row_count: usize,
) -> Option<Vec<u64>> {
let (ids, dictionary, valid) = unpack_decoded(col)?;
match find_dict_id(dictionary, value) {
None => Some(zero_mask(row_count)),
Some(target_id) => Some(build_eq_mask(ids, valid, target_id, row_count)),
}
}
pub fn decoded_dict_eval_ne(
col: &DecodedColumn,
value: &str,
row_count: usize,
) -> Option<Vec<u64>> {
let (ids, dictionary, valid) = unpack_decoded(col)?;
match find_dict_id(dictionary, value) {
None => Some(all_valid_mask(valid, row_count)),
Some(target_id) => Some(build_ne_mask(ids, valid, target_id, row_count)),
}
}
pub fn decoded_dict_eval_contains(
col: &DecodedColumn,
substr: &str,
row_count: usize,
) -> Option<Vec<u64>> {
let (ids, dictionary, valid) = unpack_decoded(col)?;
let matching = matching_ids_contains(dictionary, substr);
if matching.is_empty() {
return Some(zero_mask(row_count));
}
Some(build_set_mask(ids, valid, &matching, row_count))
}
pub fn decoded_dict_eval_like(
col: &DecodedColumn,
pattern: &str,
row_count: usize,
) -> Option<Vec<u64>> {
let (ids, dictionary, valid) = unpack_decoded(col)?;
let matching = matching_ids_like(dictionary, pattern)?;
if matching.is_empty() {
return Some(zero_mask(row_count));
}
Some(build_set_mask(ids, valid, &matching, row_count))
}
#[inline]
pub fn words_for(row_count: usize) -> usize {
row_count.div_ceil(64)
}
pub fn bitmask_and(a: &[u64], b: &[u64]) -> Vec<u64> {
let len = a.len().min(b.len());
let mut out = vec![0u64; len];
for i in 0..len {
out[i] = a[i] & b[i];
}
out
}
pub fn bitmask_all(row_count: usize) -> Vec<u64> {
let words = words_for(row_count);
let mut out = vec![u64::MAX; words];
let tail = row_count % 64;
if tail > 0 && !out.is_empty() {
*out.last_mut().expect("non-empty") = (1u64 << tail) - 1;
}
out
}
fn unpack_memtable(col: &ColumnData) -> Option<MemtableDict<'_>> {
if let ColumnData::DictEncoded {
ids,
dictionary,
reverse,
valid,
} = col
{
let validity = match valid {
Some(v) => std::borrow::Cow::Borrowed(v.as_slice()),
None => std::borrow::Cow::Owned(vec![true; ids.len()]),
};
Some((ids.as_slice(), dictionary.as_slice(), reverse, validity))
} else {
None
}
}
fn unpack_decoded(col: &DecodedColumn) -> Option<(&[u32], &[String], &[bool])> {
if let DecodedColumn::DictEncoded {
ids,
dictionary,
valid,
} = col
{
Some((ids.as_slice(), dictionary.as_slice(), valid.as_slice()))
} else {
None
}
}
fn find_dict_id(dictionary: &[String], value: &str) -> Option<u32> {
dictionary.iter().position(|s| s == value).map(|i| i as u32)
}
fn matching_ids_contains(dictionary: &[String], substr: &str) -> HashSet<u32> {
dictionary
.iter()
.enumerate()
.filter(|(_, s)| s.contains(substr))
.map(|(i, _)| i as u32)
.collect()
}
fn matching_ids_like(dictionary: &[String], pattern: &str) -> Option<HashSet<u32>> {
let matching = match (pattern.starts_with('%'), pattern.ends_with('%')) {
(true, true) => {
let inner = pattern.trim_matches('%');
if inner.contains('%') {
return None; }
dictionary
.iter()
.enumerate()
.filter(|(_, s)| s.contains(inner))
.map(|(i, _)| i as u32)
.collect()
}
(true, false) => {
let suffix = &pattern[1..];
if suffix.contains('%') {
return None;
}
dictionary
.iter()
.enumerate()
.filter(|(_, s)| s.ends_with(suffix))
.map(|(i, _)| i as u32)
.collect()
}
(false, true) => {
let prefix = &pattern[..pattern.len() - 1];
if prefix.contains('%') {
return None;
}
dictionary
.iter()
.enumerate()
.filter(|(_, s)| s.starts_with(prefix))
.map(|(i, _)| i as u32)
.collect()
}
(false, false) => {
if pattern.contains('%') {
return None;
}
dictionary
.iter()
.enumerate()
.filter(|(_, s)| s.as_str() == pattern)
.map(|(i, _)| i as u32)
.collect()
}
};
Some(matching)
}
fn build_eq_mask(ids: &[u32], valid: &[bool], target_id: u32, row_count: usize) -> Vec<u64> {
let words = words_for(row_count);
let mut mask = vec![0u64; words];
let n = row_count.min(ids.len()).min(valid.len());
for i in 0..n {
if valid[i] && ids[i] == target_id {
mask[i / 64] |= 1u64 << (i % 64);
}
}
mask
}
fn build_ne_mask(ids: &[u32], valid: &[bool], target_id: u32, row_count: usize) -> Vec<u64> {
let words = words_for(row_count);
let mut mask = vec![0u64; words];
let n = row_count.min(ids.len()).min(valid.len());
for i in 0..n {
if valid[i] && ids[i] != target_id {
mask[i / 64] |= 1u64 << (i % 64);
}
}
mask
}
fn build_set_mask(
ids: &[u32],
valid: &[bool],
matching: &HashSet<u32>,
row_count: usize,
) -> Vec<u64> {
let words = words_for(row_count);
let mut mask = vec![0u64; words];
let n = row_count.min(ids.len()).min(valid.len());
for i in 0..n {
if valid[i] && matching.contains(&ids[i]) {
mask[i / 64] |= 1u64 << (i % 64);
}
}
mask
}
#[inline]
fn zero_mask(row_count: usize) -> Vec<u64> {
vec![0u64; words_for(row_count)]
}
fn all_valid_mask(valid: &[bool], row_count: usize) -> Vec<u64> {
let words = words_for(row_count);
let mut mask = vec![0u64; words];
let n = row_count.min(valid.len());
for i in 0..n {
if valid[i] {
mask[i / 64] |= 1u64 << (i % 64);
}
}
mask
}
#[cfg(test)]
mod tests {
use super::*;
use crate::memtable::ColumnData;
use crate::reader::DecodedColumn;
use std::collections::HashMap;
fn make_dict_col(values: &[Option<&str>]) -> ColumnData {
let mut dictionary: Vec<String> = Vec::new();
let mut reverse: HashMap<String, u32> = HashMap::new();
let mut ids: Vec<u32> = Vec::new();
let mut valid: Vec<bool> = Vec::new();
for opt in values {
match opt {
None => {
ids.push(0);
valid.push(false);
}
Some(s) => {
let id = if let Some(&existing) = reverse.get(*s) {
existing
} else {
let new_id = dictionary.len() as u32;
dictionary.push(s.to_string());
reverse.insert(s.to_string(), new_id);
new_id
};
ids.push(id);
valid.push(true);
}
}
}
ColumnData::DictEncoded {
ids,
dictionary,
reverse,
valid: Some(valid),
}
}
fn make_decoded_col(values: &[Option<&str>]) -> DecodedColumn {
let mut dictionary: Vec<String> = Vec::new();
let mut id_map: HashMap<String, u32> = HashMap::new();
let mut ids: Vec<u32> = Vec::new();
let mut valid: Vec<bool> = Vec::new();
for opt in values {
match opt {
None => {
ids.push(0);
valid.push(false);
}
Some(s) => {
let id = if let Some(&existing) = id_map.get(*s) {
existing
} else {
let new_id = dictionary.len() as u32;
dictionary.push(s.to_string());
id_map.insert(s.to_string(), new_id);
new_id
};
ids.push(id);
valid.push(true);
}
}
}
DecodedColumn::DictEncoded {
ids,
dictionary,
valid,
}
}
fn bits(mask: &[u64], row_count: usize) -> Vec<bool> {
(0..row_count)
.map(|i| (mask[i / 64] >> (i % 64)) & 1 == 1)
.collect()
}
#[test]
fn dict_eq_match() {
let col = make_dict_col(&[Some("web"), Some("db"), Some("web"), Some("cache")]);
let mask = dict_eval_eq(&col, "web", 4).unwrap();
assert_eq!(bits(&mask, 4), vec![true, false, true, false]);
}
#[test]
fn dict_eq_value_not_in_dict_returns_zero_mask() {
let col = make_dict_col(&[Some("web"), Some("db")]);
let mask = dict_eval_eq(&col, "missing", 2).unwrap();
assert_eq!(bits(&mask, 2), vec![false, false]);
}
#[test]
fn dict_eq_null_rows_excluded() {
let col = make_dict_col(&[Some("web"), None, Some("web")]);
let mask = dict_eval_eq(&col, "web", 3).unwrap();
assert_eq!(bits(&mask, 3), vec![true, false, true]);
}
#[test]
fn dict_ne_basic() {
let col = make_dict_col(&[Some("web"), Some("db"), Some("web")]);
let mask = dict_eval_ne(&col, "web", 3).unwrap();
assert_eq!(bits(&mask, 3), vec![false, true, false]);
}
#[test]
fn dict_ne_value_not_in_dict_all_valid_rows_pass() {
let col = make_dict_col(&[Some("web"), None, Some("db")]);
let mask = dict_eval_ne(&col, "missing", 3).unwrap();
assert_eq!(bits(&mask, 3), vec![true, false, true]);
}
#[test]
fn dict_contains_basic() {
let col = make_dict_col(&[Some("web-1"), Some("db-1"), Some("web-2"), Some("cache")]);
let mask = dict_eval_contains(&col, "web", 4).unwrap();
assert_eq!(bits(&mask, 4), vec![true, false, true, false]);
}
#[test]
fn dict_contains_no_match_zero_mask() {
let col = make_dict_col(&[Some("alpha"), Some("beta")]);
let mask = dict_eval_contains(&col, "gamma", 2).unwrap();
assert_eq!(bits(&mask, 2), vec![false, false]);
}
#[test]
fn dict_like_prefix_wildcard() {
let col = make_dict_col(&[Some("web-1"), Some("db-1"), Some("web-2")]);
let mask = dict_eval_like(&col, "web%", 3).unwrap();
assert_eq!(bits(&mask, 3), vec![true, false, true]);
}
#[test]
fn dict_like_suffix_wildcard() {
let col = make_dict_col(&[Some("alpha-web"), Some("beta-db"), Some("gamma-web")]);
let mask = dict_eval_like(&col, "%web", 3).unwrap();
assert_eq!(bits(&mask, 3), vec![true, false, true]);
}
#[test]
fn dict_like_both_wildcards() {
let col = make_dict_col(&[Some("alpha-web-1"), Some("beta-db"), Some("gamma-web-2")]);
let mask = dict_eval_like(&col, "%web%", 3).unwrap();
assert_eq!(bits(&mask, 3), vec![true, false, true]);
}
#[test]
fn dict_like_exact_no_wildcards() {
let col = make_dict_col(&[Some("exact"), Some("other")]);
let mask = dict_eval_like(&col, "exact", 2).unwrap();
assert_eq!(bits(&mask, 2), vec![true, false]);
}
#[test]
fn dict_like_unsupported_mid_wildcard_returns_none() {
let col = make_dict_col(&[Some("abc")]);
assert!(dict_eval_like(&col, "a%c", 1).is_none());
}
#[test]
fn decoded_dict_eq_match() {
let col = make_decoded_col(&[Some("web"), Some("db"), Some("web")]);
let mask = decoded_dict_eval_eq(&col, "web", 3).unwrap();
assert_eq!(bits(&mask, 3), vec![true, false, true]);
}
#[test]
fn decoded_dict_eq_not_in_dict() {
let col = make_decoded_col(&[Some("web"), Some("db")]);
let mask = decoded_dict_eval_eq(&col, "missing", 2).unwrap();
assert_eq!(bits(&mask, 2), vec![false, false]);
}
#[test]
fn decoded_dict_ne_not_in_dict_all_valid_pass() {
let col = make_decoded_col(&[Some("a"), None, Some("b")]);
let mask = decoded_dict_eval_ne(&col, "missing", 3).unwrap();
assert_eq!(bits(&mask, 3), vec![true, false, true]);
}
#[test]
fn decoded_dict_contains() {
let col = make_decoded_col(&[Some("web-1"), Some("db"), Some("web-2")]);
let mask = decoded_dict_eval_contains(&col, "web", 3).unwrap();
assert_eq!(bits(&mask, 3), vec![true, false, true]);
}
#[test]
fn decoded_dict_like() {
let col = make_decoded_col(&[Some("web-1"), Some("db"), Some("web-2")]);
let mask = decoded_dict_eval_like(&col, "web%", 3).unwrap();
assert_eq!(bits(&mask, 3), vec![true, false, true]);
}
#[test]
fn bitmask_all_correct_tail_bits() {
let mask = bitmask_all(65);
assert_eq!(mask.len(), 2);
assert_eq!(mask[0], u64::MAX);
assert_eq!(mask[1], 1u64);
let mask66 = bitmask_all(66);
assert_eq!(mask66[1], 0b11u64);
}
#[test]
fn words_for_alignment() {
assert_eq!(words_for(0), 0);
assert_eq!(words_for(1), 1);
assert_eq!(words_for(64), 1);
assert_eq!(words_for(65), 2);
}
#[test]
fn non_dict_encoded_col_returns_none() {
let col = ColumnData::Int64 {
values: vec![1, 2, 3],
valid: Some(vec![true, true, true]),
};
assert!(dict_eval_eq(&col, "x", 3).is_none());
assert!(dict_eval_ne(&col, "x", 3).is_none());
assert!(dict_eval_contains(&col, "x", 3).is_none());
assert!(dict_eval_like(&col, "x%", 3).is_none());
}
}