use {
crate::relation::{Relation, RelationHeader},
kermit_iters::{JoinIterable, TrieIterable},
std::fmt,
};
pub struct ColumnTrieLayer {
data: Vec<usize>,
interval: Vec<usize>,
}
impl ColumnTrieLayer {
pub(super) fn data_range(&self, interval_index: usize) -> std::ops::Range<usize> {
let start = self.interval[interval_index];
let end = if interval_index + 1 < self.interval.len() {
self.interval[interval_index + 1]
} else {
self.data.len()
};
start..end
}
pub(super) fn child_data(&self, interval_index: usize) -> &[usize] {
let range = self.data_range(interval_index);
&self.data[range]
}
pub(super) fn intervals(&self) -> &[usize] { &self.interval }
pub(super) fn is_empty(&self) -> bool { self.data.is_empty() }
fn insert_key_and_shift_intervals(&mut self, pos: usize, key: usize, interval_index: usize) {
self.data.insert(pos, key);
for j in (interval_index + 1)..self.interval.len() {
self.interval[j] += 1;
}
}
fn add_interval(&mut self, i: usize) {
if i == self.interval.len() {
self.interval.push(self.data.len());
} else {
self.interval.insert(i, self.interval[i]);
}
}
}
pub struct ColumnTrie {
header: RelationHeader,
layers: Vec<ColumnTrieLayer>,
}
impl ColumnTrie {
pub fn layer(&self, layer_i: usize) -> &ColumnTrieLayer { &self.layers[layer_i] }
fn internal_insert(&mut self, tuple: &[usize]) {
let arity = self.header().arity();
let mut interval_index = 0;
'layer_loop: for (layer_i, &k) in tuple.iter().enumerate() {
let is_last_layer = layer_i == arity - 1;
if self.layers[layer_i].data.is_empty() {
self.layers[layer_i].data.push(k);
self.layers[layer_i].interval.push(0);
interval_index = 0;
continue;
}
let range = self.layers[layer_i].data_range(interval_index);
for i in range.clone() {
if self.layers[layer_i].data[i] == k {
continue 'layer_loop;
}
if k < self.layers[layer_i].data[i] {
self.layers[layer_i].insert_key_and_shift_intervals(i, k, interval_index);
if is_last_layer {
return;
}
self.layers[layer_i + 1].add_interval(i);
interval_index = i;
continue 'layer_loop;
}
}
let insert_pos = range.end;
if insert_pos == self.layers[layer_i].data.len() {
self.layers[layer_i].data.push(k);
} else {
self.layers[layer_i].insert_key_and_shift_intervals(insert_pos, k, interval_index);
}
if is_last_layer {
return;
}
self.layers[layer_i + 1].add_interval(insert_pos);
interval_index = insert_pos;
}
}
}
impl fmt::Display for ColumnTrie {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for (layer_i, layer) in self.layers.iter().enumerate() {
writeln!(f, "LAYER {layer_i}")?;
write!(f, "Data: [")?;
for (i, data) in layer.data.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{data}")?;
}
writeln!(f, "]")?;
write!(f, "Interval: [")?;
for (i, interval) in layer.interval.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{interval}")?;
}
writeln!(f, "]")?;
}
Ok(())
}
}
impl JoinIterable for ColumnTrie {}
impl crate::relation::Projectable for ColumnTrie {
fn project(&self, columns: Vec<usize>) -> Self {
let current_header = self.header();
let projected_attrs: Vec<String> = columns
.iter()
.filter_map(|&col_idx| current_header.attrs().get(col_idx).cloned())
.collect();
let new_header = if projected_attrs.is_empty() {
crate::relation::RelationHeader::new_nameless_positional(columns.len())
} else {
crate::relation::RelationHeader::new_nameless(projected_attrs)
};
let all_tuples: Vec<Vec<usize>> = self.trie_iter().into_iter().collect();
let projected_tuples: Vec<Vec<usize>> = all_tuples
.into_iter()
.map(|tuple| columns.iter().map(|&col_idx| tuple[col_idx]).collect())
.collect();
Self::from_tuples(new_header, projected_tuples)
}
}
impl Relation for ColumnTrie {
fn header(&self) -> &RelationHeader { &self.header }
fn new(header: RelationHeader) -> Self {
ColumnTrie {
layers: (0..header.arity())
.map(|_| ColumnTrieLayer {
data: vec![],
interval: vec![],
})
.collect::<Vec<_>>(),
header,
}
}
fn from_tuples(header: RelationHeader, mut tuples: Vec<Vec<usize>>) -> Self {
if tuples.is_empty() {
Self::new(header)
} else {
tuples.sort_unstable_by(|a, b| {
for i in 0..a.len() {
match a[i].cmp(&b[i]) {
| std::cmp::Ordering::Less => return std::cmp::Ordering::Less,
| std::cmp::Ordering::Greater => return std::cmp::Ordering::Greater,
| std::cmp::Ordering::Equal => continue,
}
}
std::cmp::Ordering::Equal
});
let mut trie = Self::new(header);
for tuple in tuples {
trie.insert(tuple);
}
trie
}
}
fn insert(&mut self, tuple: Vec<usize>) {
assert_eq!(
tuple.len(),
self.header().arity(),
"tuple arity must match relation arity"
);
self.internal_insert(&tuple);
}
fn insert_all(&mut self, tuples: Vec<Vec<usize>>) {
for tuple in tuples {
self.insert(tuple);
}
}
}
impl crate::heap_size::HeapSize for ColumnTrie {
fn heap_size_bytes(&self) -> usize {
let layers_vec_bytes = self.layers.capacity() * std::mem::size_of::<ColumnTrieLayer>();
let layer_contents_bytes: usize = self
.layers
.iter()
.map(|layer| {
layer.data.capacity() * std::mem::size_of::<usize>()
+ layer.interval.capacity() * std::mem::size_of::<usize>()
})
.sum();
layers_vec_bytes + layer_contents_bytes
}
}
#[cfg(test)]
mod tests {
use {
super::ColumnTrie,
crate::relation::{Projectable, Relation as _},
kermit_iters::TrieIterable,
};
#[test]
fn test_insert() {
let mut trie = ColumnTrie::new(2.into());
trie.insert(vec![2, 3]);
println!("{trie}");
trie.insert(vec![3, 1]);
println!("{trie}");
trie.insert(vec![1, 2]);
println!("{trie}");
println!("potato")
}
#[test]
fn test_project() {
let mut trie = ColumnTrie::new(3.into());
trie.insert(vec![1, 2, 3]);
trie.insert(vec![4, 5, 6]);
trie.insert(vec![7, 8, 9]);
let projected = trie.project(vec![0, 2]);
assert_eq!(projected.header().arity(), 2);
let mut all_tuples: Vec<Vec<usize>> = projected.trie_iter().into_iter().collect();
all_tuples.sort();
assert_eq!(all_tuples, vec![vec![1, 3], vec![4, 6], vec![7, 9]]);
}
#[test]
fn test_project_with_named_attributes() {
let header = crate::relation::RelationHeader::new_nameless(vec![
"a".to_string(),
"b".to_string(),
"c".to_string(),
]);
let mut trie = ColumnTrie::new(header);
trie.insert(vec![1, 2, 3]);
trie.insert(vec![4, 5, 6]);
let projected = trie.project(vec![0, 2]);
assert_eq!(projected.header().arity(), 2);
assert_eq!(projected.header().attrs(), &[
"a".to_string(),
"c".to_string()
]);
let mut all_tuples: Vec<Vec<usize>> = projected.trie_iter().into_iter().collect();
all_tuples.sort();
assert_eq!(all_tuples, vec![vec![1, 3], vec![4, 6]]);
}
}
#[cfg(test)]
mod heap_size_tests {
use {
super::*,
crate::{HeapSize, Relation},
};
#[test]
fn empty_column_trie_heap_size() {
let trie = ColumnTrie::new(2.into());
let expected = trie.layers.capacity() * std::mem::size_of::<ColumnTrieLayer>();
assert_eq!(trie.heap_size_bytes(), expected);
}
#[test]
fn single_tuple_column_trie_heap_size() {
let trie = ColumnTrie::from_tuples(2.into(), vec![vec![1, 2]]);
assert!(trie.heap_size_bytes() > 0);
}
#[test]
fn more_tuples_means_more_heap() {
let small = ColumnTrie::from_tuples(2.into(), vec![vec![1, 2]]);
let large = ColumnTrie::from_tuples(2.into(), (0..100).map(|i| vec![i, i + 1]).collect());
assert!(large.heap_size_bytes() > small.heap_size_bytes());
}
#[test]
fn heap_size_is_deterministic_across_rebuilds() {
let tuples: Vec<Vec<usize>> = (0..50).map(|i| vec![i, i + 1]).collect();
let a = ColumnTrie::from_tuples(2.into(), tuples.clone());
let b = ColumnTrie::from_tuples(2.into(), tuples);
assert_eq!(a.heap_size_bytes(), b.heap_size_bytes());
}
}