use sochdb_core::learned_index::{LearnedSparseIndex, LookupResult};
use std::collections::{BTreeMap, HashSet};
use std::sync::Arc;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndexType {
Learned,
BTree,
Hash,
None,
}
#[derive(Debug, Clone, Default)]
pub struct KeyStats {
pub count: usize,
pub min_key: u64,
pub max_key: u64,
pub is_monotonic: bool,
pub density: f64,
pub entropy: f64,
}
impl KeyStats {
pub fn analyze(keys: &[u64]) -> Self {
if keys.is_empty() {
return Self::default();
}
let min_key = *keys.iter().min().unwrap();
let max_key = *keys.iter().max().unwrap();
let count = keys.len();
let is_monotonic = keys.windows(2).all(|w| w[0] <= w[1]);
let range = (max_key - min_key + 1) as f64;
let density = count as f64 / range;
let entropy = Self::estimate_entropy(keys);
Self {
count,
min_key,
max_key,
is_monotonic,
density,
entropy,
}
}
fn estimate_entropy(keys: &[u64]) -> f64 {
if keys.len() < 2 {
return 0.0;
}
let gaps: Vec<u64> = keys.windows(2).map(|w| w[1] - w[0]).collect();
if gaps.is_empty() {
return 0.0;
}
let mean_gap = gaps.iter().sum::<u64>() as f64 / gaps.len() as f64;
if mean_gap == 0.0 {
return 0.0;
}
let variance = gaps
.iter()
.map(|&g| {
let diff = g as f64 - mean_gap;
diff * diff
})
.sum::<f64>()
/ gaps.len() as f64;
(variance.sqrt() / mean_gap).min(1.0)
}
pub fn recommend_index_type(&self) -> IndexType {
if self.count == 0 {
return IndexType::None;
}
if self.is_monotonic && self.density > 0.5 {
return IndexType::Learned;
}
if self.entropy < 0.3 {
return IndexType::Learned;
}
if self.entropy > 0.7 {
return IndexType::BTree;
}
IndexType::Learned
}
}
pub struct HybridIndex {
learned: LearnedSparseIndex,
keys: Vec<u64>,
key_map: BTreeMap<u64, usize>,
index_type: IndexType,
stats: KeyStats,
}
impl HybridIndex {
pub fn build(keys: &[u64]) -> Self {
let stats = KeyStats::analyze(keys);
let index_type = stats.recommend_index_type();
let learned = if index_type == IndexType::Learned {
LearnedSparseIndex::build(keys)
} else {
LearnedSparseIndex::empty()
};
let key_map: BTreeMap<u64, usize> = keys.iter().enumerate().map(|(i, &k)| (k, i)).collect();
Self {
learned,
keys: keys.to_vec(),
key_map,
index_type,
stats,
}
}
pub fn lookup(&self, key: u64) -> Option<usize> {
match self.index_type {
IndexType::Learned => self.lookup_learned(key),
IndexType::BTree | IndexType::Hash => self.key_map.get(&key).copied(),
IndexType::None => self.binary_search(key),
}
}
fn lookup_learned(&self, key: u64) -> Option<usize> {
match self.learned.lookup(key) {
LookupResult::Exact(pos) => Some(pos),
LookupResult::Range { low, high } => {
self.binary_search_range(key, low, high)
}
LookupResult::NotFound => None,
}
}
fn binary_search_range(&self, key: u64, low: usize, high: usize) -> Option<usize> {
if low > high || high >= self.keys.len() {
return None;
}
let slice = &self.keys[low..=high];
match slice.binary_search(&key) {
Ok(pos) => Some(low + pos),
Err(_) => None,
}
}
fn binary_search(&self, key: u64) -> Option<usize> {
self.keys.binary_search(&key).ok()
}
pub fn range_lookup(&self, start: u64, end: u64) -> Vec<usize> {
let start_pos = match self.keys.binary_search(&start) {
Ok(pos) => pos,
Err(pos) => pos,
};
let end_pos = match self.keys.binary_search(&end) {
Ok(pos) => pos + 1,
Err(pos) => pos,
};
(start_pos..end_pos.min(self.keys.len())).collect()
}
pub fn statistics(&self) -> &KeyStats {
&self.stats
}
pub fn index_type(&self) -> IndexType {
self.index_type
}
pub fn is_efficient(&self) -> bool {
self.learned.is_efficient()
}
pub fn memory_bytes(&self) -> usize {
self.learned.memory_bytes()
+ self.keys.len() * std::mem::size_of::<u64>()
+ self.key_map.len() * (std::mem::size_of::<u64>() + std::mem::size_of::<usize>())
}
}
pub struct IndexManager {
indexes: BTreeMap<(String, String), Arc<HybridIndex>>,
}
impl IndexManager {
pub fn new() -> Self {
Self {
indexes: BTreeMap::new(),
}
}
pub fn build_index(&mut self, table: &str, column: &str, keys: &[u64]) {
let index = HybridIndex::build(keys);
self.indexes
.insert((table.to_string(), column.to_string()), Arc::new(index));
}
pub fn get_index(&self, table: &str, column: &str) -> Option<Arc<HybridIndex>> {
self.indexes
.get(&(table.to_string(), column.to_string()))
.cloned()
}
pub fn drop_index(&mut self, table: &str, column: &str) -> bool {
self.indexes
.remove(&(table.to_string(), column.to_string()))
.is_some()
}
pub fn list_indexes(&self) -> Vec<(&str, &str, IndexType)> {
self.indexes
.iter()
.map(|((table, column), index)| (table.as_str(), column.as_str(), index.index_type()))
.collect()
}
}
impl Default for IndexManager {
fn default() -> Self {
Self::new()
}
}
pub struct PointLookupExecutor<'a, V> {
index: &'a HybridIndex,
data: &'a [V],
}
impl<'a, V> PointLookupExecutor<'a, V> {
pub fn new(index: &'a HybridIndex, data: &'a [V]) -> Self {
Self { index, data }
}
pub fn execute(&self, key: u64) -> Option<&V> {
self.index.lookup(key).and_then(|pos| self.data.get(pos))
}
pub fn execute_batch(&self, keys: &[u64]) -> Vec<Option<&V>> {
keys.iter().map(|&k| self.execute(k)).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sequential_keys() {
let keys: Vec<u64> = (1..=1000).collect();
let stats = KeyStats::analyze(&keys);
assert!(stats.is_monotonic);
assert!(stats.density > 0.99);
assert_eq!(stats.recommend_index_type(), IndexType::Learned);
}
#[test]
fn test_timestamp_keys() {
let base = 1700000000000000u64; let keys: Vec<u64> = (0..1000).map(|i| base + i * 1000).collect();
let stats = KeyStats::analyze(&keys);
assert!(stats.is_monotonic);
assert!(stats.entropy < 0.1); assert_eq!(stats.recommend_index_type(), IndexType::Learned);
}
#[test]
fn test_hybrid_index_lookup() {
let keys: Vec<u64> = (0..1000).map(|i| i * 10).collect();
let index = HybridIndex::build(&keys);
assert_eq!(index.lookup(500), Some(50));
assert_eq!(index.lookup(990), Some(99));
assert_eq!(index.lookup(5), None);
assert_eq!(index.lookup(10000), None);
}
#[test]
fn test_range_lookup() {
let keys: Vec<u64> = (0..100).map(|i| i * 10).collect();
let index = HybridIndex::build(&keys);
let positions = index.range_lookup(100, 300);
assert_eq!(positions.len(), 21); assert_eq!(positions[0], 10); }
#[test]
fn test_point_lookup_executor() {
let keys: Vec<u64> = vec![10, 20, 30, 40, 50];
let values = vec!["a", "b", "c", "d", "e"];
let index = HybridIndex::build(&keys);
let executor = PointLookupExecutor::new(&index, &values);
assert_eq!(executor.execute(20), Some(&"b"));
assert_eq!(executor.execute(50), Some(&"e"));
assert_eq!(executor.execute(25), None);
}
#[test]
fn test_index_manager() {
let mut manager = IndexManager::new();
let keys: Vec<u64> = (0..100).collect();
manager.build_index("users", "id", &keys);
let index = manager.get_index("users", "id").unwrap();
assert_eq!(index.lookup(50), Some(50));
let indexes = manager.list_indexes();
assert_eq!(indexes.len(), 1);
assert_eq!(indexes[0], ("users", "id", IndexType::Learned));
}
}
#[derive(Debug, Clone)]
pub struct LinearSegment {
pub start_key: u64,
pub end_key: u64,
pub slope: f64,
pub intercept: f64,
pub max_error: usize,
}
impl LinearSegment {
pub fn predict(&self, key: u64) -> usize {
if key < self.start_key {
return 0;
}
let delta = (key - self.start_key) as f64;
(self.intercept + delta * self.slope).round() as usize
}
pub fn bounds(&self, key: u64, data_len: usize) -> (usize, usize) {
let pred = self.predict(key);
let low = pred.saturating_sub(self.max_error);
let high = (pred + self.max_error).min(data_len.saturating_sub(1));
(low, high)
}
}
#[derive(Debug)]
#[allow(dead_code)]
pub struct PiecewiseLearnedIndex {
segments: Vec<LinearSegment>,
max_error_bound: usize,
total_keys: usize,
stats: PiecewiseStats,
}
#[derive(Debug, Clone, Default)]
pub struct PiecewiseStats {
pub segment_count: usize,
pub avg_error: f64,
pub max_error: usize,
pub compression_ratio: f64,
}
impl PiecewiseLearnedIndex {
pub fn build(keys: &[u64], max_error: usize) -> Self {
if keys.is_empty() {
return Self {
segments: Vec::new(),
max_error_bound: max_error,
total_keys: 0,
stats: PiecewiseStats::default(),
};
}
let segments = Self::build_greedy(keys, max_error);
let stats = Self::compute_stats(&segments, keys.len());
Self {
segments,
max_error_bound: max_error,
total_keys: keys.len(),
stats,
}
}
pub fn build_optimal(keys: &[u64], max_segments: usize) -> Self {
if keys.is_empty() {
return Self {
segments: Vec::new(),
max_error_bound: 0,
total_keys: 0,
stats: PiecewiseStats::default(),
};
}
let segments = Self::build_dp(keys, max_segments);
let max_error = segments.iter().map(|s| s.max_error).max().unwrap_or(0);
let stats = Self::compute_stats(&segments, keys.len());
Self {
segments,
max_error_bound: max_error,
total_keys: keys.len(),
stats,
}
}
fn build_greedy(keys: &[u64], max_error: usize) -> Vec<LinearSegment> {
let mut segments = Vec::new();
let mut start_idx = 0;
while start_idx < keys.len() {
let (end_idx, segment) = Self::find_longest_segment(keys, start_idx, max_error);
segments.push(segment);
start_idx = end_idx + 1;
}
segments
}
fn find_longest_segment(
keys: &[u64],
start_idx: usize,
max_error: usize,
) -> (usize, LinearSegment) {
let start_key = keys[start_idx];
let mut end_idx = start_idx;
let mut best_slope = 0.0;
let mut best_intercept = start_idx as f64;
let mut best_error = 0;
for i in (start_idx + 1)..keys.len() {
let (slope, intercept, error) = Self::fit_segment(keys, start_idx, i);
if error <= max_error {
end_idx = i;
best_slope = slope;
best_intercept = intercept;
best_error = error;
} else {
break;
}
}
let end_key = if end_idx + 1 < keys.len() {
keys[end_idx + 1]
} else {
keys[end_idx].saturating_add(1)
};
(
end_idx,
LinearSegment {
start_key,
end_key,
slope: best_slope,
intercept: best_intercept,
max_error: best_error,
},
)
}
fn fit_segment(keys: &[u64], start: usize, end: usize) -> (f64, f64, usize) {
if start == end {
return (0.0, start as f64, 0);
}
let n = (end - start + 1) as f64;
let _start_key = keys[start] as f64;
let mut sum_x = 0.0;
let mut sum_y = 0.0;
let mut sum_xy = 0.0;
let mut sum_xx = 0.0;
for i in start..=end {
let x = (keys[i] - keys[start]) as f64;
let y = i as f64;
sum_x += x;
sum_y += y;
sum_xy += x * y;
sum_xx += x * x;
}
let slope = if sum_xx * n - sum_x * sum_x != 0.0 {
(sum_xy * n - sum_x * sum_y) / (sum_xx * n - sum_x * sum_x)
} else {
0.0
};
let intercept = (sum_y - slope * sum_x) / n;
let mut max_error = 0usize;
for i in start..=end {
let x = (keys[i] - keys[start]) as f64;
let predicted = (intercept + slope * x).round() as isize;
let actual = i as isize;
let error = (predicted - actual).unsigned_abs();
max_error = max_error.max(error);
}
(slope, intercept, max_error)
}
fn build_dp(keys: &[u64], max_segments: usize) -> Vec<LinearSegment> {
let n = keys.len();
if n == 0 || max_segments == 0 {
return Vec::new();
}
let mut dp: Vec<Vec<(f64, usize)>> =
vec![vec![(f64::INFINITY, 0); max_segments + 1]; n + 1];
dp[0][0] = (0.0, 0);
let segment_cost = |start: usize, end: usize| -> f64 {
let (_, _, error) = Self::fit_segment(keys, start, end);
error as f64
};
for i in 1..=n {
for k in 1..=max_segments.min(i) {
for j in 0..i {
if dp[j][k - 1].0 < f64::INFINITY {
let cost = dp[j][k - 1].0 + segment_cost(j, i - 1);
if cost < dp[i][k].0 {
dp[i][k] = (cost, j);
}
}
}
}
}
let mut best_k = 1;
let mut best_cost = f64::INFINITY;
for (k, dp_entry) in dp[n].iter().enumerate().take(max_segments + 1).skip(1) {
let lambda = 10.0; let cost = dp_entry.0 + lambda * k as f64;
if cost < best_cost {
best_cost = cost;
best_k = k;
}
}
let mut segments = Vec::new();
let mut i = n;
let mut k = best_k;
while k > 0 && i > 0 {
let j = dp[i][k].1;
let (slope, intercept, max_error) = Self::fit_segment(keys, j, i - 1);
let end_key = if i < n {
keys[i]
} else {
keys[i - 1].saturating_add(1)
};
segments.push(LinearSegment {
start_key: keys[j],
end_key,
slope,
intercept,
max_error,
});
i = j;
k -= 1;
}
segments.reverse();
segments
}
fn compute_stats(segments: &[LinearSegment], total_keys: usize) -> PiecewiseStats {
if segments.is_empty() {
return PiecewiseStats::default();
}
let segment_count = segments.len();
let total_error: usize = segments.iter().map(|s| s.max_error).sum();
let max_error = segments.iter().map(|s| s.max_error).max().unwrap_or(0);
let avg_error = total_error as f64 / segment_count as f64;
let compression_ratio = total_keys as f64 / segment_count as f64;
PiecewiseStats {
segment_count,
avg_error,
max_error,
compression_ratio,
}
}
pub fn lookup(&self, key: u64, data_len: usize) -> Option<(usize, usize)> {
if self.segments.is_empty() {
return None;
}
let segment_idx = self.find_segment(key)?;
let segment = &self.segments[segment_idx];
Some(segment.bounds(key, data_len))
}
fn find_segment(&self, key: u64) -> Option<usize> {
if self.segments.is_empty() {
return None;
}
let idx = self.segments.partition_point(|s| s.end_key <= key);
if idx > 0 && idx <= self.segments.len() {
let seg = &self.segments[idx - 1];
if key >= seg.start_key && key < seg.end_key {
return Some(idx - 1);
}
}
if idx < self.segments.len() {
let seg = &self.segments[idx];
if key >= seg.start_key && key < seg.end_key {
return Some(idx);
}
}
None
}
pub fn statistics(&self) -> &PiecewiseStats {
&self.stats
}
pub fn memory_bytes(&self) -> usize {
self.segments.len() * std::mem::size_of::<LinearSegment>()
}
pub fn segment_count(&self) -> usize {
self.segments.len()
}
}
#[cfg(test)]
mod piecewise_tests {
use super::*;
#[test]
fn test_piecewise_sequential() {
let keys: Vec<u64> = (0..1000).collect();
let index = PiecewiseLearnedIndex::build(&keys, 2);
assert!(index.segment_count() <= 5);
assert!(index.stats.avg_error <= 2.0);
let (low, high) = index.lookup(500, 1000).unwrap();
assert!(low <= 500 && 500 <= high);
}
#[test]
fn test_piecewise_timestamp() {
let base = 1700000000000000u64;
let keys: Vec<u64> = (0..1000).map(|i| base + i * 1000 + (i % 10)).collect();
let index = PiecewiseLearnedIndex::build(&keys, 5);
assert!(index.stats.max_error <= 5);
let (low, high) = index.lookup(base + 500000, 1000).unwrap();
assert!(high - low <= 10); }
#[test]
fn test_piecewise_optimal() {
let keys: Vec<u64> = (0..100).map(|i| i * i).collect();
let index = PiecewiseLearnedIndex::build_optimal(&keys, 10);
assert!(index.segment_count() >= 1);
for i in 0..100 {
let (low, high) = index.lookup(i * i, 100).unwrap();
assert!(
low <= i as usize && i as usize <= high,
"Key {} (i={}): expected bounds to contain {}, got ({}, {})",
i * i,
i,
i,
low,
high
);
}
}
#[test]
fn test_piecewise_memory() {
let keys: Vec<u64> = (0..10000).collect();
let index = PiecewiseLearnedIndex::build(&keys, 10);
let key_memory = keys.len() * std::mem::size_of::<u64>();
assert!(index.memory_bytes() < key_memory / 10);
println!(
"Compression: {} keys -> {} segments ({:.1}x)",
keys.len(),
index.segment_count(),
index.stats.compression_ratio
);
}
}
#[derive(Debug)]
pub struct RecursiveModelIndex {
root_slope: f64,
root_intercept: f64,
leaves: Vec<PiecewiseLearnedIndex>,
min_key: u64,
max_key: u64,
key_range: f64,
num_keys: usize,
max_error: usize,
}
impl RecursiveModelIndex {
pub fn build(keys: &[u64], num_leaves: usize, leaf_max_error: usize) -> Self {
let n = keys.len();
if n == 0 {
return Self {
root_slope: 0.0,
root_intercept: 0.0,
leaves: Vec::new(),
min_key: 0,
max_key: 0,
key_range: 0.0,
num_keys: 0,
max_error: 0,
};
}
let min_key = keys[0];
let max_key = keys[n - 1];
let key_range = if max_key == min_key {
1.0
} else {
(max_key - min_key) as f64
};
let num_leaves = num_leaves.min(n).max(1);
let bucket_size = n.div_ceil(num_leaves);
let root_slope = num_leaves as f64; let root_intercept = 0.0;
let mut leaves = Vec::with_capacity(num_leaves);
let mut global_max_error = 0usize;
for bucket_idx in 0..num_leaves {
let start = bucket_idx * bucket_size;
let end = ((bucket_idx + 1) * bucket_size).min(n);
if start < n {
let bucket_keys: Vec<u64> = keys[start..end].to_vec();
let leaf = PiecewiseLearnedIndex::build(&bucket_keys, leaf_max_error);
global_max_error = global_max_error.max(leaf.stats.max_error);
leaves.push(leaf);
}
}
Self {
root_slope,
root_intercept,
leaves,
min_key,
max_key,
key_range,
num_keys: n,
max_error: global_max_error,
}
}
pub fn lookup(&self, key: u64, data_len: usize) -> Option<(usize, usize)> {
if self.num_keys == 0 || key < self.min_key || key > self.max_key {
return None;
}
let normalized = (key - self.min_key) as f64 / self.key_range;
let leaf_idx_f = self.root_slope * normalized + self.root_intercept;
let leaf_idx = (leaf_idx_f as usize).min(self.leaves.len().saturating_sub(1));
if let Some(leaf) = self.leaves.get(leaf_idx) {
if let Some((rel_low, rel_high)) = leaf.lookup(key, data_len) {
let bucket_size = self.num_keys.div_ceil(self.leaves.len());
let bucket_start = leaf_idx * bucket_size;
let abs_low = bucket_start + rel_low;
let abs_high = (bucket_start + rel_high).min(data_len.saturating_sub(1));
return Some((abs_low, abs_high));
}
}
Some((0, data_len.saturating_sub(1)))
}
pub fn size_bytes(&self) -> usize {
let base = std::mem::size_of::<Self>();
let leaves: usize = self.leaves.iter().map(|l| l.memory_bytes()).sum();
base + leaves
}
pub fn num_leaves(&self) -> usize {
self.leaves.len()
}
pub fn total_segments(&self) -> usize {
self.leaves.iter().map(|l| l.segment_count()).sum()
}
}
#[derive(Debug)]
pub struct DeltaIndex {
entries: BTreeMap<u64, bool>,
insert_count: usize,
delete_count: usize,
rebuild_threshold: f64,
}
impl DeltaIndex {
pub fn new(rebuild_threshold: f64) -> Self {
Self {
entries: BTreeMap::new(),
insert_count: 0,
delete_count: 0,
rebuild_threshold,
}
}
pub fn insert(&mut self, key: u64) {
if let Some(deleted) = self.entries.get_mut(&key) {
if *deleted {
*deleted = false;
self.delete_count = self.delete_count.saturating_sub(1);
}
} else {
self.entries.insert(key, false);
self.insert_count += 1;
}
}
pub fn delete(&mut self, key: u64) {
self.entries.insert(key, true);
self.delete_count += 1;
}
pub fn contains(&self, key: u64) -> Option<bool> {
self.entries.get(&key).copied()
}
pub fn needs_rebuild(&self, static_size: usize) -> bool {
if static_size == 0 {
return self.entries.len() > 100;
}
let delta_size = self.insert_count + self.delete_count;
(delta_size as f64 / static_size as f64) > self.rebuild_threshold
}
pub fn live_keys(&self) -> impl Iterator<Item = u64> + '_ {
self.entries
.iter()
.filter(|(_, deleted)| !**deleted)
.map(|(k, _)| *k)
}
pub fn deleted_keys(&self) -> impl Iterator<Item = u64> + '_ {
self.entries
.iter()
.filter(|(_, deleted)| **deleted)
.map(|(k, _)| *k)
}
pub fn clear(&mut self) {
self.entries.clear();
self.insert_count = 0;
self.delete_count = 0;
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
}
#[derive(Debug)]
pub struct HybridRMI {
rmi: RecursiveModelIndex,
delta: DeltaIndex,
keys: Vec<u64>,
btree_fallback: BTreeMap<u64, usize>,
stats: HybridRMIStats,
}
#[derive(Debug, Clone, Default)]
pub struct HybridRMIStats {
pub rmi_lookups: u64,
pub btree_lookups: u64,
pub delta_lookups: u64,
pub rebuilds: u64,
}
impl HybridRMI {
pub fn build(
keys: Vec<u64>,
num_leaves: usize,
leaf_max_error: usize,
rebuild_threshold: f64,
) -> Self {
let rmi = RecursiveModelIndex::build(&keys, num_leaves, leaf_max_error);
let _overflow_threshold = leaf_max_error * 3;
let mut btree_fallback = BTreeMap::new();
for (pos, &key) in keys.iter().enumerate() {
if let Some((low, high)) = rmi.lookup(key, keys.len())
&& (pos < low || pos > high)
{
btree_fallback.insert(key, pos);
}
}
Self {
rmi,
delta: DeltaIndex::new(rebuild_threshold),
keys,
btree_fallback,
stats: HybridRMIStats::default(),
}
}
pub fn lookup(&mut self, key: u64) -> Option<usize> {
if let Some(deleted) = self.delta.contains(key) {
self.stats.delta_lookups += 1;
if deleted {
return None; }
return None;
}
if let Some(&pos) = self.btree_fallback.get(&key) {
self.stats.btree_lookups += 1;
return Some(pos);
}
self.stats.rmi_lookups += 1;
if let Some((low, high)) = self.rmi.lookup(key, self.keys.len()) {
let range = &self.keys[low..=high.min(self.keys.len().saturating_sub(1))];
if let Ok(idx) = range.binary_search(&key) {
return Some(low + idx);
}
}
self.keys.binary_search(&key).ok()
}
pub fn insert(&mut self, key: u64) {
self.delta.insert(key);
if self.delta.needs_rebuild(self.keys.len()) {
self.rebuild();
}
}
pub fn delete(&mut self, key: u64) {
self.delta.delete(key);
}
pub fn rebuild(&mut self) {
let deleted: HashSet<u64> = self.delta.deleted_keys().collect();
let mut new_keys: Vec<u64> = self
.keys
.iter()
.filter(|&k| !deleted.contains(k))
.copied()
.collect();
new_keys.extend(self.delta.live_keys());
new_keys.sort_unstable();
new_keys.dedup();
let n = new_keys.len();
let num_leaves = ((n as f64).sqrt().ceil() as usize).max(1);
let leaf_max_error = self.rmi.max_error.max(10);
let new_rmi = RecursiveModelIndex::build(&new_keys, num_leaves, leaf_max_error);
let mut new_btree = BTreeMap::new();
for (pos, &key) in new_keys.iter().enumerate() {
if let Some((low, high)) = new_rmi.lookup(key, new_keys.len())
&& (pos < low || pos > high)
{
new_btree.insert(key, pos);
}
}
self.rmi = new_rmi;
self.keys = new_keys;
self.btree_fallback = new_btree;
self.delta.clear();
self.stats.rebuilds += 1;
}
pub fn stats(&self) -> &HybridRMIStats {
&self.stats
}
pub fn len(&self) -> usize {
self.keys.len()
}
pub fn is_empty(&self) -> bool {
self.keys.is_empty()
}
pub fn size_bytes(&self) -> usize {
self.rmi.size_bytes()
+ self.keys.len() * std::mem::size_of::<u64>()
+ self.delta.len() * std::mem::size_of::<(u64, bool)>()
+ self.btree_fallback.len()
* (std::mem::size_of::<u64>() + std::mem::size_of::<usize>())
}
}
#[cfg(test)]
mod rmi_tests {
use super::*;
#[test]
fn test_rmi_build() {
let keys: Vec<u64> = (0..10000).collect();
let rmi = RecursiveModelIndex::build(&keys, 100, 10);
assert_eq!(rmi.num_leaves(), 100);
assert!(rmi.max_error <= 10);
for i in (0..10000).step_by(100) {
if let Some((low, high)) = rmi.lookup(i, 10000) {
assert!(
low <= i as usize && high >= i as usize,
"Key {}: bounds ({}, {}) don't contain position",
i,
low,
high
);
}
}
}
#[test]
fn test_delta_index() {
let mut delta = DeltaIndex::new(0.1);
delta.insert(100);
delta.insert(200);
delta.delete(150);
assert_eq!(delta.contains(100), Some(false));
assert_eq!(delta.contains(150), Some(true)); assert_eq!(delta.contains(300), None);
delta.insert(150);
assert_eq!(delta.contains(150), Some(false));
}
#[test]
fn test_hybrid_rmi_lookup() {
let keys: Vec<u64> = (0..1000).step_by(2).collect();
let mut rmi = HybridRMI::build(keys, 10, 5, 0.1);
assert_eq!(rmi.lookup(100), Some(50));
assert_eq!(rmi.lookup(500), Some(250));
assert!(rmi.lookup(101).is_none());
}
#[test]
fn test_hybrid_rmi_updates() {
let keys: Vec<u64> = (0..100).collect();
let mut rmi = HybridRMI::build(keys, 5, 5, 0.5);
rmi.delete(50);
assert!(rmi.lookup(50).is_none());
rmi.insert(200);
assert!(!rmi.delta.is_empty());
}
#[test]
fn test_hybrid_rmi_rebuild() {
let keys: Vec<u64> = (0..100).collect();
let mut rmi = HybridRMI::build(keys, 5, 5, 0.05);
let initial_len = rmi.len();
for i in 100..115 {
rmi.insert(i);
}
assert!(rmi.stats().rebuilds > 0);
assert!(rmi.len() > initial_len);
}
#[test]
fn test_rmi_space_efficiency() {
let n = 100_000usize;
let keys: Vec<u64> = (0..n as u64).collect();
let rmi = RecursiveModelIndex::build(&keys, 100, 50);
let rmi_size = rmi.size_bytes();
let raw_size = n * std::mem::size_of::<u64>();
println!(
"RMI size: {} bytes, Raw keys: {} bytes, Ratio: {:.2}x",
rmi_size,
raw_size,
raw_size as f64 / rmi_size as f64
);
assert!(
rmi_size < raw_size / 10,
"RMI size {} should be < 10% of raw size {}",
rmi_size,
raw_size
);
}
}