use std::collections::{BTreeMap, HashMap};
use thiserror::Error;
use crate::registry::{DocId, FieldId};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum IndexType {
Hash,
Sorted,
Array,
Unique,
}
#[derive(Debug, Error, PartialEq, Eq)]
pub enum IndexError {
#[error("index already exists for field {0}")]
AlreadyExists(FieldId),
#[error("no index found for field {0}")]
NotFound(FieldId),
#[error("unique index violation: hash {hash} already maps to doc_id {existing_doc_id}")]
UniqueViolation {
hash: u32,
existing_doc_id: DocId,
},
}
#[derive(Debug, Default)]
pub struct IndexConfig {
entries: HashMap<FieldId, IndexType>,
}
impl IndexConfig {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn add(&mut self, field_id: FieldId, index_type: IndexType) -> Result<(), IndexError> {
if self.entries.contains_key(&field_id) {
return Err(IndexError::AlreadyExists(field_id));
}
self.entries.insert(field_id, index_type);
Ok(())
}
pub fn remove(&mut self, field_id: FieldId) -> Result<IndexType, IndexError> {
self.entries
.remove(&field_id)
.ok_or(IndexError::NotFound(field_id))
}
#[must_use]
pub fn lookup(&self, field_id: FieldId) -> Option<IndexType> {
self.entries.get(&field_id).copied()
}
#[must_use]
pub fn entries(&self) -> &HashMap<FieldId, IndexType> {
&self.entries
}
}
#[must_use]
pub fn hash32(data: &[u8]) -> u32 {
let mut hash: u32 = 0x811c_9dc5;
for &byte in data {
hash ^= u32::from(byte);
hash = hash.wrapping_mul(0x0100_0193);
}
hash
}
#[derive(Debug, Default)]
pub struct HashIndex {
buckets: HashMap<u32, Vec<DocId>>,
}
impl HashIndex {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn add(&mut self, hash: u32, doc_id: DocId) {
let bucket = self.buckets.entry(hash).or_default();
match bucket.binary_search(&doc_id) {
Ok(_) => {}
Err(pos) => bucket.insert(pos, doc_id),
}
}
pub fn remove(&mut self, hash: u32, doc_id: DocId) {
if let Some(bucket) = self.buckets.get_mut(&hash) {
if let Ok(pos) = bucket.binary_search(&doc_id) {
bucket.remove(pos);
}
if bucket.is_empty() {
self.buckets.remove(&hash);
}
}
}
#[must_use]
pub fn lookup(&self, hash: u32) -> &[DocId] {
self.buckets.get(&hash).map_or(&[], Vec::as_slice)
}
pub fn clear(&mut self) {
self.buckets.clear();
}
}
#[derive(Debug, Clone, Copy)]
pub struct OrderedF64(f64);
impl OrderedF64 {
#[must_use]
pub fn new(value: f64) -> Self {
Self(value)
}
#[must_use]
pub fn value(self) -> f64 {
self.0
}
fn to_sort_key(self) -> u64 {
let bits = self.0.to_bits();
if self.0.is_nan() {
return u64::MAX;
}
if bits >> 63 == 1 {
!bits
} else {
bits | (1 << 63)
}
}
}
impl PartialEq for OrderedF64 {
fn eq(&self, other: &Self) -> bool {
self.to_sort_key() == other.to_sort_key()
}
}
impl Eq for OrderedF64 {}
impl PartialOrd for OrderedF64 {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for OrderedF64 {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.to_sort_key().cmp(&other.to_sort_key())
}
}
#[derive(Debug, Default)]
pub struct SortedIndex {
tree: BTreeMap<OrderedF64, Vec<DocId>>,
}
impl SortedIndex {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn add(&mut self, key: f64, doc_id: DocId) {
let bucket = self.tree.entry(OrderedF64::new(key)).or_default();
match bucket.binary_search(&doc_id) {
Ok(_) => {}
Err(pos) => bucket.insert(pos, doc_id),
}
}
pub fn remove(&mut self, key: f64, doc_id: DocId) {
let ordered = OrderedF64::new(key);
if let Some(bucket) = self.tree.get_mut(&ordered) {
if let Ok(pos) = bucket.binary_search(&doc_id) {
bucket.remove(pos);
}
if bucket.is_empty() {
self.tree.remove(&ordered);
}
}
}
#[must_use]
pub fn range_query(&self, min: f64, max: f64) -> Vec<DocId> {
let lo = OrderedF64::new(min);
let hi = OrderedF64::new(max);
let mut result = Vec::new();
for (_key, bucket) in self.tree.range(lo..=hi) {
result.extend_from_slice(bucket);
}
result.sort_unstable();
result.dedup();
result
}
pub fn clear(&mut self) {
self.tree.clear();
}
}
#[derive(Debug, Default)]
pub struct UniqueIndex {
entries: HashMap<u32, Vec<DocId>>,
}
impl UniqueIndex {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn add(&mut self, hash: u32, doc_id: DocId) {
let bucket = self.entries.entry(hash).or_default();
match bucket.binary_search(&doc_id) {
Ok(_) => {}
Err(pos) => bucket.insert(pos, doc_id),
}
}
pub fn remove(&mut self, hash: u32, doc_id: DocId) {
if let Some(bucket) = self.entries.get_mut(&hash) {
if let Ok(pos) = bucket.binary_search(&doc_id) {
bucket.remove(pos);
}
if bucket.is_empty() {
self.entries.remove(&hash);
}
}
}
#[must_use]
pub fn lookup(&self, hash: u32) -> &[DocId] {
self.entries.get(&hash).map_or(&[], Vec::as_slice)
}
pub fn clear(&mut self) {
self.entries.clear();
}
}
#[derive(Debug, Default)]
pub struct CollectionIndexes {
hash_indexes: HashMap<FieldId, HashIndex>,
sorted_indexes: HashMap<FieldId, SortedIndex>,
array_indexes: HashMap<FieldId, HashIndex>,
unique_indexes: HashMap<FieldId, UniqueIndex>,
}
impl CollectionIndexes {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn get_or_create_hash(&mut self, field_id: FieldId) -> &mut HashIndex {
self.hash_indexes.entry(field_id).or_default()
}
pub fn get_or_create_sorted(&mut self, field_id: FieldId) -> &mut SortedIndex {
self.sorted_indexes.entry(field_id).or_default()
}
pub fn get_or_create_array(&mut self, field_id: FieldId) -> &mut HashIndex {
self.array_indexes.entry(field_id).or_default()
}
pub fn get_or_create_unique(&mut self, field_id: FieldId) -> &mut UniqueIndex {
self.unique_indexes.entry(field_id).or_default()
}
#[must_use]
pub fn hash(&self, field_id: FieldId) -> Option<&HashIndex> {
self.hash_indexes.get(&field_id)
}
#[must_use]
pub fn sorted(&self, field_id: FieldId) -> Option<&SortedIndex> {
self.sorted_indexes.get(&field_id)
}
#[must_use]
pub fn array(&self, field_id: FieldId) -> Option<&HashIndex> {
self.array_indexes.get(&field_id)
}
#[must_use]
pub fn unique(&self, field_id: FieldId) -> Option<&UniqueIndex> {
self.unique_indexes.get(&field_id)
}
pub fn remove_field(&mut self, field_id: FieldId) {
self.hash_indexes.remove(&field_id);
self.sorted_indexes.remove(&field_id);
self.array_indexes.remove(&field_id);
self.unique_indexes.remove(&field_id);
}
}
#[must_use]
pub fn intersect_sorted(a: &[DocId], b: &[DocId]) -> Vec<DocId> {
let mut result = Vec::new();
let (mut i, mut j) = (0, 0);
while i < a.len() && j < b.len() {
match a[i].cmp(&b[j]) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
result.push(a[i]);
i += 1;
j += 1;
}
}
}
result
}
#[must_use]
pub fn union_sorted(a: &[DocId], b: &[DocId]) -> Vec<DocId> {
let mut result = Vec::with_capacity(a.len() + b.len());
let (mut i, mut j) = (0, 0);
while i < a.len() && j < b.len() {
match a[i].cmp(&b[j]) {
std::cmp::Ordering::Less => {
result.push(a[i]);
i += 1;
}
std::cmp::Ordering::Greater => {
result.push(b[j]);
j += 1;
}
std::cmp::Ordering::Equal => {
result.push(a[i]);
i += 1;
j += 1;
}
}
}
result.extend_from_slice(&a[i..]);
result.extend_from_slice(&b[j..]);
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn hash_index_add_remove_lookup() {
let mut idx = HashIndex::new();
idx.add(100, 1);
idx.add(100, 3);
idx.add(100, 2);
assert_eq!(idx.lookup(100), &[1, 2, 3]);
idx.remove(100, 2);
assert_eq!(idx.lookup(100), &[1, 3]);
idx.remove(100, 1);
idx.remove(100, 3);
assert_eq!(idx.lookup(100), &[] as &[DocId]);
}
#[test]
fn hash_index_duplicate_add_is_idempotent() {
let mut idx = HashIndex::new();
idx.add(42, 7);
idx.add(42, 7);
idx.add(42, 7);
assert_eq!(idx.lookup(42), &[7]);
}
#[test]
fn hash_index_remove_nonexistent_is_noop() {
let mut idx = HashIndex::new();
idx.remove(999, 1);
idx.add(10, 5);
idx.remove(10, 99);
assert_eq!(idx.lookup(10), &[5]);
}
#[test]
fn sorted_index_add_remove_range_query() {
let mut idx = SortedIndex::new();
idx.add(1.0, 10);
idx.add(3.0, 30);
idx.add(2.0, 20);
idx.add(2.0, 25);
let result = idx.range_query(1.5, 3.0);
assert_eq!(result, vec![20, 25, 30]);
idx.remove(2.0, 20);
let result = idx.range_query(1.5, 3.0);
assert_eq!(result, vec![25, 30]);
}
#[test]
fn ordered_f64_ordering_negative_zero_positive() {
let neg = OrderedF64::new(-5.0);
let zero = OrderedF64::new(0.0);
let pos = OrderedF64::new(10.0);
let nan = OrderedF64::new(f64::NAN);
assert!(neg < zero);
assert!(zero < pos);
assert!(pos < nan);
assert!(neg < pos);
assert!(neg < nan);
}
#[test]
fn unique_index_add_lookup() {
let mut idx = UniqueIndex::new();
idx.add(42, 1);
assert_eq!(idx.lookup(42), &[1]);
}
#[test]
fn unique_index_allows_hash_collisions() {
let mut idx = UniqueIndex::new();
idx.add(42, 1);
idx.add(42, 2);
assert_eq!(idx.lookup(42), &[1, 2]);
}
#[test]
fn unique_index_same_doc_id_same_hash_is_ok() {
let mut idx = UniqueIndex::new();
idx.add(42, 1);
idx.add(42, 1);
assert_eq!(idx.lookup(42), &[1]);
}
#[test]
fn unique_index_remove_doc_id_only() {
let mut idx = UniqueIndex::new();
idx.add(42, 1);
idx.add(42, 2);
idx.remove(42, 1);
assert_eq!(idx.lookup(42), &[2]);
idx.remove(42, 2);
assert_eq!(idx.lookup(42), &[] as &[DocId]);
}
#[test]
fn index_config_add_remove_entries() {
let mut config = IndexConfig::new();
config.add(0, IndexType::Hash).expect("add should succeed");
config
.add(1, IndexType::Sorted)
.expect("add should succeed");
assert_eq!(config.lookup(0), Some(IndexType::Hash));
assert_eq!(config.lookup(1), Some(IndexType::Sorted));
assert_eq!(config.entries().len(), 2);
let removed = config.remove(0).expect("remove should succeed");
assert_eq!(removed, IndexType::Hash);
assert_eq!(config.lookup(0), None);
}
#[test]
fn index_config_duplicate_add_rejected() {
let mut config = IndexConfig::new();
config.add(0, IndexType::Hash).expect("add should succeed");
let err = config
.add(0, IndexType::Sorted)
.expect_err("duplicate add must fail");
assert!(matches!(err, IndexError::AlreadyExists(0)));
}
#[test]
fn index_config_remove_nonexistent_rejected() {
let mut config = IndexConfig::new();
let err = config
.remove(99)
.expect_err("remove non-existent must fail");
assert!(matches!(err, IndexError::NotFound(99)));
}
#[test]
fn intersect_sorted_disjoint() {
let result = intersect_sorted(&[1, 3, 5], &[2, 4, 6]);
assert!(result.is_empty());
}
#[test]
fn intersect_sorted_overlapping() {
let result = intersect_sorted(&[1, 2, 3, 5], &[2, 3, 4, 5]);
assert_eq!(result, vec![2, 3, 5]);
}
#[test]
fn intersect_sorted_empty_input() {
assert!(intersect_sorted(&[], &[1, 2, 3]).is_empty());
assert!(intersect_sorted(&[1, 2, 3], &[]).is_empty());
assert!(intersect_sorted(&[], &[]).is_empty());
}
#[test]
fn union_sorted_disjoint() {
let result = union_sorted(&[1, 3, 5], &[2, 4, 6]);
assert_eq!(result, vec![1, 2, 3, 4, 5, 6]);
}
#[test]
fn union_sorted_overlapping() {
let result = union_sorted(&[1, 2, 3], &[2, 3, 4]);
assert_eq!(result, vec![1, 2, 3, 4]);
}
#[test]
fn union_sorted_empty_input() {
assert_eq!(union_sorted(&[], &[1, 2]), vec![1, 2]);
assert_eq!(union_sorted(&[1, 2], &[]), vec![1, 2]);
assert!(union_sorted(&[], &[]).is_empty());
}
}