obj-db 1.1.2

Embedded document database. Stable file format, full ACID, single-file portability.
Documentation
//! Index maintenance — wire `Collection<T>` writes through every
//! `Active` `IndexDescriptor`.
//!
//! See [`docs/format.md`](https://github.com/uname-n/obj/blob/master/docs/format.md)
//! § Indexes for the on-disk shape these
//! routines persist into. M7 #58 covers `Standard` + `Unique`; #59
//! extends `Each` + `Composite`.
//!
//! # Power-of-ten posture
//!
//! - **Rule 1.** No recursion; each maintenance pass iterates a
//!   bounded `&[IndexDescriptor]`.
//! - **Rule 2.** Loops over `descriptor.indexes` are bounded by the
//!   index count, which the catalog already validates against the
//!   per-collection ceiling.
//! - **Rule 4.** `apply_doc_change` is the single entry point;
//!   per-kind work delegates to small helpers.
//! - **Rule 5.** Pre-write `Unique` check runs **inside** the WAL
//!   transaction so a tripped constraint rolls back atomically with
//!   the primary write.
//! - **Rule 7.** Every `BTree::insert` / `BTree::delete` propagates
//!   errors via `?`; no `unwrap` / `expect` on the maintenance path.

#![forbid(unsafe_code)]

use std::collections::BTreeSet;

use obj_core::btree::BTree;
use obj_core::index::{extract_index_keys, EncodedIndexKey};
use obj_core::pager::page::PageId;
use obj_core::pager::Pager;
use obj_core::platform::FileHandle;
use obj_core::{
    CollectionDescriptor, Document, Error, Id, IndexDescriptor, IndexKind, IndexSpec, IndexStatus,
    Result,
};

/// Apply the per-doc index churn implied by a write.
///
/// - `old`: the document's pre-write state (None on insert).
/// - `new`: the document's post-write state (None on delete).
/// - `id`: the document's `Id`.
///
/// Calls `extract_index_keys` for every `Active` `IndexDescriptor`
/// on the collection, diffs old-vs-new key sets, and emits the
/// minimal B-tree mutation set. For `Unique` indexes the diff
/// includes a pre-insert existence check against the new key — a
/// hit returns [`Error::UniqueConstraintViolation`] before any
/// mutation lands.
///
/// #90: every index `root_page_id` advance lands IN PLACE on the
/// caller-supplied `descriptor` (which is the per-txn cache entry) —
/// this routine NO LONGER calls `Catalog::update`. The single
/// catalog flush is deferred to [`crate::WriteTxn::commit`] via
/// [`crate::WriteTxn::flush_descriptors`], so a 64-doc batch pays one
/// `Catalog::update` per touched collection instead of two per doc.
/// The `descriptor` the caller passes is the SOLE mid-txn source of
/// truth: the `Unique` pre-check (`apply_unique_diff` → `tree.get`)
/// and the index-tree open in [`maintain_index_from_keys`] descend
/// the roots on THIS descriptor, so they observe every prior eager
/// index write inside the uncommitted txn.
pub(crate) fn apply_doc_change<T: Document>(
    pager: &mut Pager<FileHandle>,
    descriptor: &mut CollectionDescriptor,
    old: Option<&T>,
    new: Option<&T>,
    id: Id,
) -> Result<()> {
    let active_indexes: Vec<usize> = descriptor
        .indexes
        .iter()
        .enumerate()
        .filter_map(|(i, d)| (d.status == IndexStatus::Active).then_some(i))
        .collect();
    for idx in active_indexes {
        maintain_one_index::<T>(pager, descriptor, idx, old, new, id)?;
    }
    Ok(())
}

/// Maintain a single `IndexDescriptor`'s B-tree under the
/// old → new transition. Updates `descriptor.indexes[idx].root_page_id`
/// in place on COW root advance.
fn maintain_one_index<T: Document>(
    pager: &mut Pager<FileHandle>,
    descriptor: &mut CollectionDescriptor,
    idx: usize,
    old: Option<&T>,
    new: Option<&T>,
    id: Id,
) -> Result<()> {
    let collection_name = T::COLLECTION;
    let spec = descriptor_to_spec(&descriptor.indexes[idx])?;
    let old_keys = match old {
        Some(doc) => extract_index_keys(collection_name, &spec, doc)?,
        None => Vec::new(),
    };
    let new_keys = match new {
        Some(doc) => extract_index_keys(collection_name, &spec, doc)?,
        None => Vec::new(),
    };
    maintain_index_from_keys(
        pager,
        descriptor,
        idx,
        collection_name,
        &spec,
        &old_keys,
        &new_keys,
        id,
    )
}

/// Maintain a single index B-tree from already-extracted (or
/// caller-supplied) field-encoded key sets — the kind-specific
/// **storage-key composition** shared by the typed write path
/// ([`maintain_one_index`]) and the raw-bytes write path
/// ([`crate::txn::WriteTxn::insert_raw_indexed`] et al.).
///
/// This is the NON-generic seam: it takes the OLD and NEW
/// field-encoded keys (an [`EncodedIndexKey`] is the
/// order-preserving encoding of one index field — what the C side
/// produces via `obj_index_key_encode` and what `extract_index_keys`
/// produces on the typed path) and applies the exact same per-kind
/// storage layout the engine has always used:
///
/// - `Unique`: the encoded key is the B-tree key as-is; the `Id`
///   (8 bytes BE) is the VALUE; a pre-insert existence check turns a
///   duplicate into [`Error::UniqueConstraintViolation`].
/// - `Standard` / `Each` / `Composite`: the 8-byte BE `Id` is
///   appended to each encoded key (so the B-tree key stays globally
///   unique) and stored with the `Id` as value too.
///
/// Updates `descriptor.indexes[idx].root_page_id` in place on a COW
/// root advance, exactly as [`maintain_one_index`] did inline before
/// the refactor — the on-disk key composition is unchanged.
#[allow(clippy::too_many_arguments)] // the seam threads every input the
                                     // per-kind composition needs; bundling them into a struct would add
                                     // indirection without reducing the real argument count.
pub(crate) fn maintain_index_from_keys(
    pager: &mut Pager<FileHandle>,
    descriptor: &mut CollectionDescriptor,
    idx: usize,
    collection_name: &str,
    spec: &IndexSpec,
    old_keys: &[EncodedIndexKey],
    new_keys: &[EncodedIndexKey],
    id: Id,
) -> Result<()> {
    let id_suffix = id.get().to_be_bytes();
    let mut tree = open_index_tree(pager, &descriptor.indexes[idx])?;
    match spec.kind {
        IndexKind::Unique => {
            apply_unique_diff(
                pager,
                &mut tree,
                collection_name,
                spec,
                old_keys,
                new_keys,
                &id_suffix,
            )?;
        }
        IndexKind::Standard | IndexKind::Each | IndexKind::Composite => {
            apply_nonunique_diff(pager, &mut tree, old_keys, new_keys, &id_suffix)?;
        }
        // `IndexKind` is `#[non_exhaustive]`; an unknown kind here means
        // this build is older than the index it is maintaining. Refuse
        // rather than silently mis-maintain the index.
        _ => return Err(Error::InvalidArgument("unsupported index kind")),
    }
    let new_root = tree.root().get();
    if new_root != descriptor.indexes[idx].root_page_id {
        descriptor.indexes[idx].root_page_id = new_root;
    }
    Ok(())
}

/// Diff the OLD vs NEW key sets for a `Unique` index. Emits the
/// minimum set of `delete(old)` / `insert(new)` calls and runs the
/// pre-insert existence check that turns a duplicate into
/// [`Error::UniqueConstraintViolation`].
fn apply_unique_diff(
    pager: &mut Pager<FileHandle>,
    tree: &mut BTree<FileHandle>,
    collection: &str,
    spec: &IndexSpec,
    old_keys: &[EncodedIndexKey],
    new_keys: &[EncodedIndexKey],
    id_suffix: &[u8],
) -> Result<()> {
    // Unique indexes always have at most 1 entry per doc.
    debug_assert!(old_keys.len() <= 1, "unique old key set must be 0..=1");
    debug_assert!(new_keys.len() <= 1, "unique new key set must be 0..=1");
    let old = old_keys.first().map(EncodedIndexKey::as_bytes);
    let new = new_keys.first().map(EncodedIndexKey::as_bytes);
    if old == new {
        return Ok(());
    }
    // Unique key encoding: just the user key. The id is stored as
    // the *value* (8 bytes BE) — the key alone is the constraint.
    if let Some(new_bytes) = new {
        if let Some(existing_value) = tree.get(pager, new_bytes)? {
            // A unique-index collision is OK if the existing entry
            // is *this same document's* prior entry (e.g. an update
            // that didn't change the indexed field but somehow we
            // got here — defense in depth). Compare the stored
            // value (= id_be) against the current id_suffix.
            if existing_value.as_slice() != id_suffix {
                return Err(Error::UniqueConstraintViolation {
                    collection: collection.to_owned(),
                    index: spec.name.clone(),
                    key: new_bytes.to_vec(),
                });
            }
        }
    }
    if let Some(old_bytes) = old {
        let _ = tree.delete(pager, old_bytes)?;
    }
    if let Some(new_bytes) = new {
        tree.insert(pager, new_bytes, id_suffix)?;
    }
    Ok(())
}

/// Diff the OLD vs NEW key sets for a non-unique index (`Standard`
/// / `Each` / `Composite`). Each entry's B-tree key is the encoded
/// user key with the 8-byte big-endian `Id` suffix appended.
fn apply_nonunique_diff(
    pager: &mut Pager<FileHandle>,
    tree: &mut BTree<FileHandle>,
    old_keys: &[EncodedIndexKey],
    new_keys: &[EncodedIndexKey],
    id_suffix: &[u8],
) -> Result<()> {
    // Cheap O(N log N) set diff via BTreeSet — index entries per
    // doc are typically O(1) to O(low-tens) so we don't need a
    // hash-based set here.
    let old_set: BTreeSet<Vec<u8>> = old_keys
        .iter()
        .map(|k| append_id_suffix(k.as_bytes(), id_suffix))
        .collect();
    let new_set: BTreeSet<Vec<u8>> = new_keys
        .iter()
        .map(|k| append_id_suffix(k.as_bytes(), id_suffix))
        .collect();
    for to_delete in old_set.difference(&new_set) {
        let _ = tree.delete(pager, to_delete)?;
    }
    for to_insert in new_set.difference(&old_set) {
        tree.insert(pager, to_insert, id_suffix)?;
    }
    Ok(())
}

/// Concatenate `user_key || id_suffix` into a fresh `Vec<u8>`. The
/// suffix is the 8-byte big-endian `Id` so range scans can recover
/// the document id by trimming the trailing 8 bytes.
fn append_id_suffix(user_key: &[u8], id_suffix: &[u8]) -> Vec<u8> {
    let mut out = Vec::with_capacity(user_key.len() + id_suffix.len());
    out.extend_from_slice(user_key);
    out.extend_from_slice(id_suffix);
    out
}

/// Open the B+tree handle for an `IndexDescriptor`'s root.
fn open_index_tree(
    pager: &Pager<FileHandle>,
    descriptor: &IndexDescriptor,
) -> Result<BTree<FileHandle>> {
    let root = PageId::new(descriptor.root_page_id)
        .ok_or(Error::InvalidArgument("index descriptor root is zero"))?;
    BTree::<FileHandle>::open(pager, root)
}

/// Reconstruct an [`IndexSpec`] from an on-disk
/// [`IndexDescriptor`]. Used by the maintenance routines because
/// extraction needs the spec shape (kind + `key_paths` + name).
///
/// `pub(crate)` so the raw-bytes write path in [`crate::txn`] can
/// rebuild the spec for the kind-specific composition seam
/// ([`maintain_index_from_keys`]) without duplicating the
/// from-parts reconstruction.
pub(crate) fn descriptor_to_spec(d: &IndexDescriptor) -> Result<IndexSpec> {
    IndexSpec::from_parts(d.name.clone(), d.kind, d.key_paths.clone())
}