Skip to main content

triblespace_search/
lib.rs

1//! Content-addressed BM25 + HNSW indexes on top of triblespace
2//! piles. See `docs/DESIGN.md` for the full design rationale.
3//!
4//! Two canonical blob types, loaded zero-copy via [`anybytes`]
5//! with bit-packed bodies via [`jerky`]:
6//! - [`succinct::SuccinctBM25Index`] (schema
7//!   [`succinct::SuccinctBM25Blob`]) — term → doc retrieval
8//!   where terms are 32-byte triblespace `Inline`s (text tokens,
9//!   entity ids, tags, anything).
10//! - [`succinct::SuccinctHNSWIndex`] (schema
11//!   [`succinct::SuccinctHNSWBlob`]) — approximate
12//!   k-nearest-neighbour over caller-supplied embeddings.
13//!
14//! [`bm25::BM25Builder::build`] goes direct-to-succinct
15//! (sorts keys into a `CompressedUniverse` first, then
16//! accumulates per-term postings in universe-code order — no
17//! remap pass). [`hnsw::HNSWBuilder::build`] also returns the
18//! succinct form directly (delegating through today's
19//! `SuccinctHNSWIndex::from_naive` internally — the naive
20//! intermediate is a necessary buffer because HNSW levels are
21//! only revealed incrementally). Naive reference
22//! implementations live under [`testing`] — see
23//! [`testing::BM25Index`], [`testing::HNSWIndex`], and
24//! [`testing::FlatIndex`] for oracles + benchmarks. Reach them
25//! via `BM25Builder::build_naive()` / `HNSWBuilder::build_naive()`
26//! / `FlatBuilder::build()`.
27//!
28//! Both indexes are rebuilt-and-replaced (no mutation); the
29//! caller persists the resulting handle wherever appropriate
30//! (branch metadata, commit metadata, a plain trible, or an
31//! in-memory cache).
32//!
33//! # Query surface
34//!
35//! Two constraint shapes plug into `find!` / `and!` /
36//! `pattern!`. Both follow the same rule: scoring is *not* a
37//! bound variable. The constraint filters on a fixed
38//! `score_floor` parameter; callers recompute the precise
39//! score afterwards if they need it for ranking.
40//!
41//! - [`BM25Index::matches`][m] — multi-term BM25 filter.
42//!   Binds `doc` to documents whose summed BM25 score across
43//!   the query terms is `>= score_floor`. Pass `0.0` for
44//!   "any matching doc". Same method on [`SuccinctBM25Index`][sbm25].
45//!   Pair with [`BM25Index::score`][s] for ranking.
46//! - [`AttachedHNSWIndex::similar`][sh] — symmetric binary
47//!   similarity relation over two
48//!   [`EmbHandle`][emb]-typed variables with a fixed cosine
49//!   threshold. Same method on
50//!   [`AttachedFlatIndex`][sf] and
51//!   [`AttachedSuccinctHNSWIndex`][ssh].
52//! - [`AttachedHNSWIndex::similar_to`][sth] — unary
53//!   convenience for the common "search from a known handle"
54//!   case; pins the probe on the call.
55//!
56//! [m]: bm25::BM25Index::matches
57//! [s]: bm25::BM25Index::score
58//! [sbm25]: succinct::SuccinctBM25Index
59//! [sh]: hnsw::AttachedHNSWIndex::similar
60//! [sth]: hnsw::AttachedHNSWIndex::similar_to
61//! [sf]: hnsw::AttachedFlatIndex::similar
62//! [ssh]: succinct::AttachedSuccinctHNSWIndex::similar
63//! [emb]: schemas::EmbHandle
64//!
65//! # Quickstart
66//!
67//! ```
68//! use triblespace_core::find;
69//! use triblespace_core::id::Id;
70//!
71//! use triblespace_search::bm25::BM25Builder;
72//! use triblespace_search::succinct::SuccinctBM25Index;
73//! use triblespace_search::tokens::hash_tokens;
74//!
75//! // 1. Build an in-memory index.
76//! let mut b: BM25Builder = BM25Builder::new();
77//! b.insert(Id::new([1; 16]).unwrap(), hash_tokens("the quick brown fox"));
78//! b.insert(Id::new([2; 16]).unwrap(), hash_tokens("the lazy brown dog"));
79//! b.insert(Id::new([3; 16]).unwrap(), hash_tokens("quick silver fox"));
80//!
81//! // 2. Build a succinct BM25 index in a single pass.
82//! let idx: SuccinctBM25Index = b.build();
83//!
84//! // 3. Filter through the engine — constraint binds `doc`
85//! //    only; `score_floor = 0.0` means "any matching doc".
86//! let terms = hash_tokens("fox");
87//! let docs: Vec<(Id,)> = find!(
88//!     (doc: Id),
89//!     idx.matches(doc, &terms, 0.0)
90//! ).collect();
91//! assert_eq!(docs.len(), 2);
92//! ```
93//!
94//! See the `examples/` directory for runnable walkthroughs:
95//! `compose_bm25_and_pattern` / `multi_term_bm25_search`
96//! (BM25 + pattern joins), `compose_hnsw_and_pattern`
97//! (vector similarity + pattern), `hybrid_search` (all
98//! three composed in one `find!`), and `phrase_search` for
99//! the typed-tokenizer pattern.
100//!
101//! [`jerky`]: https://docs.rs/jerky
102
103pub mod bm25;
104pub mod constraint;
105pub mod hnsw;
106#[cfg(feature = "succinct")]
107pub mod ring;
108pub mod schemas;
109#[cfg(feature = "succinct")]
110pub mod succinct;
111pub mod tokens;
112
113/// Reference implementations for tests and benchmarks.
114///
115/// The types re-exported here are naive (insertion-order,
116/// non-packed) forms that exist only to validate the succinct
117/// builds and to measure "how much does jerky packing actually
118/// save at this scale." They are not a production persistence
119/// path — persistence always goes through the succinct forms
120/// in [`succinct`].
121///
122/// - [`BM25Index`][testing::BM25Index] — reference BM25 scoring
123///   and query implementation. Produced by
124///   [`bm25::BM25Builder::build_naive`].
125/// - [`HNSWIndex`][testing::HNSWIndex] — node-major HNSW graph
126///   with inline neighbour lists. Produced by
127///   [`hnsw::HNSWBuilder::build_naive`]; also the input to
128///   [`succinct::SuccinctHNSWIndex::from_naive`] for callers
129///   who want to hold the naive form.
130/// - [`FlatIndex`][testing::FlatIndex] /
131///   [`FlatBuilder`][testing::FlatBuilder] — brute-force exact
132///   k-NN baseline, used as HNSW's recall oracle.
133pub mod testing {
134    // `#[doc(inline)]` makes rustdoc render the re-exported
135    // types' full docs at this path despite `#[doc(hidden)]` at
136    // their original location — the blessed path shows up in
137    // docs, the original doesn't.
138    #[doc(inline)]
139    pub use crate::bm25::BM25Index;
140    #[doc(inline)]
141    pub use crate::hnsw::{AttachedFlatIndex, AttachedHNSWIndex, FlatBuilder, FlatIndex, HNSWIndex};
142}
143
144// Versioning policy: breaking byte-layout changes mint a new
145// `BlobEncoding` id (see `SuccinctBM25Blob` / `SuccinctHNSWBlob`
146// in `succinct.rs`). The type system then rules out
147// mismatched-layout deserialization — there's no single
148// global version number. `git log docs/DESIGN.md` has the
149// progression of layout decisions; the blob encoding id in
150// `succinct.rs` is authoritative for what any given binary
151// can load.