Skip to main content

mnem_core/retrieve/
mod.rs

1//! Agent-facing retrieval: compose structured filters, dense vector
2//! similarity, and learned-sparse retrieval under a token budget.
3//!
4//! The indexes in [`crate::index`] each answer one question in
5//! isolation: "which nodes carry label X", "which are semantically
6//! close to this embedding", "which fire on this sparse query". Real
7//! agents need all three at once and cannot afford to overflow their
8//! LLM context window.
9//!
10//! [`Retriever`] is the composition layer. It:
11//!
12//! 1. Collects candidate node IDs from each ranker (vector, sparse).
13//! 2. Fuses ranked lists with Reciprocal Rank Fusion (RRF, k=60).
14//! 3. Gates fused candidates through label / property filters.
15//! 4. Renders each surviving node to a compact text form.
16//! 5. Greedily packs results in RRF-rank order until the caller's
17//!    token budget is exhausted (rank-order skip: if a node does not
18//!    fit, move on; never reorder to exploit slack).
19//!
20//! The return value ([`RetrievalResult`]) carries both the packed
21//! items and cost metadata (`tokens_used`, `dropped`, `candidates_seen`)
22//! so callers can detect "the budget was tight and we left good stuff
23//! out" without a second round-trip.
24//!
25//! # Determinism
26//!
27//! All upstream rankers return hits in `(score desc, node_id asc)`
28//! order, RRF is a pure function of ranks, and rendering is a pure
29//! function of the node. Two independent processes with the same repo
30//! head and the same [`Retriever`] configuration produce byte-
31//! identical [`RetrievalResult`] instances. This is the property that
32//! lets agent replay and regression tests work.
33//!
34//! # Example
35//!
36//! ```no_run
37//! # use mnem_core::repo::ReadonlyRepo;
38//! # fn demo(repo: &ReadonlyRepo, embedding: Vec<f32>) -> Result<(), Box<dyn std::error::Error>> {
39//! let result = repo
40//!     .retrieve()
41//!     .label("Document")
42//!     .vector("openai:text-embedding-3-small", embedding)
43//!     .token_budget(2000)
44//!     .execute()?;
45//!
46//! println!(
47//!     "packed {} nodes in {}/{} tokens, {} dropped",
48//!     result.items.len(),
49//!     result.tokens_used,
50//!     result.tokens_budget,
51//!     result.dropped,
52//! );
53//! for item in &result.items {
54//!     println!("{}", item.rendered);
55//! }
56//! # Ok(()) }
57//! ```
58
59pub mod community_filter;
60pub mod fusion;
61pub mod retriever;
62pub mod session_reservoir;
63pub mod types;
64pub mod warnings;
65
66pub use community_filter::{
67    CommunityFilterCfg, CommunityId, CommunityLookup, apply_community_filter,
68};
69pub use fusion::{
70    convex_min_max_fusion, reciprocal_rank_fusion, score_normalized_fusion,
71    weighted_reciprocal_rank_fusion,
72};
73pub use retriever::Retriever;
74pub use types::{
75    FusionStrategy, GraphExpand, GraphExpandDirection, GraphExpandMode, Lane, RetrievalResult,
76    RetrievedItem, TemporalFilter,
77};
78pub use warnings::{WARNINGS_CAP, Warning, WarningCode, cap_warnings};
79
80use std::fmt::Write as _;
81
82use ipld_core::ipld::Ipld;
83
84use crate::objects::Node;
85
86// ============================================================
87// Token estimation
88// ============================================================
89
90/// A byte/char counter that approximates an LLM tokenizer.
91///
92/// Implementations must be deterministic pure functions of their input.
93/// The default [`HeuristicEstimator`] covers the common case without
94/// pulling in tokenizer dependencies; agents that need exact OpenAI /
95/// Anthropic / Llama counts plug in their own impl.
96pub trait TokenEstimator: Send + Sync + std::fmt::Debug {
97    /// Estimate the number of tokens `text` consumes under the target
98    /// tokenizer. Returning zero for the empty string is required;
99    /// otherwise the return value MAY be conservative (overestimate)
100    /// but MUST NOT vary between calls with the same input.
101    fn estimate(&self, text: &str) -> u32;
102}
103
104/// Byte / character heuristic tuned for modern LLM tokenizers.
105///
106/// - ASCII bytes are counted as `ceil(bytes / 4)` - the OpenAI rule of
107///   thumb, accurate within ~20% for English prose under `cl100k_base`
108///   and `o200k_base` tokenizers.
109/// - Non-ASCII characters (Unicode scalars outside `[0x00, 0x7F]`) are
110///   counted as `ceil(chars / 1.5)` - roughly one token per CJK glyph
111///   and two per emoji or Arabic/Cyrillic run, again within ~25% of
112///   actual tokenizer output.
113///
114/// The two contributions are summed. Good enough for budget packing;
115/// swap in a real tokenizer for exact accounting.
116#[derive(Debug, Default, Clone, Copy)]
117pub struct HeuristicEstimator;
118
119impl TokenEstimator for HeuristicEstimator {
120    fn estimate(&self, text: &str) -> u32 {
121        if text.is_empty() {
122            return 0;
123        }
124        let mut ascii_bytes: u32 = 0;
125        let mut non_ascii_chars: u32 = 0;
126        for ch in text.chars() {
127            if ch.is_ascii() {
128                ascii_bytes += 1;
129            } else {
130                non_ascii_chars += 1;
131            }
132        }
133        // `f32` precision is plenty for < 2^24 bytes; we ceil at the end
134        // so under-counts are impossible.
135        let ascii_tokens = (ascii_bytes as f32 / 4.0).ceil() as u32;
136        let non_ascii_tokens = (non_ascii_chars as f32 / 1.5).ceil() as u32;
137        ascii_tokens + non_ascii_tokens
138    }
139}
140
141// ============================================================
142// Node rendering
143// ============================================================
144
145/// Character cap applied to `summary` + `context_sentence` in
146/// `render_node`. An unbounded summary silently consumed the entire
147/// token budget on a single oversized node, producing zero-recall
148/// retrieves; capping the render-time string protects the budget
149/// packer without losing the underlying data on the node itself.
150///
151/// Override with `MNEM_RENDER_SUMMARY_CAP_CHARS`. Measured in
152/// `char` count (Unicode scalars) so multi-byte scripts don't hit
153/// byte-boundary panics.
154pub const DEFAULT_RENDER_SUMMARY_CAP_CHARS: usize = 8192;
155
156fn render_summary_cap() -> usize {
157    static CAP: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
158    *CAP.get_or_init(|| {
159        std::env::var("MNEM_RENDER_SUMMARY_CAP_CHARS")
160            .ok()
161            .and_then(|s| s.parse().ok())
162            .unwrap_or(DEFAULT_RENDER_SUMMARY_CAP_CHARS)
163    })
164}
165
166/// Truncate to at most `cap` chars. Does NOT split a Unicode scalar
167/// mid-sequence because `.chars()` iterates by scalar. Appends a
168/// trailing `" <...+N chars>"` marker when truncation happens so a
169/// downstream LLM can see the chunk was clipped.
170fn clip_for_render(s: &str, cap: usize) -> String {
171    let total = s.chars().count();
172    if total <= cap {
173        return s.to_string();
174    }
175    let kept: String = s.chars().take(cap).collect();
176    let dropped = total - cap;
177    format!("{kept} <...+{dropped} chars>")
178}
179
180/// Render a [`Node`] to a compact, deterministic text representation
181/// suitable for LLM consumption.
182///
183/// The format is YAML-like and stable across versions:
184///
185/// ```text
186/// ntype: <ntype>
187/// id: <uuid>
188/// context: <context_sentence>
189/// summary: <summary>
190/// <prop_key>: <prop_value>
191/// ...
192/// ```
193///
194/// - `ntype` and `id` are always present.
195/// - `context` is emitted iff `node.context_sentence` is `Some`. Sits
196///   BEFORE `summary` so an LLM reading the rendered node sees the
197///   chunk's positional cue first (, Anthropic 2024
198///   contextual-retrieval recipe).
199/// - `summary` is emitted iff `node.summary` is `Some`. Clipped at
200///   [`DEFAULT_RENDER_SUMMARY_CAP_CHARS`] (8192) chars by default,
201///   overridable via `MNEM_RENDER_SUMMARY_CAP_CHARS`. A 1 MiB
202///   summary on a single node would otherwise consume the entire
203///   token budget and starve every other item out of the result.
204/// - Scalar props (`String`, `Integer`, `Float`, `Bool`) are emitted in
205///   BTreeMap order (alphabetical). Non-scalar props (`Link`, `Map`,
206///   `List`, `Bytes`, `Null`) are skipped - an agent chasing a link
207///   should follow it with a separate `mnem_get_node` call.
208/// - Opaque `content` bytes are never rendered.
209///
210/// Determinism: since `Node.props` is a `BTreeMap`, iteration order is
211/// byte-stable and the rendered string is therefore also byte-stable.
212#[must_use]
213pub fn render_node(node: &Node) -> String {
214    let cap = render_summary_cap();
215    let mut s = String::new();
216    let _ = writeln!(s, "ntype: {}", node.ntype);
217    let _ = writeln!(s, "id: {}", node.id);
218    if let Some(context) = &node.context_sentence {
219        let _ = writeln!(s, "context: {}", clip_for_render(context, cap));
220    }
221    if let Some(summary) = &node.summary {
222        let _ = writeln!(s, "summary: {}", clip_for_render(summary, cap));
223    }
224    for (key, value) in &node.props {
225        if let Some(rendered) = render_scalar(value) {
226            let _ = writeln!(s, "{key}: {rendered}");
227        }
228    }
229    s
230}
231
232/// Like [`render_node`] but augments the output with two graph
233/// adjacency blocks derived from `repo`'s current commit:
234///
235/// ```text
236/// ntype: Doc
237/// id: ...
238/// summary: ...
239/// Outgoing:
240///   tagged -> <topic_id>
241/// Incoming:
242///   authored <- <alice_id>
243/// ```
244///
245/// Each block is capped at `per_direction_cap` entries; if the
246/// bucket had more, a trailing `... (+N more)` line is emitted so
247/// the reader knows the display was clipped. Blocks with no edges
248/// are omitted entirely. Entry order is the adjacency bucket's
249/// stored order (SPEC ยง4.9 `(label, src|dst, edge_cid)` sort),
250/// so the rendered string is byte-stable.
251///
252/// Use this from CLI / MCP / agent-facing paths that benefit from
253/// showing surrounding graph context. The hot `Retriever::execute`
254/// render path keeps calling the cheaper [`render_node`] because
255/// adjacency-aware rendering there would add a per-item O(log n)
256/// bucket fetch for every ranked candidate, and callers that do
257/// not need the blocks should not pay the cost.
258#[must_use]
259pub fn render_node_with_adjacency(
260    node: &Node,
261    repo: &crate::repo::ReadonlyRepo,
262    per_direction_cap: usize,
263) -> String {
264    let mut s = render_node(node);
265    if let Ok(edges) = repo.outgoing_edges(&node.id, None)
266        && !edges.is_empty()
267    {
268        let total = edges.len();
269        let shown = total.min(per_direction_cap);
270        let _ = writeln!(s, "Outgoing:");
271        for edge in edges.iter().take(shown) {
272            let _ = writeln!(s, "  {} -> {}", edge.etype, edge.dst);
273        }
274        if total > shown {
275            let _ = writeln!(s, "  ... (+{} more)", total - shown);
276        }
277    }
278    if let Ok(edges) = repo.incoming_edges(&node.id, None)
279        && !edges.is_empty()
280    {
281        let total = edges.len();
282        let shown = total.min(per_direction_cap);
283        let _ = writeln!(s, "Incoming:");
284        for edge in edges.iter().take(shown) {
285            let _ = writeln!(s, "  {} <- {}", edge.etype, edge.src);
286        }
287        if total > shown {
288            let _ = writeln!(s, "  ... (+{} more)", total - shown);
289        }
290    }
291    s
292}
293
294fn render_scalar(v: &Ipld) -> Option<String> {
295    match v {
296        Ipld::String(s) => Some(s.clone()),
297        Ipld::Integer(n) => Some(n.to_string()),
298        Ipld::Float(f) => Some(f.to_string()),
299        Ipld::Bool(b) => Some(b.to_string()),
300        _ => None,
301    }
302}
303
304#[cfg(test)]
305mod tests;