memoir_core/graph/resolve.rs
1//! Mapping extracted entity strings to canonical graph nodes.
2//!
3//! A [`Triple`](crate::graph::Triple)'s `subject` and `object` are bare strings
4//! the extractor produced; two memories that mention the same real-world entity
5//! under different surface forms ("Alice", "Alice Smith") must resolve to the
6//! same graph node or the graph fragments into duplicates. [`EntityResolver`] is
7//! the seam that maps a candidate string to either an existing node or a new
8//! one.
9//!
10//! Two implementations ship: [`ExactStringResolver`] (the benchmark floor —
11//! match on canonical name only) and [`EmbeddingEntityResolver`] (the
12//! production impl — cosine similarity over candidate embeddings). Both partition
13//! the resolution space by [`Scope`] so an entity in one tenant never merges with
14//! a like-named entity in another.
15//!
16//! The candidates an embedding resolver compares against come from an
17//! [`EntityCatalog`], a focused retrieval seam separate from
18//! [`GraphStore`](crate::graph::GraphStore): the resolver needs only
19//! "the entities I might merge with in this scope", not the full store surface.
20//! [`InMemoryEntityCatalog`] is the always-available backing for tests and
21//! benchmarks; a FalkorDB-backed catalog (reading the nodes the commit path
22//! writes) is layered on in the commit ticket.
23
24use std::collections::HashMap;
25use std::future::Future;
26use std::sync::Mutex;
27
28use crate::embedding::{EmbeddingError, EmbeddingModel};
29use crate::memory::Scope;
30
31use super::cosine::cosine_similarity;
32
33/// Minimum cosine similarity for two entity strings to be the same node.
34///
35/// Below this floor the surface forms are treated as distinct entities and a
36/// new node is created. Mirrors the `MIN_CATEGORY_SCORE` precedent
37/// (`client/categorize.rs`): a deliberately conservative default that favors a
38/// new node over a wrong merge, since an over-eager merge is harder to undo than
39/// a duplicate.
40///
41/// Chosen by measurement (`benches/resolver_accuracy.rs`, BGE-small-en-v1.5,
42/// 2026-06-10): should-merge name variants scored ≥ 0.763 ("Alice Smith" vs
43/// "Alice") while should-split distinct entities scored ≤ 0.718, so the floor
44/// sits inside that band, biased toward the merge bound to keep favoring
45/// splits. The original 0.85 guess false-split "Alice Smith" in the live 0014
46/// suite.
47pub const MIN_ENTITY_SIMILARITY: f32 = 0.75;
48
49/// The outcome of resolving an entity string against the existing graph.
50///
51/// Tells the commit path whether to match an existing node or create one, and
52/// carries the canonical name so surface-form variants collapse onto a single
53/// node. A resolver that finds a match returns [`Resolution::Existing`] with the
54/// matched node's stable key; otherwise [`Resolution::New`] with the name to
55/// create.
56#[derive(Debug, Clone, PartialEq)]
57pub enum Resolution {
58 /// The entity matched an existing node identified by `key`.
59 Existing {
60 /// The matched node's stable identity within its scope.
61 key: String,
62 /// The matched node's canonical name (may differ from the query string).
63 name: String,
64 },
65
66 /// No existing node matched; a node should be created under `name`.
67 New {
68 /// The canonical name to create the node under.
69 name: String,
70 },
71}
72
73/// A candidate entity node the resolver compares a query against.
74///
75/// `key` is the node's stable identity within its scope; `name` is its canonical
76/// surface form; `embedding` is the vector of `name`, used for cosine matching.
77/// Yielded by an [`EntityCatalog`] for one scope.
78#[derive(Debug, Clone, PartialEq)]
79pub struct EntityVector {
80 /// The node's stable identity within its scope.
81 pub key: String,
82 /// The node's canonical name.
83 pub name: String,
84 /// The embedding of `name`, for cosine comparison.
85 pub embedding: Vec<f32>,
86}
87
88/// Yields the existing entity nodes a resolver may merge a candidate into.
89///
90/// A focused retrieval seam, deliberately narrower than
91/// [`GraphStore`](crate::graph::GraphStore): the embedding resolver needs only
92/// the candidate entities in one [`Scope`], not the full store surface. This is
93/// also the seam where a FalkorDB-native vector index later replaces the linear
94/// scan, behind the same trait, without changing the resolver.
95pub trait EntityCatalog: Send + Sync + 'static {
96 /// Returns the entity nodes within `scope` that a candidate may match.
97 ///
98 /// Scope-confined: implementations never yield entities from another scope,
99 /// upholding the same tenant isolation the rest of memoir enforces.
100 ///
101 /// # Errors
102 ///
103 /// Returns [`ResolveError::Catalog`] when the backing store cannot be read.
104 fn candidates_in_scope(
105 &self,
106 scope: &Scope,
107 ) -> impl Future<Output = Result<Vec<EntityVector>, ResolveError>> + Send;
108}
109
110/// Maps an extracted entity string to a canonical graph node.
111///
112/// Implementations decide whether a candidate is an existing node or a new one,
113/// always confined to the candidate's [`Scope`]. Swapping one implementation for
114/// another (exact-string, embedding, or a future ANN-backed impl) requires no
115/// caller change, which is what lets the benchmark compare them.
116pub trait EntityResolver: Send + Sync + 'static {
117 /// Resolves `entity` within `scope` to an existing or new node.
118 ///
119 /// # Errors
120 ///
121 /// Returns [`ResolveError::Catalog`] when candidate retrieval fails and
122 /// [`ResolveError::Embed`] when the embedding model fails.
123 fn resolve(&self, scope: &Scope, entity: &str) -> impl Future<Output = Result<Resolution, ResolveError>> + Send;
124}
125
126/// Failure modes for [`EntityResolver`] implementations.
127#[derive(Debug, thiserror::Error)]
128pub enum ResolveError {
129 /// Reading candidate entities from the [`EntityCatalog`] failed.
130 #[error("entity catalog read failed: {0}")]
131 Catalog(String),
132
133 /// Embedding the candidate entity string failed.
134 #[error("entity embedding failed: {0}")]
135 Embed(#[from] EmbeddingError),
136}
137
138/// Resolves entities by exact canonical-name match within a scope.
139///
140/// The benchmark floor: two surface forms of the same entity ("Alice", "Alice
141/// Smith") resolve to *different* nodes because only an exact name match counts.
142/// Useful as the baseline [`EmbeddingEntityResolver`] is measured against, and as
143/// a zero-dependency resolver when embedding-based merging is not wanted.
144pub struct ExactStringResolver<C> {
145 catalog: C,
146}
147
148impl<C: EntityCatalog> ExactStringResolver<C> {
149 /// Builds an exact-match resolver over `catalog`.
150 pub fn new(catalog: C) -> Self {
151 Self { catalog }
152 }
153}
154
155impl<C: EntityCatalog> EntityResolver for ExactStringResolver<C> {
156 async fn resolve(&self, scope: &Scope, entity: &str) -> Result<Resolution, ResolveError> {
157 let candidates = self.catalog.candidates_in_scope(scope).await?;
158 match candidates.into_iter().find(|candidate| candidate.name == entity) {
159 Some(matched) => Ok(Resolution::Existing {
160 key: matched.key,
161 name: matched.name,
162 }),
163 None => Ok(Resolution::New {
164 name: entity.to_string(),
165 }),
166 }
167 }
168}
169
170/// Resolves entities by cosine similarity over candidate embeddings.
171///
172/// The production impl: embeds the candidate string and compares it against the
173/// embeddings of existing nodes in the same scope, merging into the closest node
174/// above [`MIN_ENTITY_SIMILARITY`]. This collapses surface-form variants onto one
175/// node where the exact-string resolver would fragment them.
176///
177/// Candidate embeddings are stored as a node property and scanned exactly here;
178/// a FalkorDB-native vector index can later replace the catalog's linear scan
179/// behind the [`EntityCatalog`] seam without changing this resolver.
180///
181/// Generic over the embedder and catalog so tests inject stubs.
182pub struct EmbeddingEntityResolver<E, C> {
183 embedder: E,
184 catalog: C,
185 min_similarity: f32,
186}
187
188impl<E: EmbeddingModel, C: EntityCatalog> EmbeddingEntityResolver<E, C> {
189 /// Builds a resolver over `embedder` and `catalog` with the default floor.
190 pub fn new(embedder: E, catalog: C) -> Self {
191 Self {
192 embedder,
193 catalog,
194 min_similarity: MIN_ENTITY_SIMILARITY,
195 }
196 }
197
198 /// Overrides the minimum cosine similarity for a match.
199 #[must_use]
200 pub fn with_min_similarity(mut self, min_similarity: f32) -> Self {
201 self.min_similarity = min_similarity;
202 self
203 }
204}
205
206impl<E: EmbeddingModel, C: EntityCatalog> EntityResolver for EmbeddingEntityResolver<E, C> {
207 async fn resolve(&self, scope: &Scope, entity: &str) -> Result<Resolution, ResolveError> {
208 let query = self.embedder.embed(entity).await?;
209 let candidates = self.catalog.candidates_in_scope(scope).await?;
210
211 let best = candidates
212 .into_iter()
213 .filter_map(|candidate| cosine_similarity(&query, &candidate.embedding).map(|score| (score, candidate)))
214 .filter(|(score, _)| *score >= self.min_similarity)
215 .max_by(|(a, _), (b, _)| a.total_cmp(b));
216
217 match best {
218 Some((_, matched)) => Ok(Resolution::Existing {
219 key: matched.key,
220 name: matched.name,
221 }),
222 None => Ok(Resolution::New {
223 name: entity.to_string(),
224 }),
225 }
226 }
227}
228
229/// In-memory [`EntityCatalog`] for tests and benchmarks, with no live backend.
230///
231/// Holds entity vectors per scope so a resolver's matching logic runs against
232/// real candidates rather than a faked query result. This is the catalog's
233/// test/benchmark boundary, mirroring
234/// [`InMemoryGraphStore`](crate::graph::InMemoryGraphStore).
235///
236/// # Examples
237///
238/// ```
239/// use memoir_core::graph::{EntityCatalog, EntityVector, InMemoryEntityCatalog};
240/// use memoir_core::memory::Scope;
241///
242/// # async fn run() -> Result<(), Box<dyn std::error::Error>> {
243/// let scope = Scope { agent_id: "a".into(), org_id: "o".into(), user_id: "u".into() };
244/// let catalog = InMemoryEntityCatalog::new();
245/// catalog.insert(&scope, EntityVector { key: "e1".into(), name: "Alice".into(), embedding: vec![1.0, 0.0] });
246///
247/// let found = catalog.candidates_in_scope(&scope).await?;
248/// assert_eq!(found.len(), 1);
249/// # Ok(())
250/// # }
251/// ```
252#[derive(Default)]
253pub struct InMemoryEntityCatalog {
254 by_scope: Mutex<HashMap<Scope, Vec<EntityVector>>>,
255}
256
257impl InMemoryEntityCatalog {
258 /// Creates an empty catalog.
259 pub fn new() -> Self {
260 Self::default()
261 }
262
263 /// Adds `entity` to `scope`'s candidate set.
264 pub fn insert(&self, scope: &Scope, entity: EntityVector) {
265 self.by_scope
266 .lock()
267 .expect("entity catalog mutex poisoned")
268 .entry(scope.clone())
269 .or_default()
270 .push(entity);
271 }
272}
273
274impl EntityCatalog for InMemoryEntityCatalog {
275 async fn candidates_in_scope(&self, scope: &Scope) -> Result<Vec<EntityVector>, ResolveError> {
276 Ok(self
277 .by_scope
278 .lock()
279 .expect("entity catalog mutex poisoned")
280 .get(scope)
281 .cloned()
282 .unwrap_or_default())
283 }
284}
285
286#[cfg(test)]
287mod tests {
288 use super::*;
289
290 fn scope(user: &str) -> Scope {
291 Scope {
292 agent_id: "agent".to_string(),
293 org_id: "org".to_string(),
294 user_id: user.to_string(),
295 }
296 }
297
298 /// Embeds a name to a fixed unit vector so cosine is deterministic in tests.
299 ///
300 /// "Alice" and "Alice Smith" map to near-identical vectors (the surface-form
301 /// pair the embedding resolver must merge and the exact resolver must not);
302 /// "Bob" maps to an orthogonal vector.
303 struct FakeEmbedding;
304
305 impl EmbeddingModel for FakeEmbedding {
306 async fn embed(&self, text: &str) -> Result<Vec<f32>, EmbeddingError> {
307 let vector = if text.starts_with("Alice") {
308 vec![1.0, 0.0, 0.0]
309 } else if text == "Bob" {
310 vec![0.0, 1.0, 0.0]
311 } else {
312 vec![0.0, 0.0, 1.0]
313 };
314 Ok(vector)
315 }
316
317 fn dimensions(&self) -> usize {
318 3
319 }
320 }
321
322 async fn catalog_with_alice() -> InMemoryEntityCatalog {
323 let catalog = InMemoryEntityCatalog::new();
324 catalog.insert(
325 &scope("u"),
326 EntityVector {
327 key: "alice-node".to_string(),
328 name: "Alice".to_string(),
329 embedding: vec![1.0, 0.0, 0.0],
330 },
331 );
332 catalog
333 }
334
335 #[tokio::test(flavor = "current_thread")]
336 async fn should_match_exact_name_with_exact_resolver() {
337 let resolver = ExactStringResolver::new(catalog_with_alice().await);
338
339 let resolution = resolver.resolve(&scope("u"), "Alice").await.unwrap();
340
341 assert_eq!(
342 resolution,
343 Resolution::Existing {
344 key: "alice-node".to_string(),
345 name: "Alice".to_string(),
346 }
347 );
348 }
349
350 #[tokio::test(flavor = "current_thread")]
351 async fn should_create_new_for_surface_variant_with_exact_resolver() {
352 let resolver = ExactStringResolver::new(catalog_with_alice().await);
353
354 let resolution = resolver.resolve(&scope("u"), "Alice Smith").await.unwrap();
355
356 assert_eq!(
357 resolution,
358 Resolution::New {
359 name: "Alice Smith".to_string(),
360 }
361 );
362 }
363
364 #[tokio::test(flavor = "current_thread")]
365 async fn should_merge_surface_variant_with_embedding_resolver() {
366 let resolver = EmbeddingEntityResolver::new(FakeEmbedding, catalog_with_alice().await);
367
368 let resolution = resolver.resolve(&scope("u"), "Alice Smith").await.unwrap();
369
370 assert_eq!(
371 resolution,
372 Resolution::Existing {
373 key: "alice-node".to_string(),
374 name: "Alice".to_string(),
375 }
376 );
377 }
378
379 #[tokio::test(flavor = "current_thread")]
380 async fn should_create_new_for_dissimilar_entity_with_embedding_resolver() {
381 let resolver = EmbeddingEntityResolver::new(FakeEmbedding, catalog_with_alice().await);
382
383 let resolution = resolver.resolve(&scope("u"), "Bob").await.unwrap();
384
385 assert_eq!(
386 resolution,
387 Resolution::New {
388 name: "Bob".to_string()
389 }
390 );
391 }
392
393 #[tokio::test(flavor = "current_thread")]
394 async fn should_not_merge_across_scopes() {
395 let resolver = EmbeddingEntityResolver::new(FakeEmbedding, catalog_with_alice().await);
396
397 let resolution = resolver.resolve(&scope("other"), "Alice").await.unwrap();
398
399 assert_eq!(
400 resolution,
401 Resolution::New {
402 name: "Alice".to_string()
403 }
404 );
405 }
406}