Skip to main content

memoir_core/graph/
resolve.rs

1//! Mapping extracted entity strings to canonical graph nodes.
2//!
3//! A [`Triple`](crate::graph::Triple)'s `subject` and `object` are bare strings
4//! the extractor produced; two memories that mention the same real-world entity
5//! under different surface forms ("Alice", "Alice Smith") must resolve to the
6//! same graph node or the graph fragments into duplicates. [`EntityResolver`] is
7//! the seam that maps a candidate string to either an existing node or a new
8//! one.
9//!
10//! Two implementations ship: [`ExactStringResolver`] (the benchmark floor —
11//! match on canonical name only) and [`EmbeddingEntityResolver`] (the
12//! production impl — cosine similarity over candidate embeddings). Both partition
13//! the resolution space by [`Scope`] so an entity in one tenant never merges with
14//! a like-named entity in another.
15//!
16//! The candidates an embedding resolver compares against come from an
17//! [`EntityCatalog`], a focused retrieval seam separate from
18//! [`GraphStore`](crate::graph::GraphStore): the resolver needs only
19//! "the entities I might merge with in this scope", not the full store surface.
20//! [`InMemoryEntityCatalog`] is the always-available backing for tests and
21//! benchmarks; a FalkorDB-backed catalog (reading the nodes the commit path
22//! writes) is layered on in the commit ticket.
23
24use std::collections::HashMap;
25use std::future::Future;
26use std::sync::Mutex;
27
28use crate::embedding::{EmbeddingError, EmbeddingModel};
29use crate::memory::Scope;
30
31use super::cosine::cosine_similarity;
32
33/// Minimum cosine similarity for two entity strings to be the same node.
34///
35/// Below this floor the surface forms are treated as distinct entities and a
36/// new node is created. Mirrors the `MIN_CATEGORY_SCORE` precedent
37/// (`client/categorize.rs`): a deliberately conservative default that favors a
38/// new node over a wrong merge, since an over-eager merge is harder to undo than
39/// a duplicate.
40///
41/// Chosen by measurement (`benches/resolver_accuracy.rs`, BGE-small-en-v1.5,
42/// 2026-06-10): should-merge name variants scored ≥ 0.763 ("Alice Smith" vs
43/// "Alice") while should-split distinct entities scored ≤ 0.718, so the floor
44/// sits inside that band, biased toward the merge bound to keep favoring
45/// splits. The original 0.85 guess false-split "Alice Smith" in the live 0014
46/// suite.
47pub const MIN_ENTITY_SIMILARITY: f32 = 0.75;
48
49/// The outcome of resolving an entity string against the existing graph.
50///
51/// Tells the commit path whether to match an existing node or create one, and
52/// carries the canonical name so surface-form variants collapse onto a single
53/// node. A resolver that finds a match returns [`Resolution::Existing`] with the
54/// matched node's stable key; otherwise [`Resolution::New`] with the name to
55/// create.
56#[derive(Debug, Clone, PartialEq)]
57pub enum Resolution {
58    /// The entity matched an existing node identified by `key`.
59    Existing {
60        /// The matched node's stable identity within its scope.
61        key: String,
62        /// The matched node's canonical name (may differ from the query string).
63        name: String,
64    },
65
66    /// No existing node matched; a node should be created under `name`.
67    New {
68        /// The canonical name to create the node under.
69        name: String,
70    },
71}
72
73/// A candidate entity node the resolver compares a query against.
74///
75/// `key` is the node's stable identity within its scope; `name` is its canonical
76/// surface form; `embedding` is the vector of `name`, used for cosine matching.
77/// Yielded by an [`EntityCatalog`] for one scope.
78#[derive(Debug, Clone, PartialEq)]
79pub struct EntityVector {
80    /// The node's stable identity within its scope.
81    pub key: String,
82    /// The node's canonical name.
83    pub name: String,
84    /// The embedding of `name`, for cosine comparison.
85    pub embedding: Vec<f32>,
86}
87
88/// Yields the existing entity nodes a resolver may merge a candidate into.
89///
90/// A focused retrieval seam, deliberately narrower than
91/// [`GraphStore`](crate::graph::GraphStore): the embedding resolver needs only
92/// the candidate entities in one [`Scope`], not the full store surface. This is
93/// also the seam where a FalkorDB-native vector index later replaces the linear
94/// scan, behind the same trait, without changing the resolver.
95pub trait EntityCatalog: Send + Sync + 'static {
96    /// Returns the entity nodes within `scope` that a candidate may match.
97    ///
98    /// Scope-confined: implementations never yield entities from another scope,
99    /// upholding the same tenant isolation the rest of memoir enforces.
100    ///
101    /// # Errors
102    ///
103    /// Returns [`ResolveError::Catalog`] when the backing store cannot be read.
104    fn candidates_in_scope(
105        &self,
106        scope: &Scope,
107    ) -> impl Future<Output = Result<Vec<EntityVector>, ResolveError>> + Send;
108}
109
110/// Maps an extracted entity string to a canonical graph node.
111///
112/// Implementations decide whether a candidate is an existing node or a new one,
113/// always confined to the candidate's [`Scope`]. Swapping one implementation for
114/// another (exact-string, embedding, or a future ANN-backed impl) requires no
115/// caller change, which is what lets the benchmark compare them.
116pub trait EntityResolver: Send + Sync + 'static {
117    /// Resolves `entity` within `scope` to an existing or new node.
118    ///
119    /// # Errors
120    ///
121    /// Returns [`ResolveError::Catalog`] when candidate retrieval fails and
122    /// [`ResolveError::Embed`] when the embedding model fails.
123    fn resolve(&self, scope: &Scope, entity: &str) -> impl Future<Output = Result<Resolution, ResolveError>> + Send;
124}
125
126/// Failure modes for [`EntityResolver`] implementations.
127#[derive(Debug, thiserror::Error)]
128pub enum ResolveError {
129    /// Reading candidate entities from the [`EntityCatalog`] failed.
130    #[error("entity catalog read failed: {0}")]
131    Catalog(String),
132
133    /// Embedding the candidate entity string failed.
134    #[error("entity embedding failed: {0}")]
135    Embed(#[from] EmbeddingError),
136}
137
138/// Resolves entities by exact canonical-name match within a scope.
139///
140/// The benchmark floor: two surface forms of the same entity ("Alice", "Alice
141/// Smith") resolve to *different* nodes because only an exact name match counts.
142/// Useful as the baseline [`EmbeddingEntityResolver`] is measured against, and as
143/// a zero-dependency resolver when embedding-based merging is not wanted.
144pub struct ExactStringResolver<C> {
145    catalog: C,
146}
147
148impl<C: EntityCatalog> ExactStringResolver<C> {
149    /// Builds an exact-match resolver over `catalog`.
150    pub fn new(catalog: C) -> Self {
151        Self { catalog }
152    }
153}
154
155impl<C: EntityCatalog> EntityResolver for ExactStringResolver<C> {
156    async fn resolve(&self, scope: &Scope, entity: &str) -> Result<Resolution, ResolveError> {
157        let candidates = self.catalog.candidates_in_scope(scope).await?;
158        match candidates.into_iter().find(|candidate| candidate.name == entity) {
159            Some(matched) => Ok(Resolution::Existing {
160                key: matched.key,
161                name: matched.name,
162            }),
163            None => Ok(Resolution::New {
164                name: entity.to_string(),
165            }),
166        }
167    }
168}
169
170/// Resolves entities by cosine similarity over candidate embeddings.
171///
172/// The production impl: embeds the candidate string and compares it against the
173/// embeddings of existing nodes in the same scope, merging into the closest node
174/// above [`MIN_ENTITY_SIMILARITY`]. This collapses surface-form variants onto one
175/// node where the exact-string resolver would fragment them.
176///
177/// Candidate embeddings are stored as a node property and scanned exactly here;
178/// a FalkorDB-native vector index can later replace the catalog's linear scan
179/// behind the [`EntityCatalog`] seam without changing this resolver.
180///
181/// Generic over the embedder and catalog so tests inject stubs.
182pub struct EmbeddingEntityResolver<E, C> {
183    embedder: E,
184    catalog: C,
185    min_similarity: f32,
186}
187
188impl<E: EmbeddingModel, C: EntityCatalog> EmbeddingEntityResolver<E, C> {
189    /// Builds a resolver over `embedder` and `catalog` with the default floor.
190    pub fn new(embedder: E, catalog: C) -> Self {
191        Self {
192            embedder,
193            catalog,
194            min_similarity: MIN_ENTITY_SIMILARITY,
195        }
196    }
197
198    /// Overrides the minimum cosine similarity for a match.
199    #[must_use]
200    pub fn with_min_similarity(mut self, min_similarity: f32) -> Self {
201        self.min_similarity = min_similarity;
202        self
203    }
204}
205
206impl<E: EmbeddingModel, C: EntityCatalog> EntityResolver for EmbeddingEntityResolver<E, C> {
207    async fn resolve(&self, scope: &Scope, entity: &str) -> Result<Resolution, ResolveError> {
208        let query = self.embedder.embed(entity).await?;
209        let candidates = self.catalog.candidates_in_scope(scope).await?;
210
211        let best = candidates
212            .into_iter()
213            .filter_map(|candidate| cosine_similarity(&query, &candidate.embedding).map(|score| (score, candidate)))
214            .filter(|(score, _)| *score >= self.min_similarity)
215            .max_by(|(a, _), (b, _)| a.total_cmp(b));
216
217        match best {
218            Some((_, matched)) => Ok(Resolution::Existing {
219                key: matched.key,
220                name: matched.name,
221            }),
222            None => Ok(Resolution::New {
223                name: entity.to_string(),
224            }),
225        }
226    }
227}
228
229/// In-memory [`EntityCatalog`] for tests and benchmarks, with no live backend.
230///
231/// Holds entity vectors per scope so a resolver's matching logic runs against
232/// real candidates rather than a faked query result. This is the catalog's
233/// test/benchmark boundary, mirroring
234/// [`InMemoryGraphStore`](crate::graph::InMemoryGraphStore).
235///
236/// # Examples
237///
238/// ```
239/// use memoir_core::graph::{EntityCatalog, EntityVector, InMemoryEntityCatalog};
240/// use memoir_core::memory::Scope;
241///
242/// # async fn run() -> Result<(), Box<dyn std::error::Error>> {
243/// let scope = Scope { agent_id: "a".into(), org_id: "o".into(), user_id: "u".into() };
244/// let catalog = InMemoryEntityCatalog::new();
245/// catalog.insert(&scope, EntityVector { key: "e1".into(), name: "Alice".into(), embedding: vec![1.0, 0.0] });
246///
247/// let found = catalog.candidates_in_scope(&scope).await?;
248/// assert_eq!(found.len(), 1);
249/// # Ok(())
250/// # }
251/// ```
252#[derive(Default)]
253pub struct InMemoryEntityCatalog {
254    by_scope: Mutex<HashMap<Scope, Vec<EntityVector>>>,
255}
256
257impl InMemoryEntityCatalog {
258    /// Creates an empty catalog.
259    pub fn new() -> Self {
260        Self::default()
261    }
262
263    /// Adds `entity` to `scope`'s candidate set.
264    pub fn insert(&self, scope: &Scope, entity: EntityVector) {
265        self.by_scope
266            .lock()
267            .expect("entity catalog mutex poisoned")
268            .entry(scope.clone())
269            .or_default()
270            .push(entity);
271    }
272}
273
274impl EntityCatalog for InMemoryEntityCatalog {
275    async fn candidates_in_scope(&self, scope: &Scope) -> Result<Vec<EntityVector>, ResolveError> {
276        Ok(self
277            .by_scope
278            .lock()
279            .expect("entity catalog mutex poisoned")
280            .get(scope)
281            .cloned()
282            .unwrap_or_default())
283    }
284}
285
286#[cfg(test)]
287mod tests {
288    use super::*;
289
290    fn scope(user: &str) -> Scope {
291        Scope {
292            agent_id: "agent".to_string(),
293            org_id: "org".to_string(),
294            user_id: user.to_string(),
295        }
296    }
297
298    /// Embeds a name to a fixed unit vector so cosine is deterministic in tests.
299    ///
300    /// "Alice" and "Alice Smith" map to near-identical vectors (the surface-form
301    /// pair the embedding resolver must merge and the exact resolver must not);
302    /// "Bob" maps to an orthogonal vector.
303    struct FakeEmbedding;
304
305    impl EmbeddingModel for FakeEmbedding {
306        async fn embed(&self, text: &str) -> Result<Vec<f32>, EmbeddingError> {
307            let vector = if text.starts_with("Alice") {
308                vec![1.0, 0.0, 0.0]
309            } else if text == "Bob" {
310                vec![0.0, 1.0, 0.0]
311            } else {
312                vec![0.0, 0.0, 1.0]
313            };
314            Ok(vector)
315        }
316
317        fn dimensions(&self) -> usize {
318            3
319        }
320    }
321
322    async fn catalog_with_alice() -> InMemoryEntityCatalog {
323        let catalog = InMemoryEntityCatalog::new();
324        catalog.insert(
325            &scope("u"),
326            EntityVector {
327                key: "alice-node".to_string(),
328                name: "Alice".to_string(),
329                embedding: vec![1.0, 0.0, 0.0],
330            },
331        );
332        catalog
333    }
334
335    #[tokio::test(flavor = "current_thread")]
336    async fn should_match_exact_name_with_exact_resolver() {
337        let resolver = ExactStringResolver::new(catalog_with_alice().await);
338
339        let resolution = resolver.resolve(&scope("u"), "Alice").await.unwrap();
340
341        assert_eq!(
342            resolution,
343            Resolution::Existing {
344                key: "alice-node".to_string(),
345                name: "Alice".to_string(),
346            }
347        );
348    }
349
350    #[tokio::test(flavor = "current_thread")]
351    async fn should_create_new_for_surface_variant_with_exact_resolver() {
352        let resolver = ExactStringResolver::new(catalog_with_alice().await);
353
354        let resolution = resolver.resolve(&scope("u"), "Alice Smith").await.unwrap();
355
356        assert_eq!(
357            resolution,
358            Resolution::New {
359                name: "Alice Smith".to_string(),
360            }
361        );
362    }
363
364    #[tokio::test(flavor = "current_thread")]
365    async fn should_merge_surface_variant_with_embedding_resolver() {
366        let resolver = EmbeddingEntityResolver::new(FakeEmbedding, catalog_with_alice().await);
367
368        let resolution = resolver.resolve(&scope("u"), "Alice Smith").await.unwrap();
369
370        assert_eq!(
371            resolution,
372            Resolution::Existing {
373                key: "alice-node".to_string(),
374                name: "Alice".to_string(),
375            }
376        );
377    }
378
379    #[tokio::test(flavor = "current_thread")]
380    async fn should_create_new_for_dissimilar_entity_with_embedding_resolver() {
381        let resolver = EmbeddingEntityResolver::new(FakeEmbedding, catalog_with_alice().await);
382
383        let resolution = resolver.resolve(&scope("u"), "Bob").await.unwrap();
384
385        assert_eq!(
386            resolution,
387            Resolution::New {
388                name: "Bob".to_string()
389            }
390        );
391    }
392
393    #[tokio::test(flavor = "current_thread")]
394    async fn should_not_merge_across_scopes() {
395        let resolver = EmbeddingEntityResolver::new(FakeEmbedding, catalog_with_alice().await);
396
397        let resolution = resolver.resolve(&scope("other"), "Alice").await.unwrap();
398
399        assert_eq!(
400            resolution,
401            Resolution::New {
402                name: "Alice".to_string()
403            }
404        );
405    }
406}