Skip to main content

memoir_core/graph/
synthesis.rs

1//! Reconciling extracted triples against semantic facts before commit.
2//!
3//! Synthesis is the fan-in of memoir's two LLM-derived siblings: the relational
4//! triples ([`TripleExtractor`](super::TripleExtractor)) and the flat semantic
5//! facts (the extraction worker). A [`Synthesizer`] reconciles them — vetoing
6//! triples the semantic facts do not corroborate (precision: kills hallucinated
7//! edges) and, where an implementation chooses, contributing relationships the
8//! triple pass missed (recall). Its output is the canonical triple set the
9//! commit path writes.
10//!
11//! Like every stage of how memoir's memory works, this is a swappable seam: the
12//! trait is model-agnostic, and implementations range from a cheap default
13//! ([`EmbeddingSynthesizer`], reuse the embedder already in hand) through a
14//! disable switch ([`PassthroughSynthesizer`]) to consumer-supplied or
15//! LLM-backed reconcilers. The default is cheap by deliberate choice for a
16//! library; an expensive reconciler is a valid alternative behind the same
17//! trait, not a thing ruled out.
18
19use std::future::Future;
20
21use crate::embedding::{EmbeddingError, EmbeddingModel};
22
23use super::cosine::cosine_similarity;
24use super::{Triple, TripleSet};
25
26/// Minimum cosine similarity for a semantic fact to corroborate a triple.
27///
28/// A triple scoring below this against every semantic fact is treated as
29/// uncorroborated — likely a hallucinated edge — and vetoed. Mirrors the
30/// conservative-default reasoning of `MIN_ENTITY_SIMILARITY` /
31/// `MIN_CATEGORY_SCORE`: tuned to drop weakly-supported facts rather than admit
32/// them, since a spurious edge pollutes traversal more than a missing one.
33pub const MIN_CORROBORATION_SIMILARITY: f32 = 0.6;
34
35/// A semantic fact a [`Synthesizer`] reconciles triples against.
36///
37/// Carries the fact's text — the corroboration signal. An implementation that
38/// needs vectors embeds the text itself (the embedder is the impl's concern, not
39/// the trait's), so this stays a plain, backend-agnostic input.
40#[derive(Debug, Clone, PartialEq)]
41pub struct SemanticFact {
42    /// The fact's content sentence, as the extraction worker produced it.
43    pub content: String,
44}
45
46/// Reconciles extracted triples against semantic facts into a committable set.
47///
48/// The fifth trait seam of the knowledge-graph pipeline. Implementations decide
49/// which initial triples survive (corroborated) and may add triples the
50/// extractor missed; the result is what the commit path writes. Swapping one
51/// implementation for another (passthrough, embedding, a future LLM-backed or
52/// consumer impl) requires no caller change, which is what lets the pipeline be
53/// reconfigured and benchmarked.
54pub trait Synthesizer: Send + Sync + 'static {
55    /// Reconciles `triples` against `facts` into the set to commit.
56    ///
57    /// # Errors
58    ///
59    /// Returns [`SynthesisError`] when the implementation's own machinery fails
60    /// (e.g. embedding a triple or fact).
61    fn synthesize(
62        &self,
63        triples: TripleSet,
64        facts: &[SemanticFact],
65    ) -> impl Future<Output = Result<TripleSet, SynthesisError>> + Send;
66}
67
68/// Failure modes for [`Synthesizer`] implementations.
69#[derive(Debug, thiserror::Error)]
70pub enum SynthesisError {
71    /// Embedding a triple or fact during corroboration failed.
72    #[error("synthesis embedding failed: {0}")]
73    Embed(#[from] EmbeddingError),
74}
75
76/// Commits the extracted triples unchanged — the disable switch and floor.
77///
78/// Performs no reconciliation: every initial triple passes through. This is the
79/// "synthesis off" configuration and the benchmark floor an active reconciler is
80/// measured against. Never fails and never calls a model.
81#[derive(Debug, Default, Clone, Copy)]
82pub struct PassthroughSynthesizer;
83
84impl PassthroughSynthesizer {
85    /// Creates a passthrough synthesizer.
86    pub fn new() -> Self {
87        Self
88    }
89}
90
91impl Synthesizer for PassthroughSynthesizer {
92    async fn synthesize(&self, triples: TripleSet, _facts: &[SemanticFact]) -> Result<TripleSet, SynthesisError> {
93        Ok(triples)
94    }
95}
96
97/// Vetoes triples no semantic fact corroborates, by embedding similarity.
98///
99/// The cheap default: reuses the [`EmbeddingModel`] already configured on the
100/// client (no extra model call class). Each triple is rendered to text and
101/// embedded; it survives if its cosine similarity to some semantic fact's
102/// embedding is at least [`MIN_CORROBORATION_SIMILARITY`], and is vetoed
103/// otherwise. This is the precision half of synthesis — the recall half
104/// (deriving triples for facts no triple covered) is left to richer
105/// implementations.
106///
107/// Generic over the embedder so tests inject a stub.
108pub struct EmbeddingSynthesizer<E> {
109    embedder: E,
110    min_similarity: f32,
111}
112
113impl<E: EmbeddingModel> EmbeddingSynthesizer<E> {
114    /// Builds a synthesizer over `embedder` using the default corroboration floor.
115    pub fn new(embedder: E) -> Self {
116        Self {
117            embedder,
118            min_similarity: MIN_CORROBORATION_SIMILARITY,
119        }
120    }
121
122    /// Overrides the minimum corroboration similarity.
123    #[must_use]
124    pub fn with_min_similarity(mut self, min_similarity: f32) -> Self {
125        self.min_similarity = min_similarity;
126        self
127    }
128}
129
130impl<E: EmbeddingModel> Synthesizer for EmbeddingSynthesizer<E> {
131    async fn synthesize(&self, triples: TripleSet, facts: &[SemanticFact]) -> Result<TripleSet, SynthesisError> {
132        if facts.is_empty() {
133            return Ok(TripleSet::default());
134        }
135
136        let mut fact_embeddings = Vec::with_capacity(facts.len());
137        for fact in facts {
138            fact_embeddings.push(self.embedder.embed(&fact.content).await?);
139        }
140
141        let mut kept = Vec::new();
142        for triple in triples {
143            let rendered = render_triple(&triple);
144            let triple_embedding = self.embedder.embed(&rendered).await?;
145            let corroborated = fact_embeddings
146                .iter()
147                .filter_map(|fact| cosine_similarity(&triple_embedding, fact))
148                .any(|score| score >= self.min_similarity);
149            if corroborated {
150                kept.push(triple);
151            }
152        }
153
154        Ok(kept.into_iter().collect())
155    }
156}
157
158/// Renders a triple to the text embedded for corroboration.
159fn render_triple(triple: &Triple) -> String {
160    format!("{} {} {}", triple.subject, triple.relation, triple.object)
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166
167    fn triple(subject: &str, relation: &str, object: &str) -> Triple {
168        Triple {
169            subject: subject.to_string(),
170            relation: relation.to_string(),
171            object: object.to_string(),
172            confidence: 0.9,
173        }
174    }
175
176    fn triples(items: Vec<Triple>) -> TripleSet {
177        items.into_iter().collect()
178    }
179
180    fn fact(content: &str) -> SemanticFact {
181        SemanticFact {
182            content: content.to_string(),
183        }
184    }
185
186    /// Embeds corroborated text to one vector and everything else orthogonally,
187    /// so a triple "matches" a fact iff both render to the corroborated token.
188    struct FakeEmbedding;
189
190    impl EmbeddingModel for FakeEmbedding {
191        async fn embed(&self, text: &str) -> Result<Vec<f32>, EmbeddingError> {
192            let vector = if text.contains("Acme") {
193                vec![1.0, 0.0, 0.0]
194            } else if text.contains("Globex") {
195                vec![0.0, 1.0, 0.0]
196            } else {
197                vec![0.0, 0.0, 1.0]
198            };
199            Ok(vector)
200        }
201
202        fn dimensions(&self) -> usize {
203            3
204        }
205    }
206
207    #[tokio::test(flavor = "current_thread")]
208    async fn should_pass_all_triples_through_passthrough() {
209        let synth = PassthroughSynthesizer::new();
210        let input = triples(vec![triple("Alice", "works at", "Acme"), triple("Bob", "likes", "tea")]);
211
212        let out = synth.synthesize(input.clone(), &[]).await.unwrap();
213
214        assert_eq!(out.len(), 2);
215    }
216
217    #[tokio::test(flavor = "current_thread")]
218    async fn should_keep_corroborated_triple() {
219        let synth = EmbeddingSynthesizer::new(FakeEmbedding);
220        let input = triples(vec![triple("Alice", "works at", "Acme")]);
221
222        let out = synth.synthesize(input, &[fact("Alice works at Acme Corp")]).await.unwrap();
223
224        assert_eq!(out.len(), 1);
225        assert_eq!(out[0].object, "Acme");
226    }
227
228    #[tokio::test(flavor = "current_thread")]
229    async fn should_veto_uncorroborated_triple() {
230        // The triple says Globex; the only fact is about Acme -> orthogonal -> vetoed.
231        let synth = EmbeddingSynthesizer::new(FakeEmbedding);
232        let input = triples(vec![triple("Alice", "works at", "Globex")]);
233
234        let out = synth.synthesize(input, &[fact("Alice works at Acme Corp")]).await.unwrap();
235
236        assert!(out.is_empty());
237    }
238
239    #[tokio::test(flavor = "current_thread")]
240    async fn should_veto_everything_when_no_facts() {
241        // No semantic facts means nothing corroborates -> all triples vetoed.
242        let synth = EmbeddingSynthesizer::new(FakeEmbedding);
243        let input = triples(vec![triple("Alice", "works at", "Acme")]);
244
245        let out = synth.synthesize(input, &[]).await.unwrap();
246
247        assert!(out.is_empty());
248    }
249
250    #[tokio::test(flavor = "current_thread")]
251    async fn should_keep_only_corroborated_among_mixed() {
252        let synth = EmbeddingSynthesizer::new(FakeEmbedding);
253        let input = triples(vec![
254            triple("Alice", "works at", "Acme"),
255            triple("Alice", "works at", "Globex"),
256        ]);
257
258        let out = synth.synthesize(input, &[fact("Alice works at Acme")]).await.unwrap();
259
260        assert_eq!(out.len(), 1);
261        assert_eq!(out[0].object, "Acme");
262    }
263
264    #[test]
265    fn should_render_triple_as_subject_relation_object() {
266        assert_eq!(render_triple(&triple("Alice", "works at", "Acme")), "Alice works at Acme");
267    }
268}