Skip to main content

zeph_memory/semantic/
mod.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! High-level semantic memory orchestrator.
5//!
6//! [`SemanticMemory`] is the primary entry point used by `zeph-core`.  It wires
7//! together [`crate::store::SqliteStore`] (relational persistence) and
8//! [`crate::embedding_store::EmbeddingStore`] (Qdrant vector index) into a single
9//! object with `remember` / `recall` / `summarize` operations.
10//!
11//! # Construction
12//!
13//! Use [`SemanticMemory::new`] for the default 0.7/0.3 vector/keyword weights, or
14//! [`SemanticMemory::with_qdrant_ops`] inside `AppBuilder` to share a single gRPC
15//! channel across all subsystems.
16//!
17//! # Hybrid recall
18//!
19//! Recall uses reciprocal-rank fusion of BM25 (`SQLite` FTS5) and cosine-similarity
20//! (Qdrant) results, with optional temporal decay, MMR diversity reranking, and
21//! per-tier score boosts.
22
23mod algorithms;
24mod corrections;
25mod cross_session;
26mod graph;
27pub(crate) mod importance;
28pub mod persona;
29mod recall;
30mod summarization;
31pub mod trajectory;
32pub mod tree_consolidation;
33pub(crate) mod write_buffer;
34
35#[cfg(test)]
36mod tests;
37
38use std::sync::Arc;
39use std::sync::Mutex;
40use std::sync::atomic::AtomicU64;
41use std::time::Instant;
42
43use tokio::sync::RwLock;
44use zeph_llm::any::AnyProvider;
45use zeph_llm::provider::LlmProvider as _;
46
47use crate::admission::AdmissionControl;
48use crate::embedding_store::EmbeddingStore;
49use crate::error::MemoryError;
50use crate::store::SqliteStore;
51use crate::token_counter::TokenCounter;
52
53pub(crate) const SESSION_SUMMARIES_COLLECTION: &str = "zeph_session_summaries";
54pub(crate) const KEY_FACTS_COLLECTION: &str = "zeph_key_facts";
55pub(crate) const CORRECTIONS_COLLECTION: &str = "zeph_corrections";
56
57/// Progress state for embed backfill.
58#[derive(Debug, Clone, Copy, PartialEq, Eq)]
59pub struct BackfillProgress {
60    /// Number of messages processed so far (including failures).
61    pub done: usize,
62    /// Total number of unembedded messages at backfill start.
63    pub total: usize,
64}
65
66pub use algorithms::{apply_mmr, apply_temporal_decay};
67pub use cross_session::SessionSummaryResult;
68pub use graph::{
69    ExtractionResult, ExtractionStats, GraphExtractionConfig, LinkingStats, NoteLinkingConfig,
70    PostExtractValidator, extract_and_store, link_memory_notes,
71};
72pub use persona::{
73    PersonaExtractionConfig, contains_self_referential_language, extract_persona_facts,
74};
75pub use recall::{EmbedContext, RecalledMessage};
76pub use summarization::{StructuredSummary, Summary, build_summarization_prompt};
77pub use trajectory::{TrajectoryEntry, TrajectoryExtractionConfig, extract_trajectory_entries};
78pub use tree_consolidation::{
79    TreeConsolidationConfig, TreeConsolidationResult, run_tree_consolidation_sweep,
80    start_tree_consolidation_loop,
81};
82pub use write_buffer::{BufferedWrite, WriteBuffer};
83
84/// Cached profile centroid for query-bias correction (MM-F3, #3341).
85///
86/// Stored inside `SemanticMemory::profile_centroid` under an `RwLock`. Expires after
87/// `profile_centroid_ttl_secs` seconds; a miss is non-sticky (next call retries).
88#[derive(Debug, Clone)]
89pub(crate) struct CachedCentroid {
90    /// The centroid vector (unweighted mean of persona-fact embeddings).
91    pub vector: Vec<f32>,
92    /// Wall-clock instant when this centroid was computed.
93    pub computed_at: Instant,
94}
95
96/// Classification of a user query's self-referential intent (MM-F3, #3341).
97///
98/// Used to decide whether query-bias correction should shift the embedding
99/// towards the user's profile centroid.
100#[derive(Debug, Clone, Copy, PartialEq, Eq)]
101pub(crate) enum QueryIntent {
102    /// Query contains first-person language — likely about the user themselves.
103    FirstPerson,
104    /// Query is about an external topic; no bias shift applied.
105    Other,
106}
107
108/// HL-F5 runtime wiring for spreading activation (mirror of `[memory.hebbian]` spread fields).
109///
110/// Built from config at bootstrap and attached via [`SemanticMemory::with_hebbian_spread`].
111#[derive(Debug, Clone, Default)]
112pub struct HelaSpreadRuntime {
113    /// `true` when `[memory.hebbian] enabled = true` AND `spreading_activation = true`.
114    pub enabled: bool,
115    /// BFS hops, already clamped to `[1, 6]` by the caller.
116    pub depth: u32,
117    /// Soft upper bound on the visited-node set.
118    pub max_visited: usize,
119    /// MAGMA edge-type filter for BFS traversal.
120    pub edge_types: Vec<crate::graph::EdgeType>,
121    /// Per-step circuit-breaker duration.
122    pub step_budget: Option<std::time::Duration>,
123}
124
125/// High-level semantic memory orchestrator combining `SQLite` and Qdrant.
126///
127/// Instantiate via [`SemanticMemory::new`] or the `AppBuilder` integration.
128/// All fields are `pub(crate)` — callers interact through the inherent method API.
129// TODO(review): Refactor the five bool flags into two-variant enums to satisfy
130// clippy::struct_excessive_bools. Left for a follow-up to avoid scope creep.
131#[allow(clippy::struct_excessive_bools)]
132pub struct SemanticMemory {
133    pub(crate) sqlite: SqliteStore,
134    pub(crate) qdrant: Option<Arc<EmbeddingStore>>,
135    pub(crate) provider: AnyProvider,
136    /// Dedicated provider for batch embedding calls (backfill, write-path embedding).
137    ///
138    /// When `Some`, all embedding I/O is routed through this provider instead of `provider`.
139    /// This prevents `embed_backfill` from saturating the main provider and causing guardrail
140    /// timeouts. When `None`, falls back to `provider`.
141    pub(crate) embed_provider: Option<AnyProvider>,
142    pub(crate) embedding_model: String,
143    pub(crate) vector_weight: f64,
144    pub(crate) keyword_weight: f64,
145    pub(crate) temporal_decay_enabled: bool,
146    pub(crate) temporal_decay_half_life_days: u32,
147    pub(crate) mmr_enabled: bool,
148    pub(crate) mmr_lambda: f32,
149    pub(crate) importance_enabled: bool,
150    pub(crate) importance_weight: f64,
151    /// Multiplicative score boost for semantic-tier messages in recall ranking.
152    /// Default: `1.3`. Disabled when set to `1.0`.
153    pub(crate) tier_boost_semantic: f64,
154    pub token_counter: Arc<TokenCounter>,
155    pub graph_store: Option<Arc<crate::graph::GraphStore>>,
156    /// Experience store for tool-outcome telemetry and per-turn evolution sweeps.
157    ///
158    /// `Some` when `memory.graph.experience.enabled = true` at bootstrap.
159    pub experience: Option<Arc<crate::graph::experience::ExperienceStore>>,
160    /// `ReasoningBank` store for distilled reasoning strategies (#3342).
161    ///
162    /// `Some` when `memory.reasoning.enabled = true` at bootstrap.
163    pub reasoning: Option<Arc<crate::reasoning::ReasoningMemory>>,
164    pub(crate) community_detection_failures: Arc<AtomicU64>,
165    pub(crate) graph_extraction_count: Arc<AtomicU64>,
166    pub(crate) graph_extraction_failures: Arc<AtomicU64>,
167    pub(crate) last_qdrant_warn: Arc<AtomicU64>,
168    /// A-MAC admission control gate. When `Some`, each `remember()` call is evaluated.
169    pub(crate) admission_control: Option<Arc<AdmissionControl>>,
170    /// Write quality gate. When `Some`, evaluated in `remember()`/`remember_with_parts()`
171    /// after A-MAC admission and before persistence.
172    pub(crate) quality_gate: Option<Arc<crate::quality_gate::QualityGate>>,
173    /// Cosine similarity threshold for skipping near-duplicate key facts (0.0–1.0).
174    /// When a new fact's nearest neighbour in `zeph_key_facts` has score >= this value,
175    /// the fact is considered a duplicate and not inserted.  Default: `0.95`.
176    pub(crate) key_facts_dedup_threshold: f32,
177    /// Bounded set of in-flight background embed tasks.
178    ///
179    /// Guarded by a `Mutex` because `SemanticMemory` is shared via `Arc` and
180    /// `JoinSet` requires `&mut self` for `spawn`. Capacity is capped at
181    /// `MAX_EMBED_BG_TASKS`; tasks that exceed the limit are dropped with a debug log.
182    pub(crate) embed_tasks: Mutex<tokio::task::JoinSet<()>>,
183    /// ANN candidate count fetched from the vector store before reranking (MM-F1, #3340).
184    ///
185    /// `0` = legacy behavior (`recall_limit * 2`). `≥ 1` = direct count.
186    pub(crate) retrieval_depth: u32,
187    /// Template applied to raw user queries before embedding (MM-F2, #3340).
188    ///
189    /// Empty string = identity (pass raw query through). Applied at query-side embed sites only;
190    /// never applied to stored content (summaries, documents).
191    pub(crate) search_prompt_template: String,
192    /// Fires `tracing::warn!` once per instance when `retrieval_depth < recall_limit`.
193    pub(crate) depth_below_limit_warned: Arc<std::sync::atomic::AtomicBool>,
194    /// Fires `tracing::warn!` once per instance when `search_prompt_template` has no `{query}`.
195    pub(crate) missing_placeholder_warned: Arc<std::sync::atomic::AtomicBool>,
196    /// Enable query-bias correction towards the user profile centroid (MM-F3, #3341).
197    pub(crate) query_bias_correction: bool,
198    /// Blend weight for query-bias correction (MM-F3, #3341). Clamped to `[0.0, 1.0]`.
199    pub(crate) query_bias_profile_weight: f32,
200    /// Cached profile centroid computed from persona-fact embeddings (MM-F3, #3341).
201    ///
202    /// Protected by `RwLock` to allow concurrent reads. Never holds the lock across `.await`
203    /// (await-discipline rule #4). TTL-bounded; miss is non-sticky.
204    pub(crate) profile_centroid: RwLock<Option<CachedCentroid>>,
205    /// Time-to-live for the profile centroid cache in seconds (MM-F3, #3341). Default: 300.
206    pub(crate) profile_centroid_ttl_secs: u64,
207    /// Opt-in master switch for Hebbian edge-weight reinforcement (HL-F2, #3344).
208    pub(crate) hebbian_enabled: bool,
209    /// Weight increment applied per recall traversal when `hebbian_enabled = true` (HL-F2, #3344).
210    pub(crate) hebbian_lr: f32,
211    /// HL-F5 spreading activation runtime config (#3346).
212    pub(crate) hebbian_spread: HelaSpreadRuntime,
213}
214
215impl SemanticMemory {
216    /// Create a new `SemanticMemory` instance with default hybrid search weights (0.7/0.3).
217    ///
218    /// Qdrant connection is best-effort: if unavailable, semantic search is disabled.
219    ///
220    /// For `AppBuilder` bootstrap, prefer [`SemanticMemory::with_qdrant_ops`] to share
221    /// a single gRPC channel across all subsystems.
222    ///
223    /// # Errors
224    ///
225    /// Returns an error if `SQLite` cannot be initialized.
226    pub async fn new(
227        sqlite_path: &str,
228        qdrant_url: &str,
229        provider: AnyProvider,
230        embedding_model: &str,
231    ) -> Result<Self, MemoryError> {
232        Self::with_weights(sqlite_path, qdrant_url, provider, embedding_model, 0.7, 0.3).await
233    }
234
235    /// Create a new `SemanticMemory` with custom vector/keyword weights for hybrid search.
236    ///
237    /// For `AppBuilder` bootstrap, prefer [`SemanticMemory::with_qdrant_ops`] to share
238    /// a single gRPC channel across all subsystems.
239    ///
240    /// # Errors
241    ///
242    /// Returns an error if `SQLite` cannot be initialized.
243    pub async fn with_weights(
244        sqlite_path: &str,
245        qdrant_url: &str,
246        provider: AnyProvider,
247        embedding_model: &str,
248        vector_weight: f64,
249        keyword_weight: f64,
250    ) -> Result<Self, MemoryError> {
251        Self::with_weights_and_pool_size(
252            sqlite_path,
253            qdrant_url,
254            provider,
255            embedding_model,
256            vector_weight,
257            keyword_weight,
258            5,
259        )
260        .await
261    }
262
263    /// Create a new `SemanticMemory` with custom weights and configurable pool size.
264    ///
265    /// For `AppBuilder` bootstrap, prefer [`SemanticMemory::with_qdrant_ops`] to share
266    /// a single gRPC channel across all subsystems.
267    ///
268    /// # Errors
269    ///
270    /// Returns an error if `SQLite` cannot be initialized.
271    pub async fn with_weights_and_pool_size(
272        sqlite_path: &str,
273        qdrant_url: &str,
274        provider: AnyProvider,
275        embedding_model: &str,
276        vector_weight: f64,
277        keyword_weight: f64,
278        pool_size: u32,
279    ) -> Result<Self, MemoryError> {
280        let sqlite = SqliteStore::with_pool_size(sqlite_path, pool_size).await?;
281        let pool = sqlite.pool().clone();
282
283        let qdrant = match EmbeddingStore::new(qdrant_url, pool) {
284            Ok(store) => Some(Arc::new(store)),
285            Err(e) => {
286                tracing::warn!("Qdrant unavailable, semantic search disabled: {e:#}");
287                None
288            }
289        };
290
291        Ok(Self {
292            sqlite,
293            qdrant,
294            provider,
295            embed_provider: None,
296            embedding_model: embedding_model.into(),
297            vector_weight,
298            keyword_weight,
299            temporal_decay_enabled: false,
300            temporal_decay_half_life_days: 30,
301            mmr_enabled: false,
302            mmr_lambda: 0.7,
303            importance_enabled: false,
304            importance_weight: 0.15,
305            tier_boost_semantic: 1.3,
306            token_counter: Arc::new(TokenCounter::new()),
307            graph_store: None,
308            experience: None,
309            reasoning: None,
310            community_detection_failures: Arc::new(AtomicU64::new(0)),
311            graph_extraction_count: Arc::new(AtomicU64::new(0)),
312            graph_extraction_failures: Arc::new(AtomicU64::new(0)),
313            last_qdrant_warn: Arc::new(AtomicU64::new(0)),
314            admission_control: None,
315            quality_gate: None,
316            key_facts_dedup_threshold: 0.95,
317            embed_tasks: std::sync::Mutex::new(tokio::task::JoinSet::new()),
318            retrieval_depth: 0,
319            search_prompt_template: String::new(),
320            depth_below_limit_warned: Arc::new(std::sync::atomic::AtomicBool::new(false)),
321            missing_placeholder_warned: Arc::new(std::sync::atomic::AtomicBool::new(false)),
322            query_bias_correction: true,
323            query_bias_profile_weight: 0.25,
324            profile_centroid: RwLock::new(None),
325            profile_centroid_ttl_secs: 300,
326            hebbian_enabled: false,
327            hebbian_lr: 0.1,
328            hebbian_spread: HelaSpreadRuntime::default(),
329        })
330    }
331
332    /// Create a `SemanticMemory` from a pre-built `QdrantOps` instance.
333    ///
334    /// Use this at bootstrap to share one `QdrantOps` (and thus one gRPC channel)
335    /// across all subsystems. The `ops` is consumed and wrapped inside `EmbeddingStore`.
336    ///
337    /// # Errors
338    ///
339    /// Returns an error if `SQLite` cannot be initialized.
340    pub async fn with_qdrant_ops(
341        sqlite_path: &str,
342        ops: crate::QdrantOps,
343        provider: AnyProvider,
344        embedding_model: &str,
345        vector_weight: f64,
346        keyword_weight: f64,
347        pool_size: u32,
348    ) -> Result<Self, MemoryError> {
349        let sqlite = SqliteStore::with_pool_size(sqlite_path, pool_size).await?;
350        let pool = sqlite.pool().clone();
351        let store = EmbeddingStore::with_store(Box::new(ops), pool);
352
353        Ok(Self {
354            sqlite,
355            qdrant: Some(Arc::new(store)),
356            provider,
357            embed_provider: None,
358            embedding_model: embedding_model.into(),
359            vector_weight,
360            keyword_weight,
361            temporal_decay_enabled: false,
362            temporal_decay_half_life_days: 30,
363            mmr_enabled: false,
364            mmr_lambda: 0.7,
365            importance_enabled: false,
366            importance_weight: 0.15,
367            tier_boost_semantic: 1.3,
368            token_counter: Arc::new(TokenCounter::new()),
369            graph_store: None,
370            experience: None,
371            reasoning: None,
372            community_detection_failures: Arc::new(AtomicU64::new(0)),
373            graph_extraction_count: Arc::new(AtomicU64::new(0)),
374            graph_extraction_failures: Arc::new(AtomicU64::new(0)),
375            last_qdrant_warn: Arc::new(AtomicU64::new(0)),
376            admission_control: None,
377            quality_gate: None,
378            key_facts_dedup_threshold: 0.95,
379            embed_tasks: std::sync::Mutex::new(tokio::task::JoinSet::new()),
380            retrieval_depth: 0,
381            search_prompt_template: String::new(),
382            depth_below_limit_warned: Arc::new(std::sync::atomic::AtomicBool::new(false)),
383            missing_placeholder_warned: Arc::new(std::sync::atomic::AtomicBool::new(false)),
384            query_bias_correction: true,
385            query_bias_profile_weight: 0.25,
386            profile_centroid: RwLock::new(None),
387            profile_centroid_ttl_secs: 300,
388            hebbian_enabled: false,
389            hebbian_lr: 0.1,
390            hebbian_spread: HelaSpreadRuntime::default(),
391        })
392    }
393
394    /// Attach a `GraphStore` for graph-aware retrieval.
395    ///
396    /// When set, `recall_graph` traverses the graph starting from entities
397    /// matched by the query.
398    #[must_use]
399    pub fn with_graph_store(mut self, store: Arc<crate::graph::GraphStore>) -> Self {
400        self.graph_store = Some(store);
401        self
402    }
403
404    /// Attach an [`ExperienceStore`](crate::graph::experience::ExperienceStore) for tool-outcome
405    /// telemetry and per-turn evolution sweeps.
406    ///
407    /// When set, the agent records one row per tool invocation in `experience_nodes` and
408    /// periodically runs `evolution_sweep` to prune low-confidence and self-loop edges.
409    #[must_use]
410    pub fn with_experience_store(
411        mut self,
412        store: Arc<crate::graph::experience::ExperienceStore>,
413    ) -> Self {
414        self.experience = Some(store);
415        self
416    }
417
418    /// Attach a [`ReasoningMemory`](crate::reasoning::ReasoningMemory) store for
419    /// distilled reasoning strategy storage and retrieval (#3342).
420    ///
421    /// When set, [`SemanticMemory::retrieve_reasoning_strategies`] uses this store for
422    /// embedding-similarity lookups. When `None`, retrieval returns an empty vec.
423    #[must_use]
424    pub fn with_reasoning(mut self, store: Arc<crate::reasoning::ReasoningMemory>) -> Self {
425        self.reasoning = Some(store);
426        self
427    }
428
429    /// Returns the cumulative count of community detection failures since startup.
430    #[must_use]
431    pub fn community_detection_failures(&self) -> u64 {
432        use std::sync::atomic::Ordering;
433        self.community_detection_failures.load(Ordering::Relaxed)
434    }
435
436    /// Returns the cumulative count of successful graph extractions since startup.
437    #[must_use]
438    pub fn graph_extraction_count(&self) -> u64 {
439        use std::sync::atomic::Ordering;
440        self.graph_extraction_count.load(Ordering::Relaxed)
441    }
442
443    /// Returns the cumulative count of failed graph extractions since startup.
444    #[must_use]
445    pub fn graph_extraction_failures(&self) -> u64 {
446        use std::sync::atomic::Ordering;
447        self.graph_extraction_failures.load(Ordering::Relaxed)
448    }
449
450    /// Configure temporal decay and MMR re-ranking options.
451    #[must_use]
452    pub fn with_ranking_options(
453        mut self,
454        temporal_decay_enabled: bool,
455        temporal_decay_half_life_days: u32,
456        mmr_enabled: bool,
457        mmr_lambda: f32,
458    ) -> Self {
459        self.temporal_decay_enabled = temporal_decay_enabled;
460        self.temporal_decay_half_life_days = temporal_decay_half_life_days;
461        self.mmr_enabled = mmr_enabled;
462        self.mmr_lambda = mmr_lambda;
463        self
464    }
465
466    /// Configure write-time importance scoring for memory retrieval.
467    #[must_use]
468    pub fn with_importance_options(mut self, enabled: bool, weight: f64) -> Self {
469        self.importance_enabled = enabled;
470        self.importance_weight = weight;
471        self
472    }
473
474    /// Configure the multiplicative score boost applied to semantic-tier messages during recall.
475    ///
476    /// Set to `1.0` to disable the boost. Default: `1.3`.
477    #[must_use]
478    pub fn with_tier_boost(mut self, boost: f64) -> Self {
479        self.tier_boost_semantic = boost;
480        self
481    }
482
483    /// Attach an A-MAC admission controller.
484    ///
485    /// When set, `remember()` and `remember_with_parts()` evaluate each message before persisting.
486    /// Messages below the admission threshold return `Ok(None)` without incrementing counts.
487    #[must_use]
488    pub fn with_admission_control(mut self, control: AdmissionControl) -> Self {
489        self.admission_control = Some(Arc::new(control));
490        self
491    }
492
493    /// Attach a write quality gate that scores each `remember()` call before persisting.
494    ///
495    /// When set, the gate is evaluated after A-MAC admission. A `Some(reason)` result from
496    /// [`crate::quality_gate::QualityGate::evaluate`] causes the write to be skipped
497    /// and `Ok(None)` / `Ok((None, false))` to be returned.
498    #[must_use]
499    pub fn with_quality_gate(mut self, gate: Arc<crate::quality_gate::QualityGate>) -> Self {
500        self.quality_gate = Some(gate);
501        self
502    }
503
504    /// Set the cosine similarity threshold used to skip near-duplicate key facts on insert.
505    ///
506    /// When a candidate fact's nearest neighbour in `zeph_key_facts` has a score ≥ this value,
507    /// the fact is not stored.  Default: `0.95`.
508    #[must_use]
509    pub fn with_key_facts_dedup_threshold(mut self, threshold: f32) -> Self {
510        self.key_facts_dedup_threshold = threshold;
511        self
512    }
513
514    /// Configure query-bias correction (MM-F3, #3341).
515    ///
516    /// When `enabled` is `true`, first-person queries are biased towards the user profile centroid.
517    /// `profile_weight` controls the blend strength and is clamped to `[0.0, 1.0]`.
518    /// `centroid_ttl_secs` controls how long the centroid cache stays valid.
519    #[must_use]
520    pub fn with_query_bias(
521        mut self,
522        enabled: bool,
523        profile_weight: f32,
524        centroid_ttl_secs: u64,
525    ) -> Self {
526        self.query_bias_correction = enabled;
527        self.query_bias_profile_weight = profile_weight.clamp(0.0, 1.0);
528        self.profile_centroid_ttl_secs = centroid_ttl_secs;
529        self
530    }
531
532    /// Configure HL-F5 spreading activation runtime parameters (HL-F5, #3346).
533    ///
534    /// Has no effect when `hebbian_spread.enabled = false` (the default).
535    /// Call this after `with_graph_store` and `with_hebbian` during bootstrap.
536    #[must_use]
537    pub fn with_hebbian_spread(mut self, runtime: HelaSpreadRuntime) -> Self {
538        self.hebbian_spread = runtime;
539        self
540    }
541
542    /// Configure Hebbian edge-weight reinforcement (HL-F2, #3344).
543    ///
544    /// When `enabled` is `true`, `lr` is added to the `weight` column of each traversed
545    /// edge after every recall. `lr = 0.0` with `enabled = true` logs a WARN.
546    #[must_use]
547    pub fn with_hebbian(mut self, enabled: bool, lr: f32) -> Self {
548        let lr = lr.max(0.0);
549        if enabled && lr == 0.0 {
550            tracing::warn!("hebbian enabled with lr=0.0 — no reinforcement will occur");
551        }
552        self.hebbian_enabled = enabled;
553        self.hebbian_lr = lr;
554        self
555    }
556
557    /// Classify a query's intent for query-bias correction (MM-F3, #3341).
558    ///
559    /// Returns [`QueryIntent::FirstPerson`] when the query contains self-referential language
560    /// (first-person pronouns). Otherwise returns [`QueryIntent::Other`].
561    pub(crate) fn classify_query_intent(query: &str) -> QueryIntent {
562        if persona::contains_self_referential_language(query) {
563            QueryIntent::FirstPerson
564        } else {
565            QueryIntent::Other
566        }
567    }
568
569    /// Apply query-bias correction to an embedding (MM-F3, #3341).
570    ///
571    /// Returns the embedding unchanged if `query_bias_correction` is `false`,
572    /// if the query is not first-person, or if the profile centroid is unavailable.
573    /// Logs a single WARN on dimension mismatch and returns the original embedding.
574    #[tracing::instrument(name = "memory.query_bias.apply", skip(self, embedding), fields(query_len = query.len()))]
575    pub(crate) async fn apply_query_bias(&self, query: &str, embedding: Vec<f32>) -> Vec<f32> {
576        if !self.query_bias_correction {
577            tracing::debug!(reason = "disabled", "query-bias: skipping");
578            return embedding;
579        }
580        if Self::classify_query_intent(query) != QueryIntent::FirstPerson {
581            tracing::debug!(reason = "not_first_person", "query-bias: skipping");
582            return embedding;
583        }
584        let Some(centroid) = self.profile_centroid_cached().await else {
585            tracing::debug!(reason = "no_centroid", "query-bias: skipping");
586            return embedding;
587        };
588        if centroid.len() != embedding.len() {
589            tracing::warn!(
590                centroid_dim = centroid.len(),
591                query_dim = embedding.len(),
592                reason = "dim_mismatch",
593                "query-bias: dimension mismatch between profile centroid and query embedding — skipping bias"
594            );
595            return embedding;
596        }
597        let w = self.query_bias_profile_weight;
598        tracing::debug!(
599            intent = "first_person",
600            centroid_dim = centroid.len(),
601            weight = w,
602            "query-bias: applying profile bias"
603        );
604        embedding
605            .iter()
606            .zip(centroid.iter())
607            .map(|(&q, &c)| (1.0 - w) * q + w * c)
608            .collect()
609    }
610
611    /// Return the cached profile centroid, recomputing if stale or absent (MM-F3, #3341).
612    ///
613    /// Holds the read lock only to check freshness; releases it before any `.await`.
614    /// On compute failure, preserves the previous cache value (non-sticky miss).
615    #[tracing::instrument(name = "memory.query_bias.centroid", skip(self))]
616    pub(crate) async fn profile_centroid_cached(&self) -> Option<Vec<f32>> {
617        // Fast path: check freshness under read lock without holding it across await.
618        {
619            let guard = self.profile_centroid.read().await;
620            if let Some(c) = &*guard
621                && c.computed_at.elapsed().as_secs() < self.profile_centroid_ttl_secs
622            {
623                let ttl_remaining = self
624                    .profile_centroid_ttl_secs
625                    .saturating_sub(c.computed_at.elapsed().as_secs());
626                tracing::debug!(
627                    centroid_dim = c.vector.len(),
628                    ttl_remaining_secs = ttl_remaining,
629                    "query-bias: centroid cache hit"
630                );
631                return Some(c.vector.clone());
632            }
633        }
634        // Slow path: recompute. Guard is dropped before this point.
635        let computed = self.compute_profile_centroid().await;
636        let mut guard = self.profile_centroid.write().await;
637        match computed {
638            Some(v) => {
639                tracing::debug!(centroid_dim = v.len(), "query-bias: centroid computed");
640                *guard = Some(CachedCentroid {
641                    vector: v.clone(),
642                    computed_at: Instant::now(),
643                });
644                Some(v)
645            }
646            None => {
647                // Do not overwrite a valid (but stale) cache on failure — serve stale over nothing.
648                guard.as_ref().map(|c| c.vector.clone())
649            }
650        }
651    }
652
653    /// Compute the profile centroid from persona-fact embeddings (MM-F3, #3341).
654    ///
655    /// Returns `None` when the persona table is empty or embedding fails.
656    /// Uses `load_persona_facts(0.0)` (all non-superseded facts) for the centroid basis.
657    async fn compute_profile_centroid(&self) -> Option<Vec<f32>> {
658        let facts = match self.sqlite.load_persona_facts(0.0).await {
659            Ok(f) => f,
660            Err(e) => {
661                tracing::warn!(error = %e, "query-bias: failed to load persona facts");
662                return None;
663            }
664        };
665        if facts.is_empty() {
666            return None;
667        }
668        let provider = self.effective_embed_provider();
669        let texts: Vec<String> = facts.iter().map(|f| f.content.clone()).collect();
670        let mut embeddings: Vec<Vec<f32>> = Vec::with_capacity(texts.len());
671        for text in &texts {
672            match provider.embed(text).await {
673                Ok(v) => embeddings.push(v),
674                Err(e) => {
675                    tracing::warn!(error = %e, "query-bias: failed to embed persona fact — skipping");
676                }
677            }
678        }
679        if embeddings.is_empty() {
680            return None;
681        }
682        let dim = embeddings[0].len();
683        let mut centroid = vec![0.0f32; dim];
684        for emb in &embeddings {
685            if emb.len() != dim {
686                tracing::warn!(
687                    expected = dim,
688                    got = emb.len(),
689                    "query-bias: persona embedding dimension mismatch — skipping fact"
690                );
691                continue;
692            }
693            for (c, &v) in centroid.iter_mut().zip(emb.iter()) {
694                *c += v;
695            }
696        }
697        #[allow(clippy::cast_precision_loss)]
698        let n = embeddings.len() as f32;
699        for c in &mut centroid {
700            *c /= n;
701        }
702        Some(centroid)
703    }
704
705    /// Configure retrieval depth and search prompt template (MM-F1/F2, #3340).
706    ///
707    /// `depth` is the number of ANN candidates fetched from the vector store before keyword merge
708    /// and MMR re-ranking.  `0` = legacy behavior (`recall_limit * 2`).  `≥ 1` = exact count.
709    ///
710    /// `search_prompt_template` is applied to the raw user query before embedding.  Supports a
711    /// single `{query}` placeholder.  Empty string = identity.
712    #[must_use]
713    pub fn with_retrieval_options(
714        mut self,
715        depth: u32,
716        search_prompt_template: impl Into<String>,
717    ) -> Self {
718        self.retrieval_depth = depth;
719        self.search_prompt_template = search_prompt_template.into();
720        self
721    }
722
723    /// Effective ANN candidate count for a given requested final limit (MM-F1, #3340).
724    ///
725    /// - `retrieval_depth == 0`: legacy behavior, returns `limit * 2`.
726    /// - `retrieval_depth >= 1`: returns the configured depth directly.
727    ///
728    /// When `retrieval_depth < limit`, a one-shot WARN fires because the ANN pool cannot
729    /// saturate the requested top-k.  When `limit <= retrieval_depth < limit * 2`, an INFO
730    /// fires per call noting the smaller-than-legacy pool.
731    pub(crate) fn effective_depth(&self, limit: usize) -> usize {
732        use std::sync::atomic::Ordering;
733
734        let depth = self.retrieval_depth as usize;
735        if depth == 0 {
736            return limit.saturating_mul(2);
737        }
738        if depth < limit {
739            if !self.depth_below_limit_warned.swap(true, Ordering::Relaxed) {
740                tracing::warn!(
741                    retrieval_depth = depth,
742                    recall_limit = limit,
743                    "memory.retrieval.depth < recall_limit; ANN pool cannot saturate top-k — consider raising depth"
744                );
745            }
746        } else if depth < limit.saturating_mul(2) {
747            tracing::info!(
748                retrieval_depth = depth,
749                recall_limit = limit,
750                legacy_default = limit.saturating_mul(2),
751                "memory.retrieval.depth is below legacy limit*2; ANN pool will be smaller than pre-#3340"
752            );
753        } else {
754            tracing::debug!(
755                retrieval_depth = depth,
756                recall_limit = limit,
757                "recall: using configured ANN depth"
758            );
759        }
760        depth
761    }
762
763    /// Apply the configured search prompt template to a raw query (MM-F2, #3340).
764    ///
765    /// Returns `query` as-is when the template is empty or has no `{query}` placeholder.
766    /// A one-shot WARN fires when the template is non-empty but missing the placeholder.
767    pub(crate) fn apply_search_prompt(&self, query: &str) -> String {
768        use std::sync::atomic::Ordering;
769
770        let template = &self.search_prompt_template;
771        if template.is_empty() {
772            return query.to_owned();
773        }
774        if !template.contains("{query}") {
775            if !self
776                .missing_placeholder_warned
777                .swap(true, Ordering::Relaxed)
778            {
779                tracing::warn!(
780                    template = template.as_str(),
781                    "memory.retrieval.search_prompt_template has no {{query}} placeholder — \
782                     using raw query as-is"
783                );
784            }
785            return query.to_owned();
786        }
787        template.replace("{query}", query)
788    }
789
790    /// Attach a dedicated embedding provider for write-path and backfill operations.
791    ///
792    /// When set, all batch embedding calls (backfill, `remember`) route through this provider
793    /// instead of the main `provider`. This prevents `embed_backfill` from saturating the main
794    /// provider and causing guardrail timeouts due to rate-limit contention or Ollama model-lock.
795    #[must_use]
796    pub fn with_embed_provider(mut self, embed_provider: AnyProvider) -> Self {
797        self.embed_provider = Some(embed_provider);
798        self
799    }
800
801    /// Returns the provider to use for embedding calls.
802    ///
803    /// Returns the dedicated embed provider when configured, falling back to the main provider.
804    pub fn effective_embed_provider(&self) -> &AnyProvider {
805        self.embed_provider.as_ref().unwrap_or(&self.provider)
806    }
807
808    /// Construct a `SemanticMemory` from pre-built parts.
809    ///
810    /// Intended for tests that need full control over the backing stores.
811    #[must_use]
812    pub fn from_parts(
813        sqlite: SqliteStore,
814        qdrant: Option<Arc<EmbeddingStore>>,
815        provider: AnyProvider,
816        embedding_model: impl Into<String>,
817        vector_weight: f64,
818        keyword_weight: f64,
819        token_counter: Arc<TokenCounter>,
820    ) -> Self {
821        Self {
822            sqlite,
823            qdrant,
824            provider,
825            embed_provider: None,
826            embedding_model: embedding_model.into(),
827            vector_weight,
828            keyword_weight,
829            temporal_decay_enabled: false,
830            temporal_decay_half_life_days: 30,
831            mmr_enabled: false,
832            mmr_lambda: 0.7,
833            importance_enabled: false,
834            importance_weight: 0.15,
835            tier_boost_semantic: 1.3,
836            token_counter,
837            graph_store: None,
838            experience: None,
839            reasoning: None,
840            community_detection_failures: Arc::new(AtomicU64::new(0)),
841            graph_extraction_count: Arc::new(AtomicU64::new(0)),
842            graph_extraction_failures: Arc::new(AtomicU64::new(0)),
843            last_qdrant_warn: Arc::new(AtomicU64::new(0)),
844            admission_control: None,
845            quality_gate: None,
846            key_facts_dedup_threshold: 0.95,
847            embed_tasks: std::sync::Mutex::new(tokio::task::JoinSet::new()),
848            retrieval_depth: 0,
849            search_prompt_template: String::new(),
850            depth_below_limit_warned: Arc::new(std::sync::atomic::AtomicBool::new(false)),
851            missing_placeholder_warned: Arc::new(std::sync::atomic::AtomicBool::new(false)),
852            query_bias_correction: true,
853            query_bias_profile_weight: 0.25,
854            profile_centroid: RwLock::new(None),
855            profile_centroid_ttl_secs: 300,
856            hebbian_enabled: false,
857            hebbian_lr: 0.1,
858            hebbian_spread: HelaSpreadRuntime::default(),
859        }
860    }
861
862    /// Create a `SemanticMemory` using the `SQLite`-embedded vector backend.
863    ///
864    /// # Errors
865    ///
866    /// Returns an error if `SQLite` cannot be initialized.
867    pub async fn with_sqlite_backend(
868        sqlite_path: &str,
869        provider: AnyProvider,
870        embedding_model: &str,
871        vector_weight: f64,
872        keyword_weight: f64,
873    ) -> Result<Self, MemoryError> {
874        Self::with_sqlite_backend_and_pool_size(
875            sqlite_path,
876            provider,
877            embedding_model,
878            vector_weight,
879            keyword_weight,
880            5,
881        )
882        .await
883    }
884
885    /// Create a `SemanticMemory` using the `SQLite`-embedded vector backend with configurable pool size.
886    ///
887    /// # Errors
888    ///
889    /// Returns an error if `SQLite` cannot be initialized.
890    pub async fn with_sqlite_backend_and_pool_size(
891        sqlite_path: &str,
892        provider: AnyProvider,
893        embedding_model: &str,
894        vector_weight: f64,
895        keyword_weight: f64,
896        pool_size: u32,
897    ) -> Result<Self, MemoryError> {
898        let sqlite = SqliteStore::with_pool_size(sqlite_path, pool_size).await?;
899        let pool = sqlite.pool().clone();
900        let store = EmbeddingStore::new_sqlite(pool);
901
902        Ok(Self {
903            sqlite,
904            qdrant: Some(Arc::new(store)),
905            provider,
906            embed_provider: None,
907            embedding_model: embedding_model.into(),
908            vector_weight,
909            keyword_weight,
910            temporal_decay_enabled: false,
911            temporal_decay_half_life_days: 30,
912            mmr_enabled: false,
913            mmr_lambda: 0.7,
914            importance_enabled: false,
915            importance_weight: 0.15,
916            tier_boost_semantic: 1.3,
917            token_counter: Arc::new(TokenCounter::new()),
918            graph_store: None,
919            experience: None,
920            reasoning: None,
921            community_detection_failures: Arc::new(AtomicU64::new(0)),
922            graph_extraction_count: Arc::new(AtomicU64::new(0)),
923            graph_extraction_failures: Arc::new(AtomicU64::new(0)),
924            last_qdrant_warn: Arc::new(AtomicU64::new(0)),
925            admission_control: None,
926            quality_gate: None,
927            key_facts_dedup_threshold: 0.95,
928            embed_tasks: std::sync::Mutex::new(tokio::task::JoinSet::new()),
929            retrieval_depth: 0,
930            search_prompt_template: String::new(),
931            depth_below_limit_warned: Arc::new(std::sync::atomic::AtomicBool::new(false)),
932            missing_placeholder_warned: Arc::new(std::sync::atomic::AtomicBool::new(false)),
933            query_bias_correction: true,
934            query_bias_profile_weight: 0.25,
935            profile_centroid: RwLock::new(None),
936            profile_centroid_ttl_secs: 300,
937            hebbian_enabled: false,
938            hebbian_lr: 0.1,
939            hebbian_spread: HelaSpreadRuntime::default(),
940        })
941    }
942
943    /// Access the underlying `SqliteStore` for operations that don't involve semantics.
944    #[must_use]
945    pub fn sqlite(&self) -> &SqliteStore {
946        &self.sqlite
947    }
948
949    /// Check if the vector store backend is reachable.
950    ///
951    /// Performs a real health check (Qdrant gRPC ping or `SQLite` query)
952    /// instead of just checking whether the client was created.
953    pub async fn is_vector_store_connected(&self) -> bool {
954        match self.qdrant.as_ref() {
955            Some(store) => store.health_check().await,
956            None => false,
957        }
958    }
959
960    /// Check if a vector store client is configured (may not be connected).
961    #[must_use]
962    pub fn has_vector_store(&self) -> bool {
963        self.qdrant.is_some()
964    }
965
966    /// Return a reference to the embedding store, if configured.
967    #[must_use]
968    pub fn embedding_store(&self) -> Option<&Arc<EmbeddingStore>> {
969        self.qdrant.as_ref()
970    }
971
972    /// Return a reference to the underlying LLM provider (used for RPE embedding).
973    pub fn provider(&self) -> &AnyProvider {
974        &self.provider
975    }
976
977    /// Count messages in a conversation.
978    ///
979    /// # Errors
980    ///
981    /// Returns an error if the query fails.
982    pub async fn message_count(
983        &self,
984        conversation_id: crate::types::ConversationId,
985    ) -> Result<i64, MemoryError> {
986        self.sqlite.count_messages(conversation_id).await
987    }
988
989    /// Count messages not yet covered by any summary.
990    ///
991    /// # Errors
992    ///
993    /// Returns an error if the query fails.
994    pub async fn unsummarized_message_count(
995        &self,
996        conversation_id: crate::types::ConversationId,
997    ) -> Result<i64, MemoryError> {
998        let after_id = self
999            .sqlite
1000            .latest_summary_last_message_id(conversation_id)
1001            .await?
1002            .unwrap_or(crate::types::MessageId(0));
1003        self.sqlite
1004            .count_messages_after(conversation_id, after_id)
1005            .await
1006    }
1007
1008    /// Load recent episodic messages for the promotion-scan window.
1009    ///
1010    /// Returns up to `max_items` of the most recent non-deleted messages across all
1011    /// conversations, with their `conversation_id` for session-count heuristics.
1012    ///
1013    /// # Embedding note
1014    ///
1015    /// `embedding` is returned as `None` in this MVP implementation. A future pass
1016    /// will join with the Qdrant payload to populate embeddings inline.
1017    ///
1018    /// # Errors
1019    ///
1020    /// Returns [`MemoryError`] if the underlying `SQLite` query fails.
1021    // TODO(review): populate embeddings by fetching from Qdrant when available.
1022    pub async fn load_promotion_window(
1023        &self,
1024        max_items: usize,
1025    ) -> Result<Vec<crate::compression::promotion::PromotionInput>, MemoryError> {
1026        use zeph_db::sql;
1027
1028        let limit = i64::try_from(max_items).unwrap_or(i64::MAX);
1029        let rows: Vec<(
1030            crate::types::MessageId,
1031            crate::types::ConversationId,
1032            String,
1033        )> = zeph_db::query_as(sql!(
1034            "SELECT id, conversation_id, content \
1035                 FROM messages \
1036                 WHERE deleted_at IS NULL \
1037                 ORDER BY id DESC \
1038                 LIMIT ?"
1039        ))
1040        .bind(limit)
1041        .fetch_all(self.sqlite.pool())
1042        .await?;
1043
1044        Ok(rows
1045            .into_iter()
1046            .map(|(message_id, conversation_id, content)| {
1047                crate::compression::promotion::PromotionInput {
1048                    message_id,
1049                    conversation_id,
1050                    content,
1051                    // Embeddings not wired yet — scan will skip rows with None.
1052                    embedding: None,
1053                }
1054            })
1055            .collect())
1056    }
1057
1058    /// Retrieve top-k reasoning strategies by embedding similarity to `query`.
1059    ///
1060    /// Returns an empty vec when reasoning memory is not attached, Qdrant is unavailable,
1061    /// or the provider does not support embeddings.
1062    ///
1063    /// This method is **pure** — it does not increment `use_count` or `last_used_at`.
1064    /// Call [`crate::reasoning::ReasoningMemory::mark_used`] with the ids of strategies
1065    /// actually injected into the prompt (after budget truncation).
1066    ///
1067    /// # Errors
1068    ///
1069    /// Returns an error if embedding generation or the vector search fails.
1070    pub async fn retrieve_reasoning_strategies(
1071        &self,
1072        query: &str,
1073        limit: usize,
1074    ) -> Result<Vec<crate::reasoning::ReasoningStrategy>, MemoryError> {
1075        let Some(reasoning) = &self.reasoning else {
1076            return Ok(Vec::new());
1077        };
1078        if !self.effective_embed_provider().supports_embeddings() {
1079            return Ok(Vec::new());
1080        }
1081        let embedding = self.effective_embed_provider().embed(query).await?;
1082        reasoning
1083            .retrieve_by_embedding(&embedding, limit as u64)
1084            .await
1085    }
1086}