innate_core/kb/
mod.rs

1//! KnowledgeBase — all 8 Public APIs.
2
3/// Return type for pack(): (selected_chunks, skipped_groups, skip_reasons)
4type PackResult = (
5    Vec<Value>,
6    Vec<(Vec<Value>, f64, usize)>,
7    std::collections::HashMap<String, String>,
8);
9
10use std::collections::{HashMap, HashSet};
11use std::path::Path;
12use std::sync::Arc;
13
14use serde_json::{json, Value};
15
16use crate::embedding::{DummyEmbeddingProvider, EmbeddingProvider};
17use crate::errors::{InnateError, Result};
18use crate::refine::{
19    DefaultSanitizer, DistilledChunk, Distiller, HeuristicDistiller, NoopReranker, NullRefiner,
20    Refiner, Reranker, Sanitizer,
21};
22use crate::storage::{ChunkRow, EpisodicLogRow, Storage};
23use crate::utils::{
24    content_hash, estimate_tokens, gen_uuid, pack_embedding, utc_now_iso, SanitizeAction,
25};
26
27mod appraise;
28mod curate;
29mod evolve;
30mod inspection;
31mod lifecycle;
32mod recall;
33mod record;
34mod repair;
35mod situation;
36
37pub use appraise::{
38    AbstainReason, AppraiseParams, Contributor, FlaggedPoint, Tier, Valence, Verdict,
39    APPRAISE_ADVISORY,
40};
41pub use recall::RecallParams;
42pub use record::RecordParams;
43pub use repair::TraceRepairReport;
44pub use situation::Situation;
45
46// ---------------------------------------------------------------------------
47// Tuning defaults
48// ---------------------------------------------------------------------------
49
50// Fused recall score weights. These intentionally sum to 1.05 (not 1.0): the
51// score is a relative ranking signal, not a calibrated probability, so the extra
52// 0.05 of headroom on content similarity is deliberate and the result is never
53// re-normalised. Keep this in mind before "fixing" the sum.
54const W_CONTENT: f64 = 0.55;
55const W_TRIGGER: f64 = 0.25;
56const W_CONFIDENCE: f64 = 0.10;
57const W_CONTEXT: f64 = 0.15;
58const W_ACTIVATION: f64 = 0.08;
59// Hybrid 检索:lexical/BM25 channel weight. Modest by default so exact-term
60// matches lift the right chunk without overpowering semantic similarity.
61const W_LEXICAL: f64 = 0.25;
62const TOP_K_CANDIDATES: usize = 20;
63const ANTI_TRIGGER_PENALTY: f64 = 0.6;
64const DENSITY_REFILL: bool = true;
65
66const LOW_CONF_THRESHOLD: f64 = 0.25;
67const LOW_CONF_IDLE_DAYS: i64 = 60;
68const REPEAT_SELECT_MIN: i64 = 10;
69const REPEAT_SELECT_CONF_MAX: f64 = 0.5;
70const NEVER_USED_AGE_DAYS: i64 = 30;
71const OPEN_TTL_DAYS: i64 = 14;
72const SCREENING_TIMEOUT_MINUTES: i64 = 30;
73const PROMOTE_USED_SUCCESS_MIN: i64 = 3;
74const PROMOTE_CONFIDENCE_MIN: f64 = 0.60;
75const DECAY_FLOOR: f64 = 0.20;
76const EVOLVE_THRESHOLD: i64 = 5;
77const DISTILL_BATCH_SIZE: usize = 20;
78const PENDING_RECALL_PENALTY: f64 = 0.60;
79
80// Intuition / appraise critic defaults (Spec §8). The appraise path reuses the
81// same fused score as recall; these only govern how that score is tiered/flagged.
82const APPRAISE_TIER_WEAK: f64 = 0.30;
83const APPRAISE_TIER_STRONG: f64 = 0.65;
84const APPRAISE_MIN_STRENGTH: f64 = 0.40;
85const APPRAISE_TOP: usize = 8;
86const APPRAISE_TRIGGER_HIT_MIN: f64 = 0.50;
87const APPRAISE_CANDIDATE_IN_EMBED: bool = true;
88// 弃权门(方案 A/F/G)。默认值保持现行行为(门2/门3/门4 关闭),由 meta 调参激活。
89//   门2 signature_floor=0.0   → 关闭(任何一致度都放行)
90//   门3 min_evidence=0        → 关闭(不要求观测历史)
91//   门4 conflict_ceiling=1.0  → 关闭(离散度上界恒不触发)
92// 门1 弱共振无需阈值:prune 后候选为空即弃权(WeakResonance),天然作动。
93const APPRAISE_SIGNATURE_FLOOR: f64 = 0.0;
94const APPRAISE_MIN_EVIDENCE: i64 = 0;
95const APPRAISE_CONFLICT_CEILING: f64 = 1.0;
96// 方案 D 基率锚定先验:prior = Beta(m·g0, m·(1-g0))。默认 m=2, g0=0.5 与旧 Laplace
97// (wins+1)/(evidence+2) 完全等价 → 零行为变化,调大 m / 设真实基率即激活。
98// **仅作用于 appraise(直觉/校准)路径**:实施文档明确范围不含 recall。
99const INTUITION_PRIOR_M: f64 = 2.0;
100const INTUITION_BASE_RATE: f64 = 0.5;
101// recall(图书管理员)路径恒用中性 Laplace 先验(m=2, g0=0.5),与历史
102// (wins+1)/(evidence+2) 逐位等价,绝不受 intuition.* 校准旋钮影响。方案 D 与 recall 解耦。
103const RECALL_PRIOR_M: f64 = 2.0;
104const RECALL_BASE_RATE: f64 = 0.5;
105// 方案 E 校准映射桶数。
106const CALIBRATION_BINS: i64 = 10;
107const SITUATION_COARSE_KEYS: &str = "stage,error_class,file_type";
108// Part (c) — query-embedding granularity. When true, recall folds the normalized
109// situation signature (stage/error_class/file_type) into the embedded query text so
110// the embedding anchors on the situation, not just raw words. Default OFF: opt-in
111// and reversible (chunks are embedded from content/trigger, so enabling it shifts
112// only the query side — measure with `innate recall-eval` before turning on).
113const EMBED_SITUATION_SIGNATURE: bool = false;
114const GOVERNANCE_ARCHIVE_THRESHOLD: i64 = 3;
115const NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD: i64 = 5;
116const GOVERNANCE_EVOLVE_THRESHOLD: i64 = 3;
117const FAILURE_MIN_USES: i64 = 5;
118const FAILURE_MAX_SUCCESS_RATE: f64 = 0.20;
119const FAILURE_CONFIDENCE_MAX: f64 = 0.35;
120const LOG_COMPACT_DAYS: i64 = 30;
121
122// ---------------------------------------------------------------------------
123// Public result types
124// ---------------------------------------------------------------------------
125
126#[derive(Debug, Default, Clone)]
127pub struct RecallResult {
128    pub knowledge: Vec<Value>,
129    pub sparks: Vec<Value>,
130    pub trace_id: String,
131    pub empty: bool,
132    pub depth_skipped: Vec<String>,
133    pub skipped_reasons: HashMap<String, String>,
134}
135
136#[derive(Debug, Default)]
137pub struct CurateReport {
138    pub archived: Vec<String>,
139    pub deduped: Vec<String>,
140    pub decayed: Vec<String>,
141    pub cycles: Vec<Vec<String>>,
142    pub orphans: Vec<String>,
143    pub recovered: Vec<String>,
144    pub warnings: Vec<String>,
145    pub stats: HashMap<String, Value>,
146}
147
148#[derive(Debug, Default)]
149struct DistillBatchReport {
150    distilled: usize,
151    failed: usize,
152}
153
154/// Scope for a single Curate run — allows limiting governance to a subset of chunks.
155#[derive(Debug, Default, Clone)]
156pub struct CurateScope {
157    /// If set, only process chunks with this origin (e.g. "distilled").
158    pub origin: Option<String>,
159    /// If set, only process chunks belonging to this skill.
160    pub skill_name: Option<String>,
161    /// When true, compute the report but do not write any changes.
162    pub dry_run: bool,
163}
164
165/// Replaceable governance interface (§二·六). Inject via `KnowledgeBase::open_with`.
166/// Default implementation: `BuiltinCurator`.
167pub trait Curator: Send + Sync {
168    fn run(&self, kb: &KnowledgeBase, scope: &CurateScope) -> Result<CurateReport>;
169}
170
171/// Built-in curator — implements the full §四 governance pipeline.
172pub struct BuiltinCurator;
173
174impl Curator for BuiltinCurator {
175    fn run(&self, kb: &KnowledgeBase, scope: &CurateScope) -> Result<CurateReport> {
176        kb.builtin_curate_impl(scope)
177    }
178}
179
180// ---------------------------------------------------------------------------
181// KnowledgeBase
182// ---------------------------------------------------------------------------
183
184pub struct KnowledgeBase {
185    pub storage: Storage,
186    embedding: Arc<dyn EmbeddingProvider>,
187    refiner: Arc<dyn Refiner>,
188    distiller: Arc<dyn Distiller>,
189    curator: Arc<dyn Curator>,
190    sanitizer: Arc<dyn Sanitizer>,
191    /// Opt-in offline reranker (part d). Defaults to `NoopReranker` (fused order
192    /// preserved); set via `with_reranker` when an LLM is configured.
193    reranker: Arc<dyn Reranker>,
194
195    // Tuning params (loaded from meta at init)
196    w_content: f64,
197    w_trigger: f64,
198    w_confidence: f64,
199    w_context: f64,
200    w_activation: f64,
201    w_lexical: f64,
202    top_k_candidates: usize,
203    anti_trigger_penalty: f64,
204    density_refill: bool,
205
206    low_conf_threshold: f64,
207    low_conf_idle_days: i64,
208    repeat_select_min: i64,
209    repeat_select_conf_max: f64,
210    never_used_age_days: i64,
211    open_ttl_days: i64,
212    screening_timeout_minutes: i64,
213    promote_used_success_min: i64,
214    promote_confidence_min: f64,
215    decay_floor: f64,
216    evolve_threshold: i64,
217    distill_batch_size: usize,
218    evolve_schedule_interval_hours: i64,
219    governance_archive_threshold: i64,
220    negative_feedback_archive_threshold: i64,
221    governance_evolve_threshold: i64,
222    governance_proposal_max_age_days: i64,
223    failure_min_uses: i64,
224    failure_max_success_rate: f64,
225    failure_confidence_max: f64,
226    log_compact_days: i64,
227
228    // Intuition / appraise critic params
229    appraise_tier_weak: f64,
230    appraise_tier_strong: f64,
231    appraise_min_strength: f64,
232    appraise_top: usize,
233    appraise_trigger_hit_min: f64,
234    appraise_candidate_in_embed: bool,
235    appraise_signature_floor: f64,
236    appraise_min_evidence: i64,
237    appraise_conflict_ceiling: f64,
238    intuition_prior_m: f64,
239    intuition_base_rate: f64,
240    calibration_bins: i64,
241    situation_coarse_keys: String,
242    embed_situation_signature: bool,
243}
244
245impl KnowledgeBase {
246    pub fn open(db_path: impl AsRef<Path>) -> Result<Self> {
247        Self::open_with(db_path, None, None, None, None, None)
248    }
249
250    /// Persist a content embedding, rejecting any vector whose dimension differs
251    /// from the configured provider. A mismatched vector is silently skipped at
252    /// search time (cosine search only scores equal-dimension vectors), so an
253    /// unchecked write becomes invisible recall loss with no error. Fail closed
254    /// at the write boundary instead. All vector writers route through here.
255    pub(crate) fn store_vec_content(&self, chunk_id: &str, cvec: &[f32]) -> Result<()> {
256        let want = self.embedding.content_dim();
257        if cvec.len() != want {
258            return Err(InnateError::InvalidState(format!(
259                "content embedding dim {} != configured {want} (chunk {chunk_id})",
260                cvec.len()
261            )));
262        }
263        self.storage
264            .insert_vec_content(chunk_id, &pack_embedding(cvec))
265    }
266
267    /// Trigger-vector counterpart of [`store_vec_content`]; same fail-closed
268    /// dimension guard against `trigger_dim()`.
269    pub(crate) fn store_vec_trigger(&self, chunk_id: &str, tvec: &[f32]) -> Result<()> {
270        let want = self.embedding.trigger_dim();
271        if tvec.len() != want {
272            return Err(InnateError::InvalidState(format!(
273                "trigger embedding dim {} != configured {want} (chunk {chunk_id})",
274                tvec.len()
275            )));
276        }
277        self.storage
278            .insert_vec_trigger(chunk_id, &pack_embedding(tvec))
279    }
280
281    pub fn open_with(
282        db_path: impl AsRef<Path>,
283        embedding: Option<Arc<dyn EmbeddingProvider>>,
284        refiner: Option<Arc<dyn Refiner>>,
285        distiller: Option<Arc<dyn Distiller>>,
286        curator: Option<Arc<dyn Curator>>,
287        sanitizer: Option<Arc<dyn Sanitizer>>,
288    ) -> Result<Self> {
289        let embedding = embedding.unwrap_or_else(|| Arc::new(DummyEmbeddingProvider::default()));
290        let refiner = refiner.unwrap_or_else(|| Arc::new(NullRefiner));
291        let distiller = distiller.unwrap_or_else(|| Arc::new(HeuristicDistiller));
292        let curator = curator.unwrap_or_else(|| Arc::new(BuiltinCurator));
293        let sanitizer = sanitizer.unwrap_or_else(|| Arc::new(DefaultSanitizer));
294        let reranker: Arc<dyn Reranker> = Arc::new(NoopReranker);
295
296        let storage = Storage::open(db_path, embedding.content_dim(), embedding.trigger_dim())?;
297
298        let mut kb = Self {
299            storage,
300            embedding,
301            refiner,
302            distiller,
303            curator,
304            sanitizer,
305            reranker,
306            w_lexical: W_LEXICAL,
307            embed_situation_signature: EMBED_SITUATION_SIGNATURE,
308            w_content: W_CONTENT,
309            w_trigger: W_TRIGGER,
310            w_confidence: W_CONFIDENCE,
311            w_context: W_CONTEXT,
312            w_activation: W_ACTIVATION,
313            top_k_candidates: TOP_K_CANDIDATES,
314            anti_trigger_penalty: ANTI_TRIGGER_PENALTY,
315            density_refill: DENSITY_REFILL,
316            low_conf_threshold: LOW_CONF_THRESHOLD,
317            low_conf_idle_days: LOW_CONF_IDLE_DAYS,
318            repeat_select_min: REPEAT_SELECT_MIN,
319            repeat_select_conf_max: REPEAT_SELECT_CONF_MAX,
320            never_used_age_days: NEVER_USED_AGE_DAYS,
321            open_ttl_days: OPEN_TTL_DAYS,
322            screening_timeout_minutes: SCREENING_TIMEOUT_MINUTES,
323            promote_used_success_min: PROMOTE_USED_SUCCESS_MIN,
324            promote_confidence_min: PROMOTE_CONFIDENCE_MIN,
325            decay_floor: DECAY_FLOOR,
326            evolve_threshold: EVOLVE_THRESHOLD,
327            distill_batch_size: DISTILL_BATCH_SIZE,
328            evolve_schedule_interval_hours: 6,
329            governance_archive_threshold: GOVERNANCE_ARCHIVE_THRESHOLD,
330            negative_feedback_archive_threshold: NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD,
331            governance_evolve_threshold: GOVERNANCE_EVOLVE_THRESHOLD,
332            governance_proposal_max_age_days: 30,
333            failure_min_uses: FAILURE_MIN_USES,
334            failure_max_success_rate: FAILURE_MAX_SUCCESS_RATE,
335            failure_confidence_max: FAILURE_CONFIDENCE_MAX,
336            log_compact_days: LOG_COMPACT_DAYS,
337            appraise_tier_weak: APPRAISE_TIER_WEAK,
338            appraise_tier_strong: APPRAISE_TIER_STRONG,
339            appraise_min_strength: APPRAISE_MIN_STRENGTH,
340            appraise_top: APPRAISE_TOP,
341            appraise_trigger_hit_min: APPRAISE_TRIGGER_HIT_MIN,
342            appraise_candidate_in_embed: APPRAISE_CANDIDATE_IN_EMBED,
343            appraise_signature_floor: APPRAISE_SIGNATURE_FLOOR,
344            appraise_min_evidence: APPRAISE_MIN_EVIDENCE,
345            appraise_conflict_ceiling: APPRAISE_CONFLICT_CEILING,
346            intuition_prior_m: INTUITION_PRIOR_M,
347            intuition_base_rate: INTUITION_BASE_RATE,
348            calibration_bins: CALIBRATION_BINS,
349            situation_coarse_keys: SITUATION_COARSE_KEYS.to_string(),
350        };
351        kb.init_meta()?;
352        kb.load_params()?;
353        Ok(kb)
354    }
355
356    /// Install an opt-in offline reranker (part d). Used by `open_kb` when an LLM is
357    /// configured; recall only invokes it when a caller passes `rerank=true`, so the
358    /// default hook path stays no-LLM regardless.
359    pub fn with_reranker(mut self, reranker: Arc<dyn Reranker>) -> Self {
360        self.reranker = reranker;
361        self
362    }
363
364    fn init_meta(&self) -> Result<()> {
365        let lib_id = gen_uuid();
366        let content_dim = self.embedding.content_dim().to_string();
367        let trigger_dim = self.embedding.trigger_dim().to_string();
368        let embed_model = self.embedding.model_name();
369
370        for (key, expected) in [
371            ("content_dim", self.embedding.content_dim()),
372            ("trigger_dim", self.embedding.trigger_dim()),
373        ] {
374            if let Some(stored) = self.storage.get_meta(key)? {
375                let actual = stored.parse::<usize>().map_err(|_| {
376                    InnateError::Other(format!("invalid {key} metadata value: {stored}"))
377                })?;
378                if actual != expected {
379                    return Err(InnateError::Other(format!(
380                        "{key} mismatch: database uses {actual}, embedding provider uses {expected}"
381                    )));
382                }
383            }
384        }
385
386        let defaults: &[(&str, &str)] = &[
387            ("lib_id", &lib_id),
388            ("lib_role", "personal"),
389            ("schema_version", "4.14"),
390            ("content_dim", &content_dim),
391            ("trigger_dim", &trigger_dim),
392            ("embed_model", embed_model),
393            ("embed_version", "1"),
394            ("vector_revision", "0"),
395            ("last_agg_ts", "1970-01-01T00:00:00.000Z"),
396            ("recall.w_content", "0.55"),
397            ("recall.w_trigger", "0.25"),
398            ("recall.w_confidence", "0.10"),
399            ("recall.w_context", "0.15"),
400            ("recall.w_activation", "0.08"),
401            ("recall.w_lexical", "0.25"),
402            ("recall.embed_situation_signature", "false"),
403            ("recall.top_k_candidates", "20"),
404            ("recall.anti_trigger_penalty", "0.6"),
405            ("recall.density_refill", "true"),
406            ("curate.low_conf_threshold", "0.25"),
407            ("curate.low_conf_idle_days", "60"),
408            ("curate.repeat_select_min", "10"),
409            ("curate.repeat_select_conf_max", "0.5"),
410            ("curate.never_used_age_days", "30"),
411            ("curate.open_ttl_days", "14"),
412            ("curate.screening_timeout_minutes", "30"),
413            ("curate.promote_used_success_min", "3"),
414            ("curate.promote_confidence_min", "0.60"),
415            ("curate.decay_floor", "0.20"),
416            ("evolve.threshold_new_count", "5"),
417            ("evolve.distill_batch_size", "20"),
418            ("evolve.schedule_interval_hours", "6"),
419            ("curate.soft_mature_threshold", "5"),
420            ("evolve.distill_token_window_hours", "24"),
421            ("curate.governance_archive_threshold", "3"),
422            ("curate.negative_feedback_archive_threshold", "5"),
423            ("evolve.governance_pending_threshold", "3"),
424            ("curate.governance_proposal_max_age_days", "30"),
425            ("curate.failure_min_uses", "5"),
426            ("curate.failure_max_success_rate", "0.20"),
427            ("curate.failure_confidence_max", "0.35"),
428            ("curate.log_compact_days", "30"),
429            ("appraise.tier_weak", "0.30"),
430            ("appraise.tier_strong", "0.65"),
431            ("appraise.min_strength", "0.40"),
432            ("appraise.top", "8"),
433            ("appraise.trigger_hit_min", "0.50"),
434            ("appraise.candidate_in_embed", "true"),
435            ("appraise.signature_floor", "0.0"),
436            ("appraise.min_evidence", "0"),
437            ("appraise.conflict_ceiling", "1.0"),
438            ("intuition.prior_m", "2.0"),
439            ("intuition.base_rate", "0.5"),
440            ("intuition.calibration_bins", "10"),
441            ("situation.coarse_keys", "stage,error_class,file_type"),
442        ];
443        self.storage.begin_immediate()?;
444        let result = (|| -> Result<()> {
445            for (k, v) in defaults {
446                if self.storage.get_meta(k)?.is_none() {
447                    self.storage.set_meta(k, v)?;
448                }
449            }
450            self.storage.commit()
451        })();
452        if result.is_err() {
453            let _ = self.storage.rollback();
454        }
455        result
456    }
457
458    fn load_params(&mut self) -> Result<()> {
459        let f = |k: &str, d: f64| -> f64 {
460            self.storage
461                .get_meta(k)
462                .ok()
463                .flatten()
464                .and_then(|v| v.parse().ok())
465                .unwrap_or(d)
466        };
467        let i = |k: &str, d: i64| -> i64 {
468            self.storage
469                .get_meta(k)
470                .ok()
471                .flatten()
472                .and_then(|v| v.parse().ok())
473                .unwrap_or(d)
474        };
475        let b = |k: &str, d: bool| -> bool {
476            self.storage
477                .get_meta(k)
478                .ok()
479                .flatten()
480                .map(|v| v.to_lowercase() == "true")
481                .unwrap_or(d)
482        };
483        self.w_content = f("recall.w_content", W_CONTENT);
484        self.w_trigger = f("recall.w_trigger", W_TRIGGER);
485        self.w_confidence = f("recall.w_confidence", W_CONFIDENCE);
486        self.w_context = f("recall.w_context", W_CONTEXT);
487        self.w_lexical = f("recall.w_lexical", W_LEXICAL);
488        self.embed_situation_signature =
489            b("recall.embed_situation_signature", EMBED_SITUATION_SIGNATURE);
490        self.w_activation = f("recall.w_activation", W_ACTIVATION);
491        self.top_k_candidates =
492            i("recall.top_k_candidates", TOP_K_CANDIDATES as i64).max(1) as usize;
493        self.anti_trigger_penalty = f("recall.anti_trigger_penalty", ANTI_TRIGGER_PENALTY);
494        self.density_refill = b("recall.density_refill", DENSITY_REFILL);
495        self.low_conf_threshold = f("curate.low_conf_threshold", LOW_CONF_THRESHOLD);
496        self.low_conf_idle_days = i("curate.low_conf_idle_days", LOW_CONF_IDLE_DAYS);
497        self.repeat_select_min = i("curate.repeat_select_min", REPEAT_SELECT_MIN);
498        self.repeat_select_conf_max = f("curate.repeat_select_conf_max", REPEAT_SELECT_CONF_MAX);
499        self.never_used_age_days = i("curate.never_used_age_days", NEVER_USED_AGE_DAYS);
500        self.open_ttl_days = i("curate.open_ttl_days", OPEN_TTL_DAYS);
501        self.screening_timeout_minutes = i(
502            "curate.screening_timeout_minutes",
503            SCREENING_TIMEOUT_MINUTES,
504        );
505        self.promote_used_success_min =
506            i("curate.promote_used_success_min", PROMOTE_USED_SUCCESS_MIN);
507        self.promote_confidence_min = f("curate.promote_confidence_min", PROMOTE_CONFIDENCE_MIN);
508        self.decay_floor = f("curate.decay_floor", DECAY_FLOOR).clamp(0.0, 0.4);
509        self.evolve_threshold = i("evolve.threshold_new_count", EVOLVE_THRESHOLD);
510        self.distill_batch_size =
511            i("evolve.distill_batch_size", DISTILL_BATCH_SIZE as i64) as usize;
512        self.evolve_schedule_interval_hours = i("evolve.schedule_interval_hours", 6).max(1);
513        self.governance_archive_threshold = i(
514            "curate.governance_archive_threshold",
515            GOVERNANCE_ARCHIVE_THRESHOLD,
516        )
517        .max(1);
518        self.negative_feedback_archive_threshold = i(
519            "curate.negative_feedback_archive_threshold",
520            NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD,
521        )
522        .max(1);
523        self.governance_evolve_threshold = i(
524            "evolve.governance_pending_threshold",
525            GOVERNANCE_EVOLVE_THRESHOLD,
526        )
527        .max(1);
528        self.governance_proposal_max_age_days =
529            i("curate.governance_proposal_max_age_days", 30).max(1);
530        self.failure_min_uses = i("curate.failure_min_uses", FAILURE_MIN_USES).max(1);
531        self.failure_max_success_rate =
532            f("curate.failure_max_success_rate", FAILURE_MAX_SUCCESS_RATE).clamp(0.0, 1.0);
533        self.failure_confidence_max =
534            f("curate.failure_confidence_max", FAILURE_CONFIDENCE_MAX).clamp(0.0, 1.0);
535        self.log_compact_days = i("curate.log_compact_days", LOG_COMPACT_DAYS).max(1);
536        let s = |k: &str, d: &str| -> String {
537            self.storage
538                .get_meta(k)
539                .ok()
540                .flatten()
541                .filter(|v| !v.trim().is_empty())
542                .unwrap_or_else(|| d.to_string())
543        };
544        self.appraise_tier_weak = f("appraise.tier_weak", APPRAISE_TIER_WEAK).clamp(0.0, 1.0);
545        self.appraise_tier_strong = f("appraise.tier_strong", APPRAISE_TIER_STRONG).clamp(0.0, 1.0);
546        self.appraise_min_strength =
547            f("appraise.min_strength", APPRAISE_MIN_STRENGTH).clamp(0.0, 1.0);
548        self.appraise_top = i("appraise.top", APPRAISE_TOP as i64).max(1) as usize;
549        self.appraise_trigger_hit_min =
550            f("appraise.trigger_hit_min", APPRAISE_TRIGGER_HIT_MIN).clamp(0.0, 1.0);
551        self.appraise_candidate_in_embed =
552            b("appraise.candidate_in_embed", APPRAISE_CANDIDATE_IN_EMBED);
553        self.appraise_signature_floor =
554            f("appraise.signature_floor", APPRAISE_SIGNATURE_FLOOR).clamp(0.0, 1.0);
555        self.appraise_min_evidence = i("appraise.min_evidence", APPRAISE_MIN_EVIDENCE).max(0);
556        self.appraise_conflict_ceiling =
557            f("appraise.conflict_ceiling", APPRAISE_CONFLICT_CEILING).clamp(0.0, 1.0);
558        self.intuition_prior_m = f("intuition.prior_m", INTUITION_PRIOR_M).max(0.0);
559        self.intuition_base_rate = f("intuition.base_rate", INTUITION_BASE_RATE).clamp(0.0, 1.0);
560        self.calibration_bins = i("intuition.calibration_bins", CALIBRATION_BINS).clamp(2, 100);
561        self.situation_coarse_keys = s("situation.coarse_keys", SITUATION_COARSE_KEYS);
562        Ok(())
563    }
564}
565
566// ---------------------------------------------------------------------------
567// Helpers
568// ---------------------------------------------------------------------------
569
570struct CandidateInfo {
571    chunk: Value,
572    sim_content: f32,
573    sim_trigger: f32,
574    /// Lexical/BM25 channel score ∈ [0,1] (hybrid 检索). Zero when the chunk was
575    /// found only by vector search; positive when an exact-term match recovered it.
576    sim_lexical: f32,
577}
578
579/// True when a coarse signature carries at least one real value (not empty /
580/// `none` / `unknown`) — used to decide whether folding it into the embed query
581/// adds signal or just noise.
582fn signature_has_signal(sig: &str) -> bool {
583    sig.split('|').any(|p| {
584        p.split_once('=')
585            .map(|(_, v)| !v.is_empty() && v != "none" && v != "unknown")
586            .unwrap_or(false)
587    })
588}
589
590/// Fresh candidate from a chunk with all channel sims zeroed (callers set the
591/// channel(s) that surfaced it). Centralised so adding a channel touches one place.
592fn new_candidate(chunk: &Value) -> CandidateInfo {
593    CandidateInfo {
594        chunk: chunk.clone(),
595        sim_content: 0.0,
596        sim_trigger: 0.0,
597        sim_lexical: 0.0,
598    }
599}
600
601fn chunk_is_valid_for_recall(chunk: &Value, embed_version: i64) -> bool {
602    chunk.get("state").and_then(Value::as_str) != Some("archived")
603        && chunk.get("origin").and_then(Value::as_str) != Some("spark")
604        && chunk
605            .get("embed_version")
606            .and_then(Value::as_i64)
607            .unwrap_or(1)
608            >= embed_version
609}
610
611/// Normalize a query string before hashing into a context_key.
612///
613/// Goals: collapse whitespace variations and case differences so that
614/// semantically equivalent queries (same words, different capitalisation or
615/// spacing) accumulate statistics in the same context_stat bucket.
616///
617/// Deliberately conservative: no stemming, no stop-word removal. The canonical
618/// query guidance in SKILL.md handles vocabulary consistency at the agent level.
619fn normalize_query(query: &str) -> String {
620    const STOP_WORDS: &[&str] = &[
621        "a", "an", "and", "for", "in", "of", "on", "the", "to", "with",
622    ];
623    let cleaned: String = query
624        .to_lowercase()
625        .chars()
626        .map(|ch| {
627            if ch.is_alphanumeric() || ch.is_whitespace() {
628                ch
629            } else {
630                ' '
631            }
632        })
633        .collect();
634    let mut tokens: Vec<&str> = cleaned
635        .split_whitespace()
636        .filter(|token| !STOP_WORDS.contains(token))
637        .collect();
638    tokens.sort_unstable();
639    tokens.dedup();
640    tokens.join(" ")
641}
642
643fn estimate_distill_prompt_tokens(log: &Value, related_logs: &[Value]) -> i64 {
644    let primary: i64 = [
645        "query",
646        "recall_snapshot",
647        "output",
648        "output_summary",
649        "nomination",
650    ]
651    .iter()
652    .filter_map(|key| log.get(*key).and_then(Value::as_str))
653    .map(|text| estimate_tokens(text) as i64)
654    .sum();
655    let log_id = log.get("id").and_then(Value::as_str).unwrap_or("");
656    let context_key = log.get("context_key").and_then(Value::as_str);
657    let related: i64 = related_logs
658        .iter()
659        .filter(|other| other.get("id").and_then(Value::as_str).unwrap_or("") != log_id)
660        .filter(|other| {
661            context_key.is_some() && other.get("context_key").and_then(Value::as_str) == context_key
662        })
663        .take(4)
664        .flat_map(|other| {
665            ["query", "output_summary", "outcome"]
666                .into_iter()
667                .filter_map(|key| other.get(key).and_then(Value::as_str))
668        })
669        .map(|text| estimate_tokens(text) as i64)
670        .sum();
671    primary + related
672}
673
674fn estimate_distilled_chunk_tokens(chunk: &DistilledChunk) -> i64 {
675    estimate_tokens(&chunk.content) as i64
676        + chunk
677            .trigger_desc
678            .as_deref()
679            .map(estimate_tokens)
680            .unwrap_or(0) as i64
681        + chunk
682            .anti_trigger_desc
683            .as_deref()
684            .map(estimate_tokens)
685            .unwrap_or(0) as i64
686}
687
688fn anti_trigger_hit(query: &str, anti: &str) -> bool {
689    let q_lower = query.to_lowercase();
690    anti.to_lowercase().split(',').any(|part| {
691        let p = part.trim();
692        !p.is_empty() && q_lower.contains(p)
693    })
694}
695
696fn block_cost(block: &[Value]) -> usize {
697    block
698        .iter()
699        .map(|b| {
700            b.get("token_count")
701                .and_then(Value::as_u64)
702                .map(|t| t as usize)
703                .unwrap_or_else(|| {
704                    estimate_tokens(b.get("content").and_then(Value::as_str).unwrap_or("")).max(100)
705                })
706        })
707        .sum()
708}
709
710fn limit_knowledge(knowledge: Vec<Value>, top: Option<usize>) -> Vec<Value> {
711    match top {
712        None => knowledge,
713        Some(0) => vec![],
714        Some(n) => knowledge.into_iter().take(n).collect(),
715    }
716}
717
718fn usage_state(used: Option<&[String]>) -> &'static str {
719    match used {
720        None => "unknown",
721        Some([]) => "known_none",
722        Some(_) => "known_some",
723    }
724}
725
726fn ratio(numerator: i64, denominator: i64) -> f64 {
727    if denominator <= 0 {
728        0.0
729    } else {
730        ((numerator as f64 / denominator as f64) * 1000.0).round() / 1000.0
731    }
732}
733
734fn validate_source(source: &str) -> Result<()> {
735    if !matches!(
736        source,
737        "mcp" | "sdk" | "cli" | "hook" | "daemon" | "augmented"
738    ) {
739        return Err(InnateError::InvalidState(format!(
740            "invalid event source: {source}"
741        )));
742    }
743    Ok(())
744}
745
746fn count_query(storage: &Storage, sql: &str) -> Result<i64> {
747    Ok(storage
748        .query_chunks(sql)?
749        .first()
750        .and_then(|r| r.as_object())
751        .and_then(|m| m.values().next())
752        .and_then(Value::as_i64)
753        .unwrap_or(0))
754}
755
756fn count_query_params<P: rusqlite::Params>(storage: &Storage, sql: &str, p: P) -> Result<i64> {
757    Ok(storage
758        .query_chunks_params(sql, p)?
759        .first()
760        .and_then(|r| r.as_object())
761        .and_then(|m| m.values().next())
762        .and_then(Value::as_i64)
763        .unwrap_or(0))
764}
765
766fn days_ago(now_iso: &str, days: i64) -> String {
767    use chrono::{DateTime, Duration, Utc};
768    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
769        let cutoff = t - Duration::days(days);
770        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
771    }
772    now_iso.to_string()
773}
774
775fn minutes_ago(now_iso: &str, minutes: i64) -> String {
776    use chrono::{DateTime, Duration, Utc};
777    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
778        let cutoff = t - Duration::minutes(minutes);
779        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
780    }
781    now_iso.to_string()
782}
783
784fn hours_ago(now_iso: &str, hours: i64) -> String {
785    use chrono::{DateTime, Duration, Utc};
786    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
787        let cutoff = t - Duration::hours(hours);
788        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
789    }
790    now_iso.to_string()
791}
792
793fn minutes_after(now_iso: &str, minutes: i64) -> String {
794    use chrono::{DateTime, Duration, Utc};
795    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
796        let cutoff = t + Duration::minutes(minutes);
797        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
798    }
799    now_iso.to_string()
800}
801
802fn hours_after(now_iso: &str, hours: i64) -> String {
803    use chrono::{DateTime, Duration, Utc};
804    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
805        let cutoff = t + Duration::hours(hours);
806        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
807    }
808    now_iso.to_string()
809}
810
811/// Return the number of whole days between two ISO timestamps (now - past; clamped ≥ 0).
812fn iso_days_diff(now_iso: &str, past_iso: &str) -> i64 {
813    use chrono::{DateTime, Utc};
814    let parse = |s: &str| s.parse::<DateTime<Utc>>().ok();
815    if let (Some(a), Some(b)) = (parse(now_iso), parse(past_iso)) {
816        let diff = a - b;
817        diff.num_days().max(0)
818    } else {
819        0
820    }
821}
822
823/// Fractional days between two ISO timestamps (≥ 0). Finer than `iso_days_diff`
824/// so the activation recency term keeps sub-day resolution.
825fn iso_fractional_days(now_iso: &str, past_iso: &str) -> f64 {
826    use chrono::{DateTime, Utc};
827    let parse = |s: &str| s.parse::<DateTime<Utc>>().ok();
828    if let (Some(a), Some(b)) = (parse(now_iso), parse(past_iso)) {
829        ((a - b).num_seconds().max(0)) as f64 / 86_400.0
830    } else {
831        0.0
832    }
833}
834
835/// ACT-R decay exponent for the base-level activation recency term.
836const ACTR_DECAY: f64 = 0.5;
837
838/// ACT-R-inspired base-level activation, bounded to `(0, 1)`.
839///
840/// Fuses **frequency** (how often a chunk has been used) and **recency** (time
841/// since last use) into one re-ranking signal, following the standard ACT-R
842/// approximation `B = ln(n) − d·ln(t)` (Petrov 2006), here using
843/// `B = ln(1 + used_count) − d·ln(1 + recency_days)` and squashed with a
844/// logistic so it stays on the same `[0, 1]` scale as the other fused-score
845/// terms (content/trigger sim, confidence, context).
846///
847/// Returns `0.0` for never-used chunks (no usage history → no boost), which
848/// keeps recall **zero-regression** for freshly-added knowledge: a chunk with
849/// `used_count == 0` contributes nothing to the fused score.
850pub(super) fn actr_activation(used_count: i64, last_used_at: Option<&str>, now_iso: &str) -> f64 {
851    if used_count <= 0 {
852        return 0.0;
853    }
854    let Some(last) = last_used_at else {
855        return 0.0;
856    };
857    let recency_days = iso_fractional_days(now_iso, last);
858    let b = (1.0 + used_count as f64).ln() - ACTR_DECAY * (1.0 + recency_days).ln();
859    1.0 / (1.0 + (-b).exp())
860}
861
862/// DFS-based cycle detection on the hard-dep graph. Returns list of cycles (each is a Vec of ids).
863fn detect_cycles(deps: &[Value]) -> Vec<Vec<String>> {
864    use std::collections::HashMap;
865    let mut adj: HashMap<String, Vec<String>> = HashMap::new();
866    for d in deps {
867        let src = d
868            .get("src")
869            .and_then(Value::as_str)
870            .unwrap_or("")
871            .to_string();
872        let dst = d
873            .get("dst")
874            .and_then(Value::as_str)
875            .unwrap_or("")
876            .to_string();
877        if !src.is_empty() && !dst.is_empty() {
878            adj.entry(src).or_default().push(dst);
879        }
880    }
881    let nodes: Vec<String> = adj.keys().cloned().collect();
882    let mut visited: HashSet<String> = HashSet::new();
883    let mut on_stack: HashSet<String> = HashSet::new();
884    let mut cycles: Vec<Vec<String>> = vec![];
885
886    fn dfs(
887        node: &str,
888        adj: &HashMap<String, Vec<String>>,
889        visited: &mut HashSet<String>,
890        on_stack: &mut HashSet<String>,
891        path: &mut Vec<String>,
892        cycles: &mut Vec<Vec<String>>,
893    ) {
894        if on_stack.contains(node) {
895            // Found cycle — extract loop segment.
896            let start = path.iter().position(|n| n == node).unwrap_or(0);
897            cycles.push(path[start..].to_vec());
898            return;
899        }
900        if visited.contains(node) {
901            return;
902        }
903        visited.insert(node.to_string());
904        on_stack.insert(node.to_string());
905        path.push(node.to_string());
906        if let Some(children) = adj.get(node) {
907            for child in children {
908                dfs(child, adj, visited, on_stack, path, cycles);
909            }
910        }
911        path.pop();
912        on_stack.remove(node);
913    }
914
915    for node in nodes {
916        let mut path = vec![];
917        dfs(
918            &node,
919            &adj,
920            &mut visited,
921            &mut on_stack,
922            &mut path,
923            &mut cycles,
924        );
925    }
926    cycles
927}
innate_core/kb/mod.rs

innate_core/kb/
mod.rs