innate_core/kb/
mod.rs

1//! KnowledgeBase — all 8 Public APIs.
2
3/// Return type for pack(): (selected_chunks, skipped_groups, skip_reasons)
4type PackResult = (
5    Vec<Value>,
6    Vec<(Vec<Value>, f64, usize)>,
7    std::collections::HashMap<String, String>,
8);
9
10use std::collections::{HashMap, HashSet};
11use std::path::Path;
12use std::sync::Arc;
13
14use serde_json::{json, Value};
15
16use crate::embedding::{DummyEmbeddingProvider, EmbeddingProvider};
17use crate::errors::{InnateError, Result};
18use crate::refine::{
19    DefaultSanitizer, DistilledChunk, Distiller, HeuristicDistiller, NoopReranker, NullRefiner,
20    Refiner, Reranker, Sanitizer,
21};
22use crate::storage::{ChunkRow, EpisodicLogRow, Storage};
23use crate::utils::{
24    agent_source, content_hash, estimate_tokens, gen_uuid, pack_embedding, utc_now_iso,
25    SanitizeAction,
26};
27
28mod appraise;
29mod curate;
30mod evolve;
31mod inspection;
32mod lifecycle;
33mod recall;
34mod record;
35mod repair;
36mod situation;
37
38pub use appraise::{
39    AbstainReason, AppraiseParams, Contributor, FlaggedPoint, Tier, Valence, Verdict,
40    APPRAISE_ADVISORY,
41};
42pub use recall::RecallParams;
43pub use record::RecordParams;
44pub use repair::TraceRepairReport;
45pub use situation::Situation;
46
47// ---------------------------------------------------------------------------
48// Tuning defaults
49// ---------------------------------------------------------------------------
50
51// Fused recall score weights. These intentionally sum to 1.05 (not 1.0): the
52// score is a relative ranking signal, not a calibrated probability, so the extra
53// 0.05 of headroom on content similarity is deliberate and the result is never
54// re-normalised. Keep this in mind before "fixing" the sum.
55const W_CONTENT: f64 = 0.55;
56const W_TRIGGER: f64 = 0.25;
57const W_CONFIDENCE: f64 = 0.10;
58const W_CONTEXT: f64 = 0.15;
59const W_ACTIVATION: f64 = 0.08;
60// Hybrid 检索:lexical/BM25 channel weight. Modest by default so exact-term
61// matches lift the right chunk without overpowering semantic similarity.
62const W_LEXICAL: f64 = 0.25;
63const TOP_K_CANDIDATES: usize = 20;
64const ANTI_TRIGGER_PENALTY: f64 = 0.6;
65const DENSITY_REFILL: bool = true;
66
67const LOW_CONF_THRESHOLD: f64 = 0.25;
68const LOW_CONF_IDLE_DAYS: i64 = 60;
69const REPEAT_SELECT_MIN: i64 = 10;
70const REPEAT_SELECT_CONF_MAX: f64 = 0.5;
71const NEVER_USED_AGE_DAYS: i64 = 30;
72const OPEN_TTL_DAYS: i64 = 14;
73const SCREENING_TIMEOUT_MINUTES: i64 = 30;
74const PROMOTE_USED_SUCCESS_MIN: i64 = 3;
75const PROMOTE_CONFIDENCE_MIN: f64 = 0.60;
76const DECAY_FLOOR: f64 = 0.20;
77const EVOLVE_THRESHOLD: i64 = 5;
78const DISTILL_BATCH_SIZE: usize = 20;
79const PENDING_RECALL_PENALTY: f64 = 0.60;
80
81// Intuition / appraise critic defaults (Spec §8). The appraise path reuses the
82// same fused score as recall; these only govern how that score is tiered/flagged.
83const APPRAISE_TIER_WEAK: f64 = 0.30;
84const APPRAISE_TIER_STRONG: f64 = 0.65;
85const APPRAISE_MIN_STRENGTH: f64 = 0.40;
86const APPRAISE_TOP: usize = 8;
87const APPRAISE_TRIGGER_HIT_MIN: f64 = 0.50;
88const APPRAISE_CANDIDATE_IN_EMBED: bool = true;
89// 弃权门(方案 A/F/G)。默认值保持现行行为(门2/门3/门4 关闭),由 meta 调参激活。
90//   门2 signature_floor=0.0   → 关闭(任何一致度都放行)
91//   门3 min_evidence=0        → 关闭(不要求观测历史)
92//   门4 conflict_ceiling=1.0  → 关闭(离散度上界恒不触发)
93// 门1 弱共振无需阈值:prune 后候选为空即弃权(WeakResonance),天然作动。
94const APPRAISE_SIGNATURE_FLOOR: f64 = 0.0;
95const APPRAISE_MIN_EVIDENCE: i64 = 0;
96const APPRAISE_CONFLICT_CEILING: f64 = 1.0;
97// 方案 D 基率锚定先验:prior = Beta(m·g0, m·(1-g0))。默认 m=2, g0=0.5 与旧 Laplace
98// (wins+1)/(evidence+2) 完全等价 → 零行为变化,调大 m / 设真实基率即激活。
99// **仅作用于 appraise(直觉/校准)路径**:实施文档明确范围不含 recall。
100const INTUITION_PRIOR_M: f64 = 2.0;
101const INTUITION_BASE_RATE: f64 = 0.5;
102// recall(图书管理员)路径恒用中性 Laplace 先验(m=2, g0=0.5),与历史
103// (wins+1)/(evidence+2) 逐位等价,绝不受 intuition.* 校准旋钮影响。方案 D 与 recall 解耦。
104const RECALL_PRIOR_M: f64 = 2.0;
105const RECALL_BASE_RATE: f64 = 0.5;
106// 方案 E 校准映射桶数。
107const CALIBRATION_BINS: i64 = 10;
108const SITUATION_COARSE_KEYS: &str = "stage,error_class,file_type";
109// Part (c) — query-embedding granularity. When true, recall folds the normalized
110// situation signature (stage/error_class/file_type) into the embedded query text so
111// the embedding anchors on the situation, not just raw words. Default OFF: opt-in
112// and reversible (chunks are embedded from content/trigger, so enabling it shifts
113// only the query side — measure with `innate recall-eval` before turning on).
114const EMBED_SITUATION_SIGNATURE: bool = false;
115const GOVERNANCE_ARCHIVE_THRESHOLD: i64 = 3;
116const NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD: i64 = 5;
117const GOVERNANCE_EVOLVE_THRESHOLD: i64 = 3;
118const FAILURE_MIN_USES: i64 = 5;
119const FAILURE_MAX_SUCCESS_RATE: f64 = 0.20;
120const FAILURE_CONFIDENCE_MAX: f64 = 0.35;
121const LOG_COMPACT_DAYS: i64 = 30;
122
123// ---------------------------------------------------------------------------
124// Public result types
125// ---------------------------------------------------------------------------
126
127#[derive(Debug, Default, Clone)]
128pub struct RecallResult {
129    pub knowledge: Vec<Value>,
130    pub sparks: Vec<Value>,
131    pub trace_id: String,
132    pub empty: bool,
133    pub depth_skipped: Vec<String>,
134    pub skipped_reasons: HashMap<String, String>,
135}
136
137#[derive(Debug, Default)]
138pub struct CurateReport {
139    pub archived: Vec<String>,
140    pub deduped: Vec<String>,
141    pub decayed: Vec<String>,
142    pub cycles: Vec<Vec<String>>,
143    pub orphans: Vec<String>,
144    pub recovered: Vec<String>,
145    pub warnings: Vec<String>,
146    pub stats: HashMap<String, Value>,
147}
148
149#[derive(Debug, Default)]
150struct DistillBatchReport {
151    distilled: usize,
152    failed: usize,
153}
154
155/// Scope for a single Curate run — allows limiting governance to a subset of chunks.
156#[derive(Debug, Default, Clone)]
157pub struct CurateScope {
158    /// If set, only process chunks with this origin (e.g. "distilled").
159    pub origin: Option<String>,
160    /// If set, only process chunks belonging to this skill.
161    pub skill_name: Option<String>,
162    /// When true, compute the report but do not write any changes.
163    pub dry_run: bool,
164}
165
166/// Replaceable governance interface (§二·六). Inject via `KnowledgeBase::open_with`.
167/// Default implementation: `BuiltinCurator`.
168pub trait Curator: Send + Sync {
169    fn run(&self, kb: &KnowledgeBase, scope: &CurateScope) -> Result<CurateReport>;
170}
171
172/// Built-in curator — implements the full §四 governance pipeline.
173pub struct BuiltinCurator;
174
175impl Curator for BuiltinCurator {
176    fn run(&self, kb: &KnowledgeBase, scope: &CurateScope) -> Result<CurateReport> {
177        kb.builtin_curate_impl(scope)
178    }
179}
180
181// ---------------------------------------------------------------------------
182// KnowledgeBase
183// ---------------------------------------------------------------------------
184
185pub struct KnowledgeBase {
186    pub storage: Storage,
187    embedding: Arc<dyn EmbeddingProvider>,
188    refiner: Arc<dyn Refiner>,
189    distiller: Arc<dyn Distiller>,
190    curator: Arc<dyn Curator>,
191    sanitizer: Arc<dyn Sanitizer>,
192    /// Opt-in offline reranker (part d). Defaults to `NoopReranker` (fused order
193    /// preserved); set via `with_reranker` when an LLM is configured.
194    reranker: Arc<dyn Reranker>,
195
196    // Tuning params (loaded from meta at init)
197    w_content: f64,
198    w_trigger: f64,
199    w_confidence: f64,
200    w_context: f64,
201    w_activation: f64,
202    w_lexical: f64,
203    top_k_candidates: usize,
204    anti_trigger_penalty: f64,
205    density_refill: bool,
206
207    low_conf_threshold: f64,
208    low_conf_idle_days: i64,
209    repeat_select_min: i64,
210    repeat_select_conf_max: f64,
211    never_used_age_days: i64,
212    open_ttl_days: i64,
213    screening_timeout_minutes: i64,
214    promote_used_success_min: i64,
215    promote_confidence_min: f64,
216    decay_floor: f64,
217    evolve_threshold: i64,
218    distill_batch_size: usize,
219    evolve_schedule_interval_hours: i64,
220    governance_archive_threshold: i64,
221    negative_feedback_archive_threshold: i64,
222    governance_evolve_threshold: i64,
223    governance_proposal_max_age_days: i64,
224    failure_min_uses: i64,
225    failure_max_success_rate: f64,
226    failure_confidence_max: f64,
227    log_compact_days: i64,
228
229    // Intuition / appraise critic params
230    appraise_tier_weak: f64,
231    appraise_tier_strong: f64,
232    appraise_min_strength: f64,
233    appraise_top: usize,
234    appraise_trigger_hit_min: f64,
235    appraise_candidate_in_embed: bool,
236    appraise_signature_floor: f64,
237    appraise_min_evidence: i64,
238    appraise_conflict_ceiling: f64,
239    intuition_prior_m: f64,
240    intuition_base_rate: f64,
241    calibration_bins: i64,
242    situation_coarse_keys: String,
243    embed_situation_signature: bool,
244}
245
246impl KnowledgeBase {
247    pub fn open(db_path: impl AsRef<Path>) -> Result<Self> {
248        Self::open_with(db_path, None, None, None, None, None)
249    }
250
251    /// Persist a content embedding, rejecting any vector whose dimension differs
252    /// from the configured provider. A mismatched vector is silently skipped at
253    /// search time (cosine search only scores equal-dimension vectors), so an
254    /// unchecked write becomes invisible recall loss with no error. Fail closed
255    /// at the write boundary instead. All vector writers route through here.
256    pub(crate) fn store_vec_content(&self, chunk_id: &str, cvec: &[f32]) -> Result<()> {
257        let want = self.embedding.content_dim();
258        if cvec.len() != want {
259            return Err(InnateError::InvalidState(format!(
260                "content embedding dim {} != configured {want} (chunk {chunk_id})",
261                cvec.len()
262            )));
263        }
264        self.storage
265            .insert_vec_content(chunk_id, &pack_embedding(cvec))
266    }
267
268    /// Trigger-vector counterpart of [`store_vec_content`]; same fail-closed
269    /// dimension guard against `trigger_dim()`.
270    pub(crate) fn store_vec_trigger(&self, chunk_id: &str, tvec: &[f32]) -> Result<()> {
271        let want = self.embedding.trigger_dim();
272        if tvec.len() != want {
273            return Err(InnateError::InvalidState(format!(
274                "trigger embedding dim {} != configured {want} (chunk {chunk_id})",
275                tvec.len()
276            )));
277        }
278        self.storage
279            .insert_vec_trigger(chunk_id, &pack_embedding(tvec))
280    }
281
282    pub fn open_with(
283        db_path: impl AsRef<Path>,
284        embedding: Option<Arc<dyn EmbeddingProvider>>,
285        refiner: Option<Arc<dyn Refiner>>,
286        distiller: Option<Arc<dyn Distiller>>,
287        curator: Option<Arc<dyn Curator>>,
288        sanitizer: Option<Arc<dyn Sanitizer>>,
289    ) -> Result<Self> {
290        let embedding = embedding.unwrap_or_else(|| Arc::new(DummyEmbeddingProvider::default()));
291        let refiner = refiner.unwrap_or_else(|| Arc::new(NullRefiner));
292        let distiller = distiller.unwrap_or_else(|| Arc::new(HeuristicDistiller));
293        let curator = curator.unwrap_or_else(|| Arc::new(BuiltinCurator));
294        let sanitizer = sanitizer.unwrap_or_else(|| Arc::new(DefaultSanitizer));
295        let reranker: Arc<dyn Reranker> = Arc::new(NoopReranker);
296
297        let storage = Storage::open(db_path, embedding.content_dim(), embedding.trigger_dim())?;
298
299        let mut kb = Self {
300            storage,
301            embedding,
302            refiner,
303            distiller,
304            curator,
305            sanitizer,
306            reranker,
307            w_lexical: W_LEXICAL,
308            embed_situation_signature: EMBED_SITUATION_SIGNATURE,
309            w_content: W_CONTENT,
310            w_trigger: W_TRIGGER,
311            w_confidence: W_CONFIDENCE,
312            w_context: W_CONTEXT,
313            w_activation: W_ACTIVATION,
314            top_k_candidates: TOP_K_CANDIDATES,
315            anti_trigger_penalty: ANTI_TRIGGER_PENALTY,
316            density_refill: DENSITY_REFILL,
317            low_conf_threshold: LOW_CONF_THRESHOLD,
318            low_conf_idle_days: LOW_CONF_IDLE_DAYS,
319            repeat_select_min: REPEAT_SELECT_MIN,
320            repeat_select_conf_max: REPEAT_SELECT_CONF_MAX,
321            never_used_age_days: NEVER_USED_AGE_DAYS,
322            open_ttl_days: OPEN_TTL_DAYS,
323            screening_timeout_minutes: SCREENING_TIMEOUT_MINUTES,
324            promote_used_success_min: PROMOTE_USED_SUCCESS_MIN,
325            promote_confidence_min: PROMOTE_CONFIDENCE_MIN,
326            decay_floor: DECAY_FLOOR,
327            evolve_threshold: EVOLVE_THRESHOLD,
328            distill_batch_size: DISTILL_BATCH_SIZE,
329            evolve_schedule_interval_hours: 6,
330            governance_archive_threshold: GOVERNANCE_ARCHIVE_THRESHOLD,
331            negative_feedback_archive_threshold: NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD,
332            governance_evolve_threshold: GOVERNANCE_EVOLVE_THRESHOLD,
333            governance_proposal_max_age_days: 30,
334            failure_min_uses: FAILURE_MIN_USES,
335            failure_max_success_rate: FAILURE_MAX_SUCCESS_RATE,
336            failure_confidence_max: FAILURE_CONFIDENCE_MAX,
337            log_compact_days: LOG_COMPACT_DAYS,
338            appraise_tier_weak: APPRAISE_TIER_WEAK,
339            appraise_tier_strong: APPRAISE_TIER_STRONG,
340            appraise_min_strength: APPRAISE_MIN_STRENGTH,
341            appraise_top: APPRAISE_TOP,
342            appraise_trigger_hit_min: APPRAISE_TRIGGER_HIT_MIN,
343            appraise_candidate_in_embed: APPRAISE_CANDIDATE_IN_EMBED,
344            appraise_signature_floor: APPRAISE_SIGNATURE_FLOOR,
345            appraise_min_evidence: APPRAISE_MIN_EVIDENCE,
346            appraise_conflict_ceiling: APPRAISE_CONFLICT_CEILING,
347            intuition_prior_m: INTUITION_PRIOR_M,
348            intuition_base_rate: INTUITION_BASE_RATE,
349            calibration_bins: CALIBRATION_BINS,
350            situation_coarse_keys: SITUATION_COARSE_KEYS.to_string(),
351        };
352        kb.init_meta()?;
353        kb.load_params()?;
354        Ok(kb)
355    }
356
357    /// Install an opt-in offline reranker (part d). Used by `open_kb` when an LLM is
358    /// configured; recall only invokes it when a caller passes `rerank=true`, so the
359    /// default hook path stays no-LLM regardless.
360    pub fn with_reranker(mut self, reranker: Arc<dyn Reranker>) -> Self {
361        self.reranker = reranker;
362        self
363    }
364
365    fn init_meta(&self) -> Result<()> {
366        let lib_id = gen_uuid();
367        let content_dim = self.embedding.content_dim().to_string();
368        let trigger_dim = self.embedding.trigger_dim().to_string();
369        let embed_model = self.embedding.model_name();
370
371        for (key, expected) in [
372            ("content_dim", self.embedding.content_dim()),
373            ("trigger_dim", self.embedding.trigger_dim()),
374        ] {
375            if let Some(stored) = self.storage.get_meta(key)? {
376                let actual = stored.parse::<usize>().map_err(|_| {
377                    InnateError::Other(format!("invalid {key} metadata value: {stored}"))
378                })?;
379                if actual != expected {
380                    return Err(InnateError::Other(format!(
381                        "{key} mismatch: database uses {actual}, embedding provider uses {expected}"
382                    )));
383                }
384            }
385        }
386
387        let defaults: &[(&str, &str)] = &[
388            ("lib_id", &lib_id),
389            ("lib_role", "personal"),
390            ("schema_version", "4.14"),
391            ("content_dim", &content_dim),
392            ("trigger_dim", &trigger_dim),
393            ("embed_model", embed_model),
394            ("embed_version", "1"),
395            ("vector_revision", "0"),
396            ("last_agg_ts", "1970-01-01T00:00:00.000Z"),
397            ("recall.w_content", "0.55"),
398            ("recall.w_trigger", "0.25"),
399            ("recall.w_confidence", "0.10"),
400            ("recall.w_context", "0.15"),
401            ("recall.w_activation", "0.08"),
402            ("recall.w_lexical", "0.25"),
403            ("recall.embed_situation_signature", "false"),
404            ("recall.top_k_candidates", "20"),
405            ("recall.anti_trigger_penalty", "0.6"),
406            ("recall.density_refill", "true"),
407            ("curate.low_conf_threshold", "0.25"),
408            ("curate.low_conf_idle_days", "60"),
409            ("curate.repeat_select_min", "10"),
410            ("curate.repeat_select_conf_max", "0.5"),
411            ("curate.never_used_age_days", "30"),
412            ("curate.open_ttl_days", "14"),
413            ("curate.screening_timeout_minutes", "30"),
414            ("curate.promote_used_success_min", "3"),
415            ("curate.promote_confidence_min", "0.60"),
416            ("curate.decay_floor", "0.20"),
417            ("evolve.threshold_new_count", "5"),
418            ("evolve.distill_batch_size", "20"),
419            ("evolve.schedule_interval_hours", "6"),
420            ("curate.soft_mature_threshold", "5"),
421            ("evolve.distill_token_window_hours", "24"),
422            ("curate.governance_archive_threshold", "3"),
423            ("curate.negative_feedback_archive_threshold", "5"),
424            ("evolve.governance_pending_threshold", "3"),
425            ("curate.governance_proposal_max_age_days", "30"),
426            ("curate.failure_min_uses", "5"),
427            ("curate.failure_max_success_rate", "0.20"),
428            ("curate.failure_confidence_max", "0.35"),
429            ("curate.log_compact_days", "30"),
430            ("appraise.tier_weak", "0.30"),
431            ("appraise.tier_strong", "0.65"),
432            ("appraise.min_strength", "0.40"),
433            ("appraise.top", "8"),
434            ("appraise.trigger_hit_min", "0.50"),
435            ("appraise.candidate_in_embed", "true"),
436            ("appraise.signature_floor", "0.0"),
437            ("appraise.min_evidence", "0"),
438            ("appraise.conflict_ceiling", "1.0"),
439            ("intuition.prior_m", "2.0"),
440            ("intuition.base_rate", "0.5"),
441            ("intuition.calibration_bins", "10"),
442            ("situation.coarse_keys", "stage,error_class,file_type"),
443        ];
444        self.storage.begin_immediate()?;
445        let result = (|| -> Result<()> {
446            for (k, v) in defaults {
447                if self.storage.get_meta(k)?.is_none() {
448                    self.storage.set_meta(k, v)?;
449                }
450            }
451            self.storage.commit()
452        })();
453        if result.is_err() {
454            let _ = self.storage.rollback();
455        }
456        result
457    }
458
459    fn load_params(&mut self) -> Result<()> {
460        let f = |k: &str, d: f64| -> f64 {
461            self.storage
462                .get_meta(k)
463                .ok()
464                .flatten()
465                .and_then(|v| v.parse().ok())
466                .unwrap_or(d)
467        };
468        let i = |k: &str, d: i64| -> i64 {
469            self.storage
470                .get_meta(k)
471                .ok()
472                .flatten()
473                .and_then(|v| v.parse().ok())
474                .unwrap_or(d)
475        };
476        let b = |k: &str, d: bool| -> bool {
477            self.storage
478                .get_meta(k)
479                .ok()
480                .flatten()
481                .map(|v| v.to_lowercase() == "true")
482                .unwrap_or(d)
483        };
484        self.w_content = f("recall.w_content", W_CONTENT);
485        self.w_trigger = f("recall.w_trigger", W_TRIGGER);
486        self.w_confidence = f("recall.w_confidence", W_CONFIDENCE);
487        self.w_context = f("recall.w_context", W_CONTEXT);
488        self.w_lexical = f("recall.w_lexical", W_LEXICAL);
489        self.embed_situation_signature =
490            b("recall.embed_situation_signature", EMBED_SITUATION_SIGNATURE);
491        self.w_activation = f("recall.w_activation", W_ACTIVATION);
492        self.top_k_candidates =
493            i("recall.top_k_candidates", TOP_K_CANDIDATES as i64).max(1) as usize;
494        self.anti_trigger_penalty = f("recall.anti_trigger_penalty", ANTI_TRIGGER_PENALTY);
495        self.density_refill = b("recall.density_refill", DENSITY_REFILL);
496        self.low_conf_threshold = f("curate.low_conf_threshold", LOW_CONF_THRESHOLD);
497        self.low_conf_idle_days = i("curate.low_conf_idle_days", LOW_CONF_IDLE_DAYS);
498        self.repeat_select_min = i("curate.repeat_select_min", REPEAT_SELECT_MIN);
499        self.repeat_select_conf_max = f("curate.repeat_select_conf_max", REPEAT_SELECT_CONF_MAX);
500        self.never_used_age_days = i("curate.never_used_age_days", NEVER_USED_AGE_DAYS);
501        self.open_ttl_days = i("curate.open_ttl_days", OPEN_TTL_DAYS);
502        self.screening_timeout_minutes = i(
503            "curate.screening_timeout_minutes",
504            SCREENING_TIMEOUT_MINUTES,
505        );
506        self.promote_used_success_min =
507            i("curate.promote_used_success_min", PROMOTE_USED_SUCCESS_MIN);
508        self.promote_confidence_min = f("curate.promote_confidence_min", PROMOTE_CONFIDENCE_MIN);
509        self.decay_floor = f("curate.decay_floor", DECAY_FLOOR).clamp(0.0, 0.4);
510        self.evolve_threshold = i("evolve.threshold_new_count", EVOLVE_THRESHOLD);
511        self.distill_batch_size =
512            i("evolve.distill_batch_size", DISTILL_BATCH_SIZE as i64) as usize;
513        self.evolve_schedule_interval_hours = i("evolve.schedule_interval_hours", 6).max(1);
514        self.governance_archive_threshold = i(
515            "curate.governance_archive_threshold",
516            GOVERNANCE_ARCHIVE_THRESHOLD,
517        )
518        .max(1);
519        self.negative_feedback_archive_threshold = i(
520            "curate.negative_feedback_archive_threshold",
521            NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD,
522        )
523        .max(1);
524        self.governance_evolve_threshold = i(
525            "evolve.governance_pending_threshold",
526            GOVERNANCE_EVOLVE_THRESHOLD,
527        )
528        .max(1);
529        self.governance_proposal_max_age_days =
530            i("curate.governance_proposal_max_age_days", 30).max(1);
531        self.failure_min_uses = i("curate.failure_min_uses", FAILURE_MIN_USES).max(1);
532        self.failure_max_success_rate =
533            f("curate.failure_max_success_rate", FAILURE_MAX_SUCCESS_RATE).clamp(0.0, 1.0);
534        self.failure_confidence_max =
535            f("curate.failure_confidence_max", FAILURE_CONFIDENCE_MAX).clamp(0.0, 1.0);
536        self.log_compact_days = i("curate.log_compact_days", LOG_COMPACT_DAYS).max(1);
537        let s = |k: &str, d: &str| -> String {
538            self.storage
539                .get_meta(k)
540                .ok()
541                .flatten()
542                .filter(|v| !v.trim().is_empty())
543                .unwrap_or_else(|| d.to_string())
544        };
545        self.appraise_tier_weak = f("appraise.tier_weak", APPRAISE_TIER_WEAK).clamp(0.0, 1.0);
546        self.appraise_tier_strong = f("appraise.tier_strong", APPRAISE_TIER_STRONG).clamp(0.0, 1.0);
547        self.appraise_min_strength =
548            f("appraise.min_strength", APPRAISE_MIN_STRENGTH).clamp(0.0, 1.0);
549        self.appraise_top = i("appraise.top", APPRAISE_TOP as i64).max(1) as usize;
550        self.appraise_trigger_hit_min =
551            f("appraise.trigger_hit_min", APPRAISE_TRIGGER_HIT_MIN).clamp(0.0, 1.0);
552        self.appraise_candidate_in_embed =
553            b("appraise.candidate_in_embed", APPRAISE_CANDIDATE_IN_EMBED);
554        self.appraise_signature_floor =
555            f("appraise.signature_floor", APPRAISE_SIGNATURE_FLOOR).clamp(0.0, 1.0);
556        self.appraise_min_evidence = i("appraise.min_evidence", APPRAISE_MIN_EVIDENCE).max(0);
557        self.appraise_conflict_ceiling =
558            f("appraise.conflict_ceiling", APPRAISE_CONFLICT_CEILING).clamp(0.0, 1.0);
559        self.intuition_prior_m = f("intuition.prior_m", INTUITION_PRIOR_M).max(0.0);
560        self.intuition_base_rate = f("intuition.base_rate", INTUITION_BASE_RATE).clamp(0.0, 1.0);
561        self.calibration_bins = i("intuition.calibration_bins", CALIBRATION_BINS).clamp(2, 100);
562        self.situation_coarse_keys = s("situation.coarse_keys", SITUATION_COARSE_KEYS);
563        Ok(())
564    }
565}
566
567// ---------------------------------------------------------------------------
568// Helpers
569// ---------------------------------------------------------------------------
570
571struct CandidateInfo {
572    chunk: Value,
573    sim_content: f32,
574    sim_trigger: f32,
575    /// Lexical/BM25 channel score ∈ [0,1] (hybrid 检索). Zero when the chunk was
576    /// found only by vector search; positive when an exact-term match recovered it.
577    sim_lexical: f32,
578}
579
580/// True when a coarse signature carries at least one real value (not empty /
581/// `none` / `unknown`) — used to decide whether folding it into the embed query
582/// adds signal or just noise.
583fn signature_has_signal(sig: &str) -> bool {
584    sig.split('|').any(|p| {
585        p.split_once('=')
586            .map(|(_, v)| !v.is_empty() && v != "none" && v != "unknown")
587            .unwrap_or(false)
588    })
589}
590
591/// Fresh candidate from a chunk with all channel sims zeroed (callers set the
592/// channel(s) that surfaced it). Centralised so adding a channel touches one place.
593fn new_candidate(chunk: &Value) -> CandidateInfo {
594    CandidateInfo {
595        chunk: chunk.clone(),
596        sim_content: 0.0,
597        sim_trigger: 0.0,
598        sim_lexical: 0.0,
599    }
600}
601
602fn chunk_is_valid_for_recall(chunk: &Value, embed_version: i64) -> bool {
603    chunk.get("state").and_then(Value::as_str) != Some("archived")
604        && chunk.get("origin").and_then(Value::as_str) != Some("spark")
605        && chunk
606            .get("embed_version")
607            .and_then(Value::as_i64)
608            .unwrap_or(1)
609            >= embed_version
610}
611
612/// Normalize a query string before hashing into a context_key.
613///
614/// Goals: collapse whitespace variations and case differences so that
615/// semantically equivalent queries (same words, different capitalisation or
616/// spacing) accumulate statistics in the same context_stat bucket.
617///
618/// Deliberately conservative: no stemming, no stop-word removal. The canonical
619/// query guidance in SKILL.md handles vocabulary consistency at the agent level.
620fn normalize_query(query: &str) -> String {
621    const STOP_WORDS: &[&str] = &[
622        "a", "an", "and", "for", "in", "of", "on", "the", "to", "with",
623    ];
624    let cleaned: String = query
625        .to_lowercase()
626        .chars()
627        .map(|ch| {
628            if ch.is_alphanumeric() || ch.is_whitespace() {
629                ch
630            } else {
631                ' '
632            }
633        })
634        .collect();
635    let mut tokens: Vec<&str> = cleaned
636        .split_whitespace()
637        .filter(|token| !STOP_WORDS.contains(token))
638        .collect();
639    tokens.sort_unstable();
640    tokens.dedup();
641    tokens.join(" ")
642}
643
644fn estimate_distill_prompt_tokens(log: &Value, related_logs: &[Value]) -> i64 {
645    let primary: i64 = [
646        "query",
647        "recall_snapshot",
648        "output",
649        "output_summary",
650        "nomination",
651    ]
652    .iter()
653    .filter_map(|key| log.get(*key).and_then(Value::as_str))
654    .map(|text| estimate_tokens(text) as i64)
655    .sum();
656    let log_id = log.get("id").and_then(Value::as_str).unwrap_or("");
657    let context_key = log.get("context_key").and_then(Value::as_str);
658    let related: i64 = related_logs
659        .iter()
660        .filter(|other| other.get("id").and_then(Value::as_str).unwrap_or("") != log_id)
661        .filter(|other| {
662            context_key.is_some() && other.get("context_key").and_then(Value::as_str) == context_key
663        })
664        .take(4)
665        .flat_map(|other| {
666            ["query", "output_summary", "outcome"]
667                .into_iter()
668                .filter_map(|key| other.get(key).and_then(Value::as_str))
669        })
670        .map(|text| estimate_tokens(text) as i64)
671        .sum();
672    primary + related
673}
674
675fn estimate_distilled_chunk_tokens(chunk: &DistilledChunk) -> i64 {
676    estimate_tokens(&chunk.content) as i64
677        + chunk
678            .trigger_desc
679            .as_deref()
680            .map(estimate_tokens)
681            .unwrap_or(0) as i64
682        + chunk
683            .anti_trigger_desc
684            .as_deref()
685            .map(estimate_tokens)
686            .unwrap_or(0) as i64
687}
688
689fn anti_trigger_hit(query: &str, anti: &str) -> bool {
690    let q_lower = query.to_lowercase();
691    anti.to_lowercase().split(',').any(|part| {
692        let p = part.trim();
693        !p.is_empty() && q_lower.contains(p)
694    })
695}
696
697fn block_cost(block: &[Value]) -> usize {
698    block
699        .iter()
700        .map(|b| {
701            b.get("token_count")
702                .and_then(Value::as_u64)
703                .map(|t| t as usize)
704                .unwrap_or_else(|| {
705                    estimate_tokens(b.get("content").and_then(Value::as_str).unwrap_or("")).max(100)
706                })
707        })
708        .sum()
709}
710
711fn limit_knowledge(knowledge: Vec<Value>, top: Option<usize>) -> Vec<Value> {
712    match top {
713        None => knowledge,
714        Some(0) => vec![],
715        Some(n) => knowledge.into_iter().take(n).collect(),
716    }
717}
718
719fn usage_state(used: Option<&[String]>) -> &'static str {
720    match used {
721        None => "unknown",
722        Some([]) => "known_none",
723        Some(_) => "known_some",
724    }
725}
726
727fn ratio(numerator: i64, denominator: i64) -> f64 {
728    if denominator <= 0 {
729        0.0
730    } else {
731        ((numerator as f64 / denominator as f64) * 1000.0).round() / 1000.0
732    }
733}
734
735fn validate_source(source: &str) -> Result<()> {
736    if !matches!(
737        source,
738        "mcp" | "sdk" | "cli" | "hook" | "daemon" | "augmented"
739    ) {
740        return Err(InnateError::InvalidState(format!(
741            "invalid event source: {source}"
742        )));
743    }
744    Ok(())
745}
746
747fn count_query(storage: &Storage, sql: &str) -> Result<i64> {
748    Ok(storage
749        .query_chunks(sql)?
750        .first()
751        .and_then(|r| r.as_object())
752        .and_then(|m| m.values().next())
753        .and_then(Value::as_i64)
754        .unwrap_or(0))
755}
756
757fn count_query_params<P: rusqlite::Params>(storage: &Storage, sql: &str, p: P) -> Result<i64> {
758    Ok(storage
759        .query_chunks_params(sql, p)?
760        .first()
761        .and_then(|r| r.as_object())
762        .and_then(|m| m.values().next())
763        .and_then(Value::as_i64)
764        .unwrap_or(0))
765}
766
767fn days_ago(now_iso: &str, days: i64) -> String {
768    use chrono::{DateTime, Duration, Utc};
769    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
770        let cutoff = t - Duration::days(days);
771        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
772    }
773    now_iso.to_string()
774}
775
776fn minutes_ago(now_iso: &str, minutes: i64) -> String {
777    use chrono::{DateTime, Duration, Utc};
778    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
779        let cutoff = t - Duration::minutes(minutes);
780        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
781    }
782    now_iso.to_string()
783}
784
785fn hours_ago(now_iso: &str, hours: i64) -> String {
786    use chrono::{DateTime, Duration, Utc};
787    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
788        let cutoff = t - Duration::hours(hours);
789        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
790    }
791    now_iso.to_string()
792}
793
794fn minutes_after(now_iso: &str, minutes: i64) -> String {
795    use chrono::{DateTime, Duration, Utc};
796    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
797        let cutoff = t + Duration::minutes(minutes);
798        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
799    }
800    now_iso.to_string()
801}
802
803fn hours_after(now_iso: &str, hours: i64) -> String {
804    use chrono::{DateTime, Duration, Utc};
805    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
806        let cutoff = t + Duration::hours(hours);
807        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
808    }
809    now_iso.to_string()
810}
811
812/// Return the number of whole days between two ISO timestamps (now - past; clamped ≥ 0).
813fn iso_days_diff(now_iso: &str, past_iso: &str) -> i64 {
814    use chrono::{DateTime, Utc};
815    let parse = |s: &str| s.parse::<DateTime<Utc>>().ok();
816    if let (Some(a), Some(b)) = (parse(now_iso), parse(past_iso)) {
817        let diff = a - b;
818        diff.num_days().max(0)
819    } else {
820        0
821    }
822}
823
824/// Fractional days between two ISO timestamps (≥ 0). Finer than `iso_days_diff`
825/// so the activation recency term keeps sub-day resolution.
826fn iso_fractional_days(now_iso: &str, past_iso: &str) -> f64 {
827    use chrono::{DateTime, Utc};
828    let parse = |s: &str| s.parse::<DateTime<Utc>>().ok();
829    if let (Some(a), Some(b)) = (parse(now_iso), parse(past_iso)) {
830        ((a - b).num_seconds().max(0)) as f64 / 86_400.0
831    } else {
832        0.0
833    }
834}
835
836/// ACT-R decay exponent for the base-level activation recency term.
837const ACTR_DECAY: f64 = 0.5;
838
839/// ACT-R-inspired base-level activation, bounded to `(0, 1)`.
840///
841/// Fuses **frequency** (how often a chunk has been used) and **recency** (time
842/// since last use) into one re-ranking signal, following the standard ACT-R
843/// approximation `B = ln(n) − d·ln(t)` (Petrov 2006), here using
844/// `B = ln(1 + used_count) − d·ln(1 + recency_days)` and squashed with a
845/// logistic so it stays on the same `[0, 1]` scale as the other fused-score
846/// terms (content/trigger sim, confidence, context).
847///
848/// Returns `0.0` for never-used chunks (no usage history → no boost), which
849/// keeps recall **zero-regression** for freshly-added knowledge: a chunk with
850/// `used_count == 0` contributes nothing to the fused score.
851pub(super) fn actr_activation(used_count: i64, last_used_at: Option<&str>, now_iso: &str) -> f64 {
852    if used_count <= 0 {
853        return 0.0;
854    }
855    let Some(last) = last_used_at else {
856        return 0.0;
857    };
858    let recency_days = iso_fractional_days(now_iso, last);
859    let b = (1.0 + used_count as f64).ln() - ACTR_DECAY * (1.0 + recency_days).ln();
860    1.0 / (1.0 + (-b).exp())
861}
862
863/// DFS-based cycle detection on the hard-dep graph. Returns list of cycles (each is a Vec of ids).
864fn detect_cycles(deps: &[Value]) -> Vec<Vec<String>> {
865    use std::collections::HashMap;
866    let mut adj: HashMap<String, Vec<String>> = HashMap::new();
867    for d in deps {
868        let src = d
869            .get("src")
870            .and_then(Value::as_str)
871            .unwrap_or("")
872            .to_string();
873        let dst = d
874            .get("dst")
875            .and_then(Value::as_str)
876            .unwrap_or("")
877            .to_string();
878        if !src.is_empty() && !dst.is_empty() {
879            adj.entry(src).or_default().push(dst);
880        }
881    }
882    let nodes: Vec<String> = adj.keys().cloned().collect();
883    let mut visited: HashSet<String> = HashSet::new();
884    let mut on_stack: HashSet<String> = HashSet::new();
885    let mut cycles: Vec<Vec<String>> = vec![];
886
887    fn dfs(
888        node: &str,
889        adj: &HashMap<String, Vec<String>>,
890        visited: &mut HashSet<String>,
891        on_stack: &mut HashSet<String>,
892        path: &mut Vec<String>,
893        cycles: &mut Vec<Vec<String>>,
894    ) {
895        if on_stack.contains(node) {
896            // Found cycle — extract loop segment.
897            let start = path.iter().position(|n| n == node).unwrap_or(0);
898            cycles.push(path[start..].to_vec());
899            return;
900        }
901        if visited.contains(node) {
902            return;
903        }
904        visited.insert(node.to_string());
905        on_stack.insert(node.to_string());
906        path.push(node.to_string());
907        if let Some(children) = adj.get(node) {
908            for child in children {
909                dfs(child, adj, visited, on_stack, path, cycles);
910            }
911        }
912        path.pop();
913        on_stack.remove(node);
914    }
915
916    for node in nodes {
917        let mut path = vec![];
918        dfs(
919            &node,
920            &adj,
921            &mut visited,
922            &mut on_stack,
923            &mut path,
924            &mut cycles,
925        );
926    }
927    cycles
928}
innate_core/kb/mod.rs

innate_core/kb/
mod.rs