innate_core/kb/
mod.rs

1//! KnowledgeBase — all 8 Public APIs.
2
3/// Return type for pack(): (selected_chunks, skipped_groups, skip_reasons)
4type PackResult = (
5    Vec<Value>,
6    Vec<(Vec<Value>, f64, usize)>,
7    std::collections::HashMap<String, String>,
8);
9
10use std::collections::{HashMap, HashSet};
11use std::path::Path;
12use std::sync::Arc;
13
14use serde_json::{json, Value};
15
16use crate::embedding::{DummyEmbeddingProvider, EmbeddingProvider};
17use crate::errors::{InnateError, Result};
18use crate::refine::{
19    DefaultSanitizer, DistilledChunk, Distiller, HeuristicDistiller, NullRefiner, Refiner,
20    Sanitizer,
21};
22use crate::storage::{ChunkRow, EpisodicLogRow, Storage};
23use crate::utils::{
24    content_hash, estimate_tokens, gen_uuid, pack_embedding, utc_now_iso, SanitizeAction,
25};
26
27mod appraise;
28mod curate;
29mod evolve;
30mod inspection;
31mod lifecycle;
32mod recall;
33mod record;
34mod repair;
35mod situation;
36
37pub use appraise::{
38    AbstainReason, AppraiseParams, Contributor, FlaggedPoint, Tier, Valence, Verdict,
39    APPRAISE_ADVISORY,
40};
41pub use recall::RecallParams;
42pub use record::RecordParams;
43pub use repair::TraceRepairReport;
44pub use situation::Situation;
45
46// ---------------------------------------------------------------------------
47// Tuning defaults
48// ---------------------------------------------------------------------------
49
50// Fused recall score weights. These intentionally sum to 1.05 (not 1.0): the
51// score is a relative ranking signal, not a calibrated probability, so the extra
52// 0.05 of headroom on content similarity is deliberate and the result is never
53// re-normalised. Keep this in mind before "fixing" the sum.
54const W_CONTENT: f64 = 0.55;
55const W_TRIGGER: f64 = 0.25;
56const W_CONFIDENCE: f64 = 0.10;
57const W_CONTEXT: f64 = 0.15;
58const W_ACTIVATION: f64 = 0.08;
59const TOP_K_CANDIDATES: usize = 20;
60const ANTI_TRIGGER_PENALTY: f64 = 0.6;
61const DENSITY_REFILL: bool = true;
62
63const LOW_CONF_THRESHOLD: f64 = 0.25;
64const LOW_CONF_IDLE_DAYS: i64 = 60;
65const REPEAT_SELECT_MIN: i64 = 10;
66const REPEAT_SELECT_CONF_MAX: f64 = 0.5;
67const NEVER_USED_AGE_DAYS: i64 = 30;
68const OPEN_TTL_DAYS: i64 = 14;
69const SCREENING_TIMEOUT_MINUTES: i64 = 30;
70const PROMOTE_USED_SUCCESS_MIN: i64 = 3;
71const PROMOTE_CONFIDENCE_MIN: f64 = 0.60;
72const DECAY_FLOOR: f64 = 0.20;
73const EVOLVE_THRESHOLD: i64 = 5;
74const DISTILL_BATCH_SIZE: usize = 20;
75const PENDING_RECALL_PENALTY: f64 = 0.60;
76
77// Intuition / appraise critic defaults (Spec §8). The appraise path reuses the
78// same fused score as recall; these only govern how that score is tiered/flagged.
79const APPRAISE_TIER_WEAK: f64 = 0.30;
80const APPRAISE_TIER_STRONG: f64 = 0.65;
81const APPRAISE_MIN_STRENGTH: f64 = 0.40;
82const APPRAISE_TOP: usize = 8;
83const APPRAISE_TRIGGER_HIT_MIN: f64 = 0.50;
84const APPRAISE_CANDIDATE_IN_EMBED: bool = true;
85// 弃权门(方案 A/F/G)。默认值保持现行行为(门2/门3/门4 关闭),由 meta 调参激活。
86//   门2 signature_floor=0.0   → 关闭(任何一致度都放行)
87//   门3 min_evidence=0        → 关闭(不要求观测历史)
88//   门4 conflict_ceiling=1.0  → 关闭(离散度上界恒不触发)
89// 门1 弱共振无需阈值:prune 后候选为空即弃权(WeakResonance),天然作动。
90const APPRAISE_SIGNATURE_FLOOR: f64 = 0.0;
91const APPRAISE_MIN_EVIDENCE: i64 = 0;
92const APPRAISE_CONFLICT_CEILING: f64 = 1.0;
93// 方案 D 基率锚定先验:prior = Beta(m·g0, m·(1-g0))。默认 m=2, g0=0.5 与旧 Laplace
94// (wins+1)/(evidence+2) 完全等价 → 零行为变化,调大 m / 设真实基率即激活。
95// **仅作用于 appraise(直觉/校准)路径**:实施文档明确范围不含 recall。
96const INTUITION_PRIOR_M: f64 = 2.0;
97const INTUITION_BASE_RATE: f64 = 0.5;
98// recall(图书管理员)路径恒用中性 Laplace 先验(m=2, g0=0.5),与历史
99// (wins+1)/(evidence+2) 逐位等价,绝不受 intuition.* 校准旋钮影响。方案 D 与 recall 解耦。
100const RECALL_PRIOR_M: f64 = 2.0;
101const RECALL_BASE_RATE: f64 = 0.5;
102// 方案 E 校准映射桶数。
103const CALIBRATION_BINS: i64 = 10;
104const SITUATION_COARSE_KEYS: &str = "stage,error_class,file_type";
105const GOVERNANCE_ARCHIVE_THRESHOLD: i64 = 3;
106const NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD: i64 = 5;
107const GOVERNANCE_EVOLVE_THRESHOLD: i64 = 3;
108const FAILURE_MIN_USES: i64 = 5;
109const FAILURE_MAX_SUCCESS_RATE: f64 = 0.20;
110const FAILURE_CONFIDENCE_MAX: f64 = 0.35;
111const LOG_COMPACT_DAYS: i64 = 30;
112
113// ---------------------------------------------------------------------------
114// Public result types
115// ---------------------------------------------------------------------------
116
117#[derive(Debug, Default, Clone)]
118pub struct RecallResult {
119    pub knowledge: Vec<Value>,
120    pub sparks: Vec<Value>,
121    pub trace_id: String,
122    pub empty: bool,
123    pub depth_skipped: Vec<String>,
124    pub skipped_reasons: HashMap<String, String>,
125}
126
127#[derive(Debug, Default)]
128pub struct CurateReport {
129    pub archived: Vec<String>,
130    pub deduped: Vec<String>,
131    pub decayed: Vec<String>,
132    pub cycles: Vec<Vec<String>>,
133    pub orphans: Vec<String>,
134    pub recovered: Vec<String>,
135    pub warnings: Vec<String>,
136    pub stats: HashMap<String, Value>,
137}
138
139#[derive(Debug, Default)]
140struct DistillBatchReport {
141    distilled: usize,
142    failed: usize,
143}
144
145/// Scope for a single Curate run — allows limiting governance to a subset of chunks.
146#[derive(Debug, Default, Clone)]
147pub struct CurateScope {
148    /// If set, only process chunks with this origin (e.g. "distilled").
149    pub origin: Option<String>,
150    /// If set, only process chunks belonging to this skill.
151    pub skill_name: Option<String>,
152    /// When true, compute the report but do not write any changes.
153    pub dry_run: bool,
154}
155
156/// Replaceable governance interface (§二·六). Inject via `KnowledgeBase::open_with`.
157/// Default implementation: `BuiltinCurator`.
158pub trait Curator: Send + Sync {
159    fn run(&self, kb: &KnowledgeBase, scope: &CurateScope) -> Result<CurateReport>;
160}
161
162/// Built-in curator — implements the full §四 governance pipeline.
163pub struct BuiltinCurator;
164
165impl Curator for BuiltinCurator {
166    fn run(&self, kb: &KnowledgeBase, scope: &CurateScope) -> Result<CurateReport> {
167        kb.builtin_curate_impl(scope)
168    }
169}
170
171// ---------------------------------------------------------------------------
172// KnowledgeBase
173// ---------------------------------------------------------------------------
174
175pub struct KnowledgeBase {
176    pub storage: Storage,
177    embedding: Arc<dyn EmbeddingProvider>,
178    refiner: Arc<dyn Refiner>,
179    distiller: Arc<dyn Distiller>,
180    curator: Arc<dyn Curator>,
181    sanitizer: Arc<dyn Sanitizer>,
182
183    // Tuning params (loaded from meta at init)
184    w_content: f64,
185    w_trigger: f64,
186    w_confidence: f64,
187    w_context: f64,
188    w_activation: f64,
189    top_k_candidates: usize,
190    anti_trigger_penalty: f64,
191    density_refill: bool,
192
193    low_conf_threshold: f64,
194    low_conf_idle_days: i64,
195    repeat_select_min: i64,
196    repeat_select_conf_max: f64,
197    never_used_age_days: i64,
198    open_ttl_days: i64,
199    screening_timeout_minutes: i64,
200    promote_used_success_min: i64,
201    promote_confidence_min: f64,
202    decay_floor: f64,
203    evolve_threshold: i64,
204    distill_batch_size: usize,
205    evolve_schedule_interval_hours: i64,
206    governance_archive_threshold: i64,
207    negative_feedback_archive_threshold: i64,
208    governance_evolve_threshold: i64,
209    governance_proposal_max_age_days: i64,
210    failure_min_uses: i64,
211    failure_max_success_rate: f64,
212    failure_confidence_max: f64,
213    log_compact_days: i64,
214
215    // Intuition / appraise critic params
216    appraise_tier_weak: f64,
217    appraise_tier_strong: f64,
218    appraise_min_strength: f64,
219    appraise_top: usize,
220    appraise_trigger_hit_min: f64,
221    appraise_candidate_in_embed: bool,
222    appraise_signature_floor: f64,
223    appraise_min_evidence: i64,
224    appraise_conflict_ceiling: f64,
225    intuition_prior_m: f64,
226    intuition_base_rate: f64,
227    calibration_bins: i64,
228    situation_coarse_keys: String,
229}
230
231impl KnowledgeBase {
232    pub fn open(db_path: impl AsRef<Path>) -> Result<Self> {
233        Self::open_with(db_path, None, None, None, None, None)
234    }
235
236    /// Persist a content embedding, rejecting any vector whose dimension differs
237    /// from the configured provider. A mismatched vector is silently skipped at
238    /// search time (cosine search only scores equal-dimension vectors), so an
239    /// unchecked write becomes invisible recall loss with no error. Fail closed
240    /// at the write boundary instead. All vector writers route through here.
241    pub(crate) fn store_vec_content(&self, chunk_id: &str, cvec: &[f32]) -> Result<()> {
242        let want = self.embedding.content_dim();
243        if cvec.len() != want {
244            return Err(InnateError::InvalidState(format!(
245                "content embedding dim {} != configured {want} (chunk {chunk_id})",
246                cvec.len()
247            )));
248        }
249        self.storage
250            .insert_vec_content(chunk_id, &pack_embedding(cvec))
251    }
252
253    /// Trigger-vector counterpart of [`store_vec_content`]; same fail-closed
254    /// dimension guard against `trigger_dim()`.
255    pub(crate) fn store_vec_trigger(&self, chunk_id: &str, tvec: &[f32]) -> Result<()> {
256        let want = self.embedding.trigger_dim();
257        if tvec.len() != want {
258            return Err(InnateError::InvalidState(format!(
259                "trigger embedding dim {} != configured {want} (chunk {chunk_id})",
260                tvec.len()
261            )));
262        }
263        self.storage
264            .insert_vec_trigger(chunk_id, &pack_embedding(tvec))
265    }
266
267    pub fn open_with(
268        db_path: impl AsRef<Path>,
269        embedding: Option<Arc<dyn EmbeddingProvider>>,
270        refiner: Option<Arc<dyn Refiner>>,
271        distiller: Option<Arc<dyn Distiller>>,
272        curator: Option<Arc<dyn Curator>>,
273        sanitizer: Option<Arc<dyn Sanitizer>>,
274    ) -> Result<Self> {
275        let embedding = embedding.unwrap_or_else(|| Arc::new(DummyEmbeddingProvider::default()));
276        let refiner = refiner.unwrap_or_else(|| Arc::new(NullRefiner));
277        let distiller = distiller.unwrap_or_else(|| Arc::new(HeuristicDistiller));
278        let curator = curator.unwrap_or_else(|| Arc::new(BuiltinCurator));
279        let sanitizer = sanitizer.unwrap_or_else(|| Arc::new(DefaultSanitizer));
280
281        let storage = Storage::open(db_path, embedding.content_dim(), embedding.trigger_dim())?;
282
283        let mut kb = Self {
284            storage,
285            embedding,
286            refiner,
287            distiller,
288            curator,
289            sanitizer,
290            w_content: W_CONTENT,
291            w_trigger: W_TRIGGER,
292            w_confidence: W_CONFIDENCE,
293            w_context: W_CONTEXT,
294            w_activation: W_ACTIVATION,
295            top_k_candidates: TOP_K_CANDIDATES,
296            anti_trigger_penalty: ANTI_TRIGGER_PENALTY,
297            density_refill: DENSITY_REFILL,
298            low_conf_threshold: LOW_CONF_THRESHOLD,
299            low_conf_idle_days: LOW_CONF_IDLE_DAYS,
300            repeat_select_min: REPEAT_SELECT_MIN,
301            repeat_select_conf_max: REPEAT_SELECT_CONF_MAX,
302            never_used_age_days: NEVER_USED_AGE_DAYS,
303            open_ttl_days: OPEN_TTL_DAYS,
304            screening_timeout_minutes: SCREENING_TIMEOUT_MINUTES,
305            promote_used_success_min: PROMOTE_USED_SUCCESS_MIN,
306            promote_confidence_min: PROMOTE_CONFIDENCE_MIN,
307            decay_floor: DECAY_FLOOR,
308            evolve_threshold: EVOLVE_THRESHOLD,
309            distill_batch_size: DISTILL_BATCH_SIZE,
310            evolve_schedule_interval_hours: 6,
311            governance_archive_threshold: GOVERNANCE_ARCHIVE_THRESHOLD,
312            negative_feedback_archive_threshold: NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD,
313            governance_evolve_threshold: GOVERNANCE_EVOLVE_THRESHOLD,
314            governance_proposal_max_age_days: 30,
315            failure_min_uses: FAILURE_MIN_USES,
316            failure_max_success_rate: FAILURE_MAX_SUCCESS_RATE,
317            failure_confidence_max: FAILURE_CONFIDENCE_MAX,
318            log_compact_days: LOG_COMPACT_DAYS,
319            appraise_tier_weak: APPRAISE_TIER_WEAK,
320            appraise_tier_strong: APPRAISE_TIER_STRONG,
321            appraise_min_strength: APPRAISE_MIN_STRENGTH,
322            appraise_top: APPRAISE_TOP,
323            appraise_trigger_hit_min: APPRAISE_TRIGGER_HIT_MIN,
324            appraise_candidate_in_embed: APPRAISE_CANDIDATE_IN_EMBED,
325            appraise_signature_floor: APPRAISE_SIGNATURE_FLOOR,
326            appraise_min_evidence: APPRAISE_MIN_EVIDENCE,
327            appraise_conflict_ceiling: APPRAISE_CONFLICT_CEILING,
328            intuition_prior_m: INTUITION_PRIOR_M,
329            intuition_base_rate: INTUITION_BASE_RATE,
330            calibration_bins: CALIBRATION_BINS,
331            situation_coarse_keys: SITUATION_COARSE_KEYS.to_string(),
332        };
333        kb.init_meta()?;
334        kb.load_params()?;
335        Ok(kb)
336    }
337
338    fn init_meta(&self) -> Result<()> {
339        let lib_id = gen_uuid();
340        let content_dim = self.embedding.content_dim().to_string();
341        let trigger_dim = self.embedding.trigger_dim().to_string();
342        let embed_model = self.embedding.model_name();
343
344        for (key, expected) in [
345            ("content_dim", self.embedding.content_dim()),
346            ("trigger_dim", self.embedding.trigger_dim()),
347        ] {
348            if let Some(stored) = self.storage.get_meta(key)? {
349                let actual = stored.parse::<usize>().map_err(|_| {
350                    InnateError::Other(format!("invalid {key} metadata value: {stored}"))
351                })?;
352                if actual != expected {
353                    return Err(InnateError::Other(format!(
354                        "{key} mismatch: database uses {actual}, embedding provider uses {expected}"
355                    )));
356                }
357            }
358        }
359
360        let defaults: &[(&str, &str)] = &[
361            ("lib_id", &lib_id),
362            ("lib_role", "personal"),
363            ("schema_version", "4.14"),
364            ("content_dim", &content_dim),
365            ("trigger_dim", &trigger_dim),
366            ("embed_model", embed_model),
367            ("embed_version", "1"),
368            ("vector_revision", "0"),
369            ("last_agg_ts", "1970-01-01T00:00:00.000Z"),
370            ("recall.w_content", "0.55"),
371            ("recall.w_trigger", "0.25"),
372            ("recall.w_confidence", "0.10"),
373            ("recall.w_context", "0.15"),
374            ("recall.w_activation", "0.08"),
375            ("recall.top_k_candidates", "20"),
376            ("recall.anti_trigger_penalty", "0.6"),
377            ("recall.density_refill", "true"),
378            ("curate.low_conf_threshold", "0.25"),
379            ("curate.low_conf_idle_days", "60"),
380            ("curate.repeat_select_min", "10"),
381            ("curate.repeat_select_conf_max", "0.5"),
382            ("curate.never_used_age_days", "30"),
383            ("curate.open_ttl_days", "14"),
384            ("curate.screening_timeout_minutes", "30"),
385            ("curate.promote_used_success_min", "3"),
386            ("curate.promote_confidence_min", "0.60"),
387            ("curate.decay_floor", "0.20"),
388            ("evolve.threshold_new_count", "5"),
389            ("evolve.distill_batch_size", "20"),
390            ("evolve.schedule_interval_hours", "6"),
391            ("curate.soft_mature_threshold", "5"),
392            ("evolve.distill_token_window_hours", "24"),
393            ("curate.governance_archive_threshold", "3"),
394            ("curate.negative_feedback_archive_threshold", "5"),
395            ("evolve.governance_pending_threshold", "3"),
396            ("curate.governance_proposal_max_age_days", "30"),
397            ("curate.failure_min_uses", "5"),
398            ("curate.failure_max_success_rate", "0.20"),
399            ("curate.failure_confidence_max", "0.35"),
400            ("curate.log_compact_days", "30"),
401            ("appraise.tier_weak", "0.30"),
402            ("appraise.tier_strong", "0.65"),
403            ("appraise.min_strength", "0.40"),
404            ("appraise.top", "8"),
405            ("appraise.trigger_hit_min", "0.50"),
406            ("appraise.candidate_in_embed", "true"),
407            ("appraise.signature_floor", "0.0"),
408            ("appraise.min_evidence", "0"),
409            ("appraise.conflict_ceiling", "1.0"),
410            ("intuition.prior_m", "2.0"),
411            ("intuition.base_rate", "0.5"),
412            ("intuition.calibration_bins", "10"),
413            ("situation.coarse_keys", "stage,error_class,file_type"),
414        ];
415        self.storage.begin_immediate()?;
416        let result = (|| -> Result<()> {
417            for (k, v) in defaults {
418                if self.storage.get_meta(k)?.is_none() {
419                    self.storage.set_meta(k, v)?;
420                }
421            }
422            self.storage.commit()
423        })();
424        if result.is_err() {
425            let _ = self.storage.rollback();
426        }
427        result
428    }
429
430    fn load_params(&mut self) -> Result<()> {
431        let f = |k: &str, d: f64| -> f64 {
432            self.storage
433                .get_meta(k)
434                .ok()
435                .flatten()
436                .and_then(|v| v.parse().ok())
437                .unwrap_or(d)
438        };
439        let i = |k: &str, d: i64| -> i64 {
440            self.storage
441                .get_meta(k)
442                .ok()
443                .flatten()
444                .and_then(|v| v.parse().ok())
445                .unwrap_or(d)
446        };
447        let b = |k: &str, d: bool| -> bool {
448            self.storage
449                .get_meta(k)
450                .ok()
451                .flatten()
452                .map(|v| v.to_lowercase() == "true")
453                .unwrap_or(d)
454        };
455        self.w_content = f("recall.w_content", W_CONTENT);
456        self.w_trigger = f("recall.w_trigger", W_TRIGGER);
457        self.w_confidence = f("recall.w_confidence", W_CONFIDENCE);
458        self.w_context = f("recall.w_context", W_CONTEXT);
459        self.w_activation = f("recall.w_activation", W_ACTIVATION);
460        self.top_k_candidates =
461            i("recall.top_k_candidates", TOP_K_CANDIDATES as i64).max(1) as usize;
462        self.anti_trigger_penalty = f("recall.anti_trigger_penalty", ANTI_TRIGGER_PENALTY);
463        self.density_refill = b("recall.density_refill", DENSITY_REFILL);
464        self.low_conf_threshold = f("curate.low_conf_threshold", LOW_CONF_THRESHOLD);
465        self.low_conf_idle_days = i("curate.low_conf_idle_days", LOW_CONF_IDLE_DAYS);
466        self.repeat_select_min = i("curate.repeat_select_min", REPEAT_SELECT_MIN);
467        self.repeat_select_conf_max = f("curate.repeat_select_conf_max", REPEAT_SELECT_CONF_MAX);
468        self.never_used_age_days = i("curate.never_used_age_days", NEVER_USED_AGE_DAYS);
469        self.open_ttl_days = i("curate.open_ttl_days", OPEN_TTL_DAYS);
470        self.screening_timeout_minutes = i(
471            "curate.screening_timeout_minutes",
472            SCREENING_TIMEOUT_MINUTES,
473        );
474        self.promote_used_success_min =
475            i("curate.promote_used_success_min", PROMOTE_USED_SUCCESS_MIN);
476        self.promote_confidence_min = f("curate.promote_confidence_min", PROMOTE_CONFIDENCE_MIN);
477        self.decay_floor = f("curate.decay_floor", DECAY_FLOOR).clamp(0.0, 0.4);
478        self.evolve_threshold = i("evolve.threshold_new_count", EVOLVE_THRESHOLD);
479        self.distill_batch_size =
480            i("evolve.distill_batch_size", DISTILL_BATCH_SIZE as i64) as usize;
481        self.evolve_schedule_interval_hours = i("evolve.schedule_interval_hours", 6).max(1);
482        self.governance_archive_threshold = i(
483            "curate.governance_archive_threshold",
484            GOVERNANCE_ARCHIVE_THRESHOLD,
485        )
486        .max(1);
487        self.negative_feedback_archive_threshold = i(
488            "curate.negative_feedback_archive_threshold",
489            NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD,
490        )
491        .max(1);
492        self.governance_evolve_threshold = i(
493            "evolve.governance_pending_threshold",
494            GOVERNANCE_EVOLVE_THRESHOLD,
495        )
496        .max(1);
497        self.governance_proposal_max_age_days =
498            i("curate.governance_proposal_max_age_days", 30).max(1);
499        self.failure_min_uses = i("curate.failure_min_uses", FAILURE_MIN_USES).max(1);
500        self.failure_max_success_rate =
501            f("curate.failure_max_success_rate", FAILURE_MAX_SUCCESS_RATE).clamp(0.0, 1.0);
502        self.failure_confidence_max =
503            f("curate.failure_confidence_max", FAILURE_CONFIDENCE_MAX).clamp(0.0, 1.0);
504        self.log_compact_days = i("curate.log_compact_days", LOG_COMPACT_DAYS).max(1);
505        let s = |k: &str, d: &str| -> String {
506            self.storage
507                .get_meta(k)
508                .ok()
509                .flatten()
510                .filter(|v| !v.trim().is_empty())
511                .unwrap_or_else(|| d.to_string())
512        };
513        self.appraise_tier_weak = f("appraise.tier_weak", APPRAISE_TIER_WEAK).clamp(0.0, 1.0);
514        self.appraise_tier_strong = f("appraise.tier_strong", APPRAISE_TIER_STRONG).clamp(0.0, 1.0);
515        self.appraise_min_strength =
516            f("appraise.min_strength", APPRAISE_MIN_STRENGTH).clamp(0.0, 1.0);
517        self.appraise_top = i("appraise.top", APPRAISE_TOP as i64).max(1) as usize;
518        self.appraise_trigger_hit_min =
519            f("appraise.trigger_hit_min", APPRAISE_TRIGGER_HIT_MIN).clamp(0.0, 1.0);
520        self.appraise_candidate_in_embed =
521            b("appraise.candidate_in_embed", APPRAISE_CANDIDATE_IN_EMBED);
522        self.appraise_signature_floor =
523            f("appraise.signature_floor", APPRAISE_SIGNATURE_FLOOR).clamp(0.0, 1.0);
524        self.appraise_min_evidence = i("appraise.min_evidence", APPRAISE_MIN_EVIDENCE).max(0);
525        self.appraise_conflict_ceiling =
526            f("appraise.conflict_ceiling", APPRAISE_CONFLICT_CEILING).clamp(0.0, 1.0);
527        self.intuition_prior_m = f("intuition.prior_m", INTUITION_PRIOR_M).max(0.0);
528        self.intuition_base_rate = f("intuition.base_rate", INTUITION_BASE_RATE).clamp(0.0, 1.0);
529        self.calibration_bins = i("intuition.calibration_bins", CALIBRATION_BINS).clamp(2, 100);
530        self.situation_coarse_keys = s("situation.coarse_keys", SITUATION_COARSE_KEYS);
531        Ok(())
532    }
533}
534
535// ---------------------------------------------------------------------------
536// Helpers
537// ---------------------------------------------------------------------------
538
539struct CandidateInfo {
540    chunk: Value,
541    sim_content: f32,
542    sim_trigger: f32,
543}
544
545fn chunk_is_valid_for_recall(chunk: &Value, embed_version: i64) -> bool {
546    chunk.get("state").and_then(Value::as_str) != Some("archived")
547        && chunk.get("origin").and_then(Value::as_str) != Some("spark")
548        && chunk
549            .get("embed_version")
550            .and_then(Value::as_i64)
551            .unwrap_or(1)
552            >= embed_version
553}
554
555/// Normalize a query string before hashing into a context_key.
556///
557/// Goals: collapse whitespace variations and case differences so that
558/// semantically equivalent queries (same words, different capitalisation or
559/// spacing) accumulate statistics in the same context_stat bucket.
560///
561/// Deliberately conservative: no stemming, no stop-word removal. The canonical
562/// query guidance in SKILL.md handles vocabulary consistency at the agent level.
563fn normalize_query(query: &str) -> String {
564    const STOP_WORDS: &[&str] = &[
565        "a", "an", "and", "for", "in", "of", "on", "the", "to", "with",
566    ];
567    let cleaned: String = query
568        .to_lowercase()
569        .chars()
570        .map(|ch| {
571            if ch.is_alphanumeric() || ch.is_whitespace() {
572                ch
573            } else {
574                ' '
575            }
576        })
577        .collect();
578    let mut tokens: Vec<&str> = cleaned
579        .split_whitespace()
580        .filter(|token| !STOP_WORDS.contains(token))
581        .collect();
582    tokens.sort_unstable();
583    tokens.dedup();
584    tokens.join(" ")
585}
586
587fn estimate_distill_prompt_tokens(log: &Value, related_logs: &[Value]) -> i64 {
588    let primary: i64 = [
589        "query",
590        "recall_snapshot",
591        "output",
592        "output_summary",
593        "nomination",
594    ]
595    .iter()
596    .filter_map(|key| log.get(*key).and_then(Value::as_str))
597    .map(|text| estimate_tokens(text) as i64)
598    .sum();
599    let log_id = log.get("id").and_then(Value::as_str).unwrap_or("");
600    let context_key = log.get("context_key").and_then(Value::as_str);
601    let related: i64 = related_logs
602        .iter()
603        .filter(|other| other.get("id").and_then(Value::as_str).unwrap_or("") != log_id)
604        .filter(|other| {
605            context_key.is_some() && other.get("context_key").and_then(Value::as_str) == context_key
606        })
607        .take(4)
608        .flat_map(|other| {
609            ["query", "output_summary", "outcome"]
610                .into_iter()
611                .filter_map(|key| other.get(key).and_then(Value::as_str))
612        })
613        .map(|text| estimate_tokens(text) as i64)
614        .sum();
615    primary + related
616}
617
618fn estimate_distilled_chunk_tokens(chunk: &DistilledChunk) -> i64 {
619    estimate_tokens(&chunk.content) as i64
620        + chunk
621            .trigger_desc
622            .as_deref()
623            .map(estimate_tokens)
624            .unwrap_or(0) as i64
625        + chunk
626            .anti_trigger_desc
627            .as_deref()
628            .map(estimate_tokens)
629            .unwrap_or(0) as i64
630}
631
632fn anti_trigger_hit(query: &str, anti: &str) -> bool {
633    let q_lower = query.to_lowercase();
634    anti.to_lowercase().split(',').any(|part| {
635        let p = part.trim();
636        !p.is_empty() && q_lower.contains(p)
637    })
638}
639
640fn block_cost(block: &[Value]) -> usize {
641    block
642        .iter()
643        .map(|b| {
644            b.get("token_count")
645                .and_then(Value::as_u64)
646                .map(|t| t as usize)
647                .unwrap_or_else(|| {
648                    estimate_tokens(b.get("content").and_then(Value::as_str).unwrap_or("")).max(100)
649                })
650        })
651        .sum()
652}
653
654fn limit_knowledge(knowledge: Vec<Value>, top: Option<usize>) -> Vec<Value> {
655    match top {
656        None => knowledge,
657        Some(0) => vec![],
658        Some(n) => knowledge.into_iter().take(n).collect(),
659    }
660}
661
662fn usage_state(used: Option<&[String]>) -> &'static str {
663    match used {
664        None => "unknown",
665        Some([]) => "known_none",
666        Some(_) => "known_some",
667    }
668}
669
670fn ratio(numerator: i64, denominator: i64) -> f64 {
671    if denominator <= 0 {
672        0.0
673    } else {
674        ((numerator as f64 / denominator as f64) * 1000.0).round() / 1000.0
675    }
676}
677
678fn validate_source(source: &str) -> Result<()> {
679    if !matches!(
680        source,
681        "mcp" | "sdk" | "cli" | "hook" | "daemon" | "augmented"
682    ) {
683        return Err(InnateError::InvalidState(format!(
684            "invalid event source: {source}"
685        )));
686    }
687    Ok(())
688}
689
690fn count_query(storage: &Storage, sql: &str) -> Result<i64> {
691    Ok(storage
692        .query_chunks(sql)?
693        .first()
694        .and_then(|r| r.as_object())
695        .and_then(|m| m.values().next())
696        .and_then(Value::as_i64)
697        .unwrap_or(0))
698}
699
700fn count_query_params<P: rusqlite::Params>(storage: &Storage, sql: &str, p: P) -> Result<i64> {
701    Ok(storage
702        .query_chunks_params(sql, p)?
703        .first()
704        .and_then(|r| r.as_object())
705        .and_then(|m| m.values().next())
706        .and_then(Value::as_i64)
707        .unwrap_or(0))
708}
709
710fn days_ago(now_iso: &str, days: i64) -> String {
711    use chrono::{DateTime, Duration, Utc};
712    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
713        let cutoff = t - Duration::days(days);
714        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
715    }
716    now_iso.to_string()
717}
718
719fn minutes_ago(now_iso: &str, minutes: i64) -> String {
720    use chrono::{DateTime, Duration, Utc};
721    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
722        let cutoff = t - Duration::minutes(minutes);
723        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
724    }
725    now_iso.to_string()
726}
727
728fn hours_ago(now_iso: &str, hours: i64) -> String {
729    use chrono::{DateTime, Duration, Utc};
730    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
731        let cutoff = t - Duration::hours(hours);
732        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
733    }
734    now_iso.to_string()
735}
736
737fn minutes_after(now_iso: &str, minutes: i64) -> String {
738    use chrono::{DateTime, Duration, Utc};
739    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
740        let cutoff = t + Duration::minutes(minutes);
741        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
742    }
743    now_iso.to_string()
744}
745
746fn hours_after(now_iso: &str, hours: i64) -> String {
747    use chrono::{DateTime, Duration, Utc};
748    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
749        let cutoff = t + Duration::hours(hours);
750        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
751    }
752    now_iso.to_string()
753}
754
755/// Return the number of whole days between two ISO timestamps (now - past; clamped ≥ 0).
756fn iso_days_diff(now_iso: &str, past_iso: &str) -> i64 {
757    use chrono::{DateTime, Utc};
758    let parse = |s: &str| s.parse::<DateTime<Utc>>().ok();
759    if let (Some(a), Some(b)) = (parse(now_iso), parse(past_iso)) {
760        let diff = a - b;
761        diff.num_days().max(0)
762    } else {
763        0
764    }
765}
766
767/// Fractional days between two ISO timestamps (≥ 0). Finer than `iso_days_diff`
768/// so the activation recency term keeps sub-day resolution.
769fn iso_fractional_days(now_iso: &str, past_iso: &str) -> f64 {
770    use chrono::{DateTime, Utc};
771    let parse = |s: &str| s.parse::<DateTime<Utc>>().ok();
772    if let (Some(a), Some(b)) = (parse(now_iso), parse(past_iso)) {
773        ((a - b).num_seconds().max(0)) as f64 / 86_400.0
774    } else {
775        0.0
776    }
777}
778
779/// ACT-R decay exponent for the base-level activation recency term.
780const ACTR_DECAY: f64 = 0.5;
781
782/// ACT-R-inspired base-level activation, bounded to `(0, 1)`.
783///
784/// Fuses **frequency** (how often a chunk has been used) and **recency** (time
785/// since last use) into one re-ranking signal, following the standard ACT-R
786/// approximation `B = ln(n) − d·ln(t)` (Petrov 2006), here using
787/// `B = ln(1 + used_count) − d·ln(1 + recency_days)` and squashed with a
788/// logistic so it stays on the same `[0, 1]` scale as the other fused-score
789/// terms (content/trigger sim, confidence, context).
790///
791/// Returns `0.0` for never-used chunks (no usage history → no boost), which
792/// keeps recall **zero-regression** for freshly-added knowledge: a chunk with
793/// `used_count == 0` contributes nothing to the fused score.
794pub(super) fn actr_activation(used_count: i64, last_used_at: Option<&str>, now_iso: &str) -> f64 {
795    if used_count <= 0 {
796        return 0.0;
797    }
798    let Some(last) = last_used_at else {
799        return 0.0;
800    };
801    let recency_days = iso_fractional_days(now_iso, last);
802    let b = (1.0 + used_count as f64).ln() - ACTR_DECAY * (1.0 + recency_days).ln();
803    1.0 / (1.0 + (-b).exp())
804}
805
806/// DFS-based cycle detection on the hard-dep graph. Returns list of cycles (each is a Vec of ids).
807fn detect_cycles(deps: &[Value]) -> Vec<Vec<String>> {
808    use std::collections::HashMap;
809    let mut adj: HashMap<String, Vec<String>> = HashMap::new();
810    for d in deps {
811        let src = d
812            .get("src")
813            .and_then(Value::as_str)
814            .unwrap_or("")
815            .to_string();
816        let dst = d
817            .get("dst")
818            .and_then(Value::as_str)
819            .unwrap_or("")
820            .to_string();
821        if !src.is_empty() && !dst.is_empty() {
822            adj.entry(src).or_default().push(dst);
823        }
824    }
825    let nodes: Vec<String> = adj.keys().cloned().collect();
826    let mut visited: HashSet<String> = HashSet::new();
827    let mut on_stack: HashSet<String> = HashSet::new();
828    let mut cycles: Vec<Vec<String>> = vec![];
829
830    fn dfs(
831        node: &str,
832        adj: &HashMap<String, Vec<String>>,
833        visited: &mut HashSet<String>,
834        on_stack: &mut HashSet<String>,
835        path: &mut Vec<String>,
836        cycles: &mut Vec<Vec<String>>,
837    ) {
838        if on_stack.contains(node) {
839            // Found cycle — extract loop segment.
840            let start = path.iter().position(|n| n == node).unwrap_or(0);
841            cycles.push(path[start..].to_vec());
842            return;
843        }
844        if visited.contains(node) {
845            return;
846        }
847        visited.insert(node.to_string());
848        on_stack.insert(node.to_string());
849        path.push(node.to_string());
850        if let Some(children) = adj.get(node) {
851            for child in children {
852                dfs(child, adj, visited, on_stack, path, cycles);
853            }
854        }
855        path.pop();
856        on_stack.remove(node);
857    }
858
859    for node in nodes {
860        let mut path = vec![];
861        dfs(
862            &node,
863            &adj,
864            &mut visited,
865            &mut on_stack,
866            &mut path,
867            &mut cycles,
868        );
869    }
870    cycles
871}
innate_core/kb/mod.rs

innate_core/kb/
mod.rs