Skip to main content

innate_core/kb/
mod.rs

1//! KnowledgeBase — all 8 Public APIs.
2
3/// Return type for pack(): (selected_chunks, skipped_groups, skip_reasons)
4type PackResult = (
5    Vec<Value>,
6    Vec<(Vec<Value>, f64, usize)>,
7    std::collections::HashMap<String, String>,
8);
9
10use std::collections::{HashMap, HashSet};
11use std::path::Path;
12use std::sync::Arc;
13
14use serde_json::{json, Value};
15
16use crate::embedding::{DummyEmbeddingProvider, EmbeddingProvider};
17use crate::errors::{InnateError, Result};
18use crate::refine::{
19    DefaultSanitizer, DistilledChunk, Distiller, HeuristicDistiller, NullRefiner, Refiner,
20    Sanitizer,
21};
22use crate::storage::{ChunkRow, EpisodicLogRow, Storage};
23use crate::utils::{
24    content_hash, estimate_tokens, gen_uuid, pack_embedding, utc_now_iso, SanitizeAction,
25};
26
27mod appraise;
28mod curate;
29mod evolve;
30mod inspection;
31mod lifecycle;
32mod recall;
33mod record;
34mod situation;
35
36pub use appraise::{
37    AppraiseParams, Contributor, FlaggedPoint, Tier, Valence, Verdict,
38};
39pub use recall::RecallParams;
40pub use record::RecordParams;
41pub use situation::Situation;
42
43// ---------------------------------------------------------------------------
44// Tuning defaults
45// ---------------------------------------------------------------------------
46
47// Fused recall score weights. These intentionally sum to 1.05 (not 1.0): the
48// score is a relative ranking signal, not a calibrated probability, so the extra
49// 0.05 of headroom on content similarity is deliberate and the result is never
50// re-normalised. Keep this in mind before "fixing" the sum.
51const W_CONTENT: f64 = 0.55;
52const W_TRIGGER: f64 = 0.25;
53const W_CONFIDENCE: f64 = 0.10;
54const W_CONTEXT: f64 = 0.15;
55const W_ACTIVATION: f64 = 0.08;
56const TOP_K_CANDIDATES: usize = 20;
57const ANTI_TRIGGER_PENALTY: f64 = 0.6;
58const DENSITY_REFILL: bool = true;
59
60const LOW_CONF_THRESHOLD: f64 = 0.25;
61const LOW_CONF_IDLE_DAYS: i64 = 60;
62const REPEAT_SELECT_MIN: i64 = 10;
63const REPEAT_SELECT_CONF_MAX: f64 = 0.5;
64const NEVER_USED_AGE_DAYS: i64 = 30;
65const OPEN_TTL_DAYS: i64 = 14;
66const SCREENING_TIMEOUT_MINUTES: i64 = 30;
67const PROMOTE_USED_SUCCESS_MIN: i64 = 3;
68const PROMOTE_CONFIDENCE_MIN: f64 = 0.60;
69const DECAY_FLOOR: f64 = 0.20;
70const EVOLVE_THRESHOLD: i64 = 5;
71const DISTILL_BATCH_SIZE: usize = 20;
72const PENDING_RECALL_PENALTY: f64 = 0.60;
73
74// Intuition / appraise critic defaults (Spec §8). The appraise path reuses the
75// same fused score as recall; these only govern how that score is tiered/flagged.
76const APPRAISE_TIER_WEAK: f64 = 0.30;
77const APPRAISE_TIER_STRONG: f64 = 0.65;
78const APPRAISE_MIN_STRENGTH: f64 = 0.40;
79const APPRAISE_TOP: usize = 8;
80const APPRAISE_TRIGGER_HIT_MIN: f64 = 0.50;
81const APPRAISE_CANDIDATE_IN_EMBED: bool = true;
82const SITUATION_COARSE_KEYS: &str = "stage,error_class,file_type";
83const GOVERNANCE_ARCHIVE_THRESHOLD: i64 = 3;
84const NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD: i64 = 5;
85const GOVERNANCE_EVOLVE_THRESHOLD: i64 = 3;
86const FAILURE_MIN_USES: i64 = 5;
87const FAILURE_MAX_SUCCESS_RATE: f64 = 0.20;
88const FAILURE_CONFIDENCE_MAX: f64 = 0.35;
89const LOG_COMPACT_DAYS: i64 = 30;
90
91// ---------------------------------------------------------------------------
92// Public result types
93// ---------------------------------------------------------------------------
94
95#[derive(Debug, Default, Clone)]
96pub struct RecallResult {
97    pub knowledge: Vec<Value>,
98    pub sparks: Vec<Value>,
99    pub trace_id: String,
100    pub empty: bool,
101    pub depth_skipped: Vec<String>,
102    pub skipped_reasons: HashMap<String, String>,
103}
104
105#[derive(Debug, Default)]
106pub struct CurateReport {
107    pub archived: Vec<String>,
108    pub deduped: Vec<String>,
109    pub decayed: Vec<String>,
110    pub cycles: Vec<Vec<String>>,
111    pub orphans: Vec<String>,
112    pub recovered: Vec<String>,
113    pub warnings: Vec<String>,
114    pub stats: HashMap<String, Value>,
115}
116
117#[derive(Debug, Default)]
118struct DistillBatchReport {
119    distilled: usize,
120    failed: usize,
121}
122
123/// Scope for a single Curate run — allows limiting governance to a subset of chunks.
124#[derive(Debug, Default, Clone)]
125pub struct CurateScope {
126    /// If set, only process chunks with this origin (e.g. "distilled").
127    pub origin: Option<String>,
128    /// If set, only process chunks belonging to this skill.
129    pub skill_name: Option<String>,
130    /// When true, compute the report but do not write any changes.
131    pub dry_run: bool,
132}
133
134/// Replaceable governance interface (§二·六). Inject via `KnowledgeBase::open_with`.
135/// Default implementation: `BuiltinCurator`.
136pub trait Curator: Send + Sync {
137    fn run(&self, kb: &KnowledgeBase, scope: &CurateScope) -> Result<CurateReport>;
138}
139
140/// Built-in curator — implements the full §四 governance pipeline.
141pub struct BuiltinCurator;
142
143impl Curator for BuiltinCurator {
144    fn run(&self, kb: &KnowledgeBase, scope: &CurateScope) -> Result<CurateReport> {
145        kb.builtin_curate_impl(scope)
146    }
147}
148
149// ---------------------------------------------------------------------------
150// KnowledgeBase
151// ---------------------------------------------------------------------------
152
153pub struct KnowledgeBase {
154    pub storage: Storage,
155    embedding: Arc<dyn EmbeddingProvider>,
156    refiner: Arc<dyn Refiner>,
157    distiller: Arc<dyn Distiller>,
158    curator: Arc<dyn Curator>,
159    sanitizer: Arc<dyn Sanitizer>,
160
161    // Tuning params (loaded from meta at init)
162    w_content: f64,
163    w_trigger: f64,
164    w_confidence: f64,
165    w_context: f64,
166    w_activation: f64,
167    top_k_candidates: usize,
168    anti_trigger_penalty: f64,
169    density_refill: bool,
170
171    low_conf_threshold: f64,
172    low_conf_idle_days: i64,
173    repeat_select_min: i64,
174    repeat_select_conf_max: f64,
175    never_used_age_days: i64,
176    open_ttl_days: i64,
177    screening_timeout_minutes: i64,
178    promote_used_success_min: i64,
179    promote_confidence_min: f64,
180    decay_floor: f64,
181    evolve_threshold: i64,
182    distill_batch_size: usize,
183    evolve_schedule_interval_hours: i64,
184    governance_archive_threshold: i64,
185    negative_feedback_archive_threshold: i64,
186    governance_evolve_threshold: i64,
187    governance_proposal_max_age_days: i64,
188    failure_min_uses: i64,
189    failure_max_success_rate: f64,
190    failure_confidence_max: f64,
191    log_compact_days: i64,
192
193    // Intuition / appraise critic params
194    appraise_tier_weak: f64,
195    appraise_tier_strong: f64,
196    appraise_min_strength: f64,
197    appraise_top: usize,
198    appraise_trigger_hit_min: f64,
199    appraise_candidate_in_embed: bool,
200    situation_coarse_keys: String,
201}
202
203impl KnowledgeBase {
204    pub fn open(db_path: impl AsRef<Path>) -> Result<Self> {
205        Self::open_with(db_path, None, None, None, None, None)
206    }
207
208    /// Persist a content embedding, rejecting any vector whose dimension differs
209    /// from the configured provider. A mismatched vector is silently skipped at
210    /// search time (cosine search only scores equal-dimension vectors), so an
211    /// unchecked write becomes invisible recall loss with no error. Fail closed
212    /// at the write boundary instead. All vector writers route through here.
213    pub(crate) fn store_vec_content(&self, chunk_id: &str, cvec: &[f32]) -> Result<()> {
214        let want = self.embedding.content_dim();
215        if cvec.len() != want {
216            return Err(InnateError::InvalidState(format!(
217                "content embedding dim {} != configured {want} (chunk {chunk_id})",
218                cvec.len()
219            )));
220        }
221        self.storage
222            .insert_vec_content(chunk_id, &pack_embedding(cvec))
223    }
224
225    /// Trigger-vector counterpart of [`store_vec_content`]; same fail-closed
226    /// dimension guard against `trigger_dim()`.
227    pub(crate) fn store_vec_trigger(&self, chunk_id: &str, tvec: &[f32]) -> Result<()> {
228        let want = self.embedding.trigger_dim();
229        if tvec.len() != want {
230            return Err(InnateError::InvalidState(format!(
231                "trigger embedding dim {} != configured {want} (chunk {chunk_id})",
232                tvec.len()
233            )));
234        }
235        self.storage
236            .insert_vec_trigger(chunk_id, &pack_embedding(tvec))
237    }
238
239    pub fn open_with(
240        db_path: impl AsRef<Path>,
241        embedding: Option<Arc<dyn EmbeddingProvider>>,
242        refiner: Option<Arc<dyn Refiner>>,
243        distiller: Option<Arc<dyn Distiller>>,
244        curator: Option<Arc<dyn Curator>>,
245        sanitizer: Option<Arc<dyn Sanitizer>>,
246    ) -> Result<Self> {
247        let embedding = embedding.unwrap_or_else(|| Arc::new(DummyEmbeddingProvider::default()));
248        let refiner = refiner.unwrap_or_else(|| Arc::new(NullRefiner));
249        let distiller = distiller.unwrap_or_else(|| Arc::new(HeuristicDistiller));
250        let curator = curator.unwrap_or_else(|| Arc::new(BuiltinCurator));
251        let sanitizer = sanitizer.unwrap_or_else(|| Arc::new(DefaultSanitizer));
252
253        let storage = Storage::open(db_path, embedding.content_dim(), embedding.trigger_dim())?;
254
255        let mut kb = Self {
256            storage,
257            embedding,
258            refiner,
259            distiller,
260            curator,
261            sanitizer,
262            w_content: W_CONTENT,
263            w_trigger: W_TRIGGER,
264            w_confidence: W_CONFIDENCE,
265            w_context: W_CONTEXT,
266            w_activation: W_ACTIVATION,
267            top_k_candidates: TOP_K_CANDIDATES,
268            anti_trigger_penalty: ANTI_TRIGGER_PENALTY,
269            density_refill: DENSITY_REFILL,
270            low_conf_threshold: LOW_CONF_THRESHOLD,
271            low_conf_idle_days: LOW_CONF_IDLE_DAYS,
272            repeat_select_min: REPEAT_SELECT_MIN,
273            repeat_select_conf_max: REPEAT_SELECT_CONF_MAX,
274            never_used_age_days: NEVER_USED_AGE_DAYS,
275            open_ttl_days: OPEN_TTL_DAYS,
276            screening_timeout_minutes: SCREENING_TIMEOUT_MINUTES,
277            promote_used_success_min: PROMOTE_USED_SUCCESS_MIN,
278            promote_confidence_min: PROMOTE_CONFIDENCE_MIN,
279            decay_floor: DECAY_FLOOR,
280            evolve_threshold: EVOLVE_THRESHOLD,
281            distill_batch_size: DISTILL_BATCH_SIZE,
282            evolve_schedule_interval_hours: 6,
283            governance_archive_threshold: GOVERNANCE_ARCHIVE_THRESHOLD,
284            negative_feedback_archive_threshold: NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD,
285            governance_evolve_threshold: GOVERNANCE_EVOLVE_THRESHOLD,
286            governance_proposal_max_age_days: 30,
287            failure_min_uses: FAILURE_MIN_USES,
288            failure_max_success_rate: FAILURE_MAX_SUCCESS_RATE,
289            failure_confidence_max: FAILURE_CONFIDENCE_MAX,
290            log_compact_days: LOG_COMPACT_DAYS,
291            appraise_tier_weak: APPRAISE_TIER_WEAK,
292            appraise_tier_strong: APPRAISE_TIER_STRONG,
293            appraise_min_strength: APPRAISE_MIN_STRENGTH,
294            appraise_top: APPRAISE_TOP,
295            appraise_trigger_hit_min: APPRAISE_TRIGGER_HIT_MIN,
296            appraise_candidate_in_embed: APPRAISE_CANDIDATE_IN_EMBED,
297            situation_coarse_keys: SITUATION_COARSE_KEYS.to_string(),
298        };
299        kb.init_meta()?;
300        kb.load_params()?;
301        Ok(kb)
302    }
303
304    fn init_meta(&self) -> Result<()> {
305        let lib_id = gen_uuid();
306        let content_dim = self.embedding.content_dim().to_string();
307        let trigger_dim = self.embedding.trigger_dim().to_string();
308        let embed_model = self.embedding.model_name();
309
310        for (key, expected) in [
311            ("content_dim", self.embedding.content_dim()),
312            ("trigger_dim", self.embedding.trigger_dim()),
313        ] {
314            if let Some(stored) = self.storage.get_meta(key)? {
315                let actual = stored.parse::<usize>().map_err(|_| {
316                    InnateError::Other(format!("invalid {key} metadata value: {stored}"))
317                })?;
318                if actual != expected {
319                    return Err(InnateError::Other(format!(
320                        "{key} mismatch: database uses {actual}, embedding provider uses {expected}"
321                    )));
322                }
323            }
324        }
325
326        let defaults: &[(&str, &str)] = &[
327            ("lib_id", &lib_id),
328            ("lib_role", "personal"),
329            ("schema_version", "4.14"),
330            ("content_dim", &content_dim),
331            ("trigger_dim", &trigger_dim),
332            ("embed_model", embed_model),
333            ("embed_version", "1"),
334            ("vector_revision", "0"),
335            ("last_agg_ts", "1970-01-01T00:00:00.000Z"),
336            ("recall.w_content", "0.55"),
337            ("recall.w_trigger", "0.25"),
338            ("recall.w_confidence", "0.10"),
339            ("recall.w_context", "0.15"),
340            ("recall.w_activation", "0.08"),
341            ("recall.top_k_candidates", "20"),
342            ("recall.anti_trigger_penalty", "0.6"),
343            ("recall.density_refill", "true"),
344            ("curate.low_conf_threshold", "0.25"),
345            ("curate.low_conf_idle_days", "60"),
346            ("curate.repeat_select_min", "10"),
347            ("curate.repeat_select_conf_max", "0.5"),
348            ("curate.never_used_age_days", "30"),
349            ("curate.open_ttl_days", "14"),
350            ("curate.screening_timeout_minutes", "30"),
351            ("curate.promote_used_success_min", "3"),
352            ("curate.promote_confidence_min", "0.60"),
353            ("curate.decay_floor", "0.20"),
354            ("evolve.threshold_new_count", "5"),
355            ("evolve.distill_batch_size", "20"),
356            ("evolve.schedule_interval_hours", "6"),
357            ("curate.soft_mature_threshold", "5"),
358            ("evolve.distill_token_window_hours", "24"),
359            ("curate.governance_archive_threshold", "3"),
360            ("curate.negative_feedback_archive_threshold", "5"),
361            ("evolve.governance_pending_threshold", "3"),
362            ("curate.governance_proposal_max_age_days", "30"),
363            ("curate.failure_min_uses", "5"),
364            ("curate.failure_max_success_rate", "0.20"),
365            ("curate.failure_confidence_max", "0.35"),
366            ("curate.log_compact_days", "30"),
367            ("appraise.tier_weak", "0.30"),
368            ("appraise.tier_strong", "0.65"),
369            ("appraise.min_strength", "0.40"),
370            ("appraise.top", "8"),
371            ("appraise.trigger_hit_min", "0.50"),
372            ("appraise.candidate_in_embed", "true"),
373            ("situation.coarse_keys", "stage,error_class,file_type"),
374        ];
375        self.storage.begin_immediate()?;
376        let result = (|| -> Result<()> {
377            for (k, v) in defaults {
378                if self.storage.get_meta(k)?.is_none() {
379                    self.storage.set_meta(k, v)?;
380                }
381            }
382            self.storage.commit()
383        })();
384        if result.is_err() {
385            let _ = self.storage.rollback();
386        }
387        result
388    }
389
390    fn load_params(&mut self) -> Result<()> {
391        let f = |k: &str, d: f64| -> f64 {
392            self.storage
393                .get_meta(k)
394                .ok()
395                .flatten()
396                .and_then(|v| v.parse().ok())
397                .unwrap_or(d)
398        };
399        let i = |k: &str, d: i64| -> i64 {
400            self.storage
401                .get_meta(k)
402                .ok()
403                .flatten()
404                .and_then(|v| v.parse().ok())
405                .unwrap_or(d)
406        };
407        let b = |k: &str, d: bool| -> bool {
408            self.storage
409                .get_meta(k)
410                .ok()
411                .flatten()
412                .map(|v| v.to_lowercase() == "true")
413                .unwrap_or(d)
414        };
415        self.w_content = f("recall.w_content", W_CONTENT);
416        self.w_trigger = f("recall.w_trigger", W_TRIGGER);
417        self.w_confidence = f("recall.w_confidence", W_CONFIDENCE);
418        self.w_context = f("recall.w_context", W_CONTEXT);
419        self.w_activation = f("recall.w_activation", W_ACTIVATION);
420        self.top_k_candidates =
421            i("recall.top_k_candidates", TOP_K_CANDIDATES as i64).max(1) as usize;
422        self.anti_trigger_penalty = f("recall.anti_trigger_penalty", ANTI_TRIGGER_PENALTY);
423        self.density_refill = b("recall.density_refill", DENSITY_REFILL);
424        self.low_conf_threshold = f("curate.low_conf_threshold", LOW_CONF_THRESHOLD);
425        self.low_conf_idle_days = i("curate.low_conf_idle_days", LOW_CONF_IDLE_DAYS);
426        self.repeat_select_min = i("curate.repeat_select_min", REPEAT_SELECT_MIN);
427        self.repeat_select_conf_max = f("curate.repeat_select_conf_max", REPEAT_SELECT_CONF_MAX);
428        self.never_used_age_days = i("curate.never_used_age_days", NEVER_USED_AGE_DAYS);
429        self.open_ttl_days = i("curate.open_ttl_days", OPEN_TTL_DAYS);
430        self.screening_timeout_minutes = i(
431            "curate.screening_timeout_minutes",
432            SCREENING_TIMEOUT_MINUTES,
433        );
434        self.promote_used_success_min =
435            i("curate.promote_used_success_min", PROMOTE_USED_SUCCESS_MIN);
436        self.promote_confidence_min = f("curate.promote_confidence_min", PROMOTE_CONFIDENCE_MIN);
437        self.decay_floor = f("curate.decay_floor", DECAY_FLOOR).clamp(0.0, 0.4);
438        self.evolve_threshold = i("evolve.threshold_new_count", EVOLVE_THRESHOLD);
439        self.distill_batch_size =
440            i("evolve.distill_batch_size", DISTILL_BATCH_SIZE as i64) as usize;
441        self.evolve_schedule_interval_hours = i("evolve.schedule_interval_hours", 6).max(1);
442        self.governance_archive_threshold = i(
443            "curate.governance_archive_threshold",
444            GOVERNANCE_ARCHIVE_THRESHOLD,
445        )
446        .max(1);
447        self.negative_feedback_archive_threshold = i(
448            "curate.negative_feedback_archive_threshold",
449            NEGATIVE_FEEDBACK_ARCHIVE_THRESHOLD,
450        )
451        .max(1);
452        self.governance_evolve_threshold = i(
453            "evolve.governance_pending_threshold",
454            GOVERNANCE_EVOLVE_THRESHOLD,
455        )
456        .max(1);
457        self.governance_proposal_max_age_days =
458            i("curate.governance_proposal_max_age_days", 30).max(1);
459        self.failure_min_uses = i("curate.failure_min_uses", FAILURE_MIN_USES).max(1);
460        self.failure_max_success_rate =
461            f("curate.failure_max_success_rate", FAILURE_MAX_SUCCESS_RATE).clamp(0.0, 1.0);
462        self.failure_confidence_max =
463            f("curate.failure_confidence_max", FAILURE_CONFIDENCE_MAX).clamp(0.0, 1.0);
464        self.log_compact_days = i("curate.log_compact_days", LOG_COMPACT_DAYS).max(1);
465        let s = |k: &str, d: &str| -> String {
466            self.storage
467                .get_meta(k)
468                .ok()
469                .flatten()
470                .filter(|v| !v.trim().is_empty())
471                .unwrap_or_else(|| d.to_string())
472        };
473        self.appraise_tier_weak = f("appraise.tier_weak", APPRAISE_TIER_WEAK).clamp(0.0, 1.0);
474        self.appraise_tier_strong =
475            f("appraise.tier_strong", APPRAISE_TIER_STRONG).clamp(0.0, 1.0);
476        self.appraise_min_strength =
477            f("appraise.min_strength", APPRAISE_MIN_STRENGTH).clamp(0.0, 1.0);
478        self.appraise_top = i("appraise.top", APPRAISE_TOP as i64).max(1) as usize;
479        self.appraise_trigger_hit_min =
480            f("appraise.trigger_hit_min", APPRAISE_TRIGGER_HIT_MIN).clamp(0.0, 1.0);
481        self.appraise_candidate_in_embed =
482            b("appraise.candidate_in_embed", APPRAISE_CANDIDATE_IN_EMBED);
483        self.situation_coarse_keys = s("situation.coarse_keys", SITUATION_COARSE_KEYS);
484        Ok(())
485    }
486}
487
488// ---------------------------------------------------------------------------
489// Helpers
490// ---------------------------------------------------------------------------
491
492struct CandidateInfo {
493    chunk: Value,
494    sim_content: f32,
495    sim_trigger: f32,
496}
497
498fn chunk_is_valid_for_recall(chunk: &Value, embed_version: i64) -> bool {
499    chunk.get("state").and_then(Value::as_str) != Some("archived")
500        && chunk.get("origin").and_then(Value::as_str) != Some("spark")
501        && chunk
502            .get("embed_version")
503            .and_then(Value::as_i64)
504            .unwrap_or(1)
505            >= embed_version
506}
507
508/// Normalize a query string before hashing into a context_key.
509///
510/// Goals: collapse whitespace variations and case differences so that
511/// semantically equivalent queries (same words, different capitalisation or
512/// spacing) accumulate statistics in the same context_stat bucket.
513///
514/// Deliberately conservative: no stemming, no stop-word removal. The canonical
515/// query guidance in SKILL.md handles vocabulary consistency at the agent level.
516fn normalize_query(query: &str) -> String {
517    const STOP_WORDS: &[&str] = &[
518        "a", "an", "and", "for", "in", "of", "on", "the", "to", "with",
519    ];
520    let cleaned: String = query
521        .to_lowercase()
522        .chars()
523        .map(|ch| {
524            if ch.is_alphanumeric() || ch.is_whitespace() {
525                ch
526            } else {
527                ' '
528            }
529        })
530        .collect();
531    let mut tokens: Vec<&str> = cleaned
532        .split_whitespace()
533        .filter(|token| !STOP_WORDS.contains(token))
534        .collect();
535    tokens.sort_unstable();
536    tokens.dedup();
537    tokens.join(" ")
538}
539
540fn estimate_distill_prompt_tokens(log: &Value, related_logs: &[Value]) -> i64 {
541    let primary: i64 = [
542        "query",
543        "recall_snapshot",
544        "output",
545        "output_summary",
546        "nomination",
547    ]
548    .iter()
549    .filter_map(|key| log.get(*key).and_then(Value::as_str))
550    .map(|text| estimate_tokens(text) as i64)
551    .sum();
552    let log_id = log.get("id").and_then(Value::as_str).unwrap_or("");
553    let context_key = log.get("context_key").and_then(Value::as_str);
554    let related: i64 = related_logs
555        .iter()
556        .filter(|other| other.get("id").and_then(Value::as_str).unwrap_or("") != log_id)
557        .filter(|other| {
558            context_key.is_some() && other.get("context_key").and_then(Value::as_str) == context_key
559        })
560        .take(4)
561        .flat_map(|other| {
562            ["query", "output_summary", "outcome"]
563                .into_iter()
564                .filter_map(|key| other.get(key).and_then(Value::as_str))
565        })
566        .map(|text| estimate_tokens(text) as i64)
567        .sum();
568    primary + related
569}
570
571fn estimate_distilled_chunk_tokens(chunk: &DistilledChunk) -> i64 {
572    estimate_tokens(&chunk.content) as i64
573        + chunk
574            .trigger_desc
575            .as_deref()
576            .map(estimate_tokens)
577            .unwrap_or(0) as i64
578        + chunk
579            .anti_trigger_desc
580            .as_deref()
581            .map(estimate_tokens)
582            .unwrap_or(0) as i64
583}
584
585fn anti_trigger_hit(query: &str, anti: &str) -> bool {
586    let q_lower = query.to_lowercase();
587    anti.to_lowercase().split(',').any(|part| {
588        let p = part.trim();
589        !p.is_empty() && q_lower.contains(p)
590    })
591}
592
593fn block_cost(block: &[Value]) -> usize {
594    block
595        .iter()
596        .map(|b| {
597            b.get("token_count")
598                .and_then(Value::as_u64)
599                .map(|t| t as usize)
600                .unwrap_or_else(|| {
601                    estimate_tokens(b.get("content").and_then(Value::as_str).unwrap_or("")).max(100)
602                })
603        })
604        .sum()
605}
606
607fn limit_knowledge(knowledge: Vec<Value>, top: Option<usize>) -> Vec<Value> {
608    match top {
609        None => knowledge,
610        Some(0) => vec![],
611        Some(n) => knowledge.into_iter().take(n).collect(),
612    }
613}
614
615fn usage_state(used: Option<&[String]>) -> &'static str {
616    match used {
617        None => "unknown",
618        Some([]) => "known_none",
619        Some(_) => "known_some",
620    }
621}
622
623fn ratio(numerator: i64, denominator: i64) -> f64 {
624    if denominator <= 0 {
625        0.0
626    } else {
627        ((numerator as f64 / denominator as f64) * 1000.0).round() / 1000.0
628    }
629}
630
631fn validate_source(source: &str) -> Result<()> {
632    if !matches!(
633        source,
634        "mcp" | "sdk" | "cli" | "hook" | "daemon" | "augmented"
635    ) {
636        return Err(InnateError::InvalidState(format!(
637            "invalid event source: {source}"
638        )));
639    }
640    Ok(())
641}
642
643fn count_query(storage: &Storage, sql: &str) -> Result<i64> {
644    Ok(storage
645        .query_chunks(sql)?
646        .first()
647        .and_then(|r| r.as_object())
648        .and_then(|m| m.values().next())
649        .and_then(Value::as_i64)
650        .unwrap_or(0))
651}
652
653fn count_query_params<P: rusqlite::Params>(storage: &Storage, sql: &str, p: P) -> Result<i64> {
654    Ok(storage
655        .query_chunks_params(sql, p)?
656        .first()
657        .and_then(|r| r.as_object())
658        .and_then(|m| m.values().next())
659        .and_then(Value::as_i64)
660        .unwrap_or(0))
661}
662
663fn days_ago(now_iso: &str, days: i64) -> String {
664    use chrono::{DateTime, Duration, Utc};
665    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
666        let cutoff = t - Duration::days(days);
667        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
668    }
669    now_iso.to_string()
670}
671
672fn minutes_ago(now_iso: &str, minutes: i64) -> String {
673    use chrono::{DateTime, Duration, Utc};
674    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
675        let cutoff = t - Duration::minutes(minutes);
676        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
677    }
678    now_iso.to_string()
679}
680
681fn hours_ago(now_iso: &str, hours: i64) -> String {
682    use chrono::{DateTime, Duration, Utc};
683    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
684        let cutoff = t - Duration::hours(hours);
685        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
686    }
687    now_iso.to_string()
688}
689
690fn minutes_after(now_iso: &str, minutes: i64) -> String {
691    use chrono::{DateTime, Duration, Utc};
692    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
693        let cutoff = t + Duration::minutes(minutes);
694        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
695    }
696    now_iso.to_string()
697}
698
699fn hours_after(now_iso: &str, hours: i64) -> String {
700    use chrono::{DateTime, Duration, Utc};
701    if let Ok(t) = now_iso.parse::<DateTime<Utc>>() {
702        let cutoff = t + Duration::hours(hours);
703        return cutoff.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string();
704    }
705    now_iso.to_string()
706}
707
708/// Return the number of whole days between two ISO timestamps (now - past; clamped ≥ 0).
709fn iso_days_diff(now_iso: &str, past_iso: &str) -> i64 {
710    use chrono::{DateTime, Utc};
711    let parse = |s: &str| s.parse::<DateTime<Utc>>().ok();
712    if let (Some(a), Some(b)) = (parse(now_iso), parse(past_iso)) {
713        let diff = a - b;
714        diff.num_days().max(0)
715    } else {
716        0
717    }
718}
719
720/// Fractional days between two ISO timestamps (≥ 0). Finer than `iso_days_diff`
721/// so the activation recency term keeps sub-day resolution.
722fn iso_fractional_days(now_iso: &str, past_iso: &str) -> f64 {
723    use chrono::{DateTime, Utc};
724    let parse = |s: &str| s.parse::<DateTime<Utc>>().ok();
725    if let (Some(a), Some(b)) = (parse(now_iso), parse(past_iso)) {
726        ((a - b).num_seconds().max(0)) as f64 / 86_400.0
727    } else {
728        0.0
729    }
730}
731
732/// ACT-R decay exponent for the base-level activation recency term.
733const ACTR_DECAY: f64 = 0.5;
734
735/// ACT-R-inspired base-level activation, bounded to `(0, 1)`.
736///
737/// Fuses **frequency** (how often a chunk has been used) and **recency** (time
738/// since last use) into one re-ranking signal, following the standard ACT-R
739/// approximation `B = ln(n) − d·ln(t)` (Petrov 2006), here using
740/// `B = ln(1 + used_count) − d·ln(1 + recency_days)` and squashed with a
741/// logistic so it stays on the same `[0, 1]` scale as the other fused-score
742/// terms (content/trigger sim, confidence, context).
743///
744/// Returns `0.0` for never-used chunks (no usage history → no boost), which
745/// keeps recall **zero-regression** for freshly-added knowledge: a chunk with
746/// `used_count == 0` contributes nothing to the fused score.
747pub(super) fn actr_activation(used_count: i64, last_used_at: Option<&str>, now_iso: &str) -> f64 {
748    if used_count <= 0 {
749        return 0.0;
750    }
751    let Some(last) = last_used_at else {
752        return 0.0;
753    };
754    let recency_days = iso_fractional_days(now_iso, last);
755    let b = (1.0 + used_count as f64).ln() - ACTR_DECAY * (1.0 + recency_days).ln();
756    1.0 / (1.0 + (-b).exp())
757}
758
759/// DFS-based cycle detection on the hard-dep graph. Returns list of cycles (each is a Vec of ids).
760fn detect_cycles(deps: &[Value]) -> Vec<Vec<String>> {
761    use std::collections::HashMap;
762    let mut adj: HashMap<String, Vec<String>> = HashMap::new();
763    for d in deps {
764        let src = d
765            .get("src")
766            .and_then(Value::as_str)
767            .unwrap_or("")
768            .to_string();
769        let dst = d
770            .get("dst")
771            .and_then(Value::as_str)
772            .unwrap_or("")
773            .to_string();
774        if !src.is_empty() && !dst.is_empty() {
775            adj.entry(src).or_default().push(dst);
776        }
777    }
778    let nodes: Vec<String> = adj.keys().cloned().collect();
779    let mut visited: HashSet<String> = HashSet::new();
780    let mut on_stack: HashSet<String> = HashSet::new();
781    let mut cycles: Vec<Vec<String>> = vec![];
782
783    fn dfs(
784        node: &str,
785        adj: &HashMap<String, Vec<String>>,
786        visited: &mut HashSet<String>,
787        on_stack: &mut HashSet<String>,
788        path: &mut Vec<String>,
789        cycles: &mut Vec<Vec<String>>,
790    ) {
791        if on_stack.contains(node) {
792            // Found cycle — extract loop segment.
793            let start = path.iter().position(|n| n == node).unwrap_or(0);
794            cycles.push(path[start..].to_vec());
795            return;
796        }
797        if visited.contains(node) {
798            return;
799        }
800        visited.insert(node.to_string());
801        on_stack.insert(node.to_string());
802        path.push(node.to_string());
803        if let Some(children) = adj.get(node) {
804            for child in children {
805                dfs(child, adj, visited, on_stack, path, cycles);
806            }
807        }
808        path.pop();
809        on_stack.remove(node);
810    }
811
812    for node in nodes {
813        let mut path = vec![];
814        dfs(
815            &node,
816            &adj,
817            &mut visited,
818            &mut on_stack,
819            &mut path,
820            &mut cycles,
821        );
822    }
823    cycles
824}