lean-ctx 3.1.5

Context Runtime for AI Agents with CCP. 42 MCP tools, 10 read modes, 90+ compression patterns, cross-session memory (CCP), persistent AI knowledge with temporal facts + contradiction detection, multi-agent context sharing + diaries, LITM-aware positioning, AAAK compact format, adaptive compression with Thompson Sampling bandits. Supports 24 AI tools. Reduces LLM token consumption by up to 99%.
Documentation
//! Embedding-based Knowledge Retrieval for `ctx_knowledge`.
//!
//! Wraps `ProjectKnowledge` with a vector index for semantic recall.
//! Facts are automatically embedded on `remember` and searched via
//! cosine similarity on `recall`, with hybrid exact + semantic ranking.

use std::path::PathBuf;

use serde::{Deserialize, Serialize};

use super::knowledge::{KnowledgeFact, ProjectKnowledge};

#[cfg(feature = "embeddings")]
use super::embeddings::{cosine_similarity, EmbeddingEngine};

const ALPHA_SEMANTIC: f32 = 0.6;
const BETA_CONFIDENCE: f32 = 0.25;
const GAMMA_RECENCY: f32 = 0.15;
const MAX_RECENCY_DAYS: f32 = 90.0;

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FactEmbedding {
    pub category: String,
    pub key: String,
    pub embedding: Vec<f32>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KnowledgeEmbeddingIndex {
    pub project_hash: String,
    pub entries: Vec<FactEmbedding>,
}

impl KnowledgeEmbeddingIndex {
    pub fn new(project_hash: &str) -> Self {
        Self {
            project_hash: project_hash.to_string(),
            entries: Vec::new(),
        }
    }

    pub fn upsert(&mut self, category: &str, key: &str, embedding: Vec<f32>) {
        if let Some(existing) = self
            .entries
            .iter_mut()
            .find(|e| e.category == category && e.key == key)
        {
            existing.embedding = embedding;
        } else {
            self.entries.push(FactEmbedding {
                category: category.to_string(),
                key: key.to_string(),
                embedding,
            });
        }
    }

    pub fn remove(&mut self, category: &str, key: &str) {
        self.entries
            .retain(|e| !(e.category == category && e.key == key));
    }

    #[cfg(feature = "embeddings")]
    pub fn semantic_search(
        &self,
        query_embedding: &[f32],
        top_k: usize,
    ) -> Vec<(&FactEmbedding, f32)> {
        let mut scored: Vec<(&FactEmbedding, f32)> = self
            .entries
            .iter()
            .map(|e| {
                let sim = cosine_similarity(query_embedding, &e.embedding);
                (e, sim)
            })
            .collect();

        scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
        scored.truncate(top_k);
        scored
    }

    fn index_path(project_hash: &str) -> Option<PathBuf> {
        let dir = dirs::home_dir()?.join(".lean-ctx").join("knowledge");
        Some(dir.join(format!("{project_hash}.embeddings.json")))
    }

    pub fn load(project_hash: &str) -> Option<Self> {
        let path = Self::index_path(project_hash)?;
        let data = std::fs::read_to_string(path).ok()?;
        serde_json::from_str(&data).ok()
    }

    pub fn save(&self) -> Result<(), String> {
        let path = Self::index_path(&self.project_hash)
            .ok_or_else(|| "Cannot determine home directory".to_string())?;
        if let Some(dir) = path.parent() {
            std::fs::create_dir_all(dir).map_err(|e| format!("{e}"))?;
        }
        let json = serde_json::to_string(self).map_err(|e| format!("{e}"))?;
        std::fs::write(path, json).map_err(|e| format!("{e}"))
    }
}

#[derive(Debug)]
pub struct ScoredFact<'a> {
    pub fact: &'a KnowledgeFact,
    pub score: f32,
    pub semantic_score: f32,
    pub confidence_score: f32,
    pub recency_score: f32,
}

#[cfg(feature = "embeddings")]
pub fn semantic_recall<'a>(
    knowledge: &'a ProjectKnowledge,
    index: &KnowledgeEmbeddingIndex,
    engine: &EmbeddingEngine,
    query: &str,
    top_k: usize,
) -> Vec<ScoredFact<'a>> {
    let query_embedding = match engine.embed(query) {
        Ok(e) => e,
        Err(_) => return lexical_fallback(knowledge, query, top_k),
    };

    let semantic_hits = index.semantic_search(&query_embedding, top_k * 2);

    let mut results: Vec<ScoredFact<'a>> = Vec::new();

    for (entry, sim) in &semantic_hits {
        if let Some(fact) = knowledge
            .facts
            .iter()
            .find(|f| f.category == entry.category && f.key == entry.key && f.is_current())
        {
            let confidence_score = fact.confidence;
            let recency_score = recency_decay(fact);
            let score = ALPHA_SEMANTIC * sim
                + BETA_CONFIDENCE * confidence_score
                + GAMMA_RECENCY * recency_score;

            results.push(ScoredFact {
                fact,
                score,
                semantic_score: *sim,
                confidence_score,
                recency_score,
            });
        }
    }

    let exact_matches = knowledge.recall(query);
    for fact in exact_matches {
        let already_included = results
            .iter()
            .any(|r| r.fact.category == fact.category && r.fact.key == fact.key);
        if !already_included {
            results.push(ScoredFact {
                fact,
                score: 1.0,
                semantic_score: 1.0,
                confidence_score: fact.confidence,
                recency_score: recency_decay(fact),
            });
        }
    }

    results.sort_by(|a, b| {
        b.score
            .partial_cmp(&a.score)
            .unwrap_or(std::cmp::Ordering::Equal)
    });
    results.truncate(top_k);
    results
}

fn lexical_fallback<'a>(
    knowledge: &'a ProjectKnowledge,
    query: &str,
    top_k: usize,
) -> Vec<ScoredFact<'a>> {
    knowledge
        .recall(query)
        .into_iter()
        .take(top_k)
        .map(|fact| ScoredFact {
            fact,
            score: fact.confidence,
            semantic_score: 0.0,
            confidence_score: fact.confidence,
            recency_score: recency_decay(fact),
        })
        .collect()
}

fn recency_decay(fact: &KnowledgeFact) -> f32 {
    let days_old = chrono::Utc::now()
        .signed_duration_since(fact.last_confirmed)
        .num_days() as f32;
    (1.0 - days_old / MAX_RECENCY_DAYS).max(0.0)
}

#[cfg(feature = "embeddings")]
pub fn embed_and_store(
    index: &mut KnowledgeEmbeddingIndex,
    engine: &EmbeddingEngine,
    category: &str,
    key: &str,
    value: &str,
) -> Result<(), String> {
    let text = format!("{category} {key}: {value}");
    let embedding = engine.embed(&text).map_err(|e| format!("{e}"))?;
    index.upsert(category, key, embedding);
    Ok(())
}

pub fn format_scored_facts(results: &[ScoredFact<'_>]) -> String {
    if results.is_empty() {
        return "No matching facts found.".to_string();
    }

    let mut output = String::new();
    for (i, scored) in results.iter().enumerate() {
        let f = scored.fact;
        let stars = if f.confidence >= 0.9 {
            "★★★★"
        } else if f.confidence >= 0.7 {
            "★★★"
        } else if f.confidence >= 0.5 {
            "★★"
        } else {
            ""
        };

        if i > 0 {
            output.push('|');
        }
        output.push_str(&format!(
            "{}:{}={}{} [s:{:.0}%]",
            f.category,
            f.key,
            f.value,
            stars,
            scored.score * 100.0
        ));
    }
    output
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn index_upsert_and_remove() {
        let mut idx = KnowledgeEmbeddingIndex::new("test");
        idx.upsert("arch", "db", vec![1.0, 0.0, 0.0]);
        assert_eq!(idx.entries.len(), 1);

        idx.upsert("arch", "db", vec![0.0, 1.0, 0.0]);
        assert_eq!(idx.entries.len(), 1);
        assert_eq!(idx.entries[0].embedding[1], 1.0);

        idx.upsert("arch", "cache", vec![0.0, 0.0, 1.0]);
        assert_eq!(idx.entries.len(), 2);

        idx.remove("arch", "db");
        assert_eq!(idx.entries.len(), 1);
        assert_eq!(idx.entries[0].key, "cache");
    }

    #[test]
    fn recency_decay_recent() {
        let fact = KnowledgeFact {
            category: "test".to_string(),
            key: "k".to_string(),
            value: "v".to_string(),
            source_session: "s".to_string(),
            confidence: 0.9,
            created_at: chrono::Utc::now(),
            last_confirmed: chrono::Utc::now(),
            valid_from: None,
            valid_until: None,
            supersedes: None,
            confirmation_count: 1,
        };
        let decay = recency_decay(&fact);
        assert!(
            decay > 0.95,
            "Recent fact should have high recency: {decay}"
        );
    }

    #[test]
    fn recency_decay_old() {
        let old_date = chrono::Utc::now() - chrono::Duration::days(100);
        let fact = KnowledgeFact {
            category: "test".to_string(),
            key: "k".to_string(),
            value: "v".to_string(),
            source_session: "s".to_string(),
            confidence: 0.5,
            created_at: old_date,
            last_confirmed: old_date,
            valid_from: None,
            valid_until: None,
            supersedes: None,
            confirmation_count: 1,
        };
        let decay = recency_decay(&fact);
        assert_eq!(decay, 0.0, "100-day-old fact should have 0 recency");
    }

    #[cfg(feature = "embeddings")]
    #[test]
    fn semantic_search_ranking() {
        let mut idx = KnowledgeEmbeddingIndex::new("test");
        idx.upsert("arch", "db", vec![1.0, 0.0, 0.0]);
        idx.upsert("arch", "cache", vec![0.0, 1.0, 0.0]);
        idx.upsert("ops", "deploy", vec![0.5, 0.5, 0.0]);

        let query = vec![1.0, 0.0, 0.0];
        let results = idx.semantic_search(&query, 2);
        assert_eq!(results.len(), 2);
        assert_eq!(results[0].0.key, "db");
    }

    #[test]
    fn format_scored_empty() {
        assert_eq!(format_scored_facts(&[]), "No matching facts found.");
    }

    #[test]
    fn format_scored_output() {
        let fact = KnowledgeFact {
            category: "arch".to_string(),
            key: "db".to_string(),
            value: "PostgreSQL".to_string(),
            source_session: "s1".to_string(),
            confidence: 0.95,
            created_at: chrono::Utc::now(),
            last_confirmed: chrono::Utc::now(),
            valid_from: None,
            valid_until: None,
            supersedes: None,
            confirmation_count: 3,
        };
        let scored = vec![ScoredFact {
            fact: &fact,
            score: 0.85,
            semantic_score: 0.9,
            confidence_score: 0.95,
            recency_score: 1.0,
        }];
        let output = format_scored_facts(&scored);
        assert!(output.contains("arch:db=PostgreSQL"));
        assert!(output.contains("★★★★"));
        assert!(output.contains("[s:85%]"));
    }
}