car-memgine 0.14.0

//! The memory graph — nodes are memory entries, edges are relationships.
//!
//! Replaces ImmutableStore + LayerState + 6 HashMaps with a single graph.

use chrono::{DateTime, Utc};
use petgraph::stable_graph::{NodeIndex, StableGraph};
use petgraph::visit::EdgeRef;
use petgraph::Direction;
use std::collections::{HashMap, HashSet, VecDeque};

pub type Layer = u8;

/// Namespace for memory nodes. Keeps external/foreign code knowledge from
/// polluting the project's own memory graph during spreading activation.
///
/// Stored in a side-table keyed by `NodeIndex` so adding partitions does not
/// require rewriting every `MemNode { .. }` literal. Absence from the side
/// table means `Project`.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum Partition {
    /// The host project's own memory. Default for all legacy call sites.
    Project,
    /// Knowledge harvested from an external repository at a specific commit.
    /// Keyed by (source_repo, commit) for idempotent re-ingestion.
    Foreign { source_repo: String, commit: String },
}

impl Partition {
    pub fn is_project(&self) -> bool {
        matches!(self, Partition::Project)
    }
    pub fn is_foreign(&self) -> bool {
        matches!(self, Partition::Foreign { .. })
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum CodeLanguage {
    Rust,
    Python,
    JavaScript,
    TypeScript,
    Go,
    Java,
    Cpp,
    Shell,
    Sql,
    Unknown,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum ContentType {
    #[default]
    NaturalLanguage,
    Code(CodeLanguage),
    StructuredData,
}

impl ContentType {
    /// Check if this is any code variant regardless of language.
    pub fn is_code(&self) -> bool {
        matches!(self, ContentType::Code(_))
    }

    /// Get the language if this is a code type.
    pub fn language(&self) -> Option<CodeLanguage> {
        match self {
            ContentType::Code(lang) => Some(*lang),
            _ => None,
        }
    }

    /// Parse from a string label (as used in JSON serialization).
    /// Unknown strings default to `NaturalLanguage`.
    pub fn from_label(s: &str) -> Self {
        match s {
            "code" => ContentType::Code(CodeLanguage::Unknown),
            "structured_data" => ContentType::StructuredData,
            _ => ContentType::NaturalLanguage,
        }
    }

    /// Convert to a string label for JSON serialization.
    pub fn as_label(&self) -> &'static str {
        match self {
            ContentType::NaturalLanguage => "natural_language",
            ContentType::Code(_) => "code",
            ContentType::StructuredData => "structured_data",
        }
    }
}

/// Detect content type from text using heuristics.
/// Identifies language for code content via language-specific markers.
pub fn detect_content_type(text: &str) -> ContentType {
    let trimmed = text.trim();

    // Structured data: starts with JSON/YAML/TOML markers
    if trimmed.starts_with('{') || trimmed.starts_with('[') {
        if trimmed.ends_with('}') || trimmed.ends_with(']') {
            return ContentType::StructuredData;
        }
    }
    if trimmed.starts_with("---") {
        return ContentType::StructuredData;
    }
    // TOML/INI-style: majority of lines are key = value or key: value
    let lines: Vec<&str> = trimmed.lines().collect();
    if lines.len() >= 2 {
        let kv_count = lines
            .iter()
            .filter(|l| {
                let l = l.trim();
                if l.is_empty() || l.starts_with('#') || l.starts_with("//") {
                    return false;
                }
                if let Some(pos) = l.find(" = ") {
                    return pos < 30 && !l[..pos].contains(' ');
                }
                if let Some(pos) = l.find(": ") {
                    return pos < 30 && !l[..pos].contains(' ');
                }
                false
            })
            .count();
        if kv_count as f32 / lines.len() as f32 > 0.6 {
            return ContentType::StructuredData;
        }
    }

    // Code detection with language identification
    if let Some(lang) = detect_code_language(text) {
        return ContentType::Code(lang);
    }

    ContentType::NaturalLanguage
}

/// Detect whether text is code and identify the language.
/// Returns None if the text is not code.
fn detect_code_language(text: &str) -> Option<CodeLanguage> {
    // Language-specific strong markers (unique to one language)
    let rust_markers: &[&str] = &[
        "fn ", "pub fn", "impl ", "trait ", "mod ", "#[", "let mut ", "::",
    ];
    let python_markers: &[&str] = &[
        "def ",
        "async def",
        "elif ",
        "self.",
        "import ",
        "__init__",
        "None",
        "True",
        "False",
    ];
    let js_markers: &[&str] = &[
        "function ",
        "const ",
        "let ",
        "var ",
        "=> ",
        "===",
        "!==",
        "require(",
        "console.",
    ];
    let ts_markers: &[&str] = &[
        "interface ",
        ": string",
        ": number",
        ": boolean",
        "readonly ",
        "as const",
    ];
    let go_markers: &[&str] = &["func ", "package ", "go func", ":= ", "fmt."];
    let java_markers: &[&str] = &[
        "public class",
        "private ",
        "protected ",
        "System.out",
        "@Override",
    ];
    let cpp_markers: &[&str] = &["#include", "std::", "cout", "nullptr", "template<"];
    let shell_markers: &[&str] = &["#!/bin", "echo ", "fi\n", "done\n", "esac", "$(", "export "];
    let sql_markers: &[&str] = &[
        "SELECT ",
        "INSERT ",
        "UPDATE ",
        "DELETE ",
        "CREATE TABLE",
        "ALTER TABLE",
        "JOIN ",
    ];

    let count = |markers: &[&str]| markers.iter().filter(|m| text.contains(**m)).count();

    let rust = count(rust_markers);
    let python = count(python_markers);
    let js = count(js_markers);
    let ts = count(ts_markers);
    let go = count(go_markers);
    let java = count(java_markers);
    let cpp = count(cpp_markers);
    let shell = count(shell_markers);
    let sql = count(sql_markers);

    // Find the best match
    let scores = [
        (rust, CodeLanguage::Rust),
        (python, CodeLanguage::Python),
        (js, CodeLanguage::JavaScript),
        (ts, CodeLanguage::TypeScript),
        (go, CodeLanguage::Go),
        (java, CodeLanguage::Java),
        (cpp, CodeLanguage::Cpp),
        (shell, CodeLanguage::Shell),
        (sql, CodeLanguage::Sql),
    ];

    let (best_score, best_lang) = scores
        .iter()
        .max_by_key(|(score, _)| *score)
        .copied()
        .unwrap();

    if best_score >= 2 {
        return Some(best_lang);
    }

    // Fallback: check generic code markers (language-agnostic)
    let has_braces = text.contains('{') && text.contains('}');
    let has_parens_semi = text.contains('(') && text.contains(';');
    let structural = has_braces as usize + has_parens_semi as usize;

    // Generic strong markers
    let generic_strong: &[&str] = &["fn ", "def ", "impl ", "struct ", "enum ", "=>", "->"];
    let generic_weak: &[&str] = &[
        "function ",
        "class ",
        "import ",
        "from ",
        "const ",
        "let ",
        "var ",
        "export ",
        "pub ",
        "use ",
    ];
    let strong = count(generic_strong);
    let weak = count(generic_weak);
    let total = strong + weak;

    let is_code = (strong >= 2)
        || (strong >= 1 && (structural > 0 || weak >= 1))
        || (weak >= 2 && structural > 0)
        || (total >= 3 && structural > 0);

    if is_code {
        // We know it's code but not which language — pick best guess or Unknown
        if best_score >= 1 {
            Some(best_lang)
        } else {
            Some(CodeLanguage::Unknown)
        }
    } else {
        None
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum MemKind {
    Identity,
    Fact,
    FactSuperseded,
    /// Derived knowledge citing premise Facts. Automatically invalidated
    /// when any cited premise is superseded (cascade invalidation).
    Conclusion,
    /// A Conclusion whose premises changed — needs recalculation.
    ConclusionInvalidated,
    Skill,
    SkillDeprecated,
    Conversation,
    /// Compacted summary of older conversation turns.
    ConversationSummary,
    Environment,
    /// Model performance profile — tracks observed quality/latency per model.
    Model,
    /// Parsed code symbol — function, struct, class, etc. from tree-sitter AST.
    /// Value is JSON-serialized `CodeSymbolMeta`. Key is `file_path::symbol_name`.
    CodeSymbol,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum EdgeKind {
    Supersedes,   // new → old ("I replace you")
    DependsOn,    // dependent → dependency ("I need you")
    RelatedTo,    // semantic similarity (bidirectional by convention)
    Triggers,     // skill → trigger context ("I fire when this matches")
    TemporalNext, // conversation ordering
    Calls,        // CodeSymbol → CodeSymbol ("I call you")
    DefinedIn,    // CodeSymbol → CodeSymbol (method → impl/class)
    Imports,      // CodeSymbol → CodeSymbol (file → dependency)
    CitesPremise, // Conclusion → Fact ("I am derived from this premise")
}

/// Metadata for a CodeSymbol graph node.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct CodeSymbolMeta {
    pub name: String,
    pub kind: String, // "Function", "Method", "Struct", "Class", etc.
    pub signature: String,
    pub file_path: String,
    pub start_line: u32,
    pub end_line: u32,
    pub doc_comment: Option<String>,
    pub parent: Option<String>,
}

impl CodeSymbolMeta {
    pub fn from_node(node: &MemNode) -> Option<Self> {
        if node.kind != MemKind::CodeSymbol {
            return None;
        }
        serde_json::from_str(&node.value).ok()
    }
    pub fn encode(&self) -> String {
        serde_json::to_string(self).expect("CodeSymbolMeta serializable")
    }
}

// --- Skill types ---

/// Skill scope — general (applies everywhere) or domain-specific.
/// Follows SkillRL's SkillBank = S_g ∪ ∪S_k structure.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum SkillScope {
    /// Universal strategy applicable across all tasks.
    Global,
    /// Task-specific skill scoped to a domain (persona, task type, etc).
    Domain(String),
}

impl Default for SkillScope {
    fn default() -> Self {
        SkillScope::Global
    }
}

/// Trigger metadata for a stored skill (Parslee-ai/car#181).
///
/// The original shape (`persona`, `url_pattern`, `task_keywords`)
/// stays load-bearing for the web-task automation case — `find_skill`
/// keys on those fields today. The new optional `structured` field
/// opens the trigger to non-web cases (intent signatures, IDE
/// contexts, audio-classifier outputs, anything an agent wants to
/// match a stored skill on without keyword scoring).
///
/// Backward-compatible: existing serialized skills lack `structured`
/// and load with `structured = None`.
///
/// **Matcher dispatch is deferred.** Today `find_skill` keys on the
/// keyword-shaped fields only. A skill whose `structured` is `Some`
/// and whose web-task fields are empty WILL NOT be returned by
/// `find_skill` for a structured query — agents storing such
/// skills today should enumerate via `list_skills` and apply
/// their own matcher until the dispatch lands. Naming this
/// upfront so callers don't read `Some(structured)` as
/// "find_skill will match this."
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default, PartialEq)]
pub struct SkillTrigger {
    #[serde(default)]
    pub persona: String,
    #[serde(default)]
    pub url_pattern: String,
    #[serde(default)]
    pub task_keywords: Vec<String>,
    /// Structured trigger payload for non-web-task skills. The
    /// `kind` identifies the matcher (`"intent_signature"`,
    /// `"ide_context"`, etc.); `signature` is opaque JSON the
    /// matcher knows how to compare. Both fields are optional;
    /// `None` means "use web-task keyword matching."
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub structured: Option<StructuredTrigger>,
}

/// Discriminated structured trigger for matcher plugins.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq)]
pub struct StructuredTrigger {
    pub kind: String,
    pub signature: serde_json::Value,
}

#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default, PartialEq)]
pub struct SkillStats {
    pub success_count: u64,
    pub fail_count: u64,
    pub degraded: bool,
    pub broken_for_repair: bool,
    /// Continuous success metric for telemetry feeds where success/
    /// failure isn't binary (Parslee-ai/car#181). Range [0.0, 1.0]
    /// or `None` if the consumer hasn't reported one. Distinct from
    /// `success_ratio()` which is a derived aggregate.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub completion_rate: Option<f64>,
    /// User-reported complaint count — the next layer below
    /// `fail_count`. A correct-looking but user-disliked outcome
    /// increments this without flipping `fail_count`.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub complaint_count: Option<u64>,
    /// Last time this skill was applied (success or fail). Drives
    /// recency-aware decay in the matcher.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub last_used_at: Option<DateTime<Utc>>,
}

impl SkillStats {
    pub fn should_degrade(&self, threshold: u64) -> bool {
        self.fail_count > self.success_count + threshold
    }
    pub fn success_ratio(&self) -> f64 {
        let total = self.success_count + self.fail_count;
        if total == 0 {
            return 0.5;
        }
        self.success_count as f64 / total as f64
    }
}

#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum SkillOutcome {
    Success,
    Fail,
}

#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq)]
pub struct SkillMeta {
    pub name: String,
    pub code: String,     // opaque to Rust — Playwright body, shell script, etc.
    pub platform: String, // "playwright", "node", "shell", "python"
    pub description: String,
    pub trigger: SkillTrigger,
    /// Scope: Global (general strategy) or Domain(name) (task-specific).
    #[serde(default)]
    pub scope: SkillScope,
    /// When this skill should be applied — human-readable condition.
    #[serde(default)]
    pub when_to_apply: String,
    #[serde(default)]
    pub stats: SkillStats,
    #[serde(default = "default_skill_version")]
    pub version: u64,
    /// Multi-tenant isolation tag (Parslee-ai/car#187 phase 3-D).
    /// When set, the skill belongs to a specific tenant and only
    /// surfaces in retrievals scoped to that tenant. `None` means
    /// the skill is unscoped — the legacy / single-tenant default.
    ///
    /// Important: scoped retrievals do NOT see unscoped skills.
    /// Strict isolation by design — see the `ScopedMemgineView` doc
    /// for the rationale and the rebootstrap-per-tenant pattern.
    #[serde(default)]
    pub tenant_id: Option<String>,
}

fn default_skill_version() -> u64 {
    1
}

impl SkillMeta {
    pub fn from_node(node: &MemNode) -> Option<Self> {
        if !matches!(node.kind, MemKind::Skill | MemKind::SkillDeprecated) {
            return None;
        }
        serde_json::from_str(&node.value).ok()
    }
    pub fn encode(&self) -> String {
        serde_json::to_string(self).expect("SkillMeta serializable")
    }
}

/// Provenance record — where a fact came from.
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
pub struct Provenance {
    pub source: String, // "user", "system", "consolidation", "reflection", "compaction"
    pub reference: String, // file path, URL, conversation turn, etc.
    #[serde(default)]
    pub date: Option<DateTime<Utc>>,
}

/// Structured metadata for knowledge nodes (inspired by metaswarm's JSONL schema).
/// Tracks provenance, confidence, usage signals, and file affinity.
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
pub struct FactMetadata {
    /// Confidence level: "high", "medium", "low", "derived".
    #[serde(default)]
    pub confidence: String,
    /// Where this fact came from (may have multiple sources).
    #[serde(default)]
    pub provenance: Vec<Provenance>,
    /// File paths this fact is relevant to (glob patterns allowed).
    #[serde(default)]
    pub affected_files: Vec<String>,
    /// Free-form tags for filtering and retrieval.
    #[serde(default)]
    pub tags: Vec<String>,
    /// Fact category: "fact", "gotcha", "anti_pattern", "decision", "pattern".
    #[serde(default)]
    pub category: String,
    /// How many times this fact was included in context.
    #[serde(default)]
    pub usage_count: u64,
    /// How many times usage of this fact correlated with a successful outcome.
    #[serde(default)]
    pub helpful_count: u64,
    /// How many times this fact has been flagged as potentially outdated.
    #[serde(default)]
    pub outdated_reports: u64,
    /// Multi-tenant isolation tag (Parslee-ai/car#187 phase 3-D).
    /// When set, the fact belongs to a specific tenant and only
    /// surfaces in retrievals scoped to that tenant. `None` means
    /// the fact is unscoped — the legacy / single-tenant default.
    ///
    /// Important: scoped retrievals do NOT see unscoped facts.
    /// Strict isolation by design — operators bootstrap each tenant's
    /// memgine with whatever seeded facts they want; there's no
    /// "global facts visible to everyone" fallthrough because that
    /// would let any tenant pollute the shared namespace.
    #[serde(default)]
    pub tenant_id: Option<String>,
}

impl FactMetadata {
    pub fn is_empty(&self) -> bool {
        self.confidence.is_empty()
            && self.provenance.is_empty()
            && self.affected_files.is_empty()
            && self.tags.is_empty()
            && self.category.is_empty()
    }

    /// Staleness signal: facts with high outdated_reports relative to usage are suspect.
    pub fn staleness_ratio(&self) -> f64 {
        if self.usage_count == 0 {
            return 0.0;
        }
        self.outdated_reports as f64 / self.usage_count as f64
    }

    /// Helpfulness signal: facts that correlate with success.
    pub fn helpfulness_ratio(&self) -> f64 {
        if self.usage_count == 0 {
            return 0.5;
        } // neutral prior
        self.helpful_count as f64 / self.usage_count as f64
    }
}

#[derive(Debug, Clone)]
pub struct MemNode {
    pub kind: MemKind,
    pub layer: Layer,
    pub key: String,
    pub value: String,
    pub fact_id: Option<String>,
    pub scope: String,
    pub authority: String,
    pub is_constraint: bool,
    pub created_at: DateTime<Utc>,
    pub expires_at: Option<DateTime<Utc>>,
    pub content_type: ContentType,
    /// Structured metadata for knowledge quality signals (optional, defaults empty).
    pub metadata: FactMetadata,
}

impl MemNode {
    pub fn token_estimate(&self) -> usize {
        std::cmp::max(1, self.value.len() / 4)
    }

    pub fn is_valid(&self) -> bool {
        !matches!(
            self.kind,
            MemKind::FactSuperseded | MemKind::SkillDeprecated | MemKind::ConclusionInvalidated
        )
    }
}

#[derive(Debug, Clone)]
pub struct MemEdge {
    pub kind: EdgeKind,
    pub weight: f32,
    pub created_at: DateTime<Utc>,
}

#[derive(Debug, Clone)]
pub struct RetrievalHit {
    pub node_ix: NodeIndex,
    pub activation: f32,
    pub hops: usize,
}

/// The memory graph. Nodes are entries, edges are relationships.
pub struct MemoryGraph {
    pub inner: StableGraph<MemNode, MemEdge>,
    by_fact_id: HashMap<String, NodeIndex>,
    by_key: HashMap<String, Vec<NodeIndex>>,
    by_layer: [Vec<NodeIndex>; 4],
    last_conversation: Option<NodeIndex>,
    /// Partition assignments. Only non-`Project` entries are stored —
    /// absence means `Project`.
    partitions: HashMap<NodeIndex, Partition>,
    /// Idempotency index for foreign nodes, keyed by `fact_id`.
    /// `fact_id` is already globally unique (Indexer encodes
    /// `foreign::{repo}::{commit}::{path}::{symbol}` — distinguishes
    /// overloads and impl blocks that share a short `key`). A node without
    /// a `fact_id` is NOT idempotency-tracked and will duplicate on
    /// re-insert — callers that want dedupe must populate `fact_id`.
    foreign_by_fact_id: HashMap<String, NodeIndex>,
}

impl MemoryGraph {
    pub fn new() -> Self {
        Self {
            inner: StableGraph::new(),
            by_fact_id: HashMap::new(),
            by_key: HashMap::new(),
            by_layer: [Vec::new(), Vec::new(), Vec::new(), Vec::new()],
            last_conversation: None,
            partitions: HashMap::new(),
            foreign_by_fact_id: HashMap::new(),
        }
    }

    /// Partition a node belongs to. Returns `Partition::Project` for any node
    /// not explicitly assigned (the legacy default).
    pub fn partition_of(&self, nix: NodeIndex) -> Partition {
        self.partitions
            .get(&nix)
            .cloned()
            .unwrap_or(Partition::Project)
    }

    /// Insert a node into the given `Partition`. For `Project`, equivalent
    /// to `insert`. For `Foreign`, enforces idempotency by `(repo, commit, key)`
    /// — re-inserting the same triple returns the existing `NodeIndex`
    /// rather than creating a duplicate.
    pub fn insert_partitioned(&mut self, node: MemNode, partition: Partition) -> NodeIndex {
        if let Partition::Foreign { .. } = &partition {
            if let Some(fid) = node.fact_id.as_deref() {
                if let Some(&existing) = self.foreign_by_fact_id.get(fid) {
                    return existing;
                }
            }
            let fid = node.fact_id.clone();
            let nix = self.insert(node);
            if let Some(fid) = fid {
                self.foreign_by_fact_id.insert(fid, nix);
            }
            self.partitions.insert(nix, partition);
            return nix;
        }
        self.insert(node)
    }

    /// Convenience: insert a node under `Partition::Foreign { source_repo, commit }`.
    pub fn insert_foreign(
        &mut self,
        source_repo: impl Into<String>,
        commit: impl Into<String>,
        node: MemNode,
    ) -> NodeIndex {
        self.insert_partitioned(
            node,
            Partition::Foreign {
                source_repo: source_repo.into(),
                commit: commit.into(),
            },
        )
    }

    /// Insert a node and index it.
    pub fn insert(&mut self, node: MemNode) -> NodeIndex {
        let layer_idx = (node.layer as usize).saturating_sub(1).min(3);
        let fact_id = node.fact_id.clone();
        let key = node.key.clone();
        let is_conv = matches!(
            node.kind,
            MemKind::Conversation | MemKind::ConversationSummary
        );

        let nix = self.inner.add_node(node);

        if let Some(fid) = fact_id {
            self.by_fact_id.insert(fid, nix);
        }
        self.by_key.entry(key).or_default().push(nix);
        self.by_layer[layer_idx].push(nix);

        // Auto-link conversation nodes temporally
        if is_conv {
            if let Some(prev) = self.last_conversation {
                self.inner.add_edge(
                    prev,
                    nix,
                    MemEdge {
                        kind: EdgeKind::TemporalNext,
                        weight: 1.0,
                        created_at: Utc::now(),
                    },
                );
            }
            self.last_conversation = Some(nix);
        }

        nix
    }

    /// Add a typed edge between two nodes.
    pub fn link(&mut self, from: NodeIndex, to: NodeIndex, kind: EdgeKind, weight: f32) {
        self.inner.add_edge(
            from,
            to,
            MemEdge {
                kind,
                weight,
                created_at: Utc::now(),
            },
        );
    }

    /// Supersede: mark old as FactSuperseded, add Supersedes edge,
    /// and cascade invalidation to any Conclusions citing this premise.
    pub fn supersede(&mut self, new_nix: NodeIndex, old_fact_id: &str) -> HashSet<NodeIndex> {
        let mut invalidated = HashSet::new();

        if let Some(&old_nix) = self.by_fact_id.get(old_fact_id) {
            // Mark old as superseded
            if let Some(old_node) = self.inner.node_weight_mut(old_nix) {
                old_node.kind = MemKind::FactSuperseded;
            }
            // Add supersedes edge
            self.link(new_nix, old_nix, EdgeKind::Supersedes, 1.0);
            invalidated.insert(old_nix);

            // Find dependents (nodes with DependsOn edges pointing to old_nix)
            let dependents: Vec<NodeIndex> = self
                .inner
                .neighbors_directed(old_nix, Direction::Incoming)
                .filter(|&n| {
                    self.inner
                        .edges_connecting(n, old_nix)
                        .any(|e| e.weight().kind == EdgeKind::DependsOn)
                })
                .collect();

            for dep in dependents {
                invalidated.insert(dep);
            }

            // CASCADE: invalidate Conclusions that cite this premise
            let citing_conclusions: Vec<NodeIndex> = self
                .inner
                .neighbors_directed(old_nix, Direction::Incoming)
                .filter(|&n| {
                    self.inner
                        .edges_connecting(n, old_nix)
                        .any(|e| e.weight().kind == EdgeKind::CitesPremise)
                })
                .collect();

            for conc_nix in citing_conclusions {
                if let Some(conc_node) = self.inner.node_weight_mut(conc_nix) {
                    if conc_node.kind == MemKind::Conclusion {
                        conc_node.kind = MemKind::ConclusionInvalidated;
                        invalidated.insert(conc_nix);
                    }
                }
            }
        }

        invalidated
    }

    /// Lookup by fact_id.
    pub fn get_by_fact_id(&self, fact_id: &str) -> Option<(NodeIndex, &MemNode)> {
        self.by_fact_id
            .get(fact_id)
            .and_then(|&nix| self.inner.node_weight(nix).map(|n| (nix, n)))
    }

    /// Get all valid fact nodes.
    pub fn valid_facts(&self) -> Vec<(NodeIndex, &MemNode)> {
        self.by_layer[1]
            .iter() // layer 2 = index 1
            .filter_map(|&nix| {
                self.inner
                    .node_weight(nix)
                    .filter(|n| n.kind == MemKind::Fact)
                    .map(|n| (nix, n))
            })
            .collect()
    }

    /// Get all constraint fact nodes.
    pub fn constraints(&self) -> Vec<(NodeIndex, &MemNode)> {
        self.valid_facts()
            .into_iter()
            .filter(|(_, n)| n.is_constraint)
            .collect()
    }

    /// Get nodes by layer.
    pub fn nodes_by_layer(&self, layer: Layer) -> Vec<(NodeIndex, &MemNode)> {
        let idx = (layer as usize).saturating_sub(1).min(3);
        self.by_layer[idx]
            .iter()
            .filter_map(|&nix| self.inner.node_weight(nix).map(|n| (nix, n)))
            .collect()
    }

    /// Personalized PageRank retrieval from seed nodes (HippoRAG-inspired).
    ///
    /// PPR propagates relevance from seed nodes through the graph with a damping
    /// factor that controls the balance between following edges and teleporting
    /// back to seeds. Converges to a stationary distribution in ~10-20 iterations.
    ///
    /// Results are filtered to seeds' partitions by default. Use
    /// `retrieve_ppr_cross_partition` to traverse across the Project/Foreign
    /// boundary.
    pub fn retrieve_ppr(
        &self,
        seeds: &[NodeIndex],
        seed_weights: Option<&[f32]>,
        damping: f32,
        max_results: usize,
    ) -> Vec<RetrievalHit> {
        self.retrieve_ppr_inner(seeds, seed_weights, damping, max_results, true)
    }

    /// Like `retrieve_ppr`, but does NOT filter by partition — foreign nodes
    /// can be reached from project seeds and vice versa. Caller opts in.
    pub fn retrieve_ppr_cross_partition(
        &self,
        seeds: &[NodeIndex],
        seed_weights: Option<&[f32]>,
        damping: f32,
        max_results: usize,
    ) -> Vec<RetrievalHit> {
        self.retrieve_ppr_inner(seeds, seed_weights, damping, max_results, false)
    }

    fn retrieve_ppr_inner(
        &self,
        seeds: &[NodeIndex],
        seed_weights: Option<&[f32]>,
        damping: f32,
        max_results: usize,
        apply_partition_filter: bool,
    ) -> Vec<RetrievalHit> {
        if seeds.is_empty() {
            return Vec::new();
        }

        let all_nodes: Vec<NodeIndex> = self.inner.node_indices().collect();
        let n = all_nodes.len();
        if n == 0 {
            return Vec::new();
        }

        // Build node index → position map for O(1) lookup
        let pos: HashMap<NodeIndex, usize> = all_nodes
            .iter()
            .enumerate()
            .map(|(i, &nix)| (nix, i))
            .collect();

        // Build reset vector (teleportation distribution)
        let mut reset = vec![0.0f32; n];
        let mut total_weight = 0.0f32;
        for (i, &seed) in seeds.iter().enumerate() {
            if let Some(&p) = pos.get(&seed) {
                let w = seed_weights
                    .and_then(|sw| sw.get(i).copied())
                    .unwrap_or(1.0);
                reset[p] = w;
                total_weight += w;
            }
        }
        if total_weight > 0.0 {
            for v in &mut reset {
                *v /= total_weight;
            }
        }

        // Cap any single entry to 0.4 and re-normalize
        let cap = 0.4f32;
        let mut needs_renorm = false;
        for v in &mut reset {
            if *v > cap {
                *v = cap;
                needs_renorm = true;
            }
        }
        if needs_renorm {
            let sum: f32 = reset.iter().sum();
            if sum > 0.0 {
                for v in &mut reset {
                    *v /= sum;
                }
            }
        }

        // Initialize scores to reset vector
        let mut scores = reset.clone();

        // Edge weight multipliers by kind
        let edge_mult = |kind: EdgeKind| -> f32 {
            match kind {
                EdgeKind::Supersedes => 0.3,
                EdgeKind::DependsOn => 0.8,
                EdgeKind::RelatedTo => 0.6,
                EdgeKind::Triggers => 0.9,
                EdgeKind::TemporalNext => 0.5,
                EdgeKind::Calls => 0.85,
                EdgeKind::DefinedIn => 0.9,
                EdgeKind::Imports => 0.4,
                EdgeKind::CitesPremise => 0.85,
            }
        };

        // Precompute outgoing edge weights per node for normalization
        let mut out_weights: Vec<f32> = vec![0.0; n];
        for (i, &nix) in all_nodes.iter().enumerate() {
            let mut total = 0.0f32;
            for e in self.inner.edges(nix) {
                total += e.weight().weight * edge_mult(e.weight().kind);
            }
            // Also count incoming edges we traverse (RelatedTo, DependsOn, Triggers)
            for e in self.inner.edges_directed(nix, Direction::Incoming) {
                let k = e.weight().kind;
                if matches!(
                    k,
                    EdgeKind::RelatedTo
                        | EdgeKind::DependsOn
                        | EdgeKind::Triggers
                        | EdgeKind::Calls
                        | EdgeKind::DefinedIn
                        | EdgeKind::Imports
                ) {
                    total += e.weight().weight * edge_mult(k);
                }
            }
            out_weights[i] = total;
        }

        // PPR iteration
        let max_iters = 30;
        let epsilon = 1e-6;

        let mut new_scores = vec![0.0f32; n];

        for _iter in 0..max_iters {
            new_scores.fill(0.0);

            // Teleportation component
            for i in 0..n {
                new_scores[i] += (1.0 - damping) * reset[i];
            }

            // Propagation component: each node distributes its score to neighbors
            for (i, &nix) in all_nodes.iter().enumerate() {
                if scores[i] < epsilon {
                    continue;
                }
                let out_w = out_weights[i];
                if out_w == 0.0 {
                    continue;
                }

                // Outgoing edges
                for e in self.inner.edges(nix) {
                    if let Some(&j) = pos.get(&e.target()) {
                        let w = e.weight().weight * edge_mult(e.weight().kind);
                        new_scores[j] += damping * scores[i] * w / out_w;
                    }
                }
                // Incoming edges (bidirectional traversal)
                for e in self.inner.edges_directed(nix, Direction::Incoming) {
                    let k = e.weight().kind;
                    if !matches!(
                        k,
                        EdgeKind::RelatedTo
                            | EdgeKind::DependsOn
                            | EdgeKind::Triggers
                            | EdgeKind::Calls
                            | EdgeKind::DefinedIn
                            | EdgeKind::Imports
                    ) {
                        continue;
                    }
                    if let Some(&j) = pos.get(&e.source()) {
                        let w = e.weight().weight * edge_mult(k);
                        new_scores[j] += damping * scores[i] * w / out_w;
                    }
                }
            }

            // Check convergence
            let delta: f32 = scores
                .iter()
                .zip(new_scores.iter())
                .map(|(a, b)| (a - b).abs())
                .sum();
            std::mem::swap(&mut scores, &mut new_scores);
            if delta < epsilon {
                break;
            }
        }

        // Collect results
        let mut hits: Vec<RetrievalHit> = all_nodes
            .iter()
            .enumerate()
            .filter_map(|(i, &nix)| {
                let node = self.inner.node_weight(nix)?;
                if !node.is_valid() {
                    return None;
                }
                if scores[i] < epsilon {
                    return None;
                }
                // Compute approximate hops from nearest seed
                // PPR doesn't have a clean concept of "hops" — set to 0
                let hops = 0;
                Some(RetrievalHit {
                    node_ix: nix,
                    activation: scores[i],
                    hops,
                })
            })
            .collect();

        hits.sort_by(|a, b| {
            b.activation
                .partial_cmp(&a.activation)
                .unwrap_or(std::cmp::Ordering::Equal)
        });
        if apply_partition_filter {
            self.filter_same_partition(&mut hits, seeds);
        }
        hits.truncate(max_results);
        hits
    }

    /// Drop hits whose partition doesn't match any seed's partition.
    /// Private helper; `*_cross_partition` variants skip this step.
    fn filter_same_partition(&self, hits: &mut Vec<RetrievalHit>, seeds: &[NodeIndex]) {
        if seeds.is_empty() {
            return;
        }
        // If all seeds are Project and no Foreign partitions exist, fast-path.
        if self.partitions.is_empty() {
            return;
        }
        let seed_parts: HashSet<Partition> = seeds.iter().map(|&n| self.partition_of(n)).collect();
        hits.retain(|h| seed_parts.contains(&self.partition_of(h.node_ix)));
    }

    /// Legacy spreading activation retrieval from seed nodes.
    ///
    /// Results are filtered to seeds' partitions by default. Use
    /// `retrieve_cross_partition` for opt-in cross-partition traversal.
    pub fn retrieve(
        &self,
        seeds: &[NodeIndex],
        max_hops: usize,
        max_results: usize,
        decay: f32,
        min_activation: f32,
    ) -> Vec<RetrievalHit> {
        self.retrieve_inner(seeds, max_hops, max_results, decay, min_activation, true)
    }

    /// Cross-partition variant of `retrieve`.
    pub fn retrieve_cross_partition(
        &self,
        seeds: &[NodeIndex],
        max_hops: usize,
        max_results: usize,
        decay: f32,
        min_activation: f32,
    ) -> Vec<RetrievalHit> {
        self.retrieve_inner(seeds, max_hops, max_results, decay, min_activation, false)
    }

    fn retrieve_inner(
        &self,
        seeds: &[NodeIndex],
        max_hops: usize,
        max_results: usize,
        decay: f32,
        min_activation: f32,
        apply_partition_filter: bool,
    ) -> Vec<RetrievalHit> {
        let mut activations: HashMap<NodeIndex, f32> = HashMap::new();
        let mut hops_map: HashMap<NodeIndex, usize> = HashMap::new();
        let mut queue: VecDeque<(NodeIndex, f32, usize)> = VecDeque::new();

        for &seed in seeds {
            activations.insert(seed, 1.0);
            hops_map.insert(seed, 0);
            queue.push_back((seed, 1.0, 0));
        }

        while let Some((node, activation, hops)) = queue.pop_front() {
            if hops >= max_hops {
                continue;
            }

            // Traverse outgoing edges
            for edge_ref in self.inner.edges(node) {
                let neighbor = edge_ref.target();
                let edge = edge_ref.weight();

                let edge_mult = match edge.kind {
                    EdgeKind::Supersedes => 0.3,
                    EdgeKind::DependsOn => 0.8,
                    EdgeKind::RelatedTo => 0.6,
                    EdgeKind::Triggers => 0.9,
                    EdgeKind::TemporalNext => 0.5,
                    EdgeKind::Calls => 0.85,
                    EdgeKind::DefinedIn => 0.9,
                    EdgeKind::Imports => 0.4,
                    EdgeKind::CitesPremise => 0.85,
                };

                let new_activation = activation * decay * edge.weight * edge_mult;
                if new_activation < min_activation {
                    continue;
                }

                let existing = activations.get(&neighbor).copied().unwrap_or(0.0);
                if new_activation > existing {
                    activations.insert(neighbor, new_activation);
                    hops_map.insert(neighbor, hops + 1);
                    queue.push_back((neighbor, new_activation, hops + 1));
                }
            }

            // Also traverse incoming edges (for bidirectional traversal)
            for edge_ref in self.inner.edges_directed(node, Direction::Incoming) {
                let neighbor = edge_ref.source();
                let edge = edge_ref.weight();

                // Follow RelatedTo, DependsOn, and Triggers backwards
                let edge_mult = match edge.kind {
                    EdgeKind::RelatedTo => 0.6,
                    EdgeKind::DependsOn => 0.5,
                    EdgeKind::Triggers => 0.85, // "what skill fires for this trigger?"
                    EdgeKind::Calls => 0.7,     // "what calls me?"
                    EdgeKind::DefinedIn => 0.8, // "what methods does this type have?"
                    EdgeKind::Imports => 0.3,
                    _ => continue,
                };

                let new_activation = activation * decay * edge.weight * edge_mult;
                if new_activation < min_activation {
                    continue;
                }

                let existing = activations.get(&neighbor).copied().unwrap_or(0.0);
                if new_activation > existing {
                    activations.insert(neighbor, new_activation);
                    hops_map.insert(neighbor, hops + 1);
                    queue.push_back((neighbor, new_activation, hops + 1));
                }
            }
        }

        let mut hits: Vec<RetrievalHit> = activations
            .into_iter()
            .filter_map(|(nix, act)| {
                let node = self.inner.node_weight(nix)?;
                if !node.is_valid() {
                    return None;
                } // skip superseded
                Some(RetrievalHit {
                    node_ix: nix,
                    activation: act,
                    hops: *hops_map.get(&nix).unwrap_or(&0),
                })
            })
            .collect();

        hits.sort_by(|a, b| {
            b.activation
                .partial_cmp(&a.activation)
                .unwrap_or(std::cmp::Ordering::Equal)
        });
        if apply_partition_filter {
            self.filter_same_partition(&mut hits, seeds);
        }
        hits.truncate(max_results);
        hits
    }

    /// Find seed nodes by keyword matching with IDF-weighted scores.
    ///
    /// Returns (seeds, weights) where weights incorporate corpus IDF (term rarity),
    /// stemming, synonym expansion, and content-type-aware tokenization.
    /// Use the weights as `seed_weights` in `retrieve_ppr()`.
    pub fn find_seeds_weighted(&self, query: &str, max_seeds: usize) -> (Vec<NodeIndex>, Vec<f32>) {
        let q_tokens = tokenize(query);
        if q_tokens.is_empty() {
            return (Vec::new(), Vec::new());
        }

        // Compute corpus IDF: how rare each term is across all nodes
        let corpus_idf = self.compute_corpus_idf();

        let mut scored: Vec<(f32, NodeIndex)> = Vec::new();

        for nix in self.inner.node_indices() {
            let node = match self.inner.node_weight(nix) {
                Some(n) => n,
                None => continue,
            };
            if !node.is_valid() {
                continue;
            }

            let text = format!("{} {}", node.key, node.value).to_lowercase();
            let n_tokens = tokenize_for_content(&text, node.content_type);
            let n_stems: HashSet<String> = n_tokens.iter().map(|t| stem(t)).collect();

            // Weighted scoring using corpus IDF
            let mut weighted_hits = 0.0f32;
            let mut weighted_total = 0.0f32;

            for qt in &q_tokens {
                let term_idf = corpus_idf.get(qt).copied().unwrap_or(1.0);
                weighted_total += term_idf * 3.0; // max score per term = 3 * idf

                // Exact token match (3x weight)
                if n_tokens.contains(qt) {
                    weighted_hits += term_idf * 3.0;
                    continue;
                }
                // Stem match (2x weight)
                let qt_stem = stem(qt);
                if n_stems.contains(&qt_stem) {
                    weighted_hits += term_idf * 2.0;
                    continue;
                }
                // Synonym match (1.5x weight)
                let qt_syns = synonym_expand(qt);
                if !qt_syns.is_empty()
                    && qt_syns
                        .iter()
                        .any(|s| n_tokens.contains(s) || n_stems.contains(&stem(s)))
                {
                    weighted_hits += term_idf * 1.5;
                    continue;
                }
                // Substring match (1x weight)
                if text.contains(qt.as_str()) {
                    weighted_hits += term_idf;
                }
            }

            if weighted_total > 0.0 {
                let match_score = weighted_hits / weighted_total;
                if match_score > 0.0 {
                    scored.push((match_score, nix));
                }
            }
        }

        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
        scored.truncate(max_seeds);
        let seeds: Vec<NodeIndex> = scored.iter().map(|(_, nix)| *nix).collect();
        let weights: Vec<f32> = scored.iter().map(|(w, _)| *w).collect();
        (seeds, weights)
    }

    /// Compute corpus IDF: ln(N / (1 + df)) for each term across all valid nodes.
    /// Called once per find_seeds_weighted invocation. O(N × tokens_per_node),
    /// sub-millisecond for expected graph sizes (<10K nodes).
    fn compute_corpus_idf(&self) -> HashMap<String, f32> {
        let total = self.inner.node_count().max(1) as f32;
        let mut doc_freq: HashMap<String, usize> = HashMap::new();

        for nix in self.inner.node_indices() {
            let node = match self.inner.node_weight(nix) {
                Some(n) if n.is_valid() => n,
                _ => continue,
            };
            let text = format!("{} {}", node.key, node.value).to_lowercase();
            let tokens = tokenize_for_content(&text, node.content_type);
            let mut seen = HashSet::new();
            for token in tokens {
                if seen.insert(token.clone()) {
                    *doc_freq.entry(token).or_insert(0) += 1;
                }
            }
        }

        doc_freq
            .into_iter()
            .map(|(term, df)| (term, (total / (1.0 + df as f32)).ln().max(0.1)))
            .collect()
    }

    /// Find seed nodes by keyword matching (unweighted, backward compat).
    pub fn find_seeds(&self, query: &str, max_seeds: usize) -> Vec<NodeIndex> {
        self.find_seeds_weighted(query, max_seeds).0
    }

    /// Total token count across all valid nodes.
    pub fn total_tokens(&self) -> usize {
        self.inner
            .node_indices()
            .filter_map(|nix| self.inner.node_weight(nix))
            .filter(|n| n.is_valid())
            .map(|n| n.token_estimate())
            .sum()
    }

    /// Count of valid fact nodes.
    pub fn valid_fact_count(&self) -> usize {
        self.valid_facts().len()
    }

    pub fn node_count(&self) -> usize {
        self.inner.node_count()
    }

    pub fn edge_count(&self) -> usize {
        self.inner.edge_count()
    }

    /// Garbage-collect superseded/deprecated nodes beyond a retention depth.
    ///
    /// Walks Supersedes chains from each FactSuperseded/SkillDeprecated node.
    /// If the node is more than `max_depth` hops from any active (non-superseded)
    /// ancestor, it is removed. Returns the number of nodes removed.
    pub fn gc_superseded(&mut self, max_depth: usize) -> usize {
        // Find all superseded/deprecated nodes
        let stale: Vec<NodeIndex> = self
            .inner
            .node_indices()
            .filter(|&nix| {
                self.inner
                    .node_weight(nix)
                    .map(|n| {
                        matches!(
                            n.kind,
                            MemKind::FactSuperseded
                                | MemKind::SkillDeprecated
                                | MemKind::ConclusionInvalidated
                        )
                    })
                    .unwrap_or(false)
            })
            .collect();

        // For each stale node, walk Supersedes edges backward (incoming) to find
        // distance to nearest active node. If no active ancestor within max_depth, remove.
        let mut to_remove = Vec::new();
        for nix in &stale {
            let mut depth = 0;
            let mut current = *nix;
            let mut reachable = false;

            // Walk backward: who supersedes me? (incoming Supersedes edges)
            loop {
                let parent: Option<NodeIndex> = self
                    .inner
                    .neighbors_directed(current, Direction::Incoming)
                    .find(|&neighbor| {
                        self.inner
                            .edges_connecting(neighbor, current)
                            .any(|e| e.weight().kind == EdgeKind::Supersedes)
                    });

                match parent {
                    Some(p) => {
                        depth += 1;
                        if let Some(node) = self.inner.node_weight(p) {
                            if node.is_valid() {
                                reachable = true;
                                break;
                            }
                        }
                        if depth > max_depth {
                            break;
                        }
                        current = p;
                    }
                    None => break, // orphaned superseded node
                }
            }

            // Remove if too deep or orphaned (no active ancestor at all)
            if !reachable || depth > max_depth {
                to_remove.push(*nix);
            }
        }

        let count = to_remove.len();
        for nix in to_remove {
            // Capture fields before removal for index cleanup
            let (key, layer, fact_id) = match self.inner.node_weight(nix) {
                Some(node) => (node.key.clone(), node.layer, node.fact_id.clone()),
                None => continue,
            };
            self.remove_from_indexes(nix, &key, layer, fact_id.as_deref());
            self.inner.remove_node(nix);
        }
        count
    }

    /// Remove expired environment nodes.
    pub fn prune_expired(&mut self, now: DateTime<Utc>) {
        let expired: Vec<(NodeIndex, String, Layer, Option<String>)> = self
            .inner
            .node_indices()
            .filter_map(|nix| {
                let node = self.inner.node_weight(nix)?;
                if node.expires_at.map(|e| e <= now).unwrap_or(false) {
                    Some((nix, node.key.clone(), node.layer, node.fact_id.clone()))
                } else {
                    None
                }
            })
            .collect();
        for (nix, key, layer, fact_id) in expired {
            self.remove_from_indexes(nix, &key, layer, fact_id.as_deref());
            self.inner.remove_node(nix);
        }
    }

    /// Remove a set of conversation/summary nodes and repair the temporal chain.
    /// After removal, re-links surviving conversation+summary nodes with TemporalNext edges.
    pub fn remove_conversation_nodes(&mut self, to_remove: &[NodeIndex]) {
        // Clean indexes and remove the nodes
        for &nix in to_remove {
            let (key, layer, fact_id) = match self.inner.node_weight(nix) {
                Some(node) => (node.key.clone(), node.layer, node.fact_id.clone()),
                None => continue,
            };
            self.remove_from_indexes(nix, &key, layer, fact_id.as_deref());
            self.inner.remove_node(nix);
        }

        // Collect surviving conversation/summary nodes, sorted by created_at
        let mut survivors: Vec<(NodeIndex, chrono::DateTime<Utc>)> = self
            .inner
            .node_indices()
            .filter_map(|nix| {
                let n = self.inner.node_weight(nix)?;
                if matches!(n.kind, MemKind::Conversation | MemKind::ConversationSummary) {
                    Some((nix, n.created_at))
                } else {
                    None
                }
            })
            .collect();
        survivors.sort_by_key(|&(_, ts)| ts);

        // Remove all existing TemporalNext edges between conversation nodes
        let temporal_edges: Vec<petgraph::graph::EdgeIndex> = self
            .inner
            .edge_indices()
            .filter(|&eix| {
                self.inner
                    .edge_weight(eix)
                    .map(|e| e.kind == EdgeKind::TemporalNext)
                    .unwrap_or(false)
            })
            .collect();
        for eix in temporal_edges {
            self.inner.remove_edge(eix);
        }

        // Re-link surviving nodes in chronological order
        for window in survivors.windows(2) {
            let (from_nix, _) = window[0];
            let (to_nix, _) = window[1];
            self.inner.add_edge(
                from_nix,
                to_nix,
                MemEdge {
                    kind: EdgeKind::TemporalNext,
                    weight: 1.0,
                    created_at: Utc::now(),
                },
            );
        }

        // Update last_conversation
        self.last_conversation = survivors.last().map(|&(nix, _)| nix);
    }

    /// Remove a node's entries from all secondary indexes.
    fn remove_from_indexes(
        &mut self,
        nix: NodeIndex,
        key: &str,
        layer: Layer,
        fact_id: Option<&str>,
    ) {
        if let Some(fid) = fact_id {
            self.by_fact_id.remove(fid);
        }
        if let Some(entries) = self.by_key.get_mut(key) {
            entries.retain(|&n| n != nix);
            if entries.is_empty() {
                self.by_key.remove(key);
            }
        }
        let layer_idx = (layer as usize).saturating_sub(1).min(3);
        self.by_layer[layer_idx].retain(|&n| n != nix);
    }

    pub fn clear(&mut self) {
        self.inner.clear();
        self.by_fact_id.clear();
        self.by_key.clear();
        for v in &mut self.by_layer {
            v.clear();
        }
        self.last_conversation = None;
    }
}

impl Default for MemoryGraph {
    fn default() -> Self {
        Self::new()
    }
}

/// Tokenize text on common delimiters for seed matching.
fn tokenize(text: &str) -> HashSet<String> {
    text.split(|c: char| {
        c.is_whitespace() || c == '/' || c == ':' || c == '-' || c == '_' || c == '.' || c == ','
    })
    .map(|s| s.trim().to_lowercase())
    .filter(|s| !s.is_empty())
    .collect()
}

/// Syntax noise tokens common across all languages.
const CODE_NOISE_COMMON: &[&str] = &[
    "return", "if", "else", "for", "while", "true", "false", "new", "this", "static", "void",
    "null", "break", "continue", "try", "catch", "throw",
];

/// Per-language noise tokens — keywords that carry no semantic meaning for retrieval.
fn noise_for_language(lang: CodeLanguage) -> &'static [&'static str] {
    match lang {
        CodeLanguage::Rust => &[
            "fn", "let", "const", "pub", "mut", "use", "mod", "crate", "super", "struct", "enum",
            "impl", "trait", "async", "await", "where", "type", "match", "ref", "move", "dyn",
            "unsafe",
        ],
        CodeLanguage::Python => &[
            "def", "class", "import", "from", "pass", "none", "self", "elif", "except", "finally",
            "yield", "lambda", "nonlocal", "global", "assert", "with", "as", "in", "is", "not",
            "and", "or",
        ],
        CodeLanguage::JavaScript | CodeLanguage::TypeScript => &[
            "function",
            "const",
            "let",
            "var",
            "export",
            "default",
            "async",
            "await",
            "typeof",
            "instanceof",
            "undefined",
            "require",
        ],
        CodeLanguage::Go => &[
            "func",
            "package",
            "import",
            "defer",
            "go",
            "chan",
            "select",
            "range",
            "switch",
            "case",
            "fallthrough",
        ],
        CodeLanguage::Java => &[
            "public",
            "private",
            "protected",
            "class",
            "interface",
            "extends",
            "implements",
            "final",
            "abstract",
            "synchronized",
            "volatile",
            "import",
            "package",
            "throws",
        ],
        CodeLanguage::Cpp => &[
            "include",
            "namespace",
            "using",
            "template",
            "typename",
            "virtual",
            "override",
            "const",
            "auto",
            "inline",
            "extern",
            "typedef",
            "class",
            "struct",
            "enum",
        ],
        CodeLanguage::Shell => &[
            "echo", "fi", "do", "done", "then", "esac", "elif", "local", "export", "readonly",
            "set",
        ],
        CodeLanguage::Sql => &[
            "select", "from", "where", "and", "or", "not", "insert", "into", "update", "set",
            "delete", "create", "alter", "drop", "table", "join", "on", "as", "order", "by",
            "group", "having", "limit",
        ],
        CodeLanguage::Unknown => &[
            "fn", "let", "const", "var", "pub", "mut", "return", "def", "class", "import", "from",
            "function", "export", "default",
        ],
    }
}

/// Split camelCase and PascalCase identifiers into parts.
/// e.g. "verifyJwtToken" → ["verify", "jwt", "token"]
///      "HTMLParser" → ["html", "parser"]
fn split_camel_case(s: &str) -> Vec<String> {
    let mut parts = Vec::new();
    let mut current = String::new();
    let chars: Vec<char> = s.chars().collect();

    for i in 0..chars.len() {
        let c = chars[i];
        if !c.is_alphanumeric() {
            if !current.is_empty() {
                parts.push(std::mem::take(&mut current).to_lowercase());
            }
            continue;
        }
        if c.is_uppercase() {
            // Check if this starts a new word:
            // 1. Previous was lowercase (camelCase boundary)
            // 2. Previous was uppercase but next is lowercase (acronym end: HTMLParser → HTML|Parser)
            let prev_lower = i > 0 && chars[i - 1].is_lowercase();
            let acronym_end = i > 0
                && chars[i - 1].is_uppercase()
                && i + 1 < chars.len()
                && chars[i + 1].is_lowercase();
            if prev_lower || acronym_end {
                if !current.is_empty() {
                    parts.push(std::mem::take(&mut current).to_lowercase());
                }
            }
        }
        current.push(c);
    }
    if !current.is_empty() {
        parts.push(current.to_lowercase());
    }
    parts
}

/// Code-aware tokenizer: splits camelCase/PascalCase, filters language-specific noise.
fn tokenize_code(text: &str, lang: CodeLanguage) -> HashSet<String> {
    let mut noise: HashSet<&str> = CODE_NOISE_COMMON.iter().copied().collect();
    noise.extend(noise_for_language(lang));

    let raw_tokens: Vec<String> = text
        .split(|c: char| {
            c.is_whitespace()
                || c == '/'
                || c == ':'
                || c == '-'
                || c == '_'
                || c == '.'
                || c == ','
                || c == '('
                || c == ')'
                || c == '{'
                || c == '}'
                || c == '['
                || c == ']'
                || c == ';'
                || c == '"'
                || c == '\''
                || c == '<'
                || c == '>'
                || c == '='
                || c == '&'
                || c == '|'
                || c == '!'
                || c == '#'
                || c == '*'
                || c == '+'
        })
        .map(|s| s.trim().to_string())
        .filter(|s| !s.is_empty())
        .collect();

    let mut tokens = HashSet::new();
    for raw in &raw_tokens {
        let parts = split_camel_case(raw);
        for part in parts {
            if part.len() >= 2 && !noise.contains(part.as_str()) {
                tokens.insert(part);
            }
        }
    }
    tokens
}

/// Dispatch tokenization based on content type.
pub fn tokenize_for_content(text: &str, content_type: ContentType) -> HashSet<String> {
    match content_type {
        ContentType::Code(lang) => tokenize_code(text, lang),
        _ => tokenize(text),
    }
}

/// Minimal suffix-stripping stemmer. Returns stem alongside original.
/// Strips common English suffixes, minimum stem length 3.
pub fn stem(word: &str) -> String {
    let w = word.to_lowercase();
    if w.len() < 4 {
        return w;
    }

    // Try longest suffixes first
    let suffixes = &[
        "ation", "tion", "ment", "ness", "able", "ible", "ence", "ance", "ing", "ful", "ous",
        "ive", "ize", "ise", "ify", "ate", "ed", "er", "ly", "al", "es",
    ];
    for suffix in suffixes {
        if let Some(stem) = w.strip_suffix(suffix) {
            if stem.len() >= 3 {
                return stem.to_string();
            }
        }
    }
    // Trailing 's' (but not 'ss')
    if w.ends_with('s') && !w.ends_with("ss") && w.len() > 3 {
        return w[..w.len() - 1].to_string();
    }
    w
}

/// Synonym groups for cross-modal matching.
/// Each group contains terms that should match each other.
const SYNONYM_GROUPS: &[&[&str]] = &[
    &[
        "auth",
        "authenticate",
        "authorization",
        "credential",
        "login",
        "jwt",
        "token",
        "verify",
    ],
    &[
        "db", "database", "sql", "query", "postgres", "sqlite", "mysql",
    ],
    &["err", "error", "exception", "panic", "fail", "failure"],
    &["config", "configuration", "setting", "preference", "option"],
    &["msg", "message", "notification", "alert", "event"],
    &["req", "request", "http", "api", "endpoint", "route"],
    &["resp", "response", "reply", "result", "output"],
    &["mem", "memory", "cache", "buffer", "storage"],
    &["exec", "execute", "run", "invoke", "call", "dispatch"],
    &["parse", "deserialize", "decode", "unmarshal", "extract"],
    &["serial", "serialize", "encode", "marshal", "format"],
    &["nav", "navigate", "redirect", "goto"],
];

/// Find synonym expansions for a token.
pub fn synonym_expand(token: &str) -> HashSet<String> {
    let mut result = HashSet::new();
    let lower = token.to_lowercase();
    let stemmed = stem(&lower);
    for group in SYNONYM_GROUPS {
        let matches = group.iter().any(|&t| t == lower || stem(t) == stemmed);
        if matches {
            for &t in *group {
                result.insert(t.to_string());
            }
        }
    }
    result
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_fact(key: &str, value: &str, fact_id: &str) -> MemNode {
        MemNode {
            kind: MemKind::Fact,
            layer: 2,
            key: key.to_string(),
            value: value.to_string(),
            fact_id: Some(fact_id.to_string()),
            scope: "global".to_string(),
            authority: "peer".to_string(),
            is_constraint: false,
            created_at: Utc::now(),
            expires_at: None,
            content_type: ContentType::default(),
            metadata: FactMetadata::default(),
        }
    }

    #[test]
    fn insert_and_lookup() {
        let mut g = MemoryGraph::new();
        let nix = g.insert(make_fact("language", "Rust", "f1"));
        let (found_nix, node) = g.get_by_fact_id("f1").unwrap();
        assert_eq!(found_nix, nix);
        assert_eq!(node.value, "Rust");
    }

    #[test]
    fn supersession() {
        let mut g = MemoryGraph::new();
        g.insert(make_fact("db", "PostgreSQL", "f1"));
        let new_nix = g.insert(make_fact("db", "SQLite", "f2"));
        let invalidated = g.supersede(new_nix, "f1");

        assert!(!invalidated.is_empty());
        let (_, old) = g.get_by_fact_id("f1").unwrap();
        assert_eq!(old.kind, MemKind::FactSuperseded);
        assert_eq!(g.valid_fact_count(), 1);
    }

    #[test]
    fn dependency_cascade() {
        let mut g = MemoryGraph::new();
        let base = g.insert(make_fact("base", "100", "f1"));
        let derived = g.insert(make_fact("derived", "200", "f2"));
        g.link(derived, base, EdgeKind::DependsOn, 1.0);

        let new_base = g.insert(make_fact("base", "150", "f3"));
        let invalidated = g.supersede(new_base, "f1");

        // derived should be in invalidated set (it depends on f1)
        assert!(invalidated.contains(&derived));
    }

    #[test]
    fn seed_finding() {
        let mut g = MemoryGraph::new();
        g.insert(make_fact("navigate /ops/dashboard", "Page loads", "f1"));
        g.insert(make_fact("click button", "Button clicked", "f2"));

        let seeds = g.find_seeds("dashboard", 5);
        assert!(!seeds.is_empty()); // should find f1 via substring match
    }

    #[test]
    fn spreading_activation() {
        let mut g = MemoryGraph::new();
        let a = g.insert(make_fact("project", "CAR runtime", "f1"));
        let b = g.insert(make_fact("language", "Rust", "f2"));
        let c = g.insert(make_fact("testing", "uses proptest", "f3"));
        g.link(a, b, EdgeKind::RelatedTo, 0.8);
        g.link(b, c, EdgeKind::RelatedTo, 0.7);

        let hits = g.retrieve(&[a], 3, 10, 0.7, 0.05);
        // Should find b (1 hop) and c (2 hops) via RelatedTo edges
        assert!(hits.len() >= 2);
        assert!(hits[0].activation > hits[1].activation); // closer = higher
    }

    #[test]
    fn conversation_temporal_links() {
        let mut g = MemoryGraph::new();
        let c1 = g.insert(MemNode {
            kind: MemKind::Conversation,
            layer: 3,
            key: "user".to_string(),
            value: "How's the project?".to_string(),
            fact_id: None,
            scope: "global".to_string(),
            authority: "peer".to_string(),
            is_constraint: false,
            created_at: Utc::now(),
            expires_at: None,
            content_type: ContentType::NaturalLanguage,
            metadata: FactMetadata::default(),
        });
        let c2 = g.insert(MemNode {
            kind: MemKind::Conversation,
            layer: 3,
            key: "assistant".to_string(),
            value: "Going well".to_string(),
            fact_id: None,
            scope: "global".to_string(),
            authority: "peer".to_string(),
            is_constraint: false,
            created_at: Utc::now(),
            expires_at: None,
            content_type: ContentType::NaturalLanguage,
            metadata: FactMetadata::default(),
        });

        // Should have TemporalNext edge automatically
        assert_eq!(g.edge_count(), 1);
        let edge = g.inner.edges(c1).next().unwrap();
        assert_eq!(edge.weight().kind, EdgeKind::TemporalNext);
        assert_eq!(edge.target(), c2);
    }

    #[test]
    fn constraints_tracked() {
        let mut g = MemoryGraph::new();
        let mut node = make_fact("budget", "Max $500K", "c1");
        node.is_constraint = true;
        g.insert(node);
        g.insert(make_fact("language", "Rust", "f1"));

        assert_eq!(g.constraints().len(), 1);
        assert_eq!(g.valid_fact_count(), 2);
    }

    #[test]
    fn gc_superseded_removes_deep_chains() {
        // Build chain: f4 supersedes f3 supersedes f2 supersedes f1
        let mut g = MemoryGraph::new();
        let _n1 = g.insert(make_fact("db", "v1", "f1"));
        let n2 = g.insert(make_fact("db", "v2", "f2"));
        g.supersede(n2, "f1"); // f1 → FactSuperseded
        let n3 = g.insert(make_fact("db", "v3", "f3"));
        g.supersede(n3, "f2"); // f2 → FactSuperseded
        let n4 = g.insert(make_fact("db", "v4", "f4"));
        g.supersede(n4, "f3"); // f3 → FactSuperseded

        // Before GC: 4 nodes (1 active, 3 superseded)
        assert_eq!(g.node_count(), 4);

        // GC with max_depth=1: keep f3 (1 hop from active f4), remove f1 and f2
        let removed = g.gc_superseded(1);
        assert_eq!(removed, 2);
        assert_eq!(g.node_count(), 2);

        // f4 (active) and f3 (1 hop) should remain
        assert!(g.get_by_fact_id("f4").is_some());
        // f1 and f2 should be gone
        assert!(g.get_by_fact_id("f1").is_none());
        assert!(g.get_by_fact_id("f2").is_none());
    }

    #[test]
    fn gc_superseded_retains_shallow_chain() {
        // f2 supersedes f1 — depth 1, should be retained with max_depth=1
        let mut g = MemoryGraph::new();
        let _n1 = g.insert(make_fact("db", "old", "f1"));
        let n2 = g.insert(make_fact("db", "new", "f2"));
        g.supersede(n2, "f1");

        let removed = g.gc_superseded(1);
        assert_eq!(removed, 0);
        assert_eq!(g.node_count(), 2);
    }

    #[test]
    fn gc_superseded_removes_orphaned() {
        // A superseded node with no active ancestor (orphaned chain)
        let mut g = MemoryGraph::new();
        let mut orphan = make_fact("db", "orphan", "f1");
        orphan.kind = MemKind::FactSuperseded;
        g.insert(orphan);

        let removed = g.gc_superseded(1);
        assert_eq!(removed, 1);
        assert_eq!(g.node_count(), 0);
    }

    #[test]
    fn gc_superseded_noop_on_clean_graph() {
        let mut g = MemoryGraph::new();
        g.insert(make_fact("a", "1", "f1"));
        g.insert(make_fact("b", "2", "f2"));

        let removed = g.gc_superseded(1);
        assert_eq!(removed, 0);
        assert_eq!(g.node_count(), 2);
    }

    #[test]
    fn prune_expired_removes_ttl_nodes() {
        let mut g = MemoryGraph::new();
        let now = Utc::now();
        let mut expired = make_fact("temp", "gone", "e1");
        expired.kind = MemKind::Environment;
        expired.layer = 4;
        expired.expires_at = Some(now - chrono::Duration::hours(1));
        g.insert(expired);

        let mut fresh = make_fact("temp", "here", "e2");
        fresh.kind = MemKind::Environment;
        fresh.layer = 4;
        fresh.expires_at = Some(now + chrono::Duration::hours(1));
        g.insert(fresh);

        g.prune_expired(now);
        assert_eq!(g.node_count(), 1);
    }

    #[test]
    fn prune_expired_cleans_by_key_and_by_layer() {
        let mut g = MemoryGraph::new();
        let now = Utc::now();

        // Create an expired environment node
        let mut expired = make_fact("env_temp", "gone", "e1");
        expired.kind = MemKind::Environment;
        expired.layer = 4;
        expired.expires_at = Some(now - chrono::Duration::hours(1));
        g.insert(expired);

        // Verify indexes are populated before pruning
        assert!(g.by_key.contains_key("env_temp"));
        let layer_idx = (4usize).saturating_sub(1).min(3); // index 3
        assert_eq!(g.by_layer[layer_idx].len(), 1);

        g.prune_expired(now);

        // After pruning, by_key should not contain the key
        assert!(!g.by_key.contains_key("env_temp"));
        // by_layer should be clean
        assert_eq!(g.by_layer[layer_idx].len(), 0);
        // by_fact_id should be clean
        assert!(g.get_by_fact_id("e1").is_none());
    }

    #[test]
    fn gc_superseded_cleans_by_key_and_by_layer() {
        // Build chain: f3 supersedes f2 supersedes f1
        let mut g = MemoryGraph::new();
        let _n1 = g.insert(make_fact("db", "v1", "f1"));
        let n2 = g.insert(make_fact("db", "v2", "f2"));
        g.supersede(n2, "f1");
        let n3 = g.insert(make_fact("db", "v3", "f3"));
        g.supersede(n3, "f2");

        // Before GC: by_key["db"] has 3 entries, by_layer[1] has 3 entries
        assert_eq!(g.by_key.get("db").map(|v| v.len()).unwrap_or(0), 3);
        let layer_idx = (2usize).saturating_sub(1).min(3); // index 1
        let initial_layer_count = g.by_layer[layer_idx].len();
        assert_eq!(initial_layer_count, 3);

        // GC with max_depth=0: only f3 (active) kept, f1 and f2 removed
        let removed = g.gc_superseded(0);
        assert_eq!(removed, 2);

        // by_key["db"] should only have the active node (f3)
        assert_eq!(g.by_key.get("db").map(|v| v.len()).unwrap_or(0), 1);
        // by_layer should only have the active node
        assert_eq!(g.by_layer[layer_idx].len(), 1);
    }

    #[test]
    fn remove_conversation_nodes_cleans_by_key_and_by_layer() {
        let mut g = MemoryGraph::new();
        let c1 = g.insert(MemNode {
            kind: MemKind::Conversation,
            layer: 3,
            key: "user".to_string(),
            value: "Hello".to_string(),
            fact_id: None,
            scope: "global".to_string(),
            authority: "peer".to_string(),
            is_constraint: false,
            created_at: Utc::now(),
            expires_at: None,
            content_type: ContentType::NaturalLanguage,
            metadata: FactMetadata::default(),
        });
        let c2 = g.insert(MemNode {
            kind: MemKind::Conversation,
            layer: 3,
            key: "assistant".to_string(),
            value: "Hi there".to_string(),
            fact_id: None,
            scope: "global".to_string(),
            authority: "peer".to_string(),
            is_constraint: false,
            created_at: Utc::now(),
            expires_at: None,
            content_type: ContentType::NaturalLanguage,
            metadata: FactMetadata::default(),
        });

        // Verify indexes populated
        assert!(g.by_key.contains_key("user"));
        assert!(g.by_key.contains_key("assistant"));
        let layer_idx = (3usize).saturating_sub(1).min(3); // index 2
        assert_eq!(g.by_layer[layer_idx].len(), 2);

        // Remove both conversation nodes
        g.remove_conversation_nodes(&[c1, c2]);

        // by_key should be clean
        assert!(!g.by_key.contains_key("user"));
        assert!(!g.by_key.contains_key("assistant"));
        // by_layer should be clean
        assert_eq!(g.by_layer[layer_idx].len(), 0);
        // Graph should be empty
        assert_eq!(g.node_count(), 0);
    }
}