Skip to main content

codemem_engine/
scoring.rs

1//! Hybrid scoring for memory recall.
2
3use crate::bm25;
4use chrono::{DateTime, Utc};
5use codemem_core::{MemoryNode, ScoreBreakdown};
6use codemem_storage::graph::GraphEngine;
7
8// ── Code-graph sub-weights for `graph_strength_for_memory` ──────────────────
9
10/// Weight for PageRank in code-graph scoring.
11const CODE_GRAPH_W_PAGERANK: f64 = 0.4;
12/// Weight for betweenness centrality in code-graph scoring.
13const CODE_GRAPH_W_BETWEENNESS: f64 = 0.3;
14/// Weight for connectivity (degree / 5.0) in code-graph scoring.
15const CODE_GRAPH_W_CONNECTIVITY: f64 = 0.2;
16/// Weight for average edge weight in code-graph scoring.
17const CODE_GRAPH_W_EDGE_WEIGHT: f64 = 0.1;
18
19// ── Temporal decay constants ────────────────────────────────────────────────
20
21/// Half-life for temporal decay (hours). Updated-at age decays over 30 days.
22const TEMPORAL_DECAY_HOURS: f64 = 30.0 * 24.0;
23/// Half-life for recency decay (hours). Last-access age decays over 7 days.
24const RECENCY_DECAY_HOURS: f64 = 7.0 * 24.0;
25
26/// Compute graph strength for a memory node by combining raw graph metrics.
27///
28/// Blends two signal sources:
29/// - **Code neighbors** (`sym:`, `file:`, etc.): PageRank, betweenness, connectivity, edge weight
30/// - **Memory neighbors** (other memories linked via SHARES_THEME, PRECEDED_BY, etc.):
31///   connectivity count and average edge weight
32///
33/// A function with many linked memories ranks higher. A conversational memory
34/// connected to many session peers ranks higher than an isolated one.
35pub fn graph_strength_for_memory(graph: &GraphEngine, memory_id: &str) -> f64 {
36    let metrics = match graph.raw_graph_metrics_for_memory(memory_id) {
37        Some(m) => m,
38        None => return 0.0,
39    };
40
41    // Code-graph component (0.0-1.0): centrality of linked code nodes
42    let code_score = if metrics.code_neighbor_count > 0 {
43        let connectivity = (metrics.code_neighbor_count as f64 / 5.0).min(1.0);
44        let avg_edge_w = (metrics.total_edge_weight / metrics.code_neighbor_count as f64).min(1.0);
45        CODE_GRAPH_W_PAGERANK * metrics.max_pagerank
46            + CODE_GRAPH_W_BETWEENNESS * metrics.max_betweenness
47            + CODE_GRAPH_W_CONNECTIVITY * connectivity
48            + CODE_GRAPH_W_EDGE_WEIGHT * avg_edge_w
49    } else {
50        0.0
51    };
52
53    // Memory-graph component (0.0-1.0): connectivity to other memories
54    let memory_score = if metrics.memory_neighbor_count > 0 {
55        let connectivity = (metrics.memory_neighbor_count as f64 / 10.0).min(1.0);
56        let avg_edge_w =
57            (metrics.memory_edge_weight / metrics.memory_neighbor_count as f64).min(1.0);
58        0.6 * connectivity + 0.4 * avg_edge_w
59    } else {
60        0.0
61    };
62
63    // Blend: when both exist, code neighbors dominate (70/30).
64    // When only one exists, use it fully.
65    let score = match (
66        metrics.code_neighbor_count > 0,
67        metrics.memory_neighbor_count > 0,
68    ) {
69        (true, true) => 0.7 * code_score + 0.3 * memory_score,
70        (true, false) => code_score,
71        (false, true) => memory_score,
72        (false, false) => 0.0,
73    };
74
75    score.min(1.0)
76}
77
78/// Re-export the canonical truncate from codemem-core.
79pub use codemem_core::truncate as truncate_content;
80
81/// Compute 9-component hybrid score for a memory against a query.
82/// The `graph` parameter is used to look up edge counts for graph strength scoring.
83/// The `bm25` parameter provides BM25-based token overlap scoring; if the memory
84/// is in the index it uses the indexed score, otherwise falls back to `score_text`.
85/// The `now` parameter makes scoring deterministic and testable by avoiding internal clock reads.
86pub fn compute_score(
87    memory: &MemoryNode,
88    query_tokens: &[&str],
89    vector_similarity: f64,
90    graph: &GraphEngine,
91    bm25: &bm25::Bm25Index,
92    now: DateTime<Utc>,
93) -> ScoreBreakdown {
94    // BM25 token overlap (replaces naive split+intersect)
95    // Use pre-tokenized query tokens to avoid re-tokenizing per document.
96    let token_overlap = if query_tokens.is_empty() {
97        0.0
98    } else {
99        // Try indexed score first (memory already in the BM25 index),
100        // fall back to scoring against raw text for unindexed documents.
101        let indexed_score = bm25.score_with_tokens_str(query_tokens, &memory.id);
102        if indexed_score > 0.0 {
103            indexed_score
104        } else {
105            bm25.score_text_with_tokens_str(query_tokens, &memory.content)
106        }
107    };
108
109    // Temporal: how recently updated (exponential decay over 30 days)
110    let age_hours = (now - memory.updated_at).num_hours().max(0) as f64;
111    let temporal = (-age_hours / TEMPORAL_DECAY_HOURS).exp();
112
113    // Tag matching: fraction of query tokens found in tags.
114    // Per-memory `tags.join().to_lowercase()` is O(tags) which is typically <10 strings,
115    // so allocation is negligible.
116    let tag_matching = if !query_tokens.is_empty() {
117        let tag_str: String = memory.tags.join(" ").to_lowercase();
118        let matches = query_tokens
119            .iter()
120            .filter(|qt| tag_str.contains(**qt))
121            .count();
122        matches as f64 / query_tokens.len() as f64
123    } else {
124        0.0
125    };
126
127    // Recency: based on last access time (decay over 7 days)
128    let access_hours = (now - memory.last_accessed_at).num_hours().max(0) as f64;
129    let recency = (-access_hours / RECENCY_DECAY_HOURS).exp();
130
131    // Enhanced graph scoring: bridge memory UUIDs to code-graph centrality.
132    // Memory nodes live in a separate ID space from code nodes (sym:, file:),
133    // so we collect raw metrics from code-graph neighbors and apply the
134    // scoring formula here in the engine.
135    let graph_strength = graph_strength_for_memory(graph, &memory.id);
136
137    ScoreBreakdown {
138        vector_similarity,
139        graph_strength,
140        token_overlap,
141        temporal,
142        tag_matching,
143        importance: memory.importance,
144        confidence: memory.confidence,
145        recency,
146    }
147}
148
149#[cfg(test)]
150#[path = "tests/scoring_tests.rs"]
151mod tests;