codemem_engine/
scoring.rs

1//! Hybrid scoring for memory recall.
2
3use crate::bm25;
4use chrono::{DateTime, Utc};
5use codemem_core::{MemoryNode, ScoreBreakdown};
6use codemem_storage::graph::GraphEngine;
7
8/// Compute graph strength for a memory node by combining raw graph metrics.
9///
10/// Uses PageRank, betweenness centrality, connectivity, and edge weights
11/// from the memory's code-graph neighbors to produce a 0.0-1.0 score.
12/// Weights: PageRank 40%, betweenness 30%, connectivity 20%, edge weight 10%.
13pub fn graph_strength_for_memory(graph: &GraphEngine, memory_id: &str) -> f64 {
14    let metrics = match graph.raw_graph_metrics_for_memory(memory_id) {
15        Some(m) => m,
16        None => return 0.0,
17    };
18
19    if metrics.code_neighbor_count == 0 {
20        return 0.0;
21    }
22
23    let connectivity_bonus = (metrics.code_neighbor_count as f64 / 5.0).min(1.0);
24    let edge_weight_bonus =
25        (metrics.total_edge_weight / metrics.code_neighbor_count as f64).min(1.0);
26
27    (0.4 * metrics.max_pagerank
28        + 0.3 * metrics.max_betweenness
29        + 0.2 * connectivity_bonus
30        + 0.1 * edge_weight_bonus)
31        .min(1.0)
32}
33
34/// Truncate a string to `max` bytes, appending "..." if truncated.
35/// Handles multi-byte UTF-8 safely by finding the nearest char boundary.
36pub fn truncate_content(s: &str, max: usize) -> String {
37    if s.len() <= max {
38        s.to_string()
39    } else {
40        let mut end = max;
41        while end > 0 && !s.is_char_boundary(end) {
42            end -= 1;
43        }
44        format!("{}...", &s[..end])
45    }
46}
47
48/// Compute 9-component hybrid score for a memory against a query.
49/// The `graph` parameter is used to look up edge counts for graph strength scoring.
50/// The `bm25` parameter provides BM25-based token overlap scoring; if the memory
51/// is in the index it uses the indexed score, otherwise falls back to `score_text`.
52/// The `now` parameter makes scoring deterministic and testable by avoiding internal clock reads.
53pub fn compute_score(
54    memory: &MemoryNode,
55    query_tokens: &[&str],
56    vector_similarity: f64,
57    graph: &GraphEngine,
58    bm25: &bm25::Bm25Index,
59    now: DateTime<Utc>,
60) -> ScoreBreakdown {
61    // BM25 token overlap (replaces naive split+intersect)
62    // Use pre-tokenized query tokens to avoid re-tokenizing per document.
63    let token_overlap = if query_tokens.is_empty() {
64        0.0
65    } else {
66        // Try indexed score first (memory already in the BM25 index),
67        // fall back to scoring against raw text for unindexed documents.
68        let indexed_score = bm25.score_with_tokens_str(query_tokens, &memory.id);
69        if indexed_score > 0.0 {
70            indexed_score
71        } else {
72            bm25.score_text_with_tokens_str(query_tokens, &memory.content)
73        }
74    };
75
76    // Temporal: how recently updated (exponential decay over 30 days)
77    let age_hours = (now - memory.updated_at).num_hours().max(0) as f64;
78    let temporal = (-age_hours / (30.0 * 24.0)).exp();
79
80    // Tag matching: fraction of query tokens found in tags.
81    // Per-memory `tags.join().to_lowercase()` is O(tags) which is typically <10 strings,
82    // so allocation is negligible.
83    let tag_matching = if !query_tokens.is_empty() {
84        let tag_str: String = memory.tags.join(" ").to_lowercase();
85        let matches = query_tokens
86            .iter()
87            .filter(|qt| tag_str.contains(**qt))
88            .count();
89        matches as f64 / query_tokens.len() as f64
90    } else {
91        0.0
92    };
93
94    // Recency: based on last access time (decay over 7 days)
95    let access_hours = (now - memory.last_accessed_at).num_hours().max(0) as f64;
96    let recency = (-access_hours / (7.0 * 24.0)).exp();
97
98    // Enhanced graph scoring: bridge memory UUIDs to code-graph centrality.
99    // Memory nodes live in a separate ID space from code nodes (sym:, file:),
100    // so we collect raw metrics from code-graph neighbors and apply the
101    // scoring formula here in the engine.
102    let graph_strength = graph_strength_for_memory(graph, &memory.id);
103
104    ScoreBreakdown {
105        vector_similarity,
106        graph_strength,
107        token_overlap,
108        temporal,
109        tag_matching,
110        importance: memory.importance,
111        confidence: memory.confidence,
112        recency,
113    }
114}
115
116#[cfg(test)]
117#[path = "tests/scoring_tests.rs"]
118mod tests;
codemem_engine/scoring.rs

codemem_engine/
scoring.rs