Skip to main content

agentic_memory/index/
doc_lengths.rs

1//! Document length table for BM25 normalization.
2
3use crate::engine::tokenizer::Tokenizer;
4use crate::graph::MemoryGraph;
5use crate::types::CognitiveEvent;
6
7/// Document length table for BM25 normalization.
8/// Stores the token count for each node's content indexed by node ID (dense array).
9pub struct DocLengths {
10    /// node_id → token count (document length). Indexed by node ID.
11    lengths: Vec<u32>,
12}
13
14impl DocLengths {
15    /// Create an empty doc lengths table.
16    pub fn new() -> Self {
17        Self {
18            lengths: Vec::new(),
19        }
20    }
21
22    /// Build from graph — tokenize all content and count tokens.
23    pub fn build(graph: &MemoryGraph, tokenizer: &Tokenizer) -> Self {
24        let mut lengths = Vec::new();
25
26        for node in graph.nodes() {
27            let id = node.id as usize;
28            if id >= lengths.len() {
29                lengths.resize(id + 1, 0);
30            }
31            lengths[id] = tokenizer.tokenize(&node.content).len() as u32;
32        }
33
34        Self { lengths }
35    }
36
37    /// Get token count for a node.
38    pub fn get(&self, node_id: u64) -> u32 {
39        let idx = node_id as usize;
40        if idx < self.lengths.len() {
41            self.lengths[idx]
42        } else {
43            0
44        }
45    }
46
47    /// Average document length.
48    pub fn average(&self) -> f32 {
49        let non_zero: Vec<u32> = self.lengths.iter().filter(|&&l| l > 0).copied().collect();
50        if non_zero.is_empty() {
51            0.0
52        } else {
53            non_zero.iter().sum::<u32>() as f32 / non_zero.len() as f32
54        }
55    }
56
57    /// Number of documents with non-zero length.
58    pub fn len(&self) -> usize {
59        self.lengths.iter().filter(|&&l| l > 0).count()
60    }
61
62    /// Whether the table is empty.
63    pub fn is_empty(&self) -> bool {
64        self.len() == 0
65    }
66
67    /// Add a single node's document length.
68    pub fn add_node(&mut self, event: &CognitiveEvent) {
69        let count = Tokenizer::new().tokenize(&event.content).len() as u32;
70        let id = event.id as usize;
71        if id >= self.lengths.len() {
72            self.lengths.resize(id + 1, 0);
73        }
74        self.lengths[id] = count;
75    }
76
77    /// Remove a node's document length.
78    pub fn remove_node(&mut self, id: u64) {
79        let idx = id as usize;
80        if idx < self.lengths.len() {
81            self.lengths[idx] = 0;
82        }
83    }
84
85    /// Clear all lengths.
86    pub fn clear(&mut self) {
87        self.lengths.clear();
88    }
89
90    /// Rebuild from a graph.
91    pub fn rebuild(&mut self, graph: &MemoryGraph) {
92        *self = Self::build(graph, &Tokenizer::new());
93    }
94
95    /// Serialize to bytes for file writing.
96    pub fn to_bytes(&self) -> Vec<u8> {
97        let mut buf: Vec<u8> = Vec::new();
98        buf.extend_from_slice(&(self.lengths.len() as u64).to_le_bytes());
99        for &len in &self.lengths {
100            buf.extend_from_slice(&len.to_le_bytes());
101        }
102        buf
103    }
104
105    /// Deserialize from bytes.
106    pub fn from_bytes(data: &[u8]) -> Option<Self> {
107        if data.len() < 8 {
108            return None;
109        }
110
111        let count = u64::from_le_bytes(data[0..8].try_into().ok()?) as usize;
112        let expected_size = 8 + count * 4;
113        if data.len() < expected_size {
114            return None;
115        }
116
117        let mut lengths = Vec::with_capacity(count);
118        for i in 0..count {
119            let offset = 8 + i * 4;
120            let len = u32::from_le_bytes(data[offset..offset + 4].try_into().ok()?);
121            lengths.push(len);
122        }
123
124        Some(Self { lengths })
125    }
126}
127
128impl Default for DocLengths {
129    fn default() -> Self {
130        Self::new()
131    }
132}