Skip to main content

reddb_file/
native_artifact_codec.rs

1//! On-disk codecs for the server-defined native artifact payloads:
2//! `RDGA` (graph adjacency), `RDFT` (full-text index), and `RDDP`
3//! (document path/value). These are derived index artifacts persisted into the
4//! native store; `reddb-file` owns their byte layout (ADR 0046) while the
5//! tokenisation / index-construction algorithms stay in the server engine.
6//!
7//! All three share a length-prefixed string framing: a `u32` little-endian
8//! byte length followed by the UTF-8 bytes.
9//!
10//! `RDDP` pins `entity_id` as a fixed little-endian `u64`, **not** a string.
11
12use std::collections::BTreeMap;
13
14/// Magic prefixing a persisted graph-adjacency artifact.
15pub const GRAPH_ADJACENCY_MAGIC: &[u8; 4] = b"RDGA";
16/// Magic prefixing a persisted full-text index artifact.
17pub const FULLTEXT_INDEX_MAGIC: &[u8; 4] = b"RDFT";
18/// Magic prefixing a persisted document path/value artifact.
19pub const DOC_PATHVALUE_MAGIC: &[u8; 4] = b"RDDP";
20
21/// Errors raised while decoding a native artifact payload.
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub enum NativeArtifactError {
24    /// The payload was shorter than its fixed header or had the wrong magic.
25    InvalidHeader(&'static str),
26    /// A field ran past the end of the buffer.
27    Truncated(&'static str),
28    /// A length-prefixed string contained invalid UTF-8.
29    InvalidUtf8,
30    /// The `RDDP` header entry count disagreed with the decoded records.
31    EntryCountMismatch,
32}
33
34impl std::fmt::Display for NativeArtifactError {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        match self {
37            NativeArtifactError::InvalidHeader(m) => write!(f, "{}", m),
38            NativeArtifactError::Truncated(m) => write!(f, "{}", m),
39            NativeArtifactError::InvalidUtf8 => write!(f, "invalid utf-8 in native artifact"),
40            NativeArtifactError::EntryCountMismatch => {
41                write!(f, "document path/value artifact entry count mismatch")
42            }
43        }
44    }
45}
46
47impl std::error::Error for NativeArtifactError {}
48
49fn push_string(buf: &mut Vec<u8>, value: &str) {
50    let bytes = value.as_bytes();
51    buf.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
52    buf.extend_from_slice(bytes);
53}
54
55fn read_string(bytes: &[u8], pos: &mut usize) -> Result<String, NativeArtifactError> {
56    if *pos + 4 > bytes.len() {
57        return Err(NativeArtifactError::Truncated(
58            "truncated native artifact string length",
59        ));
60    }
61    let len = u32::from_le_bytes([
62        bytes[*pos],
63        bytes[*pos + 1],
64        bytes[*pos + 2],
65        bytes[*pos + 3],
66    ]) as usize;
67    *pos += 4;
68    if *pos + len > bytes.len() {
69        return Err(NativeArtifactError::Truncated(
70            "truncated native artifact string content",
71        ));
72    }
73    let value = std::str::from_utf8(&bytes[*pos..*pos + len])
74        .map_err(|_| NativeArtifactError::InvalidUtf8)?
75        .to_string();
76    *pos += len;
77    Ok(value)
78}
79
80fn read_u32(bytes: &[u8], pos: &mut usize, ctx: &'static str) -> Result<u32, NativeArtifactError> {
81    if *pos + 4 > bytes.len() {
82        return Err(NativeArtifactError::Truncated(ctx));
83    }
84    let value = u32::from_le_bytes([
85        bytes[*pos],
86        bytes[*pos + 1],
87        bytes[*pos + 2],
88        bytes[*pos + 3],
89    ]);
90    *pos += 4;
91    Ok(value)
92}
93
94fn read_u64(bytes: &[u8], pos: &mut usize, ctx: &'static str) -> Result<u64, NativeArtifactError> {
95    if *pos + 8 > bytes.len() {
96        return Err(NativeArtifactError::Truncated(ctx));
97    }
98    let value = u64::from_le_bytes([
99        bytes[*pos],
100        bytes[*pos + 1],
101        bytes[*pos + 2],
102        bytes[*pos + 3],
103        bytes[*pos + 4],
104        bytes[*pos + 5],
105        bytes[*pos + 6],
106        bytes[*pos + 7],
107    ]);
108    *pos += 8;
109    Ok(value)
110}
111
112// ============================================================================
113// RDGA — graph adjacency
114// ============================================================================
115
116/// One persisted graph edge (`RDGA`).
117#[derive(Debug, Clone, PartialEq)]
118pub struct GraphAdjacencyEdge {
119    /// Edge entity id (`entity_id.raw()` on the server side).
120    pub edge_id: u64,
121    /// Source node key.
122    pub from_node: String,
123    /// Target node key.
124    pub to_node: String,
125    /// Edge label.
126    pub label: String,
127    /// Edge weight.
128    pub weight: f32,
129}
130
131/// Encode a graph-adjacency (`RDGA`) artifact, byte-faithful to the server.
132pub fn encode_graph_adjacency(edges: &[GraphAdjacencyEdge]) -> Vec<u8> {
133    let mut data = Vec::with_capacity(32 + edges.len() * 48);
134    data.extend_from_slice(GRAPH_ADJACENCY_MAGIC);
135    data.extend_from_slice(&(edges.len() as u32).to_le_bytes());
136    for edge in edges {
137        data.extend_from_slice(&edge.edge_id.to_le_bytes());
138        push_string(&mut data, &edge.from_node);
139        push_string(&mut data, &edge.to_node);
140        push_string(&mut data, &edge.label);
141        data.extend_from_slice(&edge.weight.to_le_bytes());
142    }
143    data
144}
145
146/// Decode a graph-adjacency (`RDGA`) artifact.
147pub fn decode_graph_adjacency(
148    bytes: &[u8],
149) -> Result<Vec<GraphAdjacencyEdge>, NativeArtifactError> {
150    if bytes.len() < 8 || &bytes[0..4] != GRAPH_ADJACENCY_MAGIC {
151        return Err(NativeArtifactError::InvalidHeader(
152            "invalid graph adjacency artifact",
153        ));
154    }
155    let mut pos = 4usize;
156    let edge_count = read_u32(bytes, &mut pos, "truncated graph adjacency artifact")? as usize;
157    let mut edges = Vec::with_capacity(edge_count);
158    for _ in 0..edge_count {
159        let edge_id = read_u64(bytes, &mut pos, "truncated graph adjacency artifact")?;
160        let from_node = read_string(bytes, &mut pos)?;
161        let to_node = read_string(bytes, &mut pos)?;
162        let label = read_string(bytes, &mut pos)?;
163        if pos + 4 > bytes.len() {
164            return Err(NativeArtifactError::Truncated(
165                "truncated graph adjacency artifact weight",
166            ));
167        }
168        let weight =
169            f32::from_le_bytes([bytes[pos], bytes[pos + 1], bytes[pos + 2], bytes[pos + 3]]);
170        pos += 4;
171        edges.push(GraphAdjacencyEdge {
172            edge_id,
173            from_node,
174            to_node,
175            label,
176            weight,
177        });
178    }
179    Ok(edges)
180}
181
182// ============================================================================
183// RDFT — full-text index
184// ============================================================================
185
186/// A decoded full-text index artifact (`RDFT`).
187#[derive(Debug, Clone, PartialEq)]
188pub struct FulltextIndex {
189    /// Collection the index was built over.
190    pub collection: String,
191    /// Number of source documents.
192    pub doc_count: u32,
193    /// term -> sorted postings of `(entity_id, term_frequency)`.
194    pub postings: BTreeMap<String, Vec<(u64, u32)>>,
195}
196
197/// Encode a full-text index (`RDFT`) artifact, byte-faithful to the server.
198///
199/// `postings` is written in `BTreeMap` (lexicographic term) order. The legacy
200/// server built the same `BTreeMap` before serialising, so the byte stream is
201/// identical.
202pub fn encode_fulltext_index(
203    collection: &str,
204    doc_count: usize,
205    postings: &BTreeMap<String, Vec<(u64, u32)>>,
206) -> Vec<u8> {
207    let mut data = Vec::with_capacity(64 + postings.len() * 32);
208    data.extend_from_slice(FULLTEXT_INDEX_MAGIC);
209    push_string(&mut data, collection);
210    data.extend_from_slice(&(doc_count as u32).to_le_bytes());
211    data.extend_from_slice(&(postings.len() as u32).to_le_bytes());
212    for (term, entries) in postings {
213        push_string(&mut data, term);
214        data.extend_from_slice(&(entries.len() as u32).to_le_bytes());
215        for (entity_id, term_count) in entries {
216            data.extend_from_slice(&entity_id.to_le_bytes());
217            data.extend_from_slice(&term_count.to_le_bytes());
218        }
219    }
220    data
221}
222
223/// Decode a full-text index (`RDFT`) artifact.
224pub fn decode_fulltext_index(bytes: &[u8]) -> Result<FulltextIndex, NativeArtifactError> {
225    if bytes.len() < 12 || &bytes[0..4] != FULLTEXT_INDEX_MAGIC {
226        return Err(NativeArtifactError::InvalidHeader(
227            "invalid fulltext artifact",
228        ));
229    }
230    let mut pos = 4usize;
231    let collection = read_string(bytes, &mut pos)?;
232    let doc_count = read_u32(bytes, &mut pos, "truncated fulltext artifact")?;
233    let term_count = read_u32(bytes, &mut pos, "truncated fulltext artifact")? as usize;
234    let mut postings = BTreeMap::new();
235    for _ in 0..term_count {
236        let term = read_string(bytes, &mut pos)?;
237        let entry_count = read_u32(bytes, &mut pos, "truncated fulltext posting count")? as usize;
238        let mut entries = Vec::with_capacity(entry_count);
239        for _ in 0..entry_count {
240            let entity_id = read_u64(bytes, &mut pos, "truncated fulltext postings")?;
241            let term_freq = read_u32(bytes, &mut pos, "truncated fulltext postings")?;
242            entries.push((entity_id, term_freq));
243        }
244        postings.insert(term, entries);
245    }
246    Ok(FulltextIndex {
247        collection,
248        doc_count,
249        postings,
250    })
251}
252
253// ============================================================================
254// RDDP — document path/value
255// ============================================================================
256
257/// One persisted document record (`RDDP`).
258#[derive(Debug, Clone, PartialEq)]
259pub struct DocPathValueRecord {
260    /// Document entity id, persisted as a fixed little-endian `u64`.
261    pub entity_id: u64,
262    /// `(path, value)` entries in persistence order.
263    pub entries: Vec<(String, String)>,
264}
265
266/// A decoded document path/value artifact (`RDDP`).
267#[derive(Debug, Clone, PartialEq)]
268pub struct DocPathValueIndex {
269    /// Collection the artifact was built over.
270    pub collection: String,
271    /// Document records in persistence order.
272    pub documents: Vec<DocPathValueRecord>,
273}
274
275/// Encode a document path/value (`RDDP`) artifact, byte-faithful to the server.
276///
277/// `entity_id` is written as a fixed little-endian `u64` (the server passes
278/// `entity_id.raw()`), never a string.
279pub fn encode_document_pathvalue(collection: &str, documents: &[DocPathValueRecord]) -> Vec<u8> {
280    let total_entries: usize = documents.iter().map(|doc| doc.entries.len()).sum();
281    let mut data = Vec::with_capacity(64 + total_entries * 48);
282    data.extend_from_slice(DOC_PATHVALUE_MAGIC);
283    push_string(&mut data, collection);
284    data.extend_from_slice(&(documents.len() as u32).to_le_bytes());
285    data.extend_from_slice(&(total_entries as u32).to_le_bytes());
286    for doc in documents {
287        data.extend_from_slice(&doc.entity_id.to_le_bytes());
288        data.extend_from_slice(&(doc.entries.len() as u32).to_le_bytes());
289        for (path, value) in &doc.entries {
290            push_string(&mut data, path);
291            push_string(&mut data, value);
292        }
293    }
294    data
295}
296
297/// Decode a document path/value (`RDDP`) artifact, validating the header's
298/// total-entry count against the decoded records.
299pub fn decode_document_pathvalue(bytes: &[u8]) -> Result<DocPathValueIndex, NativeArtifactError> {
300    if bytes.len() < 12 || &bytes[0..4] != DOC_PATHVALUE_MAGIC {
301        return Err(NativeArtifactError::InvalidHeader(
302            "invalid document path/value artifact",
303        ));
304    }
305    let mut pos = 4usize;
306    let collection = read_string(bytes, &mut pos)?;
307    let doc_count = read_u32(bytes, &mut pos, "truncated document path/value artifact")? as usize;
308    let total_entries = read_u32(bytes, &mut pos, "truncated document path/value artifact")? as u64;
309    let mut documents = Vec::with_capacity(doc_count);
310    let mut seen_entries = 0u64;
311    for _ in 0..doc_count {
312        let entity_id = read_u64(bytes, &mut pos, "truncated document path/value record")?;
313        let entry_count =
314            read_u32(bytes, &mut pos, "truncated document path/value record")? as usize;
315        let mut entries = Vec::with_capacity(entry_count);
316        for _ in 0..entry_count {
317            let path = read_string(bytes, &mut pos)?;
318            let value = read_string(bytes, &mut pos)?;
319            entries.push((path, value));
320            seen_entries += 1;
321        }
322        documents.push(DocPathValueRecord { entity_id, entries });
323    }
324    if seen_entries != total_entries {
325        return Err(NativeArtifactError::EntryCountMismatch);
326    }
327    Ok(DocPathValueIndex {
328        collection,
329        documents,
330    })
331}
332
333#[cfg(test)]
334mod tests {
335    use super::*;
336
337    #[test]
338    fn graph_adjacency_round_trip() {
339        let edges = vec![
340            GraphAdjacencyEdge {
341                edge_id: 42,
342                from_node: "a".to_string(),
343                to_node: "b".to_string(),
344                label: "knows".to_string(),
345                weight: 0.5,
346            },
347            GraphAdjacencyEdge {
348                edge_id: 7,
349                from_node: "b".to_string(),
350                to_node: "c".to_string(),
351                label: "likes".to_string(),
352                weight: -1.25,
353            },
354        ];
355        let bytes = encode_graph_adjacency(&edges);
356        assert_eq!(&bytes[0..4], b"RDGA");
357        // edge_id of the first edge is a u64 immediately after the count.
358        assert_eq!(&bytes[8..16], &42u64.to_le_bytes());
359        assert_eq!(decode_graph_adjacency(&bytes).unwrap(), edges);
360    }
361
362    #[test]
363    fn fulltext_round_trip() {
364        let mut postings: BTreeMap<String, Vec<(u64, u32)>> = BTreeMap::new();
365        postings.insert("alpha".to_string(), vec![(1, 2), (3, 1)]);
366        postings.insert("beta".to_string(), vec![(3, 5)]);
367        let bytes = encode_fulltext_index("docs", 2, &postings);
368        assert_eq!(&bytes[0..4], b"RDFT");
369        let decoded = decode_fulltext_index(&bytes).unwrap();
370        assert_eq!(decoded.collection, "docs");
371        assert_eq!(decoded.doc_count, 2);
372        assert_eq!(decoded.postings, postings);
373    }
374
375    #[test]
376    fn doc_pathvalue_round_trip_pins_u64_entity_id() {
377        let documents = vec![
378            DocPathValueRecord {
379                entity_id: 0xDEAD_BEEF_0000_0001,
380                entries: vec![
381                    ("name".to_string(), "alice".to_string()),
382                    ("age".to_string(), "30".to_string()),
383                ],
384            },
385            DocPathValueRecord {
386                entity_id: 2,
387                entries: vec![("name".to_string(), "bob".to_string())],
388            },
389        ];
390        let bytes = encode_document_pathvalue("people", &documents);
391        assert_eq!(&bytes[0..4], b"RDDP");
392
393        // The entity_id must be a fixed little-endian u64 right after the
394        // collection string + doc_count u32 + total_entries u32 header.
395        let mut pos = 4usize;
396        let coll_len =
397            u32::from_le_bytes([bytes[pos], bytes[pos + 1], bytes[pos + 2], bytes[pos + 3]])
398                as usize;
399        pos += 4 + coll_len; // collection string
400        pos += 4; // doc_count
401        pos += 4; // total_entries
402        let entity_id = u64::from_le_bytes(bytes[pos..pos + 8].try_into().unwrap());
403        assert_eq!(entity_id, 0xDEAD_BEEF_0000_0001);
404
405        let decoded = decode_document_pathvalue(&bytes).unwrap();
406        assert_eq!(decoded.collection, "people");
407        assert_eq!(decoded.documents, documents);
408    }
409
410    #[test]
411    fn doc_pathvalue_detects_entry_count_mismatch() {
412        let documents = vec![DocPathValueRecord {
413            entity_id: 1,
414            entries: vec![("p".to_string(), "v".to_string())],
415        }];
416        let mut bytes = encode_document_pathvalue("c", &documents);
417        // Corrupt the total_entries header (collection "c" => len 1).
418        // layout: magic(4) + len(4) + "c"(1) + doc_count(4) + total_entries(4)
419        let total_off = 4 + 4 + 1 + 4;
420        bytes[total_off] = 9;
421        assert_eq!(
422            decode_document_pathvalue(&bytes),
423            Err(NativeArtifactError::EntryCountMismatch)
424        );
425    }
426
427    #[test]
428    fn rejects_bad_magic() {
429        assert!(matches!(
430            decode_graph_adjacency(b"XXXX\0\0\0\0"),
431            Err(NativeArtifactError::InvalidHeader(_))
432        ));
433        assert!(matches!(
434            decode_fulltext_index(&[0u8; 12]),
435            Err(NativeArtifactError::InvalidHeader(_))
436        ));
437        assert!(matches!(
438            decode_document_pathvalue(&[0u8; 12]),
439            Err(NativeArtifactError::InvalidHeader(_))
440        ));
441    }
442}