Skip to main content

reddb_server/runtime/ai/
urn_codec.rs

1//! URN codec for ASK source references (issue #394).
2//!
3//! Deep module: pure bidirectional codec between a typed [`Urn`]
4//! value and its wire form.
5//!
6//! Wire grammar (per ADR 0013):
7//!
8//! ```text
9//! urn        = "reddb:" collection "/" id [ "#" suffix ]
10//! collection = pct-encoded utf-8
11//! id         = pct-encoded utf-8
12//! suffix     = pct-encoded utf-8           ; kind-specific:
13//!                                          ;   VectorHit  → score literal
14//!                                          ;   GraphEdge  → edge id
15//!                                          ;   Document   → fragment label
16//! ```
17//!
18//! Percent-encoding covers `/`, `#`, `%`, control bytes (`< 0x20`,
19//! `0x7F`), space, and all bytes ≥ `0x80` so the wire form stays
20//! ASCII and decoding can reconstruct UTF-8 byte-for-byte.
21//!
22//! No I/O. Round-tripping `decode(encode(u))` is the property the
23//! unit tests pin.
24
25use std::fmt;
26
27const SCHEME: &str = "reddb:";
28
29/// What kind of source the URN points at. Suffix payload (when
30/// present) lives inside the variant.
31#[derive(Debug, Clone, PartialEq)]
32pub enum UrnKind {
33    Row,
34    KvEntry,
35    GraphNode,
36    VectorHit { score: f32 },
37    Document { fragment: String },
38    GraphEdge { edge_id: String },
39}
40
41impl UrnKind {
42    fn suffix(&self) -> Option<String> {
43        match self {
44            UrnKind::Row | UrnKind::KvEntry | UrnKind::GraphNode => None,
45            UrnKind::VectorHit { score } => Some(format_score(*score)),
46            UrnKind::Document { fragment } => Some(fragment.clone()),
47            UrnKind::GraphEdge { edge_id } => Some(edge_id.clone()),
48        }
49    }
50
51    pub fn token(&self) -> &'static str {
52        match self {
53            UrnKind::Row => "row",
54            UrnKind::KvEntry => "kv",
55            UrnKind::GraphNode => "graph_node",
56            UrnKind::VectorHit { .. } => "vector_hit",
57            UrnKind::Document { .. } => "document",
58            UrnKind::GraphEdge { .. } => "graph_edge",
59        }
60    }
61}
62
63#[derive(Debug, Clone, PartialEq)]
64pub struct Urn {
65    pub collection: String,
66    pub id: String,
67    pub kind: UrnKind,
68}
69
70impl Urn {
71    pub fn row(collection: impl Into<String>, id: impl Into<String>) -> Self {
72        Self {
73            collection: collection.into(),
74            id: id.into(),
75            kind: UrnKind::Row,
76        }
77    }
78    pub fn vector_hit(collection: impl Into<String>, id: impl Into<String>, score: f32) -> Self {
79        Self {
80            collection: collection.into(),
81            id: id.into(),
82            kind: UrnKind::VectorHit { score },
83        }
84    }
85    pub fn document(
86        collection: impl Into<String>,
87        id: impl Into<String>,
88        fragment: impl Into<String>,
89    ) -> Self {
90        Self {
91            collection: collection.into(),
92            id: id.into(),
93            kind: UrnKind::Document {
94                fragment: fragment.into(),
95            },
96        }
97    }
98    pub fn graph_node(collection: impl Into<String>, id: impl Into<String>) -> Self {
99        Self {
100            collection: collection.into(),
101            id: id.into(),
102            kind: UrnKind::GraphNode,
103        }
104    }
105    pub fn graph_edge(
106        collection: impl Into<String>,
107        id: impl Into<String>,
108        edge_id: impl Into<String>,
109    ) -> Self {
110        Self {
111            collection: collection.into(),
112            id: id.into(),
113            kind: UrnKind::GraphEdge {
114                edge_id: edge_id.into(),
115            },
116        }
117    }
118    pub fn kv(collection: impl Into<String>, id: impl Into<String>) -> Self {
119        Self {
120            collection: collection.into(),
121            id: id.into(),
122            kind: UrnKind::KvEntry,
123        }
124    }
125}
126
127#[derive(Debug, Clone, PartialEq, Eq)]
128pub enum UrnError {
129    MissingScheme,
130    MissingId,
131    InvalidPercent,
132    InvalidScore,
133}
134
135impl fmt::Display for UrnError {
136    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
137        match self {
138            UrnError::MissingScheme => write!(f, "URN missing reddb: scheme"),
139            UrnError::MissingId => write!(f, "URN missing /id segment"),
140            UrnError::InvalidPercent => write!(f, "URN has invalid percent-encoding"),
141            UrnError::InvalidScore => write!(f, "URN vector_hit suffix is not a score"),
142        }
143    }
144}
145
146impl std::error::Error for UrnError {}
147
148pub fn encode(urn: &Urn) -> String {
149    let mut s = String::with_capacity(SCHEME.len() + urn.collection.len() + urn.id.len() + 8);
150    s.push_str(SCHEME);
151    pct_encode_into(&urn.collection, &mut s);
152    s.push('/');
153    pct_encode_into(&urn.id, &mut s);
154    if let Some(suffix) = urn.kind.suffix() {
155        s.push('#');
156        pct_encode_into(&suffix, &mut s);
157    }
158    s
159}
160
161/// Hint passed to [`decode`] so the codec stays pure.
162#[derive(Debug, Clone, Copy, PartialEq, Eq)]
163pub enum KindHint {
164    Row,
165    KvEntry,
166    GraphNode,
167    VectorHit,
168    Document,
169    GraphEdge,
170}
171
172pub fn decode(s: &str, hint: KindHint) -> Result<Urn, UrnError> {
173    let rest = s.strip_prefix(SCHEME).ok_or(UrnError::MissingScheme)?;
174    let (head, suffix) = match rest.split_once('#') {
175        Some((h, s)) => (h, Some(pct_decode(s)?)),
176        None => (rest, None),
177    };
178    let (collection, id) = head.split_once('/').ok_or(UrnError::MissingId)?;
179    if id.is_empty() {
180        return Err(UrnError::MissingId);
181    }
182    let collection = pct_decode(collection)?;
183    let id = pct_decode(id)?;
184    let kind = match (hint, suffix) {
185        (KindHint::Row, None) => UrnKind::Row,
186        (KindHint::KvEntry, None) => UrnKind::KvEntry,
187        (KindHint::GraphNode, None) => UrnKind::GraphNode,
188        (KindHint::VectorHit, Some(sx)) => {
189            let score: f32 = sx.parse().map_err(|_| UrnError::InvalidScore)?;
190            UrnKind::VectorHit { score }
191        }
192        (KindHint::Document, Some(sx)) => UrnKind::Document { fragment: sx },
193        (KindHint::GraphEdge, Some(sx)) => UrnKind::GraphEdge { edge_id: sx },
194        _ => return Err(UrnError::MissingId),
195    };
196    Ok(Urn {
197        collection,
198        id,
199        kind,
200    })
201}
202
203fn pct_encode_into(input: &str, out: &mut String) {
204    for &b in input.as_bytes() {
205        if needs_pct(b) {
206            out.push('%');
207            out.push(hex_high(b));
208            out.push(hex_low(b));
209        } else {
210            out.push(b as char);
211        }
212    }
213}
214
215fn pct_decode(input: &str) -> Result<String, UrnError> {
216    let bytes = input.as_bytes();
217    let mut out = Vec::with_capacity(bytes.len());
218    let mut i = 0;
219    while i < bytes.len() {
220        if bytes[i] == b'%' {
221            if i + 2 >= bytes.len() {
222                return Err(UrnError::InvalidPercent);
223            }
224            let hi = hex_value(bytes[i + 1]).ok_or(UrnError::InvalidPercent)?;
225            let lo = hex_value(bytes[i + 2]).ok_or(UrnError::InvalidPercent)?;
226            out.push((hi << 4) | lo);
227            i += 3;
228        } else {
229            out.push(bytes[i]);
230            i += 1;
231        }
232    }
233    String::from_utf8(out).map_err(|_| UrnError::InvalidPercent)
234}
235
236fn needs_pct(b: u8) -> bool {
237    b == b'%' || b == b'/' || b == b'#' || b == b' ' || !(0x20..0x7F).contains(&b)
238}
239
240fn hex_high(b: u8) -> char {
241    let h = b >> 4;
242    if h < 10 {
243        (b'0' + h) as char
244    } else {
245        (b'A' + h - 10) as char
246    }
247}
248
249fn hex_low(b: u8) -> char {
250    let h = b & 0x0F;
251    if h < 10 {
252        (b'0' + h) as char
253    } else {
254        (b'A' + h - 10) as char
255    }
256}
257
258fn hex_value(b: u8) -> Option<u8> {
259    match b {
260        b'0'..=b'9' => Some(b - b'0'),
261        b'a'..=b'f' => Some(10 + b - b'a'),
262        b'A'..=b'F' => Some(10 + b - b'A'),
263        _ => None,
264    }
265}
266
267fn format_score(score: f32) -> String {
268    let mut s = format!("{:.6}", score);
269    if s.contains('.') {
270        while s.ends_with('0') {
271            s.pop();
272        }
273        if s.ends_with('.') {
274            s.pop();
275        }
276    }
277    s
278}
279
280#[cfg(test)]
281mod tests {
282    use super::*;
283
284    #[test]
285    fn row_round_trip() {
286        let u = Urn::row("incidents", "42");
287        assert_eq!(encode(&u), "reddb:incidents/42");
288        assert_eq!(decode("reddb:incidents/42", KindHint::Row).unwrap(), u);
289    }
290
291    #[test]
292    fn kv_round_trip() {
293        let u = Urn::kv("settings", "ask.cache.enabled");
294        assert_eq!(decode(&encode(&u), KindHint::KvEntry).unwrap(), u);
295    }
296
297    #[test]
298    fn graph_node_round_trip() {
299        let u = Urn::graph_node("hosts", "n-7");
300        assert_eq!(decode(&encode(&u), KindHint::GraphNode).unwrap(), u);
301    }
302
303    #[test]
304    fn vector_hit_round_trip() {
305        let u = Urn::vector_hit("docs", "doc-9", 0.87125);
306        let s = encode(&u);
307        let back = decode(&s, KindHint::VectorHit).unwrap();
308        assert_eq!(back.collection, "docs");
309        assert_eq!(back.id, "doc-9");
310        match back.kind {
311            UrnKind::VectorHit { score } => assert!((score - 0.87125).abs() < 1e-5),
312            _ => panic!("wrong kind"),
313        }
314    }
315
316    #[test]
317    fn vector_hit_score_format_stable() {
318        assert_eq!(format_score(0.5), "0.5");
319        assert_eq!(format_score(1.0), "1");
320        assert_eq!(format_score(0.0), "0");
321        assert_eq!(format_score(0.123456), "0.123456");
322    }
323
324    #[test]
325    fn document_round_trip_with_fragment() {
326        let u = Urn::document("manuals", "m-1", "chunk-7");
327        assert_eq!(encode(&u), "reddb:manuals/m-1#chunk-7");
328        assert_eq!(decode(&encode(&u), KindHint::Document).unwrap(), u);
329    }
330
331    #[test]
332    fn graph_edge_round_trip() {
333        let u = Urn::graph_edge("hosts", "n-1", "e-77");
334        assert_eq!(encode(&u), "reddb:hosts/n-1#e-77");
335        assert_eq!(decode(&encode(&u), KindHint::GraphEdge).unwrap(), u);
336    }
337
338    #[test]
339    fn percent_encodes_separators_in_collection() {
340        let u = Urn::row("we/ird#name", "id");
341        assert_eq!(encode(&u), "reddb:we%2Fird%23name/id");
342        assert_eq!(decode(&encode(&u), KindHint::Row).unwrap(), u);
343    }
344
345    #[test]
346    fn percent_encodes_separators_in_id() {
347        let u = Urn::row("col", "a/b#c");
348        assert_eq!(encode(&u), "reddb:col/a%2Fb%23c");
349        assert_eq!(decode(&encode(&u), KindHint::Row).unwrap(), u);
350    }
351
352    #[test]
353    fn percent_encodes_space_and_percent() {
354        let u = Urn::row("col with space", "100%");
355        assert_eq!(encode(&u), "reddb:col%20with%20space/100%25");
356        assert_eq!(decode(&encode(&u), KindHint::Row).unwrap(), u);
357    }
358
359    #[test]
360    fn percent_encodes_control_bytes() {
361        let u = Urn::row("col\nname", "id\t");
362        let s = encode(&u);
363        assert!(s.contains("%0A"));
364        assert!(s.contains("%09"));
365        assert_eq!(decode(&s, KindHint::Row).unwrap(), u);
366    }
367
368    #[test]
369    fn utf8_round_trips_via_pct_encoding() {
370        let u = Urn::row("日本語", "café");
371        let s = encode(&u);
372        assert!(s.is_ascii(), "wire URN must be ASCII: {s}");
373        assert_eq!(decode(&s, KindHint::Row).unwrap(), u);
374    }
375
376    #[test]
377    fn fragment_with_special_chars_round_trips() {
378        let u = Urn::document("docs", "d-1", "section/2#a b");
379        assert_eq!(decode(&encode(&u), KindHint::Document).unwrap(), u);
380    }
381
382    #[test]
383    fn missing_scheme_rejected() {
384        assert_eq!(
385            decode("not-a-urn/x", KindHint::Row),
386            Err(UrnError::MissingScheme)
387        );
388    }
389
390    #[test]
391    fn missing_id_rejected() {
392        assert_eq!(
393            decode("reddb:colonly", KindHint::Row),
394            Err(UrnError::MissingId)
395        );
396        assert_eq!(
397            decode("reddb:col/", KindHint::Row),
398            Err(UrnError::MissingId)
399        );
400    }
401
402    #[test]
403    fn invalid_percent_rejected() {
404        assert_eq!(
405            decode("reddb:col%2/id", KindHint::Row),
406            Err(UrnError::InvalidPercent)
407        );
408        assert_eq!(
409            decode("reddb:col/id%ZZ", KindHint::Row),
410            Err(UrnError::InvalidPercent)
411        );
412    }
413
414    #[test]
415    fn vector_hit_invalid_score_rejected() {
416        assert_eq!(
417            decode("reddb:docs/d-1#nope", KindHint::VectorHit),
418            Err(UrnError::InvalidScore)
419        );
420    }
421
422    #[test]
423    fn hint_mismatch_rejected() {
424        let s = encode(&Urn::row("col", "id"));
425        assert!(decode(&s, KindHint::VectorHit).is_err());
426        let s = encode(&Urn::vector_hit("col", "id", 0.5));
427        assert!(decode(&s, KindHint::Row).is_err());
428    }
429
430    #[test]
431    fn token_is_stable() {
432        assert_eq!(UrnKind::Row.token(), "row");
433        assert_eq!(UrnKind::KvEntry.token(), "kv");
434        assert_eq!(UrnKind::GraphNode.token(), "graph_node");
435        assert_eq!(UrnKind::VectorHit { score: 0.0 }.token(), "vector_hit");
436        assert_eq!(
437            UrnKind::Document {
438                fragment: "x".into()
439            }
440            .token(),
441            "document"
442        );
443        assert_eq!(
444            UrnKind::GraphEdge {
445                edge_id: "e".into()
446            }
447            .token(),
448            "graph_edge"
449        );
450    }
451
452    /// Pseudo-property test: deterministic byte-pattern matrix
453    /// covering separator / pct / space / control / UTF-8 chars
454    /// across every kind.
455    #[test]
456    fn property_round_trip_byte_matrix() {
457        let collections = [
458            "simple",
459            "with/slash",
460            "with#hash",
461            "with%pct",
462            "with space",
463            "with\ttab",
464            "with\nnewline",
465            "café",
466            "日本語",
467            "mixed/ # %",
468        ];
469        let ids = ["1", "abc", "uuid-1234", "with/slash", "deep/path#frag"];
470        for c in collections {
471            for i in ids {
472                for hint in [KindHint::Row, KindHint::KvEntry, KindHint::GraphNode] {
473                    let u = match hint {
474                        KindHint::Row => Urn::row(c, i),
475                        KindHint::KvEntry => Urn::kv(c, i),
476                        KindHint::GraphNode => Urn::graph_node(c, i),
477                        _ => unreachable!(),
478                    };
479                    let s = encode(&u);
480                    assert_eq!(decode(&s, hint).unwrap(), u, "mismatch for {s}");
481                }
482                let v = Urn::vector_hit(c, i, 0.42);
483                let back = decode(&encode(&v), KindHint::VectorHit).unwrap();
484                assert_eq!(back.collection, v.collection);
485                assert_eq!(back.id, v.id);
486                let d = Urn::document(c, i, "frag/with#stuff");
487                assert_eq!(decode(&encode(&d), KindHint::Document).unwrap(), d);
488                let e = Urn::graph_edge(c, i, "edge%01");
489                assert_eq!(decode(&encode(&e), KindHint::GraphEdge).unwrap(), e);
490            }
491        }
492    }
493}