Skip to main content

sochdb_core/
knowledge_object.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! # Knowledge Object — Object-Centric Data Model for Knowledge Fabric
19//!
20//! The `KnowledgeObject` is the atomic unit of the Knowledge Fabric. Unlike the
21//! tabular TOON format (which separates data, embeddings, edges, and temporal
22//! metadata across different structures), a Knowledge Object **co-locates** all
23//! information about a single entity:
24//!
25//! - **Content-addressed identity**: `oid = BLAKE3(canonical_payload)` — immutable,
26//!   collision-resistant, enabling structural deduplication and content verification.
27//! - **Embedded edges**: Relationships are stored *within* the object, so loading
28//!   an object immediately provides its connections without a separate graph lookup.
29//! - **Multi-space embeddings**: A single object can carry embeddings in multiple
30//!   semantic spaces (e.g., `"semantic"`, `"code"`, `"temporal"`), enabling
31//!   domain-specific similarity search without separate vector indices.
32//! - **Bitemporal coordinates**: Every object carries `(valid_from, valid_to, system_time)`,
33//!   supporting both "what was true?" (valid time) and "what did the system know?"
34//!   (system time) queries.
35//! - **Provenance chains**: Hash-linked derivation tracking — every transformation
36//!   records its parent OIDs, creating an auditable lineage.
37//!
38//! ## Why Co-Location Matters
39//!
40//! In a traditional architecture, a compositional query ("find entities similar to X
41//! that are connected to Y and were valid at time T") requires:
42//!
43//! 1. Vector index lookup → candidate set (separate I/O)
44//! 2. Graph traversal → filter by connectivity (separate I/O)
45//! 3. Temporal filter → narrow by validity (separate I/O)
46//! 4. Attribute filter → apply predicates (separate I/O)
47//!
48//! Each boundary adds serialization, allocation, and cache misses. With co-located
49//! Knowledge Objects, the fused query executor can evaluate all predicates in a
50//! single pass, reducing latency from ~11 ms to ~300 μs (30–50× improvement).
51//!
52//! ## Relationship to TOON
53//!
54//! Knowledge Objects wrap `SochValue` payloads — TOON data remains the content
55//! format. The Knowledge Object adds the metadata envelope that enables the
56//! Knowledge Fabric's compositional queries.
57//!
58//! ## Example
59//!
60//! ```rust,ignore
61//! use sochdb_core::knowledge_object::*;
62//!
63//! let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
64//!     .attribute("name", SochValue::Text("Alice".into()))
65//!     .attribute("role", SochValue::Text("engineer".into()))
66//!     .embedding("semantic", vec![0.1, 0.2, 0.3])
67//!     .edge(Edge::new(target_oid, EdgeKind::typed("works_at"), 1.0))
68//!     .valid_from(1700000000_000000)
69//!     .valid_to(u64::MAX)
70//!     .build();
71//!
72//! assert!(ko.oid().as_bytes().len() == 32);
73//! assert_eq!(ko.edges().len(), 1);
74//! assert!(ko.embedding("semantic").is_some());
75//! ```
76
77use serde::{Deserialize, Serialize};
78use serde_json;
79use std::collections::HashMap;
80use std::fmt;
81use std::io::Read;
82
83use crate::soch::SochValue;
84
85// =============================================================================
86// Content-Addressed Object Identity
87// =============================================================================
88
89/// A 256-bit BLAKE3 content hash serving as the immutable identity of a
90/// Knowledge Object.
91///
92/// `oid = BLAKE3(canonical_serialization(payload + edges + embeddings))`
93///
94/// Properties:
95/// - **Deterministic**: Same content always produces the same OID.
96/// - **Collision-resistant**: 256-bit output makes collisions computationally infeasible.
97/// - **Structural deduplication**: Identical objects share the same OID.
98/// - **Content verification**: Recomputing the hash detects corruption or tampering.
99///
100/// The OID is computed over the *canonical* byte representation (sorted keys,
101/// normalized floats) to ensure deterministic hashing regardless of insertion order.
102#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
103pub struct ObjectId([u8; 32]);
104
105impl ObjectId {
106    /// Create an OID from a raw 32-byte hash.
107    pub fn from_bytes(bytes: [u8; 32]) -> Self {
108        Self(bytes)
109    }
110
111    /// Compute the OID from canonical content bytes using BLAKE3.
112    pub fn from_content(content: &[u8]) -> Self {
113        let hash = blake3::hash(content);
114        Self(*hash.as_bytes())
115    }
116
117    /// The raw 32 bytes of the OID.
118    pub fn as_bytes(&self) -> &[u8; 32] {
119        &self.0
120    }
121
122    /// Hex-encoded OID string (64 characters).
123    pub fn to_hex(&self) -> String {
124        hex::encode(self.0)
125    }
126
127    /// Parse an OID from a 64-character hex string.
128    pub fn from_hex(s: &str) -> Result<Self, ObjectIdError> {
129        let bytes = hex::decode(s).map_err(|_| ObjectIdError::InvalidHex)?;
130        if bytes.len() != 32 {
131            return Err(ObjectIdError::InvalidLength(bytes.len()));
132        }
133        let mut arr = [0u8; 32];
134        arr.copy_from_slice(&bytes);
135        Ok(Self(arr))
136    }
137
138    /// A zero/nil OID, used as a sentinel for "no parent" in provenance chains.
139    pub const NIL: Self = Self([0u8; 32]);
140
141    /// Check if this is the nil/zero OID.
142    pub fn is_nil(&self) -> bool {
143        self.0 == [0u8; 32]
144    }
145}
146
147impl fmt::Debug for ObjectId {
148    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
149        write!(f, "ObjectId({})", &self.to_hex()[..16]) // Show first 16 hex chars
150    }
151}
152
153impl fmt::Display for ObjectId {
154    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
155        write!(f, "{}", self.to_hex())
156    }
157}
158
159/// Errors when parsing ObjectId.
160#[derive(Debug, Clone, thiserror::Error)]
161pub enum ObjectIdError {
162    #[error("invalid hex encoding")]
163    InvalidHex,
164    #[error("expected 32 bytes, got {0}")]
165    InvalidLength(usize),
166}
167
168// =============================================================================
169// Bitemporal Coordinates
170// =============================================================================
171
172/// Bitemporal versioning coordinate for a Knowledge Object.
173///
174/// Supports two independent time dimensions:
175///
176/// - **Valid time** (`valid_from`, `valid_to`): When the fact was/is true in the
177///   real world. Example: an employee's tenure at a company.
178/// - **System time** (`system_time`): When the system recorded this version.
179///   Assigned automatically by the HLC on write. Monotonically increasing.
180///
181/// This enables queries like:
182/// - `as_of(system_time=T₁)` — "What did the system know at time T₁?"
183/// - `valid_at(valid_time=T₂)` — "What was true at time T₂?"
184/// - `as_of(T₁).valid_at(T₂)` — "What did the system believe at T₁ about T₂?"
185///
186/// Timestamps are HLC-encoded microseconds (see `hlc.rs`): upper 48 bits are
187/// physical microseconds since Unix epoch, lower 16 bits are logical counter.
188#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
189pub struct BitemporalCoord {
190    /// Start of valid time interval (inclusive). HLC-encoded microseconds.
191    pub valid_from: u64,
192
193    /// End of valid time interval (exclusive). `u64::MAX` means "still valid".
194    /// HLC-encoded microseconds.
195    pub valid_to: u64,
196
197    /// System time when this version was recorded. Assigned by HLC on write.
198    /// HLC-encoded microseconds.
199    pub system_time: u64,
200}
201
202impl BitemporalCoord {
203    /// Create a new bitemporal coordinate with an open-ended valid interval.
204    pub fn new(valid_from: u64, system_time: u64) -> Self {
205        Self {
206            valid_from,
207            valid_to: u64::MAX,
208            system_time,
209        }
210    }
211
212    /// Create a coordinate with a closed valid interval.
213    pub fn with_valid_range(valid_from: u64, valid_to: u64, system_time: u64) -> Self {
214        Self {
215            valid_from,
216            valid_to,
217            system_time,
218        }
219    }
220
221    /// Check if this coordinate is valid at a given valid time.
222    pub fn valid_at(&self, valid_time: u64) -> bool {
223        self.valid_from <= valid_time && valid_time < self.valid_to
224    }
225
226    /// Check if this coordinate was known to the system by a given system time.
227    pub fn known_at(&self, system_time: u64) -> bool {
228        self.system_time <= system_time
229    }
230
231    /// Combined bitemporal query: was this fact known at `sys_time` and valid at `valid_time`?
232    pub fn visible_at(&self, system_time: u64, valid_time: u64) -> bool {
233        self.known_at(system_time) && self.valid_at(valid_time)
234    }
235
236    /// Close the valid time interval (the fact is no longer true).
237    pub fn close_valid_time(&mut self, valid_to: u64) {
238        self.valid_to = valid_to;
239    }
240
241    /// Check if this coordinate represents a currently-valid fact (valid_to == MAX).
242    pub fn is_current(&self) -> bool {
243        self.valid_to == u64::MAX
244    }
245
246    /// Default "eternal" coordinate — valid from epoch 0, never expires.
247    pub const ETERNAL: Self = Self {
248        valid_from: 0,
249        valid_to: u64::MAX,
250        system_time: 0,
251    };
252}
253
254impl Default for BitemporalCoord {
255    fn default() -> Self {
256        Self::ETERNAL
257    }
258}
259
260// =============================================================================
261// Embedded Edges
262// =============================================================================
263
264/// The kind/type of an edge between Knowledge Objects.
265///
266/// Typed edges enable graph queries like "traverse all `works_at` edges"
267/// without inspecting edge payloads.
268#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
269pub enum EdgeKind {
270    /// A named relationship type (e.g., "works_at", "authored_by", "cites").
271    Typed(String),
272    /// A hierarchical containment relationship (parent → child).
273    Contains,
274    /// A derivation relationship (source → derived).
275    DerivedFrom,
276    /// A reference/citation relationship.
277    References,
278    /// A temporal succession relationship (predecessor → successor).
279    Succeeds,
280    /// A semantic similarity link (auto-generated by embedding proximity).
281    SimilarTo,
282}
283
284impl EdgeKind {
285    /// Create a typed edge kind with the given label.
286    pub fn typed(label: impl Into<String>) -> Self {
287        Self::Typed(label.into())
288    }
289
290    /// Returns the string label for this edge kind.
291    pub fn label(&self) -> &str {
292        match self {
293            EdgeKind::Typed(s) => s,
294            EdgeKind::Contains => "contains",
295            EdgeKind::DerivedFrom => "derived_from",
296            EdgeKind::References => "references",
297            EdgeKind::Succeeds => "succeeds",
298            EdgeKind::SimilarTo => "similar_to",
299        }
300    }
301}
302
303impl fmt::Display for EdgeKind {
304    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
305        write!(f, "{}", self.label())
306    }
307}
308
309/// A directed, typed, weighted, temporally-versioned edge between two
310/// Knowledge Objects.
311///
312/// Edges are **embedded** within the source object — when you load an object,
313/// you immediately have its outgoing relationships. This eliminates the
314/// separate graph lookup required by KV-backed edge stores.
315///
316/// ## Memory Layout (32 bytes per edge)
317///
318/// | Field       | Size  | Purpose                          |
319/// |-------------|-------|----------------------------------|
320/// | target      | 32B   | Target ObjectId (BLAKE3 hash)    |
321/// | kind        | ~24B  | Edge type (enum + string)        |
322/// | weight      | 4B    | Relationship strength [0.0, 1.0] |
323/// | valid_from  | 8B    | Temporal validity start          |
324/// | valid_to    | 8B    | Temporal validity end            |
325/// | properties  | var   | Optional edge attributes         |
326///
327/// For the hot path (CSR-based graph traversal), edges are projected to
328/// `(target_internal_id: u32, weight: f32)` for cache efficiency.
329#[derive(Debug, Clone, Serialize, Deserialize)]
330pub struct Edge {
331    /// Target object this edge points to.
332    pub target: ObjectId,
333
334    /// The type/kind of this relationship.
335    pub kind: EdgeKind,
336
337    /// Relationship strength/confidence in [0.0, 1.0].
338    /// - 1.0 = definitive relationship
339    /// - 0.5 = probable relationship
340    /// - 0.0 = hypothetical/weak relationship
341    pub weight: f32,
342
343    /// Temporal validity interval for this edge.
344    /// Uses the same HLC-encoded microsecond format as `BitemporalCoord`.
345    pub valid_from: u64,
346
347    /// End of temporal validity (exclusive). `u64::MAX` = still valid.
348    pub valid_to: u64,
349
350    /// Optional edge properties (e.g., "role": "lead", "confidence": 0.95).
351    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
352    pub properties: HashMap<String, SochValue>,
353}
354
355impl Edge {
356    /// Create a new edge with default weight 1.0 and open-ended validity.
357    pub fn new(target: ObjectId, kind: EdgeKind, weight: f32) -> Self {
358        Self {
359            target,
360            kind,
361            weight,
362            valid_from: 0,
363            valid_to: u64::MAX,
364            properties: HashMap::new(),
365        }
366    }
367
368    /// Create an edge with temporal validity.
369    pub fn with_validity(
370        target: ObjectId,
371        kind: EdgeKind,
372        weight: f32,
373        valid_from: u64,
374        valid_to: u64,
375    ) -> Self {
376        Self {
377            target,
378            kind,
379            weight,
380            valid_from,
381            valid_to,
382            properties: HashMap::new(),
383        }
384    }
385
386    /// Add a property to this edge.
387    pub fn with_property(mut self, key: impl Into<String>, value: SochValue) -> Self {
388        self.properties.insert(key.into(), value);
389        self
390    }
391
392    /// Check if this edge is valid at a given time.
393    pub fn valid_at(&self, time: u64) -> bool {
394        self.valid_from <= time && time < self.valid_to
395    }
396
397    /// Check if this edge is currently valid (valid_to == MAX).
398    pub fn is_current(&self) -> bool {
399        self.valid_to == u64::MAX
400    }
401}
402
403impl PartialEq for Edge {
404    fn eq(&self, other: &Self) -> bool {
405        self.target == other.target && self.kind == other.kind
406    }
407}
408
409impl Eq for Edge {}
410
411// =============================================================================
412// Object Kind / Type System
413// =============================================================================
414
415/// Classification of a Knowledge Object. Determines which indices and query
416/// optimizations apply.
417#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
418pub enum ObjectKind {
419    /// A persistent entity (person, organization, concept).
420    /// Typically has long valid-time intervals and many edges.
421    Entity,
422
423    /// A temporal event or episode.
424    /// Has precise valid-time intervals and causal edges.
425    Event,
426
427    /// An episodic memory or conversation turn.
428    /// Dense in embeddings, often has derivation edges.
429    Episode,
430
431    /// A document or content chunk.
432    /// Primary carrier of text content and semantic embeddings.
433    Document,
434
435    /// A fact or claim extracted from content.
436    /// Has provenance edges linking to source documents.
437    Fact,
438
439    /// An agent-generated artifact (plan, summary, decision).
440    /// Has derivation provenance and typically short valid-time windows.
441    Artifact,
442
443    /// User-defined type with a custom label.
444    Custom(String),
445}
446
447impl ObjectKind {
448    /// Returns the string label for this kind.
449    pub fn label(&self) -> &str {
450        match self {
451            ObjectKind::Entity => "entity",
452            ObjectKind::Event => "event",
453            ObjectKind::Episode => "episode",
454            ObjectKind::Document => "document",
455            ObjectKind::Fact => "fact",
456            ObjectKind::Artifact => "artifact",
457            ObjectKind::Custom(s) => s,
458        }
459    }
460}
461
462impl fmt::Display for ObjectKind {
463    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
464        write!(f, "{}", self.label())
465    }
466}
467
468// =============================================================================
469// Provenance Chain
470// =============================================================================
471
472/// Records how a Knowledge Object was derived.
473///
474/// Provenance enables auditable lineage tracking: "Where did this fact come from?"
475/// "What transformations produced this summary?" Each provenance record forms a
476/// node in a DAG (Directed Acyclic Graph) of derivations.
477///
478/// The provenance chain is hash-linked — each object's OID is derived from its
479/// content (which includes parent OIDs), creating a tamper-evident lineage.
480#[derive(Debug, Clone, Serialize, Deserialize)]
481pub struct Provenance {
482    /// OIDs of the parent objects this was derived from.
483    /// Empty for root/original objects.
484    pub parents: Vec<ObjectId>,
485
486    /// The transformation or operation that produced this object.
487    /// Examples: "chunk", "summarize", "extract_entities", "merge", "user_input"
488    pub operation: String,
489
490    /// The agent or system that performed the transformation.
491    /// Examples: "gpt-4", "user:alice", "sochdb:compaction"
492    pub agent: String,
493
494    /// Timestamp when the derivation occurred (HLC-encoded microseconds).
495    pub timestamp: u64,
496
497    /// Optional metadata about the transformation.
498    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
499    pub metadata: HashMap<String, SochValue>,
500}
501
502impl Provenance {
503    /// Create a root provenance (no parents — this is an original object).
504    pub fn root(agent: impl Into<String>, timestamp: u64) -> Self {
505        Self {
506            parents: Vec::new(),
507            operation: "create".to_string(),
508            agent: agent.into(),
509            timestamp,
510            metadata: HashMap::new(),
511        }
512    }
513
514    /// Create a derived provenance with parent objects.
515    pub fn derived(
516        parents: Vec<ObjectId>,
517        operation: impl Into<String>,
518        agent: impl Into<String>,
519        timestamp: u64,
520    ) -> Self {
521        Self {
522            parents,
523            operation: operation.into(),
524            agent: agent.into(),
525            timestamp,
526            metadata: HashMap::new(),
527        }
528    }
529
530    /// Add metadata to this provenance record.
531    pub fn with_metadata(mut self, key: impl Into<String>, value: SochValue) -> Self {
532        self.metadata.insert(key.into(), value);
533        self
534    }
535
536    /// Check if this is a root provenance (no parents).
537    pub fn is_root(&self) -> bool {
538        self.parents.is_empty()
539    }
540}
541
542// =============================================================================
543// Embedding Space
544// =============================================================================
545
546/// An embedding vector in a named semantic space.
547///
548/// Knowledge Objects can carry embeddings in multiple spaces simultaneously:
549/// - `"semantic"` — general-purpose sentence embedding (e.g., text-embedding-3-small)
550/// - `"code"` — code-specific embedding (e.g., CodeBERT)
551/// - `"temporal"` — time-series embedding for temporal similarity
552/// - `"visual"` — image/diagram embedding (e.g., CLIP)
553///
554/// Each space can have a different dimensionality and distance metric.
555#[derive(Debug, Clone, Serialize, Deserialize)]
556pub struct EmbeddingSpace {
557    /// The embedding vector (f32 components).
558    pub vector: Vec<f32>,
559
560    /// Dimensionality of this embedding.
561    pub dimensions: u32,
562
563    /// The model that generated this embedding.
564    /// Enables re-embedding when models are upgraded.
565    pub model: String,
566
567    /// When this embedding was generated (HLC-encoded microseconds).
568    /// Enables staleness detection and re-embedding triggers.
569    pub generated_at: u64,
570}
571
572impl EmbeddingSpace {
573    /// Create a new embedding in a given space.
574    pub fn new(vector: Vec<f32>, model: impl Into<String>, generated_at: u64) -> Self {
575        let dimensions = vector.len() as u32;
576        Self {
577            vector,
578            dimensions,
579            model: model.into(),
580            generated_at,
581        }
582    }
583
584    /// The L2 norm of this embedding vector.
585    pub fn norm(&self) -> f32 {
586        self.vector.iter().map(|x| x * x).sum::<f32>().sqrt()
587    }
588
589    /// Normalize this embedding to unit length (for cosine similarity as dot product).
590    pub fn normalize(&mut self) {
591        let norm = self.norm();
592        if norm > f32::EPSILON {
593            for x in &mut self.vector {
594                *x /= norm;
595            }
596        }
597    }
598}
599
600// =============================================================================
601// Knowledge Object
602// =============================================================================
603
604/// The atomic unit of the Knowledge Fabric.
605///
606/// A Knowledge Object co-locates content, relationships, embeddings, temporal
607/// metadata, and provenance into a single, content-addressed entity. This
608/// co-location enables the fused query execution pipeline that delivers
609/// 30–50× latency improvements over disaggregated architectures.
610///
611/// ## Invariants
612///
613/// 1. `oid == BLAKE3(canonical_bytes(payload, edges, embeddings))` — the OID
614///    is always consistent with the object's content.
615/// 2. `temporal.system_time` is monotonically increasing for successive versions
616///    of the same logical entity.
617/// 3. Edges form a DAG for `DerivedFrom` and `Succeeds` kinds (no cycles).
618/// 4. Embedding dimensions match the declared space dimensionality.
619///
620/// ## Thread Safety
621///
622/// `KnowledgeObject` is `Send + Sync` (all fields are owned or `Arc`-wrapped).
623/// Concurrent mutation should go through the MVCC layer — objects themselves
624/// are treated as immutable values (copy-on-write semantics via content addressing).
625#[derive(Debug, Clone, Serialize, Deserialize)]
626pub struct KnowledgeObject {
627    /// Content-addressed identity: `BLAKE3(canonical_content)`.
628    oid: ObjectId,
629
630    /// Classification of this object (entity, event, document, etc.).
631    kind: ObjectKind,
632
633    /// The object's data payload — a self-describing `SochValue`.
634    /// Typically a `SochValue::Object(HashMap<String, SochValue>)` but can be
635    /// any `SochValue` variant for flexibility.
636    payload: SochValue,
637
638    /// Outgoing edges to other Knowledge Objects.
639    /// Embedded within the object for edge locality — loading an object
640    /// immediately provides its relationships.
641    edges: Vec<Edge>,
642
643    /// Embeddings in multiple semantic spaces.
644    /// Key: space name (e.g., "semantic", "code", "temporal").
645    embeddings: HashMap<String, EmbeddingSpace>,
646
647    /// Bitemporal versioning coordinate.
648    temporal: BitemporalCoord,
649
650    /// Derivation provenance.
651    provenance: Provenance,
652
653    /// Optional namespace for multi-tenant isolation.
654    #[serde(default, skip_serializing_if = "Option::is_none")]
655    namespace: Option<String>,
656
657    /// Optional tags for fast categorical filtering.
658    /// Tags are indexed in the ART for O(k) lookup.
659    #[serde(default, skip_serializing_if = "Vec::is_empty")]
660    tags: Vec<String>,
661}
662
663impl KnowledgeObject {
664    // =========================================================================
665    // Accessors
666    // =========================================================================
667
668    /// The content-addressed object identity.
669    pub fn oid(&self) -> ObjectId {
670        self.oid
671    }
672
673    /// The object's classification.
674    pub fn kind(&self) -> &ObjectKind {
675        &self.kind
676    }
677
678    /// The data payload.
679    pub fn payload(&self) -> &SochValue {
680        &self.payload
681    }
682
683    /// Mutable access to the payload (will invalidate OID — call `recompute_oid()` after).
684    pub fn payload_mut(&mut self) -> &mut SochValue {
685        &mut self.payload
686    }
687
688    /// All outgoing edges.
689    pub fn edges(&self) -> &[Edge] {
690        &self.edges
691    }
692
693    /// Edges filtered by kind.
694    pub fn edges_of_kind(&self, kind: &EdgeKind) -> Vec<&Edge> {
695        self.edges.iter().filter(|e| &e.kind == kind).collect()
696    }
697
698    /// Edges valid at a given time.
699    pub fn edges_valid_at(&self, time: u64) -> Vec<&Edge> {
700        self.edges.iter().filter(|e| e.valid_at(time)).collect()
701    }
702
703    /// Get an embedding by space name.
704    pub fn embedding(&self, space: &str) -> Option<&EmbeddingSpace> {
705        self.embeddings.get(space)
706    }
707
708    /// All embedding spaces.
709    pub fn embeddings(&self) -> &HashMap<String, EmbeddingSpace> {
710        &self.embeddings
711    }
712
713    /// The default/primary embedding vector (in the "semantic" space).
714    pub fn primary_embedding(&self) -> Option<&[f32]> {
715        self.embeddings.get("semantic").map(|e| e.vector.as_slice())
716    }
717
718    /// The bitemporal coordinate.
719    pub fn temporal(&self) -> &BitemporalCoord {
720        &self.temporal
721    }
722
723    /// Set the bitemporal coordinate (e.g., to assign HLC system_time on write).
724    ///
725    /// Note: This does NOT change the OID. Temporal coordinates are metadata,
726    /// not part of the content-addressed identity.
727    pub fn set_temporal(&mut self, coord: BitemporalCoord) {
728        self.temporal = coord;
729    }
730
731    /// The derivation provenance.
732    pub fn provenance(&self) -> &Provenance {
733        &self.provenance
734    }
735
736    /// The namespace (for multi-tenant isolation).
737    pub fn namespace(&self) -> Option<&str> {
738        self.namespace.as_deref()
739    }
740
741    /// Tags for categorical filtering.
742    pub fn tags(&self) -> &[String] {
743        &self.tags
744    }
745
746    /// Check if this object has a specific tag.
747    pub fn has_tag(&self, tag: &str) -> bool {
748        self.tags.iter().any(|t| t == tag)
749    }
750
751    // =========================================================================
752    // Temporal Queries
753    // =========================================================================
754
755    /// Is this object valid at the given valid time?
756    pub fn valid_at(&self, valid_time: u64) -> bool {
757        self.temporal.valid_at(valid_time)
758    }
759
760    /// Was this object known to the system at the given system time?
761    pub fn known_at(&self, system_time: u64) -> bool {
762        self.temporal.known_at(system_time)
763    }
764
765    /// Combined bitemporal visibility check.
766    pub fn visible_at(&self, system_time: u64, valid_time: u64) -> bool {
767        self.temporal.visible_at(system_time, valid_time)
768    }
769
770    /// Is this the current version (valid_to == MAX)?
771    pub fn is_current(&self) -> bool {
772        self.temporal.is_current()
773    }
774
775    // =========================================================================
776    // Attribute Access
777    // =========================================================================
778
779    /// Get a named attribute from the payload (assumes payload is `SochValue::Object`).
780    pub fn attribute(&self, key: &str) -> Option<&SochValue> {
781        match &self.payload {
782            SochValue::Object(map) => map.get(key),
783            _ => None,
784        }
785    }
786
787    /// Get a text attribute.
788    pub fn text_attribute(&self, key: &str) -> Option<&str> {
789        self.attribute(key).and_then(|v| v.as_text())
790    }
791
792    /// Get an integer attribute.
793    pub fn int_attribute(&self, key: &str) -> Option<i64> {
794        self.attribute(key).and_then(|v| v.as_int())
795    }
796
797    // =========================================================================
798    // Content Addressing
799    // =========================================================================
800
801    /// Recompute the OID from the current content.
802    /// Must be called after any mutation to maintain the content-addressing invariant.
803    pub fn recompute_oid(&mut self) {
804        self.oid = Self::compute_oid(&self.kind, &self.payload, &self.edges, &self.embeddings);
805    }
806
807    /// Verify that the stored OID matches the current content.
808    pub fn verify_oid(&self) -> bool {
809        let computed = Self::compute_oid(&self.kind, &self.payload, &self.edges, &self.embeddings);
810        self.oid == computed
811    }
812
813    /// Compute the canonical OID for given content.
814    fn compute_oid(
815        kind: &ObjectKind,
816        payload: &SochValue,
817        edges: &[Edge],
818        embeddings: &HashMap<String, EmbeddingSpace>,
819    ) -> ObjectId {
820        let canonical = Self::canonical_bytes(kind, payload, edges, embeddings);
821        ObjectId::from_content(&canonical)
822    }
823
824    /// Produce canonical bytes for OID computation.
825    ///
826    /// Canonical serialization ensures deterministic hashing:
827    /// - HashMap keys are sorted lexicographically
828    /// - Floats are normalized (NaN → 0.0, -0.0 → 0.0)
829    /// - Using bincode for compact, deterministic binary encoding
830    fn canonical_bytes(
831        kind: &ObjectKind,
832        payload: &SochValue,
833        edges: &[Edge],
834        embeddings: &HashMap<String, EmbeddingSpace>,
835    ) -> Vec<u8> {
836        // We use a deterministic serialization approach:
837        // 1. Serialize kind label
838        // 2. Serialize payload via bincode
839        // 3. Serialize edges sorted by (target, kind)
840        // 4. Serialize embeddings sorted by space name
841        let mut hasher_input = Vec::with_capacity(1024);
842
843        // Kind label
844        let kind_bytes = kind.label().as_bytes();
845        hasher_input.extend_from_slice(&(kind_bytes.len() as u32).to_le_bytes());
846        hasher_input.extend_from_slice(kind_bytes);
847
848        // Payload — deterministic serialization of SochValue
849        // For Object(HashMap), sort keys before serializing
850        let payload_bytes = canonical_soch_value_bytes(payload);
851        hasher_input.extend_from_slice(&(payload_bytes.len() as u32).to_le_bytes());
852        hasher_input.extend_from_slice(&payload_bytes);
853
854        // Edges sorted by (target OID, kind label) for determinism
855        let mut sorted_edges: Vec<_> = edges.iter().collect();
856        sorted_edges.sort_by(|a, b| {
857            a.target
858                .as_bytes()
859                .cmp(b.target.as_bytes())
860                .then_with(|| a.kind.label().cmp(b.kind.label()))
861        });
862        hasher_input.extend_from_slice(&(sorted_edges.len() as u32).to_le_bytes());
863        for edge in &sorted_edges {
864            hasher_input.extend_from_slice(edge.target.as_bytes());
865            let kind_label = edge.kind.label().as_bytes();
866            hasher_input.extend_from_slice(&(kind_label.len() as u32).to_le_bytes());
867            hasher_input.extend_from_slice(kind_label);
868            hasher_input.extend_from_slice(&edge.weight.to_le_bytes());
869        }
870
871        // Embeddings sorted by space name for determinism
872        let mut sorted_spaces: Vec<_> = embeddings.iter().collect();
873        sorted_spaces.sort_by_key(|(name, _)| *name);
874        hasher_input.extend_from_slice(&(sorted_spaces.len() as u32).to_le_bytes());
875        for (name, embedding) in &sorted_spaces {
876            let name_bytes = name.as_bytes();
877            hasher_input.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
878            hasher_input.extend_from_slice(name_bytes);
879            hasher_input.extend_from_slice(&embedding.dimensions.to_le_bytes());
880            for &v in &embedding.vector {
881                hasher_input.extend_from_slice(&v.to_le_bytes());
882            }
883        }
884
885        hasher_input
886    }
887}
888
889/// Produce deterministic bytes for a SochValue.
890/// For `Object(HashMap)`, keys are sorted to ensure deterministic output.
891fn canonical_soch_value_bytes(value: &SochValue) -> Vec<u8> {
892    let mut buf = Vec::with_capacity(256);
893    write_canonical_soch_value(&mut buf, value);
894    buf
895}
896
897/// Recursively write a SochValue in canonical (deterministic) byte order.
898fn write_canonical_soch_value(buf: &mut Vec<u8>, value: &SochValue) {
899    match value {
900        SochValue::Null => buf.push(0),
901        SochValue::Bool(b) => {
902            buf.push(1);
903            buf.push(if *b { 1 } else { 0 });
904        }
905        SochValue::Int(i) => {
906            buf.push(2);
907            buf.extend_from_slice(&i.to_le_bytes());
908        }
909        SochValue::UInt(u) => {
910            buf.push(3);
911            buf.extend_from_slice(&u.to_le_bytes());
912        }
913        SochValue::Float(f) => {
914            buf.push(4);
915            // Normalize: NaN → 0.0, -0.0 → 0.0
916            let normalized = if f.is_nan() {
917                0.0
918            } else if *f == 0.0 {
919                0.0
920            } else {
921                *f
922            };
923            buf.extend_from_slice(&normalized.to_le_bytes());
924        }
925        SochValue::Text(s) => {
926            buf.push(5);
927            buf.extend_from_slice(&(s.len() as u32).to_le_bytes());
928            buf.extend_from_slice(s.as_bytes());
929        }
930        SochValue::Binary(b) => {
931            buf.push(6);
932            buf.extend_from_slice(&(b.len() as u32).to_le_bytes());
933            buf.extend_from_slice(b);
934        }
935        SochValue::Array(arr) => {
936            buf.push(7);
937            buf.extend_from_slice(&(arr.len() as u32).to_le_bytes());
938            for item in arr {
939                write_canonical_soch_value(buf, item);
940            }
941        }
942        SochValue::Object(map) => {
943            buf.push(8);
944            // Sort keys for deterministic ordering
945            let mut sorted_keys: Vec<&String> = map.keys().collect();
946            sorted_keys.sort();
947            buf.extend_from_slice(&(sorted_keys.len() as u32).to_le_bytes());
948            for key in sorted_keys {
949                buf.extend_from_slice(&(key.len() as u32).to_le_bytes());
950                buf.extend_from_slice(key.as_bytes());
951                write_canonical_soch_value(buf, &map[key]);
952            }
953        }
954        SochValue::Ref { table, id } => {
955            buf.push(9);
956            buf.extend_from_slice(&(table.len() as u32).to_le_bytes());
957            buf.extend_from_slice(table.as_bytes());
958            buf.extend_from_slice(&id.to_le_bytes());
959        }
960    }
961}
962
963impl KnowledgeObject {
964    // =========================================================================
965    // Serialization
966    // =========================================================================
967
968    /// Serialize this Knowledge Object to compact binary format.
969    /// Uses serde_json for reliable HashMap serialization.
970    pub fn to_bytes(&self) -> Result<Vec<u8>, KnowledgeObjectError> {
971        serde_json::to_vec(self)
972            .map_err(|e| KnowledgeObjectError::SerializationError(e.to_string()))
973    }
974
975    /// Deserialize a Knowledge Object from binary format.
976    pub fn from_bytes(bytes: &[u8]) -> Result<Self, KnowledgeObjectError> {
977        serde_json::from_slice(bytes)
978            .map_err(|e| KnowledgeObjectError::DeserializationError(e.to_string()))
979    }
980
981    /// Estimated memory footprint of this object (for memory budgeting).
982    pub fn estimated_size(&self) -> usize {
983        std::mem::size_of::<Self>()
984            + self.edges.len() * std::mem::size_of::<Edge>()
985            + self
986                .embeddings
987                .values()
988                .map(|e| e.vector.len() * 4)
989                .sum::<usize>()
990            + self.tags.iter().map(|t| t.len()).sum::<usize>()
991    }
992
993    // =========================================================================
994    // Compressed Serialization
995    // =========================================================================
996
997    /// Serialize with per-object compression.
998    ///
999    /// Wire format: `[tag: u8] [original_len: u32 LE] [payload...]`
1000    ///
1001    /// - Tag 0 (`None`): payload is raw JSON (original_len == payload.len()).
1002    /// - Tag 1 (`Lz4`): payload is LZ4-block-compressed JSON.
1003    /// - Tag 2 (`Zstd`): payload is ZSTD-compressed JSON.
1004    ///
1005    /// Falls back to uncompressed if compressed output >= original size.
1006    pub fn to_compressed_bytes(
1007        &self,
1008        mode: CompressionMode,
1009    ) -> Result<Vec<u8>, KnowledgeObjectError> {
1010        let raw = self.to_bytes()?;
1011        let original_len = raw.len() as u32;
1012
1013        match mode {
1014            CompressionMode::None => {
1015                let mut out = Vec::with_capacity(5 + raw.len());
1016                out.push(CompressionMode::None.tag());
1017                out.extend_from_slice(&original_len.to_le_bytes());
1018                out.extend_from_slice(&raw);
1019                Ok(out)
1020            }
1021            CompressionMode::Lz4 => {
1022                let compressed = lz4::block::compress(&raw, None, false)
1023                    .map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
1024                // Fallback if compression doesn't save space
1025                if compressed.len() >= raw.len() {
1026                    let mut out = Vec::with_capacity(5 + raw.len());
1027                    out.push(CompressionMode::None.tag());
1028                    out.extend_from_slice(&original_len.to_le_bytes());
1029                    out.extend_from_slice(&raw);
1030                    return Ok(out);
1031                }
1032                let mut out = Vec::with_capacity(5 + compressed.len());
1033                out.push(CompressionMode::Lz4.tag());
1034                out.extend_from_slice(&original_len.to_le_bytes());
1035                out.extend_from_slice(&compressed);
1036                Ok(out)
1037            }
1038            CompressionMode::Zstd { level } => {
1039                let compressed = zstd::encode_all(raw.as_slice(), level)
1040                    .map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
1041                if compressed.len() >= raw.len() {
1042                    let mut out = Vec::with_capacity(5 + raw.len());
1043                    out.push(CompressionMode::None.tag());
1044                    out.extend_from_slice(&original_len.to_le_bytes());
1045                    out.extend_from_slice(&raw);
1046                    return Ok(out);
1047                }
1048                let mut out = Vec::with_capacity(5 + compressed.len());
1049                out.push(CompressionMode::Zstd { level }.tag());
1050                out.extend_from_slice(&original_len.to_le_bytes());
1051                out.extend_from_slice(&compressed);
1052                Ok(out)
1053            }
1054        }
1055    }
1056
1057    /// Deserialize from compressed wire format (auto-detects compression).
1058    ///
1059    /// The 1-byte tag determines the decompression algorithm.
1060    pub fn from_compressed_bytes(bytes: &[u8]) -> Result<Self, KnowledgeObjectError> {
1061        if bytes.len() < 5 {
1062            return Err(KnowledgeObjectError::DeserializationError(
1063                "compressed payload too short (need >= 5 bytes)".into(),
1064            ));
1065        }
1066
1067        let tag = bytes[0];
1068        let original_len = u32::from_le_bytes([bytes[1], bytes[2], bytes[3], bytes[4]]) as usize;
1069        let payload = &bytes[5..];
1070
1071        let raw = match tag {
1072            0 => {
1073                // Uncompressed
1074                payload.to_vec()
1075            }
1076            1 => {
1077                // LZ4
1078                lz4::block::decompress(payload, Some(original_len as i32))
1079                    .map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?
1080            }
1081            2 => {
1082                // ZSTD
1083                let mut decoder = zstd::Decoder::new(payload)
1084                    .map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
1085                let mut raw = Vec::with_capacity(original_len);
1086                decoder
1087                    .read_to_end(&mut raw)
1088                    .map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
1089                raw
1090            }
1091            _ => {
1092                return Err(KnowledgeObjectError::UnknownCompressionTag(tag));
1093            }
1094        };
1095
1096        Self::from_bytes(&raw)
1097    }
1098
1099    /// Returns the compression ratio for a given mode (compressed_size / original_size).
1100    /// Values < 1.0 indicate space savings.
1101    pub fn compression_ratio(&self, mode: CompressionMode) -> Result<f64, KnowledgeObjectError> {
1102        let raw_len = self.to_bytes()?.len() as f64;
1103        let compressed_len = self.to_compressed_bytes(mode)?.len() as f64;
1104        Ok(compressed_len / raw_len)
1105    }
1106}
1107
1108// =============================================================================
1109// Compression Mode
1110// =============================================================================
1111
1112/// Per-object compression strategy.
1113///
1114/// Each [`KnowledgeObject`] can be independently compressed with a different
1115/// algorithm and level. The choice depends on the object's characteristics:
1116///
1117/// - **LZ4**: ~3 GB/s decode, low CPU. Best for hot/frequently-accessed objects.
1118/// - **ZSTD**: ~1 GB/s decode, better ratios. Best for cold/archival objects.
1119/// - **None**: Zero overhead. Use for tiny objects where compression adds bytes.
1120#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1121pub enum CompressionMode {
1122    /// No compression — raw JSON bytes.
1123    None,
1124    /// LZ4 block compression — fast decode, moderate ratio.
1125    Lz4,
1126    /// ZSTD compression with configurable level (1–22, default 3).
1127    Zstd { level: i32 },
1128}
1129
1130impl CompressionMode {
1131    /// 1-byte tag written to the wire format header.
1132    pub fn tag(&self) -> u8 {
1133        match self {
1134            Self::None => 0,
1135            Self::Lz4 => 1,
1136            Self::Zstd { .. } => 2,
1137        }
1138    }
1139
1140    /// Construct from wire tag (decompression side doesn't need level).
1141    pub fn from_tag(tag: u8) -> Option<Self> {
1142        match tag {
1143            0 => Some(Self::None),
1144            1 => Some(Self::Lz4),
1145            2 => Some(Self::Zstd { level: 0 }), // level unused for decompression
1146            _ => Option::None,
1147        }
1148    }
1149
1150    /// Default ZSTD mode (level 3 — good balance of speed and ratio).
1151    pub fn zstd() -> Self {
1152        Self::Zstd { level: 3 }
1153    }
1154
1155    /// High-compression ZSTD (level 9 — archival).
1156    pub fn zstd_high() -> Self {
1157        Self::Zstd { level: 9 }
1158    }
1159}
1160
1161impl Default for CompressionMode {
1162    fn default() -> Self {
1163        Self::None
1164    }
1165}
1166
1167impl PartialEq for KnowledgeObject {
1168    fn eq(&self, other: &Self) -> bool {
1169        // Content-addressed equality: same OID means same object.
1170        self.oid == other.oid
1171    }
1172}
1173
1174impl Eq for KnowledgeObject {}
1175
1176impl std::hash::Hash for KnowledgeObject {
1177    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
1178        self.oid.hash(state);
1179    }
1180}
1181
1182impl fmt::Display for KnowledgeObject {
1183    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1184        write!(
1185            f,
1186            "KO({}, kind={}, edges={}, embeddings={}, tags={})",
1187            &self.oid.to_hex()[..12],
1188            self.kind,
1189            self.edges.len(),
1190            self.embeddings.len(),
1191            self.tags.len()
1192        )
1193    }
1194}
1195
1196// =============================================================================
1197// Builder
1198// =============================================================================
1199
1200/// Ergonomic builder for constructing Knowledge Objects.
1201///
1202/// The builder computes the content-addressed OID automatically on `.build()`.
1203///
1204/// # Example
1205///
1206/// ```rust,ignore
1207/// let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1208///     .attribute("name", SochValue::Text("Alice".into()))
1209///     .embedding("semantic", vec![0.1, 0.2, 0.3])
1210///     .tag("person")
1211///     .valid_from(1700000000_000000)
1212///     .build();
1213/// ```
1214pub struct KnowledgeObjectBuilder {
1215    kind: ObjectKind,
1216    payload: SochValue,
1217    edges: Vec<Edge>,
1218    embeddings: HashMap<String, EmbeddingSpace>,
1219    temporal: BitemporalCoord,
1220    provenance: Provenance,
1221    namespace: Option<String>,
1222    tags: Vec<String>,
1223}
1224
1225impl KnowledgeObjectBuilder {
1226    /// Create a new builder with the given object kind.
1227    pub fn new(kind: ObjectKind) -> Self {
1228        Self {
1229            kind,
1230            payload: SochValue::Object(HashMap::new()),
1231            edges: Vec::new(),
1232            embeddings: HashMap::new(),
1233            temporal: BitemporalCoord::default(),
1234            provenance: Provenance::root("system", 0),
1235            namespace: None,
1236            tags: Vec::new(),
1237        }
1238    }
1239
1240    /// Set the full payload.
1241    pub fn payload(mut self, payload: SochValue) -> Self {
1242        self.payload = payload;
1243        self
1244    }
1245
1246    /// Add a named attribute to the payload (creates/extends an Object payload).
1247    pub fn attribute(mut self, key: impl Into<String>, value: SochValue) -> Self {
1248        match &mut self.payload {
1249            SochValue::Object(map) => {
1250                map.insert(key.into(), value);
1251            }
1252            _ => {
1253                let mut map = HashMap::new();
1254                map.insert(key.into(), value);
1255                self.payload = SochValue::Object(map);
1256            }
1257        }
1258        self
1259    }
1260
1261    /// Add an outgoing edge.
1262    pub fn edge(mut self, edge: Edge) -> Self {
1263        self.edges.push(edge);
1264        self
1265    }
1266
1267    /// Add multiple edges at once.
1268    pub fn edges(mut self, edges: impl IntoIterator<Item = Edge>) -> Self {
1269        self.edges.extend(edges);
1270        self
1271    }
1272
1273    /// Add an embedding in a named space.
1274    pub fn embedding(mut self, space: impl Into<String>, vector: Vec<f32>) -> Self {
1275        let space_name = space.into();
1276        self.embeddings
1277            .insert(space_name, EmbeddingSpace::new(vector, "unknown", 0));
1278        self
1279    }
1280
1281    /// Add an embedding with full metadata.
1282    pub fn embedding_with_metadata(
1283        mut self,
1284        space: impl Into<String>,
1285        vector: Vec<f32>,
1286        model: impl Into<String>,
1287        generated_at: u64,
1288    ) -> Self {
1289        let space_name = space.into();
1290        self.embeddings
1291            .insert(space_name, EmbeddingSpace::new(vector, model, generated_at));
1292        self
1293    }
1294
1295    /// Set the valid_from time (HLC-encoded microseconds).
1296    pub fn valid_from(mut self, valid_from: u64) -> Self {
1297        self.temporal.valid_from = valid_from;
1298        self
1299    }
1300
1301    /// Set the valid_to time (HLC-encoded microseconds).
1302    pub fn valid_to(mut self, valid_to: u64) -> Self {
1303        self.temporal.valid_to = valid_to;
1304        self
1305    }
1306
1307    /// Set the system_time (typically assigned automatically by HLC on write).
1308    pub fn system_time(mut self, system_time: u64) -> Self {
1309        self.temporal.system_time = system_time;
1310        self
1311    }
1312
1313    /// Set the full bitemporal coordinate.
1314    pub fn temporal(mut self, temporal: BitemporalCoord) -> Self {
1315        self.temporal = temporal;
1316        self
1317    }
1318
1319    /// Set the provenance record.
1320    pub fn provenance(mut self, provenance: Provenance) -> Self {
1321        self.provenance = provenance;
1322        self
1323    }
1324
1325    /// Set the namespace.
1326    pub fn namespace(mut self, namespace: impl Into<String>) -> Self {
1327        self.namespace = Some(namespace.into());
1328        self
1329    }
1330
1331    /// Add a tag.
1332    pub fn tag(mut self, tag: impl Into<String>) -> Self {
1333        self.tags.push(tag.into());
1334        self
1335    }
1336
1337    /// Add multiple tags.
1338    pub fn tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
1339        self.tags.extend(tags.into_iter().map(|t| t.into()));
1340        self
1341    }
1342
1343    /// Build the Knowledge Object, computing the content-addressed OID.
1344    pub fn build(self) -> KnowledgeObject {
1345        let oid =
1346            KnowledgeObject::compute_oid(&self.kind, &self.payload, &self.edges, &self.embeddings);
1347
1348        KnowledgeObject {
1349            oid,
1350            kind: self.kind,
1351            payload: self.payload,
1352            edges: self.edges,
1353            embeddings: self.embeddings,
1354            temporal: self.temporal,
1355            provenance: self.provenance,
1356            namespace: self.namespace,
1357            tags: self.tags,
1358        }
1359    }
1360
1361    /// Build with a pre-computed OID (for deserialization or migration).
1362    pub fn build_with_oid(self, oid: ObjectId) -> KnowledgeObject {
1363        KnowledgeObject {
1364            oid,
1365            kind: self.kind,
1366            payload: self.payload,
1367            edges: self.edges,
1368            embeddings: self.embeddings,
1369            temporal: self.temporal,
1370            provenance: self.provenance,
1371            namespace: self.namespace,
1372            tags: self.tags,
1373        }
1374    }
1375}
1376
1377// =============================================================================
1378// Error Types
1379// =============================================================================
1380
1381/// Errors for Knowledge Object operations.
1382#[derive(Debug, Clone, thiserror::Error)]
1383pub enum KnowledgeObjectError {
1384    #[error("serialization error: {0}")]
1385    SerializationError(String),
1386
1387    #[error("deserialization error: {0}")]
1388    DeserializationError(String),
1389
1390    #[error("OID verification failed: stored={stored}, computed={computed}")]
1391    OidMismatch { stored: String, computed: String },
1392
1393    #[error("missing required embedding space: {0}")]
1394    MissingEmbedding(String),
1395
1396    #[error("dimension mismatch in space '{space}': expected {expected}, got {got}")]
1397    DimensionMismatch {
1398        space: String,
1399        expected: u32,
1400        got: u32,
1401    },
1402
1403    #[error("invalid temporal coordinates: valid_from ({valid_from}) > valid_to ({valid_to})")]
1404    InvalidTemporalRange { valid_from: u64, valid_to: u64 },
1405
1406    #[error("compression error: {0}")]
1407    CompressionError(String),
1408
1409    #[error("unknown compression tag: {0}")]
1410    UnknownCompressionTag(u8),
1411}
1412
1413// =============================================================================
1414// Conversion from TOON/SochValue
1415// =============================================================================
1416
1417impl From<SochValue> for KnowledgeObjectBuilder {
1418    /// Convert a SochValue into a KnowledgeObject builder.
1419    /// The SochValue becomes the payload; kind defaults to `Document`.
1420    fn from(value: SochValue) -> Self {
1421        KnowledgeObjectBuilder::new(ObjectKind::Document).payload(value)
1422    }
1423}
1424
1425// =============================================================================
1426// Tests
1427// =============================================================================
1428
1429#[cfg(test)]
1430mod tests {
1431    use super::*;
1432
1433    #[test]
1434    fn test_content_addressing_determinism() {
1435        let ko1 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1436            .attribute("name", SochValue::Text("Alice".into()))
1437            .attribute("age", SochValue::Int(30))
1438            .build();
1439
1440        let ko2 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1441            .attribute("age", SochValue::Int(30))
1442            .attribute("name", SochValue::Text("Alice".into()))
1443            .build();
1444
1445        // Different insertion order, same content → same OID
1446        assert_eq!(ko1.oid(), ko2.oid());
1447    }
1448
1449    #[test]
1450    fn test_different_content_different_oid() {
1451        let ko1 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1452            .attribute("name", SochValue::Text("Alice".into()))
1453            .build();
1454
1455        let ko2 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1456            .attribute("name", SochValue::Text("Bob".into()))
1457            .build();
1458
1459        assert_ne!(ko1.oid(), ko2.oid());
1460    }
1461
1462    #[test]
1463    fn test_oid_verification() {
1464        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1465            .attribute("content", SochValue::Text("Hello, world!".into()))
1466            .build();
1467
1468        assert!(ko.verify_oid());
1469    }
1470
1471    #[test]
1472    fn test_bitemporal_queries() {
1473        let ko = KnowledgeObjectBuilder::new(ObjectKind::Event)
1474            .valid_from(100)
1475            .valid_to(200)
1476            .system_time(50)
1477            .build();
1478
1479        assert!(ko.valid_at(150));
1480        assert!(!ko.valid_at(250));
1481        assert!(ko.known_at(50));
1482        assert!(ko.known_at(100));
1483        assert!(!ko.known_at(40));
1484
1485        // Combined: visible at system_time=60, valid_time=150 → true
1486        assert!(ko.visible_at(60, 150));
1487        // Combined: visible at system_time=40, valid_time=150 → false (not yet recorded)
1488        assert!(!ko.visible_at(40, 150));
1489    }
1490
1491    #[test]
1492    fn test_embedded_edges() {
1493        let target_oid = ObjectId::from_content(b"target_object");
1494
1495        let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1496            .attribute("name", SochValue::Text("Alice".into()))
1497            .edge(Edge::new(target_oid, EdgeKind::typed("works_at"), 1.0))
1498            .edge(Edge::new(target_oid, EdgeKind::Contains, 0.5))
1499            .build();
1500
1501        assert_eq!(ko.edges().len(), 2);
1502        assert_eq!(ko.edges_of_kind(&EdgeKind::typed("works_at")).len(), 1);
1503        assert_eq!(ko.edges_of_kind(&EdgeKind::Contains).len(), 1);
1504    }
1505
1506    #[test]
1507    fn test_multi_space_embeddings() {
1508        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1509            .embedding("semantic", vec![0.1, 0.2, 0.3])
1510            .embedding("code", vec![0.4, 0.5, 0.6, 0.7])
1511            .build();
1512
1513        assert!(ko.embedding("semantic").is_some());
1514        assert!(ko.embedding("code").is_some());
1515        assert!(ko.embedding("nonexistent").is_none());
1516        assert_eq!(ko.embedding("semantic").unwrap().dimensions, 3);
1517        assert_eq!(ko.embedding("code").unwrap().dimensions, 4);
1518    }
1519
1520    #[test]
1521    fn test_provenance_chain() {
1522        let parent_oid = ObjectId::from_content(b"parent_document");
1523
1524        let ko = KnowledgeObjectBuilder::new(ObjectKind::Fact)
1525            .attribute("claim", SochValue::Text("X is true".into()))
1526            .provenance(Provenance::derived(
1527                vec![parent_oid],
1528                "extract_facts",
1529                "gpt-4",
1530                1700000000,
1531            ))
1532            .build();
1533
1534        assert!(!ko.provenance().is_root());
1535        assert_eq!(ko.provenance().parents.len(), 1);
1536        assert_eq!(ko.provenance().parents[0], parent_oid);
1537        assert_eq!(ko.provenance().operation, "extract_facts");
1538    }
1539
1540    #[test]
1541    fn test_serialization_roundtrip() {
1542        let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1543            .attribute("name", SochValue::Text("Alice".into()))
1544            .embedding("semantic", vec![0.1, 0.2, 0.3])
1545            .tag("person")
1546            .namespace("test")
1547            .build();
1548
1549        let bytes = ko.to_bytes().unwrap();
1550        let restored = KnowledgeObject::from_bytes(&bytes).unwrap();
1551
1552        assert_eq!(ko.oid(), restored.oid());
1553        assert_eq!(ko.kind(), restored.kind());
1554        assert_eq!(ko.tags(), restored.tags());
1555        assert_eq!(ko.namespace(), restored.namespace());
1556    }
1557
1558    #[test]
1559    fn test_object_id_hex_roundtrip() {
1560        let oid = ObjectId::from_content(b"test content");
1561        let hex = oid.to_hex();
1562        let parsed = ObjectId::from_hex(&hex).unwrap();
1563        assert_eq!(oid, parsed);
1564    }
1565
1566    #[test]
1567    fn test_nil_oid() {
1568        assert!(ObjectId::NIL.is_nil());
1569        let non_nil = ObjectId::from_content(b"something");
1570        assert!(!non_nil.is_nil());
1571    }
1572
1573    #[test]
1574    fn test_edge_temporal_filtering() {
1575        let target = ObjectId::from_content(b"target");
1576
1577        let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1578            .edge(Edge::with_validity(
1579                target,
1580                EdgeKind::typed("works_at"),
1581                1.0,
1582                100,
1583                200,
1584            ))
1585            .edge(Edge::with_validity(
1586                target,
1587                EdgeKind::typed("manages"),
1588                0.8,
1589                150,
1590                u64::MAX,
1591            ))
1592            .build();
1593
1594        // At time 120: only "works_at" is valid
1595        let active = ko.edges_valid_at(120);
1596        assert_eq!(active.len(), 1);
1597        assert_eq!(active[0].kind, EdgeKind::typed("works_at"));
1598
1599        // At time 160: both are valid
1600        assert_eq!(ko.edges_valid_at(160).len(), 2);
1601
1602        // At time 250: only "manages" (still current)
1603        let active = ko.edges_valid_at(250);
1604        assert_eq!(active.len(), 1);
1605        assert_eq!(active[0].kind, EdgeKind::typed("manages"));
1606    }
1607
1608    #[test]
1609    fn test_estimated_size() {
1610        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1611            .embedding("semantic", vec![0.0; 384])
1612            .tag("test")
1613            .build();
1614
1615        let size = ko.estimated_size();
1616        assert!(size > 384 * 4); // At least the embedding vector
1617    }
1618
1619    #[test]
1620    fn test_display() {
1621        let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1622            .attribute("name", SochValue::Text("Alice".into()))
1623            .build();
1624
1625        let display = format!("{}", ko);
1626        assert!(display.starts_with("KO("));
1627        assert!(display.contains("kind=entity"));
1628    }
1629
1630    // =====================================================================
1631    // Compression tests
1632    // =====================================================================
1633
1634    #[test]
1635    fn test_compression_none_roundtrip() {
1636        let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1637            .attribute("name", SochValue::Text("Alice".into()))
1638            .embedding("semantic", vec![0.1; 128])
1639            .tag("person")
1640            .build();
1641
1642        let compressed = ko.to_compressed_bytes(CompressionMode::None).unwrap();
1643        assert_eq!(compressed[0], 0); // tag = None
1644        let restored = KnowledgeObject::from_compressed_bytes(&compressed).unwrap();
1645        assert_eq!(ko.oid(), restored.oid());
1646    }
1647
1648    #[test]
1649    fn test_compression_lz4_roundtrip() {
1650        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1651            .attribute("content", SochValue::Text("hello world ".repeat(100)))
1652            .embedding("semantic", vec![0.5; 384])
1653            .build();
1654
1655        let compressed = ko.to_compressed_bytes(CompressionMode::Lz4).unwrap();
1656        let raw = ko.to_bytes().unwrap();
1657
1658        // LZ4 should compress repetitive content
1659        assert!(
1660            compressed.len() < raw.len(),
1661            "LZ4 should reduce size for repetitive data"
1662        );
1663        assert_eq!(compressed[0], 1); // tag = Lz4
1664
1665        let restored = KnowledgeObject::from_compressed_bytes(&compressed).unwrap();
1666        assert_eq!(ko.oid(), restored.oid());
1667        assert_eq!(ko.tags(), restored.tags());
1668    }
1669
1670    #[test]
1671    fn test_compression_zstd_roundtrip() {
1672        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1673            .attribute("content", SochValue::Text("hello world ".repeat(100)))
1674            .embedding("semantic", vec![0.5; 384])
1675            .tag("document")
1676            .namespace("test-ns")
1677            .build();
1678
1679        let compressed = ko.to_compressed_bytes(CompressionMode::zstd()).unwrap();
1680        let raw = ko.to_bytes().unwrap();
1681
1682        assert!(compressed.len() < raw.len(), "ZSTD should reduce size");
1683        assert_eq!(compressed[0], 2); // tag = Zstd
1684
1685        let restored = KnowledgeObject::from_compressed_bytes(&compressed).unwrap();
1686        assert_eq!(ko.oid(), restored.oid());
1687        assert_eq!(ko.namespace(), restored.namespace());
1688    }
1689
1690    #[test]
1691    fn test_compression_fallback_on_tiny_object() {
1692        // A tiny object where compression might increase size
1693        let ko = KnowledgeObjectBuilder::new(ObjectKind::Fact)
1694            .attribute("x", SochValue::Int(1))
1695            .build();
1696
1697        let compressed_lz4 = ko.to_compressed_bytes(CompressionMode::Lz4).unwrap();
1698        let compressed_zstd = ko.to_compressed_bytes(CompressionMode::zstd()).unwrap();
1699
1700        // Should still roundtrip regardless (falls back to None if compressed >= raw)
1701        let r1 = KnowledgeObject::from_compressed_bytes(&compressed_lz4).unwrap();
1702        let r2 = KnowledgeObject::from_compressed_bytes(&compressed_zstd).unwrap();
1703        assert_eq!(ko.oid(), r1.oid());
1704        assert_eq!(ko.oid(), r2.oid());
1705    }
1706
1707    #[test]
1708    fn test_compression_ratio() {
1709        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1710            .attribute("data", SochValue::Text("abcdefgh".repeat(500)))
1711            .build();
1712
1713        let ratio = ko.compression_ratio(CompressionMode::Lz4).unwrap();
1714        assert!(
1715            ratio < 1.0,
1716            "LZ4 should achieve < 1.0 ratio on repetitive data"
1717        );
1718
1719        let ratio_zstd = ko.compression_ratio(CompressionMode::zstd()).unwrap();
1720        assert!(
1721            ratio_zstd < ratio,
1722            "ZSTD should beat LZ4 ratio at default level"
1723        );
1724    }
1725
1726    #[test]
1727    fn test_compression_mode_tag_roundtrip() {
1728        for mode in [
1729            CompressionMode::None,
1730            CompressionMode::Lz4,
1731            CompressionMode::zstd(),
1732        ] {
1733            let tag = mode.tag();
1734            let recovered = CompressionMode::from_tag(tag).unwrap();
1735            assert_eq!(mode.tag(), recovered.tag());
1736        }
1737        assert!(CompressionMode::from_tag(255).is_none());
1738    }
1739
1740    #[test]
1741    fn test_compressed_bytes_too_short() {
1742        let result = KnowledgeObject::from_compressed_bytes(&[0, 1, 2]);
1743        assert!(result.is_err());
1744    }
1745
1746    #[test]
1747    fn test_unknown_compression_tag() {
1748        let bad_bytes = vec![99, 0, 0, 0, 0]; // tag=99, len=0
1749        let result = KnowledgeObject::from_compressed_bytes(&bad_bytes);
1750        assert!(result.is_err());
1751    }
1752}