Skip to main content

sochdb_core/
knowledge_object.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! # Knowledge Object — Object-Centric Data Model for Knowledge Fabric
19//!
20//! The `KnowledgeObject` is the atomic unit of the Knowledge Fabric. Unlike the
21//! tabular TOON format (which separates data, embeddings, edges, and temporal
22//! metadata across different structures), a Knowledge Object **co-locates** all
23//! information about a single entity:
24//!
25//! - **Content-addressed identity**: `oid = BLAKE3(canonical_payload)` — immutable,
26//!   collision-resistant, enabling structural deduplication and content verification.
27//! - **Embedded edges**: Relationships are stored *within* the object, so loading
28//!   an object immediately provides its connections without a separate graph lookup.
29//! - **Multi-space embeddings**: A single object can carry embeddings in multiple
30//!   semantic spaces (e.g., `"semantic"`, `"code"`, `"temporal"`), enabling
31//!   domain-specific similarity search without separate vector indices.
32//! - **Bitemporal coordinates**: Every object carries `(valid_from, valid_to, system_time)`,
33//!   supporting both "what was true?" (valid time) and "what did the system know?"
34//!   (system time) queries.
35//! - **Provenance chains**: Hash-linked derivation tracking — every transformation
36//!   records its parent OIDs, creating an auditable lineage.
37//!
38//! ## Why Co-Location Matters
39//!
40//! In a traditional architecture, a compositional query ("find entities similar to X
41//! that are connected to Y and were valid at time T") requires:
42//!
43//! 1. Vector index lookup → candidate set (separate I/O)
44//! 2. Graph traversal → filter by connectivity (separate I/O)
45//! 3. Temporal filter → narrow by validity (separate I/O)
46//! 4. Attribute filter → apply predicates (separate I/O)
47//!
48//! Each boundary adds serialization, allocation, and cache misses. With co-located
49//! Knowledge Objects, the fused query executor can evaluate all predicates in a
50//! single pass, reducing latency from ~11 ms to ~300 μs (30–50× improvement).
51//!
52//! ## Relationship to TOON
53//!
54//! Knowledge Objects wrap `SochValue` payloads — TOON data remains the content
55//! format. The Knowledge Object adds the metadata envelope that enables the
56//! Knowledge Fabric's compositional queries.
57//!
58//! ## Example
59//!
60//! ```rust,ignore
61//! use sochdb_core::knowledge_object::*;
62//!
63//! let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
64//!     .attribute("name", SochValue::Text("Alice".into()))
65//!     .attribute("role", SochValue::Text("engineer".into()))
66//!     .embedding("semantic", vec![0.1, 0.2, 0.3])
67//!     .edge(Edge::new(target_oid, EdgeKind::typed("works_at"), 1.0))
68//!     .valid_from(1700000000_000000)
69//!     .valid_to(u64::MAX)
70//!     .build();
71//!
72//! assert!(ko.oid().as_bytes().len() == 32);
73//! assert_eq!(ko.edges().len(), 1);
74//! assert!(ko.embedding("semantic").is_some());
75//! ```
76
77use serde::{Deserialize, Serialize};
78use serde_json;
79use std::collections::HashMap;
80use std::fmt;
81use std::io::Read;
82
83use crate::soch::SochValue;
84
85// =============================================================================
86// Content-Addressed Object Identity
87// =============================================================================
88
89/// A 256-bit BLAKE3 content hash serving as the immutable identity of a
90/// Knowledge Object.
91///
92/// `oid = BLAKE3(canonical_serialization(payload + edges + embeddings))`
93///
94/// Properties:
95/// - **Deterministic**: Same content always produces the same OID.
96/// - **Collision-resistant**: 256-bit output makes collisions computationally infeasible.
97/// - **Structural deduplication**: Identical objects share the same OID.
98/// - **Content verification**: Recomputing the hash detects corruption or tampering.
99///
100/// The OID is computed over the *canonical* byte representation (sorted keys,
101/// normalized floats) to ensure deterministic hashing regardless of insertion order.
102#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
103pub struct ObjectId([u8; 32]);
104
105impl ObjectId {
106    /// Create an OID from a raw 32-byte hash.
107    pub fn from_bytes(bytes: [u8; 32]) -> Self {
108        Self(bytes)
109    }
110
111    /// Compute the OID from canonical content bytes using BLAKE3.
112    pub fn from_content(content: &[u8]) -> Self {
113        let hash = blake3::hash(content);
114        Self(*hash.as_bytes())
115    }
116
117    /// The raw 32 bytes of the OID.
118    pub fn as_bytes(&self) -> &[u8; 32] {
119        &self.0
120    }
121
122    /// Hex-encoded OID string (64 characters).
123    pub fn to_hex(&self) -> String {
124        hex::encode(self.0)
125    }
126
127    /// Parse an OID from a 64-character hex string.
128    pub fn from_hex(s: &str) -> Result<Self, ObjectIdError> {
129        let bytes = hex::decode(s).map_err(|_| ObjectIdError::InvalidHex)?;
130        if bytes.len() != 32 {
131            return Err(ObjectIdError::InvalidLength(bytes.len()));
132        }
133        let mut arr = [0u8; 32];
134        arr.copy_from_slice(&bytes);
135        Ok(Self(arr))
136    }
137
138    /// A zero/nil OID, used as a sentinel for "no parent" in provenance chains.
139    pub const NIL: Self = Self([0u8; 32]);
140
141    /// Check if this is the nil/zero OID.
142    pub fn is_nil(&self) -> bool {
143        self.0 == [0u8; 32]
144    }
145}
146
147impl fmt::Debug for ObjectId {
148    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
149        write!(f, "ObjectId({})", &self.to_hex()[..16]) // Show first 16 hex chars
150    }
151}
152
153impl fmt::Display for ObjectId {
154    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
155        write!(f, "{}", self.to_hex())
156    }
157}
158
159/// Errors when parsing ObjectId.
160#[derive(Debug, Clone, thiserror::Error)]
161pub enum ObjectIdError {
162    #[error("invalid hex encoding")]
163    InvalidHex,
164    #[error("expected 32 bytes, got {0}")]
165    InvalidLength(usize),
166}
167
168// =============================================================================
169// Bitemporal Coordinates
170// =============================================================================
171
172/// Bitemporal versioning coordinate for a Knowledge Object.
173///
174/// Supports two independent time dimensions:
175///
176/// - **Valid time** (`valid_from`, `valid_to`): When the fact was/is true in the
177///   real world. Example: an employee's tenure at a company.
178/// - **System time** (`system_time`): When the system recorded this version.
179///   Assigned automatically by the HLC on write. Monotonically increasing.
180///
181/// This enables queries like:
182/// - `as_of(system_time=T₁)` — "What did the system know at time T₁?"
183/// - `valid_at(valid_time=T₂)` — "What was true at time T₂?"
184/// - `as_of(T₁).valid_at(T₂)` — "What did the system believe at T₁ about T₂?"
185///
186/// Timestamps are HLC-encoded microseconds (see `hlc.rs`): upper 48 bits are
187/// physical microseconds since Unix epoch, lower 16 bits are logical counter.
188#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
189pub struct BitemporalCoord {
190    /// Start of valid time interval (inclusive). HLC-encoded microseconds.
191    pub valid_from: u64,
192
193    /// End of valid time interval (exclusive). `u64::MAX` means "still valid".
194    /// HLC-encoded microseconds.
195    pub valid_to: u64,
196
197    /// System time when this version was recorded. Assigned by HLC on write.
198    /// HLC-encoded microseconds.
199    pub system_time: u64,
200}
201
202impl BitemporalCoord {
203    /// Create a new bitemporal coordinate with an open-ended valid interval.
204    pub fn new(valid_from: u64, system_time: u64) -> Self {
205        Self {
206            valid_from,
207            valid_to: u64::MAX,
208            system_time,
209        }
210    }
211
212    /// Create a coordinate with a closed valid interval.
213    pub fn with_valid_range(valid_from: u64, valid_to: u64, system_time: u64) -> Self {
214        Self {
215            valid_from,
216            valid_to,
217            system_time,
218        }
219    }
220
221    /// Check if this coordinate is valid at a given valid time.
222    pub fn valid_at(&self, valid_time: u64) -> bool {
223        self.valid_from <= valid_time && valid_time < self.valid_to
224    }
225
226    /// Check if this coordinate was known to the system by a given system time.
227    pub fn known_at(&self, system_time: u64) -> bool {
228        self.system_time <= system_time
229    }
230
231    /// Combined bitemporal query: was this fact known at `sys_time` and valid at `valid_time`?
232    pub fn visible_at(&self, system_time: u64, valid_time: u64) -> bool {
233        self.known_at(system_time) && self.valid_at(valid_time)
234    }
235
236    /// Close the valid time interval (the fact is no longer true).
237    pub fn close_valid_time(&mut self, valid_to: u64) {
238        self.valid_to = valid_to;
239    }
240
241    /// Check if this coordinate represents a currently-valid fact (valid_to == MAX).
242    pub fn is_current(&self) -> bool {
243        self.valid_to == u64::MAX
244    }
245
246    /// Default "eternal" coordinate — valid from epoch 0, never expires.
247    pub const ETERNAL: Self = Self {
248        valid_from: 0,
249        valid_to: u64::MAX,
250        system_time: 0,
251    };
252}
253
254impl Default for BitemporalCoord {
255    fn default() -> Self {
256        Self::ETERNAL
257    }
258}
259
260// =============================================================================
261// Embedded Edges
262// =============================================================================
263
264/// The kind/type of an edge between Knowledge Objects.
265///
266/// Typed edges enable graph queries like "traverse all `works_at` edges"
267/// without inspecting edge payloads.
268#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
269pub enum EdgeKind {
270    /// A named relationship type (e.g., "works_at", "authored_by", "cites").
271    Typed(String),
272    /// A hierarchical containment relationship (parent → child).
273    Contains,
274    /// A derivation relationship (source → derived).
275    DerivedFrom,
276    /// A reference/citation relationship.
277    References,
278    /// A temporal succession relationship (predecessor → successor).
279    Succeeds,
280    /// A semantic similarity link (auto-generated by embedding proximity).
281    SimilarTo,
282}
283
284impl EdgeKind {
285    /// Create a typed edge kind with the given label.
286    pub fn typed(label: impl Into<String>) -> Self {
287        Self::Typed(label.into())
288    }
289
290    /// Returns the string label for this edge kind.
291    pub fn label(&self) -> &str {
292        match self {
293            EdgeKind::Typed(s) => s,
294            EdgeKind::Contains => "contains",
295            EdgeKind::DerivedFrom => "derived_from",
296            EdgeKind::References => "references",
297            EdgeKind::Succeeds => "succeeds",
298            EdgeKind::SimilarTo => "similar_to",
299        }
300    }
301}
302
303impl fmt::Display for EdgeKind {
304    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
305        write!(f, "{}", self.label())
306    }
307}
308
309/// A directed, typed, weighted, temporally-versioned edge between two
310/// Knowledge Objects.
311///
312/// Edges are **embedded** within the source object — when you load an object,
313/// you immediately have its outgoing relationships. This eliminates the
314/// separate graph lookup required by KV-backed edge stores.
315///
316/// ## Memory Layout (32 bytes per edge)
317///
318/// | Field       | Size  | Purpose                          |
319/// |-------------|-------|----------------------------------|
320/// | target      | 32B   | Target ObjectId (BLAKE3 hash)    |
321/// | kind        | ~24B  | Edge type (enum + string)        |
322/// | weight      | 4B    | Relationship strength [0.0, 1.0] |
323/// | valid_from  | 8B    | Temporal validity start          |
324/// | valid_to    | 8B    | Temporal validity end            |
325/// | properties  | var   | Optional edge attributes         |
326///
327/// For the hot path (CSR-based graph traversal), edges are projected to
328/// `(target_internal_id: u32, weight: f32)` for cache efficiency.
329#[derive(Debug, Clone, Serialize, Deserialize)]
330pub struct Edge {
331    /// Target object this edge points to.
332    pub target: ObjectId,
333
334    /// The type/kind of this relationship.
335    pub kind: EdgeKind,
336
337    /// Relationship strength/confidence in [0.0, 1.0].
338    /// - 1.0 = definitive relationship
339    /// - 0.5 = probable relationship
340    /// - 0.0 = hypothetical/weak relationship
341    pub weight: f32,
342
343    /// Temporal validity interval for this edge.
344    /// Uses the same HLC-encoded microsecond format as `BitemporalCoord`.
345    pub valid_from: u64,
346
347    /// End of temporal validity (exclusive). `u64::MAX` = still valid.
348    pub valid_to: u64,
349
350    /// Optional edge properties (e.g., "role": "lead", "confidence": 0.95).
351    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
352    pub properties: HashMap<String, SochValue>,
353}
354
355impl Edge {
356    /// Create a new edge with default weight 1.0 and open-ended validity.
357    pub fn new(target: ObjectId, kind: EdgeKind, weight: f32) -> Self {
358        Self {
359            target,
360            kind,
361            weight,
362            valid_from: 0,
363            valid_to: u64::MAX,
364            properties: HashMap::new(),
365        }
366    }
367
368    /// Create an edge with temporal validity.
369    pub fn with_validity(
370        target: ObjectId,
371        kind: EdgeKind,
372        weight: f32,
373        valid_from: u64,
374        valid_to: u64,
375    ) -> Self {
376        Self {
377            target,
378            kind,
379            weight,
380            valid_from,
381            valid_to,
382            properties: HashMap::new(),
383        }
384    }
385
386    /// Add a property to this edge.
387    pub fn with_property(mut self, key: impl Into<String>, value: SochValue) -> Self {
388        self.properties.insert(key.into(), value);
389        self
390    }
391
392    /// Check if this edge is valid at a given time.
393    pub fn valid_at(&self, time: u64) -> bool {
394        self.valid_from <= time && time < self.valid_to
395    }
396
397    /// Check if this edge is currently valid (valid_to == MAX).
398    pub fn is_current(&self) -> bool {
399        self.valid_to == u64::MAX
400    }
401}
402
403impl PartialEq for Edge {
404    fn eq(&self, other: &Self) -> bool {
405        self.target == other.target && self.kind == other.kind
406    }
407}
408
409impl Eq for Edge {}
410
411// =============================================================================
412// Object Kind / Type System
413// =============================================================================
414
415/// Classification of a Knowledge Object. Determines which indices and query
416/// optimizations apply.
417#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
418pub enum ObjectKind {
419    /// A persistent entity (person, organization, concept).
420    /// Typically has long valid-time intervals and many edges.
421    Entity,
422
423    /// A temporal event or episode.
424    /// Has precise valid-time intervals and causal edges.
425    Event,
426
427    /// An episodic memory or conversation turn.
428    /// Dense in embeddings, often has derivation edges.
429    Episode,
430
431    /// A document or content chunk.
432    /// Primary carrier of text content and semantic embeddings.
433    Document,
434
435    /// A fact or claim extracted from content.
436    /// Has provenance edges linking to source documents.
437    Fact,
438
439    /// An agent-generated artifact (plan, summary, decision).
440    /// Has derivation provenance and typically short valid-time windows.
441    Artifact,
442
443    /// User-defined type with a custom label.
444    Custom(String),
445}
446
447impl ObjectKind {
448    /// Returns the string label for this kind.
449    pub fn label(&self) -> &str {
450        match self {
451            ObjectKind::Entity => "entity",
452            ObjectKind::Event => "event",
453            ObjectKind::Episode => "episode",
454            ObjectKind::Document => "document",
455            ObjectKind::Fact => "fact",
456            ObjectKind::Artifact => "artifact",
457            ObjectKind::Custom(s) => s,
458        }
459    }
460}
461
462impl fmt::Display for ObjectKind {
463    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
464        write!(f, "{}", self.label())
465    }
466}
467
468// =============================================================================
469// Provenance Chain
470// =============================================================================
471
472/// Records how a Knowledge Object was derived.
473///
474/// Provenance enables auditable lineage tracking: "Where did this fact come from?"
475/// "What transformations produced this summary?" Each provenance record forms a
476/// node in a DAG (Directed Acyclic Graph) of derivations.
477///
478/// The provenance chain is hash-linked — each object's OID is derived from its
479/// content (which includes parent OIDs), creating a tamper-evident lineage.
480#[derive(Debug, Clone, Serialize, Deserialize)]
481pub struct Provenance {
482    /// OIDs of the parent objects this was derived from.
483    /// Empty for root/original objects.
484    pub parents: Vec<ObjectId>,
485
486    /// The transformation or operation that produced this object.
487    /// Examples: "chunk", "summarize", "extract_entities", "merge", "user_input"
488    pub operation: String,
489
490    /// The agent or system that performed the transformation.
491    /// Examples: "gpt-4", "user:alice", "sochdb:compaction"
492    pub agent: String,
493
494    /// Timestamp when the derivation occurred (HLC-encoded microseconds).
495    pub timestamp: u64,
496
497    /// Optional metadata about the transformation.
498    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
499    pub metadata: HashMap<String, SochValue>,
500}
501
502impl Provenance {
503    /// Create a root provenance (no parents — this is an original object).
504    pub fn root(agent: impl Into<String>, timestamp: u64) -> Self {
505        Self {
506            parents: Vec::new(),
507            operation: "create".to_string(),
508            agent: agent.into(),
509            timestamp,
510            metadata: HashMap::new(),
511        }
512    }
513
514    /// Create a derived provenance with parent objects.
515    pub fn derived(
516        parents: Vec<ObjectId>,
517        operation: impl Into<String>,
518        agent: impl Into<String>,
519        timestamp: u64,
520    ) -> Self {
521        Self {
522            parents,
523            operation: operation.into(),
524            agent: agent.into(),
525            timestamp,
526            metadata: HashMap::new(),
527        }
528    }
529
530    /// Add metadata to this provenance record.
531    pub fn with_metadata(mut self, key: impl Into<String>, value: SochValue) -> Self {
532        self.metadata.insert(key.into(), value);
533        self
534    }
535
536    /// Check if this is a root provenance (no parents).
537    pub fn is_root(&self) -> bool {
538        self.parents.is_empty()
539    }
540}
541
542// =============================================================================
543// Embedding Space
544// =============================================================================
545
546/// An embedding vector in a named semantic space.
547///
548/// Knowledge Objects can carry embeddings in multiple spaces simultaneously:
549/// - `"semantic"` — general-purpose sentence embedding (e.g., text-embedding-3-small)
550/// - `"code"` — code-specific embedding (e.g., CodeBERT)
551/// - `"temporal"` — time-series embedding for temporal similarity
552/// - `"visual"` — image/diagram embedding (e.g., CLIP)
553///
554/// Each space can have a different dimensionality and distance metric.
555#[derive(Debug, Clone, Serialize, Deserialize)]
556pub struct EmbeddingSpace {
557    /// The embedding vector (f32 components).
558    pub vector: Vec<f32>,
559
560    /// Dimensionality of this embedding.
561    pub dimensions: u32,
562
563    /// The model that generated this embedding.
564    /// Enables re-embedding when models are upgraded.
565    pub model: String,
566
567    /// When this embedding was generated (HLC-encoded microseconds).
568    /// Enables staleness detection and re-embedding triggers.
569    pub generated_at: u64,
570}
571
572impl EmbeddingSpace {
573    /// Create a new embedding in a given space.
574    pub fn new(vector: Vec<f32>, model: impl Into<String>, generated_at: u64) -> Self {
575        let dimensions = vector.len() as u32;
576        Self {
577            vector,
578            dimensions,
579            model: model.into(),
580            generated_at,
581        }
582    }
583
584    /// The L2 norm of this embedding vector.
585    pub fn norm(&self) -> f32 {
586        self.vector.iter().map(|x| x * x).sum::<f32>().sqrt()
587    }
588
589    /// Normalize this embedding to unit length (for cosine similarity as dot product).
590    pub fn normalize(&mut self) {
591        let norm = self.norm();
592        if norm > f32::EPSILON {
593            for x in &mut self.vector {
594                *x /= norm;
595            }
596        }
597    }
598}
599
600// =============================================================================
601// Knowledge Object
602// =============================================================================
603
604/// The atomic unit of the Knowledge Fabric.
605///
606/// A Knowledge Object co-locates content, relationships, embeddings, temporal
607/// metadata, and provenance into a single, content-addressed entity. This
608/// co-location enables the fused query execution pipeline that delivers
609/// 30–50× latency improvements over disaggregated architectures.
610///
611/// ## Invariants
612///
613/// 1. `oid == BLAKE3(canonical_bytes(payload, edges, embeddings))` — the OID
614///    is always consistent with the object's content.
615/// 2. `temporal.system_time` is monotonically increasing for successive versions
616///    of the same logical entity.
617/// 3. Edges form a DAG for `DerivedFrom` and `Succeeds` kinds (no cycles).
618/// 4. Embedding dimensions match the declared space dimensionality.
619///
620/// ## Thread Safety
621///
622/// `KnowledgeObject` is `Send + Sync` (all fields are owned or `Arc`-wrapped).
623/// Concurrent mutation should go through the MVCC layer — objects themselves
624/// are treated as immutable values (copy-on-write semantics via content addressing).
625#[derive(Debug, Clone, Serialize, Deserialize)]
626pub struct KnowledgeObject {
627    /// Content-addressed identity: `BLAKE3(canonical_content)`.
628    oid: ObjectId,
629
630    /// Classification of this object (entity, event, document, etc.).
631    kind: ObjectKind,
632
633    /// The object's data payload — a self-describing `SochValue`.
634    /// Typically a `SochValue::Object(HashMap<String, SochValue>)` but can be
635    /// any `SochValue` variant for flexibility.
636    payload: SochValue,
637
638    /// Outgoing edges to other Knowledge Objects.
639    /// Embedded within the object for edge locality — loading an object
640    /// immediately provides its relationships.
641    edges: Vec<Edge>,
642
643    /// Embeddings in multiple semantic spaces.
644    /// Key: space name (e.g., "semantic", "code", "temporal").
645    embeddings: HashMap<String, EmbeddingSpace>,
646
647    /// Bitemporal versioning coordinate.
648    temporal: BitemporalCoord,
649
650    /// Derivation provenance.
651    provenance: Provenance,
652
653    /// Optional namespace for multi-tenant isolation.
654    #[serde(default, skip_serializing_if = "Option::is_none")]
655    namespace: Option<String>,
656
657    /// Optional tags for fast categorical filtering.
658    /// Tags are indexed in the ART for O(k) lookup.
659    #[serde(default, skip_serializing_if = "Vec::is_empty")]
660    tags: Vec<String>,
661}
662
663impl KnowledgeObject {
664    // =========================================================================
665    // Accessors
666    // =========================================================================
667
668    /// The content-addressed object identity.
669    pub fn oid(&self) -> ObjectId {
670        self.oid
671    }
672
673    /// The object's classification.
674    pub fn kind(&self) -> &ObjectKind {
675        &self.kind
676    }
677
678    /// The data payload.
679    pub fn payload(&self) -> &SochValue {
680        &self.payload
681    }
682
683    /// Mutable access to the payload (will invalidate OID — call `recompute_oid()` after).
684    pub fn payload_mut(&mut self) -> &mut SochValue {
685        &mut self.payload
686    }
687
688    /// All outgoing edges.
689    pub fn edges(&self) -> &[Edge] {
690        &self.edges
691    }
692
693    /// Edges filtered by kind.
694    pub fn edges_of_kind(&self, kind: &EdgeKind) -> Vec<&Edge> {
695        self.edges.iter().filter(|e| &e.kind == kind).collect()
696    }
697
698    /// Edges valid at a given time.
699    pub fn edges_valid_at(&self, time: u64) -> Vec<&Edge> {
700        self.edges.iter().filter(|e| e.valid_at(time)).collect()
701    }
702
703    /// Get an embedding by space name.
704    pub fn embedding(&self, space: &str) -> Option<&EmbeddingSpace> {
705        self.embeddings.get(space)
706    }
707
708    /// All embedding spaces.
709    pub fn embeddings(&self) -> &HashMap<String, EmbeddingSpace> {
710        &self.embeddings
711    }
712
713    /// The default/primary embedding vector (in the "semantic" space).
714    pub fn primary_embedding(&self) -> Option<&[f32]> {
715        self.embeddings.get("semantic").map(|e| e.vector.as_slice())
716    }
717
718    /// The bitemporal coordinate.
719    pub fn temporal(&self) -> &BitemporalCoord {
720        &self.temporal
721    }
722
723    /// Set the bitemporal coordinate (e.g., to assign HLC system_time on write).
724    ///
725    /// Note: This does NOT change the OID. Temporal coordinates are metadata,
726    /// not part of the content-addressed identity.
727    pub fn set_temporal(&mut self, coord: BitemporalCoord) {
728        self.temporal = coord;
729    }
730
731    /// The derivation provenance.
732    pub fn provenance(&self) -> &Provenance {
733        &self.provenance
734    }
735
736    /// The namespace (for multi-tenant isolation).
737    pub fn namespace(&self) -> Option<&str> {
738        self.namespace.as_deref()
739    }
740
741    /// Tags for categorical filtering.
742    pub fn tags(&self) -> &[String] {
743        &self.tags
744    }
745
746    /// Check if this object has a specific tag.
747    pub fn has_tag(&self, tag: &str) -> bool {
748        self.tags.iter().any(|t| t == tag)
749    }
750
751    // =========================================================================
752    // Temporal Queries
753    // =========================================================================
754
755    /// Is this object valid at the given valid time?
756    pub fn valid_at(&self, valid_time: u64) -> bool {
757        self.temporal.valid_at(valid_time)
758    }
759
760    /// Was this object known to the system at the given system time?
761    pub fn known_at(&self, system_time: u64) -> bool {
762        self.temporal.known_at(system_time)
763    }
764
765    /// Combined bitemporal visibility check.
766    pub fn visible_at(&self, system_time: u64, valid_time: u64) -> bool {
767        self.temporal.visible_at(system_time, valid_time)
768    }
769
770    /// Is this the current version (valid_to == MAX)?
771    pub fn is_current(&self) -> bool {
772        self.temporal.is_current()
773    }
774
775    // =========================================================================
776    // Attribute Access
777    // =========================================================================
778
779    /// Get a named attribute from the payload (assumes payload is `SochValue::Object`).
780    pub fn attribute(&self, key: &str) -> Option<&SochValue> {
781        match &self.payload {
782            SochValue::Object(map) => map.get(key),
783            _ => None,
784        }
785    }
786
787    /// Get a text attribute.
788    pub fn text_attribute(&self, key: &str) -> Option<&str> {
789        self.attribute(key).and_then(|v| v.as_text())
790    }
791
792    /// Get an integer attribute.
793    pub fn int_attribute(&self, key: &str) -> Option<i64> {
794        self.attribute(key).and_then(|v| v.as_int())
795    }
796
797    // =========================================================================
798    // Content Addressing
799    // =========================================================================
800
801    /// Recompute the OID from the current content.
802    /// Must be called after any mutation to maintain the content-addressing invariant.
803    pub fn recompute_oid(&mut self) {
804        self.oid = Self::compute_oid(&self.kind, &self.payload, &self.edges, &self.embeddings);
805    }
806
807    /// Verify that the stored OID matches the current content.
808    pub fn verify_oid(&self) -> bool {
809        let computed = Self::compute_oid(&self.kind, &self.payload, &self.edges, &self.embeddings);
810        self.oid == computed
811    }
812
813    /// Compute the canonical OID for given content.
814    fn compute_oid(
815        kind: &ObjectKind,
816        payload: &SochValue,
817        edges: &[Edge],
818        embeddings: &HashMap<String, EmbeddingSpace>,
819    ) -> ObjectId {
820        let canonical = Self::canonical_bytes(kind, payload, edges, embeddings);
821        ObjectId::from_content(&canonical)
822    }
823
824    /// Produce canonical bytes for OID computation.
825    ///
826    /// Canonical serialization ensures deterministic hashing:
827    /// - HashMap keys are sorted lexicographically
828    /// - Floats are normalized (NaN → 0.0, -0.0 → 0.0)
829    /// - Using bincode for compact, deterministic binary encoding
830    fn canonical_bytes(
831        kind: &ObjectKind,
832        payload: &SochValue,
833        edges: &[Edge],
834        embeddings: &HashMap<String, EmbeddingSpace>,
835    ) -> Vec<u8> {
836        // We use a deterministic serialization approach:
837        // 1. Serialize kind label
838        // 2. Serialize payload via bincode
839        // 3. Serialize edges sorted by (target, kind)
840        // 4. Serialize embeddings sorted by space name
841        let mut hasher_input = Vec::with_capacity(1024);
842
843        // Kind label
844        let kind_bytes = kind.label().as_bytes();
845        hasher_input.extend_from_slice(&(kind_bytes.len() as u32).to_le_bytes());
846        hasher_input.extend_from_slice(kind_bytes);
847
848        // Payload — deterministic serialization of SochValue
849        // For Object(HashMap), sort keys before serializing
850        let payload_bytes = canonical_soch_value_bytes(payload);
851        hasher_input.extend_from_slice(&(payload_bytes.len() as u32).to_le_bytes());
852        hasher_input.extend_from_slice(&payload_bytes);
853
854        // Edges sorted by (target OID, kind label) for determinism
855        let mut sorted_edges: Vec<_> = edges.iter().collect();
856        sorted_edges.sort_by(|a, b| {
857            a.target
858                .as_bytes()
859                .cmp(b.target.as_bytes())
860                .then_with(|| a.kind.label().cmp(b.kind.label()))
861        });
862        hasher_input.extend_from_slice(&(sorted_edges.len() as u32).to_le_bytes());
863        for edge in &sorted_edges {
864            hasher_input.extend_from_slice(edge.target.as_bytes());
865            let kind_label = edge.kind.label().as_bytes();
866            hasher_input.extend_from_slice(&(kind_label.len() as u32).to_le_bytes());
867            hasher_input.extend_from_slice(kind_label);
868            hasher_input.extend_from_slice(&edge.weight.to_le_bytes());
869        }
870
871        // Embeddings sorted by space name for determinism
872        let mut sorted_spaces: Vec<_> = embeddings.iter().collect();
873        sorted_spaces.sort_by_key(|(name, _)| *name);
874        hasher_input.extend_from_slice(&(sorted_spaces.len() as u32).to_le_bytes());
875        for (name, embedding) in &sorted_spaces {
876            let name_bytes = name.as_bytes();
877            hasher_input.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
878            hasher_input.extend_from_slice(name_bytes);
879            hasher_input.extend_from_slice(&embedding.dimensions.to_le_bytes());
880            for &v in &embedding.vector {
881                hasher_input.extend_from_slice(&v.to_le_bytes());
882            }
883        }
884
885        hasher_input
886    }
887}
888
889/// Produce deterministic bytes for a SochValue.
890/// For `Object(HashMap)`, keys are sorted to ensure deterministic output.
891fn canonical_soch_value_bytes(value: &SochValue) -> Vec<u8> {
892    let mut buf = Vec::with_capacity(256);
893    write_canonical_soch_value(&mut buf, value);
894    buf
895}
896
897/// Recursively write a SochValue in canonical (deterministic) byte order.
898fn write_canonical_soch_value(buf: &mut Vec<u8>, value: &SochValue) {
899    match value {
900        SochValue::Null => buf.push(0),
901        SochValue::Bool(b) => {
902            buf.push(1);
903            buf.push(if *b { 1 } else { 0 });
904        }
905        SochValue::Int(i) => {
906            buf.push(2);
907            buf.extend_from_slice(&i.to_le_bytes());
908        }
909        SochValue::UInt(u) => {
910            buf.push(3);
911            buf.extend_from_slice(&u.to_le_bytes());
912        }
913        SochValue::Float(f) => {
914            buf.push(4);
915            // Normalize: NaN → 0.0, -0.0 → 0.0
916            let normalized = if f.is_nan() { 0.0 } else if *f == 0.0 { 0.0 } else { *f };
917            buf.extend_from_slice(&normalized.to_le_bytes());
918        }
919        SochValue::Text(s) => {
920            buf.push(5);
921            buf.extend_from_slice(&(s.len() as u32).to_le_bytes());
922            buf.extend_from_slice(s.as_bytes());
923        }
924        SochValue::Binary(b) => {
925            buf.push(6);
926            buf.extend_from_slice(&(b.len() as u32).to_le_bytes());
927            buf.extend_from_slice(b);
928        }
929        SochValue::Array(arr) => {
930            buf.push(7);
931            buf.extend_from_slice(&(arr.len() as u32).to_le_bytes());
932            for item in arr {
933                write_canonical_soch_value(buf, item);
934            }
935        }
936        SochValue::Object(map) => {
937            buf.push(8);
938            // Sort keys for deterministic ordering
939            let mut sorted_keys: Vec<&String> = map.keys().collect();
940            sorted_keys.sort();
941            buf.extend_from_slice(&(sorted_keys.len() as u32).to_le_bytes());
942            for key in sorted_keys {
943                buf.extend_from_slice(&(key.len() as u32).to_le_bytes());
944                buf.extend_from_slice(key.as_bytes());
945                write_canonical_soch_value(buf, &map[key]);
946            }
947        }
948        SochValue::Ref { table, id } => {
949            buf.push(9);
950            buf.extend_from_slice(&(table.len() as u32).to_le_bytes());
951            buf.extend_from_slice(table.as_bytes());
952            buf.extend_from_slice(&id.to_le_bytes());
953        }
954    }
955}
956
957impl KnowledgeObject {
958    // =========================================================================
959    // Serialization
960    // =========================================================================
961
962    /// Serialize this Knowledge Object to compact binary format.
963    /// Uses serde_json for reliable HashMap serialization.
964    pub fn to_bytes(&self) -> Result<Vec<u8>, KnowledgeObjectError> {
965        serde_json::to_vec(self).map_err(|e| KnowledgeObjectError::SerializationError(e.to_string()))
966    }
967
968    /// Deserialize a Knowledge Object from binary format.
969    pub fn from_bytes(bytes: &[u8]) -> Result<Self, KnowledgeObjectError> {
970        serde_json::from_slice(bytes)
971            .map_err(|e| KnowledgeObjectError::DeserializationError(e.to_string()))
972    }
973
974    /// Estimated memory footprint of this object (for memory budgeting).
975    pub fn estimated_size(&self) -> usize {
976        std::mem::size_of::<Self>()
977            + self.edges.len() * std::mem::size_of::<Edge>()
978            + self
979                .embeddings
980                .values()
981                .map(|e| e.vector.len() * 4)
982                .sum::<usize>()
983            + self.tags.iter().map(|t| t.len()).sum::<usize>()
984    }
985
986    // =========================================================================
987    // Compressed Serialization
988    // =========================================================================
989
990    /// Serialize with per-object compression.
991    ///
992    /// Wire format: `[tag: u8] [original_len: u32 LE] [payload...]`
993    ///
994    /// - Tag 0 (`None`): payload is raw JSON (original_len == payload.len()).
995    /// - Tag 1 (`Lz4`): payload is LZ4-block-compressed JSON.
996    /// - Tag 2 (`Zstd`): payload is ZSTD-compressed JSON.
997    ///
998    /// Falls back to uncompressed if compressed output >= original size.
999    pub fn to_compressed_bytes(
1000        &self,
1001        mode: CompressionMode,
1002    ) -> Result<Vec<u8>, KnowledgeObjectError> {
1003        let raw = self.to_bytes()?;
1004        let original_len = raw.len() as u32;
1005
1006        match mode {
1007            CompressionMode::None => {
1008                let mut out = Vec::with_capacity(5 + raw.len());
1009                out.push(CompressionMode::None.tag());
1010                out.extend_from_slice(&original_len.to_le_bytes());
1011                out.extend_from_slice(&raw);
1012                Ok(out)
1013            }
1014            CompressionMode::Lz4 => {
1015                let compressed = lz4::block::compress(&raw, None, false)
1016                    .map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
1017                // Fallback if compression doesn't save space
1018                if compressed.len() >= raw.len() {
1019                    let mut out = Vec::with_capacity(5 + raw.len());
1020                    out.push(CompressionMode::None.tag());
1021                    out.extend_from_slice(&original_len.to_le_bytes());
1022                    out.extend_from_slice(&raw);
1023                    return Ok(out);
1024                }
1025                let mut out = Vec::with_capacity(5 + compressed.len());
1026                out.push(CompressionMode::Lz4.tag());
1027                out.extend_from_slice(&original_len.to_le_bytes());
1028                out.extend_from_slice(&compressed);
1029                Ok(out)
1030            }
1031            CompressionMode::Zstd { level } => {
1032                let compressed = zstd::encode_all(raw.as_slice(), level)
1033                    .map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
1034                if compressed.len() >= raw.len() {
1035                    let mut out = Vec::with_capacity(5 + raw.len());
1036                    out.push(CompressionMode::None.tag());
1037                    out.extend_from_slice(&original_len.to_le_bytes());
1038                    out.extend_from_slice(&raw);
1039                    return Ok(out);
1040                }
1041                let mut out = Vec::with_capacity(5 + compressed.len());
1042                out.push(CompressionMode::Zstd { level }.tag());
1043                out.extend_from_slice(&original_len.to_le_bytes());
1044                out.extend_from_slice(&compressed);
1045                Ok(out)
1046            }
1047        }
1048    }
1049
1050    /// Deserialize from compressed wire format (auto-detects compression).
1051    ///
1052    /// The 1-byte tag determines the decompression algorithm.
1053    pub fn from_compressed_bytes(bytes: &[u8]) -> Result<Self, KnowledgeObjectError> {
1054        if bytes.len() < 5 {
1055            return Err(KnowledgeObjectError::DeserializationError(
1056                "compressed payload too short (need >= 5 bytes)".into(),
1057            ));
1058        }
1059
1060        let tag = bytes[0];
1061        let original_len =
1062            u32::from_le_bytes([bytes[1], bytes[2], bytes[3], bytes[4]]) as usize;
1063        let payload = &bytes[5..];
1064
1065        let raw = match tag {
1066            0 => {
1067                // Uncompressed
1068                payload.to_vec()
1069            }
1070            1 => {
1071                // LZ4
1072                lz4::block::decompress(payload, Some(original_len as i32))
1073                    .map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?
1074            }
1075            2 => {
1076                // ZSTD
1077                let mut decoder = zstd::Decoder::new(payload)
1078                    .map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
1079                let mut raw = Vec::with_capacity(original_len);
1080                decoder
1081                    .read_to_end(&mut raw)
1082                    .map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
1083                raw
1084            }
1085            _ => {
1086                return Err(KnowledgeObjectError::UnknownCompressionTag(tag));
1087            }
1088        };
1089
1090        Self::from_bytes(&raw)
1091    }
1092
1093    /// Returns the compression ratio for a given mode (compressed_size / original_size).
1094    /// Values < 1.0 indicate space savings.
1095    pub fn compression_ratio(
1096        &self,
1097        mode: CompressionMode,
1098    ) -> Result<f64, KnowledgeObjectError> {
1099        let raw_len = self.to_bytes()?.len() as f64;
1100        let compressed_len = self.to_compressed_bytes(mode)?.len() as f64;
1101        Ok(compressed_len / raw_len)
1102    }
1103}
1104
1105// =============================================================================
1106// Compression Mode
1107// =============================================================================
1108
1109/// Per-object compression strategy.
1110///
1111/// Each [`KnowledgeObject`] can be independently compressed with a different
1112/// algorithm and level. The choice depends on the object's characteristics:
1113///
1114/// - **LZ4**: ~3 GB/s decode, low CPU. Best for hot/frequently-accessed objects.
1115/// - **ZSTD**: ~1 GB/s decode, better ratios. Best for cold/archival objects.
1116/// - **None**: Zero overhead. Use for tiny objects where compression adds bytes.
1117#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1118pub enum CompressionMode {
1119    /// No compression — raw JSON bytes.
1120    None,
1121    /// LZ4 block compression — fast decode, moderate ratio.
1122    Lz4,
1123    /// ZSTD compression with configurable level (1–22, default 3).
1124    Zstd { level: i32 },
1125}
1126
1127impl CompressionMode {
1128    /// 1-byte tag written to the wire format header.
1129    pub fn tag(&self) -> u8 {
1130        match self {
1131            Self::None => 0,
1132            Self::Lz4 => 1,
1133            Self::Zstd { .. } => 2,
1134        }
1135    }
1136
1137    /// Construct from wire tag (decompression side doesn't need level).
1138    pub fn from_tag(tag: u8) -> Option<Self> {
1139        match tag {
1140            0 => Some(Self::None),
1141            1 => Some(Self::Lz4),
1142            2 => Some(Self::Zstd { level: 0 }), // level unused for decompression
1143            _ => Option::None,
1144        }
1145    }
1146
1147    /// Default ZSTD mode (level 3 — good balance of speed and ratio).
1148    pub fn zstd() -> Self {
1149        Self::Zstd { level: 3 }
1150    }
1151
1152    /// High-compression ZSTD (level 9 — archival).
1153    pub fn zstd_high() -> Self {
1154        Self::Zstd { level: 9 }
1155    }
1156}
1157
1158impl Default for CompressionMode {
1159    fn default() -> Self {
1160        Self::None
1161    }
1162}
1163
1164impl PartialEq for KnowledgeObject {
1165    fn eq(&self, other: &Self) -> bool {
1166        // Content-addressed equality: same OID means same object.
1167        self.oid == other.oid
1168    }
1169}
1170
1171impl Eq for KnowledgeObject {}
1172
1173impl std::hash::Hash for KnowledgeObject {
1174    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
1175        self.oid.hash(state);
1176    }
1177}
1178
1179impl fmt::Display for KnowledgeObject {
1180    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1181        write!(
1182            f,
1183            "KO({}, kind={}, edges={}, embeddings={}, tags={})",
1184            &self.oid.to_hex()[..12],
1185            self.kind,
1186            self.edges.len(),
1187            self.embeddings.len(),
1188            self.tags.len()
1189        )
1190    }
1191}
1192
1193// =============================================================================
1194// Builder
1195// =============================================================================
1196
1197/// Ergonomic builder for constructing Knowledge Objects.
1198///
1199/// The builder computes the content-addressed OID automatically on `.build()`.
1200///
1201/// # Example
1202///
1203/// ```rust,ignore
1204/// let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1205///     .attribute("name", SochValue::Text("Alice".into()))
1206///     .embedding("semantic", vec![0.1, 0.2, 0.3])
1207///     .tag("person")
1208///     .valid_from(1700000000_000000)
1209///     .build();
1210/// ```
1211pub struct KnowledgeObjectBuilder {
1212    kind: ObjectKind,
1213    payload: SochValue,
1214    edges: Vec<Edge>,
1215    embeddings: HashMap<String, EmbeddingSpace>,
1216    temporal: BitemporalCoord,
1217    provenance: Provenance,
1218    namespace: Option<String>,
1219    tags: Vec<String>,
1220}
1221
1222impl KnowledgeObjectBuilder {
1223    /// Create a new builder with the given object kind.
1224    pub fn new(kind: ObjectKind) -> Self {
1225        Self {
1226            kind,
1227            payload: SochValue::Object(HashMap::new()),
1228            edges: Vec::new(),
1229            embeddings: HashMap::new(),
1230            temporal: BitemporalCoord::default(),
1231            provenance: Provenance::root("system", 0),
1232            namespace: None,
1233            tags: Vec::new(),
1234        }
1235    }
1236
1237    /// Set the full payload.
1238    pub fn payload(mut self, payload: SochValue) -> Self {
1239        self.payload = payload;
1240        self
1241    }
1242
1243    /// Add a named attribute to the payload (creates/extends an Object payload).
1244    pub fn attribute(mut self, key: impl Into<String>, value: SochValue) -> Self {
1245        match &mut self.payload {
1246            SochValue::Object(map) => {
1247                map.insert(key.into(), value);
1248            }
1249            _ => {
1250                let mut map = HashMap::new();
1251                map.insert(key.into(), value);
1252                self.payload = SochValue::Object(map);
1253            }
1254        }
1255        self
1256    }
1257
1258    /// Add an outgoing edge.
1259    pub fn edge(mut self, edge: Edge) -> Self {
1260        self.edges.push(edge);
1261        self
1262    }
1263
1264    /// Add multiple edges at once.
1265    pub fn edges(mut self, edges: impl IntoIterator<Item = Edge>) -> Self {
1266        self.edges.extend(edges);
1267        self
1268    }
1269
1270    /// Add an embedding in a named space.
1271    pub fn embedding(
1272        mut self,
1273        space: impl Into<String>,
1274        vector: Vec<f32>,
1275    ) -> Self {
1276        let space_name = space.into();
1277        self.embeddings.insert(
1278            space_name,
1279            EmbeddingSpace::new(vector, "unknown", 0),
1280        );
1281        self
1282    }
1283
1284    /// Add an embedding with full metadata.
1285    pub fn embedding_with_metadata(
1286        mut self,
1287        space: impl Into<String>,
1288        vector: Vec<f32>,
1289        model: impl Into<String>,
1290        generated_at: u64,
1291    ) -> Self {
1292        let space_name = space.into();
1293        self.embeddings.insert(
1294            space_name,
1295            EmbeddingSpace::new(vector, model, generated_at),
1296        );
1297        self
1298    }
1299
1300    /// Set the valid_from time (HLC-encoded microseconds).
1301    pub fn valid_from(mut self, valid_from: u64) -> Self {
1302        self.temporal.valid_from = valid_from;
1303        self
1304    }
1305
1306    /// Set the valid_to time (HLC-encoded microseconds).
1307    pub fn valid_to(mut self, valid_to: u64) -> Self {
1308        self.temporal.valid_to = valid_to;
1309        self
1310    }
1311
1312    /// Set the system_time (typically assigned automatically by HLC on write).
1313    pub fn system_time(mut self, system_time: u64) -> Self {
1314        self.temporal.system_time = system_time;
1315        self
1316    }
1317
1318    /// Set the full bitemporal coordinate.
1319    pub fn temporal(mut self, temporal: BitemporalCoord) -> Self {
1320        self.temporal = temporal;
1321        self
1322    }
1323
1324    /// Set the provenance record.
1325    pub fn provenance(mut self, provenance: Provenance) -> Self {
1326        self.provenance = provenance;
1327        self
1328    }
1329
1330    /// Set the namespace.
1331    pub fn namespace(mut self, namespace: impl Into<String>) -> Self {
1332        self.namespace = Some(namespace.into());
1333        self
1334    }
1335
1336    /// Add a tag.
1337    pub fn tag(mut self, tag: impl Into<String>) -> Self {
1338        self.tags.push(tag.into());
1339        self
1340    }
1341
1342    /// Add multiple tags.
1343    pub fn tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
1344        self.tags.extend(tags.into_iter().map(|t| t.into()));
1345        self
1346    }
1347
1348    /// Build the Knowledge Object, computing the content-addressed OID.
1349    pub fn build(self) -> KnowledgeObject {
1350        let oid = KnowledgeObject::compute_oid(
1351            &self.kind,
1352            &self.payload,
1353            &self.edges,
1354            &self.embeddings,
1355        );
1356
1357        KnowledgeObject {
1358            oid,
1359            kind: self.kind,
1360            payload: self.payload,
1361            edges: self.edges,
1362            embeddings: self.embeddings,
1363            temporal: self.temporal,
1364            provenance: self.provenance,
1365            namespace: self.namespace,
1366            tags: self.tags,
1367        }
1368    }
1369
1370    /// Build with a pre-computed OID (for deserialization or migration).
1371    pub fn build_with_oid(self, oid: ObjectId) -> KnowledgeObject {
1372        KnowledgeObject {
1373            oid,
1374            kind: self.kind,
1375            payload: self.payload,
1376            edges: self.edges,
1377            embeddings: self.embeddings,
1378            temporal: self.temporal,
1379            provenance: self.provenance,
1380            namespace: self.namespace,
1381            tags: self.tags,
1382        }
1383    }
1384}
1385
1386// =============================================================================
1387// Error Types
1388// =============================================================================
1389
1390/// Errors for Knowledge Object operations.
1391#[derive(Debug, Clone, thiserror::Error)]
1392pub enum KnowledgeObjectError {
1393    #[error("serialization error: {0}")]
1394    SerializationError(String),
1395
1396    #[error("deserialization error: {0}")]
1397    DeserializationError(String),
1398
1399    #[error("OID verification failed: stored={stored}, computed={computed}")]
1400    OidMismatch { stored: String, computed: String },
1401
1402    #[error("missing required embedding space: {0}")]
1403    MissingEmbedding(String),
1404
1405    #[error("dimension mismatch in space '{space}': expected {expected}, got {got}")]
1406    DimensionMismatch {
1407        space: String,
1408        expected: u32,
1409        got: u32,
1410    },
1411
1412    #[error("invalid temporal coordinates: valid_from ({valid_from}) > valid_to ({valid_to})")]
1413    InvalidTemporalRange { valid_from: u64, valid_to: u64 },
1414
1415    #[error("compression error: {0}")]
1416    CompressionError(String),
1417
1418    #[error("unknown compression tag: {0}")]
1419    UnknownCompressionTag(u8),
1420}
1421
1422// =============================================================================
1423// Conversion from TOON/SochValue
1424// =============================================================================
1425
1426impl From<SochValue> for KnowledgeObjectBuilder {
1427    /// Convert a SochValue into a KnowledgeObject builder.
1428    /// The SochValue becomes the payload; kind defaults to `Document`.
1429    fn from(value: SochValue) -> Self {
1430        KnowledgeObjectBuilder::new(ObjectKind::Document).payload(value)
1431    }
1432}
1433
1434// =============================================================================
1435// Tests
1436// =============================================================================
1437
1438#[cfg(test)]
1439mod tests {
1440    use super::*;
1441
1442    #[test]
1443    fn test_content_addressing_determinism() {
1444        let ko1 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1445            .attribute("name", SochValue::Text("Alice".into()))
1446            .attribute("age", SochValue::Int(30))
1447            .build();
1448
1449        let ko2 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1450            .attribute("age", SochValue::Int(30))
1451            .attribute("name", SochValue::Text("Alice".into()))
1452            .build();
1453
1454        // Different insertion order, same content → same OID
1455        assert_eq!(ko1.oid(), ko2.oid());
1456    }
1457
1458    #[test]
1459    fn test_different_content_different_oid() {
1460        let ko1 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1461            .attribute("name", SochValue::Text("Alice".into()))
1462            .build();
1463
1464        let ko2 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1465            .attribute("name", SochValue::Text("Bob".into()))
1466            .build();
1467
1468        assert_ne!(ko1.oid(), ko2.oid());
1469    }
1470
1471    #[test]
1472    fn test_oid_verification() {
1473        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1474            .attribute("content", SochValue::Text("Hello, world!".into()))
1475            .build();
1476
1477        assert!(ko.verify_oid());
1478    }
1479
1480    #[test]
1481    fn test_bitemporal_queries() {
1482        let ko = KnowledgeObjectBuilder::new(ObjectKind::Event)
1483            .valid_from(100)
1484            .valid_to(200)
1485            .system_time(50)
1486            .build();
1487
1488        assert!(ko.valid_at(150));
1489        assert!(!ko.valid_at(250));
1490        assert!(ko.known_at(50));
1491        assert!(ko.known_at(100));
1492        assert!(!ko.known_at(40));
1493
1494        // Combined: visible at system_time=60, valid_time=150 → true
1495        assert!(ko.visible_at(60, 150));
1496        // Combined: visible at system_time=40, valid_time=150 → false (not yet recorded)
1497        assert!(!ko.visible_at(40, 150));
1498    }
1499
1500    #[test]
1501    fn test_embedded_edges() {
1502        let target_oid = ObjectId::from_content(b"target_object");
1503
1504        let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1505            .attribute("name", SochValue::Text("Alice".into()))
1506            .edge(Edge::new(target_oid, EdgeKind::typed("works_at"), 1.0))
1507            .edge(Edge::new(target_oid, EdgeKind::Contains, 0.5))
1508            .build();
1509
1510        assert_eq!(ko.edges().len(), 2);
1511        assert_eq!(ko.edges_of_kind(&EdgeKind::typed("works_at")).len(), 1);
1512        assert_eq!(ko.edges_of_kind(&EdgeKind::Contains).len(), 1);
1513    }
1514
1515    #[test]
1516    fn test_multi_space_embeddings() {
1517        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1518            .embedding("semantic", vec![0.1, 0.2, 0.3])
1519            .embedding("code", vec![0.4, 0.5, 0.6, 0.7])
1520            .build();
1521
1522        assert!(ko.embedding("semantic").is_some());
1523        assert!(ko.embedding("code").is_some());
1524        assert!(ko.embedding("nonexistent").is_none());
1525        assert_eq!(ko.embedding("semantic").unwrap().dimensions, 3);
1526        assert_eq!(ko.embedding("code").unwrap().dimensions, 4);
1527    }
1528
1529    #[test]
1530    fn test_provenance_chain() {
1531        let parent_oid = ObjectId::from_content(b"parent_document");
1532
1533        let ko = KnowledgeObjectBuilder::new(ObjectKind::Fact)
1534            .attribute("claim", SochValue::Text("X is true".into()))
1535            .provenance(Provenance::derived(
1536                vec![parent_oid],
1537                "extract_facts",
1538                "gpt-4",
1539                1700000000,
1540            ))
1541            .build();
1542
1543        assert!(!ko.provenance().is_root());
1544        assert_eq!(ko.provenance().parents.len(), 1);
1545        assert_eq!(ko.provenance().parents[0], parent_oid);
1546        assert_eq!(ko.provenance().operation, "extract_facts");
1547    }
1548
1549    #[test]
1550    fn test_serialization_roundtrip() {
1551        let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1552            .attribute("name", SochValue::Text("Alice".into()))
1553            .embedding("semantic", vec![0.1, 0.2, 0.3])
1554            .tag("person")
1555            .namespace("test")
1556            .build();
1557
1558        let bytes = ko.to_bytes().unwrap();
1559        let restored = KnowledgeObject::from_bytes(&bytes).unwrap();
1560
1561        assert_eq!(ko.oid(), restored.oid());
1562        assert_eq!(ko.kind(), restored.kind());
1563        assert_eq!(ko.tags(), restored.tags());
1564        assert_eq!(ko.namespace(), restored.namespace());
1565    }
1566
1567    #[test]
1568    fn test_object_id_hex_roundtrip() {
1569        let oid = ObjectId::from_content(b"test content");
1570        let hex = oid.to_hex();
1571        let parsed = ObjectId::from_hex(&hex).unwrap();
1572        assert_eq!(oid, parsed);
1573    }
1574
1575    #[test]
1576    fn test_nil_oid() {
1577        assert!(ObjectId::NIL.is_nil());
1578        let non_nil = ObjectId::from_content(b"something");
1579        assert!(!non_nil.is_nil());
1580    }
1581
1582    #[test]
1583    fn test_edge_temporal_filtering() {
1584        let target = ObjectId::from_content(b"target");
1585
1586        let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1587            .edge(Edge::with_validity(target, EdgeKind::typed("works_at"), 1.0, 100, 200))
1588            .edge(Edge::with_validity(target, EdgeKind::typed("manages"), 0.8, 150, u64::MAX))
1589            .build();
1590
1591        // At time 120: only "works_at" is valid
1592        let active = ko.edges_valid_at(120);
1593        assert_eq!(active.len(), 1);
1594        assert_eq!(active[0].kind, EdgeKind::typed("works_at"));
1595
1596        // At time 160: both are valid
1597        assert_eq!(ko.edges_valid_at(160).len(), 2);
1598
1599        // At time 250: only "manages" (still current)
1600        let active = ko.edges_valid_at(250);
1601        assert_eq!(active.len(), 1);
1602        assert_eq!(active[0].kind, EdgeKind::typed("manages"));
1603    }
1604
1605    #[test]
1606    fn test_estimated_size() {
1607        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1608            .embedding("semantic", vec![0.0; 384])
1609            .tag("test")
1610            .build();
1611
1612        let size = ko.estimated_size();
1613        assert!(size > 384 * 4); // At least the embedding vector
1614    }
1615
1616    #[test]
1617    fn test_display() {
1618        let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1619            .attribute("name", SochValue::Text("Alice".into()))
1620            .build();
1621
1622        let display = format!("{}", ko);
1623        assert!(display.starts_with("KO("));
1624        assert!(display.contains("kind=entity"));
1625    }
1626
1627    // =====================================================================
1628    // Compression tests
1629    // =====================================================================
1630
1631    #[test]
1632    fn test_compression_none_roundtrip() {
1633        let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
1634            .attribute("name", SochValue::Text("Alice".into()))
1635            .embedding("semantic", vec![0.1; 128])
1636            .tag("person")
1637            .build();
1638
1639        let compressed = ko.to_compressed_bytes(CompressionMode::None).unwrap();
1640        assert_eq!(compressed[0], 0); // tag = None
1641        let restored = KnowledgeObject::from_compressed_bytes(&compressed).unwrap();
1642        assert_eq!(ko.oid(), restored.oid());
1643    }
1644
1645    #[test]
1646    fn test_compression_lz4_roundtrip() {
1647        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1648            .attribute("content", SochValue::Text("hello world ".repeat(100)))
1649            .embedding("semantic", vec![0.5; 384])
1650            .build();
1651
1652        let compressed = ko.to_compressed_bytes(CompressionMode::Lz4).unwrap();
1653        let raw = ko.to_bytes().unwrap();
1654
1655        // LZ4 should compress repetitive content
1656        assert!(compressed.len() < raw.len(), "LZ4 should reduce size for repetitive data");
1657        assert_eq!(compressed[0], 1); // tag = Lz4
1658
1659        let restored = KnowledgeObject::from_compressed_bytes(&compressed).unwrap();
1660        assert_eq!(ko.oid(), restored.oid());
1661        assert_eq!(ko.tags(), restored.tags());
1662    }
1663
1664    #[test]
1665    fn test_compression_zstd_roundtrip() {
1666        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1667            .attribute("content", SochValue::Text("hello world ".repeat(100)))
1668            .embedding("semantic", vec![0.5; 384])
1669            .tag("document")
1670            .namespace("test-ns")
1671            .build();
1672
1673        let compressed = ko.to_compressed_bytes(CompressionMode::zstd()).unwrap();
1674        let raw = ko.to_bytes().unwrap();
1675
1676        assert!(compressed.len() < raw.len(), "ZSTD should reduce size");
1677        assert_eq!(compressed[0], 2); // tag = Zstd
1678
1679        let restored = KnowledgeObject::from_compressed_bytes(&compressed).unwrap();
1680        assert_eq!(ko.oid(), restored.oid());
1681        assert_eq!(ko.namespace(), restored.namespace());
1682    }
1683
1684    #[test]
1685    fn test_compression_fallback_on_tiny_object() {
1686        // A tiny object where compression might increase size
1687        let ko = KnowledgeObjectBuilder::new(ObjectKind::Fact)
1688            .attribute("x", SochValue::Int(1))
1689            .build();
1690
1691        let compressed_lz4 = ko.to_compressed_bytes(CompressionMode::Lz4).unwrap();
1692        let compressed_zstd = ko.to_compressed_bytes(CompressionMode::zstd()).unwrap();
1693
1694        // Should still roundtrip regardless (falls back to None if compressed >= raw)
1695        let r1 = KnowledgeObject::from_compressed_bytes(&compressed_lz4).unwrap();
1696        let r2 = KnowledgeObject::from_compressed_bytes(&compressed_zstd).unwrap();
1697        assert_eq!(ko.oid(), r1.oid());
1698        assert_eq!(ko.oid(), r2.oid());
1699    }
1700
1701    #[test]
1702    fn test_compression_ratio() {
1703        let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
1704            .attribute("data", SochValue::Text("abcdefgh".repeat(500)))
1705            .build();
1706
1707        let ratio = ko.compression_ratio(CompressionMode::Lz4).unwrap();
1708        assert!(ratio < 1.0, "LZ4 should achieve < 1.0 ratio on repetitive data");
1709
1710        let ratio_zstd = ko.compression_ratio(CompressionMode::zstd()).unwrap();
1711        assert!(ratio_zstd < ratio, "ZSTD should beat LZ4 ratio at default level");
1712    }
1713
1714    #[test]
1715    fn test_compression_mode_tag_roundtrip() {
1716        for mode in [CompressionMode::None, CompressionMode::Lz4, CompressionMode::zstd()] {
1717            let tag = mode.tag();
1718            let recovered = CompressionMode::from_tag(tag).unwrap();
1719            assert_eq!(mode.tag(), recovered.tag());
1720        }
1721        assert!(CompressionMode::from_tag(255).is_none());
1722    }
1723
1724    #[test]
1725    fn test_compressed_bytes_too_short() {
1726        let result = KnowledgeObject::from_compressed_bytes(&[0, 1, 2]);
1727        assert!(result.is_err());
1728    }
1729
1730    #[test]
1731    fn test_unknown_compression_tag() {
1732        let bad_bytes = vec![99, 0, 0, 0, 0]; // tag=99, len=0
1733        let result = KnowledgeObject::from_compressed_bytes(&bad_bytes);
1734        assert!(result.is_err());
1735    }
1736}