use serde::{Deserialize, Serialize};
use serde_json;
use std::collections::HashMap;
use std::fmt;
use std::io::Read;
use crate::soch::SochValue;
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ObjectId([u8; 32]);
impl ObjectId {
pub fn from_bytes(bytes: [u8; 32]) -> Self {
Self(bytes)
}
pub fn from_content(content: &[u8]) -> Self {
let hash = blake3::hash(content);
Self(*hash.as_bytes())
}
pub fn as_bytes(&self) -> &[u8; 32] {
&self.0
}
pub fn to_hex(&self) -> String {
hex::encode(self.0)
}
pub fn from_hex(s: &str) -> Result<Self, ObjectIdError> {
let bytes = hex::decode(s).map_err(|_| ObjectIdError::InvalidHex)?;
if bytes.len() != 32 {
return Err(ObjectIdError::InvalidLength(bytes.len()));
}
let mut arr = [0u8; 32];
arr.copy_from_slice(&bytes);
Ok(Self(arr))
}
pub const NIL: Self = Self([0u8; 32]);
pub fn is_nil(&self) -> bool {
self.0 == [0u8; 32]
}
}
impl fmt::Debug for ObjectId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "ObjectId({})", &self.to_hex()[..16]) }
}
impl fmt::Display for ObjectId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.to_hex())
}
}
#[derive(Debug, Clone, thiserror::Error)]
pub enum ObjectIdError {
#[error("invalid hex encoding")]
InvalidHex,
#[error("expected 32 bytes, got {0}")]
InvalidLength(usize),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct BitemporalCoord {
pub valid_from: u64,
pub valid_to: u64,
pub system_time: u64,
}
impl BitemporalCoord {
pub fn new(valid_from: u64, system_time: u64) -> Self {
Self {
valid_from,
valid_to: u64::MAX,
system_time,
}
}
pub fn with_valid_range(valid_from: u64, valid_to: u64, system_time: u64) -> Self {
Self {
valid_from,
valid_to,
system_time,
}
}
pub fn valid_at(&self, valid_time: u64) -> bool {
self.valid_from <= valid_time && valid_time < self.valid_to
}
pub fn known_at(&self, system_time: u64) -> bool {
self.system_time <= system_time
}
pub fn visible_at(&self, system_time: u64, valid_time: u64) -> bool {
self.known_at(system_time) && self.valid_at(valid_time)
}
pub fn close_valid_time(&mut self, valid_to: u64) {
self.valid_to = valid_to;
}
pub fn is_current(&self) -> bool {
self.valid_to == u64::MAX
}
pub const ETERNAL: Self = Self {
valid_from: 0,
valid_to: u64::MAX,
system_time: 0,
};
}
impl Default for BitemporalCoord {
fn default() -> Self {
Self::ETERNAL
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum EdgeKind {
Typed(String),
Contains,
DerivedFrom,
References,
Succeeds,
SimilarTo,
}
impl EdgeKind {
pub fn typed(label: impl Into<String>) -> Self {
Self::Typed(label.into())
}
pub fn label(&self) -> &str {
match self {
EdgeKind::Typed(s) => s,
EdgeKind::Contains => "contains",
EdgeKind::DerivedFrom => "derived_from",
EdgeKind::References => "references",
EdgeKind::Succeeds => "succeeds",
EdgeKind::SimilarTo => "similar_to",
}
}
}
impl fmt::Display for EdgeKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.label())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Edge {
pub target: ObjectId,
pub kind: EdgeKind,
pub weight: f32,
pub valid_from: u64,
pub valid_to: u64,
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub properties: HashMap<String, SochValue>,
}
impl Edge {
pub fn new(target: ObjectId, kind: EdgeKind, weight: f32) -> Self {
Self {
target,
kind,
weight,
valid_from: 0,
valid_to: u64::MAX,
properties: HashMap::new(),
}
}
pub fn with_validity(
target: ObjectId,
kind: EdgeKind,
weight: f32,
valid_from: u64,
valid_to: u64,
) -> Self {
Self {
target,
kind,
weight,
valid_from,
valid_to,
properties: HashMap::new(),
}
}
pub fn with_property(mut self, key: impl Into<String>, value: SochValue) -> Self {
self.properties.insert(key.into(), value);
self
}
pub fn valid_at(&self, time: u64) -> bool {
self.valid_from <= time && time < self.valid_to
}
pub fn is_current(&self) -> bool {
self.valid_to == u64::MAX
}
}
impl PartialEq for Edge {
fn eq(&self, other: &Self) -> bool {
self.target == other.target && self.kind == other.kind
}
}
impl Eq for Edge {}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ObjectKind {
Entity,
Event,
Episode,
Document,
Fact,
Artifact,
Custom(String),
}
impl ObjectKind {
pub fn label(&self) -> &str {
match self {
ObjectKind::Entity => "entity",
ObjectKind::Event => "event",
ObjectKind::Episode => "episode",
ObjectKind::Document => "document",
ObjectKind::Fact => "fact",
ObjectKind::Artifact => "artifact",
ObjectKind::Custom(s) => s,
}
}
}
impl fmt::Display for ObjectKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.label())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Provenance {
pub parents: Vec<ObjectId>,
pub operation: String,
pub agent: String,
pub timestamp: u64,
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub metadata: HashMap<String, SochValue>,
}
impl Provenance {
pub fn root(agent: impl Into<String>, timestamp: u64) -> Self {
Self {
parents: Vec::new(),
operation: "create".to_string(),
agent: agent.into(),
timestamp,
metadata: HashMap::new(),
}
}
pub fn derived(
parents: Vec<ObjectId>,
operation: impl Into<String>,
agent: impl Into<String>,
timestamp: u64,
) -> Self {
Self {
parents,
operation: operation.into(),
agent: agent.into(),
timestamp,
metadata: HashMap::new(),
}
}
pub fn with_metadata(mut self, key: impl Into<String>, value: SochValue) -> Self {
self.metadata.insert(key.into(), value);
self
}
pub fn is_root(&self) -> bool {
self.parents.is_empty()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmbeddingSpace {
pub vector: Vec<f32>,
pub dimensions: u32,
pub model: String,
pub generated_at: u64,
}
impl EmbeddingSpace {
pub fn new(vector: Vec<f32>, model: impl Into<String>, generated_at: u64) -> Self {
let dimensions = vector.len() as u32;
Self {
vector,
dimensions,
model: model.into(),
generated_at,
}
}
pub fn norm(&self) -> f32 {
self.vector.iter().map(|x| x * x).sum::<f32>().sqrt()
}
pub fn normalize(&mut self) {
let norm = self.norm();
if norm > f32::EPSILON {
for x in &mut self.vector {
*x /= norm;
}
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KnowledgeObject {
oid: ObjectId,
kind: ObjectKind,
payload: SochValue,
edges: Vec<Edge>,
embeddings: HashMap<String, EmbeddingSpace>,
temporal: BitemporalCoord,
provenance: Provenance,
#[serde(default, skip_serializing_if = "Option::is_none")]
namespace: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
tags: Vec<String>,
}
impl KnowledgeObject {
pub fn oid(&self) -> ObjectId {
self.oid
}
pub fn kind(&self) -> &ObjectKind {
&self.kind
}
pub fn payload(&self) -> &SochValue {
&self.payload
}
pub fn payload_mut(&mut self) -> &mut SochValue {
&mut self.payload
}
pub fn edges(&self) -> &[Edge] {
&self.edges
}
pub fn edges_of_kind(&self, kind: &EdgeKind) -> Vec<&Edge> {
self.edges.iter().filter(|e| &e.kind == kind).collect()
}
pub fn edges_valid_at(&self, time: u64) -> Vec<&Edge> {
self.edges.iter().filter(|e| e.valid_at(time)).collect()
}
pub fn embedding(&self, space: &str) -> Option<&EmbeddingSpace> {
self.embeddings.get(space)
}
pub fn embeddings(&self) -> &HashMap<String, EmbeddingSpace> {
&self.embeddings
}
pub fn primary_embedding(&self) -> Option<&[f32]> {
self.embeddings.get("semantic").map(|e| e.vector.as_slice())
}
pub fn temporal(&self) -> &BitemporalCoord {
&self.temporal
}
pub fn set_temporal(&mut self, coord: BitemporalCoord) {
self.temporal = coord;
}
pub fn provenance(&self) -> &Provenance {
&self.provenance
}
pub fn namespace(&self) -> Option<&str> {
self.namespace.as_deref()
}
pub fn tags(&self) -> &[String] {
&self.tags
}
pub fn has_tag(&self, tag: &str) -> bool {
self.tags.iter().any(|t| t == tag)
}
pub fn valid_at(&self, valid_time: u64) -> bool {
self.temporal.valid_at(valid_time)
}
pub fn known_at(&self, system_time: u64) -> bool {
self.temporal.known_at(system_time)
}
pub fn visible_at(&self, system_time: u64, valid_time: u64) -> bool {
self.temporal.visible_at(system_time, valid_time)
}
pub fn is_current(&self) -> bool {
self.temporal.is_current()
}
pub fn attribute(&self, key: &str) -> Option<&SochValue> {
match &self.payload {
SochValue::Object(map) => map.get(key),
_ => None,
}
}
pub fn text_attribute(&self, key: &str) -> Option<&str> {
self.attribute(key).and_then(|v| v.as_text())
}
pub fn int_attribute(&self, key: &str) -> Option<i64> {
self.attribute(key).and_then(|v| v.as_int())
}
pub fn recompute_oid(&mut self) {
self.oid = Self::compute_oid(&self.kind, &self.payload, &self.edges, &self.embeddings);
}
pub fn verify_oid(&self) -> bool {
let computed = Self::compute_oid(&self.kind, &self.payload, &self.edges, &self.embeddings);
self.oid == computed
}
fn compute_oid(
kind: &ObjectKind,
payload: &SochValue,
edges: &[Edge],
embeddings: &HashMap<String, EmbeddingSpace>,
) -> ObjectId {
let canonical = Self::canonical_bytes(kind, payload, edges, embeddings);
ObjectId::from_content(&canonical)
}
fn canonical_bytes(
kind: &ObjectKind,
payload: &SochValue,
edges: &[Edge],
embeddings: &HashMap<String, EmbeddingSpace>,
) -> Vec<u8> {
let mut hasher_input = Vec::with_capacity(1024);
let kind_bytes = kind.label().as_bytes();
hasher_input.extend_from_slice(&(kind_bytes.len() as u32).to_le_bytes());
hasher_input.extend_from_slice(kind_bytes);
let payload_bytes = canonical_soch_value_bytes(payload);
hasher_input.extend_from_slice(&(payload_bytes.len() as u32).to_le_bytes());
hasher_input.extend_from_slice(&payload_bytes);
let mut sorted_edges: Vec<_> = edges.iter().collect();
sorted_edges.sort_by(|a, b| {
a.target
.as_bytes()
.cmp(b.target.as_bytes())
.then_with(|| a.kind.label().cmp(b.kind.label()))
});
hasher_input.extend_from_slice(&(sorted_edges.len() as u32).to_le_bytes());
for edge in &sorted_edges {
hasher_input.extend_from_slice(edge.target.as_bytes());
let kind_label = edge.kind.label().as_bytes();
hasher_input.extend_from_slice(&(kind_label.len() as u32).to_le_bytes());
hasher_input.extend_from_slice(kind_label);
hasher_input.extend_from_slice(&edge.weight.to_le_bytes());
}
let mut sorted_spaces: Vec<_> = embeddings.iter().collect();
sorted_spaces.sort_by_key(|(name, _)| *name);
hasher_input.extend_from_slice(&(sorted_spaces.len() as u32).to_le_bytes());
for (name, embedding) in &sorted_spaces {
let name_bytes = name.as_bytes();
hasher_input.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
hasher_input.extend_from_slice(name_bytes);
hasher_input.extend_from_slice(&embedding.dimensions.to_le_bytes());
for &v in &embedding.vector {
hasher_input.extend_from_slice(&v.to_le_bytes());
}
}
hasher_input
}
}
fn canonical_soch_value_bytes(value: &SochValue) -> Vec<u8> {
let mut buf = Vec::with_capacity(256);
write_canonical_soch_value(&mut buf, value);
buf
}
fn write_canonical_soch_value(buf: &mut Vec<u8>, value: &SochValue) {
match value {
SochValue::Null => buf.push(0),
SochValue::Bool(b) => {
buf.push(1);
buf.push(if *b { 1 } else { 0 });
}
SochValue::Int(i) => {
buf.push(2);
buf.extend_from_slice(&i.to_le_bytes());
}
SochValue::UInt(u) => {
buf.push(3);
buf.extend_from_slice(&u.to_le_bytes());
}
SochValue::Float(f) => {
buf.push(4);
let normalized = if f.is_nan() { 0.0 } else if *f == 0.0 { 0.0 } else { *f };
buf.extend_from_slice(&normalized.to_le_bytes());
}
SochValue::Text(s) => {
buf.push(5);
buf.extend_from_slice(&(s.len() as u32).to_le_bytes());
buf.extend_from_slice(s.as_bytes());
}
SochValue::Binary(b) => {
buf.push(6);
buf.extend_from_slice(&(b.len() as u32).to_le_bytes());
buf.extend_from_slice(b);
}
SochValue::Array(arr) => {
buf.push(7);
buf.extend_from_slice(&(arr.len() as u32).to_le_bytes());
for item in arr {
write_canonical_soch_value(buf, item);
}
}
SochValue::Object(map) => {
buf.push(8);
let mut sorted_keys: Vec<&String> = map.keys().collect();
sorted_keys.sort();
buf.extend_from_slice(&(sorted_keys.len() as u32).to_le_bytes());
for key in sorted_keys {
buf.extend_from_slice(&(key.len() as u32).to_le_bytes());
buf.extend_from_slice(key.as_bytes());
write_canonical_soch_value(buf, &map[key]);
}
}
SochValue::Ref { table, id } => {
buf.push(9);
buf.extend_from_slice(&(table.len() as u32).to_le_bytes());
buf.extend_from_slice(table.as_bytes());
buf.extend_from_slice(&id.to_le_bytes());
}
}
}
impl KnowledgeObject {
pub fn to_bytes(&self) -> Result<Vec<u8>, KnowledgeObjectError> {
serde_json::to_vec(self).map_err(|e| KnowledgeObjectError::SerializationError(e.to_string()))
}
pub fn from_bytes(bytes: &[u8]) -> Result<Self, KnowledgeObjectError> {
serde_json::from_slice(bytes)
.map_err(|e| KnowledgeObjectError::DeserializationError(e.to_string()))
}
pub fn estimated_size(&self) -> usize {
std::mem::size_of::<Self>()
+ self.edges.len() * std::mem::size_of::<Edge>()
+ self
.embeddings
.values()
.map(|e| e.vector.len() * 4)
.sum::<usize>()
+ self.tags.iter().map(|t| t.len()).sum::<usize>()
}
pub fn to_compressed_bytes(
&self,
mode: CompressionMode,
) -> Result<Vec<u8>, KnowledgeObjectError> {
let raw = self.to_bytes()?;
let original_len = raw.len() as u32;
match mode {
CompressionMode::None => {
let mut out = Vec::with_capacity(5 + raw.len());
out.push(CompressionMode::None.tag());
out.extend_from_slice(&original_len.to_le_bytes());
out.extend_from_slice(&raw);
Ok(out)
}
CompressionMode::Lz4 => {
let compressed = lz4::block::compress(&raw, None, false)
.map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
if compressed.len() >= raw.len() {
let mut out = Vec::with_capacity(5 + raw.len());
out.push(CompressionMode::None.tag());
out.extend_from_slice(&original_len.to_le_bytes());
out.extend_from_slice(&raw);
return Ok(out);
}
let mut out = Vec::with_capacity(5 + compressed.len());
out.push(CompressionMode::Lz4.tag());
out.extend_from_slice(&original_len.to_le_bytes());
out.extend_from_slice(&compressed);
Ok(out)
}
CompressionMode::Zstd { level } => {
let compressed = zstd::encode_all(raw.as_slice(), level)
.map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
if compressed.len() >= raw.len() {
let mut out = Vec::with_capacity(5 + raw.len());
out.push(CompressionMode::None.tag());
out.extend_from_slice(&original_len.to_le_bytes());
out.extend_from_slice(&raw);
return Ok(out);
}
let mut out = Vec::with_capacity(5 + compressed.len());
out.push(CompressionMode::Zstd { level }.tag());
out.extend_from_slice(&original_len.to_le_bytes());
out.extend_from_slice(&compressed);
Ok(out)
}
}
}
pub fn from_compressed_bytes(bytes: &[u8]) -> Result<Self, KnowledgeObjectError> {
if bytes.len() < 5 {
return Err(KnowledgeObjectError::DeserializationError(
"compressed payload too short (need >= 5 bytes)".into(),
));
}
let tag = bytes[0];
let original_len =
u32::from_le_bytes([bytes[1], bytes[2], bytes[3], bytes[4]]) as usize;
let payload = &bytes[5..];
let raw = match tag {
0 => {
payload.to_vec()
}
1 => {
lz4::block::decompress(payload, Some(original_len as i32))
.map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?
}
2 => {
let mut decoder = zstd::Decoder::new(payload)
.map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
let mut raw = Vec::with_capacity(original_len);
decoder
.read_to_end(&mut raw)
.map_err(|e| KnowledgeObjectError::CompressionError(e.to_string()))?;
raw
}
_ => {
return Err(KnowledgeObjectError::UnknownCompressionTag(tag));
}
};
Self::from_bytes(&raw)
}
pub fn compression_ratio(
&self,
mode: CompressionMode,
) -> Result<f64, KnowledgeObjectError> {
let raw_len = self.to_bytes()?.len() as f64;
let compressed_len = self.to_compressed_bytes(mode)?.len() as f64;
Ok(compressed_len / raw_len)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CompressionMode {
None,
Lz4,
Zstd { level: i32 },
}
impl CompressionMode {
pub fn tag(&self) -> u8 {
match self {
Self::None => 0,
Self::Lz4 => 1,
Self::Zstd { .. } => 2,
}
}
pub fn from_tag(tag: u8) -> Option<Self> {
match tag {
0 => Some(Self::None),
1 => Some(Self::Lz4),
2 => Some(Self::Zstd { level: 0 }), _ => Option::None,
}
}
pub fn zstd() -> Self {
Self::Zstd { level: 3 }
}
pub fn zstd_high() -> Self {
Self::Zstd { level: 9 }
}
}
impl Default for CompressionMode {
fn default() -> Self {
Self::None
}
}
impl PartialEq for KnowledgeObject {
fn eq(&self, other: &Self) -> bool {
self.oid == other.oid
}
}
impl Eq for KnowledgeObject {}
impl std::hash::Hash for KnowledgeObject {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.oid.hash(state);
}
}
impl fmt::Display for KnowledgeObject {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"KO({}, kind={}, edges={}, embeddings={}, tags={})",
&self.oid.to_hex()[..12],
self.kind,
self.edges.len(),
self.embeddings.len(),
self.tags.len()
)
}
}
pub struct KnowledgeObjectBuilder {
kind: ObjectKind,
payload: SochValue,
edges: Vec<Edge>,
embeddings: HashMap<String, EmbeddingSpace>,
temporal: BitemporalCoord,
provenance: Provenance,
namespace: Option<String>,
tags: Vec<String>,
}
impl KnowledgeObjectBuilder {
pub fn new(kind: ObjectKind) -> Self {
Self {
kind,
payload: SochValue::Object(HashMap::new()),
edges: Vec::new(),
embeddings: HashMap::new(),
temporal: BitemporalCoord::default(),
provenance: Provenance::root("system", 0),
namespace: None,
tags: Vec::new(),
}
}
pub fn payload(mut self, payload: SochValue) -> Self {
self.payload = payload;
self
}
pub fn attribute(mut self, key: impl Into<String>, value: SochValue) -> Self {
match &mut self.payload {
SochValue::Object(map) => {
map.insert(key.into(), value);
}
_ => {
let mut map = HashMap::new();
map.insert(key.into(), value);
self.payload = SochValue::Object(map);
}
}
self
}
pub fn edge(mut self, edge: Edge) -> Self {
self.edges.push(edge);
self
}
pub fn edges(mut self, edges: impl IntoIterator<Item = Edge>) -> Self {
self.edges.extend(edges);
self
}
pub fn embedding(
mut self,
space: impl Into<String>,
vector: Vec<f32>,
) -> Self {
let space_name = space.into();
self.embeddings.insert(
space_name,
EmbeddingSpace::new(vector, "unknown", 0),
);
self
}
pub fn embedding_with_metadata(
mut self,
space: impl Into<String>,
vector: Vec<f32>,
model: impl Into<String>,
generated_at: u64,
) -> Self {
let space_name = space.into();
self.embeddings.insert(
space_name,
EmbeddingSpace::new(vector, model, generated_at),
);
self
}
pub fn valid_from(mut self, valid_from: u64) -> Self {
self.temporal.valid_from = valid_from;
self
}
pub fn valid_to(mut self, valid_to: u64) -> Self {
self.temporal.valid_to = valid_to;
self
}
pub fn system_time(mut self, system_time: u64) -> Self {
self.temporal.system_time = system_time;
self
}
pub fn temporal(mut self, temporal: BitemporalCoord) -> Self {
self.temporal = temporal;
self
}
pub fn provenance(mut self, provenance: Provenance) -> Self {
self.provenance = provenance;
self
}
pub fn namespace(mut self, namespace: impl Into<String>) -> Self {
self.namespace = Some(namespace.into());
self
}
pub fn tag(mut self, tag: impl Into<String>) -> Self {
self.tags.push(tag.into());
self
}
pub fn tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
self.tags.extend(tags.into_iter().map(|t| t.into()));
self
}
pub fn build(self) -> KnowledgeObject {
let oid = KnowledgeObject::compute_oid(
&self.kind,
&self.payload,
&self.edges,
&self.embeddings,
);
KnowledgeObject {
oid,
kind: self.kind,
payload: self.payload,
edges: self.edges,
embeddings: self.embeddings,
temporal: self.temporal,
provenance: self.provenance,
namespace: self.namespace,
tags: self.tags,
}
}
pub fn build_with_oid(self, oid: ObjectId) -> KnowledgeObject {
KnowledgeObject {
oid,
kind: self.kind,
payload: self.payload,
edges: self.edges,
embeddings: self.embeddings,
temporal: self.temporal,
provenance: self.provenance,
namespace: self.namespace,
tags: self.tags,
}
}
}
#[derive(Debug, Clone, thiserror::Error)]
pub enum KnowledgeObjectError {
#[error("serialization error: {0}")]
SerializationError(String),
#[error("deserialization error: {0}")]
DeserializationError(String),
#[error("OID verification failed: stored={stored}, computed={computed}")]
OidMismatch { stored: String, computed: String },
#[error("missing required embedding space: {0}")]
MissingEmbedding(String),
#[error("dimension mismatch in space '{space}': expected {expected}, got {got}")]
DimensionMismatch {
space: String,
expected: u32,
got: u32,
},
#[error("invalid temporal coordinates: valid_from ({valid_from}) > valid_to ({valid_to})")]
InvalidTemporalRange { valid_from: u64, valid_to: u64 },
#[error("compression error: {0}")]
CompressionError(String),
#[error("unknown compression tag: {0}")]
UnknownCompressionTag(u8),
}
impl From<SochValue> for KnowledgeObjectBuilder {
fn from(value: SochValue) -> Self {
KnowledgeObjectBuilder::new(ObjectKind::Document).payload(value)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_content_addressing_determinism() {
let ko1 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
.attribute("name", SochValue::Text("Alice".into()))
.attribute("age", SochValue::Int(30))
.build();
let ko2 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
.attribute("age", SochValue::Int(30))
.attribute("name", SochValue::Text("Alice".into()))
.build();
assert_eq!(ko1.oid(), ko2.oid());
}
#[test]
fn test_different_content_different_oid() {
let ko1 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
.attribute("name", SochValue::Text("Alice".into()))
.build();
let ko2 = KnowledgeObjectBuilder::new(ObjectKind::Entity)
.attribute("name", SochValue::Text("Bob".into()))
.build();
assert_ne!(ko1.oid(), ko2.oid());
}
#[test]
fn test_oid_verification() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
.attribute("content", SochValue::Text("Hello, world!".into()))
.build();
assert!(ko.verify_oid());
}
#[test]
fn test_bitemporal_queries() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Event)
.valid_from(100)
.valid_to(200)
.system_time(50)
.build();
assert!(ko.valid_at(150));
assert!(!ko.valid_at(250));
assert!(ko.known_at(50));
assert!(ko.known_at(100));
assert!(!ko.known_at(40));
assert!(ko.visible_at(60, 150));
assert!(!ko.visible_at(40, 150));
}
#[test]
fn test_embedded_edges() {
let target_oid = ObjectId::from_content(b"target_object");
let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
.attribute("name", SochValue::Text("Alice".into()))
.edge(Edge::new(target_oid, EdgeKind::typed("works_at"), 1.0))
.edge(Edge::new(target_oid, EdgeKind::Contains, 0.5))
.build();
assert_eq!(ko.edges().len(), 2);
assert_eq!(ko.edges_of_kind(&EdgeKind::typed("works_at")).len(), 1);
assert_eq!(ko.edges_of_kind(&EdgeKind::Contains).len(), 1);
}
#[test]
fn test_multi_space_embeddings() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
.embedding("semantic", vec![0.1, 0.2, 0.3])
.embedding("code", vec![0.4, 0.5, 0.6, 0.7])
.build();
assert!(ko.embedding("semantic").is_some());
assert!(ko.embedding("code").is_some());
assert!(ko.embedding("nonexistent").is_none());
assert_eq!(ko.embedding("semantic").unwrap().dimensions, 3);
assert_eq!(ko.embedding("code").unwrap().dimensions, 4);
}
#[test]
fn test_provenance_chain() {
let parent_oid = ObjectId::from_content(b"parent_document");
let ko = KnowledgeObjectBuilder::new(ObjectKind::Fact)
.attribute("claim", SochValue::Text("X is true".into()))
.provenance(Provenance::derived(
vec![parent_oid],
"extract_facts",
"gpt-4",
1700000000,
))
.build();
assert!(!ko.provenance().is_root());
assert_eq!(ko.provenance().parents.len(), 1);
assert_eq!(ko.provenance().parents[0], parent_oid);
assert_eq!(ko.provenance().operation, "extract_facts");
}
#[test]
fn test_serialization_roundtrip() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
.attribute("name", SochValue::Text("Alice".into()))
.embedding("semantic", vec![0.1, 0.2, 0.3])
.tag("person")
.namespace("test")
.build();
let bytes = ko.to_bytes().unwrap();
let restored = KnowledgeObject::from_bytes(&bytes).unwrap();
assert_eq!(ko.oid(), restored.oid());
assert_eq!(ko.kind(), restored.kind());
assert_eq!(ko.tags(), restored.tags());
assert_eq!(ko.namespace(), restored.namespace());
}
#[test]
fn test_object_id_hex_roundtrip() {
let oid = ObjectId::from_content(b"test content");
let hex = oid.to_hex();
let parsed = ObjectId::from_hex(&hex).unwrap();
assert_eq!(oid, parsed);
}
#[test]
fn test_nil_oid() {
assert!(ObjectId::NIL.is_nil());
let non_nil = ObjectId::from_content(b"something");
assert!(!non_nil.is_nil());
}
#[test]
fn test_edge_temporal_filtering() {
let target = ObjectId::from_content(b"target");
let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
.edge(Edge::with_validity(target, EdgeKind::typed("works_at"), 1.0, 100, 200))
.edge(Edge::with_validity(target, EdgeKind::typed("manages"), 0.8, 150, u64::MAX))
.build();
let active = ko.edges_valid_at(120);
assert_eq!(active.len(), 1);
assert_eq!(active[0].kind, EdgeKind::typed("works_at"));
assert_eq!(ko.edges_valid_at(160).len(), 2);
let active = ko.edges_valid_at(250);
assert_eq!(active.len(), 1);
assert_eq!(active[0].kind, EdgeKind::typed("manages"));
}
#[test]
fn test_estimated_size() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
.embedding("semantic", vec![0.0; 384])
.tag("test")
.build();
let size = ko.estimated_size();
assert!(size > 384 * 4); }
#[test]
fn test_display() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
.attribute("name", SochValue::Text("Alice".into()))
.build();
let display = format!("{}", ko);
assert!(display.starts_with("KO("));
assert!(display.contains("kind=entity"));
}
#[test]
fn test_compression_none_roundtrip() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Entity)
.attribute("name", SochValue::Text("Alice".into()))
.embedding("semantic", vec![0.1; 128])
.tag("person")
.build();
let compressed = ko.to_compressed_bytes(CompressionMode::None).unwrap();
assert_eq!(compressed[0], 0); let restored = KnowledgeObject::from_compressed_bytes(&compressed).unwrap();
assert_eq!(ko.oid(), restored.oid());
}
#[test]
fn test_compression_lz4_roundtrip() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
.attribute("content", SochValue::Text("hello world ".repeat(100)))
.embedding("semantic", vec![0.5; 384])
.build();
let compressed = ko.to_compressed_bytes(CompressionMode::Lz4).unwrap();
let raw = ko.to_bytes().unwrap();
assert!(compressed.len() < raw.len(), "LZ4 should reduce size for repetitive data");
assert_eq!(compressed[0], 1);
let restored = KnowledgeObject::from_compressed_bytes(&compressed).unwrap();
assert_eq!(ko.oid(), restored.oid());
assert_eq!(ko.tags(), restored.tags());
}
#[test]
fn test_compression_zstd_roundtrip() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
.attribute("content", SochValue::Text("hello world ".repeat(100)))
.embedding("semantic", vec![0.5; 384])
.tag("document")
.namespace("test-ns")
.build();
let compressed = ko.to_compressed_bytes(CompressionMode::zstd()).unwrap();
let raw = ko.to_bytes().unwrap();
assert!(compressed.len() < raw.len(), "ZSTD should reduce size");
assert_eq!(compressed[0], 2);
let restored = KnowledgeObject::from_compressed_bytes(&compressed).unwrap();
assert_eq!(ko.oid(), restored.oid());
assert_eq!(ko.namespace(), restored.namespace());
}
#[test]
fn test_compression_fallback_on_tiny_object() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Fact)
.attribute("x", SochValue::Int(1))
.build();
let compressed_lz4 = ko.to_compressed_bytes(CompressionMode::Lz4).unwrap();
let compressed_zstd = ko.to_compressed_bytes(CompressionMode::zstd()).unwrap();
let r1 = KnowledgeObject::from_compressed_bytes(&compressed_lz4).unwrap();
let r2 = KnowledgeObject::from_compressed_bytes(&compressed_zstd).unwrap();
assert_eq!(ko.oid(), r1.oid());
assert_eq!(ko.oid(), r2.oid());
}
#[test]
fn test_compression_ratio() {
let ko = KnowledgeObjectBuilder::new(ObjectKind::Document)
.attribute("data", SochValue::Text("abcdefgh".repeat(500)))
.build();
let ratio = ko.compression_ratio(CompressionMode::Lz4).unwrap();
assert!(ratio < 1.0, "LZ4 should achieve < 1.0 ratio on repetitive data");
let ratio_zstd = ko.compression_ratio(CompressionMode::zstd()).unwrap();
assert!(ratio_zstd < ratio, "ZSTD should beat LZ4 ratio at default level");
}
#[test]
fn test_compression_mode_tag_roundtrip() {
for mode in [CompressionMode::None, CompressionMode::Lz4, CompressionMode::zstd()] {
let tag = mode.tag();
let recovered = CompressionMode::from_tag(tag).unwrap();
assert_eq!(mode.tag(), recovered.tag());
}
assert!(CompressionMode::from_tag(255).is_none());
}
#[test]
fn test_compressed_bytes_too_short() {
let result = KnowledgeObject::from_compressed_bytes(&[0, 1, 2]);
assert!(result.is_err());
}
#[test]
fn test_unknown_compression_tag() {
let bad_bytes = vec![99, 0, 0, 0, 0]; let result = KnowledgeObject::from_compressed_bytes(&bad_bytes);
assert!(result.is_err());
}
}