Skip to main content

reddb_server/storage/unified/
entity.rs

1//! Unified Entity Model
2//!
3//! Provides a single entity type that can represent table rows, graph nodes,
4//! graph edges, or vectors with seamless interoperability.
5
6use std::collections::HashMap;
7use std::fmt;
8use std::sync::Arc;
9
10use crate::storage::schema::Value;
11
12/// The first entity-id handed to user-inserted data. Ids `1..FIRST_USER_ENTITY_ID`
13/// are reserved for the internal collection-descriptor and config-default entities
14/// the engine seeds at boot, so the first user-inserted `rid` is a STABLE,
15/// documented value regardless of how many config defaults a build ships.
16///
17/// Before this floor existed the offset drifted upward by one for every config
18/// default added (101 → 114 over time), silently breaking the documented
19/// file-format invariant (#1369). The boot sequence bumps the allocator up to
20/// this floor after seeding internals; it only ever raises the counter, so a
21/// database that already holds user data is untouched. Mirrors
22/// `FIRST_USER_LABEL_ID` in the graph label registry.
23pub const FIRST_USER_ENTITY_ID: u64 = 1024;
24
25/// Unique identifier for any entity in the unified storage
26#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
27pub struct EntityId(pub u64);
28
29impl EntityId {
30    /// Create a new entity ID
31    pub fn new(id: u64) -> Self {
32        Self(id)
33    }
34
35    /// Get the raw ID value
36    pub fn raw(&self) -> u64 {
37        self.0
38    }
39}
40
41impl fmt::Display for EntityId {
42    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
43        write!(f, "e{}", self.0)
44    }
45}
46
47impl From<u64> for EntityId {
48    fn from(id: u64) -> Self {
49        Self(id)
50    }
51}
52
53/// The kind of entity (what storage type it belongs to)
54#[derive(Debug, Clone, PartialEq, Eq, Hash)]
55pub enum EntityKind {
56    /// A row in a structured table (hot path — kept inline for cache performance)
57    TableRow { table: Arc<str>, row_id: u64 },
58    /// A node in the graph (boxed — saves ~56 bytes per entity for table rows)
59    GraphNode(Box<GraphNodeKind>),
60    /// An edge in the graph (boxed)
61    GraphEdge(Box<GraphEdgeKind>),
62    /// A vector in a collection
63    Vector { collection: String },
64    /// A time-series data point (boxed)
65    TimeSeriesPoint(Box<TimeSeriesPointKind>),
66    /// A queue message
67    QueueMessage { queue: String, position: u64 },
68}
69
70#[derive(Debug, Clone, PartialEq, Eq, Hash)]
71pub struct GraphNodeKind {
72    pub label: String,
73    pub node_type: String,
74}
75
76#[derive(Debug, Clone, PartialEq, Eq, Hash)]
77pub struct GraphEdgeKind {
78    pub label: String,
79    pub from_node: String,
80    pub to_node: String,
81    pub weight: u32,
82}
83
84#[derive(Debug, Clone, PartialEq, Eq, Hash)]
85pub struct TimeSeriesPointKind {
86    pub series: String,
87    pub metric: String,
88}
89
90impl EntityKind {
91    /// Get the storage type as a string
92    pub fn storage_type(&self) -> &'static str {
93        match self {
94            Self::TableRow { .. } => "table",
95            Self::GraphNode(_) => "graph_node",
96            Self::GraphEdge(_) => "graph_edge",
97            Self::Vector { .. } => "vector",
98            Self::TimeSeriesPoint(_) => "timeseries",
99            Self::QueueMessage { .. } => "queue",
100        }
101    }
102
103    /// Get the collection/table name
104    pub fn collection(&self) -> &str {
105        match self {
106            Self::TableRow { table, .. } => table,
107            Self::GraphNode(n) => &n.label,
108            Self::GraphEdge(e) => &e.label,
109            Self::Vector { collection } => collection,
110            Self::TimeSeriesPoint(ts) => &ts.series,
111            Self::QueueMessage { queue, .. } => queue,
112        }
113    }
114}
115
116/// The actual data content of an entity
117#[derive(Debug, Clone)]
118pub enum EntityData {
119    /// Table row data
120    Row(RowData),
121    /// Graph node data
122    Node(NodeData),
123    /// Graph edge data
124    Edge(EdgeData),
125    /// Vector data
126    Vector(VectorData),
127    /// Time-series data point
128    TimeSeries(TimeSeriesData),
129    /// Queue message data
130    QueueMessage(QueueMessageData),
131}
132
133impl EntityData {
134    /// Check if this is row data
135    pub fn is_row(&self) -> bool {
136        matches!(self, Self::Row(_))
137    }
138
139    /// Check if this is node data
140    pub fn is_node(&self) -> bool {
141        matches!(self, Self::Node(_))
142    }
143
144    /// Check if this is edge data
145    pub fn is_edge(&self) -> bool {
146        matches!(self, Self::Edge(_))
147    }
148
149    /// Check if this is vector data
150    pub fn is_vector(&self) -> bool {
151        matches!(self, Self::Vector(_))
152    }
153
154    /// Get as row data
155    pub fn as_row(&self) -> Option<&RowData> {
156        match self {
157            Self::Row(r) => Some(r),
158            _ => None,
159        }
160    }
161
162    /// Get as node data
163    pub fn as_node(&self) -> Option<&NodeData> {
164        match self {
165            Self::Node(n) => Some(n),
166            _ => None,
167        }
168    }
169
170    /// Get as edge data
171    pub fn as_edge(&self) -> Option<&EdgeData> {
172        match self {
173            Self::Edge(e) => Some(e),
174            _ => None,
175        }
176    }
177
178    /// Get as vector data
179    pub fn as_vector(&self) -> Option<&VectorData> {
180        match self {
181            Self::Vector(v) => Some(v),
182            _ => None,
183        }
184    }
185}
186
187/// Data for a table row
188#[derive(Debug, Clone)]
189pub struct RowData {
190    /// Column values in schema order
191    pub columns: Vec<Value>,
192    /// Named column access (optional, for convenience)
193    pub named: Option<HashMap<String, Value>>,
194    /// Shared column schema: column names in order (maps index → name).
195    /// When set, `columns` holds the values and `named` is None.
196    /// This saves ~60% memory vs per-row HashMap.
197    pub schema: Option<std::sync::Arc<Vec<String>>>,
198}
199
200impl RowData {
201    /// Create new row data from column values
202    pub fn new(columns: Vec<Value>) -> Self {
203        Self {
204            columns,
205            named: None,
206            schema: None,
207        }
208    }
209
210    /// Create row data with named columns
211    pub fn with_names(columns: Vec<Value>, names: Vec<String>) -> Self {
212        let named: HashMap<String, Value> =
213            names.into_iter().zip(columns.iter().cloned()).collect();
214        Self {
215            columns,
216            named: Some(named),
217            schema: None,
218        }
219    }
220
221    /// Get a named field value — checks named HashMap first, then schema+columns.
222    pub fn get_field(&self, name: &str) -> Option<&Value> {
223        // Fast path: named HashMap
224        if let Some(ref named) = self.named {
225            return named.get(name);
226        }
227        // Columnar path: use schema to find index
228        if let Some(ref schema) = self.schema {
229            if let Some(idx) = schema.iter().position(|s| s == name) {
230                return self.columns.get(idx);
231            }
232        }
233        None
234    }
235
236    /// Iterate over all (name, value) pairs — works for both named and columnar.
237    pub fn iter_fields(&self) -> Box<dyn Iterator<Item = (&str, &Value)> + '_> {
238        if let Some(ref named) = self.named {
239            Box::new(named.iter().map(|(k, v)| (k.as_str(), v)))
240        } else if let Some(ref schema) = self.schema {
241            Box::new(
242                schema
243                    .iter()
244                    .zip(self.columns.iter())
245                    .map(|(k, v)| (k.as_str(), v)),
246            )
247        } else {
248            Box::new(std::iter::empty())
249        }
250    }
251
252    /// Get column by index
253    pub fn get(&self, index: usize) -> Option<&Value> {
254        self.columns.get(index)
255    }
256
257    /// Get column by name
258    pub fn get_by_name(&self, name: &str) -> Option<&Value> {
259        self.named.as_ref()?.get(name)
260    }
261
262    /// Number of columns
263    pub fn len(&self) -> usize {
264        self.columns.len()
265    }
266
267    /// Check if empty
268    pub fn is_empty(&self) -> bool {
269        self.columns.is_empty()
270    }
271}
272
273/// Data for a graph node
274#[derive(Debug, Clone)]
275pub struct NodeData {
276    /// Node properties
277    pub properties: HashMap<String, Value>,
278}
279
280impl NodeData {
281    /// Create new node data
282    pub fn new() -> Self {
283        Self {
284            properties: HashMap::new(),
285        }
286    }
287
288    /// Create with properties
289    pub fn with_properties(properties: HashMap<String, Value>) -> Self {
290        Self { properties }
291    }
292
293    /// Set a property
294    pub fn set(&mut self, key: impl Into<String>, value: Value) {
295        self.properties.insert(key.into(), value);
296    }
297
298    /// Get a property
299    pub fn get(&self, key: &str) -> Option<&Value> {
300        self.properties.get(key)
301    }
302
303    /// Check if property exists
304    pub fn has(&self, key: &str) -> bool {
305        self.properties.contains_key(key)
306    }
307}
308
309impl Default for NodeData {
310    fn default() -> Self {
311        Self::new()
312    }
313}
314
315/// Data for a graph edge
316#[derive(Debug, Clone)]
317pub struct EdgeData {
318    /// Edge properties
319    pub properties: HashMap<String, Value>,
320    /// Edge weight (for weighted graphs)
321    pub weight: f32,
322}
323
324impl EdgeData {
325    /// Create new edge data
326    pub fn new(weight: f32) -> Self {
327        Self {
328            properties: HashMap::new(),
329            weight,
330        }
331    }
332
333    /// Create with properties
334    pub fn with_properties(weight: f32, properties: HashMap<String, Value>) -> Self {
335        Self { properties, weight }
336    }
337
338    /// Set a property
339    pub fn set(&mut self, key: impl Into<String>, value: Value) {
340        self.properties.insert(key.into(), value);
341    }
342
343    /// Get a property
344    pub fn get(&self, key: &str) -> Option<&Value> {
345        self.properties.get(key)
346    }
347}
348
349impl Default for EdgeData {
350    fn default() -> Self {
351        Self::new(1.0)
352    }
353}
354
355/// Data for a vector
356#[derive(Debug, Clone)]
357pub struct VectorData {
358    /// Dense vector (primary embedding)
359    pub dense: Vec<f32>,
360    /// Optional sparse vector
361    pub sparse: Option<SparseVector>,
362    /// Original content (if applicable)
363    pub content: Option<String>,
364}
365
366impl VectorData {
367    /// Create new vector data from dense vector
368    pub fn new(dense: Vec<f32>) -> Self {
369        Self {
370            dense,
371            sparse: None,
372            content: None,
373        }
374    }
375
376    /// Create with sparse vector
377    pub fn with_sparse(dense: Vec<f32>, sparse: SparseVector) -> Self {
378        Self {
379            dense,
380            sparse: Some(sparse),
381            content: None,
382        }
383    }
384
385    /// Set content
386    pub fn with_content(mut self, content: impl Into<String>) -> Self {
387        self.content = Some(content.into());
388        self
389    }
390
391    /// Get dimension
392    pub fn dimension(&self) -> usize {
393        self.dense.len()
394    }
395
396    /// Check if has sparse component
397    pub fn is_hybrid(&self) -> bool {
398        self.sparse.is_some()
399    }
400}
401
402/// Time-series data point
403#[derive(Debug, Clone)]
404pub struct TimeSeriesData {
405    /// Metric name (e.g., "cpu.idle")
406    pub metric: String,
407    /// Timestamp in nanoseconds since epoch
408    pub timestamp_ns: u64,
409    /// Metric value
410    pub value: f64,
411    /// Dimensional tags (e.g., {"host": "srv1"})
412    pub tags: std::collections::HashMap<String, String>,
413}
414
415/// Queue message data
416#[derive(Debug, Clone)]
417pub struct QueueMessageData {
418    /// Message payload
419    pub payload: Value,
420    /// Optional priority (higher = more urgent)
421    pub priority: Option<i32>,
422    /// Enqueue timestamp (nanoseconds)
423    pub enqueued_at_ns: u64,
424    /// Number of delivery attempts
425    pub attempts: u32,
426    /// Maximum delivery attempts before DLQ
427    pub max_attempts: u32,
428    /// Whether the message has been acknowledged
429    pub acked: bool,
430}
431
432/// Sparse vector representation
433#[derive(Debug, Clone)]
434pub struct SparseVector {
435    /// Indices of non-zero elements
436    pub indices: Vec<u32>,
437    /// Values at those indices
438    pub values: Vec<f32>,
439    /// Total dimension (may be larger than indices.len())
440    pub dimension: usize,
441}
442
443impl SparseVector {
444    /// Create new sparse vector
445    pub fn new(indices: Vec<u32>, values: Vec<f32>, dimension: usize) -> Self {
446        debug_assert_eq!(indices.len(), values.len());
447        Self {
448            indices,
449            values,
450            dimension,
451        }
452    }
453
454    /// Number of non-zero elements
455    pub fn nnz(&self) -> usize {
456        self.indices.len()
457    }
458
459    /// Sparsity ratio
460    pub fn sparsity(&self) -> f32 {
461        if self.dimension == 0 {
462            1.0
463        } else {
464            1.0 - (self.nnz() as f32 / self.dimension as f32)
465        }
466    }
467
468    /// Get value at index (0 if not present)
469    pub fn get(&self, index: u32) -> f32 {
470        self.indices
471            .iter()
472            .position(|&i| i == index)
473            .map(|pos| self.values[pos])
474            .unwrap_or(0.0)
475    }
476}
477
478/// A slot for embedding a specific aspect of an entity
479#[derive(Debug, Clone)]
480pub struct EmbeddingSlot {
481    /// Slot name (e.g., "content", "summary", "title", "code")
482    pub name: String,
483    /// The embedding vector
484    pub vector: Vec<f32>,
485    /// Model used to generate embedding
486    pub model: String,
487    /// Vector dimension
488    pub dimension: usize,
489    /// Generation timestamp
490    pub generated_at: u64,
491}
492
493fn current_unix_secs() -> u64 {
494    std::time::SystemTime::now()
495        .duration_since(std::time::UNIX_EPOCH)
496        .unwrap_or_default()
497        .as_secs()
498}
499
500impl EmbeddingSlot {
501    /// Create a new embedding slot
502    pub fn new(name: impl Into<String>, vector: Vec<f32>, model: impl Into<String>) -> Self {
503        let dimension = vector.len();
504        Self {
505            name: name.into(),
506            vector,
507            model: model.into(),
508            dimension,
509            generated_at: current_unix_secs(),
510        }
511    }
512}
513
514/// A unified entity that can represent any storage type
515#[derive(Debug, Clone)]
516pub struct UnifiedEntity {
517    /// Unique entity identifier
518    pub id: EntityId,
519    /// Stable user-visible identity shared by all physical versions.
520    ///
521    /// `None` is the legacy encoding and resolves to `id`.
522    logical_id: Option<EntityId>,
523    /// What kind of entity this is
524    pub kind: EntityKind,
525    /// Creation timestamp
526    pub created_at: u64,
527    /// Last update timestamp
528    pub updated_at: u64,
529    /// The actual data content
530    pub data: EntityData,
531    /// Sequence ID for ordering/versioning
532    pub sequence_id: u64,
533    /// Field-name bloom filter (u64, zero-allocation).
534    ///
535    /// Each bit encodes one possible mid-character value: for field name `n`
536    /// the bit position is `n.as_bytes()[n.len()/2] & 63`. OR of all user
537    /// field names present in this entity. Cleared for schema-based bulk rows
538    /// (all rows share the same schema so bloom is segment-level).
539    ///
540    /// The compiled filter computes `required_bloom` from predicate field names
541    /// at compile time. If `entity.field_bloom & required_bloom != required_bloom`,
542    /// the entity cannot match and is skipped before any HashMap probe.
543    pub field_bloom: u64,
544    /// MVCC creation transaction ID (Phase 2.3 PG parity).
545    ///
546    /// `0` means "pre-MVCC" / auto-commit — visible to every snapshot. When
547    /// a BEGIN-wrapped INSERT runs, it stamps `xmin` with the transaction's
548    /// snapshot id so other concurrent transactions only see the row after
549    /// the writer commits (snapshot isolation semantics).
550    ///
551    /// Visibility rule: `xmin <= snapshot.xid && (xmax == 0 || xmax > snapshot.xid)`.
552    pub xmin: u64,
553    /// MVCC deletion transaction ID (Phase 2.3 PG parity).
554    ///
555    /// `0` means "live". Set to the deleting transaction's snapshot id on
556    /// DELETE/UPDATE (row is kept until VACUUM reclaims it). Snapshots with
557    /// `xid < xmax` still see the row; newer snapshots skip it.
558    pub xmax: u64,
559    /// Optional auxiliary data (embeddings, cross-refs).
560    /// None for most table rows — saves 40 bytes/entity.
561    aux: Option<Box<EntityAux>>,
562}
563
564/// Auxiliary entity data — only allocated when needed.
565#[derive(Debug, Clone, Default)]
566pub struct EntityAux {
567    /// Embedding slots (for multi-vector support)
568    pub embeddings: Vec<EmbeddingSlot>,
569    /// Cross-references to other entities
570    pub cross_refs: Vec<CrossRef>,
571}
572
573impl UnifiedEntity {
574    /// Access embeddings (returns empty slice if no aux data).
575    pub fn embeddings(&self) -> &[EmbeddingSlot] {
576        self.aux
577            .as_ref()
578            .map(|a| a.embeddings.as_slice())
579            .unwrap_or(&[])
580    }
581
582    /// Access cross-references (returns empty slice if no aux data).
583    pub fn cross_refs(&self) -> &[CrossRef] {
584        self.aux
585            .as_ref()
586            .map(|a| a.cross_refs.as_slice())
587            .unwrap_or(&[])
588    }
589
590    /// Get mutable embeddings (allocates aux if needed).
591    pub fn embeddings_mut(&mut self) -> &mut Vec<EmbeddingSlot> {
592        &mut self.aux.get_or_insert_with(Default::default).embeddings
593    }
594
595    /// Get mutable cross-refs (allocates aux if needed).
596    pub fn cross_refs_mut(&mut self) -> &mut Vec<CrossRef> {
597        &mut self.aux.get_or_insert_with(Default::default).cross_refs
598    }
599
600    /// Check if entity has any auxiliary data.
601    pub fn has_aux(&self) -> bool {
602        self.aux.is_some()
603    }
604}
605
606/// Compute one bit of a field-name bloom filter.
607///
608/// Uses the mid-character trick from MongoDB's `FieldNameBloomFilter.h`:
609/// the bit position is the mid-byte value clamped to 0..63. Zero-allocation,
610/// ~1.5% false-positive rate for ≤5 distinct field names.
611#[inline]
612pub fn field_name_bloom(name: &str) -> u64 {
613    let b = name.as_bytes();
614    if b.is_empty() {
615        return 0;
616    }
617    1u64 << (b[b.len() / 2] & 63)
618}
619
620/// Compute the combined field-name bloom for all user-level fields in `data`.
621/// Returns 0 for schema-based rows (all rows share the same schema, so the
622/// per-entity bloom would be identical — caller uses a segment-level bloom).
623pub fn compute_entity_field_bloom(data: &EntityData) -> u64 {
624    match data {
625        EntityData::Row(row) => {
626            if row.schema.is_some() {
627                // Schema path: bloom is identical for every row in this table.
628                // Don't store per-entity — use segment-level bloom instead.
629                return 0;
630            }
631            if let Some(named) = &row.named {
632                named.keys().fold(0u64, |acc, k| acc | field_name_bloom(k))
633            } else {
634                0
635            }
636        }
637        EntityData::Node(node) => node
638            .properties
639            .keys()
640            .fold(0u64, |acc, k| acc | field_name_bloom(k)),
641        EntityData::Edge(edge) => edge
642            .properties
643            .keys()
644            .fold(0u64, |acc, k| acc | field_name_bloom(k)),
645        // Vectors, time-series, queue: no user-named fields worth blooming.
646        _ => 0,
647    }
648}
649
650impl UnifiedEntity {
651    /// Create a new unified entity
652    pub fn new(id: EntityId, kind: EntityKind, data: EntityData) -> Self {
653        let now = current_unix_secs();
654        let field_bloom = compute_entity_field_bloom(&data);
655
656        Self {
657            id,
658            logical_id: None,
659            kind,
660            created_at: now,
661            updated_at: now,
662            data,
663            sequence_id: 0,
664            field_bloom,
665            // Pre-MVCC default: xmin/xmax = 0 means visible to every snapshot.
666            // Transactional writers stamp real snapshot IDs after allocation.
667            xmin: 0,
668            xmax: 0,
669            aux: None,
670        }
671    }
672
673    /// MVCC visibility check (Phase 2.3 PG parity).
674    ///
675    /// Returns `true` when this tuple is visible under the provided
676    /// snapshot xid. Pre-MVCC rows (`xmin == 0`, `xmax == 0`) are visible
677    /// to every snapshot — preserves full compatibility with existing
678    /// data inserted before the MVCC headers existed.
679    ///
680    /// Snapshot isolation rule:
681    ///   - `xmin == 0 || xmin <= snapshot_xid`  (creator committed before snapshot)
682    ///   - `xmax == 0 || xmax > snapshot_xid`   (deleter committed after snapshot)
683    #[inline]
684    pub fn is_visible(&self, snapshot_xid: u64) -> bool {
685        if self.xmin != 0 && self.xmin > snapshot_xid {
686            return false;
687        }
688        if self.xmax != 0 && self.xmax <= snapshot_xid {
689            return false;
690        }
691        true
692    }
693
694    /// Stamp `xmin` (creation transaction ID). Called by the runtime on
695    /// INSERT inside an active transaction.
696    #[inline]
697    pub fn set_xmin(&mut self, xid: u64) {
698        self.xmin = xid;
699    }
700
701    /// Stamp `xmax` (deletion transaction ID). Called by the runtime on
702    /// DELETE/UPDATE inside an active transaction — the tuple survives
703    /// until VACUUM reclaims it.
704    #[inline]
705    pub fn set_xmax(&mut self, xid: u64) {
706        self.xmax = xid;
707    }
708
709    /// Stable user-visible identity. Legacy rows without an explicit
710    /// logical id map to their physical entity id.
711    #[inline]
712    pub fn logical_id(&self) -> EntityId {
713        self.logical_id.unwrap_or(self.id)
714    }
715
716    /// Returns true when the entity carries an explicit logical id on disk.
717    #[inline]
718    pub fn has_explicit_logical_id(&self) -> bool {
719        self.logical_id.is_some()
720    }
721
722    /// Set the stable user-visible identity for this physical version.
723    #[inline]
724    pub fn set_logical_id(&mut self, logical_id: EntityId) {
725        self.logical_id = Some(logical_id);
726    }
727
728    /// Ensure entities written by the current engine carry explicit
729    /// logical identity. Table rows + documents (Phase 1/2) and graph
730    /// nodes/edges (Phase 3) participate in the multi-model MVCC
731    /// versioning rollout and so need a stable logical id for
732    /// version-chain selection. Stamping the logical id is inert for
733    /// non-versioned collections: history only accrues through
734    /// `install_versioned_table_row_update`, which is gated on the
735    /// collection `versioned` flag, so a stamped-but-never-superseded
736    /// entity keeps `logical_id == id` and behaves exactly as before.
737    /// Vectors are intentionally excluded pending their read-path follow
738    /// up (no snapshot-honoring `VECTOR SEARCH`).
739    #[inline]
740    pub(crate) fn ensure_table_logical_id(&mut self) {
741        if self.logical_id.is_none()
742            && matches!(
743                self.kind,
744                EntityKind::TableRow { .. } | EntityKind::GraphNode(_) | EntityKind::GraphEdge(_)
745            )
746        {
747            self.logical_id = Some(self.id);
748        }
749    }
750
751    /// Create a table row entity
752    pub fn table_row(
753        id: EntityId,
754        table: impl Into<Arc<str>>,
755        row_id: u64,
756        columns: Vec<Value>,
757    ) -> Self {
758        Self::new(
759            id,
760            EntityKind::TableRow {
761                table: table.into(),
762                row_id,
763            },
764            EntityData::Row(RowData::new(columns)),
765        )
766    }
767
768    /// Create a graph node entity
769    pub fn graph_node(
770        id: EntityId,
771        label: impl Into<String>,
772        node_type: impl Into<String>,
773        properties: HashMap<String, Value>,
774    ) -> Self {
775        Self::new(
776            id,
777            EntityKind::GraphNode(Box::new(GraphNodeKind {
778                label: label.into(),
779                node_type: node_type.into(),
780            })),
781            EntityData::Node(NodeData::with_properties(properties)),
782        )
783    }
784
785    /// Create a graph edge entity
786    pub fn graph_edge(
787        id: EntityId,
788        label: impl Into<String>,
789        from: impl Into<String>,
790        to: impl Into<String>,
791        weight: f32,
792        properties: HashMap<String, Value>,
793    ) -> Self {
794        Self::new(
795            id,
796            EntityKind::GraphEdge(Box::new(GraphEdgeKind {
797                label: label.into(),
798                from_node: from.into(),
799                to_node: to.into(),
800                weight: (weight * 1000.0) as u32,
801            })),
802            EntityData::Edge(EdgeData::with_properties(weight, properties)),
803        )
804    }
805
806    /// Create a vector entity
807    pub fn vector(id: EntityId, collection: impl Into<String>, vector: Vec<f32>) -> Self {
808        Self::new(
809            id,
810            EntityKind::Vector {
811                collection: collection.into(),
812            },
813            EntityData::Vector(VectorData::new(vector)),
814        )
815    }
816
817    /// Add an embedding to this entity
818    pub fn add_embedding(&mut self, slot: EmbeddingSlot) {
819        self.embeddings_mut().push(slot);
820        self.touch();
821    }
822
823    /// Add a cross-reference
824    pub fn add_cross_ref(&mut self, cross_ref: CrossRef) {
825        self.cross_refs_mut().push(cross_ref);
826        self.touch();
827    }
828
829    /// Get embedding by slot name
830    pub fn get_embedding(&self, name: &str) -> Option<&EmbeddingSlot> {
831        self.embeddings().iter().find(|e| e.name == name)
832    }
833
834    /// Update timestamp
835    fn touch(&mut self) {
836        self.updated_at = current_unix_secs();
837    }
838
839    /// Check if entity is stale (not updated in given seconds)
840    pub fn is_stale(&self, max_age_secs: u64) -> bool {
841        let now = current_unix_secs();
842        now.saturating_sub(self.updated_at) > max_age_secs
843    }
844}
845
846/// A cross-reference between entities
847#[derive(Debug, Clone, PartialEq)]
848pub struct CrossRef {
849    /// Source entity ID (the entity that holds this reference)
850    pub source: EntityId,
851    /// Target entity ID
852    pub target: EntityId,
853    /// Target collection name
854    pub target_collection: String,
855    /// Type of reference
856    pub ref_type: RefType,
857    /// Reference weight/strength (0.0-1.0)
858    pub weight: f32,
859    /// When this reference was created
860    pub created_at: u64,
861}
862
863impl CrossRef {
864    /// Create a new cross-reference
865    pub fn new(
866        source: EntityId,
867        target: EntityId,
868        target_collection: impl Into<String>,
869        ref_type: RefType,
870    ) -> Self {
871        Self {
872            source,
873            target,
874            target_collection: target_collection.into(),
875            ref_type,
876            weight: 1.0,
877            created_at: current_unix_secs(),
878        }
879    }
880
881    /// Create with weight
882    pub fn with_weight(
883        source: EntityId,
884        target: EntityId,
885        target_collection: impl Into<String>,
886        ref_type: RefType,
887        weight: f32,
888    ) -> Self {
889        let mut cr = Self::new(source, target, target_collection, ref_type);
890        cr.weight = weight;
891        cr
892    }
893}
894
895/// Types of cross-references between entities
896#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
897pub enum RefType {
898    // Table ↔ Graph
899    RowToNode, // Table row represents a graph node
900    RowToEdge, // Table row represents a graph edge
901    NodeToRow, // Node links back to source row
902
903    // Table ↔ Vector
904    RowToVector, // Table row has embeddings
905    VectorToRow, // Vector search → source row
906
907    // Graph ↔ Vector
908    NodeToVector, // Node has embeddings
909    EdgeToVector, // Edge has embeddings
910    VectorToNode, // Vector search → source node
911
912    // Semantic links (discovered)
913    SimilarTo,   // Discovered by vector similarity
914    RelatedTo,   // Domain-specific relationship
915    DerivesFrom, // Data lineage
916    Mentions,    // Text mentions another entity
917    Contains,    // Structural containment
918    DependsOn,   // Dependency relationship
919}
920
921impl RefType {
922    /// Get the inverse reference type (for bidirectional tracking)
923    pub fn inverse(&self) -> Option<Self> {
924        match self {
925            Self::RowToNode => Some(Self::NodeToRow),
926            Self::NodeToRow => Some(Self::RowToNode),
927            Self::RowToVector => Some(Self::VectorToRow),
928            Self::VectorToRow => Some(Self::RowToVector),
929            Self::NodeToVector => Some(Self::VectorToNode),
930            Self::VectorToNode => Some(Self::NodeToVector),
931            Self::SimilarTo => Some(Self::SimilarTo), // Symmetric
932            Self::RelatedTo => Some(Self::RelatedTo), // Symmetric
933            _ => None,                                // One-directional references
934        }
935    }
936
937    /// Check if this is a symmetric reference type
938    pub fn is_symmetric(&self) -> bool {
939        matches!(self, Self::SimilarTo | Self::RelatedTo)
940    }
941
942    /// Convert RefType to byte for binary serialization
943    pub fn to_byte(&self) -> u8 {
944        match self {
945            Self::RowToNode => 0,
946            Self::RowToEdge => 1,
947            Self::NodeToRow => 2,
948            Self::RowToVector => 3,
949            Self::VectorToRow => 4,
950            Self::NodeToVector => 5,
951            Self::EdgeToVector => 6,
952            Self::VectorToNode => 7,
953            Self::SimilarTo => 8,
954            Self::RelatedTo => 9,
955            Self::DerivesFrom => 10,
956            Self::Mentions => 11,
957            Self::Contains => 12,
958            Self::DependsOn => 13,
959        }
960    }
961
962    /// Create RefType from byte (binary deserialization)
963    pub fn from_byte(byte: u8) -> Self {
964        match byte {
965            0 => Self::RowToNode,
966            1 => Self::RowToEdge,
967            2 => Self::NodeToRow,
968            3 => Self::RowToVector,
969            4 => Self::VectorToRow,
970            5 => Self::NodeToVector,
971            6 => Self::EdgeToVector,
972            7 => Self::VectorToNode,
973            8 => Self::SimilarTo,
974            9 => Self::RelatedTo,
975            10 => Self::DerivesFrom,
976            11 => Self::Mentions,
977            12 => Self::Contains,
978            13 => Self::DependsOn,
979            _ => Self::RelatedTo, // Default fallback
980        }
981    }
982}
983
984/// Convert Vec<Value> to RowData
985impl From<Vec<Value>> for RowData {
986    fn from(columns: Vec<Value>) -> Self {
987        RowData::new(columns)
988    }
989}
990
991/// Convert HashMap to NodeData
992impl From<HashMap<String, Value>> for NodeData {
993    fn from(properties: HashMap<String, Value>) -> Self {
994        NodeData::with_properties(properties)
995    }
996}
997
998/// Convert dense vector to VectorData
999impl From<Vec<f32>> for VectorData {
1000    fn from(dense: Vec<f32>) -> Self {
1001        VectorData::new(dense)
1002    }
1003}
1004
1005/// Convert tuple (dense, sparse) to VectorData
1006impl From<(Vec<f32>, SparseVector)> for VectorData {
1007    fn from((dense, sparse): (Vec<f32>, SparseVector)) -> Self {
1008        VectorData::with_sparse(dense, sparse)
1009    }
1010}
1011
1012// Helper trait for uniform entity creation
1013impl UnifiedEntity {
1014    /// Create a graph node entity from properties map
1015    pub fn from_properties(
1016        id: EntityId,
1017        label: impl Into<String>,
1018        node_type: impl Into<String>,
1019        properties: impl IntoIterator<Item = (impl Into<String>, Value)>,
1020    ) -> Self {
1021        let props: HashMap<String, Value> =
1022            properties.into_iter().map(|(k, v)| (k.into(), v)).collect();
1023        Self::graph_node(id, label, node_type, props)
1024    }
1025
1026    /// Convert entity to row data if applicable
1027    pub fn into_row(self) -> Option<RowData> {
1028        match self.data {
1029            EntityData::Row(r) => Some(r),
1030            _ => None,
1031        }
1032    }
1033
1034    /// Convert entity to node data if applicable
1035    pub fn into_node(self) -> Option<NodeData> {
1036        match self.data {
1037            EntityData::Node(n) => Some(n),
1038            _ => None,
1039        }
1040    }
1041
1042    /// Convert entity to edge data if applicable
1043    pub fn into_edge(self) -> Option<EdgeData> {
1044        match self.data {
1045            EntityData::Edge(e) => Some(e),
1046            _ => None,
1047        }
1048    }
1049
1050    /// Convert entity to vector data if applicable
1051    pub fn into_vector(self) -> Option<VectorData> {
1052        match self.data {
1053            EntityData::Vector(v) => Some(v),
1054            _ => None,
1055        }
1056    }
1057}
1058
1059#[cfg(test)]
1060mod tests {
1061    use super::*;
1062
1063    #[test]
1064    fn test_entity_creation() {
1065        let id = EntityId::new(1);
1066        let entity = UnifiedEntity::table_row(
1067            id,
1068            "users",
1069            100,
1070            vec![Value::text("alice".to_string()), Value::Integer(25)],
1071        );
1072
1073        assert!(entity.data.is_row());
1074        assert_eq!(entity.kind.storage_type(), "table");
1075        assert_eq!(entity.kind.collection(), "users");
1076    }
1077
1078    #[test]
1079    fn test_cross_refs() {
1080        let id1 = EntityId::new(1);
1081        let id2 = EntityId::new(2);
1082
1083        let cross_ref = CrossRef::new(id1, id2, "nodes", RefType::RowToNode);
1084        assert_eq!(cross_ref.source, id1);
1085        assert_eq!(cross_ref.target, id2);
1086        assert_eq!(cross_ref.ref_type.inverse(), Some(RefType::NodeToRow));
1087    }
1088
1089    #[test]
1090    fn test_sparse_vector() {
1091        let sparse = SparseVector::new(vec![0, 5, 10], vec![1.0, 2.0, 3.0], 100);
1092
1093        assert_eq!(sparse.nnz(), 3);
1094        assert_eq!(sparse.get(5), 2.0);
1095        assert_eq!(sparse.get(3), 0.0);
1096        assert!(sparse.sparsity() > 0.9);
1097    }
1098
1099    #[test]
1100    fn test_embedding_slots() {
1101        let mut entity = UnifiedEntity::table_row(
1102            EntityId::new(1),
1103            "documents",
1104            1,
1105            vec![Value::text("Hello world".to_string())],
1106        );
1107
1108        entity.add_embedding(EmbeddingSlot::new(
1109            "content",
1110            vec![0.1, 0.2, 0.3],
1111            "text-embedding-3-small",
1112        ));
1113
1114        assert_eq!(entity.embeddings().len(), 1);
1115        assert!(entity.get_embedding("content").is_some());
1116        assert!(entity.get_embedding("summary").is_none());
1117    }
1118}