Skip to main content

sochdb_core/
memory_schema.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! Canonical Memory Schema for LLM Agents
19//!
20//! This module defines the **core semantic schema** for SochDB as an LLM memory store:
21//!
22//! - `episodes` - Task/conversation runs with summaries and embeddings
23//! - `events` - Steps within episodes (tool calls, messages, etc.)
24//! - `entities` - Users, projects, documents, services, etc.
25//!
26//! ## Design Rationale
27//!
28//! Without a canonical core, LLMs must reason over O(T) unrelated table schemas.
29//! With this fixed `(episodes, events, entities)` core, effective complexity becomes
30//! O(1) + domain-specific tables.
31//!
32//! ## Retrieval Pattern
33//!
34//! 1. Vector search over episode summaries: O(log E) via HNSW/Vamana
35//! 2. Range scan over events by (episode_id, seq): O(V) where V = events per episode
36//!
37//! ## Example Queries
38//!
39//! ```text
40//! -- Find similar past tasks
41//! SEARCH episodes BY SIMILARITY($query) TOP 5
42//!
43//! -- Get timeline for an episode
44//! SELECT * FROM events WHERE episode_id = $id ORDER BY seq
45//!
46//! -- Find entities by kind
47//! SEARCH entities WHERE kind = 'user' BY SIMILARITY($query) TOP 10
48//! ```
49
50use serde::{Deserialize, Serialize};
51use std::collections::HashMap;
52
53use crate::soch::{SochSchema, SochType, SochValue};
54
55// ============================================================================
56// Episode - A Task/Conversation Run
57// ============================================================================
58
59/// Episode types for categorization
60#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
61#[serde(rename_all = "snake_case")]
62pub enum EpisodeType {
63    /// Interactive conversation session
64    Conversation,
65    /// Autonomous task execution
66    Task,
67    /// Background workflow
68    Workflow,
69    /// Debug/testing session
70    Debug,
71    /// Agent-to-agent interaction
72    AgentInteraction,
73    /// Other/custom type
74    Other,
75}
76
77impl EpisodeType {
78    pub fn as_str(&self) -> &'static str {
79        match self {
80            EpisodeType::Conversation => "conversation",
81            EpisodeType::Task => "task",
82            EpisodeType::Workflow => "workflow",
83            EpisodeType::Debug => "debug",
84            EpisodeType::AgentInteraction => "agent_interaction",
85            EpisodeType::Other => "other",
86        }
87    }
88
89    #[allow(clippy::should_implement_trait)]
90    pub fn from_str(s: &str) -> Self {
91        match s.to_lowercase().as_str() {
92            "conversation" => EpisodeType::Conversation,
93            "task" => EpisodeType::Task,
94            "workflow" => EpisodeType::Workflow,
95            "debug" => EpisodeType::Debug,
96            "agent_interaction" => EpisodeType::AgentInteraction,
97            _ => EpisodeType::Other,
98        }
99    }
100}
101
102/// An episode represents a bounded task or conversation run
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct Episode {
105    /// Unique episode identifier
106    pub episode_id: String,
107    /// Episode type
108    pub episode_type: EpisodeType,
109    /// Related entity IDs (users, projects, etc.)
110    pub entity_ids: Vec<String>,
111    /// Start timestamp (microseconds since epoch)
112    pub ts_start: u64,
113    /// End timestamp (0 if ongoing)
114    pub ts_end: u64,
115    /// Natural language summary of the episode
116    pub summary: String,
117    /// Tags for categorization
118    pub tags: Vec<String>,
119    /// Embedding vector for semantic search (optional)
120    pub embedding: Option<Vec<f32>>,
121    /// Custom metadata
122    pub metadata: HashMap<String, SochValue>,
123}
124
125impl Episode {
126    /// Create a new episode
127    pub fn new(episode_id: impl Into<String>, episode_type: EpisodeType) -> Self {
128        Self {
129            episode_id: episode_id.into(),
130            episode_type,
131            entity_ids: Vec::new(),
132            ts_start: Self::now_us(),
133            ts_end: 0,
134            summary: String::new(),
135            tags: Vec::new(),
136            embedding: None,
137            metadata: HashMap::new(),
138        }
139    }
140
141    /// Get schema for episodes table
142    pub fn schema() -> SochSchema {
143        SochSchema::new("episodes")
144            .field("episode_id", SochType::Text)
145            .field("episode_type", SochType::Text)
146            .field("entity_ids", SochType::Array(Box::new(SochType::Text)))
147            .field("ts_start", SochType::UInt)
148            .field("ts_end", SochType::UInt)
149            .field("summary", SochType::Text)
150            .field("tags", SochType::Array(Box::new(SochType::Text)))
151            .field(
152                "embedding",
153                SochType::Optional(Box::new(SochType::Array(Box::new(SochType::Float)))),
154            )
155            .field("metadata", SochType::Object(vec![]))
156            .primary_key("episode_id")
157            .index("idx_episodes_type", vec!["episode_type".into()], false)
158            .index("idx_episodes_ts", vec!["ts_start".into()], false)
159    }
160
161    fn now_us() -> u64 {
162        std::time::SystemTime::now()
163            .duration_since(std::time::UNIX_EPOCH)
164            .unwrap()
165            .as_micros() as u64
166    }
167}
168
169// ============================================================================
170// Event - A Step Within an Episode
171// ============================================================================
172
173/// Event roles (who/what triggered the event)
174#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
175#[serde(rename_all = "snake_case")]
176pub enum EventRole {
177    /// User message/action
178    User,
179    /// Assistant/LLM response
180    Assistant,
181    /// System event
182    System,
183    /// Tool invocation
184    Tool,
185    /// External service
186    External,
187}
188
189impl EventRole {
190    pub fn as_str(&self) -> &'static str {
191        match self {
192            EventRole::User => "user",
193            EventRole::Assistant => "assistant",
194            EventRole::System => "system",
195            EventRole::Tool => "tool",
196            EventRole::External => "external",
197        }
198    }
199
200    #[allow(clippy::should_implement_trait)]
201    pub fn from_str(s: &str) -> Self {
202        match s.to_lowercase().as_str() {
203            "user" => EventRole::User,
204            "assistant" => EventRole::Assistant,
205            "system" => EventRole::System,
206            "tool" => EventRole::Tool,
207            "external" => EventRole::External,
208            _ => EventRole::System,
209        }
210    }
211}
212
213/// An event represents a single step within an episode
214#[derive(Debug, Clone, Serialize, Deserialize)]
215pub struct Event {
216    /// Parent episode ID
217    pub episode_id: String,
218    /// Sequence number within episode (monotonically increasing)
219    pub seq: u64,
220    /// Timestamp (microseconds since epoch)
221    pub ts: u64,
222    /// Who/what triggered this event
223    pub role: EventRole,
224    /// Tool name (if role == Tool)
225    pub tool_name: Option<String>,
226    /// Input data in TOON format
227    pub input_toon: String,
228    /// Output data in TOON format
229    pub output_toon: String,
230    /// Error message if event failed
231    pub error: Option<String>,
232    /// Performance metrics
233    pub metrics: EventMetrics,
234}
235
236/// Performance metrics for an event
237#[derive(Debug, Clone, Default, Serialize, Deserialize)]
238pub struct EventMetrics {
239    /// Duration in microseconds
240    pub duration_us: u64,
241    /// Input tokens (if applicable)
242    pub input_tokens: Option<u32>,
243    /// Output tokens (if applicable)
244    pub output_tokens: Option<u32>,
245    /// Cost in microdollars (if applicable)
246    pub cost_micros: Option<u64>,
247}
248
249impl Event {
250    /// Create a new event
251    pub fn new(episode_id: impl Into<String>, seq: u64, role: EventRole) -> Self {
252        Self {
253            episode_id: episode_id.into(),
254            seq,
255            ts: Self::now_us(),
256            role,
257            tool_name: None,
258            input_toon: String::new(),
259            output_toon: String::new(),
260            error: None,
261            metrics: EventMetrics::default(),
262        }
263    }
264
265    /// Get schema for events table
266    pub fn schema() -> SochSchema {
267        SochSchema::new("events")
268            .field("episode_id", SochType::Text)
269            .field("seq", SochType::UInt)
270            .field("ts", SochType::UInt)
271            .field("role", SochType::Text)
272            .field("tool_name", SochType::Optional(Box::new(SochType::Text)))
273            .field("input_toon", SochType::Text)
274            .field("output_toon", SochType::Text)
275            .field("error", SochType::Optional(Box::new(SochType::Text)))
276            .field("duration_us", SochType::UInt)
277            .field("input_tokens", SochType::Optional(Box::new(SochType::UInt)))
278            .field(
279                "output_tokens",
280                SochType::Optional(Box::new(SochType::UInt)),
281            )
282            .field("cost_micros", SochType::Optional(Box::new(SochType::UInt)))
283            // Composite primary key: (episode_id, seq)
284            .primary_key("episode_id")
285            .index(
286                "idx_events_episode_seq",
287                vec!["episode_id".into(), "seq".into()],
288                true,
289            )
290            .index("idx_events_ts", vec!["ts".into()], false)
291            .index("idx_events_role", vec!["role".into()], false)
292    }
293
294    fn now_us() -> u64 {
295        std::time::SystemTime::now()
296            .duration_since(std::time::UNIX_EPOCH)
297            .unwrap()
298            .as_micros() as u64
299    }
300}
301
302// ============================================================================
303// Entity - Users, Projects, Documents, Services
304// ============================================================================
305
306/// Entity kinds for categorization
307#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
308#[serde(rename_all = "snake_case")]
309pub enum EntityKind {
310    /// Human user
311    User,
312    /// Project or repository
313    Project,
314    /// Document or file
315    Document,
316    /// External service
317    Service,
318    /// Agent/bot identity
319    Agent,
320    /// Organization
321    Organization,
322    /// Custom entity type
323    Custom,
324}
325
326impl EntityKind {
327    pub fn as_str(&self) -> &'static str {
328        match self {
329            EntityKind::User => "user",
330            EntityKind::Project => "project",
331            EntityKind::Document => "document",
332            EntityKind::Service => "service",
333            EntityKind::Agent => "agent",
334            EntityKind::Organization => "organization",
335            EntityKind::Custom => "custom",
336        }
337    }
338
339    #[allow(clippy::should_implement_trait)]
340    pub fn from_str(s: &str) -> Self {
341        match s.to_lowercase().as_str() {
342            "user" => EntityKind::User,
343            "project" => EntityKind::Project,
344            "document" => EntityKind::Document,
345            "service" => EntityKind::Service,
346            "agent" => EntityKind::Agent,
347            "organization" => EntityKind::Organization,
348            _ => EntityKind::Custom,
349        }
350    }
351}
352
353/// An entity represents a user, project, document, service, etc.
354#[derive(Debug, Clone, Serialize, Deserialize)]
355pub struct Entity {
356    /// Unique entity identifier
357    pub entity_id: String,
358    /// Entity kind/type
359    pub kind: EntityKind,
360    /// Human-readable name
361    pub name: String,
362    /// Typed attributes
363    pub attributes: HashMap<String, SochValue>,
364    /// Embedding vector for semantic search
365    pub embedding: Option<Vec<f32>>,
366    /// Custom metadata
367    pub metadata: HashMap<String, SochValue>,
368    /// Created timestamp
369    pub created_at: u64,
370    /// Last updated timestamp
371    pub updated_at: u64,
372}
373
374impl Entity {
375    /// Create a new entity
376    pub fn new(entity_id: impl Into<String>, kind: EntityKind, name: impl Into<String>) -> Self {
377        let now = Self::now_us();
378        Self {
379            entity_id: entity_id.into(),
380            kind,
381            name: name.into(),
382            attributes: HashMap::new(),
383            embedding: None,
384            metadata: HashMap::new(),
385            created_at: now,
386            updated_at: now,
387        }
388    }
389
390    /// Get schema for entities table
391    pub fn schema() -> SochSchema {
392        SochSchema::new("entities")
393            .field("entity_id", SochType::Text)
394            .field("kind", SochType::Text)
395            .field("name", SochType::Text)
396            .field("attributes", SochType::Object(vec![]))
397            .field(
398                "embedding",
399                SochType::Optional(Box::new(SochType::Array(Box::new(SochType::Float)))),
400            )
401            .field("metadata", SochType::Object(vec![]))
402            .field("created_at", SochType::UInt)
403            .field("updated_at", SochType::UInt)
404            .primary_key("entity_id")
405            .index("idx_entities_kind", vec!["kind".into()], false)
406            .index("idx_entities_name", vec!["name".into()], false)
407    }
408
409    fn now_us() -> u64 {
410        std::time::SystemTime::now()
411            .duration_since(std::time::UNIX_EPOCH)
412            .unwrap()
413            .as_micros() as u64
414    }
415}
416
417// ============================================================================
418// Table Metadata for MCP Resources
419// ============================================================================
420
421/// Table role for semantic metadata
422#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
423#[serde(rename_all = "snake_case")]
424pub enum TableRole {
425    /// Append-only log (events, traces)
426    Log,
427    /// Dimension table (entities, users)
428    Dimension,
429    /// Fact table (metrics, aggregates)
430    Fact,
431    /// Vector collection for embeddings
432    VectorCollection,
433    /// Lookup table (config, mappings)
434    Lookup,
435    /// Core memory schema table
436    CoreMemory,
437}
438
439impl TableRole {
440    pub fn as_str(&self) -> &'static str {
441        match self {
442            TableRole::Log => "log",
443            TableRole::Dimension => "dimension",
444            TableRole::Fact => "fact",
445            TableRole::VectorCollection => "vector_collection",
446            TableRole::Lookup => "lookup",
447            TableRole::CoreMemory => "core_memory",
448        }
449    }
450}
451
452/// Semantic metadata for a table (exposed via MCP resources)
453#[derive(Debug, Clone, Serialize, Deserialize)]
454pub struct TableSemanticMetadata {
455    /// Table name
456    pub name: String,
457    /// Semantic role of this table
458    pub role: TableRole,
459    /// Primary key columns
460    pub primary_key: Vec<String>,
461    /// Clustering key (for ordered access)
462    pub cluster_key: Option<Vec<String>>,
463    /// Timestamp column (for temporal queries)
464    pub ts_column: Option<String>,
465    /// Whether backed by a vector index
466    pub backed_by_vector_index: bool,
467    /// Embedding dimension (if vector collection)
468    pub embedding_dimension: Option<usize>,
469    /// Human-readable description
470    pub description: String,
471}
472
473impl TableSemanticMetadata {
474    /// Get metadata for episodes table
475    pub fn episodes() -> Self {
476        Self {
477            name: "episodes".to_string(),
478            role: TableRole::CoreMemory,
479            primary_key: vec!["episode_id".to_string()],
480            cluster_key: Some(vec!["ts_start".to_string()]),
481            ts_column: Some("ts_start".to_string()),
482            backed_by_vector_index: true,
483            embedding_dimension: Some(1536), // OpenAI ada-002 default
484            description: "Task/conversation runs with summaries and embeddings. Search here to find similar past tasks.".to_string(),
485        }
486    }
487
488    /// Get metadata for events table
489    pub fn events() -> Self {
490        Self {
491            name: "events".to_string(),
492            role: TableRole::Log,
493            primary_key: vec!["episode_id".to_string(), "seq".to_string()],
494            cluster_key: Some(vec!["episode_id".to_string(), "seq".to_string()]),
495            ts_column: Some("ts".to_string()),
496            backed_by_vector_index: false,
497            embedding_dimension: None,
498            description: "Steps within episodes (tool calls, messages). Use LAST N FROM events WHERE episode_id = $id for timeline.".to_string(),
499        }
500    }
501
502    /// Get metadata for entities table
503    pub fn entities() -> Self {
504        Self {
505            name: "entities".to_string(),
506            role: TableRole::Dimension,
507            primary_key: vec!["entity_id".to_string()],
508            cluster_key: Some(vec!["kind".to_string()]),
509            ts_column: Some("updated_at".to_string()),
510            backed_by_vector_index: true,
511            embedding_dimension: Some(1536),
512            description: "Users, projects, documents, services. Search by kind and similarity."
513                .to_string(),
514        }
515    }
516
517    /// Get core memory table metadata
518    pub fn core_tables() -> Vec<Self> {
519        vec![Self::episodes(), Self::events(), Self::entities()]
520    }
521}
522
523// ============================================================================
524// Memory Store API
525// ============================================================================
526
527/// Result of searching episodes
528#[derive(Debug, Clone)]
529pub struct EpisodeSearchResult {
530    pub episode: Episode,
531    pub score: f32,
532}
533
534/// Result of searching entities
535#[derive(Debug, Clone)]
536pub struct EntitySearchResult {
537    pub entity: Entity,
538    pub score: f32,
539}
540
541/// Core memory operations trait
542pub trait MemoryStore {
543    /// Create a new episode
544    fn create_episode(&self, episode: &Episode) -> crate::Result<()>;
545
546    /// Get an episode by ID
547    fn get_episode(&self, episode_id: &str) -> crate::Result<Option<Episode>>;
548
549    /// Search episodes by vector similarity
550    fn search_episodes(&self, query: &str, k: usize) -> crate::Result<Vec<EpisodeSearchResult>>;
551
552    /// Append an event to an episode
553    fn append_event(&self, event: &Event) -> crate::Result<()>;
554
555    /// Get timeline for an episode
556    fn get_timeline(&self, episode_id: &str, max_events: usize) -> crate::Result<Vec<Event>>;
557
558    /// Create or update an entity
559    fn upsert_entity(&self, entity: &Entity) -> crate::Result<()>;
560
561    /// Get an entity by ID
562    fn get_entity(&self, entity_id: &str) -> crate::Result<Option<Entity>>;
563
564    /// Search entities by kind and vector similarity
565    fn search_entities(
566        &self,
567        kind: Option<EntityKind>,
568        query: &str,
569        k: usize,
570    ) -> crate::Result<Vec<EntitySearchResult>>;
571
572    /// Get facts for an entity (attributes + linked episodes)
573    fn get_entity_facts(&self, entity_id: &str) -> crate::Result<EntityFacts>;
574}
575
576/// Facts about an entity
577#[derive(Debug, Clone)]
578pub struct EntityFacts {
579    pub entity: Entity,
580    pub recent_episodes: Vec<Episode>,
581    pub related_entities: Vec<Entity>,
582}
583
584#[cfg(test)]
585mod tests {
586    use super::*;
587
588    #[test]
589    fn test_episode_schema() {
590        let schema = Episode::schema();
591        assert_eq!(schema.name, "episodes");
592        assert!(schema.fields.iter().any(|f| f.name == "episode_id"));
593        assert!(schema.fields.iter().any(|f| f.name == "embedding"));
594    }
595
596    #[test]
597    fn test_event_schema() {
598        let schema = Event::schema();
599        assert_eq!(schema.name, "events");
600        assert!(schema.fields.iter().any(|f| f.name == "episode_id"));
601        assert!(schema.fields.iter().any(|f| f.name == "seq"));
602    }
603
604    #[test]
605    fn test_entity_schema() {
606        let schema = Entity::schema();
607        assert_eq!(schema.name, "entities");
608        assert!(schema.fields.iter().any(|f| f.name == "entity_id"));
609        assert!(schema.fields.iter().any(|f| f.name == "kind"));
610    }
611
612    #[test]
613    fn test_table_metadata() {
614        let tables = TableSemanticMetadata::core_tables();
615        assert_eq!(tables.len(), 3);
616        assert!(tables.iter().any(|t| t.name == "episodes"));
617        assert!(tables.iter().any(|t| t.name == "events"));
618        assert!(tables.iter().any(|t| t.name == "entities"));
619    }
620}