sochdb_core/
memory_schema.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Canonical Memory Schema for LLM Agents
16//!
17//! This module defines the **core semantic schema** for SochDB as an LLM memory store:
18//!
19//! - `episodes` - Task/conversation runs with summaries and embeddings
20//! - `events` - Steps within episodes (tool calls, messages, etc.)
21//! - `entities` - Users, projects, documents, services, etc.
22//!
23//! ## Design Rationale
24//!
25//! Without a canonical core, LLMs must reason over O(T) unrelated table schemas.
26//! With this fixed `(episodes, events, entities)` core, effective complexity becomes
27//! O(1) + domain-specific tables.
28//!
29//! ## Retrieval Pattern
30//!
31//! 1. Vector search over episode summaries: O(log E) via HNSW/Vamana
32//! 2. Range scan over events by (episode_id, seq): O(V) where V = events per episode
33//!
34//! ## Example Queries
35//!
36//! ```text
37//! -- Find similar past tasks
38//! SEARCH episodes BY SIMILARITY($query) TOP 5
39//!
40//! -- Get timeline for an episode
41//! SELECT * FROM events WHERE episode_id = $id ORDER BY seq
42//!
43//! -- Find entities by kind
44//! SEARCH entities WHERE kind = 'user' BY SIMILARITY($query) TOP 10
45//! ```
46
47use serde::{Deserialize, Serialize};
48use std::collections::HashMap;
49
50use crate::soch::{SochSchema, SochType, SochValue};
51
52// ============================================================================
53// Episode - A Task/Conversation Run
54// ============================================================================
55
56/// Episode types for categorization
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
58#[serde(rename_all = "snake_case")]
59pub enum EpisodeType {
60    /// Interactive conversation session
61    Conversation,
62    /// Autonomous task execution
63    Task,
64    /// Background workflow
65    Workflow,
66    /// Debug/testing session
67    Debug,
68    /// Agent-to-agent interaction
69    AgentInteraction,
70    /// Other/custom type
71    Other,
72}
73
74impl EpisodeType {
75    pub fn as_str(&self) -> &'static str {
76        match self {
77            EpisodeType::Conversation => "conversation",
78            EpisodeType::Task => "task",
79            EpisodeType::Workflow => "workflow",
80            EpisodeType::Debug => "debug",
81            EpisodeType::AgentInteraction => "agent_interaction",
82            EpisodeType::Other => "other",
83        }
84    }
85
86    #[allow(clippy::should_implement_trait)]
87    pub fn from_str(s: &str) -> Self {
88        match s.to_lowercase().as_str() {
89            "conversation" => EpisodeType::Conversation,
90            "task" => EpisodeType::Task,
91            "workflow" => EpisodeType::Workflow,
92            "debug" => EpisodeType::Debug,
93            "agent_interaction" => EpisodeType::AgentInteraction,
94            _ => EpisodeType::Other,
95        }
96    }
97}
98
99/// An episode represents a bounded task or conversation run
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct Episode {
102    /// Unique episode identifier
103    pub episode_id: String,
104    /// Episode type
105    pub episode_type: EpisodeType,
106    /// Related entity IDs (users, projects, etc.)
107    pub entity_ids: Vec<String>,
108    /// Start timestamp (microseconds since epoch)
109    pub ts_start: u64,
110    /// End timestamp (0 if ongoing)
111    pub ts_end: u64,
112    /// Natural language summary of the episode
113    pub summary: String,
114    /// Tags for categorization
115    pub tags: Vec<String>,
116    /// Embedding vector for semantic search (optional)
117    pub embedding: Option<Vec<f32>>,
118    /// Custom metadata
119    pub metadata: HashMap<String, SochValue>,
120}
121
122impl Episode {
123    /// Create a new episode
124    pub fn new(episode_id: impl Into<String>, episode_type: EpisodeType) -> Self {
125        Self {
126            episode_id: episode_id.into(),
127            episode_type,
128            entity_ids: Vec::new(),
129            ts_start: Self::now_us(),
130            ts_end: 0,
131            summary: String::new(),
132            tags: Vec::new(),
133            embedding: None,
134            metadata: HashMap::new(),
135        }
136    }
137
138    /// Get schema for episodes table
139    pub fn schema() -> SochSchema {
140        SochSchema::new("episodes")
141            .field("episode_id", SochType::Text)
142            .field("episode_type", SochType::Text)
143            .field("entity_ids", SochType::Array(Box::new(SochType::Text)))
144            .field("ts_start", SochType::UInt)
145            .field("ts_end", SochType::UInt)
146            .field("summary", SochType::Text)
147            .field("tags", SochType::Array(Box::new(SochType::Text)))
148            .field(
149                "embedding",
150                SochType::Optional(Box::new(SochType::Array(Box::new(SochType::Float)))),
151            )
152            .field("metadata", SochType::Object(vec![]))
153            .primary_key("episode_id")
154            .index("idx_episodes_type", vec!["episode_type".into()], false)
155            .index("idx_episodes_ts", vec!["ts_start".into()], false)
156    }
157
158    fn now_us() -> u64 {
159        std::time::SystemTime::now()
160            .duration_since(std::time::UNIX_EPOCH)
161            .unwrap()
162            .as_micros() as u64
163    }
164}
165
166// ============================================================================
167// Event - A Step Within an Episode
168// ============================================================================
169
170/// Event roles (who/what triggered the event)
171#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
172#[serde(rename_all = "snake_case")]
173pub enum EventRole {
174    /// User message/action
175    User,
176    /// Assistant/LLM response
177    Assistant,
178    /// System event
179    System,
180    /// Tool invocation
181    Tool,
182    /// External service
183    External,
184}
185
186impl EventRole {
187    pub fn as_str(&self) -> &'static str {
188        match self {
189            EventRole::User => "user",
190            EventRole::Assistant => "assistant",
191            EventRole::System => "system",
192            EventRole::Tool => "tool",
193            EventRole::External => "external",
194        }
195    }
196
197    #[allow(clippy::should_implement_trait)]
198    pub fn from_str(s: &str) -> Self {
199        match s.to_lowercase().as_str() {
200            "user" => EventRole::User,
201            "assistant" => EventRole::Assistant,
202            "system" => EventRole::System,
203            "tool" => EventRole::Tool,
204            "external" => EventRole::External,
205            _ => EventRole::System,
206        }
207    }
208}
209
210/// An event represents a single step within an episode
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct Event {
213    /// Parent episode ID
214    pub episode_id: String,
215    /// Sequence number within episode (monotonically increasing)
216    pub seq: u64,
217    /// Timestamp (microseconds since epoch)
218    pub ts: u64,
219    /// Who/what triggered this event
220    pub role: EventRole,
221    /// Tool name (if role == Tool)
222    pub tool_name: Option<String>,
223    /// Input data in TOON format
224    pub input_toon: String,
225    /// Output data in TOON format
226    pub output_toon: String,
227    /// Error message if event failed
228    pub error: Option<String>,
229    /// Performance metrics
230    pub metrics: EventMetrics,
231}
232
233/// Performance metrics for an event
234#[derive(Debug, Clone, Default, Serialize, Deserialize)]
235pub struct EventMetrics {
236    /// Duration in microseconds
237    pub duration_us: u64,
238    /// Input tokens (if applicable)
239    pub input_tokens: Option<u32>,
240    /// Output tokens (if applicable)
241    pub output_tokens: Option<u32>,
242    /// Cost in microdollars (if applicable)
243    pub cost_micros: Option<u64>,
244}
245
246impl Event {
247    /// Create a new event
248    pub fn new(episode_id: impl Into<String>, seq: u64, role: EventRole) -> Self {
249        Self {
250            episode_id: episode_id.into(),
251            seq,
252            ts: Self::now_us(),
253            role,
254            tool_name: None,
255            input_toon: String::new(),
256            output_toon: String::new(),
257            error: None,
258            metrics: EventMetrics::default(),
259        }
260    }
261
262    /// Get schema for events table
263    pub fn schema() -> SochSchema {
264        SochSchema::new("events")
265            .field("episode_id", SochType::Text)
266            .field("seq", SochType::UInt)
267            .field("ts", SochType::UInt)
268            .field("role", SochType::Text)
269            .field("tool_name", SochType::Optional(Box::new(SochType::Text)))
270            .field("input_toon", SochType::Text)
271            .field("output_toon", SochType::Text)
272            .field("error", SochType::Optional(Box::new(SochType::Text)))
273            .field("duration_us", SochType::UInt)
274            .field("input_tokens", SochType::Optional(Box::new(SochType::UInt)))
275            .field(
276                "output_tokens",
277                SochType::Optional(Box::new(SochType::UInt)),
278            )
279            .field("cost_micros", SochType::Optional(Box::new(SochType::UInt)))
280            // Composite primary key: (episode_id, seq)
281            .primary_key("episode_id")
282            .index(
283                "idx_events_episode_seq",
284                vec!["episode_id".into(), "seq".into()],
285                true,
286            )
287            .index("idx_events_ts", vec!["ts".into()], false)
288            .index("idx_events_role", vec!["role".into()], false)
289    }
290
291    fn now_us() -> u64 {
292        std::time::SystemTime::now()
293            .duration_since(std::time::UNIX_EPOCH)
294            .unwrap()
295            .as_micros() as u64
296    }
297}
298
299// ============================================================================
300// Entity - Users, Projects, Documents, Services
301// ============================================================================
302
303/// Entity kinds for categorization
304#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
305#[serde(rename_all = "snake_case")]
306pub enum EntityKind {
307    /// Human user
308    User,
309    /// Project or repository
310    Project,
311    /// Document or file
312    Document,
313    /// External service
314    Service,
315    /// Agent/bot identity
316    Agent,
317    /// Organization
318    Organization,
319    /// Custom entity type
320    Custom,
321}
322
323impl EntityKind {
324    pub fn as_str(&self) -> &'static str {
325        match self {
326            EntityKind::User => "user",
327            EntityKind::Project => "project",
328            EntityKind::Document => "document",
329            EntityKind::Service => "service",
330            EntityKind::Agent => "agent",
331            EntityKind::Organization => "organization",
332            EntityKind::Custom => "custom",
333        }
334    }
335
336    #[allow(clippy::should_implement_trait)]
337    pub fn from_str(s: &str) -> Self {
338        match s.to_lowercase().as_str() {
339            "user" => EntityKind::User,
340            "project" => EntityKind::Project,
341            "document" => EntityKind::Document,
342            "service" => EntityKind::Service,
343            "agent" => EntityKind::Agent,
344            "organization" => EntityKind::Organization,
345            _ => EntityKind::Custom,
346        }
347    }
348}
349
350/// An entity represents a user, project, document, service, etc.
351#[derive(Debug, Clone, Serialize, Deserialize)]
352pub struct Entity {
353    /// Unique entity identifier
354    pub entity_id: String,
355    /// Entity kind/type
356    pub kind: EntityKind,
357    /// Human-readable name
358    pub name: String,
359    /// Typed attributes
360    pub attributes: HashMap<String, SochValue>,
361    /// Embedding vector for semantic search
362    pub embedding: Option<Vec<f32>>,
363    /// Custom metadata
364    pub metadata: HashMap<String, SochValue>,
365    /// Created timestamp
366    pub created_at: u64,
367    /// Last updated timestamp
368    pub updated_at: u64,
369}
370
371impl Entity {
372    /// Create a new entity
373    pub fn new(entity_id: impl Into<String>, kind: EntityKind, name: impl Into<String>) -> Self {
374        let now = Self::now_us();
375        Self {
376            entity_id: entity_id.into(),
377            kind,
378            name: name.into(),
379            attributes: HashMap::new(),
380            embedding: None,
381            metadata: HashMap::new(),
382            created_at: now,
383            updated_at: now,
384        }
385    }
386
387    /// Get schema for entities table
388    pub fn schema() -> SochSchema {
389        SochSchema::new("entities")
390            .field("entity_id", SochType::Text)
391            .field("kind", SochType::Text)
392            .field("name", SochType::Text)
393            .field("attributes", SochType::Object(vec![]))
394            .field(
395                "embedding",
396                SochType::Optional(Box::new(SochType::Array(Box::new(SochType::Float)))),
397            )
398            .field("metadata", SochType::Object(vec![]))
399            .field("created_at", SochType::UInt)
400            .field("updated_at", SochType::UInt)
401            .primary_key("entity_id")
402            .index("idx_entities_kind", vec!["kind".into()], false)
403            .index("idx_entities_name", vec!["name".into()], false)
404    }
405
406    fn now_us() -> u64 {
407        std::time::SystemTime::now()
408            .duration_since(std::time::UNIX_EPOCH)
409            .unwrap()
410            .as_micros() as u64
411    }
412}
413
414// ============================================================================
415// Table Metadata for MCP Resources
416// ============================================================================
417
418/// Table role for semantic metadata
419#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
420#[serde(rename_all = "snake_case")]
421pub enum TableRole {
422    /// Append-only log (events, traces)
423    Log,
424    /// Dimension table (entities, users)
425    Dimension,
426    /// Fact table (metrics, aggregates)
427    Fact,
428    /// Vector collection for embeddings
429    VectorCollection,
430    /// Lookup table (config, mappings)
431    Lookup,
432    /// Core memory schema table
433    CoreMemory,
434}
435
436impl TableRole {
437    pub fn as_str(&self) -> &'static str {
438        match self {
439            TableRole::Log => "log",
440            TableRole::Dimension => "dimension",
441            TableRole::Fact => "fact",
442            TableRole::VectorCollection => "vector_collection",
443            TableRole::Lookup => "lookup",
444            TableRole::CoreMemory => "core_memory",
445        }
446    }
447}
448
449/// Semantic metadata for a table (exposed via MCP resources)
450#[derive(Debug, Clone, Serialize, Deserialize)]
451pub struct TableSemanticMetadata {
452    /// Table name
453    pub name: String,
454    /// Semantic role of this table
455    pub role: TableRole,
456    /// Primary key columns
457    pub primary_key: Vec<String>,
458    /// Clustering key (for ordered access)
459    pub cluster_key: Option<Vec<String>>,
460    /// Timestamp column (for temporal queries)
461    pub ts_column: Option<String>,
462    /// Whether backed by a vector index
463    pub backed_by_vector_index: bool,
464    /// Embedding dimension (if vector collection)
465    pub embedding_dimension: Option<usize>,
466    /// Human-readable description
467    pub description: String,
468}
469
470impl TableSemanticMetadata {
471    /// Get metadata for episodes table
472    pub fn episodes() -> Self {
473        Self {
474            name: "episodes".to_string(),
475            role: TableRole::CoreMemory,
476            primary_key: vec!["episode_id".to_string()],
477            cluster_key: Some(vec!["ts_start".to_string()]),
478            ts_column: Some("ts_start".to_string()),
479            backed_by_vector_index: true,
480            embedding_dimension: Some(1536), // OpenAI ada-002 default
481            description: "Task/conversation runs with summaries and embeddings. Search here to find similar past tasks.".to_string(),
482        }
483    }
484
485    /// Get metadata for events table
486    pub fn events() -> Self {
487        Self {
488            name: "events".to_string(),
489            role: TableRole::Log,
490            primary_key: vec!["episode_id".to_string(), "seq".to_string()],
491            cluster_key: Some(vec!["episode_id".to_string(), "seq".to_string()]),
492            ts_column: Some("ts".to_string()),
493            backed_by_vector_index: false,
494            embedding_dimension: None,
495            description: "Steps within episodes (tool calls, messages). Use LAST N FROM events WHERE episode_id = $id for timeline.".to_string(),
496        }
497    }
498
499    /// Get metadata for entities table
500    pub fn entities() -> Self {
501        Self {
502            name: "entities".to_string(),
503            role: TableRole::Dimension,
504            primary_key: vec!["entity_id".to_string()],
505            cluster_key: Some(vec!["kind".to_string()]),
506            ts_column: Some("updated_at".to_string()),
507            backed_by_vector_index: true,
508            embedding_dimension: Some(1536),
509            description: "Users, projects, documents, services. Search by kind and similarity."
510                .to_string(),
511        }
512    }
513
514    /// Get core memory table metadata
515    pub fn core_tables() -> Vec<Self> {
516        vec![Self::episodes(), Self::events(), Self::entities()]
517    }
518}
519
520// ============================================================================
521// Memory Store API
522// ============================================================================
523
524/// Result of searching episodes
525#[derive(Debug, Clone)]
526pub struct EpisodeSearchResult {
527    pub episode: Episode,
528    pub score: f32,
529}
530
531/// Result of searching entities
532#[derive(Debug, Clone)]
533pub struct EntitySearchResult {
534    pub entity: Entity,
535    pub score: f32,
536}
537
538/// Core memory operations trait
539pub trait MemoryStore {
540    /// Create a new episode
541    fn create_episode(&self, episode: &Episode) -> crate::Result<()>;
542
543    /// Get an episode by ID
544    fn get_episode(&self, episode_id: &str) -> crate::Result<Option<Episode>>;
545
546    /// Search episodes by vector similarity
547    fn search_episodes(&self, query: &str, k: usize) -> crate::Result<Vec<EpisodeSearchResult>>;
548
549    /// Append an event to an episode
550    fn append_event(&self, event: &Event) -> crate::Result<()>;
551
552    /// Get timeline for an episode
553    fn get_timeline(&self, episode_id: &str, max_events: usize) -> crate::Result<Vec<Event>>;
554
555    /// Create or update an entity
556    fn upsert_entity(&self, entity: &Entity) -> crate::Result<()>;
557
558    /// Get an entity by ID
559    fn get_entity(&self, entity_id: &str) -> crate::Result<Option<Entity>>;
560
561    /// Search entities by kind and vector similarity
562    fn search_entities(
563        &self,
564        kind: Option<EntityKind>,
565        query: &str,
566        k: usize,
567    ) -> crate::Result<Vec<EntitySearchResult>>;
568
569    /// Get facts for an entity (attributes + linked episodes)
570    fn get_entity_facts(&self, entity_id: &str) -> crate::Result<EntityFacts>;
571}
572
573/// Facts about an entity
574#[derive(Debug, Clone)]
575pub struct EntityFacts {
576    pub entity: Entity,
577    pub recent_episodes: Vec<Episode>,
578    pub related_entities: Vec<Entity>,
579}
580
581#[cfg(test)]
582mod tests {
583    use super::*;
584
585    #[test]
586    fn test_episode_schema() {
587        let schema = Episode::schema();
588        assert_eq!(schema.name, "episodes");
589        assert!(schema.fields.iter().any(|f| f.name == "episode_id"));
590        assert!(schema.fields.iter().any(|f| f.name == "embedding"));
591    }
592
593    #[test]
594    fn test_event_schema() {
595        let schema = Event::schema();
596        assert_eq!(schema.name, "events");
597        assert!(schema.fields.iter().any(|f| f.name == "episode_id"));
598        assert!(schema.fields.iter().any(|f| f.name == "seq"));
599    }
600
601    #[test]
602    fn test_entity_schema() {
603        let schema = Entity::schema();
604        assert_eq!(schema.name, "entities");
605        assert!(schema.fields.iter().any(|f| f.name == "entity_id"));
606        assert!(schema.fields.iter().any(|f| f.name == "kind"));
607    }
608
609    #[test]
610    fn test_table_metadata() {
611        let tables = TableSemanticMetadata::core_tables();
612        assert_eq!(tables.len(), 3);
613        assert!(tables.iter().any(|t| t.name == "episodes"));
614        assert!(tables.iter().any(|t| t.name == "events"));
615        assert!(tables.iter().any(|t| t.name == "entities"));
616    }
617}