Skip to main content

ailake_core/
schema.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2use crate::error::{AilakeError, AilakeResult};
3use crate::types::{EmbeddingModelInfo, VectorMetric, VectorModality, VectorPrecision};
4use serde::{Deserialize, Serialize};
5
6/// Canonical column names for LLM-context tables.
7/// ContextAssembler reads columns by these names.
8pub mod llm_columns {
9    pub const CHUNK_ID: &str = "chunk_id";
10    pub const DOCUMENT_ID: &str = "document_id";
11    pub const CHUNK_INDEX: &str = "chunk_index";
12    pub const TOTAL_CHUNKS: &str = "total_chunks";
13    pub const CHUNK_TEXT: &str = "chunk_text";
14    pub const DOCUMENT_TITLE: &str = "document_title";
15    pub const SECTION_PATH: &str = "section_path";
16    pub const PRECEDING_CONTEXT: &str = "preceding_context";
17    pub const FOLLOWING_CONTEXT: &str = "following_context";
18    pub const DOCUMENT_SUMMARY: &str = "document_summary";
19    pub const CHUNK_SUMMARY: &str = "chunk_summary";
20    pub const SOURCE_URI: &str = "source_uri";
21    pub const PAGE_NUMBER: &str = "page_number";
22    /// Arrow type: `Timestamp(Nanosecond, Some("UTC"))` — use `ailake_core::now_ns()` to populate.
23    pub const CREATED_AT: &str = "created_at";
24    pub const DOCUMENT_DATE: &str = "document_date";
25    pub const EMBEDDING: &str = "embedding";
26    pub const CONTEXT_EMBEDDING: &str = "context_embedding";
27}
28
29/// Current UTC time as Unix epoch nanoseconds.
30///
31/// Use for `created_at` and `last_accessed_at` columns in `LlmContextSchema`
32/// and `EpisodicMemorySchema` tables. Arrow type for these columns must be
33/// `Timestamp(Nanosecond, Some("UTC"))` — Iceberg maps this to `timestamptz`.
34pub fn now_ns() -> i64 {
35    std::time::SystemTime::now()
36        .duration_since(std::time::UNIX_EPOCH)
37        .map(|d| d.as_nanos() as i64)
38        .unwrap_or(0)
39}
40
41/// Vector storage configuration applied at table creation time.
42/// Stored in Iceberg metadata.json properties.
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct VectorStoragePolicy {
45    pub column_name: String,
46    pub dim: u32,
47    pub metric: VectorMetric,
48    pub precision: VectorPrecision,
49    pub pq: Option<PQConfig>,
50    pub keep_raw_for_reranking: bool,
51    /// Normalize each input vector to unit L2 length before indexing.
52    /// Enables the NormalizedCosine fast path in HNSW: distance = 1 - dot(a, b),
53    /// no sqrt, ~2× faster distance computation. Semantics unchanged — same top-k
54    /// results as Cosine. Most embedding models (OpenAI, Cohere, etc.) produce
55    /// nearly-unit vectors; enabling this adds negligible write overhead.
56    #[serde(default)]
57    pub pre_normalize: bool,
58    /// HNSW M parameter — connections per node. `None` = default (16).
59    /// Higher M → better recall, more memory, slower build.
60    /// Recommended values: 8 (low-memory), 16 (default), 32 (high-recall), 64 (max).
61    #[serde(default)]
62    pub hnsw_m: Option<u32>,
63    /// HNSW ef_construction — candidate pool size during build. `None` = default (150).
64    /// Higher ef_construction → better graph quality, slower build.
65    /// Recommended values: 100 (fast), 150 (default), 200 (quality), 400 (max quality).
66    #[serde(default)]
67    pub hnsw_ef_construction: Option<u32>,
68    /// IVF-PQ residual encoding — train PQ on per-cluster residuals (vec - coarse_centroid).
69    /// Same bytes/vector, ~2-4pp better recall@10. Only applies when IVF-PQ index is used.
70    #[serde(default)]
71    pub ivf_residual: bool,
72    /// Optional embedding model metadata. When set:
73    /// - Stored as `ailake.embedding-model` in Iceberg table properties.
74    /// - Validated on every `write_batch`: dim mismatch → hard error; name mismatch → warning.
75    /// - Required for `migrate_embeddings` to track the model transition.
76    #[serde(default, skip_serializing_if = "Option::is_none")]
77    pub embedding_model: Option<EmbeddingModelInfo>,
78    /// Modality tag for this vector column (text / image / audio / video).
79    /// Stored as `ailake.modality-<col>` in Iceberg properties and Parquet KV metadata.
80    /// Allows readers to select the correct HNSW by modality without reading data.
81    #[serde(default, skip_serializing_if = "Option::is_none")]
82    pub modality: Option<VectorModality>,
83    /// Column to partition by (e.g. "agent_id"). Stored in Iceberg metadata as an identity
84    /// partition spec, enabling file-level pruning for per-agent search without post-scan filtering.
85    /// Set this at table creation time; all files written to this table carry the partition column.
86    #[serde(default, skip_serializing_if = "Option::is_none")]
87    pub partition_by: Option<String>,
88    /// Runtime partition value for this writer instance (not stored in table metadata).
89    /// When set, each file written by this TableWriter is tagged with this value, enabling
90    /// the search path to prune files from other partitions (e.g., other agents).
91    /// Typical usage: set to `agent_id` in Agent.__init__.
92    #[serde(skip)]
93    pub partition_value: Option<String>,
94    /// Iceberg type of the partition column ("string", "uuid", "int", "long").
95    /// Used when writing the Iceberg schema and partition spec at table creation.
96    /// Defaults to "string" when `None`.  Only relevant when `partition_by` is set.
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    pub partition_column_type: Option<String>,
99    /// Multi-column / non-identity partition spec (Phase K).
100    ///
101    /// When non-empty, takes precedence over `partition_by` + `partition_column_type`
102    /// for table creation.  Supports `identity` and `truncate[W]` transforms.
103    /// Values at write time are provided via `partition_value` encoded as
104    /// `\x1f`-separated compound string ("val1\x1fval2") matching field order.
105    ///
106    /// Example (two-column identity):
107    /// ```ignore
108    /// partition_fields: vec![
109    ///     PartitionDef::identity("agent_id", "string"),
110    ///     PartitionDef::identity("session_id", "string"),
111    /// ]
112    /// ```
113    #[serde(default, skip_serializing_if = "Vec::is_empty")]
114    pub partition_fields: Vec<PartitionDef>,
115}
116
117/// One field in a multi-column partition spec (Phase K).
118///
119/// Supported transforms: `"identity"` and `"truncate[W]"` (string prefix / int rounding).
120#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
121pub struct PartitionDef {
122    /// Source column name in the table schema.
123    pub column: String,
124    /// Iceberg transform string: `"identity"` or `"truncate[W]"`.
125    pub transform: String,
126    /// Iceberg type of the source column: `"string"`, `"int"`, `"long"`, `"uuid"`.
127    pub column_type: String,
128}
129
130impl PartitionDef {
131    pub fn identity(column: impl Into<String>, column_type: impl Into<String>) -> Self {
132        Self {
133            column: column.into(),
134            transform: "identity".into(),
135            column_type: column_type.into(),
136        }
137    }
138
139    pub fn truncate(
140        column: impl Into<String>,
141        width: usize,
142        column_type: impl Into<String>,
143    ) -> Self {
144        Self {
145            column: column.into(),
146            transform: format!("truncate[{width}]"),
147            column_type: column_type.into(),
148        }
149    }
150
151    /// Apply this transform to a raw column value.
152    /// - `identity` → returns value unchanged.
153    /// - `truncate[W]` → returns first W characters (strings) or value rounded down to
154    ///   the nearest multiple of W (integers, parsed and re-formatted as string).
155    pub fn apply(&self, raw: &str) -> String {
156        if let Some(w) = self.truncate_width() {
157            if matches!(self.column_type.as_str(), "int" | "long" | "integer") {
158                // Integer truncation: round down to multiple of W
159                if let Ok(n) = raw.parse::<i64>() {
160                    return (n - n.rem_euclid(w as i64)).to_string();
161                }
162            }
163            // String truncation: first W chars
164            raw.chars().take(w).collect()
165        } else {
166            raw.to_string()
167        }
168    }
169
170    fn truncate_width(&self) -> Option<usize> {
171        self.transform
172            .strip_prefix("truncate[")
173            .and_then(|s| s.strip_suffix(']'))
174            .and_then(|s| s.parse().ok())
175    }
176}
177
178impl VectorStoragePolicy {
179    pub fn default_f16(column: &str, dim: u32, metric: VectorMetric) -> Self {
180        Self {
181            column_name: column.to_string(),
182            dim,
183            metric,
184            precision: VectorPrecision::F16,
185            pq: None,
186            keep_raw_for_reranking: true,
187            pre_normalize: false,
188            hnsw_m: None,
189            hnsw_ef_construction: None,
190            ivf_residual: false,
191            embedding_model: None,
192            modality: None,
193            partition_by: None,
194            partition_value: None,
195            partition_column_type: None,
196            partition_fields: vec![],
197        }
198    }
199}
200
201/// Product Quantization configuration
202#[derive(Debug, Clone, Serialize, Deserialize)]
203pub struct PQConfig {
204    /// Number of sub-vectors M (dim must be divisible by M)
205    pub num_subvectors: usize,
206    /// Bits per code (8 = 256 centroids per sub-vector)
207    pub bits_per_code: u8,
208    /// Number of training samples for codebook
209    pub train_sample_size: usize,
210}
211
212/// Marker struct for documentation purposes — actual schema is enforced by
213/// column names in llm_columns module.
214pub struct LlmContextSchema;
215
216/// Canonical column names for multimodal LLM-context tables.
217/// Extends `LlmContextSchema` with media and cross-modal embedding columns.
218///
219/// Usage: write tables whose Parquet schema includes these column names alongside
220/// `llm_columns::*`. The AI-Lake SDK reads them by name — no code-gen required.
221///
222/// Typical multimodal row:
223/// - chunk_text  + embedding (text)
224/// - image_embedding (CLIP/SigLIP dim=512)
225/// - media_uri pointing to the source image/audio/video in object storage
226/// - audio_transcript when the source is audio/video
227/// - media_caption from a captioning model
228pub mod multimodal_columns {
229    /// URI of the raw media asset in object storage (s3://, gs://, az://, https://).
230    /// AI-Lake is NOT a blob store — store media externally; only the URI lives here.
231    pub const MEDIA_URI: &str = "media_uri";
232    /// MIME type of the media asset (e.g. "image/jpeg", "audio/mpeg", "video/mp4").
233    pub const MEDIA_MIME: &str = "media_mime";
234    /// Human-readable caption generated by a vision/audio model (e.g. BLIP-2, Whisper).
235    pub const MEDIA_CAPTION: &str = "media_caption";
236    /// Image embedding column (e.g. CLIP ViT-B/32, SigLIP dim=512).
237    /// Physical type: FIXED_LEN_BYTE_ARRAY (F16) — same as text `embedding`.
238    pub const IMAGE_EMBEDDING: &str = "image_embedding";
239    /// Transcription of spoken content from audio or video assets (Whisper output).
240    pub const AUDIO_TRANSCRIPT: &str = "audio_transcript";
241    /// Base64-encoded thumbnail (JPEG, ≤ 64×64 px) for inline LLM context.
242    /// Allows multimodal LLMs to receive a visual preview without fetching media_uri.
243    pub const THUMBNAIL_B64: &str = "thumbnail_b64";
244}
245
246/// Marker struct for multimodal LLM-context tables.
247/// Actual schema is enforced by column names in `multimodal_columns` module.
248///
249/// A multimodal table combines all `llm_columns::*` fields (text + embeddings)
250/// with `multimodal_columns::*` (media URI, MIME, caption, image_embedding,
251/// audio_transcript, thumbnail_b64).
252///
253/// Example Arrow schema (abridged):
254/// ```text
255/// chunk_id:          Utf8
256/// chunk_text:        Utf8
257/// embedding:         FixedSizeBinary(3072)   -- text, F16, dim=1536
258/// image_embedding:   FixedSizeBinary(1024)   -- image, F16, dim=512
259/// media_uri:         Utf8
260/// media_mime:        Utf8
261/// media_caption:     Utf8
262/// audio_transcript:  Utf8
263/// thumbnail_b64:     Utf8
264/// ```
265pub struct MultimodalContextSchema;
266
267// ── Phase 9 — Agent / Episodic Memory ────────────────────────────────────────
268
269/// Outcome of a tool call recorded in a `ToolCallSchema` table.
270#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
271#[serde(rename_all = "snake_case")]
272pub enum ToolCallOutcome {
273    Success,
274    Failure,
275    Timeout,
276}
277
278impl ToolCallOutcome {
279    pub fn as_str(self) -> &'static str {
280        match self {
281            Self::Success => "success",
282            Self::Failure => "failure",
283            Self::Timeout => "timeout",
284        }
285    }
286}
287
288impl std::fmt::Display for ToolCallOutcome {
289    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
290        f.write_str(self.as_str())
291    }
292}
293
294impl std::str::FromStr for ToolCallOutcome {
295    type Err = AilakeError;
296    fn from_str(s: &str) -> AilakeResult<Self> {
297        match s {
298            "success" => Ok(Self::Success),
299            "failure" => Ok(Self::Failure),
300            "timeout" => Ok(Self::Timeout),
301            other => Err(AilakeError::InvalidArgument(format!(
302                "unknown ToolCallOutcome '{other}' (valid: success, failure, timeout)"
303            ))),
304        }
305    }
306}
307
308/// Canonical column names for agent tool-call history tables.
309///
310/// Each row records one tool invocation: agent identity, session context,
311/// inputs/outputs as JSON, outcome, and latency. The `embedding` column
312/// (from `llm_columns::EMBEDDING`) holds a vector over the concatenated
313/// `tool_name + tool_input_json` text, enabling semantic search over past
314/// tool calls ("when did the agent call X in contexts similar to Y?").
315///
316/// Usage: include these columns alongside `llm_columns::*` in the Arrow
317/// schema of a `ToolCallSchema` table.
318pub mod tool_call_columns {
319    /// UUID of the agent instance (identifies which agent performed the call).
320    pub const AGENT_ID: &str = "agent_id";
321    /// UUID of the conversation / task session.
322    pub const SESSION_ID: &str = "session_id";
323    /// Zero-based index of this tool call within the session.
324    pub const STEP_INDEX: &str = "step_index";
325    /// Name of the tool that was invoked (e.g. "web_search", "code_exec").
326    pub const TOOL_NAME: &str = "tool_name";
327    /// JSON-serialized input arguments passed to the tool.
328    pub const TOOL_INPUT_JSON: &str = "tool_input_json";
329    /// JSON-serialized output returned by the tool (or error message on failure).
330    pub const TOOL_OUTPUT_JSON: &str = "tool_output_json";
331    /// Outcome of the call: "success" | "failure" | "timeout".
332    /// Use `ToolCallOutcome` enum for typed access.
333    pub const OUTCOME: &str = "outcome";
334    /// Wall-clock latency of the tool call in milliseconds.
335    pub const LATENCY_MS: &str = "latency_ms";
336}
337
338/// Marker struct for agent tool-call history tables (Phase 9).
339/// Actual schema is enforced by column names in `tool_call_columns` module.
340///
341/// A tool-call table extends `LlmContextSchema` with agent identity and
342/// invocation metadata, enabling semantic search over an agent's history:
343///
344/// ```text
345/// agent_id:         Utf8          -- UUID string
346/// session_id:       Utf8          -- UUID string
347/// step_index:       UInt32
348/// tool_name:        Utf8
349/// tool_input_json:  Utf8
350/// tool_output_json: Utf8
351/// outcome:          Utf8          -- "success" | "failure" | "timeout"
352/// latency_ms:       UInt32
353/// embedding:        FixedSizeBinary(N)  -- F16, over tool_name+tool_input_json
354/// ```
355///
356/// Recommended index: one HNSW over `embedding` (text, cosine).
357/// Partition by `agent_id` via `VectorStoragePolicy` for isolated per-agent search.
358pub struct ToolCallSchema;