ailake_core/schema.rs
1// SPDX-License-Identifier: MIT OR Apache-2.0
2use crate::error::{AilakeError, AilakeResult};
3use crate::types::{EmbeddingModelInfo, VectorMetric, VectorModality, VectorPrecision};
4use serde::{Deserialize, Serialize};
5
6/// Canonical column names for LLM-context tables.
7/// ContextAssembler reads columns by these names.
8pub mod llm_columns {
9 pub const CHUNK_ID: &str = "chunk_id";
10 pub const DOCUMENT_ID: &str = "document_id";
11 pub const CHUNK_INDEX: &str = "chunk_index";
12 pub const TOTAL_CHUNKS: &str = "total_chunks";
13 pub const CHUNK_TEXT: &str = "chunk_text";
14 pub const DOCUMENT_TITLE: &str = "document_title";
15 pub const SECTION_PATH: &str = "section_path";
16 pub const PRECEDING_CONTEXT: &str = "preceding_context";
17 pub const FOLLOWING_CONTEXT: &str = "following_context";
18 pub const DOCUMENT_SUMMARY: &str = "document_summary";
19 pub const CHUNK_SUMMARY: &str = "chunk_summary";
20 pub const SOURCE_URI: &str = "source_uri";
21 pub const PAGE_NUMBER: &str = "page_number";
22 /// Arrow type: `Timestamp(Nanosecond, Some("UTC"))` — use `ailake_core::now_ns()` to populate.
23 pub const CREATED_AT: &str = "created_at";
24 pub const DOCUMENT_DATE: &str = "document_date";
25 pub const EMBEDDING: &str = "embedding";
26 pub const CONTEXT_EMBEDDING: &str = "context_embedding";
27}
28
29/// Current UTC time as Unix epoch nanoseconds.
30///
31/// Use for `created_at` and `last_accessed_at` columns in `LlmContextSchema`
32/// and `EpisodicMemorySchema` tables. Arrow type for these columns must be
33/// `Timestamp(Nanosecond, Some("UTC"))` — Iceberg maps this to `timestamptz`.
34pub fn now_ns() -> i64 {
35 std::time::SystemTime::now()
36 .duration_since(std::time::UNIX_EPOCH)
37 .map(|d| d.as_nanos() as i64)
38 .unwrap_or(0)
39}
40
41/// Vector storage configuration applied at table creation time.
42/// Stored in Iceberg metadata.json properties.
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct VectorStoragePolicy {
45 pub column_name: String,
46 pub dim: u32,
47 pub metric: VectorMetric,
48 pub precision: VectorPrecision,
49 pub pq: Option<PQConfig>,
50 pub keep_raw_for_reranking: bool,
51 /// Normalize each input vector to unit L2 length before indexing.
52 /// Enables the NormalizedCosine fast path in HNSW: distance = 1 - dot(a, b),
53 /// no sqrt, ~2× faster distance computation. Semantics unchanged — same top-k
54 /// results as Cosine. Most embedding models (OpenAI, Cohere, etc.) produce
55 /// nearly-unit vectors; enabling this adds negligible write overhead.
56 #[serde(default)]
57 pub pre_normalize: bool,
58 /// HNSW M parameter — connections per node. `None` = default (16).
59 /// Higher M → better recall, more memory, slower build.
60 /// Recommended values: 8 (low-memory), 16 (default), 32 (high-recall), 64 (max).
61 #[serde(default)]
62 pub hnsw_m: Option<u32>,
63 /// HNSW ef_construction — candidate pool size during build. `None` = default (150).
64 /// Higher ef_construction → better graph quality, slower build.
65 /// Recommended values: 100 (fast), 150 (default), 200 (quality), 400 (max quality).
66 #[serde(default)]
67 pub hnsw_ef_construction: Option<u32>,
68 /// IVF-PQ residual encoding — train PQ on per-cluster residuals (vec - coarse_centroid).
69 /// Same bytes/vector, ~2-4pp better recall@10. Only applies when IVF-PQ index is used.
70 #[serde(default)]
71 pub ivf_residual: bool,
72 /// Optional embedding model metadata. When set:
73 /// - Stored as `ailake.embedding-model` in Iceberg table properties.
74 /// - Validated on every `write_batch`: dim mismatch → hard error; name mismatch → warning.
75 /// - Required for `migrate_embeddings` to track the model transition.
76 #[serde(default, skip_serializing_if = "Option::is_none")]
77 pub embedding_model: Option<EmbeddingModelInfo>,
78 /// Modality tag for this vector column (text / image / audio / video).
79 /// Stored as `ailake.modality-<col>` in Iceberg properties and Parquet KV metadata.
80 /// Allows readers to select the correct HNSW by modality without reading data.
81 #[serde(default, skip_serializing_if = "Option::is_none")]
82 pub modality: Option<VectorModality>,
83 /// Column to partition by (e.g. "agent_id"). Stored in Iceberg metadata as an identity
84 /// partition spec, enabling file-level pruning for per-agent search without post-scan filtering.
85 /// Set this at table creation time; all files written to this table carry the partition column.
86 #[serde(default, skip_serializing_if = "Option::is_none")]
87 pub partition_by: Option<String>,
88 /// Runtime partition value for this writer instance (not stored in table metadata).
89 /// When set, each file written by this TableWriter is tagged with this value, enabling
90 /// the search path to prune files from other partitions (e.g., other agents).
91 /// Typical usage: set to `agent_id` in Agent.__init__.
92 #[serde(skip)]
93 pub partition_value: Option<String>,
94 /// Iceberg type of the partition column ("string", "uuid", "int", "long").
95 /// Used when writing the Iceberg schema and partition spec at table creation.
96 /// Defaults to "string" when `None`. Only relevant when `partition_by` is set.
97 #[serde(default, skip_serializing_if = "Option::is_none")]
98 pub partition_column_type: Option<String>,
99 /// Multi-column / non-identity partition spec (Phase K).
100 ///
101 /// When non-empty, takes precedence over `partition_by` + `partition_column_type`
102 /// for table creation. Supports `identity` and `truncate[W]` transforms.
103 /// Values at write time are provided via `partition_value` encoded as
104 /// `\x1f`-separated compound string ("val1\x1fval2") matching field order.
105 ///
106 /// Example (two-column identity):
107 /// ```ignore
108 /// partition_fields: vec![
109 /// PartitionDef::identity("agent_id", "string"),
110 /// PartitionDef::identity("session_id", "string"),
111 /// ]
112 /// ```
113 #[serde(default, skip_serializing_if = "Vec::is_empty")]
114 pub partition_fields: Vec<PartitionDef>,
115}
116
117/// One field in a multi-column partition spec (Phase K).
118///
119/// Supported transforms: `"identity"` and `"truncate[W]"` (string prefix / int rounding).
120#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
121pub struct PartitionDef {
122 /// Source column name in the table schema.
123 pub column: String,
124 /// Iceberg transform string: `"identity"` or `"truncate[W]"`.
125 pub transform: String,
126 /// Iceberg type of the source column: `"string"`, `"int"`, `"long"`, `"uuid"`.
127 pub column_type: String,
128}
129
130impl PartitionDef {
131 pub fn identity(column: impl Into<String>, column_type: impl Into<String>) -> Self {
132 Self {
133 column: column.into(),
134 transform: "identity".into(),
135 column_type: column_type.into(),
136 }
137 }
138
139 pub fn truncate(
140 column: impl Into<String>,
141 width: usize,
142 column_type: impl Into<String>,
143 ) -> Self {
144 Self {
145 column: column.into(),
146 transform: format!("truncate[{width}]"),
147 column_type: column_type.into(),
148 }
149 }
150
151 /// Apply this transform to a raw column value.
152 /// - `identity` → returns value unchanged.
153 /// - `truncate[W]` → returns first W characters (strings) or value rounded down to
154 /// the nearest multiple of W (integers, parsed and re-formatted as string).
155 pub fn apply(&self, raw: &str) -> String {
156 if let Some(w) = self.truncate_width() {
157 if matches!(self.column_type.as_str(), "int" | "long" | "integer") {
158 // Integer truncation: round down to multiple of W
159 if let Ok(n) = raw.parse::<i64>() {
160 return (n - n.rem_euclid(w as i64)).to_string();
161 }
162 }
163 // String truncation: first W chars
164 raw.chars().take(w).collect()
165 } else {
166 raw.to_string()
167 }
168 }
169
170 fn truncate_width(&self) -> Option<usize> {
171 self.transform
172 .strip_prefix("truncate[")
173 .and_then(|s| s.strip_suffix(']'))
174 .and_then(|s| s.parse().ok())
175 }
176}
177
178impl VectorStoragePolicy {
179 pub fn default_f16(column: &str, dim: u32, metric: VectorMetric) -> Self {
180 Self {
181 column_name: column.to_string(),
182 dim,
183 metric,
184 precision: VectorPrecision::F16,
185 pq: None,
186 keep_raw_for_reranking: true,
187 pre_normalize: false,
188 hnsw_m: None,
189 hnsw_ef_construction: None,
190 ivf_residual: false,
191 embedding_model: None,
192 modality: None,
193 partition_by: None,
194 partition_value: None,
195 partition_column_type: None,
196 partition_fields: vec![],
197 }
198 }
199}
200
201/// Product Quantization configuration
202#[derive(Debug, Clone, Serialize, Deserialize)]
203pub struct PQConfig {
204 /// Number of sub-vectors M (dim must be divisible by M)
205 pub num_subvectors: usize,
206 /// Bits per code (8 = 256 centroids per sub-vector)
207 pub bits_per_code: u8,
208 /// Number of training samples for codebook
209 pub train_sample_size: usize,
210}
211
212/// Marker struct for documentation purposes — actual schema is enforced by
213/// column names in llm_columns module.
214pub struct LlmContextSchema;
215
216/// Canonical column names for multimodal LLM-context tables.
217/// Extends `LlmContextSchema` with media and cross-modal embedding columns.
218///
219/// Usage: write tables whose Parquet schema includes these column names alongside
220/// `llm_columns::*`. The AI-Lake SDK reads them by name — no code-gen required.
221///
222/// Typical multimodal row:
223/// - chunk_text + embedding (text)
224/// - image_embedding (CLIP/SigLIP dim=512)
225/// - media_uri pointing to the source image/audio/video in object storage
226/// - audio_transcript when the source is audio/video
227/// - media_caption from a captioning model
228pub mod multimodal_columns {
229 /// URI of the raw media asset in object storage (s3://, gs://, az://, https://).
230 /// AI-Lake is NOT a blob store — store media externally; only the URI lives here.
231 pub const MEDIA_URI: &str = "media_uri";
232 /// MIME type of the media asset (e.g. "image/jpeg", "audio/mpeg", "video/mp4").
233 pub const MEDIA_MIME: &str = "media_mime";
234 /// Human-readable caption generated by a vision/audio model (e.g. BLIP-2, Whisper).
235 pub const MEDIA_CAPTION: &str = "media_caption";
236 /// Image embedding column (e.g. CLIP ViT-B/32, SigLIP dim=512).
237 /// Physical type: FIXED_LEN_BYTE_ARRAY (F16) — same as text `embedding`.
238 pub const IMAGE_EMBEDDING: &str = "image_embedding";
239 /// Transcription of spoken content from audio or video assets (Whisper output).
240 pub const AUDIO_TRANSCRIPT: &str = "audio_transcript";
241 /// Base64-encoded thumbnail (JPEG, ≤ 64×64 px) for inline LLM context.
242 /// Allows multimodal LLMs to receive a visual preview without fetching media_uri.
243 pub const THUMBNAIL_B64: &str = "thumbnail_b64";
244}
245
246/// Marker struct for multimodal LLM-context tables.
247/// Actual schema is enforced by column names in `multimodal_columns` module.
248///
249/// A multimodal table combines all `llm_columns::*` fields (text + embeddings)
250/// with `multimodal_columns::*` (media URI, MIME, caption, image_embedding,
251/// audio_transcript, thumbnail_b64).
252///
253/// Example Arrow schema (abridged):
254/// ```text
255/// chunk_id: Utf8
256/// chunk_text: Utf8
257/// embedding: FixedSizeBinary(3072) -- text, F16, dim=1536
258/// image_embedding: FixedSizeBinary(1024) -- image, F16, dim=512
259/// media_uri: Utf8
260/// media_mime: Utf8
261/// media_caption: Utf8
262/// audio_transcript: Utf8
263/// thumbnail_b64: Utf8
264/// ```
265pub struct MultimodalContextSchema;
266
267// ── Phase 9 — Agent / Episodic Memory ────────────────────────────────────────
268
269/// Outcome of a tool call recorded in a `ToolCallSchema` table.
270#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
271#[serde(rename_all = "snake_case")]
272pub enum ToolCallOutcome {
273 Success,
274 Failure,
275 Timeout,
276}
277
278impl ToolCallOutcome {
279 pub fn as_str(self) -> &'static str {
280 match self {
281 Self::Success => "success",
282 Self::Failure => "failure",
283 Self::Timeout => "timeout",
284 }
285 }
286}
287
288impl std::fmt::Display for ToolCallOutcome {
289 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
290 f.write_str(self.as_str())
291 }
292}
293
294impl std::str::FromStr for ToolCallOutcome {
295 type Err = AilakeError;
296 fn from_str(s: &str) -> AilakeResult<Self> {
297 match s {
298 "success" => Ok(Self::Success),
299 "failure" => Ok(Self::Failure),
300 "timeout" => Ok(Self::Timeout),
301 other => Err(AilakeError::InvalidArgument(format!(
302 "unknown ToolCallOutcome '{other}' (valid: success, failure, timeout)"
303 ))),
304 }
305 }
306}
307
308/// Canonical column names for agent tool-call history tables.
309///
310/// Each row records one tool invocation: agent identity, session context,
311/// inputs/outputs as JSON, outcome, and latency. The `embedding` column
312/// (from `llm_columns::EMBEDDING`) holds a vector over the concatenated
313/// `tool_name + tool_input_json` text, enabling semantic search over past
314/// tool calls ("when did the agent call X in contexts similar to Y?").
315///
316/// Usage: include these columns alongside `llm_columns::*` in the Arrow
317/// schema of a `ToolCallSchema` table.
318pub mod tool_call_columns {
319 /// UUID of the agent instance (identifies which agent performed the call).
320 pub const AGENT_ID: &str = "agent_id";
321 /// UUID of the conversation / task session.
322 pub const SESSION_ID: &str = "session_id";
323 /// Zero-based index of this tool call within the session.
324 pub const STEP_INDEX: &str = "step_index";
325 /// Name of the tool that was invoked (e.g. "web_search", "code_exec").
326 pub const TOOL_NAME: &str = "tool_name";
327 /// JSON-serialized input arguments passed to the tool.
328 pub const TOOL_INPUT_JSON: &str = "tool_input_json";
329 /// JSON-serialized output returned by the tool (or error message on failure).
330 pub const TOOL_OUTPUT_JSON: &str = "tool_output_json";
331 /// Outcome of the call: "success" | "failure" | "timeout".
332 /// Use `ToolCallOutcome` enum for typed access.
333 pub const OUTCOME: &str = "outcome";
334 /// Wall-clock latency of the tool call in milliseconds.
335 pub const LATENCY_MS: &str = "latency_ms";
336}
337
338/// Marker struct for agent tool-call history tables (Phase 9).
339/// Actual schema is enforced by column names in `tool_call_columns` module.
340///
341/// A tool-call table extends `LlmContextSchema` with agent identity and
342/// invocation metadata, enabling semantic search over an agent's history:
343///
344/// ```text
345/// agent_id: Utf8 -- UUID string
346/// session_id: Utf8 -- UUID string
347/// step_index: UInt32
348/// tool_name: Utf8
349/// tool_input_json: Utf8
350/// tool_output_json: Utf8
351/// outcome: Utf8 -- "success" | "failure" | "timeout"
352/// latency_ms: UInt32
353/// embedding: FixedSizeBinary(N) -- F16, over tool_name+tool_input_json
354/// ```
355///
356/// Recommended index: one HNSW over `embedding` (text, cosine).
357/// Partition by `agent_id` via `VectorStoragePolicy` for isolated per-agent search.
358pub struct ToolCallSchema;