Skip to main content

ailake_core/
schema.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2use crate::types::{EmbeddingModelInfo, VectorMetric, VectorModality, VectorPrecision};
3use serde::{Deserialize, Serialize};
4
5/// Canonical column names for LLM-context tables.
6/// ContextAssembler reads columns by these names.
7pub mod llm_columns {
8    pub const CHUNK_ID: &str = "chunk_id";
9    pub const DOCUMENT_ID: &str = "document_id";
10    pub const CHUNK_INDEX: &str = "chunk_index";
11    pub const TOTAL_CHUNKS: &str = "total_chunks";
12    pub const CHUNK_TEXT: &str = "chunk_text";
13    pub const DOCUMENT_TITLE: &str = "document_title";
14    pub const SECTION_PATH: &str = "section_path";
15    pub const PRECEDING_CONTEXT: &str = "preceding_context";
16    pub const FOLLOWING_CONTEXT: &str = "following_context";
17    pub const DOCUMENT_SUMMARY: &str = "document_summary";
18    pub const CHUNK_SUMMARY: &str = "chunk_summary";
19    pub const SOURCE_URI: &str = "source_uri";
20    pub const PAGE_NUMBER: &str = "page_number";
21    pub const CREATED_AT: &str = "created_at";
22    pub const DOCUMENT_DATE: &str = "document_date";
23    pub const EMBEDDING: &str = "embedding";
24    pub const CONTEXT_EMBEDDING: &str = "context_embedding";
25}
26
27/// Vector storage configuration applied at table creation time.
28/// Stored in Iceberg metadata.json properties.
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct VectorStoragePolicy {
31    pub column_name: String,
32    pub dim: u32,
33    pub metric: VectorMetric,
34    pub precision: VectorPrecision,
35    pub pq: Option<PQConfig>,
36    pub keep_raw_for_reranking: bool,
37    /// Normalize each input vector to unit L2 length before indexing.
38    /// Enables the NormalizedCosine fast path in HNSW: distance = 1 - dot(a, b),
39    /// no sqrt, ~2× faster distance computation. Semantics unchanged — same top-k
40    /// results as Cosine. Most embedding models (OpenAI, Cohere, etc.) produce
41    /// nearly-unit vectors; enabling this adds negligible write overhead.
42    #[serde(default)]
43    pub pre_normalize: bool,
44    /// HNSW M parameter — connections per node. `None` = default (16).
45    /// Higher M → better recall, more memory, slower build.
46    /// Recommended values: 8 (low-memory), 16 (default), 32 (high-recall), 64 (max).
47    #[serde(default)]
48    pub hnsw_m: Option<u32>,
49    /// HNSW ef_construction — candidate pool size during build. `None` = default (150).
50    /// Higher ef_construction → better graph quality, slower build.
51    /// Recommended values: 100 (fast), 150 (default), 200 (quality), 400 (max quality).
52    #[serde(default)]
53    pub hnsw_ef_construction: Option<u32>,
54    /// IVF-PQ residual encoding — train PQ on per-cluster residuals (vec - coarse_centroid).
55    /// Same bytes/vector, ~2-4pp better recall@10. Only applies when IVF-PQ index is used.
56    #[serde(default)]
57    pub ivf_residual: bool,
58    /// Optional embedding model metadata. When set:
59    /// - Stored as `ailake.embedding-model` in Iceberg table properties.
60    /// - Validated on every `write_batch`: dim mismatch → hard error; name mismatch → warning.
61    /// - Required for `migrate_embeddings` to track the model transition.
62    #[serde(default, skip_serializing_if = "Option::is_none")]
63    pub embedding_model: Option<EmbeddingModelInfo>,
64    /// Modality tag for this vector column (text / image / audio / video).
65    /// Stored as `ailake.modality-<col>` in Iceberg properties and Parquet KV metadata.
66    /// Allows readers to select the correct HNSW by modality without reading data.
67    #[serde(default, skip_serializing_if = "Option::is_none")]
68    pub modality: Option<VectorModality>,
69}
70
71impl VectorStoragePolicy {
72    pub fn default_f16(column: &str, dim: u32, metric: VectorMetric) -> Self {
73        Self {
74            column_name: column.to_string(),
75            dim,
76            metric,
77            precision: VectorPrecision::F16,
78            pq: None,
79            keep_raw_for_reranking: true,
80            pre_normalize: false,
81            hnsw_m: None,
82            hnsw_ef_construction: None,
83            ivf_residual: false,
84            embedding_model: None,
85            modality: None,
86        }
87    }
88}
89
90/// Product Quantization configuration
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct PQConfig {
93    /// Number of sub-vectors M (dim must be divisible by M)
94    pub num_subvectors: usize,
95    /// Bits per code (8 = 256 centroids per sub-vector)
96    pub bits_per_code: u8,
97    /// Number of training samples for codebook
98    pub train_sample_size: usize,
99}
100
101/// Marker struct for documentation purposes — actual schema is enforced by
102/// column names in llm_columns module.
103pub struct LlmContextSchema;
104
105/// Canonical column names for multimodal LLM-context tables.
106/// Extends `LlmContextSchema` with media and cross-modal embedding columns.
107///
108/// Usage: write tables whose Parquet schema includes these column names alongside
109/// `llm_columns::*`. The AI-Lake SDK reads them by name — no code-gen required.
110///
111/// Typical multimodal row:
112/// - chunk_text  + embedding (text)
113/// - image_embedding (CLIP/SigLIP dim=512)
114/// - media_uri pointing to the source image/audio/video in object storage
115/// - audio_transcript when the source is audio/video
116/// - media_caption from a captioning model
117pub mod multimodal_columns {
118    /// URI of the raw media asset in object storage (s3://, gs://, az://, https://).
119    /// AI-Lake is NOT a blob store — store media externally; only the URI lives here.
120    pub const MEDIA_URI: &str = "media_uri";
121    /// MIME type of the media asset (e.g. "image/jpeg", "audio/mpeg", "video/mp4").
122    pub const MEDIA_MIME: &str = "media_mime";
123    /// Human-readable caption generated by a vision/audio model (e.g. BLIP-2, Whisper).
124    pub const MEDIA_CAPTION: &str = "media_caption";
125    /// Image embedding column (e.g. CLIP ViT-B/32, SigLIP dim=512).
126    /// Physical type: FIXED_LEN_BYTE_ARRAY (F16) — same as text `embedding`.
127    pub const IMAGE_EMBEDDING: &str = "image_embedding";
128    /// Transcription of spoken content from audio or video assets (Whisper output).
129    pub const AUDIO_TRANSCRIPT: &str = "audio_transcript";
130    /// Base64-encoded thumbnail (JPEG, ≤ 64×64 px) for inline LLM context.
131    /// Allows multimodal LLMs to receive a visual preview without fetching media_uri.
132    pub const THUMBNAIL_B64: &str = "thumbnail_b64";
133}
134
135/// Marker struct for multimodal LLM-context tables.
136/// Actual schema is enforced by column names in `multimodal_columns` module.
137///
138/// A multimodal table combines all `llm_columns::*` fields (text + embeddings)
139/// with `multimodal_columns::*` (media URI, MIME, caption, image_embedding,
140/// audio_transcript, thumbnail_b64).
141///
142/// Example Arrow schema (abridged):
143/// ```text
144/// chunk_id:          Utf8
145/// chunk_text:        Utf8
146/// embedding:         FixedSizeBinary(3072)   -- text, F16, dim=1536
147/// image_embedding:   FixedSizeBinary(1024)   -- image, F16, dim=512
148/// media_uri:         Utf8
149/// media_mime:        Utf8
150/// media_caption:     Utf8
151/// audio_transcript:  Utf8
152/// thumbnail_b64:     Utf8
153/// ```
154pub struct MultimodalContextSchema;