Skip to main content

ailake_core/
schema.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2use crate::types::{VectorMetric, VectorPrecision};
3use serde::{Deserialize, Serialize};
4
5/// Canonical column names for LLM-context tables.
6/// ContextAssembler reads columns by these names.
7pub mod llm_columns {
8    pub const CHUNK_ID: &str = "chunk_id";
9    pub const DOCUMENT_ID: &str = "document_id";
10    pub const CHUNK_INDEX: &str = "chunk_index";
11    pub const TOTAL_CHUNKS: &str = "total_chunks";
12    pub const CHUNK_TEXT: &str = "chunk_text";
13    pub const DOCUMENT_TITLE: &str = "document_title";
14    pub const SECTION_PATH: &str = "section_path";
15    pub const PRECEDING_CONTEXT: &str = "preceding_context";
16    pub const FOLLOWING_CONTEXT: &str = "following_context";
17    pub const DOCUMENT_SUMMARY: &str = "document_summary";
18    pub const CHUNK_SUMMARY: &str = "chunk_summary";
19    pub const SOURCE_URI: &str = "source_uri";
20    pub const PAGE_NUMBER: &str = "page_number";
21    pub const CREATED_AT: &str = "created_at";
22    pub const DOCUMENT_DATE: &str = "document_date";
23    pub const EMBEDDING: &str = "embedding";
24    pub const CONTEXT_EMBEDDING: &str = "context_embedding";
25}
26
27/// Vector storage configuration applied at table creation time.
28/// Stored in Iceberg metadata.json properties.
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct VectorStoragePolicy {
31    pub column_name: String,
32    pub dim: u32,
33    pub metric: VectorMetric,
34    pub precision: VectorPrecision,
35    pub pq: Option<PQConfig>,
36    pub keep_raw_for_reranking: bool,
37    /// Normalize each input vector to unit L2 length before indexing.
38    /// Enables the NormalizedCosine fast path in HNSW: distance = 1 - dot(a, b),
39    /// no sqrt, ~2× faster distance computation. Semantics unchanged — same top-k
40    /// results as Cosine. Most embedding models (OpenAI, Cohere, etc.) produce
41    /// nearly-unit vectors; enabling this adds negligible write overhead.
42    #[serde(default)]
43    pub pre_normalize: bool,
44    /// HNSW M parameter — connections per node. `None` = default (16).
45    /// Higher M → better recall, more memory, slower build.
46    /// Recommended values: 8 (low-memory), 16 (default), 32 (high-recall), 64 (max).
47    #[serde(default)]
48    pub hnsw_m: Option<u32>,
49    /// HNSW ef_construction — candidate pool size during build. `None` = default (150).
50    /// Higher ef_construction → better graph quality, slower build.
51    /// Recommended values: 100 (fast), 150 (default), 200 (quality), 400 (max quality).
52    #[serde(default)]
53    pub hnsw_ef_construction: Option<u32>,
54    /// RaBitQ configuration. When set, the file writer embeds a RaBitQ flat index
55    /// instead of HNSW. Best for workloads that require extreme storage compression
56    /// (1 bit/dim = 16× smaller than F16) with better recall than naive binary
57    /// quantization. Use `rerank_factor ≥ 3` at search time for full precision.
58    #[serde(default)]
59    pub rabitq: Option<RaBitQConfig>,
60}
61
62/// RaBitQ quantization configuration.
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct RaBitQConfig {
65    /// Seed for the random rotation matrix. Same seed → identical quantization
66    /// across shards, enabling consistent distance comparisons.
67    #[serde(default)]
68    pub seed: u64,
69    /// Keep raw F16 vectors alongside binary codes for exact reranking.
70    /// Disabling this halves the storage of the index section but prevents
71    /// reranking — only use when storage is the primary constraint.
72    #[serde(default = "default_keep_raw")]
73    pub keep_raw: bool,
74}
75
76fn default_keep_raw() -> bool {
77    true
78}
79
80impl VectorStoragePolicy {
81    pub fn default_f16(column: &str, dim: u32, metric: VectorMetric) -> Self {
82        Self {
83            column_name: column.to_string(),
84            dim,
85            metric,
86            precision: VectorPrecision::F16,
87            pq: None,
88            keep_raw_for_reranking: true,
89            pre_normalize: false,
90            hnsw_m: None,
91            hnsw_ef_construction: None,
92            rabitq: None,
93        }
94    }
95}
96
97/// Product Quantization configuration
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct PQConfig {
100    /// Number of sub-vectors M (dim must be divisible by M)
101    pub num_subvectors: usize,
102    /// Bits per code (8 = 256 centroids per sub-vector)
103    pub bits_per_code: u8,
104    /// Number of training samples for codebook
105    pub train_sample_size: usize,
106}
107
108/// Marker struct for documentation purposes — actual schema is enforced by
109/// column names in llm_columns module.
110pub struct LlmContextSchema;