ailake_core/schema.rs
1// SPDX-License-Identifier: MIT OR Apache-2.0
2use crate::types::{EmbeddingModelInfo, VectorMetric, VectorPrecision};
3use serde::{Deserialize, Serialize};
4
5/// Canonical column names for LLM-context tables.
6/// ContextAssembler reads columns by these names.
7pub mod llm_columns {
8 pub const CHUNK_ID: &str = "chunk_id";
9 pub const DOCUMENT_ID: &str = "document_id";
10 pub const CHUNK_INDEX: &str = "chunk_index";
11 pub const TOTAL_CHUNKS: &str = "total_chunks";
12 pub const CHUNK_TEXT: &str = "chunk_text";
13 pub const DOCUMENT_TITLE: &str = "document_title";
14 pub const SECTION_PATH: &str = "section_path";
15 pub const PRECEDING_CONTEXT: &str = "preceding_context";
16 pub const FOLLOWING_CONTEXT: &str = "following_context";
17 pub const DOCUMENT_SUMMARY: &str = "document_summary";
18 pub const CHUNK_SUMMARY: &str = "chunk_summary";
19 pub const SOURCE_URI: &str = "source_uri";
20 pub const PAGE_NUMBER: &str = "page_number";
21 pub const CREATED_AT: &str = "created_at";
22 pub const DOCUMENT_DATE: &str = "document_date";
23 pub const EMBEDDING: &str = "embedding";
24 pub const CONTEXT_EMBEDDING: &str = "context_embedding";
25}
26
27/// Vector storage configuration applied at table creation time.
28/// Stored in Iceberg metadata.json properties.
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct VectorStoragePolicy {
31 pub column_name: String,
32 pub dim: u32,
33 pub metric: VectorMetric,
34 pub precision: VectorPrecision,
35 pub pq: Option<PQConfig>,
36 pub keep_raw_for_reranking: bool,
37 /// Normalize each input vector to unit L2 length before indexing.
38 /// Enables the NormalizedCosine fast path in HNSW: distance = 1 - dot(a, b),
39 /// no sqrt, ~2× faster distance computation. Semantics unchanged — same top-k
40 /// results as Cosine. Most embedding models (OpenAI, Cohere, etc.) produce
41 /// nearly-unit vectors; enabling this adds negligible write overhead.
42 #[serde(default)]
43 pub pre_normalize: bool,
44 /// HNSW M parameter — connections per node. `None` = default (16).
45 /// Higher M → better recall, more memory, slower build.
46 /// Recommended values: 8 (low-memory), 16 (default), 32 (high-recall), 64 (max).
47 #[serde(default)]
48 pub hnsw_m: Option<u32>,
49 /// HNSW ef_construction — candidate pool size during build. `None` = default (150).
50 /// Higher ef_construction → better graph quality, slower build.
51 /// Recommended values: 100 (fast), 150 (default), 200 (quality), 400 (max quality).
52 #[serde(default)]
53 pub hnsw_ef_construction: Option<u32>,
54 /// IVF-PQ residual encoding — train PQ on per-cluster residuals (vec - coarse_centroid).
55 /// Same bytes/vector, ~2-4pp better recall@10. Only applies when IVF-PQ index is used.
56 #[serde(default)]
57 pub ivf_residual: bool,
58 /// Optional embedding model metadata. When set:
59 /// - Stored as `ailake.embedding-model` in Iceberg table properties.
60 /// - Validated on every `write_batch`: dim mismatch → hard error; name mismatch → warning.
61 /// - Required for `migrate_embeddings` to track the model transition.
62 #[serde(default, skip_serializing_if = "Option::is_none")]
63 pub embedding_model: Option<EmbeddingModelInfo>,
64}
65
66impl VectorStoragePolicy {
67 pub fn default_f16(column: &str, dim: u32, metric: VectorMetric) -> Self {
68 Self {
69 column_name: column.to_string(),
70 dim,
71 metric,
72 precision: VectorPrecision::F16,
73 pq: None,
74 keep_raw_for_reranking: true,
75 pre_normalize: false,
76 hnsw_m: None,
77 hnsw_ef_construction: None,
78 ivf_residual: false,
79 embedding_model: None,
80 }
81 }
82}
83
84/// Product Quantization configuration
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct PQConfig {
87 /// Number of sub-vectors M (dim must be divisible by M)
88 pub num_subvectors: usize,
89 /// Bits per code (8 = 256 centroids per sub-vector)
90 pub bits_per_code: u8,
91 /// Number of training samples for codebook
92 pub train_sample_size: usize,
93}
94
95/// Marker struct for documentation purposes — actual schema is enforced by
96/// column names in llm_columns module.
97pub struct LlmContextSchema;