Skip to main content

ailake_core/
schema.rs

1use crate::types::{VectorMetric, VectorPrecision};
2use serde::{Deserialize, Serialize};
3
4/// Canonical column names for LLM-context tables.
5/// ContextAssembler reads columns by these names.
6pub mod llm_columns {
7    pub const CHUNK_ID: &str = "chunk_id";
8    pub const DOCUMENT_ID: &str = "document_id";
9    pub const CHUNK_INDEX: &str = "chunk_index";
10    pub const TOTAL_CHUNKS: &str = "total_chunks";
11    pub const CHUNK_TEXT: &str = "chunk_text";
12    pub const DOCUMENT_TITLE: &str = "document_title";
13    pub const SECTION_PATH: &str = "section_path";
14    pub const PRECEDING_CONTEXT: &str = "preceding_context";
15    pub const FOLLOWING_CONTEXT: &str = "following_context";
16    pub const DOCUMENT_SUMMARY: &str = "document_summary";
17    pub const CHUNK_SUMMARY: &str = "chunk_summary";
18    pub const SOURCE_URI: &str = "source_uri";
19    pub const PAGE_NUMBER: &str = "page_number";
20    pub const CREATED_AT: &str = "created_at";
21    pub const DOCUMENT_DATE: &str = "document_date";
22    pub const EMBEDDING: &str = "embedding";
23    pub const CONTEXT_EMBEDDING: &str = "context_embedding";
24}
25
26/// Vector storage configuration applied at table creation time.
27/// Stored in Iceberg metadata.json properties.
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct VectorStoragePolicy {
30    pub column_name: String,
31    pub dim: u32,
32    pub metric: VectorMetric,
33    pub precision: VectorPrecision,
34    pub pq: Option<PQConfig>,
35    pub keep_raw_for_reranking: bool,
36}
37
38impl VectorStoragePolicy {
39    pub fn default_f16(column: &str, dim: u32, metric: VectorMetric) -> Self {
40        Self {
41            column_name: column.to_string(),
42            dim,
43            metric,
44            precision: VectorPrecision::F16,
45            pq: None,
46            keep_raw_for_reranking: true,
47        }
48    }
49}
50
51/// Product Quantization configuration
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct PQConfig {
54    /// Number of sub-vectors M (dim must be divisible by M)
55    pub num_subvectors: usize,
56    /// Bits per code (8 = 256 centroids per sub-vector)
57    pub bits_per_code: u8,
58    /// Number of training samples for codebook
59    pub train_sample_size: usize,
60}
61
62/// Marker struct for documentation purposes — actual schema is enforced by
63/// column names in llm_columns module.
64pub struct LlmContextSchema;