Skip to main content

fraiseql_core/compiler/fact_table/
mod.rs

1//! Fact Table Introspection Module
2//!
3//! This module provides functionality to detect and introspect fact tables following
4//! FraiseQL's analytics architecture:
5//!
6//! # Fact Table Pattern
7//!
8//! - **Table naming**: `tf_*` prefix (table fact)
9//! - **Measures**: SQL columns with numeric types (INT, BIGINT, DECIMAL, FLOAT) - for fast
10//!   aggregation
11//! - **Dimensions**: JSONB `data` column - for flexible GROUP BY
12//! - **Denormalized filters**: Indexed SQL columns (`customer_id`, `occurred_at`) - for fast WHERE
13//!
14//! # No Joins Principle
15//!
16//! FraiseQL does NOT support joins. All dimensional data must be denormalized into the
17//! `data` JSONB column at ETL time (managed by DBA/data team, not FraiseQL).
18//!
19//! # Example Fact Table
20//!
21//! ```sql
22//! CREATE TABLE tf_sales (
23//!     id BIGSERIAL PRIMARY KEY,
24//!     -- Measures (SQL columns for fast aggregation)
25//!     revenue DECIMAL(10,2) NOT NULL,
26//!     quantity INT NOT NULL,
27//!     cost DECIMAL(10,2) NOT NULL,
28//!     -- Dimensions (JSONB for flexible grouping)
29//!     data JSONB NOT NULL,
30//!     -- Denormalized filters (indexed for fast WHERE)
31//!     customer_id UUID NOT NULL,
32//!     product_id UUID NOT NULL,
33//!     occurred_at TIMESTAMPTZ NOT NULL,
34//!     created_at TIMESTAMPTZ DEFAULT NOW()
35//! );
36//! ```
37
38use serde::{Deserialize, Serialize};
39
40mod detector;
41// Re-export from fraiseql-db to avoid duplication
42pub use fraiseql_db::{introspector::DatabaseIntrospector, types::DatabaseType};
43
44pub use self::detector::FactTableDetector;
45
46#[cfg(test)]
47mod tests;
48
49/// Metadata about a fact table structure
50#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
51pub struct FactTableMetadata {
52    /// Table name (e.g., "`tf_sales`")
53    pub table_name:           String,
54    /// Measures (aggregatable numeric columns)
55    pub measures:             Vec<MeasureColumn>,
56    /// Dimension column (JSONB)
57    pub dimensions:           DimensionColumn,
58    /// Denormalized filter columns
59    pub denormalized_filters: Vec<FilterColumn>,
60    /// Calendar dimensions for optimized temporal aggregations
61    #[serde(default)]
62    pub calendar_dimensions:  Vec<CalendarDimension>,
63}
64
65/// A measure column (aggregatable numeric type)
66#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
67pub struct MeasureColumn {
68    /// Column name (e.g., "revenue")
69    pub name:     String,
70    /// SQL data type
71    pub sql_type: SqlType,
72    /// Is nullable
73    pub nullable: bool,
74}
75
76/// SQL data types
77#[non_exhaustive]
78#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
79pub enum SqlType {
80    /// SMALLINT, INT, INTEGER
81    Int,
82    /// BIGINT
83    BigInt,
84    /// DECIMAL, NUMERIC
85    Decimal,
86    /// REAL, FLOAT, DOUBLE PRECISION
87    Float,
88    /// JSONB (PostgreSQL)
89    Jsonb,
90    /// JSON (MySQL, SQL Server)
91    Json,
92    /// TEXT, VARCHAR
93    Text,
94    /// UUID
95    Uuid,
96    /// TIMESTAMP, TIMESTAMPTZ
97    Timestamp,
98    /// DATE
99    Date,
100    /// BOOLEAN
101    Boolean,
102    /// Other types
103    Other(String),
104}
105
106/// Dimension column (JSONB)
107#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
108pub struct DimensionColumn {
109    /// Column name (default: "dimensions" for fact tables)
110    pub name:  String,
111    /// Detected dimension paths (optional, extracted from sample data)
112    pub paths: Vec<DimensionPath>,
113}
114
115/// A dimension path within the JSONB column
116#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
117pub struct DimensionPath {
118    /// Path name (e.g., "category")
119    pub name:      String,
120    /// JSON path (e.g., "dimensions->>'category'" for PostgreSQL)
121    pub json_path: String,
122    /// Data type hint
123    pub data_type: String,
124}
125
126/// Calendar dimension metadata (pre-computed temporal fields)
127///
128/// Calendar dimensions provide 10-20x performance improvements for temporal aggregations
129/// by using pre-computed JSONB columns (`date_info`, `month_info`, etc.) instead of runtime
130/// `DATE_TRUNC` operations.
131///
132/// # Multi-Column Pattern
133///
134/// - 7 JSONB columns: `date_info`, `week_info`, `month_info`, `quarter_info`, `semester_info`,
135///   `year_info`, `decade_info`
136/// - Each contains hierarchical temporal buckets (e.g., `date_info` has: date, week, month,
137///   quarter, year)
138/// - Pre-populated by user's ETL (FraiseQL reads, doesn't populate)
139///
140/// # Example
141///
142/// ```json
143/// {
144///   "date": "2024-03-15",
145///   "week": 11,
146///   "month": 3,
147///   "quarter": 1,
148///   "semester": 1,
149///   "year": 2024
150/// }
151/// ```
152#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
153pub struct CalendarDimension {
154    /// Source timestamp column (e.g., "`occurred_at`")
155    pub source_column: String,
156
157    /// Available calendar granularity columns
158    pub granularities: Vec<CalendarGranularity>,
159}
160
161/// Calendar granularity column with pre-computed fields
162#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
163pub struct CalendarGranularity {
164    /// Column name (e.g., "`date_info`", "`month_info`")
165    pub column_name: String,
166
167    /// Temporal buckets available in this column
168    pub buckets: Vec<CalendarBucket>,
169}
170
171/// Pre-computed temporal bucket in calendar JSONB
172#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
173pub struct CalendarBucket {
174    /// JSON path key (e.g., "date", "month", "quarter")
175    pub json_key: String,
176
177    /// Corresponding `TemporalBucket` enum
178    pub bucket_type: crate::compiler::aggregate_types::TemporalBucket,
179
180    /// Data type (e.g., "date", "integer")
181    pub data_type: String,
182}
183
184/// A denormalized filter column
185#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
186pub struct FilterColumn {
187    /// Column name (e.g., "`customer_id`")
188    pub name:     String,
189    /// SQL data type
190    pub sql_type: SqlType,
191    /// Is indexed (for performance)
192    pub indexed:  bool,
193}
194
195/// Aggregation strategy for fact tables
196///
197/// Determines how fact table data is updated and structured.
198///
199/// # Strategies
200///
201/// - **Incremental**: New records added (e.g., transaction logs)
202/// - **`AccumulatingSnapshot`**: Records updated with new events (e.g., order milestones)
203/// - **`PeriodicSnapshot`**: Complete snapshot at regular intervals (e.g., daily inventory)
204#[non_exhaustive]
205#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
206pub enum AggregationStrategy {
207    /// New records are appended (e.g., transaction logs, event streams)
208    #[serde(rename = "incremental")]
209    #[default]
210    Incremental,
211
212    /// Records are updated with new events (e.g., order status changes)
213    #[serde(rename = "accumulating_snapshot")]
214    AccumulatingSnapshot,
215
216    /// Complete snapshots at regular intervals (e.g., daily inventory levels)
217    #[serde(rename = "periodic_snapshot")]
218    PeriodicSnapshot,
219}
220
221/// Explicit fact table schema declaration
222///
223/// Allows users to explicitly declare fact table metadata instead of relying on
224/// auto-detection. Explicit declarations take precedence over auto-detected metadata.
225///
226/// # Example
227///
228/// ```json
229/// {
230///   "name": "tf_sales",
231///   "measures": ["amount", "quantity", "discount"],
232///   "dimensions": ["product_id", "region_id", "date_id"],
233///   "primary_key": "id",
234///   "metadata": {
235///     "aggregation_strategy": "incremental",
236///     "grain": ["date", "product", "region"]
237///   }
238/// }
239/// ```
240#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
241pub struct FactTableDeclaration {
242    /// Fact table name (e.g., "`tf_sales`")
243    pub name: String,
244
245    /// Measure column names (aggregatable numeric fields)
246    pub measures: Vec<String>,
247
248    /// Dimension column names or paths within JSONB
249    pub dimensions: Vec<String>,
250
251    /// Primary key column name
252    pub primary_key: String,
253
254    /// Optional metadata about the fact table
255    pub metadata: Option<FactTableDeclarationMetadata>,
256}
257
258/// Metadata for explicitly declared fact tables
259#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
260pub struct FactTableDeclarationMetadata {
261    /// Aggregation strategy (how data is updated)
262    #[serde(default)]
263    pub aggregation_strategy: AggregationStrategy,
264
265    /// Grain of the fact table (combination of dimensions that makes a unique record)
266    pub grain: Vec<String>,
267
268    /// Column containing snapshot date (for periodic snapshots)
269    pub snapshot_date_column: Option<String>,
270
271    /// Whether this is a slowly changing dimension
272    #[serde(default)]
273    pub is_slowly_changing_dimension: bool,
274}
275
276impl SqlType {
277    /// Parse SQL type from string (database-specific)
278    pub fn from_str_postgres(type_name: &str) -> Self {
279        match type_name.to_lowercase().as_str() {
280            "smallint" | "int" | "integer" | "int2" | "int4" => Self::Int,
281            "bigint" | "int8" => Self::BigInt,
282            "decimal" | "numeric" => Self::Decimal,
283            "real" | "float" | "double precision" | "float4" | "float8" => Self::Float,
284            "jsonb" => Self::Jsonb,
285            "json" => Self::Json,
286            "text" | "varchar" | "character varying" | "char" | "character" => Self::Text,
287            "uuid" => Self::Uuid,
288            "timestamp"
289            | "timestamptz"
290            | "timestamp with time zone"
291            | "timestamp without time zone" => Self::Timestamp,
292            "date" => Self::Date,
293            "boolean" | "bool" => Self::Boolean,
294            other => Self::Other(other.to_string()),
295        }
296    }
297
298    /// Parse SQL type from string (MySQL)
299    pub fn from_str_mysql(type_name: &str) -> Self {
300        match type_name.to_lowercase().as_str() {
301            "tinyint" | "smallint" | "mediumint" | "int" | "integer" => Self::Int,
302            "bigint" => Self::BigInt,
303            "decimal" | "numeric" => Self::Decimal,
304            "float" | "double" | "real" => Self::Float,
305            "json" => Self::Json,
306            "text" | "varchar" | "char" | "tinytext" | "mediumtext" | "longtext" => Self::Text,
307            "timestamp" | "datetime" => Self::Timestamp,
308            "date" => Self::Date,
309            "boolean" | "bool" | "tinyint(1)" => Self::Boolean,
310            other => Self::Other(other.to_string()),
311        }
312    }
313
314    /// Parse SQL type from string (SQLite)
315    pub fn from_str_sqlite(type_name: &str) -> Self {
316        match type_name.to_lowercase().as_str() {
317            "integer" | "int" => Self::BigInt, // SQLite INTEGER is 64-bit
318            "real" | "double" | "float" => Self::Float,
319            "numeric" | "decimal" => Self::Decimal,
320            "text" | "varchar" | "char" => Self::Text,
321            "blob" => Self::Other("BLOB".to_string()),
322            other => Self::Other(other.to_string()),
323        }
324    }
325
326    /// Parse SQL type from string (SQL Server)
327    pub fn from_str_sqlserver(type_name: &str) -> Self {
328        match type_name.to_lowercase().as_str() {
329            "tinyint" | "smallint" | "int" => Self::Int,
330            "bigint" => Self::BigInt,
331            "decimal" | "numeric" | "money" | "smallmoney" => Self::Decimal,
332            "float" | "real" => Self::Float,
333            "nvarchar" | "varchar" | "char" | "nchar" | "text" | "ntext" => Self::Text,
334            "uniqueidentifier" => Self::Uuid,
335            "datetime" | "datetime2" | "smalldatetime" | "datetimeoffset" => Self::Timestamp,
336            "date" => Self::Date,
337            "bit" => Self::Boolean,
338            other => Self::Other(other.to_string()),
339        }
340    }
341}