fraiseql_core/compiler/fact_table/mod.rs
1//! Fact Table Introspection Module
2//!
3//! This module provides functionality to detect and introspect fact tables following
4//! FraiseQL's analytics architecture:
5//!
6//! # Fact Table Pattern
7//!
8//! - **Table naming**: `tf_*` prefix (table fact)
9//! - **Measures**: SQL columns with numeric types (INT, BIGINT, DECIMAL, FLOAT) - for fast
10//! aggregation
11//! - **Dimensions**: JSONB `data` column - for flexible GROUP BY
12//! - **Denormalized filters**: Indexed SQL columns (`customer_id`, `occurred_at`) - for fast WHERE
13//!
14//! # No Joins Principle
15//!
16//! FraiseQL does NOT support joins. All dimensional data must be denormalized into the
17//! `data` JSONB column at ETL time (managed by DBA/data team, not FraiseQL).
18//!
19//! # Example Fact Table
20//!
21//! ```sql
22//! CREATE TABLE tf_sales (
23//! id BIGSERIAL PRIMARY KEY,
24//! -- Measures (SQL columns for fast aggregation)
25//! revenue DECIMAL(10,2) NOT NULL,
26//! quantity INT NOT NULL,
27//! cost DECIMAL(10,2) NOT NULL,
28//! -- Dimensions (JSONB for flexible grouping)
29//! data JSONB NOT NULL,
30//! -- Denormalized filters (indexed for fast WHERE)
31//! customer_id UUID NOT NULL,
32//! product_id UUID NOT NULL,
33//! occurred_at TIMESTAMPTZ NOT NULL,
34//! created_at TIMESTAMPTZ DEFAULT NOW()
35//! );
36//! ```
37
38use serde::{Deserialize, Serialize};
39
40mod detector;
41// Re-export from fraiseql-db to avoid duplication
42pub use fraiseql_db::{introspector::DatabaseIntrospector, types::DatabaseType};
43
44pub use self::detector::FactTableDetector;
45
46#[cfg(test)]
47mod tests;
48
49/// Metadata about a fact table structure
50#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
51pub struct FactTableMetadata {
52 /// Table name (e.g., "`tf_sales`")
53 pub table_name: String,
54 /// Measures (aggregatable numeric columns)
55 pub measures: Vec<MeasureColumn>,
56 /// Dimension column (JSONB)
57 pub dimensions: DimensionColumn,
58 /// Denormalized filter columns
59 pub denormalized_filters: Vec<FilterColumn>,
60 /// Calendar dimensions for optimized temporal aggregations
61 #[serde(default)]
62 pub calendar_dimensions: Vec<CalendarDimension>,
63}
64
65/// A measure column (aggregatable numeric type)
66#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
67pub struct MeasureColumn {
68 /// Column name (e.g., "revenue")
69 pub name: String,
70 /// SQL data type
71 pub sql_type: SqlType,
72 /// Is nullable
73 pub nullable: bool,
74}
75
76/// SQL data types
77#[non_exhaustive]
78#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
79pub enum SqlType {
80 /// SMALLINT, INT, INTEGER
81 Int,
82 /// BIGINT
83 BigInt,
84 /// DECIMAL, NUMERIC
85 Decimal,
86 /// REAL, FLOAT, DOUBLE PRECISION
87 Float,
88 /// JSONB (PostgreSQL)
89 Jsonb,
90 /// JSON (MySQL, SQL Server)
91 Json,
92 /// TEXT, VARCHAR
93 Text,
94 /// UUID
95 Uuid,
96 /// TIMESTAMP, TIMESTAMPTZ
97 Timestamp,
98 /// DATE
99 Date,
100 /// BOOLEAN
101 Boolean,
102 /// Other types
103 Other(String),
104}
105
106/// Dimension column (JSONB)
107#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
108pub struct DimensionColumn {
109 /// Column name (default: "dimensions" for fact tables)
110 pub name: String,
111 /// Detected dimension paths (optional, extracted from sample data)
112 pub paths: Vec<DimensionPath>,
113}
114
115/// A dimension path within the JSONB column
116#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
117pub struct DimensionPath {
118 /// Path name (e.g., "category")
119 pub name: String,
120 /// JSON path (e.g., "dimensions->>'category'" for PostgreSQL)
121 pub json_path: String,
122 /// Data type hint
123 pub data_type: String,
124}
125
126/// Calendar dimension metadata (pre-computed temporal fields)
127///
128/// Calendar dimensions provide 10-20x performance improvements for temporal aggregations
129/// by using pre-computed JSONB columns (`date_info`, `month_info`, etc.) instead of runtime
130/// `DATE_TRUNC` operations.
131///
132/// # Multi-Column Pattern
133///
134/// - 7 JSONB columns: `date_info`, `week_info`, `month_info`, `quarter_info`, `semester_info`,
135/// `year_info`, `decade_info`
136/// - Each contains hierarchical temporal buckets (e.g., `date_info` has: date, week, month,
137/// quarter, year)
138/// - Pre-populated by user's ETL (FraiseQL reads, doesn't populate)
139///
140/// # Example
141///
142/// ```json
143/// {
144/// "date": "2024-03-15",
145/// "week": 11,
146/// "month": 3,
147/// "quarter": 1,
148/// "semester": 1,
149/// "year": 2024
150/// }
151/// ```
152#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
153pub struct CalendarDimension {
154 /// Source timestamp column (e.g., "`occurred_at`")
155 pub source_column: String,
156
157 /// Available calendar granularity columns
158 pub granularities: Vec<CalendarGranularity>,
159}
160
161/// Calendar granularity column with pre-computed fields
162#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
163pub struct CalendarGranularity {
164 /// Column name (e.g., "`date_info`", "`month_info`")
165 pub column_name: String,
166
167 /// Temporal buckets available in this column
168 pub buckets: Vec<CalendarBucket>,
169}
170
171/// Pre-computed temporal bucket in calendar JSONB
172#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
173pub struct CalendarBucket {
174 /// JSON path key (e.g., "date", "month", "quarter")
175 pub json_key: String,
176
177 /// Corresponding `TemporalBucket` enum
178 pub bucket_type: crate::compiler::aggregate_types::TemporalBucket,
179
180 /// Data type (e.g., "date", "integer")
181 pub data_type: String,
182}
183
184/// A denormalized filter column
185#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
186pub struct FilterColumn {
187 /// Column name (e.g., "`customer_id`")
188 pub name: String,
189 /// SQL data type
190 pub sql_type: SqlType,
191 /// Is indexed (for performance)
192 pub indexed: bool,
193}
194
195/// Aggregation strategy for fact tables
196///
197/// Determines how fact table data is updated and structured.
198///
199/// # Strategies
200///
201/// - **Incremental**: New records added (e.g., transaction logs)
202/// - **`AccumulatingSnapshot`**: Records updated with new events (e.g., order milestones)
203/// - **`PeriodicSnapshot`**: Complete snapshot at regular intervals (e.g., daily inventory)
204#[non_exhaustive]
205#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
206pub enum AggregationStrategy {
207 /// New records are appended (e.g., transaction logs, event streams)
208 #[serde(rename = "incremental")]
209 #[default]
210 Incremental,
211
212 /// Records are updated with new events (e.g., order status changes)
213 #[serde(rename = "accumulating_snapshot")]
214 AccumulatingSnapshot,
215
216 /// Complete snapshots at regular intervals (e.g., daily inventory levels)
217 #[serde(rename = "periodic_snapshot")]
218 PeriodicSnapshot,
219}
220
221/// Explicit fact table schema declaration
222///
223/// Allows users to explicitly declare fact table metadata instead of relying on
224/// auto-detection. Explicit declarations take precedence over auto-detected metadata.
225///
226/// # Example
227///
228/// ```json
229/// {
230/// "name": "tf_sales",
231/// "measures": ["amount", "quantity", "discount"],
232/// "dimensions": ["product_id", "region_id", "date_id"],
233/// "primary_key": "id",
234/// "metadata": {
235/// "aggregation_strategy": "incremental",
236/// "grain": ["date", "product", "region"]
237/// }
238/// }
239/// ```
240#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
241pub struct FactTableDeclaration {
242 /// Fact table name (e.g., "`tf_sales`")
243 pub name: String,
244
245 /// Measure column names (aggregatable numeric fields)
246 pub measures: Vec<String>,
247
248 /// Dimension column names or paths within JSONB
249 pub dimensions: Vec<String>,
250
251 /// Primary key column name
252 pub primary_key: String,
253
254 /// Optional metadata about the fact table
255 pub metadata: Option<FactTableDeclarationMetadata>,
256}
257
258/// Metadata for explicitly declared fact tables
259#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
260pub struct FactTableDeclarationMetadata {
261 /// Aggregation strategy (how data is updated)
262 #[serde(default)]
263 pub aggregation_strategy: AggregationStrategy,
264
265 /// Grain of the fact table (combination of dimensions that makes a unique record)
266 pub grain: Vec<String>,
267
268 /// Column containing snapshot date (for periodic snapshots)
269 pub snapshot_date_column: Option<String>,
270
271 /// Whether this is a slowly changing dimension
272 #[serde(default)]
273 pub is_slowly_changing_dimension: bool,
274}
275
276impl SqlType {
277 /// Parse SQL type from string (database-specific)
278 pub fn from_str_postgres(type_name: &str) -> Self {
279 match type_name.to_lowercase().as_str() {
280 "smallint" | "int" | "integer" | "int2" | "int4" => Self::Int,
281 "bigint" | "int8" => Self::BigInt,
282 "decimal" | "numeric" => Self::Decimal,
283 "real" | "float" | "double precision" | "float4" | "float8" => Self::Float,
284 "jsonb" => Self::Jsonb,
285 "json" => Self::Json,
286 "text" | "varchar" | "character varying" | "char" | "character" => Self::Text,
287 "uuid" => Self::Uuid,
288 "timestamp"
289 | "timestamptz"
290 | "timestamp with time zone"
291 | "timestamp without time zone" => Self::Timestamp,
292 "date" => Self::Date,
293 "boolean" | "bool" => Self::Boolean,
294 other => Self::Other(other.to_string()),
295 }
296 }
297
298 /// Parse SQL type from string (MySQL)
299 pub fn from_str_mysql(type_name: &str) -> Self {
300 match type_name.to_lowercase().as_str() {
301 "tinyint" | "smallint" | "mediumint" | "int" | "integer" => Self::Int,
302 "bigint" => Self::BigInt,
303 "decimal" | "numeric" => Self::Decimal,
304 "float" | "double" | "real" => Self::Float,
305 "json" => Self::Json,
306 "text" | "varchar" | "char" | "tinytext" | "mediumtext" | "longtext" => Self::Text,
307 "timestamp" | "datetime" => Self::Timestamp,
308 "date" => Self::Date,
309 "boolean" | "bool" | "tinyint(1)" => Self::Boolean,
310 other => Self::Other(other.to_string()),
311 }
312 }
313
314 /// Parse SQL type from string (SQLite)
315 pub fn from_str_sqlite(type_name: &str) -> Self {
316 match type_name.to_lowercase().as_str() {
317 "integer" | "int" => Self::BigInt, // SQLite INTEGER is 64-bit
318 "real" | "double" | "float" => Self::Float,
319 "numeric" | "decimal" => Self::Decimal,
320 "text" | "varchar" | "char" => Self::Text,
321 "blob" => Self::Other("BLOB".to_string()),
322 other => Self::Other(other.to_string()),
323 }
324 }
325
326 /// Parse SQL type from string (SQL Server)
327 pub fn from_str_sqlserver(type_name: &str) -> Self {
328 match type_name.to_lowercase().as_str() {
329 "tinyint" | "smallint" | "int" => Self::Int,
330 "bigint" => Self::BigInt,
331 "decimal" | "numeric" | "money" | "smallmoney" => Self::Decimal,
332 "float" | "real" => Self::Float,
333 "nvarchar" | "varchar" | "char" | "nchar" | "text" | "ntext" => Self::Text,
334 "uniqueidentifier" => Self::Uuid,
335 "datetime" | "datetime2" | "smalldatetime" | "datetimeoffset" => Self::Timestamp,
336 "date" => Self::Date,
337 "bit" => Self::Boolean,
338 other => Self::Other(other.to_string()),
339 }
340 }
341}