Skip to main content

fraiseql_core/compiler/fact_table/
detector.rs

1use super::{
2    CalendarBucket, CalendarDimension, CalendarGranularity, DatabaseIntrospector, DatabaseType,
3    DimensionColumn, DimensionPath, FactTableMetadata, FilterColumn, MeasureColumn, SqlType,
4};
5use crate::error::{FraiseQLError, Result};
6
7/// Detects and introspects fact tables
8pub struct FactTableDetector;
9
10impl FactTableDetector {
11    /// Detect if a table name follows the fact table pattern.
12    ///
13    /// Fact tables must follow the naming convention: `tf_<table_name>`
14    /// where the table name contains only lowercase letters, numbers, and underscores.
15    ///
16    /// # Arguments
17    ///
18    /// * `table_name` - Table name to check
19    ///
20    /// # Returns
21    ///
22    /// `true` if the table name starts with `tf_` and follows naming conventions,
23    /// `false` otherwise.
24    ///
25    /// # Notes
26    ///
27    /// - The check is strict: `tf_` is required as a prefix
28    /// - Table names like `TF_sales` (uppercase prefix) are NOT recognized as fact tables
29    /// - Empty strings and tables named just `tf_` without additional suffix are not valid
30    ///
31    /// # Examples
32    ///
33    /// ```
34    /// use fraiseql_core::compiler::fact_table::FactTableDetector;
35    ///
36    /// assert!(FactTableDetector::is_fact_table("tf_sales"));
37    /// assert!(FactTableDetector::is_fact_table("tf_events"));
38    /// assert!(FactTableDetector::is_fact_table("tf_page_views_daily"));
39    /// assert!(!FactTableDetector::is_fact_table("ta_sales_by_day"));
40    /// assert!(!FactTableDetector::is_fact_table("v_user"));
41    /// assert!(!FactTableDetector::is_fact_table("TF_sales")); // uppercase prefix not recognized
42    /// assert!(!FactTableDetector::is_fact_table("tf_")); // incomplete name
43    /// ```
44    pub fn is_fact_table(table_name: &str) -> bool {
45        // Must start with "tf_" and have at least one more character
46        table_name.len() > 3 && table_name.starts_with("tf_")
47    }
48
49    /// Introspect a fact table from the database
50    ///
51    /// Queries the database schema to extract:
52    /// - Measures (numeric columns)
53    /// - Dimensions (JSONB/JSON columns)
54    /// - Denormalized filters (indexed columns)
55    ///
56    /// # Arguments
57    ///
58    /// * `introspector` - Database introspection implementation
59    /// * `table_name` - Fact table name (must start with "tf_")
60    ///
61    /// # Errors
62    ///
63    /// Returns error if:
64    /// - Table is not a fact table (doesn't start with "tf_")
65    /// - Database query fails
66    /// - Table structure is invalid
67    ///
68    /// # Example
69    ///
70    /// ```rust,no_run
71    /// use fraiseql_core::compiler::fact_table::{FactTableDetector, DatabaseIntrospector};
72    ///
73    /// # async fn example(db: impl DatabaseIntrospector) -> Result<(), Box<dyn std::error::Error>> {
74    /// let metadata = FactTableDetector::introspect(&db, "tf_sales").await?;
75    /// println!("Found {} measures", metadata.measures.len());
76    /// # Ok(())
77    /// # }
78    /// ```
79    pub async fn introspect(
80        introspector: &impl DatabaseIntrospector,
81        table_name: &str,
82    ) -> Result<FactTableMetadata> {
83        // Validate table name follows fact table pattern
84        if !Self::is_fact_table(table_name) {
85            return Err(FraiseQLError::Validation {
86                message: format!(
87                    "Table '{}' is not a fact table (must start with 'tf_')",
88                    table_name
89                ),
90                path:    None,
91            });
92        }
93
94        // Query column information
95        let columns = introspector.get_columns(table_name).await?;
96        if columns.is_empty() {
97            return Err(FraiseQLError::Validation {
98                message: format!("Table '{}' not found or has no columns", table_name),
99                path:    None,
100            });
101        }
102
103        // Query indexed columns
104        let indexed_columns = introspector.get_indexed_columns(table_name).await?;
105        let indexed_set: std::collections::HashSet<String> = indexed_columns.into_iter().collect();
106
107        // Parse SQL types based on database
108        let db_type = introspector.database_type();
109
110        let mut measures = Vec::new();
111        let mut dimension_column: Option<DimensionColumn> = None;
112        let mut filters = Vec::new();
113
114        for (name, data_type, is_nullable) in &columns {
115            let sql_type = Self::parse_sql_type(data_type, db_type);
116
117            match sql_type {
118                SqlType::Jsonb | SqlType::Json => {
119                    // This is the dimension column - try to extract paths from sample data
120                    let paths = if let Ok(Some(sample)) =
121                        introspector.get_sample_jsonb(table_name, name).await
122                    {
123                        Self::extract_dimension_paths(&sample, name, db_type)
124                    } else {
125                        Vec::new()
126                    };
127                    dimension_column = Some(DimensionColumn {
128                        name: name.clone(),
129                        paths,
130                    });
131                },
132                SqlType::Int | SqlType::BigInt | SqlType::Decimal | SqlType::Float => {
133                    // Skip common non-measure columns
134                    if name != "id" && !name.ends_with("_id") {
135                        measures.push(MeasureColumn {
136                            name:     name.clone(),
137                            sql_type: sql_type.clone(),
138                            nullable: *is_nullable,
139                        });
140                    }
141
142                    // Check if it's a denormalized filter
143                    if name.ends_with("_id") && indexed_set.contains(name.as_str()) {
144                        filters.push(FilterColumn {
145                            name:     name.clone(),
146                            sql_type: sql_type.clone(),
147                            indexed:  true,
148                        });
149                    }
150                },
151                _ => {
152                    // Other types might be denormalized filters
153                    if name != "id"
154                        && name != "created_at"
155                        && name != "updated_at"
156                        && name != "occurred_at"
157                    {
158                        filters.push(FilterColumn {
159                            name: name.clone(),
160                            sql_type,
161                            indexed: indexed_set.contains(name.as_str()),
162                        });
163                    } else if (name == "occurred_at" || name == "created_at")
164                        && indexed_set.contains(name.as_str())
165                    {
166                        // Timestamp columns are important filters if indexed
167                        filters.push(FilterColumn {
168                            name: name.clone(),
169                            sql_type,
170                            indexed: true,
171                        });
172                    }
173                },
174            }
175        }
176
177        // Detect calendar dimensions
178        let calendar_dimensions = Self::detect_calendar_dimensions(&columns, &indexed_set)?;
179
180        let metadata = FactTableMetadata {
181            table_name: table_name.to_string(),
182            measures,
183            dimensions: dimension_column.unwrap_or(DimensionColumn {
184                name:  "dimensions".to_string(),
185                paths: Vec::new(),
186            }),
187            denormalized_filters: filters,
188            calendar_dimensions,
189        };
190
191        Self::validate(&metadata)?;
192        Ok(metadata)
193    }
194
195    /// Parse SQL type string to `SqlType` enum
196    fn parse_sql_type(type_name: &str, db_type: DatabaseType) -> SqlType {
197        match db_type {
198            DatabaseType::PostgreSQL => SqlType::from_str_postgres(type_name),
199            DatabaseType::MySQL => SqlType::from_str_mysql(type_name),
200            DatabaseType::SQLite => SqlType::from_str_sqlite(type_name),
201            DatabaseType::SQLServer => SqlType::from_str_sqlserver(type_name),
202        }
203    }
204
205    /// Validate fact table structure
206    ///
207    /// # Errors
208    ///
209    /// Returns error if:
210    /// - No measures found
211    /// - No dimension column found
212    /// - Measures are not numeric types
213    pub fn validate(metadata: &FactTableMetadata) -> Result<()> {
214        // Must have at least one measure
215        if metadata.measures.is_empty() {
216            return Err(FraiseQLError::Validation {
217                message: format!(
218                    "Fact table '{}' must have at least one measure column",
219                    metadata.table_name
220                ),
221                path:    None,
222            });
223        }
224
225        // Validate all measures are numeric
226        for measure in &metadata.measures {
227            if !Self::is_numeric_type(&measure.sql_type) {
228                return Err(FraiseQLError::Validation {
229                    message: format!(
230                        "Measure column '{}' must be numeric type, found {:?}",
231                        measure.name, measure.sql_type
232                    ),
233                    path:    None,
234                });
235            }
236        }
237
238        // Must have dimension column
239        if metadata.dimensions.name.is_empty() {
240            return Err(FraiseQLError::Validation {
241                message: format!(
242                    "Fact table '{}' must have a dimension column (JSONB)",
243                    metadata.table_name
244                ),
245                path:    None,
246            });
247        }
248
249        Ok(())
250    }
251
252    /// Check if SQL type is numeric (suitable for aggregation)
253    pub(super) const fn is_numeric_type(sql_type: &SqlType) -> bool {
254        matches!(sql_type, SqlType::Int | SqlType::BigInt | SqlType::Decimal | SqlType::Float)
255    }
256
257    /// Extract dimension paths from a sample JSON value
258    ///
259    /// Walks through the JSON structure and extracts top-level keys as dimension paths.
260    /// Nested objects are represented with dot notation (e.g., "customer.region").
261    ///
262    /// # Arguments
263    ///
264    /// * `sample` - Sample JSON value from the dimension column
265    /// * `column_name` - Name of the JSONB column (e.g., "dimensions")
266    /// * `db_type` - Database type for generating correct JSON path syntax
267    ///
268    /// # Returns
269    ///
270    /// Vec of `DimensionPath` extracted from the sample data
271    pub fn extract_dimension_paths(
272        sample: &serde_json::Value,
273        column_name: &str,
274        db_type: DatabaseType,
275    ) -> Vec<DimensionPath> {
276        let mut paths = Vec::new();
277        Self::extract_paths_recursive(sample, column_name, "", &mut paths, db_type, 0);
278        paths
279    }
280
281    /// Recursively extract paths from JSON structure
282    fn extract_paths_recursive(
283        value: &serde_json::Value,
284        column_name: &str,
285        prefix: &str,
286        paths: &mut Vec<DimensionPath>,
287        db_type: DatabaseType,
288        depth: usize,
289    ) {
290        // Limit depth to avoid infinite recursion on deeply nested structures
291        if depth > 3 {
292            return;
293        }
294
295        if let Some(obj) = value.as_object() {
296            for (key, val) in obj {
297                let full_path = if prefix.is_empty() {
298                    key.clone()
299                } else {
300                    format!("{}.{}", prefix, key)
301                };
302
303                // Determine data type from the value
304                let data_type = Self::infer_json_type(val);
305
306                // Generate database-specific JSON path syntax
307                let json_path = Self::generate_json_path(column_name, &full_path, db_type);
308
309                paths.push(DimensionPath {
310                    name: full_path.replace('.', "_"), /* Convert dots to underscores for field
311                                                        * names */
312                    json_path,
313                    data_type,
314                });
315
316                // Recurse into nested objects
317                if val.is_object() {
318                    Self::extract_paths_recursive(
319                        val,
320                        column_name,
321                        &full_path,
322                        paths,
323                        db_type,
324                        depth + 1,
325                    );
326                }
327            }
328        }
329    }
330
331    /// Infer JSON data type from a value
332    pub(super) fn infer_json_type(value: &serde_json::Value) -> String {
333        match value {
334            serde_json::Value::Bool(_) => "boolean".to_string(),
335            serde_json::Value::Number(n) => {
336                if n.is_i64() || n.is_u64() {
337                    "integer".to_string()
338                } else {
339                    "float".to_string()
340                }
341            },
342            serde_json::Value::Array(_) => "array".to_string(),
343            serde_json::Value::Object(_) => "object".to_string(),
344            // Null and String both infer as "string" type
345            serde_json::Value::Null | serde_json::Value::String(_) => "string".to_string(),
346        }
347    }
348
349    /// Generate database-specific JSON path syntax
350    pub(super) fn generate_json_path(
351        column_name: &str,
352        path: &str,
353        db_type: DatabaseType,
354    ) -> String {
355        let parts: Vec<&str> = path.split('.').collect();
356
357        match db_type {
358            DatabaseType::PostgreSQL => {
359                // PostgreSQL: column->>'key' for top-level, column->'nested'->>'key' for nested
360                if parts.is_empty() {
361                    // Safety: handle empty path by returning raw column name
362                    column_name.to_string()
363                } else if parts.len() == 1 {
364                    format!("{}->>'{}'", column_name, parts[0])
365                } else {
366                    // Safe: parts.len() >= 2 is guaranteed here
367                    if let Some(last) = parts.last() {
368                        let rest = &parts[..parts.len() - 1];
369                        let nav = rest.iter().fold(String::new(), |mut acc, p| {
370                            use std::fmt::Write;
371                            let _ = write!(acc, "->'{}'", p);
372                            acc
373                        });
374                        format!("{}{}->>'{}'", column_name, nav, last)
375                    } else {
376                        // This branch is unreachable due to length check, but safe fallback
377                        column_name.to_string()
378                    }
379                }
380            },
381            DatabaseType::MySQL => {
382                // MySQL: JSON_EXTRACT(column, '$.path.to.key')
383                format!("JSON_UNQUOTE(JSON_EXTRACT({}, '$.{}')", column_name, path)
384            },
385            DatabaseType::SQLite => {
386                // SQLite: json_extract(column, '$.path.to.key')
387                format!("json_extract({}, '$.{}')", column_name, path)
388            },
389            DatabaseType::SQLServer => {
390                // SQL Server: JSON_VALUE(column, '$.path.to.key')
391                format!("JSON_VALUE({}, '$.{}')", column_name, path)
392            },
393        }
394    }
395
396    /// Detect calendar dimension columns (`date_info`, `week_info`, etc.)
397    ///
398    /// Looks for `*_info` JSONB/JSON columns following the calendar dimension pattern.
399    /// Returns calendar dimension metadata if calendar columns are found.
400    ///
401    /// # Arguments
402    ///
403    /// * `columns` - List of (name, `data_type`, nullable) tuples
404    /// * `_indexed_set` - Set of indexed columns (unused, for future optimization detection)
405    ///
406    /// # Returns
407    ///
408    /// Vec of calendar dimensions (empty if none found)
409    pub(super) fn detect_calendar_dimensions(
410        columns: &[(String, String, bool)],
411        _indexed_set: &std::collections::HashSet<String>,
412    ) -> Result<Vec<CalendarDimension>> {
413        // Look for *_info columns with JSONB/JSON type
414        let calendar_columns: Vec<String> = columns
415            .iter()
416            .filter(|(name, data_type, _)| {
417                name.ends_with("_info")
418                    && (data_type.to_lowercase().contains("json")
419                        || data_type.to_lowercase().contains("jsonb"))
420            })
421            .map(|(name, _, _)| name.clone())
422            .collect();
423
424        if calendar_columns.is_empty() {
425            return Ok(Vec::new());
426        }
427
428        // Build granularities based on calendar dimension pattern
429        let mut granularities = Vec::new();
430        for col_name in calendar_columns {
431            let buckets = Self::infer_calendar_buckets(&col_name);
432            if !buckets.is_empty() {
433                granularities.push(CalendarGranularity {
434                    column_name: col_name,
435                    buckets,
436                });
437            }
438        }
439
440        if granularities.is_empty() {
441            return Ok(Vec::new());
442        }
443
444        // Assume single source column "occurred_at"
445        // (could be enhanced to detect from schema later)
446        Ok(vec![CalendarDimension {
447            source_column: "occurred_at".to_string(),
448            granularities,
449        }])
450    }
451
452    /// Map calendar column names to available buckets (standard pattern)
453    ///
454    /// # Arguments
455    ///
456    /// * `column_name` - Name of the calendar column (e.g., "`date_info`", "`month_info`")
457    ///
458    /// # Returns
459    ///
460    /// Vec of calendar buckets available in this column
461    pub(super) fn infer_calendar_buckets(column_name: &str) -> Vec<CalendarBucket> {
462        use crate::compiler::aggregate_types::TemporalBucket;
463
464        match column_name {
465            "date_info" => vec![
466                CalendarBucket {
467                    json_key:    "date".to_string(),
468                    bucket_type: TemporalBucket::Day,
469                    data_type:   "date".to_string(),
470                },
471                CalendarBucket {
472                    json_key:    "week".to_string(),
473                    bucket_type: TemporalBucket::Week,
474                    data_type:   "integer".to_string(),
475                },
476                CalendarBucket {
477                    json_key:    "month".to_string(),
478                    bucket_type: TemporalBucket::Month,
479                    data_type:   "integer".to_string(),
480                },
481                CalendarBucket {
482                    json_key:    "quarter".to_string(),
483                    bucket_type: TemporalBucket::Quarter,
484                    data_type:   "integer".to_string(),
485                },
486                CalendarBucket {
487                    json_key:    "year".to_string(),
488                    bucket_type: TemporalBucket::Year,
489                    data_type:   "integer".to_string(),
490                },
491            ],
492            "week_info" => vec![
493                CalendarBucket {
494                    json_key:    "week".to_string(),
495                    bucket_type: TemporalBucket::Week,
496                    data_type:   "integer".to_string(),
497                },
498                CalendarBucket {
499                    json_key:    "month".to_string(),
500                    bucket_type: TemporalBucket::Month,
501                    data_type:   "integer".to_string(),
502                },
503                CalendarBucket {
504                    json_key:    "quarter".to_string(),
505                    bucket_type: TemporalBucket::Quarter,
506                    data_type:   "integer".to_string(),
507                },
508                CalendarBucket {
509                    json_key:    "year".to_string(),
510                    bucket_type: TemporalBucket::Year,
511                    data_type:   "integer".to_string(),
512                },
513            ],
514            "month_info" => vec![
515                CalendarBucket {
516                    json_key:    "month".to_string(),
517                    bucket_type: TemporalBucket::Month,
518                    data_type:   "integer".to_string(),
519                },
520                CalendarBucket {
521                    json_key:    "quarter".to_string(),
522                    bucket_type: TemporalBucket::Quarter,
523                    data_type:   "integer".to_string(),
524                },
525                CalendarBucket {
526                    json_key:    "year".to_string(),
527                    bucket_type: TemporalBucket::Year,
528                    data_type:   "integer".to_string(),
529                },
530            ],
531            "quarter_info" => vec![
532                CalendarBucket {
533                    json_key:    "quarter".to_string(),
534                    bucket_type: TemporalBucket::Quarter,
535                    data_type:   "integer".to_string(),
536                },
537                CalendarBucket {
538                    json_key:    "year".to_string(),
539                    bucket_type: TemporalBucket::Year,
540                    data_type:   "integer".to_string(),
541                },
542            ],
543            "semester_info" => vec![
544                CalendarBucket {
545                    json_key:    "semester".to_string(),
546                    bucket_type: TemporalBucket::Quarter, // Map to Quarter for now
547                    data_type:   "integer".to_string(),
548                },
549                CalendarBucket {
550                    json_key:    "year".to_string(),
551                    bucket_type: TemporalBucket::Year,
552                    data_type:   "integer".to_string(),
553                },
554            ],
555            "year_info" => vec![CalendarBucket {
556                json_key:    "year".to_string(),
557                bucket_type: TemporalBucket::Year,
558                data_type:   "integer".to_string(),
559            }],
560            _ => Vec::new(),
561        }
562    }
563
564    /// Create metadata from column definitions (for testing)
565    ///
566    /// # Errors
567    ///
568    /// Returns `FraiseQLError::Validation` if no time dimension column is found.
569    pub fn from_columns(
570        table_name: String,
571        columns: Vec<(&str, SqlType, bool)>,
572    ) -> Result<FactTableMetadata> {
573        let mut measures = Vec::new();
574        let mut dimension_column: Option<DimensionColumn> = None;
575        let mut filters = Vec::new();
576
577        for (name, sql_type, nullable) in columns {
578            match sql_type {
579                SqlType::Jsonb | SqlType::Json => {
580                    // This is the dimension column
581                    dimension_column = Some(DimensionColumn {
582                        name:  name.to_string(),
583                        paths: Vec::new(),
584                    });
585                },
586                SqlType::Int | SqlType::BigInt | SqlType::Decimal | SqlType::Float => {
587                    // Skip id column
588                    if name != "id" && !name.ends_with("_id") {
589                        // This is a measure
590                        measures.push(MeasureColumn {
591                            name: name.to_string(),
592                            sql_type,
593                            nullable,
594                        });
595                    } else if name != "id" {
596                        // This is a filter (_id columns)
597                        filters.push(FilterColumn {
598                            name: name.to_string(),
599                            sql_type,
600                            indexed: false,
601                        });
602                    }
603                },
604                _ => {
605                    // This might be a filter column (if not id/created_at/updated_at)
606                    if name != "id" && name != "created_at" && name != "updated_at" {
607                        filters.push(FilterColumn {
608                            name: name.to_string(),
609                            sql_type,
610                            indexed: false, // Would need to query indexes to determine
611                        });
612                    }
613                },
614            }
615        }
616
617        let metadata = FactTableMetadata {
618            table_name,
619            measures,
620            dimensions: dimension_column.unwrap_or(DimensionColumn {
621                name:  "dimensions".to_string(),
622                paths: Vec::new(),
623            }),
624            denormalized_filters: filters,
625            calendar_dimensions: Vec::new(), // No calendar detection in test helper
626        };
627
628        Self::validate(&metadata)?;
629        Ok(metadata)
630    }
631}