fraiseql_core/compiler/fact_table/detector.rs
1use super::{
2 CalendarBucket, CalendarDimension, CalendarGranularity, DatabaseIntrospector, DatabaseType,
3 DimensionColumn, DimensionPath, FactTableMetadata, FilterColumn, MeasureColumn, SqlType,
4};
5use crate::error::{FraiseQLError, Result};
6
7/// Detects and introspects fact tables
8pub struct FactTableDetector;
9
10impl FactTableDetector {
11 /// Detect if a table name follows the fact table pattern.
12 ///
13 /// Fact tables must follow the naming convention: `tf_<table_name>`
14 /// where the table name contains only lowercase letters, numbers, and underscores.
15 ///
16 /// # Arguments
17 ///
18 /// * `table_name` - Table name to check
19 ///
20 /// # Returns
21 ///
22 /// `true` if the table name starts with `tf_` and follows naming conventions,
23 /// `false` otherwise.
24 ///
25 /// # Notes
26 ///
27 /// - The check is strict: `tf_` is required as a prefix
28 /// - Table names like `TF_sales` (uppercase prefix) are NOT recognized as fact tables
29 /// - Empty strings and tables named just `tf_` without additional suffix are not valid
30 ///
31 /// # Examples
32 ///
33 /// ```
34 /// use fraiseql_core::compiler::fact_table::FactTableDetector;
35 ///
36 /// assert!(FactTableDetector::is_fact_table("tf_sales"));
37 /// assert!(FactTableDetector::is_fact_table("tf_events"));
38 /// assert!(FactTableDetector::is_fact_table("tf_page_views_daily"));
39 /// assert!(!FactTableDetector::is_fact_table("ta_sales_by_day"));
40 /// assert!(!FactTableDetector::is_fact_table("v_user"));
41 /// assert!(!FactTableDetector::is_fact_table("TF_sales")); // uppercase prefix not recognized
42 /// assert!(!FactTableDetector::is_fact_table("tf_")); // incomplete name
43 /// ```
44 pub fn is_fact_table(table_name: &str) -> bool {
45 // Must start with "tf_" and have at least one more character
46 table_name.len() > 3 && table_name.starts_with("tf_")
47 }
48
49 /// Introspect a fact table from the database
50 ///
51 /// Queries the database schema to extract:
52 /// - Measures (numeric columns)
53 /// - Dimensions (JSONB/JSON columns)
54 /// - Denormalized filters (indexed columns)
55 ///
56 /// # Arguments
57 ///
58 /// * `introspector` - Database introspection implementation
59 /// * `table_name` - Fact table name (must start with "tf_")
60 ///
61 /// # Errors
62 ///
63 /// Returns error if:
64 /// - Table is not a fact table (doesn't start with "tf_")
65 /// - Database query fails
66 /// - Table structure is invalid
67 ///
68 /// # Example
69 ///
70 /// ```rust,no_run
71 /// use fraiseql_core::compiler::fact_table::{FactTableDetector, DatabaseIntrospector};
72 ///
73 /// # async fn example(db: impl DatabaseIntrospector) -> Result<(), Box<dyn std::error::Error>> {
74 /// let metadata = FactTableDetector::introspect(&db, "tf_sales").await?;
75 /// println!("Found {} measures", metadata.measures.len());
76 /// # Ok(())
77 /// # }
78 /// ```
79 pub async fn introspect(
80 introspector: &impl DatabaseIntrospector,
81 table_name: &str,
82 ) -> Result<FactTableMetadata> {
83 // Validate table name follows fact table pattern
84 if !Self::is_fact_table(table_name) {
85 return Err(FraiseQLError::Validation {
86 message: format!(
87 "Table '{}' is not a fact table (must start with 'tf_')",
88 table_name
89 ),
90 path: None,
91 });
92 }
93
94 // Query column information
95 let columns = introspector.get_columns(table_name).await?;
96 if columns.is_empty() {
97 return Err(FraiseQLError::Validation {
98 message: format!("Table '{}' not found or has no columns", table_name),
99 path: None,
100 });
101 }
102
103 // Query indexed columns
104 let indexed_columns = introspector.get_indexed_columns(table_name).await?;
105 let indexed_set: std::collections::HashSet<String> = indexed_columns.into_iter().collect();
106
107 // Parse SQL types based on database
108 let db_type = introspector.database_type();
109
110 let mut measures = Vec::new();
111 let mut dimension_column: Option<DimensionColumn> = None;
112 let mut filters = Vec::new();
113
114 for (name, data_type, is_nullable) in &columns {
115 let sql_type = Self::parse_sql_type(data_type, db_type);
116
117 match sql_type {
118 SqlType::Jsonb | SqlType::Json => {
119 // This is the dimension column - try to extract paths from sample data
120 let paths = if let Ok(Some(sample)) =
121 introspector.get_sample_jsonb(table_name, name).await
122 {
123 Self::extract_dimension_paths(&sample, name, db_type)
124 } else {
125 Vec::new()
126 };
127 dimension_column = Some(DimensionColumn {
128 name: name.clone(),
129 paths,
130 });
131 },
132 SqlType::Int | SqlType::BigInt | SqlType::Decimal | SqlType::Float => {
133 // Skip common non-measure columns
134 if name != "id" && !name.ends_with("_id") {
135 measures.push(MeasureColumn {
136 name: name.clone(),
137 sql_type: sql_type.clone(),
138 nullable: *is_nullable,
139 });
140 }
141
142 // Check if it's a denormalized filter
143 if name.ends_with("_id") && indexed_set.contains(name.as_str()) {
144 filters.push(FilterColumn {
145 name: name.clone(),
146 sql_type: sql_type.clone(),
147 indexed: true,
148 });
149 }
150 },
151 _ => {
152 // Other types might be denormalized filters
153 if name != "id"
154 && name != "created_at"
155 && name != "updated_at"
156 && name != "occurred_at"
157 {
158 filters.push(FilterColumn {
159 name: name.clone(),
160 sql_type,
161 indexed: indexed_set.contains(name.as_str()),
162 });
163 } else if (name == "occurred_at" || name == "created_at")
164 && indexed_set.contains(name.as_str())
165 {
166 // Timestamp columns are important filters if indexed
167 filters.push(FilterColumn {
168 name: name.clone(),
169 sql_type,
170 indexed: true,
171 });
172 }
173 },
174 }
175 }
176
177 // Detect calendar dimensions
178 let calendar_dimensions = Self::detect_calendar_dimensions(&columns, &indexed_set)?;
179
180 let metadata = FactTableMetadata {
181 table_name: table_name.to_string(),
182 measures,
183 dimensions: dimension_column.unwrap_or(DimensionColumn {
184 name: "dimensions".to_string(),
185 paths: Vec::new(),
186 }),
187 denormalized_filters: filters,
188 calendar_dimensions,
189 };
190
191 Self::validate(&metadata)?;
192 Ok(metadata)
193 }
194
195 /// Parse SQL type string to `SqlType` enum
196 fn parse_sql_type(type_name: &str, db_type: DatabaseType) -> SqlType {
197 match db_type {
198 DatabaseType::PostgreSQL => SqlType::from_str_postgres(type_name),
199 DatabaseType::MySQL => SqlType::from_str_mysql(type_name),
200 DatabaseType::SQLite => SqlType::from_str_sqlite(type_name),
201 DatabaseType::SQLServer => SqlType::from_str_sqlserver(type_name),
202 }
203 }
204
205 /// Validate fact table structure
206 ///
207 /// # Errors
208 ///
209 /// Returns error if:
210 /// - No measures found
211 /// - No dimension column found
212 /// - Measures are not numeric types
213 pub fn validate(metadata: &FactTableMetadata) -> Result<()> {
214 // Must have at least one measure
215 if metadata.measures.is_empty() {
216 return Err(FraiseQLError::Validation {
217 message: format!(
218 "Fact table '{}' must have at least one measure column",
219 metadata.table_name
220 ),
221 path: None,
222 });
223 }
224
225 // Validate all measures are numeric
226 for measure in &metadata.measures {
227 if !Self::is_numeric_type(&measure.sql_type) {
228 return Err(FraiseQLError::Validation {
229 message: format!(
230 "Measure column '{}' must be numeric type, found {:?}",
231 measure.name, measure.sql_type
232 ),
233 path: None,
234 });
235 }
236 }
237
238 // Must have dimension column
239 if metadata.dimensions.name.is_empty() {
240 return Err(FraiseQLError::Validation {
241 message: format!(
242 "Fact table '{}' must have a dimension column (JSONB)",
243 metadata.table_name
244 ),
245 path: None,
246 });
247 }
248
249 Ok(())
250 }
251
252 /// Check if SQL type is numeric (suitable for aggregation)
253 pub(super) const fn is_numeric_type(sql_type: &SqlType) -> bool {
254 matches!(sql_type, SqlType::Int | SqlType::BigInt | SqlType::Decimal | SqlType::Float)
255 }
256
257 /// Extract dimension paths from a sample JSON value
258 ///
259 /// Walks through the JSON structure and extracts top-level keys as dimension paths.
260 /// Nested objects are represented with dot notation (e.g., "customer.region").
261 ///
262 /// # Arguments
263 ///
264 /// * `sample` - Sample JSON value from the dimension column
265 /// * `column_name` - Name of the JSONB column (e.g., "dimensions")
266 /// * `db_type` - Database type for generating correct JSON path syntax
267 ///
268 /// # Returns
269 ///
270 /// Vec of `DimensionPath` extracted from the sample data
271 pub fn extract_dimension_paths(
272 sample: &serde_json::Value,
273 column_name: &str,
274 db_type: DatabaseType,
275 ) -> Vec<DimensionPath> {
276 let mut paths = Vec::new();
277 Self::extract_paths_recursive(sample, column_name, "", &mut paths, db_type, 0);
278 paths
279 }
280
281 /// Recursively extract paths from JSON structure
282 fn extract_paths_recursive(
283 value: &serde_json::Value,
284 column_name: &str,
285 prefix: &str,
286 paths: &mut Vec<DimensionPath>,
287 db_type: DatabaseType,
288 depth: usize,
289 ) {
290 // Limit depth to avoid infinite recursion on deeply nested structures
291 if depth > 3 {
292 return;
293 }
294
295 if let Some(obj) = value.as_object() {
296 for (key, val) in obj {
297 let full_path = if prefix.is_empty() {
298 key.clone()
299 } else {
300 format!("{}.{}", prefix, key)
301 };
302
303 // Determine data type from the value
304 let data_type = Self::infer_json_type(val);
305
306 // Generate database-specific JSON path syntax
307 let json_path = Self::generate_json_path(column_name, &full_path, db_type);
308
309 paths.push(DimensionPath {
310 name: full_path.replace('.', "_"), /* Convert dots to underscores for field
311 * names */
312 json_path,
313 data_type,
314 });
315
316 // Recurse into nested objects
317 if val.is_object() {
318 Self::extract_paths_recursive(
319 val,
320 column_name,
321 &full_path,
322 paths,
323 db_type,
324 depth + 1,
325 );
326 }
327 }
328 }
329 }
330
331 /// Infer JSON data type from a value
332 pub(super) fn infer_json_type(value: &serde_json::Value) -> String {
333 match value {
334 serde_json::Value::Bool(_) => "boolean".to_string(),
335 serde_json::Value::Number(n) => {
336 if n.is_i64() || n.is_u64() {
337 "integer".to_string()
338 } else {
339 "float".to_string()
340 }
341 },
342 serde_json::Value::Array(_) => "array".to_string(),
343 serde_json::Value::Object(_) => "object".to_string(),
344 // Null and String both infer as "string" type
345 serde_json::Value::Null | serde_json::Value::String(_) => "string".to_string(),
346 }
347 }
348
349 /// Generate database-specific JSON path syntax
350 pub(super) fn generate_json_path(
351 column_name: &str,
352 path: &str,
353 db_type: DatabaseType,
354 ) -> String {
355 let parts: Vec<&str> = path.split('.').collect();
356
357 match db_type {
358 DatabaseType::PostgreSQL => {
359 // PostgreSQL: column->>'key' for top-level, column->'nested'->>'key' for nested
360 if parts.is_empty() {
361 // Safety: handle empty path by returning raw column name
362 column_name.to_string()
363 } else if parts.len() == 1 {
364 format!("{}->>'{}'", column_name, parts[0])
365 } else {
366 // Safe: parts.len() >= 2 is guaranteed here
367 if let Some(last) = parts.last() {
368 let rest = &parts[..parts.len() - 1];
369 let nav = rest.iter().fold(String::new(), |mut acc, p| {
370 use std::fmt::Write;
371 let _ = write!(acc, "->'{}'", p);
372 acc
373 });
374 format!("{}{}->>'{}'", column_name, nav, last)
375 } else {
376 // This branch is unreachable due to length check, but safe fallback
377 column_name.to_string()
378 }
379 }
380 },
381 DatabaseType::MySQL => {
382 // MySQL: JSON_EXTRACT(column, '$.path.to.key')
383 format!("JSON_UNQUOTE(JSON_EXTRACT({}, '$.{}')", column_name, path)
384 },
385 DatabaseType::SQLite => {
386 // SQLite: json_extract(column, '$.path.to.key')
387 format!("json_extract({}, '$.{}')", column_name, path)
388 },
389 DatabaseType::SQLServer => {
390 // SQL Server: JSON_VALUE(column, '$.path.to.key')
391 format!("JSON_VALUE({}, '$.{}')", column_name, path)
392 },
393 }
394 }
395
396 /// Detect calendar dimension columns (`date_info`, `week_info`, etc.)
397 ///
398 /// Looks for `*_info` JSONB/JSON columns following the calendar dimension pattern.
399 /// Returns calendar dimension metadata if calendar columns are found.
400 ///
401 /// # Arguments
402 ///
403 /// * `columns` - List of (name, `data_type`, nullable) tuples
404 /// * `_indexed_set` - Set of indexed columns (unused, for future optimization detection)
405 ///
406 /// # Returns
407 ///
408 /// Vec of calendar dimensions (empty if none found)
409 pub(super) fn detect_calendar_dimensions(
410 columns: &[(String, String, bool)],
411 _indexed_set: &std::collections::HashSet<String>,
412 ) -> Result<Vec<CalendarDimension>> {
413 // Look for *_info columns with JSONB/JSON type
414 let calendar_columns: Vec<String> = columns
415 .iter()
416 .filter(|(name, data_type, _)| {
417 name.ends_with("_info")
418 && (data_type.to_lowercase().contains("json")
419 || data_type.to_lowercase().contains("jsonb"))
420 })
421 .map(|(name, _, _)| name.clone())
422 .collect();
423
424 if calendar_columns.is_empty() {
425 return Ok(Vec::new());
426 }
427
428 // Build granularities based on calendar dimension pattern
429 let mut granularities = Vec::new();
430 for col_name in calendar_columns {
431 let buckets = Self::infer_calendar_buckets(&col_name);
432 if !buckets.is_empty() {
433 granularities.push(CalendarGranularity {
434 column_name: col_name,
435 buckets,
436 });
437 }
438 }
439
440 if granularities.is_empty() {
441 return Ok(Vec::new());
442 }
443
444 // Assume single source column "occurred_at"
445 // (could be enhanced to detect from schema later)
446 Ok(vec![CalendarDimension {
447 source_column: "occurred_at".to_string(),
448 granularities,
449 }])
450 }
451
452 /// Map calendar column names to available buckets (standard pattern)
453 ///
454 /// # Arguments
455 ///
456 /// * `column_name` - Name of the calendar column (e.g., "`date_info`", "`month_info`")
457 ///
458 /// # Returns
459 ///
460 /// Vec of calendar buckets available in this column
461 pub(super) fn infer_calendar_buckets(column_name: &str) -> Vec<CalendarBucket> {
462 use crate::compiler::aggregate_types::TemporalBucket;
463
464 match column_name {
465 "date_info" => vec![
466 CalendarBucket {
467 json_key: "date".to_string(),
468 bucket_type: TemporalBucket::Day,
469 data_type: "date".to_string(),
470 },
471 CalendarBucket {
472 json_key: "week".to_string(),
473 bucket_type: TemporalBucket::Week,
474 data_type: "integer".to_string(),
475 },
476 CalendarBucket {
477 json_key: "month".to_string(),
478 bucket_type: TemporalBucket::Month,
479 data_type: "integer".to_string(),
480 },
481 CalendarBucket {
482 json_key: "quarter".to_string(),
483 bucket_type: TemporalBucket::Quarter,
484 data_type: "integer".to_string(),
485 },
486 CalendarBucket {
487 json_key: "year".to_string(),
488 bucket_type: TemporalBucket::Year,
489 data_type: "integer".to_string(),
490 },
491 ],
492 "week_info" => vec![
493 CalendarBucket {
494 json_key: "week".to_string(),
495 bucket_type: TemporalBucket::Week,
496 data_type: "integer".to_string(),
497 },
498 CalendarBucket {
499 json_key: "month".to_string(),
500 bucket_type: TemporalBucket::Month,
501 data_type: "integer".to_string(),
502 },
503 CalendarBucket {
504 json_key: "quarter".to_string(),
505 bucket_type: TemporalBucket::Quarter,
506 data_type: "integer".to_string(),
507 },
508 CalendarBucket {
509 json_key: "year".to_string(),
510 bucket_type: TemporalBucket::Year,
511 data_type: "integer".to_string(),
512 },
513 ],
514 "month_info" => vec![
515 CalendarBucket {
516 json_key: "month".to_string(),
517 bucket_type: TemporalBucket::Month,
518 data_type: "integer".to_string(),
519 },
520 CalendarBucket {
521 json_key: "quarter".to_string(),
522 bucket_type: TemporalBucket::Quarter,
523 data_type: "integer".to_string(),
524 },
525 CalendarBucket {
526 json_key: "year".to_string(),
527 bucket_type: TemporalBucket::Year,
528 data_type: "integer".to_string(),
529 },
530 ],
531 "quarter_info" => vec![
532 CalendarBucket {
533 json_key: "quarter".to_string(),
534 bucket_type: TemporalBucket::Quarter,
535 data_type: "integer".to_string(),
536 },
537 CalendarBucket {
538 json_key: "year".to_string(),
539 bucket_type: TemporalBucket::Year,
540 data_type: "integer".to_string(),
541 },
542 ],
543 "semester_info" => vec![
544 CalendarBucket {
545 json_key: "semester".to_string(),
546 bucket_type: TemporalBucket::Quarter, // Map to Quarter for now
547 data_type: "integer".to_string(),
548 },
549 CalendarBucket {
550 json_key: "year".to_string(),
551 bucket_type: TemporalBucket::Year,
552 data_type: "integer".to_string(),
553 },
554 ],
555 "year_info" => vec![CalendarBucket {
556 json_key: "year".to_string(),
557 bucket_type: TemporalBucket::Year,
558 data_type: "integer".to_string(),
559 }],
560 _ => Vec::new(),
561 }
562 }
563
564 /// Create metadata from column definitions (for testing)
565 ///
566 /// # Errors
567 ///
568 /// Returns `FraiseQLError::Validation` if no time dimension column is found.
569 pub fn from_columns(
570 table_name: String,
571 columns: Vec<(&str, SqlType, bool)>,
572 ) -> Result<FactTableMetadata> {
573 let mut measures = Vec::new();
574 let mut dimension_column: Option<DimensionColumn> = None;
575 let mut filters = Vec::new();
576
577 for (name, sql_type, nullable) in columns {
578 match sql_type {
579 SqlType::Jsonb | SqlType::Json => {
580 // This is the dimension column
581 dimension_column = Some(DimensionColumn {
582 name: name.to_string(),
583 paths: Vec::new(),
584 });
585 },
586 SqlType::Int | SqlType::BigInt | SqlType::Decimal | SqlType::Float => {
587 // Skip id column
588 if name != "id" && !name.ends_with("_id") {
589 // This is a measure
590 measures.push(MeasureColumn {
591 name: name.to_string(),
592 sql_type,
593 nullable,
594 });
595 } else if name != "id" {
596 // This is a filter (_id columns)
597 filters.push(FilterColumn {
598 name: name.to_string(),
599 sql_type,
600 indexed: false,
601 });
602 }
603 },
604 _ => {
605 // This might be a filter column (if not id/created_at/updated_at)
606 if name != "id" && name != "created_at" && name != "updated_at" {
607 filters.push(FilterColumn {
608 name: name.to_string(),
609 sql_type,
610 indexed: false, // Would need to query indexes to determine
611 });
612 }
613 },
614 }
615 }
616
617 let metadata = FactTableMetadata {
618 table_name,
619 measures,
620 dimensions: dimension_column.unwrap_or(DimensionColumn {
621 name: "dimensions".to_string(),
622 paths: Vec::new(),
623 }),
624 denormalized_filters: filters,
625 calendar_dimensions: Vec::new(), // No calendar detection in test helper
626 };
627
628 Self::validate(&metadata)?;
629 Ok(metadata)
630 }
631}