flowscope-core 0.7.0

Core SQL lineage analysis engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
use super::context::{ColumnRef, StatementContext};
use super::expression::ExpressionAnalyzer;
use super::helpers::{
    alias_visibility_warning, infer_expr_type, is_simple_column_ref, lateral_alias_warning,
    normalize_schema_type,
};
use super::query::OutputColumnParams;
use super::Analyzer;
use crate::types::FilterClauseType;
use sqlparser::ast::{self, Select, SelectItem};
use std::collections::{HashMap, HashSet};

/// Analyzes SELECT statements to extract column lineage.
pub(crate) struct SelectAnalyzer<'a, 'b> {
    analyzer: &'a mut Analyzer<'b>,
    ctx: &'a mut StatementContext,
    target_node: Option<String>,
}

impl<'a, 'b> SelectAnalyzer<'a, 'b> {
    pub(crate) fn new(
        analyzer: &'a mut Analyzer<'b>,
        ctx: &'a mut StatementContext,
        target_node: Option<String>,
    ) -> Self {
        Self {
            analyzer,
            ctx,
            target_node,
        }
    }

    /// Analyze a SELECT statement's projection, group by, and filter clauses.
    ///
    /// This method populates:
    /// - Output columns in the context
    /// - Filter predicates
    /// - Aggregation info
    pub(crate) fn analyze(&mut self, select: &Select) {
        self.ctx.clear_grouping();

        self.analyze_group_by(&select.group_by);
        self.analyze_projection(&select.projection);
        self.analyze_selection(&select.selection);
        self.analyze_having(&select.having);
    }

    /// Analyzes GROUP BY expressions to track grouping columns.
    ///
    /// # Limitations
    ///
    /// TODO: GROUP BY alias visibility checking is incomplete because GROUP BY is
    /// analyzed before the SELECT projection. This means `output_columns` is typically
    /// empty when we try to detect alias references. A multi-pass analysis approach
    /// would be needed to properly detect aliases used in GROUP BY, which would require:
    /// 1. First pass: collect SELECT aliases
    /// 2. Second pass: analyze GROUP BY with alias knowledge
    /// 3. Third pass: analyze projection with grouping context
    ///
    /// For now, this check only catches edge cases where output_columns were populated
    /// from a previous statement or context.
    fn analyze_group_by(&mut self, group_by: &ast::GroupByExpr) {
        let dialect = self.analyzer.request.dialect;
        match group_by {
            ast::GroupByExpr::Expressions(exprs, _) => {
                let mut processed_grouping_exprs = HashSet::new();
                for group_by_expr in exprs {
                    // Normalize expression first without creating expr_analyzer yet
                    let expr_str = {
                        let ea = ExpressionAnalyzer::new(self.analyzer, self.ctx);
                        ea.normalize_group_by_expr(group_by_expr)
                    };

                    // Alias visibility check (limited - see function doc comment for details)
                    let matched_alias = self
                        .ctx
                        .output_columns
                        .iter()
                        .find(|c| c.name == expr_str)
                        .map(|c| c.name.clone());

                    if let Some(alias_name) = matched_alias {
                        if !dialect.alias_in_group_by() {
                            self.emit_alias_warning("GROUP BY", &alias_name);
                        }
                    }

                    if !processed_grouping_exprs.insert(expr_str.clone()) {
                        continue;
                    }

                    // Now create expr_analyzer for the actual analysis
                    let mut expr_analyzer = ExpressionAnalyzer::new(self.analyzer, self.ctx);
                    expr_analyzer.ctx.add_grouping_column(expr_str);
                    expr_analyzer.analyze(group_by_expr);
                }
            }
            ast::GroupByExpr::All(_) => {
                self.ctx.has_group_by = true;
            }
        }
    }

    fn analyze_projection(&mut self, projection: &[SelectItem]) {
        // Track aliases defined in this SELECT list for lateral column alias checking.
        // For dialects that support lateral aliases, we also track the sources so we can
        // resolve references to them in subsequent SELECT items.
        let mut defined_aliases: HashSet<String> = HashSet::new();
        // Maps normalized alias name -> sources (column refs that compose the alias)
        let mut lateral_alias_sources: HashMap<String, Vec<ColumnRef>> = HashMap::new();

        let dialect = self.analyzer.request.dialect;
        let supports_lateral = dialect.lateral_column_alias();

        for (idx, item) in projection.iter().enumerate() {
            match item {
                SelectItem::UnnamedExpr(expr) => {
                    // Check for lateral column alias usage
                    self.check_lateral_column_alias(expr, &defined_aliases);

                    let (mut sources, name, aggregation) = {
                        let mut ea = ExpressionAnalyzer::new(self.analyzer, self.ctx);
                        let column_refs = ea.extract_column_refs_with_warning(expr);
                        (
                            column_refs,
                            ea.derive_column_name(expr, idx),
                            ea.detect_aggregation(expr),
                        )
                    };

                    // Resolve lateral alias references if dialect supports them
                    if supports_lateral {
                        sources = self.resolve_lateral_alias_sources(
                            expr,
                            sources,
                            &lateral_alias_sources,
                        );
                    }

                    let is_simple_ref = is_simple_column_ref(expr);
                    let expr_text = if is_simple_ref {
                        None
                    } else {
                        Some(expr.to_string())
                    };
                    // First try to infer type from expression structure (literals, functions, etc.)
                    // If that fails, look up the type from CTE/subquery output columns
                    let data_type = infer_expr_type(expr)
                        .map(|t| t.to_string())
                        .or_else(|| self.lookup_source_column_type(&sources));

                    // Record source columns for implied schema. For simple column references,
                    // we can safely propagate the type. For transformed expressions (CAST,
                    // functions, arithmetic), we only record the column existence without
                    // a type since the expression output type differs from source column type.
                    let source_type = if is_simple_ref { &data_type } else { &None };
                    self.record_source_columns_with_type(&sources, source_type);

                    self.analyzer.add_output_column_with_aggregation(
                        self.ctx,
                        OutputColumnParams {
                            name,
                            sources,
                            expression: expr_text,
                            data_type,
                            target_node: self.target_node.clone(),
                            approximate: false,
                            aggregation,
                        },
                    );
                }
                SelectItem::ExprWithAlias { expr, alias } => {
                    // Check for lateral column alias usage
                    self.check_lateral_column_alias(expr, &defined_aliases);

                    let (mut sources, aggregation) = {
                        let mut ea = ExpressionAnalyzer::new(self.analyzer, self.ctx);
                        let column_refs = ea.extract_column_refs_with_warning(expr);
                        (column_refs, ea.detect_aggregation(expr))
                    };

                    // Resolve lateral alias references if dialect supports them
                    if supports_lateral {
                        sources = self.resolve_lateral_alias_sources(
                            expr,
                            sources,
                            &lateral_alias_sources,
                        );
                    }

                    let name = alias.value.clone();
                    let is_simple_ref = is_simple_column_ref(expr);
                    let expr_text = if is_simple_ref {
                        None
                    } else {
                        Some(expr.to_string())
                    };
                    // First try to infer type from expression structure (literals, functions, etc.)
                    // If that fails, look up the type from CTE/subquery output columns
                    let data_type = infer_expr_type(expr)
                        .map(|t| t.to_string())
                        .or_else(|| self.lookup_source_column_type(&sources));

                    // Record source columns for implied schema. For simple column references,
                    // we can safely propagate the type. For transformed expressions (CAST,
                    // functions, arithmetic), we only record the column existence without
                    // a type since the expression output type differs from source column type.
                    let source_type = if is_simple_ref { &data_type } else { &None };
                    self.record_source_columns_with_type(&sources, source_type);

                    // Record this alias for subsequent lateral column alias checking
                    let normalized_alias = self.analyzer.normalize_identifier(&name);
                    defined_aliases.insert(normalized_alias.clone());

                    // Track sources for lateral alias resolution in subsequent items
                    if supports_lateral {
                        lateral_alias_sources.insert(normalized_alias, sources.clone());
                    }

                    self.analyzer.add_output_column_with_aggregation(
                        self.ctx,
                        OutputColumnParams {
                            name,
                            sources,
                            expression: expr_text,
                            data_type,
                            target_node: self.target_node.clone(),
                            approximate: false,
                            aggregation,
                        },
                    );
                }
                SelectItem::QualifiedWildcard(name, _) => {
                    let qualifier = name.to_string();
                    // SelectItemQualifiedWildcardKind::Display appends ".*"
                    let qualifier = qualifier.strip_suffix(".*").unwrap_or(&qualifier);
                    self.analyzer.expand_wildcard(
                        self.ctx,
                        Some(qualifier),
                        self.target_node.as_deref(),
                    );
                }
                SelectItem::Wildcard(_) => {
                    self.analyzer
                        .expand_wildcard(self.ctx, None, self.target_node.as_deref());
                }
            }
        }
    }

    /// Resolves lateral column alias references in the sources.
    ///
    /// For dialects that support lateral column aliases (BigQuery, Snowflake, etc.),
    /// when an unqualified identifier in the expression matches a previously-defined
    /// alias, we replace that identifier's source with the sources of the alias.
    ///
    /// Example: `SELECT a + 1 AS b, b + 1 AS c FROM t`
    /// When processing `c`, the identifier `b` matches the lateral alias, so we
    /// resolve `c`'s sources to include `t.a` (via `b`) instead of treating `b`
    /// as an unresolved column reference.
    fn resolve_lateral_alias_sources(
        &self,
        expr: &sqlparser::ast::Expr,
        mut sources: Vec<ColumnRef>,
        lateral_alias_sources: &HashMap<String, Vec<ColumnRef>>,
    ) -> Vec<ColumnRef> {
        if lateral_alias_sources.is_empty() {
            return sources;
        }

        // Find unqualified identifiers in the expression that match lateral aliases
        let identifiers = ExpressionAnalyzer::extract_simple_identifiers(expr);
        let mut additional_sources = Vec::new();

        for ident in &identifiers {
            let normalized_ident = self.analyzer.normalize_identifier(ident);
            if let Some(alias_sources) = lateral_alias_sources.get(&normalized_ident) {
                // This identifier is a lateral alias reference. Add the alias's sources
                // to our sources list, and remove any ColumnRef that just has the alias
                // name without a table (since it's not a real table column).
                additional_sources.extend(alias_sources.clone());

                // Remove the unresolved reference to the alias itself
                sources.retain(|s| {
                    !(s.table.is_none()
                        && self.analyzer.normalize_identifier(&s.column) == normalized_ident)
                });
            }
        }

        sources.extend(additional_sources);

        // Deduplicate sources by (table, column) to avoid duplicate edges in lineage
        let mut seen: HashSet<(Option<String>, String)> = HashSet::new();
        sources.retain(|s| seen.insert((s.table.clone(), s.column.clone())));

        sources
    }

    /// Emits a warning for unsupported alias usage in a clause.
    fn emit_alias_warning(&mut self, clause_name: &str, alias_name: &str) {
        let dialect = self.analyzer.request.dialect;
        let statement_index = self.ctx.statement_index;
        self.analyzer.issues.push(alias_visibility_warning(
            dialect,
            clause_name,
            alias_name,
            statement_index,
        ));
    }

    /// Checks if an expression uses a lateral column alias (an alias defined earlier
    /// in the same SELECT list) and emits a warning if the dialect doesn't support it.
    fn check_lateral_column_alias(
        &mut self,
        expr: &sqlparser::ast::Expr,
        defined_aliases: &HashSet<String>,
    ) {
        let dialect = self.analyzer.request.dialect;

        if !dialect.lateral_column_alias() && !defined_aliases.is_empty() {
            let identifiers = ExpressionAnalyzer::extract_simple_identifiers(expr);
            for ident in &identifiers {
                let normalized_ident = self.analyzer.normalize_identifier(ident);
                if defined_aliases.contains(&normalized_ident) {
                    let statement_index = self.ctx.statement_index;
                    self.analyzer.issues.push(lateral_alias_warning(
                        dialect,
                        ident,
                        statement_index,
                    ));
                }
            }
        }
    }

    fn analyze_selection(&mut self, selection: &Option<sqlparser::ast::Expr>) {
        if let Some(ref where_clause) = selection {
            let mut ea = ExpressionAnalyzer::new(self.analyzer, self.ctx);
            ea.analyze(where_clause);
            ea.capture_filter_predicates(where_clause, FilterClauseType::Where);
        }
    }

    fn analyze_having(&mut self, having: &Option<sqlparser::ast::Expr>) {
        if let Some(ref having_expr) = having {
            let dialect = self.analyzer.request.dialect;

            // Check for alias usage in HAVING clause
            if !dialect.alias_in_having() {
                self.check_alias_in_clause(having_expr, "HAVING");
            }

            let mut ea = ExpressionAnalyzer::new(self.analyzer, self.ctx);
            ea.analyze(having_expr);
            ea.capture_filter_predicates(having_expr, FilterClauseType::Having);
        }
    }

    /// Checks if an expression references any output column aliases and emits a warning.
    ///
    /// Used by HAVING and can be extended to other clauses that need alias checking.
    fn check_alias_in_clause(&mut self, expr: &sqlparser::ast::Expr, clause_name: &str) {
        let identifiers = ExpressionAnalyzer::extract_simple_identifiers(expr);
        for ident in &identifiers {
            let normalized_ident = self.analyzer.normalize_identifier(ident);
            if let Some(alias_name) = self
                .ctx
                .output_columns
                .iter()
                .find(|c| self.analyzer.normalize_identifier(&c.name) == normalized_ident)
                .map(|c| c.name.clone())
            {
                self.emit_alias_warning(clause_name, &alias_name);
            }
        }
    }

    /// Record source columns with inferred data types for implied schema tracking.
    ///
    /// Always records the column existence for implied schema. If a data type is
    /// provided, it will be recorded as well (but won't overwrite existing types).
    fn record_source_columns_with_type(
        &mut self,
        sources: &[ColumnRef],
        data_type: &Option<String>,
    ) {
        for col_ref in sources {
            let Some(table) = col_ref.table.as_deref() else {
                continue;
            };
            let Some(canonical) = self.analyzer.resolve_table_alias(self.ctx, Some(table)) else {
                continue;
            };
            self.ctx
                .record_source_column(&canonical, &col_ref.column, data_type.clone());
        }
    }

    /// Look up the data type of a column from CTE/subquery output columns or schema.
    ///
    /// Resolution priority:
    /// 1. CTE/derived table output columns (for columns from CTEs or subqueries)
    /// 2. Schema registry (for columns from base tables with user-provided or DDL-inferred schema)
    ///
    /// When referencing columns from CTEs or derived tables, we can inherit the
    /// type from the CTE's output column definition. This enables type propagation
    /// through CTE chains even when the column is just a simple identifier reference.
    ///
    /// For columns from base tables, we look up the type from the schema registry
    /// if the table and column exist there.
    fn lookup_source_column_type(&self, sources: &[ColumnRef]) -> Option<String> {
        // Only look up type if we have a single source column (simple column reference)
        if sources.len() != 1 {
            return None;
        }

        let source = &sources[0];
        let normalized_col = self.analyzer.normalize_identifier(&source.column);

        // If table is specified, resolve it. Otherwise, search all CTEs/subqueries in scope.
        if let Some(table) = source.table.as_ref() {
            // Resolve alias to canonical name for CTE lookup
            let canonical = self.analyzer.resolve_table_alias(self.ctx, Some(table))?;

            // Check aliased_subquery_columns (CTEs and derived tables) first
            if let Some(cte_cols) = self.ctx.resolve_subquery_columns(&canonical) {
                if let Some(col) = cte_cols.iter().find(|c| c.name == normalized_col) {
                    if col.data_type.is_some() {
                        return col.data_type.clone();
                    }
                }
            }

            // Fall back to schema registry for base table column types
            if let Some(schema_type) = self
                .analyzer
                .schema
                .lookup_column_type(&canonical, &source.column)
            {
                return Some(normalize_schema_type(&schema_type));
            }
        } else {
            // No table qualifier - search all CTEs/subqueries in current scope
            for table_canonical in self.ctx.tables_in_current_scope() {
                // Check CTE/subquery columns first
                if let Some(cte_cols) = self.ctx.resolve_subquery_columns(&table_canonical) {
                    if let Some(col) = cte_cols.iter().find(|c| c.name == normalized_col) {
                        if col.data_type.is_some() {
                            return col.data_type.clone();
                        }
                    }
                }

                // Then check schema registry
                if let Some(schema_type) = self
                    .analyzer
                    .schema
                    .lookup_column_type(&table_canonical, &source.column)
                {
                    return Some(normalize_schema_type(&schema_type));
                }
            }
        }

        None
    }
}