Skip to main content

plsql_ir/
sql_columns.rs

1//! Projection + column read/write extraction.
2//!
3//! Builds on the table/alias resolution from
4//! (`sql_resolve`). Given a `SqlStatementModel` whose `tables` +
5//! `alias_scope` are populated, this pass fills `projection`,
6//! `reads`, and `writes` by walking the SELECT list, the
7//! INSERT/UPDATE column targets, and the WHERE/SET/ON predicate
8//! columns — attaching a [`ColumnResolution`] verdict to each.
9//!
10//! Resolution rules:
11//!
12//! * `alias.col` → look the alias up in `AliasScope`; if found,
13//!   `Resolved`; if the alias isn't bound, `Unresolved`.
14//! * bare `col` with exactly one table in scope → `Resolved`
15//!   against that table.
16//! * bare `col` with multiple tables in scope → `Unresolved`
17//!   (ambiguous without catalog column lists; the catalog
18//!   cross-check pass disambiguates later).
19//! * `*` / `alias.*` → `StarExpansion`.
20//!
21//! ## /oracle evidence
22//!
23//! * `DATABASE-REFERENCE.md` PL/SQL Language Reference — the
24//!   SELECT-list / SET-clause / predicate grammar defers to the
25//!   SQL Language Reference.
26//! * `LOW-LEVEL-CATALOGS.md` Data Dictionary View Families —
27//!   `ALL_TAB_COLUMNS` is the authority that turns an
28//!   `Unresolved` bare column into a `Resolved` one once the
29//!   catalog is available (deferred to a later pass).
30
31use crate::sql_sem::{
32    ColumnResolution, ColumnUse, ProjectionItem, SqlSemanticModel, SqlSemanticVerb,
33    SqlStatementModel,
34};
35
36/// Populate `projection`, `reads`, `writes` on `model` from its
37/// raw text + already-resolved `tables` / `alias_scope`.
38/// `raw` is the original SQL text (the model doesn't keep it).
39pub fn extract_columns(model: &mut SqlStatementModel, raw: &str) {
40    let single_table = model.tables.len() == 1;
41    match model.verb {
42        SqlSemanticVerb::Select => {
43            let proj = parse_select_list(raw);
44            for item in &proj {
45                classify_projection_reads(item, single_table, model);
46            }
47            model.projection = proj;
48            for c in predicate_columns(raw) {
49                push_read(model, c, single_table);
50            }
51        }
52        SqlSemanticVerb::Insert => {
53            for c in insert_target_columns(raw) {
54                push_write(model, c, single_table);
55            }
56            // Sub-SELECT projection columns are reads.
57            for item in parse_select_list(raw) {
58                if !item.is_star {
59                    push_read_name(model, &item.expression_text, single_table);
60                }
61            }
62        }
63        SqlSemanticVerb::Update => {
64            for c in update_set_columns(raw) {
65                push_write(model, c, single_table);
66            }
67            for c in predicate_columns(raw) {
68                push_read(model, c, single_table);
69            }
70        }
71        SqlSemanticVerb::Delete => {
72            for c in predicate_columns(raw) {
73                push_read(model, c, single_table);
74            }
75        }
76        SqlSemanticVerb::MergeUpdate
77        | SqlSemanticVerb::MergeInsert
78        | SqlSemanticVerb::MergeDelete => {
79            for c in update_set_columns(raw) {
80                push_write(model, c, single_table);
81            }
82            for c in predicate_columns(raw) {
83                push_read(model, c, single_table);
84            }
85        }
86    }
87}
88
89/// Convenience: run `extract_columns` over every statement in a
90/// `SqlSemanticModel`. The caller supplies the raw text per
91/// statement (the model is text-free by design).
92pub fn extract_columns_for_model(model: &mut SqlSemanticModel, raws: &[String]) {
93    for (i, stmt) in model.statements.iter_mut().enumerate() {
94        if let Some(raw) = raws.get(i) {
95            extract_columns(stmt, raw);
96        }
97    }
98}
99
100fn parse_select_list(raw: &str) -> Vec<ProjectionItem> {
101    let upper = raw.to_ascii_uppercase();
102    let Some(sel) = upper.find("SELECT") else {
103        return Vec::new();
104    };
105    let after = sel + "SELECT".len();
106    // Stop the projection list at INTO or FROM (whichever first).
107    let into = upper[after..].find(" INTO ").map(|p| after + p);
108    let from = upper[after..].find(" FROM ").map(|p| after + p);
109    let end = [into, from]
110        .into_iter()
111        .flatten()
112        .min()
113        .unwrap_or(raw.len());
114    let list = raw[after..end].trim();
115    split_top_level_commas(list)
116        .into_iter()
117        .map(|piece| parse_projection_item(piece.trim()))
118        .filter(|p| !p.expression_text.is_empty())
119        .collect()
120}
121
122fn parse_projection_item(piece: &str) -> ProjectionItem {
123    let is_star = piece == "*" || piece.ends_with(".*");
124    // `expr AS alias` / `expr alias`.
125    let upper = piece.to_ascii_uppercase();
126    if let Some(as_pos) = upper.rfind(" AS ") {
127        let expr = piece[..as_pos].trim().to_string();
128        let alias = piece[as_pos + 4..].trim().to_string();
129        return ProjectionItem {
130            alias,
131            expression_text: expr,
132            is_star,
133        };
134    }
135    // Trailing-token alias only if there's whitespace and the
136    // last token is a bare identifier (avoid splitting
137    // `a.b` or `fn(x)`).
138    if let Some(ws) = piece.rfind(char::is_whitespace) {
139        let head = piece[..ws].trim();
140        let tail = piece[ws..].trim();
141        if !head.is_empty()
142            && tail.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
143            && !head.ends_with(['(', ','])
144            && !is_star
145        {
146            return ProjectionItem {
147                alias: tail.to_string(),
148                expression_text: head.to_string(),
149                is_star,
150            };
151        }
152    }
153    ProjectionItem {
154        alias: String::new(),
155        expression_text: piece.to_string(),
156        is_star,
157    }
158}
159
160fn classify_projection_reads(
161    item: &ProjectionItem,
162    single_table: bool,
163    model: &mut SqlStatementModel,
164) {
165    if item.is_star {
166        let (qual, _col) = split_qualifier(&item.expression_text);
167        model.reads.push(ColumnUse {
168            qualifier: qual,
169            column: "*".to_string(),
170            resolution: ColumnResolution::StarExpansion,
171        });
172        return;
173    }
174    // Pull bare column identifiers from the expression.
175    for ident in column_idents(&item.expression_text) {
176        push_read_name(model, &ident, single_table);
177    }
178}
179
180fn push_read(model: &mut SqlStatementModel, col: String, single_table: bool) {
181    push_read_name(model, &col, single_table);
182}
183
184fn push_read_name(model: &mut SqlStatementModel, name: &str, single_table: bool) {
185    if let Some(cu) = make_column_use(name, single_table, model) {
186        if !model.reads.contains(&cu) {
187            model.reads.push(cu);
188        }
189    }
190}
191
192fn push_write(model: &mut SqlStatementModel, col: String, single_table: bool) {
193    if let Some(cu) = make_column_use(&col, single_table, model) {
194        if !model.writes.contains(&cu) {
195            model.writes.push(cu);
196        }
197    }
198}
199
200fn make_column_use(name: &str, single_table: bool, model: &SqlStatementModel) -> Option<ColumnUse> {
201    let name = name.trim();
202    if name.is_empty() || is_sql_noise(name) {
203        return None;
204    }
205    let (qualifier, column) = split_qualifier(name);
206    if column.is_empty() || !column.chars().next()?.is_ascii_alphabetic() {
207        return None;
208    }
209    let resolution = if !qualifier.is_empty() {
210        if model.alias_scope.resolve(&qualifier).is_some() {
211            ColumnResolution::Resolved
212        } else {
213            ColumnResolution::Unresolved
214        }
215    } else if single_table {
216        ColumnResolution::Resolved
217    } else {
218        ColumnResolution::Unresolved
219    };
220    Some(ColumnUse {
221        qualifier,
222        column: column.to_ascii_uppercase(),
223        resolution,
224    })
225}
226
227fn split_qualifier(name: &str) -> (String, String) {
228    match name.rsplit_once('.') {
229        Some((q, c)) => (q.trim().to_string(), c.trim().to_string()),
230        None => (String::new(), name.trim().to_string()),
231    }
232}
233
234fn column_idents(expr: &str) -> Vec<String> {
235    let mut out = Vec::new();
236    let mut cur = String::new();
237    for ch in expr.chars() {
238        if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' || ch == '#' || ch == '.' {
239            cur.push(ch);
240        } else {
241            if !cur.is_empty() {
242                out.push(std::mem::take(&mut cur));
243            }
244        }
245    }
246    if !cur.is_empty() {
247        out.push(cur);
248    }
249    out.into_iter().filter(|w| !is_sql_noise(w)).collect()
250}
251
252fn insert_target_columns(raw: &str) -> Vec<String> {
253    // INSERT INTO t (c1, c2, …) VALUES …  — pull the paren list
254    // immediately after the table name.
255    let upper = raw.to_ascii_uppercase();
256    let Some(into) = upper.find("INTO") else {
257        return Vec::new();
258    };
259    let rest = &raw[into + 4..];
260    let Some(open) = rest.find('(') else {
261        return Vec::new();
262    };
263    let Some(close) = rest[open..].find(')') else {
264        return Vec::new();
265    };
266    split_top_level_commas(&rest[open + 1..open + close])
267        .into_iter()
268        .map(|s| s.trim().to_string())
269        .filter(|s| !s.is_empty())
270        .collect()
271}
272
273fn update_set_columns(raw: &str) -> Vec<String> {
274    let upper = raw.to_ascii_uppercase();
275    let Some(set) = upper.find(" SET ") else {
276        return Vec::new();
277    };
278    let after = set + 5;
279    let end = upper[after..]
280        .find(" WHERE ")
281        .map(|p| after + p)
282        .unwrap_or(raw.len());
283    split_top_level_commas(&raw[after..end])
284        .into_iter()
285        .filter_map(|assign| assign.split('=').next().map(|s| s.trim().to_string()))
286        .filter(|s| !s.is_empty())
287        .collect()
288}
289
290fn predicate_columns(raw: &str) -> Vec<String> {
291    let upper = raw.to_ascii_uppercase();
292    let Some(w) = upper.find(" WHERE ") else {
293        return Vec::new();
294    };
295    let pred = &raw[w + 7..];
296    // Stop at GROUP/ORDER/HAVING.
297    let pu = pred.to_ascii_uppercase();
298    let stop = ["GROUP ", "ORDER ", "HAVING ", "CONNECT "]
299        .iter()
300        .filter_map(|kw| pu.find(kw))
301        .min()
302        .unwrap_or(pred.len());
303    column_idents(&pred[..stop])
304}
305
306fn split_top_level_commas(s: &str) -> Vec<String> {
307    let mut out = Vec::new();
308    let mut depth = 0i32;
309    let mut buf = String::new();
310    for ch in s.chars() {
311        match ch {
312            '(' => {
313                depth += 1;
314                buf.push(ch);
315            }
316            ')' => {
317                depth -= 1;
318                buf.push(ch);
319            }
320            ',' if depth == 0 => out.push(std::mem::take(&mut buf)),
321            _ => buf.push(ch),
322        }
323    }
324    if !buf.trim().is_empty() {
325        out.push(buf);
326    }
327    out
328}
329
330fn is_sql_noise(w: &str) -> bool {
331    let u = w.to_ascii_uppercase();
332    matches!(
333        u.as_str(),
334        "AND"
335            | "OR"
336            | "NOT"
337            | "NULL"
338            | "IS"
339            | "IN"
340            | "LIKE"
341            | "BETWEEN"
342            | "EXISTS"
343            | "TRUE"
344            | "FALSE"
345            | "FROM"
346            | "WHERE"
347            | "SELECT"
348            | "INTO"
349            | "VALUES"
350            | "SET"
351            | "DUAL"
352            | "SYSDATE"
353            | "COUNT"
354            | "SUM"
355            | "AVG"
356            | "MIN"
357            | "MAX"
358            | "DISTINCT"
359            | "AS"
360            | "ON"
361            | "USING"
362            | "CASE"
363            | "WHEN"
364            | "THEN"
365            | "ELSE"
366            | "END"
367    ) || u.chars().all(|c| c.is_ascii_digit() || c == '.')
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373    use crate::sql_resolve::resolve_sql;
374
375    #[test]
376    fn select_list_columns_become_reads() {
377        let raw = "SELECT e.id, e.name INTO a, b FROM employees e";
378        let mut m = resolve_sql(raw);
379        extract_columns(&mut m, raw);
380        assert_eq!(m.projection.len(), 2);
381        let cols: Vec<&str> = m.reads.iter().map(|c| c.column.as_str()).collect();
382        assert!(cols.contains(&"ID"));
383        assert!(cols.contains(&"NAME"));
384        // Alias `e` is bound → Resolved.
385        assert!(
386            m.reads
387                .iter()
388                .all(|c| c.resolution == ColumnResolution::Resolved)
389        );
390    }
391
392    #[test]
393    fn star_projection_is_star_expansion() {
394        let raw = "SELECT * INTO r FROM employees";
395        let mut m = resolve_sql(raw);
396        extract_columns(&mut m, raw);
397        assert!(m.projection.iter().any(|p| p.is_star));
398        assert!(
399            m.reads
400                .iter()
401                .any(|c| c.resolution == ColumnResolution::StarExpansion)
402        );
403    }
404
405    #[test]
406    fn bare_column_single_table_resolved() {
407        let raw = "SELECT salary INTO v FROM employees";
408        let mut m = resolve_sql(raw);
409        extract_columns(&mut m, raw);
410        let c = m.reads.iter().find(|c| c.column == "SALARY").unwrap();
411        assert_eq!(c.resolution, ColumnResolution::Resolved);
412    }
413
414    #[test]
415    fn bare_column_multi_table_unresolved() {
416        let raw = "SELECT amount INTO v FROM orders o, payments p WHERE o.id = p.oid";
417        let mut m = resolve_sql(raw);
418        extract_columns(&mut m, raw);
419        let c = m.reads.iter().find(|c| c.column == "AMOUNT");
420        assert_eq!(c.map(|c| c.resolution), Some(ColumnResolution::Unresolved));
421    }
422
423    #[test]
424    fn qualified_unbound_alias_is_unresolved() {
425        let raw = "SELECT zzz.col INTO v FROM employees e";
426        let mut m = resolve_sql(raw);
427        extract_columns(&mut m, raw);
428        let c = m.reads.iter().find(|c| c.column == "COL").unwrap();
429        assert_eq!(c.resolution, ColumnResolution::Unresolved);
430    }
431
432    #[test]
433    fn insert_target_columns_become_writes() {
434        let raw = "INSERT INTO audit (event_id, ts) VALUES (1, SYSDATE)";
435        let mut m = resolve_sql(raw);
436        extract_columns(&mut m, raw);
437        let cols: Vec<&str> = m.writes.iter().map(|c| c.column.as_str()).collect();
438        assert!(cols.contains(&"EVENT_ID"));
439        assert!(cols.contains(&"TS"));
440    }
441
442    #[test]
443    fn update_set_columns_become_writes() {
444        let raw = "UPDATE employees e SET e.salary = e.salary * 1.1 WHERE e.id = 1";
445        let mut m = resolve_sql(raw);
446        extract_columns(&mut m, raw);
447        assert!(m.writes.iter().any(|c| c.column == "SALARY"));
448        // WHERE column is a read.
449        assert!(m.reads.iter().any(|c| c.column == "ID"));
450    }
451
452    #[test]
453    fn delete_predicate_columns_are_reads() {
454        let raw = "DELETE FROM stale WHERE created < SYSDATE - 30";
455        let mut m = resolve_sql(raw);
456        extract_columns(&mut m, raw);
457        assert!(m.reads.iter().any(|c| c.column == "CREATED"));
458        assert!(m.writes.is_empty());
459    }
460
461    #[test]
462    fn projection_alias_split() {
463        let raw = "SELECT e.salary AS pay INTO v FROM employees e";
464        let mut m = resolve_sql(raw);
465        extract_columns(&mut m, raw);
466        let p = &m.projection[0];
467        assert_eq!(p.alias, "pay");
468        assert_eq!(p.expression_text, "e.salary");
469    }
470
471    #[test]
472    fn sql_noise_not_recorded_as_columns() {
473        let raw = "SELECT id INTO v FROM employees WHERE id IS NOT NULL AND id > 0";
474        let mut m = resolve_sql(raw);
475        extract_columns(&mut m, raw);
476        let cols: Vec<&str> = m.reads.iter().map(|c| c.column.as_str()).collect();
477        assert!(
478            !cols
479                .iter()
480                .any(|c| *c == "NULL" || *c == "AND" || *c == "NOT")
481        );
482        assert!(cols.contains(&"ID"));
483    }
484
485    #[test]
486    fn extract_columns_for_model_walks_all_statements() {
487        let mut model = SqlSemanticModel::default();
488        model.push(resolve_sql("SELECT id INTO v FROM t1"));
489        model.push(resolve_sql("SELECT name INTO v FROM t2"));
490        extract_columns_for_model(
491            &mut model,
492            &[
493                "SELECT id INTO v FROM t1".to_string(),
494                "SELECT name INTO v FROM t2".to_string(),
495            ],
496        );
497        assert!(model.statements[0].reads.iter().any(|c| c.column == "ID"));
498        assert!(model.statements[1].reads.iter().any(|c| c.column == "NAME"));
499    }
500
501    #[test]
502    fn serde_round_trip_preserves_column_resolution() {
503        let raw = "SELECT salary INTO v FROM employees";
504        let mut m = resolve_sql(raw);
505        extract_columns(&mut m, raw);
506        let json = serde_json::to_string(&m).unwrap();
507        let back: SqlStatementModel = serde_json::from_str(&json).unwrap();
508        assert_eq!(back, m);
509    }
510}