Skip to main content

datafusion_sql/
resolve.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::collections::BTreeSet;
19use std::ops::ControlFlow;
20
21use datafusion_common::{DataFusionError, Result};
22
23use crate::TableReference;
24use crate::parser::{CopyToSource, CopyToStatement, Statement as DFStatement};
25use crate::planner::object_name_to_table_reference;
26use sqlparser::ast::*;
27
28// following constants are used in `resolve_table_references`
29// and should be same as `datafusion/catalog/src/information_schema.rs`
30const INFORMATION_SCHEMA: &str = "information_schema";
31const TABLES: &str = "tables";
32const VIEWS: &str = "views";
33const COLUMNS: &str = "columns";
34const DF_SETTINGS: &str = "df_settings";
35const SCHEMATA: &str = "schemata";
36const ROUTINES: &str = "routines";
37const PARAMETERS: &str = "parameters";
38
39/// All information schema tables
40const INFORMATION_SCHEMA_TABLES: &[&str] = &[
41    TABLES,
42    VIEWS,
43    COLUMNS,
44    DF_SETTINGS,
45    SCHEMATA,
46    ROUTINES,
47    PARAMETERS,
48];
49
50// Collect table/CTE references as `TableReference`s and normalize them during traversal.
51// This avoids a second normalization/conversion pass after visiting the AST.
52struct RelationVisitor {
53    relations: BTreeSet<TableReference>,
54    all_ctes: BTreeSet<TableReference>,
55    ctes_in_scope: Vec<TableReference>,
56    enable_ident_normalization: bool,
57}
58
59impl RelationVisitor {
60    /// Record the reference to `relation`, if it's not a CTE reference.
61    fn insert_relation(&mut self, relation: &ObjectName) -> ControlFlow<DataFusionError> {
62        match object_name_to_table_reference(
63            relation.clone(),
64            self.enable_ident_normalization,
65        ) {
66            Ok(relation) => {
67                if !self.relations.contains(&relation)
68                    && !self.ctes_in_scope.contains(&relation)
69                {
70                    self.relations.insert(relation);
71                }
72                ControlFlow::Continue(())
73            }
74            Err(e) => ControlFlow::Break(e),
75        }
76    }
77}
78
79impl Visitor for RelationVisitor {
80    type Break = DataFusionError;
81
82    fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow<Self::Break> {
83        self.insert_relation(relation)
84    }
85
86    fn pre_visit_query(&mut self, q: &Query) -> ControlFlow<Self::Break> {
87        if let Some(with) = &q.with {
88            for cte in &with.cte_tables {
89                // The non-recursive CTE name is not in scope when evaluating the CTE itself, so this is valid:
90                // `WITH t AS (SELECT * FROM t) SELECT * FROM t`
91                // Where the first `t` refers to a predefined table. So we are careful here
92                // to visit the CTE first, before putting it in scope.
93                if !with.recursive {
94                    // This is a bit hackish as the CTE will be visited again as part of visiting `q`,
95                    // but thankfully `insert_relation` is idempotent.
96                    cte.visit(self)?;
97                }
98                let cte_name = ObjectName::from(vec![cte.alias.name.clone()]);
99                match object_name_to_table_reference(
100                    cte_name,
101                    self.enable_ident_normalization,
102                ) {
103                    Ok(cte_ref) => self.ctes_in_scope.push(cte_ref),
104                    Err(e) => return ControlFlow::Break(e),
105                }
106            }
107        }
108        ControlFlow::Continue(())
109    }
110
111    fn post_visit_query(&mut self, q: &Query) -> ControlFlow<Self::Break> {
112        if let Some(with) = &q.with {
113            for _ in &with.cte_tables {
114                // Unwrap: We just pushed these in `pre_visit_query`
115                self.all_ctes.insert(self.ctes_in_scope.pop().unwrap());
116            }
117        }
118        ControlFlow::Continue(())
119    }
120
121    fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow<Self::Break> {
122        if let Statement::ShowCreate {
123            obj_type: ShowCreateObject::Table | ShowCreateObject::View,
124            obj_name,
125        } = statement
126        {
127            self.insert_relation(obj_name)?;
128        }
129
130        // SHOW statements will later be rewritten into a SELECT from the information_schema
131        let requires_information_schema = matches!(
132            statement,
133            Statement::ShowFunctions { .. }
134                | Statement::ShowVariable { .. }
135                | Statement::ShowStatus { .. }
136                | Statement::ShowVariables { .. }
137                | Statement::ShowCreate { .. }
138                | Statement::ShowColumns { .. }
139                | Statement::ShowTables { .. }
140                | Statement::ShowCollation { .. }
141        );
142        if requires_information_schema {
143            for s in INFORMATION_SCHEMA_TABLES {
144                // Information schema references are synthesized here, so convert directly.
145                let obj = ObjectName::from(vec![
146                    Ident::new(INFORMATION_SCHEMA),
147                    Ident::new(*s),
148                ]);
149                match object_name_to_table_reference(obj, self.enable_ident_normalization)
150                {
151                    Ok(tbl_ref) => {
152                        self.relations.insert(tbl_ref);
153                    }
154                    Err(e) => return ControlFlow::Break(e),
155                }
156            }
157        }
158        ControlFlow::Continue(())
159    }
160}
161
162fn control_flow_to_result(flow: ControlFlow<DataFusionError>) -> Result<()> {
163    match flow {
164        ControlFlow::Continue(()) => Ok(()),
165        ControlFlow::Break(err) => Err(err),
166    }
167}
168
169fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) -> Result<()> {
170    match statement {
171        DFStatement::Statement(s) => {
172            control_flow_to_result(s.as_ref().visit(visitor))?;
173        }
174        DFStatement::CreateExternalTable(table) => {
175            control_flow_to_result(visitor.insert_relation(&table.name))?;
176        }
177        DFStatement::CopyTo(CopyToStatement { source, .. }) => match source {
178            CopyToSource::Relation(table_name) => {
179                control_flow_to_result(visitor.insert_relation(table_name))?;
180            }
181            CopyToSource::Query(query) => {
182                control_flow_to_result(query.visit(visitor))?;
183            }
184        },
185        DFStatement::Explain(explain) => {
186            visit_statement(&explain.statement, visitor)?;
187        }
188        DFStatement::Reset(_) => {}
189    }
190    Ok(())
191}
192
193/// Collects all tables and views referenced in the SQL statement. CTEs are collected separately.
194/// This can be used to determine which tables need to be in the catalog for a query to be planned.
195///
196/// # Returns
197///
198/// A `(table_refs, ctes)` tuple, the first element contains table and view references and the second
199/// element contains any CTE aliases that were defined and possibly referenced.
200///
201/// ## Example
202///
203/// ```
204/// # use datafusion_sql::parser::DFParser;
205/// # use datafusion_sql::resolve::resolve_table_references;
206/// let query = "SELECT a FROM foo where x IN (SELECT y FROM bar)";
207/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
208/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
209/// assert_eq!(table_refs.len(), 2);
210/// assert_eq!(table_refs[0].to_string(), "bar");
211/// assert_eq!(table_refs[1].to_string(), "foo");
212/// assert_eq!(ctes.len(), 0);
213/// ```
214///
215/// ## Example with CTEs  
216///  
217/// ```  
218/// # use datafusion_sql::parser::DFParser;
219/// # use datafusion_sql::resolve::resolve_table_references;
220/// let query = "with my_cte as (values (1), (2)) SELECT * from my_cte;";
221/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
222/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
223/// assert_eq!(table_refs.len(), 0);
224/// assert_eq!(ctes.len(), 1);
225/// assert_eq!(ctes[0].to_string(), "my_cte");
226/// ```
227pub fn resolve_table_references(
228    statement: &crate::parser::Statement,
229    enable_ident_normalization: bool,
230) -> Result<(Vec<TableReference>, Vec<TableReference>)> {
231    let mut visitor = RelationVisitor {
232        relations: BTreeSet::new(),
233        all_ctes: BTreeSet::new(),
234        ctes_in_scope: vec![],
235        enable_ident_normalization,
236    };
237
238    visit_statement(statement, &mut visitor)?;
239
240    Ok((
241        visitor.relations.into_iter().collect(),
242        visitor.all_ctes.into_iter().collect(),
243    ))
244}
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    #[test]
251    fn resolve_table_references_shadowed_cte() {
252        use crate::parser::DFParser;
253
254        // An interesting edge case where the `t` name is used both as an ordinary table reference
255        // and as a CTE reference.
256        let query = "WITH t AS (SELECT * FROM t) SELECT * FROM t";
257        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
258        let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
259        assert_eq!(table_refs.len(), 1);
260        assert_eq!(ctes.len(), 1);
261        assert_eq!(ctes[0].to_string(), "t");
262        assert_eq!(table_refs[0].to_string(), "t");
263
264        // UNION is a special case where the CTE is not in scope for the second branch.
265        let query = "(with t as (select 1) select * from t) union (select * from t)";
266        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
267        let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
268        assert_eq!(table_refs.len(), 1);
269        assert_eq!(ctes.len(), 1);
270        assert_eq!(ctes[0].to_string(), "t");
271        assert_eq!(table_refs[0].to_string(), "t");
272
273        // Nested CTEs are also handled.
274        // Here the first `u` is a CTE, but the second `u` is a table reference.
275        // While `t` is always a CTE.
276        let query = "(with t as (with u as (select 1) select * from u) select * from u cross join t)";
277        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
278        let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
279        assert_eq!(table_refs.len(), 1);
280        assert_eq!(ctes.len(), 2);
281        assert_eq!(ctes[0].to_string(), "t");
282        assert_eq!(ctes[1].to_string(), "u");
283        assert_eq!(table_refs[0].to_string(), "u");
284    }
285
286    #[test]
287    fn resolve_table_references_recursive_cte() {
288        use crate::parser::DFParser;
289
290        let query = "
291            WITH RECURSIVE nodes AS ( 
292                SELECT 1 as id
293                UNION ALL 
294                SELECT id + 1 as id 
295                FROM nodes
296                WHERE id < 10
297            )
298            SELECT * FROM nodes
299        ";
300        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
301        let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
302        assert_eq!(table_refs.len(), 0);
303        assert_eq!(ctes.len(), 1);
304        assert_eq!(ctes[0].to_string(), "nodes");
305    }
306
307    #[test]
308    fn resolve_table_references_cte_with_quoted_reference() {
309        use crate::parser::DFParser;
310
311        let query = r#"with barbaz as (select 1) select * from "barbaz""#;
312        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
313        let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
314        assert_eq!(ctes.len(), 1);
315        assert_eq!(ctes[0].to_string(), "barbaz");
316        // Quoted reference should still resolve to the CTE when normalization is on
317        assert_eq!(table_refs.len(), 0);
318    }
319
320    #[test]
321    fn resolve_table_references_cte_with_quoted_reference_normalization_off() {
322        use crate::parser::DFParser;
323
324        let query = r#"with barbaz as (select 1) select * from "barbaz""#;
325        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
326        let (table_refs, ctes) = resolve_table_references(&statement, false).unwrap();
327        assert_eq!(ctes.len(), 1);
328        assert_eq!(ctes[0].to_string(), "barbaz");
329        // Even with normalization off, quoted reference matches same-case CTE name
330        assert_eq!(table_refs.len(), 0);
331    }
332
333    #[test]
334    fn resolve_table_references_cte_with_quoted_reference_uppercase_normalization_on() {
335        use crate::parser::DFParser;
336
337        let query = r#"with FOObar as (select 1) select * from "FOObar""#;
338        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
339        let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
340        // CTE name is normalized to lowercase, quoted reference preserves case, so they differ
341        assert_eq!(ctes.len(), 1);
342        assert_eq!(ctes[0].to_string(), "foobar");
343        assert_eq!(table_refs.len(), 1);
344        assert_eq!(table_refs[0].to_string(), "FOObar");
345    }
346
347    #[test]
348    fn resolve_table_references_cte_with_quoted_reference_uppercase_normalization_off() {
349        use crate::parser::DFParser;
350
351        let query = r#"with FOObar as (select 1) select * from "FOObar""#;
352        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
353        let (table_refs, ctes) = resolve_table_references(&statement, false).unwrap();
354        // Without normalization, cases match exactly, so quoted reference resolves to the CTE
355        assert_eq!(ctes.len(), 1);
356        assert_eq!(ctes[0].to_string(), "FOObar");
357        assert_eq!(table_refs.len(), 0);
358    }
359}