use crate::dialects::DialectType;
use crate::expressions::{Expression, Identifier, TableRef};
use crate::schema::{normalize_name, Schema};
use crate::scope::{Scope, SourceInfo};
use std::collections::{HashMap, HashSet};
use thiserror::Error;
#[derive(Debug, Error, Clone)]
pub enum ResolverError {
#[error("Unknown table: {0}")]
UnknownTable(String),
#[error("Ambiguous column: {column} appears in multiple sources: {sources}")]
AmbiguousColumn { column: String, sources: String },
#[error("Column not found: {0}")]
ColumnNotFound(String),
#[error("Unknown set operation: {0}")]
UnknownSetOperation(String),
}
pub type ResolverResult<T> = Result<T, ResolverError>;
pub struct Resolver<'a> {
pub scope: &'a Scope,
schema: &'a dyn Schema,
pub dialect: Option<DialectType>,
infer_schema: bool,
source_columns_cache: HashMap<String, Vec<String>>,
unambiguous_columns_cache: Option<HashMap<String, String>>,
all_columns_cache: Option<HashSet<String>>,
}
impl<'a> Resolver<'a> {
pub fn new(scope: &'a Scope, schema: &'a dyn Schema, infer_schema: bool) -> Self {
Self {
scope,
schema,
dialect: schema.dialect(),
infer_schema,
source_columns_cache: HashMap::new(),
unambiguous_columns_cache: None,
all_columns_cache: None,
}
}
pub fn get_table(&mut self, column_name: &str) -> Option<String> {
let table_name = self.get_table_name_from_sources(column_name, None);
if table_name.is_some() {
return table_name;
}
if self.infer_schema {
let sources_without_schema: Vec<_> = self
.get_all_source_columns()
.iter()
.filter(|(_, columns)| columns.is_empty() || columns.contains(&"*".to_string()))
.map(|(name, _)| name.clone())
.collect();
if sources_without_schema.len() == 1 {
return Some(sources_without_schema[0].clone());
}
}
None
}
pub fn get_table_identifier(&mut self, column_name: &str) -> Option<Identifier> {
self.get_table(column_name).map(Identifier::new)
}
pub fn table_exists_in_schema(&self, table_name: &str) -> bool {
self.schema.column_names(table_name).is_ok()
}
pub fn find_column_in_outer_schema_tables(&self, column_name: &str) -> Option<String> {
let tables = self.schema.find_tables_for_column(column_name);
let outer_tables: Vec<String> = tables
.into_iter()
.filter(|t| !self.scope.sources.contains_key(t))
.collect();
if outer_tables.len() == 1 {
Some(outer_tables.into_iter().next().unwrap())
} else {
None
}
}
pub fn all_columns(&mut self) -> &HashSet<String> {
if self.all_columns_cache.is_none() {
let mut all = HashSet::new();
for columns in self.get_all_source_columns().values() {
all.extend(columns.iter().cloned());
}
self.all_columns_cache = Some(all);
}
self.all_columns_cache
.as_ref()
.expect("cache populated above")
}
pub fn get_source_columns(&mut self, source_name: &str) -> ResolverResult<Vec<String>> {
if let Some(columns) = self.source_columns_cache.get(source_name) {
return Ok(columns.clone());
}
let source_info = self
.scope
.sources
.get(source_name)
.ok_or_else(|| ResolverError::UnknownTable(source_name.to_string()))?;
let columns = self.extract_columns_from_source(source_info)?;
self.source_columns_cache
.insert(source_name.to_string(), columns.clone());
Ok(columns)
}
fn extract_columns_from_source(&self, source_info: &SourceInfo) -> ResolverResult<Vec<String>> {
let columns = match &source_info.expression {
Expression::Table(table) => {
let table_name = qualified_table_name(table);
match self.schema.column_names(&table_name) {
Ok(cols) => cols,
Err(_) => Vec::new(), }
}
Expression::Subquery(subquery) => {
self.get_named_selects(&subquery.this)
}
Expression::Select(select) => {
self.get_select_column_names(select)
}
Expression::Union(union) => {
self.get_source_columns_from_set_op(&Expression::Union(union.clone()))?
}
Expression::Intersect(intersect) => {
self.get_source_columns_from_set_op(&Expression::Intersect(intersect.clone()))?
}
Expression::Except(except) => {
self.get_source_columns_from_set_op(&Expression::Except(except.clone()))?
}
Expression::Cte(cte) => {
if !cte.columns.is_empty() {
cte.columns.iter().map(|c| c.name.clone()).collect()
} else {
self.get_named_selects(&cte.this)
}
}
_ => Vec::new(),
};
Ok(columns)
}
fn get_named_selects(&self, expr: &Expression) -> Vec<String> {
match expr {
Expression::Select(select) => self.get_select_column_names(select),
Expression::Union(union) => {
self.get_named_selects(&union.left)
}
Expression::Intersect(intersect) => self.get_named_selects(&intersect.left),
Expression::Except(except) => self.get_named_selects(&except.left),
Expression::Subquery(subquery) => self.get_named_selects(&subquery.this),
_ => Vec::new(),
}
}
fn get_select_column_names(&self, select: &crate::expressions::Select) -> Vec<String> {
select
.expressions
.iter()
.filter_map(|expr| self.get_expression_alias(expr))
.collect()
}
fn get_expression_alias(&self, expr: &Expression) -> Option<String> {
match expr {
Expression::Alias(alias) => Some(alias.alias.name.clone()),
Expression::Column(col) => Some(col.name.name.clone()),
Expression::Star(_) => Some("*".to_string()),
Expression::Identifier(id) => Some(id.name.clone()),
_ => None,
}
}
pub fn get_source_columns_from_set_op(
&self,
expression: &Expression,
) -> ResolverResult<Vec<String>> {
match expression {
Expression::Select(select) => Ok(self.get_select_column_names(select)),
Expression::Subquery(subquery) => {
if matches!(
&subquery.this,
Expression::Union(_) | Expression::Intersect(_) | Expression::Except(_)
) {
self.get_source_columns_from_set_op(&subquery.this)
} else {
Ok(self.get_named_selects(&subquery.this))
}
}
Expression::Union(union) => {
self.get_source_columns_from_set_op(&union.left)
}
Expression::Intersect(intersect) => {
self.get_source_columns_from_set_op(&intersect.left)
}
Expression::Except(except) => self.get_source_columns_from_set_op(&except.left),
_ => Err(ResolverError::UnknownSetOperation(format!(
"{:?}",
expression
))),
}
}
fn get_all_source_columns(&mut self) -> HashMap<String, Vec<String>> {
let source_names: Vec<_> = self.scope.sources.keys().cloned().collect();
let mut result = HashMap::new();
for source_name in source_names {
if let Ok(columns) = self.get_source_columns(&source_name) {
result.insert(source_name, columns);
}
}
result
}
fn get_table_name_from_sources(
&mut self,
column_name: &str,
source_columns: Option<&HashMap<String, Vec<String>>>,
) -> Option<String> {
let normalized_column_name = normalize_column_name(column_name, self.dialect);
let unambiguous = match source_columns {
Some(cols) => self.compute_unambiguous_columns(cols),
None => {
if self.unambiguous_columns_cache.is_none() {
let all_source_columns = self.get_all_source_columns();
self.unambiguous_columns_cache =
Some(self.compute_unambiguous_columns(&all_source_columns));
}
self.unambiguous_columns_cache
.clone()
.expect("cache populated above")
}
};
unambiguous.get(&normalized_column_name).cloned()
}
fn compute_unambiguous_columns(
&self,
source_columns: &HashMap<String, Vec<String>>,
) -> HashMap<String, String> {
if source_columns.is_empty() {
return HashMap::new();
}
let mut column_to_sources: HashMap<String, Vec<String>> = HashMap::new();
for (source_name, columns) in source_columns {
for column in columns {
column_to_sources
.entry(normalize_column_name(column, self.dialect))
.or_default()
.push(source_name.clone());
}
}
column_to_sources
.into_iter()
.filter(|(_, sources)| sources.len() == 1)
.map(|(column, sources)| (column, sources.into_iter().next().unwrap()))
.collect()
}
pub fn is_ambiguous(&mut self, column_name: &str) -> bool {
let normalized_column_name = normalize_column_name(column_name, self.dialect);
let all_source_columns = self.get_all_source_columns();
let sources_with_column: Vec<_> = all_source_columns
.iter()
.filter(|(_, columns)| {
columns.iter().any(|column| {
normalize_column_name(column, self.dialect) == normalized_column_name
})
})
.map(|(name, _)| name.clone())
.collect();
sources_with_column.len() > 1
}
pub fn sources_for_column(&mut self, column_name: &str) -> Vec<String> {
let normalized_column_name = normalize_column_name(column_name, self.dialect);
let all_source_columns = self.get_all_source_columns();
all_source_columns
.iter()
.filter(|(_, columns)| {
columns.iter().any(|column| {
normalize_column_name(column, self.dialect) == normalized_column_name
})
})
.map(|(name, _)| name.clone())
.collect()
}
pub fn disambiguate_in_join_context(
&mut self,
column_name: &str,
available_sources: &[String],
) -> Option<String> {
let normalized_column_name = normalize_column_name(column_name, self.dialect);
let mut matching_sources = Vec::new();
for source_name in available_sources {
if let Ok(columns) = self.get_source_columns(source_name) {
if columns.iter().any(|column| {
normalize_column_name(column, self.dialect) == normalized_column_name
}) {
matching_sources.push(source_name.clone());
}
}
}
if matching_sources.len() == 1 {
Some(matching_sources.remove(0))
} else {
None
}
}
}
fn normalize_column_name(name: &str, dialect: Option<DialectType>) -> String {
normalize_name(name, dialect, false, true)
}
pub fn resolve_column(
scope: &Scope,
schema: &dyn Schema,
column_name: &str,
infer_schema: bool,
) -> Option<String> {
let mut resolver = Resolver::new(scope, schema, infer_schema);
resolver.get_table(column_name)
}
pub fn is_column_ambiguous(scope: &Scope, schema: &dyn Schema, column_name: &str) -> bool {
let mut resolver = Resolver::new(scope, schema, true);
resolver.is_ambiguous(column_name)
}
fn qualified_table_name(table: &TableRef) -> String {
let mut parts = Vec::new();
if let Some(catalog) = &table.catalog {
parts.push(catalog.name.clone());
}
if let Some(schema) = &table.schema {
parts.push(schema.name.clone());
}
parts.push(table.name.name.clone());
parts.join(".")
}
#[cfg(test)]
mod tests {
use super::*;
use crate::dialects::Dialect;
use crate::expressions::DataType;
use crate::parser::Parser;
use crate::schema::MappingSchema;
use crate::scope::build_scope;
fn create_test_schema() -> MappingSchema {
let mut schema = MappingSchema::new();
schema
.add_table(
"users",
&[
(
"id".to_string(),
DataType::Int {
length: None,
integer_spelling: false,
},
),
("name".to_string(), DataType::Text),
("email".to_string(), DataType::Text),
],
None,
)
.unwrap();
schema
.add_table(
"orders",
&[
(
"id".to_string(),
DataType::Int {
length: None,
integer_spelling: false,
},
),
(
"user_id".to_string(),
DataType::Int {
length: None,
integer_spelling: false,
},
),
(
"amount".to_string(),
DataType::Double {
precision: None,
scale: None,
},
),
],
None,
)
.unwrap();
schema
}
#[test]
fn test_resolver_basic() {
let ast = Parser::parse_sql("SELECT id, name FROM users").expect("Failed to parse");
let scope = build_scope(&ast[0]);
let schema = create_test_schema();
let mut resolver = Resolver::new(&scope, &schema, true);
let table = resolver.get_table("name");
assert_eq!(table, Some("users".to_string()));
}
#[test]
fn test_resolver_ambiguous_column() {
let ast =
Parser::parse_sql("SELECT id FROM users JOIN orders ON users.id = orders.user_id")
.expect("Failed to parse");
let scope = build_scope(&ast[0]);
let schema = create_test_schema();
let mut resolver = Resolver::new(&scope, &schema, true);
assert!(resolver.is_ambiguous("id"));
assert!(!resolver.is_ambiguous("name"));
assert!(!resolver.is_ambiguous("amount"));
}
#[test]
fn test_resolver_unambiguous_column() {
let ast = Parser::parse_sql(
"SELECT name, amount FROM users JOIN orders ON users.id = orders.user_id",
)
.expect("Failed to parse");
let scope = build_scope(&ast[0]);
let schema = create_test_schema();
let mut resolver = Resolver::new(&scope, &schema, true);
let table = resolver.get_table("name");
assert_eq!(table, Some("users".to_string()));
let table = resolver.get_table("amount");
assert_eq!(table, Some("orders".to_string()));
}
#[test]
fn test_resolver_with_alias() {
let ast = Parser::parse_sql("SELECT u.id FROM users AS u").expect("Failed to parse");
let scope = build_scope(&ast[0]);
let schema = create_test_schema();
let _resolver = Resolver::new(&scope, &schema, true);
assert!(scope.sources.contains_key("u"));
}
#[test]
fn test_sources_for_column() {
let ast = Parser::parse_sql("SELECT * FROM users JOIN orders ON users.id = orders.user_id")
.expect("Failed to parse");
let scope = build_scope(&ast[0]);
let schema = create_test_schema();
let mut resolver = Resolver::new(&scope, &schema, true);
let sources = resolver.sources_for_column("id");
assert!(sources.contains(&"users".to_string()));
assert!(sources.contains(&"orders".to_string()));
let sources = resolver.sources_for_column("email");
assert_eq!(sources, vec!["users".to_string()]);
}
#[test]
fn test_all_columns() {
let ast = Parser::parse_sql("SELECT * FROM users").expect("Failed to parse");
let scope = build_scope(&ast[0]);
let schema = create_test_schema();
let mut resolver = Resolver::new(&scope, &schema, true);
let all = resolver.all_columns();
assert!(all.contains("id"));
assert!(all.contains("name"));
assert!(all.contains("email"));
}
#[test]
fn test_resolver_cte_projected_alias_column() {
let ast = Parser::parse_sql(
"WITH my_cte AS (SELECT id AS emp_id FROM users) SELECT emp_id FROM my_cte",
)
.expect("Failed to parse");
let scope = build_scope(&ast[0]);
let schema = create_test_schema();
let mut resolver = Resolver::new(&scope, &schema, true);
let table = resolver.get_table("emp_id");
assert_eq!(table, Some("my_cte".to_string()));
}
#[test]
fn test_resolve_column_helper() {
let ast = Parser::parse_sql("SELECT name FROM users").expect("Failed to parse");
let scope = build_scope(&ast[0]);
let schema = create_test_schema();
let table = resolve_column(&scope, &schema, "name", true);
assert_eq!(table, Some("users".to_string()));
}
#[test]
fn test_resolver_bigquery_mixed_case_column_names() {
let dialect = Dialect::get(DialectType::BigQuery);
let expr = dialect
.parse("SELECT Name AS name FROM teams")
.unwrap()
.into_iter()
.next()
.expect("expected one expression");
let scope = build_scope(&expr);
let mut schema = MappingSchema::with_dialect(DialectType::BigQuery);
schema
.add_table(
"teams",
&[("Name".into(), DataType::String { length: None })],
None,
)
.expect("schema setup");
let mut resolver = Resolver::new(&scope, &schema, true);
let table = resolver.get_table("Name");
assert_eq!(table, Some("teams".to_string()));
let table = resolver.get_table("name");
assert_eq!(table, Some("teams".to_string()));
}
}