Skip to main content

polyglot_sql/dialects/
spark.rs

1//! Spark SQL Dialect
2//!
3//! Spark SQL-specific transformations based on sqlglot patterns.
4//! Key features (extends Hive with modern SQL):
5//! - TRY_CAST is supported (Spark 3+)
6//! - ILIKE is supported (Spark 3+)
7//! - Uses backticks for identifiers
8//! - ARRAY_AGG, COLLECT_LIST for array aggregation
9//! - STRING_AGG / LISTAGG supported (Spark 4+)
10//! - DATE_ADD with unit parameter (Spark 3+)
11//! - TIMESTAMPADD, TIMESTAMPDIFF (Spark 3+)
12//! - More PostgreSQL-like syntax than Hive
13
14use super::{DialectImpl, DialectType};
15use crate::error::Result;
16use crate::expressions::{
17    CeilFunc, CurrentTimestamp, DataType, DateTimeField, Expression, ExtractFunc, Function,
18    Literal, StructField, UnaryFunc, VarArgFunc,
19};
20#[cfg(feature = "generate")]
21use crate::generator::GeneratorConfig;
22use crate::tokens::TokenizerConfig;
23
24/// Spark SQL dialect
25pub struct SparkDialect;
26
27impl DialectImpl for SparkDialect {
28    fn dialect_type(&self) -> DialectType {
29        DialectType::Spark
30    }
31
32    fn tokenizer_config(&self) -> TokenizerConfig {
33        let mut config = TokenizerConfig::default();
34        // Spark uses backticks for identifiers (NOT double quotes)
35        config.identifiers.clear();
36        config.identifiers.insert('`', '`');
37        // Spark (like Hive) uses double quotes as string delimiters (QUOTES = ["'", '"'])
38        config.quotes.insert("\"".to_string(), "\"".to_string());
39        // Spark (like Hive) uses backslash escapes in strings (STRING_ESCAPES = ["\\"])
40        config.string_escapes.push('\\');
41        // Spark supports DIV keyword for integer division (inherited from Hive)
42        config
43            .keywords
44            .insert("DIV".to_string(), crate::tokens::TokenType::Div);
45        config
46            .keywords
47            .insert("REPAIR".to_string(), crate::tokens::TokenType::Command);
48        config
49            .keywords
50            .insert("MSCK".to_string(), crate::tokens::TokenType::Command);
51        // Spark numeric literal suffixes (same as Hive): 1L -> BIGINT, 1S -> SMALLINT, etc.
52        config
53            .numeric_literals
54            .insert("L".to_string(), "BIGINT".to_string());
55        config
56            .numeric_literals
57            .insert("S".to_string(), "SMALLINT".to_string());
58        config
59            .numeric_literals
60            .insert("Y".to_string(), "TINYINT".to_string());
61        config
62            .numeric_literals
63            .insert("D".to_string(), "DOUBLE".to_string());
64        config
65            .numeric_literals
66            .insert("F".to_string(), "FLOAT".to_string());
67        config
68            .numeric_literals
69            .insert("BD".to_string(), "DECIMAL".to_string());
70        // Spark allows identifiers to start with digits (e.g., 1a, 1_a)
71        config.identifiers_can_start_with_digit = true;
72        // Spark: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False
73        // Backslashes in raw strings are always literal (no escape processing)
74        config.string_escapes_allowed_in_raw_strings = false;
75        config
76    }
77
78    #[cfg(feature = "generate")]
79
80    fn generator_config(&self) -> GeneratorConfig {
81        use crate::generator::IdentifierQuoteStyle;
82        GeneratorConfig {
83            identifier_quote: '`',
84            identifier_quote_style: IdentifierQuoteStyle::BACKTICK,
85            dialect: Some(DialectType::Spark),
86            // Spark uses colon separator in STRUCT field definitions: STRUCT<field_name: TYPE>
87            struct_field_sep: ": ",
88            // Spark doesn't use AS before RETURN in function definitions
89            create_function_return_as: false,
90            // Spark places alias after the TABLESAMPLE clause
91            alias_post_tablesample: true,
92            tablesample_seed_keyword: "REPEATABLE",
93            join_hints: false,
94            identifiers_can_start_with_digit: true,
95            // Spark uses COMMENT 'value' without = sign
96            schema_comment_with_eq: false,
97            ..Default::default()
98        }
99    }
100
101    #[cfg(feature = "transpile")]
102
103    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
104        match expr {
105            // IFNULL -> COALESCE in Spark
106            Expression::IfNull(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
107                original_name: None,
108                expressions: vec![f.this, f.expression],
109                inferred_type: None,
110            }))),
111
112            // NVL is supported in Spark (from Hive), but COALESCE is standard
113            Expression::Nvl(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
114                original_name: None,
115                expressions: vec![f.this, f.expression],
116                inferred_type: None,
117            }))),
118
119            // Cast: normalize VARCHAR(n) -> STRING, CHAR(n) -> STRING for Spark
120            Expression::Cast(mut c) => {
121                c.to = Self::normalize_spark_type(c.to);
122                Ok(Expression::Cast(c))
123            }
124
125            // TryCast stays as TryCast in Spark (Spark supports TRY_CAST natively)
126            Expression::TryCast(mut c) => {
127                c.to = Self::normalize_spark_type(c.to);
128                Ok(Expression::TryCast(c))
129            }
130
131            // SafeCast -> TRY_CAST
132            Expression::SafeCast(mut c) => {
133                c.to = Self::normalize_spark_type(c.to);
134                Ok(Expression::TryCast(c))
135            }
136
137            // TRIM: non-standard comma syntax -> standard FROM syntax
138            // TRIM('SL', 'SSparkSQLS') -> TRIM('SL' FROM 'SSparkSQLS')
139            Expression::Trim(mut t) => {
140                if !t.sql_standard_syntax && t.characters.is_some() {
141                    // Convert comma syntax to standard SQL syntax
142                    // Fields already have correct semantics: this=string, characters=chars
143                    t.sql_standard_syntax = true;
144                }
145                Ok(Expression::Trim(t))
146            }
147
148            // ILIKE is supported in Spark 3+
149            Expression::ILike(op) => Ok(Expression::ILike(op)),
150
151            // UNNEST -> EXPLODE in Spark (Hive compatibility)
152            Expression::Unnest(f) => Ok(Expression::Explode(Box::new(UnaryFunc::new(f.this)))),
153
154            // EXPLODE is native to Spark
155            Expression::Explode(f) => Ok(Expression::Explode(f)),
156
157            // ExplodeOuter is supported in Spark
158            Expression::ExplodeOuter(f) => Ok(Expression::ExplodeOuter(f)),
159
160            // RANDOM -> RAND in Spark
161            Expression::Random(_) => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
162                seed: None,
163                lower: None,
164                upper: None,
165            }))),
166
167            // Rand is native to Spark
168            Expression::Rand(r) => Ok(Expression::Rand(r)),
169
170            // || (Concat) -> CONCAT in Spark
171            Expression::Concat(op) => Ok(Expression::Function(Box::new(Function::new(
172                "CONCAT".to_string(),
173                vec![op.left, op.right],
174            )))),
175
176            // ParseJson: handled by generator (emits just the string literal for Spark)
177
178            // Generic function transformations
179            Expression::Function(f) => self.transform_function(*f),
180
181            // Generic aggregate function transformations
182            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
183
184            // $N parameters -> ${N} in Spark (DollarBrace style)
185            Expression::Parameter(mut p)
186                if p.style == crate::expressions::ParameterStyle::Dollar =>
187            {
188                p.style = crate::expressions::ParameterStyle::DollarBrace;
189                // Convert index to name for DollarBrace format
190                if let Some(idx) = p.index {
191                    p.name = Some(idx.to_string());
192                }
193                Ok(Expression::Parameter(p))
194            }
195
196            // JSONExtract with variant_extract (Databricks colon syntax) -> GET_JSON_OBJECT
197            Expression::JSONExtract(je) if je.variant_extract.is_some() => {
198                // Convert path: 'item[1].price' -> '$.item[1].price'
199                let path = match *je.expression {
200                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
201                        let Literal::String(s) = lit.as_ref() else {
202                            unreachable!()
203                        };
204                        Expression::Literal(Box::new(Literal::String(format!("$.{}", s))))
205                    }
206                    other => other,
207                };
208                Ok(Expression::Function(Box::new(Function::new(
209                    "GET_JSON_OBJECT".to_string(),
210                    vec![*je.this, path],
211                ))))
212            }
213
214            // Pass through everything else
215            _ => Ok(expr),
216        }
217    }
218}
219
220#[cfg(feature = "transpile")]
221impl SparkDialect {
222    /// Normalize a data type for Spark:
223    /// - VARCHAR/CHAR without length -> STRING
224    /// - VARCHAR(n)/CHAR(n) with length -> keep as-is
225    /// - TEXT -> STRING
226    fn normalize_spark_type(dt: DataType) -> DataType {
227        match dt {
228            DataType::VarChar { length: None, .. }
229            | DataType::Char { length: None }
230            | DataType::Text => DataType::Custom {
231                name: "STRING".to_string(),
232            },
233            // VARCHAR(n) and CHAR(n) with length are kept as-is
234            DataType::VarChar { .. } | DataType::Char { .. } => dt,
235            // Also normalize struct fields recursively
236            DataType::Struct { fields, nested } => {
237                let normalized_fields: Vec<StructField> = fields
238                    .into_iter()
239                    .map(|mut f| {
240                        f.data_type = Self::normalize_spark_type(f.data_type);
241                        f
242                    })
243                    .collect();
244                DataType::Struct {
245                    fields: normalized_fields,
246                    nested,
247                }
248            }
249            _ => dt,
250        }
251    }
252
253    fn transform_function(&self, f: Function) -> Result<Expression> {
254        let name_upper = f.name.to_uppercase();
255        match name_upper.as_str() {
256            // IFNULL -> COALESCE
257            "IFNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
258                original_name: None,
259                expressions: f.args,
260                inferred_type: None,
261            }))),
262
263            // NVL -> COALESCE
264            "NVL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
265                original_name: None,
266                expressions: f.args,
267                inferred_type: None,
268            }))),
269
270            // ISNULL -> COALESCE
271            "ISNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
272                original_name: None,
273                expressions: f.args,
274                inferred_type: None,
275            }))),
276
277            // GROUP_CONCAT -> CONCAT_WS + COLLECT_LIST in older Spark
278            // In Spark 4+, STRING_AGG is available
279            "GROUP_CONCAT" if !f.args.is_empty() => {
280                // For simplicity, use COLLECT_LIST (array aggregation)
281                Ok(Expression::Function(Box::new(Function::new(
282                    "COLLECT_LIST".to_string(),
283                    f.args,
284                ))))
285            }
286
287            // STRING_AGG is supported in Spark 4+
288            // For older versions, fall back to CONCAT_WS + COLLECT_LIST
289            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
290                Function::new("COLLECT_LIST".to_string(), f.args),
291            ))),
292
293            // LISTAGG -> STRING_AGG in Spark 4+ (or COLLECT_LIST for older)
294            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
295                "COLLECT_LIST".to_string(),
296                f.args,
297            )))),
298
299            // SUBSTRING is native to Spark
300            "SUBSTRING" | "SUBSTR" => Ok(Expression::Function(Box::new(f))),
301
302            // LENGTH is native to Spark
303            "LENGTH" => Ok(Expression::Function(Box::new(f))),
304
305            // LEN -> LENGTH
306            "LEN" if f.args.len() == 1 => Ok(Expression::Length(Box::new(UnaryFunc::new(
307                f.args.into_iter().next().unwrap(),
308            )))),
309
310            // RANDOM -> RAND
311            "RANDOM" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
312                seed: None,
313                lower: None,
314                upper: None,
315            }))),
316
317            // RAND is native to Spark
318            "RAND" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
319                seed: None,
320                lower: None,
321                upper: None,
322            }))),
323
324            // NOW -> CURRENT_TIMESTAMP
325            "NOW" => Ok(Expression::CurrentTimestamp(
326                crate::expressions::CurrentTimestamp {
327                    precision: None,
328                    sysdate: false,
329                },
330            )),
331
332            // GETDATE -> CURRENT_TIMESTAMP
333            "GETDATE" => Ok(Expression::CurrentTimestamp(
334                crate::expressions::CurrentTimestamp {
335                    precision: None,
336                    sysdate: false,
337                },
338            )),
339
340            // CURRENT_TIMESTAMP is native
341            "CURRENT_TIMESTAMP" => Ok(Expression::CurrentTimestamp(
342                crate::expressions::CurrentTimestamp {
343                    precision: None,
344                    sysdate: false,
345                },
346            )),
347
348            // CURRENT_DATE is native
349            "CURRENT_DATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
350
351            // TO_DATE is native to Spark; strip default format 'yyyy-MM-dd'
352            "TO_DATE" if f.args.len() == 2 => {
353                let is_default_format = matches!(&f.args[1], Expression::Literal(lit) if matches!(lit.as_ref(), crate::expressions::Literal::String(s) if s == "yyyy-MM-dd"));
354                if is_default_format {
355                    Ok(Expression::Function(Box::new(Function::new(
356                        "TO_DATE".to_string(),
357                        vec![f.args.into_iter().next().unwrap()],
358                    ))))
359                } else {
360                    Ok(Expression::Function(Box::new(f)))
361                }
362            }
363            "TO_DATE" => Ok(Expression::Function(Box::new(f))),
364
365            // TO_TIMESTAMP is native to Spark
366            "TO_TIMESTAMP" => Ok(Expression::Function(Box::new(f))),
367
368            // DATE_FORMAT is native to Spark
369            "DATE_FORMAT" => Ok(Expression::Function(Box::new(f))),
370
371            // strftime -> DATE_FORMAT
372            "STRFTIME" => Ok(Expression::Function(Box::new(Function::new(
373                "DATE_FORMAT".to_string(),
374                f.args,
375            )))),
376
377            // TO_CHAR -> DATE_FORMAT
378            "TO_CHAR" => Ok(Expression::Function(Box::new(Function::new(
379                "DATE_FORMAT".to_string(),
380                f.args,
381            )))),
382
383            // DATE_TRUNC is native to Spark
384            "DATE_TRUNC" => Ok(Expression::Function(Box::new(f))),
385
386            // TRUNC is native to Spark
387            "TRUNC" => Ok(Expression::Function(Box::new(f))),
388
389            // EXTRACT is native to Spark
390            "EXTRACT" => Ok(Expression::Function(Box::new(f))),
391
392            // DATEPART -> EXTRACT
393            "DATEPART" => Ok(Expression::Function(Box::new(Function::new(
394                "EXTRACT".to_string(),
395                f.args,
396            )))),
397
398            // UNIX_TIMESTAMP is native to Spark
399            // When called with no args, add CURRENT_TIMESTAMP() as default
400            "UNIX_TIMESTAMP" => {
401                if f.args.is_empty() {
402                    Ok(Expression::Function(Box::new(Function::new(
403                        "UNIX_TIMESTAMP".to_string(),
404                        vec![Expression::CurrentTimestamp(CurrentTimestamp {
405                            precision: None,
406                            sysdate: false,
407                        })],
408                    ))))
409                } else {
410                    Ok(Expression::Function(Box::new(f)))
411                }
412            }
413
414            // FROM_UNIXTIME is native to Spark
415            "FROM_UNIXTIME" => Ok(Expression::Function(Box::new(f))),
416
417            // STR_TO_MAP is native to Spark
418            // When called with only one arg, add default delimiters ',' and ':'
419            "STR_TO_MAP" => {
420                if f.args.len() == 1 {
421                    let mut args = f.args;
422                    args.push(Expression::Literal(Box::new(
423                        crate::expressions::Literal::String(",".to_string()),
424                    )));
425                    args.push(Expression::Literal(Box::new(
426                        crate::expressions::Literal::String(":".to_string()),
427                    )));
428                    Ok(Expression::Function(Box::new(Function::new(
429                        "STR_TO_MAP".to_string(),
430                        args,
431                    ))))
432                } else {
433                    Ok(Expression::Function(Box::new(f)))
434                }
435            }
436
437            // POSITION is native to Spark (POSITION(substr IN str))
438            "POSITION" => Ok(Expression::Function(Box::new(f))),
439
440            // LOCATE is native to Spark
441            "LOCATE" => Ok(Expression::Function(Box::new(f))),
442
443            // STRPOS -> Use expression form or LOCATE
444            "STRPOS" if f.args.len() == 2 => {
445                let mut args = f.args;
446                let first = args.remove(0);
447                let second = args.remove(0);
448                // LOCATE(substr, str) in Spark
449                Ok(Expression::Function(Box::new(Function::new(
450                    "LOCATE".to_string(),
451                    vec![second, first],
452                ))))
453            }
454
455            // CHARINDEX -> LOCATE
456            "CHARINDEX" if f.args.len() >= 2 => {
457                let mut args = f.args;
458                let substring = args.remove(0);
459                let string = args.remove(0);
460                let mut locate_args = vec![substring, string];
461                if !args.is_empty() {
462                    locate_args.push(args.remove(0));
463                }
464                Ok(Expression::Function(Box::new(Function::new(
465                    "LOCATE".to_string(),
466                    locate_args,
467                ))))
468            }
469
470            // INSTR is native to Spark
471            "INSTR" => Ok(Expression::Function(Box::new(f))),
472
473            // CEILING -> CEIL
474            "CEILING" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
475                this: f.args.into_iter().next().unwrap(),
476                decimals: None,
477                to: None,
478            }))),
479
480            // CEIL is native to Spark
481            "CEIL" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
482                this: f.args.into_iter().next().unwrap(),
483                decimals: None,
484                to: None,
485            }))),
486
487            // UNNEST -> EXPLODE
488            "UNNEST" => Ok(Expression::Function(Box::new(Function::new(
489                "EXPLODE".to_string(),
490                f.args,
491            )))),
492
493            // FLATTEN -> FLATTEN is native to Spark (for nested arrays)
494            "FLATTEN" => Ok(Expression::Function(Box::new(f))),
495
496            // ARRAY_AGG -> COLLECT_LIST
497            "ARRAY_AGG" => Ok(Expression::Function(Box::new(Function::new(
498                "COLLECT_LIST".to_string(),
499                f.args,
500            )))),
501
502            // COLLECT_LIST is native to Spark
503            "COLLECT_LIST" => Ok(Expression::Function(Box::new(f))),
504
505            // COLLECT_SET is native to Spark
506            "COLLECT_SET" => Ok(Expression::Function(Box::new(f))),
507
508            // ARRAY_LENGTH -> SIZE in Spark
509            "ARRAY_LENGTH" | "CARDINALITY" => Ok(Expression::Function(Box::new(Function::new(
510                "SIZE".to_string(),
511                f.args,
512            )))),
513
514            // SIZE is native to Spark
515            "SIZE" => Ok(Expression::Function(Box::new(f))),
516
517            // SPLIT is native to Spark
518            "SPLIT" => Ok(Expression::Function(Box::new(f))),
519
520            // REGEXP_REPLACE: Spark supports up to 4 args (subject, pattern, replacement, position)
521            // Strip extra Snowflake args (occurrence, params) if present
522            "REGEXP_REPLACE" if f.args.len() > 4 => {
523                let mut args = f.args;
524                args.truncate(4);
525                Ok(Expression::Function(Box::new(Function::new(
526                    "REGEXP_REPLACE".to_string(),
527                    args,
528                ))))
529            }
530            "REGEXP_REPLACE" => Ok(Expression::Function(Box::new(f))),
531
532            // REGEXP_EXTRACT is native to Spark
533            "REGEXP_EXTRACT" => Ok(Expression::Function(Box::new(f))),
534
535            // REGEXP_EXTRACT_ALL is native to Spark
536            "REGEXP_EXTRACT_ALL" => Ok(Expression::Function(Box::new(f))),
537
538            // RLIKE is native to Spark
539            "RLIKE" | "REGEXP_LIKE" => Ok(Expression::Function(Box::new(Function::new(
540                "RLIKE".to_string(),
541                f.args,
542            )))),
543
544            // JSON_EXTRACT -> GET_JSON_OBJECT (Hive style) or :: operator
545            "JSON_EXTRACT" => Ok(Expression::Function(Box::new(Function::new(
546                "GET_JSON_OBJECT".to_string(),
547                f.args,
548            )))),
549
550            // JSON_EXTRACT_SCALAR -> GET_JSON_OBJECT
551            "JSON_EXTRACT_SCALAR" => Ok(Expression::Function(Box::new(Function::new(
552                "GET_JSON_OBJECT".to_string(),
553                f.args,
554            )))),
555
556            // GET_JSON_OBJECT is native to Spark
557            "GET_JSON_OBJECT" => Ok(Expression::Function(Box::new(f))),
558
559            // FROM_JSON is native to Spark
560            "FROM_JSON" => Ok(Expression::Function(Box::new(f))),
561
562            // TO_JSON is native to Spark
563            "TO_JSON" => Ok(Expression::Function(Box::new(f))),
564
565            // PARSE_JSON -> strip for Spark (just keep the string argument)
566            "PARSE_JSON" if f.args.len() == 1 => Ok(f.args.into_iter().next().unwrap()),
567            "PARSE_JSON" => Ok(Expression::Function(Box::new(Function::new(
568                "FROM_JSON".to_string(),
569                f.args,
570            )))),
571
572            // DATEDIFF is native to Spark (supports unit in Spark 3+)
573            "DATEDIFF" | "DATE_DIFF" => Ok(Expression::Function(Box::new(Function::new(
574                "DATEDIFF".to_string(),
575                f.args,
576            )))),
577
578            // DATE_ADD is native to Spark
579            "DATE_ADD" | "DATEADD" => Ok(Expression::Function(Box::new(Function::new(
580                "DATE_ADD".to_string(),
581                f.args,
582            )))),
583
584            // DATE_SUB is native to Spark
585            "DATE_SUB" => Ok(Expression::Function(Box::new(f))),
586
587            // TIMESTAMPADD is native to Spark 3+
588            "TIMESTAMPADD" => Ok(Expression::Function(Box::new(f))),
589
590            // TIMESTAMPDIFF is native to Spark 3+
591            "TIMESTAMPDIFF" => Ok(Expression::Function(Box::new(f))),
592
593            // ADD_MONTHS is native to Spark
594            "ADD_MONTHS" => Ok(Expression::Function(Box::new(f))),
595
596            // MONTHS_BETWEEN is native to Spark
597            "MONTHS_BETWEEN" => Ok(Expression::Function(Box::new(f))),
598
599            // NVL is native to Spark
600            "NVL" => Ok(Expression::Function(Box::new(f))),
601
602            // NVL2 is native to Spark
603            "NVL2" => Ok(Expression::Function(Box::new(f))),
604
605            // MAP is native to Spark
606            "MAP" => Ok(Expression::Function(Box::new(f))),
607
608            // ARRAY is native to Spark
609            "ARRAY" => Ok(Expression::Function(Box::new(f))),
610
611            // ROW -> STRUCT for Spark (cross-dialect, no auto-naming)
612            "ROW" => Ok(Expression::Function(Box::new(Function::new(
613                "STRUCT".to_string(),
614                f.args,
615            )))),
616
617            // STRUCT is native to Spark - auto-name unnamed args as col1, col2, etc.
618            "STRUCT" => {
619                let mut col_idx = 1usize;
620                let named_args: Vec<Expression> = f
621                    .args
622                    .into_iter()
623                    .map(|arg| {
624                        let current_idx = col_idx;
625                        col_idx += 1;
626                        // Check if arg already has an alias (AS name) or is Star
627                        match &arg {
628                            Expression::Alias(_) => arg, // already named
629                            Expression::Star(_) => arg,  // STRUCT(*) - keep as-is
630                            Expression::Column(c) if c.table.is_none() => {
631                                // Column reference: use column name as the struct field name
632                                let name = c.name.name.clone();
633                                Expression::Alias(Box::new(crate::expressions::Alias {
634                                    this: arg,
635                                    alias: crate::expressions::Identifier::new(&name),
636                                    column_aliases: Vec::new(),
637                                    alias_explicit_as: false,
638                                    alias_keyword: None,
639                                    pre_alias_comments: Vec::new(),
640                                    trailing_comments: Vec::new(),
641                                    inferred_type: None,
642                                }))
643                            }
644                            _ => {
645                                // Unnamed literal/expression: auto-name as colN
646                                let name = format!("col{}", current_idx);
647                                Expression::Alias(Box::new(crate::expressions::Alias {
648                                    this: arg,
649                                    alias: crate::expressions::Identifier::new(&name),
650                                    column_aliases: Vec::new(),
651                                    alias_explicit_as: false,
652                                    alias_keyword: None,
653                                    pre_alias_comments: Vec::new(),
654                                    trailing_comments: Vec::new(),
655                                    inferred_type: None,
656                                }))
657                            }
658                        }
659                    })
660                    .collect();
661                Ok(Expression::Function(Box::new(Function {
662                    name: "STRUCT".to_string(),
663                    args: named_args,
664                    distinct: false,
665                    trailing_comments: Vec::new(),
666                    use_bracket_syntax: false,
667                    no_parens: false,
668                    quoted: false,
669                    span: None,
670                    inferred_type: None,
671                })))
672            }
673
674            // NAMED_STRUCT('a', 1) -> STRUCT(1 AS a) for SQLGlot Spark outputs
675            "NAMED_STRUCT" if f.args.len() % 2 == 0 => {
676                let original_args = f.args.clone();
677                let mut struct_args = Vec::new();
678                for pair in f.args.chunks(2) {
679                    if let Expression::Literal(lit) = &pair[0] {
680                        if let Literal::String(field_name) = lit.as_ref() {
681                            struct_args.push(Expression::Alias(Box::new(
682                                crate::expressions::Alias {
683                                    this: pair[1].clone(),
684                                    alias: crate::expressions::Identifier::new(field_name),
685                                    column_aliases: Vec::new(),
686                                    alias_explicit_as: false,
687                                    alias_keyword: None,
688                                    pre_alias_comments: Vec::new(),
689                                    trailing_comments: Vec::new(),
690                                    inferred_type: None,
691                                },
692                            )));
693                            continue;
694                        }
695                    }
696                    return Ok(Expression::Function(Box::new(Function::new(
697                        "NAMED_STRUCT".to_string(),
698                        original_args,
699                    ))));
700                }
701                Ok(Expression::Function(Box::new(Function::new(
702                    "STRUCT".to_string(),
703                    struct_args,
704                ))))
705            }
706
707            // NAMED_STRUCT is native to Spark
708            "NAMED_STRUCT" => Ok(Expression::Function(Box::new(f))),
709
710            // MAP_FROM_ARRAYS is native to Spark
711            "MAP_FROM_ARRAYS" => Ok(Expression::Function(Box::new(f))),
712
713            // ARRAY_SORT is native to Spark
714            "ARRAY_SORT" => Ok(Expression::Function(Box::new(f))),
715
716            // ARRAY_DISTINCT is native to Spark
717            "ARRAY_DISTINCT" => Ok(Expression::Function(Box::new(f))),
718
719            // ARRAY_UNION is native to Spark
720            "ARRAY_UNION" => Ok(Expression::Function(Box::new(f))),
721
722            // ARRAY_INTERSECT is native to Spark
723            "ARRAY_INTERSECT" => Ok(Expression::Function(Box::new(f))),
724
725            // ARRAY_EXCEPT is native to Spark
726            "ARRAY_EXCEPT" => Ok(Expression::Function(Box::new(f))),
727
728            // ARRAY_CONTAINS is native to Spark
729            "ARRAY_CONTAINS" => Ok(Expression::Function(Box::new(f))),
730
731            // ELEMENT_AT is native to Spark
732            "ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
733
734            // TRY_ELEMENT_AT is native to Spark 3+
735            "TRY_ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
736
737            // TRANSFORM is native to Spark (array transformation)
738            "TRANSFORM" => Ok(Expression::Function(Box::new(f))),
739
740            // FILTER is native to Spark (array filtering)
741            "FILTER" => Ok(Expression::Function(Box::new(f))),
742
743            // AGGREGATE is native to Spark (array reduction)
744            "AGGREGATE" => Ok(Expression::Function(Box::new(f))),
745
746            // SEQUENCE is native to Spark (generate array)
747            "SEQUENCE" => Ok(Expression::Function(Box::new(f))),
748
749            // GENERATE_SERIES -> SEQUENCE
750            "GENERATE_SERIES" => Ok(Expression::Function(Box::new(Function::new(
751                "SEQUENCE".to_string(),
752                f.args,
753            )))),
754
755            // STARTSWITH is native to Spark 3+
756            "STARTSWITH" | "STARTS_WITH" => Ok(Expression::Function(Box::new(Function::new(
757                "STARTSWITH".to_string(),
758                f.args,
759            )))),
760
761            // ENDSWITH is native to Spark 3+
762            "ENDSWITH" | "ENDS_WITH" => Ok(Expression::Function(Box::new(Function::new(
763                "ENDSWITH".to_string(),
764                f.args,
765            )))),
766
767            // ARRAY_CONSTRUCT_COMPACT(1, null, 2) -> ARRAY_COMPACT(ARRAY(1, NULL, 2))
768            "ARRAY_CONSTRUCT_COMPACT" => {
769                let inner =
770                    Expression::Function(Box::new(Function::new("ARRAY".to_string(), f.args)));
771                Ok(Expression::Function(Box::new(Function::new(
772                    "ARRAY_COMPACT".to_string(),
773                    vec![inner],
774                ))))
775            }
776
777            // ARRAY_TO_STRING -> ARRAY_JOIN
778            "ARRAY_TO_STRING" => Ok(Expression::Function(Box::new(Function::new(
779                "ARRAY_JOIN".to_string(),
780                f.args,
781            )))),
782
783            // TO_ARRAY(x) -> IF(x IS NULL, NULL, ARRAY(x))
784            "TO_ARRAY" if f.args.len() == 1 => {
785                let x = f.args[0].clone();
786                // Check if arg is already an array constructor (bracket notation)
787                // In that case: TO_ARRAY(['test']) -> ARRAY('test')
788                match &x {
789                    Expression::ArrayFunc(arr) => {
790                        // Just convert to ARRAY(...) function
791                        Ok(Expression::Function(Box::new(Function::new(
792                            "ARRAY".to_string(),
793                            arr.expressions.clone(),
794                        ))))
795                    }
796                    _ => Ok(Expression::IfFunc(Box::new(crate::expressions::IfFunc {
797                        condition: Expression::IsNull(Box::new(crate::expressions::IsNull {
798                            this: x.clone(),
799                            not: false,
800                            postfix_form: false,
801                        })),
802                        true_value: Expression::Null(crate::expressions::Null),
803                        false_value: Some(Expression::Function(Box::new(Function::new(
804                            "ARRAY".to_string(),
805                            vec![x],
806                        )))),
807                        original_name: Some("IF".to_string()),
808                        inferred_type: None,
809                    }))),
810                }
811            }
812
813            // REGEXP_SUBSTR -> REGEXP_EXTRACT (strip extra args)
814            "REGEXP_SUBSTR" if f.args.len() >= 2 => {
815                let subject = f.args[0].clone();
816                let pattern = f.args[1].clone();
817                // For Spark: REGEXP_EXTRACT(subject, pattern, group)
818                // group defaults to 0 for full match, but sqlglot uses last arg if present
819                let group = if f.args.len() >= 6 {
820                    let g = &f.args[5];
821                    // If group is literal 1 (default), omit it
822                    if matches!(g, Expression::Literal(lit) if matches!(lit.as_ref(), Literal::Number(n) if n == "1"))
823                    {
824                        None
825                    } else {
826                        Some(g.clone())
827                    }
828                } else {
829                    None
830                };
831                let mut args = vec![subject, pattern];
832                if let Some(g) = group {
833                    args.push(g);
834                }
835                Ok(Expression::Function(Box::new(Function::new(
836                    "REGEXP_EXTRACT".to_string(),
837                    args,
838                ))))
839            }
840
841            // UUID_STRING() -> UUID(); keep namespace/name args for target-specific generation.
842            "UUID_STRING" => {
843                if f.args.is_empty() {
844                    Ok(Expression::Function(Box::new(Function::new(
845                        "UUID".to_string(),
846                        vec![],
847                    ))))
848                } else {
849                    Ok(Expression::Function(Box::new(Function::new(
850                        "UUID_STRING".to_string(),
851                        f.args,
852                    ))))
853                }
854            }
855
856            // OBJECT_CONSTRUCT -> STRUCT in Spark
857            "OBJECT_CONSTRUCT" if f.args.len() >= 2 && f.args.len() % 2 == 0 => {
858                // Convert key-value pairs to named struct fields
859                // OBJECT_CONSTRUCT('Manitoba', 'Winnipeg', 'foo', 'bar')
860                // -> STRUCT('Winnipeg' AS Manitoba, 'bar' AS foo)
861                let mut struct_args = Vec::new();
862                for pair in f.args.chunks(2) {
863                    if let Expression::Literal(lit) = &pair[0] {
864                        if let Literal::String(key) = lit.as_ref() {
865                            struct_args.push(Expression::Alias(Box::new(
866                                crate::expressions::Alias {
867                                    this: pair[1].clone(),
868                                    alias: crate::expressions::Identifier::new(key.clone()),
869                                    column_aliases: vec![],
870                                    alias_explicit_as: false,
871                                    alias_keyword: None,
872                                    pre_alias_comments: vec![],
873                                    trailing_comments: vec![],
874                                    inferred_type: None,
875                                },
876                            )));
877                        }
878                    } else {
879                        struct_args.push(pair[1].clone());
880                    }
881                }
882                Ok(Expression::Function(Box::new(Function::new(
883                    "STRUCT".to_string(),
884                    struct_args,
885                ))))
886            }
887
888            // DATE_PART(part, expr) -> EXTRACT(part FROM expr)
889            "DATE_PART" if f.args.len() == 2 => {
890                let mut args = f.args;
891                let part = args.remove(0);
892                let expr = args.remove(0);
893                if let Some(field) = expr_to_datetime_field(&part) {
894                    Ok(Expression::Extract(Box::new(ExtractFunc {
895                        this: expr,
896                        field,
897                    })))
898                } else {
899                    // Can't parse the field, keep as function
900                    Ok(Expression::Function(Box::new(Function::new(
901                        "DATE_PART".to_string(),
902                        vec![part, expr],
903                    ))))
904                }
905            }
906
907            // GET_PATH(obj, path) -> GET_JSON_OBJECT(obj, json_path) in Spark
908            "GET_PATH" if f.args.len() == 2 => {
909                let mut args = f.args;
910                let this = args.remove(0);
911                let path = args.remove(0);
912                let json_path = match &path {
913                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
914                        let Literal::String(s) = lit.as_ref() else {
915                            unreachable!()
916                        };
917                        let normalized = if s.starts_with('$') {
918                            s.clone()
919                        } else if s.starts_with('[') {
920                            format!("${}", s)
921                        } else {
922                            format!("$.{}", s)
923                        };
924                        Expression::Literal(Box::new(Literal::String(normalized)))
925                    }
926                    _ => path,
927                };
928                Ok(Expression::Function(Box::new(Function::new(
929                    "GET_JSON_OBJECT".to_string(),
930                    vec![this, json_path],
931                ))))
932            }
933
934            // BITWISE_LEFT_SHIFT → SHIFTLEFT
935            "BITWISE_LEFT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
936                "SHIFTLEFT".to_string(),
937                f.args,
938            )))),
939
940            // BITWISE_RIGHT_SHIFT → SHIFTRIGHT
941            "BITWISE_RIGHT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
942                "SHIFTRIGHT".to_string(),
943                f.args,
944            )))),
945
946            // APPROX_DISTINCT → APPROX_COUNT_DISTINCT
947            "APPROX_DISTINCT" => Ok(Expression::Function(Box::new(Function::new(
948                "APPROX_COUNT_DISTINCT".to_string(),
949                f.args,
950            )))),
951
952            // ARRAY_SLICE → SLICE
953            "ARRAY_SLICE" => Ok(Expression::Function(Box::new(Function::new(
954                "SLICE".to_string(),
955                f.args,
956            )))),
957
958            // DATE_FROM_PARTS → MAKE_DATE
959            "DATE_FROM_PARTS" => Ok(Expression::Function(Box::new(Function::new(
960                "MAKE_DATE".to_string(),
961                f.args,
962            )))),
963
964            // DAYOFWEEK_ISO → DAYOFWEEK
965            "DAYOFWEEK_ISO" => Ok(Expression::Function(Box::new(Function::new(
966                "DAYOFWEEK".to_string(),
967                f.args,
968            )))),
969
970            // FORMAT → FORMAT_STRING
971            "FORMAT" => Ok(Expression::Function(Box::new(Function::new(
972                "FORMAT_STRING".to_string(),
973                f.args,
974            )))),
975
976            // LOGICAL_AND → BOOL_AND
977            "LOGICAL_AND" => Ok(Expression::Function(Box::new(Function::new(
978                "BOOL_AND".to_string(),
979                f.args,
980            )))),
981
982            // VARIANCE_POP → VAR_POP
983            "VARIANCE_POP" => Ok(Expression::Function(Box::new(Function::new(
984                "VAR_POP".to_string(),
985                f.args,
986            )))),
987
988            // WEEK_OF_YEAR → WEEKOFYEAR
989            "WEEK_OF_YEAR" => Ok(Expression::Function(Box::new(Function::new(
990                "WEEKOFYEAR".to_string(),
991                f.args,
992            )))),
993
994            // BIT_GET -> GETBIT
995            "BIT_GET" => Ok(Expression::Function(Box::new(Function::new(
996                "GETBIT".to_string(),
997                f.args,
998            )))),
999
1000            // CURDATE -> CURRENT_DATE
1001            "CURDATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
1002
1003            // Pass through everything else
1004            _ => Ok(Expression::Function(Box::new(f))),
1005        }
1006    }
1007
1008    fn transform_aggregate_function(
1009        &self,
1010        f: Box<crate::expressions::AggregateFunction>,
1011    ) -> Result<Expression> {
1012        let name_upper = f.name.to_uppercase();
1013        match name_upper.as_str() {
1014            // GROUP_CONCAT -> COLLECT_LIST (then CONCAT_WS for string)
1015            "GROUP_CONCAT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
1016                Function::new("COLLECT_LIST".to_string(), f.args),
1017            ))),
1018
1019            // STRING_AGG -> COLLECT_LIST (or STRING_AGG in Spark 4+)
1020            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
1021                Function::new("COLLECT_LIST".to_string(), f.args),
1022            ))),
1023
1024            // LISTAGG -> COLLECT_LIST
1025            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
1026                "COLLECT_LIST".to_string(),
1027                f.args,
1028            )))),
1029
1030            // ARRAY_AGG -> COLLECT_LIST (preserve distinct and filter)
1031            "ARRAY_AGG" if !f.args.is_empty() => {
1032                let mut af = f;
1033                af.name = "COLLECT_LIST".to_string();
1034                Ok(Expression::AggregateFunction(af))
1035            }
1036
1037            // LOGICAL_OR -> BOOL_OR in Spark
1038            "LOGICAL_OR" if !f.args.is_empty() => {
1039                let mut af = f;
1040                af.name = "BOOL_OR".to_string();
1041                Ok(Expression::AggregateFunction(af))
1042            }
1043
1044            // Pass through everything else
1045            _ => Ok(Expression::AggregateFunction(f)),
1046        }
1047    }
1048}
1049
1050/// Convert an expression (string literal or identifier) to a DateTimeField
1051fn expr_to_datetime_field(expr: &Expression) -> Option<DateTimeField> {
1052    let name = match expr {
1053        Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
1054            let Literal::String(s) = lit.as_ref() else {
1055                unreachable!()
1056            };
1057            s.to_uppercase()
1058        }
1059        Expression::Identifier(id) => id.name.to_uppercase(),
1060        Expression::Var(v) => v.this.to_uppercase(),
1061        Expression::Column(col) if col.table.is_none() => col.name.name.to_uppercase(),
1062        _ => return None,
1063    };
1064    match name.as_str() {
1065        "YEAR" | "Y" | "YY" | "YYY" | "YYYY" | "YR" | "YEARS" | "YRS" => Some(DateTimeField::Year),
1066        "MONTH" | "MM" | "MON" | "MONS" | "MONTHS" => Some(DateTimeField::Month),
1067        "DAY" | "D" | "DD" | "DAYS" | "DAYOFMONTH" => Some(DateTimeField::Day),
1068        "HOUR" | "H" | "HH" | "HR" | "HOURS" | "HRS" => Some(DateTimeField::Hour),
1069        "MINUTE" | "MI" | "MIN" | "MINUTES" | "MINS" => Some(DateTimeField::Minute),
1070        "SECOND" | "S" | "SEC" | "SECONDS" | "SECS" => Some(DateTimeField::Second),
1071        "MILLISECOND" | "MS" | "MSEC" | "MILLISECONDS" => Some(DateTimeField::Millisecond),
1072        "MICROSECOND" | "US" | "USEC" | "MICROSECONDS" => Some(DateTimeField::Microsecond),
1073        "DOW" | "DAYOFWEEK" | "DAYOFWEEK_ISO" | "DW" => Some(DateTimeField::DayOfWeek),
1074        "DOY" | "DAYOFYEAR" => Some(DateTimeField::DayOfYear),
1075        "WEEK" | "W" | "WK" | "WEEKOFYEAR" | "WOY" => Some(DateTimeField::Week),
1076        "QUARTER" | "Q" | "QTR" | "QTRS" | "QUARTERS" => Some(DateTimeField::Quarter),
1077        "EPOCH" | "EPOCH_SECOND" | "EPOCH_SECONDS" => Some(DateTimeField::Epoch),
1078        "TIMEZONE" | "TIMEZONE_HOUR" | "TZH" => Some(DateTimeField::TimezoneHour),
1079        "TIMEZONE_MINUTE" | "TZM" => Some(DateTimeField::TimezoneMinute),
1080        _ => Some(DateTimeField::Custom(name)),
1081    }
1082}