Skip to main content

polyglot_sql/dialects/
spark.rs

1//! Spark SQL Dialect
2//!
3//! Spark SQL-specific transformations based on sqlglot patterns.
4//! Key features (extends Hive with modern SQL):
5//! - TRY_CAST is supported (Spark 3+)
6//! - ILIKE is supported (Spark 3+)
7//! - Uses backticks for identifiers
8//! - ARRAY_AGG, COLLECT_LIST for array aggregation
9//! - STRING_AGG / LISTAGG supported (Spark 4+)
10//! - DATE_ADD with unit parameter (Spark 3+)
11//! - TIMESTAMPADD, TIMESTAMPDIFF (Spark 3+)
12//! - More PostgreSQL-like syntax than Hive
13
14use super::{DialectImpl, DialectType};
15use crate::error::Result;
16use crate::expressions::{
17    CeilFunc, CurrentTimestamp, DataType, DateTimeField, Expression, ExtractFunc, Function,
18    Literal, StructField, UnaryFunc, VarArgFunc,
19};
20use crate::generator::GeneratorConfig;
21use crate::tokens::TokenizerConfig;
22
23/// Spark SQL dialect
24pub struct SparkDialect;
25
26impl DialectImpl for SparkDialect {
27    fn dialect_type(&self) -> DialectType {
28        DialectType::Spark
29    }
30
31    fn tokenizer_config(&self) -> TokenizerConfig {
32        let mut config = TokenizerConfig::default();
33        // Spark uses backticks for identifiers (NOT double quotes)
34        config.identifiers.clear();
35        config.identifiers.insert('`', '`');
36        // Spark (like Hive) uses double quotes as string delimiters (QUOTES = ["'", '"'])
37        config.quotes.insert("\"".to_string(), "\"".to_string());
38        // Spark (like Hive) uses backslash escapes in strings (STRING_ESCAPES = ["\\"])
39        config.string_escapes.push('\\');
40        // Spark supports DIV keyword for integer division (inherited from Hive)
41        config
42            .keywords
43            .insert("DIV".to_string(), crate::tokens::TokenType::Div);
44        // Spark numeric literal suffixes (same as Hive): 1L -> BIGINT, 1S -> SMALLINT, etc.
45        config
46            .numeric_literals
47            .insert("L".to_string(), "BIGINT".to_string());
48        config
49            .numeric_literals
50            .insert("S".to_string(), "SMALLINT".to_string());
51        config
52            .numeric_literals
53            .insert("Y".to_string(), "TINYINT".to_string());
54        config
55            .numeric_literals
56            .insert("D".to_string(), "DOUBLE".to_string());
57        config
58            .numeric_literals
59            .insert("F".to_string(), "FLOAT".to_string());
60        config
61            .numeric_literals
62            .insert("BD".to_string(), "DECIMAL".to_string());
63        // Spark allows identifiers to start with digits (e.g., 1a, 1_a)
64        config.identifiers_can_start_with_digit = true;
65        // Spark: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False
66        // Backslashes in raw strings are always literal (no escape processing)
67        config.string_escapes_allowed_in_raw_strings = false;
68        config
69    }
70
71    fn generator_config(&self) -> GeneratorConfig {
72        use crate::generator::IdentifierQuoteStyle;
73        GeneratorConfig {
74            identifier_quote: '`',
75            identifier_quote_style: IdentifierQuoteStyle::BACKTICK,
76            dialect: Some(DialectType::Spark),
77            // Spark uses colon separator in STRUCT field definitions: STRUCT<field_name: TYPE>
78            struct_field_sep: ": ",
79            // Spark doesn't use AS before RETURN in function definitions
80            create_function_return_as: false,
81            // Spark places alias after the TABLESAMPLE clause
82            alias_post_tablesample: true,
83            tablesample_seed_keyword: "REPEATABLE",
84            join_hints: false,
85            identifiers_can_start_with_digit: true,
86            // Spark uses COMMENT 'value' without = sign
87            schema_comment_with_eq: false,
88            ..Default::default()
89        }
90    }
91
92    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
93        match expr {
94            // IFNULL -> COALESCE in Spark
95            Expression::IfNull(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
96                original_name: None,
97                expressions: vec![f.this, f.expression],
98                inferred_type: None,
99            }))),
100
101            // NVL is supported in Spark (from Hive), but COALESCE is standard
102            Expression::Nvl(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
103                original_name: None,
104                expressions: vec![f.this, f.expression],
105                inferred_type: None,
106            }))),
107
108            // Cast: normalize VARCHAR(n) -> STRING, CHAR(n) -> STRING for Spark
109            Expression::Cast(mut c) => {
110                c.to = Self::normalize_spark_type(c.to);
111                Ok(Expression::Cast(c))
112            }
113
114            // TryCast stays as TryCast in Spark (Spark supports TRY_CAST natively)
115            Expression::TryCast(mut c) => {
116                c.to = Self::normalize_spark_type(c.to);
117                Ok(Expression::TryCast(c))
118            }
119
120            // SafeCast -> TRY_CAST
121            Expression::SafeCast(mut c) => {
122                c.to = Self::normalize_spark_type(c.to);
123                Ok(Expression::TryCast(c))
124            }
125
126            // TRIM: non-standard comma syntax -> standard FROM syntax
127            // TRIM('SL', 'SSparkSQLS') -> TRIM('SL' FROM 'SSparkSQLS')
128            Expression::Trim(mut t) => {
129                if !t.sql_standard_syntax && t.characters.is_some() {
130                    // Convert comma syntax to standard SQL syntax
131                    // Fields already have correct semantics: this=string, characters=chars
132                    t.sql_standard_syntax = true;
133                }
134                Ok(Expression::Trim(t))
135            }
136
137            // ILIKE is supported in Spark 3+
138            Expression::ILike(op) => Ok(Expression::ILike(op)),
139
140            // UNNEST -> EXPLODE in Spark (Hive compatibility)
141            Expression::Unnest(f) => Ok(Expression::Explode(Box::new(UnaryFunc::new(f.this)))),
142
143            // EXPLODE is native to Spark
144            Expression::Explode(f) => Ok(Expression::Explode(f)),
145
146            // ExplodeOuter is supported in Spark
147            Expression::ExplodeOuter(f) => Ok(Expression::ExplodeOuter(f)),
148
149            // RANDOM -> RAND in Spark
150            Expression::Random(_) => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
151                seed: None,
152                lower: None,
153                upper: None,
154            }))),
155
156            // Rand is native to Spark
157            Expression::Rand(r) => Ok(Expression::Rand(r)),
158
159            // || (Concat) -> CONCAT in Spark
160            Expression::Concat(op) => Ok(Expression::Function(Box::new(Function::new(
161                "CONCAT".to_string(),
162                vec![op.left, op.right],
163            )))),
164
165            // ParseJson: handled by generator (emits just the string literal for Spark)
166
167            // Generic function transformations
168            Expression::Function(f) => self.transform_function(*f),
169
170            // Generic aggregate function transformations
171            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
172
173            // $N parameters -> ${N} in Spark (DollarBrace style)
174            Expression::Parameter(mut p)
175                if p.style == crate::expressions::ParameterStyle::Dollar =>
176            {
177                p.style = crate::expressions::ParameterStyle::DollarBrace;
178                // Convert index to name for DollarBrace format
179                if let Some(idx) = p.index {
180                    p.name = Some(idx.to_string());
181                }
182                Ok(Expression::Parameter(p))
183            }
184
185            // JSONExtract with variant_extract (Databricks colon syntax) -> GET_JSON_OBJECT
186            Expression::JSONExtract(je) if je.variant_extract.is_some() => {
187                // Convert path: 'item[1].price' -> '$.item[1].price'
188                let path = match *je.expression {
189                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
190                        let Literal::String(s) = lit.as_ref() else { unreachable!() };
191                        Expression::Literal(Box::new(Literal::String(format!("$.{}", s))))
192                    }
193                    other => other,
194                };
195                Ok(Expression::Function(Box::new(Function::new(
196                    "GET_JSON_OBJECT".to_string(),
197                    vec![*je.this, path],
198                ))))
199            }
200
201            // Pass through everything else
202            _ => Ok(expr),
203        }
204    }
205}
206
207impl SparkDialect {
208    /// Normalize a data type for Spark:
209    /// - VARCHAR/CHAR without length -> STRING
210    /// - VARCHAR(n)/CHAR(n) with length -> keep as-is
211    /// - TEXT -> STRING
212    fn normalize_spark_type(dt: DataType) -> DataType {
213        match dt {
214            DataType::VarChar { length: None, .. }
215            | DataType::Char { length: None }
216            | DataType::Text => DataType::Custom {
217                name: "STRING".to_string(),
218            },
219            // VARCHAR(n) and CHAR(n) with length are kept as-is
220            DataType::VarChar { .. } | DataType::Char { .. } => dt,
221            // Also normalize struct fields recursively
222            DataType::Struct { fields, nested } => {
223                let normalized_fields: Vec<StructField> = fields
224                    .into_iter()
225                    .map(|mut f| {
226                        f.data_type = Self::normalize_spark_type(f.data_type);
227                        f
228                    })
229                    .collect();
230                DataType::Struct {
231                    fields: normalized_fields,
232                    nested,
233                }
234            }
235            _ => dt,
236        }
237    }
238
239    fn transform_function(&self, f: Function) -> Result<Expression> {
240        let name_upper = f.name.to_uppercase();
241        match name_upper.as_str() {
242            // IFNULL -> COALESCE
243            "IFNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
244                original_name: None,
245                expressions: f.args,
246                inferred_type: None,
247            }))),
248
249            // NVL -> COALESCE
250            "NVL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
251                original_name: None,
252                expressions: f.args,
253                inferred_type: None,
254            }))),
255
256            // ISNULL -> COALESCE
257            "ISNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
258                original_name: None,
259                expressions: f.args,
260                inferred_type: None,
261            }))),
262
263            // GROUP_CONCAT -> CONCAT_WS + COLLECT_LIST in older Spark
264            // In Spark 4+, STRING_AGG is available
265            "GROUP_CONCAT" if !f.args.is_empty() => {
266                // For simplicity, use COLLECT_LIST (array aggregation)
267                Ok(Expression::Function(Box::new(Function::new(
268                    "COLLECT_LIST".to_string(),
269                    f.args,
270                ))))
271            }
272
273            // STRING_AGG is supported in Spark 4+
274            // For older versions, fall back to CONCAT_WS + COLLECT_LIST
275            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
276                Function::new("COLLECT_LIST".to_string(), f.args),
277            ))),
278
279            // LISTAGG -> STRING_AGG in Spark 4+ (or COLLECT_LIST for older)
280            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
281                "COLLECT_LIST".to_string(),
282                f.args,
283            )))),
284
285            // SUBSTRING is native to Spark
286            "SUBSTRING" | "SUBSTR" => Ok(Expression::Function(Box::new(f))),
287
288            // LENGTH is native to Spark
289            "LENGTH" => Ok(Expression::Function(Box::new(f))),
290
291            // LEN -> LENGTH
292            "LEN" if f.args.len() == 1 => Ok(Expression::Length(Box::new(UnaryFunc::new(
293                f.args.into_iter().next().unwrap(),
294            )))),
295
296            // RANDOM -> RAND
297            "RANDOM" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
298                seed: None,
299                lower: None,
300                upper: None,
301            }))),
302
303            // RAND is native to Spark
304            "RAND" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
305                seed: None,
306                lower: None,
307                upper: None,
308            }))),
309
310            // NOW -> CURRENT_TIMESTAMP
311            "NOW" => Ok(Expression::CurrentTimestamp(
312                crate::expressions::CurrentTimestamp {
313                    precision: None,
314                    sysdate: false,
315                },
316            )),
317
318            // GETDATE -> CURRENT_TIMESTAMP
319            "GETDATE" => Ok(Expression::CurrentTimestamp(
320                crate::expressions::CurrentTimestamp {
321                    precision: None,
322                    sysdate: false,
323                },
324            )),
325
326            // CURRENT_TIMESTAMP is native
327            "CURRENT_TIMESTAMP" => Ok(Expression::CurrentTimestamp(
328                crate::expressions::CurrentTimestamp {
329                    precision: None,
330                    sysdate: false,
331                },
332            )),
333
334            // CURRENT_DATE is native
335            "CURRENT_DATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
336
337            // TO_DATE is native to Spark; strip default format 'yyyy-MM-dd'
338            "TO_DATE" if f.args.len() == 2 => {
339                let is_default_format = matches!(&f.args[1], Expression::Literal(lit) if matches!(lit.as_ref(), crate::expressions::Literal::String(s) if s == "yyyy-MM-dd"));
340                if is_default_format {
341                    Ok(Expression::Function(Box::new(Function::new(
342                        "TO_DATE".to_string(),
343                        vec![f.args.into_iter().next().unwrap()],
344                    ))))
345                } else {
346                    Ok(Expression::Function(Box::new(f)))
347                }
348            }
349            "TO_DATE" => Ok(Expression::Function(Box::new(f))),
350
351            // TO_TIMESTAMP is native to Spark
352            "TO_TIMESTAMP" => Ok(Expression::Function(Box::new(f))),
353
354            // DATE_FORMAT is native to Spark
355            "DATE_FORMAT" => Ok(Expression::Function(Box::new(f))),
356
357            // strftime -> DATE_FORMAT
358            "STRFTIME" => Ok(Expression::Function(Box::new(Function::new(
359                "DATE_FORMAT".to_string(),
360                f.args,
361            )))),
362
363            // TO_CHAR -> DATE_FORMAT
364            "TO_CHAR" => Ok(Expression::Function(Box::new(Function::new(
365                "DATE_FORMAT".to_string(),
366                f.args,
367            )))),
368
369            // DATE_TRUNC is native to Spark
370            "DATE_TRUNC" => Ok(Expression::Function(Box::new(f))),
371
372            // TRUNC is native to Spark
373            "TRUNC" => Ok(Expression::Function(Box::new(f))),
374
375            // EXTRACT is native to Spark
376            "EXTRACT" => Ok(Expression::Function(Box::new(f))),
377
378            // DATEPART -> EXTRACT
379            "DATEPART" => Ok(Expression::Function(Box::new(Function::new(
380                "EXTRACT".to_string(),
381                f.args,
382            )))),
383
384            // UNIX_TIMESTAMP is native to Spark
385            // When called with no args, add CURRENT_TIMESTAMP() as default
386            "UNIX_TIMESTAMP" => {
387                if f.args.is_empty() {
388                    Ok(Expression::Function(Box::new(Function::new(
389                        "UNIX_TIMESTAMP".to_string(),
390                        vec![Expression::CurrentTimestamp(CurrentTimestamp {
391                            precision: None,
392                            sysdate: false,
393                        })],
394                    ))))
395                } else {
396                    Ok(Expression::Function(Box::new(f)))
397                }
398            }
399
400            // FROM_UNIXTIME is native to Spark
401            "FROM_UNIXTIME" => Ok(Expression::Function(Box::new(f))),
402
403            // STR_TO_MAP is native to Spark
404            // When called with only one arg, add default delimiters ',' and ':'
405            "STR_TO_MAP" => {
406                if f.args.len() == 1 {
407                    let mut args = f.args;
408                    args.push(Expression::Literal(Box::new(crate::expressions::Literal::String(
409                        ",".to_string(),
410                    ))));
411                    args.push(Expression::Literal(Box::new(crate::expressions::Literal::String(
412                        ":".to_string(),
413                    ))));
414                    Ok(Expression::Function(Box::new(Function::new(
415                        "STR_TO_MAP".to_string(),
416                        args,
417                    ))))
418                } else {
419                    Ok(Expression::Function(Box::new(f)))
420                }
421            }
422
423            // POSITION is native to Spark (POSITION(substr IN str))
424            "POSITION" => Ok(Expression::Function(Box::new(f))),
425
426            // LOCATE is native to Spark
427            "LOCATE" => Ok(Expression::Function(Box::new(f))),
428
429            // STRPOS -> Use expression form or LOCATE
430            "STRPOS" if f.args.len() == 2 => {
431                let mut args = f.args;
432                let first = args.remove(0);
433                let second = args.remove(0);
434                // LOCATE(substr, str) in Spark
435                Ok(Expression::Function(Box::new(Function::new(
436                    "LOCATE".to_string(),
437                    vec![second, first],
438                ))))
439            }
440
441            // CHARINDEX -> LOCATE
442            "CHARINDEX" if f.args.len() >= 2 => {
443                let mut args = f.args;
444                let substring = args.remove(0);
445                let string = args.remove(0);
446                let mut locate_args = vec![substring, string];
447                if !args.is_empty() {
448                    locate_args.push(args.remove(0));
449                }
450                Ok(Expression::Function(Box::new(Function::new(
451                    "LOCATE".to_string(),
452                    locate_args,
453                ))))
454            }
455
456            // INSTR is native to Spark
457            "INSTR" => Ok(Expression::Function(Box::new(f))),
458
459            // CEILING -> CEIL
460            "CEILING" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
461                this: f.args.into_iter().next().unwrap(),
462                decimals: None,
463                to: None,
464            }))),
465
466            // CEIL is native to Spark
467            "CEIL" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
468                this: f.args.into_iter().next().unwrap(),
469                decimals: None,
470                to: None,
471            }))),
472
473            // UNNEST -> EXPLODE
474            "UNNEST" => Ok(Expression::Function(Box::new(Function::new(
475                "EXPLODE".to_string(),
476                f.args,
477            )))),
478
479            // FLATTEN -> FLATTEN is native to Spark (for nested arrays)
480            "FLATTEN" => Ok(Expression::Function(Box::new(f))),
481
482            // ARRAY_AGG -> COLLECT_LIST
483            "ARRAY_AGG" => Ok(Expression::Function(Box::new(Function::new(
484                "COLLECT_LIST".to_string(),
485                f.args,
486            )))),
487
488            // COLLECT_LIST is native to Spark
489            "COLLECT_LIST" => Ok(Expression::Function(Box::new(f))),
490
491            // COLLECT_SET is native to Spark
492            "COLLECT_SET" => Ok(Expression::Function(Box::new(f))),
493
494            // ARRAY_LENGTH -> SIZE in Spark
495            "ARRAY_LENGTH" | "CARDINALITY" => Ok(Expression::Function(Box::new(Function::new(
496                "SIZE".to_string(),
497                f.args,
498            )))),
499
500            // SIZE is native to Spark
501            "SIZE" => Ok(Expression::Function(Box::new(f))),
502
503            // SPLIT is native to Spark
504            "SPLIT" => Ok(Expression::Function(Box::new(f))),
505
506            // REGEXP_REPLACE: Spark supports up to 4 args (subject, pattern, replacement, position)
507            // Strip extra Snowflake args (occurrence, params) if present
508            "REGEXP_REPLACE" if f.args.len() > 4 => {
509                let mut args = f.args;
510                args.truncate(4);
511                Ok(Expression::Function(Box::new(Function::new(
512                    "REGEXP_REPLACE".to_string(),
513                    args,
514                ))))
515            }
516            "REGEXP_REPLACE" => Ok(Expression::Function(Box::new(f))),
517
518            // REGEXP_EXTRACT is native to Spark
519            "REGEXP_EXTRACT" => Ok(Expression::Function(Box::new(f))),
520
521            // REGEXP_EXTRACT_ALL is native to Spark
522            "REGEXP_EXTRACT_ALL" => Ok(Expression::Function(Box::new(f))),
523
524            // RLIKE is native to Spark
525            "RLIKE" | "REGEXP_LIKE" => Ok(Expression::Function(Box::new(Function::new(
526                "RLIKE".to_string(),
527                f.args,
528            )))),
529
530            // JSON_EXTRACT -> GET_JSON_OBJECT (Hive style) or :: operator
531            "JSON_EXTRACT" => Ok(Expression::Function(Box::new(Function::new(
532                "GET_JSON_OBJECT".to_string(),
533                f.args,
534            )))),
535
536            // JSON_EXTRACT_SCALAR -> GET_JSON_OBJECT
537            "JSON_EXTRACT_SCALAR" => Ok(Expression::Function(Box::new(Function::new(
538                "GET_JSON_OBJECT".to_string(),
539                f.args,
540            )))),
541
542            // GET_JSON_OBJECT is native to Spark
543            "GET_JSON_OBJECT" => Ok(Expression::Function(Box::new(f))),
544
545            // FROM_JSON is native to Spark
546            "FROM_JSON" => Ok(Expression::Function(Box::new(f))),
547
548            // TO_JSON is native to Spark
549            "TO_JSON" => Ok(Expression::Function(Box::new(f))),
550
551            // PARSE_JSON -> strip for Spark (just keep the string argument)
552            "PARSE_JSON" if f.args.len() == 1 => Ok(f.args.into_iter().next().unwrap()),
553            "PARSE_JSON" => Ok(Expression::Function(Box::new(Function::new(
554                "FROM_JSON".to_string(),
555                f.args,
556            )))),
557
558            // DATEDIFF is native to Spark (supports unit in Spark 3+)
559            "DATEDIFF" | "DATE_DIFF" => Ok(Expression::Function(Box::new(Function::new(
560                "DATEDIFF".to_string(),
561                f.args,
562            )))),
563
564            // DATE_ADD is native to Spark
565            "DATE_ADD" | "DATEADD" => Ok(Expression::Function(Box::new(Function::new(
566                "DATE_ADD".to_string(),
567                f.args,
568            )))),
569
570            // DATE_SUB is native to Spark
571            "DATE_SUB" => Ok(Expression::Function(Box::new(f))),
572
573            // TIMESTAMPADD is native to Spark 3+
574            "TIMESTAMPADD" => Ok(Expression::Function(Box::new(f))),
575
576            // TIMESTAMPDIFF is native to Spark 3+
577            "TIMESTAMPDIFF" => Ok(Expression::Function(Box::new(f))),
578
579            // ADD_MONTHS is native to Spark
580            "ADD_MONTHS" => Ok(Expression::Function(Box::new(f))),
581
582            // MONTHS_BETWEEN is native to Spark
583            "MONTHS_BETWEEN" => Ok(Expression::Function(Box::new(f))),
584
585            // NVL is native to Spark
586            "NVL" => Ok(Expression::Function(Box::new(f))),
587
588            // NVL2 is native to Spark
589            "NVL2" => Ok(Expression::Function(Box::new(f))),
590
591            // MAP is native to Spark
592            "MAP" => Ok(Expression::Function(Box::new(f))),
593
594            // ARRAY is native to Spark
595            "ARRAY" => Ok(Expression::Function(Box::new(f))),
596
597            // ROW -> STRUCT for Spark (cross-dialect, no auto-naming)
598            "ROW" => Ok(Expression::Function(Box::new(Function::new(
599                "STRUCT".to_string(),
600                f.args,
601            )))),
602
603            // STRUCT is native to Spark - auto-name unnamed args as col1, col2, etc.
604            "STRUCT" => {
605                let mut col_idx = 1usize;
606                let named_args: Vec<Expression> = f
607                    .args
608                    .into_iter()
609                    .map(|arg| {
610                        let current_idx = col_idx;
611                        col_idx += 1;
612                        // Check if arg already has an alias (AS name) or is Star
613                        match &arg {
614                            Expression::Alias(_) => arg, // already named
615                            Expression::Star(_) => arg,  // STRUCT(*) - keep as-is
616                            Expression::Column(c) if c.table.is_none() => {
617                                // Column reference: use column name as the struct field name
618                                let name = c.name.name.clone();
619                                Expression::Alias(Box::new(crate::expressions::Alias {
620                                    this: arg,
621                                    alias: crate::expressions::Identifier::new(&name),
622                                    column_aliases: Vec::new(),
623                                    pre_alias_comments: Vec::new(),
624                                    trailing_comments: Vec::new(),
625                                    inferred_type: None,
626                                }))
627                            }
628                            _ => {
629                                // Unnamed literal/expression: auto-name as colN
630                                let name = format!("col{}", current_idx);
631                                Expression::Alias(Box::new(crate::expressions::Alias {
632                                    this: arg,
633                                    alias: crate::expressions::Identifier::new(&name),
634                                    column_aliases: Vec::new(),
635                                    pre_alias_comments: Vec::new(),
636                                    trailing_comments: Vec::new(),
637                                    inferred_type: None,
638                                }))
639                            }
640                        }
641                    })
642                    .collect();
643                Ok(Expression::Function(Box::new(Function {
644                    name: "STRUCT".to_string(),
645                    args: named_args,
646                    distinct: false,
647                    trailing_comments: Vec::new(),
648                    use_bracket_syntax: false,
649                    no_parens: false,
650                    quoted: false,
651                    span: None,
652                    inferred_type: None,
653                })))
654            }
655
656            // NAMED_STRUCT is native to Spark
657            "NAMED_STRUCT" => Ok(Expression::Function(Box::new(f))),
658
659            // MAP_FROM_ARRAYS is native to Spark
660            "MAP_FROM_ARRAYS" => Ok(Expression::Function(Box::new(f))),
661
662            // ARRAY_SORT is native to Spark
663            "ARRAY_SORT" => Ok(Expression::Function(Box::new(f))),
664
665            // ARRAY_DISTINCT is native to Spark
666            "ARRAY_DISTINCT" => Ok(Expression::Function(Box::new(f))),
667
668            // ARRAY_UNION is native to Spark
669            "ARRAY_UNION" => Ok(Expression::Function(Box::new(f))),
670
671            // ARRAY_INTERSECT is native to Spark
672            "ARRAY_INTERSECT" => Ok(Expression::Function(Box::new(f))),
673
674            // ARRAY_EXCEPT is native to Spark
675            "ARRAY_EXCEPT" => Ok(Expression::Function(Box::new(f))),
676
677            // ARRAY_CONTAINS is native to Spark
678            "ARRAY_CONTAINS" => Ok(Expression::Function(Box::new(f))),
679
680            // ELEMENT_AT is native to Spark
681            "ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
682
683            // TRY_ELEMENT_AT is native to Spark 3+
684            "TRY_ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
685
686            // TRANSFORM is native to Spark (array transformation)
687            "TRANSFORM" => Ok(Expression::Function(Box::new(f))),
688
689            // FILTER is native to Spark (array filtering)
690            "FILTER" => Ok(Expression::Function(Box::new(f))),
691
692            // AGGREGATE is native to Spark (array reduction)
693            "AGGREGATE" => Ok(Expression::Function(Box::new(f))),
694
695            // SEQUENCE is native to Spark (generate array)
696            "SEQUENCE" => Ok(Expression::Function(Box::new(f))),
697
698            // GENERATE_SERIES -> SEQUENCE
699            "GENERATE_SERIES" => Ok(Expression::Function(Box::new(Function::new(
700                "SEQUENCE".to_string(),
701                f.args,
702            )))),
703
704            // STARTSWITH is native to Spark 3+
705            "STARTSWITH" | "STARTS_WITH" => Ok(Expression::Function(Box::new(Function::new(
706                "STARTSWITH".to_string(),
707                f.args,
708            )))),
709
710            // ENDSWITH is native to Spark 3+
711            "ENDSWITH" | "ENDS_WITH" => Ok(Expression::Function(Box::new(Function::new(
712                "ENDSWITH".to_string(),
713                f.args,
714            )))),
715
716            // ARRAY_CONSTRUCT_COMPACT(1, null, 2) -> ARRAY_COMPACT(ARRAY(1, NULL, 2))
717            "ARRAY_CONSTRUCT_COMPACT" => {
718                let inner =
719                    Expression::Function(Box::new(Function::new("ARRAY".to_string(), f.args)));
720                Ok(Expression::Function(Box::new(Function::new(
721                    "ARRAY_COMPACT".to_string(),
722                    vec![inner],
723                ))))
724            }
725
726            // ARRAY_TO_STRING -> ARRAY_JOIN
727            "ARRAY_TO_STRING" => Ok(Expression::Function(Box::new(Function::new(
728                "ARRAY_JOIN".to_string(),
729                f.args,
730            )))),
731
732            // TO_ARRAY(x) -> IF(x IS NULL, NULL, ARRAY(x))
733            "TO_ARRAY" if f.args.len() == 1 => {
734                let x = f.args[0].clone();
735                // Check if arg is already an array constructor (bracket notation)
736                // In that case: TO_ARRAY(['test']) -> ARRAY('test')
737                match &x {
738                    Expression::ArrayFunc(arr) => {
739                        // Just convert to ARRAY(...) function
740                        Ok(Expression::Function(Box::new(Function::new(
741                            "ARRAY".to_string(),
742                            arr.expressions.clone(),
743                        ))))
744                    }
745                    _ => Ok(Expression::IfFunc(Box::new(crate::expressions::IfFunc {
746                        condition: Expression::IsNull(Box::new(crate::expressions::IsNull {
747                            this: x.clone(),
748                            not: false,
749                            postfix_form: false,
750                        })),
751                        true_value: Expression::Null(crate::expressions::Null),
752                        false_value: Some(Expression::Function(Box::new(Function::new(
753                            "ARRAY".to_string(),
754                            vec![x],
755                        )))),
756                        original_name: Some("IF".to_string()),
757                        inferred_type: None,
758                    }))),
759                }
760            }
761
762            // REGEXP_SUBSTR -> REGEXP_EXTRACT (strip extra args)
763            "REGEXP_SUBSTR" if f.args.len() >= 2 => {
764                let subject = f.args[0].clone();
765                let pattern = f.args[1].clone();
766                // For Spark: REGEXP_EXTRACT(subject, pattern, group)
767                // group defaults to 0 for full match, but sqlglot uses last arg if present
768                let group = if f.args.len() >= 6 {
769                    let g = &f.args[5];
770                    // If group is literal 1 (default), omit it
771                    if matches!(g, Expression::Literal(lit) if matches!(lit.as_ref(), Literal::Number(n) if n == "1")) {
772                        None
773                    } else {
774                        Some(g.clone())
775                    }
776                } else {
777                    None
778                };
779                let mut args = vec![subject, pattern];
780                if let Some(g) = group {
781                    args.push(g);
782                }
783                Ok(Expression::Function(Box::new(Function::new(
784                    "REGEXP_EXTRACT".to_string(),
785                    args,
786                ))))
787            }
788
789            // UUID_STRING -> UUID()
790            "UUID_STRING" => Ok(Expression::Function(Box::new(Function::new(
791                "UUID".to_string(),
792                vec![],
793            )))),
794
795            // OBJECT_CONSTRUCT -> STRUCT in Spark
796            "OBJECT_CONSTRUCT" if f.args.len() >= 2 && f.args.len() % 2 == 0 => {
797                // Convert key-value pairs to named struct fields
798                // OBJECT_CONSTRUCT('Manitoba', 'Winnipeg', 'foo', 'bar')
799                // -> STRUCT('Winnipeg' AS Manitoba, 'bar' AS foo)
800                let mut struct_args = Vec::new();
801                for pair in f.args.chunks(2) {
802                    if let Expression::Literal(lit) = &pair[0] {
803                        if let Literal::String(key) = lit.as_ref() {
804                        struct_args.push(Expression::Alias(Box::new(crate::expressions::Alias {
805                            this: pair[1].clone(),
806                            alias: crate::expressions::Identifier::new(key.clone()),
807                            column_aliases: vec![],
808                            pre_alias_comments: vec![],
809                            trailing_comments: vec![],
810                            inferred_type: None,
811                        })));
812                    }
813                    } else {
814                        struct_args.push(pair[1].clone());
815                    }
816                }
817                Ok(Expression::Function(Box::new(Function::new(
818                    "STRUCT".to_string(),
819                    struct_args,
820                ))))
821            }
822
823            // DATE_PART(part, expr) -> EXTRACT(part FROM expr)
824            "DATE_PART" if f.args.len() == 2 => {
825                let mut args = f.args;
826                let part = args.remove(0);
827                let expr = args.remove(0);
828                if let Some(field) = expr_to_datetime_field(&part) {
829                    Ok(Expression::Extract(Box::new(ExtractFunc {
830                        this: expr,
831                        field,
832                    })))
833                } else {
834                    // Can't parse the field, keep as function
835                    Ok(Expression::Function(Box::new(Function::new(
836                        "DATE_PART".to_string(),
837                        vec![part, expr],
838                    ))))
839                }
840            }
841
842            // GET_PATH(obj, path) -> GET_JSON_OBJECT(obj, json_path) in Spark
843            "GET_PATH" if f.args.len() == 2 => {
844                let mut args = f.args;
845                let this = args.remove(0);
846                let path = args.remove(0);
847                let json_path = match &path {
848                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
849                        let Literal::String(s) = lit.as_ref() else { unreachable!() };
850                        let normalized = if s.starts_with('$') {
851                            s.clone()
852                        } else if s.starts_with('[') {
853                            format!("${}", s)
854                        } else {
855                            format!("$.{}", s)
856                        };
857                        Expression::Literal(Box::new(Literal::String(normalized)))
858                    }
859                    _ => path,
860                };
861                Ok(Expression::Function(Box::new(Function::new(
862                    "GET_JSON_OBJECT".to_string(),
863                    vec![this, json_path],
864                ))))
865            }
866
867            // BITWISE_LEFT_SHIFT → SHIFTLEFT
868            "BITWISE_LEFT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
869                "SHIFTLEFT".to_string(),
870                f.args,
871            )))),
872
873            // BITWISE_RIGHT_SHIFT → SHIFTRIGHT
874            "BITWISE_RIGHT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
875                "SHIFTRIGHT".to_string(),
876                f.args,
877            )))),
878
879            // APPROX_DISTINCT → APPROX_COUNT_DISTINCT
880            "APPROX_DISTINCT" => Ok(Expression::Function(Box::new(Function::new(
881                "APPROX_COUNT_DISTINCT".to_string(),
882                f.args,
883            )))),
884
885            // ARRAY_SLICE → SLICE
886            "ARRAY_SLICE" => Ok(Expression::Function(Box::new(Function::new(
887                "SLICE".to_string(),
888                f.args,
889            )))),
890
891            // DATE_FROM_PARTS → MAKE_DATE
892            "DATE_FROM_PARTS" => Ok(Expression::Function(Box::new(Function::new(
893                "MAKE_DATE".to_string(),
894                f.args,
895            )))),
896
897            // DAYOFWEEK_ISO → DAYOFWEEK
898            "DAYOFWEEK_ISO" => Ok(Expression::Function(Box::new(Function::new(
899                "DAYOFWEEK".to_string(),
900                f.args,
901            )))),
902
903            // FORMAT → FORMAT_STRING
904            "FORMAT" => Ok(Expression::Function(Box::new(Function::new(
905                "FORMAT_STRING".to_string(),
906                f.args,
907            )))),
908
909            // LOGICAL_AND → BOOL_AND
910            "LOGICAL_AND" => Ok(Expression::Function(Box::new(Function::new(
911                "BOOL_AND".to_string(),
912                f.args,
913            )))),
914
915            // VARIANCE_POP → VAR_POP
916            "VARIANCE_POP" => Ok(Expression::Function(Box::new(Function::new(
917                "VAR_POP".to_string(),
918                f.args,
919            )))),
920
921            // WEEK_OF_YEAR → WEEKOFYEAR
922            "WEEK_OF_YEAR" => Ok(Expression::Function(Box::new(Function::new(
923                "WEEKOFYEAR".to_string(),
924                f.args,
925            )))),
926
927            // BIT_GET -> GETBIT
928            "BIT_GET" => Ok(Expression::Function(Box::new(Function::new(
929                "GETBIT".to_string(),
930                f.args,
931            )))),
932
933            // CURDATE -> CURRENT_DATE
934            "CURDATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
935
936            // Pass through everything else
937            _ => Ok(Expression::Function(Box::new(f))),
938        }
939    }
940
941    fn transform_aggregate_function(
942        &self,
943        f: Box<crate::expressions::AggregateFunction>,
944    ) -> Result<Expression> {
945        let name_upper = f.name.to_uppercase();
946        match name_upper.as_str() {
947            // GROUP_CONCAT -> COLLECT_LIST (then CONCAT_WS for string)
948            "GROUP_CONCAT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
949                Function::new("COLLECT_LIST".to_string(), f.args),
950            ))),
951
952            // STRING_AGG -> COLLECT_LIST (or STRING_AGG in Spark 4+)
953            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
954                Function::new("COLLECT_LIST".to_string(), f.args),
955            ))),
956
957            // LISTAGG -> COLLECT_LIST
958            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
959                "COLLECT_LIST".to_string(),
960                f.args,
961            )))),
962
963            // ARRAY_AGG -> COLLECT_LIST (preserve distinct and filter)
964            "ARRAY_AGG" if !f.args.is_empty() => {
965                let mut af = f;
966                af.name = "COLLECT_LIST".to_string();
967                Ok(Expression::AggregateFunction(af))
968            }
969
970            // LOGICAL_OR -> BOOL_OR in Spark
971            "LOGICAL_OR" if !f.args.is_empty() => {
972                let mut af = f;
973                af.name = "BOOL_OR".to_string();
974                Ok(Expression::AggregateFunction(af))
975            }
976
977            // Pass through everything else
978            _ => Ok(Expression::AggregateFunction(f)),
979        }
980    }
981}
982
983/// Convert an expression (string literal or identifier) to a DateTimeField
984fn expr_to_datetime_field(expr: &Expression) -> Option<DateTimeField> {
985    let name = match expr {
986        Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => { let Literal::String(s) = lit.as_ref() else { unreachable!() }; s.to_uppercase() },
987        Expression::Identifier(id) => id.name.to_uppercase(),
988        Expression::Column(col) if col.table.is_none() => col.name.name.to_uppercase(),
989        _ => return None,
990    };
991    match name.as_str() {
992        "YEAR" | "Y" | "YY" | "YYY" | "YYYY" | "YR" | "YEARS" | "YRS" => Some(DateTimeField::Year),
993        "MONTH" | "MM" | "MON" | "MONS" | "MONTHS" => Some(DateTimeField::Month),
994        "DAY" | "D" | "DD" | "DAYS" | "DAYOFMONTH" => Some(DateTimeField::Day),
995        "HOUR" | "H" | "HH" | "HR" | "HOURS" | "HRS" => Some(DateTimeField::Hour),
996        "MINUTE" | "MI" | "MIN" | "MINUTES" | "MINS" => Some(DateTimeField::Minute),
997        "SECOND" | "S" | "SEC" | "SECONDS" | "SECS" => Some(DateTimeField::Second),
998        "MILLISECOND" | "MS" | "MSEC" | "MILLISECONDS" => Some(DateTimeField::Millisecond),
999        "MICROSECOND" | "US" | "USEC" | "MICROSECONDS" => Some(DateTimeField::Microsecond),
1000        "DOW" | "DAYOFWEEK" | "DAYOFWEEK_ISO" | "DW" => Some(DateTimeField::DayOfWeek),
1001        "DOY" | "DAYOFYEAR" => Some(DateTimeField::DayOfYear),
1002        "WEEK" | "W" | "WK" | "WEEKOFYEAR" | "WOY" => Some(DateTimeField::Week),
1003        "QUARTER" | "Q" | "QTR" | "QTRS" | "QUARTERS" => Some(DateTimeField::Quarter),
1004        "EPOCH" | "EPOCH_SECOND" | "EPOCH_SECONDS" => Some(DateTimeField::Epoch),
1005        "TIMEZONE" | "TIMEZONE_HOUR" | "TZH" => Some(DateTimeField::TimezoneHour),
1006        "TIMEZONE_MINUTE" | "TZM" => Some(DateTimeField::TimezoneMinute),
1007        _ => Some(DateTimeField::Custom(name)),
1008    }
1009}