Skip to main content

polyglot_sql/dialects/
spark.rs

1//! Spark SQL Dialect
2//!
3//! Spark SQL-specific transformations based on sqlglot patterns.
4//! Key features (extends Hive with modern SQL):
5//! - TRY_CAST is supported (Spark 3+)
6//! - ILIKE is supported (Spark 3+)
7//! - Uses backticks for identifiers
8//! - ARRAY_AGG, COLLECT_LIST for array aggregation
9//! - STRING_AGG / LISTAGG supported (Spark 4+)
10//! - DATE_ADD with unit parameter (Spark 3+)
11//! - TIMESTAMPADD, TIMESTAMPDIFF (Spark 3+)
12//! - More PostgreSQL-like syntax than Hive
13
14use super::{DialectImpl, DialectType};
15use crate::error::Result;
16use crate::expressions::{
17    CeilFunc, CurrentTimestamp, DataType, DateTimeField, Expression, ExtractFunc, Function,
18    Literal, StructField, UnaryFunc, VarArgFunc,
19};
20use crate::generator::GeneratorConfig;
21use crate::tokens::TokenizerConfig;
22
23/// Spark SQL dialect
24pub struct SparkDialect;
25
26impl DialectImpl for SparkDialect {
27    fn dialect_type(&self) -> DialectType {
28        DialectType::Spark
29    }
30
31    fn tokenizer_config(&self) -> TokenizerConfig {
32        let mut config = TokenizerConfig::default();
33        // Spark uses backticks for identifiers (NOT double quotes)
34        config.identifiers.clear();
35        config.identifiers.insert('`', '`');
36        // Spark (like Hive) uses double quotes as string delimiters (QUOTES = ["'", '"'])
37        config.quotes.insert("\"".to_string(), "\"".to_string());
38        // Spark (like Hive) uses backslash escapes in strings (STRING_ESCAPES = ["\\"])
39        config.string_escapes.push('\\');
40        // Spark supports DIV keyword for integer division (inherited from Hive)
41        config
42            .keywords
43            .insert("DIV".to_string(), crate::tokens::TokenType::Div);
44        // Spark numeric literal suffixes (same as Hive): 1L -> BIGINT, 1S -> SMALLINT, etc.
45        config
46            .numeric_literals
47            .insert("L".to_string(), "BIGINT".to_string());
48        config
49            .numeric_literals
50            .insert("S".to_string(), "SMALLINT".to_string());
51        config
52            .numeric_literals
53            .insert("Y".to_string(), "TINYINT".to_string());
54        config
55            .numeric_literals
56            .insert("D".to_string(), "DOUBLE".to_string());
57        config
58            .numeric_literals
59            .insert("F".to_string(), "FLOAT".to_string());
60        config
61            .numeric_literals
62            .insert("BD".to_string(), "DECIMAL".to_string());
63        // Spark allows identifiers to start with digits (e.g., 1a, 1_a)
64        config.identifiers_can_start_with_digit = true;
65        // Spark: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False
66        // Backslashes in raw strings are always literal (no escape processing)
67        config.string_escapes_allowed_in_raw_strings = false;
68        config
69    }
70
71    fn generator_config(&self) -> GeneratorConfig {
72        use crate::generator::IdentifierQuoteStyle;
73        GeneratorConfig {
74            identifier_quote: '`',
75            identifier_quote_style: IdentifierQuoteStyle::BACKTICK,
76            dialect: Some(DialectType::Spark),
77            // Spark uses colon separator in STRUCT field definitions: STRUCT<field_name: TYPE>
78            struct_field_sep: ": ",
79            // Spark doesn't use AS before RETURN in function definitions
80            create_function_return_as: false,
81            // Spark places alias after the TABLESAMPLE clause
82            alias_post_tablesample: true,
83            tablesample_seed_keyword: "REPEATABLE",
84            join_hints: false,
85            identifiers_can_start_with_digit: true,
86            // Spark uses COMMENT 'value' without = sign
87            schema_comment_with_eq: false,
88            ..Default::default()
89        }
90    }
91
92    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
93        match expr {
94            // IFNULL -> COALESCE in Spark
95            Expression::IfNull(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
96                original_name: None,
97                expressions: vec![f.this, f.expression],
98                inferred_type: None,
99            }))),
100
101            // NVL is supported in Spark (from Hive), but COALESCE is standard
102            Expression::Nvl(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
103                original_name: None,
104                expressions: vec![f.this, f.expression],
105                inferred_type: None,
106            }))),
107
108            // Cast: normalize VARCHAR(n) -> STRING, CHAR(n) -> STRING for Spark
109            Expression::Cast(mut c) => {
110                c.to = Self::normalize_spark_type(c.to);
111                Ok(Expression::Cast(c))
112            }
113
114            // TryCast stays as TryCast in Spark (Spark supports TRY_CAST natively)
115            Expression::TryCast(mut c) => {
116                c.to = Self::normalize_spark_type(c.to);
117                Ok(Expression::TryCast(c))
118            }
119
120            // SafeCast -> TRY_CAST
121            Expression::SafeCast(mut c) => {
122                c.to = Self::normalize_spark_type(c.to);
123                Ok(Expression::TryCast(c))
124            }
125
126            // TRIM: non-standard comma syntax -> standard FROM syntax
127            // TRIM('SL', 'SSparkSQLS') -> TRIM('SL' FROM 'SSparkSQLS')
128            Expression::Trim(mut t) => {
129                if !t.sql_standard_syntax && t.characters.is_some() {
130                    // Convert comma syntax to standard SQL syntax
131                    // Fields already have correct semantics: this=string, characters=chars
132                    t.sql_standard_syntax = true;
133                }
134                Ok(Expression::Trim(t))
135            }
136
137            // ILIKE is supported in Spark 3+
138            Expression::ILike(op) => Ok(Expression::ILike(op)),
139
140            // UNNEST -> EXPLODE in Spark (Hive compatibility)
141            Expression::Unnest(f) => Ok(Expression::Explode(Box::new(UnaryFunc::new(f.this)))),
142
143            // EXPLODE is native to Spark
144            Expression::Explode(f) => Ok(Expression::Explode(f)),
145
146            // ExplodeOuter is supported in Spark
147            Expression::ExplodeOuter(f) => Ok(Expression::ExplodeOuter(f)),
148
149            // RANDOM -> RAND in Spark
150            Expression::Random(_) => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
151                seed: None,
152                lower: None,
153                upper: None,
154            }))),
155
156            // Rand is native to Spark
157            Expression::Rand(r) => Ok(Expression::Rand(r)),
158
159            // || (Concat) -> CONCAT in Spark
160            Expression::Concat(op) => Ok(Expression::Function(Box::new(Function::new(
161                "CONCAT".to_string(),
162                vec![op.left, op.right],
163            )))),
164
165            // ParseJson: handled by generator (emits just the string literal for Spark)
166
167            // Generic function transformations
168            Expression::Function(f) => self.transform_function(*f),
169
170            // Generic aggregate function transformations
171            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
172
173            // $N parameters -> ${N} in Spark (DollarBrace style)
174            Expression::Parameter(mut p)
175                if p.style == crate::expressions::ParameterStyle::Dollar =>
176            {
177                p.style = crate::expressions::ParameterStyle::DollarBrace;
178                // Convert index to name for DollarBrace format
179                if let Some(idx) = p.index {
180                    p.name = Some(idx.to_string());
181                }
182                Ok(Expression::Parameter(p))
183            }
184
185            // JSONExtract with variant_extract (Databricks colon syntax) -> GET_JSON_OBJECT
186            Expression::JSONExtract(je) if je.variant_extract.is_some() => {
187                // Convert path: 'item[1].price' -> '$.item[1].price'
188                let path = match *je.expression {
189                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
190                        let Literal::String(s) = lit.as_ref() else {
191                            unreachable!()
192                        };
193                        Expression::Literal(Box::new(Literal::String(format!("$.{}", s))))
194                    }
195                    other => other,
196                };
197                Ok(Expression::Function(Box::new(Function::new(
198                    "GET_JSON_OBJECT".to_string(),
199                    vec![*je.this, path],
200                ))))
201            }
202
203            // Pass through everything else
204            _ => Ok(expr),
205        }
206    }
207}
208
209impl SparkDialect {
210    /// Normalize a data type for Spark:
211    /// - VARCHAR/CHAR without length -> STRING
212    /// - VARCHAR(n)/CHAR(n) with length -> keep as-is
213    /// - TEXT -> STRING
214    fn normalize_spark_type(dt: DataType) -> DataType {
215        match dt {
216            DataType::VarChar { length: None, .. }
217            | DataType::Char { length: None }
218            | DataType::Text => DataType::Custom {
219                name: "STRING".to_string(),
220            },
221            // VARCHAR(n) and CHAR(n) with length are kept as-is
222            DataType::VarChar { .. } | DataType::Char { .. } => dt,
223            // Also normalize struct fields recursively
224            DataType::Struct { fields, nested } => {
225                let normalized_fields: Vec<StructField> = fields
226                    .into_iter()
227                    .map(|mut f| {
228                        f.data_type = Self::normalize_spark_type(f.data_type);
229                        f
230                    })
231                    .collect();
232                DataType::Struct {
233                    fields: normalized_fields,
234                    nested,
235                }
236            }
237            _ => dt,
238        }
239    }
240
241    fn transform_function(&self, f: Function) -> Result<Expression> {
242        let name_upper = f.name.to_uppercase();
243        match name_upper.as_str() {
244            // IFNULL -> COALESCE
245            "IFNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
246                original_name: None,
247                expressions: f.args,
248                inferred_type: None,
249            }))),
250
251            // NVL -> COALESCE
252            "NVL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
253                original_name: None,
254                expressions: f.args,
255                inferred_type: None,
256            }))),
257
258            // ISNULL -> COALESCE
259            "ISNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
260                original_name: None,
261                expressions: f.args,
262                inferred_type: None,
263            }))),
264
265            // GROUP_CONCAT -> CONCAT_WS + COLLECT_LIST in older Spark
266            // In Spark 4+, STRING_AGG is available
267            "GROUP_CONCAT" if !f.args.is_empty() => {
268                // For simplicity, use COLLECT_LIST (array aggregation)
269                Ok(Expression::Function(Box::new(Function::new(
270                    "COLLECT_LIST".to_string(),
271                    f.args,
272                ))))
273            }
274
275            // STRING_AGG is supported in Spark 4+
276            // For older versions, fall back to CONCAT_WS + COLLECT_LIST
277            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
278                Function::new("COLLECT_LIST".to_string(), f.args),
279            ))),
280
281            // LISTAGG -> STRING_AGG in Spark 4+ (or COLLECT_LIST for older)
282            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
283                "COLLECT_LIST".to_string(),
284                f.args,
285            )))),
286
287            // SUBSTRING is native to Spark
288            "SUBSTRING" | "SUBSTR" => Ok(Expression::Function(Box::new(f))),
289
290            // LENGTH is native to Spark
291            "LENGTH" => Ok(Expression::Function(Box::new(f))),
292
293            // LEN -> LENGTH
294            "LEN" if f.args.len() == 1 => Ok(Expression::Length(Box::new(UnaryFunc::new(
295                f.args.into_iter().next().unwrap(),
296            )))),
297
298            // RANDOM -> RAND
299            "RANDOM" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
300                seed: None,
301                lower: None,
302                upper: None,
303            }))),
304
305            // RAND is native to Spark
306            "RAND" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
307                seed: None,
308                lower: None,
309                upper: None,
310            }))),
311
312            // NOW -> CURRENT_TIMESTAMP
313            "NOW" => Ok(Expression::CurrentTimestamp(
314                crate::expressions::CurrentTimestamp {
315                    precision: None,
316                    sysdate: false,
317                },
318            )),
319
320            // GETDATE -> CURRENT_TIMESTAMP
321            "GETDATE" => Ok(Expression::CurrentTimestamp(
322                crate::expressions::CurrentTimestamp {
323                    precision: None,
324                    sysdate: false,
325                },
326            )),
327
328            // CURRENT_TIMESTAMP is native
329            "CURRENT_TIMESTAMP" => Ok(Expression::CurrentTimestamp(
330                crate::expressions::CurrentTimestamp {
331                    precision: None,
332                    sysdate: false,
333                },
334            )),
335
336            // CURRENT_DATE is native
337            "CURRENT_DATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
338
339            // TO_DATE is native to Spark; strip default format 'yyyy-MM-dd'
340            "TO_DATE" if f.args.len() == 2 => {
341                let is_default_format = matches!(&f.args[1], Expression::Literal(lit) if matches!(lit.as_ref(), crate::expressions::Literal::String(s) if s == "yyyy-MM-dd"));
342                if is_default_format {
343                    Ok(Expression::Function(Box::new(Function::new(
344                        "TO_DATE".to_string(),
345                        vec![f.args.into_iter().next().unwrap()],
346                    ))))
347                } else {
348                    Ok(Expression::Function(Box::new(f)))
349                }
350            }
351            "TO_DATE" => Ok(Expression::Function(Box::new(f))),
352
353            // TO_TIMESTAMP is native to Spark
354            "TO_TIMESTAMP" => Ok(Expression::Function(Box::new(f))),
355
356            // DATE_FORMAT is native to Spark
357            "DATE_FORMAT" => Ok(Expression::Function(Box::new(f))),
358
359            // strftime -> DATE_FORMAT
360            "STRFTIME" => Ok(Expression::Function(Box::new(Function::new(
361                "DATE_FORMAT".to_string(),
362                f.args,
363            )))),
364
365            // TO_CHAR -> DATE_FORMAT
366            "TO_CHAR" => Ok(Expression::Function(Box::new(Function::new(
367                "DATE_FORMAT".to_string(),
368                f.args,
369            )))),
370
371            // DATE_TRUNC is native to Spark
372            "DATE_TRUNC" => Ok(Expression::Function(Box::new(f))),
373
374            // TRUNC is native to Spark
375            "TRUNC" => Ok(Expression::Function(Box::new(f))),
376
377            // EXTRACT is native to Spark
378            "EXTRACT" => Ok(Expression::Function(Box::new(f))),
379
380            // DATEPART -> EXTRACT
381            "DATEPART" => Ok(Expression::Function(Box::new(Function::new(
382                "EXTRACT".to_string(),
383                f.args,
384            )))),
385
386            // UNIX_TIMESTAMP is native to Spark
387            // When called with no args, add CURRENT_TIMESTAMP() as default
388            "UNIX_TIMESTAMP" => {
389                if f.args.is_empty() {
390                    Ok(Expression::Function(Box::new(Function::new(
391                        "UNIX_TIMESTAMP".to_string(),
392                        vec![Expression::CurrentTimestamp(CurrentTimestamp {
393                            precision: None,
394                            sysdate: false,
395                        })],
396                    ))))
397                } else {
398                    Ok(Expression::Function(Box::new(f)))
399                }
400            }
401
402            // FROM_UNIXTIME is native to Spark
403            "FROM_UNIXTIME" => Ok(Expression::Function(Box::new(f))),
404
405            // STR_TO_MAP is native to Spark
406            // When called with only one arg, add default delimiters ',' and ':'
407            "STR_TO_MAP" => {
408                if f.args.len() == 1 {
409                    let mut args = f.args;
410                    args.push(Expression::Literal(Box::new(
411                        crate::expressions::Literal::String(",".to_string()),
412                    )));
413                    args.push(Expression::Literal(Box::new(
414                        crate::expressions::Literal::String(":".to_string()),
415                    )));
416                    Ok(Expression::Function(Box::new(Function::new(
417                        "STR_TO_MAP".to_string(),
418                        args,
419                    ))))
420                } else {
421                    Ok(Expression::Function(Box::new(f)))
422                }
423            }
424
425            // POSITION is native to Spark (POSITION(substr IN str))
426            "POSITION" => Ok(Expression::Function(Box::new(f))),
427
428            // LOCATE is native to Spark
429            "LOCATE" => Ok(Expression::Function(Box::new(f))),
430
431            // STRPOS -> Use expression form or LOCATE
432            "STRPOS" if f.args.len() == 2 => {
433                let mut args = f.args;
434                let first = args.remove(0);
435                let second = args.remove(0);
436                // LOCATE(substr, str) in Spark
437                Ok(Expression::Function(Box::new(Function::new(
438                    "LOCATE".to_string(),
439                    vec![second, first],
440                ))))
441            }
442
443            // CHARINDEX -> LOCATE
444            "CHARINDEX" if f.args.len() >= 2 => {
445                let mut args = f.args;
446                let substring = args.remove(0);
447                let string = args.remove(0);
448                let mut locate_args = vec![substring, string];
449                if !args.is_empty() {
450                    locate_args.push(args.remove(0));
451                }
452                Ok(Expression::Function(Box::new(Function::new(
453                    "LOCATE".to_string(),
454                    locate_args,
455                ))))
456            }
457
458            // INSTR is native to Spark
459            "INSTR" => Ok(Expression::Function(Box::new(f))),
460
461            // CEILING -> CEIL
462            "CEILING" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
463                this: f.args.into_iter().next().unwrap(),
464                decimals: None,
465                to: None,
466            }))),
467
468            // CEIL is native to Spark
469            "CEIL" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
470                this: f.args.into_iter().next().unwrap(),
471                decimals: None,
472                to: None,
473            }))),
474
475            // UNNEST -> EXPLODE
476            "UNNEST" => Ok(Expression::Function(Box::new(Function::new(
477                "EXPLODE".to_string(),
478                f.args,
479            )))),
480
481            // FLATTEN -> FLATTEN is native to Spark (for nested arrays)
482            "FLATTEN" => Ok(Expression::Function(Box::new(f))),
483
484            // ARRAY_AGG -> COLLECT_LIST
485            "ARRAY_AGG" => Ok(Expression::Function(Box::new(Function::new(
486                "COLLECT_LIST".to_string(),
487                f.args,
488            )))),
489
490            // COLLECT_LIST is native to Spark
491            "COLLECT_LIST" => Ok(Expression::Function(Box::new(f))),
492
493            // COLLECT_SET is native to Spark
494            "COLLECT_SET" => Ok(Expression::Function(Box::new(f))),
495
496            // ARRAY_LENGTH -> SIZE in Spark
497            "ARRAY_LENGTH" | "CARDINALITY" => Ok(Expression::Function(Box::new(Function::new(
498                "SIZE".to_string(),
499                f.args,
500            )))),
501
502            // SIZE is native to Spark
503            "SIZE" => Ok(Expression::Function(Box::new(f))),
504
505            // SPLIT is native to Spark
506            "SPLIT" => Ok(Expression::Function(Box::new(f))),
507
508            // REGEXP_REPLACE: Spark supports up to 4 args (subject, pattern, replacement, position)
509            // Strip extra Snowflake args (occurrence, params) if present
510            "REGEXP_REPLACE" if f.args.len() > 4 => {
511                let mut args = f.args;
512                args.truncate(4);
513                Ok(Expression::Function(Box::new(Function::new(
514                    "REGEXP_REPLACE".to_string(),
515                    args,
516                ))))
517            }
518            "REGEXP_REPLACE" => Ok(Expression::Function(Box::new(f))),
519
520            // REGEXP_EXTRACT is native to Spark
521            "REGEXP_EXTRACT" => Ok(Expression::Function(Box::new(f))),
522
523            // REGEXP_EXTRACT_ALL is native to Spark
524            "REGEXP_EXTRACT_ALL" => Ok(Expression::Function(Box::new(f))),
525
526            // RLIKE is native to Spark
527            "RLIKE" | "REGEXP_LIKE" => Ok(Expression::Function(Box::new(Function::new(
528                "RLIKE".to_string(),
529                f.args,
530            )))),
531
532            // JSON_EXTRACT -> GET_JSON_OBJECT (Hive style) or :: operator
533            "JSON_EXTRACT" => Ok(Expression::Function(Box::new(Function::new(
534                "GET_JSON_OBJECT".to_string(),
535                f.args,
536            )))),
537
538            // JSON_EXTRACT_SCALAR -> GET_JSON_OBJECT
539            "JSON_EXTRACT_SCALAR" => Ok(Expression::Function(Box::new(Function::new(
540                "GET_JSON_OBJECT".to_string(),
541                f.args,
542            )))),
543
544            // GET_JSON_OBJECT is native to Spark
545            "GET_JSON_OBJECT" => Ok(Expression::Function(Box::new(f))),
546
547            // FROM_JSON is native to Spark
548            "FROM_JSON" => Ok(Expression::Function(Box::new(f))),
549
550            // TO_JSON is native to Spark
551            "TO_JSON" => Ok(Expression::Function(Box::new(f))),
552
553            // PARSE_JSON -> strip for Spark (just keep the string argument)
554            "PARSE_JSON" if f.args.len() == 1 => Ok(f.args.into_iter().next().unwrap()),
555            "PARSE_JSON" => Ok(Expression::Function(Box::new(Function::new(
556                "FROM_JSON".to_string(),
557                f.args,
558            )))),
559
560            // DATEDIFF is native to Spark (supports unit in Spark 3+)
561            "DATEDIFF" | "DATE_DIFF" => Ok(Expression::Function(Box::new(Function::new(
562                "DATEDIFF".to_string(),
563                f.args,
564            )))),
565
566            // DATE_ADD is native to Spark
567            "DATE_ADD" | "DATEADD" => Ok(Expression::Function(Box::new(Function::new(
568                "DATE_ADD".to_string(),
569                f.args,
570            )))),
571
572            // DATE_SUB is native to Spark
573            "DATE_SUB" => Ok(Expression::Function(Box::new(f))),
574
575            // TIMESTAMPADD is native to Spark 3+
576            "TIMESTAMPADD" => Ok(Expression::Function(Box::new(f))),
577
578            // TIMESTAMPDIFF is native to Spark 3+
579            "TIMESTAMPDIFF" => Ok(Expression::Function(Box::new(f))),
580
581            // ADD_MONTHS is native to Spark
582            "ADD_MONTHS" => Ok(Expression::Function(Box::new(f))),
583
584            // MONTHS_BETWEEN is native to Spark
585            "MONTHS_BETWEEN" => Ok(Expression::Function(Box::new(f))),
586
587            // NVL is native to Spark
588            "NVL" => Ok(Expression::Function(Box::new(f))),
589
590            // NVL2 is native to Spark
591            "NVL2" => Ok(Expression::Function(Box::new(f))),
592
593            // MAP is native to Spark
594            "MAP" => Ok(Expression::Function(Box::new(f))),
595
596            // ARRAY is native to Spark
597            "ARRAY" => Ok(Expression::Function(Box::new(f))),
598
599            // ROW -> STRUCT for Spark (cross-dialect, no auto-naming)
600            "ROW" => Ok(Expression::Function(Box::new(Function::new(
601                "STRUCT".to_string(),
602                f.args,
603            )))),
604
605            // STRUCT is native to Spark - auto-name unnamed args as col1, col2, etc.
606            "STRUCT" => {
607                let mut col_idx = 1usize;
608                let named_args: Vec<Expression> = f
609                    .args
610                    .into_iter()
611                    .map(|arg| {
612                        let current_idx = col_idx;
613                        col_idx += 1;
614                        // Check if arg already has an alias (AS name) or is Star
615                        match &arg {
616                            Expression::Alias(_) => arg, // already named
617                            Expression::Star(_) => arg,  // STRUCT(*) - keep as-is
618                            Expression::Column(c) if c.table.is_none() => {
619                                // Column reference: use column name as the struct field name
620                                let name = c.name.name.clone();
621                                Expression::Alias(Box::new(crate::expressions::Alias {
622                                    this: arg,
623                                    alias: crate::expressions::Identifier::new(&name),
624                                    column_aliases: Vec::new(),
625                                    pre_alias_comments: Vec::new(),
626                                    trailing_comments: Vec::new(),
627                                    inferred_type: None,
628                                }))
629                            }
630                            _ => {
631                                // Unnamed literal/expression: auto-name as colN
632                                let name = format!("col{}", current_idx);
633                                Expression::Alias(Box::new(crate::expressions::Alias {
634                                    this: arg,
635                                    alias: crate::expressions::Identifier::new(&name),
636                                    column_aliases: Vec::new(),
637                                    pre_alias_comments: Vec::new(),
638                                    trailing_comments: Vec::new(),
639                                    inferred_type: None,
640                                }))
641                            }
642                        }
643                    })
644                    .collect();
645                Ok(Expression::Function(Box::new(Function {
646                    name: "STRUCT".to_string(),
647                    args: named_args,
648                    distinct: false,
649                    trailing_comments: Vec::new(),
650                    use_bracket_syntax: false,
651                    no_parens: false,
652                    quoted: false,
653                    span: None,
654                    inferred_type: None,
655                })))
656            }
657
658            // NAMED_STRUCT is native to Spark
659            "NAMED_STRUCT" => Ok(Expression::Function(Box::new(f))),
660
661            // MAP_FROM_ARRAYS is native to Spark
662            "MAP_FROM_ARRAYS" => Ok(Expression::Function(Box::new(f))),
663
664            // ARRAY_SORT is native to Spark
665            "ARRAY_SORT" => Ok(Expression::Function(Box::new(f))),
666
667            // ARRAY_DISTINCT is native to Spark
668            "ARRAY_DISTINCT" => Ok(Expression::Function(Box::new(f))),
669
670            // ARRAY_UNION is native to Spark
671            "ARRAY_UNION" => Ok(Expression::Function(Box::new(f))),
672
673            // ARRAY_INTERSECT is native to Spark
674            "ARRAY_INTERSECT" => Ok(Expression::Function(Box::new(f))),
675
676            // ARRAY_EXCEPT is native to Spark
677            "ARRAY_EXCEPT" => Ok(Expression::Function(Box::new(f))),
678
679            // ARRAY_CONTAINS is native to Spark
680            "ARRAY_CONTAINS" => Ok(Expression::Function(Box::new(f))),
681
682            // ELEMENT_AT is native to Spark
683            "ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
684
685            // TRY_ELEMENT_AT is native to Spark 3+
686            "TRY_ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
687
688            // TRANSFORM is native to Spark (array transformation)
689            "TRANSFORM" => Ok(Expression::Function(Box::new(f))),
690
691            // FILTER is native to Spark (array filtering)
692            "FILTER" => Ok(Expression::Function(Box::new(f))),
693
694            // AGGREGATE is native to Spark (array reduction)
695            "AGGREGATE" => Ok(Expression::Function(Box::new(f))),
696
697            // SEQUENCE is native to Spark (generate array)
698            "SEQUENCE" => Ok(Expression::Function(Box::new(f))),
699
700            // GENERATE_SERIES -> SEQUENCE
701            "GENERATE_SERIES" => Ok(Expression::Function(Box::new(Function::new(
702                "SEQUENCE".to_string(),
703                f.args,
704            )))),
705
706            // STARTSWITH is native to Spark 3+
707            "STARTSWITH" | "STARTS_WITH" => Ok(Expression::Function(Box::new(Function::new(
708                "STARTSWITH".to_string(),
709                f.args,
710            )))),
711
712            // ENDSWITH is native to Spark 3+
713            "ENDSWITH" | "ENDS_WITH" => Ok(Expression::Function(Box::new(Function::new(
714                "ENDSWITH".to_string(),
715                f.args,
716            )))),
717
718            // ARRAY_CONSTRUCT_COMPACT(1, null, 2) -> ARRAY_COMPACT(ARRAY(1, NULL, 2))
719            "ARRAY_CONSTRUCT_COMPACT" => {
720                let inner =
721                    Expression::Function(Box::new(Function::new("ARRAY".to_string(), f.args)));
722                Ok(Expression::Function(Box::new(Function::new(
723                    "ARRAY_COMPACT".to_string(),
724                    vec![inner],
725                ))))
726            }
727
728            // ARRAY_TO_STRING -> ARRAY_JOIN
729            "ARRAY_TO_STRING" => Ok(Expression::Function(Box::new(Function::new(
730                "ARRAY_JOIN".to_string(),
731                f.args,
732            )))),
733
734            // TO_ARRAY(x) -> IF(x IS NULL, NULL, ARRAY(x))
735            "TO_ARRAY" if f.args.len() == 1 => {
736                let x = f.args[0].clone();
737                // Check if arg is already an array constructor (bracket notation)
738                // In that case: TO_ARRAY(['test']) -> ARRAY('test')
739                match &x {
740                    Expression::ArrayFunc(arr) => {
741                        // Just convert to ARRAY(...) function
742                        Ok(Expression::Function(Box::new(Function::new(
743                            "ARRAY".to_string(),
744                            arr.expressions.clone(),
745                        ))))
746                    }
747                    _ => Ok(Expression::IfFunc(Box::new(crate::expressions::IfFunc {
748                        condition: Expression::IsNull(Box::new(crate::expressions::IsNull {
749                            this: x.clone(),
750                            not: false,
751                            postfix_form: false,
752                        })),
753                        true_value: Expression::Null(crate::expressions::Null),
754                        false_value: Some(Expression::Function(Box::new(Function::new(
755                            "ARRAY".to_string(),
756                            vec![x],
757                        )))),
758                        original_name: Some("IF".to_string()),
759                        inferred_type: None,
760                    }))),
761                }
762            }
763
764            // REGEXP_SUBSTR -> REGEXP_EXTRACT (strip extra args)
765            "REGEXP_SUBSTR" if f.args.len() >= 2 => {
766                let subject = f.args[0].clone();
767                let pattern = f.args[1].clone();
768                // For Spark: REGEXP_EXTRACT(subject, pattern, group)
769                // group defaults to 0 for full match, but sqlglot uses last arg if present
770                let group = if f.args.len() >= 6 {
771                    let g = &f.args[5];
772                    // If group is literal 1 (default), omit it
773                    if matches!(g, Expression::Literal(lit) if matches!(lit.as_ref(), Literal::Number(n) if n == "1"))
774                    {
775                        None
776                    } else {
777                        Some(g.clone())
778                    }
779                } else {
780                    None
781                };
782                let mut args = vec![subject, pattern];
783                if let Some(g) = group {
784                    args.push(g);
785                }
786                Ok(Expression::Function(Box::new(Function::new(
787                    "REGEXP_EXTRACT".to_string(),
788                    args,
789                ))))
790            }
791
792            // UUID_STRING -> UUID()
793            "UUID_STRING" => Ok(Expression::Function(Box::new(Function::new(
794                "UUID".to_string(),
795                vec![],
796            )))),
797
798            // OBJECT_CONSTRUCT -> STRUCT in Spark
799            "OBJECT_CONSTRUCT" if f.args.len() >= 2 && f.args.len() % 2 == 0 => {
800                // Convert key-value pairs to named struct fields
801                // OBJECT_CONSTRUCT('Manitoba', 'Winnipeg', 'foo', 'bar')
802                // -> STRUCT('Winnipeg' AS Manitoba, 'bar' AS foo)
803                let mut struct_args = Vec::new();
804                for pair in f.args.chunks(2) {
805                    if let Expression::Literal(lit) = &pair[0] {
806                        if let Literal::String(key) = lit.as_ref() {
807                            struct_args.push(Expression::Alias(Box::new(
808                                crate::expressions::Alias {
809                                    this: pair[1].clone(),
810                                    alias: crate::expressions::Identifier::new(key.clone()),
811                                    column_aliases: vec![],
812                                    pre_alias_comments: vec![],
813                                    trailing_comments: vec![],
814                                    inferred_type: None,
815                                },
816                            )));
817                        }
818                    } else {
819                        struct_args.push(pair[1].clone());
820                    }
821                }
822                Ok(Expression::Function(Box::new(Function::new(
823                    "STRUCT".to_string(),
824                    struct_args,
825                ))))
826            }
827
828            // DATE_PART(part, expr) -> EXTRACT(part FROM expr)
829            "DATE_PART" if f.args.len() == 2 => {
830                let mut args = f.args;
831                let part = args.remove(0);
832                let expr = args.remove(0);
833                if let Some(field) = expr_to_datetime_field(&part) {
834                    Ok(Expression::Extract(Box::new(ExtractFunc {
835                        this: expr,
836                        field,
837                    })))
838                } else {
839                    // Can't parse the field, keep as function
840                    Ok(Expression::Function(Box::new(Function::new(
841                        "DATE_PART".to_string(),
842                        vec![part, expr],
843                    ))))
844                }
845            }
846
847            // GET_PATH(obj, path) -> GET_JSON_OBJECT(obj, json_path) in Spark
848            "GET_PATH" if f.args.len() == 2 => {
849                let mut args = f.args;
850                let this = args.remove(0);
851                let path = args.remove(0);
852                let json_path = match &path {
853                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
854                        let Literal::String(s) = lit.as_ref() else {
855                            unreachable!()
856                        };
857                        let normalized = if s.starts_with('$') {
858                            s.clone()
859                        } else if s.starts_with('[') {
860                            format!("${}", s)
861                        } else {
862                            format!("$.{}", s)
863                        };
864                        Expression::Literal(Box::new(Literal::String(normalized)))
865                    }
866                    _ => path,
867                };
868                Ok(Expression::Function(Box::new(Function::new(
869                    "GET_JSON_OBJECT".to_string(),
870                    vec![this, json_path],
871                ))))
872            }
873
874            // BITWISE_LEFT_SHIFT → SHIFTLEFT
875            "BITWISE_LEFT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
876                "SHIFTLEFT".to_string(),
877                f.args,
878            )))),
879
880            // BITWISE_RIGHT_SHIFT → SHIFTRIGHT
881            "BITWISE_RIGHT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
882                "SHIFTRIGHT".to_string(),
883                f.args,
884            )))),
885
886            // APPROX_DISTINCT → APPROX_COUNT_DISTINCT
887            "APPROX_DISTINCT" => Ok(Expression::Function(Box::new(Function::new(
888                "APPROX_COUNT_DISTINCT".to_string(),
889                f.args,
890            )))),
891
892            // ARRAY_SLICE → SLICE
893            "ARRAY_SLICE" => Ok(Expression::Function(Box::new(Function::new(
894                "SLICE".to_string(),
895                f.args,
896            )))),
897
898            // DATE_FROM_PARTS → MAKE_DATE
899            "DATE_FROM_PARTS" => Ok(Expression::Function(Box::new(Function::new(
900                "MAKE_DATE".to_string(),
901                f.args,
902            )))),
903
904            // DAYOFWEEK_ISO → DAYOFWEEK
905            "DAYOFWEEK_ISO" => Ok(Expression::Function(Box::new(Function::new(
906                "DAYOFWEEK".to_string(),
907                f.args,
908            )))),
909
910            // FORMAT → FORMAT_STRING
911            "FORMAT" => Ok(Expression::Function(Box::new(Function::new(
912                "FORMAT_STRING".to_string(),
913                f.args,
914            )))),
915
916            // LOGICAL_AND → BOOL_AND
917            "LOGICAL_AND" => Ok(Expression::Function(Box::new(Function::new(
918                "BOOL_AND".to_string(),
919                f.args,
920            )))),
921
922            // VARIANCE_POP → VAR_POP
923            "VARIANCE_POP" => Ok(Expression::Function(Box::new(Function::new(
924                "VAR_POP".to_string(),
925                f.args,
926            )))),
927
928            // WEEK_OF_YEAR → WEEKOFYEAR
929            "WEEK_OF_YEAR" => Ok(Expression::Function(Box::new(Function::new(
930                "WEEKOFYEAR".to_string(),
931                f.args,
932            )))),
933
934            // BIT_GET -> GETBIT
935            "BIT_GET" => Ok(Expression::Function(Box::new(Function::new(
936                "GETBIT".to_string(),
937                f.args,
938            )))),
939
940            // CURDATE -> CURRENT_DATE
941            "CURDATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
942
943            // Pass through everything else
944            _ => Ok(Expression::Function(Box::new(f))),
945        }
946    }
947
948    fn transform_aggregate_function(
949        &self,
950        f: Box<crate::expressions::AggregateFunction>,
951    ) -> Result<Expression> {
952        let name_upper = f.name.to_uppercase();
953        match name_upper.as_str() {
954            // GROUP_CONCAT -> COLLECT_LIST (then CONCAT_WS for string)
955            "GROUP_CONCAT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
956                Function::new("COLLECT_LIST".to_string(), f.args),
957            ))),
958
959            // STRING_AGG -> COLLECT_LIST (or STRING_AGG in Spark 4+)
960            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
961                Function::new("COLLECT_LIST".to_string(), f.args),
962            ))),
963
964            // LISTAGG -> COLLECT_LIST
965            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
966                "COLLECT_LIST".to_string(),
967                f.args,
968            )))),
969
970            // ARRAY_AGG -> COLLECT_LIST (preserve distinct and filter)
971            "ARRAY_AGG" if !f.args.is_empty() => {
972                let mut af = f;
973                af.name = "COLLECT_LIST".to_string();
974                Ok(Expression::AggregateFunction(af))
975            }
976
977            // LOGICAL_OR -> BOOL_OR in Spark
978            "LOGICAL_OR" if !f.args.is_empty() => {
979                let mut af = f;
980                af.name = "BOOL_OR".to_string();
981                Ok(Expression::AggregateFunction(af))
982            }
983
984            // Pass through everything else
985            _ => Ok(Expression::AggregateFunction(f)),
986        }
987    }
988}
989
990/// Convert an expression (string literal or identifier) to a DateTimeField
991fn expr_to_datetime_field(expr: &Expression) -> Option<DateTimeField> {
992    let name = match expr {
993        Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
994            let Literal::String(s) = lit.as_ref() else {
995                unreachable!()
996            };
997            s.to_uppercase()
998        }
999        Expression::Identifier(id) => id.name.to_uppercase(),
1000        Expression::Var(v) => v.this.to_uppercase(),
1001        Expression::Column(col) if col.table.is_none() => col.name.name.to_uppercase(),
1002        _ => return None,
1003    };
1004    match name.as_str() {
1005        "YEAR" | "Y" | "YY" | "YYY" | "YYYY" | "YR" | "YEARS" | "YRS" => Some(DateTimeField::Year),
1006        "MONTH" | "MM" | "MON" | "MONS" | "MONTHS" => Some(DateTimeField::Month),
1007        "DAY" | "D" | "DD" | "DAYS" | "DAYOFMONTH" => Some(DateTimeField::Day),
1008        "HOUR" | "H" | "HH" | "HR" | "HOURS" | "HRS" => Some(DateTimeField::Hour),
1009        "MINUTE" | "MI" | "MIN" | "MINUTES" | "MINS" => Some(DateTimeField::Minute),
1010        "SECOND" | "S" | "SEC" | "SECONDS" | "SECS" => Some(DateTimeField::Second),
1011        "MILLISECOND" | "MS" | "MSEC" | "MILLISECONDS" => Some(DateTimeField::Millisecond),
1012        "MICROSECOND" | "US" | "USEC" | "MICROSECONDS" => Some(DateTimeField::Microsecond),
1013        "DOW" | "DAYOFWEEK" | "DAYOFWEEK_ISO" | "DW" => Some(DateTimeField::DayOfWeek),
1014        "DOY" | "DAYOFYEAR" => Some(DateTimeField::DayOfYear),
1015        "WEEK" | "W" | "WK" | "WEEKOFYEAR" | "WOY" => Some(DateTimeField::Week),
1016        "QUARTER" | "Q" | "QTR" | "QTRS" | "QUARTERS" => Some(DateTimeField::Quarter),
1017        "EPOCH" | "EPOCH_SECOND" | "EPOCH_SECONDS" => Some(DateTimeField::Epoch),
1018        "TIMEZONE" | "TIMEZONE_HOUR" | "TZH" => Some(DateTimeField::TimezoneHour),
1019        "TIMEZONE_MINUTE" | "TZM" => Some(DateTimeField::TimezoneMinute),
1020        _ => Some(DateTimeField::Custom(name)),
1021    }
1022}