Skip to main content

polyglot_sql/dialects/
spark.rs

1//! Spark SQL Dialect
2//!
3//! Spark SQL-specific transformations based on sqlglot patterns.
4//! Key features (extends Hive with modern SQL):
5//! - TRY_CAST is supported (Spark 3+)
6//! - ILIKE is supported (Spark 3+)
7//! - Uses backticks for identifiers
8//! - ARRAY_AGG, COLLECT_LIST for array aggregation
9//! - STRING_AGG / LISTAGG supported (Spark 4+)
10//! - DATE_ADD with unit parameter (Spark 3+)
11//! - TIMESTAMPADD, TIMESTAMPDIFF (Spark 3+)
12//! - More PostgreSQL-like syntax than Hive
13
14use super::{DialectImpl, DialectType};
15use crate::error::Result;
16use crate::expressions::{
17    CeilFunc, CurrentTimestamp, DataType, DateTimeField, Expression, ExtractFunc, Function,
18    Literal, StructField, UnaryFunc, VarArgFunc,
19};
20use crate::generator::GeneratorConfig;
21use crate::tokens::TokenizerConfig;
22
23/// Spark SQL dialect
24pub struct SparkDialect;
25
26impl DialectImpl for SparkDialect {
27    fn dialect_type(&self) -> DialectType {
28        DialectType::Spark
29    }
30
31    fn tokenizer_config(&self) -> TokenizerConfig {
32        let mut config = TokenizerConfig::default();
33        // Spark uses backticks for identifiers (NOT double quotes)
34        config.identifiers.clear();
35        config.identifiers.insert('`', '`');
36        // Spark (like Hive) uses double quotes as string delimiters (QUOTES = ["'", '"'])
37        config.quotes.insert("\"".to_string(), "\"".to_string());
38        // Spark (like Hive) uses backslash escapes in strings (STRING_ESCAPES = ["\\"])
39        config.string_escapes.push('\\');
40        // Spark supports DIV keyword for integer division (inherited from Hive)
41        config
42            .keywords
43            .insert("DIV".to_string(), crate::tokens::TokenType::Div);
44        // Spark numeric literal suffixes (same as Hive): 1L -> BIGINT, 1S -> SMALLINT, etc.
45        config
46            .numeric_literals
47            .insert("L".to_string(), "BIGINT".to_string());
48        config
49            .numeric_literals
50            .insert("S".to_string(), "SMALLINT".to_string());
51        config
52            .numeric_literals
53            .insert("Y".to_string(), "TINYINT".to_string());
54        config
55            .numeric_literals
56            .insert("D".to_string(), "DOUBLE".to_string());
57        config
58            .numeric_literals
59            .insert("F".to_string(), "FLOAT".to_string());
60        config
61            .numeric_literals
62            .insert("BD".to_string(), "DECIMAL".to_string());
63        // Spark allows identifiers to start with digits (e.g., 1a, 1_a)
64        config.identifiers_can_start_with_digit = true;
65        // Spark: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False
66        // Backslashes in raw strings are always literal (no escape processing)
67        config.string_escapes_allowed_in_raw_strings = false;
68        config
69    }
70
71    fn generator_config(&self) -> GeneratorConfig {
72        use crate::generator::IdentifierQuoteStyle;
73        GeneratorConfig {
74            identifier_quote: '`',
75            identifier_quote_style: IdentifierQuoteStyle::BACKTICK,
76            dialect: Some(DialectType::Spark),
77            // Spark uses colon separator in STRUCT field definitions: STRUCT<field_name: TYPE>
78            struct_field_sep: ": ",
79            // Spark doesn't use AS before RETURN in function definitions
80            create_function_return_as: false,
81            // Spark places alias after the TABLESAMPLE clause
82            alias_post_tablesample: true,
83            tablesample_seed_keyword: "REPEATABLE",
84            join_hints: false,
85            identifiers_can_start_with_digit: true,
86            // Spark uses COMMENT 'value' without = sign
87            schema_comment_with_eq: false,
88            ..Default::default()
89        }
90    }
91
92    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
93        match expr {
94            // IFNULL -> COALESCE in Spark
95            Expression::IfNull(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
96                original_name: None,
97                expressions: vec![f.this, f.expression],
98                inferred_type: None,
99            }))),
100
101            // NVL is supported in Spark (from Hive), but COALESCE is standard
102            Expression::Nvl(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
103                original_name: None,
104                expressions: vec![f.this, f.expression],
105                inferred_type: None,
106            }))),
107
108            // Cast: normalize VARCHAR(n) -> STRING, CHAR(n) -> STRING for Spark
109            Expression::Cast(mut c) => {
110                c.to = Self::normalize_spark_type(c.to);
111                Ok(Expression::Cast(c))
112            }
113
114            // TryCast stays as TryCast in Spark (Spark supports TRY_CAST natively)
115            Expression::TryCast(mut c) => {
116                c.to = Self::normalize_spark_type(c.to);
117                Ok(Expression::TryCast(c))
118            }
119
120            // SafeCast -> TRY_CAST
121            Expression::SafeCast(mut c) => {
122                c.to = Self::normalize_spark_type(c.to);
123                Ok(Expression::TryCast(c))
124            }
125
126            // TRIM: non-standard comma syntax -> standard FROM syntax
127            // TRIM('SL', 'SSparkSQLS') -> TRIM('SL' FROM 'SSparkSQLS')
128            Expression::Trim(mut t) => {
129                if !t.sql_standard_syntax && t.characters.is_some() {
130                    // Convert comma syntax to standard SQL syntax
131                    // Fields already have correct semantics: this=string, characters=chars
132                    t.sql_standard_syntax = true;
133                }
134                Ok(Expression::Trim(t))
135            }
136
137            // ILIKE is supported in Spark 3+
138            Expression::ILike(op) => Ok(Expression::ILike(op)),
139
140            // UNNEST -> EXPLODE in Spark (Hive compatibility)
141            Expression::Unnest(f) => Ok(Expression::Explode(Box::new(UnaryFunc::new(f.this)))),
142
143            // EXPLODE is native to Spark
144            Expression::Explode(f) => Ok(Expression::Explode(f)),
145
146            // ExplodeOuter is supported in Spark
147            Expression::ExplodeOuter(f) => Ok(Expression::ExplodeOuter(f)),
148
149            // RANDOM -> RAND in Spark
150            Expression::Random(_) => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
151                seed: None,
152                lower: None,
153                upper: None,
154            }))),
155
156            // Rand is native to Spark
157            Expression::Rand(r) => Ok(Expression::Rand(r)),
158
159            // || (Concat) -> CONCAT in Spark
160            Expression::Concat(op) => Ok(Expression::Function(Box::new(Function::new(
161                "CONCAT".to_string(),
162                vec![op.left, op.right],
163            )))),
164
165            // ParseJson: handled by generator (emits just the string literal for Spark)
166
167            // Generic function transformations
168            Expression::Function(f) => self.transform_function(*f),
169
170            // Generic aggregate function transformations
171            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
172
173            // $N parameters -> ${N} in Spark (DollarBrace style)
174            Expression::Parameter(mut p)
175                if p.style == crate::expressions::ParameterStyle::Dollar =>
176            {
177                p.style = crate::expressions::ParameterStyle::DollarBrace;
178                // Convert index to name for DollarBrace format
179                if let Some(idx) = p.index {
180                    p.name = Some(idx.to_string());
181                }
182                Ok(Expression::Parameter(p))
183            }
184
185            // JSONExtract with variant_extract (Databricks colon syntax) -> GET_JSON_OBJECT
186            Expression::JSONExtract(je) if je.variant_extract.is_some() => {
187                // Convert path: 'item[1].price' -> '$.item[1].price'
188                let path = match *je.expression {
189                    Expression::Literal(Literal::String(s)) => {
190                        Expression::Literal(Literal::String(format!("$.{}", s)))
191                    }
192                    other => other,
193                };
194                Ok(Expression::Function(Box::new(Function::new(
195                    "GET_JSON_OBJECT".to_string(),
196                    vec![*je.this, path],
197                ))))
198            }
199
200            // Pass through everything else
201            _ => Ok(expr),
202        }
203    }
204}
205
206impl SparkDialect {
207    /// Normalize a data type for Spark:
208    /// - VARCHAR/CHAR without length -> STRING
209    /// - VARCHAR(n)/CHAR(n) with length -> keep as-is
210    /// - TEXT -> STRING
211    fn normalize_spark_type(dt: DataType) -> DataType {
212        match dt {
213            DataType::VarChar { length: None, .. }
214            | DataType::Char { length: None }
215            | DataType::Text => DataType::Custom {
216                name: "STRING".to_string(),
217            },
218            // VARCHAR(n) and CHAR(n) with length are kept as-is
219            DataType::VarChar { .. } | DataType::Char { .. } => dt,
220            // Also normalize struct fields recursively
221            DataType::Struct { fields, nested } => {
222                let normalized_fields: Vec<StructField> = fields
223                    .into_iter()
224                    .map(|mut f| {
225                        f.data_type = Self::normalize_spark_type(f.data_type);
226                        f
227                    })
228                    .collect();
229                DataType::Struct {
230                    fields: normalized_fields,
231                    nested,
232                }
233            }
234            _ => dt,
235        }
236    }
237
238    fn transform_function(&self, f: Function) -> Result<Expression> {
239        let name_upper = f.name.to_uppercase();
240        match name_upper.as_str() {
241            // IFNULL -> COALESCE
242            "IFNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
243                original_name: None,
244                expressions: f.args,
245                inferred_type: None,
246            }))),
247
248            // NVL -> COALESCE
249            "NVL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
250                original_name: None,
251                expressions: f.args,
252                inferred_type: None,
253            }))),
254
255            // ISNULL -> COALESCE
256            "ISNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
257                original_name: None,
258                expressions: f.args,
259                inferred_type: None,
260            }))),
261
262            // GROUP_CONCAT -> CONCAT_WS + COLLECT_LIST in older Spark
263            // In Spark 4+, STRING_AGG is available
264            "GROUP_CONCAT" if !f.args.is_empty() => {
265                // For simplicity, use COLLECT_LIST (array aggregation)
266                Ok(Expression::Function(Box::new(Function::new(
267                    "COLLECT_LIST".to_string(),
268                    f.args,
269                ))))
270            }
271
272            // STRING_AGG is supported in Spark 4+
273            // For older versions, fall back to CONCAT_WS + COLLECT_LIST
274            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
275                Function::new("COLLECT_LIST".to_string(), f.args),
276            ))),
277
278            // LISTAGG -> STRING_AGG in Spark 4+ (or COLLECT_LIST for older)
279            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
280                "COLLECT_LIST".to_string(),
281                f.args,
282            )))),
283
284            // SUBSTRING is native to Spark
285            "SUBSTRING" | "SUBSTR" => Ok(Expression::Function(Box::new(f))),
286
287            // LENGTH is native to Spark
288            "LENGTH" => Ok(Expression::Function(Box::new(f))),
289
290            // LEN -> LENGTH
291            "LEN" if f.args.len() == 1 => Ok(Expression::Length(Box::new(UnaryFunc::new(
292                f.args.into_iter().next().unwrap(),
293            )))),
294
295            // RANDOM -> RAND
296            "RANDOM" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
297                seed: None,
298                lower: None,
299                upper: None,
300            }))),
301
302            // RAND is native to Spark
303            "RAND" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
304                seed: None,
305                lower: None,
306                upper: None,
307            }))),
308
309            // NOW -> CURRENT_TIMESTAMP
310            "NOW" => Ok(Expression::CurrentTimestamp(
311                crate::expressions::CurrentTimestamp {
312                    precision: None,
313                    sysdate: false,
314                },
315            )),
316
317            // GETDATE -> CURRENT_TIMESTAMP
318            "GETDATE" => Ok(Expression::CurrentTimestamp(
319                crate::expressions::CurrentTimestamp {
320                    precision: None,
321                    sysdate: false,
322                },
323            )),
324
325            // CURRENT_TIMESTAMP is native
326            "CURRENT_TIMESTAMP" => Ok(Expression::CurrentTimestamp(
327                crate::expressions::CurrentTimestamp {
328                    precision: None,
329                    sysdate: false,
330                },
331            )),
332
333            // CURRENT_DATE is native
334            "CURRENT_DATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
335
336            // TO_DATE is native to Spark; strip default format 'yyyy-MM-dd'
337            "TO_DATE" if f.args.len() == 2 => {
338                let is_default_format = matches!(&f.args[1], Expression::Literal(crate::expressions::Literal::String(s)) if s == "yyyy-MM-dd");
339                if is_default_format {
340                    Ok(Expression::Function(Box::new(Function::new(
341                        "TO_DATE".to_string(),
342                        vec![f.args.into_iter().next().unwrap()],
343                    ))))
344                } else {
345                    Ok(Expression::Function(Box::new(f)))
346                }
347            }
348            "TO_DATE" => Ok(Expression::Function(Box::new(f))),
349
350            // TO_TIMESTAMP is native to Spark
351            "TO_TIMESTAMP" => Ok(Expression::Function(Box::new(f))),
352
353            // DATE_FORMAT is native to Spark
354            "DATE_FORMAT" => Ok(Expression::Function(Box::new(f))),
355
356            // strftime -> DATE_FORMAT
357            "STRFTIME" => Ok(Expression::Function(Box::new(Function::new(
358                "DATE_FORMAT".to_string(),
359                f.args,
360            )))),
361
362            // TO_CHAR -> DATE_FORMAT
363            "TO_CHAR" => Ok(Expression::Function(Box::new(Function::new(
364                "DATE_FORMAT".to_string(),
365                f.args,
366            )))),
367
368            // DATE_TRUNC is native to Spark
369            "DATE_TRUNC" => Ok(Expression::Function(Box::new(f))),
370
371            // TRUNC is native to Spark
372            "TRUNC" => Ok(Expression::Function(Box::new(f))),
373
374            // EXTRACT is native to Spark
375            "EXTRACT" => Ok(Expression::Function(Box::new(f))),
376
377            // DATEPART -> EXTRACT
378            "DATEPART" => Ok(Expression::Function(Box::new(Function::new(
379                "EXTRACT".to_string(),
380                f.args,
381            )))),
382
383            // UNIX_TIMESTAMP is native to Spark
384            // When called with no args, add CURRENT_TIMESTAMP() as default
385            "UNIX_TIMESTAMP" => {
386                if f.args.is_empty() {
387                    Ok(Expression::Function(Box::new(Function::new(
388                        "UNIX_TIMESTAMP".to_string(),
389                        vec![Expression::CurrentTimestamp(CurrentTimestamp {
390                            precision: None,
391                            sysdate: false,
392                        })],
393                    ))))
394                } else {
395                    Ok(Expression::Function(Box::new(f)))
396                }
397            }
398
399            // FROM_UNIXTIME is native to Spark
400            "FROM_UNIXTIME" => Ok(Expression::Function(Box::new(f))),
401
402            // STR_TO_MAP is native to Spark
403            // When called with only one arg, add default delimiters ',' and ':'
404            "STR_TO_MAP" => {
405                if f.args.len() == 1 {
406                    let mut args = f.args;
407                    args.push(Expression::Literal(crate::expressions::Literal::String(
408                        ",".to_string(),
409                    )));
410                    args.push(Expression::Literal(crate::expressions::Literal::String(
411                        ":".to_string(),
412                    )));
413                    Ok(Expression::Function(Box::new(Function::new(
414                        "STR_TO_MAP".to_string(),
415                        args,
416                    ))))
417                } else {
418                    Ok(Expression::Function(Box::new(f)))
419                }
420            }
421
422            // POSITION is native to Spark (POSITION(substr IN str))
423            "POSITION" => Ok(Expression::Function(Box::new(f))),
424
425            // LOCATE is native to Spark
426            "LOCATE" => Ok(Expression::Function(Box::new(f))),
427
428            // STRPOS -> Use expression form or LOCATE
429            "STRPOS" if f.args.len() == 2 => {
430                let mut args = f.args;
431                let first = args.remove(0);
432                let second = args.remove(0);
433                // LOCATE(substr, str) in Spark
434                Ok(Expression::Function(Box::new(Function::new(
435                    "LOCATE".to_string(),
436                    vec![second, first],
437                ))))
438            }
439
440            // CHARINDEX -> LOCATE
441            "CHARINDEX" if f.args.len() >= 2 => {
442                let mut args = f.args;
443                let substring = args.remove(0);
444                let string = args.remove(0);
445                let mut locate_args = vec![substring, string];
446                if !args.is_empty() {
447                    locate_args.push(args.remove(0));
448                }
449                Ok(Expression::Function(Box::new(Function::new(
450                    "LOCATE".to_string(),
451                    locate_args,
452                ))))
453            }
454
455            // INSTR is native to Spark
456            "INSTR" => Ok(Expression::Function(Box::new(f))),
457
458            // CEILING -> CEIL
459            "CEILING" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
460                this: f.args.into_iter().next().unwrap(),
461                decimals: None,
462                to: None,
463            }))),
464
465            // CEIL is native to Spark
466            "CEIL" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
467                this: f.args.into_iter().next().unwrap(),
468                decimals: None,
469                to: None,
470            }))),
471
472            // UNNEST -> EXPLODE
473            "UNNEST" => Ok(Expression::Function(Box::new(Function::new(
474                "EXPLODE".to_string(),
475                f.args,
476            )))),
477
478            // FLATTEN -> FLATTEN is native to Spark (for nested arrays)
479            "FLATTEN" => Ok(Expression::Function(Box::new(f))),
480
481            // ARRAY_AGG -> COLLECT_LIST
482            "ARRAY_AGG" => Ok(Expression::Function(Box::new(Function::new(
483                "COLLECT_LIST".to_string(),
484                f.args,
485            )))),
486
487            // COLLECT_LIST is native to Spark
488            "COLLECT_LIST" => Ok(Expression::Function(Box::new(f))),
489
490            // COLLECT_SET is native to Spark
491            "COLLECT_SET" => Ok(Expression::Function(Box::new(f))),
492
493            // ARRAY_LENGTH -> SIZE in Spark
494            "ARRAY_LENGTH" | "CARDINALITY" => Ok(Expression::Function(Box::new(Function::new(
495                "SIZE".to_string(),
496                f.args,
497            )))),
498
499            // SIZE is native to Spark
500            "SIZE" => Ok(Expression::Function(Box::new(f))),
501
502            // SPLIT is native to Spark
503            "SPLIT" => Ok(Expression::Function(Box::new(f))),
504
505            // REGEXP_REPLACE: Spark supports up to 4 args (subject, pattern, replacement, position)
506            // Strip extra Snowflake args (occurrence, params) if present
507            "REGEXP_REPLACE" if f.args.len() > 4 => {
508                let mut args = f.args;
509                args.truncate(4);
510                Ok(Expression::Function(Box::new(Function::new(
511                    "REGEXP_REPLACE".to_string(),
512                    args,
513                ))))
514            }
515            "REGEXP_REPLACE" => Ok(Expression::Function(Box::new(f))),
516
517            // REGEXP_EXTRACT is native to Spark
518            "REGEXP_EXTRACT" => Ok(Expression::Function(Box::new(f))),
519
520            // REGEXP_EXTRACT_ALL is native to Spark
521            "REGEXP_EXTRACT_ALL" => Ok(Expression::Function(Box::new(f))),
522
523            // RLIKE is native to Spark
524            "RLIKE" | "REGEXP_LIKE" => Ok(Expression::Function(Box::new(Function::new(
525                "RLIKE".to_string(),
526                f.args,
527            )))),
528
529            // JSON_EXTRACT -> GET_JSON_OBJECT (Hive style) or :: operator
530            "JSON_EXTRACT" => Ok(Expression::Function(Box::new(Function::new(
531                "GET_JSON_OBJECT".to_string(),
532                f.args,
533            )))),
534
535            // JSON_EXTRACT_SCALAR -> GET_JSON_OBJECT
536            "JSON_EXTRACT_SCALAR" => Ok(Expression::Function(Box::new(Function::new(
537                "GET_JSON_OBJECT".to_string(),
538                f.args,
539            )))),
540
541            // GET_JSON_OBJECT is native to Spark
542            "GET_JSON_OBJECT" => Ok(Expression::Function(Box::new(f))),
543
544            // FROM_JSON is native to Spark
545            "FROM_JSON" => Ok(Expression::Function(Box::new(f))),
546
547            // TO_JSON is native to Spark
548            "TO_JSON" => Ok(Expression::Function(Box::new(f))),
549
550            // PARSE_JSON -> strip for Spark (just keep the string argument)
551            "PARSE_JSON" if f.args.len() == 1 => Ok(f.args.into_iter().next().unwrap()),
552            "PARSE_JSON" => Ok(Expression::Function(Box::new(Function::new(
553                "FROM_JSON".to_string(),
554                f.args,
555            )))),
556
557            // DATEDIFF is native to Spark (supports unit in Spark 3+)
558            "DATEDIFF" | "DATE_DIFF" => Ok(Expression::Function(Box::new(Function::new(
559                "DATEDIFF".to_string(),
560                f.args,
561            )))),
562
563            // DATE_ADD is native to Spark
564            "DATE_ADD" | "DATEADD" => Ok(Expression::Function(Box::new(Function::new(
565                "DATE_ADD".to_string(),
566                f.args,
567            )))),
568
569            // DATE_SUB is native to Spark
570            "DATE_SUB" => Ok(Expression::Function(Box::new(f))),
571
572            // TIMESTAMPADD is native to Spark 3+
573            "TIMESTAMPADD" => Ok(Expression::Function(Box::new(f))),
574
575            // TIMESTAMPDIFF is native to Spark 3+
576            "TIMESTAMPDIFF" => Ok(Expression::Function(Box::new(f))),
577
578            // ADD_MONTHS is native to Spark
579            "ADD_MONTHS" => Ok(Expression::Function(Box::new(f))),
580
581            // MONTHS_BETWEEN is native to Spark
582            "MONTHS_BETWEEN" => Ok(Expression::Function(Box::new(f))),
583
584            // NVL is native to Spark
585            "NVL" => Ok(Expression::Function(Box::new(f))),
586
587            // NVL2 is native to Spark
588            "NVL2" => Ok(Expression::Function(Box::new(f))),
589
590            // MAP is native to Spark
591            "MAP" => Ok(Expression::Function(Box::new(f))),
592
593            // ARRAY is native to Spark
594            "ARRAY" => Ok(Expression::Function(Box::new(f))),
595
596            // ROW -> STRUCT for Spark (cross-dialect, no auto-naming)
597            "ROW" => Ok(Expression::Function(Box::new(Function::new(
598                "STRUCT".to_string(),
599                f.args,
600            )))),
601
602            // STRUCT is native to Spark - auto-name unnamed args as col1, col2, etc.
603            "STRUCT" => {
604                let mut col_idx = 1usize;
605                let named_args: Vec<Expression> = f
606                    .args
607                    .into_iter()
608                    .map(|arg| {
609                        let current_idx = col_idx;
610                        col_idx += 1;
611                        // Check if arg already has an alias (AS name) or is Star
612                        match &arg {
613                            Expression::Alias(_) => arg, // already named
614                            Expression::Star(_) => arg,  // STRUCT(*) - keep as-is
615                            Expression::Column(c) if c.table.is_none() => {
616                                // Column reference: use column name as the struct field name
617                                let name = c.name.name.clone();
618                                Expression::Alias(Box::new(crate::expressions::Alias {
619                                    this: arg,
620                                    alias: crate::expressions::Identifier::new(&name),
621                                    column_aliases: Vec::new(),
622                                    pre_alias_comments: Vec::new(),
623                                    trailing_comments: Vec::new(),
624                                    inferred_type: None,
625                                }))
626                            }
627                            _ => {
628                                // Unnamed literal/expression: auto-name as colN
629                                let name = format!("col{}", current_idx);
630                                Expression::Alias(Box::new(crate::expressions::Alias {
631                                    this: arg,
632                                    alias: crate::expressions::Identifier::new(&name),
633                                    column_aliases: Vec::new(),
634                                    pre_alias_comments: Vec::new(),
635                                    trailing_comments: Vec::new(),
636                                    inferred_type: None,
637                                }))
638                            }
639                        }
640                    })
641                    .collect();
642                Ok(Expression::Function(Box::new(Function {
643                    name: "STRUCT".to_string(),
644                    args: named_args,
645                    distinct: false,
646                    trailing_comments: Vec::new(),
647                    use_bracket_syntax: false,
648                    no_parens: false,
649                    quoted: false,
650                    span: None,
651                    inferred_type: None,
652                })))
653            }
654
655            // NAMED_STRUCT is native to Spark
656            "NAMED_STRUCT" => Ok(Expression::Function(Box::new(f))),
657
658            // MAP_FROM_ARRAYS is native to Spark
659            "MAP_FROM_ARRAYS" => Ok(Expression::Function(Box::new(f))),
660
661            // ARRAY_SORT is native to Spark
662            "ARRAY_SORT" => Ok(Expression::Function(Box::new(f))),
663
664            // ARRAY_DISTINCT is native to Spark
665            "ARRAY_DISTINCT" => Ok(Expression::Function(Box::new(f))),
666
667            // ARRAY_UNION is native to Spark
668            "ARRAY_UNION" => Ok(Expression::Function(Box::new(f))),
669
670            // ARRAY_INTERSECT is native to Spark
671            "ARRAY_INTERSECT" => Ok(Expression::Function(Box::new(f))),
672
673            // ARRAY_EXCEPT is native to Spark
674            "ARRAY_EXCEPT" => Ok(Expression::Function(Box::new(f))),
675
676            // ARRAY_CONTAINS is native to Spark
677            "ARRAY_CONTAINS" => Ok(Expression::Function(Box::new(f))),
678
679            // ELEMENT_AT is native to Spark
680            "ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
681
682            // TRY_ELEMENT_AT is native to Spark 3+
683            "TRY_ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
684
685            // TRANSFORM is native to Spark (array transformation)
686            "TRANSFORM" => Ok(Expression::Function(Box::new(f))),
687
688            // FILTER is native to Spark (array filtering)
689            "FILTER" => Ok(Expression::Function(Box::new(f))),
690
691            // AGGREGATE is native to Spark (array reduction)
692            "AGGREGATE" => Ok(Expression::Function(Box::new(f))),
693
694            // SEQUENCE is native to Spark (generate array)
695            "SEQUENCE" => Ok(Expression::Function(Box::new(f))),
696
697            // GENERATE_SERIES -> SEQUENCE
698            "GENERATE_SERIES" => Ok(Expression::Function(Box::new(Function::new(
699                "SEQUENCE".to_string(),
700                f.args,
701            )))),
702
703            // STARTSWITH is native to Spark 3+
704            "STARTSWITH" | "STARTS_WITH" => Ok(Expression::Function(Box::new(Function::new(
705                "STARTSWITH".to_string(),
706                f.args,
707            )))),
708
709            // ENDSWITH is native to Spark 3+
710            "ENDSWITH" | "ENDS_WITH" => Ok(Expression::Function(Box::new(Function::new(
711                "ENDSWITH".to_string(),
712                f.args,
713            )))),
714
715            // ARRAY_CONSTRUCT_COMPACT(1, null, 2) -> ARRAY_COMPACT(ARRAY(1, NULL, 2))
716            "ARRAY_CONSTRUCT_COMPACT" => {
717                let inner =
718                    Expression::Function(Box::new(Function::new("ARRAY".to_string(), f.args)));
719                Ok(Expression::Function(Box::new(Function::new(
720                    "ARRAY_COMPACT".to_string(),
721                    vec![inner],
722                ))))
723            }
724
725            // ARRAY_TO_STRING -> ARRAY_JOIN
726            "ARRAY_TO_STRING" => Ok(Expression::Function(Box::new(Function::new(
727                "ARRAY_JOIN".to_string(),
728                f.args,
729            )))),
730
731            // TO_ARRAY(x) -> IF(x IS NULL, NULL, ARRAY(x))
732            "TO_ARRAY" if f.args.len() == 1 => {
733                let x = f.args[0].clone();
734                // Check if arg is already an array constructor (bracket notation)
735                // In that case: TO_ARRAY(['test']) -> ARRAY('test')
736                match &x {
737                    Expression::ArrayFunc(arr) => {
738                        // Just convert to ARRAY(...) function
739                        Ok(Expression::Function(Box::new(Function::new(
740                            "ARRAY".to_string(),
741                            arr.expressions.clone(),
742                        ))))
743                    }
744                    _ => Ok(Expression::IfFunc(Box::new(crate::expressions::IfFunc {
745                        condition: Expression::IsNull(Box::new(crate::expressions::IsNull {
746                            this: x.clone(),
747                            not: false,
748                            postfix_form: false,
749                        })),
750                        true_value: Expression::Null(crate::expressions::Null),
751                        false_value: Some(Expression::Function(Box::new(Function::new(
752                            "ARRAY".to_string(),
753                            vec![x],
754                        )))),
755                        original_name: Some("IF".to_string()),
756                    }))),
757                }
758            }
759
760            // REGEXP_SUBSTR -> REGEXP_EXTRACT (strip extra args)
761            "REGEXP_SUBSTR" if f.args.len() >= 2 => {
762                let subject = f.args[0].clone();
763                let pattern = f.args[1].clone();
764                // For Spark: REGEXP_EXTRACT(subject, pattern, group)
765                // group defaults to 0 for full match, but sqlglot uses last arg if present
766                let group = if f.args.len() >= 6 {
767                    let g = &f.args[5];
768                    // If group is literal 1 (default), omit it
769                    if matches!(g, Expression::Literal(Literal::Number(n)) if n == "1") {
770                        None
771                    } else {
772                        Some(g.clone())
773                    }
774                } else {
775                    None
776                };
777                let mut args = vec![subject, pattern];
778                if let Some(g) = group {
779                    args.push(g);
780                }
781                Ok(Expression::Function(Box::new(Function::new(
782                    "REGEXP_EXTRACT".to_string(),
783                    args,
784                ))))
785            }
786
787            // UUID_STRING -> UUID()
788            "UUID_STRING" => Ok(Expression::Function(Box::new(Function::new(
789                "UUID".to_string(),
790                vec![],
791            )))),
792
793            // OBJECT_CONSTRUCT -> STRUCT in Spark
794            "OBJECT_CONSTRUCT" if f.args.len() >= 2 && f.args.len() % 2 == 0 => {
795                // Convert key-value pairs to named struct fields
796                // OBJECT_CONSTRUCT('Manitoba', 'Winnipeg', 'foo', 'bar')
797                // -> STRUCT('Winnipeg' AS Manitoba, 'bar' AS foo)
798                let mut struct_args = Vec::new();
799                for pair in f.args.chunks(2) {
800                    if let Expression::Literal(Literal::String(key)) = &pair[0] {
801                        struct_args.push(Expression::Alias(Box::new(crate::expressions::Alias {
802                            this: pair[1].clone(),
803                            alias: crate::expressions::Identifier::new(key.clone()),
804                            column_aliases: vec![],
805                            pre_alias_comments: vec![],
806                            trailing_comments: vec![],
807                            inferred_type: None,
808                        })));
809                    } else {
810                        struct_args.push(pair[1].clone());
811                    }
812                }
813                Ok(Expression::Function(Box::new(Function::new(
814                    "STRUCT".to_string(),
815                    struct_args,
816                ))))
817            }
818
819            // DATE_PART(part, expr) -> EXTRACT(part FROM expr)
820            "DATE_PART" if f.args.len() == 2 => {
821                let mut args = f.args;
822                let part = args.remove(0);
823                let expr = args.remove(0);
824                if let Some(field) = expr_to_datetime_field(&part) {
825                    Ok(Expression::Extract(Box::new(ExtractFunc {
826                        this: expr,
827                        field,
828                    })))
829                } else {
830                    // Can't parse the field, keep as function
831                    Ok(Expression::Function(Box::new(Function::new(
832                        "DATE_PART".to_string(),
833                        vec![part, expr],
834                    ))))
835                }
836            }
837
838            // GET_PATH(obj, path) -> GET_JSON_OBJECT(obj, json_path) in Spark
839            "GET_PATH" if f.args.len() == 2 => {
840                let mut args = f.args;
841                let this = args.remove(0);
842                let path = args.remove(0);
843                let json_path = match &path {
844                    Expression::Literal(Literal::String(s)) => {
845                        let normalized = if s.starts_with('$') {
846                            s.clone()
847                        } else if s.starts_with('[') {
848                            format!("${}", s)
849                        } else {
850                            format!("$.{}", s)
851                        };
852                        Expression::Literal(Literal::String(normalized))
853                    }
854                    _ => path,
855                };
856                Ok(Expression::Function(Box::new(Function::new(
857                    "GET_JSON_OBJECT".to_string(),
858                    vec![this, json_path],
859                ))))
860            }
861
862            // BITWISE_LEFT_SHIFT → SHIFTLEFT
863            "BITWISE_LEFT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
864                "SHIFTLEFT".to_string(),
865                f.args,
866            )))),
867
868            // BITWISE_RIGHT_SHIFT → SHIFTRIGHT
869            "BITWISE_RIGHT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
870                "SHIFTRIGHT".to_string(),
871                f.args,
872            )))),
873
874            // APPROX_DISTINCT → APPROX_COUNT_DISTINCT
875            "APPROX_DISTINCT" => Ok(Expression::Function(Box::new(Function::new(
876                "APPROX_COUNT_DISTINCT".to_string(),
877                f.args,
878            )))),
879
880            // ARRAY_SLICE → SLICE
881            "ARRAY_SLICE" => Ok(Expression::Function(Box::new(Function::new(
882                "SLICE".to_string(),
883                f.args,
884            )))),
885
886            // DATE_FROM_PARTS → MAKE_DATE
887            "DATE_FROM_PARTS" => Ok(Expression::Function(Box::new(Function::new(
888                "MAKE_DATE".to_string(),
889                f.args,
890            )))),
891
892            // DAYOFWEEK_ISO → DAYOFWEEK
893            "DAYOFWEEK_ISO" => Ok(Expression::Function(Box::new(Function::new(
894                "DAYOFWEEK".to_string(),
895                f.args,
896            )))),
897
898            // FORMAT → FORMAT_STRING
899            "FORMAT" => Ok(Expression::Function(Box::new(Function::new(
900                "FORMAT_STRING".to_string(),
901                f.args,
902            )))),
903
904            // LOGICAL_AND → BOOL_AND
905            "LOGICAL_AND" => Ok(Expression::Function(Box::new(Function::new(
906                "BOOL_AND".to_string(),
907                f.args,
908            )))),
909
910            // VARIANCE_POP → VAR_POP
911            "VARIANCE_POP" => Ok(Expression::Function(Box::new(Function::new(
912                "VAR_POP".to_string(),
913                f.args,
914            )))),
915
916            // WEEK_OF_YEAR → WEEKOFYEAR
917            "WEEK_OF_YEAR" => Ok(Expression::Function(Box::new(Function::new(
918                "WEEKOFYEAR".to_string(),
919                f.args,
920            )))),
921
922            // Pass through everything else
923            _ => Ok(Expression::Function(Box::new(f))),
924        }
925    }
926
927    fn transform_aggregate_function(
928        &self,
929        f: Box<crate::expressions::AggregateFunction>,
930    ) -> Result<Expression> {
931        let name_upper = f.name.to_uppercase();
932        match name_upper.as_str() {
933            // GROUP_CONCAT -> COLLECT_LIST (then CONCAT_WS for string)
934            "GROUP_CONCAT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
935                Function::new("COLLECT_LIST".to_string(), f.args),
936            ))),
937
938            // STRING_AGG -> COLLECT_LIST (or STRING_AGG in Spark 4+)
939            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
940                Function::new("COLLECT_LIST".to_string(), f.args),
941            ))),
942
943            // LISTAGG -> COLLECT_LIST
944            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
945                "COLLECT_LIST".to_string(),
946                f.args,
947            )))),
948
949            // ARRAY_AGG -> COLLECT_LIST (preserve distinct and filter)
950            "ARRAY_AGG" if !f.args.is_empty() => {
951                let mut af = f;
952                af.name = "COLLECT_LIST".to_string();
953                Ok(Expression::AggregateFunction(af))
954            }
955
956            // LOGICAL_OR -> BOOL_OR in Spark
957            "LOGICAL_OR" if !f.args.is_empty() => {
958                let mut af = f;
959                af.name = "BOOL_OR".to_string();
960                Ok(Expression::AggregateFunction(af))
961            }
962
963            // Pass through everything else
964            _ => Ok(Expression::AggregateFunction(f)),
965        }
966    }
967}
968
969/// Convert an expression (string literal or identifier) to a DateTimeField
970fn expr_to_datetime_field(expr: &Expression) -> Option<DateTimeField> {
971    let name = match expr {
972        Expression::Literal(Literal::String(s)) => s.to_uppercase(),
973        Expression::Identifier(id) => id.name.to_uppercase(),
974        Expression::Column(col) if col.table.is_none() => col.name.name.to_uppercase(),
975        _ => return None,
976    };
977    match name.as_str() {
978        "YEAR" | "Y" | "YY" | "YYY" | "YYYY" | "YR" | "YEARS" | "YRS" => Some(DateTimeField::Year),
979        "MONTH" | "MM" | "MON" | "MONS" | "MONTHS" => Some(DateTimeField::Month),
980        "DAY" | "D" | "DD" | "DAYS" | "DAYOFMONTH" => Some(DateTimeField::Day),
981        "HOUR" | "H" | "HH" | "HR" | "HOURS" | "HRS" => Some(DateTimeField::Hour),
982        "MINUTE" | "MI" | "MIN" | "MINUTES" | "MINS" => Some(DateTimeField::Minute),
983        "SECOND" | "S" | "SEC" | "SECONDS" | "SECS" => Some(DateTimeField::Second),
984        "MILLISECOND" | "MS" | "MSEC" | "MILLISECONDS" => Some(DateTimeField::Millisecond),
985        "MICROSECOND" | "US" | "USEC" | "MICROSECONDS" => Some(DateTimeField::Microsecond),
986        "DOW" | "DAYOFWEEK" | "DAYOFWEEK_ISO" | "DW" => Some(DateTimeField::DayOfWeek),
987        "DOY" | "DAYOFYEAR" => Some(DateTimeField::DayOfYear),
988        "WEEK" | "W" | "WK" | "WEEKOFYEAR" | "WOY" => Some(DateTimeField::Week),
989        "QUARTER" | "Q" | "QTR" | "QTRS" | "QUARTERS" => Some(DateTimeField::Quarter),
990        "EPOCH" | "EPOCH_SECOND" | "EPOCH_SECONDS" => Some(DateTimeField::Epoch),
991        "TIMEZONE" | "TIMEZONE_HOUR" | "TZH" => Some(DateTimeField::TimezoneHour),
992        "TIMEZONE_MINUTE" | "TZM" => Some(DateTimeField::TimezoneMinute),
993        _ => Some(DateTimeField::Custom(name)),
994    }
995}