Skip to main content

polyglot_sql/dialects/
spark.rs

1//! Spark SQL Dialect
2//!
3//! Spark SQL-specific transformations based on sqlglot patterns.
4//! Key features (extends Hive with modern SQL):
5//! - TRY_CAST is supported (Spark 3+)
6//! - ILIKE is supported (Spark 3+)
7//! - Uses backticks for identifiers
8//! - ARRAY_AGG, COLLECT_LIST for array aggregation
9//! - STRING_AGG / LISTAGG supported (Spark 4+)
10//! - DATE_ADD with unit parameter (Spark 3+)
11//! - TIMESTAMPADD, TIMESTAMPDIFF (Spark 3+)
12//! - More PostgreSQL-like syntax than Hive
13
14use super::{DialectImpl, DialectType};
15use crate::error::Result;
16use crate::expressions::{
17    CeilFunc, CurrentTimestamp, DataType, DateTimeField, Expression, ExtractFunc, Function,
18    Literal, StructField, UnaryFunc, VarArgFunc,
19};
20use crate::generator::GeneratorConfig;
21use crate::tokens::TokenizerConfig;
22
23/// Spark SQL dialect
24pub struct SparkDialect;
25
26impl DialectImpl for SparkDialect {
27    fn dialect_type(&self) -> DialectType {
28        DialectType::Spark
29    }
30
31    fn tokenizer_config(&self) -> TokenizerConfig {
32        let mut config = TokenizerConfig::default();
33        // Spark uses backticks for identifiers (NOT double quotes)
34        config.identifiers.clear();
35        config.identifiers.insert('`', '`');
36        // Spark (like Hive) uses double quotes as string delimiters (QUOTES = ["'", '"'])
37        config.quotes.insert("\"".to_string(), "\"".to_string());
38        // Spark (like Hive) uses backslash escapes in strings (STRING_ESCAPES = ["\\"])
39        config.string_escapes.push('\\');
40        // Spark supports DIV keyword for integer division (inherited from Hive)
41        config
42            .keywords
43            .insert("DIV".to_string(), crate::tokens::TokenType::Div);
44        config
45            .keywords
46            .insert("REPAIR".to_string(), crate::tokens::TokenType::Command);
47        config
48            .keywords
49            .insert("MSCK".to_string(), crate::tokens::TokenType::Command);
50        // Spark numeric literal suffixes (same as Hive): 1L -> BIGINT, 1S -> SMALLINT, etc.
51        config
52            .numeric_literals
53            .insert("L".to_string(), "BIGINT".to_string());
54        config
55            .numeric_literals
56            .insert("S".to_string(), "SMALLINT".to_string());
57        config
58            .numeric_literals
59            .insert("Y".to_string(), "TINYINT".to_string());
60        config
61            .numeric_literals
62            .insert("D".to_string(), "DOUBLE".to_string());
63        config
64            .numeric_literals
65            .insert("F".to_string(), "FLOAT".to_string());
66        config
67            .numeric_literals
68            .insert("BD".to_string(), "DECIMAL".to_string());
69        // Spark allows identifiers to start with digits (e.g., 1a, 1_a)
70        config.identifiers_can_start_with_digit = true;
71        // Spark: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False
72        // Backslashes in raw strings are always literal (no escape processing)
73        config.string_escapes_allowed_in_raw_strings = false;
74        config
75    }
76
77    fn generator_config(&self) -> GeneratorConfig {
78        use crate::generator::IdentifierQuoteStyle;
79        GeneratorConfig {
80            identifier_quote: '`',
81            identifier_quote_style: IdentifierQuoteStyle::BACKTICK,
82            dialect: Some(DialectType::Spark),
83            // Spark uses colon separator in STRUCT field definitions: STRUCT<field_name: TYPE>
84            struct_field_sep: ": ",
85            // Spark doesn't use AS before RETURN in function definitions
86            create_function_return_as: false,
87            // Spark places alias after the TABLESAMPLE clause
88            alias_post_tablesample: true,
89            tablesample_seed_keyword: "REPEATABLE",
90            join_hints: false,
91            identifiers_can_start_with_digit: true,
92            // Spark uses COMMENT 'value' without = sign
93            schema_comment_with_eq: false,
94            ..Default::default()
95        }
96    }
97
98    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
99        match expr {
100            // IFNULL -> COALESCE in Spark
101            Expression::IfNull(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
102                original_name: None,
103                expressions: vec![f.this, f.expression],
104                inferred_type: None,
105            }))),
106
107            // NVL is supported in Spark (from Hive), but COALESCE is standard
108            Expression::Nvl(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
109                original_name: None,
110                expressions: vec![f.this, f.expression],
111                inferred_type: None,
112            }))),
113
114            // Cast: normalize VARCHAR(n) -> STRING, CHAR(n) -> STRING for Spark
115            Expression::Cast(mut c) => {
116                c.to = Self::normalize_spark_type(c.to);
117                Ok(Expression::Cast(c))
118            }
119
120            // TryCast stays as TryCast in Spark (Spark supports TRY_CAST natively)
121            Expression::TryCast(mut c) => {
122                c.to = Self::normalize_spark_type(c.to);
123                Ok(Expression::TryCast(c))
124            }
125
126            // SafeCast -> TRY_CAST
127            Expression::SafeCast(mut c) => {
128                c.to = Self::normalize_spark_type(c.to);
129                Ok(Expression::TryCast(c))
130            }
131
132            // TRIM: non-standard comma syntax -> standard FROM syntax
133            // TRIM('SL', 'SSparkSQLS') -> TRIM('SL' FROM 'SSparkSQLS')
134            Expression::Trim(mut t) => {
135                if !t.sql_standard_syntax && t.characters.is_some() {
136                    // Convert comma syntax to standard SQL syntax
137                    // Fields already have correct semantics: this=string, characters=chars
138                    t.sql_standard_syntax = true;
139                }
140                Ok(Expression::Trim(t))
141            }
142
143            // ILIKE is supported in Spark 3+
144            Expression::ILike(op) => Ok(Expression::ILike(op)),
145
146            // UNNEST -> EXPLODE in Spark (Hive compatibility)
147            Expression::Unnest(f) => Ok(Expression::Explode(Box::new(UnaryFunc::new(f.this)))),
148
149            // EXPLODE is native to Spark
150            Expression::Explode(f) => Ok(Expression::Explode(f)),
151
152            // ExplodeOuter is supported in Spark
153            Expression::ExplodeOuter(f) => Ok(Expression::ExplodeOuter(f)),
154
155            // RANDOM -> RAND in Spark
156            Expression::Random(_) => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
157                seed: None,
158                lower: None,
159                upper: None,
160            }))),
161
162            // Rand is native to Spark
163            Expression::Rand(r) => Ok(Expression::Rand(r)),
164
165            // || (Concat) -> CONCAT in Spark
166            Expression::Concat(op) => Ok(Expression::Function(Box::new(Function::new(
167                "CONCAT".to_string(),
168                vec![op.left, op.right],
169            )))),
170
171            // ParseJson: handled by generator (emits just the string literal for Spark)
172
173            // Generic function transformations
174            Expression::Function(f) => self.transform_function(*f),
175
176            // Generic aggregate function transformations
177            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
178
179            // $N parameters -> ${N} in Spark (DollarBrace style)
180            Expression::Parameter(mut p)
181                if p.style == crate::expressions::ParameterStyle::Dollar =>
182            {
183                p.style = crate::expressions::ParameterStyle::DollarBrace;
184                // Convert index to name for DollarBrace format
185                if let Some(idx) = p.index {
186                    p.name = Some(idx.to_string());
187                }
188                Ok(Expression::Parameter(p))
189            }
190
191            // JSONExtract with variant_extract (Databricks colon syntax) -> GET_JSON_OBJECT
192            Expression::JSONExtract(je) if je.variant_extract.is_some() => {
193                // Convert path: 'item[1].price' -> '$.item[1].price'
194                let path = match *je.expression {
195                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
196                        let Literal::String(s) = lit.as_ref() else {
197                            unreachable!()
198                        };
199                        Expression::Literal(Box::new(Literal::String(format!("$.{}", s))))
200                    }
201                    other => other,
202                };
203                Ok(Expression::Function(Box::new(Function::new(
204                    "GET_JSON_OBJECT".to_string(),
205                    vec![*je.this, path],
206                ))))
207            }
208
209            // Pass through everything else
210            _ => Ok(expr),
211        }
212    }
213}
214
215impl SparkDialect {
216    /// Normalize a data type for Spark:
217    /// - VARCHAR/CHAR without length -> STRING
218    /// - VARCHAR(n)/CHAR(n) with length -> keep as-is
219    /// - TEXT -> STRING
220    fn normalize_spark_type(dt: DataType) -> DataType {
221        match dt {
222            DataType::VarChar { length: None, .. }
223            | DataType::Char { length: None }
224            | DataType::Text => DataType::Custom {
225                name: "STRING".to_string(),
226            },
227            // VARCHAR(n) and CHAR(n) with length are kept as-is
228            DataType::VarChar { .. } | DataType::Char { .. } => dt,
229            // Also normalize struct fields recursively
230            DataType::Struct { fields, nested } => {
231                let normalized_fields: Vec<StructField> = fields
232                    .into_iter()
233                    .map(|mut f| {
234                        f.data_type = Self::normalize_spark_type(f.data_type);
235                        f
236                    })
237                    .collect();
238                DataType::Struct {
239                    fields: normalized_fields,
240                    nested,
241                }
242            }
243            _ => dt,
244        }
245    }
246
247    fn transform_function(&self, f: Function) -> Result<Expression> {
248        let name_upper = f.name.to_uppercase();
249        match name_upper.as_str() {
250            // IFNULL -> COALESCE
251            "IFNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
252                original_name: None,
253                expressions: f.args,
254                inferred_type: None,
255            }))),
256
257            // NVL -> COALESCE
258            "NVL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
259                original_name: None,
260                expressions: f.args,
261                inferred_type: None,
262            }))),
263
264            // ISNULL -> COALESCE
265            "ISNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
266                original_name: None,
267                expressions: f.args,
268                inferred_type: None,
269            }))),
270
271            // GROUP_CONCAT -> CONCAT_WS + COLLECT_LIST in older Spark
272            // In Spark 4+, STRING_AGG is available
273            "GROUP_CONCAT" if !f.args.is_empty() => {
274                // For simplicity, use COLLECT_LIST (array aggregation)
275                Ok(Expression::Function(Box::new(Function::new(
276                    "COLLECT_LIST".to_string(),
277                    f.args,
278                ))))
279            }
280
281            // STRING_AGG is supported in Spark 4+
282            // For older versions, fall back to CONCAT_WS + COLLECT_LIST
283            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
284                Function::new("COLLECT_LIST".to_string(), f.args),
285            ))),
286
287            // LISTAGG -> STRING_AGG in Spark 4+ (or COLLECT_LIST for older)
288            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
289                "COLLECT_LIST".to_string(),
290                f.args,
291            )))),
292
293            // SUBSTRING is native to Spark
294            "SUBSTRING" | "SUBSTR" => Ok(Expression::Function(Box::new(f))),
295
296            // LENGTH is native to Spark
297            "LENGTH" => Ok(Expression::Function(Box::new(f))),
298
299            // LEN -> LENGTH
300            "LEN" if f.args.len() == 1 => Ok(Expression::Length(Box::new(UnaryFunc::new(
301                f.args.into_iter().next().unwrap(),
302            )))),
303
304            // RANDOM -> RAND
305            "RANDOM" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
306                seed: None,
307                lower: None,
308                upper: None,
309            }))),
310
311            // RAND is native to Spark
312            "RAND" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
313                seed: None,
314                lower: None,
315                upper: None,
316            }))),
317
318            // NOW -> CURRENT_TIMESTAMP
319            "NOW" => Ok(Expression::CurrentTimestamp(
320                crate::expressions::CurrentTimestamp {
321                    precision: None,
322                    sysdate: false,
323                },
324            )),
325
326            // GETDATE -> CURRENT_TIMESTAMP
327            "GETDATE" => Ok(Expression::CurrentTimestamp(
328                crate::expressions::CurrentTimestamp {
329                    precision: None,
330                    sysdate: false,
331                },
332            )),
333
334            // CURRENT_TIMESTAMP is native
335            "CURRENT_TIMESTAMP" => Ok(Expression::CurrentTimestamp(
336                crate::expressions::CurrentTimestamp {
337                    precision: None,
338                    sysdate: false,
339                },
340            )),
341
342            // CURRENT_DATE is native
343            "CURRENT_DATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
344
345            // TO_DATE is native to Spark; strip default format 'yyyy-MM-dd'
346            "TO_DATE" if f.args.len() == 2 => {
347                let is_default_format = matches!(&f.args[1], Expression::Literal(lit) if matches!(lit.as_ref(), crate::expressions::Literal::String(s) if s == "yyyy-MM-dd"));
348                if is_default_format {
349                    Ok(Expression::Function(Box::new(Function::new(
350                        "TO_DATE".to_string(),
351                        vec![f.args.into_iter().next().unwrap()],
352                    ))))
353                } else {
354                    Ok(Expression::Function(Box::new(f)))
355                }
356            }
357            "TO_DATE" => Ok(Expression::Function(Box::new(f))),
358
359            // TO_TIMESTAMP is native to Spark
360            "TO_TIMESTAMP" => Ok(Expression::Function(Box::new(f))),
361
362            // DATE_FORMAT is native to Spark
363            "DATE_FORMAT" => Ok(Expression::Function(Box::new(f))),
364
365            // strftime -> DATE_FORMAT
366            "STRFTIME" => Ok(Expression::Function(Box::new(Function::new(
367                "DATE_FORMAT".to_string(),
368                f.args,
369            )))),
370
371            // TO_CHAR -> DATE_FORMAT
372            "TO_CHAR" => Ok(Expression::Function(Box::new(Function::new(
373                "DATE_FORMAT".to_string(),
374                f.args,
375            )))),
376
377            // DATE_TRUNC is native to Spark
378            "DATE_TRUNC" => Ok(Expression::Function(Box::new(f))),
379
380            // TRUNC is native to Spark
381            "TRUNC" => Ok(Expression::Function(Box::new(f))),
382
383            // EXTRACT is native to Spark
384            "EXTRACT" => Ok(Expression::Function(Box::new(f))),
385
386            // DATEPART -> EXTRACT
387            "DATEPART" => Ok(Expression::Function(Box::new(Function::new(
388                "EXTRACT".to_string(),
389                f.args,
390            )))),
391
392            // UNIX_TIMESTAMP is native to Spark
393            // When called with no args, add CURRENT_TIMESTAMP() as default
394            "UNIX_TIMESTAMP" => {
395                if f.args.is_empty() {
396                    Ok(Expression::Function(Box::new(Function::new(
397                        "UNIX_TIMESTAMP".to_string(),
398                        vec![Expression::CurrentTimestamp(CurrentTimestamp {
399                            precision: None,
400                            sysdate: false,
401                        })],
402                    ))))
403                } else {
404                    Ok(Expression::Function(Box::new(f)))
405                }
406            }
407
408            // FROM_UNIXTIME is native to Spark
409            "FROM_UNIXTIME" => Ok(Expression::Function(Box::new(f))),
410
411            // STR_TO_MAP is native to Spark
412            // When called with only one arg, add default delimiters ',' and ':'
413            "STR_TO_MAP" => {
414                if f.args.len() == 1 {
415                    let mut args = f.args;
416                    args.push(Expression::Literal(Box::new(
417                        crate::expressions::Literal::String(",".to_string()),
418                    )));
419                    args.push(Expression::Literal(Box::new(
420                        crate::expressions::Literal::String(":".to_string()),
421                    )));
422                    Ok(Expression::Function(Box::new(Function::new(
423                        "STR_TO_MAP".to_string(),
424                        args,
425                    ))))
426                } else {
427                    Ok(Expression::Function(Box::new(f)))
428                }
429            }
430
431            // POSITION is native to Spark (POSITION(substr IN str))
432            "POSITION" => Ok(Expression::Function(Box::new(f))),
433
434            // LOCATE is native to Spark
435            "LOCATE" => Ok(Expression::Function(Box::new(f))),
436
437            // STRPOS -> Use expression form or LOCATE
438            "STRPOS" if f.args.len() == 2 => {
439                let mut args = f.args;
440                let first = args.remove(0);
441                let second = args.remove(0);
442                // LOCATE(substr, str) in Spark
443                Ok(Expression::Function(Box::new(Function::new(
444                    "LOCATE".to_string(),
445                    vec![second, first],
446                ))))
447            }
448
449            // CHARINDEX -> LOCATE
450            "CHARINDEX" if f.args.len() >= 2 => {
451                let mut args = f.args;
452                let substring = args.remove(0);
453                let string = args.remove(0);
454                let mut locate_args = vec![substring, string];
455                if !args.is_empty() {
456                    locate_args.push(args.remove(0));
457                }
458                Ok(Expression::Function(Box::new(Function::new(
459                    "LOCATE".to_string(),
460                    locate_args,
461                ))))
462            }
463
464            // INSTR is native to Spark
465            "INSTR" => Ok(Expression::Function(Box::new(f))),
466
467            // CEILING -> CEIL
468            "CEILING" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
469                this: f.args.into_iter().next().unwrap(),
470                decimals: None,
471                to: None,
472            }))),
473
474            // CEIL is native to Spark
475            "CEIL" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
476                this: f.args.into_iter().next().unwrap(),
477                decimals: None,
478                to: None,
479            }))),
480
481            // UNNEST -> EXPLODE
482            "UNNEST" => Ok(Expression::Function(Box::new(Function::new(
483                "EXPLODE".to_string(),
484                f.args,
485            )))),
486
487            // FLATTEN -> FLATTEN is native to Spark (for nested arrays)
488            "FLATTEN" => Ok(Expression::Function(Box::new(f))),
489
490            // ARRAY_AGG -> COLLECT_LIST
491            "ARRAY_AGG" => Ok(Expression::Function(Box::new(Function::new(
492                "COLLECT_LIST".to_string(),
493                f.args,
494            )))),
495
496            // COLLECT_LIST is native to Spark
497            "COLLECT_LIST" => Ok(Expression::Function(Box::new(f))),
498
499            // COLLECT_SET is native to Spark
500            "COLLECT_SET" => Ok(Expression::Function(Box::new(f))),
501
502            // ARRAY_LENGTH -> SIZE in Spark
503            "ARRAY_LENGTH" | "CARDINALITY" => Ok(Expression::Function(Box::new(Function::new(
504                "SIZE".to_string(),
505                f.args,
506            )))),
507
508            // SIZE is native to Spark
509            "SIZE" => Ok(Expression::Function(Box::new(f))),
510
511            // SPLIT is native to Spark
512            "SPLIT" => Ok(Expression::Function(Box::new(f))),
513
514            // REGEXP_REPLACE: Spark supports up to 4 args (subject, pattern, replacement, position)
515            // Strip extra Snowflake args (occurrence, params) if present
516            "REGEXP_REPLACE" if f.args.len() > 4 => {
517                let mut args = f.args;
518                args.truncate(4);
519                Ok(Expression::Function(Box::new(Function::new(
520                    "REGEXP_REPLACE".to_string(),
521                    args,
522                ))))
523            }
524            "REGEXP_REPLACE" => Ok(Expression::Function(Box::new(f))),
525
526            // REGEXP_EXTRACT is native to Spark
527            "REGEXP_EXTRACT" => Ok(Expression::Function(Box::new(f))),
528
529            // REGEXP_EXTRACT_ALL is native to Spark
530            "REGEXP_EXTRACT_ALL" => Ok(Expression::Function(Box::new(f))),
531
532            // RLIKE is native to Spark
533            "RLIKE" | "REGEXP_LIKE" => Ok(Expression::Function(Box::new(Function::new(
534                "RLIKE".to_string(),
535                f.args,
536            )))),
537
538            // JSON_EXTRACT -> GET_JSON_OBJECT (Hive style) or :: operator
539            "JSON_EXTRACT" => Ok(Expression::Function(Box::new(Function::new(
540                "GET_JSON_OBJECT".to_string(),
541                f.args,
542            )))),
543
544            // JSON_EXTRACT_SCALAR -> GET_JSON_OBJECT
545            "JSON_EXTRACT_SCALAR" => Ok(Expression::Function(Box::new(Function::new(
546                "GET_JSON_OBJECT".to_string(),
547                f.args,
548            )))),
549
550            // GET_JSON_OBJECT is native to Spark
551            "GET_JSON_OBJECT" => Ok(Expression::Function(Box::new(f))),
552
553            // FROM_JSON is native to Spark
554            "FROM_JSON" => Ok(Expression::Function(Box::new(f))),
555
556            // TO_JSON is native to Spark
557            "TO_JSON" => Ok(Expression::Function(Box::new(f))),
558
559            // PARSE_JSON -> strip for Spark (just keep the string argument)
560            "PARSE_JSON" if f.args.len() == 1 => Ok(f.args.into_iter().next().unwrap()),
561            "PARSE_JSON" => Ok(Expression::Function(Box::new(Function::new(
562                "FROM_JSON".to_string(),
563                f.args,
564            )))),
565
566            // DATEDIFF is native to Spark (supports unit in Spark 3+)
567            "DATEDIFF" | "DATE_DIFF" => Ok(Expression::Function(Box::new(Function::new(
568                "DATEDIFF".to_string(),
569                f.args,
570            )))),
571
572            // DATE_ADD is native to Spark
573            "DATE_ADD" | "DATEADD" => Ok(Expression::Function(Box::new(Function::new(
574                "DATE_ADD".to_string(),
575                f.args,
576            )))),
577
578            // DATE_SUB is native to Spark
579            "DATE_SUB" => Ok(Expression::Function(Box::new(f))),
580
581            // TIMESTAMPADD is native to Spark 3+
582            "TIMESTAMPADD" => Ok(Expression::Function(Box::new(f))),
583
584            // TIMESTAMPDIFF is native to Spark 3+
585            "TIMESTAMPDIFF" => Ok(Expression::Function(Box::new(f))),
586
587            // ADD_MONTHS is native to Spark
588            "ADD_MONTHS" => Ok(Expression::Function(Box::new(f))),
589
590            // MONTHS_BETWEEN is native to Spark
591            "MONTHS_BETWEEN" => Ok(Expression::Function(Box::new(f))),
592
593            // NVL is native to Spark
594            "NVL" => Ok(Expression::Function(Box::new(f))),
595
596            // NVL2 is native to Spark
597            "NVL2" => Ok(Expression::Function(Box::new(f))),
598
599            // MAP is native to Spark
600            "MAP" => Ok(Expression::Function(Box::new(f))),
601
602            // ARRAY is native to Spark
603            "ARRAY" => Ok(Expression::Function(Box::new(f))),
604
605            // ROW -> STRUCT for Spark (cross-dialect, no auto-naming)
606            "ROW" => Ok(Expression::Function(Box::new(Function::new(
607                "STRUCT".to_string(),
608                f.args,
609            )))),
610
611            // STRUCT is native to Spark - auto-name unnamed args as col1, col2, etc.
612            "STRUCT" => {
613                let mut col_idx = 1usize;
614                let named_args: Vec<Expression> = f
615                    .args
616                    .into_iter()
617                    .map(|arg| {
618                        let current_idx = col_idx;
619                        col_idx += 1;
620                        // Check if arg already has an alias (AS name) or is Star
621                        match &arg {
622                            Expression::Alias(_) => arg, // already named
623                            Expression::Star(_) => arg,  // STRUCT(*) - keep as-is
624                            Expression::Column(c) if c.table.is_none() => {
625                                // Column reference: use column name as the struct field name
626                                let name = c.name.name.clone();
627                                Expression::Alias(Box::new(crate::expressions::Alias {
628                                    this: arg,
629                                    alias: crate::expressions::Identifier::new(&name),
630                                    column_aliases: Vec::new(),
631                                    pre_alias_comments: Vec::new(),
632                                    trailing_comments: Vec::new(),
633                                    inferred_type: None,
634                                }))
635                            }
636                            _ => {
637                                // Unnamed literal/expression: auto-name as colN
638                                let name = format!("col{}", current_idx);
639                                Expression::Alias(Box::new(crate::expressions::Alias {
640                                    this: arg,
641                                    alias: crate::expressions::Identifier::new(&name),
642                                    column_aliases: Vec::new(),
643                                    pre_alias_comments: Vec::new(),
644                                    trailing_comments: Vec::new(),
645                                    inferred_type: None,
646                                }))
647                            }
648                        }
649                    })
650                    .collect();
651                Ok(Expression::Function(Box::new(Function {
652                    name: "STRUCT".to_string(),
653                    args: named_args,
654                    distinct: false,
655                    trailing_comments: Vec::new(),
656                    use_bracket_syntax: false,
657                    no_parens: false,
658                    quoted: false,
659                    span: None,
660                    inferred_type: None,
661                })))
662            }
663
664            // NAMED_STRUCT is native to Spark
665            "NAMED_STRUCT" => Ok(Expression::Function(Box::new(f))),
666
667            // MAP_FROM_ARRAYS is native to Spark
668            "MAP_FROM_ARRAYS" => Ok(Expression::Function(Box::new(f))),
669
670            // ARRAY_SORT is native to Spark
671            "ARRAY_SORT" => Ok(Expression::Function(Box::new(f))),
672
673            // ARRAY_DISTINCT is native to Spark
674            "ARRAY_DISTINCT" => Ok(Expression::Function(Box::new(f))),
675
676            // ARRAY_UNION is native to Spark
677            "ARRAY_UNION" => Ok(Expression::Function(Box::new(f))),
678
679            // ARRAY_INTERSECT is native to Spark
680            "ARRAY_INTERSECT" => Ok(Expression::Function(Box::new(f))),
681
682            // ARRAY_EXCEPT is native to Spark
683            "ARRAY_EXCEPT" => Ok(Expression::Function(Box::new(f))),
684
685            // ARRAY_CONTAINS is native to Spark
686            "ARRAY_CONTAINS" => Ok(Expression::Function(Box::new(f))),
687
688            // ELEMENT_AT is native to Spark
689            "ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
690
691            // TRY_ELEMENT_AT is native to Spark 3+
692            "TRY_ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
693
694            // TRANSFORM is native to Spark (array transformation)
695            "TRANSFORM" => Ok(Expression::Function(Box::new(f))),
696
697            // FILTER is native to Spark (array filtering)
698            "FILTER" => Ok(Expression::Function(Box::new(f))),
699
700            // AGGREGATE is native to Spark (array reduction)
701            "AGGREGATE" => Ok(Expression::Function(Box::new(f))),
702
703            // SEQUENCE is native to Spark (generate array)
704            "SEQUENCE" => Ok(Expression::Function(Box::new(f))),
705
706            // GENERATE_SERIES -> SEQUENCE
707            "GENERATE_SERIES" => Ok(Expression::Function(Box::new(Function::new(
708                "SEQUENCE".to_string(),
709                f.args,
710            )))),
711
712            // STARTSWITH is native to Spark 3+
713            "STARTSWITH" | "STARTS_WITH" => Ok(Expression::Function(Box::new(Function::new(
714                "STARTSWITH".to_string(),
715                f.args,
716            )))),
717
718            // ENDSWITH is native to Spark 3+
719            "ENDSWITH" | "ENDS_WITH" => Ok(Expression::Function(Box::new(Function::new(
720                "ENDSWITH".to_string(),
721                f.args,
722            )))),
723
724            // ARRAY_CONSTRUCT_COMPACT(1, null, 2) -> ARRAY_COMPACT(ARRAY(1, NULL, 2))
725            "ARRAY_CONSTRUCT_COMPACT" => {
726                let inner =
727                    Expression::Function(Box::new(Function::new("ARRAY".to_string(), f.args)));
728                Ok(Expression::Function(Box::new(Function::new(
729                    "ARRAY_COMPACT".to_string(),
730                    vec![inner],
731                ))))
732            }
733
734            // ARRAY_TO_STRING -> ARRAY_JOIN
735            "ARRAY_TO_STRING" => Ok(Expression::Function(Box::new(Function::new(
736                "ARRAY_JOIN".to_string(),
737                f.args,
738            )))),
739
740            // TO_ARRAY(x) -> IF(x IS NULL, NULL, ARRAY(x))
741            "TO_ARRAY" if f.args.len() == 1 => {
742                let x = f.args[0].clone();
743                // Check if arg is already an array constructor (bracket notation)
744                // In that case: TO_ARRAY(['test']) -> ARRAY('test')
745                match &x {
746                    Expression::ArrayFunc(arr) => {
747                        // Just convert to ARRAY(...) function
748                        Ok(Expression::Function(Box::new(Function::new(
749                            "ARRAY".to_string(),
750                            arr.expressions.clone(),
751                        ))))
752                    }
753                    _ => Ok(Expression::IfFunc(Box::new(crate::expressions::IfFunc {
754                        condition: Expression::IsNull(Box::new(crate::expressions::IsNull {
755                            this: x.clone(),
756                            not: false,
757                            postfix_form: false,
758                        })),
759                        true_value: Expression::Null(crate::expressions::Null),
760                        false_value: Some(Expression::Function(Box::new(Function::new(
761                            "ARRAY".to_string(),
762                            vec![x],
763                        )))),
764                        original_name: Some("IF".to_string()),
765                        inferred_type: None,
766                    }))),
767                }
768            }
769
770            // REGEXP_SUBSTR -> REGEXP_EXTRACT (strip extra args)
771            "REGEXP_SUBSTR" if f.args.len() >= 2 => {
772                let subject = f.args[0].clone();
773                let pattern = f.args[1].clone();
774                // For Spark: REGEXP_EXTRACT(subject, pattern, group)
775                // group defaults to 0 for full match, but sqlglot uses last arg if present
776                let group = if f.args.len() >= 6 {
777                    let g = &f.args[5];
778                    // If group is literal 1 (default), omit it
779                    if matches!(g, Expression::Literal(lit) if matches!(lit.as_ref(), Literal::Number(n) if n == "1"))
780                    {
781                        None
782                    } else {
783                        Some(g.clone())
784                    }
785                } else {
786                    None
787                };
788                let mut args = vec![subject, pattern];
789                if let Some(g) = group {
790                    args.push(g);
791                }
792                Ok(Expression::Function(Box::new(Function::new(
793                    "REGEXP_EXTRACT".to_string(),
794                    args,
795                ))))
796            }
797
798            // UUID_STRING -> UUID()
799            "UUID_STRING" => Ok(Expression::Function(Box::new(Function::new(
800                "UUID".to_string(),
801                vec![],
802            )))),
803
804            // OBJECT_CONSTRUCT -> STRUCT in Spark
805            "OBJECT_CONSTRUCT" if f.args.len() >= 2 && f.args.len() % 2 == 0 => {
806                // Convert key-value pairs to named struct fields
807                // OBJECT_CONSTRUCT('Manitoba', 'Winnipeg', 'foo', 'bar')
808                // -> STRUCT('Winnipeg' AS Manitoba, 'bar' AS foo)
809                let mut struct_args = Vec::new();
810                for pair in f.args.chunks(2) {
811                    if let Expression::Literal(lit) = &pair[0] {
812                        if let Literal::String(key) = lit.as_ref() {
813                            struct_args.push(Expression::Alias(Box::new(
814                                crate::expressions::Alias {
815                                    this: pair[1].clone(),
816                                    alias: crate::expressions::Identifier::new(key.clone()),
817                                    column_aliases: vec![],
818                                    pre_alias_comments: vec![],
819                                    trailing_comments: vec![],
820                                    inferred_type: None,
821                                },
822                            )));
823                        }
824                    } else {
825                        struct_args.push(pair[1].clone());
826                    }
827                }
828                Ok(Expression::Function(Box::new(Function::new(
829                    "STRUCT".to_string(),
830                    struct_args,
831                ))))
832            }
833
834            // DATE_PART(part, expr) -> EXTRACT(part FROM expr)
835            "DATE_PART" if f.args.len() == 2 => {
836                let mut args = f.args;
837                let part = args.remove(0);
838                let expr = args.remove(0);
839                if let Some(field) = expr_to_datetime_field(&part) {
840                    Ok(Expression::Extract(Box::new(ExtractFunc {
841                        this: expr,
842                        field,
843                    })))
844                } else {
845                    // Can't parse the field, keep as function
846                    Ok(Expression::Function(Box::new(Function::new(
847                        "DATE_PART".to_string(),
848                        vec![part, expr],
849                    ))))
850                }
851            }
852
853            // GET_PATH(obj, path) -> GET_JSON_OBJECT(obj, json_path) in Spark
854            "GET_PATH" if f.args.len() == 2 => {
855                let mut args = f.args;
856                let this = args.remove(0);
857                let path = args.remove(0);
858                let json_path = match &path {
859                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
860                        let Literal::String(s) = lit.as_ref() else {
861                            unreachable!()
862                        };
863                        let normalized = if s.starts_with('$') {
864                            s.clone()
865                        } else if s.starts_with('[') {
866                            format!("${}", s)
867                        } else {
868                            format!("$.{}", s)
869                        };
870                        Expression::Literal(Box::new(Literal::String(normalized)))
871                    }
872                    _ => path,
873                };
874                Ok(Expression::Function(Box::new(Function::new(
875                    "GET_JSON_OBJECT".to_string(),
876                    vec![this, json_path],
877                ))))
878            }
879
880            // BITWISE_LEFT_SHIFT → SHIFTLEFT
881            "BITWISE_LEFT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
882                "SHIFTLEFT".to_string(),
883                f.args,
884            )))),
885
886            // BITWISE_RIGHT_SHIFT → SHIFTRIGHT
887            "BITWISE_RIGHT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
888                "SHIFTRIGHT".to_string(),
889                f.args,
890            )))),
891
892            // APPROX_DISTINCT → APPROX_COUNT_DISTINCT
893            "APPROX_DISTINCT" => Ok(Expression::Function(Box::new(Function::new(
894                "APPROX_COUNT_DISTINCT".to_string(),
895                f.args,
896            )))),
897
898            // ARRAY_SLICE → SLICE
899            "ARRAY_SLICE" => Ok(Expression::Function(Box::new(Function::new(
900                "SLICE".to_string(),
901                f.args,
902            )))),
903
904            // DATE_FROM_PARTS → MAKE_DATE
905            "DATE_FROM_PARTS" => Ok(Expression::Function(Box::new(Function::new(
906                "MAKE_DATE".to_string(),
907                f.args,
908            )))),
909
910            // DAYOFWEEK_ISO → DAYOFWEEK
911            "DAYOFWEEK_ISO" => Ok(Expression::Function(Box::new(Function::new(
912                "DAYOFWEEK".to_string(),
913                f.args,
914            )))),
915
916            // FORMAT → FORMAT_STRING
917            "FORMAT" => Ok(Expression::Function(Box::new(Function::new(
918                "FORMAT_STRING".to_string(),
919                f.args,
920            )))),
921
922            // LOGICAL_AND → BOOL_AND
923            "LOGICAL_AND" => Ok(Expression::Function(Box::new(Function::new(
924                "BOOL_AND".to_string(),
925                f.args,
926            )))),
927
928            // VARIANCE_POP → VAR_POP
929            "VARIANCE_POP" => Ok(Expression::Function(Box::new(Function::new(
930                "VAR_POP".to_string(),
931                f.args,
932            )))),
933
934            // WEEK_OF_YEAR → WEEKOFYEAR
935            "WEEK_OF_YEAR" => Ok(Expression::Function(Box::new(Function::new(
936                "WEEKOFYEAR".to_string(),
937                f.args,
938            )))),
939
940            // BIT_GET -> GETBIT
941            "BIT_GET" => Ok(Expression::Function(Box::new(Function::new(
942                "GETBIT".to_string(),
943                f.args,
944            )))),
945
946            // CURDATE -> CURRENT_DATE
947            "CURDATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
948
949            // Pass through everything else
950            _ => Ok(Expression::Function(Box::new(f))),
951        }
952    }
953
954    fn transform_aggregate_function(
955        &self,
956        f: Box<crate::expressions::AggregateFunction>,
957    ) -> Result<Expression> {
958        let name_upper = f.name.to_uppercase();
959        match name_upper.as_str() {
960            // GROUP_CONCAT -> COLLECT_LIST (then CONCAT_WS for string)
961            "GROUP_CONCAT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
962                Function::new("COLLECT_LIST".to_string(), f.args),
963            ))),
964
965            // STRING_AGG -> COLLECT_LIST (or STRING_AGG in Spark 4+)
966            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
967                Function::new("COLLECT_LIST".to_string(), f.args),
968            ))),
969
970            // LISTAGG -> COLLECT_LIST
971            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
972                "COLLECT_LIST".to_string(),
973                f.args,
974            )))),
975
976            // ARRAY_AGG -> COLLECT_LIST (preserve distinct and filter)
977            "ARRAY_AGG" if !f.args.is_empty() => {
978                let mut af = f;
979                af.name = "COLLECT_LIST".to_string();
980                Ok(Expression::AggregateFunction(af))
981            }
982
983            // LOGICAL_OR -> BOOL_OR in Spark
984            "LOGICAL_OR" if !f.args.is_empty() => {
985                let mut af = f;
986                af.name = "BOOL_OR".to_string();
987                Ok(Expression::AggregateFunction(af))
988            }
989
990            // Pass through everything else
991            _ => Ok(Expression::AggregateFunction(f)),
992        }
993    }
994}
995
996/// Convert an expression (string literal or identifier) to a DateTimeField
997fn expr_to_datetime_field(expr: &Expression) -> Option<DateTimeField> {
998    let name = match expr {
999        Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
1000            let Literal::String(s) = lit.as_ref() else {
1001                unreachable!()
1002            };
1003            s.to_uppercase()
1004        }
1005        Expression::Identifier(id) => id.name.to_uppercase(),
1006        Expression::Var(v) => v.this.to_uppercase(),
1007        Expression::Column(col) if col.table.is_none() => col.name.name.to_uppercase(),
1008        _ => return None,
1009    };
1010    match name.as_str() {
1011        "YEAR" | "Y" | "YY" | "YYY" | "YYYY" | "YR" | "YEARS" | "YRS" => Some(DateTimeField::Year),
1012        "MONTH" | "MM" | "MON" | "MONS" | "MONTHS" => Some(DateTimeField::Month),
1013        "DAY" | "D" | "DD" | "DAYS" | "DAYOFMONTH" => Some(DateTimeField::Day),
1014        "HOUR" | "H" | "HH" | "HR" | "HOURS" | "HRS" => Some(DateTimeField::Hour),
1015        "MINUTE" | "MI" | "MIN" | "MINUTES" | "MINS" => Some(DateTimeField::Minute),
1016        "SECOND" | "S" | "SEC" | "SECONDS" | "SECS" => Some(DateTimeField::Second),
1017        "MILLISECOND" | "MS" | "MSEC" | "MILLISECONDS" => Some(DateTimeField::Millisecond),
1018        "MICROSECOND" | "US" | "USEC" | "MICROSECONDS" => Some(DateTimeField::Microsecond),
1019        "DOW" | "DAYOFWEEK" | "DAYOFWEEK_ISO" | "DW" => Some(DateTimeField::DayOfWeek),
1020        "DOY" | "DAYOFYEAR" => Some(DateTimeField::DayOfYear),
1021        "WEEK" | "W" | "WK" | "WEEKOFYEAR" | "WOY" => Some(DateTimeField::Week),
1022        "QUARTER" | "Q" | "QTR" | "QTRS" | "QUARTERS" => Some(DateTimeField::Quarter),
1023        "EPOCH" | "EPOCH_SECOND" | "EPOCH_SECONDS" => Some(DateTimeField::Epoch),
1024        "TIMEZONE" | "TIMEZONE_HOUR" | "TZH" => Some(DateTimeField::TimezoneHour),
1025        "TIMEZONE_MINUTE" | "TZM" => Some(DateTimeField::TimezoneMinute),
1026        _ => Some(DateTimeField::Custom(name)),
1027    }
1028}