Skip to main content

polyglot_sql/dialects/
spark.rs

1//! Spark SQL Dialect
2//!
3//! Spark SQL-specific transformations based on sqlglot patterns.
4//! Key features (extends Hive with modern SQL):
5//! - TRY_CAST is supported (Spark 3+)
6//! - ILIKE is supported (Spark 3+)
7//! - Uses backticks for identifiers
8//! - ARRAY_AGG, COLLECT_LIST for array aggregation
9//! - STRING_AGG / LISTAGG supported (Spark 4+)
10//! - DATE_ADD with unit parameter (Spark 3+)
11//! - TIMESTAMPADD, TIMESTAMPDIFF (Spark 3+)
12//! - More PostgreSQL-like syntax than Hive
13
14use super::{DialectImpl, DialectType};
15use crate::error::Result;
16use crate::expressions::{
17    CeilFunc, CurrentTimestamp, DataType, DateTimeField, Expression, ExtractFunc, Function,
18    Literal, StructField, UnaryFunc, VarArgFunc,
19};
20use crate::generator::GeneratorConfig;
21use crate::tokens::TokenizerConfig;
22
23/// Spark SQL dialect
24pub struct SparkDialect;
25
26impl DialectImpl for SparkDialect {
27    fn dialect_type(&self) -> DialectType {
28        DialectType::Spark
29    }
30
31    fn tokenizer_config(&self) -> TokenizerConfig {
32        let mut config = TokenizerConfig::default();
33        // Spark uses backticks for identifiers (NOT double quotes)
34        config.identifiers.clear();
35        config.identifiers.insert('`', '`');
36        // Spark (like Hive) uses double quotes as string delimiters (QUOTES = ["'", '"'])
37        config.quotes.insert("\"".to_string(), "\"".to_string());
38        // Spark (like Hive) uses backslash escapes in strings (STRING_ESCAPES = ["\\"])
39        config.string_escapes.push('\\');
40        // Spark supports DIV keyword for integer division (inherited from Hive)
41        config
42            .keywords
43            .insert("DIV".to_string(), crate::tokens::TokenType::Div);
44        config
45            .keywords
46            .insert("REPAIR".to_string(), crate::tokens::TokenType::Command);
47        config
48            .keywords
49            .insert("MSCK".to_string(), crate::tokens::TokenType::Command);
50        // Spark numeric literal suffixes (same as Hive): 1L -> BIGINT, 1S -> SMALLINT, etc.
51        config
52            .numeric_literals
53            .insert("L".to_string(), "BIGINT".to_string());
54        config
55            .numeric_literals
56            .insert("S".to_string(), "SMALLINT".to_string());
57        config
58            .numeric_literals
59            .insert("Y".to_string(), "TINYINT".to_string());
60        config
61            .numeric_literals
62            .insert("D".to_string(), "DOUBLE".to_string());
63        config
64            .numeric_literals
65            .insert("F".to_string(), "FLOAT".to_string());
66        config
67            .numeric_literals
68            .insert("BD".to_string(), "DECIMAL".to_string());
69        // Spark allows identifiers to start with digits (e.g., 1a, 1_a)
70        config.identifiers_can_start_with_digit = true;
71        // Spark: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False
72        // Backslashes in raw strings are always literal (no escape processing)
73        config.string_escapes_allowed_in_raw_strings = false;
74        config
75    }
76
77    fn generator_config(&self) -> GeneratorConfig {
78        use crate::generator::IdentifierQuoteStyle;
79        GeneratorConfig {
80            identifier_quote: '`',
81            identifier_quote_style: IdentifierQuoteStyle::BACKTICK,
82            dialect: Some(DialectType::Spark),
83            // Spark uses colon separator in STRUCT field definitions: STRUCT<field_name: TYPE>
84            struct_field_sep: ": ",
85            // Spark doesn't use AS before RETURN in function definitions
86            create_function_return_as: false,
87            // Spark places alias after the TABLESAMPLE clause
88            alias_post_tablesample: true,
89            tablesample_seed_keyword: "REPEATABLE",
90            join_hints: false,
91            identifiers_can_start_with_digit: true,
92            // Spark uses COMMENT 'value' without = sign
93            schema_comment_with_eq: false,
94            ..Default::default()
95        }
96    }
97
98    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
99        match expr {
100            // IFNULL -> COALESCE in Spark
101            Expression::IfNull(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
102                original_name: None,
103                expressions: vec![f.this, f.expression],
104                inferred_type: None,
105            }))),
106
107            // NVL is supported in Spark (from Hive), but COALESCE is standard
108            Expression::Nvl(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
109                original_name: None,
110                expressions: vec![f.this, f.expression],
111                inferred_type: None,
112            }))),
113
114            // Cast: normalize VARCHAR(n) -> STRING, CHAR(n) -> STRING for Spark
115            Expression::Cast(mut c) => {
116                c.to = Self::normalize_spark_type(c.to);
117                Ok(Expression::Cast(c))
118            }
119
120            // TryCast stays as TryCast in Spark (Spark supports TRY_CAST natively)
121            Expression::TryCast(mut c) => {
122                c.to = Self::normalize_spark_type(c.to);
123                Ok(Expression::TryCast(c))
124            }
125
126            // SafeCast -> TRY_CAST
127            Expression::SafeCast(mut c) => {
128                c.to = Self::normalize_spark_type(c.to);
129                Ok(Expression::TryCast(c))
130            }
131
132            // TRIM: non-standard comma syntax -> standard FROM syntax
133            // TRIM('SL', 'SSparkSQLS') -> TRIM('SL' FROM 'SSparkSQLS')
134            Expression::Trim(mut t) => {
135                if !t.sql_standard_syntax && t.characters.is_some() {
136                    // Convert comma syntax to standard SQL syntax
137                    // Fields already have correct semantics: this=string, characters=chars
138                    t.sql_standard_syntax = true;
139                }
140                Ok(Expression::Trim(t))
141            }
142
143            // ILIKE is supported in Spark 3+
144            Expression::ILike(op) => Ok(Expression::ILike(op)),
145
146            // UNNEST -> EXPLODE in Spark (Hive compatibility)
147            Expression::Unnest(f) => Ok(Expression::Explode(Box::new(UnaryFunc::new(f.this)))),
148
149            // EXPLODE is native to Spark
150            Expression::Explode(f) => Ok(Expression::Explode(f)),
151
152            // ExplodeOuter is supported in Spark
153            Expression::ExplodeOuter(f) => Ok(Expression::ExplodeOuter(f)),
154
155            // RANDOM -> RAND in Spark
156            Expression::Random(_) => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
157                seed: None,
158                lower: None,
159                upper: None,
160            }))),
161
162            // Rand is native to Spark
163            Expression::Rand(r) => Ok(Expression::Rand(r)),
164
165            // || (Concat) -> CONCAT in Spark
166            Expression::Concat(op) => Ok(Expression::Function(Box::new(Function::new(
167                "CONCAT".to_string(),
168                vec![op.left, op.right],
169            )))),
170
171            // ParseJson: handled by generator (emits just the string literal for Spark)
172
173            // Generic function transformations
174            Expression::Function(f) => self.transform_function(*f),
175
176            // Generic aggregate function transformations
177            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
178
179            // $N parameters -> ${N} in Spark (DollarBrace style)
180            Expression::Parameter(mut p)
181                if p.style == crate::expressions::ParameterStyle::Dollar =>
182            {
183                p.style = crate::expressions::ParameterStyle::DollarBrace;
184                // Convert index to name for DollarBrace format
185                if let Some(idx) = p.index {
186                    p.name = Some(idx.to_string());
187                }
188                Ok(Expression::Parameter(p))
189            }
190
191            // JSONExtract with variant_extract (Databricks colon syntax) -> GET_JSON_OBJECT
192            Expression::JSONExtract(je) if je.variant_extract.is_some() => {
193                // Convert path: 'item[1].price' -> '$.item[1].price'
194                let path = match *je.expression {
195                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
196                        let Literal::String(s) = lit.as_ref() else {
197                            unreachable!()
198                        };
199                        Expression::Literal(Box::new(Literal::String(format!("$.{}", s))))
200                    }
201                    other => other,
202                };
203                Ok(Expression::Function(Box::new(Function::new(
204                    "GET_JSON_OBJECT".to_string(),
205                    vec![*je.this, path],
206                ))))
207            }
208
209            // Pass through everything else
210            _ => Ok(expr),
211        }
212    }
213}
214
215impl SparkDialect {
216    /// Normalize a data type for Spark:
217    /// - VARCHAR/CHAR without length -> STRING
218    /// - VARCHAR(n)/CHAR(n) with length -> keep as-is
219    /// - TEXT -> STRING
220    fn normalize_spark_type(dt: DataType) -> DataType {
221        match dt {
222            DataType::VarChar { length: None, .. }
223            | DataType::Char { length: None }
224            | DataType::Text => DataType::Custom {
225                name: "STRING".to_string(),
226            },
227            // VARCHAR(n) and CHAR(n) with length are kept as-is
228            DataType::VarChar { .. } | DataType::Char { .. } => dt,
229            // Also normalize struct fields recursively
230            DataType::Struct { fields, nested } => {
231                let normalized_fields: Vec<StructField> = fields
232                    .into_iter()
233                    .map(|mut f| {
234                        f.data_type = Self::normalize_spark_type(f.data_type);
235                        f
236                    })
237                    .collect();
238                DataType::Struct {
239                    fields: normalized_fields,
240                    nested,
241                }
242            }
243            _ => dt,
244        }
245    }
246
247    fn transform_function(&self, f: Function) -> Result<Expression> {
248        let name_upper = f.name.to_uppercase();
249        match name_upper.as_str() {
250            // IFNULL -> COALESCE
251            "IFNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
252                original_name: None,
253                expressions: f.args,
254                inferred_type: None,
255            }))),
256
257            // NVL -> COALESCE
258            "NVL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
259                original_name: None,
260                expressions: f.args,
261                inferred_type: None,
262            }))),
263
264            // ISNULL -> COALESCE
265            "ISNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
266                original_name: None,
267                expressions: f.args,
268                inferred_type: None,
269            }))),
270
271            // GROUP_CONCAT -> CONCAT_WS + COLLECT_LIST in older Spark
272            // In Spark 4+, STRING_AGG is available
273            "GROUP_CONCAT" if !f.args.is_empty() => {
274                // For simplicity, use COLLECT_LIST (array aggregation)
275                Ok(Expression::Function(Box::new(Function::new(
276                    "COLLECT_LIST".to_string(),
277                    f.args,
278                ))))
279            }
280
281            // STRING_AGG is supported in Spark 4+
282            // For older versions, fall back to CONCAT_WS + COLLECT_LIST
283            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
284                Function::new("COLLECT_LIST".to_string(), f.args),
285            ))),
286
287            // LISTAGG -> STRING_AGG in Spark 4+ (or COLLECT_LIST for older)
288            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
289                "COLLECT_LIST".to_string(),
290                f.args,
291            )))),
292
293            // SUBSTRING is native to Spark
294            "SUBSTRING" | "SUBSTR" => Ok(Expression::Function(Box::new(f))),
295
296            // LENGTH is native to Spark
297            "LENGTH" => Ok(Expression::Function(Box::new(f))),
298
299            // LEN -> LENGTH
300            "LEN" if f.args.len() == 1 => Ok(Expression::Length(Box::new(UnaryFunc::new(
301                f.args.into_iter().next().unwrap(),
302            )))),
303
304            // RANDOM -> RAND
305            "RANDOM" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
306                seed: None,
307                lower: None,
308                upper: None,
309            }))),
310
311            // RAND is native to Spark
312            "RAND" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
313                seed: None,
314                lower: None,
315                upper: None,
316            }))),
317
318            // NOW -> CURRENT_TIMESTAMP
319            "NOW" => Ok(Expression::CurrentTimestamp(
320                crate::expressions::CurrentTimestamp {
321                    precision: None,
322                    sysdate: false,
323                },
324            )),
325
326            // GETDATE -> CURRENT_TIMESTAMP
327            "GETDATE" => Ok(Expression::CurrentTimestamp(
328                crate::expressions::CurrentTimestamp {
329                    precision: None,
330                    sysdate: false,
331                },
332            )),
333
334            // CURRENT_TIMESTAMP is native
335            "CURRENT_TIMESTAMP" => Ok(Expression::CurrentTimestamp(
336                crate::expressions::CurrentTimestamp {
337                    precision: None,
338                    sysdate: false,
339                },
340            )),
341
342            // CURRENT_DATE is native
343            "CURRENT_DATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
344
345            // TO_DATE is native to Spark; strip default format 'yyyy-MM-dd'
346            "TO_DATE" if f.args.len() == 2 => {
347                let is_default_format = matches!(&f.args[1], Expression::Literal(lit) if matches!(lit.as_ref(), crate::expressions::Literal::String(s) if s == "yyyy-MM-dd"));
348                if is_default_format {
349                    Ok(Expression::Function(Box::new(Function::new(
350                        "TO_DATE".to_string(),
351                        vec![f.args.into_iter().next().unwrap()],
352                    ))))
353                } else {
354                    Ok(Expression::Function(Box::new(f)))
355                }
356            }
357            "TO_DATE" => Ok(Expression::Function(Box::new(f))),
358
359            // TO_TIMESTAMP is native to Spark
360            "TO_TIMESTAMP" => Ok(Expression::Function(Box::new(f))),
361
362            // DATE_FORMAT is native to Spark
363            "DATE_FORMAT" => Ok(Expression::Function(Box::new(f))),
364
365            // strftime -> DATE_FORMAT
366            "STRFTIME" => Ok(Expression::Function(Box::new(Function::new(
367                "DATE_FORMAT".to_string(),
368                f.args,
369            )))),
370
371            // TO_CHAR -> DATE_FORMAT
372            "TO_CHAR" => Ok(Expression::Function(Box::new(Function::new(
373                "DATE_FORMAT".to_string(),
374                f.args,
375            )))),
376
377            // DATE_TRUNC is native to Spark
378            "DATE_TRUNC" => Ok(Expression::Function(Box::new(f))),
379
380            // TRUNC is native to Spark
381            "TRUNC" => Ok(Expression::Function(Box::new(f))),
382
383            // EXTRACT is native to Spark
384            "EXTRACT" => Ok(Expression::Function(Box::new(f))),
385
386            // DATEPART -> EXTRACT
387            "DATEPART" => Ok(Expression::Function(Box::new(Function::new(
388                "EXTRACT".to_string(),
389                f.args,
390            )))),
391
392            // UNIX_TIMESTAMP is native to Spark
393            // When called with no args, add CURRENT_TIMESTAMP() as default
394            "UNIX_TIMESTAMP" => {
395                if f.args.is_empty() {
396                    Ok(Expression::Function(Box::new(Function::new(
397                        "UNIX_TIMESTAMP".to_string(),
398                        vec![Expression::CurrentTimestamp(CurrentTimestamp {
399                            precision: None,
400                            sysdate: false,
401                        })],
402                    ))))
403                } else {
404                    Ok(Expression::Function(Box::new(f)))
405                }
406            }
407
408            // FROM_UNIXTIME is native to Spark
409            "FROM_UNIXTIME" => Ok(Expression::Function(Box::new(f))),
410
411            // STR_TO_MAP is native to Spark
412            // When called with only one arg, add default delimiters ',' and ':'
413            "STR_TO_MAP" => {
414                if f.args.len() == 1 {
415                    let mut args = f.args;
416                    args.push(Expression::Literal(Box::new(
417                        crate::expressions::Literal::String(",".to_string()),
418                    )));
419                    args.push(Expression::Literal(Box::new(
420                        crate::expressions::Literal::String(":".to_string()),
421                    )));
422                    Ok(Expression::Function(Box::new(Function::new(
423                        "STR_TO_MAP".to_string(),
424                        args,
425                    ))))
426                } else {
427                    Ok(Expression::Function(Box::new(f)))
428                }
429            }
430
431            // POSITION is native to Spark (POSITION(substr IN str))
432            "POSITION" => Ok(Expression::Function(Box::new(f))),
433
434            // LOCATE is native to Spark
435            "LOCATE" => Ok(Expression::Function(Box::new(f))),
436
437            // STRPOS -> Use expression form or LOCATE
438            "STRPOS" if f.args.len() == 2 => {
439                let mut args = f.args;
440                let first = args.remove(0);
441                let second = args.remove(0);
442                // LOCATE(substr, str) in Spark
443                Ok(Expression::Function(Box::new(Function::new(
444                    "LOCATE".to_string(),
445                    vec![second, first],
446                ))))
447            }
448
449            // CHARINDEX -> LOCATE
450            "CHARINDEX" if f.args.len() >= 2 => {
451                let mut args = f.args;
452                let substring = args.remove(0);
453                let string = args.remove(0);
454                let mut locate_args = vec![substring, string];
455                if !args.is_empty() {
456                    locate_args.push(args.remove(0));
457                }
458                Ok(Expression::Function(Box::new(Function::new(
459                    "LOCATE".to_string(),
460                    locate_args,
461                ))))
462            }
463
464            // INSTR is native to Spark
465            "INSTR" => Ok(Expression::Function(Box::new(f))),
466
467            // CEILING -> CEIL
468            "CEILING" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
469                this: f.args.into_iter().next().unwrap(),
470                decimals: None,
471                to: None,
472            }))),
473
474            // CEIL is native to Spark
475            "CEIL" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
476                this: f.args.into_iter().next().unwrap(),
477                decimals: None,
478                to: None,
479            }))),
480
481            // UNNEST -> EXPLODE
482            "UNNEST" => Ok(Expression::Function(Box::new(Function::new(
483                "EXPLODE".to_string(),
484                f.args,
485            )))),
486
487            // FLATTEN -> FLATTEN is native to Spark (for nested arrays)
488            "FLATTEN" => Ok(Expression::Function(Box::new(f))),
489
490            // ARRAY_AGG -> COLLECT_LIST
491            "ARRAY_AGG" => Ok(Expression::Function(Box::new(Function::new(
492                "COLLECT_LIST".to_string(),
493                f.args,
494            )))),
495
496            // COLLECT_LIST is native to Spark
497            "COLLECT_LIST" => Ok(Expression::Function(Box::new(f))),
498
499            // COLLECT_SET is native to Spark
500            "COLLECT_SET" => Ok(Expression::Function(Box::new(f))),
501
502            // ARRAY_LENGTH -> SIZE in Spark
503            "ARRAY_LENGTH" | "CARDINALITY" => Ok(Expression::Function(Box::new(Function::new(
504                "SIZE".to_string(),
505                f.args,
506            )))),
507
508            // SIZE is native to Spark
509            "SIZE" => Ok(Expression::Function(Box::new(f))),
510
511            // SPLIT is native to Spark
512            "SPLIT" => Ok(Expression::Function(Box::new(f))),
513
514            // REGEXP_REPLACE: Spark supports up to 4 args (subject, pattern, replacement, position)
515            // Strip extra Snowflake args (occurrence, params) if present
516            "REGEXP_REPLACE" if f.args.len() > 4 => {
517                let mut args = f.args;
518                args.truncate(4);
519                Ok(Expression::Function(Box::new(Function::new(
520                    "REGEXP_REPLACE".to_string(),
521                    args,
522                ))))
523            }
524            "REGEXP_REPLACE" => Ok(Expression::Function(Box::new(f))),
525
526            // REGEXP_EXTRACT is native to Spark
527            "REGEXP_EXTRACT" => Ok(Expression::Function(Box::new(f))),
528
529            // REGEXP_EXTRACT_ALL is native to Spark
530            "REGEXP_EXTRACT_ALL" => Ok(Expression::Function(Box::new(f))),
531
532            // RLIKE is native to Spark
533            "RLIKE" | "REGEXP_LIKE" => Ok(Expression::Function(Box::new(Function::new(
534                "RLIKE".to_string(),
535                f.args,
536            )))),
537
538            // JSON_EXTRACT -> GET_JSON_OBJECT (Hive style) or :: operator
539            "JSON_EXTRACT" => Ok(Expression::Function(Box::new(Function::new(
540                "GET_JSON_OBJECT".to_string(),
541                f.args,
542            )))),
543
544            // JSON_EXTRACT_SCALAR -> GET_JSON_OBJECT
545            "JSON_EXTRACT_SCALAR" => Ok(Expression::Function(Box::new(Function::new(
546                "GET_JSON_OBJECT".to_string(),
547                f.args,
548            )))),
549
550            // GET_JSON_OBJECT is native to Spark
551            "GET_JSON_OBJECT" => Ok(Expression::Function(Box::new(f))),
552
553            // FROM_JSON is native to Spark
554            "FROM_JSON" => Ok(Expression::Function(Box::new(f))),
555
556            // TO_JSON is native to Spark
557            "TO_JSON" => Ok(Expression::Function(Box::new(f))),
558
559            // PARSE_JSON -> strip for Spark (just keep the string argument)
560            "PARSE_JSON" if f.args.len() == 1 => Ok(f.args.into_iter().next().unwrap()),
561            "PARSE_JSON" => Ok(Expression::Function(Box::new(Function::new(
562                "FROM_JSON".to_string(),
563                f.args,
564            )))),
565
566            // DATEDIFF is native to Spark (supports unit in Spark 3+)
567            "DATEDIFF" | "DATE_DIFF" => Ok(Expression::Function(Box::new(Function::new(
568                "DATEDIFF".to_string(),
569                f.args,
570            )))),
571
572            // DATE_ADD is native to Spark
573            "DATE_ADD" | "DATEADD" => Ok(Expression::Function(Box::new(Function::new(
574                "DATE_ADD".to_string(),
575                f.args,
576            )))),
577
578            // DATE_SUB is native to Spark
579            "DATE_SUB" => Ok(Expression::Function(Box::new(f))),
580
581            // TIMESTAMPADD is native to Spark 3+
582            "TIMESTAMPADD" => Ok(Expression::Function(Box::new(f))),
583
584            // TIMESTAMPDIFF is native to Spark 3+
585            "TIMESTAMPDIFF" => Ok(Expression::Function(Box::new(f))),
586
587            // ADD_MONTHS is native to Spark
588            "ADD_MONTHS" => Ok(Expression::Function(Box::new(f))),
589
590            // MONTHS_BETWEEN is native to Spark
591            "MONTHS_BETWEEN" => Ok(Expression::Function(Box::new(f))),
592
593            // NVL is native to Spark
594            "NVL" => Ok(Expression::Function(Box::new(f))),
595
596            // NVL2 is native to Spark
597            "NVL2" => Ok(Expression::Function(Box::new(f))),
598
599            // MAP is native to Spark
600            "MAP" => Ok(Expression::Function(Box::new(f))),
601
602            // ARRAY is native to Spark
603            "ARRAY" => Ok(Expression::Function(Box::new(f))),
604
605            // ROW -> STRUCT for Spark (cross-dialect, no auto-naming)
606            "ROW" => Ok(Expression::Function(Box::new(Function::new(
607                "STRUCT".to_string(),
608                f.args,
609            )))),
610
611            // STRUCT is native to Spark - auto-name unnamed args as col1, col2, etc.
612            "STRUCT" => {
613                let mut col_idx = 1usize;
614                let named_args: Vec<Expression> = f
615                    .args
616                    .into_iter()
617                    .map(|arg| {
618                        let current_idx = col_idx;
619                        col_idx += 1;
620                        // Check if arg already has an alias (AS name) or is Star
621                        match &arg {
622                            Expression::Alias(_) => arg, // already named
623                            Expression::Star(_) => arg,  // STRUCT(*) - keep as-is
624                            Expression::Column(c) if c.table.is_none() => {
625                                // Column reference: use column name as the struct field name
626                                let name = c.name.name.clone();
627                                Expression::Alias(Box::new(crate::expressions::Alias {
628                                    this: arg,
629                                    alias: crate::expressions::Identifier::new(&name),
630                                    column_aliases: Vec::new(),
631                                    pre_alias_comments: Vec::new(),
632                                    trailing_comments: Vec::new(),
633                                    inferred_type: None,
634                                }))
635                            }
636                            _ => {
637                                // Unnamed literal/expression: auto-name as colN
638                                let name = format!("col{}", current_idx);
639                                Expression::Alias(Box::new(crate::expressions::Alias {
640                                    this: arg,
641                                    alias: crate::expressions::Identifier::new(&name),
642                                    column_aliases: Vec::new(),
643                                    pre_alias_comments: Vec::new(),
644                                    trailing_comments: Vec::new(),
645                                    inferred_type: None,
646                                }))
647                            }
648                        }
649                    })
650                    .collect();
651                Ok(Expression::Function(Box::new(Function {
652                    name: "STRUCT".to_string(),
653                    args: named_args,
654                    distinct: false,
655                    trailing_comments: Vec::new(),
656                    use_bracket_syntax: false,
657                    no_parens: false,
658                    quoted: false,
659                    span: None,
660                    inferred_type: None,
661                })))
662            }
663
664            // NAMED_STRUCT('a', 1) -> STRUCT(1 AS a) for SQLGlot Spark outputs
665            "NAMED_STRUCT" if f.args.len() % 2 == 0 => {
666                let original_args = f.args.clone();
667                let mut struct_args = Vec::new();
668                for pair in f.args.chunks(2) {
669                    if let Expression::Literal(lit) = &pair[0] {
670                        if let Literal::String(field_name) = lit.as_ref() {
671                            struct_args.push(Expression::Alias(Box::new(
672                                crate::expressions::Alias {
673                                    this: pair[1].clone(),
674                                    alias: crate::expressions::Identifier::new(field_name),
675                                    column_aliases: Vec::new(),
676                                    pre_alias_comments: Vec::new(),
677                                    trailing_comments: Vec::new(),
678                                    inferred_type: None,
679                                },
680                            )));
681                            continue;
682                        }
683                    }
684                    return Ok(Expression::Function(Box::new(Function::new(
685                        "NAMED_STRUCT".to_string(),
686                        original_args,
687                    ))));
688                }
689                Ok(Expression::Function(Box::new(Function::new(
690                    "STRUCT".to_string(),
691                    struct_args,
692                ))))
693            }
694
695            // NAMED_STRUCT is native to Spark
696            "NAMED_STRUCT" => Ok(Expression::Function(Box::new(f))),
697
698            // MAP_FROM_ARRAYS is native to Spark
699            "MAP_FROM_ARRAYS" => Ok(Expression::Function(Box::new(f))),
700
701            // ARRAY_SORT is native to Spark
702            "ARRAY_SORT" => Ok(Expression::Function(Box::new(f))),
703
704            // ARRAY_DISTINCT is native to Spark
705            "ARRAY_DISTINCT" => Ok(Expression::Function(Box::new(f))),
706
707            // ARRAY_UNION is native to Spark
708            "ARRAY_UNION" => Ok(Expression::Function(Box::new(f))),
709
710            // ARRAY_INTERSECT is native to Spark
711            "ARRAY_INTERSECT" => Ok(Expression::Function(Box::new(f))),
712
713            // ARRAY_EXCEPT is native to Spark
714            "ARRAY_EXCEPT" => Ok(Expression::Function(Box::new(f))),
715
716            // ARRAY_CONTAINS is native to Spark
717            "ARRAY_CONTAINS" => Ok(Expression::Function(Box::new(f))),
718
719            // ELEMENT_AT is native to Spark
720            "ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
721
722            // TRY_ELEMENT_AT is native to Spark 3+
723            "TRY_ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
724
725            // TRANSFORM is native to Spark (array transformation)
726            "TRANSFORM" => Ok(Expression::Function(Box::new(f))),
727
728            // FILTER is native to Spark (array filtering)
729            "FILTER" => Ok(Expression::Function(Box::new(f))),
730
731            // AGGREGATE is native to Spark (array reduction)
732            "AGGREGATE" => Ok(Expression::Function(Box::new(f))),
733
734            // SEQUENCE is native to Spark (generate array)
735            "SEQUENCE" => Ok(Expression::Function(Box::new(f))),
736
737            // GENERATE_SERIES -> SEQUENCE
738            "GENERATE_SERIES" => Ok(Expression::Function(Box::new(Function::new(
739                "SEQUENCE".to_string(),
740                f.args,
741            )))),
742
743            // STARTSWITH is native to Spark 3+
744            "STARTSWITH" | "STARTS_WITH" => Ok(Expression::Function(Box::new(Function::new(
745                "STARTSWITH".to_string(),
746                f.args,
747            )))),
748
749            // ENDSWITH is native to Spark 3+
750            "ENDSWITH" | "ENDS_WITH" => Ok(Expression::Function(Box::new(Function::new(
751                "ENDSWITH".to_string(),
752                f.args,
753            )))),
754
755            // ARRAY_CONSTRUCT_COMPACT(1, null, 2) -> ARRAY_COMPACT(ARRAY(1, NULL, 2))
756            "ARRAY_CONSTRUCT_COMPACT" => {
757                let inner =
758                    Expression::Function(Box::new(Function::new("ARRAY".to_string(), f.args)));
759                Ok(Expression::Function(Box::new(Function::new(
760                    "ARRAY_COMPACT".to_string(),
761                    vec![inner],
762                ))))
763            }
764
765            // ARRAY_TO_STRING -> ARRAY_JOIN
766            "ARRAY_TO_STRING" => Ok(Expression::Function(Box::new(Function::new(
767                "ARRAY_JOIN".to_string(),
768                f.args,
769            )))),
770
771            // TO_ARRAY(x) -> IF(x IS NULL, NULL, ARRAY(x))
772            "TO_ARRAY" if f.args.len() == 1 => {
773                let x = f.args[0].clone();
774                // Check if arg is already an array constructor (bracket notation)
775                // In that case: TO_ARRAY(['test']) -> ARRAY('test')
776                match &x {
777                    Expression::ArrayFunc(arr) => {
778                        // Just convert to ARRAY(...) function
779                        Ok(Expression::Function(Box::new(Function::new(
780                            "ARRAY".to_string(),
781                            arr.expressions.clone(),
782                        ))))
783                    }
784                    _ => Ok(Expression::IfFunc(Box::new(crate::expressions::IfFunc {
785                        condition: Expression::IsNull(Box::new(crate::expressions::IsNull {
786                            this: x.clone(),
787                            not: false,
788                            postfix_form: false,
789                        })),
790                        true_value: Expression::Null(crate::expressions::Null),
791                        false_value: Some(Expression::Function(Box::new(Function::new(
792                            "ARRAY".to_string(),
793                            vec![x],
794                        )))),
795                        original_name: Some("IF".to_string()),
796                        inferred_type: None,
797                    }))),
798                }
799            }
800
801            // REGEXP_SUBSTR -> REGEXP_EXTRACT (strip extra args)
802            "REGEXP_SUBSTR" if f.args.len() >= 2 => {
803                let subject = f.args[0].clone();
804                let pattern = f.args[1].clone();
805                // For Spark: REGEXP_EXTRACT(subject, pattern, group)
806                // group defaults to 0 for full match, but sqlglot uses last arg if present
807                let group = if f.args.len() >= 6 {
808                    let g = &f.args[5];
809                    // If group is literal 1 (default), omit it
810                    if matches!(g, Expression::Literal(lit) if matches!(lit.as_ref(), Literal::Number(n) if n == "1"))
811                    {
812                        None
813                    } else {
814                        Some(g.clone())
815                    }
816                } else {
817                    None
818                };
819                let mut args = vec![subject, pattern];
820                if let Some(g) = group {
821                    args.push(g);
822                }
823                Ok(Expression::Function(Box::new(Function::new(
824                    "REGEXP_EXTRACT".to_string(),
825                    args,
826                ))))
827            }
828
829            // UUID_STRING() -> UUID(); keep namespace/name args for target-specific generation.
830            "UUID_STRING" => {
831                if f.args.is_empty() {
832                    Ok(Expression::Function(Box::new(Function::new(
833                        "UUID".to_string(),
834                        vec![],
835                    ))))
836                } else {
837                    Ok(Expression::Function(Box::new(Function::new(
838                        "UUID_STRING".to_string(),
839                        f.args,
840                    ))))
841                }
842            }
843
844            // OBJECT_CONSTRUCT -> STRUCT in Spark
845            "OBJECT_CONSTRUCT" if f.args.len() >= 2 && f.args.len() % 2 == 0 => {
846                // Convert key-value pairs to named struct fields
847                // OBJECT_CONSTRUCT('Manitoba', 'Winnipeg', 'foo', 'bar')
848                // -> STRUCT('Winnipeg' AS Manitoba, 'bar' AS foo)
849                let mut struct_args = Vec::new();
850                for pair in f.args.chunks(2) {
851                    if let Expression::Literal(lit) = &pair[0] {
852                        if let Literal::String(key) = lit.as_ref() {
853                            struct_args.push(Expression::Alias(Box::new(
854                                crate::expressions::Alias {
855                                    this: pair[1].clone(),
856                                    alias: crate::expressions::Identifier::new(key.clone()),
857                                    column_aliases: vec![],
858                                    pre_alias_comments: vec![],
859                                    trailing_comments: vec![],
860                                    inferred_type: None,
861                                },
862                            )));
863                        }
864                    } else {
865                        struct_args.push(pair[1].clone());
866                    }
867                }
868                Ok(Expression::Function(Box::new(Function::new(
869                    "STRUCT".to_string(),
870                    struct_args,
871                ))))
872            }
873
874            // DATE_PART(part, expr) -> EXTRACT(part FROM expr)
875            "DATE_PART" if f.args.len() == 2 => {
876                let mut args = f.args;
877                let part = args.remove(0);
878                let expr = args.remove(0);
879                if let Some(field) = expr_to_datetime_field(&part) {
880                    Ok(Expression::Extract(Box::new(ExtractFunc {
881                        this: expr,
882                        field,
883                    })))
884                } else {
885                    // Can't parse the field, keep as function
886                    Ok(Expression::Function(Box::new(Function::new(
887                        "DATE_PART".to_string(),
888                        vec![part, expr],
889                    ))))
890                }
891            }
892
893            // GET_PATH(obj, path) -> GET_JSON_OBJECT(obj, json_path) in Spark
894            "GET_PATH" if f.args.len() == 2 => {
895                let mut args = f.args;
896                let this = args.remove(0);
897                let path = args.remove(0);
898                let json_path = match &path {
899                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
900                        let Literal::String(s) = lit.as_ref() else {
901                            unreachable!()
902                        };
903                        let normalized = if s.starts_with('$') {
904                            s.clone()
905                        } else if s.starts_with('[') {
906                            format!("${}", s)
907                        } else {
908                            format!("$.{}", s)
909                        };
910                        Expression::Literal(Box::new(Literal::String(normalized)))
911                    }
912                    _ => path,
913                };
914                Ok(Expression::Function(Box::new(Function::new(
915                    "GET_JSON_OBJECT".to_string(),
916                    vec![this, json_path],
917                ))))
918            }
919
920            // BITWISE_LEFT_SHIFT → SHIFTLEFT
921            "BITWISE_LEFT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
922                "SHIFTLEFT".to_string(),
923                f.args,
924            )))),
925
926            // BITWISE_RIGHT_SHIFT → SHIFTRIGHT
927            "BITWISE_RIGHT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
928                "SHIFTRIGHT".to_string(),
929                f.args,
930            )))),
931
932            // APPROX_DISTINCT → APPROX_COUNT_DISTINCT
933            "APPROX_DISTINCT" => Ok(Expression::Function(Box::new(Function::new(
934                "APPROX_COUNT_DISTINCT".to_string(),
935                f.args,
936            )))),
937
938            // ARRAY_SLICE → SLICE
939            "ARRAY_SLICE" => Ok(Expression::Function(Box::new(Function::new(
940                "SLICE".to_string(),
941                f.args,
942            )))),
943
944            // DATE_FROM_PARTS → MAKE_DATE
945            "DATE_FROM_PARTS" => Ok(Expression::Function(Box::new(Function::new(
946                "MAKE_DATE".to_string(),
947                f.args,
948            )))),
949
950            // DAYOFWEEK_ISO → DAYOFWEEK
951            "DAYOFWEEK_ISO" => Ok(Expression::Function(Box::new(Function::new(
952                "DAYOFWEEK".to_string(),
953                f.args,
954            )))),
955
956            // FORMAT → FORMAT_STRING
957            "FORMAT" => Ok(Expression::Function(Box::new(Function::new(
958                "FORMAT_STRING".to_string(),
959                f.args,
960            )))),
961
962            // LOGICAL_AND → BOOL_AND
963            "LOGICAL_AND" => Ok(Expression::Function(Box::new(Function::new(
964                "BOOL_AND".to_string(),
965                f.args,
966            )))),
967
968            // VARIANCE_POP → VAR_POP
969            "VARIANCE_POP" => Ok(Expression::Function(Box::new(Function::new(
970                "VAR_POP".to_string(),
971                f.args,
972            )))),
973
974            // WEEK_OF_YEAR → WEEKOFYEAR
975            "WEEK_OF_YEAR" => Ok(Expression::Function(Box::new(Function::new(
976                "WEEKOFYEAR".to_string(),
977                f.args,
978            )))),
979
980            // BIT_GET -> GETBIT
981            "BIT_GET" => Ok(Expression::Function(Box::new(Function::new(
982                "GETBIT".to_string(),
983                f.args,
984            )))),
985
986            // CURDATE -> CURRENT_DATE
987            "CURDATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
988
989            // Pass through everything else
990            _ => Ok(Expression::Function(Box::new(f))),
991        }
992    }
993
994    fn transform_aggregate_function(
995        &self,
996        f: Box<crate::expressions::AggregateFunction>,
997    ) -> Result<Expression> {
998        let name_upper = f.name.to_uppercase();
999        match name_upper.as_str() {
1000            // GROUP_CONCAT -> COLLECT_LIST (then CONCAT_WS for string)
1001            "GROUP_CONCAT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
1002                Function::new("COLLECT_LIST".to_string(), f.args),
1003            ))),
1004
1005            // STRING_AGG -> COLLECT_LIST (or STRING_AGG in Spark 4+)
1006            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
1007                Function::new("COLLECT_LIST".to_string(), f.args),
1008            ))),
1009
1010            // LISTAGG -> COLLECT_LIST
1011            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
1012                "COLLECT_LIST".to_string(),
1013                f.args,
1014            )))),
1015
1016            // ARRAY_AGG -> COLLECT_LIST (preserve distinct and filter)
1017            "ARRAY_AGG" if !f.args.is_empty() => {
1018                let mut af = f;
1019                af.name = "COLLECT_LIST".to_string();
1020                Ok(Expression::AggregateFunction(af))
1021            }
1022
1023            // LOGICAL_OR -> BOOL_OR in Spark
1024            "LOGICAL_OR" if !f.args.is_empty() => {
1025                let mut af = f;
1026                af.name = "BOOL_OR".to_string();
1027                Ok(Expression::AggregateFunction(af))
1028            }
1029
1030            // Pass through everything else
1031            _ => Ok(Expression::AggregateFunction(f)),
1032        }
1033    }
1034}
1035
1036/// Convert an expression (string literal or identifier) to a DateTimeField
1037fn expr_to_datetime_field(expr: &Expression) -> Option<DateTimeField> {
1038    let name = match expr {
1039        Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
1040            let Literal::String(s) = lit.as_ref() else {
1041                unreachable!()
1042            };
1043            s.to_uppercase()
1044        }
1045        Expression::Identifier(id) => id.name.to_uppercase(),
1046        Expression::Var(v) => v.this.to_uppercase(),
1047        Expression::Column(col) if col.table.is_none() => col.name.name.to_uppercase(),
1048        _ => return None,
1049    };
1050    match name.as_str() {
1051        "YEAR" | "Y" | "YY" | "YYY" | "YYYY" | "YR" | "YEARS" | "YRS" => Some(DateTimeField::Year),
1052        "MONTH" | "MM" | "MON" | "MONS" | "MONTHS" => Some(DateTimeField::Month),
1053        "DAY" | "D" | "DD" | "DAYS" | "DAYOFMONTH" => Some(DateTimeField::Day),
1054        "HOUR" | "H" | "HH" | "HR" | "HOURS" | "HRS" => Some(DateTimeField::Hour),
1055        "MINUTE" | "MI" | "MIN" | "MINUTES" | "MINS" => Some(DateTimeField::Minute),
1056        "SECOND" | "S" | "SEC" | "SECONDS" | "SECS" => Some(DateTimeField::Second),
1057        "MILLISECOND" | "MS" | "MSEC" | "MILLISECONDS" => Some(DateTimeField::Millisecond),
1058        "MICROSECOND" | "US" | "USEC" | "MICROSECONDS" => Some(DateTimeField::Microsecond),
1059        "DOW" | "DAYOFWEEK" | "DAYOFWEEK_ISO" | "DW" => Some(DateTimeField::DayOfWeek),
1060        "DOY" | "DAYOFYEAR" => Some(DateTimeField::DayOfYear),
1061        "WEEK" | "W" | "WK" | "WEEKOFYEAR" | "WOY" => Some(DateTimeField::Week),
1062        "QUARTER" | "Q" | "QTR" | "QTRS" | "QUARTERS" => Some(DateTimeField::Quarter),
1063        "EPOCH" | "EPOCH_SECOND" | "EPOCH_SECONDS" => Some(DateTimeField::Epoch),
1064        "TIMEZONE" | "TIMEZONE_HOUR" | "TZH" => Some(DateTimeField::TimezoneHour),
1065        "TIMEZONE_MINUTE" | "TZM" => Some(DateTimeField::TimezoneMinute),
1066        _ => Some(DateTimeField::Custom(name)),
1067    }
1068}