Skip to main content

polyglot_sql/dialects/
spark.rs

1//! Spark SQL Dialect
2//!
3//! Spark SQL-specific transformations based on sqlglot patterns.
4//! Key features (extends Hive with modern SQL):
5//! - TRY_CAST is supported (Spark 3+)
6//! - ILIKE is supported (Spark 3+)
7//! - Uses backticks for identifiers
8//! - ARRAY_AGG, COLLECT_LIST for array aggregation
9//! - STRING_AGG / LISTAGG supported (Spark 4+)
10//! - DATE_ADD with unit parameter (Spark 3+)
11//! - TIMESTAMPADD, TIMESTAMPDIFF (Spark 3+)
12//! - More PostgreSQL-like syntax than Hive
13
14use super::{DialectImpl, DialectType};
15use crate::error::Result;
16use crate::expressions::{
17    CeilFunc, CurrentTimestamp, DataType, DateTimeField, Expression, ExtractFunc, Function,
18    Literal, StructField, UnaryFunc, VarArgFunc,
19};
20use crate::generator::GeneratorConfig;
21use crate::tokens::TokenizerConfig;
22
23/// Spark SQL dialect
24pub struct SparkDialect;
25
26impl DialectImpl for SparkDialect {
27    fn dialect_type(&self) -> DialectType {
28        DialectType::Spark
29    }
30
31    fn tokenizer_config(&self) -> TokenizerConfig {
32        let mut config = TokenizerConfig::default();
33        // Spark uses backticks for identifiers (NOT double quotes)
34        config.identifiers.clear();
35        config.identifiers.insert('`', '`');
36        // Spark (like Hive) uses double quotes as string delimiters (QUOTES = ["'", '"'])
37        config.quotes.insert("\"".to_string(), "\"".to_string());
38        // Spark (like Hive) uses backslash escapes in strings (STRING_ESCAPES = ["\\"])
39        config.string_escapes.push('\\');
40        // Spark supports DIV keyword for integer division (inherited from Hive)
41        config
42            .keywords
43            .insert("DIV".to_string(), crate::tokens::TokenType::Div);
44        config
45            .keywords
46            .insert("REPAIR".to_string(), crate::tokens::TokenType::Command);
47        config
48            .keywords
49            .insert("MSCK".to_string(), crate::tokens::TokenType::Command);
50        // Spark numeric literal suffixes (same as Hive): 1L -> BIGINT, 1S -> SMALLINT, etc.
51        config
52            .numeric_literals
53            .insert("L".to_string(), "BIGINT".to_string());
54        config
55            .numeric_literals
56            .insert("S".to_string(), "SMALLINT".to_string());
57        config
58            .numeric_literals
59            .insert("Y".to_string(), "TINYINT".to_string());
60        config
61            .numeric_literals
62            .insert("D".to_string(), "DOUBLE".to_string());
63        config
64            .numeric_literals
65            .insert("F".to_string(), "FLOAT".to_string());
66        config
67            .numeric_literals
68            .insert("BD".to_string(), "DECIMAL".to_string());
69        // Spark allows identifiers to start with digits (e.g., 1a, 1_a)
70        config.identifiers_can_start_with_digit = true;
71        // Spark: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False
72        // Backslashes in raw strings are always literal (no escape processing)
73        config.string_escapes_allowed_in_raw_strings = false;
74        config
75    }
76
77    fn generator_config(&self) -> GeneratorConfig {
78        use crate::generator::IdentifierQuoteStyle;
79        GeneratorConfig {
80            identifier_quote: '`',
81            identifier_quote_style: IdentifierQuoteStyle::BACKTICK,
82            dialect: Some(DialectType::Spark),
83            // Spark uses colon separator in STRUCT field definitions: STRUCT<field_name: TYPE>
84            struct_field_sep: ": ",
85            // Spark doesn't use AS before RETURN in function definitions
86            create_function_return_as: false,
87            // Spark places alias after the TABLESAMPLE clause
88            alias_post_tablesample: true,
89            tablesample_seed_keyword: "REPEATABLE",
90            join_hints: false,
91            identifiers_can_start_with_digit: true,
92            // Spark uses COMMENT 'value' without = sign
93            schema_comment_with_eq: false,
94            ..Default::default()
95        }
96    }
97
98    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
99        match expr {
100            // IFNULL -> COALESCE in Spark
101            Expression::IfNull(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
102                original_name: None,
103                expressions: vec![f.this, f.expression],
104                inferred_type: None,
105            }))),
106
107            // NVL is supported in Spark (from Hive), but COALESCE is standard
108            Expression::Nvl(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc {
109                original_name: None,
110                expressions: vec![f.this, f.expression],
111                inferred_type: None,
112            }))),
113
114            // Cast: normalize VARCHAR(n) -> STRING, CHAR(n) -> STRING for Spark
115            Expression::Cast(mut c) => {
116                c.to = Self::normalize_spark_type(c.to);
117                Ok(Expression::Cast(c))
118            }
119
120            // TryCast stays as TryCast in Spark (Spark supports TRY_CAST natively)
121            Expression::TryCast(mut c) => {
122                c.to = Self::normalize_spark_type(c.to);
123                Ok(Expression::TryCast(c))
124            }
125
126            // SafeCast -> TRY_CAST
127            Expression::SafeCast(mut c) => {
128                c.to = Self::normalize_spark_type(c.to);
129                Ok(Expression::TryCast(c))
130            }
131
132            // TRIM: non-standard comma syntax -> standard FROM syntax
133            // TRIM('SL', 'SSparkSQLS') -> TRIM('SL' FROM 'SSparkSQLS')
134            Expression::Trim(mut t) => {
135                if !t.sql_standard_syntax && t.characters.is_some() {
136                    // Convert comma syntax to standard SQL syntax
137                    // Fields already have correct semantics: this=string, characters=chars
138                    t.sql_standard_syntax = true;
139                }
140                Ok(Expression::Trim(t))
141            }
142
143            // ILIKE is supported in Spark 3+
144            Expression::ILike(op) => Ok(Expression::ILike(op)),
145
146            // UNNEST -> EXPLODE in Spark (Hive compatibility)
147            Expression::Unnest(f) => Ok(Expression::Explode(Box::new(UnaryFunc::new(f.this)))),
148
149            // EXPLODE is native to Spark
150            Expression::Explode(f) => Ok(Expression::Explode(f)),
151
152            // ExplodeOuter is supported in Spark
153            Expression::ExplodeOuter(f) => Ok(Expression::ExplodeOuter(f)),
154
155            // RANDOM -> RAND in Spark
156            Expression::Random(_) => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
157                seed: None,
158                lower: None,
159                upper: None,
160            }))),
161
162            // Rand is native to Spark
163            Expression::Rand(r) => Ok(Expression::Rand(r)),
164
165            // || (Concat) -> CONCAT in Spark
166            Expression::Concat(op) => Ok(Expression::Function(Box::new(Function::new(
167                "CONCAT".to_string(),
168                vec![op.left, op.right],
169            )))),
170
171            // ParseJson: handled by generator (emits just the string literal for Spark)
172
173            // Generic function transformations
174            Expression::Function(f) => self.transform_function(*f),
175
176            // Generic aggregate function transformations
177            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
178
179            // $N parameters -> ${N} in Spark (DollarBrace style)
180            Expression::Parameter(mut p)
181                if p.style == crate::expressions::ParameterStyle::Dollar =>
182            {
183                p.style = crate::expressions::ParameterStyle::DollarBrace;
184                // Convert index to name for DollarBrace format
185                if let Some(idx) = p.index {
186                    p.name = Some(idx.to_string());
187                }
188                Ok(Expression::Parameter(p))
189            }
190
191            // JSONExtract with variant_extract (Databricks colon syntax) -> GET_JSON_OBJECT
192            Expression::JSONExtract(je) if je.variant_extract.is_some() => {
193                // Convert path: 'item[1].price' -> '$.item[1].price'
194                let path = match *je.expression {
195                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
196                        let Literal::String(s) = lit.as_ref() else {
197                            unreachable!()
198                        };
199                        Expression::Literal(Box::new(Literal::String(format!("$.{}", s))))
200                    }
201                    other => other,
202                };
203                Ok(Expression::Function(Box::new(Function::new(
204                    "GET_JSON_OBJECT".to_string(),
205                    vec![*je.this, path],
206                ))))
207            }
208
209            // Pass through everything else
210            _ => Ok(expr),
211        }
212    }
213}
214
215impl SparkDialect {
216    /// Normalize a data type for Spark:
217    /// - VARCHAR/CHAR without length -> STRING
218    /// - VARCHAR(n)/CHAR(n) with length -> keep as-is
219    /// - TEXT -> STRING
220    fn normalize_spark_type(dt: DataType) -> DataType {
221        match dt {
222            DataType::VarChar { length: None, .. }
223            | DataType::Char { length: None }
224            | DataType::Text => DataType::Custom {
225                name: "STRING".to_string(),
226            },
227            // VARCHAR(n) and CHAR(n) with length are kept as-is
228            DataType::VarChar { .. } | DataType::Char { .. } => dt,
229            // Also normalize struct fields recursively
230            DataType::Struct { fields, nested } => {
231                let normalized_fields: Vec<StructField> = fields
232                    .into_iter()
233                    .map(|mut f| {
234                        f.data_type = Self::normalize_spark_type(f.data_type);
235                        f
236                    })
237                    .collect();
238                DataType::Struct {
239                    fields: normalized_fields,
240                    nested,
241                }
242            }
243            _ => dt,
244        }
245    }
246
247    fn transform_function(&self, f: Function) -> Result<Expression> {
248        let name_upper = f.name.to_uppercase();
249        match name_upper.as_str() {
250            // IFNULL -> COALESCE
251            "IFNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
252                original_name: None,
253                expressions: f.args,
254                inferred_type: None,
255            }))),
256
257            // NVL -> COALESCE
258            "NVL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
259                original_name: None,
260                expressions: f.args,
261                inferred_type: None,
262            }))),
263
264            // ISNULL -> COALESCE
265            "ISNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
266                original_name: None,
267                expressions: f.args,
268                inferred_type: None,
269            }))),
270
271            // GROUP_CONCAT -> CONCAT_WS + COLLECT_LIST in older Spark
272            // In Spark 4+, STRING_AGG is available
273            "GROUP_CONCAT" if !f.args.is_empty() => {
274                // For simplicity, use COLLECT_LIST (array aggregation)
275                Ok(Expression::Function(Box::new(Function::new(
276                    "COLLECT_LIST".to_string(),
277                    f.args,
278                ))))
279            }
280
281            // STRING_AGG is supported in Spark 4+
282            // For older versions, fall back to CONCAT_WS + COLLECT_LIST
283            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
284                Function::new("COLLECT_LIST".to_string(), f.args),
285            ))),
286
287            // LISTAGG -> STRING_AGG in Spark 4+ (or COLLECT_LIST for older)
288            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
289                "COLLECT_LIST".to_string(),
290                f.args,
291            )))),
292
293            // SUBSTRING is native to Spark
294            "SUBSTRING" | "SUBSTR" => Ok(Expression::Function(Box::new(f))),
295
296            // LENGTH is native to Spark
297            "LENGTH" => Ok(Expression::Function(Box::new(f))),
298
299            // LEN -> LENGTH
300            "LEN" if f.args.len() == 1 => Ok(Expression::Length(Box::new(UnaryFunc::new(
301                f.args.into_iter().next().unwrap(),
302            )))),
303
304            // RANDOM -> RAND
305            "RANDOM" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
306                seed: None,
307                lower: None,
308                upper: None,
309            }))),
310
311            // RAND is native to Spark
312            "RAND" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
313                seed: None,
314                lower: None,
315                upper: None,
316            }))),
317
318            // NOW -> CURRENT_TIMESTAMP
319            "NOW" => Ok(Expression::CurrentTimestamp(
320                crate::expressions::CurrentTimestamp {
321                    precision: None,
322                    sysdate: false,
323                },
324            )),
325
326            // GETDATE -> CURRENT_TIMESTAMP
327            "GETDATE" => Ok(Expression::CurrentTimestamp(
328                crate::expressions::CurrentTimestamp {
329                    precision: None,
330                    sysdate: false,
331                },
332            )),
333
334            // CURRENT_TIMESTAMP is native
335            "CURRENT_TIMESTAMP" => Ok(Expression::CurrentTimestamp(
336                crate::expressions::CurrentTimestamp {
337                    precision: None,
338                    sysdate: false,
339                },
340            )),
341
342            // CURRENT_DATE is native
343            "CURRENT_DATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
344
345            // TO_DATE is native to Spark; strip default format 'yyyy-MM-dd'
346            "TO_DATE" if f.args.len() == 2 => {
347                let is_default_format = matches!(&f.args[1], Expression::Literal(lit) if matches!(lit.as_ref(), crate::expressions::Literal::String(s) if s == "yyyy-MM-dd"));
348                if is_default_format {
349                    Ok(Expression::Function(Box::new(Function::new(
350                        "TO_DATE".to_string(),
351                        vec![f.args.into_iter().next().unwrap()],
352                    ))))
353                } else {
354                    Ok(Expression::Function(Box::new(f)))
355                }
356            }
357            "TO_DATE" => Ok(Expression::Function(Box::new(f))),
358
359            // TO_TIMESTAMP is native to Spark
360            "TO_TIMESTAMP" => Ok(Expression::Function(Box::new(f))),
361
362            // DATE_FORMAT is native to Spark
363            "DATE_FORMAT" => Ok(Expression::Function(Box::new(f))),
364
365            // strftime -> DATE_FORMAT
366            "STRFTIME" => Ok(Expression::Function(Box::new(Function::new(
367                "DATE_FORMAT".to_string(),
368                f.args,
369            )))),
370
371            // TO_CHAR -> DATE_FORMAT
372            "TO_CHAR" => Ok(Expression::Function(Box::new(Function::new(
373                "DATE_FORMAT".to_string(),
374                f.args,
375            )))),
376
377            // DATE_TRUNC is native to Spark
378            "DATE_TRUNC" => Ok(Expression::Function(Box::new(f))),
379
380            // TRUNC is native to Spark
381            "TRUNC" => Ok(Expression::Function(Box::new(f))),
382
383            // EXTRACT is native to Spark
384            "EXTRACT" => Ok(Expression::Function(Box::new(f))),
385
386            // DATEPART -> EXTRACT
387            "DATEPART" => Ok(Expression::Function(Box::new(Function::new(
388                "EXTRACT".to_string(),
389                f.args,
390            )))),
391
392            // UNIX_TIMESTAMP is native to Spark
393            // When called with no args, add CURRENT_TIMESTAMP() as default
394            "UNIX_TIMESTAMP" => {
395                if f.args.is_empty() {
396                    Ok(Expression::Function(Box::new(Function::new(
397                        "UNIX_TIMESTAMP".to_string(),
398                        vec![Expression::CurrentTimestamp(CurrentTimestamp {
399                            precision: None,
400                            sysdate: false,
401                        })],
402                    ))))
403                } else {
404                    Ok(Expression::Function(Box::new(f)))
405                }
406            }
407
408            // FROM_UNIXTIME is native to Spark
409            "FROM_UNIXTIME" => Ok(Expression::Function(Box::new(f))),
410
411            // STR_TO_MAP is native to Spark
412            // When called with only one arg, add default delimiters ',' and ':'
413            "STR_TO_MAP" => {
414                if f.args.len() == 1 {
415                    let mut args = f.args;
416                    args.push(Expression::Literal(Box::new(
417                        crate::expressions::Literal::String(",".to_string()),
418                    )));
419                    args.push(Expression::Literal(Box::new(
420                        crate::expressions::Literal::String(":".to_string()),
421                    )));
422                    Ok(Expression::Function(Box::new(Function::new(
423                        "STR_TO_MAP".to_string(),
424                        args,
425                    ))))
426                } else {
427                    Ok(Expression::Function(Box::new(f)))
428                }
429            }
430
431            // POSITION is native to Spark (POSITION(substr IN str))
432            "POSITION" => Ok(Expression::Function(Box::new(f))),
433
434            // LOCATE is native to Spark
435            "LOCATE" => Ok(Expression::Function(Box::new(f))),
436
437            // STRPOS -> Use expression form or LOCATE
438            "STRPOS" if f.args.len() == 2 => {
439                let mut args = f.args;
440                let first = args.remove(0);
441                let second = args.remove(0);
442                // LOCATE(substr, str) in Spark
443                Ok(Expression::Function(Box::new(Function::new(
444                    "LOCATE".to_string(),
445                    vec![second, first],
446                ))))
447            }
448
449            // CHARINDEX -> LOCATE
450            "CHARINDEX" if f.args.len() >= 2 => {
451                let mut args = f.args;
452                let substring = args.remove(0);
453                let string = args.remove(0);
454                let mut locate_args = vec![substring, string];
455                if !args.is_empty() {
456                    locate_args.push(args.remove(0));
457                }
458                Ok(Expression::Function(Box::new(Function::new(
459                    "LOCATE".to_string(),
460                    locate_args,
461                ))))
462            }
463
464            // INSTR is native to Spark
465            "INSTR" => Ok(Expression::Function(Box::new(f))),
466
467            // CEILING -> CEIL
468            "CEILING" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
469                this: f.args.into_iter().next().unwrap(),
470                decimals: None,
471                to: None,
472            }))),
473
474            // CEIL is native to Spark
475            "CEIL" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
476                this: f.args.into_iter().next().unwrap(),
477                decimals: None,
478                to: None,
479            }))),
480
481            // UNNEST -> EXPLODE
482            "UNNEST" => Ok(Expression::Function(Box::new(Function::new(
483                "EXPLODE".to_string(),
484                f.args,
485            )))),
486
487            // FLATTEN -> FLATTEN is native to Spark (for nested arrays)
488            "FLATTEN" => Ok(Expression::Function(Box::new(f))),
489
490            // ARRAY_AGG -> COLLECT_LIST
491            "ARRAY_AGG" => Ok(Expression::Function(Box::new(Function::new(
492                "COLLECT_LIST".to_string(),
493                f.args,
494            )))),
495
496            // COLLECT_LIST is native to Spark
497            "COLLECT_LIST" => Ok(Expression::Function(Box::new(f))),
498
499            // COLLECT_SET is native to Spark
500            "COLLECT_SET" => Ok(Expression::Function(Box::new(f))),
501
502            // ARRAY_LENGTH -> SIZE in Spark
503            "ARRAY_LENGTH" | "CARDINALITY" => Ok(Expression::Function(Box::new(Function::new(
504                "SIZE".to_string(),
505                f.args,
506            )))),
507
508            // SIZE is native to Spark
509            "SIZE" => Ok(Expression::Function(Box::new(f))),
510
511            // SPLIT is native to Spark
512            "SPLIT" => Ok(Expression::Function(Box::new(f))),
513
514            // REGEXP_REPLACE: Spark supports up to 4 args (subject, pattern, replacement, position)
515            // Strip extra Snowflake args (occurrence, params) if present
516            "REGEXP_REPLACE" if f.args.len() > 4 => {
517                let mut args = f.args;
518                args.truncate(4);
519                Ok(Expression::Function(Box::new(Function::new(
520                    "REGEXP_REPLACE".to_string(),
521                    args,
522                ))))
523            }
524            "REGEXP_REPLACE" => Ok(Expression::Function(Box::new(f))),
525
526            // REGEXP_EXTRACT is native to Spark
527            "REGEXP_EXTRACT" => Ok(Expression::Function(Box::new(f))),
528
529            // REGEXP_EXTRACT_ALL is native to Spark
530            "REGEXP_EXTRACT_ALL" => Ok(Expression::Function(Box::new(f))),
531
532            // RLIKE is native to Spark
533            "RLIKE" | "REGEXP_LIKE" => Ok(Expression::Function(Box::new(Function::new(
534                "RLIKE".to_string(),
535                f.args,
536            )))),
537
538            // JSON_EXTRACT -> GET_JSON_OBJECT (Hive style) or :: operator
539            "JSON_EXTRACT" => Ok(Expression::Function(Box::new(Function::new(
540                "GET_JSON_OBJECT".to_string(),
541                f.args,
542            )))),
543
544            // JSON_EXTRACT_SCALAR -> GET_JSON_OBJECT
545            "JSON_EXTRACT_SCALAR" => Ok(Expression::Function(Box::new(Function::new(
546                "GET_JSON_OBJECT".to_string(),
547                f.args,
548            )))),
549
550            // GET_JSON_OBJECT is native to Spark
551            "GET_JSON_OBJECT" => Ok(Expression::Function(Box::new(f))),
552
553            // FROM_JSON is native to Spark
554            "FROM_JSON" => Ok(Expression::Function(Box::new(f))),
555
556            // TO_JSON is native to Spark
557            "TO_JSON" => Ok(Expression::Function(Box::new(f))),
558
559            // PARSE_JSON -> strip for Spark (just keep the string argument)
560            "PARSE_JSON" if f.args.len() == 1 => Ok(f.args.into_iter().next().unwrap()),
561            "PARSE_JSON" => Ok(Expression::Function(Box::new(Function::new(
562                "FROM_JSON".to_string(),
563                f.args,
564            )))),
565
566            // DATEDIFF is native to Spark (supports unit in Spark 3+)
567            "DATEDIFF" | "DATE_DIFF" => Ok(Expression::Function(Box::new(Function::new(
568                "DATEDIFF".to_string(),
569                f.args,
570            )))),
571
572            // DATE_ADD is native to Spark
573            "DATE_ADD" | "DATEADD" => Ok(Expression::Function(Box::new(Function::new(
574                "DATE_ADD".to_string(),
575                f.args,
576            )))),
577
578            // DATE_SUB is native to Spark
579            "DATE_SUB" => Ok(Expression::Function(Box::new(f))),
580
581            // TIMESTAMPADD is native to Spark 3+
582            "TIMESTAMPADD" => Ok(Expression::Function(Box::new(f))),
583
584            // TIMESTAMPDIFF is native to Spark 3+
585            "TIMESTAMPDIFF" => Ok(Expression::Function(Box::new(f))),
586
587            // ADD_MONTHS is native to Spark
588            "ADD_MONTHS" => Ok(Expression::Function(Box::new(f))),
589
590            // MONTHS_BETWEEN is native to Spark
591            "MONTHS_BETWEEN" => Ok(Expression::Function(Box::new(f))),
592
593            // NVL is native to Spark
594            "NVL" => Ok(Expression::Function(Box::new(f))),
595
596            // NVL2 is native to Spark
597            "NVL2" => Ok(Expression::Function(Box::new(f))),
598
599            // MAP is native to Spark
600            "MAP" => Ok(Expression::Function(Box::new(f))),
601
602            // ARRAY is native to Spark
603            "ARRAY" => Ok(Expression::Function(Box::new(f))),
604
605            // ROW -> STRUCT for Spark (cross-dialect, no auto-naming)
606            "ROW" => Ok(Expression::Function(Box::new(Function::new(
607                "STRUCT".to_string(),
608                f.args,
609            )))),
610
611            // STRUCT is native to Spark - auto-name unnamed args as col1, col2, etc.
612            "STRUCT" => {
613                let mut col_idx = 1usize;
614                let named_args: Vec<Expression> = f
615                    .args
616                    .into_iter()
617                    .map(|arg| {
618                        let current_idx = col_idx;
619                        col_idx += 1;
620                        // Check if arg already has an alias (AS name) or is Star
621                        match &arg {
622                            Expression::Alias(_) => arg, // already named
623                            Expression::Star(_) => arg,  // STRUCT(*) - keep as-is
624                            Expression::Column(c) if c.table.is_none() => {
625                                // Column reference: use column name as the struct field name
626                                let name = c.name.name.clone();
627                                Expression::Alias(Box::new(crate::expressions::Alias {
628                                    this: arg,
629                                    alias: crate::expressions::Identifier::new(&name),
630                                    column_aliases: Vec::new(),
631                                    alias_explicit_as: false,
632                                    alias_keyword: None,
633                                    pre_alias_comments: Vec::new(),
634                                    trailing_comments: Vec::new(),
635                                    inferred_type: None,
636                                }))
637                            }
638                            _ => {
639                                // Unnamed literal/expression: auto-name as colN
640                                let name = format!("col{}", current_idx);
641                                Expression::Alias(Box::new(crate::expressions::Alias {
642                                    this: arg,
643                                    alias: crate::expressions::Identifier::new(&name),
644                                    column_aliases: Vec::new(),
645                                    alias_explicit_as: false,
646                                    alias_keyword: None,
647                                    pre_alias_comments: Vec::new(),
648                                    trailing_comments: Vec::new(),
649                                    inferred_type: None,
650                                }))
651                            }
652                        }
653                    })
654                    .collect();
655                Ok(Expression::Function(Box::new(Function {
656                    name: "STRUCT".to_string(),
657                    args: named_args,
658                    distinct: false,
659                    trailing_comments: Vec::new(),
660                    use_bracket_syntax: false,
661                    no_parens: false,
662                    quoted: false,
663                    span: None,
664                    inferred_type: None,
665                })))
666            }
667
668            // NAMED_STRUCT('a', 1) -> STRUCT(1 AS a) for SQLGlot Spark outputs
669            "NAMED_STRUCT" if f.args.len() % 2 == 0 => {
670                let original_args = f.args.clone();
671                let mut struct_args = Vec::new();
672                for pair in f.args.chunks(2) {
673                    if let Expression::Literal(lit) = &pair[0] {
674                        if let Literal::String(field_name) = lit.as_ref() {
675                            struct_args.push(Expression::Alias(Box::new(
676                                crate::expressions::Alias {
677                                    this: pair[1].clone(),
678                                    alias: crate::expressions::Identifier::new(field_name),
679                                    column_aliases: Vec::new(),
680                                    alias_explicit_as: false,
681                                    alias_keyword: None,
682                                    pre_alias_comments: Vec::new(),
683                                    trailing_comments: Vec::new(),
684                                    inferred_type: None,
685                                },
686                            )));
687                            continue;
688                        }
689                    }
690                    return Ok(Expression::Function(Box::new(Function::new(
691                        "NAMED_STRUCT".to_string(),
692                        original_args,
693                    ))));
694                }
695                Ok(Expression::Function(Box::new(Function::new(
696                    "STRUCT".to_string(),
697                    struct_args,
698                ))))
699            }
700
701            // NAMED_STRUCT is native to Spark
702            "NAMED_STRUCT" => Ok(Expression::Function(Box::new(f))),
703
704            // MAP_FROM_ARRAYS is native to Spark
705            "MAP_FROM_ARRAYS" => Ok(Expression::Function(Box::new(f))),
706
707            // ARRAY_SORT is native to Spark
708            "ARRAY_SORT" => Ok(Expression::Function(Box::new(f))),
709
710            // ARRAY_DISTINCT is native to Spark
711            "ARRAY_DISTINCT" => Ok(Expression::Function(Box::new(f))),
712
713            // ARRAY_UNION is native to Spark
714            "ARRAY_UNION" => Ok(Expression::Function(Box::new(f))),
715
716            // ARRAY_INTERSECT is native to Spark
717            "ARRAY_INTERSECT" => Ok(Expression::Function(Box::new(f))),
718
719            // ARRAY_EXCEPT is native to Spark
720            "ARRAY_EXCEPT" => Ok(Expression::Function(Box::new(f))),
721
722            // ARRAY_CONTAINS is native to Spark
723            "ARRAY_CONTAINS" => Ok(Expression::Function(Box::new(f))),
724
725            // ELEMENT_AT is native to Spark
726            "ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
727
728            // TRY_ELEMENT_AT is native to Spark 3+
729            "TRY_ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
730
731            // TRANSFORM is native to Spark (array transformation)
732            "TRANSFORM" => Ok(Expression::Function(Box::new(f))),
733
734            // FILTER is native to Spark (array filtering)
735            "FILTER" => Ok(Expression::Function(Box::new(f))),
736
737            // AGGREGATE is native to Spark (array reduction)
738            "AGGREGATE" => Ok(Expression::Function(Box::new(f))),
739
740            // SEQUENCE is native to Spark (generate array)
741            "SEQUENCE" => Ok(Expression::Function(Box::new(f))),
742
743            // GENERATE_SERIES -> SEQUENCE
744            "GENERATE_SERIES" => Ok(Expression::Function(Box::new(Function::new(
745                "SEQUENCE".to_string(),
746                f.args,
747            )))),
748
749            // STARTSWITH is native to Spark 3+
750            "STARTSWITH" | "STARTS_WITH" => Ok(Expression::Function(Box::new(Function::new(
751                "STARTSWITH".to_string(),
752                f.args,
753            )))),
754
755            // ENDSWITH is native to Spark 3+
756            "ENDSWITH" | "ENDS_WITH" => Ok(Expression::Function(Box::new(Function::new(
757                "ENDSWITH".to_string(),
758                f.args,
759            )))),
760
761            // ARRAY_CONSTRUCT_COMPACT(1, null, 2) -> ARRAY_COMPACT(ARRAY(1, NULL, 2))
762            "ARRAY_CONSTRUCT_COMPACT" => {
763                let inner =
764                    Expression::Function(Box::new(Function::new("ARRAY".to_string(), f.args)));
765                Ok(Expression::Function(Box::new(Function::new(
766                    "ARRAY_COMPACT".to_string(),
767                    vec![inner],
768                ))))
769            }
770
771            // ARRAY_TO_STRING -> ARRAY_JOIN
772            "ARRAY_TO_STRING" => Ok(Expression::Function(Box::new(Function::new(
773                "ARRAY_JOIN".to_string(),
774                f.args,
775            )))),
776
777            // TO_ARRAY(x) -> IF(x IS NULL, NULL, ARRAY(x))
778            "TO_ARRAY" if f.args.len() == 1 => {
779                let x = f.args[0].clone();
780                // Check if arg is already an array constructor (bracket notation)
781                // In that case: TO_ARRAY(['test']) -> ARRAY('test')
782                match &x {
783                    Expression::ArrayFunc(arr) => {
784                        // Just convert to ARRAY(...) function
785                        Ok(Expression::Function(Box::new(Function::new(
786                            "ARRAY".to_string(),
787                            arr.expressions.clone(),
788                        ))))
789                    }
790                    _ => Ok(Expression::IfFunc(Box::new(crate::expressions::IfFunc {
791                        condition: Expression::IsNull(Box::new(crate::expressions::IsNull {
792                            this: x.clone(),
793                            not: false,
794                            postfix_form: false,
795                        })),
796                        true_value: Expression::Null(crate::expressions::Null),
797                        false_value: Some(Expression::Function(Box::new(Function::new(
798                            "ARRAY".to_string(),
799                            vec![x],
800                        )))),
801                        original_name: Some("IF".to_string()),
802                        inferred_type: None,
803                    }))),
804                }
805            }
806
807            // REGEXP_SUBSTR -> REGEXP_EXTRACT (strip extra args)
808            "REGEXP_SUBSTR" if f.args.len() >= 2 => {
809                let subject = f.args[0].clone();
810                let pattern = f.args[1].clone();
811                // For Spark: REGEXP_EXTRACT(subject, pattern, group)
812                // group defaults to 0 for full match, but sqlglot uses last arg if present
813                let group = if f.args.len() >= 6 {
814                    let g = &f.args[5];
815                    // If group is literal 1 (default), omit it
816                    if matches!(g, Expression::Literal(lit) if matches!(lit.as_ref(), Literal::Number(n) if n == "1"))
817                    {
818                        None
819                    } else {
820                        Some(g.clone())
821                    }
822                } else {
823                    None
824                };
825                let mut args = vec![subject, pattern];
826                if let Some(g) = group {
827                    args.push(g);
828                }
829                Ok(Expression::Function(Box::new(Function::new(
830                    "REGEXP_EXTRACT".to_string(),
831                    args,
832                ))))
833            }
834
835            // UUID_STRING() -> UUID(); keep namespace/name args for target-specific generation.
836            "UUID_STRING" => {
837                if f.args.is_empty() {
838                    Ok(Expression::Function(Box::new(Function::new(
839                        "UUID".to_string(),
840                        vec![],
841                    ))))
842                } else {
843                    Ok(Expression::Function(Box::new(Function::new(
844                        "UUID_STRING".to_string(),
845                        f.args,
846                    ))))
847                }
848            }
849
850            // OBJECT_CONSTRUCT -> STRUCT in Spark
851            "OBJECT_CONSTRUCT" if f.args.len() >= 2 && f.args.len() % 2 == 0 => {
852                // Convert key-value pairs to named struct fields
853                // OBJECT_CONSTRUCT('Manitoba', 'Winnipeg', 'foo', 'bar')
854                // -> STRUCT('Winnipeg' AS Manitoba, 'bar' AS foo)
855                let mut struct_args = Vec::new();
856                for pair in f.args.chunks(2) {
857                    if let Expression::Literal(lit) = &pair[0] {
858                        if let Literal::String(key) = lit.as_ref() {
859                            struct_args.push(Expression::Alias(Box::new(
860                                crate::expressions::Alias {
861                                    this: pair[1].clone(),
862                                    alias: crate::expressions::Identifier::new(key.clone()),
863                                    column_aliases: vec![],
864                                    alias_explicit_as: false,
865                                    alias_keyword: None,
866                                    pre_alias_comments: vec![],
867                                    trailing_comments: vec![],
868                                    inferred_type: None,
869                                },
870                            )));
871                        }
872                    } else {
873                        struct_args.push(pair[1].clone());
874                    }
875                }
876                Ok(Expression::Function(Box::new(Function::new(
877                    "STRUCT".to_string(),
878                    struct_args,
879                ))))
880            }
881
882            // DATE_PART(part, expr) -> EXTRACT(part FROM expr)
883            "DATE_PART" if f.args.len() == 2 => {
884                let mut args = f.args;
885                let part = args.remove(0);
886                let expr = args.remove(0);
887                if let Some(field) = expr_to_datetime_field(&part) {
888                    Ok(Expression::Extract(Box::new(ExtractFunc {
889                        this: expr,
890                        field,
891                    })))
892                } else {
893                    // Can't parse the field, keep as function
894                    Ok(Expression::Function(Box::new(Function::new(
895                        "DATE_PART".to_string(),
896                        vec![part, expr],
897                    ))))
898                }
899            }
900
901            // GET_PATH(obj, path) -> GET_JSON_OBJECT(obj, json_path) in Spark
902            "GET_PATH" if f.args.len() == 2 => {
903                let mut args = f.args;
904                let this = args.remove(0);
905                let path = args.remove(0);
906                let json_path = match &path {
907                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
908                        let Literal::String(s) = lit.as_ref() else {
909                            unreachable!()
910                        };
911                        let normalized = if s.starts_with('$') {
912                            s.clone()
913                        } else if s.starts_with('[') {
914                            format!("${}", s)
915                        } else {
916                            format!("$.{}", s)
917                        };
918                        Expression::Literal(Box::new(Literal::String(normalized)))
919                    }
920                    _ => path,
921                };
922                Ok(Expression::Function(Box::new(Function::new(
923                    "GET_JSON_OBJECT".to_string(),
924                    vec![this, json_path],
925                ))))
926            }
927
928            // BITWISE_LEFT_SHIFT → SHIFTLEFT
929            "BITWISE_LEFT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
930                "SHIFTLEFT".to_string(),
931                f.args,
932            )))),
933
934            // BITWISE_RIGHT_SHIFT → SHIFTRIGHT
935            "BITWISE_RIGHT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
936                "SHIFTRIGHT".to_string(),
937                f.args,
938            )))),
939
940            // APPROX_DISTINCT → APPROX_COUNT_DISTINCT
941            "APPROX_DISTINCT" => Ok(Expression::Function(Box::new(Function::new(
942                "APPROX_COUNT_DISTINCT".to_string(),
943                f.args,
944            )))),
945
946            // ARRAY_SLICE → SLICE
947            "ARRAY_SLICE" => Ok(Expression::Function(Box::new(Function::new(
948                "SLICE".to_string(),
949                f.args,
950            )))),
951
952            // DATE_FROM_PARTS → MAKE_DATE
953            "DATE_FROM_PARTS" => Ok(Expression::Function(Box::new(Function::new(
954                "MAKE_DATE".to_string(),
955                f.args,
956            )))),
957
958            // DAYOFWEEK_ISO → DAYOFWEEK
959            "DAYOFWEEK_ISO" => Ok(Expression::Function(Box::new(Function::new(
960                "DAYOFWEEK".to_string(),
961                f.args,
962            )))),
963
964            // FORMAT → FORMAT_STRING
965            "FORMAT" => Ok(Expression::Function(Box::new(Function::new(
966                "FORMAT_STRING".to_string(),
967                f.args,
968            )))),
969
970            // LOGICAL_AND → BOOL_AND
971            "LOGICAL_AND" => Ok(Expression::Function(Box::new(Function::new(
972                "BOOL_AND".to_string(),
973                f.args,
974            )))),
975
976            // VARIANCE_POP → VAR_POP
977            "VARIANCE_POP" => Ok(Expression::Function(Box::new(Function::new(
978                "VAR_POP".to_string(),
979                f.args,
980            )))),
981
982            // WEEK_OF_YEAR → WEEKOFYEAR
983            "WEEK_OF_YEAR" => Ok(Expression::Function(Box::new(Function::new(
984                "WEEKOFYEAR".to_string(),
985                f.args,
986            )))),
987
988            // BIT_GET -> GETBIT
989            "BIT_GET" => Ok(Expression::Function(Box::new(Function::new(
990                "GETBIT".to_string(),
991                f.args,
992            )))),
993
994            // CURDATE -> CURRENT_DATE
995            "CURDATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
996
997            // Pass through everything else
998            _ => Ok(Expression::Function(Box::new(f))),
999        }
1000    }
1001
1002    fn transform_aggregate_function(
1003        &self,
1004        f: Box<crate::expressions::AggregateFunction>,
1005    ) -> Result<Expression> {
1006        let name_upper = f.name.to_uppercase();
1007        match name_upper.as_str() {
1008            // GROUP_CONCAT -> COLLECT_LIST (then CONCAT_WS for string)
1009            "GROUP_CONCAT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
1010                Function::new("COLLECT_LIST".to_string(), f.args),
1011            ))),
1012
1013            // STRING_AGG -> COLLECT_LIST (or STRING_AGG in Spark 4+)
1014            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
1015                Function::new("COLLECT_LIST".to_string(), f.args),
1016            ))),
1017
1018            // LISTAGG -> COLLECT_LIST
1019            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
1020                "COLLECT_LIST".to_string(),
1021                f.args,
1022            )))),
1023
1024            // ARRAY_AGG -> COLLECT_LIST (preserve distinct and filter)
1025            "ARRAY_AGG" if !f.args.is_empty() => {
1026                let mut af = f;
1027                af.name = "COLLECT_LIST".to_string();
1028                Ok(Expression::AggregateFunction(af))
1029            }
1030
1031            // LOGICAL_OR -> BOOL_OR in Spark
1032            "LOGICAL_OR" if !f.args.is_empty() => {
1033                let mut af = f;
1034                af.name = "BOOL_OR".to_string();
1035                Ok(Expression::AggregateFunction(af))
1036            }
1037
1038            // Pass through everything else
1039            _ => Ok(Expression::AggregateFunction(f)),
1040        }
1041    }
1042}
1043
1044/// Convert an expression (string literal or identifier) to a DateTimeField
1045fn expr_to_datetime_field(expr: &Expression) -> Option<DateTimeField> {
1046    let name = match expr {
1047        Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
1048            let Literal::String(s) = lit.as_ref() else {
1049                unreachable!()
1050            };
1051            s.to_uppercase()
1052        }
1053        Expression::Identifier(id) => id.name.to_uppercase(),
1054        Expression::Var(v) => v.this.to_uppercase(),
1055        Expression::Column(col) if col.table.is_none() => col.name.name.to_uppercase(),
1056        _ => return None,
1057    };
1058    match name.as_str() {
1059        "YEAR" | "Y" | "YY" | "YYY" | "YYYY" | "YR" | "YEARS" | "YRS" => Some(DateTimeField::Year),
1060        "MONTH" | "MM" | "MON" | "MONS" | "MONTHS" => Some(DateTimeField::Month),
1061        "DAY" | "D" | "DD" | "DAYS" | "DAYOFMONTH" => Some(DateTimeField::Day),
1062        "HOUR" | "H" | "HH" | "HR" | "HOURS" | "HRS" => Some(DateTimeField::Hour),
1063        "MINUTE" | "MI" | "MIN" | "MINUTES" | "MINS" => Some(DateTimeField::Minute),
1064        "SECOND" | "S" | "SEC" | "SECONDS" | "SECS" => Some(DateTimeField::Second),
1065        "MILLISECOND" | "MS" | "MSEC" | "MILLISECONDS" => Some(DateTimeField::Millisecond),
1066        "MICROSECOND" | "US" | "USEC" | "MICROSECONDS" => Some(DateTimeField::Microsecond),
1067        "DOW" | "DAYOFWEEK" | "DAYOFWEEK_ISO" | "DW" => Some(DateTimeField::DayOfWeek),
1068        "DOY" | "DAYOFYEAR" => Some(DateTimeField::DayOfYear),
1069        "WEEK" | "W" | "WK" | "WEEKOFYEAR" | "WOY" => Some(DateTimeField::Week),
1070        "QUARTER" | "Q" | "QTR" | "QTRS" | "QUARTERS" => Some(DateTimeField::Quarter),
1071        "EPOCH" | "EPOCH_SECOND" | "EPOCH_SECONDS" => Some(DateTimeField::Epoch),
1072        "TIMEZONE" | "TIMEZONE_HOUR" | "TZH" => Some(DateTimeField::TimezoneHour),
1073        "TIMEZONE_MINUTE" | "TZM" => Some(DateTimeField::TimezoneMinute),
1074        _ => Some(DateTimeField::Custom(name)),
1075    }
1076}