Skip to main content

polyglot_sql/dialects/
spark.rs

1//! Spark SQL Dialect
2//!
3//! Spark SQL-specific transformations based on sqlglot patterns.
4//! Key features (extends Hive with modern SQL):
5//! - TRY_CAST is supported (Spark 3+)
6//! - ILIKE is supported (Spark 3+)
7//! - Uses backticks for identifiers
8//! - ARRAY_AGG, COLLECT_LIST for array aggregation
9//! - STRING_AGG / LISTAGG supported (Spark 4+)
10//! - DATE_ADD with unit parameter (Spark 3+)
11//! - TIMESTAMPADD, TIMESTAMPDIFF (Spark 3+)
12//! - More PostgreSQL-like syntax than Hive
13
14use super::{DialectImpl, DialectType};
15use crate::error::Result;
16use crate::expressions::{CeilFunc, CurrentTimestamp, DataType, DateTimeField, Expression, ExtractFunc, Function, Literal, StructField, UnaryFunc, VarArgFunc};
17use crate::generator::GeneratorConfig;
18use crate::tokens::TokenizerConfig;
19
20/// Spark SQL dialect
21pub struct SparkDialect;
22
23impl DialectImpl for SparkDialect {
24    fn dialect_type(&self) -> DialectType {
25        DialectType::Spark
26    }
27
28    fn tokenizer_config(&self) -> TokenizerConfig {
29        let mut config = TokenizerConfig::default();
30        // Spark uses backticks for identifiers (NOT double quotes)
31        config.identifiers.clear();
32        config.identifiers.insert('`', '`');
33        // Spark (like Hive) uses double quotes as string delimiters (QUOTES = ["'", '"'])
34        config.quotes.insert("\"".to_string(), "\"".to_string());
35        // Spark (like Hive) uses backslash escapes in strings (STRING_ESCAPES = ["\\"])
36        config.string_escapes.push('\\');
37        // Spark supports DIV keyword for integer division (inherited from Hive)
38        config.keywords.insert("DIV".to_string(), crate::tokens::TokenType::Div);
39        // Spark numeric literal suffixes (same as Hive): 1L -> BIGINT, 1S -> SMALLINT, etc.
40        config.numeric_literals.insert("L".to_string(), "BIGINT".to_string());
41        config.numeric_literals.insert("S".to_string(), "SMALLINT".to_string());
42        config.numeric_literals.insert("Y".to_string(), "TINYINT".to_string());
43        config.numeric_literals.insert("D".to_string(), "DOUBLE".to_string());
44        config.numeric_literals.insert("F".to_string(), "FLOAT".to_string());
45        config.numeric_literals.insert("BD".to_string(), "DECIMAL".to_string());
46        // Spark allows identifiers to start with digits (e.g., 1a, 1_a)
47        config.identifiers_can_start_with_digit = true;
48        // Spark: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False
49        // Backslashes in raw strings are always literal (no escape processing)
50        config.string_escapes_allowed_in_raw_strings = false;
51        config
52    }
53
54    fn generator_config(&self) -> GeneratorConfig {
55        use crate::generator::IdentifierQuoteStyle;
56        GeneratorConfig {
57            identifier_quote: '`',
58            identifier_quote_style: IdentifierQuoteStyle::BACKTICK,
59            dialect: Some(DialectType::Spark),
60            // Spark uses colon separator in STRUCT field definitions: STRUCT<field_name: TYPE>
61            struct_field_sep: ": ",
62            // Spark doesn't use AS before RETURN in function definitions
63            create_function_return_as: false,
64            // Spark places alias after the TABLESAMPLE clause
65            alias_post_tablesample: true,
66            tablesample_seed_keyword: "REPEATABLE",
67            join_hints: false,
68            identifiers_can_start_with_digit: true,
69            // Spark uses COMMENT 'value' without = sign
70            schema_comment_with_eq: false,
71            ..Default::default()
72        }
73    }
74
75    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
76        match expr {
77            // IFNULL -> COALESCE in Spark
78            Expression::IfNull(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc { original_name: None,
79                expressions: vec![f.this, f.expression],
80            }))),
81
82            // NVL is supported in Spark (from Hive), but COALESCE is standard
83            Expression::Nvl(f) => Ok(Expression::Coalesce(Box::new(VarArgFunc { original_name: None,
84                expressions: vec![f.this, f.expression],
85            }))),
86
87            // Cast: normalize VARCHAR(n) -> STRING, CHAR(n) -> STRING for Spark
88            Expression::Cast(mut c) => {
89                c.to = Self::normalize_spark_type(c.to);
90                Ok(Expression::Cast(c))
91            }
92
93            // TryCast stays as TryCast in Spark (Spark supports TRY_CAST natively)
94            Expression::TryCast(mut c) => {
95                c.to = Self::normalize_spark_type(c.to);
96                Ok(Expression::TryCast(c))
97            }
98
99            // SafeCast -> TRY_CAST
100            Expression::SafeCast(mut c) => {
101                c.to = Self::normalize_spark_type(c.to);
102                Ok(Expression::TryCast(c))
103            }
104
105            // TRIM: non-standard comma syntax -> standard FROM syntax
106            // TRIM('SL', 'SSparkSQLS') -> TRIM('SL' FROM 'SSparkSQLS')
107            Expression::Trim(mut t) => {
108                if !t.sql_standard_syntax && t.characters.is_some() {
109                    // Convert comma syntax to standard SQL syntax
110                    // Fields already have correct semantics: this=string, characters=chars
111                    t.sql_standard_syntax = true;
112                }
113                Ok(Expression::Trim(t))
114            }
115
116            // ILIKE is supported in Spark 3+
117            Expression::ILike(op) => Ok(Expression::ILike(op)),
118
119            // UNNEST -> EXPLODE in Spark (Hive compatibility)
120            Expression::Unnest(f) => Ok(Expression::Explode(Box::new(UnaryFunc::new(f.this)))),
121
122            // EXPLODE is native to Spark
123            Expression::Explode(f) => Ok(Expression::Explode(f)),
124
125            // ExplodeOuter is supported in Spark
126            Expression::ExplodeOuter(f) => Ok(Expression::ExplodeOuter(f)),
127
128            // RANDOM -> RAND in Spark
129            Expression::Random(_) => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
130                seed: None, lower: None, upper: None,
131            }))),
132
133            // Rand is native to Spark
134            Expression::Rand(r) => Ok(Expression::Rand(r)),
135
136            // || (Concat) -> CONCAT in Spark
137            Expression::Concat(op) => Ok(Expression::Function(Box::new(Function::new(
138                "CONCAT".to_string(),
139                vec![op.left, op.right],
140            )))),
141
142            // ParseJson: handled by generator (emits just the string literal for Spark)
143
144            // Generic function transformations
145            Expression::Function(f) => self.transform_function(*f),
146
147            // Generic aggregate function transformations
148            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
149
150            // $N parameters -> ${N} in Spark (DollarBrace style)
151            Expression::Parameter(mut p) if p.style == crate::expressions::ParameterStyle::Dollar => {
152                p.style = crate::expressions::ParameterStyle::DollarBrace;
153                // Convert index to name for DollarBrace format
154                if let Some(idx) = p.index {
155                    p.name = Some(idx.to_string());
156                }
157                Ok(Expression::Parameter(p))
158            }
159
160            // JSONExtract with variant_extract (Databricks colon syntax) -> GET_JSON_OBJECT
161            Expression::JSONExtract(je) if je.variant_extract.is_some() => {
162                // Convert path: 'item[1].price' -> '$.item[1].price'
163                let path = match *je.expression {
164                    Expression::Literal(Literal::String(s)) => {
165                        Expression::Literal(Literal::String(format!("$.{}", s)))
166                    }
167                    other => other,
168                };
169                Ok(Expression::Function(Box::new(Function::new(
170                    "GET_JSON_OBJECT".to_string(),
171                    vec![*je.this, path],
172                ))))
173            }
174
175            // Pass through everything else
176            _ => Ok(expr),
177        }
178    }
179}
180
181impl SparkDialect {
182    /// Normalize a data type for Spark:
183    /// - VARCHAR/CHAR without length -> STRING
184    /// - VARCHAR(n)/CHAR(n) with length -> keep as-is
185    /// - TEXT -> STRING
186    fn normalize_spark_type(dt: DataType) -> DataType {
187        match dt {
188            DataType::VarChar { length: None, .. } | DataType::Char { length: None } | DataType::Text => {
189                DataType::Custom { name: "STRING".to_string() }
190            }
191            // VARCHAR(n) and CHAR(n) with length are kept as-is
192            DataType::VarChar { .. } | DataType::Char { .. } => dt,
193            // Also normalize struct fields recursively
194            DataType::Struct { fields, nested } => {
195                let normalized_fields: Vec<StructField> = fields.into_iter().map(|mut f| {
196                    f.data_type = Self::normalize_spark_type(f.data_type);
197                    f
198                }).collect();
199                DataType::Struct { fields: normalized_fields, nested }
200            }
201            _ => dt,
202        }
203    }
204
205    fn transform_function(&self, f: Function) -> Result<Expression> {
206        let name_upper = f.name.to_uppercase();
207        match name_upper.as_str() {
208            // IFNULL -> COALESCE
209            "IFNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc { original_name: None,
210                expressions: f.args,
211            }))),
212
213            // NVL -> COALESCE
214            "NVL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc { original_name: None,
215                expressions: f.args,
216            }))),
217
218            // ISNULL -> COALESCE
219            "ISNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc { original_name: None,
220                expressions: f.args,
221            }))),
222
223            // GROUP_CONCAT -> CONCAT_WS + COLLECT_LIST in older Spark
224            // In Spark 4+, STRING_AGG is available
225            "GROUP_CONCAT" if !f.args.is_empty() => {
226                // For simplicity, use COLLECT_LIST (array aggregation)
227                Ok(Expression::Function(Box::new(Function::new(
228                    "COLLECT_LIST".to_string(),
229                    f.args,
230                ))))
231            }
232
233            // STRING_AGG is supported in Spark 4+
234            // For older versions, fall back to CONCAT_WS + COLLECT_LIST
235            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
236                "COLLECT_LIST".to_string(),
237                f.args,
238            )))),
239
240            // LISTAGG -> STRING_AGG in Spark 4+ (or COLLECT_LIST for older)
241            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
242                "COLLECT_LIST".to_string(),
243                f.args,
244            )))),
245
246            // SUBSTRING is native to Spark
247            "SUBSTRING" | "SUBSTR" => Ok(Expression::Function(Box::new(f))),
248
249            // LENGTH is native to Spark
250            "LENGTH" => Ok(Expression::Function(Box::new(f))),
251
252            // LEN -> LENGTH
253            "LEN" if f.args.len() == 1 => Ok(Expression::Length(Box::new(UnaryFunc::new(
254                f.args.into_iter().next().unwrap(),
255            )))),
256
257            // RANDOM -> RAND
258            "RANDOM" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
259                seed: None, lower: None, upper: None,
260            }))),
261
262            // RAND is native to Spark
263            "RAND" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
264                seed: None, lower: None, upper: None,
265            }))),
266
267            // NOW -> CURRENT_TIMESTAMP
268            "NOW" => Ok(Expression::CurrentTimestamp(
269                crate::expressions::CurrentTimestamp { precision: None, sysdate: false },
270            )),
271
272            // GETDATE -> CURRENT_TIMESTAMP
273            "GETDATE" => Ok(Expression::CurrentTimestamp(
274                crate::expressions::CurrentTimestamp { precision: None, sysdate: false },
275            )),
276
277            // CURRENT_TIMESTAMP is native
278            "CURRENT_TIMESTAMP" => Ok(Expression::CurrentTimestamp(
279                crate::expressions::CurrentTimestamp { precision: None, sysdate: false },
280            )),
281
282            // CURRENT_DATE is native
283            "CURRENT_DATE" => Ok(Expression::CurrentDate(crate::expressions::CurrentDate)),
284
285            // TO_DATE is native to Spark; strip default format 'yyyy-MM-dd'
286            "TO_DATE" if f.args.len() == 2 => {
287                let is_default_format = matches!(&f.args[1], Expression::Literal(crate::expressions::Literal::String(s)) if s == "yyyy-MM-dd");
288                if is_default_format {
289                    Ok(Expression::Function(Box::new(Function::new("TO_DATE".to_string(), vec![f.args.into_iter().next().unwrap()]))))
290                } else {
291                    Ok(Expression::Function(Box::new(f)))
292                }
293            }
294            "TO_DATE" => Ok(Expression::Function(Box::new(f))),
295
296            // TO_TIMESTAMP is native to Spark
297            "TO_TIMESTAMP" => Ok(Expression::Function(Box::new(f))),
298
299            // DATE_FORMAT is native to Spark
300            "DATE_FORMAT" => Ok(Expression::Function(Box::new(f))),
301
302            // strftime -> DATE_FORMAT
303            "STRFTIME" => Ok(Expression::Function(Box::new(Function::new(
304                "DATE_FORMAT".to_string(),
305                f.args,
306            )))),
307
308            // TO_CHAR -> DATE_FORMAT
309            "TO_CHAR" => Ok(Expression::Function(Box::new(Function::new(
310                "DATE_FORMAT".to_string(),
311                f.args,
312            )))),
313
314            // DATE_TRUNC is native to Spark
315            "DATE_TRUNC" => Ok(Expression::Function(Box::new(f))),
316
317            // TRUNC is native to Spark
318            "TRUNC" => Ok(Expression::Function(Box::new(f))),
319
320            // EXTRACT is native to Spark
321            "EXTRACT" => Ok(Expression::Function(Box::new(f))),
322
323            // DATEPART -> EXTRACT
324            "DATEPART" => Ok(Expression::Function(Box::new(Function::new(
325                "EXTRACT".to_string(),
326                f.args,
327            )))),
328
329            // UNIX_TIMESTAMP is native to Spark
330            // When called with no args, add CURRENT_TIMESTAMP() as default
331            "UNIX_TIMESTAMP" => {
332                if f.args.is_empty() {
333                    Ok(Expression::Function(Box::new(Function::new(
334                        "UNIX_TIMESTAMP".to_string(),
335                        vec![Expression::CurrentTimestamp(CurrentTimestamp {
336                            precision: None,
337                            sysdate: false,
338                        })],
339                    ))))
340                } else {
341                    Ok(Expression::Function(Box::new(f)))
342                }
343            }
344
345            // FROM_UNIXTIME is native to Spark
346            "FROM_UNIXTIME" => Ok(Expression::Function(Box::new(f))),
347
348            // STR_TO_MAP is native to Spark
349            // When called with only one arg, add default delimiters ',' and ':'
350            "STR_TO_MAP" => {
351                if f.args.len() == 1 {
352                    let mut args = f.args;
353                    args.push(Expression::Literal(crate::expressions::Literal::String(",".to_string())));
354                    args.push(Expression::Literal(crate::expressions::Literal::String(":".to_string())));
355                    Ok(Expression::Function(Box::new(Function::new(
356                        "STR_TO_MAP".to_string(),
357                        args,
358                    ))))
359                } else {
360                    Ok(Expression::Function(Box::new(f)))
361                }
362            }
363
364            // POSITION is native to Spark (POSITION(substr IN str))
365            "POSITION" => Ok(Expression::Function(Box::new(f))),
366
367            // LOCATE is native to Spark
368            "LOCATE" => Ok(Expression::Function(Box::new(f))),
369
370            // STRPOS -> Use expression form or LOCATE
371            "STRPOS" if f.args.len() == 2 => {
372                let mut args = f.args;
373                let first = args.remove(0);
374                let second = args.remove(0);
375                // LOCATE(substr, str) in Spark
376                Ok(Expression::Function(Box::new(Function::new(
377                    "LOCATE".to_string(),
378                    vec![second, first],
379                ))))
380            }
381
382            // CHARINDEX -> LOCATE
383            "CHARINDEX" if f.args.len() >= 2 => {
384                let mut args = f.args;
385                let substring = args.remove(0);
386                let string = args.remove(0);
387                let mut locate_args = vec![substring, string];
388                if !args.is_empty() {
389                    locate_args.push(args.remove(0));
390                }
391                Ok(Expression::Function(Box::new(Function::new(
392                    "LOCATE".to_string(),
393                    locate_args,
394                ))))
395            }
396
397            // INSTR is native to Spark
398            "INSTR" => Ok(Expression::Function(Box::new(f))),
399
400            // CEILING -> CEIL
401            "CEILING" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
402                this: f.args.into_iter().next().unwrap(),
403                decimals: None,
404                to: None,
405            }))),
406
407            // CEIL is native to Spark
408            "CEIL" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
409                this: f.args.into_iter().next().unwrap(),
410                decimals: None,
411                to: None,
412            }))),
413
414            // UNNEST -> EXPLODE
415            "UNNEST" => Ok(Expression::Function(Box::new(Function::new(
416                "EXPLODE".to_string(),
417                f.args,
418            )))),
419
420            // FLATTEN -> FLATTEN is native to Spark (for nested arrays)
421            "FLATTEN" => Ok(Expression::Function(Box::new(f))),
422
423            // ARRAY_AGG -> COLLECT_LIST
424            "ARRAY_AGG" => Ok(Expression::Function(Box::new(Function::new(
425                "COLLECT_LIST".to_string(),
426                f.args,
427            )))),
428
429            // COLLECT_LIST is native to Spark
430            "COLLECT_LIST" => Ok(Expression::Function(Box::new(f))),
431
432            // COLLECT_SET is native to Spark
433            "COLLECT_SET" => Ok(Expression::Function(Box::new(f))),
434
435            // ARRAY_LENGTH -> SIZE in Spark
436            "ARRAY_LENGTH" | "CARDINALITY" => Ok(Expression::Function(Box::new(Function::new(
437                "SIZE".to_string(),
438                f.args,
439            )))),
440
441            // SIZE is native to Spark
442            "SIZE" => Ok(Expression::Function(Box::new(f))),
443
444            // SPLIT is native to Spark
445            "SPLIT" => Ok(Expression::Function(Box::new(f))),
446
447            // REGEXP_REPLACE: Spark supports up to 4 args (subject, pattern, replacement, position)
448            // Strip extra Snowflake args (occurrence, params) if present
449            "REGEXP_REPLACE" if f.args.len() > 4 => {
450                let mut args = f.args;
451                args.truncate(4);
452                Ok(Expression::Function(Box::new(Function::new(
453                    "REGEXP_REPLACE".to_string(),
454                    args,
455                ))))
456            }
457            "REGEXP_REPLACE" => Ok(Expression::Function(Box::new(f))),
458
459            // REGEXP_EXTRACT is native to Spark
460            "REGEXP_EXTRACT" => Ok(Expression::Function(Box::new(f))),
461
462            // REGEXP_EXTRACT_ALL is native to Spark
463            "REGEXP_EXTRACT_ALL" => Ok(Expression::Function(Box::new(f))),
464
465            // RLIKE is native to Spark
466            "RLIKE" | "REGEXP_LIKE" => Ok(Expression::Function(Box::new(Function::new(
467                "RLIKE".to_string(),
468                f.args,
469            )))),
470
471            // JSON_EXTRACT -> GET_JSON_OBJECT (Hive style) or :: operator
472            "JSON_EXTRACT" => Ok(Expression::Function(Box::new(Function::new(
473                "GET_JSON_OBJECT".to_string(),
474                f.args,
475            )))),
476
477            // JSON_EXTRACT_SCALAR -> GET_JSON_OBJECT
478            "JSON_EXTRACT_SCALAR" => Ok(Expression::Function(Box::new(Function::new(
479                "GET_JSON_OBJECT".to_string(),
480                f.args,
481            )))),
482
483            // GET_JSON_OBJECT is native to Spark
484            "GET_JSON_OBJECT" => Ok(Expression::Function(Box::new(f))),
485
486            // FROM_JSON is native to Spark
487            "FROM_JSON" => Ok(Expression::Function(Box::new(f))),
488
489            // TO_JSON is native to Spark
490            "TO_JSON" => Ok(Expression::Function(Box::new(f))),
491
492            // PARSE_JSON -> strip for Spark (just keep the string argument)
493            "PARSE_JSON" if f.args.len() == 1 => Ok(f.args.into_iter().next().unwrap()),
494            "PARSE_JSON" => Ok(Expression::Function(Box::new(Function::new(
495                "FROM_JSON".to_string(),
496                f.args,
497            )))),
498
499            // DATEDIFF is native to Spark (supports unit in Spark 3+)
500            "DATEDIFF" | "DATE_DIFF" => Ok(Expression::Function(Box::new(Function::new(
501                "DATEDIFF".to_string(),
502                f.args,
503            )))),
504
505            // DATE_ADD is native to Spark
506            "DATE_ADD" | "DATEADD" => Ok(Expression::Function(Box::new(Function::new(
507                "DATE_ADD".to_string(),
508                f.args,
509            )))),
510
511            // DATE_SUB is native to Spark
512            "DATE_SUB" => Ok(Expression::Function(Box::new(f))),
513
514            // TIMESTAMPADD is native to Spark 3+
515            "TIMESTAMPADD" => Ok(Expression::Function(Box::new(f))),
516
517            // TIMESTAMPDIFF is native to Spark 3+
518            "TIMESTAMPDIFF" => Ok(Expression::Function(Box::new(f))),
519
520            // ADD_MONTHS is native to Spark
521            "ADD_MONTHS" => Ok(Expression::Function(Box::new(f))),
522
523            // MONTHS_BETWEEN is native to Spark
524            "MONTHS_BETWEEN" => Ok(Expression::Function(Box::new(f))),
525
526            // NVL is native to Spark
527            "NVL" => Ok(Expression::Function(Box::new(f))),
528
529            // NVL2 is native to Spark
530            "NVL2" => Ok(Expression::Function(Box::new(f))),
531
532            // MAP is native to Spark
533            "MAP" => Ok(Expression::Function(Box::new(f))),
534
535            // ARRAY is native to Spark
536            "ARRAY" => Ok(Expression::Function(Box::new(f))),
537
538            // ROW -> STRUCT for Spark (cross-dialect, no auto-naming)
539            "ROW" => {
540                Ok(Expression::Function(Box::new(Function::new(
541                    "STRUCT".to_string(),
542                    f.args,
543                ))))
544            }
545
546            // STRUCT is native to Spark - auto-name unnamed args as col1, col2, etc.
547            "STRUCT" => {
548                let mut col_idx = 1usize;
549                let named_args: Vec<Expression> = f.args.into_iter().map(|arg| {
550                    let current_idx = col_idx;
551                    col_idx += 1;
552                    // Check if arg already has an alias (AS name) or is Star
553                    match &arg {
554                        Expression::Alias(_) => arg, // already named
555                        Expression::Star(_) => arg, // STRUCT(*) - keep as-is
556                        Expression::Column(c) if c.table.is_none() => {
557                            // Column reference: use column name as the struct field name
558                            let name = c.name.name.clone();
559                            Expression::Alias(Box::new(crate::expressions::Alias {
560                                this: arg,
561                                alias: crate::expressions::Identifier::new(&name),
562                                column_aliases: Vec::new(),
563                                pre_alias_comments: Vec::new(),
564                                trailing_comments: Vec::new(),
565                            }))
566                        }
567                        _ => {
568                            // Unnamed literal/expression: auto-name as colN
569                            let name = format!("col{}", current_idx);
570                            Expression::Alias(Box::new(crate::expressions::Alias {
571                                this: arg,
572                                alias: crate::expressions::Identifier::new(&name),
573                                column_aliases: Vec::new(),
574                                pre_alias_comments: Vec::new(),
575                                trailing_comments: Vec::new(),
576                            }))
577                        }
578                    }
579                }).collect();
580                Ok(Expression::Function(Box::new(Function {
581                    name: "STRUCT".to_string(),
582                    args: named_args,
583                    distinct: false,
584                    trailing_comments: Vec::new(),
585                    use_bracket_syntax: false,
586                    no_parens: false,
587                    quoted: false,
588                })))
589            }
590
591            // NAMED_STRUCT is native to Spark
592            "NAMED_STRUCT" => Ok(Expression::Function(Box::new(f))),
593
594            // MAP_FROM_ARRAYS is native to Spark
595            "MAP_FROM_ARRAYS" => Ok(Expression::Function(Box::new(f))),
596
597            // ARRAY_SORT is native to Spark
598            "ARRAY_SORT" => Ok(Expression::Function(Box::new(f))),
599
600            // ARRAY_DISTINCT is native to Spark
601            "ARRAY_DISTINCT" => Ok(Expression::Function(Box::new(f))),
602
603            // ARRAY_UNION is native to Spark
604            "ARRAY_UNION" => Ok(Expression::Function(Box::new(f))),
605
606            // ARRAY_INTERSECT is native to Spark
607            "ARRAY_INTERSECT" => Ok(Expression::Function(Box::new(f))),
608
609            // ARRAY_EXCEPT is native to Spark
610            "ARRAY_EXCEPT" => Ok(Expression::Function(Box::new(f))),
611
612            // ARRAY_CONTAINS is native to Spark
613            "ARRAY_CONTAINS" => Ok(Expression::Function(Box::new(f))),
614
615            // ELEMENT_AT is native to Spark
616            "ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
617
618            // TRY_ELEMENT_AT is native to Spark 3+
619            "TRY_ELEMENT_AT" => Ok(Expression::Function(Box::new(f))),
620
621            // TRANSFORM is native to Spark (array transformation)
622            "TRANSFORM" => Ok(Expression::Function(Box::new(f))),
623
624            // FILTER is native to Spark (array filtering)
625            "FILTER" => Ok(Expression::Function(Box::new(f))),
626
627            // AGGREGATE is native to Spark (array reduction)
628            "AGGREGATE" => Ok(Expression::Function(Box::new(f))),
629
630            // SEQUENCE is native to Spark (generate array)
631            "SEQUENCE" => Ok(Expression::Function(Box::new(f))),
632
633            // GENERATE_SERIES -> SEQUENCE
634            "GENERATE_SERIES" => Ok(Expression::Function(Box::new(Function::new(
635                "SEQUENCE".to_string(),
636                f.args,
637            )))),
638
639            // STARTSWITH is native to Spark 3+
640            "STARTSWITH" | "STARTS_WITH" => Ok(Expression::Function(Box::new(Function::new(
641                "STARTSWITH".to_string(),
642                f.args,
643            )))),
644
645            // ENDSWITH is native to Spark 3+
646            "ENDSWITH" | "ENDS_WITH" => Ok(Expression::Function(Box::new(Function::new(
647                "ENDSWITH".to_string(),
648                f.args,
649            )))),
650
651            // ARRAY_CONSTRUCT_COMPACT(1, null, 2) -> ARRAY_COMPACT(ARRAY(1, NULL, 2))
652            "ARRAY_CONSTRUCT_COMPACT" => {
653                let inner = Expression::Function(Box::new(Function::new("ARRAY".to_string(), f.args)));
654                Ok(Expression::Function(Box::new(Function::new(
655                    "ARRAY_COMPACT".to_string(),
656                    vec![inner],
657                ))))
658            }
659
660            // ARRAY_TO_STRING -> ARRAY_JOIN
661            "ARRAY_TO_STRING" => Ok(Expression::Function(Box::new(Function::new(
662                "ARRAY_JOIN".to_string(),
663                f.args,
664            )))),
665
666            // TO_ARRAY(x) -> IF(x IS NULL, NULL, ARRAY(x))
667            "TO_ARRAY" if f.args.len() == 1 => {
668                let x = f.args[0].clone();
669                // Check if arg is already an array constructor (bracket notation)
670                // In that case: TO_ARRAY(['test']) -> ARRAY('test')
671                match &x {
672                    Expression::ArrayFunc(arr) => {
673                        // Just convert to ARRAY(...) function
674                        Ok(Expression::Function(Box::new(Function::new(
675                            "ARRAY".to_string(),
676                            arr.expressions.clone(),
677                        ))))
678                    }
679                    _ => {
680                        Ok(Expression::IfFunc(Box::new(crate::expressions::IfFunc {
681                            condition: Expression::IsNull(Box::new(crate::expressions::IsNull {
682                                this: x.clone(),
683                                not: false,
684                                postfix_form: false,
685                            })),
686                            true_value: Expression::Null(crate::expressions::Null),
687                            false_value: Some(Expression::Function(Box::new(Function::new(
688                                "ARRAY".to_string(),
689                                vec![x],
690                            )))),
691                            original_name: Some("IF".to_string()),
692                        })))
693                    }
694                }
695            }
696
697            // REGEXP_SUBSTR -> REGEXP_EXTRACT (strip extra args)
698            "REGEXP_SUBSTR" if f.args.len() >= 2 => {
699                let subject = f.args[0].clone();
700                let pattern = f.args[1].clone();
701                // For Spark: REGEXP_EXTRACT(subject, pattern, group)
702                // group defaults to 0 for full match, but sqlglot uses last arg if present
703                let group = if f.args.len() >= 6 {
704                    let g = &f.args[5];
705                    // If group is literal 1 (default), omit it
706                    if matches!(g, Expression::Literal(Literal::Number(n)) if n == "1") {
707                        None
708                    } else {
709                        Some(g.clone())
710                    }
711                } else {
712                    None
713                };
714                let mut args = vec![subject, pattern];
715                if let Some(g) = group {
716                    args.push(g);
717                }
718                Ok(Expression::Function(Box::new(Function::new(
719                    "REGEXP_EXTRACT".to_string(),
720                    args,
721                ))))
722            }
723
724            // UUID_STRING -> UUID()
725            "UUID_STRING" => Ok(Expression::Function(Box::new(Function::new(
726                "UUID".to_string(),
727                vec![],
728            )))),
729
730            // OBJECT_CONSTRUCT -> STRUCT in Spark
731            "OBJECT_CONSTRUCT" if f.args.len() >= 2 && f.args.len() % 2 == 0 => {
732                // Convert key-value pairs to named struct fields
733                // OBJECT_CONSTRUCT('Manitoba', 'Winnipeg', 'foo', 'bar')
734                // -> STRUCT('Winnipeg' AS Manitoba, 'bar' AS foo)
735                let mut struct_args = Vec::new();
736                for pair in f.args.chunks(2) {
737                    if let Expression::Literal(Literal::String(key)) = &pair[0] {
738                        struct_args.push(Expression::Alias(Box::new(crate::expressions::Alias {
739                            this: pair[1].clone(),
740                            alias: crate::expressions::Identifier::new(key.clone()),
741                            column_aliases: vec![],
742                            pre_alias_comments: vec![],
743                            trailing_comments: vec![],
744                        })));
745                    } else {
746                        struct_args.push(pair[1].clone());
747                    }
748                }
749                Ok(Expression::Function(Box::new(Function::new(
750                    "STRUCT".to_string(),
751                    struct_args,
752                ))))
753            }
754
755            // DATE_PART(part, expr) -> EXTRACT(part FROM expr)
756            "DATE_PART" if f.args.len() == 2 => {
757                let mut args = f.args;
758                let part = args.remove(0);
759                let expr = args.remove(0);
760                if let Some(field) = expr_to_datetime_field(&part) {
761                    Ok(Expression::Extract(Box::new(ExtractFunc {
762                        this: expr,
763                        field,
764                    })))
765                } else {
766                    // Can't parse the field, keep as function
767                    Ok(Expression::Function(Box::new(Function::new(
768                        "DATE_PART".to_string(),
769                        vec![part, expr],
770                    ))))
771                }
772            }
773
774            // GET_PATH(obj, path) -> GET_JSON_OBJECT(obj, json_path) in Spark
775            "GET_PATH" if f.args.len() == 2 => {
776                let mut args = f.args;
777                let this = args.remove(0);
778                let path = args.remove(0);
779                let json_path = match &path {
780                    Expression::Literal(Literal::String(s)) => {
781                        let normalized = if s.starts_with('$') {
782                            s.clone()
783                        } else if s.starts_with('[') {
784                            format!("${}", s)
785                        } else {
786                            format!("$.{}", s)
787                        };
788                        Expression::Literal(Literal::String(normalized))
789                    }
790                    _ => path,
791                };
792                Ok(Expression::Function(Box::new(Function::new(
793                    "GET_JSON_OBJECT".to_string(),
794                    vec![this, json_path],
795                ))))
796            }
797
798            // BITWISE_LEFT_SHIFT → SHIFTLEFT
799            "BITWISE_LEFT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
800                "SHIFTLEFT".to_string(),
801                f.args,
802            )))),
803
804            // BITWISE_RIGHT_SHIFT → SHIFTRIGHT
805            "BITWISE_RIGHT_SHIFT" => Ok(Expression::Function(Box::new(Function::new(
806                "SHIFTRIGHT".to_string(),
807                f.args,
808            )))),
809
810            // APPROX_DISTINCT → APPROX_COUNT_DISTINCT
811            "APPROX_DISTINCT" => Ok(Expression::Function(Box::new(Function::new(
812                "APPROX_COUNT_DISTINCT".to_string(),
813                f.args,
814            )))),
815
816            // ARRAY_SLICE → SLICE
817            "ARRAY_SLICE" => Ok(Expression::Function(Box::new(Function::new(
818                "SLICE".to_string(),
819                f.args,
820            )))),
821
822            // DATE_FROM_PARTS → MAKE_DATE
823            "DATE_FROM_PARTS" => Ok(Expression::Function(Box::new(Function::new(
824                "MAKE_DATE".to_string(),
825                f.args,
826            )))),
827
828            // DAYOFWEEK_ISO → DAYOFWEEK
829            "DAYOFWEEK_ISO" => Ok(Expression::Function(Box::new(Function::new(
830                "DAYOFWEEK".to_string(),
831                f.args,
832            )))),
833
834            // FORMAT → FORMAT_STRING
835            "FORMAT" => Ok(Expression::Function(Box::new(Function::new(
836                "FORMAT_STRING".to_string(),
837                f.args,
838            )))),
839
840            // LOGICAL_AND → BOOL_AND
841            "LOGICAL_AND" => Ok(Expression::Function(Box::new(Function::new(
842                "BOOL_AND".to_string(),
843                f.args,
844            )))),
845
846            // VARIANCE_POP → VAR_POP
847            "VARIANCE_POP" => Ok(Expression::Function(Box::new(Function::new(
848                "VAR_POP".to_string(),
849                f.args,
850            )))),
851
852            // WEEK_OF_YEAR → WEEKOFYEAR
853            "WEEK_OF_YEAR" => Ok(Expression::Function(Box::new(Function::new(
854                "WEEKOFYEAR".to_string(),
855                f.args,
856            )))),
857
858            // Pass through everything else
859            _ => Ok(Expression::Function(Box::new(f))),
860        }
861    }
862
863    fn transform_aggregate_function(
864        &self,
865        f: Box<crate::expressions::AggregateFunction>,
866    ) -> Result<Expression> {
867        let name_upper = f.name.to_uppercase();
868        match name_upper.as_str() {
869            // GROUP_CONCAT -> COLLECT_LIST (then CONCAT_WS for string)
870            "GROUP_CONCAT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
871                Function::new("COLLECT_LIST".to_string(), f.args),
872            ))),
873
874            // STRING_AGG -> COLLECT_LIST (or STRING_AGG in Spark 4+)
875            "STRING_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
876                "COLLECT_LIST".to_string(),
877                f.args,
878            )))),
879
880            // LISTAGG -> COLLECT_LIST
881            "LISTAGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
882                "COLLECT_LIST".to_string(),
883                f.args,
884            )))),
885
886            // ARRAY_AGG -> COLLECT_LIST (preserve distinct and filter)
887            "ARRAY_AGG" if !f.args.is_empty() => {
888                let mut af = f;
889                af.name = "COLLECT_LIST".to_string();
890                Ok(Expression::AggregateFunction(af))
891            }
892
893            // LOGICAL_OR -> BOOL_OR in Spark
894            "LOGICAL_OR" if !f.args.is_empty() => {
895                let mut af = f;
896                af.name = "BOOL_OR".to_string();
897                Ok(Expression::AggregateFunction(af))
898            }
899
900            // Pass through everything else
901            _ => Ok(Expression::AggregateFunction(f)),
902        }
903    }
904}
905
906/// Convert an expression (string literal or identifier) to a DateTimeField
907fn expr_to_datetime_field(expr: &Expression) -> Option<DateTimeField> {
908    let name = match expr {
909        Expression::Literal(Literal::String(s)) => s.to_uppercase(),
910        Expression::Identifier(id) => id.name.to_uppercase(),
911        Expression::Column(col) if col.table.is_none() => col.name.name.to_uppercase(),
912        _ => return None,
913    };
914    match name.as_str() {
915        "YEAR" | "Y" | "YY" | "YYY" | "YYYY" | "YR" | "YEARS" | "YRS" => Some(DateTimeField::Year),
916        "MONTH" | "MM" | "MON" | "MONS" | "MONTHS" => Some(DateTimeField::Month),
917        "DAY" | "D" | "DD" | "DAYS" | "DAYOFMONTH" => Some(DateTimeField::Day),
918        "HOUR" | "H" | "HH" | "HR" | "HOURS" | "HRS" => Some(DateTimeField::Hour),
919        "MINUTE" | "MI" | "MIN" | "MINUTES" | "MINS" => Some(DateTimeField::Minute),
920        "SECOND" | "S" | "SEC" | "SECONDS" | "SECS" => Some(DateTimeField::Second),
921        "MILLISECOND" | "MS" | "MSEC" | "MILLISECONDS" => Some(DateTimeField::Millisecond),
922        "MICROSECOND" | "US" | "USEC" | "MICROSECONDS" => Some(DateTimeField::Microsecond),
923        "DOW" | "DAYOFWEEK" | "DAYOFWEEK_ISO" | "DW" => Some(DateTimeField::DayOfWeek),
924        "DOY" | "DAYOFYEAR" => Some(DateTimeField::DayOfYear),
925        "WEEK" | "W" | "WK" | "WEEKOFYEAR" | "WOY" => Some(DateTimeField::Week),
926        "QUARTER" | "Q" | "QTR" | "QTRS" | "QUARTERS" => Some(DateTimeField::Quarter),
927        "EPOCH" | "EPOCH_SECOND" | "EPOCH_SECONDS" => Some(DateTimeField::Epoch),
928        "TIMEZONE" | "TIMEZONE_HOUR" | "TZH" => Some(DateTimeField::TimezoneHour),
929        "TIMEZONE_MINUTE" | "TZM" => Some(DateTimeField::TimezoneMinute),
930        _ => Some(DateTimeField::Custom(name)),
931    }
932}