Skip to main content

polyglot_sql/dialects/
bigquery.rs

1//! BigQuery Dialect
2//!
3//! BigQuery-specific transformations based on sqlglot patterns.
4//! Key differences:
5//! - Uses backticks for identifiers (especially for project.dataset.table)
6//! - SAFE_ prefix for safe operations
7//! - Different date/time function names (DATE_DIFF, FORMAT_DATE, PARSE_DATE)
8//! - STRUCT and ARRAY syntax differences
9//! - No ILIKE support
10//! - QUALIFY clause support
11
12use super::{DialectImpl, DialectType};
13use crate::error::Result;
14use crate::expressions::{
15    Alias, BinaryOp, CeilFunc, Column, Exists, Expression, From, Function, FunctionBody,
16    Identifier, JsonExtractFunc, LikeOp, Literal, Select, SplitFunc, StringAggFunc, UnaryFunc,
17    UnnestFunc, VarArgFunc, Where,
18};
19use crate::generator::GeneratorConfig;
20use crate::tokens::TokenizerConfig;
21
22/// BigQuery dialect
23pub struct BigQueryDialect;
24
25impl DialectImpl for BigQueryDialect {
26    fn dialect_type(&self) -> DialectType {
27        DialectType::BigQuery
28    }
29
30    fn tokenizer_config(&self) -> TokenizerConfig {
31        let mut config = TokenizerConfig::default();
32        // BigQuery uses backticks for identifiers, NOT double quotes
33        // Remove double quote from identifiers (it's in the default config)
34        config.identifiers.remove(&'"');
35        config.identifiers.insert('`', '`');
36        // BigQuery supports double quotes for strings (in addition to single quotes)
37        config.quotes.insert("\"".to_string(), "\"".to_string());
38        // BigQuery supports triple-quoted strings
39        config.quotes.insert("'''".to_string(), "'''".to_string());
40        config
41            .quotes
42            .insert("\"\"\"".to_string(), "\"\"\"".to_string());
43        // BigQuery supports backslash escaping in strings
44        config.string_escapes = vec!['\'', '\\'];
45        // In BigQuery, b'...' is a byte string (bytes), not a bit string (binary digits)
46        config.b_prefix_is_byte_string = true;
47        // BigQuery supports hex number strings like 0xA, 0xFF
48        config.hex_number_strings = true;
49        // BigQuery: 0xA represents integer 10 (not binary/blob)
50        config.hex_string_is_integer_type = true;
51        // BigQuery supports # as single-line comments.
52        config.hash_comments = true;
53        config
54    }
55
56    fn generator_config(&self) -> GeneratorConfig {
57        use crate::generator::{IdentifierQuoteStyle, NormalizeFunctions};
58        GeneratorConfig {
59            identifier_quote: '`',
60            identifier_quote_style: IdentifierQuoteStyle::BACKTICK,
61            dialect: Some(DialectType::BigQuery),
62            // BigQuery doesn't normalize function names (Python: NORMALIZE_FUNCTIONS = False)
63            normalize_functions: NormalizeFunctions::None,
64            // BigQuery-specific settings from Python sqlglot
65            interval_allows_plural_form: false,
66            join_hints: false,
67            query_hints: false,
68            table_hints: false,
69            limit_fetch_style: crate::generator::LimitFetchStyle::Limit,
70            rename_table_with_db: false,
71            nvl2_supported: false,
72            unnest_with_ordinality: false,
73            collate_is_func: true,
74            limit_only_literals: true,
75            supports_table_alias_columns: false,
76            unpivot_aliases_are_identifiers: false,
77            json_key_value_pair_sep: ",",
78            null_ordering_supported: false,
79            ignore_nulls_in_func: true,
80            json_path_single_quote_escape: true,
81            can_implement_array_any: true,
82            supports_to_number: false,
83            named_placeholder_token: "@",
84            hex_func: "TO_HEX",
85            with_properties_prefix: "OPTIONS",
86            supports_exploding_projections: false,
87            except_intersect_support_all_clause: false,
88            supports_unix_seconds: true,
89            // BigQuery uses SAFE_ prefix for safe operations
90            try_supported: true,
91            // BigQuery does not support SEMI/ANTI JOIN syntax
92            semi_anti_join_with_side: false,
93            ..Default::default()
94        }
95    }
96
97    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
98        match expr {
99            // ===== Data Type Mappings =====
100            Expression::DataType(dt) => self.transform_data_type(dt),
101
102            // ===== Null handling =====
103            // IFNULL is native to BigQuery - keep as-is for identity
104            Expression::IfNull(f) => Ok(Expression::IfNull(f)),
105
106            // NVL -> IFNULL in BigQuery (BigQuery uses IFNULL, not NVL)
107            Expression::Nvl(f) => Ok(Expression::IfNull(f)),
108
109            // Coalesce stays as Coalesce
110            Expression::Coalesce(f) => Ok(Expression::Coalesce(f)),
111
112            // ===== String aggregation =====
113            // GROUP_CONCAT -> STRING_AGG in BigQuery
114            Expression::GroupConcat(f) => Ok(Expression::StringAgg(Box::new(StringAggFunc {
115                this: f.this,
116                separator: f.separator,
117                order_by: f.order_by,
118                distinct: f.distinct,
119                filter: f.filter,
120                limit: None,
121                inferred_type: None,
122            }))),
123
124            // ===== Cast operations =====
125            // Cast data types are transformed by transform_recursive in mod.rs
126            // which calls transform_data_type via transform_expr(Expression::DataType(...))
127
128            // TryCast -> SafeCast in BigQuery with type transformation
129            Expression::TryCast(c) => {
130                let transformed_type = match self.transform_data_type(c.to)? {
131                    Expression::DataType(dt) => dt,
132                    _ => return Err(crate::error::Error::parse("Expected DataType", 0, 0, 0, 0)),
133                };
134                Ok(Expression::SafeCast(Box::new(crate::expressions::Cast {
135                    this: c.this,
136                    to: transformed_type,
137                    trailing_comments: c.trailing_comments,
138                    double_colon_syntax: c.double_colon_syntax,
139                    format: c.format,
140                    default: c.default,
141                    inferred_type: None,
142                })))
143            }
144
145            // ===== Pattern matching =====
146            // ILIKE -> LOWER() LIKE LOWER() in BigQuery (no ILIKE support)
147            Expression::ILike(op) => {
148                let lower_left = Expression::Lower(Box::new(UnaryFunc::new(op.left)));
149                let lower_right = Expression::Lower(Box::new(UnaryFunc::new(op.right)));
150                Ok(Expression::Like(Box::new(LikeOp {
151                    left: lower_left,
152                    right: lower_right,
153                    escape: op.escape,
154                    quantifier: op.quantifier,
155                    inferred_type: None,
156                })))
157            }
158
159            // RegexpLike -> REGEXP_CONTAINS in BigQuery
160            Expression::RegexpLike(f) => Ok(Expression::Function(Box::new(Function::new(
161                "REGEXP_CONTAINS".to_string(),
162                vec![f.this, f.pattern],
163            )))),
164
165            // ===== Array operations =====
166            // EXPLODE -> UNNEST in BigQuery
167            Expression::Explode(f) => Ok(Expression::Unnest(Box::new(
168                crate::expressions::UnnestFunc {
169                    this: f.this,
170                    expressions: Vec::new(),
171                    with_ordinality: false,
172                    alias: None,
173                    offset_alias: None,
174                },
175            ))),
176
177            // ExplodeOuter -> UNNEST with LEFT JOIN semantics
178            Expression::ExplodeOuter(f) => Ok(Expression::Unnest(Box::new(
179                crate::expressions::UnnestFunc {
180                    this: f.this,
181                    expressions: Vec::new(),
182                    with_ordinality: false,
183                    alias: None,
184                    offset_alias: None,
185                },
186            ))),
187
188            // GenerateSeries -> GENERATE_ARRAY in BigQuery
189            Expression::GenerateSeries(f) => {
190                let mut args = Vec::new();
191                if let Some(start) = f.start {
192                    args.push(*start);
193                }
194                if let Some(end) = f.end {
195                    args.push(*end);
196                }
197                if let Some(step) = f.step {
198                    args.push(*step);
199                }
200                Ok(Expression::Function(Box::new(Function::new(
201                    "GENERATE_ARRAY".to_string(),
202                    args,
203                ))))
204            }
205
206            // ===== Bitwise operations =====
207            // BitwiseAndAgg -> BIT_AND
208            Expression::BitwiseAndAgg(f) => Ok(Expression::Function(Box::new(Function::new(
209                "BIT_AND".to_string(),
210                vec![f.this],
211            )))),
212
213            // BitwiseOrAgg -> BIT_OR
214            Expression::BitwiseOrAgg(f) => Ok(Expression::Function(Box::new(Function::new(
215                "BIT_OR".to_string(),
216                vec![f.this],
217            )))),
218
219            // BitwiseXorAgg -> BIT_XOR
220            Expression::BitwiseXorAgg(f) => Ok(Expression::Function(Box::new(Function::new(
221                "BIT_XOR".to_string(),
222                vec![f.this],
223            )))),
224
225            // BitwiseCount -> BIT_COUNT
226            Expression::BitwiseCount(f) => Ok(Expression::Function(Box::new(Function::new(
227                "BIT_COUNT".to_string(),
228                vec![f.this],
229            )))),
230
231            // ByteLength -> BYTE_LENGTH
232            Expression::ByteLength(f) => Ok(Expression::Function(Box::new(Function::new(
233                "BYTE_LENGTH".to_string(),
234                vec![f.this],
235            )))),
236
237            // IntDiv -> DIV
238            Expression::IntDiv(f) => Ok(Expression::Function(Box::new(Function::new(
239                "DIV".to_string(),
240                vec![f.this, f.expression],
241            )))),
242
243            // Int64 -> INT64
244            Expression::Int64(f) => Ok(Expression::Function(Box::new(Function::new(
245                "INT64".to_string(),
246                vec![f.this],
247            )))),
248
249            // ===== Random =====
250            // RANDOM -> RAND in BigQuery
251            Expression::Random(_) => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
252                seed: None,
253                lower: None,
254                upper: None,
255            }))),
256
257            // ===== UUID =====
258            // Uuid -> GENERATE_UUID in BigQuery
259            Expression::Uuid(_) => Ok(Expression::Function(Box::new(Function::new(
260                "GENERATE_UUID".to_string(),
261                vec![],
262            )))),
263
264            // ===== Approximate functions =====
265            // ApproxDistinct -> APPROX_COUNT_DISTINCT
266            Expression::ApproxDistinct(f) => Ok(Expression::Function(Box::new(Function::new(
267                "APPROX_COUNT_DISTINCT".to_string(),
268                vec![f.this],
269            )))),
270
271            // ArgMax -> MAX_BY in BigQuery
272            Expression::ArgMax(f) => Ok(Expression::Function(Box::new(Function::new(
273                "MAX_BY".to_string(),
274                vec![*f.this, *f.expression],
275            )))),
276
277            // ArgMin -> MIN_BY in BigQuery
278            Expression::ArgMin(f) => Ok(Expression::Function(Box::new(Function::new(
279                "MIN_BY".to_string(),
280                vec![*f.this, *f.expression],
281            )))),
282
283            // ===== Conditional =====
284            // CountIf -> COUNTIF in BigQuery
285            Expression::CountIf(f) => Ok(Expression::Function(Box::new(Function::new(
286                "COUNTIF".to_string(),
287                vec![f.this],
288            )))),
289
290            // ===== String functions =====
291            // StringAgg -> STRING_AGG in BigQuery - keep as-is to preserve ORDER BY
292            Expression::StringAgg(f) => Ok(Expression::StringAgg(f)),
293
294            // ===== Conversion =====
295            // Unhex -> FROM_HEX
296            Expression::Unhex(f) => Ok(Expression::Function(Box::new(Function::new(
297                "FROM_HEX".to_string(),
298                vec![*f.this],
299            )))),
300
301            // UnixToTime -> TIMESTAMP_SECONDS/MILLIS/MICROS based on scale
302            Expression::UnixToTime(f) => {
303                let scale = f.scale.unwrap_or(0);
304                match scale {
305                    0 => Ok(Expression::Function(Box::new(Function::new(
306                        "TIMESTAMP_SECONDS".to_string(),
307                        vec![*f.this],
308                    )))),
309                    3 => Ok(Expression::Function(Box::new(Function::new(
310                        "TIMESTAMP_MILLIS".to_string(),
311                        vec![*f.this],
312                    )))),
313                    6 => Ok(Expression::Function(Box::new(Function::new(
314                        "TIMESTAMP_MICROS".to_string(),
315                        vec![*f.this],
316                    )))),
317                    _ => {
318                        // TIMESTAMP_SECONDS(CAST(value / POWER(10, scale) AS INT64))
319                        let div_expr =
320                            Expression::Div(Box::new(crate::expressions::BinaryOp::new(
321                                *f.this,
322                                Expression::Function(Box::new(Function::new(
323                                    "POWER".to_string(),
324                                    vec![Expression::number(10), Expression::number(scale)],
325                                ))),
326                            )));
327                        let cast_expr = Expression::Cast(Box::new(crate::expressions::Cast {
328                            this: div_expr,
329                            to: crate::expressions::DataType::Custom {
330                                name: "INT64".to_string(),
331                            },
332                            double_colon_syntax: false,
333                            trailing_comments: vec![],
334                            format: None,
335                            default: None,
336                            inferred_type: None,
337                        }));
338                        Ok(Expression::Function(Box::new(Function::new(
339                            "TIMESTAMP_SECONDS".to_string(),
340                            vec![cast_expr],
341                        ))))
342                    }
343                }
344            }
345
346            // ===== Date/time =====
347            // DateDiff -> DATE_DIFF in BigQuery
348            Expression::DateDiff(f) => {
349                // BigQuery: DATE_DIFF(date1, date2, part)
350                let unit_str = match f.unit {
351                    Some(crate::expressions::IntervalUnit::Year) => "YEAR",
352                    Some(crate::expressions::IntervalUnit::Quarter) => "QUARTER",
353                    Some(crate::expressions::IntervalUnit::Month) => "MONTH",
354                    Some(crate::expressions::IntervalUnit::Week) => "WEEK",
355                    Some(crate::expressions::IntervalUnit::Day) => "DAY",
356                    Some(crate::expressions::IntervalUnit::Hour) => "HOUR",
357                    Some(crate::expressions::IntervalUnit::Minute) => "MINUTE",
358                    Some(crate::expressions::IntervalUnit::Second) => "SECOND",
359                    Some(crate::expressions::IntervalUnit::Millisecond) => "MILLISECOND",
360                    Some(crate::expressions::IntervalUnit::Microsecond) => "MICROSECOND",
361                    Some(crate::expressions::IntervalUnit::Nanosecond) => "NANOSECOND",
362                    None => "DAY",
363                };
364                let unit = Expression::Identifier(crate::expressions::Identifier {
365                    name: unit_str.to_string(),
366                    quoted: false,
367                    trailing_comments: Vec::new(),
368                    span: None,
369                });
370                Ok(Expression::Function(Box::new(Function::new(
371                    "DATE_DIFF".to_string(),
372                    vec![f.this, f.expression, unit],
373                ))))
374            }
375
376            // ===== Variance =====
377            // VarPop -> VAR_POP
378            Expression::VarPop(f) => Ok(Expression::Function(Box::new(Function::new(
379                "VAR_POP".to_string(),
380                vec![f.this],
381            )))),
382
383            // ===== Hash functions =====
384            // SHA -> SHA1
385            Expression::SHA(f) => Ok(Expression::Function(Box::new(Function::new(
386                "SHA1".to_string(),
387                vec![f.this],
388            )))),
389
390            // SHA1Digest -> SHA1
391            Expression::SHA1Digest(f) => Ok(Expression::Function(Box::new(Function::new(
392                "SHA1".to_string(),
393                vec![f.this],
394            )))),
395
396            // MD5Digest -> MD5
397            Expression::MD5Digest(f) => Ok(Expression::Function(Box::new(Function::new(
398                "MD5".to_string(),
399                vec![*f.this],
400            )))),
401
402            // ===== Type conversion =====
403            // JSONBool -> BOOL
404            Expression::JSONBool(f) => Ok(Expression::Function(Box::new(Function::new(
405                "BOOL".to_string(),
406                vec![f.this],
407            )))),
408
409            // StringFunc -> STRING
410            Expression::StringFunc(f) => Ok(Expression::Function(Box::new(Function::new(
411                "STRING".to_string(),
412                vec![*f.this],
413            )))),
414
415            // ===== Date/time from parts =====
416            // DateFromUnixDate -> DATE_FROM_UNIX_DATE
417            Expression::DateFromUnixDate(f) => Ok(Expression::Function(Box::new(Function::new(
418                "DATE_FROM_UNIX_DATE".to_string(),
419                vec![f.this],
420            )))),
421
422            // UnixDate -> UNIX_DATE
423            Expression::UnixDate(f) => Ok(Expression::Function(Box::new(Function::new(
424                "UNIX_DATE".to_string(),
425                vec![f.this],
426            )))),
427
428            // TimestampDiff -> TIMESTAMP_DIFF
429            Expression::TimestampDiff(f) => Ok(Expression::Function(Box::new(Function::new(
430                "TIMESTAMP_DIFF".to_string(),
431                vec![*f.this, *f.expression],
432            )))),
433
434            // FromTimeZone -> DATETIME
435            Expression::FromTimeZone(f) => Ok(Expression::Function(Box::new(Function::new(
436                "DATETIME".to_string(),
437                vec![*f.this],
438            )))),
439
440            // TsOrDsToDatetime -> DATETIME
441            Expression::TsOrDsToDatetime(f) => Ok(Expression::Function(Box::new(Function::new(
442                "DATETIME".to_string(),
443                vec![f.this],
444            )))),
445
446            // TsOrDsToTimestamp -> TIMESTAMP
447            Expression::TsOrDsToTimestamp(f) => Ok(Expression::Function(Box::new(Function::new(
448                "TIMESTAMP".to_string(),
449                vec![f.this],
450            )))),
451
452            // ===== IfFunc -> IF in BigQuery =====
453            Expression::IfFunc(f) => {
454                let mut args = vec![f.condition, f.true_value];
455                if let Some(false_val) = f.false_value {
456                    args.push(false_val);
457                } else {
458                    args.push(Expression::Null(crate::expressions::Null));
459                }
460                Ok(Expression::Function(Box::new(Function::new(
461                    "IF".to_string(),
462                    args,
463                ))))
464            }
465
466            // ===== HexString -> FROM_HEX =====
467            Expression::HexStringExpr(f) => Ok(Expression::Function(Box::new(Function::new(
468                "FROM_HEX".to_string(),
469                vec![*f.this],
470            )))),
471
472            // ===== Additional auto-generated transforms from Python sqlglot =====
473            // ApproxTopK -> APPROX_TOP_COUNT
474            Expression::ApproxTopK(f) => {
475                let mut args = vec![*f.this];
476                if let Some(expr) = f.expression {
477                    args.push(*expr);
478                }
479                Ok(Expression::Function(Box::new(Function::new(
480                    "APPROX_TOP_COUNT".to_string(),
481                    args,
482                ))))
483            }
484
485            // SafeDivide -> SAFE_DIVIDE
486            Expression::SafeDivide(f) => Ok(Expression::Function(Box::new(Function::new(
487                "SAFE_DIVIDE".to_string(),
488                vec![*f.this, *f.expression],
489            )))),
490
491            // JSONKeysAtDepth -> JSON_KEYS
492            Expression::JSONKeysAtDepth(f) => Ok(Expression::Function(Box::new(Function::new(
493                "JSON_KEYS".to_string(),
494                vec![*f.this],
495            )))),
496
497            // JSONValueArray -> JSON_VALUE_ARRAY
498            Expression::JSONValueArray(f) => Ok(Expression::Function(Box::new(Function::new(
499                "JSON_VALUE_ARRAY".to_string(),
500                vec![*f.this],
501            )))),
502
503            // DateFromParts -> DATE
504            Expression::DateFromParts(f) => {
505                let mut args = Vec::new();
506                if let Some(y) = f.year {
507                    args.push(*y);
508                }
509                if let Some(m) = f.month {
510                    args.push(*m);
511                }
512                if let Some(d) = f.day {
513                    args.push(*d);
514                }
515                Ok(Expression::Function(Box::new(Function::new(
516                    "DATE".to_string(),
517                    args,
518                ))))
519            }
520
521            // SPLIT: BigQuery defaults to comma separator when none provided
522            // SPLIT(foo) -> SPLIT(foo, ',')
523            Expression::Split(f) => {
524                // Check if delimiter is empty or a placeholder - add default comma
525                let delimiter = match &f.delimiter {
526                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(s) if s.is_empty()) =>
527                    {
528                        let Literal::String(_) = lit.as_ref() else {
529                            unreachable!()
530                        };
531                        Expression::Literal(Box::new(Literal::String(",".to_string())))
532                    }
533                    _ => f.delimiter,
534                };
535                Ok(Expression::Split(Box::new(SplitFunc {
536                    this: f.this,
537                    delimiter,
538                })))
539            }
540
541            // Cast: Transform the target type according to BigQuery TYPE_MAPPING
542            // Special case: CAST to JSON -> PARSE_JSON in BigQuery
543            // Special case: CAST(x AS temporal FORMAT 'fmt') -> PARSE_DATE/PARSE_TIMESTAMP
544            Expression::Cast(c) => {
545                use crate::expressions::DataType;
546                // Check if casting to JSON - use PARSE_JSON instead
547                let is_json = matches!(c.to, DataType::Json | DataType::JsonB)
548                    || matches!(&c.to, DataType::Custom { name } if name.eq_ignore_ascii_case("JSON") || name.eq_ignore_ascii_case("JSONB"));
549                if is_json {
550                    return Ok(Expression::ParseJson(Box::new(UnaryFunc::new(c.this))));
551                }
552                // CAST(x AS temporal_type FORMAT 'fmt') -> PARSE_DATE/PARSE_TIMESTAMP(strftime_fmt, x)
553                if c.format.is_some() {
554                    let is_temporal = matches!(
555                        c.to,
556                        DataType::Date | DataType::Timestamp { .. } | DataType::Time { .. }
557                    ) || matches!(&c.to, DataType::Custom { name } if
558                        name.eq_ignore_ascii_case("TIMESTAMP") ||
559                        name.eq_ignore_ascii_case("DATE") ||
560                        name.eq_ignore_ascii_case("DATETIME") ||
561                        name.eq_ignore_ascii_case("TIME")
562                    );
563                    if is_temporal {
564                        let format_expr = c.format.as_ref().unwrap().as_ref();
565                        // Extract the actual format expr and timezone (if AT TIME ZONE is present)
566                        let (actual_format, timezone) = match format_expr {
567                            Expression::AtTimeZone(ref atz) => {
568                                (atz.this.clone(), Some(atz.zone.clone()))
569                            }
570                            _ => (format_expr.clone(), None),
571                        };
572                        let strftime_fmt = Self::bq_cast_format_to_strftime(&actual_format);
573                        let func_name = match &c.to {
574                            DataType::Date => "PARSE_DATE",
575                            DataType::Custom { name } if name.eq_ignore_ascii_case("DATE") => {
576                                "PARSE_DATE"
577                            }
578                            DataType::Custom { name } if name.eq_ignore_ascii_case("DATETIME") => {
579                                "PARSE_DATETIME"
580                            }
581                            _ => "PARSE_TIMESTAMP",
582                        };
583                        let mut func_args = vec![strftime_fmt, c.this];
584                        if let Some(tz) = timezone {
585                            func_args.push(tz);
586                        }
587                        return Ok(Expression::Function(Box::new(Function::new(
588                            func_name.to_string(),
589                            func_args,
590                        ))));
591                    }
592                }
593                let transformed_type = match self.transform_data_type(c.to)? {
594                    Expression::DataType(dt) => dt,
595                    _ => return Err(crate::error::Error::parse("Expected DataType", 0, 0, 0, 0)),
596                };
597                Ok(Expression::Cast(Box::new(crate::expressions::Cast {
598                    this: c.this,
599                    to: transformed_type,
600                    trailing_comments: c.trailing_comments,
601                    double_colon_syntax: c.double_colon_syntax,
602                    format: c.format,
603                    default: c.default,
604                    inferred_type: None,
605                })))
606            }
607
608            // SafeCast: Transform the target type according to BigQuery TYPE_MAPPING
609            Expression::SafeCast(c) => {
610                let transformed_type = match self.transform_data_type(c.to)? {
611                    Expression::DataType(dt) => dt,
612                    _ => return Err(crate::error::Error::parse("Expected DataType", 0, 0, 0, 0)),
613                };
614                Ok(Expression::SafeCast(Box::new(crate::expressions::Cast {
615                    this: c.this,
616                    to: transformed_type,
617                    trailing_comments: c.trailing_comments,
618                    double_colon_syntax: c.double_colon_syntax,
619                    format: c.format,
620                    default: c.default,
621                    inferred_type: None,
622                })))
623            }
624
625            // ===== SELECT-level transforms =====
626            // BigQuery: GROUP BY expression → alias when both GROUP BY and ORDER BY exist
627            Expression::Select(mut select) => {
628                if select.group_by.is_some() && select.order_by.is_some() {
629                    // Build map: expression → alias name for aliased projections
630                    let aliases: Vec<(Expression, Identifier)> = select
631                        .expressions
632                        .iter()
633                        .filter_map(|e| {
634                            if let Expression::Alias(a) = e {
635                                Some((a.this.clone(), a.alias.clone()))
636                            } else {
637                                None
638                            }
639                        })
640                        .collect();
641
642                    if let Some(ref mut group_by) = select.group_by {
643                        for grouped in group_by.expressions.iter_mut() {
644                            // Skip numeric indices (already aliased)
645                            if matches!(grouped, Expression::Literal(lit) if matches!(lit.as_ref(), Literal::Number(_)))
646                            {
647                                continue;
648                            }
649                            // Check if this GROUP BY expression matches a SELECT alias
650                            for (expr, alias_ident) in &aliases {
651                                if grouped == expr {
652                                    *grouped = Expression::boxed_column(Column {
653                                        name: alias_ident.clone(),
654                                        table: None,
655                                        join_mark: false,
656                                        trailing_comments: Vec::new(),
657                                        span: None,
658                                        inferred_type: None,
659                                    });
660                                    break;
661                                }
662                            }
663                        }
664                    }
665                }
666                Ok(Expression::Select(select))
667            }
668
669            // ===== ArrayContains → EXISTS(SELECT 1 FROM UNNEST(arr) AS _col WHERE _col = val) =====
670            Expression::ArrayContains(f) => {
671                let array_expr = f.this;
672                let value_expr = f.expression;
673
674                // Build: SELECT 1 FROM UNNEST(array) AS _col WHERE _col = value
675                let unnest = Expression::Unnest(Box::new(UnnestFunc {
676                    this: array_expr,
677                    expressions: Vec::new(),
678                    with_ordinality: false,
679                    alias: None,
680                    offset_alias: None,
681                }));
682                let aliased_unnest = Expression::Alias(Box::new(Alias {
683                    this: unnest,
684                    alias: Identifier::new("_col"),
685                    column_aliases: Vec::new(),
686                    pre_alias_comments: Vec::new(),
687                    trailing_comments: Vec::new(),
688                    inferred_type: None,
689                }));
690                let col_ref = Expression::boxed_column(Column {
691                    name: Identifier::new("_col"),
692                    table: None,
693                    join_mark: false,
694                    trailing_comments: Vec::new(),
695                    span: None,
696                    inferred_type: None,
697                });
698                let where_clause = Where {
699                    this: Expression::Eq(Box::new(BinaryOp {
700                        left: col_ref,
701                        right: value_expr,
702                        left_comments: Vec::new(),
703                        operator_comments: Vec::new(),
704                        trailing_comments: Vec::new(),
705                        inferred_type: None,
706                    })),
707                };
708                let inner_select = Expression::Select(Box::new(Select {
709                    expressions: vec![Expression::Literal(Box::new(Literal::Number(
710                        "1".to_string(),
711                    )))],
712                    from: Some(From {
713                        expressions: vec![aliased_unnest],
714                    }),
715                    where_clause: Some(where_clause),
716                    ..Default::default()
717                }));
718                Ok(Expression::Exists(Box::new(Exists {
719                    this: inner_select,
720                    not: false,
721                })))
722            }
723
724            // ===== JSON_OBJECT array form → key-value pairs =====
725            // BigQuery "signature 2": JSON_OBJECT(['a', 'b'], [10, NULL]) → JSON_OBJECT('a', 10, 'b', NULL)
726            Expression::JsonObject(mut f) => {
727                if f.pairs.len() == 1 {
728                    // Extract expressions from both Array and ArrayFunc variants
729                    let keys_exprs = match &f.pairs[0].0 {
730                        Expression::Array(arr) => Some(&arr.expressions),
731                        Expression::ArrayFunc(arr) => Some(&arr.expressions),
732                        _ => None,
733                    };
734                    let vals_exprs = match &f.pairs[0].1 {
735                        Expression::Array(arr) => Some(&arr.expressions),
736                        Expression::ArrayFunc(arr) => Some(&arr.expressions),
737                        _ => None,
738                    };
739                    if let (Some(keys), Some(vals)) = (keys_exprs, vals_exprs) {
740                        if keys.len() == vals.len() {
741                            let new_pairs: Vec<(Expression, Expression)> = keys
742                                .iter()
743                                .zip(vals.iter())
744                                .map(|(k, v)| (k.clone(), v.clone()))
745                                .collect();
746                            f.pairs = new_pairs;
747                        }
748                    }
749                }
750                Ok(Expression::JsonObject(f))
751            }
752
753            // ===== MOD function: unwrap unnecessary Paren from first argument =====
754            // BigQuery normalizes MOD((a + 1), b) -> MOD(a + 1, b)
755            Expression::ModFunc(mut f) => {
756                // Unwrap Paren from first argument if present
757                if let Expression::Paren(paren) = f.this {
758                    f.this = paren.this;
759                }
760                Ok(Expression::ModFunc(f))
761            }
762
763            // JSONExtract with variant_extract (Snowflake colon syntax) -> JSON_EXTRACT
764            Expression::JSONExtract(e) if e.variant_extract.is_some() => {
765                let path = match *e.expression {
766                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
767                        let Literal::String(s) = lit.as_ref() else {
768                            unreachable!()
769                        };
770                        let normalized = if s.starts_with('$') {
771                            s.clone()
772                        } else if s.starts_with('[') {
773                            format!("${}", s)
774                        } else {
775                            format!("$.{}", s)
776                        };
777                        Expression::Literal(Box::new(Literal::String(normalized)))
778                    }
779                    other => other,
780                };
781                Ok(Expression::Function(Box::new(Function::new(
782                    "JSON_EXTRACT".to_string(),
783                    vec![*e.this, path],
784                ))))
785            }
786
787            // Generic function transformations
788            Expression::Function(f) => self.transform_function(*f),
789
790            // Generic aggregate function transformations
791            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
792
793            // MethodCall: Handle SAFE.PARSE_DATE, SAFE.PARSE_DATETIME, SAFE.PARSE_TIMESTAMP
794            // These are parsed as MethodCall(this=SAFE, method=PARSE_DATE, args=[...])
795            Expression::MethodCall(mc) => self.transform_method_call(*mc),
796
797            // CreateFunction: Convert RETURNS TABLE(...) to RETURNS TABLE <...> for BigQuery
798            // and set is_table_function = true for TABLE FUNCTION syntax
799            Expression::CreateFunction(mut cf) => {
800                if let Some(ref mut rtb) = cf.returns_table_body {
801                    if rtb.starts_with("TABLE (") || rtb.starts_with("TABLE(") {
802                        // Convert TABLE (...) to TABLE <...> with BigQuery types
803                        let inner = if rtb.starts_with("TABLE (") {
804                            &rtb["TABLE (".len()..rtb.len() - 1]
805                        } else {
806                            &rtb["TABLE(".len()..rtb.len() - 1]
807                        };
808                        // Convert common types to BigQuery equivalents
809                        let converted = inner
810                            .replace(" INT,", " INT64,")
811                            .replace(" INT)", " INT64)")
812                            .replace(" INTEGER,", " INT64,")
813                            .replace(" INTEGER)", " INT64)")
814                            .replace(" FLOAT,", " FLOAT64,")
815                            .replace(" FLOAT)", " FLOAT64)")
816                            .replace(" BOOLEAN,", " BOOL,")
817                            .replace(" BOOLEAN)", " BOOL)")
818                            .replace(" VARCHAR", " STRING")
819                            .replace(" TEXT", " STRING");
820                        // Handle trailing type (no comma, no paren)
821                        let converted = if converted.ends_with(" INT") {
822                            format!("{}{}", &converted[..converted.len() - 4], " INT64")
823                        } else {
824                            converted
825                        };
826                        *rtb = format!("TABLE <{}>", converted);
827                        cf.is_table_function = true;
828                    }
829                }
830                // Convert string literal body to expression body for BigQuery TABLE FUNCTIONs only
831                if cf.is_table_function {
832                    if let Some(ref body) = cf.body {
833                        if matches!(body, FunctionBody::StringLiteral(_)) {
834                            if let Some(FunctionBody::StringLiteral(sql)) = cf.body.take() {
835                                // Parse the SQL string into an expression
836                                if let Ok(parsed) = crate::parser::Parser::parse_sql(&sql) {
837                                    if let Some(stmt) = parsed.into_iter().next() {
838                                        cf.body = Some(FunctionBody::Expression(stmt));
839                                    } else {
840                                        cf.body = Some(FunctionBody::StringLiteral(sql));
841                                    }
842                                } else {
843                                    cf.body = Some(FunctionBody::StringLiteral(sql));
844                                }
845                            }
846                        }
847                    }
848                }
849                Ok(Expression::CreateFunction(cf))
850            }
851
852            // Pass through everything else
853            _ => Ok(expr),
854        }
855    }
856}
857
858impl BigQueryDialect {
859    /// Transform data types according to BigQuery TYPE_MAPPING
860    fn transform_data_type(&self, dt: crate::expressions::DataType) -> Result<Expression> {
861        use crate::expressions::DataType;
862        let transformed = match dt {
863            // BIGINT -> INT64
864            DataType::BigInt { .. } => DataType::Custom {
865                name: "INT64".to_string(),
866            },
867            // INT -> INT64
868            DataType::Int { .. } => DataType::Custom {
869                name: "INT64".to_string(),
870            },
871            // SMALLINT -> INT64
872            DataType::SmallInt { .. } => DataType::Custom {
873                name: "INT64".to_string(),
874            },
875            // TINYINT -> INT64
876            DataType::TinyInt { .. } => DataType::Custom {
877                name: "INT64".to_string(),
878            },
879            // FLOAT -> FLOAT64
880            DataType::Float { .. } => DataType::Custom {
881                name: "FLOAT64".to_string(),
882            },
883            // DOUBLE -> FLOAT64
884            DataType::Double { .. } => DataType::Custom {
885                name: "FLOAT64".to_string(),
886            },
887            // BOOLEAN -> BOOL
888            DataType::Boolean => DataType::Custom {
889                name: "BOOL".to_string(),
890            },
891            // CHAR -> STRING
892            DataType::Char { .. } => DataType::Custom {
893                name: "STRING".to_string(),
894            },
895            // VARCHAR -> STRING
896            DataType::VarChar { .. } => DataType::Custom {
897                name: "STRING".to_string(),
898            },
899            // TEXT -> STRING
900            DataType::Text => DataType::Custom {
901                name: "STRING".to_string(),
902            },
903            // STRING(n) -> STRING (BigQuery doesn't support length for STRING)
904            DataType::String { .. } => DataType::Custom {
905                name: "STRING".to_string(),
906            },
907            // BINARY -> BYTES
908            DataType::Binary { .. } => DataType::Custom {
909                name: "BYTES".to_string(),
910            },
911            // VARBINARY -> BYTES
912            DataType::VarBinary { .. } => DataType::Custom {
913                name: "BYTES".to_string(),
914            },
915            // BLOB -> BYTES
916            DataType::Blob => DataType::Custom {
917                name: "BYTES".to_string(),
918            },
919            // DECIMAL -> NUMERIC (BigQuery strips precision in CAST context)
920            DataType::Decimal { .. } => DataType::Custom {
921                name: "NUMERIC".to_string(),
922            },
923            // For BigQuery identity: preserve TIMESTAMP/DATETIME as Custom types
924            // This avoids the issue where parsed TIMESTAMP (timezone: false) would
925            // be converted to DATETIME by the generator
926            DataType::Timestamp {
927                timezone: false, ..
928            } => DataType::Custom {
929                name: "TIMESTAMP".to_string(),
930            },
931            DataType::Timestamp { timezone: true, .. } => DataType::Custom {
932                name: "TIMESTAMP".to_string(),
933            },
934            // UUID -> STRING (BigQuery doesn't have native UUID type)
935            DataType::Uuid => DataType::Custom {
936                name: "STRING".to_string(),
937            },
938            // RECORD -> STRUCT in BigQuery
939            DataType::Custom { ref name } if name.eq_ignore_ascii_case("RECORD") => {
940                DataType::Custom {
941                    name: "STRUCT".to_string(),
942                }
943            }
944            // TIMESTAMPTZ (custom) -> TIMESTAMP
945            DataType::Custom { ref name } if name.eq_ignore_ascii_case("TIMESTAMPTZ") => {
946                DataType::Custom {
947                    name: "TIMESTAMP".to_string(),
948                }
949            }
950            // BYTEINT (custom) -> INT64
951            DataType::Custom { ref name } if name.eq_ignore_ascii_case("BYTEINT") => {
952                DataType::Custom {
953                    name: "INT64".to_string(),
954                }
955            }
956            // Keep all other types as-is
957            other => other,
958        };
959        Ok(Expression::DataType(transformed))
960    }
961
962    fn transform_function(&self, f: Function) -> Result<Expression> {
963        let name_upper = f.name.to_uppercase();
964        match name_upper.as_str() {
965            // IFNULL -> COALESCE (both work in BigQuery)
966            "IFNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
967                original_name: None,
968                expressions: f.args,
969                inferred_type: None,
970            }))),
971
972            // NVL -> COALESCE
973            "NVL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
974                original_name: None,
975                expressions: f.args,
976                inferred_type: None,
977            }))),
978
979            // ISNULL -> COALESCE
980            "ISNULL" if f.args.len() == 2 => Ok(Expression::Coalesce(Box::new(VarArgFunc {
981                original_name: None,
982                expressions: f.args,
983                inferred_type: None,
984            }))),
985
986            // GROUP_CONCAT -> STRING_AGG in BigQuery
987            "GROUP_CONCAT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
988                Function::new("STRING_AGG".to_string(), f.args),
989            ))),
990
991            // SUBSTR -> SUBSTRING (both work)
992            "SUBSTR" => Ok(Expression::Function(Box::new(Function::new(
993                "SUBSTRING".to_string(),
994                f.args,
995            )))),
996
997            // RANDOM -> RAND
998            "RANDOM" => Ok(Expression::Rand(Box::new(crate::expressions::Rand {
999                seed: None,
1000                lower: None,
1001                upper: None,
1002            }))),
1003
1004            // CURRENT_DATE -> CURRENT_DATE() in BigQuery
1005            // Keep as Function when it has args (e.g., CURRENT_DATE('UTC'))
1006            "CURRENT_DATE" if f.args.is_empty() => {
1007                Ok(Expression::CurrentDate(crate::expressions::CurrentDate))
1008            }
1009            "CURRENT_DATE" => Ok(Expression::Function(Box::new(Function {
1010                name: "CURRENT_DATE".to_string(),
1011                args: f.args,
1012                distinct: false,
1013                trailing_comments: Vec::new(),
1014                use_bracket_syntax: false,
1015                no_parens: false,
1016                quoted: false,
1017                span: None,
1018                inferred_type: None,
1019            }))),
1020
1021            // NOW -> CURRENT_TIMESTAMP in BigQuery
1022            "NOW" => Ok(Expression::CurrentTimestamp(
1023                crate::expressions::CurrentTimestamp {
1024                    precision: None,
1025                    sysdate: false,
1026                },
1027            )),
1028
1029            // TO_DATE -> PARSE_DATE in BigQuery
1030            "TO_DATE" => Ok(Expression::Function(Box::new(Function::new(
1031                "PARSE_DATE".to_string(),
1032                f.args,
1033            )))),
1034
1035            // TO_TIMESTAMP -> PARSE_TIMESTAMP in BigQuery
1036            "TO_TIMESTAMP" => Ok(Expression::Function(Box::new(Function::new(
1037                "PARSE_TIMESTAMP".to_string(),
1038                f.args,
1039            )))),
1040
1041            // TO_TIME -> TIME in BigQuery
1042            "TO_TIME" if f.args.len() == 1 => Ok(Expression::Function(Box::new(Function::new(
1043                "TIME".to_string(),
1044                f.args,
1045            )))),
1046
1047            // DATE_FORMAT -> FORMAT_DATE in BigQuery (argument order may differ)
1048            "DATE_FORMAT" => Ok(Expression::Function(Box::new(Function::new(
1049                "FORMAT_DATE".to_string(),
1050                f.args,
1051            )))),
1052
1053            // POSITION -> STRPOS in BigQuery
1054            // BigQuery uses STRPOS(string, substring)
1055            "POSITION" if f.args.len() == 2 => {
1056                let mut args = f.args;
1057                // Swap arguments: POSITION(sub IN str) -> STRPOS(str, sub)
1058                let first = args.remove(0);
1059                let second = args.remove(0);
1060                Ok(Expression::Function(Box::new(Function::new(
1061                    "STRPOS".to_string(),
1062                    vec![second, first],
1063                ))))
1064            }
1065
1066            // LEN -> LENGTH
1067            "LEN" if f.args.len() == 1 => Ok(Expression::Length(Box::new(UnaryFunc::new(
1068                f.args.into_iter().next().unwrap(),
1069            )))),
1070
1071            // CEILING -> CEIL (both work)
1072            "CEILING" if f.args.len() == 1 => Ok(Expression::Ceil(Box::new(CeilFunc {
1073                this: f.args.into_iter().next().unwrap(),
1074                decimals: None,
1075                to: None,
1076            }))),
1077
1078            // GETDATE -> CURRENT_TIMESTAMP
1079            "GETDATE" => Ok(Expression::CurrentTimestamp(
1080                crate::expressions::CurrentTimestamp {
1081                    precision: None,
1082                    sysdate: false,
1083                },
1084            )),
1085
1086            // ARRAY_LENGTH -> ARRAY_LENGTH (native)
1087            // CARDINALITY -> ARRAY_LENGTH
1088            "CARDINALITY" if f.args.len() == 1 => Ok(Expression::ArrayLength(Box::new(
1089                UnaryFunc::new(f.args.into_iter().next().unwrap()),
1090            ))),
1091
1092            // UNNEST is native to BigQuery
1093
1094            // GENERATE_SERIES -> GENERATE_ARRAY in BigQuery
1095            "GENERATE_SERIES" => Ok(Expression::Function(Box::new(Function::new(
1096                "GENERATE_ARRAY".to_string(),
1097                f.args,
1098            )))),
1099
1100            // APPROX_COUNT_DISTINCT -> APPROX_COUNT_DISTINCT (native)
1101            // APPROX_DISTINCT -> APPROX_COUNT_DISTINCT
1102            "APPROX_DISTINCT" => Ok(Expression::Function(Box::new(Function::new(
1103                "APPROX_COUNT_DISTINCT".to_string(),
1104                f.args,
1105            )))),
1106
1107            // COUNT_IF -> COUNTIF in BigQuery
1108            "COUNT_IF" => Ok(Expression::Function(Box::new(Function::new(
1109                "COUNTIF".to_string(),
1110                f.args,
1111            )))),
1112
1113            // SHA1 -> SHA1 (native), SHA -> SHA1
1114            "SHA" => Ok(Expression::Function(Box::new(Function::new(
1115                "SHA1".to_string(),
1116                f.args,
1117            )))),
1118
1119            // SHA256/SHA2 -> SHA256
1120            "SHA2" => Ok(Expression::Function(Box::new(Function::new(
1121                "SHA256".to_string(),
1122                f.args,
1123            )))),
1124
1125            // MD5 in BigQuery returns bytes, often combined with TO_HEX
1126            // TO_HEX(MD5(x)) pattern
1127            "MD5" => Ok(Expression::Function(Box::new(Function::new(
1128                "MD5".to_string(),
1129                f.args,
1130            )))),
1131
1132            // VARIANCE/VAR_SAMP -> VAR_SAMP (native)
1133            // VAR_POP -> VAR_POP (native)
1134
1135            // DATEADD(unit, amount, date) → DATE_ADD(date, INTERVAL amount unit) for BigQuery
1136            "DATEADD" if f.args.len() == 3 => {
1137                let mut args = f.args;
1138                let unit_expr = args.remove(0);
1139                let amount = args.remove(0);
1140                let date = args.remove(0);
1141                // Convert unit identifier to IntervalUnit
1142                let unit_name = match &unit_expr {
1143                    Expression::Identifier(id) => id.name.to_uppercase(),
1144                    _ => "DAY".to_string(),
1145                };
1146                let unit = match unit_name.as_str() {
1147                    "YEAR" | "YEARS" | "YY" | "YYYY" => crate::expressions::IntervalUnit::Year,
1148                    "QUARTER" | "QUARTERS" | "QQ" | "Q" => {
1149                        crate::expressions::IntervalUnit::Quarter
1150                    }
1151                    "MONTH" | "MONTHS" | "MM" | "M" => crate::expressions::IntervalUnit::Month,
1152                    "WEEK" | "WEEKS" | "WK" | "WW" => crate::expressions::IntervalUnit::Week,
1153                    "DAY" | "DAYS" | "DD" | "D" | "DAYOFMONTH" => {
1154                        crate::expressions::IntervalUnit::Day
1155                    }
1156                    "HOUR" | "HOURS" | "HH" => crate::expressions::IntervalUnit::Hour,
1157                    "MINUTE" | "MINUTES" | "MI" | "N" => crate::expressions::IntervalUnit::Minute,
1158                    "SECOND" | "SECONDS" | "SS" | "S" => crate::expressions::IntervalUnit::Second,
1159                    "MILLISECOND" | "MILLISECONDS" | "MS" => {
1160                        crate::expressions::IntervalUnit::Millisecond
1161                    }
1162                    "MICROSECOND" | "MICROSECONDS" | "US" => {
1163                        crate::expressions::IntervalUnit::Microsecond
1164                    }
1165                    _ => crate::expressions::IntervalUnit::Day,
1166                };
1167                Ok(Expression::DateAdd(Box::new(
1168                    crate::expressions::DateAddFunc {
1169                        this: date,
1170                        interval: amount,
1171                        unit,
1172                    },
1173                )))
1174            }
1175            "DATE_ADD" => Ok(Expression::Function(Box::new(Function::new(
1176                "DATE_ADD".to_string(),
1177                f.args,
1178            )))),
1179
1180            // DATE_DIFF in BigQuery (native)
1181            "DATEDIFF" => Ok(Expression::Function(Box::new(Function::new(
1182                "DATE_DIFF".to_string(),
1183                f.args,
1184            )))),
1185
1186            // TIMESTAMP_DIFF in BigQuery
1187            "TIMESTAMPDIFF" => Ok(Expression::Function(Box::new(Function::new(
1188                "TIMESTAMP_DIFF".to_string(),
1189                f.args,
1190            )))),
1191
1192            // TIME -> TIME (native)
1193            // DATETIME -> DATETIME (native)
1194
1195            // SAFE_DIVIDE -> SAFE_DIVIDE (native)
1196
1197            // NEWID/UUID -> GENERATE_UUID
1198            "NEWID" | "UUID" => Ok(Expression::Function(Box::new(Function::new(
1199                "GENERATE_UUID".to_string(),
1200                vec![],
1201            )))),
1202
1203            // LEVENSHTEIN -> EDIT_DISTANCE (BigQuery naming)
1204            "LEVENSHTEIN" => Ok(Expression::Function(Box::new(Function::new(
1205                "EDIT_DISTANCE".to_string(),
1206                f.args,
1207            )))),
1208
1209            // UNIX_TIMESTAMP -> UNIX_SECONDS
1210            "UNIX_TIMESTAMP" => Ok(Expression::Function(Box::new(Function::new(
1211                "UNIX_SECONDS".to_string(),
1212                f.args,
1213            )))),
1214
1215            // FROM_UNIXTIME -> TIMESTAMP_SECONDS
1216            "FROM_UNIXTIME" => Ok(Expression::Function(Box::new(Function::new(
1217                "TIMESTAMP_SECONDS".to_string(),
1218                f.args,
1219            )))),
1220
1221            // CHAR_LENGTH / CHARACTER_LENGTH -> LENGTH
1222            "CHAR_LENGTH" | "CHARACTER_LENGTH" => Ok(Expression::Function(Box::new(
1223                Function::new("LENGTH".to_string(), f.args),
1224            ))),
1225
1226            // OCTET_LENGTH -> BYTE_LENGTH in BigQuery
1227            "OCTET_LENGTH" => Ok(Expression::Function(Box::new(Function::new(
1228                "BYTE_LENGTH".to_string(),
1229                f.args,
1230            )))),
1231
1232            // JSON_EXTRACT_STRING_ARRAY -> JSON_VALUE_ARRAY in BigQuery
1233            "JSON_EXTRACT_STRING_ARRAY" => Ok(Expression::Function(Box::new(Function::new(
1234                "JSON_VALUE_ARRAY".to_string(),
1235                f.args,
1236            )))),
1237
1238            // INSTR is native to BigQuery
1239
1240            // SPLIT: BigQuery defaults to comma separator when none provided
1241            // SPLIT(foo) -> SPLIT(foo, ',')
1242            "SPLIT" if f.args.len() == 1 => {
1243                let mut args = f.args;
1244                args.push(Expression::Literal(Box::new(Literal::String(
1245                    ",".to_string(),
1246                ))));
1247                Ok(Expression::Split(Box::new(SplitFunc {
1248                    this: args.remove(0),
1249                    delimiter: args.remove(0),
1250                })))
1251            }
1252
1253            // SPLIT with two args - convert to Split expression
1254            "SPLIT" if f.args.len() == 2 => {
1255                let mut args = f.args;
1256                Ok(Expression::Split(Box::new(SplitFunc {
1257                    this: args.remove(0),
1258                    delimiter: args.remove(0),
1259                })))
1260            }
1261
1262            // REGEXP_SUBSTR -> REGEXP_EXTRACT in BigQuery (strip extra Snowflake args)
1263            "REGEXP_SUBSTR" if f.args.len() >= 2 => {
1264                // BigQuery REGEXP_EXTRACT supports (subject, pattern, pos, occ) max 4 args
1265                let args = if f.args.len() > 4 {
1266                    f.args[..4].to_vec()
1267                } else {
1268                    f.args
1269                };
1270                Ok(Expression::Function(Box::new(Function::new(
1271                    "REGEXP_EXTRACT".to_string(),
1272                    args,
1273                ))))
1274            }
1275            "REGEXP_SUBSTR" => Ok(Expression::Function(Box::new(Function::new(
1276                "REGEXP_EXTRACT".to_string(),
1277                f.args,
1278            )))),
1279
1280            // REGEXP_REPLACE - strip extra Snowflake-specific args
1281            "REGEXP_REPLACE" if f.args.len() > 3 => {
1282                let args = f.args[..3].to_vec();
1283                Ok(Expression::Function(Box::new(Function::new(
1284                    "REGEXP_REPLACE".to_string(),
1285                    args,
1286                ))))
1287            }
1288
1289            // OBJECT_CONSTRUCT_KEEP_NULL -> JSON_OBJECT
1290            "OBJECT_CONSTRUCT_KEEP_NULL" => Ok(Expression::Function(Box::new(Function::new(
1291                "JSON_OBJECT".to_string(),
1292                f.args,
1293            )))),
1294
1295            // EDITDISTANCE -> EDIT_DISTANCE with named max_distance parameter
1296            "EDITDISTANCE" if f.args.len() == 3 => {
1297                let col1 = f.args[0].clone();
1298                let col2 = f.args[1].clone();
1299                let max_dist = f.args[2].clone();
1300                Ok(Expression::Function(Box::new(Function::new(
1301                    "EDIT_DISTANCE".to_string(),
1302                    vec![
1303                        col1,
1304                        col2,
1305                        Expression::NamedArgument(Box::new(crate::expressions::NamedArgument {
1306                            name: crate::expressions::Identifier::new("max_distance".to_string()),
1307                            value: max_dist,
1308                            separator: crate::expressions::NamedArgSeparator::DArrow,
1309                        })),
1310                    ],
1311                ))))
1312            }
1313            "EDITDISTANCE" if f.args.len() == 2 => Ok(Expression::Function(Box::new(
1314                Function::new("EDIT_DISTANCE".to_string(), f.args),
1315            ))),
1316
1317            // HEX_DECODE_BINARY -> FROM_HEX
1318            "HEX_DECODE_BINARY" => Ok(Expression::Function(Box::new(Function::new(
1319                "FROM_HEX".to_string(),
1320                f.args,
1321            )))),
1322
1323            // BigQuery format string normalization for PARSE_DATE/DATETIME/TIMESTAMP functions
1324            // %Y-%m-%d -> %F and %H:%M:%S -> %T
1325            "PARSE_DATE"
1326            | "PARSE_DATETIME"
1327            | "PARSE_TIMESTAMP"
1328            | "SAFE.PARSE_DATE"
1329            | "SAFE.PARSE_DATETIME"
1330            | "SAFE.PARSE_TIMESTAMP" => {
1331                let args = self.normalize_time_format_args(f.args);
1332                Ok(Expression::Function(Box::new(Function {
1333                    name: f.name,
1334                    args,
1335                    distinct: f.distinct,
1336                    no_parens: f.no_parens,
1337                    trailing_comments: f.trailing_comments,
1338                    quoted: f.quoted,
1339                    use_bracket_syntax: f.use_bracket_syntax,
1340                    span: None,
1341                    inferred_type: None,
1342                })))
1343            }
1344
1345            // GET_PATH(obj, path) -> JSON_EXTRACT(obj, json_path) in BigQuery
1346            "GET_PATH" if f.args.len() == 2 => {
1347                let mut args = f.args;
1348                let this = args.remove(0);
1349                let path = args.remove(0);
1350                let json_path = match &path {
1351                    Expression::Literal(lit) if matches!(lit.as_ref(), Literal::String(_)) => {
1352                        let Literal::String(s) = lit.as_ref() else {
1353                            unreachable!()
1354                        };
1355                        let normalized = if s.starts_with('$') {
1356                            s.clone()
1357                        } else if s.starts_with('[') {
1358                            format!("${}", s)
1359                        } else {
1360                            format!("$.{}", s)
1361                        };
1362                        Expression::Literal(Box::new(Literal::String(normalized)))
1363                    }
1364                    _ => path,
1365                };
1366                Ok(Expression::JsonExtract(Box::new(JsonExtractFunc {
1367                    this,
1368                    path: json_path,
1369                    returning: None,
1370                    arrow_syntax: false,
1371                    hash_arrow_syntax: false,
1372                    wrapper_option: None,
1373                    quotes_option: None,
1374                    on_scalar_string: false,
1375                    on_error: None,
1376                })))
1377            }
1378
1379            // Pass through everything else
1380            _ => Ok(Expression::Function(Box::new(f))),
1381        }
1382    }
1383
1384    fn transform_aggregate_function(
1385        &self,
1386        f: Box<crate::expressions::AggregateFunction>,
1387    ) -> Result<Expression> {
1388        let name_upper = f.name.to_uppercase();
1389        match name_upper.as_str() {
1390            // GROUP_CONCAT -> STRING_AGG
1391            "GROUP_CONCAT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
1392                Function::new("STRING_AGG".to_string(), f.args),
1393            ))),
1394
1395            // Pass through everything else
1396            _ => Ok(Expression::AggregateFunction(f)),
1397        }
1398    }
1399
1400    /// Transform MethodCall expressions
1401    /// Handles SAFE.PARSE_DATE, SAFE.PARSE_DATETIME, SAFE.PARSE_TIMESTAMP format normalization
1402    fn transform_method_call(&self, mc: crate::expressions::MethodCall) -> Result<Expression> {
1403        use crate::expressions::MethodCall;
1404
1405        // Check if this is SAFE.PARSE_DATE/DATETIME/TIMESTAMP
1406        if let Expression::Column(ref col) = mc.this {
1407            if col.name.name.eq_ignore_ascii_case("SAFE") {
1408                let method_upper = mc.method.name.to_uppercase();
1409                if method_upper == "PARSE_DATE"
1410                    || method_upper == "PARSE_DATETIME"
1411                    || method_upper == "PARSE_TIMESTAMP"
1412                {
1413                    // Normalize the format string in the first argument
1414                    let args = self.normalize_time_format_args(mc.args);
1415                    return Ok(Expression::MethodCall(Box::new(MethodCall {
1416                        this: mc.this,
1417                        method: mc.method,
1418                        args,
1419                    })));
1420                }
1421            }
1422        }
1423
1424        // Pass through all other method calls
1425        Ok(Expression::MethodCall(Box::new(mc)))
1426    }
1427
1428    /// Normalize time format strings in function arguments
1429    /// BigQuery normalizes: %Y-%m-%d -> %F, %H:%M:%S -> %T
1430    fn normalize_time_format_args(&self, args: Vec<Expression>) -> Vec<Expression> {
1431        args.into_iter()
1432            .enumerate()
1433            .map(|(i, arg)| {
1434                // Only transform the first argument (the format string)
1435                if i == 0 {
1436                    if let Expression::Literal(ref lit) = arg {
1437                        if let Literal::String(s) = lit.as_ref() {
1438                            let normalized = self.normalize_time_format(&s);
1439                            return Expression::Literal(Box::new(Literal::String(normalized)));
1440                        }
1441                    }
1442                }
1443                arg
1444            })
1445            .collect()
1446    }
1447
1448    /// Normalize a time format string according to BigQuery conventions
1449    /// %Y-%m-%d -> %F (ISO date)
1450    /// %H:%M:%S -> %T (time)
1451    fn normalize_time_format(&self, format: &str) -> String {
1452        format.replace("%Y-%m-%d", "%F").replace("%H:%M:%S", "%T")
1453    }
1454
1455    /// Convert BigQuery CAST FORMAT elements to strftime equivalents,
1456    /// then normalize BigQuery shorthand forms (%Y-%m-%d -> %F, %H:%M:%S -> %T)
1457    fn bq_cast_format_to_strftime(format_expr: &Expression) -> Expression {
1458        use crate::expressions::Literal;
1459        if let Expression::Literal(lit) = format_expr {
1460            if let Literal::String(s) = lit.as_ref() {
1461                let result = s
1462                    .replace("YYYYMMDD", "%Y%m%d")
1463                    .replace("YYYY", "%Y")
1464                    .replace("YY", "%y")
1465                    .replace("MONTH", "%B")
1466                    .replace("MON", "%b")
1467                    .replace("MM", "%m")
1468                    .replace("DD", "%d")
1469                    .replace("HH24", "%H")
1470                    .replace("HH12", "%I")
1471                    .replace("HH", "%I")
1472                    .replace("MI", "%M")
1473                    .replace("SSTZH", "%S%z")
1474                    .replace("SS", "%S")
1475                    .replace("TZH", "%z");
1476                // Normalize: %Y-%m-%d -> %F, %H:%M:%S -> %T
1477                let normalized = result.replace("%Y-%m-%d", "%F").replace("%H:%M:%S", "%T");
1478                return Expression::Literal(Box::new(Literal::String(normalized)));
1479            }
1480        }
1481        format_expr.clone()
1482    }
1483}
1484
1485#[cfg(test)]
1486mod tests {
1487    use super::*;
1488    use crate::dialects::Dialect;
1489    use crate::parse_one;
1490
1491    fn transpile_to_bigquery(sql: &str) -> String {
1492        let dialect = Dialect::get(DialectType::Generic);
1493        let result = dialect
1494            .transpile_to(sql, DialectType::BigQuery)
1495            .expect("Transpile failed");
1496        result[0].clone()
1497    }
1498
1499    #[test]
1500    fn test_ifnull_identity() {
1501        // Generic -> BigQuery: IFNULL is normalized to COALESCE (matching sqlglot behavior)
1502        let result = transpile_to_bigquery("SELECT IFNULL(a, b)");
1503        assert!(
1504            result.contains("COALESCE"),
1505            "Expected COALESCE, got: {}",
1506            result
1507        );
1508    }
1509
1510    #[test]
1511    fn test_nvl_to_ifnull() {
1512        // NVL is converted to IFNULL in BigQuery
1513        let result = transpile_to_bigquery("SELECT NVL(a, b)");
1514        assert!(
1515            result.contains("IFNULL"),
1516            "Expected IFNULL, got: {}",
1517            result
1518        );
1519    }
1520
1521    #[test]
1522    fn test_try_cast_to_safe_cast() {
1523        let result = transpile_to_bigquery("SELECT TRY_CAST(a AS INT)");
1524        assert!(
1525            result.contains("SAFE_CAST"),
1526            "Expected SAFE_CAST, got: {}",
1527            result
1528        );
1529    }
1530
1531    #[test]
1532    fn test_random_to_rand() {
1533        let result = transpile_to_bigquery("SELECT RANDOM()");
1534        assert!(result.contains("RAND"), "Expected RAND, got: {}", result);
1535    }
1536
1537    #[test]
1538    fn test_basic_select() {
1539        let result = transpile_to_bigquery("SELECT a, b FROM users WHERE id = 1");
1540        assert!(result.contains("SELECT"));
1541        assert!(result.contains("FROM users"));
1542    }
1543
1544    #[test]
1545    fn test_group_concat_to_string_agg() {
1546        let result = transpile_to_bigquery("SELECT GROUP_CONCAT(name)");
1547        assert!(
1548            result.contains("STRING_AGG"),
1549            "Expected STRING_AGG, got: {}",
1550            result
1551        );
1552    }
1553
1554    #[test]
1555    fn test_generate_series_to_generate_array() {
1556        let result = transpile_to_bigquery("SELECT GENERATE_SERIES(1, 10)");
1557        assert!(
1558            result.contains("GENERATE_ARRAY"),
1559            "Expected GENERATE_ARRAY, got: {}",
1560            result
1561        );
1562    }
1563
1564    #[test]
1565    fn test_backtick_identifiers() {
1566        // BigQuery uses backticks for identifiers
1567        let dialect = BigQueryDialect;
1568        let config = dialect.generator_config();
1569        assert_eq!(config.identifier_quote, '`');
1570    }
1571
1572    fn bigquery_identity(sql: &str, expected: &str) {
1573        let dialect = Dialect::get(DialectType::BigQuery);
1574        let ast = dialect.parse(sql).expect("Parse failed");
1575        let transformed = dialect.transform(ast[0].clone()).expect("Transform failed");
1576        let result = dialect.generate(&transformed).expect("Generate failed");
1577        assert_eq!(result, expected, "SQL: {}", sql);
1578    }
1579
1580    #[test]
1581    fn test_cast_char_to_string() {
1582        bigquery_identity("CAST(x AS CHAR)", "CAST(x AS STRING)");
1583    }
1584
1585    #[test]
1586    fn test_cast_varchar_to_string() {
1587        bigquery_identity("CAST(x AS VARCHAR)", "CAST(x AS STRING)");
1588    }
1589
1590    #[test]
1591    fn test_cast_nchar_to_string() {
1592        bigquery_identity("CAST(x AS NCHAR)", "CAST(x AS STRING)");
1593    }
1594
1595    #[test]
1596    fn test_cast_nvarchar_to_string() {
1597        bigquery_identity("CAST(x AS NVARCHAR)", "CAST(x AS STRING)");
1598    }
1599
1600    #[test]
1601    fn test_cast_timestamptz_to_timestamp() {
1602        bigquery_identity("CAST(x AS TIMESTAMPTZ)", "CAST(x AS TIMESTAMP)");
1603    }
1604
1605    #[test]
1606    fn test_cast_record_to_struct() {
1607        bigquery_identity("CAST(x AS RECORD)", "CAST(x AS STRUCT)");
1608    }
1609
1610    #[test]
1611    fn test_json_literal_to_parse_json() {
1612        // JSON 'string' literal syntax should be converted to PARSE_JSON()
1613        bigquery_identity(
1614            "SELECT JSON '\"foo\"' AS json_data",
1615            "SELECT PARSE_JSON('\"foo\"') AS json_data",
1616        );
1617    }
1618
1619    #[test]
1620    fn test_grant_as_alias_not_quoted() {
1621        // GRANT is not a reserved keyword in BigQuery, should not be backtick-quoted
1622        bigquery_identity(
1623            "SELECT GRANT FROM (SELECT 'input' AS GRANT)",
1624            "SELECT GRANT FROM (SELECT 'input' AS GRANT)",
1625        );
1626    }
1627
1628    #[test]
1629    fn test_timestamp_literal_to_cast() {
1630        // TIMESTAMP 'value' literal should be converted to CAST('value' AS TIMESTAMP)
1631        bigquery_identity(
1632            "CREATE VIEW `d.v` OPTIONS (expiration_timestamp=TIMESTAMP '2020-01-02T04:05:06.007Z') AS SELECT 1 AS c",
1633            "CREATE VIEW `d.v` OPTIONS (expiration_timestamp=CAST('2020-01-02T04:05:06.007Z' AS TIMESTAMP)) AS SELECT 1 AS c"
1634        );
1635    }
1636
1637    #[test]
1638    fn test_date_literal_to_cast_in_extract() {
1639        // Issue 1: DATE literal should become CAST syntax in BigQuery
1640        bigquery_identity(
1641            "EXTRACT(WEEK(THURSDAY) FROM DATE '2013-12-25')",
1642            "EXTRACT(WEEK(THURSDAY) FROM CAST('2013-12-25' AS DATE))",
1643        );
1644    }
1645
1646    #[test]
1647    fn test_json_object_with_json_literals() {
1648        // Issue 2: JSON literals in JSON_OBJECT should use PARSE_JSON, not CAST AS JSON
1649        bigquery_identity(
1650            "SELECT JSON_OBJECT('a', JSON '10') AS json_data",
1651            "SELECT JSON_OBJECT('a', PARSE_JSON('10')) AS json_data",
1652        );
1653    }
1654
1655    // NOTE: MOD paren unwrapping is tested in the conformance tests (sqlglot_dialect_identity).
1656    // The unit test version was removed due to stack overflow in debug builds (deep recursion).
1657    // Test case: MOD((a + 1), b) -> MOD(a + 1, b)
1658
1659    #[test]
1660    fn test_safe_parse_date_format_normalization() {
1661        // SAFE.PARSE_DATE format string normalization: %Y-%m-%d -> %F
1662        bigquery_identity(
1663            "SAFE.PARSE_DATE('%Y-%m-%d', '2024-01-15')",
1664            "SAFE.PARSE_DATE('%F', '2024-01-15')",
1665        );
1666    }
1667
1668    #[test]
1669    fn test_safe_parse_datetime_format_normalization() {
1670        // SAFE.PARSE_DATETIME format string normalization: %Y-%m-%d %H:%M:%S -> %F %T
1671        bigquery_identity(
1672            "SAFE.PARSE_DATETIME('%Y-%m-%d %H:%M:%S', '2024-01-15 10:30:00')",
1673            "SAFE.PARSE_DATETIME('%F %T', '2024-01-15 10:30:00')",
1674        );
1675    }
1676
1677    #[test]
1678    fn test_safe_parse_timestamp_format_normalization() {
1679        // SAFE.PARSE_TIMESTAMP format string normalization: %Y-%m-%d %H:%M:%S -> %F %T
1680        bigquery_identity(
1681            "SAFE.PARSE_TIMESTAMP('%Y-%m-%d %H:%M:%S', '2024-01-15 10:30:00')",
1682            "SAFE.PARSE_TIMESTAMP('%F %T', '2024-01-15 10:30:00')",
1683        );
1684    }
1685
1686    #[test]
1687    fn test_datetime_literal_to_cast() {
1688        // DATETIME 'value' literal should be converted to CAST('value' AS DATETIME)
1689        bigquery_identity(
1690            "LAST_DAY(DATETIME '2008-11-10 15:30:00', WEEK(SUNDAY))",
1691            "LAST_DAY(CAST('2008-11-10 15:30:00' AS DATETIME), WEEK)",
1692        );
1693    }
1694
1695    #[test]
1696    fn test_last_day_week_modifier_stripped() {
1697        // WEEK(SUNDAY) should become WEEK in BigQuery LAST_DAY function
1698        bigquery_identity("LAST_DAY(col, WEEK(MONDAY))", "LAST_DAY(col, WEEK)");
1699    }
1700
1701    #[test]
1702    fn test_hash_line_comment_parses() {
1703        // Regression test for issue #38:
1704        // BigQuery should accept # as a single-line comment.
1705        let result = parse_one("SELECT 1 as a #hello world", DialectType::BigQuery);
1706        assert!(result.is_ok(), "Expected parse to succeed, got: {result:?}");
1707    }
1708}