Skip to main content

polyglot_sql/dialects/
clickhouse.rs

1//! ClickHouse Dialect
2//!
3//! ClickHouse-specific transformations based on sqlglot patterns.
4//! ClickHouse is case-sensitive and has unique function naming conventions.
5
6use super::{DialectImpl, DialectType};
7use crate::error::Result;
8use crate::expressions::{
9    AggregateFunction, BinaryOp, Case, Cast, Expression, Function, In, IsNull, LikeOp,
10    MapConstructor, Paren, UnaryOp,
11};
12use crate::generator::GeneratorConfig;
13use crate::tokens::TokenizerConfig;
14
15/// ClickHouse dialect
16pub struct ClickHouseDialect;
17
18impl DialectImpl for ClickHouseDialect {
19    fn dialect_type(&self) -> DialectType {
20        DialectType::ClickHouse
21    }
22
23    fn tokenizer_config(&self) -> TokenizerConfig {
24        let mut config = TokenizerConfig::default();
25        // ClickHouse uses double quotes and backticks for identifiers
26        config.identifiers.insert('"', '"');
27        config.identifiers.insert('`', '`');
28        // ClickHouse supports nested comments
29        config.nested_comments = true;
30        // ClickHouse allows identifiers to start with digits
31        config.identifiers_can_start_with_digit = true;
32        // ClickHouse uses backslash escaping in strings
33        config.string_escapes.push('\\');
34        // ClickHouse supports # as single-line comment
35        config.hash_comments = true;
36        // ClickHouse allows $ in identifiers
37        config.dollar_sign_is_identifier = true;
38        // ClickHouse: INSERT ... FORMAT <name> is followed by raw data
39        config.insert_format_raw_data = true;
40        // ClickHouse supports 0xDEADBEEF hex integer literals
41        config.hex_number_strings = true;
42        config.hex_string_is_integer_type = true;
43        // ClickHouse allows underscores as digit separators in numeric literals
44        config.numbers_can_be_underscore_separated = true;
45        // SQLGlot tokenizes malformed SHOW LIKE probes such as `'a\' or 1=1`.
46        config.recover_terminal_backslash_quote = true;
47        // The ClickHouse corpus includes partial string probes extracted from shell tests.
48        config.recover_unterminated_string = true;
49        config
50    }
51
52    fn generator_config(&self) -> GeneratorConfig {
53        use crate::generator::{IdentifierQuoteStyle, NormalizeFunctions};
54        GeneratorConfig {
55            identifier_quote: '"',
56            identifier_quote_style: IdentifierQuoteStyle::DOUBLE_QUOTE,
57            dialect: Some(DialectType::ClickHouse),
58            // ClickHouse uses uppercase keywords (matching Python SQLGlot behavior)
59            uppercase_keywords: true,
60            // ClickHouse function names are case-sensitive and typically camelCase
61            normalize_functions: NormalizeFunctions::None,
62            // ClickHouse identifiers are case-sensitive
63            case_sensitive_identifiers: true,
64            tablesample_keywords: "SAMPLE",
65            tablesample_requires_parens: false,
66            identifiers_can_start_with_digit: true,
67            // ClickHouse uses bracket-only notation for arrays: [1, 2, 3]
68            array_bracket_only: true,
69            ..Default::default()
70        }
71    }
72
73    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
74        let wrap_predicate_left = |expr: Expression| -> Expression {
75            let needs_parens = matches!(
76                expr,
77                Expression::Add(_)
78                    | Expression::Sub(_)
79                    | Expression::Mul(_)
80                    | Expression::Div(_)
81                    | Expression::Mod(_)
82                    | Expression::Concat(_)
83                    | Expression::And(_)
84                    | Expression::Or(_)
85                    | Expression::Not(_)
86                    | Expression::Case(_)
87            );
88
89            if needs_parens {
90                Expression::Paren(Box::new(Paren {
91                    this: expr,
92                    trailing_comments: Vec::new(),
93                }))
94            } else {
95                expr
96            }
97        };
98
99        let wrap_not_target = |expr: Expression| -> Expression {
100            match expr {
101                Expression::Paren(_) => expr,
102                Expression::In(_)
103                | Expression::Between(_)
104                | Expression::Is(_)
105                | Expression::IsNull(_)
106                | Expression::IsTrue(_)
107                | Expression::IsFalse(_)
108                | Expression::IsJson(_)
109                | Expression::Like(_)
110                | Expression::ILike(_)
111                | Expression::SimilarTo(_)
112                | Expression::Glob(_)
113                | Expression::RegexpLike(_)
114                | Expression::RegexpILike(_)
115                | Expression::MemberOf(_) => Expression::Paren(Box::new(Paren {
116                    this: expr,
117                    trailing_comments: Vec::new(),
118                })),
119                _ => expr,
120            }
121        };
122
123        let unwrap_in_array = |mut expressions: Vec<Expression>,
124                               query: &Option<Expression>,
125                               unnest: &Option<Box<Expression>>|
126         -> Vec<Expression> {
127            if query.is_none() && unnest.is_none() && expressions.len() == 1 {
128                if matches!(expressions[0], Expression::ArrayFunc(_)) {
129                    if let Expression::ArrayFunc(arr) = expressions.remove(0) {
130                        return arr.expressions;
131                    }
132                }
133            }
134            expressions
135        };
136
137        match expr {
138            // TryCast stays as TryCast (ClickHouse doesn't have TRY_CAST by default)
139            // But we can emulate with toXOrNull functions
140            Expression::TryCast(c) => {
141                // For simplicity, just use regular CAST
142                // ClickHouse has toXOrNull/toXOrZero functions for safe casts
143                Ok(Expression::Cast(c))
144            }
145
146            // SafeCast -> CAST in ClickHouse
147            Expression::SafeCast(c) => Ok(Expression::Cast(c)),
148
149            // CountIf is native in ClickHouse (lowercase)
150            Expression::CountIf(f) => Ok(Expression::Function(Box::new(Function::new(
151                "countIf".to_string(),
152                vec![f.this],
153            )))),
154
155            // UNNEST -> arrayJoin in ClickHouse
156            Expression::Unnest(f) => Ok(Expression::Function(Box::new(Function::new(
157                "arrayJoin".to_string(),
158                vec![f.this],
159            )))),
160
161            // EXPLODE -> arrayJoin in ClickHouse
162            Expression::Explode(f) => Ok(Expression::Function(Box::new(Function::new(
163                "arrayJoin".to_string(),
164                vec![f.this],
165            )))),
166
167            // ExplodeOuter -> arrayJoin in ClickHouse
168            Expression::ExplodeOuter(f) => Ok(Expression::Function(Box::new(Function::new(
169                "arrayJoin".to_string(),
170                vec![f.this],
171            )))),
172
173            // RAND -> randCanonical() in ClickHouse
174            Expression::Rand(_) => Ok(Expression::Function(Box::new(Function::new(
175                "randCanonical".to_string(),
176                vec![],
177            )))),
178
179            // Random -> randCanonical() in ClickHouse
180            Expression::Random(_) => Ok(Expression::Function(Box::new(Function::new(
181                "randCanonical".to_string(),
182                vec![],
183            )))),
184
185            // startsWith -> startsWith
186            Expression::StartsWith(f) => Ok(Expression::Function(Box::new(Function::new(
187                "startsWith".to_string(),
188                vec![f.this, f.expression],
189            )))),
190
191            // endsWith -> endsWith
192            Expression::EndsWith(f) => Ok(Expression::Function(Box::new(Function::new(
193                "endsWith".to_string(),
194                vec![f.this, f.expression],
195            )))),
196
197            // ClickHouse prefers NOT (x IN (...)) over x NOT IN (...)
198            Expression::In(in_expr) if in_expr.not => {
199                if in_expr.global {
200                    return Ok(Expression::In(in_expr));
201                }
202                let In {
203                    this,
204                    expressions,
205                    query,
206                    unnest,
207                    global,
208                    is_field,
209                    ..
210                } = *in_expr;
211                let expressions = unwrap_in_array(expressions, &query, &unnest);
212                let base = Expression::In(Box::new(In {
213                    this: wrap_predicate_left(this),
214                    expressions,
215                    query,
216                    not: false,
217                    global,
218                    unnest,
219                    is_field,
220                }));
221                Ok(Expression::Not(Box::new(UnaryOp {
222                    this: wrap_not_target(base),
223                    inferred_type: None,
224                })))
225            }
226
227            // ClickHouse prefers NOT (x IS NULL) over x IS NOT NULL
228            Expression::IsNull(is_null) if is_null.not => {
229                let IsNull { this, .. } = *is_null;
230                let base = Expression::IsNull(Box::new(IsNull {
231                    this: wrap_predicate_left(this),
232                    not: false,
233                    postfix_form: false,
234                }));
235                Ok(Expression::Not(Box::new(UnaryOp {
236                    this: wrap_not_target(base),
237                    inferred_type: None,
238                })))
239            }
240
241            Expression::In(mut in_expr) => {
242                in_expr.expressions =
243                    unwrap_in_array(in_expr.expressions, &in_expr.query, &in_expr.unnest);
244                in_expr.this = wrap_predicate_left(in_expr.this);
245                Ok(Expression::In(in_expr))
246            }
247
248            Expression::IsNull(mut is_null) => {
249                is_null.this = wrap_predicate_left(is_null.this);
250                Ok(Expression::IsNull(is_null))
251            }
252
253            // IF(cond, true, false) -> CASE WHEN cond THEN true ELSE false END
254            Expression::IfFunc(f) => {
255                let f = *f;
256                let has_aliased_arg = matches!(f.condition, Expression::Alias(_))
257                    || matches!(f.true_value, Expression::Alias(_))
258                    || matches!(f.false_value.as_ref(), Some(Expression::Alias(_)));
259                if has_aliased_arg {
260                    return Ok(Expression::IfFunc(Box::new(f)));
261                }
262                Ok(Expression::Case(Box::new(Case {
263                    operand: None,
264                    whens: vec![(f.condition, f.true_value)],
265                    else_: f.false_value,
266                    comments: Vec::new(),
267                    inferred_type: None,
268                })))
269            }
270
271            Expression::Is(mut is_expr) => {
272                is_expr.left = wrap_predicate_left(is_expr.left);
273                Ok(Expression::Is(is_expr))
274            }
275
276            Expression::Or(op) => {
277                let BinaryOp {
278                    left,
279                    right,
280                    left_comments,
281                    operator_comments,
282                    trailing_comments,
283                    ..
284                } = *op;
285                let left = if matches!(left, Expression::And(_)) {
286                    Expression::Paren(Box::new(Paren {
287                        this: left,
288                        trailing_comments: Vec::new(),
289                    }))
290                } else {
291                    left
292                };
293                let right = if matches!(right, Expression::And(_)) {
294                    Expression::Paren(Box::new(Paren {
295                        this: right,
296                        trailing_comments: Vec::new(),
297                    }))
298                } else {
299                    right
300                };
301                Ok(Expression::Or(Box::new(BinaryOp {
302                    left,
303                    right,
304                    left_comments,
305                    operator_comments,
306                    trailing_comments,
307                    inferred_type: None,
308                })))
309            }
310
311            Expression::Not(op) => {
312                let inner = wrap_not_target(op.this);
313                Ok(Expression::Not(Box::new(UnaryOp {
314                    this: inner,
315                    inferred_type: None,
316                })))
317            }
318
319            Expression::MapFunc(map) if map.curly_brace_syntax => {
320                let MapConstructor { keys, values, .. } = *map;
321                let mut args = Vec::with_capacity(keys.len() * 2);
322                for (key, value) in keys.into_iter().zip(values.into_iter()) {
323                    args.push(key);
324                    args.push(value);
325                }
326                Ok(Expression::Function(Box::new(Function::new(
327                    "map".to_string(),
328                    args,
329                ))))
330            }
331
332            Expression::Insert(mut insert) => {
333                for row in insert.values.iter_mut() {
334                    for value in row.iter_mut() {
335                        if !matches!(value, Expression::Paren(_)) {
336                            let wrapped = Expression::Paren(Box::new(Paren {
337                                this: value.clone(),
338                                trailing_comments: Vec::new(),
339                            }));
340                            *value = wrapped;
341                        }
342                    }
343                }
344                Ok(Expression::Insert(insert))
345            }
346
347            // Generic function transformations
348            Expression::Function(f) => self.transform_function(*f),
349
350            // Generic aggregate function transformations
351            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
352
353            // Cast transformations
354            Expression::Cast(c) => self.transform_cast(*c),
355
356            // TYPEOF -> toTypeName in ClickHouse
357            Expression::Typeof(f) => Ok(Expression::Function(Box::new(Function::new(
358                "toTypeName".to_string(),
359                vec![f.this],
360            )))),
361
362            // Pass through everything else
363            _ => Ok(expr),
364        }
365    }
366}
367
368impl ClickHouseDialect {}
369
370impl ClickHouseDialect {
371    fn transform_function(&self, f: Function) -> Result<Expression> {
372        let name_upper = f.name.to_uppercase();
373        match name_upper.as_str() {
374            // UTCTimestamp() -> CURRENT_TIMESTAMP('UTC')
375            "UTCTIMESTAMP" => Ok(Expression::UtcTimestamp(Box::new(
376                crate::expressions::UtcTimestamp { this: None },
377            ))),
378
379            "CURRENTDATABASE" | "CURRENT_DATABASE" => Ok(Expression::Function(Box::new(
380                Function::new("CURRENT_DATABASE".to_string(), f.args),
381            ))),
382            "CURRENTSCHEMAS" | "CURRENT_SCHEMAS" => Ok(Expression::Function(Box::new(
383                Function::new("CURRENT_SCHEMAS".to_string(), f.args),
384            ))),
385            "LEVENSHTEIN" | "LEVENSHTEINDISTANCE" | "EDITDISTANCE" => Ok(Expression::Function(
386                Box::new(Function::new("editDistance".to_string(), f.args)),
387            )),
388            "CHAR" | "CHR" => Ok(Expression::Function(Box::new(Function::new(
389                "CHAR".to_string(),
390                f.args,
391            )))),
392            "STR_TO_DATE" => Ok(Expression::Function(Box::new(Function::new(
393                "STR_TO_DATE".to_string(),
394                f.args,
395            )))),
396            "JSONEXTRACTSTRING" => Ok(Expression::Function(Box::new(Function::new(
397                "JSONExtractString".to_string(),
398                f.args,
399            )))),
400            "MATCH" => Ok(Expression::Function(Box::new(Function::new(
401                "match".to_string(),
402                f.args,
403            )))),
404            "LIKE" if f.args.len() == 2 => {
405                let left = f.args[0].clone();
406                let right = f.args[1].clone();
407                Ok(Expression::Like(Box::new(LikeOp::new(left, right))))
408            }
409            "NOTLIKE" if f.args.len() == 2 => {
410                let left = f.args[0].clone();
411                let right = f.args[1].clone();
412                let like = Expression::Like(Box::new(LikeOp::new(left, right)));
413                Ok(Expression::Not(Box::new(UnaryOp {
414                    this: like,
415                    inferred_type: None,
416                })))
417            }
418            "ILIKE" if f.args.len() == 2 => {
419                let left = f.args[0].clone();
420                let right = f.args[1].clone();
421                Ok(Expression::ILike(Box::new(LikeOp::new(left, right))))
422            }
423            "AND" if f.args.len() >= 2 => {
424                let mut iter = f.args.into_iter();
425                let mut expr = iter.next().unwrap();
426                for arg in iter {
427                    expr = Expression::And(Box::new(BinaryOp::new(expr, arg)));
428                }
429                Ok(expr)
430            }
431            "OR" if f.args.len() >= 2 => {
432                let mut iter = f.args.into_iter();
433                let mut expr = iter.next().unwrap();
434                for arg in iter {
435                    expr = Expression::Or(Box::new(BinaryOp::new(expr, arg)));
436                }
437                self.transform_expr(expr)
438            }
439            // TYPEOF -> toTypeName in ClickHouse
440            "TYPEOF" => Ok(Expression::Function(Box::new(Function::new(
441                "toTypeName".to_string(),
442                f.args,
443            )))),
444
445            // DATE_TRUNC: ClickHouse uses dateTrunc (camelCase)
446            "DATE_TRUNC" | "DATETRUNC" => Ok(Expression::Function(Box::new(Function::new(
447                "dateTrunc".to_string(),
448                f.args,
449            )))),
450            "TOSTARTOFDAY" if f.args.len() == 1 => {
451                Ok(Expression::Function(Box::new(Function::new(
452                    "dateTrunc".to_string(),
453                    vec![Expression::string("DAY"), f.args[0].clone()],
454                ))))
455            }
456
457            // SUBSTRING_INDEX: preserve original case (substringIndex in ClickHouse)
458            "SUBSTRING_INDEX" => Ok(Expression::Function(Box::new(Function::new(
459                f.name.clone(),
460                f.args,
461            )))),
462
463            // IS_NAN / ISNAN -> isNaN (ClickHouse camelCase)
464            "IS_NAN" | "ISNAN" => Ok(Expression::Function(Box::new(Function::new(
465                "isNaN".to_string(),
466                f.args,
467            )))),
468
469            _ => Ok(Expression::Function(Box::new(f))),
470        }
471    }
472
473    fn transform_aggregate_function(
474        &self,
475        f: Box<crate::expressions::AggregateFunction>,
476    ) -> Result<Expression> {
477        let name_upper = f.name.to_uppercase();
478        match name_upper.as_str() {
479            // COUNT_IF -> countIf
480            "COUNT_IF" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
481                "countIf".to_string(),
482                f.args,
483            )))),
484
485            // SUM_IF -> sumIf
486            "SUM_IF" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
487                "sumIf".to_string(),
488                f.args,
489            )))),
490
491            // AVG_IF -> avgIf
492            "AVG_IF" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
493                "avgIf".to_string(),
494                f.args,
495            )))),
496
497            // ANY_VALUE -> any in ClickHouse
498            "ANY_VALUE" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
499                "any".to_string(),
500                f.args,
501            )))),
502
503            // GROUP_CONCAT -> groupArray + arrayStringConcat
504            "GROUP_CONCAT" if !f.args.is_empty() => {
505                let mut args = f.args;
506                let first = args.remove(0);
507                let separator = args.pop();
508                let group_array = Expression::Function(Box::new(Function::new(
509                    "groupArray".to_string(),
510                    vec![first],
511                )));
512                if let Some(sep) = separator {
513                    Ok(Expression::Function(Box::new(Function::new(
514                        "arrayStringConcat".to_string(),
515                        vec![group_array, sep],
516                    ))))
517                } else {
518                    Ok(Expression::Function(Box::new(Function::new(
519                        "arrayStringConcat".to_string(),
520                        vec![group_array],
521                    ))))
522                }
523            }
524
525            // STRING_AGG -> groupArray + arrayStringConcat
526            "STRING_AGG" if !f.args.is_empty() => {
527                let mut args = f.args;
528                let first = args.remove(0);
529                let separator = args.pop();
530                let group_array = Expression::Function(Box::new(Function::new(
531                    "groupArray".to_string(),
532                    vec![first],
533                )));
534                if let Some(sep) = separator {
535                    Ok(Expression::Function(Box::new(Function::new(
536                        "arrayStringConcat".to_string(),
537                        vec![group_array, sep],
538                    ))))
539                } else {
540                    Ok(Expression::Function(Box::new(Function::new(
541                        "arrayStringConcat".to_string(),
542                        vec![group_array],
543                    ))))
544                }
545            }
546
547            // LISTAGG -> groupArray + arrayStringConcat
548            "LISTAGG" if !f.args.is_empty() => {
549                let mut args = f.args;
550                let first = args.remove(0);
551                let separator = args.pop();
552                let group_array = Expression::Function(Box::new(Function::new(
553                    "groupArray".to_string(),
554                    vec![first],
555                )));
556                if let Some(sep) = separator {
557                    Ok(Expression::Function(Box::new(Function::new(
558                        "arrayStringConcat".to_string(),
559                        vec![group_array, sep],
560                    ))))
561                } else {
562                    Ok(Expression::Function(Box::new(Function::new(
563                        "arrayStringConcat".to_string(),
564                        vec![group_array],
565                    ))))
566                }
567            }
568
569            // ARRAY_AGG -> groupArray
570            "ARRAY_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
571                "groupArray".to_string(),
572                f.args,
573            )))),
574
575            // STDDEV -> stddevSamp in ClickHouse (sample stddev)
576            "STDDEV" if !f.args.is_empty() => {
577                Ok(Expression::AggregateFunction(Box::new(AggregateFunction {
578                    name: "stddevSamp".to_string(),
579                    args: f.args,
580                    distinct: f.distinct,
581                    filter: f.filter,
582                    order_by: Vec::new(),
583                    limit: None,
584                    ignore_nulls: None,
585                    inferred_type: None,
586                })))
587            }
588
589            // STDDEV_POP -> stddevPop
590            "STDDEV_POP" if !f.args.is_empty() => {
591                Ok(Expression::AggregateFunction(Box::new(AggregateFunction {
592                    name: "stddevPop".to_string(),
593                    args: f.args,
594                    distinct: f.distinct,
595                    filter: f.filter,
596                    order_by: Vec::new(),
597                    limit: None,
598                    ignore_nulls: None,
599                    inferred_type: None,
600                })))
601            }
602
603            // VARIANCE -> varSamp in ClickHouse
604            "VARIANCE" if !f.args.is_empty() => {
605                Ok(Expression::AggregateFunction(Box::new(AggregateFunction {
606                    name: "varSamp".to_string(),
607                    args: f.args,
608                    distinct: f.distinct,
609                    filter: f.filter,
610                    order_by: Vec::new(),
611                    limit: None,
612                    ignore_nulls: None,
613                    inferred_type: None,
614                })))
615            }
616
617            // VAR_POP -> varPop
618            "VAR_POP" if !f.args.is_empty() => {
619                Ok(Expression::AggregateFunction(Box::new(AggregateFunction {
620                    name: "varPop".to_string(),
621                    args: f.args,
622                    distinct: f.distinct,
623                    filter: f.filter,
624                    order_by: Vec::new(),
625                    limit: None,
626                    ignore_nulls: None,
627                    inferred_type: None,
628                })))
629            }
630
631            // MEDIAN -> median
632            "MEDIAN" if !f.args.is_empty() => {
633                Ok(Expression::AggregateFunction(Box::new(AggregateFunction {
634                    name: "median".to_string(),
635                    args: f.args,
636                    distinct: f.distinct,
637                    filter: f.filter,
638                    order_by: Vec::new(),
639                    limit: None,
640                    ignore_nulls: None,
641                    inferred_type: None,
642                })))
643            }
644
645            // APPROX_COUNT_DISTINCT -> uniq in ClickHouse
646            "APPROX_COUNT_DISTINCT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
647                Function::new("uniq".to_string(), f.args),
648            ))),
649
650            // APPROX_DISTINCT -> uniq
651            "APPROX_DISTINCT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
652                Function::new("uniq".to_string(), f.args),
653            ))),
654
655            _ => Ok(Expression::AggregateFunction(f)),
656        }
657    }
658
659    fn transform_cast(&self, c: Cast) -> Result<Expression> {
660        Ok(Expression::Cast(Box::new(c)))
661    }
662}