Skip to main content

polyglot_sql/dialects/
clickhouse.rs

1//! ClickHouse Dialect
2//!
3//! ClickHouse-specific transformations based on sqlglot patterns.
4//! ClickHouse is case-sensitive and has unique function naming conventions.
5
6use super::{DialectImpl, DialectType};
7#[cfg(feature = "transpile")]
8use crate::error::Result;
9#[cfg(feature = "transpile")]
10use crate::expressions::{
11    AggregateFunction, BinaryOp, Case, Cast, Expression, Function, In, IsNull, LikeOp,
12    MapConstructor, Paren, UnaryOp,
13};
14#[cfg(feature = "generate")]
15use crate::generator::GeneratorConfig;
16use crate::tokens::TokenizerConfig;
17
18/// ClickHouse dialect
19pub struct ClickHouseDialect;
20
21impl DialectImpl for ClickHouseDialect {
22    fn dialect_type(&self) -> DialectType {
23        DialectType::ClickHouse
24    }
25
26    fn tokenizer_config(&self) -> TokenizerConfig {
27        let mut config = TokenizerConfig::default();
28        // ClickHouse uses double quotes and backticks for identifiers
29        config.identifiers.insert('"', '"');
30        config.identifiers.insert('`', '`');
31        // ClickHouse supports nested comments
32        config.nested_comments = true;
33        // ClickHouse allows identifiers to start with digits
34        config.identifiers_can_start_with_digit = true;
35        // ClickHouse uses backslash escaping in strings
36        config.string_escapes.push('\\');
37        // ClickHouse supports # as single-line comment
38        config.hash_comments = true;
39        // ClickHouse allows $ in identifiers
40        config.dollar_sign_is_identifier = true;
41        // ClickHouse: INSERT ... FORMAT <name> is followed by raw data
42        config.insert_format_raw_data = true;
43        // ClickHouse supports 0xDEADBEEF hex integer literals
44        config.hex_number_strings = true;
45        config.hex_string_is_integer_type = true;
46        // ClickHouse allows underscores as digit separators in numeric literals
47        config.numbers_can_be_underscore_separated = true;
48        // SQLGlot tokenizes malformed SHOW LIKE probes such as `'a\' or 1=1`.
49        config.recover_terminal_backslash_quote = true;
50        // The ClickHouse corpus includes partial string probes extracted from shell tests.
51        config.recover_unterminated_string = true;
52        config
53    }
54
55    #[cfg(feature = "generate")]
56
57    fn generator_config(&self) -> GeneratorConfig {
58        use crate::generator::{IdentifierQuoteStyle, NormalizeFunctions};
59        GeneratorConfig {
60            identifier_quote: '"',
61            identifier_quote_style: IdentifierQuoteStyle::DOUBLE_QUOTE,
62            dialect: Some(DialectType::ClickHouse),
63            // ClickHouse uses uppercase keywords (matching Python SQLGlot behavior)
64            uppercase_keywords: true,
65            // ClickHouse function names are case-sensitive and typically camelCase
66            normalize_functions: NormalizeFunctions::None,
67            // ClickHouse identifiers are case-sensitive
68            case_sensitive_identifiers: true,
69            tablesample_keywords: "SAMPLE",
70            tablesample_requires_parens: false,
71            identifiers_can_start_with_digit: true,
72            // ClickHouse uses bracket-only notation for arrays: [1, 2, 3]
73            array_bracket_only: true,
74            ..Default::default()
75        }
76    }
77
78    #[cfg(feature = "transpile")]
79
80    fn transform_expr(&self, expr: Expression) -> Result<Expression> {
81        let wrap_predicate_left = |expr: Expression| -> Expression {
82            let needs_parens = matches!(
83                expr,
84                Expression::Add(_)
85                    | Expression::Sub(_)
86                    | Expression::Mul(_)
87                    | Expression::Div(_)
88                    | Expression::Mod(_)
89                    | Expression::Concat(_)
90                    | Expression::And(_)
91                    | Expression::Or(_)
92                    | Expression::Not(_)
93                    | Expression::Case(_)
94            );
95
96            if needs_parens {
97                Expression::Paren(Box::new(Paren {
98                    this: expr,
99                    trailing_comments: Vec::new(),
100                }))
101            } else {
102                expr
103            }
104        };
105
106        let wrap_not_target = |expr: Expression| -> Expression {
107            match expr {
108                Expression::Paren(_) => expr,
109                Expression::In(_)
110                | Expression::Between(_)
111                | Expression::Is(_)
112                | Expression::IsNull(_)
113                | Expression::IsTrue(_)
114                | Expression::IsFalse(_)
115                | Expression::IsJson(_)
116                | Expression::Like(_)
117                | Expression::ILike(_)
118                | Expression::SimilarTo(_)
119                | Expression::Glob(_)
120                | Expression::RegexpLike(_)
121                | Expression::RegexpILike(_)
122                | Expression::MemberOf(_) => Expression::Paren(Box::new(Paren {
123                    this: expr,
124                    trailing_comments: Vec::new(),
125                })),
126                _ => expr,
127            }
128        };
129
130        let unwrap_in_array = |mut expressions: Vec<Expression>,
131                               query: &Option<Expression>,
132                               unnest: &Option<Box<Expression>>|
133         -> Vec<Expression> {
134            if query.is_none() && unnest.is_none() && expressions.len() == 1 {
135                if matches!(expressions[0], Expression::ArrayFunc(_)) {
136                    if let Expression::ArrayFunc(arr) = expressions.remove(0) {
137                        return arr.expressions;
138                    }
139                }
140            }
141            expressions
142        };
143
144        match expr {
145            // TryCast stays as TryCast (ClickHouse doesn't have TRY_CAST by default)
146            // But we can emulate with toXOrNull functions
147            Expression::TryCast(c) => {
148                // For simplicity, just use regular CAST
149                // ClickHouse has toXOrNull/toXOrZero functions for safe casts
150                Ok(Expression::Cast(c))
151            }
152
153            // SafeCast -> CAST in ClickHouse
154            Expression::SafeCast(c) => Ok(Expression::Cast(c)),
155
156            // CountIf is native in ClickHouse (lowercase)
157            Expression::CountIf(f) => Ok(Expression::Function(Box::new(Function::new(
158                "countIf".to_string(),
159                vec![f.this],
160            )))),
161
162            // UNNEST -> arrayJoin in ClickHouse
163            Expression::Unnest(f) => Ok(Expression::Function(Box::new(Function::new(
164                "arrayJoin".to_string(),
165                vec![f.this],
166            )))),
167
168            // EXPLODE -> arrayJoin in ClickHouse
169            Expression::Explode(f) => Ok(Expression::Function(Box::new(Function::new(
170                "arrayJoin".to_string(),
171                vec![f.this],
172            )))),
173
174            // ExplodeOuter -> arrayJoin in ClickHouse
175            Expression::ExplodeOuter(f) => Ok(Expression::Function(Box::new(Function::new(
176                "arrayJoin".to_string(),
177                vec![f.this],
178            )))),
179
180            // RAND -> randCanonical() in ClickHouse
181            Expression::Rand(_) => Ok(Expression::Function(Box::new(Function::new(
182                "randCanonical".to_string(),
183                vec![],
184            )))),
185
186            // Random -> randCanonical() in ClickHouse
187            Expression::Random(_) => Ok(Expression::Function(Box::new(Function::new(
188                "randCanonical".to_string(),
189                vec![],
190            )))),
191
192            // startsWith -> startsWith
193            Expression::StartsWith(f) => Ok(Expression::Function(Box::new(Function::new(
194                "startsWith".to_string(),
195                vec![f.this, f.expression],
196            )))),
197
198            // endsWith -> endsWith
199            Expression::EndsWith(f) => Ok(Expression::Function(Box::new(Function::new(
200                "endsWith".to_string(),
201                vec![f.this, f.expression],
202            )))),
203
204            // ClickHouse prefers NOT (x IN (...)) over x NOT IN (...)
205            Expression::In(in_expr) if in_expr.not => {
206                if in_expr.global {
207                    return Ok(Expression::In(in_expr));
208                }
209                let In {
210                    this,
211                    expressions,
212                    query,
213                    unnest,
214                    global,
215                    is_field,
216                    ..
217                } = *in_expr;
218                let expressions = unwrap_in_array(expressions, &query, &unnest);
219                let base = Expression::In(Box::new(In {
220                    this: wrap_predicate_left(this),
221                    expressions,
222                    query,
223                    not: false,
224                    global,
225                    unnest,
226                    is_field,
227                }));
228                Ok(Expression::Not(Box::new(UnaryOp {
229                    this: wrap_not_target(base),
230                    inferred_type: None,
231                })))
232            }
233
234            // ClickHouse prefers NOT (x IS NULL) over x IS NOT NULL
235            Expression::IsNull(is_null) if is_null.not => {
236                let IsNull { this, .. } = *is_null;
237                let base = Expression::IsNull(Box::new(IsNull {
238                    this: wrap_predicate_left(this),
239                    not: false,
240                    postfix_form: false,
241                }));
242                Ok(Expression::Not(Box::new(UnaryOp {
243                    this: wrap_not_target(base),
244                    inferred_type: None,
245                })))
246            }
247
248            Expression::In(mut in_expr) => {
249                in_expr.expressions =
250                    unwrap_in_array(in_expr.expressions, &in_expr.query, &in_expr.unnest);
251                in_expr.this = wrap_predicate_left(in_expr.this);
252                Ok(Expression::In(in_expr))
253            }
254
255            Expression::IsNull(mut is_null) => {
256                is_null.this = wrap_predicate_left(is_null.this);
257                Ok(Expression::IsNull(is_null))
258            }
259
260            // IF(cond, true, false) -> CASE WHEN cond THEN true ELSE false END
261            Expression::IfFunc(f) => {
262                let f = *f;
263                let has_aliased_arg = matches!(f.condition, Expression::Alias(_))
264                    || matches!(f.true_value, Expression::Alias(_))
265                    || matches!(f.false_value.as_ref(), Some(Expression::Alias(_)));
266                if has_aliased_arg {
267                    return Ok(Expression::IfFunc(Box::new(f)));
268                }
269                Ok(Expression::Case(Box::new(Case {
270                    operand: None,
271                    whens: vec![(f.condition, f.true_value)],
272                    else_: f.false_value,
273                    comments: Vec::new(),
274                    inferred_type: None,
275                })))
276            }
277
278            Expression::Is(mut is_expr) => {
279                is_expr.left = wrap_predicate_left(is_expr.left);
280                Ok(Expression::Is(is_expr))
281            }
282
283            Expression::Or(op) => {
284                let BinaryOp {
285                    left,
286                    right,
287                    left_comments,
288                    operator_comments,
289                    trailing_comments,
290                    ..
291                } = *op;
292                let left = if matches!(left, Expression::And(_)) {
293                    Expression::Paren(Box::new(Paren {
294                        this: left,
295                        trailing_comments: Vec::new(),
296                    }))
297                } else {
298                    left
299                };
300                let right = if matches!(right, Expression::And(_)) {
301                    Expression::Paren(Box::new(Paren {
302                        this: right,
303                        trailing_comments: Vec::new(),
304                    }))
305                } else {
306                    right
307                };
308                Ok(Expression::Or(Box::new(BinaryOp {
309                    left,
310                    right,
311                    left_comments,
312                    operator_comments,
313                    trailing_comments,
314                    inferred_type: None,
315                })))
316            }
317
318            Expression::Not(op) => {
319                let inner = wrap_not_target(op.this);
320                Ok(Expression::Not(Box::new(UnaryOp {
321                    this: inner,
322                    inferred_type: None,
323                })))
324            }
325
326            Expression::MapFunc(map) if map.curly_brace_syntax => {
327                let MapConstructor { keys, values, .. } = *map;
328                let mut args = Vec::with_capacity(keys.len() * 2);
329                for (key, value) in keys.into_iter().zip(values.into_iter()) {
330                    args.push(key);
331                    args.push(value);
332                }
333                Ok(Expression::Function(Box::new(Function::new(
334                    "map".to_string(),
335                    args,
336                ))))
337            }
338
339            Expression::Insert(mut insert) => {
340                for row in insert.values.iter_mut() {
341                    for value in row.iter_mut() {
342                        if !matches!(value, Expression::Paren(_)) {
343                            let wrapped = Expression::Paren(Box::new(Paren {
344                                this: value.clone(),
345                                trailing_comments: Vec::new(),
346                            }));
347                            *value = wrapped;
348                        }
349                    }
350                }
351                Ok(Expression::Insert(insert))
352            }
353
354            // Generic function transformations
355            Expression::Function(f) => self.transform_function(*f),
356
357            // Generic aggregate function transformations
358            Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
359
360            // Cast transformations
361            Expression::Cast(c) => self.transform_cast(*c),
362
363            // TYPEOF -> toTypeName in ClickHouse
364            Expression::Typeof(f) => Ok(Expression::Function(Box::new(Function::new(
365                "toTypeName".to_string(),
366                vec![f.this],
367            )))),
368
369            // Pass through everything else
370            _ => Ok(expr),
371        }
372    }
373}
374
375#[cfg(feature = "transpile")]
376impl ClickHouseDialect {}
377
378#[cfg(feature = "transpile")]
379impl ClickHouseDialect {
380    fn transform_function(&self, f: Function) -> Result<Expression> {
381        let name_upper = f.name.to_uppercase();
382        match name_upper.as_str() {
383            // UTCTimestamp() -> CURRENT_TIMESTAMP('UTC')
384            "UTCTIMESTAMP" => Ok(Expression::UtcTimestamp(Box::new(
385                crate::expressions::UtcTimestamp { this: None },
386            ))),
387
388            "CURRENTDATABASE" | "CURRENT_DATABASE" => Ok(Expression::Function(Box::new(
389                Function::new("CURRENT_DATABASE".to_string(), f.args),
390            ))),
391            "CURRENTSCHEMAS" | "CURRENT_SCHEMAS" => Ok(Expression::Function(Box::new(
392                Function::new("CURRENT_SCHEMAS".to_string(), f.args),
393            ))),
394            "LEVENSHTEIN" | "LEVENSHTEINDISTANCE" | "EDITDISTANCE" => Ok(Expression::Function(
395                Box::new(Function::new("editDistance".to_string(), f.args)),
396            )),
397            "CHAR" | "CHR" => Ok(Expression::Function(Box::new(Function::new(
398                "CHAR".to_string(),
399                f.args,
400            )))),
401            "STR_TO_DATE" => Ok(Expression::Function(Box::new(Function::new(
402                "STR_TO_DATE".to_string(),
403                f.args,
404            )))),
405            "JSONEXTRACTSTRING" => Ok(Expression::Function(Box::new(Function::new(
406                "JSONExtractString".to_string(),
407                f.args,
408            )))),
409            "MATCH" => Ok(Expression::Function(Box::new(Function::new(
410                "match".to_string(),
411                f.args,
412            )))),
413            "LIKE" if f.args.len() == 2 => {
414                let left = f.args[0].clone();
415                let right = f.args[1].clone();
416                Ok(Expression::Like(Box::new(LikeOp::new(left, right))))
417            }
418            "NOTLIKE" if f.args.len() == 2 => {
419                let left = f.args[0].clone();
420                let right = f.args[1].clone();
421                let like = Expression::Like(Box::new(LikeOp::new(left, right)));
422                Ok(Expression::Not(Box::new(UnaryOp {
423                    this: like,
424                    inferred_type: None,
425                })))
426            }
427            "ILIKE" if f.args.len() == 2 => {
428                let left = f.args[0].clone();
429                let right = f.args[1].clone();
430                Ok(Expression::ILike(Box::new(LikeOp::new(left, right))))
431            }
432            "AND" if f.args.len() >= 2 => {
433                let mut iter = f.args.into_iter();
434                let mut expr = iter.next().unwrap();
435                for arg in iter {
436                    expr = Expression::And(Box::new(BinaryOp::new(expr, arg)));
437                }
438                Ok(expr)
439            }
440            "OR" if f.args.len() >= 2 => {
441                let mut iter = f.args.into_iter();
442                let mut expr = iter.next().unwrap();
443                for arg in iter {
444                    expr = Expression::Or(Box::new(BinaryOp::new(expr, arg)));
445                }
446                self.transform_expr(expr)
447            }
448            // TYPEOF -> toTypeName in ClickHouse
449            "TYPEOF" => Ok(Expression::Function(Box::new(Function::new(
450                "toTypeName".to_string(),
451                f.args,
452            )))),
453
454            // DATE_TRUNC: ClickHouse uses dateTrunc (camelCase)
455            "DATE_TRUNC" | "DATETRUNC" => Ok(Expression::Function(Box::new(Function::new(
456                "dateTrunc".to_string(),
457                f.args,
458            )))),
459            "TOSTARTOFDAY" if f.args.len() == 1 => {
460                Ok(Expression::Function(Box::new(Function::new(
461                    "dateTrunc".to_string(),
462                    vec![Expression::string("DAY"), f.args[0].clone()],
463                ))))
464            }
465
466            // SUBSTRING_INDEX: preserve original case (substringIndex in ClickHouse)
467            "SUBSTRING_INDEX" => Ok(Expression::Function(Box::new(Function::new(
468                f.name.clone(),
469                f.args,
470            )))),
471
472            // IS_NAN / ISNAN -> isNaN (ClickHouse camelCase)
473            "IS_NAN" | "ISNAN" => Ok(Expression::Function(Box::new(Function::new(
474                "isNaN".to_string(),
475                f.args,
476            )))),
477
478            _ => Ok(Expression::Function(Box::new(f))),
479        }
480    }
481
482    fn transform_aggregate_function(
483        &self,
484        f: Box<crate::expressions::AggregateFunction>,
485    ) -> Result<Expression> {
486        let name_upper = f.name.to_uppercase();
487        match name_upper.as_str() {
488            // COUNT_IF -> countIf
489            "COUNT_IF" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
490                "countIf".to_string(),
491                f.args,
492            )))),
493
494            // SUM_IF -> sumIf
495            "SUM_IF" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
496                "sumIf".to_string(),
497                f.args,
498            )))),
499
500            // AVG_IF -> avgIf
501            "AVG_IF" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
502                "avgIf".to_string(),
503                f.args,
504            )))),
505
506            // ANY_VALUE -> any in ClickHouse
507            "ANY_VALUE" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
508                "any".to_string(),
509                f.args,
510            )))),
511
512            // GROUP_CONCAT -> groupArray + arrayStringConcat
513            "GROUP_CONCAT" if !f.args.is_empty() => {
514                let mut args = f.args;
515                let first = args.remove(0);
516                let separator = args.pop();
517                let group_array = Expression::Function(Box::new(Function::new(
518                    "groupArray".to_string(),
519                    vec![first],
520                )));
521                if let Some(sep) = separator {
522                    Ok(Expression::Function(Box::new(Function::new(
523                        "arrayStringConcat".to_string(),
524                        vec![group_array, sep],
525                    ))))
526                } else {
527                    Ok(Expression::Function(Box::new(Function::new(
528                        "arrayStringConcat".to_string(),
529                        vec![group_array],
530                    ))))
531                }
532            }
533
534            // STRING_AGG -> groupArray + arrayStringConcat
535            "STRING_AGG" if !f.args.is_empty() => {
536                let mut args = f.args;
537                let first = args.remove(0);
538                let separator = args.pop();
539                let group_array = Expression::Function(Box::new(Function::new(
540                    "groupArray".to_string(),
541                    vec![first],
542                )));
543                if let Some(sep) = separator {
544                    Ok(Expression::Function(Box::new(Function::new(
545                        "arrayStringConcat".to_string(),
546                        vec![group_array, sep],
547                    ))))
548                } else {
549                    Ok(Expression::Function(Box::new(Function::new(
550                        "arrayStringConcat".to_string(),
551                        vec![group_array],
552                    ))))
553                }
554            }
555
556            // LISTAGG -> groupArray + arrayStringConcat
557            "LISTAGG" if !f.args.is_empty() => {
558                let mut args = f.args;
559                let first = args.remove(0);
560                let separator = args.pop();
561                let group_array = Expression::Function(Box::new(Function::new(
562                    "groupArray".to_string(),
563                    vec![first],
564                )));
565                if let Some(sep) = separator {
566                    Ok(Expression::Function(Box::new(Function::new(
567                        "arrayStringConcat".to_string(),
568                        vec![group_array, sep],
569                    ))))
570                } else {
571                    Ok(Expression::Function(Box::new(Function::new(
572                        "arrayStringConcat".to_string(),
573                        vec![group_array],
574                    ))))
575                }
576            }
577
578            // ARRAY_AGG -> groupArray
579            "ARRAY_AGG" if !f.args.is_empty() => Ok(Expression::Function(Box::new(Function::new(
580                "groupArray".to_string(),
581                f.args,
582            )))),
583
584            // STDDEV -> stddevSamp in ClickHouse (sample stddev)
585            "STDDEV" if !f.args.is_empty() => {
586                Ok(Expression::AggregateFunction(Box::new(AggregateFunction {
587                    name: "stddevSamp".to_string(),
588                    args: f.args,
589                    distinct: f.distinct,
590                    filter: f.filter,
591                    order_by: Vec::new(),
592                    limit: None,
593                    ignore_nulls: None,
594                    inferred_type: None,
595                })))
596            }
597
598            // STDDEV_POP -> stddevPop
599            "STDDEV_POP" if !f.args.is_empty() => {
600                Ok(Expression::AggregateFunction(Box::new(AggregateFunction {
601                    name: "stddevPop".to_string(),
602                    args: f.args,
603                    distinct: f.distinct,
604                    filter: f.filter,
605                    order_by: Vec::new(),
606                    limit: None,
607                    ignore_nulls: None,
608                    inferred_type: None,
609                })))
610            }
611
612            // VARIANCE -> varSamp in ClickHouse
613            "VARIANCE" if !f.args.is_empty() => {
614                Ok(Expression::AggregateFunction(Box::new(AggregateFunction {
615                    name: "varSamp".to_string(),
616                    args: f.args,
617                    distinct: f.distinct,
618                    filter: f.filter,
619                    order_by: Vec::new(),
620                    limit: None,
621                    ignore_nulls: None,
622                    inferred_type: None,
623                })))
624            }
625
626            // VAR_POP -> varPop
627            "VAR_POP" if !f.args.is_empty() => {
628                Ok(Expression::AggregateFunction(Box::new(AggregateFunction {
629                    name: "varPop".to_string(),
630                    args: f.args,
631                    distinct: f.distinct,
632                    filter: f.filter,
633                    order_by: Vec::new(),
634                    limit: None,
635                    ignore_nulls: None,
636                    inferred_type: None,
637                })))
638            }
639
640            // MEDIAN -> median
641            "MEDIAN" if !f.args.is_empty() => {
642                Ok(Expression::AggregateFunction(Box::new(AggregateFunction {
643                    name: "median".to_string(),
644                    args: f.args,
645                    distinct: f.distinct,
646                    filter: f.filter,
647                    order_by: Vec::new(),
648                    limit: None,
649                    ignore_nulls: None,
650                    inferred_type: None,
651                })))
652            }
653
654            // APPROX_COUNT_DISTINCT -> uniq in ClickHouse
655            "APPROX_COUNT_DISTINCT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
656                Function::new("uniq".to_string(), f.args),
657            ))),
658
659            // APPROX_DISTINCT -> uniq
660            "APPROX_DISTINCT" if !f.args.is_empty() => Ok(Expression::Function(Box::new(
661                Function::new("uniq".to_string(), f.args),
662            ))),
663
664            _ => Ok(Expression::AggregateFunction(f)),
665        }
666    }
667
668    fn transform_cast(&self, c: Cast) -> Result<Expression> {
669        Ok(Expression::Cast(Box::new(c)))
670    }
671}