Skip to main content

clickhouse_native_client/types/
parser.rs

1//! Type parser - 1:1 port of clickhouse-cpp type_parser.cpp
2//!
3//! This module implements token-based type parsing with AST caching,
4//! mirroring the C++ implementation exactly.
5//!
6//! **Reference:** `cpp/clickhouse-cpp/clickhouse/types/type_parser.{h,cpp}`
7
8use super::TypeCode;
9use crate::{
10    Error,
11    Result,
12};
13use std::{
14    cell::RefCell,
15    collections::HashMap,
16};
17
18/// Token types used during parsing
19/// Mirrors C++ `TypeParser::Token::Type`
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21enum TokenType {
22    Invalid = 0,
23    Assign,
24    Name,
25    Number,
26    #[allow(dead_code)]
27    String,
28    LPar, // Left parenthesis (
29    RPar, // Right parenthesis )
30    Comma,
31    QuotedString, // String with quotation marks included
32    Eos,          // End of string
33}
34
35/// Token with type and value
36/// Mirrors C++ `TypeParser::Token`
37#[derive(Debug, Clone)]
38struct Token<'a> {
39    token_type: TokenType,
40    value: &'a str,
41}
42
43/// TypeAst meta-type classification
44/// Mirrors C++ `TypeAst::Meta`
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46pub enum TypeMeta {
47    /// Variable-length array type (`Array(T)`).
48    Array,
49    /// Assignment expression (used in Enum definitions).
50    Assign,
51    /// Null literal.
52    Null,
53    /// Nullable wrapper type (`Nullable(T)`).
54    Nullable,
55    /// Numeric literal (e.g. precision, scale, fixed string size).
56    Number,
57    /// String literal.
58    String,
59    /// Terminal (leaf) type with no parameters (e.g. `Int32`, `UUID`).
60    Terminal,
61    /// Tuple type (`Tuple(T1, T2, ...)`).
62    Tuple,
63    /// Enum type (`Enum8` or `Enum16`).
64    Enum,
65    /// Dictionary-encoded type (`LowCardinality(T)`).
66    LowCardinality,
67    /// Simple aggregate function type.
68    SimpleAggregateFunction,
69    /// Key-value map type (`Map(K, V)`).
70    Map,
71}
72
73/// Abstract Syntax Tree for a type definition
74/// Mirrors C++ `struct TypeAst`
75#[derive(Debug, Clone, PartialEq)]
76pub struct TypeAst {
77    /// Type's category
78    pub meta: TypeMeta,
79    /// Type code
80    pub code: TypeCode,
81    /// Type's name
82    pub name: String,
83    /// Value associated with the node (for fixed-width types and enum values)
84    pub value: i64,
85    /// String value (for timezone, enum names, etc.)
86    pub value_string: String,
87    /// Sub-elements of the type (for composite types, enum items)
88    pub elements: Vec<TypeAst>,
89}
90
91impl Default for TypeAst {
92    fn default() -> Self {
93        Self {
94            meta: TypeMeta::Terminal,
95            code: TypeCode::Void,
96            name: String::new(),
97            value: 0,
98            value_string: String::new(),
99            elements: Vec::new(),
100        }
101    }
102}
103
104/// Type parser - mirrors C++ `class TypeParser`
105pub struct TypeParser<'a> {
106    /// Current position in input string
107    cur: usize,
108    /// Input string bytes
109    input: &'a str,
110    /// Stack of open elements during parsing
111    open_elements: Vec<*mut TypeAst>,
112    /// Current AST node being built
113    current_type: Option<*mut TypeAst>,
114}
115
116impl<'a> TypeParser<'a> {
117    /// Create a new parser for the given type name
118    /// Mirrors C++ `TypeParser::TypeParser(const StringView& name)`
119    pub fn new(name: &'a str) -> Self {
120        Self {
121            cur: 0,
122            input: name,
123            open_elements: Vec::new(),
124            current_type: None,
125        }
126    }
127
128    /// Parse the type string into a TypeAst
129    /// Mirrors C++ `bool TypeParser::Parse(TypeAst* type)`
130    pub fn parse(&mut self, type_ast: &mut TypeAst) -> bool {
131        // Safety: We use raw pointers to match C++ semantics, but we ensure:
132        // 1. Pointers are only used during parsing (within this function)
133        // 2. No pointers escape this function
134        // 3. TypeAst outlives all pointer operations
135
136        let type_ptr: *mut TypeAst = type_ast as *mut TypeAst;
137        self.current_type = Some(type_ptr);
138        self.open_elements.push(type_ptr);
139
140        let mut processed_tokens = 0;
141
142        loop {
143            let token = self.next_token();
144
145            match token.token_type {
146                TokenType::QuotedString => {
147                    unsafe {
148                        let current = self.current_type.unwrap();
149                        (*current).meta = TypeMeta::String; // Use String meta for quoted strings
150                                                            // Remove quotes from value
151                        if token.value.len() >= 2 {
152                            (*current).value_string = token.value
153                                [1..token.value.len() - 1]
154                                .to_string();
155                        } else {
156                            (*current).value_string = String::new();
157                        }
158                        (*current).code = TypeCode::String;
159                    }
160                }
161
162                TokenType::Name => unsafe {
163                    let current = self.current_type.unwrap();
164                    (*current).meta = get_type_meta(token.value);
165                    (*current).name = token.value.to_string();
166                    (*current).code = get_type_code(token.value);
167                },
168
169                TokenType::Number => unsafe {
170                    let current = self.current_type.unwrap();
171                    (*current).meta = TypeMeta::Number;
172                    (*current).value = token.value.parse::<i64>().unwrap_or(0);
173                },
174
175                TokenType::String => unsafe {
176                    let current = self.current_type.unwrap();
177                    (*current).meta = TypeMeta::String;
178                    (*current).value_string = token.value.to_string();
179                },
180
181                TokenType::LPar => {
182                    unsafe {
183                        let current = self.current_type.unwrap();
184                        (*current).elements.push(TypeAst::default());
185                        self.open_elements.push(current);
186                        // Get pointer to last element
187                        let last_idx = (*current).elements.len() - 1;
188                        let elements_ptr = (*current).elements.as_mut_ptr();
189                        let new_current = elements_ptr.add(last_idx);
190                        self.current_type = Some(new_current);
191                    }
192                }
193
194                TokenType::RPar => {
195                    self.open_elements.pop();
196                    if let Some(&parent) = self.open_elements.last() {
197                        self.current_type = Some(parent);
198                    }
199                }
200
201                TokenType::Assign | TokenType::Comma => {
202                    self.open_elements.pop();
203                    if let Some(&parent) = self.open_elements.last() {
204                        unsafe {
205                            (*parent).elements.push(TypeAst::default());
206                            self.open_elements.push(parent);
207                            let last_idx = (*parent).elements.len() - 1;
208                            let elements_ptr = (*parent).elements.as_mut_ptr();
209                            let new_current = elements_ptr.add(last_idx);
210                            self.current_type = Some(new_current);
211                        }
212                    }
213                }
214
215                TokenType::Eos => {
216                    // Unbalanced braces/brackets is an error
217                    if self.open_elements.len() != 1 {
218                        return false;
219                    }
220
221                    // Empty input string
222                    if processed_tokens == 0 {
223                        return false;
224                    }
225
226                    return validate_ast(type_ast);
227                }
228
229                TokenType::Invalid => {
230                    return false;
231                }
232            }
233
234            processed_tokens += 1;
235        }
236    }
237
238    /// Get next token from input
239    /// Mirrors C++ `TypeParser::Token TypeParser::NextToken()`
240    fn next_token(&mut self) -> Token<'a> {
241        let bytes = self.input.as_bytes();
242
243        // Skip whitespace
244        while self.cur < bytes.len() {
245            match bytes[self.cur] as char {
246                ' ' | '\n' | '\t' | '\0' => {
247                    self.cur += 1;
248                    continue;
249                }
250                '=' => {
251                    let start = self.cur;
252                    self.cur += 1;
253                    return Token {
254                        token_type: TokenType::Assign,
255                        value: &self.input[start..self.cur],
256                    };
257                }
258                '(' => {
259                    let start = self.cur;
260                    self.cur += 1;
261                    return Token {
262                        token_type: TokenType::LPar,
263                        value: &self.input[start..self.cur],
264                    };
265                }
266                ')' => {
267                    let start = self.cur;
268                    self.cur += 1;
269                    return Token {
270                        token_type: TokenType::RPar,
271                        value: &self.input[start..self.cur],
272                    };
273                }
274                ',' => {
275                    let start = self.cur;
276                    self.cur += 1;
277                    return Token {
278                        token_type: TokenType::Comma,
279                        value: &self.input[start..self.cur],
280                    };
281                }
282                '\'' => {
283                    // Quoted string
284                    let start = self.cur;
285                    self.cur += 1;
286
287                    // Fast forward to closing quote
288                    while self.cur < bytes.len() {
289                        if bytes[self.cur] as char == '\'' {
290                            self.cur += 1;
291                            return Token {
292                                token_type: TokenType::QuotedString,
293                                value: &self.input[start..self.cur],
294                            };
295                        }
296                        self.cur += 1;
297                    }
298
299                    return Token {
300                        token_type: TokenType::QuotedString,
301                        value: &self.input[start..self.cur],
302                    };
303                }
304                _ => {
305                    let start = self.cur;
306                    let ch = bytes[self.cur] as char;
307
308                    // Identifier (name)
309                    if ch.is_alphabetic() || ch == '_' {
310                        while self.cur < bytes.len() {
311                            let c = bytes[self.cur] as char;
312                            if !c.is_alphanumeric() && c != '_' {
313                                break;
314                            }
315                            self.cur += 1;
316                        }
317                        return Token {
318                            token_type: TokenType::Name,
319                            value: &self.input[start..self.cur],
320                        };
321                    }
322
323                    // Number
324                    if ch.is_numeric() || ch == '-' {
325                        self.cur += 1;
326                        while self.cur < bytes.len() {
327                            if !(bytes[self.cur] as char).is_numeric() {
328                                break;
329                            }
330                            self.cur += 1;
331                        }
332                        return Token {
333                            token_type: TokenType::Number,
334                            value: &self.input[start..self.cur],
335                        };
336                    }
337
338                    return Token {
339                        token_type: TokenType::Invalid,
340                        value: "",
341                    };
342                }
343            }
344        }
345
346        Token { token_type: TokenType::Eos, value: "" }
347    }
348}
349
350/// Get TypeMeta from type name
351/// Mirrors C++ `GetTypeMeta(const StringView& name)`
352fn get_type_meta(name: &str) -> TypeMeta {
353    match name {
354        "Array" => TypeMeta::Array,
355        "Null" => TypeMeta::Null,
356        "Nullable" => TypeMeta::Nullable,
357        "Tuple" => TypeMeta::Tuple,
358        "Enum8" | "Enum16" => TypeMeta::Enum,
359        "LowCardinality" => TypeMeta::LowCardinality,
360        "SimpleAggregateFunction" => TypeMeta::SimpleAggregateFunction,
361        "Map" => TypeMeta::Map,
362        _ => TypeMeta::Terminal,
363    }
364}
365
366/// Get TypeCode from type name
367/// Mirrors C++ `GetTypeCode(const std::string& name)`
368fn get_type_code(name: &str) -> TypeCode {
369    match name {
370        "Void" => TypeCode::Void,
371        "Int8" => TypeCode::Int8,
372        "Int16" => TypeCode::Int16,
373        "Int32" => TypeCode::Int32,
374        "Int64" => TypeCode::Int64,
375        "Bool" | "UInt8" => TypeCode::UInt8,
376        "UInt16" => TypeCode::UInt16,
377        "UInt32" => TypeCode::UInt32,
378        "UInt64" => TypeCode::UInt64,
379        "Float32" => TypeCode::Float32,
380        "Float64" => TypeCode::Float64,
381        "String" => TypeCode::String,
382        "FixedString" => TypeCode::FixedString,
383        "DateTime" => TypeCode::DateTime,
384        "DateTime64" => TypeCode::DateTime64,
385        "Date" => TypeCode::Date,
386        "Date32" => TypeCode::Date32,
387        "Array" => TypeCode::Array,
388        "Nullable" => TypeCode::Nullable,
389        "Tuple" => TypeCode::Tuple,
390        "Enum8" => TypeCode::Enum8,
391        "Enum16" => TypeCode::Enum16,
392        "UUID" => TypeCode::UUID,
393        "IPv4" => TypeCode::IPv4,
394        "IPv6" => TypeCode::IPv6,
395        "Int128" => TypeCode::Int128,
396        "UInt128" => TypeCode::UInt128,
397        "Decimal" => TypeCode::Decimal,
398        "Decimal32" => TypeCode::Decimal32,
399        "Decimal64" => TypeCode::Decimal64,
400        "Decimal128" => TypeCode::Decimal128,
401        "LowCardinality" => TypeCode::LowCardinality,
402        "Map" => TypeCode::Map,
403        "Point" => TypeCode::Point,
404        "Ring" => TypeCode::Ring,
405        "Polygon" => TypeCode::Polygon,
406        "MultiPolygon" => TypeCode::MultiPolygon,
407        _ => TypeCode::Void,
408    }
409}
410
411/// Validate the parsed AST
412/// Mirrors C++ `bool ValidateAST(const TypeAst& ast)`
413fn validate_ast(ast: &TypeAst) -> bool {
414    // Void terminal that is not actually "void" is an unknown type
415    if ast.meta == TypeMeta::Terminal
416        && ast.code == TypeCode::Void
417        && !ast.name.eq_ignore_ascii_case("void")
418        && !ast.name.is_empty()
419    {
420        return false;
421    }
422
423    true
424}
425
426// Thread-local cache for parsed type names
427// Each thread maintains its own cache for zero-overhead lookups.
428// Optimized for Rust: uses thread_local instead of global mutex (unlike C++
429// implementation).
430thread_local! {
431    static TYPE_CACHE: RefCell<HashMap<String, TypeAst>> =
432        RefCell::new(HashMap::new());
433}
434
435/// Parse a type name and return cached AST
436/// Mirrors C++ `const TypeAst* ParseTypeName(const std::string& type_name)`
437pub fn parse_type_name(type_name: &str) -> Result<TypeAst> {
438    TYPE_CACHE.with(|cache| {
439        // Try to get from thread-local cache
440        if let Some(ast) = cache.borrow().get(type_name) {
441            return Ok(ast.clone());
442        }
443
444        // Parse new AST
445        let mut ast = TypeAst::default();
446        let mut parser = TypeParser::new(type_name);
447
448        if !parser.parse(&mut ast) {
449            return Err(Error::Protocol(format!(
450                "Failed to parse type: {}",
451                type_name
452            )));
453        }
454
455        // Cache the result in thread-local storage
456        cache.borrow_mut().insert(type_name.to_string(), ast.clone());
457        Ok(ast)
458    })
459}
460
461#[cfg(test)]
462#[cfg_attr(coverage_nightly, coverage(off))]
463mod tests {
464    use super::*;
465
466    #[test]
467    fn test_simple_types() {
468        let ast = parse_type_name("Int32").unwrap();
469        assert_eq!(ast.meta, TypeMeta::Terminal);
470        assert_eq!(ast.code, TypeCode::Int32);
471        assert_eq!(ast.name, "Int32");
472    }
473
474    #[test]
475    fn test_array_type() {
476        let ast = parse_type_name("Array(String)").unwrap();
477        assert_eq!(ast.meta, TypeMeta::Array);
478        assert_eq!(ast.code, TypeCode::Array);
479        assert_eq!(ast.elements.len(), 1);
480        assert_eq!(ast.elements[0].code, TypeCode::String);
481    }
482
483    #[test]
484    fn test_nullable_type() {
485        let ast = parse_type_name("Nullable(UInt64)").unwrap();
486        assert_eq!(ast.meta, TypeMeta::Nullable);
487        assert_eq!(ast.elements.len(), 1);
488        assert_eq!(ast.elements[0].code, TypeCode::UInt64);
489    }
490
491    #[test]
492    fn test_nested_types() {
493        let ast = parse_type_name("Array(Nullable(String))").unwrap();
494        assert_eq!(ast.meta, TypeMeta::Array);
495        assert_eq!(ast.elements[0].meta, TypeMeta::Nullable);
496        assert_eq!(ast.elements[0].elements[0].code, TypeCode::String);
497    }
498
499    #[test]
500    fn test_fixed_string() {
501        let ast = parse_type_name("FixedString(10)").unwrap();
502        assert_eq!(ast.meta, TypeMeta::Terminal);
503        assert_eq!(ast.code, TypeCode::FixedString);
504        assert_eq!(ast.elements.len(), 1);
505        assert_eq!(ast.elements[0].meta, TypeMeta::Number);
506        assert_eq!(ast.elements[0].value, 10);
507    }
508
509    #[test]
510    fn test_enum8() {
511        let ast = parse_type_name("Enum8('red' = 1, 'green' = 2)").unwrap();
512        assert_eq!(ast.meta, TypeMeta::Enum);
513        assert_eq!(ast.code, TypeCode::Enum8);
514        assert_eq!(ast.elements.len(), 4); // 'red', 1, 'green', 2
515    }
516
517    #[test]
518    fn test_caching() {
519        let ast1 = parse_type_name("String").unwrap();
520        let ast2 = parse_type_name("String").unwrap();
521        assert_eq!(ast1, ast2);
522
523        // Verify it's actually cached by checking the thread-local cache
524        TYPE_CACHE.with(|cache| {
525            assert!(cache.borrow().contains_key("String"));
526        });
527    }
528}