databend_driver_core/
schema.rs

1// Copyright 2021 Datafuse Labs
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use databend_client::SchemaField as APISchemaField;
18
19use crate::error::{Error, Result};
20
21use arrow_schema::{DataType as ArrowDataType, Field as ArrowField, SchemaRef as ArrowSchemaRef};
22
23// Extension types defined by Databend
24pub(crate) const EXTENSION_KEY: &str = "Extension";
25pub(crate) const ARROW_EXT_TYPE_EMPTY_ARRAY: &str = "EmptyArray";
26pub(crate) const ARROW_EXT_TYPE_EMPTY_MAP: &str = "EmptyMap";
27pub(crate) const ARROW_EXT_TYPE_VARIANT: &str = "Variant";
28pub(crate) const ARROW_EXT_TYPE_BITMAP: &str = "Bitmap";
29pub(crate) const ARROW_EXT_TYPE_GEOMETRY: &str = "Geometry";
30pub(crate) const ARROW_EXT_TYPE_GEOGRAPHY: &str = "Geography";
31pub(crate) const ARROW_EXT_TYPE_INTERVAL: &str = "Interval";
32pub(crate) const ARROW_EXT_TYPE_VECTOR: &str = "Vector";
33
34#[derive(Debug, Clone, PartialEq, Eq)]
35pub enum NumberDataType {
36    UInt8,
37    UInt16,
38    UInt32,
39    UInt64,
40    Int8,
41    Int16,
42    Int32,
43    Int64,
44    Float32,
45    Float64,
46}
47
48#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49pub struct DecimalSize {
50    pub precision: u8,
51    pub scale: u8,
52}
53
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub enum DecimalDataType {
56    Decimal128(DecimalSize),
57    Decimal256(DecimalSize),
58}
59
60impl DecimalDataType {
61    pub fn decimal_size(&self) -> &DecimalSize {
62        match self {
63            DecimalDataType::Decimal128(size) => size,
64            DecimalDataType::Decimal256(size) => size,
65        }
66    }
67}
68
69#[derive(Debug, Clone)]
70pub enum DataType {
71    Null,
72    EmptyArray,
73    EmptyMap,
74    Boolean,
75    Binary,
76    String,
77    Number(NumberDataType),
78    Decimal(DecimalDataType),
79    Timestamp,
80    TimestampTz,
81    Date,
82    Nullable(Box<DataType>),
83    Array(Box<DataType>),
84    Map(Box<DataType>),
85    Tuple(Vec<DataType>),
86    Variant,
87    Bitmap,
88    Geometry,
89    Geography,
90    Interval,
91    Vector(u64),
92    // Generic(usize),
93}
94
95impl DataType {
96    pub fn is_numeric(&self) -> bool {
97        match self {
98            DataType::Number(_) | DataType::Decimal(_) => true,
99            DataType::Nullable(inner) => inner.is_numeric(),
100            _ => false,
101        }
102    }
103}
104
105impl std::fmt::Display for DataType {
106    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
107        match self {
108            DataType::Null => write!(f, "Null"),
109            DataType::EmptyArray => write!(f, "EmptyArray"),
110            DataType::EmptyMap => write!(f, "EmptyMap"),
111            DataType::Boolean => write!(f, "Boolean"),
112            DataType::Binary => write!(f, "Binary"),
113            DataType::String => write!(f, "String"),
114            DataType::Number(n) => match n {
115                NumberDataType::UInt8 => write!(f, "UInt8"),
116                NumberDataType::UInt16 => write!(f, "UInt16"),
117                NumberDataType::UInt32 => write!(f, "UInt32"),
118                NumberDataType::UInt64 => write!(f, "UInt64"),
119                NumberDataType::Int8 => write!(f, "Int8"),
120                NumberDataType::Int16 => write!(f, "Int16"),
121                NumberDataType::Int32 => write!(f, "Int32"),
122                NumberDataType::Int64 => write!(f, "Int64"),
123                NumberDataType::Float32 => write!(f, "Float32"),
124                NumberDataType::Float64 => write!(f, "Float64"),
125            },
126            DataType::Decimal(d) => {
127                let size = d.decimal_size();
128                write!(f, "Decimal({}, {})", size.precision, size.scale)
129            }
130            DataType::Timestamp => write!(f, "Timestamp"),
131            DataType::TimestampTz => write!(f, "Timestamp_Tz"),
132            DataType::Date => write!(f, "Date"),
133            DataType::Nullable(inner) => write!(f, "Nullable({inner})"),
134            DataType::Array(inner) => write!(f, "Array({inner})"),
135            DataType::Map(inner) => match inner.as_ref() {
136                DataType::Tuple(tys) => {
137                    write!(f, "Map({}, {})", tys[0], tys[1])
138                }
139                _ => unreachable!(),
140            },
141            DataType::Tuple(inner) => {
142                let inner = inner
143                    .iter()
144                    .map(|x| x.to_string())
145                    .collect::<Vec<_>>()
146                    .join(", ");
147                write!(f, "Tuple({inner})")
148            }
149            DataType::Variant => write!(f, "Variant"),
150            DataType::Bitmap => write!(f, "Bitmap"),
151            DataType::Geometry => write!(f, "Geometry"),
152            DataType::Geography => write!(f, "Geography"),
153            DataType::Interval => write!(f, "Interval"),
154            DataType::Vector(d) => write!(f, "Vector({d})"),
155        }
156    }
157}
158
159#[derive(Debug, Clone)]
160pub struct Field {
161    pub name: String,
162    pub data_type: DataType,
163}
164
165#[derive(Debug, Clone, Default)]
166pub struct Schema(Vec<Field>);
167
168pub type SchemaRef = Arc<Schema>;
169
170impl Schema {
171    pub fn fields(&self) -> &[Field] {
172        &self.0
173    }
174
175    pub fn from_vec(fields: Vec<Field>) -> Self {
176        Self(fields)
177    }
178}
179
180impl TryFrom<&TypeDesc<'_>> for DataType {
181    type Error = Error;
182
183    fn try_from(desc: &TypeDesc) -> Result<Self> {
184        if desc.nullable {
185            let mut desc = desc.clone();
186            desc.nullable = false;
187            let inner = DataType::try_from(&desc)?;
188            return Ok(DataType::Nullable(Box::new(inner)));
189        }
190        let dt = match desc.name {
191            "NULL" | "Null" => DataType::Null,
192            "Boolean" => DataType::Boolean,
193            "Binary" => DataType::Binary,
194            "String" => DataType::String,
195            "Int8" => DataType::Number(NumberDataType::Int8),
196            "Int16" => DataType::Number(NumberDataType::Int16),
197            "Int32" => DataType::Number(NumberDataType::Int32),
198            "Int64" => DataType::Number(NumberDataType::Int64),
199            "UInt8" => DataType::Number(NumberDataType::UInt8),
200            "UInt16" => DataType::Number(NumberDataType::UInt16),
201            "UInt32" => DataType::Number(NumberDataType::UInt32),
202            "UInt64" => DataType::Number(NumberDataType::UInt64),
203            "Float32" => DataType::Number(NumberDataType::Float32),
204            "Float64" => DataType::Number(NumberDataType::Float64),
205            "Decimal" => {
206                let precision = desc.args[0].name.parse::<u8>()?;
207                let scale = desc.args[1].name.parse::<u8>()?;
208
209                if precision <= 38 {
210                    DataType::Decimal(DecimalDataType::Decimal128(DecimalSize {
211                        precision,
212                        scale,
213                    }))
214                } else {
215                    DataType::Decimal(DecimalDataType::Decimal256(DecimalSize {
216                        precision,
217                        scale,
218                    }))
219                }
220            }
221            "Timestamp" => DataType::Timestamp,
222            "Date" => DataType::Date,
223            "Nullable" => {
224                if desc.args.len() != 1 {
225                    return Err(Error::Parsing(
226                        "Nullable type must have one argument".to_string(),
227                    ));
228                }
229                let mut desc = desc.clone();
230                // ignore inner NULL indicator
231                desc.nullable = false;
232                let inner = Self::try_from(&desc.args[0])?;
233                DataType::Nullable(Box::new(inner))
234            }
235            "Array" => {
236                if desc.args.len() != 1 {
237                    return Err(Error::Parsing(
238                        "Array type must have one argument".to_string(),
239                    ));
240                }
241                if desc.args[0].name == "Nothing" {
242                    DataType::EmptyArray
243                } else {
244                    let inner = Self::try_from(&desc.args[0])?;
245                    DataType::Array(Box::new(inner))
246                }
247            }
248            "Map" => {
249                if desc.args.len() == 1 && desc.args[0].name == "Nothing" {
250                    DataType::EmptyMap
251                } else {
252                    if desc.args.len() != 2 {
253                        return Err(Error::Parsing(
254                            "Map type must have two arguments".to_string(),
255                        ));
256                    }
257                    let key_ty = Self::try_from(&desc.args[0])?;
258                    let val_ty = Self::try_from(&desc.args[1])?;
259                    DataType::Map(Box::new(DataType::Tuple(vec![key_ty, val_ty])))
260                }
261            }
262            "Tuple" => {
263                let mut inner = vec![];
264                for arg in &desc.args {
265                    inner.push(Self::try_from(arg)?);
266                }
267                DataType::Tuple(inner)
268            }
269            "Variant" => DataType::Variant,
270            "Bitmap" => DataType::Bitmap,
271            "Geometry" => DataType::Geometry,
272            "Geography" => DataType::Geography,
273            "Interval" => DataType::Interval,
274            "Vector" => {
275                let dimension = desc.args[0].name.parse::<u64>()?;
276                DataType::Vector(dimension)
277            }
278            "Timestamp_Tz" => DataType::TimestampTz,
279            _ => return Err(Error::Parsing(format!("Unknown type: {desc:?}"))),
280        };
281        Ok(dt)
282    }
283}
284
285impl TryFrom<APISchemaField> for Field {
286    type Error = Error;
287
288    fn try_from(f: APISchemaField) -> Result<Self> {
289        let type_desc = parse_type_desc(&f.data_type)?;
290        let dt = DataType::try_from(&type_desc)?;
291        let field = Self {
292            name: f.name,
293            data_type: dt,
294        };
295        Ok(field)
296    }
297}
298
299impl TryFrom<Vec<APISchemaField>> for Schema {
300    type Error = Error;
301
302    fn try_from(fields: Vec<APISchemaField>) -> Result<Self> {
303        let fields = fields
304            .into_iter()
305            .map(Field::try_from)
306            .collect::<Result<Vec<_>>>()?;
307        Ok(Self(fields))
308    }
309}
310
311impl TryFrom<&Arc<ArrowField>> for Field {
312    type Error = Error;
313
314    fn try_from(f: &Arc<ArrowField>) -> Result<Self> {
315        let mut dt = if let Some(extend_type) = f.metadata().get(EXTENSION_KEY) {
316            match extend_type.as_str() {
317                ARROW_EXT_TYPE_EMPTY_ARRAY => DataType::EmptyArray,
318                ARROW_EXT_TYPE_EMPTY_MAP => DataType::EmptyMap,
319                ARROW_EXT_TYPE_VARIANT => DataType::Variant,
320                ARROW_EXT_TYPE_BITMAP => DataType::Bitmap,
321                ARROW_EXT_TYPE_GEOMETRY => DataType::Geometry,
322                ARROW_EXT_TYPE_GEOGRAPHY => DataType::Geography,
323                ARROW_EXT_TYPE_INTERVAL => DataType::Interval,
324                ARROW_EXT_TYPE_VECTOR => match f.data_type() {
325                    ArrowDataType::FixedSizeList(field, dimension) => {
326                        let dimension = match field.data_type() {
327                            ArrowDataType::Float32 => *dimension as u64,
328                            _ => {
329                                return Err(Error::Parsing(format!(
330                                    "Unsupported FixedSizeList Arrow type: {:?}",
331                                    field.data_type()
332                                )));
333                            }
334                        };
335                        DataType::Vector(dimension)
336                    }
337                    arrow_type => {
338                        return Err(Error::Parsing(format!(
339                            "Unsupported Arrow type: {arrow_type:?}",
340                        )));
341                    }
342                },
343                _ => {
344                    return Err(Error::Parsing(format!(
345                        "Unsupported extension datatype for arrow field: {f:?}"
346                    )))
347                }
348            }
349        } else {
350            match f.data_type() {
351                ArrowDataType::Null => DataType::Null,
352                ArrowDataType::Boolean => DataType::Boolean,
353                ArrowDataType::Int8 => DataType::Number(NumberDataType::Int8),
354                ArrowDataType::Int16 => DataType::Number(NumberDataType::Int16),
355                ArrowDataType::Int32 => DataType::Number(NumberDataType::Int32),
356                ArrowDataType::Int64 => DataType::Number(NumberDataType::Int64),
357                ArrowDataType::UInt8 => DataType::Number(NumberDataType::UInt8),
358                ArrowDataType::UInt16 => DataType::Number(NumberDataType::UInt16),
359                ArrowDataType::UInt32 => DataType::Number(NumberDataType::UInt32),
360                ArrowDataType::UInt64 => DataType::Number(NumberDataType::UInt64),
361                ArrowDataType::Float32 => DataType::Number(NumberDataType::Float32),
362                ArrowDataType::Float64 => DataType::Number(NumberDataType::Float64),
363                ArrowDataType::Binary
364                | ArrowDataType::LargeBinary
365                | ArrowDataType::FixedSizeBinary(_) => DataType::Binary,
366                ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View => {
367                    DataType::String
368                }
369                ArrowDataType::Timestamp(_, _) => DataType::Timestamp,
370                ArrowDataType::Date32 => DataType::Date,
371                ArrowDataType::Decimal128(p, s) => {
372                    DataType::Decimal(DecimalDataType::Decimal128(DecimalSize {
373                        precision: *p,
374                        scale: *s as u8,
375                    }))
376                }
377                ArrowDataType::Decimal256(p, s) => {
378                    DataType::Decimal(DecimalDataType::Decimal256(DecimalSize {
379                        precision: *p,
380                        scale: *s as u8,
381                    }))
382                }
383                ArrowDataType::List(f) | ArrowDataType::LargeList(f) => {
384                    let inner_field = Field::try_from(f)?;
385                    let inner_ty = inner_field.data_type;
386                    DataType::Array(Box::new(inner_ty))
387                }
388                ArrowDataType::Map(f, _) => {
389                    let inner_field = Field::try_from(f)?;
390                    let inner_ty = inner_field.data_type;
391                    DataType::Map(Box::new(inner_ty))
392                }
393                ArrowDataType::Struct(fs) => {
394                    let mut inner_tys = Vec::with_capacity(fs.len());
395                    for f in fs {
396                        let inner_field = Field::try_from(f)?;
397                        let inner_ty = inner_field.data_type;
398                        inner_tys.push(inner_ty);
399                    }
400                    DataType::Tuple(inner_tys)
401                }
402                _ => {
403                    return Err(Error::Parsing(format!(
404                        "Unsupported datatype for arrow field: {f:?}"
405                    )))
406                }
407            }
408        };
409        if f.is_nullable() && !matches!(dt, DataType::Null) {
410            dt = DataType::Nullable(Box::new(dt));
411        }
412        Ok(Field {
413            name: f.name().to_string(),
414            data_type: dt,
415        })
416    }
417}
418
419impl TryFrom<ArrowSchemaRef> for Schema {
420    type Error = Error;
421
422    fn try_from(schema_ref: ArrowSchemaRef) -> Result<Self> {
423        let fields = schema_ref
424            .fields()
425            .iter()
426            .map(Field::try_from)
427            .collect::<Result<Vec<_>>>()?;
428        Ok(Self(fields))
429    }
430}
431
432#[derive(Debug, Clone, PartialEq, Eq)]
433struct TypeDesc<'t> {
434    name: &'t str,
435    nullable: bool,
436    args: Vec<TypeDesc<'t>>,
437}
438
439fn parse_type_desc(s: &str) -> Result<TypeDesc<'_>> {
440    let mut name = "";
441    let mut args = vec![];
442    let mut depth = 0;
443    let mut start = 0;
444    let mut nullable = false;
445    for (i, c) in s.char_indices() {
446        match c {
447            '(' => {
448                if depth == 0 {
449                    name = &s[start..i];
450                    start = i + 1;
451                }
452                depth += 1;
453            }
454            ')' => {
455                depth -= 1;
456                if depth == 0 {
457                    let s = &s[start..i];
458                    if !s.is_empty() {
459                        args.push(parse_type_desc(s)?);
460                    }
461                    start = i + 1;
462                }
463            }
464            ',' => {
465                if depth == 1 {
466                    let s = &s[start..i];
467                    args.push(parse_type_desc(s)?);
468                    start = i + 1;
469                }
470            }
471            ' ' => {
472                if depth == 0 {
473                    let s = &s[start..i];
474                    if !s.is_empty() {
475                        name = s;
476                    }
477                    start = i + 1;
478                }
479            }
480            _ => {}
481        }
482    }
483    if depth != 0 {
484        return Err(Error::Parsing(format!("Invalid type desc: {s}")));
485    }
486    if start < s.len() {
487        let s = &s[start..];
488        if !s.is_empty() {
489            if name.is_empty() {
490                name = s;
491            } else if s == "NULL" {
492                nullable = true;
493            } else {
494                return Err(Error::Parsing(format!("Invalid type arg for {name}: {s}")));
495            }
496        }
497    }
498    Ok(TypeDesc {
499        name,
500        nullable,
501        args,
502    })
503}
504
505#[cfg(test)]
506mod test {
507    use std::vec;
508
509    use super::*;
510
511    #[test]
512    fn test_parse_type_desc() {
513        struct TestCase<'t> {
514            desc: &'t str,
515            input: &'t str,
516            output: TypeDesc<'t>,
517        }
518        let test_cases = vec![
519            TestCase {
520                desc: "plain type",
521                input: "String",
522                output: TypeDesc {
523                    name: "String",
524                    nullable: false,
525                    args: vec![],
526                },
527            },
528            TestCase {
529                desc: "decimal type",
530                input: "Decimal(42, 42)",
531                output: TypeDesc {
532                    name: "Decimal",
533                    nullable: false,
534                    args: vec![
535                        TypeDesc {
536                            name: "42",
537                            nullable: false,
538                            args: vec![],
539                        },
540                        TypeDesc {
541                            name: "42",
542                            nullable: false,
543                            args: vec![],
544                        },
545                    ],
546                },
547            },
548            TestCase {
549                desc: "nullable type",
550                input: "Nullable(Nothing)",
551                output: TypeDesc {
552                    name: "Nullable",
553                    nullable: false,
554                    args: vec![TypeDesc {
555                        name: "Nothing",
556                        nullable: false,
557                        args: vec![],
558                    }],
559                },
560            },
561            TestCase {
562                desc: "empty arg",
563                input: "DateTime()",
564                output: TypeDesc {
565                    name: "DateTime",
566                    nullable: false,
567                    args: vec![],
568                },
569            },
570            TestCase {
571                desc: "numeric arg",
572                input: "FixedString(42)",
573                output: TypeDesc {
574                    name: "FixedString",
575                    nullable: false,
576                    args: vec![TypeDesc {
577                        name: "42",
578                        nullable: false,
579                        args: vec![],
580                    }],
581                },
582            },
583            TestCase {
584                desc: "multiple args",
585                input: "Array(Tuple(Tuple(String, String), Tuple(String, UInt64)))",
586                output: TypeDesc {
587                    name: "Array",
588                    nullable: false,
589                    args: vec![TypeDesc {
590                        name: "Tuple",
591                        nullable: false,
592                        args: vec![
593                            TypeDesc {
594                                name: "Tuple",
595                                nullable: false,
596                                args: vec![
597                                    TypeDesc {
598                                        name: "String",
599                                        nullable: false,
600                                        args: vec![],
601                                    },
602                                    TypeDesc {
603                                        name: "String",
604                                        nullable: false,
605                                        args: vec![],
606                                    },
607                                ],
608                            },
609                            TypeDesc {
610                                name: "Tuple",
611                                nullable: false,
612                                args: vec![
613                                    TypeDesc {
614                                        name: "String",
615                                        nullable: false,
616                                        args: vec![],
617                                    },
618                                    TypeDesc {
619                                        name: "UInt64",
620                                        nullable: false,
621                                        args: vec![],
622                                    },
623                                ],
624                            },
625                        ],
626                    }],
627                },
628            },
629            TestCase {
630                desc: "map args",
631                input: "Map(String, Array(Int64))",
632                output: TypeDesc {
633                    name: "Map",
634                    nullable: false,
635                    args: vec![
636                        TypeDesc {
637                            name: "String",
638                            nullable: false,
639                            args: vec![],
640                        },
641                        TypeDesc {
642                            name: "Array",
643                            nullable: false,
644                            args: vec![TypeDesc {
645                                name: "Int64",
646                                nullable: false,
647                                args: vec![],
648                            }],
649                        },
650                    ],
651                },
652            },
653            TestCase {
654                desc: "map nullable value args",
655                input: "Nullable(Map(String, String NULL))",
656                output: TypeDesc {
657                    name: "Nullable",
658                    nullable: false,
659                    args: vec![TypeDesc {
660                        name: "Map",
661                        nullable: false,
662                        args: vec![
663                            TypeDesc {
664                                name: "String",
665                                nullable: false,
666                                args: vec![],
667                            },
668                            TypeDesc {
669                                name: "String",
670                                nullable: true,
671                                args: vec![],
672                            },
673                        ],
674                    }],
675                },
676            },
677        ];
678        for case in test_cases {
679            let output = parse_type_desc(case.input).unwrap();
680            assert_eq!(output, case.output, "{}", case.desc);
681        }
682    }
683
684    #[test]
685    fn test_parse_complex_type_with_null() {
686        struct TestCase<'t> {
687            desc: &'t str,
688            input: &'t str,
689            output: TypeDesc<'t>,
690        }
691        let test_cases = vec![
692            TestCase {
693                desc: "complex nullable type",
694                input: "Nullable(Tuple(String NULL, Array(Tuple(Array(Int32 NULL) NULL, Array(String NULL) NULL) NULL) NULL))",
695                output: TypeDesc {
696                    name: "Nullable",
697                    nullable: false,
698                    args: vec![
699                        TypeDesc {
700                            name: "Tuple",
701                            nullable: false,
702                            args: vec![
703                                TypeDesc {
704                                    name: "String",
705                                    nullable: true,
706                                    args: vec![],
707                                },
708                                TypeDesc {
709                                    name: "Array",
710                                    nullable: true,
711                                    args: vec![
712                                        TypeDesc{
713                                            name: "Tuple",
714                                            nullable: true,
715                                            args: vec![
716                                                TypeDesc {
717                                                    name: "Array",
718                                                    nullable: true,
719                                                    args: vec![
720                                                        TypeDesc {
721                                                            name: "Int32",
722                                                            nullable: true,
723                                                            args: vec![],
724                                                        },
725                                                    ],
726                                                },
727                                                TypeDesc {
728                                                    name: "Array",
729                                                    nullable: true,
730                                                    args: vec![
731                                                        TypeDesc {
732                                                            name: "String",
733                                                            nullable: true,
734                                                            args: vec![],
735                                                        },
736                                                    ],
737                                                },
738                                            ]
739                                        }
740                                    ],
741                                },
742                            ],
743                        },
744                    ],
745                },
746            },
747        ];
748        for case in test_cases {
749            let output = parse_type_desc(case.input).unwrap();
750            assert_eq!(output, case.output, "{}", case.desc);
751        }
752    }
753}