1use std::sync::Arc;
16
17use databend_client::SchemaField as APISchemaField;
18
19use crate::error::{Error, Result};
20
21use arrow_schema::{DataType as ArrowDataType, Field as ArrowField, SchemaRef as ArrowSchemaRef};
22
23pub(crate) const EXTENSION_KEY: &str = "Extension";
25pub(crate) const ARROW_EXT_TYPE_EMPTY_ARRAY: &str = "EmptyArray";
26pub(crate) const ARROW_EXT_TYPE_EMPTY_MAP: &str = "EmptyMap";
27pub(crate) const ARROW_EXT_TYPE_VARIANT: &str = "Variant";
28pub(crate) const ARROW_EXT_TYPE_BITMAP: &str = "Bitmap";
29pub(crate) const ARROW_EXT_TYPE_GEOMETRY: &str = "Geometry";
30pub(crate) const ARROW_EXT_TYPE_GEOGRAPHY: &str = "Geography";
31pub(crate) const ARROW_EXT_TYPE_INTERVAL: &str = "Interval";
32pub(crate) const ARROW_EXT_TYPE_VECTOR: &str = "Vector";
33
34#[derive(Debug, Clone, PartialEq, Eq)]
35pub enum NumberDataType {
36 UInt8,
37 UInt16,
38 UInt32,
39 UInt64,
40 Int8,
41 Int16,
42 Int32,
43 Int64,
44 Float32,
45 Float64,
46}
47
48#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49pub struct DecimalSize {
50 pub precision: u8,
51 pub scale: u8,
52}
53
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub enum DecimalDataType {
56 Decimal128(DecimalSize),
57 Decimal256(DecimalSize),
58}
59
60impl DecimalDataType {
61 pub fn decimal_size(&self) -> &DecimalSize {
62 match self {
63 DecimalDataType::Decimal128(size) => size,
64 DecimalDataType::Decimal256(size) => size,
65 }
66 }
67}
68
69#[derive(Debug, Clone)]
70pub enum DataType {
71 Null,
72 EmptyArray,
73 EmptyMap,
74 Boolean,
75 Binary,
76 String,
77 Number(NumberDataType),
78 Decimal(DecimalDataType),
79 Timestamp,
80 TimestampTz,
81 Date,
82 Nullable(Box<DataType>),
83 Array(Box<DataType>),
84 Map(Box<DataType>),
85 Tuple(Vec<DataType>),
86 Variant,
87 Bitmap,
88 Geometry,
89 Geography,
90 Interval,
91 Vector(u64),
92 }
94
95impl DataType {
96 pub fn is_numeric(&self) -> bool {
97 match self {
98 DataType::Number(_) | DataType::Decimal(_) => true,
99 DataType::Nullable(inner) => inner.is_numeric(),
100 _ => false,
101 }
102 }
103}
104
105impl std::fmt::Display for DataType {
106 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
107 match self {
108 DataType::Null => write!(f, "Null"),
109 DataType::EmptyArray => write!(f, "EmptyArray"),
110 DataType::EmptyMap => write!(f, "EmptyMap"),
111 DataType::Boolean => write!(f, "Boolean"),
112 DataType::Binary => write!(f, "Binary"),
113 DataType::String => write!(f, "String"),
114 DataType::Number(n) => match n {
115 NumberDataType::UInt8 => write!(f, "UInt8"),
116 NumberDataType::UInt16 => write!(f, "UInt16"),
117 NumberDataType::UInt32 => write!(f, "UInt32"),
118 NumberDataType::UInt64 => write!(f, "UInt64"),
119 NumberDataType::Int8 => write!(f, "Int8"),
120 NumberDataType::Int16 => write!(f, "Int16"),
121 NumberDataType::Int32 => write!(f, "Int32"),
122 NumberDataType::Int64 => write!(f, "Int64"),
123 NumberDataType::Float32 => write!(f, "Float32"),
124 NumberDataType::Float64 => write!(f, "Float64"),
125 },
126 DataType::Decimal(d) => {
127 let size = d.decimal_size();
128 write!(f, "Decimal({}, {})", size.precision, size.scale)
129 }
130 DataType::Timestamp => write!(f, "Timestamp"),
131 DataType::TimestampTz => write!(f, "Timestamp_Tz"),
132 DataType::Date => write!(f, "Date"),
133 DataType::Nullable(inner) => write!(f, "Nullable({inner})"),
134 DataType::Array(inner) => write!(f, "Array({inner})"),
135 DataType::Map(inner) => match inner.as_ref() {
136 DataType::Tuple(tys) => {
137 write!(f, "Map({}, {})", tys[0], tys[1])
138 }
139 _ => unreachable!(),
140 },
141 DataType::Tuple(inner) => {
142 let inner = inner
143 .iter()
144 .map(|x| x.to_string())
145 .collect::<Vec<_>>()
146 .join(", ");
147 write!(f, "Tuple({inner})")
148 }
149 DataType::Variant => write!(f, "Variant"),
150 DataType::Bitmap => write!(f, "Bitmap"),
151 DataType::Geometry => write!(f, "Geometry"),
152 DataType::Geography => write!(f, "Geography"),
153 DataType::Interval => write!(f, "Interval"),
154 DataType::Vector(d) => write!(f, "Vector({d})"),
155 }
156 }
157}
158
159#[derive(Debug, Clone)]
160pub struct Field {
161 pub name: String,
162 pub data_type: DataType,
163}
164
165#[derive(Debug, Clone, Default)]
166pub struct Schema(Vec<Field>);
167
168pub type SchemaRef = Arc<Schema>;
169
170impl Schema {
171 pub fn fields(&self) -> &[Field] {
172 &self.0
173 }
174
175 pub fn from_vec(fields: Vec<Field>) -> Self {
176 Self(fields)
177 }
178}
179
180impl TryFrom<&TypeDesc<'_>> for DataType {
181 type Error = Error;
182
183 fn try_from(desc: &TypeDesc) -> Result<Self> {
184 if desc.nullable {
185 let mut desc = desc.clone();
186 desc.nullable = false;
187 let inner = DataType::try_from(&desc)?;
188 return Ok(DataType::Nullable(Box::new(inner)));
189 }
190 let dt = match desc.name {
191 "NULL" | "Null" => DataType::Null,
192 "Boolean" => DataType::Boolean,
193 "Binary" => DataType::Binary,
194 "String" => DataType::String,
195 "Int8" => DataType::Number(NumberDataType::Int8),
196 "Int16" => DataType::Number(NumberDataType::Int16),
197 "Int32" => DataType::Number(NumberDataType::Int32),
198 "Int64" => DataType::Number(NumberDataType::Int64),
199 "UInt8" => DataType::Number(NumberDataType::UInt8),
200 "UInt16" => DataType::Number(NumberDataType::UInt16),
201 "UInt32" => DataType::Number(NumberDataType::UInt32),
202 "UInt64" => DataType::Number(NumberDataType::UInt64),
203 "Float32" => DataType::Number(NumberDataType::Float32),
204 "Float64" => DataType::Number(NumberDataType::Float64),
205 "Decimal" => {
206 let precision = desc.args[0].name.parse::<u8>()?;
207 let scale = desc.args[1].name.parse::<u8>()?;
208
209 if precision <= 38 {
210 DataType::Decimal(DecimalDataType::Decimal128(DecimalSize {
211 precision,
212 scale,
213 }))
214 } else {
215 DataType::Decimal(DecimalDataType::Decimal256(DecimalSize {
216 precision,
217 scale,
218 }))
219 }
220 }
221 "Timestamp" => DataType::Timestamp,
222 "Date" => DataType::Date,
223 "Nullable" => {
224 if desc.args.len() != 1 {
225 return Err(Error::Parsing(
226 "Nullable type must have one argument".to_string(),
227 ));
228 }
229 let mut desc = desc.clone();
230 desc.nullable = false;
232 let inner = Self::try_from(&desc.args[0])?;
233 DataType::Nullable(Box::new(inner))
234 }
235 "Array" => {
236 if desc.args.len() != 1 {
237 return Err(Error::Parsing(
238 "Array type must have one argument".to_string(),
239 ));
240 }
241 if desc.args[0].name == "Nothing" {
242 DataType::EmptyArray
243 } else {
244 let inner = Self::try_from(&desc.args[0])?;
245 DataType::Array(Box::new(inner))
246 }
247 }
248 "Map" => {
249 if desc.args.len() == 1 && desc.args[0].name == "Nothing" {
250 DataType::EmptyMap
251 } else {
252 if desc.args.len() != 2 {
253 return Err(Error::Parsing(
254 "Map type must have two arguments".to_string(),
255 ));
256 }
257 let key_ty = Self::try_from(&desc.args[0])?;
258 let val_ty = Self::try_from(&desc.args[1])?;
259 DataType::Map(Box::new(DataType::Tuple(vec![key_ty, val_ty])))
260 }
261 }
262 "Tuple" => {
263 let mut inner = vec![];
264 for arg in &desc.args {
265 inner.push(Self::try_from(arg)?);
266 }
267 DataType::Tuple(inner)
268 }
269 "Variant" => DataType::Variant,
270 "Bitmap" => DataType::Bitmap,
271 "Geometry" => DataType::Geometry,
272 "Geography" => DataType::Geography,
273 "Interval" => DataType::Interval,
274 "Vector" => {
275 let dimension = desc.args[0].name.parse::<u64>()?;
276 DataType::Vector(dimension)
277 }
278 "Timestamp_Tz" => DataType::TimestampTz,
279 _ => return Err(Error::Parsing(format!("Unknown type: {desc:?}"))),
280 };
281 Ok(dt)
282 }
283}
284
285impl TryFrom<APISchemaField> for Field {
286 type Error = Error;
287
288 fn try_from(f: APISchemaField) -> Result<Self> {
289 let type_desc = parse_type_desc(&f.data_type)?;
290 let dt = DataType::try_from(&type_desc)?;
291 let field = Self {
292 name: f.name,
293 data_type: dt,
294 };
295 Ok(field)
296 }
297}
298
299impl TryFrom<Vec<APISchemaField>> for Schema {
300 type Error = Error;
301
302 fn try_from(fields: Vec<APISchemaField>) -> Result<Self> {
303 let fields = fields
304 .into_iter()
305 .map(Field::try_from)
306 .collect::<Result<Vec<_>>>()?;
307 Ok(Self(fields))
308 }
309}
310
311impl TryFrom<&Arc<ArrowField>> for Field {
312 type Error = Error;
313
314 fn try_from(f: &Arc<ArrowField>) -> Result<Self> {
315 let mut dt = if let Some(extend_type) = f.metadata().get(EXTENSION_KEY) {
316 match extend_type.as_str() {
317 ARROW_EXT_TYPE_EMPTY_ARRAY => DataType::EmptyArray,
318 ARROW_EXT_TYPE_EMPTY_MAP => DataType::EmptyMap,
319 ARROW_EXT_TYPE_VARIANT => DataType::Variant,
320 ARROW_EXT_TYPE_BITMAP => DataType::Bitmap,
321 ARROW_EXT_TYPE_GEOMETRY => DataType::Geometry,
322 ARROW_EXT_TYPE_GEOGRAPHY => DataType::Geography,
323 ARROW_EXT_TYPE_INTERVAL => DataType::Interval,
324 ARROW_EXT_TYPE_VECTOR => match f.data_type() {
325 ArrowDataType::FixedSizeList(field, dimension) => {
326 let dimension = match field.data_type() {
327 ArrowDataType::Float32 => *dimension as u64,
328 _ => {
329 return Err(Error::Parsing(format!(
330 "Unsupported FixedSizeList Arrow type: {:?}",
331 field.data_type()
332 )));
333 }
334 };
335 DataType::Vector(dimension)
336 }
337 arrow_type => {
338 return Err(Error::Parsing(format!(
339 "Unsupported Arrow type: {arrow_type:?}",
340 )));
341 }
342 },
343 _ => {
344 return Err(Error::Parsing(format!(
345 "Unsupported extension datatype for arrow field: {f:?}"
346 )))
347 }
348 }
349 } else {
350 match f.data_type() {
351 ArrowDataType::Null => DataType::Null,
352 ArrowDataType::Boolean => DataType::Boolean,
353 ArrowDataType::Int8 => DataType::Number(NumberDataType::Int8),
354 ArrowDataType::Int16 => DataType::Number(NumberDataType::Int16),
355 ArrowDataType::Int32 => DataType::Number(NumberDataType::Int32),
356 ArrowDataType::Int64 => DataType::Number(NumberDataType::Int64),
357 ArrowDataType::UInt8 => DataType::Number(NumberDataType::UInt8),
358 ArrowDataType::UInt16 => DataType::Number(NumberDataType::UInt16),
359 ArrowDataType::UInt32 => DataType::Number(NumberDataType::UInt32),
360 ArrowDataType::UInt64 => DataType::Number(NumberDataType::UInt64),
361 ArrowDataType::Float32 => DataType::Number(NumberDataType::Float32),
362 ArrowDataType::Float64 => DataType::Number(NumberDataType::Float64),
363 ArrowDataType::Binary
364 | ArrowDataType::LargeBinary
365 | ArrowDataType::FixedSizeBinary(_) => DataType::Binary,
366 ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View => {
367 DataType::String
368 }
369 ArrowDataType::Timestamp(_, _) => DataType::Timestamp,
370 ArrowDataType::Date32 => DataType::Date,
371 ArrowDataType::Decimal128(p, s) => {
372 DataType::Decimal(DecimalDataType::Decimal128(DecimalSize {
373 precision: *p,
374 scale: *s as u8,
375 }))
376 }
377 ArrowDataType::Decimal256(p, s) => {
378 DataType::Decimal(DecimalDataType::Decimal256(DecimalSize {
379 precision: *p,
380 scale: *s as u8,
381 }))
382 }
383 ArrowDataType::List(f) | ArrowDataType::LargeList(f) => {
384 let inner_field = Field::try_from(f)?;
385 let inner_ty = inner_field.data_type;
386 DataType::Array(Box::new(inner_ty))
387 }
388 ArrowDataType::Map(f, _) => {
389 let inner_field = Field::try_from(f)?;
390 let inner_ty = inner_field.data_type;
391 DataType::Map(Box::new(inner_ty))
392 }
393 ArrowDataType::Struct(fs) => {
394 let mut inner_tys = Vec::with_capacity(fs.len());
395 for f in fs {
396 let inner_field = Field::try_from(f)?;
397 let inner_ty = inner_field.data_type;
398 inner_tys.push(inner_ty);
399 }
400 DataType::Tuple(inner_tys)
401 }
402 _ => {
403 return Err(Error::Parsing(format!(
404 "Unsupported datatype for arrow field: {f:?}"
405 )))
406 }
407 }
408 };
409 if f.is_nullable() && !matches!(dt, DataType::Null) {
410 dt = DataType::Nullable(Box::new(dt));
411 }
412 Ok(Field {
413 name: f.name().to_string(),
414 data_type: dt,
415 })
416 }
417}
418
419impl TryFrom<ArrowSchemaRef> for Schema {
420 type Error = Error;
421
422 fn try_from(schema_ref: ArrowSchemaRef) -> Result<Self> {
423 let fields = schema_ref
424 .fields()
425 .iter()
426 .map(Field::try_from)
427 .collect::<Result<Vec<_>>>()?;
428 Ok(Self(fields))
429 }
430}
431
432#[derive(Debug, Clone, PartialEq, Eq)]
433struct TypeDesc<'t> {
434 name: &'t str,
435 nullable: bool,
436 args: Vec<TypeDesc<'t>>,
437}
438
439fn parse_type_desc(s: &str) -> Result<TypeDesc<'_>> {
440 let mut name = "";
441 let mut args = vec![];
442 let mut depth = 0;
443 let mut start = 0;
444 let mut nullable = false;
445 for (i, c) in s.char_indices() {
446 match c {
447 '(' => {
448 if depth == 0 {
449 name = &s[start..i];
450 start = i + 1;
451 }
452 depth += 1;
453 }
454 ')' => {
455 depth -= 1;
456 if depth == 0 {
457 let s = &s[start..i];
458 if !s.is_empty() {
459 args.push(parse_type_desc(s)?);
460 }
461 start = i + 1;
462 }
463 }
464 ',' => {
465 if depth == 1 {
466 let s = &s[start..i];
467 args.push(parse_type_desc(s)?);
468 start = i + 1;
469 }
470 }
471 ' ' => {
472 if depth == 0 {
473 let s = &s[start..i];
474 if !s.is_empty() {
475 name = s;
476 }
477 start = i + 1;
478 }
479 }
480 _ => {}
481 }
482 }
483 if depth != 0 {
484 return Err(Error::Parsing(format!("Invalid type desc: {s}")));
485 }
486 if start < s.len() {
487 let s = &s[start..];
488 if !s.is_empty() {
489 if name.is_empty() {
490 name = s;
491 } else if s == "NULL" {
492 nullable = true;
493 } else {
494 return Err(Error::Parsing(format!("Invalid type arg for {name}: {s}")));
495 }
496 }
497 }
498 Ok(TypeDesc {
499 name,
500 nullable,
501 args,
502 })
503}
504
505#[cfg(test)]
506mod test {
507 use std::vec;
508
509 use super::*;
510
511 #[test]
512 fn test_parse_type_desc() {
513 struct TestCase<'t> {
514 desc: &'t str,
515 input: &'t str,
516 output: TypeDesc<'t>,
517 }
518 let test_cases = vec![
519 TestCase {
520 desc: "plain type",
521 input: "String",
522 output: TypeDesc {
523 name: "String",
524 nullable: false,
525 args: vec![],
526 },
527 },
528 TestCase {
529 desc: "decimal type",
530 input: "Decimal(42, 42)",
531 output: TypeDesc {
532 name: "Decimal",
533 nullable: false,
534 args: vec![
535 TypeDesc {
536 name: "42",
537 nullable: false,
538 args: vec![],
539 },
540 TypeDesc {
541 name: "42",
542 nullable: false,
543 args: vec![],
544 },
545 ],
546 },
547 },
548 TestCase {
549 desc: "nullable type",
550 input: "Nullable(Nothing)",
551 output: TypeDesc {
552 name: "Nullable",
553 nullable: false,
554 args: vec![TypeDesc {
555 name: "Nothing",
556 nullable: false,
557 args: vec![],
558 }],
559 },
560 },
561 TestCase {
562 desc: "empty arg",
563 input: "DateTime()",
564 output: TypeDesc {
565 name: "DateTime",
566 nullable: false,
567 args: vec![],
568 },
569 },
570 TestCase {
571 desc: "numeric arg",
572 input: "FixedString(42)",
573 output: TypeDesc {
574 name: "FixedString",
575 nullable: false,
576 args: vec![TypeDesc {
577 name: "42",
578 nullable: false,
579 args: vec![],
580 }],
581 },
582 },
583 TestCase {
584 desc: "multiple args",
585 input: "Array(Tuple(Tuple(String, String), Tuple(String, UInt64)))",
586 output: TypeDesc {
587 name: "Array",
588 nullable: false,
589 args: vec![TypeDesc {
590 name: "Tuple",
591 nullable: false,
592 args: vec![
593 TypeDesc {
594 name: "Tuple",
595 nullable: false,
596 args: vec![
597 TypeDesc {
598 name: "String",
599 nullable: false,
600 args: vec![],
601 },
602 TypeDesc {
603 name: "String",
604 nullable: false,
605 args: vec![],
606 },
607 ],
608 },
609 TypeDesc {
610 name: "Tuple",
611 nullable: false,
612 args: vec![
613 TypeDesc {
614 name: "String",
615 nullable: false,
616 args: vec![],
617 },
618 TypeDesc {
619 name: "UInt64",
620 nullable: false,
621 args: vec![],
622 },
623 ],
624 },
625 ],
626 }],
627 },
628 },
629 TestCase {
630 desc: "map args",
631 input: "Map(String, Array(Int64))",
632 output: TypeDesc {
633 name: "Map",
634 nullable: false,
635 args: vec![
636 TypeDesc {
637 name: "String",
638 nullable: false,
639 args: vec![],
640 },
641 TypeDesc {
642 name: "Array",
643 nullable: false,
644 args: vec![TypeDesc {
645 name: "Int64",
646 nullable: false,
647 args: vec![],
648 }],
649 },
650 ],
651 },
652 },
653 TestCase {
654 desc: "map nullable value args",
655 input: "Nullable(Map(String, String NULL))",
656 output: TypeDesc {
657 name: "Nullable",
658 nullable: false,
659 args: vec![TypeDesc {
660 name: "Map",
661 nullable: false,
662 args: vec![
663 TypeDesc {
664 name: "String",
665 nullable: false,
666 args: vec![],
667 },
668 TypeDesc {
669 name: "String",
670 nullable: true,
671 args: vec![],
672 },
673 ],
674 }],
675 },
676 },
677 ];
678 for case in test_cases {
679 let output = parse_type_desc(case.input).unwrap();
680 assert_eq!(output, case.output, "{}", case.desc);
681 }
682 }
683
684 #[test]
685 fn test_parse_complex_type_with_null() {
686 struct TestCase<'t> {
687 desc: &'t str,
688 input: &'t str,
689 output: TypeDesc<'t>,
690 }
691 let test_cases = vec![
692 TestCase {
693 desc: "complex nullable type",
694 input: "Nullable(Tuple(String NULL, Array(Tuple(Array(Int32 NULL) NULL, Array(String NULL) NULL) NULL) NULL))",
695 output: TypeDesc {
696 name: "Nullable",
697 nullable: false,
698 args: vec![
699 TypeDesc {
700 name: "Tuple",
701 nullable: false,
702 args: vec![
703 TypeDesc {
704 name: "String",
705 nullable: true,
706 args: vec![],
707 },
708 TypeDesc {
709 name: "Array",
710 nullable: true,
711 args: vec![
712 TypeDesc{
713 name: "Tuple",
714 nullable: true,
715 args: vec![
716 TypeDesc {
717 name: "Array",
718 nullable: true,
719 args: vec![
720 TypeDesc {
721 name: "Int32",
722 nullable: true,
723 args: vec![],
724 },
725 ],
726 },
727 TypeDesc {
728 name: "Array",
729 nullable: true,
730 args: vec![
731 TypeDesc {
732 name: "String",
733 nullable: true,
734 args: vec![],
735 },
736 ],
737 },
738 ]
739 }
740 ],
741 },
742 ],
743 },
744 ],
745 },
746 },
747 ];
748 for case in test_cases {
749 let output = parse_type_desc(case.input).unwrap();
750 assert_eq!(output, case.output, "{}", case.desc);
751 }
752 }
753}