lance_core/
datatypes.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Lance data types, [Schema] and [Field]
5
6use std::collections::HashMap;
7use std::fmt::{self, Debug, Formatter};
8use std::sync::{Arc, LazyLock};
9
10use arrow_array::ArrayRef;
11use arrow_schema::{DataType, Field as ArrowField, Fields, TimeUnit};
12use deepsize::DeepSizeOf;
13use lance_arrow::bfloat16::{is_bfloat16_field, BFLOAT16_EXT_NAME};
14use lance_arrow::{ARROW_EXT_META_KEY, ARROW_EXT_NAME_KEY};
15use snafu::location;
16
17mod field;
18mod schema;
19
20use crate::{Error, Result};
21pub use field::{
22    Encoding, Field, NullabilityComparison, OnTypeMismatch, SchemaCompareOptions, StorageClass,
23    LANCE_STORAGE_CLASS_SCHEMA_META_KEY,
24};
25pub use schema::{
26    escape_field_path_for_project, format_field_path, parse_field_path, FieldRef, OnMissing,
27    Projectable, Projection, Schema,
28};
29
30pub static BLOB_DESC_FIELDS: LazyLock<Fields> = LazyLock::new(|| {
31    Fields::from(vec![
32        ArrowField::new("position", DataType::UInt64, true),
33        ArrowField::new("size", DataType::UInt64, true),
34    ])
35});
36
37pub static BLOB_DESC_TYPE: LazyLock<DataType> =
38    LazyLock::new(|| DataType::Struct(BLOB_DESC_FIELDS.clone()));
39
40pub static BLOB_DESC_FIELD: LazyLock<ArrowField> = LazyLock::new(|| {
41    ArrowField::new("description", BLOB_DESC_TYPE.clone(), true).with_metadata(HashMap::from([(
42        lance_arrow::BLOB_META_KEY.to_string(),
43        "true".to_string(),
44    )]))
45});
46
47pub static BLOB_DESC_LANCE_FIELD: LazyLock<Field> =
48    LazyLock::new(|| Field::try_from(&*BLOB_DESC_FIELD).unwrap());
49
50/// LogicalType is a string presentation of arrow type.
51/// to be serialized into protobuf.
52#[derive(Debug, Clone, PartialEq, DeepSizeOf)]
53pub struct LogicalType(String);
54
55impl fmt::Display for LogicalType {
56    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
57        write!(f, "{}", self.0)
58    }
59}
60
61impl LogicalType {
62    fn is_list(&self) -> bool {
63        self.0 == "list" || self.0 == "list.struct"
64    }
65
66    fn is_large_list(&self) -> bool {
67        self.0 == "large_list" || self.0 == "large_list.struct"
68    }
69
70    fn is_struct(&self) -> bool {
71        self.0 == "struct"
72    }
73}
74
75impl From<&str> for LogicalType {
76    fn from(s: &str) -> Self {
77        Self(s.to_string())
78    }
79}
80
81fn timeunit_to_str(unit: &TimeUnit) -> &'static str {
82    match unit {
83        TimeUnit::Second => "s",
84        TimeUnit::Millisecond => "ms",
85        TimeUnit::Microsecond => "us",
86        TimeUnit::Nanosecond => "ns",
87    }
88}
89
90fn parse_timeunit(unit: &str) -> Result<TimeUnit> {
91    match unit {
92        "s" => Ok(TimeUnit::Second),
93        "ms" => Ok(TimeUnit::Millisecond),
94        "us" => Ok(TimeUnit::Microsecond),
95        "ns" => Ok(TimeUnit::Nanosecond),
96        _ => Err(Error::Arrow {
97            message: format!("Unsupported TimeUnit: {unit}"),
98            location: location!(),
99        }),
100    }
101}
102
103impl TryFrom<&DataType> for LogicalType {
104    type Error = Error;
105
106    fn try_from(dt: &DataType) -> Result<Self> {
107        let type_str = match dt {
108            DataType::Null => "null".to_string(),
109            DataType::Boolean => "bool".to_string(),
110            DataType::Int8 => "int8".to_string(),
111            DataType::UInt8 => "uint8".to_string(),
112            DataType::Int16 => "int16".to_string(),
113            DataType::UInt16 => "uint16".to_string(),
114            DataType::Int32 => "int32".to_string(),
115            DataType::UInt32 => "uint32".to_string(),
116            DataType::Int64 => "int64".to_string(),
117            DataType::UInt64 => "uint64".to_string(),
118            DataType::Float16 => "halffloat".to_string(),
119            DataType::Float32 => "float".to_string(),
120            DataType::Float64 => "double".to_string(),
121            DataType::Decimal128(precision, scale) => format!("decimal:128:{precision}:{scale}"),
122            DataType::Decimal256(precision, scale) => format!("decimal:256:{precision}:{scale}"),
123            DataType::Utf8 => "string".to_string(),
124            DataType::Binary => "binary".to_string(),
125            DataType::LargeUtf8 => "large_string".to_string(),
126            DataType::LargeBinary => "large_binary".to_string(),
127            DataType::Date32 => "date32:day".to_string(),
128            DataType::Date64 => "date64:ms".to_string(),
129            DataType::Time32(tu) => format!("time32:{}", timeunit_to_str(tu)),
130            DataType::Time64(tu) => format!("time64:{}", timeunit_to_str(tu)),
131            DataType::Timestamp(tu, tz) => format!(
132                "timestamp:{}:{}",
133                timeunit_to_str(tu),
134                tz.as_ref()
135                    .map(|v| v.to_string())
136                    .unwrap_or("-".to_string())
137            ),
138            DataType::Duration(tu) => format!("duration:{}", timeunit_to_str(tu)),
139            DataType::Struct(_) => "struct".to_string(),
140            DataType::Dictionary(key_type, value_type) => {
141                format!(
142                    "dict:{}:{}:{}",
143                    Self::try_from(value_type.as_ref())?.0,
144                    Self::try_from(key_type.as_ref())?.0,
145                    // Arrow C++ Dictionary has "ordered:bool" field, but it does not exist in `arrow-rs`.
146                    false
147                )
148            }
149            DataType::List(elem) => match elem.data_type() {
150                DataType::Struct(_) => "list.struct".to_string(),
151                _ => "list".to_string(),
152            },
153            DataType::LargeList(elem) => match elem.data_type() {
154                DataType::Struct(_) => "large_list.struct".to_string(),
155                _ => "large_list".to_string(),
156            },
157            DataType::FixedSizeList(field, len) => {
158                if is_bfloat16_field(field) {
159                    // Don't want to directly use `bfloat16`, in case a built-in type is added
160                    // that isn't identical to our extension type.
161                    format!("fixed_size_list:lance.bfloat16:{}", *len)
162                } else {
163                    format!(
164                        "fixed_size_list:{}:{}",
165                        Self::try_from(field.data_type())?.0,
166                        *len
167                    )
168                }
169            }
170            DataType::FixedSizeBinary(len) => format!("fixed_size_binary:{}", *len),
171            _ => {
172                return Err(Error::Schema {
173                    message: format!("Unsupported data type: {:?}", dt),
174                    location: location!(),
175                })
176            }
177        };
178
179        Ok(Self(type_str))
180    }
181}
182
183impl TryFrom<&LogicalType> for DataType {
184    type Error = Error;
185
186    fn try_from(lt: &LogicalType) -> Result<Self> {
187        use DataType::*;
188        if let Some(t) = match lt.0.as_str() {
189            "null" => Some(Null),
190            "bool" => Some(Boolean),
191            "int8" => Some(Int8),
192            "uint8" => Some(UInt8),
193            "int16" => Some(Int16),
194            "uint16" => Some(UInt16),
195            "int32" => Some(Int32),
196            "uint32" => Some(UInt32),
197            "int64" => Some(Int64),
198            "uint64" => Some(UInt64),
199            "halffloat" => Some(Float16),
200            "float" => Some(Float32),
201            "double" => Some(Float64),
202            "string" => Some(Utf8),
203            "binary" => Some(Binary),
204            "large_string" => Some(LargeUtf8),
205            "large_binary" => Some(LargeBinary),
206            "json" => Some(LargeBinary),
207            "date32:day" => Some(Date32),
208            "date64:ms" => Some(Date64),
209            "time32:s" => Some(Time32(TimeUnit::Second)),
210            "time32:ms" => Some(Time32(TimeUnit::Millisecond)),
211            "time64:us" => Some(Time64(TimeUnit::Microsecond)),
212            "time64:ns" => Some(Time64(TimeUnit::Nanosecond)),
213            "duration:s" => Some(Duration(TimeUnit::Second)),
214            "duration:ms" => Some(Duration(TimeUnit::Millisecond)),
215            "duration:us" => Some(Duration(TimeUnit::Microsecond)),
216            "duration:ns" => Some(Duration(TimeUnit::Nanosecond)),
217            _ => None,
218        } {
219            Ok(t)
220        } else {
221            let splits = lt.0.split(':').collect::<Vec<_>>();
222            match splits[0] {
223                "fixed_size_list" => {
224                    if splits.len() < 3 {
225                        return Err(Error::Schema {
226                            message: format!("Unsupported logical type: {}", lt),
227                            location: location!(),
228                        });
229                    }
230
231                    let size: i32 =
232                        splits
233                            .last()
234                            .unwrap()
235                            .parse::<i32>()
236                            .map_err(|e: _| Error::Schema {
237                                message: e.to_string(),
238                                location: location!(),
239                            })?;
240
241                    let inner_type = splits[1..splits.len() - 1].join(":");
242
243                    match inner_type.as_str() {
244                        BFLOAT16_EXT_NAME => {
245                            let field = ArrowField::new("item", Self::FixedSizeBinary(2), true)
246                                .with_metadata(
247                                    [
248                                        (ARROW_EXT_NAME_KEY.into(), BFLOAT16_EXT_NAME.into()),
249                                        (ARROW_EXT_META_KEY.into(), "".into()),
250                                    ]
251                                    .into(),
252                                );
253                            Ok(FixedSizeList(Arc::new(field), size))
254                        }
255                        data_type => {
256                            let elem_type = (&LogicalType(data_type.to_string())).try_into()?;
257
258                            Ok(FixedSizeList(
259                                Arc::new(ArrowField::new("item", elem_type, true)),
260                                size,
261                            ))
262                        }
263                    }
264                }
265                "fixed_size_binary" => {
266                    if splits.len() != 2 {
267                        Err(Error::Schema {
268                            message: format!("Unsupported logical type: {}", lt),
269                            location: location!(),
270                        })
271                    } else {
272                        let size: i32 = splits[1].parse::<i32>().map_err(|e: _| Error::Schema {
273                            message: e.to_string(),
274                            location: location!(),
275                        })?;
276                        Ok(FixedSizeBinary(size))
277                    }
278                }
279                "dict" => {
280                    if splits.len() != 4 {
281                        Err(Error::Schema {
282                            message: format!("Unsupported dictionary type: {}", lt),
283                            location: location!(),
284                        })
285                    } else {
286                        let value_type: Self = (&LogicalType::from(splits[1])).try_into()?;
287                        let index_type: Self = (&LogicalType::from(splits[2])).try_into()?;
288                        Ok(Dictionary(Box::new(index_type), Box::new(value_type)))
289                    }
290                }
291                "decimal" => {
292                    if splits.len() != 4 {
293                        Err(Error::Schema {
294                            message: format!("Unsupported decimal type: {}", lt),
295                            location: location!(),
296                        })
297                    } else {
298                        let bits: i16 = splits[1].parse::<i16>().map_err(|err| Error::Schema {
299                            message: err.to_string(),
300                            location: location!(),
301                        })?;
302                        let precision: u8 =
303                            splits[2].parse::<u8>().map_err(|err| Error::Schema {
304                                message: err.to_string(),
305                                location: location!(),
306                            })?;
307                        let scale: i8 = splits[3].parse::<i8>().map_err(|err| Error::Schema {
308                            message: err.to_string(),
309                            location: location!(),
310                        })?;
311
312                        if bits == 128 {
313                            Ok(Decimal128(precision, scale))
314                        } else if bits == 256 {
315                            Ok(Decimal256(precision, scale))
316                        } else {
317                            Err(Error::Schema {
318                                message: format!(
319                                    "Only Decimal128 and Decimal256 is supported. Found {bits}"
320                                ),
321                                location: location!(),
322                            })
323                        }
324                    }
325                }
326                "timestamp" => {
327                    if splits.len() != 3 {
328                        Err(Error::Schema {
329                            message: format!("Unsupported timestamp type: {}", lt),
330                            location: location!(),
331                        })
332                    } else {
333                        let timeunit = parse_timeunit(splits[1])?;
334                        let tz: Option<Arc<str>> = if splits[2] == "-" {
335                            None
336                        } else {
337                            Some(splits[2].into())
338                        };
339                        Ok(Timestamp(timeunit, tz))
340                    }
341                }
342                _ => Err(Error::Schema {
343                    message: format!("Unsupported logical type: {}", lt),
344                    location: location!(),
345                }),
346            }
347        }
348    }
349}
350
351#[derive(Debug, Clone, Default)]
352pub struct Dictionary {
353    pub offset: usize,
354
355    pub length: usize,
356
357    pub values: Option<ArrayRef>,
358}
359
360impl DeepSizeOf for Dictionary {
361    fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize {
362        self.values
363            .as_ref()
364            .map(|v| v.get_array_memory_size())
365            .unwrap_or(0)
366    }
367}
368
369impl PartialEq for Dictionary {
370    fn eq(&self, other: &Self) -> bool {
371        match (&self.values, &other.values) {
372            (Some(a), Some(b)) => a == b,
373            _ => false,
374        }
375    }
376}
377
378/// Returns true if Lance supports writing this datatype with nulls.
379pub fn lance_supports_nulls(datatype: &DataType) -> bool {
380    matches!(
381        datatype,
382        DataType::Utf8
383            | DataType::LargeUtf8
384            | DataType::Binary
385            | DataType::List(_)
386            | DataType::FixedSizeBinary(_)
387            | DataType::FixedSizeList(_, _)
388    )
389}