1use std::collections::HashMap;
7use std::fmt::{self, Debug, Formatter};
8use std::sync::{Arc, LazyLock};
9
10use arrow_array::ArrayRef;
11use arrow_schema::{DataType, Field as ArrowField, Fields, TimeUnit};
12use deepsize::DeepSizeOf;
13use lance_arrow::bfloat16::{is_bfloat16_field, BFLOAT16_EXT_NAME};
14use lance_arrow::{ARROW_EXT_META_KEY, ARROW_EXT_NAME_KEY};
15use snafu::location;
16
17mod field;
18mod schema;
19
20use crate::{Error, Result};
21pub use field::{
22 Encoding, Field, NullabilityComparison, OnTypeMismatch, SchemaCompareOptions, StorageClass,
23 LANCE_STORAGE_CLASS_SCHEMA_META_KEY,
24};
25pub use schema::{
26 escape_field_path_for_project, format_field_path, parse_field_path, FieldRef, OnMissing,
27 Projectable, Projection, Schema,
28};
29
30pub static BLOB_DESC_FIELDS: LazyLock<Fields> = LazyLock::new(|| {
31 Fields::from(vec![
32 ArrowField::new("position", DataType::UInt64, true),
33 ArrowField::new("size", DataType::UInt64, true),
34 ])
35});
36
37pub static BLOB_DESC_TYPE: LazyLock<DataType> =
38 LazyLock::new(|| DataType::Struct(BLOB_DESC_FIELDS.clone()));
39
40pub static BLOB_DESC_FIELD: LazyLock<ArrowField> = LazyLock::new(|| {
41 ArrowField::new("description", BLOB_DESC_TYPE.clone(), true).with_metadata(HashMap::from([(
42 lance_arrow::BLOB_META_KEY.to_string(),
43 "true".to_string(),
44 )]))
45});
46
47pub static BLOB_DESC_LANCE_FIELD: LazyLock<Field> =
48 LazyLock::new(|| Field::try_from(&*BLOB_DESC_FIELD).unwrap());
49
50#[derive(Debug, Clone, PartialEq, DeepSizeOf)]
53pub struct LogicalType(String);
54
55impl fmt::Display for LogicalType {
56 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
57 write!(f, "{}", self.0)
58 }
59}
60
61impl LogicalType {
62 fn is_list(&self) -> bool {
63 self.0 == "list" || self.0 == "list.struct"
64 }
65
66 fn is_large_list(&self) -> bool {
67 self.0 == "large_list" || self.0 == "large_list.struct"
68 }
69
70 fn is_struct(&self) -> bool {
71 self.0 == "struct"
72 }
73}
74
75impl From<&str> for LogicalType {
76 fn from(s: &str) -> Self {
77 Self(s.to_string())
78 }
79}
80
81fn timeunit_to_str(unit: &TimeUnit) -> &'static str {
82 match unit {
83 TimeUnit::Second => "s",
84 TimeUnit::Millisecond => "ms",
85 TimeUnit::Microsecond => "us",
86 TimeUnit::Nanosecond => "ns",
87 }
88}
89
90fn parse_timeunit(unit: &str) -> Result<TimeUnit> {
91 match unit {
92 "s" => Ok(TimeUnit::Second),
93 "ms" => Ok(TimeUnit::Millisecond),
94 "us" => Ok(TimeUnit::Microsecond),
95 "ns" => Ok(TimeUnit::Nanosecond),
96 _ => Err(Error::Arrow {
97 message: format!("Unsupported TimeUnit: {unit}"),
98 location: location!(),
99 }),
100 }
101}
102
103impl TryFrom<&DataType> for LogicalType {
104 type Error = Error;
105
106 fn try_from(dt: &DataType) -> Result<Self> {
107 let type_str = match dt {
108 DataType::Null => "null".to_string(),
109 DataType::Boolean => "bool".to_string(),
110 DataType::Int8 => "int8".to_string(),
111 DataType::UInt8 => "uint8".to_string(),
112 DataType::Int16 => "int16".to_string(),
113 DataType::UInt16 => "uint16".to_string(),
114 DataType::Int32 => "int32".to_string(),
115 DataType::UInt32 => "uint32".to_string(),
116 DataType::Int64 => "int64".to_string(),
117 DataType::UInt64 => "uint64".to_string(),
118 DataType::Float16 => "halffloat".to_string(),
119 DataType::Float32 => "float".to_string(),
120 DataType::Float64 => "double".to_string(),
121 DataType::Decimal128(precision, scale) => format!("decimal:128:{precision}:{scale}"),
122 DataType::Decimal256(precision, scale) => format!("decimal:256:{precision}:{scale}"),
123 DataType::Utf8 => "string".to_string(),
124 DataType::Binary => "binary".to_string(),
125 DataType::LargeUtf8 => "large_string".to_string(),
126 DataType::LargeBinary => "large_binary".to_string(),
127 DataType::Date32 => "date32:day".to_string(),
128 DataType::Date64 => "date64:ms".to_string(),
129 DataType::Time32(tu) => format!("time32:{}", timeunit_to_str(tu)),
130 DataType::Time64(tu) => format!("time64:{}", timeunit_to_str(tu)),
131 DataType::Timestamp(tu, tz) => format!(
132 "timestamp:{}:{}",
133 timeunit_to_str(tu),
134 tz.as_ref()
135 .map(|v| v.to_string())
136 .unwrap_or("-".to_string())
137 ),
138 DataType::Duration(tu) => format!("duration:{}", timeunit_to_str(tu)),
139 DataType::Struct(_) => "struct".to_string(),
140 DataType::Dictionary(key_type, value_type) => {
141 format!(
142 "dict:{}:{}:{}",
143 Self::try_from(value_type.as_ref())?.0,
144 Self::try_from(key_type.as_ref())?.0,
145 false
147 )
148 }
149 DataType::List(elem) => match elem.data_type() {
150 DataType::Struct(_) => "list.struct".to_string(),
151 _ => "list".to_string(),
152 },
153 DataType::LargeList(elem) => match elem.data_type() {
154 DataType::Struct(_) => "large_list.struct".to_string(),
155 _ => "large_list".to_string(),
156 },
157 DataType::FixedSizeList(field, len) => {
158 if is_bfloat16_field(field) {
159 format!("fixed_size_list:lance.bfloat16:{}", *len)
162 } else {
163 format!(
164 "fixed_size_list:{}:{}",
165 Self::try_from(field.data_type())?.0,
166 *len
167 )
168 }
169 }
170 DataType::FixedSizeBinary(len) => format!("fixed_size_binary:{}", *len),
171 _ => {
172 return Err(Error::Schema {
173 message: format!("Unsupported data type: {:?}", dt),
174 location: location!(),
175 })
176 }
177 };
178
179 Ok(Self(type_str))
180 }
181}
182
183impl TryFrom<&LogicalType> for DataType {
184 type Error = Error;
185
186 fn try_from(lt: &LogicalType) -> Result<Self> {
187 use DataType::*;
188 if let Some(t) = match lt.0.as_str() {
189 "null" => Some(Null),
190 "bool" => Some(Boolean),
191 "int8" => Some(Int8),
192 "uint8" => Some(UInt8),
193 "int16" => Some(Int16),
194 "uint16" => Some(UInt16),
195 "int32" => Some(Int32),
196 "uint32" => Some(UInt32),
197 "int64" => Some(Int64),
198 "uint64" => Some(UInt64),
199 "halffloat" => Some(Float16),
200 "float" => Some(Float32),
201 "double" => Some(Float64),
202 "string" => Some(Utf8),
203 "binary" => Some(Binary),
204 "large_string" => Some(LargeUtf8),
205 "large_binary" => Some(LargeBinary),
206 "json" => Some(LargeBinary),
207 "date32:day" => Some(Date32),
208 "date64:ms" => Some(Date64),
209 "time32:s" => Some(Time32(TimeUnit::Second)),
210 "time32:ms" => Some(Time32(TimeUnit::Millisecond)),
211 "time64:us" => Some(Time64(TimeUnit::Microsecond)),
212 "time64:ns" => Some(Time64(TimeUnit::Nanosecond)),
213 "duration:s" => Some(Duration(TimeUnit::Second)),
214 "duration:ms" => Some(Duration(TimeUnit::Millisecond)),
215 "duration:us" => Some(Duration(TimeUnit::Microsecond)),
216 "duration:ns" => Some(Duration(TimeUnit::Nanosecond)),
217 _ => None,
218 } {
219 Ok(t)
220 } else {
221 let splits = lt.0.split(':').collect::<Vec<_>>();
222 match splits[0] {
223 "fixed_size_list" => {
224 if splits.len() < 3 {
225 return Err(Error::Schema {
226 message: format!("Unsupported logical type: {}", lt),
227 location: location!(),
228 });
229 }
230
231 let size: i32 =
232 splits
233 .last()
234 .unwrap()
235 .parse::<i32>()
236 .map_err(|e: _| Error::Schema {
237 message: e.to_string(),
238 location: location!(),
239 })?;
240
241 let inner_type = splits[1..splits.len() - 1].join(":");
242
243 match inner_type.as_str() {
244 BFLOAT16_EXT_NAME => {
245 let field = ArrowField::new("item", Self::FixedSizeBinary(2), true)
246 .with_metadata(
247 [
248 (ARROW_EXT_NAME_KEY.into(), BFLOAT16_EXT_NAME.into()),
249 (ARROW_EXT_META_KEY.into(), "".into()),
250 ]
251 .into(),
252 );
253 Ok(FixedSizeList(Arc::new(field), size))
254 }
255 data_type => {
256 let elem_type = (&LogicalType(data_type.to_string())).try_into()?;
257
258 Ok(FixedSizeList(
259 Arc::new(ArrowField::new("item", elem_type, true)),
260 size,
261 ))
262 }
263 }
264 }
265 "fixed_size_binary" => {
266 if splits.len() != 2 {
267 Err(Error::Schema {
268 message: format!("Unsupported logical type: {}", lt),
269 location: location!(),
270 })
271 } else {
272 let size: i32 = splits[1].parse::<i32>().map_err(|e: _| Error::Schema {
273 message: e.to_string(),
274 location: location!(),
275 })?;
276 Ok(FixedSizeBinary(size))
277 }
278 }
279 "dict" => {
280 if splits.len() != 4 {
281 Err(Error::Schema {
282 message: format!("Unsupported dictionary type: {}", lt),
283 location: location!(),
284 })
285 } else {
286 let value_type: Self = (&LogicalType::from(splits[1])).try_into()?;
287 let index_type: Self = (&LogicalType::from(splits[2])).try_into()?;
288 Ok(Dictionary(Box::new(index_type), Box::new(value_type)))
289 }
290 }
291 "decimal" => {
292 if splits.len() != 4 {
293 Err(Error::Schema {
294 message: format!("Unsupported decimal type: {}", lt),
295 location: location!(),
296 })
297 } else {
298 let bits: i16 = splits[1].parse::<i16>().map_err(|err| Error::Schema {
299 message: err.to_string(),
300 location: location!(),
301 })?;
302 let precision: u8 =
303 splits[2].parse::<u8>().map_err(|err| Error::Schema {
304 message: err.to_string(),
305 location: location!(),
306 })?;
307 let scale: i8 = splits[3].parse::<i8>().map_err(|err| Error::Schema {
308 message: err.to_string(),
309 location: location!(),
310 })?;
311
312 if bits == 128 {
313 Ok(Decimal128(precision, scale))
314 } else if bits == 256 {
315 Ok(Decimal256(precision, scale))
316 } else {
317 Err(Error::Schema {
318 message: format!(
319 "Only Decimal128 and Decimal256 is supported. Found {bits}"
320 ),
321 location: location!(),
322 })
323 }
324 }
325 }
326 "timestamp" => {
327 if splits.len() != 3 {
328 Err(Error::Schema {
329 message: format!("Unsupported timestamp type: {}", lt),
330 location: location!(),
331 })
332 } else {
333 let timeunit = parse_timeunit(splits[1])?;
334 let tz: Option<Arc<str>> = if splits[2] == "-" {
335 None
336 } else {
337 Some(splits[2].into())
338 };
339 Ok(Timestamp(timeunit, tz))
340 }
341 }
342 _ => Err(Error::Schema {
343 message: format!("Unsupported logical type: {}", lt),
344 location: location!(),
345 }),
346 }
347 }
348 }
349}
350
351#[derive(Debug, Clone, Default)]
352pub struct Dictionary {
353 pub offset: usize,
354
355 pub length: usize,
356
357 pub values: Option<ArrayRef>,
358}
359
360impl DeepSizeOf for Dictionary {
361 fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize {
362 self.values
363 .as_ref()
364 .map(|v| v.get_array_memory_size())
365 .unwrap_or(0)
366 }
367}
368
369impl PartialEq for Dictionary {
370 fn eq(&self, other: &Self) -> bool {
371 match (&self.values, &other.values) {
372 (Some(a), Some(b)) => a == b,
373 _ => false,
374 }
375 }
376}
377
378pub fn lance_supports_nulls(datatype: &DataType) -> bool {
380 matches!(
381 datatype,
382 DataType::Utf8
383 | DataType::LargeUtf8
384 | DataType::Binary
385 | DataType::List(_)
386 | DataType::FixedSizeBinary(_)
387 | DataType::FixedSizeList(_, _)
388 )
389}