lance_arrow/
schema.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Extension to arrow schema
5
6use arrow_schema::{ArrowError, DataType, Field, FieldRef, Schema};
7
8use crate::BLOB_META_KEY;
9
10pub enum Indentation {
11    OneLine,
12    MultiLine(u8),
13}
14
15impl Indentation {
16    fn value(&self) -> String {
17        match self {
18            Self::OneLine => "".to_string(),
19            Self::MultiLine(spaces) => " ".repeat(*spaces as usize),
20        }
21    }
22
23    fn deepen(&self) -> Self {
24        match self {
25            Self::OneLine => Self::OneLine,
26            Self::MultiLine(spaces) => Self::MultiLine(spaces + 2),
27        }
28    }
29}
30
31/// Extends the functionality of [arrow_schema::Field].
32pub trait FieldExt {
33    /// Create a compact string representation of the field
34    ///
35    /// This is intended for display purposes and not for serialization
36    fn to_compact_string(&self, indent: Indentation) -> String;
37
38    /// Check if the field is marked as a packed struct
39    fn is_packed_struct(&self) -> bool;
40
41    /// Check if the field is marked as a blob
42    fn is_blob(&self) -> bool;
43}
44
45impl FieldExt for Field {
46    fn to_compact_string(&self, indent: Indentation) -> String {
47        let mut result = format!("{}: ", self.name().clone());
48        match self.data_type() {
49            DataType::Struct(fields) => {
50                result += "{";
51                result += &indent.value();
52                for (field_idx, field) in fields.iter().enumerate() {
53                    result += field.to_compact_string(indent.deepen()).as_str();
54                    if field_idx < fields.len() - 1 {
55                        result += ",";
56                    }
57                    result += indent.value().as_str();
58                }
59                result += "}";
60            }
61            DataType::List(field)
62            | DataType::LargeList(field)
63            | DataType::ListView(field)
64            | DataType::LargeListView(field) => {
65                result += "[";
66                result += field.to_compact_string(indent.deepen()).as_str();
67                result += "]";
68            }
69            DataType::FixedSizeList(child, dimension) => {
70                result += &format!(
71                    "[{}; {}]",
72                    child.to_compact_string(indent.deepen()),
73                    dimension
74                );
75            }
76            DataType::Dictionary(key_type, value_type) => {
77                result += &value_type.to_string();
78                result += "@";
79                result += &key_type.to_string();
80            }
81            _ => {
82                result += &self.data_type().to_string();
83            }
84        }
85        if self.is_nullable() {
86            result += "?";
87        }
88        result
89    }
90
91    // Check if field has metadata `packed` set to true, this check is case insensitive.
92    fn is_packed_struct(&self) -> bool {
93        let field_metadata = self.metadata();
94        const PACKED_KEYS: [&str; 2] = ["packed", "lance-encoding:packed"];
95        PACKED_KEYS.iter().any(|key| {
96            field_metadata
97                .get(*key)
98                .map(|value| value.eq_ignore_ascii_case("true"))
99                .unwrap_or(false)
100        })
101    }
102
103    fn is_blob(&self) -> bool {
104        let field_metadata = self.metadata();
105        field_metadata.get(BLOB_META_KEY).is_some()
106    }
107}
108
109/// Extends the functionality of [arrow_schema::Schema].
110pub trait SchemaExt {
111    /// Create a new [`Schema`] with one extra field.
112    fn try_with_column(&self, field: Field) -> std::result::Result<Schema, ArrowError>;
113
114    fn try_with_column_at(
115        &self,
116        index: usize,
117        field: Field,
118    ) -> std::result::Result<Schema, ArrowError>;
119
120    fn field_names(&self) -> Vec<&String>;
121
122    fn without_column(&self, column_name: &str) -> Schema;
123
124    /// Create a compact string representation of the schema
125    ///
126    /// This is intended for display purposes and not for serialization
127    fn to_compact_string(&self, indent: Indentation) -> String;
128}
129
130impl SchemaExt for Schema {
131    fn try_with_column(&self, field: Field) -> std::result::Result<Schema, ArrowError> {
132        if self.column_with_name(field.name()).is_some() {
133            return Err(ArrowError::SchemaError(format!(
134                "Can not append column {} on schema: {:?}",
135                field.name(),
136                self
137            )));
138        };
139        let mut fields: Vec<FieldRef> = self.fields().iter().cloned().collect();
140        fields.push(FieldRef::new(field));
141        Ok(Self::new_with_metadata(fields, self.metadata.clone()))
142    }
143
144    fn try_with_column_at(
145        &self,
146        index: usize,
147        field: Field,
148    ) -> std::result::Result<Schema, ArrowError> {
149        if self.column_with_name(field.name()).is_some() {
150            return Err(ArrowError::SchemaError(format!(
151                "Failed to modify schema: Inserting column {} would create a duplicate column in schema: {:?}",
152                field.name(),
153                self
154            )));
155        };
156        let mut fields: Vec<FieldRef> = self.fields().iter().cloned().collect();
157        fields.insert(index, FieldRef::new(field));
158        Ok(Self::new_with_metadata(fields, self.metadata.clone()))
159    }
160
161    /// Project the schema to remove the given column.
162    ///
163    /// This only works on top-level fields right now. If a field does not exist,
164    /// the schema will be returned as is.
165    fn without_column(&self, column_name: &str) -> Schema {
166        let fields: Vec<FieldRef> = self
167            .fields()
168            .iter()
169            .filter(|f| f.name() != column_name)
170            .cloned()
171            .collect();
172        Self::new_with_metadata(fields, self.metadata.clone())
173    }
174
175    fn field_names(&self) -> Vec<&String> {
176        self.fields().iter().map(|f| f.name()).collect()
177    }
178
179    fn to_compact_string(&self, indent: Indentation) -> String {
180        let mut result = "{".to_string();
181        result += &indent.value();
182        for (field_idx, field) in self.fields.iter().enumerate() {
183            result += field.to_compact_string(indent.deepen()).as_str();
184            if field_idx < self.fields.len() - 1 {
185                result += ",";
186            }
187            result += indent.value().as_str();
188        }
189        result += "}";
190        result
191    }
192}