lance_arrow/
schema.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Extension to arrow schema
5
6use arrow_schema::{ArrowError, DataType, Field, FieldRef, Schema};
7
8use crate::{ARROW_EXT_NAME_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME};
9
10pub enum Indentation {
11    OneLine,
12    MultiLine(u8),
13}
14
15impl Indentation {
16    fn value(&self) -> String {
17        match self {
18            Self::OneLine => "".to_string(),
19            Self::MultiLine(spaces) => " ".repeat(*spaces as usize),
20        }
21    }
22
23    fn deepen(&self) -> Self {
24        match self {
25            Self::OneLine => Self::OneLine,
26            Self::MultiLine(spaces) => Self::MultiLine(spaces + 2),
27        }
28    }
29}
30
31/// Extends the functionality of [arrow_schema::Field].
32pub trait FieldExt {
33    /// Create a compact string representation of the field
34    ///
35    /// This is intended for display purposes and not for serialization
36    fn to_compact_string(&self, indent: Indentation) -> String;
37
38    /// Check if the field is marked as a packed struct
39    fn is_packed_struct(&self) -> bool;
40
41    /// Check if the field is marked as a blob
42    fn is_blob(&self) -> bool;
43}
44
45impl FieldExt for Field {
46    fn to_compact_string(&self, indent: Indentation) -> String {
47        let mut result = format!("{}: ", self.name().clone());
48        match self.data_type() {
49            DataType::Struct(fields) => {
50                result += "{";
51                result += &indent.value();
52                for (field_idx, field) in fields.iter().enumerate() {
53                    result += field.to_compact_string(indent.deepen()).as_str();
54                    if field_idx < fields.len() - 1 {
55                        result += ",";
56                    }
57                    result += indent.value().as_str();
58                }
59                result += "}";
60            }
61            DataType::List(field)
62            | DataType::LargeList(field)
63            | DataType::ListView(field)
64            | DataType::LargeListView(field) => {
65                result += "[";
66                result += field.to_compact_string(indent.deepen()).as_str();
67                result += "]";
68            }
69            DataType::FixedSizeList(child, dimension) => {
70                result += &format!(
71                    "[{}; {}]",
72                    child.to_compact_string(indent.deepen()),
73                    dimension
74                );
75            }
76            DataType::Dictionary(key_type, value_type) => {
77                result += &value_type.to_string();
78                result += "@";
79                result += &key_type.to_string();
80            }
81            _ => {
82                result += &self.data_type().to_string();
83            }
84        }
85        if self.is_nullable() {
86            result += "?";
87        }
88        result
89    }
90
91    // Check if field has metadata `packed` set to true, this check is case insensitive.
92    fn is_packed_struct(&self) -> bool {
93        let field_metadata = self.metadata();
94        const PACKED_KEYS: [&str; 2] = ["packed", "lance-encoding:packed"];
95        PACKED_KEYS.iter().any(|key| {
96            field_metadata
97                .get(*key)
98                .map(|value| value.eq_ignore_ascii_case("true"))
99                .unwrap_or(false)
100        })
101    }
102
103    fn is_blob(&self) -> bool {
104        let field_metadata = self.metadata();
105        field_metadata.get(BLOB_META_KEY).is_some()
106            || field_metadata
107                .get(ARROW_EXT_NAME_KEY)
108                .map(|value| value == BLOB_V2_EXT_NAME)
109                .unwrap_or(false)
110    }
111}
112
113/// Extends the functionality of [arrow_schema::Schema].
114pub trait SchemaExt {
115    /// Create a new [`Schema`] with one extra field.
116    fn try_with_column(&self, field: Field) -> std::result::Result<Schema, ArrowError>;
117
118    fn try_with_column_at(
119        &self,
120        index: usize,
121        field: Field,
122    ) -> std::result::Result<Schema, ArrowError>;
123
124    fn field_names(&self) -> Vec<&String>;
125
126    fn without_column(&self, column_name: &str) -> Schema;
127
128    /// Create a compact string representation of the schema
129    ///
130    /// This is intended for display purposes and not for serialization
131    fn to_compact_string(&self, indent: Indentation) -> String;
132}
133
134impl SchemaExt for Schema {
135    fn try_with_column(&self, field: Field) -> std::result::Result<Schema, ArrowError> {
136        if self.column_with_name(field.name()).is_some() {
137            return Err(ArrowError::SchemaError(format!(
138                "Can not append column {} on schema: {:?}",
139                field.name(),
140                self
141            )));
142        };
143        let mut fields: Vec<FieldRef> = self.fields().iter().cloned().collect();
144        fields.push(FieldRef::new(field));
145        Ok(Self::new_with_metadata(fields, self.metadata.clone()))
146    }
147
148    fn try_with_column_at(
149        &self,
150        index: usize,
151        field: Field,
152    ) -> std::result::Result<Schema, ArrowError> {
153        if self.column_with_name(field.name()).is_some() {
154            return Err(ArrowError::SchemaError(format!(
155                "Failed to modify schema: Inserting column {} would create a duplicate column in schema: {:?}",
156                field.name(),
157                self
158            )));
159        };
160        let mut fields: Vec<FieldRef> = self.fields().iter().cloned().collect();
161        fields.insert(index, FieldRef::new(field));
162        Ok(Self::new_with_metadata(fields, self.metadata.clone()))
163    }
164
165    /// Project the schema to remove the given column.
166    ///
167    /// This only works on top-level fields right now. If a field does not exist,
168    /// the schema will be returned as is.
169    fn without_column(&self, column_name: &str) -> Schema {
170        let fields: Vec<FieldRef> = self
171            .fields()
172            .iter()
173            .filter(|f| f.name() != column_name)
174            .cloned()
175            .collect();
176        Self::new_with_metadata(fields, self.metadata.clone())
177    }
178
179    fn field_names(&self) -> Vec<&String> {
180        self.fields().iter().map(|f| f.name()).collect()
181    }
182
183    fn to_compact_string(&self, indent: Indentation) -> String {
184        let mut result = "{".to_string();
185        result += &indent.value();
186        for (field_idx, field) in self.fields.iter().enumerate() {
187            result += field.to_compact_string(indent.deepen()).as_str();
188            if field_idx < self.fields.len() - 1 {
189                result += ",";
190            }
191            result += indent.value().as_str();
192        }
193        result += "}";
194        result
195    }
196}