Skip to main content

lance_arrow/
schema.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Extension to arrow schema
5
6use arrow_schema::{ArrowError, DataType, Field, FieldRef, Schema};
7
8use crate::{ARROW_EXT_NAME_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME};
9
10pub enum Indentation {
11    OneLine,
12    MultiLine(u8),
13}
14
15impl Indentation {
16    fn value(&self) -> String {
17        match self {
18            Self::OneLine => "".to_string(),
19            Self::MultiLine(spaces) => " ".repeat(*spaces as usize),
20        }
21    }
22
23    fn deepen(&self) -> Self {
24        match self {
25            Self::OneLine => Self::OneLine,
26            Self::MultiLine(spaces) => Self::MultiLine(spaces + 2),
27        }
28    }
29}
30
31/// Extends the functionality of [arrow_schema::Field].
32pub trait FieldExt {
33    /// Create a compact string representation of the field
34    ///
35    /// This is intended for display purposes and not for serialization
36    fn to_compact_string(&self, indent: Indentation) -> String;
37
38    /// Check if the field is marked as a packed struct
39    fn is_packed_struct(&self) -> bool;
40
41    /// Check if the field is marked as a blob
42    fn is_blob(&self) -> bool;
43
44    /// Check if the field is marked as a blob
45    fn is_blob_v2(&self) -> bool;
46}
47
48impl FieldExt for Field {
49    fn to_compact_string(&self, indent: Indentation) -> String {
50        let mut result = format!("{}: ", self.name().clone());
51        match self.data_type() {
52            DataType::Struct(fields) => {
53                result += "{";
54                result += &indent.value();
55                for (field_idx, field) in fields.iter().enumerate() {
56                    result += field.to_compact_string(indent.deepen()).as_str();
57                    if field_idx < fields.len() - 1 {
58                        result += ",";
59                    }
60                    result += indent.value().as_str();
61                }
62                result += "}";
63            }
64            DataType::List(field)
65            | DataType::LargeList(field)
66            | DataType::ListView(field)
67            | DataType::LargeListView(field) => {
68                result += "[";
69                result += field.to_compact_string(indent.deepen()).as_str();
70                result += "]";
71            }
72            DataType::FixedSizeList(child, dimension) => {
73                result += &format!(
74                    "[{}; {}]",
75                    child.to_compact_string(indent.deepen()),
76                    dimension
77                );
78            }
79            DataType::Dictionary(key_type, value_type) => {
80                result += &value_type.to_string();
81                result += "@";
82                result += &key_type.to_string();
83            }
84            _ => {
85                result += &self.data_type().to_string();
86            }
87        }
88        if self.is_nullable() {
89            result += "?";
90        }
91        result
92    }
93
94    // Check if field has metadata `packed` set to true, this check is case insensitive.
95    fn is_packed_struct(&self) -> bool {
96        let field_metadata = self.metadata();
97        const PACKED_KEYS: [&str; 2] = ["packed", "lance-encoding:packed"];
98        PACKED_KEYS.iter().any(|key| {
99            field_metadata
100                .get(*key)
101                .map(|value| value.eq_ignore_ascii_case("true"))
102                .unwrap_or(false)
103        })
104    }
105
106    fn is_blob(&self) -> bool {
107        let field_metadata = self.metadata();
108        field_metadata.get(BLOB_META_KEY).is_some()
109            || field_metadata
110                .get(ARROW_EXT_NAME_KEY)
111                .map(|value| value == BLOB_V2_EXT_NAME)
112                .unwrap_or(false)
113    }
114
115    fn is_blob_v2(&self) -> bool {
116        let field_metadata = self.metadata();
117        field_metadata
118            .get(ARROW_EXT_NAME_KEY)
119            .map(|value| value == BLOB_V2_EXT_NAME)
120            .unwrap_or(false)
121    }
122}
123
124/// Extends the functionality of [arrow_schema::Schema].
125pub trait SchemaExt {
126    /// Create a new [`Schema`] with one extra field.
127    fn try_with_column(&self, field: Field) -> std::result::Result<Schema, ArrowError>;
128
129    fn try_with_column_at(
130        &self,
131        index: usize,
132        field: Field,
133    ) -> std::result::Result<Schema, ArrowError>;
134
135    fn field_names(&self) -> Vec<&String>;
136
137    fn without_column(&self, column_name: &str) -> Schema;
138
139    /// Create a compact string representation of the schema
140    ///
141    /// This is intended for display purposes and not for serialization
142    fn to_compact_string(&self, indent: Indentation) -> String;
143}
144
145impl SchemaExt for Schema {
146    fn try_with_column(&self, field: Field) -> std::result::Result<Schema, ArrowError> {
147        if self.column_with_name(field.name()).is_some() {
148            return Err(ArrowError::SchemaError(format!(
149                "Can not append column {} on schema: {:?}",
150                field.name(),
151                self
152            )));
153        };
154        let mut fields: Vec<FieldRef> = self.fields().iter().cloned().collect();
155        fields.push(FieldRef::new(field));
156        Ok(Self::new_with_metadata(fields, self.metadata.clone()))
157    }
158
159    fn try_with_column_at(
160        &self,
161        index: usize,
162        field: Field,
163    ) -> std::result::Result<Schema, ArrowError> {
164        if self.column_with_name(field.name()).is_some() {
165            return Err(ArrowError::SchemaError(format!(
166                "Failed to modify schema: Inserting column {} would create a duplicate column in schema: {:?}",
167                field.name(),
168                self
169            )));
170        };
171        let mut fields: Vec<FieldRef> = self.fields().iter().cloned().collect();
172        fields.insert(index, FieldRef::new(field));
173        Ok(Self::new_with_metadata(fields, self.metadata.clone()))
174    }
175
176    /// Project the schema to remove the given column.
177    ///
178    /// This only works on top-level fields right now. If a field does not exist,
179    /// the schema will be returned as is.
180    fn without_column(&self, column_name: &str) -> Schema {
181        let fields: Vec<FieldRef> = self
182            .fields()
183            .iter()
184            .filter(|f| f.name() != column_name)
185            .cloned()
186            .collect();
187        Self::new_with_metadata(fields, self.metadata.clone())
188    }
189
190    fn field_names(&self) -> Vec<&String> {
191        self.fields().iter().map(|f| f.name()).collect()
192    }
193
194    fn to_compact_string(&self, indent: Indentation) -> String {
195        let mut result = "{".to_string();
196        result += &indent.value();
197        for (field_idx, field) in self.fields.iter().enumerate() {
198            result += field.to_compact_string(indent.deepen()).as_str();
199            if field_idx < self.fields.len() - 1 {
200                result += ",";
201            }
202            result += indent.value().as_str();
203        }
204        result += "}";
205        result
206    }
207}