polars_schema/
schema.rs

1use core::fmt::{Debug, Formatter};
2use core::hash::{Hash, Hasher};
3
4use indexmap::map::MutableKeys;
5use polars_error::{PolarsError, PolarsResult, polars_bail, polars_ensure, polars_err};
6use polars_utils::aliases::{InitHashMaps, PlIndexMap};
7use polars_utils::pl_str::PlSmallStr;
8
9#[derive(Clone, Default)]
10#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
11pub struct Schema<D> {
12    fields: PlIndexMap<PlSmallStr, D>,
13}
14
15impl<D: Eq> Eq for Schema<D> {}
16
17impl<D> Schema<D> {
18    pub fn with_capacity(capacity: usize) -> Self {
19        let fields = PlIndexMap::with_capacity(capacity);
20        Self { fields }
21    }
22
23    /// Reserve `additional` memory spaces in the schema.
24    pub fn reserve(&mut self, additional: usize) {
25        self.fields.reserve(additional);
26    }
27
28    /// The number of fields in the schema.
29    #[inline]
30    pub fn len(&self) -> usize {
31        self.fields.len()
32    }
33
34    #[inline]
35    pub fn is_empty(&self) -> bool {
36        self.fields.is_empty()
37    }
38
39    /// Rename field `old` to `new`, and return the (owned) old name.
40    ///
41    /// If `old` is not present in the schema, the schema is not modified and `None` is returned. Otherwise the schema
42    /// is updated and `Some(old_name)` is returned.
43    pub fn rename(&mut self, old: &str, new: PlSmallStr) -> Option<PlSmallStr> {
44        // Remove `old`, get the corresponding index and dtype, and move the last item in the map to that position
45        let (old_index, old_name, dtype) = self.fields.swap_remove_full(old)?;
46        // Insert the same dtype under the new name at the end of the map and store that index
47        let (new_index, _) = self.fields.insert_full(new, dtype);
48        // Swap the two indices to move the originally last element back to the end and to move the new element back to
49        // its original position
50        self.fields.swap_indices(old_index, new_index);
51
52        Some(old_name)
53    }
54
55    pub fn insert(&mut self, key: PlSmallStr, value: D) -> Option<D> {
56        self.fields.insert(key, value)
57    }
58
59    /// Insert a field with `name` and `dtype` at the given `index` into this schema.
60    ///
61    /// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is
62    /// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the
63    /// end of the schema).
64    ///
65    /// For a non-mutating version that clones the schema, see [`new_inserting_at_index`][Self::new_inserting_at_index].
66    ///
67    /// Runtime: **O(n)** where `n` is the number of fields in the schema.
68    ///
69    /// Returns:
70    /// - If index is out of bounds, `Err(PolarsError)`
71    /// - Else if `name` was already in the schema, `Ok(Some(old_dtype))`
72    /// - Else `Ok(None)`
73    pub fn insert_at_index(
74        &mut self,
75        mut index: usize,
76        name: PlSmallStr,
77        dtype: D,
78    ) -> PolarsResult<Option<D>> {
79        polars_ensure!(
80            index <= self.len(),
81            OutOfBounds:
82                "index {} is out of bounds for schema with length {} (the max index allowed is self.len())",
83                    index,
84                    self.len()
85        );
86
87        let (old_index, old_dtype) = self.fields.insert_full(name, dtype);
88
89        // If we're moving an existing field, one-past-the-end will actually be out of bounds. Also, self.len() won't
90        // have changed after inserting, so `index == self.len()` is the same as it was before inserting.
91        if old_dtype.is_some() && index == self.len() {
92            index -= 1;
93        }
94        self.fields.move_index(old_index, index);
95        Ok(old_dtype)
96    }
97
98    /// Get a reference to the dtype of the field named `name`, or `None` if the field doesn't exist.
99    pub fn get(&self, name: &str) -> Option<&D> {
100        self.fields.get(name)
101    }
102
103    /// Get a mutable reference to the dtype of the field named `name`, or `None` if the field doesn't exist.
104    pub fn get_mut(&mut self, name: &str) -> Option<&mut D> {
105        self.fields.get_mut(name)
106    }
107
108    /// Get a reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist.
109    pub fn try_get(&self, name: &str) -> PolarsResult<&D> {
110        self.get(name)
111            .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name))
112    }
113
114    /// Get a mutable reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist.
115    pub fn try_get_mut(&mut self, name: &str) -> PolarsResult<&mut D> {
116        self.fields
117            .get_mut(name)
118            .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name))
119    }
120
121    /// Return all data about the field named `name`: its index in the schema, its name, and its dtype.
122    ///
123    /// Returns `Some((index, &name, &dtype))` if the field exists, `None` if it doesn't.
124    pub fn get_full(&self, name: &str) -> Option<(usize, &PlSmallStr, &D)> {
125        self.fields.get_full(name)
126    }
127
128    /// Return all data about the field named `name`: its index in the schema, its name, and its dtype.
129    ///
130    /// Returns `Ok((index, &name, &dtype))` if the field exists, `Err(PolarsErr)` if it doesn't.
131    pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &PlSmallStr, &D)> {
132        self.fields
133            .get_full(name)
134            .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name))
135    }
136
137    /// Get references to the name and dtype of the field at `index`.
138    ///
139    /// If `index` is inbounds, returns `Some((&name, &dtype))`, else `None`. See
140    /// [`get_at_index_mut`][Self::get_at_index_mut] for a mutable version.
141    pub fn get_at_index(&self, index: usize) -> Option<(&PlSmallStr, &D)> {
142        self.fields.get_index(index)
143    }
144
145    pub fn try_get_at_index(&self, index: usize) -> PolarsResult<(&PlSmallStr, &D)> {
146        self.fields.get_index(index).ok_or_else(|| polars_err!(ComputeError: "index {index} out of bounds with 'schema' of len: {}", self.len()))
147    }
148
149    /// Get mutable references to the name and dtype of the field at `index`.
150    ///
151    /// If `index` is inbounds, returns `Some((&mut name, &mut dtype))`, else `None`. See
152    /// [`get_at_index`][Self::get_at_index] for an immutable version.
153    pub fn get_at_index_mut(&mut self, index: usize) -> Option<(&mut PlSmallStr, &mut D)> {
154        self.fields.get_index_mut2(index)
155    }
156
157    /// Swap-remove a field by name and, if the field existed, return its dtype.
158    ///
159    /// If the field does not exist, the schema is not modified and `None` is returned.
160    ///
161    /// This method does a `swap_remove`, which is O(1) but **changes the order of the schema**: the field named `name`
162    /// is replaced by the last field, which takes its position. For a slower, but order-preserving, method, use
163    /// [`shift_remove`][Self::shift_remove].
164    pub fn remove(&mut self, name: &str) -> Option<D> {
165        self.fields.swap_remove(name)
166    }
167
168    /// Remove a field by name, preserving order, and, if the field existed, return its dtype.
169    ///
170    /// If the field does not exist, the schema is not modified and `None` is returned.
171    ///
172    /// This method does a `shift_remove`, which preserves the order of the fields in the schema but **is O(n)**. For a
173    /// faster, but not order-preserving, method, use [`remove`][Self::remove].
174    pub fn shift_remove(&mut self, name: &str) -> Option<D> {
175        self.fields.shift_remove(name)
176    }
177
178    /// Remove a field by name, preserving order, and, if the field existed, return its dtype.
179    ///
180    /// If the field does not exist, the schema is not modified and `None` is returned.
181    ///
182    /// This method does a `shift_remove`, which preserves the order of the fields in the schema but **is O(n)**. For a
183    /// faster, but not order-preserving, method, use [`remove`][Self::remove].
184    pub fn shift_remove_index(&mut self, index: usize) -> Option<(PlSmallStr, D)> {
185        self.fields.shift_remove_index(index)
186    }
187
188    /// Whether the schema contains a field named `name`.
189    pub fn contains(&self, name: &str) -> bool {
190        self.get(name).is_some()
191    }
192
193    /// Change the field named `name` to the given `dtype` and return the previous dtype.
194    ///
195    /// If `name` doesn't already exist in the schema, the schema is not modified and `None` is returned. Otherwise
196    /// returns `Some(old_dtype)`.
197    ///
198    /// This method only ever modifies an existing field and never adds a new field to the schema. To add a new field,
199    /// use [`with_column`][Self::with_column] or [`insert_at_index`][Self::insert_at_index].
200    pub fn set_dtype(&mut self, name: &str, dtype: D) -> Option<D> {
201        let old_dtype = self.fields.get_mut(name)?;
202        Some(std::mem::replace(old_dtype, dtype))
203    }
204
205    /// Change the field at the given index to the given `dtype` and return the previous dtype.
206    ///
207    /// If the index is out of bounds, the schema is not modified and `None` is returned. Otherwise returns
208    /// `Some(old_dtype)`.
209    ///
210    /// This method only ever modifies an existing index and never adds a new field to the schema. To add a new field,
211    /// use [`with_column`][Self::with_column] or [`insert_at_index`][Self::insert_at_index].
212    pub fn set_dtype_at_index(&mut self, index: usize, dtype: D) -> Option<D> {
213        let (_, old_dtype) = self.fields.get_index_mut(index)?;
214        Some(std::mem::replace(old_dtype, dtype))
215    }
216
217    /// Insert a column into the [`Schema`].
218    ///
219    /// If the schema already has this column, this instead updates it with the new value and
220    /// returns the old one. Otherwise, the column is inserted at the end.
221    ///
222    /// To enforce the index of the resulting field, use [`insert_at_index`][Self::insert_at_index].
223    pub fn with_column(&mut self, name: PlSmallStr, dtype: D) -> Option<D> {
224        self.fields.insert(name, dtype)
225    }
226
227    /// Raises DuplicateError if this column already exists in the schema.
228    pub fn try_insert(&mut self, name: PlSmallStr, value: D) -> PolarsResult<()> {
229        if self.fields.contains_key(&name) {
230            polars_bail!(Duplicate: "column '{}' is duplicate", name)
231        }
232
233        self.fields.insert(name, value);
234
235        Ok(())
236    }
237
238    /// Performs [`Schema::try_insert`] for every column.
239    ///
240    /// Raises DuplicateError if a column already exists in the schema.
241    pub fn hstack_mut(
242        &mut self,
243        columns: impl IntoIterator<Item = impl Into<(PlSmallStr, D)>>,
244    ) -> PolarsResult<()> {
245        for v in columns {
246            let (k, v) = v.into();
247            self.try_insert(k, v)?;
248        }
249
250        Ok(())
251    }
252
253    /// Performs [`Schema::try_insert`] for every column.
254    ///
255    /// Raises DuplicateError if a column already exists in the schema.
256    pub fn hstack(
257        mut self,
258        columns: impl IntoIterator<Item = impl Into<(PlSmallStr, D)>>,
259    ) -> PolarsResult<Self> {
260        self.hstack_mut(columns)?;
261        Ok(self)
262    }
263
264    /// Merge `other` into `self`.
265    ///
266    /// Merging logic:
267    /// - Fields that occur in `self` but not `other` are unmodified
268    /// - Fields that occur in `other` but not `self` are appended, in order, to the end of `self`
269    /// - Fields that occur in both `self` and `other` are updated with the dtype from `other`, but keep their original
270    ///   index
271    pub fn merge(&mut self, other: Self) {
272        self.fields.extend(other.fields)
273    }
274
275    /// Iterates over the `(&name, &dtype)` pairs in this schema.
276    ///
277    /// For an owned version, use [`iter_fields`][Self::iter_fields], which clones the data to iterate owned `Field`s
278    pub fn iter(&self) -> impl ExactSizeIterator<Item = (&PlSmallStr, &D)> + '_ {
279        self.fields.iter()
280    }
281
282    pub fn iter_mut(&mut self) -> impl ExactSizeIterator<Item = (&PlSmallStr, &mut D)> + '_ {
283        self.fields.iter_mut()
284    }
285
286    /// Iterates over references to the names in this schema.
287    pub fn iter_names(&self) -> impl '_ + ExactSizeIterator<Item = &PlSmallStr> {
288        self.fields.iter().map(|(name, _dtype)| name)
289    }
290
291    pub fn iter_names_cloned(&self) -> impl '_ + ExactSizeIterator<Item = PlSmallStr> {
292        self.iter_names().cloned()
293    }
294
295    /// Iterates over references to the dtypes in this schema.
296    pub fn iter_values(&self) -> impl '_ + ExactSizeIterator<Item = &D> {
297        self.fields.iter().map(|(_name, dtype)| dtype)
298    }
299
300    pub fn into_iter_values(self) -> impl ExactSizeIterator<Item = D> {
301        self.fields.into_values()
302    }
303
304    /// Iterates over mut references to the dtypes in this schema.
305    pub fn iter_values_mut(&mut self) -> impl '_ + ExactSizeIterator<Item = &mut D> {
306        self.fields.iter_mut().map(|(_name, dtype)| dtype)
307    }
308
309    pub fn index_of(&self, name: &str) -> Option<usize> {
310        self.fields.get_index_of(name)
311    }
312
313    pub fn try_index_of(&self, name: &str) -> PolarsResult<usize> {
314        let Some(i) = self.fields.get_index_of(name) else {
315            polars_bail!(
316                ColumnNotFound:
317                "unable to find column {:?}; valid columns: {:?}",
318                name, self.iter_names().collect::<Vec<_>>(),
319            )
320        };
321
322        Ok(i)
323    }
324
325    /// Compare the fields between two schema returning the additional columns that each schema has.
326    pub fn field_compare<'a, 'b>(
327        &'a self,
328        other: &'b Self,
329        self_extra: &mut Vec<(usize, (&'a PlSmallStr, &'a D))>,
330        other_extra: &mut Vec<(usize, (&'b PlSmallStr, &'b D))>,
331    ) {
332        self_extra.extend(
333            self.iter()
334                .enumerate()
335                .filter(|(_, (n, _))| !other.contains(n)),
336        );
337        other_extra.extend(
338            other
339                .iter()
340                .enumerate()
341                .filter(|(_, (n, _))| !self.contains(n)),
342        );
343    }
344}
345
346impl<D> Schema<D>
347where
348    D: Clone + Default,
349{
350    /// Create a new schema from this one, inserting a field with `name` and `dtype` at the given `index`.
351    ///
352    /// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is
353    /// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the
354    /// end of the schema).
355    ///
356    /// For a mutating version that doesn't clone, see [`insert_at_index`][Self::insert_at_index].
357    ///
358    /// Runtime: **O(m * n)** where `m` is the (average) length of the field names and `n` is the number of fields in
359    /// the schema. This method clones every field in the schema.
360    ///
361    /// Returns: `Ok(new_schema)` if `index <= self.len()`, else `Err(PolarsError)`
362    pub fn new_inserting_at_index(
363        &self,
364        index: usize,
365        name: PlSmallStr,
366        field: D,
367    ) -> PolarsResult<Self> {
368        polars_ensure!(
369            index <= self.len(),
370            OutOfBounds:
371                "index {} is out of bounds for schema with length {} (the max index allowed is self.len())",
372                    index,
373                    self.len()
374        );
375
376        let mut new = Self::default();
377        let mut iter = self.fields.iter().filter_map(|(fld_name, dtype)| {
378            (fld_name != &name).then_some((fld_name.clone(), dtype.clone()))
379        });
380        new.fields.extend(iter.by_ref().take(index));
381        new.fields.insert(name.clone(), field);
382        new.fields.extend(iter);
383        Ok(new)
384    }
385
386    /// Merge borrowed `other` into `self`.
387    ///
388    /// Merging logic:
389    /// - Fields that occur in `self` but not `other` are unmodified
390    /// - Fields that occur in `other` but not `self` are appended, in order, to the end of `self`
391    /// - Fields that occur in both `self` and `other` are updated with the dtype from `other`, but keep their original
392    ///   index
393    pub fn merge_from_ref(&mut self, other: &Self) {
394        self.fields.extend(
395            other
396                .iter()
397                .map(|(column, field)| (column.clone(), field.clone())),
398        )
399    }
400
401    /// Generates another schema with just the specified columns selected from this one.
402    pub fn try_project<I>(&self, columns: I) -> PolarsResult<Self>
403    where
404        I: IntoIterator,
405        I::Item: AsRef<str>,
406    {
407        let schema = columns
408            .into_iter()
409            .map(|c| {
410                let name = c.as_ref();
411                let (_, name, dtype) = self
412                    .fields
413                    .get_full(name)
414                    .ok_or_else(|| polars_err!(col_not_found = name))?;
415                PolarsResult::Ok((name.clone(), dtype.clone()))
416            })
417            .collect::<PolarsResult<PlIndexMap<PlSmallStr, _>>>()?;
418        Ok(Self::from(schema))
419    }
420
421    pub fn try_project_indices(&self, indices: &[usize]) -> PolarsResult<Self> {
422        let fields = indices
423            .iter()
424            .map(|&i| {
425                let Some((k, v)) = self.fields.get_index(i) else {
426                    polars_bail!(
427                        SchemaFieldNotFound:
428                        "projection index {} is out of bounds for schema of length {}",
429                        i, self.fields.len()
430                    );
431                };
432
433                Ok((k.clone(), v.clone()))
434            })
435            .collect::<PolarsResult<PlIndexMap<_, _>>>()?;
436
437        Ok(Self { fields })
438    }
439
440    /// Returns a new [`Schema`] with a subset of all fields whose `predicate`
441    /// evaluates to true.
442    pub fn filter<F: Fn(usize, &D) -> bool>(self, predicate: F) -> Self {
443        let fields = self
444            .fields
445            .into_iter()
446            .enumerate()
447            .filter_map(|(index, (name, d))| {
448                if (predicate)(index, &d) {
449                    Some((name, d))
450                } else {
451                    None
452                }
453            })
454            .collect();
455
456        Self { fields }
457    }
458
459    pub fn from_iter_check_duplicates<I, F>(iter: I) -> PolarsResult<Self>
460    where
461        I: IntoIterator<Item = F>,
462        F: Into<(PlSmallStr, D)>,
463    {
464        let iter = iter.into_iter();
465        let mut slf = Self::with_capacity(iter.size_hint().1.unwrap_or(0));
466
467        for v in iter {
468            let (name, d) = v.into();
469
470            if slf.contains(&name) {
471                return Err(err_msg(&name));
472
473                fn err_msg(name: &str) -> PolarsError {
474                    polars_err!(Duplicate: "duplicate name when building schema '{}'", &name)
475                }
476            }
477
478            slf.fields.insert(name, d);
479        }
480
481        Ok(slf)
482    }
483}
484
485pub fn ensure_matching_schema_names<D>(lhs: &Schema<D>, rhs: &Schema<D>) -> PolarsResult<()> {
486    let lhs_names = lhs.iter_names();
487    let rhs_names = rhs.iter_names();
488
489    if !(lhs_names.len() == rhs_names.len() && lhs_names.zip(rhs_names).all(|(l, r)| l == r)) {
490        polars_bail!(
491            SchemaMismatch:
492            "lhs: {:?} rhs: {:?}",
493            lhs.iter_names().collect::<Vec<_>>(), rhs.iter_names().collect::<Vec<_>>()
494        )
495    }
496
497    Ok(())
498}
499
500impl<D: Debug> Debug for Schema<D> {
501    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
502        writeln!(f, "Schema:")?;
503        for (name, field) in self.fields.iter() {
504            writeln!(f, "name: {name}, field: {field:?}")?;
505        }
506        Ok(())
507    }
508}
509
510impl<D: Hash> Hash for Schema<D> {
511    fn hash<H: Hasher>(&self, state: &mut H) {
512        self.fields.iter().for_each(|v| v.hash(state))
513    }
514}
515
516// Schemas will only compare equal if they have the same fields in the same order. We can't use `self.inner ==
517// other.inner` because [`IndexMap`] ignores order when checking equality, but we don't want to ignore it.
518impl<D: PartialEq> PartialEq for Schema<D> {
519    fn eq(&self, other: &Self) -> bool {
520        self.fields.len() == other.fields.len()
521            && self
522                .fields
523                .iter()
524                .zip(other.fields.iter())
525                .all(|(a, b)| a == b)
526    }
527}
528
529impl<D> From<PlIndexMap<PlSmallStr, D>> for Schema<D> {
530    fn from(fields: PlIndexMap<PlSmallStr, D>) -> Self {
531        Self { fields }
532    }
533}
534
535impl<F, D> FromIterator<F> for Schema<D>
536where
537    F: Into<(PlSmallStr, D)>,
538{
539    fn from_iter<I: IntoIterator<Item = F>>(iter: I) -> Self {
540        let fields = PlIndexMap::from_iter(iter.into_iter().map(|x| x.into()));
541        Self { fields }
542    }
543}
544
545impl<F, D> Extend<F> for Schema<D>
546where
547    F: Into<(PlSmallStr, D)>,
548{
549    fn extend<T: IntoIterator<Item = F>>(&mut self, iter: T) {
550        self.fields.extend(iter.into_iter().map(|x| x.into()))
551    }
552}
553
554impl<D> IntoIterator for Schema<D> {
555    type IntoIter = <PlIndexMap<PlSmallStr, D> as IntoIterator>::IntoIter;
556    type Item = (PlSmallStr, D);
557
558    fn into_iter(self) -> Self::IntoIter {
559        self.fields.into_iter()
560    }
561}