Skip to main content

polars_schema/
schema.rs

1use core::fmt::Debug;
2use core::hash::{Hash, Hasher};
3
4use indexmap::map::MutableKeys;
5use polars_error::{PolarsError, PolarsResult, polars_bail, polars_ensure, polars_err};
6use polars_utils::aliases::{InitHashMaps, PlIndexMap};
7use polars_utils::pl_str::PlSmallStr;
8
9#[derive(Debug, Clone, Default)]
10#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
11#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
12pub struct Schema<Field, Metadata> {
13    fields: PlIndexMap<PlSmallStr, Field>,
14    metadata: Metadata,
15}
16
17impl<Field: Eq, Metadata: Eq> Eq for Schema<Field, Metadata> {}
18
19impl<Field, Metadata: Default> Schema<Field, Metadata> {
20    pub fn with_capacity(capacity: usize) -> Self {
21        let fields = PlIndexMap::with_capacity(capacity);
22        Self {
23            fields,
24            metadata: Metadata::default(),
25        }
26    }
27
28    pub fn from_iter_check_duplicates<I, F>(iter: I) -> PolarsResult<Self>
29    where
30        I: IntoIterator<Item = F>,
31        F: Into<(PlSmallStr, Field)>,
32    {
33        Self::try_from_iter_check_duplicates(
34            iter.into_iter().map(PolarsResult::Ok),
35            |name: &str| polars_err!(Duplicate: "duplicate name when building schema '{}'", &name),
36        )
37    }
38
39    pub fn try_from_iter_check_duplicates<I, F, E>(iter: I, err_func: E) -> PolarsResult<Self>
40    where
41        I: IntoIterator<Item = PolarsResult<F>>,
42        F: Into<(PlSmallStr, Field)>,
43        E: Fn(&str) -> PolarsError,
44    {
45        let iter = iter.into_iter();
46        let mut slf = Self::with_capacity(iter.size_hint().1.unwrap_or(0));
47
48        for v in iter {
49            let (name, d) = v?.into();
50
51            if slf.contains(&name) {
52                return Err(err_func(&name));
53            }
54
55            slf.fields.insert(name, d);
56        }
57
58        Ok(slf)
59    }
60}
61
62impl<Field, Metadata> Schema<Field, Metadata> {
63    /// Reserve `additional` memory spaces in the schema.
64    pub fn reserve(&mut self, additional: usize) {
65        self.fields.reserve(additional);
66    }
67
68    /// The number of fields in the schema.
69    #[inline]
70    pub fn len(&self) -> usize {
71        self.fields.len()
72    }
73
74    #[inline]
75    pub fn is_empty(&self) -> bool {
76        self.fields.is_empty()
77    }
78
79    pub fn metadata(&self) -> &Metadata {
80        &self.metadata
81    }
82
83    pub fn metadata_mut(&mut self) -> &mut Metadata {
84        &mut self.metadata
85    }
86
87    /// Rename field `old` to `new`, and return the (owned) old name.
88    ///
89    /// If `old` is not present in the schema, the schema is not modified and `None` is returned. Otherwise the schema
90    /// is updated and `Some(old_name)` is returned.
91    pub fn rename(&mut self, old: &str, new: PlSmallStr) -> Option<PlSmallStr> {
92        // Remove `old`, get the corresponding index and dtype, and move the last item in the map to that position
93        let (old_index, old_name, dtype) = self.fields.swap_remove_full(old)?;
94        // Insert the same dtype under the new name at the end of the map and store that index
95        let (new_index, _) = self.fields.insert_full(new, dtype);
96        // Swap the two indices to move the originally last element back to the end and to move the new element back to
97        // its original position
98        self.fields.swap_indices(old_index, new_index);
99
100        Some(old_name)
101    }
102
103    pub fn insert(&mut self, key: PlSmallStr, value: Field) -> Option<Field> {
104        self.fields.insert(key, value)
105    }
106
107    /// Insert a field with `name` and `dtype` at the given `index` into this schema.
108    ///
109    /// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is
110    /// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the
111    /// end of the schema).
112    ///
113    /// For a non-mutating version that clones the schema, see [`new_inserting_at_index`][Self::new_inserting_at_index].
114    ///
115    /// Runtime: **O(n)** where `n` is the number of fields in the schema.
116    ///
117    /// Returns:
118    /// - If index is out of bounds, `Err(PolarsError)`
119    /// - Else if `name` was already in the schema, `Ok(Some(old_dtype))`
120    /// - Else `Ok(None)`
121    pub fn insert_at_index(
122        &mut self,
123        mut index: usize,
124        name: PlSmallStr,
125        dtype: Field,
126    ) -> PolarsResult<Option<Field>> {
127        polars_ensure!(
128            index <= self.len(),
129            OutOfBounds:
130                "index {} is out of bounds for schema with length {} (the max index allowed is self.len())",
131                    index,
132                    self.len()
133        );
134
135        let (old_index, old_dtype) = self.fields.insert_full(name, dtype);
136
137        // If we're moving an existing field, one-past-the-end will actually be out of bounds. Also, self.len() won't
138        // have changed after inserting, so `index == self.len()` is the same as it was before inserting.
139        if old_dtype.is_some() && index == self.len() {
140            index -= 1;
141        }
142        self.fields.move_index(old_index, index);
143        Ok(old_dtype)
144    }
145
146    /// Get a reference to the dtype of the field named `name`, or `None` if the field doesn't exist.
147    pub fn get(&self, name: &str) -> Option<&Field> {
148        self.fields.get(name)
149    }
150
151    /// Get a mutable reference to the dtype of the field named `name`, or `None` if the field doesn't exist.
152    pub fn get_mut(&mut self, name: &str) -> Option<&mut Field> {
153        self.fields.get_mut(name)
154    }
155
156    /// Get a reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist.
157    pub fn try_get(&self, name: &str) -> PolarsResult<&Field> {
158        self.get(name)
159            .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name))
160    }
161
162    /// Get a mutable reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist.
163    pub fn try_get_mut(&mut self, name: &str) -> PolarsResult<&mut Field> {
164        self.fields
165            .get_mut(name)
166            .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name))
167    }
168
169    /// Return all data about the field named `name`: its index in the schema, its name, and its dtype.
170    ///
171    /// Returns `Some((index, &name, &dtype))` if the field exists, `None` if it doesn't.
172    pub fn get_full(&self, name: &str) -> Option<(usize, &PlSmallStr, &Field)> {
173        self.fields.get_full(name)
174    }
175
176    /// Return all data about the field named `name`: its index in the schema, its name, and its dtype.
177    ///
178    /// Returns `Ok((index, &name, &dtype))` if the field exists, `Err(PolarsErr)` if it doesn't.
179    pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &PlSmallStr, &Field)> {
180        self.fields
181            .get_full(name)
182            .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name))
183    }
184
185    /// Get references to the name and dtype of the field at `index`.
186    ///
187    /// If `index` is inbounds, returns `Some((&name, &dtype))`, else `None`. See
188    /// [`get_at_index_mut`][Self::get_at_index_mut] for a mutable version.
189    pub fn get_at_index(&self, index: usize) -> Option<(&PlSmallStr, &Field)> {
190        self.fields.get_index(index)
191    }
192
193    pub fn try_get_at_index(&self, index: usize) -> PolarsResult<(&PlSmallStr, &Field)> {
194        self.fields.get_index(index).ok_or_else(|| polars_err!(ComputeError: "index {index} out of bounds with 'schema' of len: {}", self.len()))
195    }
196
197    /// Get mutable references to the name and dtype of the field at `index`.
198    ///
199    /// If `index` is inbounds, returns `Some((&mut name, &mut dtype))`, else `None`. See
200    /// [`get_at_index`][Self::get_at_index] for an immutable version.
201    pub fn get_at_index_mut(&mut self, index: usize) -> Option<(&mut PlSmallStr, &mut Field)> {
202        self.fields.get_index_mut2(index)
203    }
204
205    /// Swap-remove a field by name and, if the field existed, return its dtype.
206    ///
207    /// If the field does not exist, the schema is not modified and `None` is returned.
208    ///
209    /// This method does a `swap_remove`, which is O(1) but **changes the order of the schema**: the field named `name`
210    /// is replaced by the last field, which takes its position. For a slower, but order-preserving, method, use
211    /// [`shift_remove`][Self::shift_remove].
212    pub fn remove(&mut self, name: &str) -> Option<Field> {
213        self.fields.swap_remove(name)
214    }
215
216    /// Remove a field by name, preserving order, and, if the field existed, return its dtype.
217    ///
218    /// If the field does not exist, the schema is not modified and `None` is returned.
219    ///
220    /// This method does a `shift_remove`, which preserves the order of the fields in the schema but **is O(n)**. For a
221    /// faster, but not order-preserving, method, use [`remove`][Self::remove].
222    pub fn shift_remove(&mut self, name: &str) -> Option<Field> {
223        self.fields.shift_remove(name)
224    }
225
226    /// Remove a field by name, preserving order, and, if the field existed, return its dtype.
227    ///
228    /// If the field does not exist, the schema is not modified and `None` is returned.
229    ///
230    /// This method does a `shift_remove`, which preserves the order of the fields in the schema but **is O(n)**. For a
231    /// faster, but not order-preserving, method, use [`remove`][Self::remove].
232    pub fn shift_remove_index(&mut self, index: usize) -> Option<(PlSmallStr, Field)> {
233        self.fields.shift_remove_index(index)
234    }
235
236    /// Whether the schema contains a field named `name`.
237    pub fn contains(&self, name: &str) -> bool {
238        self.get(name).is_some()
239    }
240
241    /// Change the field named `name` to the given `dtype` and return the previous dtype.
242    ///
243    /// If `name` doesn't already exist in the schema, the schema is not modified and `None` is returned. Otherwise
244    /// returns `Some(old_dtype)`.
245    ///
246    /// This method only ever modifies an existing field and never adds a new field to the schema. To add a new field,
247    /// use [`with_column`][Self::with_column] or [`insert_at_index`][Self::insert_at_index].
248    pub fn set_dtype(&mut self, name: &str, dtype: Field) -> Option<Field> {
249        let old_dtype = self.fields.get_mut(name)?;
250        Some(std::mem::replace(old_dtype, dtype))
251    }
252
253    /// Change the field at the given index to the given `dtype` and return the previous dtype.
254    ///
255    /// If the index is out of bounds, the schema is not modified and `None` is returned. Otherwise returns
256    /// `Some(old_dtype)`.
257    ///
258    /// This method only ever modifies an existing index and never adds a new field to the schema. To add a new field,
259    /// use [`with_column`][Self::with_column] or [`insert_at_index`][Self::insert_at_index].
260    pub fn set_dtype_at_index(&mut self, index: usize, dtype: Field) -> Option<Field> {
261        let (_, old_dtype) = self.fields.get_index_mut(index)?;
262        Some(std::mem::replace(old_dtype, dtype))
263    }
264
265    /// Insert a column into the [`Schema`].
266    ///
267    /// If the schema already has this column, this instead updates it with the new value and
268    /// returns the old one. Otherwise, the column is inserted at the end.
269    ///
270    /// To enforce the index of the resulting field, use [`insert_at_index`][Self::insert_at_index].
271    pub fn with_column(&mut self, name: PlSmallStr, dtype: Field) -> Option<Field> {
272        self.fields.insert(name, dtype)
273    }
274
275    /// Raises DuplicateError if this column already exists in the schema.
276    pub fn try_insert(&mut self, name: PlSmallStr, value: Field) -> PolarsResult<()> {
277        if self.fields.contains_key(&name) {
278            polars_bail!(Duplicate: "column '{}' is duplicate", name)
279        }
280
281        self.fields.insert(name, value);
282
283        Ok(())
284    }
285
286    /// Performs [`Schema::try_insert`] for every column.
287    ///
288    /// Raises DuplicateError if a column already exists in the schema.
289    pub fn hstack_mut(
290        &mut self,
291        columns: impl IntoIterator<Item = impl Into<(PlSmallStr, Field)>>,
292    ) -> PolarsResult<()> {
293        for v in columns {
294            let (k, v) = v.into();
295            self.try_insert(k, v)?;
296        }
297
298        Ok(())
299    }
300
301    /// Performs [`Schema::try_insert`] for every column.
302    ///
303    /// Raises DuplicateError if a column already exists in the schema.
304    pub fn hstack(
305        mut self,
306        columns: impl IntoIterator<Item = impl Into<(PlSmallStr, Field)>>,
307    ) -> PolarsResult<Self> {
308        self.hstack_mut(columns)?;
309        Ok(self)
310    }
311
312    pub fn sort_by_key<T, F>(&mut self, sort_key: F)
313    where
314        T: Ord,
315        F: FnMut(&PlSmallStr, &Field) -> T,
316    {
317        self.fields.sort_by_key(sort_key);
318    }
319
320    /// Merge `other` into `self`.
321    ///
322    /// Merging logic:
323    /// - Fields that occur in `self` but not `other` are unmodified
324    /// - Fields that occur in `other` but not `self` are appended, in order, to the end of `self`
325    /// - Fields that occur in both `self` and `other` are updated with the dtype from `other`, but keep their original
326    ///   index
327    pub fn merge(&mut self, other: Self) {
328        self.fields.extend(other.fields)
329    }
330
331    /// Iterates over the `(&name, &dtype)` pairs in this schema.
332    ///
333    /// For an owned version, use [`iter_fields`][Self::iter_fields], which clones the data to iterate owned `Field`s
334    pub fn iter(&self) -> impl ExactSizeIterator<Item = (&PlSmallStr, &Field)> + '_ {
335        self.fields.iter()
336    }
337
338    pub fn iter_mut(&mut self) -> impl ExactSizeIterator<Item = (&PlSmallStr, &mut Field)> + '_ {
339        self.fields.iter_mut()
340    }
341
342    /// Iterates over references to the names in this schema.
343    pub fn iter_names(&self) -> impl '_ + ExactSizeIterator<Item = &PlSmallStr> {
344        self.fields.iter().map(|(name, _dtype)| name)
345    }
346
347    pub fn iter_names_cloned(&self) -> impl '_ + ExactSizeIterator<Item = PlSmallStr> {
348        self.iter_names().cloned()
349    }
350
351    /// Iterates over references to the dtypes in this schema.
352    pub fn iter_values(&self) -> impl '_ + ExactSizeIterator<Item = &Field> {
353        self.fields.iter().map(|(_name, dtype)| dtype)
354    }
355
356    pub fn into_iter_values(self) -> impl ExactSizeIterator<Item = Field> {
357        self.fields.into_values()
358    }
359
360    /// Iterates over mut references to the dtypes in this schema.
361    pub fn iter_values_mut(&mut self) -> impl '_ + ExactSizeIterator<Item = &mut Field> {
362        self.fields.iter_mut().map(|(_name, dtype)| dtype)
363    }
364
365    pub fn index_of(&self, name: &str) -> Option<usize> {
366        self.fields.get_index_of(name)
367    }
368
369    pub fn try_index_of(&self, name: &str) -> PolarsResult<usize> {
370        let Some(i) = self.fields.get_index_of(name) else {
371            polars_bail!(
372                ColumnNotFound:
373                "unable to find column {:?}; valid columns: {:?}",
374                name, self.iter_names().collect::<Vec<_>>(),
375            )
376        };
377
378        Ok(i)
379    }
380
381    /// Compare the fields between two schema returning the additional columns that each schema has.
382    pub fn field_compare<'a, 'b>(
383        &'a self,
384        other: &'b Self,
385        self_extra: &mut Vec<(usize, (&'a PlSmallStr, &'a Field))>,
386        other_extra: &mut Vec<(usize, (&'b PlSmallStr, &'b Field))>,
387    ) {
388        self_extra.extend(
389            self.iter()
390                .enumerate()
391                .filter(|(_, (n, _))| !other.contains(n)),
392        );
393        other_extra.extend(
394            other
395                .iter()
396                .enumerate()
397                .filter(|(_, (n, _))| !self.contains(n)),
398        );
399    }
400}
401
402impl<Field, Metadata> Schema<Field, Metadata>
403where
404    Field: Clone + Default,
405    Metadata: Clone,
406{
407    /// Create a new schema from this one, inserting a field with `name` and `dtype` at the given `index`.
408    ///
409    /// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is
410    /// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the
411    /// end of the schema).
412    ///
413    /// For a mutating version that doesn't clone, see [`insert_at_index`][Self::insert_at_index].
414    ///
415    /// Runtime: **O(m * n)** where `m` is the (average) length of the field names and `n` is the number of fields in
416    /// the schema. This method clones every field in the schema.
417    ///
418    /// Returns: `Ok(new_schema)` if `index <= self.len()`, else `Err(PolarsError)`
419    pub fn new_inserting_at_index(
420        &self,
421        index: usize,
422        name: PlSmallStr,
423        field: Field,
424    ) -> PolarsResult<Self> {
425        polars_ensure!(
426            index <= self.len(),
427            OutOfBounds:
428                "index {} is out of bounds for schema with length {} (the max index allowed is self.len())",
429                    index,
430                    self.len()
431        );
432
433        let mut new = Self {
434            fields: Default::default(),
435            metadata: self.metadata().clone(),
436        };
437        let mut iter = self.fields.iter().filter_map(|(fld_name, dtype)| {
438            (fld_name != &name).then_some((fld_name.clone(), dtype.clone()))
439        });
440        new.fields.extend(iter.by_ref().take(index));
441        new.fields.insert(name.clone(), field);
442        new.fields.extend(iter);
443        Ok(new)
444    }
445
446    /// Merge borrowed `other` into `self`.
447    ///
448    /// Merging logic:
449    /// - Fields that occur in `self` but not `other` are unmodified
450    /// - Fields that occur in `other` but not `self` are appended, in order, to the end of `self`
451    /// - Fields that occur in both `self` and `other` are updated with the dtype from `other`, but keep their original
452    ///   index
453    pub fn merge_from_ref(&mut self, other: &Self) {
454        self.fields.extend(
455            other
456                .iter()
457                .map(|(column, field)| (column.clone(), field.clone())),
458        )
459    }
460
461    /// Generates another schema with just the specified columns selected from this one.
462    pub fn try_project<I>(&self, columns: I) -> PolarsResult<Self>
463    where
464        I: IntoIterator,
465        I::Item: AsRef<str>,
466    {
467        let fields = columns
468            .into_iter()
469            .map(|c| {
470                let name = c.as_ref();
471                let (_, name, dtype) = self
472                    .fields
473                    .get_full(name)
474                    .ok_or_else(|| polars_err!(col_not_found = name))?;
475                PolarsResult::Ok((name.clone(), dtype.clone()))
476            })
477            .collect::<PolarsResult<PlIndexMap<PlSmallStr, _>>>()?;
478        Ok(Self {
479            fields,
480            metadata: self.metadata().clone(),
481        })
482    }
483
484    pub fn try_project_indices(&self, indices: &[usize]) -> PolarsResult<Self> {
485        let fields = indices
486            .iter()
487            .map(|&i| {
488                let Some((k, v)) = self.fields.get_index(i) else {
489                    polars_bail!(
490                        SchemaFieldNotFound:
491                        "projection index {} is out of bounds for schema of length {}",
492                        i, self.fields.len()
493                    );
494                };
495
496                Ok((k.clone(), v.clone()))
497            })
498            .collect::<PolarsResult<PlIndexMap<_, _>>>()?;
499
500        Ok(Self {
501            fields,
502            metadata: self.metadata().clone(),
503        })
504    }
505
506    /// Returns a new [`Schema`] with a subset of all fields whose `predicate`
507    /// evaluates to true.
508    pub fn filter<F: Fn(usize, &Field) -> bool>(self, predicate: F) -> Self {
509        let metadata = self.metadata().clone();
510        let fields = self
511            .fields
512            .into_iter()
513            .enumerate()
514            .filter_map(|(index, (name, d))| {
515                if (predicate)(index, &d) {
516                    Some((name, d))
517                } else {
518                    None
519                }
520            })
521            .collect();
522
523        Self { fields, metadata }
524    }
525}
526
527impl<Field: Hash, Metadata: Hash> Hash for Schema<Field, Metadata> {
528    fn hash<H: Hasher>(&self, state: &mut H) {
529        Hash::hash(&SchemaHashEqWrap::from(self), state)
530    }
531}
532
533// Schemas will only compare equal if they have the same fields in the same order. We can't use `self.inner ==
534// other.inner` because [`IndexMap`] ignores order when checking equality, but we don't want to ignore it.
535impl<Field: PartialEq, Metadata: PartialEq> PartialEq for Schema<Field, Metadata> {
536    fn eq(&self, other: &Self) -> bool {
537        PartialEq::eq(
538            &SchemaHashEqWrap::from(self),
539            &SchemaHashEqWrap::from(other),
540        )
541    }
542}
543
544/// Specialization
545/// * `IndexMap` eq impl does not consider key ordering, but we want key ordering.
546/// * `IndexMap` does not impl Hash.
547#[derive(Hash, PartialEq)]
548struct SchemaHashEqWrap<'a, Field, Metadata> {
549    fields: &'a indexmap::map::Slice<PlSmallStr, Field>,
550    metadata: &'a Metadata,
551}
552
553impl<'a, Field, Metadata> From<&'a Schema<Field, Metadata>>
554    for SchemaHashEqWrap<'a, Field, Metadata>
555{
556    fn from(value: &'a Schema<Field, Metadata>) -> Self {
557        let Schema { fields, metadata } = value;
558
559        Self {
560            fields: fields.as_slice(),
561            metadata,
562        }
563    }
564}
565
566impl<Field, Metadata: Default> From<PlIndexMap<PlSmallStr, Field>> for Schema<Field, Metadata> {
567    fn from(fields: PlIndexMap<PlSmallStr, Field>) -> Self {
568        Self {
569            fields,
570            metadata: Metadata::default(),
571        }
572    }
573}
574
575impl<F, Field, Metadata: Default> FromIterator<F> for Schema<Field, Metadata>
576where
577    F: Into<(PlSmallStr, Field)>,
578{
579    fn from_iter<I: IntoIterator<Item = F>>(iter: I) -> Self {
580        let fields = PlIndexMap::from_iter(iter.into_iter().map(|x| x.into()));
581        Self {
582            fields,
583            metadata: Metadata::default(),
584        }
585    }
586}
587
588impl<F, Field, Metadata> Extend<F> for Schema<Field, Metadata>
589where
590    F: Into<(PlSmallStr, Field)>,
591{
592    fn extend<T: IntoIterator<Item = F>>(&mut self, iter: T) {
593        self.fields.extend(iter.into_iter().map(|x| x.into()))
594    }
595}
596
597impl<Field, Metadata> IntoIterator for Schema<Field, Metadata> {
598    type IntoIter = <PlIndexMap<PlSmallStr, Field> as IntoIterator>::IntoIter;
599    type Item = (PlSmallStr, Field);
600
601    fn into_iter(self) -> Self::IntoIter {
602        self.fields.into_iter()
603    }
604}
605
606#[cfg(test)]
607mod tests {
608    use super::Schema;
609
610    #[test]
611    fn test_schema_eq_checks_key_order() {
612        let lhs: Schema<(), ()> = Schema::from_iter([("a".into(), ()), ("b".into(), ())]);
613        let rhs: Schema<(), ()> = Schema::from_iter([("b".into(), ()), ("a".into(), ())]);
614
615        assert_ne!(lhs, rhs);
616    }
617}