Skip to main content

vortex_session/
registry.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Many session types use a registry of objects that can be looked up by name to construct
5//! contexts. This module provides a generic registry type for that purpose.
6
7use std::cmp::Ordering;
8use std::fmt;
9use std::fmt::Debug;
10use std::fmt::Display;
11use std::fmt::Formatter;
12use std::hash::Hash;
13use std::ops::Deref;
14use std::sync::Arc;
15use std::sync::LazyLock;
16use std::sync::OnceLock;
17
18use lasso::Spur;
19use lasso::ThreadedRodeo;
20use parking_lot::RwLock;
21use vortex_error::VortexExpect;
22use vortex_utils::aliases::DefaultHashBuilder;
23use vortex_utils::aliases::dash_map::DashMap;
24
25/// Global string interner for [`Id`] values.
26static INTERNER: LazyLock<ThreadedRodeo<Spur, DefaultHashBuilder>> =
27    LazyLock::new(|| ThreadedRodeo::with_hasher(DefaultHashBuilder::default()));
28
29/// A lightweight, copyable identifier backed by a global string interner.
30///
31/// Used for array encoding IDs, scalar function IDs, layout IDs, and similar
32/// globally-unique string identifiers throughout Vortex. Equality and hashing
33/// are O(1) symbol comparisons.
34#[derive(Clone, Copy, PartialEq, Eq, Hash)]
35pub struct Id(Spur);
36
37impl Id {
38    /// Intern a string and return its `Id`.
39    pub fn new(s: &str) -> Self {
40        Self(INTERNER.get_or_intern(s))
41    }
42
43    /// Intern a string and return its `Id`.
44    pub fn new_static(s: &'static str) -> Self {
45        Self(INTERNER.get_or_intern_static(s))
46    }
47
48    /// Returns the interned string.
49    pub fn as_str(&self) -> &str {
50        let s = INTERNER.resolve(&self.0);
51        // SAFETY: INTERNER is 'static and its arena is append-only, so resolved string
52        // pointers are stable for the lifetime of the program.
53        unsafe { &*(s as *const str) }
54    }
55}
56
57impl From<&str> for Id {
58    #[expect(clippy::disallowed_methods, reason = "interning a dynamic id")]
59    fn from(s: &str) -> Self {
60        Self::new(s)
61    }
62}
63
64impl Display for Id {
65    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
66        f.write_str(self.as_str())
67    }
68}
69
70impl Debug for Id {
71    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
72        write!(f, "Id(\"{}\")", self.as_str())
73    }
74}
75
76impl PartialOrd for Id {
77    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
78        Some(self.cmp(other))
79    }
80}
81
82impl Ord for Id {
83    fn cmp(&self, other: &Self) -> Ordering {
84        self.as_str().cmp(other.as_str())
85    }
86}
87
88impl AsRef<str> for Id {
89    fn as_ref(&self) -> &str {
90        self.as_str()
91    }
92}
93
94impl PartialEq<&Id> for Id {
95    fn eq(&self, other: &&Id) -> bool {
96        self == *other
97    }
98}
99
100impl PartialEq<Id> for &Id {
101    fn eq(&self, other: &Id) -> bool {
102        *self == other
103    }
104}
105
106/// A lazily-initialized, cached [`Id`] for use as a `static`.
107///
108/// Avoids repeated interner write-lock acquisition by storing the interned [`Id`]
109/// on first access and returning the cached copy on all subsequent calls.
110///
111/// # Example
112///
113/// ```
114/// use vortex_session::registry::{CachedId, Id};
115///
116/// static MY_ID: CachedId = CachedId::new("my.encoding");
117///
118/// fn get_id() -> Id {
119///     *MY_ID
120/// }
121/// ```
122pub struct CachedId {
123    s: &'static str,
124    cached: OnceLock<Id>,
125}
126
127impl CachedId {
128    /// Create a new `CachedId` that will intern `s` on first access.
129    pub const fn new(s: &'static str) -> Self {
130        Self {
131            s,
132            cached: OnceLock::new(),
133        }
134    }
135}
136
137impl Deref for CachedId {
138    type Target = Id;
139
140    #[expect(
141        clippy::disallowed_methods,
142        reason = "CachedId interns its static id once here"
143    )]
144    fn deref(&self) -> &Id {
145        self.cached.get_or_init(|| Id::new_static(self.s))
146    }
147}
148
149/// A registry of items that are keyed by a string identifier.
150#[derive(Clone, Debug)]
151pub struct Registry<T>(Arc<DashMap<Id, T>>);
152
153impl<T> Default for Registry<T> {
154    fn default() -> Self {
155        Self(Default::default())
156    }
157}
158
159impl<T: Clone> Registry<T> {
160    pub fn empty() -> Self {
161        Self(Default::default())
162    }
163
164    /// List the IDs in the registry.
165    pub fn ids(&self) -> impl Iterator<Item = Id> + '_ {
166        self.0.iter().map(|i| *i.key())
167    }
168
169    /// List the items in the registry.
170    pub fn items(&self) -> impl Iterator<Item = T> + '_ {
171        self.0.iter().map(|i| i.value().clone())
172    }
173
174    /// Return the items with the given IDs.
175    pub fn find_many<'a>(
176        &self,
177        ids: impl IntoIterator<Item = &'a Id>,
178    ) -> impl Iterator<Item = Option<impl Deref<Target = T>>> {
179        ids.into_iter().map(|id| self.0.get(id))
180    }
181
182    /// Find the item with the given ID.
183    pub fn find(&self, id: &Id) -> Option<T> {
184        self.0.get(id).as_deref().cloned()
185    }
186
187    /// Register a new item, replacing any existing item with the same ID.
188    pub fn register(&self, id: impl Into<Id>, item: impl Into<T>) {
189        self.0.insert(id.into(), item.into());
190    }
191
192    /// Register a new item, replacing any existing item with the same ID, and return self for
193    pub fn with(self, id: impl Into<Id>, item: impl Into<T>) -> Self {
194        self.register(id, item.into());
195        self
196    }
197}
198
199/// A [`ReadContext`] holds a set of interned IDs for use during deserialization, mapping
200/// u16 indices to IDs.
201#[derive(Clone, Debug)]
202pub struct ReadContext {
203    ids: Arc<[Id]>,
204}
205
206impl ReadContext {
207    /// Create a context with the given initial IDs.
208    pub fn new(ids: impl Into<Arc<[Id]>>) -> Self {
209        Self { ids: ids.into() }
210    }
211
212    /// Resolve an interned ID by its index.
213    pub fn resolve(&self, idx: u16) -> Option<Id> {
214        self.ids.get(idx as usize).cloned()
215    }
216
217    pub fn ids(&self) -> &[Id] {
218        &self.ids
219    }
220}
221
222/// A [`Context`] holds a set of interned IDs for use during serialization/deserialization, mapping
223/// IDs to u16 indices.
224///
225/// ## Upcoming Changes
226///
227/// 1. This object holds an Arc of RwLock internally because we need concurrent access from the
228///    layout writer code path. We should update SegmentSink to take an Array rather than
229///    ByteBuffer such that serializing arrays is done sequentially.
230/// 2. The name is terrible. `Interner<T>` is better, but I want to minimize breakage for now.
231#[derive(Clone, Debug)]
232pub struct Context<T> {
233    // TODO(ngates): it's a long story, but if we make SegmentSink and SegmentSource take an
234    //  enum of Segment { Array, DType, Buffer } then we don't actually need a mutable context
235    //  in the LayoutWriter, therefore we don't need a RwLock here and everyone is happier.
236    ids: Arc<RwLock<Vec<Id>>>,
237    // Optional registry used to filter the permissible interned items.
238    registry: Option<Registry<T>>,
239}
240
241impl<T> Default for Context<T> {
242    fn default() -> Self {
243        Self {
244            ids: Arc::new(RwLock::new(Vec::new())),
245            registry: None,
246        }
247    }
248}
249
250impl<T: Clone> Context<T> {
251    /// Create a context with the given initial IDs.
252    pub fn new(ids: Vec<Id>) -> Self {
253        Self {
254            ids: Arc::new(RwLock::new(ids)),
255            registry: None,
256        }
257    }
258
259    /// Create an empty context.
260    pub fn empty() -> Self {
261        Self::default()
262    }
263
264    /// Configure a registry to restrict the permissible set of interned items.
265    pub fn with_registry(mut self, registry: Registry<T>) -> Self {
266        self.registry = Some(registry);
267        self
268    }
269
270    /// Intern an ID, returning its index.
271    pub fn intern(&self, id: &Id) -> Option<u16> {
272        if let Some(registry) = &self.registry
273            && registry.find(id).is_none()
274        {
275            // ID not in registry, cannot intern.
276            return None;
277        }
278
279        let mut ids = self.ids.write();
280        if let Some(idx) = ids.iter().position(|e| e == id) {
281            return Some(u16::try_from(idx).vortex_expect("Cannot have more than u16::MAX items"));
282        }
283
284        let idx = ids.len();
285        assert!(
286            idx < u16::MAX as usize,
287            "Cannot have more than u16::MAX items"
288        );
289        ids.push(*id);
290        Some(u16::try_from(idx).vortex_expect("checked already"))
291    }
292
293    /// Get the list of interned IDs.
294    pub fn to_ids(&self) -> Vec<Id> {
295        self.ids.read().clone()
296    }
297}