Skip to main content

vortex_session/
registry.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Many session types use a registry of objects that can be looked up by name to construct
5//! contexts. This module provides a generic registry type for that purpose.
6
7use std::cmp::Ordering;
8use std::fmt;
9use std::fmt::Debug;
10use std::fmt::Display;
11use std::fmt::Formatter;
12use std::ops::Deref;
13use std::sync::Arc;
14use std::sync::LazyLock;
15use std::sync::OnceLock;
16
17use lasso::Spur;
18use lasso::ThreadedRodeo;
19use parking_lot::RwLock;
20use vortex_error::VortexExpect;
21use vortex_utils::aliases::dash_map::DashMap;
22
23/// Global string interner for [`Id`] values.
24static INTERNER: LazyLock<ThreadedRodeo> = LazyLock::new(ThreadedRodeo::new);
25
26/// A lightweight, copyable identifier backed by a global string interner.
27///
28/// Used for array encoding IDs, scalar function IDs, layout IDs, and similar
29/// globally-unique string identifiers throughout Vortex. Equality and hashing
30/// are O(1) symbol comparisons.
31#[derive(Clone, Copy, PartialEq, Eq, Hash)]
32pub struct Id(Spur);
33
34impl Id {
35    /// Intern a string and return its `Id`.
36    pub fn new(s: &str) -> Self {
37        Self(INTERNER.get_or_intern(s))
38    }
39
40    /// Intern a string and return its `Id`.
41    pub fn new_static(s: &'static str) -> Self {
42        Self(INTERNER.get_or_intern_static(s))
43    }
44
45    /// Returns the interned string.
46    pub fn as_str(&self) -> &str {
47        let s = INTERNER.resolve(&self.0);
48        // SAFETY: INTERNER is 'static and its arena is append-only, so resolved string
49        // pointers are stable for the lifetime of the program.
50        unsafe { &*(s as *const str) }
51    }
52}
53
54impl From<&str> for Id {
55    fn from(s: &str) -> Self {
56        Self::new(s)
57    }
58}
59
60impl Display for Id {
61    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
62        f.write_str(self.as_str())
63    }
64}
65
66impl Debug for Id {
67    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
68        write!(f, "Id(\"{}\")", self.as_str())
69    }
70}
71
72impl PartialOrd for Id {
73    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
74        Some(self.cmp(other))
75    }
76}
77
78impl Ord for Id {
79    fn cmp(&self, other: &Self) -> Ordering {
80        self.as_str().cmp(other.as_str())
81    }
82}
83
84impl AsRef<str> for Id {
85    fn as_ref(&self) -> &str {
86        self.as_str()
87    }
88}
89
90impl PartialEq<&Id> for Id {
91    fn eq(&self, other: &&Id) -> bool {
92        self == *other
93    }
94}
95
96impl PartialEq<Id> for &Id {
97    fn eq(&self, other: &Id) -> bool {
98        *self == other
99    }
100}
101
102/// A lazily-initialized, cached [`Id`] for use as a `static`.
103///
104/// Avoids repeated interner write-lock acquisition by storing the interned [`Id`]
105/// on first access and returning the cached copy on all subsequent calls.
106///
107/// # Example
108///
109/// ```
110/// use vortex_session::registry::{CachedId, Id};
111///
112/// static MY_ID: CachedId = CachedId::new("my.encoding");
113///
114/// fn get_id() -> Id {
115///     *MY_ID
116/// }
117/// ```
118pub struct CachedId {
119    s: &'static str,
120    cached: OnceLock<Id>,
121}
122
123impl CachedId {
124    /// Create a new `CachedId` that will intern `s` on first access.
125    pub const fn new(s: &'static str) -> Self {
126        Self {
127            s,
128            cached: OnceLock::new(),
129        }
130    }
131}
132
133impl Deref for CachedId {
134    type Target = Id;
135
136    fn deref(&self) -> &Id {
137        self.cached.get_or_init(|| Id::new(self.s))
138    }
139}
140
141/// A registry of items that are keyed by a string identifier.
142#[derive(Clone, Debug)]
143pub struct Registry<T>(Arc<DashMap<Id, T>>);
144
145impl<T> Default for Registry<T> {
146    fn default() -> Self {
147        Self(Default::default())
148    }
149}
150
151impl<T: Clone> Registry<T> {
152    pub fn empty() -> Self {
153        Self(Default::default())
154    }
155
156    /// List the IDs in the registry.
157    pub fn ids(&self) -> impl Iterator<Item = Id> + '_ {
158        self.0.iter().map(|i| *i.key())
159    }
160
161    /// List the items in the registry.
162    pub fn items(&self) -> impl Iterator<Item = T> + '_ {
163        self.0.iter().map(|i| i.value().clone())
164    }
165
166    /// Return the items with the given IDs.
167    pub fn find_many<'a>(
168        &self,
169        ids: impl IntoIterator<Item = &'a Id>,
170    ) -> impl Iterator<Item = Option<impl Deref<Target = T>>> {
171        ids.into_iter().map(|id| self.0.get(id))
172    }
173
174    /// Find the item with the given ID.
175    pub fn find(&self, id: &Id) -> Option<T> {
176        self.0.get(id).as_deref().cloned()
177    }
178
179    /// Register a new item, replacing any existing item with the same ID.
180    pub fn register(&self, id: impl Into<Id>, item: impl Into<T>) {
181        self.0.insert(id.into(), item.into());
182    }
183
184    /// Register a new item, replacing any existing item with the same ID, and return self for
185    pub fn with(self, id: impl Into<Id>, item: impl Into<T>) -> Self {
186        self.register(id, item.into());
187        self
188    }
189}
190
191/// A [`ReadContext`] holds a set of interned IDs for use during deserialization, mapping
192/// u16 indices to IDs.
193#[derive(Clone, Debug)]
194pub struct ReadContext {
195    ids: Arc<[Id]>,
196}
197
198impl ReadContext {
199    /// Create a context with the given initial IDs.
200    pub fn new(ids: impl Into<Arc<[Id]>>) -> Self {
201        Self { ids: ids.into() }
202    }
203
204    /// Resolve an interned ID by its index.
205    pub fn resolve(&self, idx: u16) -> Option<Id> {
206        self.ids.get(idx as usize).cloned()
207    }
208
209    pub fn ids(&self) -> &[Id] {
210        &self.ids
211    }
212}
213
214/// A [`Context`] holds a set of interned IDs for use during serialization/deserialization, mapping
215/// IDs to u16 indices.
216///
217/// ## Upcoming Changes
218///
219/// 1. This object holds an Arc of RwLock internally because we need concurrent access from the
220///    layout writer code path. We should update SegmentSink to take an Array rather than
221///    ByteBuffer such that serializing arrays is done sequentially.
222/// 2. The name is terrible. `Interner<T>` is better, but I want to minimize breakage for now.
223#[derive(Clone, Debug)]
224pub struct Context<T> {
225    // TODO(ngates): it's a long story, but if we make SegmentSink and SegmentSource take an
226    //  enum of Segment { Array, DType, Buffer } then we don't actually need a mutable context
227    //  in the LayoutWriter, therefore we don't need a RwLock here and everyone is happier.
228    ids: Arc<RwLock<Vec<Id>>>,
229    // Optional registry used to filter the permissible interned items.
230    registry: Option<Registry<T>>,
231}
232
233impl<T> Default for Context<T> {
234    fn default() -> Self {
235        Self {
236            ids: Arc::new(RwLock::new(Vec::new())),
237            registry: None,
238        }
239    }
240}
241
242impl<T: Clone> Context<T> {
243    /// Create a context with the given initial IDs.
244    pub fn new(ids: Vec<Id>) -> Self {
245        Self {
246            ids: Arc::new(RwLock::new(ids)),
247            registry: None,
248        }
249    }
250
251    /// Create an empty context.
252    pub fn empty() -> Self {
253        Self::default()
254    }
255
256    /// Configure a registry to restrict the permissible set of interned items.
257    pub fn with_registry(mut self, registry: Registry<T>) -> Self {
258        self.registry = Some(registry);
259        self
260    }
261
262    /// Intern an ID, returning its index.
263    pub fn intern(&self, id: &Id) -> Option<u16> {
264        if let Some(registry) = &self.registry
265            && registry.find(id).is_none()
266        {
267            // ID not in registry, cannot intern.
268            return None;
269        }
270
271        let mut ids = self.ids.write();
272        if let Some(idx) = ids.iter().position(|e| e == id) {
273            return Some(u16::try_from(idx).vortex_expect("Cannot have more than u16::MAX items"));
274        }
275
276        let idx = ids.len();
277        assert!(
278            idx < u16::MAX as usize,
279            "Cannot have more than u16::MAX items"
280        );
281        ids.push(*id);
282        Some(u16::try_from(idx).vortex_expect("checked already"))
283    }
284
285    /// Get the list of interned IDs.
286    pub fn to_ids(&self) -> Vec<Id> {
287        self.ids.read().clone()
288    }
289}