Skip to main content

lite_strtab/
table.rs

1//! Immutable string storage backed by one byte buffer and one offset table.
2//!
3//! The layout is:
4//!
5//! - `bytes`: all UTF-8 string bytes concatenated
6//! - `offsets`: start offset for each string, plus one final sentinel
7//! - strings are not NUL-terminated by default; boundaries come from offsets
8//!
9//! For `n` strings, `offsets.len() == n + 1`.
10//! String `i` is `bytes[offsets[i]..offsets[i + 1]]`.
11
12use core::iter::FusedIterator;
13use core::marker::PhantomData;
14use core::ops::Range;
15use core::str;
16
17use crate::allocator::*;
18#[cfg(any(debug_assertions, test))]
19use crate::error::{ValidationError, ValidationResult};
20use crate::{Offset, StringId, StringIndex};
21
22/// Alias for [`StringTable`].
23pub type StringPool<O = u32, I = u16, const NULL_PADDED: bool = false, A = Global> =
24    StringTable<O, I, NULL_PADDED, A>;
25
26/// Alias for [`StringTableIter`].
27pub type StringPoolIter<'a, O = u32, const NULL_PADDED: bool = false> =
28    StringTableIter<'a, O, NULL_PADDED>;
29
30/// Immutable string storage.
31///
32/// All strings are stored in a single contiguous UTF-8 byte buffer.
33/// An offset table maps each [`StringId`] to a byte range.
34///
35/// The table is immutable once built. This keeps references returned by [`Self::get`]
36/// valid for the lifetime of `&self` and avoids mutation-related reallocation issues.
37///
38/// The offset table always contains one extra value at the end (a sentinel)
39/// equal to `bytes.len()`. This allows `get` to resolve a range with two
40/// offset reads.
41///
42/// Generic parameters control capacity and metadata size:
43/// - `O` is the byte-offset type (see [`Offset`]). It bounds total UTF-8 bytes and costs
44///   `size_of::<O>()` per string inside the [`crate::StringTable`].
45/// - `I` is the string-ID type (see [`StringIndex`]) used by [`crate::StringId`].
46///   It limits string count and costs `size_of::<I>()` per stored ID field
47///   (table index) in your own structs.
48///
49/// The common choice is `O = u32, I = u16`: meaning `4 GiB` of UTF-8 data
50/// and `64Ki` entries per table.
51///
52/// This is 4 bytes per string offset in the table and 2 bytes
53/// per StringID (index into table) inside your structs.
54/// For comparison: `Box<str>` == 16 bytes, `String` == 24 bytes.
55///
56/// By default, inserted strings are not NUL-terminated.
57/// Set `NULL_PADDED = true` to store strings with a trailing NUL byte.
58///
59/// # Example
60///
61/// ```rust
62/// use lite_strtab::StringTableBuilder;
63///
64/// let mut builder = StringTableBuilder::new();
65/// let a = builder.try_push("cat").unwrap();
66/// let b = builder.try_push("dog").unwrap();
67///
68/// let table = builder.build();
69/// assert_eq!(table.get(a), Some("cat"));
70/// assert_eq!(table.get(b), Some("dog"));
71/// ```
72pub struct StringTable<
73    O = u32,
74    I = u16,
75    const NULL_PADDED: bool = false,
76    A: Allocator + Clone = Global,
77> where
78    O: Offset,
79    I: StringIndex,
80{
81    bytes: Box<[u8], A>,
82    offsets: Box<[O], A>,
83    _id: PhantomData<I>,
84}
85
86impl StringTable<u32, u16, false, Global> {
87    /// Creates an empty table using the global allocator.
88    #[inline]
89    pub fn empty() -> Self {
90        Self::empty_in(Global)
91    }
92}
93
94impl<O: Offset, I: StringIndex, const NULL_PADDED: bool, A: Allocator + Clone>
95    StringTable<O, I, NULL_PADDED, A>
96{
97    /// Creates an empty table with a custom allocator.
98    pub fn empty_in(allocator: A) -> Self {
99        let bytes = Vec::new_in(allocator.clone()).into_boxed_slice();
100        let mut offsets = Vec::with_capacity_in(1, allocator);
101        offsets.push(zero_offset::<O>());
102
103        Self {
104            bytes,
105            offsets: offsets.into_boxed_slice(),
106            _id: PhantomData,
107        }
108    }
109
110    #[inline]
111    pub(crate) fn from_parts_unchecked(bytes: Box<[u8], A>, offsets: Box<[O], A>) -> Self {
112        Self {
113            bytes,
114            offsets,
115            _id: PhantomData,
116        }
117    }
118
119    /// Number of strings in the table.
120    #[inline]
121    pub fn len(&self) -> usize {
122        self.offsets.len().saturating_sub(1)
123    }
124
125    /// Returns `true` when the table has no strings.
126    #[inline]
127    pub fn is_empty(&self) -> bool {
128        self.len() == 0
129    }
130
131    /// Returns the string for a given ID.
132    #[inline]
133    pub fn get(&self, id: StringId<I>) -> Option<&str> {
134        let index = id.into_usize();
135        // Failure (None) is unlikely; users typically provide valid indices.
136        // Since likely/unlikely isn't stable, we structure this so the
137        // success path falls through without jumps, improving pipelining.
138        if index < self.len() {
139            // SAFETY: Bounds check above guarantees both offset entries exist.
140            let start = unsafe { self.offsets.get_unchecked(index) }.to_usize();
141            let end = unsafe { self.offsets.get_unchecked(index + 1) }.to_usize();
142            // Const generic: default (`false`) folds `saturating_sub(0)` to `end`.
143            let logical_end = end.saturating_sub(usize::from(NULL_PADDED));
144            debug_assert!(logical_end >= start);
145
146            // SAFETY: Table invariants guarantee this range is in bounds and valid UTF-8.
147            let bytes = unsafe { self.bytes.get_unchecked(start..logical_end) };
148            // SAFETY: Invariants guarantee all ranges are valid UTF-8.
149            Some(unsafe { str::from_utf8_unchecked(bytes) })
150        } else {
151            None
152        }
153    }
154
155    /// Returns the string for a given ID without bounds checks.
156    ///
157    /// # Safety
158    ///
159    /// `id` must be in bounds (`id < self.len()`).
160    #[inline]
161    pub unsafe fn get_unchecked(&self, id: StringId<I>) -> &str {
162        let index = id.into_usize();
163        let start = unsafe { self.offsets.get_unchecked(index) }.to_usize();
164        let end = unsafe { self.offsets.get_unchecked(index + 1) }.to_usize();
165        // Const generic: default (`false`) folds `saturating_sub(0)` to `end`.
166        let logical_end = end.saturating_sub(usize::from(NULL_PADDED));
167        debug_assert!(logical_end >= start);
168        let bytes = unsafe { self.bytes.get_unchecked(start..logical_end) };
169
170        // SAFETY: Invariants guarantee all ranges are valid UTF-8.
171        unsafe { str::from_utf8_unchecked(bytes) }
172    }
173
174    /// Returns an iterator over all strings.
175    #[inline]
176    pub fn iter(&self) -> StringTableIter<'_, O, NULL_PADDED> {
177        let offsets = &self.offsets;
178        let strings = offsets.len().saturating_sub(1);
179        let cur_offset = offsets.as_ptr();
180
181        StringTableIter {
182            bytes: &self.bytes,
183            cur_offset,
184            // SAFETY: `strings` is at most `offsets.len() - 1`, so this stays
185            // in-bounds and may equal `cur_offset` for an empty iterator.
186            max_offset: unsafe { cur_offset.add(strings) },
187            remaining: strings,
188            _offsets: PhantomData,
189        }
190    }
191
192    /// Returns the contiguous byte storage.
193    #[inline]
194    pub fn as_bytes(&self) -> &[u8] {
195        &self.bytes
196    }
197
198    /// Returns `true` if any stored string equals `value`.
199    #[inline]
200    pub fn contains(&self, value: &str) -> bool {
201        self.iter().any(|item| item == value)
202    }
203
204    /// Returns the offset table, including the final sentinel.
205    #[inline]
206    pub fn offsets(&self) -> &[O] {
207        &self.offsets
208    }
209
210    /// Returns the byte range for a given ID.
211    #[inline]
212    pub fn byte_range(&self, id: StringId<I>) -> Option<Range<usize>> {
213        let index = id.into_usize();
214        // Failure (None) is unlikely; users typically provide valid indices.
215        // Since likely/unlikely isn't stable, we structure this so the
216        // success path falls through without jumps, improving pipelining.
217        if index < self.len() {
218            // SAFETY: Bounds check above ensures `index` and `index + 1` are valid.
219            let start = unsafe { self.offsets.get_unchecked(index) }.to_usize();
220            let end = unsafe { self.offsets.get_unchecked(index + 1) }.to_usize();
221            // Const generic: default (`false`) folds `saturating_sub(0)` to `end`.
222            let logical_end = end.saturating_sub(usize::from(NULL_PADDED));
223            debug_assert!(logical_end >= start);
224
225            Some(start..logical_end)
226        } else {
227            None
228        }
229    }
230
231    #[cfg(any(debug_assertions, test))]
232    pub(crate) fn validate(&self) -> ValidationResult<()> {
233        let bytes_len = self.bytes.len();
234        if O::try_from_usize(bytes_len).is_none() {
235            return Err(ValidationError::TooManyBytesForOffsetType {
236                bytes: bytes_len,
237                offset_type: O::TYPE_NAME,
238            });
239        }
240
241        let strings = self.len();
242        if strings > 0 && I::try_from_usize(strings - 1).is_none() {
243            return Err(ValidationError::TooManyStrings {
244                strings,
245                id_type: I::TYPE_NAME,
246            });
247        }
248
249        let offsets = &self.offsets;
250        if offsets.is_empty() {
251            return Err(ValidationError::MissingSentinelOffset);
252        }
253
254        let last_index = offsets.len() - 1;
255        let found_last = offsets[last_index].to_usize();
256        if found_last != bytes_len {
257            return Err(ValidationError::LastOffsetMismatch {
258                found: found_last,
259                expected: bytes_len,
260            });
261        }
262
263        let mut previous = 0usize;
264        for (index, &offset) in offsets.iter().enumerate() {
265            let current = offset.to_usize();
266
267            if current > bytes_len {
268                return Err(ValidationError::OffsetOutOfBounds {
269                    index,
270                    offset: current,
271                    bytes_len,
272                });
273            }
274
275            if index == 0 {
276                previous = current;
277                continue;
278            }
279
280            if current < previous {
281                return Err(ValidationError::OffsetsNotMonotonic {
282                    index,
283                    previous,
284                    current,
285                });
286            }
287
288            if NULL_PADDED {
289                if current == previous {
290                    return Err(ValidationError::NullPaddedStringMissingTerminatorByte {
291                        index: index - 1,
292                    });
293                }
294
295                let terminator_index = current - 1;
296                if self.bytes[terminator_index] != 0 {
297                    return Err(ValidationError::NullPaddedStringMissingTrailingNul {
298                        index: index - 1,
299                    });
300                }
301
302                if str::from_utf8(&self.bytes[previous..terminator_index]).is_err() {
303                    return Err(ValidationError::InvalidUtf8 { index: index - 1 });
304                }
305            } else if str::from_utf8(&self.bytes[previous..current]).is_err() {
306                return Err(ValidationError::InvalidUtf8 { index: index - 1 });
307            }
308
309            previous = current;
310        }
311
312        Ok(())
313    }
314}
315
316/// Iterator returned by [`StringTable::iter`].
317pub struct StringTableIter<'a, O: Offset = u32, const NULL_PADDED: bool = false> {
318    bytes: &'a [u8],
319    cur_offset: *const O,
320    max_offset: *const O,
321    remaining: usize,
322    _offsets: PhantomData<&'a [O]>,
323}
324
325impl<'a, O: Offset, const NULL_PADDED: bool> Iterator for StringTableIter<'a, O, NULL_PADDED> {
326    type Item = &'a str;
327
328    #[inline]
329    fn next(&mut self) -> Option<Self::Item> {
330        if self.cur_offset != self.max_offset {
331            // SAFETY: `cur_offset != max_offset` guarantees at least one string
332            // remains, so both `cur_offset` and `cur_offset + 1` are valid.
333            let start = unsafe { (*self.cur_offset).to_usize() };
334            self.cur_offset = unsafe { self.cur_offset.add(1) };
335            let end = unsafe { (*self.cur_offset).to_usize() };
336            self.remaining -= 1;
337
338            // Const generic: default (`false`) folds `saturating_sub(0)` to `end`.
339            let logical_end = end.saturating_sub(usize::from(NULL_PADDED));
340            debug_assert!(logical_end >= start);
341
342            // SAFETY: Pool invariants guarantee this slice is in bounds and valid UTF-8.
343            let bytes = unsafe { self.bytes.get_unchecked(start..logical_end) };
344            Some(unsafe { str::from_utf8_unchecked(bytes) })
345        } else {
346            None
347        }
348    }
349
350    #[inline]
351    fn size_hint(&self) -> (usize, Option<usize>) {
352        let remaining = self.len();
353        (remaining, Some(remaining))
354    }
355}
356
357impl<O: Offset, const NULL_PADDED: bool> ExactSizeIterator for StringTableIter<'_, O, NULL_PADDED> {
358    #[inline]
359    fn len(&self) -> usize {
360        self.remaining
361    }
362}
363
364impl<O: Offset, const NULL_PADDED: bool> FusedIterator for StringTableIter<'_, O, NULL_PADDED> {}
365
366#[inline]
367fn zero_offset<O: Offset>() -> O {
368    // SAFETY: All built-in integer implementations accept zero.
369    unsafe { O::try_from_usize(0).unwrap_unchecked() }
370}
371
372#[cfg(test)]
373mod tests {
374    use crate::allocator::{Global, Vec};
375    use crate::error::{ValidationError, ValidationResult};
376    use crate::{Offset, StringId, StringIndex, StringTable};
377
378    fn validate_parts<O: Offset, I: StringIndex, const NULL_PADDED: bool>(
379        bytes: Vec<u8, Global>,
380        offsets: Vec<O, Global>,
381    ) -> ValidationResult<()> {
382        let table = StringTable::<O, I, NULL_PADDED, Global>::from_parts_unchecked(
383            bytes.into_boxed_slice(),
384            offsets.into_boxed_slice(),
385        );
386        table.validate()
387    }
388
389    #[test]
390    fn validate_rejects_missing_sentinel() {
391        let mut bytes = Vec::new_in(Global);
392        bytes.extend_from_slice(b"hello");
393
394        let mut offsets = Vec::new_in(Global);
395        offsets.push(0u32);
396
397        let result = validate_parts::<u32, u32, false>(bytes, offsets);
398        assert!(matches!(
399            result,
400            Err(ValidationError::LastOffsetMismatch { .. })
401        ));
402    }
403
404    #[test]
405    fn validate_rejects_non_monotonic_offsets() {
406        let mut bytes = Vec::new_in(Global);
407        bytes.extend_from_slice(b"abcd");
408
409        let mut offsets = Vec::new_in(Global);
410        offsets.push(0u32);
411        offsets.push(3u32);
412        offsets.push(2u32);
413        offsets.push(4u32);
414
415        let result = validate_parts::<u32, u32, false>(bytes, offsets);
416        assert!(matches!(
417            result,
418            Err(ValidationError::OffsetsNotMonotonic { .. })
419        ));
420    }
421
422    #[test]
423    fn validate_rejects_invalid_utf8() {
424        let mut bytes = Vec::new_in(Global);
425        bytes.push(0xFF);
426
427        let mut offsets = Vec::new_in(Global);
428        offsets.push(0u32);
429        offsets.push(1u32);
430
431        let result = validate_parts::<u32, u32, false>(bytes, offsets);
432        assert!(matches!(result, Err(ValidationError::InvalidUtf8 { .. })));
433    }
434
435    #[test]
436    fn validate_null_padded_accepts_trailing_nul() {
437        let mut bytes = Vec::new_in(Global);
438        bytes.extend_from_slice(b"hello\0");
439
440        let mut offsets = Vec::new_in(Global);
441        offsets.push(0u32);
442        offsets.push(6u32);
443
444        let result = validate_parts::<u32, u32, true>(bytes, offsets);
445        assert!(result.is_ok());
446    }
447
448    #[test]
449    fn validate_null_padded_rejects_missing_trailing_nul() {
450        let mut bytes = Vec::new_in(Global);
451        bytes.extend_from_slice(b"hello");
452
453        let mut offsets = Vec::new_in(Global);
454        offsets.push(0u32);
455        offsets.push(5u32);
456
457        let result = validate_parts::<u32, u32, true>(bytes, offsets);
458        assert!(matches!(
459            result,
460            Err(ValidationError::NullPaddedStringMissingTrailingNul { .. })
461        ));
462    }
463
464    #[test]
465    fn validate_rejects_offset_type_overflow() {
466        let mut bytes = Vec::new_in(Global);
467        bytes.extend_from_slice(b"abc");
468
469        let mut offsets = Vec::new_in(Global);
470        offsets.push(0u8);
471        offsets.push(3u8);
472
473        let result = validate_parts::<u8, u32, false>(bytes, offsets);
474        assert!(result.is_ok());
475
476        let mut too_big = Vec::new_in(Global);
477        too_big.extend_from_slice(&[0u8; 300]);
478        let mut offsets = Vec::new_in(Global);
479        offsets.push(0u8);
480        offsets.push(u8::MAX);
481
482        let result = validate_parts::<u8, u32, false>(too_big, offsets);
483        assert!(matches!(
484            result,
485            Err(ValidationError::TooManyBytesForOffsetType { .. })
486        ));
487    }
488
489    #[test]
490    fn validate_rejects_id_type_overflow() {
491        let bytes = Vec::new_in(Global).into_boxed_slice();
492        let mut offsets = Vec::new_in(Global);
493        for _ in 0..258 {
494            offsets.push(0u32);
495        }
496
497        let table = StringTable::<u32, u8>::from_parts_unchecked(bytes, offsets.into_boxed_slice());
498        let result = table.validate();
499        assert!(matches!(
500            result,
501            Err(ValidationError::TooManyStrings {
502                strings: 257,
503                id_type: "u8"
504            })
505        ));
506    }
507
508    #[test]
509    fn get_returns_none_for_invalid_id() {
510        let table = StringTable::empty();
511        assert_eq!(table.get(StringId::new(0)), None);
512    }
513}