Skip to main content

luna_core/runtime/
string.rs

1//! Lua strings: immutable byte sequences allocated in one block (header +
2//! inline bytes). Short strings (≤ 40 bytes, PUC LUAI_MAXSHORTLEN) are
3//! interned in the heap's string table: equality is pointer equality. Long
4//! strings hash lazily, seeded per-heap (hash-flooding defense for hostile
5//! script workloads (script host)).
6
7use std::alloc::{Layout, alloc, dealloc, handle_alloc_error};
8use std::cell::Cell;
9use std::ptr;
10use std::slice;
11
12use crate::runtime::heap::{GcHeader, ObjTag};
13
14/// Strings up to this byte length are interned in the heap's string table;
15/// longer strings are heap-individual and hashed lazily.
16pub const MAX_SHORT_LEN: usize = 40;
17
18/// Lua string object — header plus inline byte payload. Byte-clean (Lua
19/// strings are arbitrary byte sequences, not necessarily UTF-8). Access the
20/// bytes via `Gc<LuaStr>::as_bytes`.
21#[repr(C)]
22pub struct LuaStr {
23    pub(crate) hdr: GcHeader,
24    /// string-table bucket chain (short strings only)
25    hnext: *mut LuaStr,
26    /// for long strings this holds the heap seed until the hash is computed
27    hash: Cell<u32>,
28    hashed: Cell<bool>,
29    short: bool,
30    len: u32,
31    // `len` bytes follow the struct
32}
33
34impl LuaStr {
35    /// Byte length of the string (not character count).
36    pub fn len(&self) -> usize {
37        self.len as usize
38    }
39
40    /// True when the string is zero bytes long.
41    pub fn is_empty(&self) -> bool {
42        self.len == 0
43    }
44
45    pub(crate) fn is_short(&self) -> bool {
46        self.short
47    }
48}
49
50/// Inline-bytes access MUST go through a pointer carrying the provenance of
51/// the original allocation — a `&LuaStr` only covers the header, so deriving
52/// the tail from it is UB (caught by miri). `Gc` stores the allocation
53/// pointer, hence these live on `Gc<LuaStr>`.
54impl crate::runtime::heap::Gc<LuaStr> {
55    /// Borrow the underlying bytes of this Lua string.
56    pub fn as_bytes(&self) -> &[u8] {
57        // SAFETY: `self.as_ptr()` is the start of this `LuaStr`'s header which was allocated with the trailing bytes / hash fields in the same allocation by `StringTable::intern`.
58        unsafe { bytes_of(self.as_ptr()) }
59    }
60
61    /// Cached hash of the string (computed lazily for long strings).
62    pub fn hash(&self) -> u32 {
63        // SAFETY: `self.as_ptr()` is the start of this `LuaStr`'s header which was allocated with the trailing bytes / hash fields in the same allocation by `StringTable::intern`.
64        unsafe { hash_of(self.as_ptr()) }
65    }
66}
67
68/// SAFETY: `p` must point to a live string allocation (with its tail).
69pub(crate) unsafe fn bytes_of<'a>(p: *const LuaStr) -> &'a [u8] {
70    unsafe { slice::from_raw_parts(p.add(1) as *const u8, (*p).len as usize) }
71}
72
73/// SAFETY: as `bytes_of`.
74pub(crate) unsafe fn hash_of(p: *const LuaStr) -> u32 {
75    unsafe {
76        if !(*p).hashed.get() {
77            (*p).hash.set(lua_hash(bytes_of(p), (*p).hash.get()));
78            (*p).hashed.set(true);
79        }
80        (*p).hash.get()
81    }
82}
83
84/// PUC luaS_hash (all bytes, no step — post-5.3 flooding fix).
85pub(crate) fn lua_hash(bytes: &[u8], seed: u32) -> u32 {
86    let mut h = seed ^ bytes.len() as u32;
87    for &b in bytes {
88        h ^= h
89            .wrapping_shl(5)
90            .wrapping_add(h.wrapping_shr(2))
91            .wrapping_add(b as u32);
92    }
93    h
94}
95
96fn layout(len: usize) -> Layout {
97    Layout::new::<LuaStr>()
98        .extend(Layout::array::<u8>(len).expect("string size overflows layout"))
99        .expect("string size overflows layout")
100        .0
101        .pad_to_align()
102}
103
104fn alloc_str(bytes: &[u8], short: bool, hash: u32, hashed: bool) -> *mut LuaStr {
105    let layout = layout(bytes.len());
106    // SAFETY: layout is built from the header size + trailing bytes length we just computed; deallocation will use the same layout in `Heap::sweep_strings`.
107    unsafe {
108        let p = alloc(layout) as *mut LuaStr;
109        if p.is_null() {
110            handle_alloc_error(layout);
111        }
112        p.write(LuaStr {
113            hdr: GcHeader::new(ObjTag::Str),
114            hnext: ptr::null_mut(),
115            hash: Cell::new(hash),
116            hashed: Cell::new(hashed),
117            short,
118            len: bytes.len() as u32,
119        });
120        ptr::copy_nonoverlapping(bytes.as_ptr(), p.add(1) as *mut u8, bytes.len());
121        p
122    }
123}
124
125pub(crate) fn alloc_long(bytes: &[u8], seed: u32) -> *mut LuaStr {
126    debug_assert!(bytes.len() > MAX_SHORT_LEN);
127    alloc_str(bytes, false, seed, false)
128}
129
130/// SAFETY: `p` must come from `alloc_str` and not be freed twice.
131pub(crate) unsafe fn free(p: *mut LuaStr) {
132    unsafe {
133        let l = layout((*p).len as usize);
134        ptr::drop_in_place(p);
135        dealloc(p as *mut u8, l);
136    }
137}
138
139/// Open hashing with per-string chains (PUC stringtable shape).
140pub(crate) struct StringTable {
141    buckets: Vec<*mut LuaStr>,
142    count: usize,
143}
144
145impl StringTable {
146    pub(crate) fn new() -> StringTable {
147        StringTable {
148            buckets: vec![ptr::null_mut(); 64],
149            count: 0,
150        }
151    }
152
153    /// Find or create an interned short string. Returns `(ptr, newly_created)`.
154    pub(crate) fn intern(&mut self, bytes: &[u8], seed: u32) -> (*mut LuaStr, bool) {
155        debug_assert!(bytes.len() <= MAX_SHORT_LEN);
156        let h = lua_hash(bytes, seed);
157        let b = h as usize & (self.buckets.len() - 1);
158        let mut cur = self.buckets[b];
159        // SAFETY: `self.as_ptr()` is the start of this `LuaStr`'s header which was allocated with the trailing bytes / hash fields in the same allocation by `StringTable::intern`.
160        unsafe {
161            while !cur.is_null() {
162                if (*cur).len as usize == bytes.len() && bytes_of(cur) == bytes {
163                    return (cur, false);
164                }
165                cur = (*cur).hnext;
166            }
167        }
168        if self.count >= self.buckets.len() {
169            self.grow();
170        }
171        let b = h as usize & (self.buckets.len() - 1);
172        let p = alloc_str(bytes, true, h, true);
173        // SAFETY: Gc<T> is NonNull<T> over the GC heap; the heap is single-threaded and the pointer is live as long as it is reachable from active roots (see heap.rs:5-7).
174        unsafe {
175            (*p).hnext = self.buckets[b];
176        }
177        self.buckets[b] = p;
178        self.count += 1;
179        (p, true)
180    }
181
182    fn grow(&mut self) {
183        let mut nb = vec![ptr::null_mut(); self.buckets.len() * 2];
184        let mask = nb.len() - 1;
185        for &head in &self.buckets {
186            let mut cur = head;
187            while !cur.is_null() {
188                // SAFETY: Gc<T> is NonNull<T> over the GC heap; the heap is single-threaded and the pointer is live as long as it is reachable from active roots (see heap.rs:5-7).
189                unsafe {
190                    let next = (*cur).hnext;
191                    let b = (*cur).hash.get() as usize & mask;
192                    (*cur).hnext = nb[b];
193                    nb[b] = cur;
194                    cur = next;
195                }
196            }
197        }
198        self.buckets = nb;
199    }
200
201    /// Unlink a dying interned string (called from sweep).
202    pub(crate) fn remove(&mut self, p: *mut LuaStr) {
203        // SAFETY: Gc<T> is NonNull<T> over the GC heap; the heap is single-threaded and the pointer is live as long as it is reachable from active roots (see heap.rs:5-7).
204        unsafe {
205            let b = (*p).hash.get() as usize & (self.buckets.len() - 1);
206            let mut cur: *mut *mut LuaStr = &mut self.buckets[b];
207            while !(*cur).is_null() {
208                if *cur == p {
209                    *cur = (*p).hnext;
210                    self.count -= 1;
211                    return;
212                }
213                cur = &mut (**cur).hnext;
214            }
215            unreachable!("interned string missing from string table");
216        }
217    }
218}
219
220/// Allocation footprint of a string of `len` bytes (heap accounting).
221pub(crate) fn alloc_size(len: usize) -> usize {
222    layout(len).size()
223}
224
225#[cfg(test)]
226mod tests {
227    use crate::runtime::heap::Heap;
228
229    #[test]
230    fn short_strings_are_interned() {
231        let mut heap = Heap::new();
232        let a = heap.intern(b"hello");
233        let b = heap.intern(b"hello");
234        let c = heap.intern(b"world");
235        assert!(a.ptr_eq(b));
236        assert!(!a.ptr_eq(c));
237        assert_eq!(heap.live_objects(), 2);
238        assert_eq!(a.as_bytes(), b"hello");
239    }
240
241    #[test]
242    fn long_strings_are_not_interned() {
243        let mut heap = Heap::new();
244        let bytes = [0xAAu8; 64]; // non-UTF-8 long content
245        let a = heap.intern(&bytes);
246        let b = heap.intern(&bytes);
247        assert!(!a.ptr_eq(b));
248        assert_eq!(a.as_bytes(), b.as_bytes());
249        // lazy hash agrees for equal content
250        assert_eq!(a.hash(), b.hash());
251    }
252
253    #[test]
254    fn arbitrary_bytes_roundtrip() {
255        let mut heap = Heap::new();
256        let bytes: Vec<u8> = (0..=255).collect();
257        let s = heap.intern(&bytes);
258        assert_eq!(s.as_bytes(), &bytes[..]);
259        assert_eq!(s.len(), 256);
260    }
261
262    #[test]
263    fn interning_survives_table_growth() {
264        let mut heap = Heap::new();
265        let first = heap.intern(b"key000");
266        // push way past the initial 64 buckets to force grow()
267        for i in 0..2000 {
268            heap.intern(format!("key{i:03}").as_bytes());
269        }
270        let again = heap.intern(b"key000");
271        assert!(first.ptr_eq(again));
272    }
273}