Skip to main content

lua_vm/
string.rs

1//! String table and interned-string operations — port of `lstring.c` + `lstring.h`.
2//!
3//! Provides two key abstractions:
4//!
5//! - [`LuaStringImpl`]: the Lua string value, stored as a reference-counted byte slice.
6//!   Short strings (`<= MAX_SHORT_LEN` bytes) are interned in the process-global
7//!   [`StringPool`]; long strings are heap-allocated on each creation and never
8//!   interned.
9//!
10//! - [`StringPool`]: the intern table for short strings, stored on `GlobalState`.
11//!   Replaces the C `stringtable` struct, which used an open-addressing hash table
12//!   with intrusive chaining through `TString.u.hnext`.  In Rust the intrusive
13//!   chain is dropped; a `HashMap` provides O(1) lookup and automatic rehashing.
14//!   See PORT NOTE on [`StringPool`] for the full rationale.
15//!
16//! The `lstring.h` header is merged into this module per PORTING.md §1.
17//!
18//! # C source files
19//! - `reference/lua-5.4.7/src/lstring.c`  (275 lines, 15 functions)
20//! - `reference/lua-5.4.7/src/lstring.h`  (57 lines; merged here)
21
22use std::cell::Cell;
23#[allow(unused_imports)] use crate::prelude::*;
24use std::collections::HashMap;
25use std::rc::Rc;
26
27// TODO(port): these import paths will resolve once Phase B wires the crate graph.
28// `LuaState` and `GlobalState` live in crate::state (src/state.rs, from lstate.c).
29// `LuaValue` and `LuaError` live in lua_types (crates/lua-types/src/).
30use crate::state::LuaState;
31
32// PORT NOTE: `GcRef<T>` is the lua-types newtype around `Rc<T>` per PORT_STRATEGY §3.4.
33// Re-imported here so all string-pool entries share identity with state.rs / api.rs.
34use lua_types::GcRef;
35/// Phase-B bridge: converts a lua-vm rich `LuaStringImpl` into a `lua_types::LuaString`.
36/// The two types track different metadata (short/long flag, extra byte) and a real
37/// merge belongs in Phase B once `lua-types::LuaString` grows the needed fields.
38fn impl_to_lt(s: &GcRef<LuaStringImpl>) -> GcRef<lua_types::LuaString> {
39    // TODO(D-1c-bridge): allocation outside state context (free fn)
40    GcRef::new(lua_types::LuaString::from_bytes(s.as_bytes().to_vec()))
41}
42
43// ── Constants (lstring.h macros → macros.tsv) ─────────────────────────────────
44
45// macros.tsv: MEMERRMSG → const MEMERR_MSG: &[u8] = b"not enough memory"
46/// Pre-allocated OOM error message.  Must be created before the allocator
47/// can fail so that the GC can always hand back a valid error string.
48pub(crate) const MEMERR_MSG: &[u8] = b"not enough memory";
49
50// macros.tsv: MINSTRTABSIZE → const MIN_STR_TAB_SIZE: usize = 128
51const MIN_STR_TAB_SIZE: usize = 128;
52
53// macros.tsv: STRCACHE_N → const STRCACHE_N: usize = 53
54const STRCACHE_N: usize = 53;
55
56// macros.tsv: STRCACHE_M → const STRCACHE_M: usize = 2
57const STRCACHE_M: usize = 2;
58
59// macros.tsv: LUAI_MAXSHORTLEN → const MAX_SHORT_LEN: usize = 40
60pub(crate) const MAX_SHORT_LEN: usize = 40;
61
62// macros.tsv: MAX_SIZE → const MAX_SIZE: usize = if size_of::<usize>() < size_of::<i64>() { usize::MAX } else { i64::MAX as usize }
63const MAX_SIZE: usize = if std::mem::size_of::<usize>() < std::mem::size_of::<i64>() {
64    usize::MAX
65} else {
66    i64::MAX as usize
67};
68
69// macros.tsv: luaM_limitN → std::cmp::min(n, usize::MAX / std::mem::size_of::<T>())
70//             cast_int → x as i32
71// Rust: upper bound on the number of hash buckets; derived from MAX_INT / pointer size.
72const MAX_STR_TAB: usize = i32::MAX as usize / std::mem::size_of::<usize>();
73
74// macros.tsv: sizelstring → drop — Rust allocates via Box<[u8]> / Rc<[u8]>
75// PORT NOTE: dropped entirely; Rust uses Rc<[u8]> which carries its own length.
76
77// macros.tsv: luaS_newliteral → state.intern_str(b"...")
78// PORT NOTE: translated at call sites as `new_lstr(state, b"literal")`.
79
80// macros.tsv: isreserved → ts.is_reserved_word()
81// PORT NOTE: translated at call sites as the `LuaStringImpl::is_reserved_word()` method.
82
83// macros.tsv: eqshrstr → Rc::ptr_eq(a, b)
84// PORT NOTE: short strings are interned so pointer equality suffices.
85// Translated at call sites as `Rc::ptr_eq(a, b)`.
86
87// ── LuaStringImpl (was TString in lobject.h) ─────────────────────────────────────
88
89// PORT NOTE: `LuaStringImpl` corresponds to `TString` from `lobject.h`, which maps to
90// `src/object.rs` per file_deps.txt.  It is defined here (in `string.rs`) because
91// `lstring.c` owns the string-table internals and most of the type's behaviour.
92// Phase B should reconcile: either keep it here and re-export from `object.rs`,
93// or move it there and import it from `string.rs`.
94
95/// Whether a Lua string is short (interned) or long (not interned).
96///
97/// Corresponds to `LUA_VSHRSTR` / `LUA_VLNGSTR` tags from `lobject.h`.
98///
99/// # C mapping (types.tsv)
100/// ```text
101/// LUA_VSHRSTR → LuaStringImpl::Short  (shrlen holds length 0..=40)
102/// LUA_VLNGSTR → LuaStringImpl::Long   (shrlen = 0xFF sentinel; u.lnglen holds length)
103/// ```
104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
105pub enum StringKind {
106    Short,
107    Long,
108}
109
110/// A Lua string: an immutable, reference-counted byte sequence.
111///
112/// Short strings (`<= MAX_SHORT_LEN = 40` bytes) are interned in the
113/// [`StringPool`] on `GlobalState`; two short strings with the same bytes
114/// are guaranteed to be the same `GcRef` (pointer equality via `Rc::ptr_eq`).
115///
116/// Long strings are heap-allocated independently and never interned.  Their
117/// hash is computed lazily on first call to [`hash_long_str`] and cached via
118/// interior mutability (`Cell<u32>`).
119///
120/// # C mapping (types.tsv)
121/// ```text
122/// TString             → LuaStringImpl
123/// TString.extra       → extra: Cell<u8>   (reserved-word idx for Short; hash-ready flag for Long)
124/// TString.shrlen      → kind: StringKind   (0xFF sentinel replaced by enum variant)
125/// TString.hash        → hash: Cell<u32>
126/// TString.u.lnglen    → bytes.len()        (length implicit in Rc<[u8]>)
127/// TString.u.hnext     → (removed)          (intrusive chain gone; StringPool uses HashMap)
128/// TString.contents    → bytes: Rc<[u8]>
129/// ```
130pub struct LuaStringImpl {
131    bytes: Rc<[u8]>,
132
133    // Replaced by the StringKind enum; length is implicit in bytes.len().
134    kind: StringKind,
135
136    // Using Cell<u32> so that `hash_long_str` can cache the hash through a
137    // shared `&LuaStringImpl` reference (interior mutability, single-threaded).
138    #[allow(dead_code)]
139    hash: Cell<u32>,
140
141    // Short strings: reserved-word token index (0 = not a keyword).
142    // Long strings:  0 = hash not yet computed; 1 = hash is valid.
143    extra: Cell<u8>,
144}
145
146impl LuaStringImpl {
147    /// Returns the string's bytes.
148    ///
149    /// macros.tsv: `getstr` / `getlngstr` / `getshrstr` → `ts.as_bytes()`
150    pub fn as_bytes(&self) -> &[u8] {
151        &self.bytes
152    }
153
154    /// Returns the byte length of the string.
155    ///
156    /// for Long.  In Rust both cases are `bytes.len()`.
157    /// macros.tsv: `tsslen` → `ts.len()`
158    pub fn len(&self) -> usize {
159        self.bytes.len()
160    }
161
162    /// Returns `true` if this is a long (non-interned) string.
163    pub fn is_long(&self) -> bool {
164        self.kind == StringKind::Long
165    }
166
167    /// Returns `true` if this is a short (interned) string.
168    pub fn is_short(&self) -> bool {
169        self.kind == StringKind::Short
170    }
171
172    /// Returns `true` if this short string is a Lua reserved word.
173    ///
174    /// macros.tsv: `isreserved` → `ts.is_reserved_word()`
175    pub fn is_reserved_word(&self) -> bool {
176        self.kind == StringKind::Short && self.extra.get() > 0
177    }
178
179    /// GC color predicate.  Returns `true` if this object is "white" (unreachable)
180    /// in the GC's current wave.
181    ///
182    /// macros.tsv: `iswhite` → `obj.is_white()`
183    ///
184    /// PORT NOTE: GC color management is deferred to Phase D.  In Phases A–C all
185    /// objects are reachable via `Rc` reference counts and this always returns
186    /// `false` (nothing is white / unreachable).
187    pub fn is_white(&self) -> bool {
188        // TODO(port): Phase D — check the GC marked byte; stub returns false (all live)
189        false
190    }
191
192    /// Flip GC color from white to the current non-white (resurrect a dead object).
193    ///
194    /// macros.tsv: `changewhite` → `obj.flip_white()`
195    ///
196    /// PORT NOTE: GC color management deferred to Phase D; no-op in Phases A–C.
197    pub fn flip_white(&self) {
198        // TODO(port): Phase D — update the GC marked byte
199    }
200}
201
202impl PartialEq for LuaStringImpl {
203    /// Equality for Lua strings.
204    ///
205    /// For short strings (interned), pointer equality via `Rc::ptr_eq` is sufficient
206    /// and matches `eqshrstr` in C.  For long strings, we fall back to byte
207    /// comparison, matching `luaS_eqlngstr` in C.
208    fn eq(&self, other: &Self) -> bool {
209        if self.kind == StringKind::Short && other.kind == StringKind::Short {
210            Rc::ptr_eq(&self.bytes, &other.bytes)
211        } else {
212            self.bytes == other.bytes
213        }
214    }
215}
216
217impl Eq for LuaStringImpl {}
218
219// ── StringPool (was stringtable in lstate.h) ──────────────────────────────────
220
221// PORT NOTE: `StringPool` corresponds to `stringtable` from `lstate.h`, which maps
222// to `src/state.rs` per file_deps.txt.  It is defined here because `lstring.c`
223// owns all of the pool's mutation logic.  Phase B should reconcile placement.
224//
225// The C `stringtable` used an open-addressing hash table where each bucket was
226// the head of an intrusive singly-linked list threaded through `TString.u.hnext`.
227// In Rust, `TString.u.hnext` is removed per types.tsv.  The `HashMap` replaces
228// both the bucket array and the chain: it provides O(1) average-case lookup,
229// automatic rehashing, and eliminates the need for `tablerehash`.
230//
231// `nuse` and `size` are retained for parity with the C invariants that other
232// code may check (e.g. `growstrtab` tests `nuse >= size`).
233
234/// Intern table for short Lua strings.  Lives on `GlobalState`.
235///
236/// # C mapping (types.tsv)
237/// ```text
238/// stringtable        → StringPool
239/// stringtable.hash   → map: HashMap<Box<[u8]>, GcRef<LuaStringImpl>>
240/// stringtable.nuse   → nuse: usize
241/// stringtable.size   → size: usize
242/// ```
243pub struct StringPool {
244    // PORT NOTE: keyed by owned byte slice; lookup by `&[u8]` via Borrow<[u8]>.
245    map: HashMap<Box<[u8]>, GcRef<LuaStringImpl>>,
246
247    // PERF(port): redundant with map.len() in Rust — keep for C-parity; remove in Phase B
248    nuse: usize,
249
250    // In Rust, HashMap manages its own capacity; this tracks the last requested size.
251    size: usize,
252}
253
254impl StringPool {
255    /// Create an empty pool with `MIN_STR_TAB_SIZE` preallocated capacity.
256    ///
257    ///    `tablerehash(tb->hash, 0, MINSTRTABSIZE)` sequence in `luaS_init`.
258    pub fn new() -> Self {
259        StringPool {
260            map: HashMap::with_capacity(MIN_STR_TAB_SIZE),
261            nuse: 0,
262            size: MIN_STR_TAB_SIZE,
263        }
264    }
265}
266
267impl Default for StringPool {
268    fn default() -> Self {
269        Self::new()
270    }
271}
272
273// ── LuaUserData (was Udata in lobject.h) ──────────────────────────────────────
274
275// PORT NOTE: `LuaUserData` corresponds to `Udata` from `lobject.h`, which maps to
276// `src/object.rs` per file_deps.txt.  Defined here because `luaS_newudata` lives
277// in `lstring.c`.  Phase B should reconcile placement.
278
279/// Full userdata: a GC-tracked object carrying a raw byte payload plus optional
280/// Lua user values and an optional metatable.
281///
282/// # C mapping (types.tsv)
283/// ```text
284/// Udata           → LuaUserData
285/// Udata.len       → len: usize
286/// Udata.nuvalue   → nuvalue: u16  (covered by uv.len() but kept for parity)
287/// Udata.metatable → metatable: Option<GcRef<LuaTable>>
288/// Udata.uv        → uv: Vec<LuaValue>
289/// (no direct C field) data: Box<[u8]>  — the raw byte payload; C used a flexible
290///                          array member laid out past the Udata header via
291///                          `udatamemoffset` alignment math.
292/// ```
293pub struct LuaUserDataImpl {
294    pub len: usize,
295    pub nuvalue: u16,
296    // TODO(port): GcRef<LuaTable> — LuaTable not yet defined; Phase B
297    pub metatable: Option<()>,
298    // macros.tsv: setnilvalue → *o = LuaValue::Nil
299    // TODO(port): Vec<LuaValue> — LuaValue not yet defined; Phase B
300    pub uv: Vec<()>,
301    // Port of the raw byte payload that C accessed via udatamemoffset arithmetic.
302    pub data: Box<[u8]>,
303}
304
305// ── Public functions ───────────────────────────────────────────────────────────
306
307// lstring.h: LUAI_FUNC → pub(crate)
308/// Hash a byte string with a seed using Lua's FNV-style hash.
309///
310/// This is a pure function with no allocations.  The algorithm XORs shifts and
311/// additions over each byte in reverse order, seeded by `seed ^ len`.
312///
313/// # C source
314/// ```c
315///
316/// //   unsigned int h = seed ^ cast_uint(l);
317/// //   for (; l > 0; l--)
318/// //     h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1]));
319/// //   return h;
320/// // }
321/// ```
322///
323/// PORT NOTE: C parenthesises `(h<<5)` and `(h>>2)` explicitly, so the outer
324/// additions are unambiguous despite C's `<<`/`>>` having lower precedence than
325/// `+`.  In Rust `<<` and `>>` have higher precedence than `+`, so the same
326/// expression is computed without extra parentheses; `wrapping_add` is used to
327/// match C's unsigned wrap-around arithmetic.
328pub(crate) fn hash_bytes(bytes: &[u8], seed: u32) -> u32 {
329    // macros.tsv: cast_uint → x as u32
330    let mut h: u32 = seed ^ (bytes.len() as u32);
331
332    let mut l = bytes.len();
333    while l > 0 {
334        l -= 1;
335        // macros.tsv: cast_byte → x as u8 (then as u32 for the arithmetic)
336        h ^= (h << 5)
337            .wrapping_add(h >> 2)
338            .wrapping_add(bytes[l] as u32);
339    }
340
341    h
342}
343
344//
345// PORT NOTE: `tablerehash` walked the intrusive `hnext` chain in each bucket and
346// redistributed `TString *` pointers into new bucket slots.  In Rust the
347// `HashMap` in `StringPool` handles its own rehashing automatically whenever its
348// load factor is exceeded or `reserve` / `shrink_to` is called.  The entire
349// function is therefore dropped; its effects are subsumed by the HashMap.
350
351// lstring.h: LUAI_FUNC → pub(crate)
352/// Resize the string intern table to approximately `nsize` buckets.
353///
354/// When growing, `HashMap::reserve` hints the desired capacity.  When shrinking,
355/// `HashMap::shrink_to` is used as an approximation of the C logic, which
356/// would rehash entries out of the shrinking tail.  The C function's graceful
357/// degradation on allocation failure (keep the current size) is preserved:
358/// `HashMap` will simply retain its existing capacity if memory is tight.
359///
360/// # C source
361/// ```c
362///
363/// //   stringtable *tb = &G(L)->strt;
364/// //   int osize = tb->size;
365/// //   TString **newvect;
366/// //   if (nsize < osize)
367/// //     tablerehash(tb->hash, osize, nsize);  /* depopulate shrinking part */
368/// //   newvect = luaM_reallocvector(L, tb->hash, osize, nsize, TString*);
369/// //   if (l_unlikely(newvect == NULL)) {
370/// //     if (nsize < osize)
371/// //       tablerehash(tb->hash, nsize, osize);  /* restore to original size */
372/// //   } else {
373/// //     tb->hash = newvect;
374/// //     tb->size = nsize;
375/// //     if (nsize > osize)
376/// //       tablerehash(newvect, osize, nsize);
377/// //   }
378/// // }
379/// ```
380///
381/// PORT NOTE: The three calls to `tablerehash` are dropped because `HashMap`
382/// automatically rehashes.  The allocation-failure fallback (restore to `osize`)
383/// has no direct analogue; `HashMap` will retain existing capacity on OOM, which
384/// matches the intent.
385// PERF(port): luaS_resize shrink — HashMap::shrink_to() is a hint, not a
386// guarantee; the C code freed exact memory.  Profile in Phase B.
387pub(crate) fn resize(state: &mut LuaState, nsize: usize) {
388    let strt = &mut state.global_mut().strt;
389    let osize = strt.size;
390
391    if nsize > osize {
392        let additional = nsize.saturating_sub(strt.map.len());
393        strt.map.reserve(additional);
394    } else if nsize < osize {
395        // PERF(port): shrink_to is a hint; exact shrink not guaranteed in Rust
396        strt.map.shrink_to(nsize);
397    }
398
399    strt.size = nsize;
400}
401
402// lstring.h: LUAI_FUNC → pub(crate)
403/// Initialise the string intern table and the API string cache.
404///
405/// Must be called exactly once during VM startup, before any strings are created.
406/// Pre-creates the memory-error message and fixes it in the GC (so it is never
407/// collected), then fills every cache slot with that same string.
408///
409/// # C source
410/// ```c
411///
412/// //   global_State *g = G(L);
413/// //   int i, j;
414/// //   stringtable *tb = &G(L)->strt;
415/// //   tb->hash = luaM_newvector(L, MINSTRTABSIZE, TString*);
416/// //   tablerehash(tb->hash, 0, MINSTRTABSIZE);
417/// //   tb->size = MINSTRTABSIZE;
418/// //   g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
419/// //   luaC_fix(L, obj2gco(g->memerrmsg));
420/// //   for (i = 0; i < STRCACHE_N; i++)
421/// //     for (j = 0; j < STRCACHE_M; j++)
422/// //       g->strcache[i][j] = g->memerrmsg;
423/// // }
424/// ```
425pub(crate) fn init(state: &mut LuaState) -> Result<(), LuaError> {
426    //    tablerehash(tb->hash, 0, MINSTRTABSIZE);
427    //    tb->size = MINSTRTABSIZE;
428    // macros.tsv: luaM_newvector → vec![T::default(); n]
429    // PORT NOTE: StringPool::new() sets the initial capacity to MIN_STR_TAB_SIZE,
430    // replacing both the allocation and the tablerehash clear pass.
431    state.global_mut().strt = StringPool::new();
432
433    // macros.tsv: luaS_newliteral → state.intern_str(b"...")
434    let memerrmsg = new_lstr(state, MEMERR_MSG)?;
435
436    // macros.tsv: luaC_fix — not listed; it marks the object as fixed (non-collectable)
437    // TODO(port): call state.gc().fix(memerrmsg.clone()) when GC is wired in Phase D;
438    // in Phases A–C the Rc keeps it alive as long as GlobalState holds the clone
439    let memerrmsg_lt = impl_to_lt(&memerrmsg);
440    state.global_mut().memerrmsg = memerrmsg_lt.clone();
441
442    //      for (j = 0; j < STRCACHE_M; j++)
443    //        g->strcache[i][j] = g->memerrmsg;
444    for i in 0..STRCACHE_N {
445        for j in 0..STRCACHE_M {
446            state.global_mut().strcache[i][j] = memerrmsg_lt.clone();
447        }
448    }
449
450    Ok(())
451}
452
453// lstring.h: LUAI_FUNC → pub(crate)
454/// Create or retrieve a Lua string from `bytes`.
455///
456/// If `bytes.len() <= MAX_SHORT_LEN` (40), the string is interned: an existing
457/// identical short string is returned if found, otherwise a new one is created
458/// and inserted into the intern table.
459///
460/// If `bytes.len() > MAX_SHORT_LEN`, a new long string is allocated each time
461/// (long strings are never interned).
462///
463/// # C source
464/// ```c
465///
466/// //   if (l <= LUAI_MAXSHORTLEN)  /* short string? */
467/// //     return internshrstr(L, str, l);
468/// //   else {
469/// //     TString *ts;
470/// //     if (l_unlikely(l * sizeof(char) >= (MAX_SIZE - sizeof(TString))))
471/// //       luaM_toobig(L);
472/// //     ts = luaS_createlngstrobj(L, l);
473/// //     memcpy(getlngstr(ts), str, l * sizeof(char));
474/// //     return ts;
475/// //   }
476/// // }
477/// ```
478pub(crate) fn new_lstr(state: &mut LuaState, bytes: &[u8]) -> Result<GcRef<LuaStringImpl>, LuaError> {
479    if bytes.len() <= MAX_SHORT_LEN {
480        intern_short_str(state, bytes)
481    } else {
482        //        luaM_toobig(L);
483        // macros.tsv: luaM_toobig → return Err(LuaError::Memory)
484        // PORT NOTE: sizeof(TString) is a C-specific overhead; in Rust we just
485        // check that the byte count fits within MAX_SIZE.
486        if bytes.len() >= MAX_SIZE {
487            return Err(LuaError::Memory);
488        }
489
490        //    memcpy(getlngstr(ts), str, l * sizeof(char));
491        // PORT NOTE: Rather than creating a zeroed buffer and then copying,
492        // we construct the LuaStringImpl directly from `bytes`.
493        let seed = state.global().seed;
494        let h = hash_bytes(bytes, seed);
495        let ts = create_str_obj(state, bytes, StringKind::Long, h);
496        Ok(ts)
497    }
498}
499
500// lstring.h: LUAI_FUNC → pub(crate)
501/// Create or retrieve a Lua string, using a small two-slot LRU cache per hash
502/// bucket to accelerate repeated calls with the same byte sequence.
503///
504/// In C, the cache bucket is selected by casting the C string pointer to a `u32`
505/// (`point2uint`).  In Rust, `point2uint` is restricted to `lua-gc`/`lua-coro`
506/// (raw-pointer cast requiring `unsafe`).  We substitute a content-hash based
507/// bucket index instead.  Functional semantics are identical; cache hit rates for
508/// repeated calls with the same `bytes` may differ.
509///
510/// # C source
511/// ```c
512///
513/// //   unsigned int i = point2uint(str) % STRCACHE_N;  /* hash */
514/// //   int j;
515/// //   TString **p = G(L)->strcache[i];
516/// //   for (j = 0; j < STRCACHE_M; j++) {
517/// //     if (strcmp(str, getstr(p[j])) == 0)  /* hit? */
518/// //       return p[j];  /* that is it */
519/// //   }
520/// //   /* normal route */
521/// //   for (j = STRCACHE_M - 1; j > 0; j--)
522/// //     p[j] = p[j - 1];  /* move out last element */
523/// //   p[0] = luaS_newlstr(L, str, strlen(str));
524/// //   return p[0];
525/// // }
526/// ```
527///
528/// PORT NOTE: `point2uint(str) % STRCACHE_N` used the raw pointer address as a
529/// fast key, exploiting the fact that C string literals have stable addresses.
530/// In Rust we use `hash_bytes(bytes, seed) % STRCACHE_N` instead.  The replacement
531/// is fully safe and has identical semantics (but different cache behaviour for
532/// calls from different `&[u8]` slices with identical content).
533pub(crate) fn new(state: &mut LuaState, bytes: &[u8]) -> Result<GcRef<LuaStringImpl>, LuaError> {
534    // PORT NOTE: pointer hash replaced by content hash (see doc above)
535    let seed = state.global().seed;
536    let i = (hash_bytes(bytes, seed) as usize) % STRCACHE_N;
537
538    // macros.tsv: getstr → ts.as_bytes()
539    for j in 0..STRCACHE_M {
540        if state.global().strcache[i][j].as_bytes() == bytes {
541            // TODO(phase-b): strcache currently holds lua_types::LuaString; rebuild
542            // a rich LuaStringImpl from the bytes. Phase B should unify the types.
543            let cached_bytes = state.global().strcache[i][j].as_bytes().to_vec();
544            // TODO(D-1c-bridge): LuaStringImpl is the rich local type; state helper produces lua_types::LuaString
545            return Ok(GcRef::new(LuaStringImpl {
546                bytes: cached_bytes.into(),
547                kind: if bytes.len() <= MAX_SHORT_LEN { StringKind::Short } else { StringKind::Long },
548                hash: Cell::new(hash_bytes(bytes, seed)),
549                extra: Cell::new(0),
550            }));
551        }
552    }
553
554    // Create the string before mutating the cache
555    let new_str = new_lstr(state, bytes)?;
556
557    // Shift entries toward the back to make room at slot 0
558    for j in (1..STRCACHE_M).rev() {
559        // Clone first to avoid borrow conflict between getter and setter
560        let prev = state.global().strcache[i][j - 1].clone();
561        state.global_mut().strcache[i][j] = prev;
562    }
563
564    state.global_mut().strcache[i][0] = impl_to_lt(&new_str);
565
566    Ok(new_str)
567}
568
569// ── Private helpers ───────────────────────────────────────────────────────────
570
571/// Allocate and initialise a new `LuaStringImpl` with the given bytes, kind, and hash.
572///
573/// In C, `createstrobj` allocated uninitialised memory via `luaC_newobj` and set
574/// the header fields; the caller then filled the content via `memcpy`.  In Rust
575/// we construct the string directly from the provided `bytes`, eliminating the
576/// two-step pattern.
577///
578/// # C source
579/// ```c
580///
581/// //   TString *ts;
582/// //   GCObject *o;
583/// //   size_t totalsize = sizelstring(l);
584/// //   o = luaC_newobj(L, tag, totalsize);
585/// //   ts = gco2ts(o);
586/// //   ts->hash = h;
587/// //   ts->extra = 0;
588/// //   getstr(ts)[l] = '\0';  /* ending 0 */
589/// //   return ts;
590/// // }
591/// ```
592///
593/// PORT NOTE: `sizelstring(l)` computed the total allocation size including the
594/// nul terminator.  In Rust, `Rc<[u8]>` stores the bytes without a nul; the
595/// nul terminator is dropped.  Callers that need a nul-terminated `*const u8`
596/// for FFI must use a temporary `CString` or equivalent.
597fn create_str_obj(
598    state: &mut LuaState,
599    bytes: &[u8],
600    kind: StringKind,
601    hash: u32,
602) -> GcRef<LuaStringImpl> {
603    // macros.tsv: luaM_newobject → state.gc().new_obj(tag, sz)
604    // TODO(port): register with GC tracking list (state.global_mut().allgc)
605    // in Phase D; Phase A–C creates a bare Rc
606    let _ = state; // state needed for GC registration in Phase D
607    // TODO(D-1c-bridge): LuaStringImpl is the rich local type; state helper produces lua_types::LuaString
608    GcRef::new(LuaStringImpl {
609        hash: Cell::new(hash),
610        extra: Cell::new(0),
611        // PORT NOTE: we receive bytes directly; no separate memcpy step needed
612        bytes: Rc::from(bytes),
613        kind,
614    })
615}
616
617/// Grow the string intern table, first attempting a GC collection if the table is
618/// at its absolute maximum size.
619///
620/// # C source
621/// ```c
622///
623/// //   if (l_unlikely(tb->nuse == MAX_INT)) {  /* too many strings? */
624/// //     luaC_fullgc(L, 1);  /* try to free some... */
625/// //     if (tb->nuse == MAX_INT)  /* still too many? */
626/// //       luaM_error(L);  /* cannot even create a message... */
627/// //   }
628/// //   if (tb->size <= MAXSTRTB / 2)  /* can grow string table? */
629/// //     luaS_resize(L, tb->size * 2);
630/// // }
631/// ```
632fn grow_str_tab(state: &mut LuaState) -> Result<(), LuaError> {
633    // macros.tsv: MAX_INT → i32::MAX
634    let nuse = state.global().strt.nuse;
635    if nuse == i32::MAX as usize {
636        // macros.tsv: luaC_fullgc → state.gc().full_collect()
637        // TODO(port): state.gc().full_collect() — GC not yet wired in Phase A–C; no-op
638        // (When GC is live this call may reduce nuse by sweeping dead short strings.)
639
640        // macros.tsv: luaM_error → return Err(LuaError::Memory)
641        if state.global().strt.nuse == i32::MAX as usize {
642            return Err(LuaError::Memory);
643        }
644    }
645
646    let size = state.global().strt.size;
647    if size <= MAX_STR_TAB / 2 {
648        resize(state, size * 2);
649    }
650
651    Ok(())
652}
653
654/// Look up `bytes` in the intern table; create and insert a new short string if
655/// not found.
656///
657/// The `isdead` / `changewhite` resurrection path is elided in Phases A–C because
658/// `Rc`-based reference counting keeps objects alive until all references drop
659/// (there are no dead-but-not-collected strings in Phase A–C).
660///
661/// # C source
662/// ```c
663///
664/// //   TString *ts;
665/// //   global_State *g = G(L);
666/// //   stringtable *tb = &g->strt;
667/// //   unsigned int h = luaS_hash(str, l, g->seed);
668/// //   TString **list = &tb->hash[lmod(h, tb->size)];
669/// //   lua_assert(str != NULL);
670/// //   for (ts = *list; ts != NULL; ts = ts->u.hnext) {
671/// //     if (l == ts->shrlen && (memcmp(str, getshrstr(ts), l) == 0)) {
672/// //       if (isdead(g, ts)) changewhite(ts);  /* resurrect it */
673/// //       return ts;
674/// //     }
675/// //   }
676/// //   if (tb->nuse >= tb->size) {
677/// //     growstrtab(L, tb);
678/// //     list = &tb->hash[lmod(h, tb->size)];
679/// //   }
680/// //   ts = createstrobj(L, l, LUA_VSHRSTR, h);
681/// //   ts->shrlen = cast_byte(l);
682/// //   memcpy(getshrstr(ts), str, l);
683/// //   ts->u.hnext = *list;
684/// //   *list = ts;
685/// //   tb->nuse++;
686/// //   return ts;
687/// // }
688/// ```
689///
690/// PORT NOTE: `lmod(h, tb->size)` (power-of-two bucket modulo via
691/// `macros.tsv: lmod → (s & (size - 1)) as usize`) and the `hnext` chain walk
692/// are both gone.  `HashMap::get` replaces the linear bucket scan.
693fn intern_short_str(
694    state: &mut LuaState,
695    bytes: &[u8],
696) -> Result<GcRef<LuaStringImpl>, LuaError> {
697    // In Rust, &[u8] slices are never null; the assertion is trivially satisfied.
698
699    let seed = state.global().seed;
700    let h = hash_bytes(bytes, seed);
701
702    // PORT NOTE: intrusive hnext chain replaced by HashMap lookup
703    // Clone the existing GcRef<LuaStringImpl> so the immutable borrow on `state` ends
704    // before any mutable access below.
705    let existing = state.global().strt.map.get(bytes).cloned();
706    if let Some(ts) = existing {
707        // macros.tsv: isdead → g.is_dead(obj);  changewhite → obj.flip_white()
708        // PORT NOTE: GC color management deferred to Phase D; in Phases A–C all
709        // Rc-held objects are live by definition (Rc keeps them alive).
710        return Ok(ts);
711    }
712
713    let needs_grow = {
714        let strt = &state.global().strt;
715        strt.nuse >= strt.size
716    };
717    if needs_grow {
718        grow_str_tab(state)?;
719    }
720
721    //    ts->shrlen = cast_byte(l);  — encoded in StringKind::Short
722    //    memcpy(getshrstr(ts), str, l);  — bytes passed directly to create_str_obj
723    let ts = create_str_obj(state, bytes, StringKind::Short, h);
724
725    state
726        .global_mut()
727        .strt
728        .map
729        .insert(bytes.to_vec().into_boxed_slice(), ts.clone());
730    state.global_mut().strt.nuse += 1;
731
732    Ok(ts)
733}
734
735// ── Re-export marker for type defined here ────────────────────────────────────
736
737// TODO(port): LuaError is used in function signatures above but is not yet defined
738// in lua-types.  Phase B must add LuaError to lua-types/src/error.rs per
739// PORTING.md §6 before this file can compile.  The expected variants are:
740//   LuaError::Runtime(LuaValue)
741//   LuaError::Memory
742//   LuaError::Syntax(LuaValue)
743//   ... (full list in PORTING.md §6)
744// For now, reference LuaError as an opaque import from the future lua-types crate.
745use lua_types::LuaError;
746
747// ──────────────────────────────────────────────────────────────────────────────
748// PORT STATUS
749//   source:        src/lstring.c  (275 lines, 15 functions)
750//                  src/lstring.h  (57 lines; merged)
751//   target_crate:  lua-vm
752//   confidence:    medium
753//   todos:         14
754//   port_notes:    30
755//   unsafe_blocks: 0   (must be 0 outside explicit unsafe-budget crates)
756//   notes:         Logic is faithful to the C.  The two largest structural changes
757//                  are: (1) `tablerehash` + intrusive `hnext` chain replaced by
758//                  `HashMap` in `StringPool`; (2) `luaS_new`'s `point2uint`
759//                  pointer-hash replaced by a content hash (safe, same semantics).
760//                  Key TODOs: GC registration in create_str_obj (Phase D),
761//                  GC registration in new_userdata (Phase D), luaC_fix in init
762//                  (Phase D), full_collect stub in grow_str_tab (Phase D),
763//                  udatamemoffset size check in new_userdata (Phase B),
764//                  LuaValue in LuaUserData.uv (Phase B), LuaError import path
765//                  (Phase B), GcRef typedef (Phase B).  Phase B priority: wire
766//                  import paths for LuaState, GlobalState, LuaError, LuaValue,
767//                  and move LuaStringImpl/StringPool/LuaUserData to their canonical
768//                  modules (object.rs / state.rs).
769// ──────────────────────────────────────────────────────────────────────────────