Skip to main content

lua_vm/
string.rs

1//! String table and interned-string operations — port of `lstring.c` + `lstring.h`.
2//!
3//! Provides two key abstractions:
4//!
5//! - [`LuaStringImpl`]: the Lua string value, stored as a reference-counted byte slice.
6//!   Short strings (`<= MAX_SHORT_LEN` bytes) are interned in the process-global
7//!   [`StringPool`]; long strings are heap-allocated on each creation and never
8//!   interned.
9//!
10//! - [`StringPool`]: the intern table for short strings, stored on `GlobalState`.
11//!   Replaces the C `stringtable` struct, which used an open-addressing hash table
12//!   with intrusive chaining through `TString.u.hnext`.  In Rust the intrusive
13//!   chain is dropped; a `HashMap` provides O(1) lookup and automatic rehashing.
14//!   See PORT NOTE on [`StringPool`] for the full rationale.
15//!
16//! The `lstring.h` header is merged into this module per PORTING.md §1.
17//!
18//! # C source files
19//! - `reference/lua-5.4.7/src/lstring.c`  (275 lines, 15 functions)
20//! - `reference/lua-5.4.7/src/lstring.h`  (57 lines; merged here)
21
22use std::cell::Cell;
23#[allow(unused_imports)] use crate::prelude::*;
24use std::collections::HashMap;
25use std::rc::Rc;
26
27// TODO(port): these import paths will resolve once Phase B wires the crate graph.
28// `LuaState` and `GlobalState` live in crate::state (src/state.rs, from lstate.c).
29// `LuaValue` and `LuaError` live in lua_types (crates/lua-types/src/).
30use crate::state::{GlobalState, LuaState};
31
32// PORT NOTE: `GcRef<T>` is the lua-types newtype around `Rc<T>` per PORT_STRATEGY §3.4.
33// Re-imported here so all string-pool entries share identity with state.rs / api.rs.
34use lua_types::GcRef;
35/// Local alias retained while string.rs's own `LuaStringImpl` is still in use; will
36/// merge with `lua_types::LuaString` in Phase B's string-pool consolidation.
37type LocalGcRef<T> = Rc<T>;
38
39/// Phase-B bridge: converts a lua-vm rich `LuaStringImpl` into a `lua_types::LuaString`.
40/// The two types track different metadata (short/long flag, extra byte) and a real
41/// merge belongs in Phase B once `lua-types::LuaString` grows the needed fields.
42fn impl_to_lt(s: &GcRef<LuaStringImpl>) -> GcRef<lua_types::LuaString> {
43    // TODO(D-1c-bridge): allocation outside state context (free fn)
44    GcRef::new(lua_types::LuaString::from_bytes(s.as_bytes().to_vec()))
45}
46
47// ── Constants (lstring.h macros → macros.tsv) ─────────────────────────────────
48
49// macros.tsv: MEMERRMSG → const MEMERR_MSG: &[u8] = b"not enough memory"
50/// Pre-allocated OOM error message.  Must be created before the allocator
51/// can fail so that the GC can always hand back a valid error string.
52pub(crate) const MEMERR_MSG: &[u8] = b"not enough memory";
53
54// macros.tsv: MINSTRTABSIZE → const MIN_STR_TAB_SIZE: usize = 128
55const MIN_STR_TAB_SIZE: usize = 128;
56
57// macros.tsv: STRCACHE_N → const STRCACHE_N: usize = 53
58const STRCACHE_N: usize = 53;
59
60// macros.tsv: STRCACHE_M → const STRCACHE_M: usize = 2
61const STRCACHE_M: usize = 2;
62
63// macros.tsv: LUAI_MAXSHORTLEN → const MAX_SHORT_LEN: usize = 40
64pub(crate) const MAX_SHORT_LEN: usize = 40;
65
66// macros.tsv: MAX_SIZE → const MAX_SIZE: usize = if size_of::<usize>() < size_of::<i64>() { usize::MAX } else { i64::MAX as usize }
67const MAX_SIZE: usize = if std::mem::size_of::<usize>() < std::mem::size_of::<i64>() {
68    usize::MAX
69} else {
70    i64::MAX as usize
71};
72
73// macros.tsv: luaM_limitN → std::cmp::min(n, usize::MAX / std::mem::size_of::<T>())
74//             cast_int → x as i32
75// Rust: upper bound on the number of hash buckets; derived from MAX_INT / pointer size.
76const MAX_STR_TAB: usize = i32::MAX as usize / std::mem::size_of::<usize>();
77
78// macros.tsv: sizelstring → drop — Rust allocates via Box<[u8]> / Rc<[u8]>
79// PORT NOTE: dropped entirely; Rust uses Rc<[u8]> which carries its own length.
80
81// macros.tsv: luaS_newliteral → state.intern_str(b"...")
82// PORT NOTE: translated at call sites as `new_lstr(state, b"literal")`.
83
84// macros.tsv: isreserved → ts.is_reserved_word()
85// PORT NOTE: translated at call sites as the `LuaStringImpl::is_reserved_word()` method.
86
87// macros.tsv: eqshrstr → Rc::ptr_eq(a, b)
88// PORT NOTE: short strings are interned so pointer equality suffices.
89// Translated at call sites as `Rc::ptr_eq(a, b)`.
90
91// ── LuaStringImpl (was TString in lobject.h) ─────────────────────────────────────
92
93// PORT NOTE: `LuaStringImpl` corresponds to `TString` from `lobject.h`, which maps to
94// `src/object.rs` per file_deps.txt.  It is defined here (in `string.rs`) because
95// `lstring.c` owns the string-table internals and most of the type's behaviour.
96// Phase B should reconcile: either keep it here and re-export from `object.rs`,
97// or move it there and import it from `string.rs`.
98
99/// Whether a Lua string is short (interned) or long (not interned).
100///
101/// Corresponds to `LUA_VSHRSTR` / `LUA_VLNGSTR` tags from `lobject.h`.
102///
103/// # C mapping (types.tsv)
104/// ```text
105/// LUA_VSHRSTR → LuaStringImpl::Short  (shrlen holds length 0..=40)
106/// LUA_VLNGSTR → LuaStringImpl::Long   (shrlen = 0xFF sentinel; u.lnglen holds length)
107/// ```
108#[derive(Debug, Clone, Copy, PartialEq, Eq)]
109pub enum StringKind {
110    Short,
111    Long,
112}
113
114/// A Lua string: an immutable, reference-counted byte sequence.
115///
116/// Short strings (`<= MAX_SHORT_LEN = 40` bytes) are interned in the
117/// [`StringPool`] on `GlobalState`; two short strings with the same bytes
118/// are guaranteed to be the same `GcRef` (pointer equality via `Rc::ptr_eq`).
119///
120/// Long strings are heap-allocated independently and never interned.  Their
121/// hash is computed lazily on first call to [`hash_long_str`] and cached via
122/// interior mutability (`Cell<u32>`).
123///
124/// # C mapping (types.tsv)
125/// ```text
126/// TString             → LuaStringImpl
127/// TString.extra       → extra: Cell<u8>   (reserved-word idx for Short; hash-ready flag for Long)
128/// TString.shrlen      → kind: StringKind   (0xFF sentinel replaced by enum variant)
129/// TString.hash        → hash: Cell<u32>
130/// TString.u.lnglen    → bytes.len()        (length implicit in Rc<[u8]>)
131/// TString.u.hnext     → (removed)          (intrusive chain gone; StringPool uses HashMap)
132/// TString.contents    → bytes: Rc<[u8]>
133/// ```
134pub struct LuaStringImpl {
135    bytes: Rc<[u8]>,
136
137    // Replaced by the StringKind enum; length is implicit in bytes.len().
138    kind: StringKind,
139
140    // Using Cell<u32> so that `hash_long_str` can cache the hash through a
141    // shared `&LuaStringImpl` reference (interior mutability, single-threaded).
142    hash: Cell<u32>,
143
144    // Short strings: reserved-word token index (0 = not a keyword).
145    // Long strings:  0 = hash not yet computed; 1 = hash is valid.
146    extra: Cell<u8>,
147}
148
149impl LuaStringImpl {
150    /// Returns the string's bytes.
151    ///
152    /// macros.tsv: `getstr` / `getlngstr` / `getshrstr` → `ts.as_bytes()`
153    pub fn as_bytes(&self) -> &[u8] {
154        &self.bytes
155    }
156
157    /// Returns the byte length of the string.
158    ///
159    /// for Long.  In Rust both cases are `bytes.len()`.
160    /// macros.tsv: `tsslen` → `ts.len()`
161    pub fn len(&self) -> usize {
162        self.bytes.len()
163    }
164
165    /// Returns `true` if this is a long (non-interned) string.
166    pub fn is_long(&self) -> bool {
167        self.kind == StringKind::Long
168    }
169
170    /// Returns `true` if this is a short (interned) string.
171    pub fn is_short(&self) -> bool {
172        self.kind == StringKind::Short
173    }
174
175    /// Returns `true` if this short string is a Lua reserved word.
176    ///
177    /// macros.tsv: `isreserved` → `ts.is_reserved_word()`
178    pub fn is_reserved_word(&self) -> bool {
179        self.kind == StringKind::Short && self.extra.get() > 0
180    }
181
182    /// GC color predicate.  Returns `true` if this object is "white" (unreachable)
183    /// in the GC's current wave.
184    ///
185    /// macros.tsv: `iswhite` → `obj.is_white()`
186    ///
187    /// PORT NOTE: GC color management is deferred to Phase D.  In Phases A–C all
188    /// objects are reachable via `Rc` reference counts and this always returns
189    /// `false` (nothing is white / unreachable).
190    pub fn is_white(&self) -> bool {
191        // TODO(port): Phase D — check the GC marked byte; stub returns false (all live)
192        false
193    }
194
195    /// Flip GC color from white to the current non-white (resurrect a dead object).
196    ///
197    /// macros.tsv: `changewhite` → `obj.flip_white()`
198    ///
199    /// PORT NOTE: GC color management deferred to Phase D; no-op in Phases A–C.
200    pub fn flip_white(&self) {
201        // TODO(port): Phase D — update the GC marked byte
202    }
203}
204
205impl PartialEq for LuaStringImpl {
206    /// Equality for Lua strings.
207    ///
208    /// For short strings (interned), pointer equality via `Rc::ptr_eq` is sufficient
209    /// and matches `eqshrstr` in C.  For long strings, we fall back to byte
210    /// comparison, matching `luaS_eqlngstr` in C.
211    fn eq(&self, other: &Self) -> bool {
212        if self.kind == StringKind::Short && other.kind == StringKind::Short {
213            Rc::ptr_eq(&self.bytes, &other.bytes)
214        } else {
215            self.bytes == other.bytes
216        }
217    }
218}
219
220impl Eq for LuaStringImpl {}
221
222// ── StringPool (was stringtable in lstate.h) ──────────────────────────────────
223
224// PORT NOTE: `StringPool` corresponds to `stringtable` from `lstate.h`, which maps
225// to `src/state.rs` per file_deps.txt.  It is defined here because `lstring.c`
226// owns all of the pool's mutation logic.  Phase B should reconcile placement.
227//
228// The C `stringtable` used an open-addressing hash table where each bucket was
229// the head of an intrusive singly-linked list threaded through `TString.u.hnext`.
230// In Rust, `TString.u.hnext` is removed per types.tsv.  The `HashMap` replaces
231// both the bucket array and the chain: it provides O(1) average-case lookup,
232// automatic rehashing, and eliminates the need for `tablerehash`.
233//
234// `nuse` and `size` are retained for parity with the C invariants that other
235// code may check (e.g. `growstrtab` tests `nuse >= size`).
236
237/// Intern table for short Lua strings.  Lives on `GlobalState`.
238///
239/// # C mapping (types.tsv)
240/// ```text
241/// stringtable        → StringPool
242/// stringtable.hash   → map: HashMap<Box<[u8]>, GcRef<LuaStringImpl>>
243/// stringtable.nuse   → nuse: usize
244/// stringtable.size   → size: usize
245/// ```
246pub struct StringPool {
247    // PORT NOTE: keyed by owned byte slice; lookup by `&[u8]` via Borrow<[u8]>.
248    map: HashMap<Box<[u8]>, GcRef<LuaStringImpl>>,
249
250    // PERF(port): redundant with map.len() in Rust — keep for C-parity; remove in Phase B
251    nuse: usize,
252
253    // In Rust, HashMap manages its own capacity; this tracks the last requested size.
254    size: usize,
255}
256
257impl StringPool {
258    /// Create an empty pool with `MIN_STR_TAB_SIZE` preallocated capacity.
259    ///
260    ///    `tablerehash(tb->hash, 0, MINSTRTABSIZE)` sequence in `luaS_init`.
261    pub fn new() -> Self {
262        StringPool {
263            map: HashMap::with_capacity(MIN_STR_TAB_SIZE),
264            nuse: 0,
265            size: MIN_STR_TAB_SIZE,
266        }
267    }
268}
269
270impl Default for StringPool {
271    fn default() -> Self {
272        Self::new()
273    }
274}
275
276// ── LuaUserData (was Udata in lobject.h) ──────────────────────────────────────
277
278// PORT NOTE: `LuaUserData` corresponds to `Udata` from `lobject.h`, which maps to
279// `src/object.rs` per file_deps.txt.  Defined here because `luaS_newudata` lives
280// in `lstring.c`.  Phase B should reconcile placement.
281
282/// Full userdata: a GC-tracked object carrying a raw byte payload plus optional
283/// Lua user values and an optional metatable.
284///
285/// # C mapping (types.tsv)
286/// ```text
287/// Udata           → LuaUserData
288/// Udata.len       → len: usize
289/// Udata.nuvalue   → nuvalue: u16  (covered by uv.len() but kept for parity)
290/// Udata.metatable → metatable: Option<GcRef<LuaTable>>
291/// Udata.uv        → uv: Vec<LuaValue>
292/// (no direct C field) data: Box<[u8]>  — the raw byte payload; C used a flexible
293///                          array member laid out past the Udata header via
294///                          `udatamemoffset` alignment math.
295/// ```
296pub struct LuaUserDataImpl {
297    pub len: usize,
298    pub nuvalue: u16,
299    // TODO(port): GcRef<LuaTable> — LuaTable not yet defined; Phase B
300    pub metatable: Option<()>,
301    // macros.tsv: setnilvalue → *o = LuaValue::Nil
302    // TODO(port): Vec<LuaValue> — LuaValue not yet defined; Phase B
303    pub uv: Vec<()>,
304    // Port of the raw byte payload that C accessed via udatamemoffset arithmetic.
305    pub data: Box<[u8]>,
306}
307
308// ── Public functions ───────────────────────────────────────────────────────────
309
310// lstring.h: LUAI_FUNC → pub(crate)
311/// Test equality of two long strings.
312///
313/// Two long strings are equal if they have identical byte content.  A pointer
314/// equality short-circuit is also applied: if `a` and `b` share the same
315/// underlying `Rc<[u8]>` allocation, they are trivially equal.
316///
317/// # C source
318/// ```c
319///
320/// //   size_t len = a->u.lnglen;
321/// //   lua_assert(a->tt == LUA_VLNGSTR && b->tt == LUA_VLNGSTR);
322/// //   return (a == b) ||
323/// //     ((len == b->u.lnglen) &&
324/// //      (memcmp(getlngstr(a), getlngstr(b), len) == 0));
325/// // }
326/// ```
327pub(crate) fn eq_long_str(a: &LuaStringImpl, b: &LuaStringImpl) -> bool {
328    // macros.tsv: lua_assert → debug_assert!
329    debug_assert!(a.is_long() && b.is_long(), "eq_long_str: both arguments must be long strings");
330
331    // In Rust: check if the Rc<[u8]> byte buffers are the same allocation
332    if Rc::ptr_eq(&a.bytes, &b.bytes) {
333        return true;
334    }
335
336    // macros.tsv: getlngstr → ts.as_bytes()
337    a.as_bytes() == b.as_bytes()
338}
339
340// lstring.h: LUAI_FUNC → pub(crate)
341/// Hash a byte string with a seed using Lua's FNV-style hash.
342///
343/// This is a pure function with no allocations.  The algorithm XORs shifts and
344/// additions over each byte in reverse order, seeded by `seed ^ len`.
345///
346/// # C source
347/// ```c
348///
349/// //   unsigned int h = seed ^ cast_uint(l);
350/// //   for (; l > 0; l--)
351/// //     h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1]));
352/// //   return h;
353/// // }
354/// ```
355///
356/// PORT NOTE: C parenthesises `(h<<5)` and `(h>>2)` explicitly, so the outer
357/// additions are unambiguous despite C's `<<`/`>>` having lower precedence than
358/// `+`.  In Rust `<<` and `>>` have higher precedence than `+`, so the same
359/// expression is computed without extra parentheses; `wrapping_add` is used to
360/// match C's unsigned wrap-around arithmetic.
361pub(crate) fn hash_bytes(bytes: &[u8], seed: u32) -> u32 {
362    // macros.tsv: cast_uint → x as u32
363    let mut h: u32 = seed ^ (bytes.len() as u32);
364
365    let mut l = bytes.len();
366    while l > 0 {
367        l -= 1;
368        // macros.tsv: cast_byte → x as u8 (then as u32 for the arithmetic)
369        h ^= (h << 5)
370            .wrapping_add(h >> 2)
371            .wrapping_add(bytes[l] as u32);
372    }
373
374    h
375}
376
377// lstring.h: LUAI_FUNC → pub(crate)
378/// Compute (and cache) the hash of a long string.
379///
380/// The hash for long strings is computed lazily: on first call the hash is
381/// derived from `hash_bytes` using the seed stored in the `hash` field, then
382/// `extra` is set to `1` to record that the hash is now valid.  Subsequent calls
383/// return the cached value directly.
384///
385/// Interior mutability (`Cell<u32>` / `Cell<u8>`) allows mutation through a
386/// shared `&LuaStringImpl` reference, which is necessary because `GcRef<LuaStringImpl>`
387/// is `Rc<LuaStringImpl>` and there is no safe way to get `&mut` through an `Rc`.
388///
389/// # C source
390/// ```c
391///
392/// //   lua_assert(ts->tt == LUA_VLNGSTR);
393/// //   if (ts->extra == 0) {  /* no hash? */
394/// //     size_t len = ts->u.lnglen;
395/// //     ts->hash = luaS_hash(getlngstr(ts), len, ts->hash);
396/// //     ts->extra = 1;  /* now it has its hash */
397/// //   }
398/// //   return ts->hash;
399/// // }
400/// ```
401pub(crate) fn hash_long_str(ts: &LuaStringImpl) -> u32 {
402    debug_assert!(ts.is_long(), "hash_long_str: argument must be a long string");
403
404    if ts.extra.get() == 0 {
405        // The initial ts->hash holds the per-state seed (set at construction).
406        let computed = hash_bytes(ts.as_bytes(), ts.hash.get());
407        ts.hash.set(computed);
408        ts.extra.set(1);
409    }
410
411    ts.hash.get()
412}
413
414//
415// PORT NOTE: `tablerehash` walked the intrusive `hnext` chain in each bucket and
416// redistributed `TString *` pointers into new bucket slots.  In Rust the
417// `HashMap` in `StringPool` handles its own rehashing automatically whenever its
418// load factor is exceeded or `reserve` / `shrink_to` is called.  The entire
419// function is therefore dropped; its effects are subsumed by the HashMap.
420
421// lstring.h: LUAI_FUNC → pub(crate)
422/// Resize the string intern table to approximately `nsize` buckets.
423///
424/// When growing, `HashMap::reserve` hints the desired capacity.  When shrinking,
425/// `HashMap::shrink_to` is used as an approximation of the C logic, which
426/// would rehash entries out of the shrinking tail.  The C function's graceful
427/// degradation on allocation failure (keep the current size) is preserved:
428/// `HashMap` will simply retain its existing capacity if memory is tight.
429///
430/// # C source
431/// ```c
432///
433/// //   stringtable *tb = &G(L)->strt;
434/// //   int osize = tb->size;
435/// //   TString **newvect;
436/// //   if (nsize < osize)
437/// //     tablerehash(tb->hash, osize, nsize);  /* depopulate shrinking part */
438/// //   newvect = luaM_reallocvector(L, tb->hash, osize, nsize, TString*);
439/// //   if (l_unlikely(newvect == NULL)) {
440/// //     if (nsize < osize)
441/// //       tablerehash(tb->hash, nsize, osize);  /* restore to original size */
442/// //   } else {
443/// //     tb->hash = newvect;
444/// //     tb->size = nsize;
445/// //     if (nsize > osize)
446/// //       tablerehash(newvect, osize, nsize);
447/// //   }
448/// // }
449/// ```
450///
451/// PORT NOTE: The three calls to `tablerehash` are dropped because `HashMap`
452/// automatically rehashes.  The allocation-failure fallback (restore to `osize`)
453/// has no direct analogue; `HashMap` will retain existing capacity on OOM, which
454/// matches the intent.
455// PERF(port): luaS_resize shrink — HashMap::shrink_to() is a hint, not a
456// guarantee; the C code freed exact memory.  Profile in Phase B.
457pub(crate) fn resize(state: &mut LuaState, nsize: usize) {
458    let strt = &mut state.global_mut().strt;
459    let osize = strt.size;
460
461    if nsize > osize {
462        let additional = nsize.saturating_sub(strt.map.len());
463        strt.map.reserve(additional);
464    } else if nsize < osize {
465        // PERF(port): shrink_to is a hint; exact shrink not guaranteed in Rust
466        strt.map.shrink_to(nsize);
467    }
468
469    strt.size = nsize;
470}
471
472// lstring.h: LUAI_FUNC → pub(crate)
473/// Clear the API string cache, replacing any GC-white entries with the
474/// preallocated OOM message (which is never collected).
475///
476/// Called by the GC sweep phase to ensure the cache never holds a pointer to a
477/// collected string.
478///
479/// # C source
480/// ```c
481///
482/// //   int i, j;
483/// //   for (i = 0; i < STRCACHE_N; i++)
484/// //     for (j = 0; j < STRCACHE_M; j++) {
485/// //       if (iswhite(g->strcache[i][j]))  /* will entry be collected? */
486/// //         g->strcache[i][j] = g->memerrmsg;
487/// //     }
488/// // }
489/// ```
490///
491/// PORT NOTE: Takes `&mut GlobalState` directly (same as the C signature which
492/// takes `global_State *g`, not `lua_State *L`).  The caller accesses this via
493/// `state.global_mut()`.
494pub(crate) fn clear_cache(g: &mut GlobalState) {
495    for i in 0..STRCACHE_N {
496        for j in 0..STRCACHE_M {
497            // macros.tsv: iswhite → obj.is_white()
498            if g.strcache[i][j].is_white() {
499                g.strcache[i][j] = g.memerrmsg.clone();
500            }
501        }
502    }
503}
504
505// lstring.h: LUAI_FUNC → pub(crate)
506/// Initialise the string intern table and the API string cache.
507///
508/// Must be called exactly once during VM startup, before any strings are created.
509/// Pre-creates the memory-error message and fixes it in the GC (so it is never
510/// collected), then fills every cache slot with that same string.
511///
512/// # C source
513/// ```c
514///
515/// //   global_State *g = G(L);
516/// //   int i, j;
517/// //   stringtable *tb = &G(L)->strt;
518/// //   tb->hash = luaM_newvector(L, MINSTRTABSIZE, TString*);
519/// //   tablerehash(tb->hash, 0, MINSTRTABSIZE);
520/// //   tb->size = MINSTRTABSIZE;
521/// //   g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
522/// //   luaC_fix(L, obj2gco(g->memerrmsg));
523/// //   for (i = 0; i < STRCACHE_N; i++)
524/// //     for (j = 0; j < STRCACHE_M; j++)
525/// //       g->strcache[i][j] = g->memerrmsg;
526/// // }
527/// ```
528pub(crate) fn init(state: &mut LuaState) -> Result<(), LuaError> {
529    //    tablerehash(tb->hash, 0, MINSTRTABSIZE);
530    //    tb->size = MINSTRTABSIZE;
531    // macros.tsv: luaM_newvector → vec![T::default(); n]
532    // PORT NOTE: StringPool::new() sets the initial capacity to MIN_STR_TAB_SIZE,
533    // replacing both the allocation and the tablerehash clear pass.
534    state.global_mut().strt = StringPool::new();
535
536    // macros.tsv: luaS_newliteral → state.intern_str(b"...")
537    let memerrmsg = new_lstr(state, MEMERR_MSG)?;
538
539    // macros.tsv: luaC_fix — not listed; it marks the object as fixed (non-collectable)
540    // TODO(port): call state.gc().fix(memerrmsg.clone()) when GC is wired in Phase D;
541    // in Phases A–C the Rc keeps it alive as long as GlobalState holds the clone
542    let memerrmsg_lt = impl_to_lt(&memerrmsg);
543    state.global_mut().memerrmsg = memerrmsg_lt.clone();
544
545    //      for (j = 0; j < STRCACHE_M; j++)
546    //        g->strcache[i][j] = g->memerrmsg;
547    for i in 0..STRCACHE_N {
548        for j in 0..STRCACHE_M {
549            state.global_mut().strcache[i][j] = memerrmsg_lt.clone();
550        }
551    }
552
553    Ok(())
554}
555
556// lstring.h: LUAI_FUNC → pub(crate)
557/// Create a new, uninitialized long string of `l` bytes.
558///
559/// The returned string's bytes are all zero.  The caller is responsible for
560/// filling the content, if needed; in practice `new_lstr` calls this and then
561/// copies the source bytes in.
562///
563/// # C source
564/// ```c
565///
566/// //   TString *ts = createstrobj(L, l, LUA_VLNGSTR, G(L)->seed);
567/// //   ts->u.lnglen = l;
568/// //   ts->shrlen = 0xFF;  /* signals that it is a long string */
569/// //   return ts;
570/// // }
571/// ```
572///
573/// PORT NOTE: `ts->u.lnglen = l` and `ts->shrlen = 0xFF` are replaced by the
574/// `StringKind::Long` variant which carries the length implicitly through
575/// `Rc<[u8]>::len()`.  The `0xFF` sentinel is no longer needed.
576pub(crate) fn create_long_str(state: &mut LuaState, l: usize) -> GcRef<LuaStringImpl> {
577    let seed = state.global().seed;
578    // PORT NOTE: C's createstrobj allocates uninitialised storage then the caller
579    // fills bytes via memcpy.  Rust's create_str_obj constructs with zeroed bytes;
580    // callers (e.g. new_lstr) pass the real bytes directly, eliminating the two-step.
581    create_str_obj(state, &vec![0u8; l], StringKind::Long, seed)
582}
583
584// lstring.h: LUAI_FUNC → pub(crate)
585/// Remove a short string from the intern table.
586///
587/// Called by the GC sweep when a short string is about to be collected.
588///
589/// # C source
590/// ```c
591///
592/// //   stringtable *tb = &G(L)->strt;
593/// //   TString **p = &tb->hash[lmod(ts->hash, tb->size)];
594/// //   while (*p != ts)  /* find previous element */
595/// //     p = &(*p)->u.hnext;
596/// //   *p = (*p)->u.hnext;  /* remove element from its list */
597/// //   tb->nuse--;
598/// // }
599/// ```
600///
601/// PORT NOTE: The C implementation walks the intrusive `hnext` chain to unlink
602/// `ts`.  In Rust the chain does not exist; `HashMap::remove` is O(1) average.
603/// `lmod(ts->hash, tb->size)` (the bucket index) is not needed; the map keys by
604/// byte content.
605pub(crate) fn remove_str(state: &mut LuaState, ts: &LuaStringImpl) {
606    let strt = &mut state.global_mut().strt;
607
608    //    while (*p != ts) p = &(*p)->u.hnext;
609    //    *p = (*p)->u.hnext;
610    // PORT NOTE: all of the above replaced by HashMap::remove keyed on bytes
611    strt.map.remove(ts.as_bytes());
612
613    strt.nuse = strt.nuse.saturating_sub(1);
614}
615
616// lstring.h: LUAI_FUNC → pub(crate)
617/// Create or retrieve a Lua string from `bytes`.
618///
619/// If `bytes.len() <= MAX_SHORT_LEN` (40), the string is interned: an existing
620/// identical short string is returned if found, otherwise a new one is created
621/// and inserted into the intern table.
622///
623/// If `bytes.len() > MAX_SHORT_LEN`, a new long string is allocated each time
624/// (long strings are never interned).
625///
626/// # C source
627/// ```c
628///
629/// //   if (l <= LUAI_MAXSHORTLEN)  /* short string? */
630/// //     return internshrstr(L, str, l);
631/// //   else {
632/// //     TString *ts;
633/// //     if (l_unlikely(l * sizeof(char) >= (MAX_SIZE - sizeof(TString))))
634/// //       luaM_toobig(L);
635/// //     ts = luaS_createlngstrobj(L, l);
636/// //     memcpy(getlngstr(ts), str, l * sizeof(char));
637/// //     return ts;
638/// //   }
639/// // }
640/// ```
641pub(crate) fn new_lstr(state: &mut LuaState, bytes: &[u8]) -> Result<GcRef<LuaStringImpl>, LuaError> {
642    if bytes.len() <= MAX_SHORT_LEN {
643        intern_short_str(state, bytes)
644    } else {
645        //        luaM_toobig(L);
646        // macros.tsv: luaM_toobig → return Err(LuaError::Memory)
647        // PORT NOTE: sizeof(TString) is a C-specific overhead; in Rust we just
648        // check that the byte count fits within MAX_SIZE.
649        if bytes.len() >= MAX_SIZE {
650            return Err(LuaError::Memory);
651        }
652
653        //    memcpy(getlngstr(ts), str, l * sizeof(char));
654        // PORT NOTE: Rather than creating a zeroed buffer and then copying,
655        // we construct the LuaStringImpl directly from `bytes`.
656        let seed = state.global().seed;
657        let h = hash_bytes(bytes, seed);
658        let ts = create_str_obj(state, bytes, StringKind::Long, h);
659        Ok(ts)
660    }
661}
662
663// lstring.h: LUAI_FUNC → pub(crate)
664/// Create or retrieve a Lua string, using a small two-slot LRU cache per hash
665/// bucket to accelerate repeated calls with the same byte sequence.
666///
667/// In C, the cache bucket is selected by casting the C string pointer to a `u32`
668/// (`point2uint`).  In Rust, `point2uint` is restricted to `lua-gc`/`lua-coro`
669/// (raw-pointer cast requiring `unsafe`).  We substitute a content-hash based
670/// bucket index instead.  Functional semantics are identical; cache hit rates for
671/// repeated calls with the same `bytes` may differ.
672///
673/// # C source
674/// ```c
675///
676/// //   unsigned int i = point2uint(str) % STRCACHE_N;  /* hash */
677/// //   int j;
678/// //   TString **p = G(L)->strcache[i];
679/// //   for (j = 0; j < STRCACHE_M; j++) {
680/// //     if (strcmp(str, getstr(p[j])) == 0)  /* hit? */
681/// //       return p[j];  /* that is it */
682/// //   }
683/// //   /* normal route */
684/// //   for (j = STRCACHE_M - 1; j > 0; j--)
685/// //     p[j] = p[j - 1];  /* move out last element */
686/// //   p[0] = luaS_newlstr(L, str, strlen(str));
687/// //   return p[0];
688/// // }
689/// ```
690///
691/// PORT NOTE: `point2uint(str) % STRCACHE_N` used the raw pointer address as a
692/// fast key, exploiting the fact that C string literals have stable addresses.
693/// In Rust we use `hash_bytes(bytes, seed) % STRCACHE_N` instead.  The replacement
694/// is fully safe and has identical semantics (but different cache behaviour for
695/// calls from different `&[u8]` slices with identical content).
696pub(crate) fn new(state: &mut LuaState, bytes: &[u8]) -> Result<GcRef<LuaStringImpl>, LuaError> {
697    // PORT NOTE: pointer hash replaced by content hash (see doc above)
698    let seed = state.global().seed;
699    let i = (hash_bytes(bytes, seed) as usize) % STRCACHE_N;
700
701    // macros.tsv: getstr → ts.as_bytes()
702    for j in 0..STRCACHE_M {
703        if state.global().strcache[i][j].as_bytes() == bytes {
704            // TODO(phase-b): strcache currently holds lua_types::LuaString; rebuild
705            // a rich LuaStringImpl from the bytes. Phase B should unify the types.
706            let cached_bytes = state.global().strcache[i][j].as_bytes().to_vec();
707            // TODO(D-1c-bridge): LuaStringImpl is the rich local type; state helper produces lua_types::LuaString
708            return Ok(GcRef::new(LuaStringImpl {
709                bytes: cached_bytes.into(),
710                kind: if bytes.len() <= MAX_SHORT_LEN { StringKind::Short } else { StringKind::Long },
711                hash: Cell::new(hash_bytes(bytes, seed)),
712                extra: Cell::new(0),
713            }));
714        }
715    }
716
717    // Create the string before mutating the cache
718    let new_str = new_lstr(state, bytes)?;
719
720    // Shift entries toward the back to make room at slot 0
721    for j in (1..STRCACHE_M).rev() {
722        // Clone first to avoid borrow conflict between getter and setter
723        let prev = state.global().strcache[i][j - 1].clone();
724        state.global_mut().strcache[i][j] = prev;
725    }
726
727    state.global_mut().strcache[i][0] = impl_to_lt(&new_str);
728
729    Ok(new_str)
730}
731
732// lstring.h: LUAI_FUNC → pub(crate)
733/// Allocate a new full userdata of `s` raw bytes with `nuvalue` Lua user values.
734///
735/// The raw byte payload is zeroed.  All user values are initialised to `nil`.
736/// The metatable is `None`.
737///
738/// # C source
739/// ```c
740///
741/// //   Udata *u;
742/// //   int i;
743/// //   GCObject *o;
744/// //   if (l_unlikely(s > MAX_SIZE - udatamemoffset(nuvalue)))
745/// //     luaM_toobig(L);
746/// //   o = luaC_newobj(L, LUA_VUSERDATA, sizeudata(nuvalue, s));
747/// //   u = gco2u(o);
748/// //   u->len = s;
749/// //   u->nuvalue = nuvalue;
750/// //   u->metatable = NULL;
751/// //   for (i = 0; i < nuvalue; i++)
752/// //     setnilvalue(&u->uv[i].uv);
753/// //   return u;
754/// // }
755/// ```
756pub(crate) fn new_userdata(
757    state: &mut LuaState,
758    s: usize,
759    nuvalue: usize,
760) -> Result<GcRef<LuaUserDataImpl>, LuaError> {
761    //        luaM_toobig(L);
762    // macros.tsv: luaM_toobig → return Err(LuaError::Memory)
763    // TODO(port): udatamemoffset(nuvalue) computes C-specific alignment padding
764    // for the flexible-array Udata layout.  In Rust, LuaUserData allocates `data`
765    // and `uv` separately (Box<[u8]> + Vec<LuaValue>); the combined size bound
766    // differs.  Conservative check: reject if s alone exceeds MAX_SIZE.
767    if s > MAX_SIZE {
768        return Err(LuaError::Memory);
769    }
770
771    //    u = gco2u(o);
772    // TODO(port): register with GC tracking (state.gc().new_obj(...));
773    // Phase A–C stub: allocate via Rc without GC registration.
774    // TODO(D-1c-bridge): LuaUserDataImpl is the rich local type; state.new_userdata is still todo!()
775    let u = GcRef::new(LuaUserDataImpl {
776        len: s,
777        nuvalue: nuvalue as u16,
778        metatable: None,
779        // macros.tsv: setnilvalue → *o = LuaValue::Nil
780        // TODO(port): Vec<LuaValue> once LuaValue is defined in lua-types
781        uv: vec![(); nuvalue],
782        // Raw byte payload; zero-initialised.
783        data: vec![0u8; s].into_boxed_slice(),
784    });
785
786    // TODO(port): push into state.global_mut().allgc for GC tracking (Phase D)
787    Ok(u)
788}
789
790// ── Private helpers ───────────────────────────────────────────────────────────
791
792/// Allocate and initialise a new `LuaStringImpl` with the given bytes, kind, and hash.
793///
794/// In C, `createstrobj` allocated uninitialised memory via `luaC_newobj` and set
795/// the header fields; the caller then filled the content via `memcpy`.  In Rust
796/// we construct the string directly from the provided `bytes`, eliminating the
797/// two-step pattern.
798///
799/// # C source
800/// ```c
801///
802/// //   TString *ts;
803/// //   GCObject *o;
804/// //   size_t totalsize = sizelstring(l);
805/// //   o = luaC_newobj(L, tag, totalsize);
806/// //   ts = gco2ts(o);
807/// //   ts->hash = h;
808/// //   ts->extra = 0;
809/// //   getstr(ts)[l] = '\0';  /* ending 0 */
810/// //   return ts;
811/// // }
812/// ```
813///
814/// PORT NOTE: `sizelstring(l)` computed the total allocation size including the
815/// nul terminator.  In Rust, `Rc<[u8]>` stores the bytes without a nul; the
816/// nul terminator is dropped.  Callers that need a nul-terminated `*const u8`
817/// for FFI must use a temporary `CString` or equivalent.
818fn create_str_obj(
819    state: &mut LuaState,
820    bytes: &[u8],
821    kind: StringKind,
822    hash: u32,
823) -> GcRef<LuaStringImpl> {
824    // macros.tsv: luaM_newobject → state.gc().new_obj(tag, sz)
825    // TODO(port): register with GC tracking list (state.global_mut().allgc)
826    // in Phase D; Phase A–C creates a bare Rc
827    let _ = state; // state needed for GC registration in Phase D
828    // TODO(D-1c-bridge): LuaStringImpl is the rich local type; state helper produces lua_types::LuaString
829    GcRef::new(LuaStringImpl {
830        hash: Cell::new(hash),
831        extra: Cell::new(0),
832        // PORT NOTE: we receive bytes directly; no separate memcpy step needed
833        bytes: Rc::from(bytes),
834        kind,
835    })
836}
837
838/// Grow the string intern table, first attempting a GC collection if the table is
839/// at its absolute maximum size.
840///
841/// # C source
842/// ```c
843///
844/// //   if (l_unlikely(tb->nuse == MAX_INT)) {  /* too many strings? */
845/// //     luaC_fullgc(L, 1);  /* try to free some... */
846/// //     if (tb->nuse == MAX_INT)  /* still too many? */
847/// //       luaM_error(L);  /* cannot even create a message... */
848/// //   }
849/// //   if (tb->size <= MAXSTRTB / 2)  /* can grow string table? */
850/// //     luaS_resize(L, tb->size * 2);
851/// // }
852/// ```
853fn grow_str_tab(state: &mut LuaState) -> Result<(), LuaError> {
854    // macros.tsv: MAX_INT → i32::MAX
855    let nuse = state.global().strt.nuse;
856    if nuse == i32::MAX as usize {
857        // macros.tsv: luaC_fullgc → state.gc().full_collect()
858        // TODO(port): state.gc().full_collect() — GC not yet wired in Phase A–C; no-op
859        // (When GC is live this call may reduce nuse by sweeping dead short strings.)
860
861        // macros.tsv: luaM_error → return Err(LuaError::Memory)
862        if state.global().strt.nuse == i32::MAX as usize {
863            return Err(LuaError::Memory);
864        }
865    }
866
867    let size = state.global().strt.size;
868    if size <= MAX_STR_TAB / 2 {
869        resize(state, size * 2);
870    }
871
872    Ok(())
873}
874
875/// Look up `bytes` in the intern table; create and insert a new short string if
876/// not found.
877///
878/// The `isdead` / `changewhite` resurrection path is elided in Phases A–C because
879/// `Rc`-based reference counting keeps objects alive until all references drop
880/// (there are no dead-but-not-collected strings in Phase A–C).
881///
882/// # C source
883/// ```c
884///
885/// //   TString *ts;
886/// //   global_State *g = G(L);
887/// //   stringtable *tb = &g->strt;
888/// //   unsigned int h = luaS_hash(str, l, g->seed);
889/// //   TString **list = &tb->hash[lmod(h, tb->size)];
890/// //   lua_assert(str != NULL);
891/// //   for (ts = *list; ts != NULL; ts = ts->u.hnext) {
892/// //     if (l == ts->shrlen && (memcmp(str, getshrstr(ts), l) == 0)) {
893/// //       if (isdead(g, ts)) changewhite(ts);  /* resurrect it */
894/// //       return ts;
895/// //     }
896/// //   }
897/// //   if (tb->nuse >= tb->size) {
898/// //     growstrtab(L, tb);
899/// //     list = &tb->hash[lmod(h, tb->size)];
900/// //   }
901/// //   ts = createstrobj(L, l, LUA_VSHRSTR, h);
902/// //   ts->shrlen = cast_byte(l);
903/// //   memcpy(getshrstr(ts), str, l);
904/// //   ts->u.hnext = *list;
905/// //   *list = ts;
906/// //   tb->nuse++;
907/// //   return ts;
908/// // }
909/// ```
910///
911/// PORT NOTE: `lmod(h, tb->size)` (power-of-two bucket modulo via
912/// `macros.tsv: lmod → (s & (size - 1)) as usize`) and the `hnext` chain walk
913/// are both gone.  `HashMap::get` replaces the linear bucket scan.
914fn intern_short_str(
915    state: &mut LuaState,
916    bytes: &[u8],
917) -> Result<GcRef<LuaStringImpl>, LuaError> {
918    // In Rust, &[u8] slices are never null; the assertion is trivially satisfied.
919
920    let seed = state.global().seed;
921    let h = hash_bytes(bytes, seed);
922
923    // PORT NOTE: intrusive hnext chain replaced by HashMap lookup
924    // Clone the existing GcRef<LuaStringImpl> so the immutable borrow on `state` ends
925    // before any mutable access below.
926    let existing = state.global().strt.map.get(bytes).cloned();
927    if let Some(ts) = existing {
928        // macros.tsv: isdead → g.is_dead(obj);  changewhite → obj.flip_white()
929        // PORT NOTE: GC color management deferred to Phase D; in Phases A–C all
930        // Rc-held objects are live by definition (Rc keeps them alive).
931        return Ok(ts);
932    }
933
934    let needs_grow = {
935        let strt = &state.global().strt;
936        strt.nuse >= strt.size
937    };
938    if needs_grow {
939        grow_str_tab(state)?;
940    }
941
942    //    ts->shrlen = cast_byte(l);  — encoded in StringKind::Short
943    //    memcpy(getshrstr(ts), str, l);  — bytes passed directly to create_str_obj
944    let ts = create_str_obj(state, bytes, StringKind::Short, h);
945
946    state
947        .global_mut()
948        .strt
949        .map
950        .insert(bytes.to_vec().into_boxed_slice(), ts.clone());
951    state.global_mut().strt.nuse += 1;
952
953    Ok(ts)
954}
955
956// ── Re-export marker for type defined here ────────────────────────────────────
957
958// TODO(port): LuaError is used in function signatures above but is not yet defined
959// in lua-types.  Phase B must add LuaError to lua-types/src/error.rs per
960// PORTING.md §6 before this file can compile.  The expected variants are:
961//   LuaError::Runtime(LuaValue)
962//   LuaError::Memory
963//   LuaError::Syntax(LuaValue)
964//   ... (full list in PORTING.md §6)
965// For now, reference LuaError as an opaque import from the future lua-types crate.
966use lua_types::LuaError;
967
968// ──────────────────────────────────────────────────────────────────────────────
969// PORT STATUS
970//   source:        src/lstring.c  (275 lines, 15 functions)
971//                  src/lstring.h  (57 lines; merged)
972//   target_crate:  lua-vm
973//   confidence:    medium
974//   todos:         14
975//   port_notes:    30
976//   unsafe_blocks: 0   (must be 0 outside explicit unsafe-budget crates)
977//   notes:         Logic is faithful to the C.  The two largest structural changes
978//                  are: (1) `tablerehash` + intrusive `hnext` chain replaced by
979//                  `HashMap` in `StringPool`; (2) `luaS_new`'s `point2uint`
980//                  pointer-hash replaced by a content hash (safe, same semantics).
981//                  Key TODOs: GC registration in create_str_obj (Phase D),
982//                  GC registration in new_userdata (Phase D), luaC_fix in init
983//                  (Phase D), full_collect stub in grow_str_tab (Phase D),
984//                  udatamemoffset size check in new_userdata (Phase B),
985//                  LuaValue in LuaUserData.uv (Phase B), LuaError import path
986//                  (Phase B), GcRef typedef (Phase B).  Phase B priority: wire
987//                  import paths for LuaState, GlobalState, LuaError, LuaValue,
988//                  and move LuaStringImpl/StringPool/LuaUserData to their canonical
989//                  modules (object.rs / state.rs).
990// ──────────────────────────────────────────────────────────────────────────────