lua_vm/string.rs
1//! String table and interned-string operations — port of `lstring.c` + `lstring.h`.
2//!
3//! Provides two key abstractions:
4//!
5//! - [`LuaStringImpl`]: the Lua string value, stored as a reference-counted byte slice.
6//! Short strings (`<= MAX_SHORT_LEN` bytes) are interned in the process-global
7//! [`StringPool`]; long strings are heap-allocated on each creation and never
8//! interned.
9//!
10//! - [`StringPool`]: the intern table for short strings, stored on `GlobalState`.
11//! Replaces the C `stringtable` struct, which used an open-addressing hash table
12//! with intrusive chaining through `TString.u.hnext`. In Rust the intrusive
13//! chain is dropped; a `HashMap` provides O(1) lookup and automatic rehashing.
14//! See PORT NOTE on [`StringPool`] for the full rationale.
15//!
16//! The `lstring.h` header is merged into this module per PORTING.md §1.
17//!
18//! # C source files
19//! - `reference/lua-5.4.7/src/lstring.c` (275 lines, 15 functions)
20//! - `reference/lua-5.4.7/src/lstring.h` (57 lines; merged here)
21
22use std::cell::Cell;
23#[allow(unused_imports)] use crate::prelude::*;
24use std::collections::HashMap;
25use std::rc::Rc;
26
27// TODO(port): these import paths will resolve once Phase B wires the crate graph.
28// `LuaState` and `GlobalState` live in crate::state (src/state.rs, from lstate.c).
29// `LuaValue` and `LuaError` live in lua_types (crates/lua-types/src/).
30use crate::state::{GlobalState, LuaState};
31
32// PORT NOTE: `GcRef<T>` is the lua-types newtype around `Rc<T>` per PORT_STRATEGY §3.4.
33// Re-imported here so all string-pool entries share identity with state.rs / api.rs.
34use lua_types::GcRef;
35/// Local alias retained while string.rs's own `LuaStringImpl` is still in use; will
36/// merge with `lua_types::LuaString` in Phase B's string-pool consolidation.
37type LocalGcRef<T> = Rc<T>;
38
39/// Phase-B bridge: converts a lua-vm rich `LuaStringImpl` into a `lua_types::LuaString`.
40/// The two types track different metadata (short/long flag, extra byte) and a real
41/// merge belongs in Phase B once `lua-types::LuaString` grows the needed fields.
42fn impl_to_lt(s: &GcRef<LuaStringImpl>) -> GcRef<lua_types::LuaString> {
43 // TODO(D-1c-bridge): allocation outside state context (free fn)
44 GcRef::new(lua_types::LuaString::from_bytes(s.as_bytes().to_vec()))
45}
46
47// ── Constants (lstring.h macros → macros.tsv) ─────────────────────────────────
48
49// macros.tsv: MEMERRMSG → const MEMERR_MSG: &[u8] = b"not enough memory"
50/// Pre-allocated OOM error message. Must be created before the allocator
51/// can fail so that the GC can always hand back a valid error string.
52pub(crate) const MEMERR_MSG: &[u8] = b"not enough memory";
53
54// macros.tsv: MINSTRTABSIZE → const MIN_STR_TAB_SIZE: usize = 128
55const MIN_STR_TAB_SIZE: usize = 128;
56
57// macros.tsv: STRCACHE_N → const STRCACHE_N: usize = 53
58const STRCACHE_N: usize = 53;
59
60// macros.tsv: STRCACHE_M → const STRCACHE_M: usize = 2
61const STRCACHE_M: usize = 2;
62
63// macros.tsv: LUAI_MAXSHORTLEN → const MAX_SHORT_LEN: usize = 40
64pub(crate) const MAX_SHORT_LEN: usize = 40;
65
66// macros.tsv: MAX_SIZE → const MAX_SIZE: usize = if size_of::<usize>() < size_of::<i64>() { usize::MAX } else { i64::MAX as usize }
67const MAX_SIZE: usize = if std::mem::size_of::<usize>() < std::mem::size_of::<i64>() {
68 usize::MAX
69} else {
70 i64::MAX as usize
71};
72
73// macros.tsv: luaM_limitN → std::cmp::min(n, usize::MAX / std::mem::size_of::<T>())
74// cast_int → x as i32
75// Rust: upper bound on the number of hash buckets; derived from MAX_INT / pointer size.
76const MAX_STR_TAB: usize = i32::MAX as usize / std::mem::size_of::<usize>();
77
78// macros.tsv: sizelstring → drop — Rust allocates via Box<[u8]> / Rc<[u8]>
79// PORT NOTE: dropped entirely; Rust uses Rc<[u8]> which carries its own length.
80
81// macros.tsv: luaS_newliteral → state.intern_str(b"...")
82// PORT NOTE: translated at call sites as `new_lstr(state, b"literal")`.
83
84// macros.tsv: isreserved → ts.is_reserved_word()
85// PORT NOTE: translated at call sites as the `LuaStringImpl::is_reserved_word()` method.
86
87// macros.tsv: eqshrstr → Rc::ptr_eq(a, b)
88// PORT NOTE: short strings are interned so pointer equality suffices.
89// Translated at call sites as `Rc::ptr_eq(a, b)`.
90
91// ── LuaStringImpl (was TString in lobject.h) ─────────────────────────────────────
92
93// PORT NOTE: `LuaStringImpl` corresponds to `TString` from `lobject.h`, which maps to
94// `src/object.rs` per file_deps.txt. It is defined here (in `string.rs`) because
95// `lstring.c` owns the string-table internals and most of the type's behaviour.
96// Phase B should reconcile: either keep it here and re-export from `object.rs`,
97// or move it there and import it from `string.rs`.
98
99/// Whether a Lua string is short (interned) or long (not interned).
100///
101/// Corresponds to `LUA_VSHRSTR` / `LUA_VLNGSTR` tags from `lobject.h`.
102///
103/// # C mapping (types.tsv)
104/// ```text
105/// LUA_VSHRSTR → LuaStringImpl::Short (shrlen holds length 0..=40)
106/// LUA_VLNGSTR → LuaStringImpl::Long (shrlen = 0xFF sentinel; u.lnglen holds length)
107/// ```
108#[derive(Debug, Clone, Copy, PartialEq, Eq)]
109pub enum StringKind {
110 Short,
111 Long,
112}
113
114/// A Lua string: an immutable, reference-counted byte sequence.
115///
116/// Short strings (`<= MAX_SHORT_LEN = 40` bytes) are interned in the
117/// [`StringPool`] on `GlobalState`; two short strings with the same bytes
118/// are guaranteed to be the same `GcRef` (pointer equality via `Rc::ptr_eq`).
119///
120/// Long strings are heap-allocated independently and never interned. Their
121/// hash is computed lazily on first call to [`hash_long_str`] and cached via
122/// interior mutability (`Cell<u32>`).
123///
124/// # C mapping (types.tsv)
125/// ```text
126/// TString → LuaStringImpl
127/// TString.extra → extra: Cell<u8> (reserved-word idx for Short; hash-ready flag for Long)
128/// TString.shrlen → kind: StringKind (0xFF sentinel replaced by enum variant)
129/// TString.hash → hash: Cell<u32>
130/// TString.u.lnglen → bytes.len() (length implicit in Rc<[u8]>)
131/// TString.u.hnext → (removed) (intrusive chain gone; StringPool uses HashMap)
132/// TString.contents → bytes: Rc<[u8]>
133/// ```
134pub struct LuaStringImpl {
135 bytes: Rc<[u8]>,
136
137 // Replaced by the StringKind enum; length is implicit in bytes.len().
138 kind: StringKind,
139
140 // Using Cell<u32> so that `hash_long_str` can cache the hash through a
141 // shared `&LuaStringImpl` reference (interior mutability, single-threaded).
142 hash: Cell<u32>,
143
144 // Short strings: reserved-word token index (0 = not a keyword).
145 // Long strings: 0 = hash not yet computed; 1 = hash is valid.
146 extra: Cell<u8>,
147}
148
149impl LuaStringImpl {
150 /// Returns the string's bytes.
151 ///
152 /// macros.tsv: `getstr` / `getlngstr` / `getshrstr` → `ts.as_bytes()`
153 pub fn as_bytes(&self) -> &[u8] {
154 &self.bytes
155 }
156
157 /// Returns the byte length of the string.
158 ///
159 /// for Long. In Rust both cases are `bytes.len()`.
160 /// macros.tsv: `tsslen` → `ts.len()`
161 pub fn len(&self) -> usize {
162 self.bytes.len()
163 }
164
165 /// Returns `true` if this is a long (non-interned) string.
166 pub fn is_long(&self) -> bool {
167 self.kind == StringKind::Long
168 }
169
170 /// Returns `true` if this is a short (interned) string.
171 pub fn is_short(&self) -> bool {
172 self.kind == StringKind::Short
173 }
174
175 /// Returns `true` if this short string is a Lua reserved word.
176 ///
177 /// macros.tsv: `isreserved` → `ts.is_reserved_word()`
178 pub fn is_reserved_word(&self) -> bool {
179 self.kind == StringKind::Short && self.extra.get() > 0
180 }
181
182 /// GC color predicate. Returns `true` if this object is "white" (unreachable)
183 /// in the GC's current wave.
184 ///
185 /// macros.tsv: `iswhite` → `obj.is_white()`
186 ///
187 /// PORT NOTE: GC color management is deferred to Phase D. In Phases A–C all
188 /// objects are reachable via `Rc` reference counts and this always returns
189 /// `false` (nothing is white / unreachable).
190 pub fn is_white(&self) -> bool {
191 // TODO(port): Phase D — check the GC marked byte; stub returns false (all live)
192 false
193 }
194
195 /// Flip GC color from white to the current non-white (resurrect a dead object).
196 ///
197 /// macros.tsv: `changewhite` → `obj.flip_white()`
198 ///
199 /// PORT NOTE: GC color management deferred to Phase D; no-op in Phases A–C.
200 pub fn flip_white(&self) {
201 // TODO(port): Phase D — update the GC marked byte
202 }
203}
204
205impl PartialEq for LuaStringImpl {
206 /// Equality for Lua strings.
207 ///
208 /// For short strings (interned), pointer equality via `Rc::ptr_eq` is sufficient
209 /// and matches `eqshrstr` in C. For long strings, we fall back to byte
210 /// comparison, matching `luaS_eqlngstr` in C.
211 fn eq(&self, other: &Self) -> bool {
212 if self.kind == StringKind::Short && other.kind == StringKind::Short {
213 Rc::ptr_eq(&self.bytes, &other.bytes)
214 } else {
215 self.bytes == other.bytes
216 }
217 }
218}
219
220impl Eq for LuaStringImpl {}
221
222// ── StringPool (was stringtable in lstate.h) ──────────────────────────────────
223
224// PORT NOTE: `StringPool` corresponds to `stringtable` from `lstate.h`, which maps
225// to `src/state.rs` per file_deps.txt. It is defined here because `lstring.c`
226// owns all of the pool's mutation logic. Phase B should reconcile placement.
227//
228// The C `stringtable` used an open-addressing hash table where each bucket was
229// the head of an intrusive singly-linked list threaded through `TString.u.hnext`.
230// In Rust, `TString.u.hnext` is removed per types.tsv. The `HashMap` replaces
231// both the bucket array and the chain: it provides O(1) average-case lookup,
232// automatic rehashing, and eliminates the need for `tablerehash`.
233//
234// `nuse` and `size` are retained for parity with the C invariants that other
235// code may check (e.g. `growstrtab` tests `nuse >= size`).
236
237/// Intern table for short Lua strings. Lives on `GlobalState`.
238///
239/// # C mapping (types.tsv)
240/// ```text
241/// stringtable → StringPool
242/// stringtable.hash → map: HashMap<Box<[u8]>, GcRef<LuaStringImpl>>
243/// stringtable.nuse → nuse: usize
244/// stringtable.size → size: usize
245/// ```
246pub struct StringPool {
247 // PORT NOTE: keyed by owned byte slice; lookup by `&[u8]` via Borrow<[u8]>.
248 map: HashMap<Box<[u8]>, GcRef<LuaStringImpl>>,
249
250 // PERF(port): redundant with map.len() in Rust — keep for C-parity; remove in Phase B
251 nuse: usize,
252
253 // In Rust, HashMap manages its own capacity; this tracks the last requested size.
254 size: usize,
255}
256
257impl StringPool {
258 /// Create an empty pool with `MIN_STR_TAB_SIZE` preallocated capacity.
259 ///
260 /// `tablerehash(tb->hash, 0, MINSTRTABSIZE)` sequence in `luaS_init`.
261 pub fn new() -> Self {
262 StringPool {
263 map: HashMap::with_capacity(MIN_STR_TAB_SIZE),
264 nuse: 0,
265 size: MIN_STR_TAB_SIZE,
266 }
267 }
268}
269
270impl Default for StringPool {
271 fn default() -> Self {
272 Self::new()
273 }
274}
275
276// ── LuaUserData (was Udata in lobject.h) ──────────────────────────────────────
277
278// PORT NOTE: `LuaUserData` corresponds to `Udata` from `lobject.h`, which maps to
279// `src/object.rs` per file_deps.txt. Defined here because `luaS_newudata` lives
280// in `lstring.c`. Phase B should reconcile placement.
281
282/// Full userdata: a GC-tracked object carrying a raw byte payload plus optional
283/// Lua user values and an optional metatable.
284///
285/// # C mapping (types.tsv)
286/// ```text
287/// Udata → LuaUserData
288/// Udata.len → len: usize
289/// Udata.nuvalue → nuvalue: u16 (covered by uv.len() but kept for parity)
290/// Udata.metatable → metatable: Option<GcRef<LuaTable>>
291/// Udata.uv → uv: Vec<LuaValue>
292/// (no direct C field) data: Box<[u8]> — the raw byte payload; C used a flexible
293/// array member laid out past the Udata header via
294/// `udatamemoffset` alignment math.
295/// ```
296pub struct LuaUserDataImpl {
297 pub len: usize,
298 pub nuvalue: u16,
299 // TODO(port): GcRef<LuaTable> — LuaTable not yet defined; Phase B
300 pub metatable: Option<()>,
301 // macros.tsv: setnilvalue → *o = LuaValue::Nil
302 // TODO(port): Vec<LuaValue> — LuaValue not yet defined; Phase B
303 pub uv: Vec<()>,
304 // Port of the raw byte payload that C accessed via udatamemoffset arithmetic.
305 pub data: Box<[u8]>,
306}
307
308// ── Public functions ───────────────────────────────────────────────────────────
309
310// lstring.h: LUAI_FUNC → pub(crate)
311/// Test equality of two long strings.
312///
313/// Two long strings are equal if they have identical byte content. A pointer
314/// equality short-circuit is also applied: if `a` and `b` share the same
315/// underlying `Rc<[u8]>` allocation, they are trivially equal.
316///
317/// # C source
318/// ```c
319///
320/// // size_t len = a->u.lnglen;
321/// // lua_assert(a->tt == LUA_VLNGSTR && b->tt == LUA_VLNGSTR);
322/// // return (a == b) ||
323/// // ((len == b->u.lnglen) &&
324/// // (memcmp(getlngstr(a), getlngstr(b), len) == 0));
325/// // }
326/// ```
327pub(crate) fn eq_long_str(a: &LuaStringImpl, b: &LuaStringImpl) -> bool {
328 // macros.tsv: lua_assert → debug_assert!
329 debug_assert!(a.is_long() && b.is_long(), "eq_long_str: both arguments must be long strings");
330
331 // In Rust: check if the Rc<[u8]> byte buffers are the same allocation
332 if Rc::ptr_eq(&a.bytes, &b.bytes) {
333 return true;
334 }
335
336 // macros.tsv: getlngstr → ts.as_bytes()
337 a.as_bytes() == b.as_bytes()
338}
339
340// lstring.h: LUAI_FUNC → pub(crate)
341/// Hash a byte string with a seed using Lua's FNV-style hash.
342///
343/// This is a pure function with no allocations. The algorithm XORs shifts and
344/// additions over each byte in reverse order, seeded by `seed ^ len`.
345///
346/// # C source
347/// ```c
348///
349/// // unsigned int h = seed ^ cast_uint(l);
350/// // for (; l > 0; l--)
351/// // h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1]));
352/// // return h;
353/// // }
354/// ```
355///
356/// PORT NOTE: C parenthesises `(h<<5)` and `(h>>2)` explicitly, so the outer
357/// additions are unambiguous despite C's `<<`/`>>` having lower precedence than
358/// `+`. In Rust `<<` and `>>` have higher precedence than `+`, so the same
359/// expression is computed without extra parentheses; `wrapping_add` is used to
360/// match C's unsigned wrap-around arithmetic.
361pub(crate) fn hash_bytes(bytes: &[u8], seed: u32) -> u32 {
362 // macros.tsv: cast_uint → x as u32
363 let mut h: u32 = seed ^ (bytes.len() as u32);
364
365 let mut l = bytes.len();
366 while l > 0 {
367 l -= 1;
368 // macros.tsv: cast_byte → x as u8 (then as u32 for the arithmetic)
369 h ^= (h << 5)
370 .wrapping_add(h >> 2)
371 .wrapping_add(bytes[l] as u32);
372 }
373
374 h
375}
376
377// lstring.h: LUAI_FUNC → pub(crate)
378/// Compute (and cache) the hash of a long string.
379///
380/// The hash for long strings is computed lazily: on first call the hash is
381/// derived from `hash_bytes` using the seed stored in the `hash` field, then
382/// `extra` is set to `1` to record that the hash is now valid. Subsequent calls
383/// return the cached value directly.
384///
385/// Interior mutability (`Cell<u32>` / `Cell<u8>`) allows mutation through a
386/// shared `&LuaStringImpl` reference, which is necessary because `GcRef<LuaStringImpl>`
387/// is `Rc<LuaStringImpl>` and there is no safe way to get `&mut` through an `Rc`.
388///
389/// # C source
390/// ```c
391///
392/// // lua_assert(ts->tt == LUA_VLNGSTR);
393/// // if (ts->extra == 0) { /* no hash? */
394/// // size_t len = ts->u.lnglen;
395/// // ts->hash = luaS_hash(getlngstr(ts), len, ts->hash);
396/// // ts->extra = 1; /* now it has its hash */
397/// // }
398/// // return ts->hash;
399/// // }
400/// ```
401pub(crate) fn hash_long_str(ts: &LuaStringImpl) -> u32 {
402 debug_assert!(ts.is_long(), "hash_long_str: argument must be a long string");
403
404 if ts.extra.get() == 0 {
405 // The initial ts->hash holds the per-state seed (set at construction).
406 let computed = hash_bytes(ts.as_bytes(), ts.hash.get());
407 ts.hash.set(computed);
408 ts.extra.set(1);
409 }
410
411 ts.hash.get()
412}
413
414//
415// PORT NOTE: `tablerehash` walked the intrusive `hnext` chain in each bucket and
416// redistributed `TString *` pointers into new bucket slots. In Rust the
417// `HashMap` in `StringPool` handles its own rehashing automatically whenever its
418// load factor is exceeded or `reserve` / `shrink_to` is called. The entire
419// function is therefore dropped; its effects are subsumed by the HashMap.
420
421// lstring.h: LUAI_FUNC → pub(crate)
422/// Resize the string intern table to approximately `nsize` buckets.
423///
424/// When growing, `HashMap::reserve` hints the desired capacity. When shrinking,
425/// `HashMap::shrink_to` is used as an approximation of the C logic, which
426/// would rehash entries out of the shrinking tail. The C function's graceful
427/// degradation on allocation failure (keep the current size) is preserved:
428/// `HashMap` will simply retain its existing capacity if memory is tight.
429///
430/// # C source
431/// ```c
432///
433/// // stringtable *tb = &G(L)->strt;
434/// // int osize = tb->size;
435/// // TString **newvect;
436/// // if (nsize < osize)
437/// // tablerehash(tb->hash, osize, nsize); /* depopulate shrinking part */
438/// // newvect = luaM_reallocvector(L, tb->hash, osize, nsize, TString*);
439/// // if (l_unlikely(newvect == NULL)) {
440/// // if (nsize < osize)
441/// // tablerehash(tb->hash, nsize, osize); /* restore to original size */
442/// // } else {
443/// // tb->hash = newvect;
444/// // tb->size = nsize;
445/// // if (nsize > osize)
446/// // tablerehash(newvect, osize, nsize);
447/// // }
448/// // }
449/// ```
450///
451/// PORT NOTE: The three calls to `tablerehash` are dropped because `HashMap`
452/// automatically rehashes. The allocation-failure fallback (restore to `osize`)
453/// has no direct analogue; `HashMap` will retain existing capacity on OOM, which
454/// matches the intent.
455// PERF(port): luaS_resize shrink — HashMap::shrink_to() is a hint, not a
456// guarantee; the C code freed exact memory. Profile in Phase B.
457pub(crate) fn resize(state: &mut LuaState, nsize: usize) {
458 let strt = &mut state.global_mut().strt;
459 let osize = strt.size;
460
461 if nsize > osize {
462 let additional = nsize.saturating_sub(strt.map.len());
463 strt.map.reserve(additional);
464 } else if nsize < osize {
465 // PERF(port): shrink_to is a hint; exact shrink not guaranteed in Rust
466 strt.map.shrink_to(nsize);
467 }
468
469 strt.size = nsize;
470}
471
472// lstring.h: LUAI_FUNC → pub(crate)
473/// Clear the API string cache, replacing any GC-white entries with the
474/// preallocated OOM message (which is never collected).
475///
476/// Called by the GC sweep phase to ensure the cache never holds a pointer to a
477/// collected string.
478///
479/// # C source
480/// ```c
481///
482/// // int i, j;
483/// // for (i = 0; i < STRCACHE_N; i++)
484/// // for (j = 0; j < STRCACHE_M; j++) {
485/// // if (iswhite(g->strcache[i][j])) /* will entry be collected? */
486/// // g->strcache[i][j] = g->memerrmsg;
487/// // }
488/// // }
489/// ```
490///
491/// PORT NOTE: Takes `&mut GlobalState` directly (same as the C signature which
492/// takes `global_State *g`, not `lua_State *L`). The caller accesses this via
493/// `state.global_mut()`.
494pub(crate) fn clear_cache(g: &mut GlobalState) {
495 for i in 0..STRCACHE_N {
496 for j in 0..STRCACHE_M {
497 // macros.tsv: iswhite → obj.is_white()
498 if g.strcache[i][j].is_white() {
499 g.strcache[i][j] = g.memerrmsg.clone();
500 }
501 }
502 }
503}
504
505// lstring.h: LUAI_FUNC → pub(crate)
506/// Initialise the string intern table and the API string cache.
507///
508/// Must be called exactly once during VM startup, before any strings are created.
509/// Pre-creates the memory-error message and fixes it in the GC (so it is never
510/// collected), then fills every cache slot with that same string.
511///
512/// # C source
513/// ```c
514///
515/// // global_State *g = G(L);
516/// // int i, j;
517/// // stringtable *tb = &G(L)->strt;
518/// // tb->hash = luaM_newvector(L, MINSTRTABSIZE, TString*);
519/// // tablerehash(tb->hash, 0, MINSTRTABSIZE);
520/// // tb->size = MINSTRTABSIZE;
521/// // g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
522/// // luaC_fix(L, obj2gco(g->memerrmsg));
523/// // for (i = 0; i < STRCACHE_N; i++)
524/// // for (j = 0; j < STRCACHE_M; j++)
525/// // g->strcache[i][j] = g->memerrmsg;
526/// // }
527/// ```
528pub(crate) fn init(state: &mut LuaState) -> Result<(), LuaError> {
529 // tablerehash(tb->hash, 0, MINSTRTABSIZE);
530 // tb->size = MINSTRTABSIZE;
531 // macros.tsv: luaM_newvector → vec![T::default(); n]
532 // PORT NOTE: StringPool::new() sets the initial capacity to MIN_STR_TAB_SIZE,
533 // replacing both the allocation and the tablerehash clear pass.
534 state.global_mut().strt = StringPool::new();
535
536 // macros.tsv: luaS_newliteral → state.intern_str(b"...")
537 let memerrmsg = new_lstr(state, MEMERR_MSG)?;
538
539 // macros.tsv: luaC_fix — not listed; it marks the object as fixed (non-collectable)
540 // TODO(port): call state.gc().fix(memerrmsg.clone()) when GC is wired in Phase D;
541 // in Phases A–C the Rc keeps it alive as long as GlobalState holds the clone
542 let memerrmsg_lt = impl_to_lt(&memerrmsg);
543 state.global_mut().memerrmsg = memerrmsg_lt.clone();
544
545 // for (j = 0; j < STRCACHE_M; j++)
546 // g->strcache[i][j] = g->memerrmsg;
547 for i in 0..STRCACHE_N {
548 for j in 0..STRCACHE_M {
549 state.global_mut().strcache[i][j] = memerrmsg_lt.clone();
550 }
551 }
552
553 Ok(())
554}
555
556// lstring.h: LUAI_FUNC → pub(crate)
557/// Create a new, uninitialized long string of `l` bytes.
558///
559/// The returned string's bytes are all zero. The caller is responsible for
560/// filling the content, if needed; in practice `new_lstr` calls this and then
561/// copies the source bytes in.
562///
563/// # C source
564/// ```c
565///
566/// // TString *ts = createstrobj(L, l, LUA_VLNGSTR, G(L)->seed);
567/// // ts->u.lnglen = l;
568/// // ts->shrlen = 0xFF; /* signals that it is a long string */
569/// // return ts;
570/// // }
571/// ```
572///
573/// PORT NOTE: `ts->u.lnglen = l` and `ts->shrlen = 0xFF` are replaced by the
574/// `StringKind::Long` variant which carries the length implicitly through
575/// `Rc<[u8]>::len()`. The `0xFF` sentinel is no longer needed.
576pub(crate) fn create_long_str(state: &mut LuaState, l: usize) -> GcRef<LuaStringImpl> {
577 let seed = state.global().seed;
578 // PORT NOTE: C's createstrobj allocates uninitialised storage then the caller
579 // fills bytes via memcpy. Rust's create_str_obj constructs with zeroed bytes;
580 // callers (e.g. new_lstr) pass the real bytes directly, eliminating the two-step.
581 create_str_obj(state, &vec![0u8; l], StringKind::Long, seed)
582}
583
584// lstring.h: LUAI_FUNC → pub(crate)
585/// Remove a short string from the intern table.
586///
587/// Called by the GC sweep when a short string is about to be collected.
588///
589/// # C source
590/// ```c
591///
592/// // stringtable *tb = &G(L)->strt;
593/// // TString **p = &tb->hash[lmod(ts->hash, tb->size)];
594/// // while (*p != ts) /* find previous element */
595/// // p = &(*p)->u.hnext;
596/// // *p = (*p)->u.hnext; /* remove element from its list */
597/// // tb->nuse--;
598/// // }
599/// ```
600///
601/// PORT NOTE: The C implementation walks the intrusive `hnext` chain to unlink
602/// `ts`. In Rust the chain does not exist; `HashMap::remove` is O(1) average.
603/// `lmod(ts->hash, tb->size)` (the bucket index) is not needed; the map keys by
604/// byte content.
605pub(crate) fn remove_str(state: &mut LuaState, ts: &LuaStringImpl) {
606 let strt = &mut state.global_mut().strt;
607
608 // while (*p != ts) p = &(*p)->u.hnext;
609 // *p = (*p)->u.hnext;
610 // PORT NOTE: all of the above replaced by HashMap::remove keyed on bytes
611 strt.map.remove(ts.as_bytes());
612
613 strt.nuse = strt.nuse.saturating_sub(1);
614}
615
616// lstring.h: LUAI_FUNC → pub(crate)
617/// Create or retrieve a Lua string from `bytes`.
618///
619/// If `bytes.len() <= MAX_SHORT_LEN` (40), the string is interned: an existing
620/// identical short string is returned if found, otherwise a new one is created
621/// and inserted into the intern table.
622///
623/// If `bytes.len() > MAX_SHORT_LEN`, a new long string is allocated each time
624/// (long strings are never interned).
625///
626/// # C source
627/// ```c
628///
629/// // if (l <= LUAI_MAXSHORTLEN) /* short string? */
630/// // return internshrstr(L, str, l);
631/// // else {
632/// // TString *ts;
633/// // if (l_unlikely(l * sizeof(char) >= (MAX_SIZE - sizeof(TString))))
634/// // luaM_toobig(L);
635/// // ts = luaS_createlngstrobj(L, l);
636/// // memcpy(getlngstr(ts), str, l * sizeof(char));
637/// // return ts;
638/// // }
639/// // }
640/// ```
641pub(crate) fn new_lstr(state: &mut LuaState, bytes: &[u8]) -> Result<GcRef<LuaStringImpl>, LuaError> {
642 if bytes.len() <= MAX_SHORT_LEN {
643 intern_short_str(state, bytes)
644 } else {
645 // luaM_toobig(L);
646 // macros.tsv: luaM_toobig → return Err(LuaError::Memory)
647 // PORT NOTE: sizeof(TString) is a C-specific overhead; in Rust we just
648 // check that the byte count fits within MAX_SIZE.
649 if bytes.len() >= MAX_SIZE {
650 return Err(LuaError::Memory);
651 }
652
653 // memcpy(getlngstr(ts), str, l * sizeof(char));
654 // PORT NOTE: Rather than creating a zeroed buffer and then copying,
655 // we construct the LuaStringImpl directly from `bytes`.
656 let seed = state.global().seed;
657 let h = hash_bytes(bytes, seed);
658 let ts = create_str_obj(state, bytes, StringKind::Long, h);
659 Ok(ts)
660 }
661}
662
663// lstring.h: LUAI_FUNC → pub(crate)
664/// Create or retrieve a Lua string, using a small two-slot LRU cache per hash
665/// bucket to accelerate repeated calls with the same byte sequence.
666///
667/// In C, the cache bucket is selected by casting the C string pointer to a `u32`
668/// (`point2uint`). In Rust, `point2uint` is restricted to `lua-gc`/`lua-coro`
669/// (raw-pointer cast requiring `unsafe`). We substitute a content-hash based
670/// bucket index instead. Functional semantics are identical; cache hit rates for
671/// repeated calls with the same `bytes` may differ.
672///
673/// # C source
674/// ```c
675///
676/// // unsigned int i = point2uint(str) % STRCACHE_N; /* hash */
677/// // int j;
678/// // TString **p = G(L)->strcache[i];
679/// // for (j = 0; j < STRCACHE_M; j++) {
680/// // if (strcmp(str, getstr(p[j])) == 0) /* hit? */
681/// // return p[j]; /* that is it */
682/// // }
683/// // /* normal route */
684/// // for (j = STRCACHE_M - 1; j > 0; j--)
685/// // p[j] = p[j - 1]; /* move out last element */
686/// // p[0] = luaS_newlstr(L, str, strlen(str));
687/// // return p[0];
688/// // }
689/// ```
690///
691/// PORT NOTE: `point2uint(str) % STRCACHE_N` used the raw pointer address as a
692/// fast key, exploiting the fact that C string literals have stable addresses.
693/// In Rust we use `hash_bytes(bytes, seed) % STRCACHE_N` instead. The replacement
694/// is fully safe and has identical semantics (but different cache behaviour for
695/// calls from different `&[u8]` slices with identical content).
696pub(crate) fn new(state: &mut LuaState, bytes: &[u8]) -> Result<GcRef<LuaStringImpl>, LuaError> {
697 // PORT NOTE: pointer hash replaced by content hash (see doc above)
698 let seed = state.global().seed;
699 let i = (hash_bytes(bytes, seed) as usize) % STRCACHE_N;
700
701 // macros.tsv: getstr → ts.as_bytes()
702 for j in 0..STRCACHE_M {
703 if state.global().strcache[i][j].as_bytes() == bytes {
704 // TODO(phase-b): strcache currently holds lua_types::LuaString; rebuild
705 // a rich LuaStringImpl from the bytes. Phase B should unify the types.
706 let cached_bytes = state.global().strcache[i][j].as_bytes().to_vec();
707 // TODO(D-1c-bridge): LuaStringImpl is the rich local type; state helper produces lua_types::LuaString
708 return Ok(GcRef::new(LuaStringImpl {
709 bytes: cached_bytes.into(),
710 kind: if bytes.len() <= MAX_SHORT_LEN { StringKind::Short } else { StringKind::Long },
711 hash: Cell::new(hash_bytes(bytes, seed)),
712 extra: Cell::new(0),
713 }));
714 }
715 }
716
717 // Create the string before mutating the cache
718 let new_str = new_lstr(state, bytes)?;
719
720 // Shift entries toward the back to make room at slot 0
721 for j in (1..STRCACHE_M).rev() {
722 // Clone first to avoid borrow conflict between getter and setter
723 let prev = state.global().strcache[i][j - 1].clone();
724 state.global_mut().strcache[i][j] = prev;
725 }
726
727 state.global_mut().strcache[i][0] = impl_to_lt(&new_str);
728
729 Ok(new_str)
730}
731
732// lstring.h: LUAI_FUNC → pub(crate)
733/// Allocate a new full userdata of `s` raw bytes with `nuvalue` Lua user values.
734///
735/// The raw byte payload is zeroed. All user values are initialised to `nil`.
736/// The metatable is `None`.
737///
738/// # C source
739/// ```c
740///
741/// // Udata *u;
742/// // int i;
743/// // GCObject *o;
744/// // if (l_unlikely(s > MAX_SIZE - udatamemoffset(nuvalue)))
745/// // luaM_toobig(L);
746/// // o = luaC_newobj(L, LUA_VUSERDATA, sizeudata(nuvalue, s));
747/// // u = gco2u(o);
748/// // u->len = s;
749/// // u->nuvalue = nuvalue;
750/// // u->metatable = NULL;
751/// // for (i = 0; i < nuvalue; i++)
752/// // setnilvalue(&u->uv[i].uv);
753/// // return u;
754/// // }
755/// ```
756pub(crate) fn new_userdata(
757 state: &mut LuaState,
758 s: usize,
759 nuvalue: usize,
760) -> Result<GcRef<LuaUserDataImpl>, LuaError> {
761 // luaM_toobig(L);
762 // macros.tsv: luaM_toobig → return Err(LuaError::Memory)
763 // TODO(port): udatamemoffset(nuvalue) computes C-specific alignment padding
764 // for the flexible-array Udata layout. In Rust, LuaUserData allocates `data`
765 // and `uv` separately (Box<[u8]> + Vec<LuaValue>); the combined size bound
766 // differs. Conservative check: reject if s alone exceeds MAX_SIZE.
767 if s > MAX_SIZE {
768 return Err(LuaError::Memory);
769 }
770
771 // u = gco2u(o);
772 // TODO(port): register with GC tracking (state.gc().new_obj(...));
773 // Phase A–C stub: allocate via Rc without GC registration.
774 // TODO(D-1c-bridge): LuaUserDataImpl is the rich local type; state.new_userdata is still todo!()
775 let u = GcRef::new(LuaUserDataImpl {
776 len: s,
777 nuvalue: nuvalue as u16,
778 metatable: None,
779 // macros.tsv: setnilvalue → *o = LuaValue::Nil
780 // TODO(port): Vec<LuaValue> once LuaValue is defined in lua-types
781 uv: vec![(); nuvalue],
782 // Raw byte payload; zero-initialised.
783 data: vec![0u8; s].into_boxed_slice(),
784 });
785
786 // TODO(port): push into state.global_mut().allgc for GC tracking (Phase D)
787 Ok(u)
788}
789
790// ── Private helpers ───────────────────────────────────────────────────────────
791
792/// Allocate and initialise a new `LuaStringImpl` with the given bytes, kind, and hash.
793///
794/// In C, `createstrobj` allocated uninitialised memory via `luaC_newobj` and set
795/// the header fields; the caller then filled the content via `memcpy`. In Rust
796/// we construct the string directly from the provided `bytes`, eliminating the
797/// two-step pattern.
798///
799/// # C source
800/// ```c
801///
802/// // TString *ts;
803/// // GCObject *o;
804/// // size_t totalsize = sizelstring(l);
805/// // o = luaC_newobj(L, tag, totalsize);
806/// // ts = gco2ts(o);
807/// // ts->hash = h;
808/// // ts->extra = 0;
809/// // getstr(ts)[l] = '\0'; /* ending 0 */
810/// // return ts;
811/// // }
812/// ```
813///
814/// PORT NOTE: `sizelstring(l)` computed the total allocation size including the
815/// nul terminator. In Rust, `Rc<[u8]>` stores the bytes without a nul; the
816/// nul terminator is dropped. Callers that need a nul-terminated `*const u8`
817/// for FFI must use a temporary `CString` or equivalent.
818fn create_str_obj(
819 state: &mut LuaState,
820 bytes: &[u8],
821 kind: StringKind,
822 hash: u32,
823) -> GcRef<LuaStringImpl> {
824 // macros.tsv: luaM_newobject → state.gc().new_obj(tag, sz)
825 // TODO(port): register with GC tracking list (state.global_mut().allgc)
826 // in Phase D; Phase A–C creates a bare Rc
827 let _ = state; // state needed for GC registration in Phase D
828 // TODO(D-1c-bridge): LuaStringImpl is the rich local type; state helper produces lua_types::LuaString
829 GcRef::new(LuaStringImpl {
830 hash: Cell::new(hash),
831 extra: Cell::new(0),
832 // PORT NOTE: we receive bytes directly; no separate memcpy step needed
833 bytes: Rc::from(bytes),
834 kind,
835 })
836}
837
838/// Grow the string intern table, first attempting a GC collection if the table is
839/// at its absolute maximum size.
840///
841/// # C source
842/// ```c
843///
844/// // if (l_unlikely(tb->nuse == MAX_INT)) { /* too many strings? */
845/// // luaC_fullgc(L, 1); /* try to free some... */
846/// // if (tb->nuse == MAX_INT) /* still too many? */
847/// // luaM_error(L); /* cannot even create a message... */
848/// // }
849/// // if (tb->size <= MAXSTRTB / 2) /* can grow string table? */
850/// // luaS_resize(L, tb->size * 2);
851/// // }
852/// ```
853fn grow_str_tab(state: &mut LuaState) -> Result<(), LuaError> {
854 // macros.tsv: MAX_INT → i32::MAX
855 let nuse = state.global().strt.nuse;
856 if nuse == i32::MAX as usize {
857 // macros.tsv: luaC_fullgc → state.gc().full_collect()
858 // TODO(port): state.gc().full_collect() — GC not yet wired in Phase A–C; no-op
859 // (When GC is live this call may reduce nuse by sweeping dead short strings.)
860
861 // macros.tsv: luaM_error → return Err(LuaError::Memory)
862 if state.global().strt.nuse == i32::MAX as usize {
863 return Err(LuaError::Memory);
864 }
865 }
866
867 let size = state.global().strt.size;
868 if size <= MAX_STR_TAB / 2 {
869 resize(state, size * 2);
870 }
871
872 Ok(())
873}
874
875/// Look up `bytes` in the intern table; create and insert a new short string if
876/// not found.
877///
878/// The `isdead` / `changewhite` resurrection path is elided in Phases A–C because
879/// `Rc`-based reference counting keeps objects alive until all references drop
880/// (there are no dead-but-not-collected strings in Phase A–C).
881///
882/// # C source
883/// ```c
884///
885/// // TString *ts;
886/// // global_State *g = G(L);
887/// // stringtable *tb = &g->strt;
888/// // unsigned int h = luaS_hash(str, l, g->seed);
889/// // TString **list = &tb->hash[lmod(h, tb->size)];
890/// // lua_assert(str != NULL);
891/// // for (ts = *list; ts != NULL; ts = ts->u.hnext) {
892/// // if (l == ts->shrlen && (memcmp(str, getshrstr(ts), l) == 0)) {
893/// // if (isdead(g, ts)) changewhite(ts); /* resurrect it */
894/// // return ts;
895/// // }
896/// // }
897/// // if (tb->nuse >= tb->size) {
898/// // growstrtab(L, tb);
899/// // list = &tb->hash[lmod(h, tb->size)];
900/// // }
901/// // ts = createstrobj(L, l, LUA_VSHRSTR, h);
902/// // ts->shrlen = cast_byte(l);
903/// // memcpy(getshrstr(ts), str, l);
904/// // ts->u.hnext = *list;
905/// // *list = ts;
906/// // tb->nuse++;
907/// // return ts;
908/// // }
909/// ```
910///
911/// PORT NOTE: `lmod(h, tb->size)` (power-of-two bucket modulo via
912/// `macros.tsv: lmod → (s & (size - 1)) as usize`) and the `hnext` chain walk
913/// are both gone. `HashMap::get` replaces the linear bucket scan.
914fn intern_short_str(
915 state: &mut LuaState,
916 bytes: &[u8],
917) -> Result<GcRef<LuaStringImpl>, LuaError> {
918 // In Rust, &[u8] slices are never null; the assertion is trivially satisfied.
919
920 let seed = state.global().seed;
921 let h = hash_bytes(bytes, seed);
922
923 // PORT NOTE: intrusive hnext chain replaced by HashMap lookup
924 // Clone the existing GcRef<LuaStringImpl> so the immutable borrow on `state` ends
925 // before any mutable access below.
926 let existing = state.global().strt.map.get(bytes).cloned();
927 if let Some(ts) = existing {
928 // macros.tsv: isdead → g.is_dead(obj); changewhite → obj.flip_white()
929 // PORT NOTE: GC color management deferred to Phase D; in Phases A–C all
930 // Rc-held objects are live by definition (Rc keeps them alive).
931 return Ok(ts);
932 }
933
934 let needs_grow = {
935 let strt = &state.global().strt;
936 strt.nuse >= strt.size
937 };
938 if needs_grow {
939 grow_str_tab(state)?;
940 }
941
942 // ts->shrlen = cast_byte(l); — encoded in StringKind::Short
943 // memcpy(getshrstr(ts), str, l); — bytes passed directly to create_str_obj
944 let ts = create_str_obj(state, bytes, StringKind::Short, h);
945
946 state
947 .global_mut()
948 .strt
949 .map
950 .insert(bytes.to_vec().into_boxed_slice(), ts.clone());
951 state.global_mut().strt.nuse += 1;
952
953 Ok(ts)
954}
955
956// ── Re-export marker for type defined here ────────────────────────────────────
957
958// TODO(port): LuaError is used in function signatures above but is not yet defined
959// in lua-types. Phase B must add LuaError to lua-types/src/error.rs per
960// PORTING.md §6 before this file can compile. The expected variants are:
961// LuaError::Runtime(LuaValue)
962// LuaError::Memory
963// LuaError::Syntax(LuaValue)
964// ... (full list in PORTING.md §6)
965// For now, reference LuaError as an opaque import from the future lua-types crate.
966use lua_types::LuaError;
967
968// ──────────────────────────────────────────────────────────────────────────────
969// PORT STATUS
970// source: src/lstring.c (275 lines, 15 functions)
971// src/lstring.h (57 lines; merged)
972// target_crate: lua-vm
973// confidence: medium
974// todos: 14
975// port_notes: 30
976// unsafe_blocks: 0 (must be 0 outside explicit unsafe-budget crates)
977// notes: Logic is faithful to the C. The two largest structural changes
978// are: (1) `tablerehash` + intrusive `hnext` chain replaced by
979// `HashMap` in `StringPool`; (2) `luaS_new`'s `point2uint`
980// pointer-hash replaced by a content hash (safe, same semantics).
981// Key TODOs: GC registration in create_str_obj (Phase D),
982// GC registration in new_userdata (Phase D), luaC_fix in init
983// (Phase D), full_collect stub in grow_str_tab (Phase D),
984// udatamemoffset size check in new_userdata (Phase B),
985// LuaValue in LuaUserData.uv (Phase B), LuaError import path
986// (Phase B), GcRef typedef (Phase B). Phase B priority: wire
987// import paths for LuaState, GlobalState, LuaError, LuaValue,
988// and move LuaStringImpl/StringPool/LuaUserData to their canonical
989// modules (object.rs / state.rs).
990// ──────────────────────────────────────────────────────────────────────────────