1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
//! String table and interned-string operations — port of `lstring.c` + `lstring.h`.
//!
//! Provides two key abstractions:
//!
//! - [`LuaStringImpl`]: the Lua string value, stored as a reference-counted byte slice.
//! Short strings (`<= MAX_SHORT_LEN` bytes) are interned in the process-global
//! [`StringPool`]; long strings are heap-allocated on each creation and never
//! interned.
//!
//! - [`StringPool`]: the intern table for short strings, stored on `GlobalState`.
//! Replaces the C `stringtable` struct, which used an open-addressing hash table
//! with intrusive chaining through `TString.u.hnext`. In Rust the intrusive
//! chain is dropped; a `HashMap` provides O(1) lookup and automatic rehashing.
//! See PORT NOTE on [`StringPool`] for the full rationale.
//!
//! The `lstring.h` header is merged into this module per PORTING.md §1.
//!
//! # C source files
//! - `reference/lua-5.4.7/src/lstring.c` (275 lines, 15 functions)
//! - `reference/lua-5.4.7/src/lstring.h` (57 lines; merged here)
use Cell;
use crate*;
use HashMap;
use Rc;
// TODO(port): these import paths will resolve once Phase B wires the crate graph.
// `LuaState` and `GlobalState` live in crate::state (src/state.rs, from lstate.c).
// `LuaValue` and `LuaError` live in lua_types (crates/lua-types/src/).
use crateLuaState;
// PORT NOTE: `GcRef<T>` is the lua-types newtype around `Rc<T>` per PORT_STRATEGY §3.4.
// Re-imported here so all string-pool entries share identity with state.rs / api.rs.
use GcRef;
/// Phase-B bridge: converts a lua-vm rich `LuaStringImpl` into a `lua_types::LuaString`.
/// The two types track different metadata (short/long flag, extra byte) and a real
/// merge belongs in Phase B once `lua-types::LuaString` grows the needed fields.
// ── Constants (lstring.h macros → macros.tsv) ─────────────────────────────────
// macros.tsv: MEMERRMSG → const MEMERR_MSG: &[u8] = b"not enough memory"
/// Pre-allocated OOM error message. Must be created before the allocator
/// can fail so that the GC can always hand back a valid error string.
pub const MEMERR_MSG: & = b"not enough memory";
// macros.tsv: MINSTRTABSIZE → const MIN_STR_TAB_SIZE: usize = 128
const MIN_STR_TAB_SIZE: usize = 128;
// macros.tsv: STRCACHE_N → const STRCACHE_N: usize = 53
const STRCACHE_N: usize = 53;
// macros.tsv: STRCACHE_M → const STRCACHE_M: usize = 2
const STRCACHE_M: usize = 2;
// macros.tsv: LUAI_MAXSHORTLEN → const MAX_SHORT_LEN: usize = 40
pub const MAX_SHORT_LEN: usize = 40;
// macros.tsv: MAX_SIZE → const MAX_SIZE: usize = if size_of::<usize>() < size_of::<i64>() { usize::MAX } else { i64::MAX as usize }
const MAX_SIZE: usize = if < else ;
// macros.tsv: luaM_limitN → std::cmp::min(n, usize::MAX / std::mem::size_of::<T>())
// cast_int → x as i32
// Rust: upper bound on the number of hash buckets; derived from MAX_INT / pointer size.
const MAX_STR_TAB: usize = i32MAX as usize / ;
// macros.tsv: sizelstring → drop — Rust allocates via Box<[u8]> / Rc<[u8]>
// PORT NOTE: dropped entirely; Rust uses Rc<[u8]> which carries its own length.
// macros.tsv: luaS_newliteral → state.intern_str(b"...")
// PORT NOTE: translated at call sites as `new_lstr(state, b"literal")`.
// macros.tsv: isreserved → ts.is_reserved_word()
// PORT NOTE: translated at call sites as the `LuaStringImpl::is_reserved_word()` method.
// macros.tsv: eqshrstr → Rc::ptr_eq(a, b)
// PORT NOTE: short strings are interned so pointer equality suffices.
// Translated at call sites as `Rc::ptr_eq(a, b)`.
// ── LuaStringImpl (was TString in lobject.h) ─────────────────────────────────────
// PORT NOTE: `LuaStringImpl` corresponds to `TString` from `lobject.h`, which maps to
// `src/object.rs` per file_deps.txt. It is defined here (in `string.rs`) because
// `lstring.c` owns the string-table internals and most of the type's behaviour.
// Phase B should reconcile: either keep it here and re-export from `object.rs`,
// or move it there and import it from `string.rs`.
/// Whether a Lua string is short (interned) or long (not interned).
///
/// Corresponds to `LUA_VSHRSTR` / `LUA_VLNGSTR` tags from `lobject.h`.
///
/// # C mapping (types.tsv)
/// ```text
/// LUA_VSHRSTR → LuaStringImpl::Short (shrlen holds length 0..=40)
/// LUA_VLNGSTR → LuaStringImpl::Long (shrlen = 0xFF sentinel; u.lnglen holds length)
/// ```
/// A Lua string: an immutable, reference-counted byte sequence.
///
/// Short strings (`<= MAX_SHORT_LEN = 40` bytes) are interned in the
/// [`StringPool`] on `GlobalState`; two short strings with the same bytes
/// are guaranteed to be the same `GcRef` (pointer equality via `Rc::ptr_eq`).
///
/// Long strings are heap-allocated independently and never interned. Their
/// hash is computed lazily on first call to [`hash_long_str`] and cached via
/// interior mutability (`Cell<u32>`).
///
/// # C mapping (types.tsv)
/// ```text
/// TString → LuaStringImpl
/// TString.extra → extra: Cell<u8> (reserved-word idx for Short; hash-ready flag for Long)
/// TString.shrlen → kind: StringKind (0xFF sentinel replaced by enum variant)
/// TString.hash → hash: Cell<u32>
/// TString.u.lnglen → bytes.len() (length implicit in Rc<[u8]>)
/// TString.u.hnext → (removed) (intrusive chain gone; StringPool uses HashMap)
/// TString.contents → bytes: Rc<[u8]>
/// ```
// ── StringPool (was stringtable in lstate.h) ──────────────────────────────────
// PORT NOTE: `StringPool` corresponds to `stringtable` from `lstate.h`, which maps
// to `src/state.rs` per file_deps.txt. It is defined here because `lstring.c`
// owns all of the pool's mutation logic. Phase B should reconcile placement.
//
// The C `stringtable` used an open-addressing hash table where each bucket was
// the head of an intrusive singly-linked list threaded through `TString.u.hnext`.
// In Rust, `TString.u.hnext` is removed per types.tsv. The `HashMap` replaces
// both the bucket array and the chain: it provides O(1) average-case lookup,
// automatic rehashing, and eliminates the need for `tablerehash`.
//
// `nuse` and `size` are retained for parity with the C invariants that other
// code may check (e.g. `growstrtab` tests `nuse >= size`).
/// Intern table for short Lua strings. Lives on `GlobalState`.
///
/// # C mapping (types.tsv)
/// ```text
/// stringtable → StringPool
/// stringtable.hash → map: HashMap<Box<[u8]>, GcRef<LuaStringImpl>>
/// stringtable.nuse → nuse: usize
/// stringtable.size → size: usize
/// ```
// ── LuaUserData (was Udata in lobject.h) ──────────────────────────────────────
// PORT NOTE: `LuaUserData` corresponds to `Udata` from `lobject.h`, which maps to
// `src/object.rs` per file_deps.txt. Defined here because `luaS_newudata` lives
// in `lstring.c`. Phase B should reconcile placement.
/// Full userdata: a GC-tracked object carrying a raw byte payload plus optional
/// Lua user values and an optional metatable.
///
/// # C mapping (types.tsv)
/// ```text
/// Udata → LuaUserData
/// Udata.len → len: usize
/// Udata.nuvalue → nuvalue: u16 (covered by uv.len() but kept for parity)
/// Udata.metatable → metatable: Option<GcRef<LuaTable>>
/// Udata.uv → uv: Vec<LuaValue>
/// (no direct C field) data: Box<[u8]> — the raw byte payload; C used a flexible
/// array member laid out past the Udata header via
/// `udatamemoffset` alignment math.
/// ```
// ── Public functions ───────────────────────────────────────────────────────────
// lstring.h: LUAI_FUNC → pub(crate)
/// Hash a byte string with a seed using Lua's FNV-style hash.
///
/// This is a pure function with no allocations. The algorithm XORs shifts and
/// additions over each byte in reverse order, seeded by `seed ^ len`.
///
/// # C source
/// ```c
///
/// // unsigned int h = seed ^ cast_uint(l);
/// // for (; l > 0; l--)
/// // h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1]));
/// // return h;
/// // }
/// ```
///
/// PORT NOTE: C parenthesises `(h<<5)` and `(h>>2)` explicitly, so the outer
/// additions are unambiguous despite C's `<<`/`>>` having lower precedence than
/// `+`. In Rust `<<` and `>>` have higher precedence than `+`, so the same
/// expression is computed without extra parentheses; `wrapping_add` is used to
/// match C's unsigned wrap-around arithmetic.
pub
//
// PORT NOTE: `tablerehash` walked the intrusive `hnext` chain in each bucket and
// redistributed `TString *` pointers into new bucket slots. In Rust the
// `HashMap` in `StringPool` handles its own rehashing automatically whenever its
// load factor is exceeded or `reserve` / `shrink_to` is called. The entire
// function is therefore dropped; its effects are subsumed by the HashMap.
// lstring.h: LUAI_FUNC → pub(crate)
/// Resize the string intern table to approximately `nsize` buckets.
///
/// When growing, `HashMap::reserve` hints the desired capacity. When shrinking,
/// `HashMap::shrink_to` is used as an approximation of the C logic, which
/// would rehash entries out of the shrinking tail. The C function's graceful
/// degradation on allocation failure (keep the current size) is preserved:
/// `HashMap` will simply retain its existing capacity if memory is tight.
///
/// # C source
/// ```c
///
/// // stringtable *tb = &G(L)->strt;
/// // int osize = tb->size;
/// // TString **newvect;
/// // if (nsize < osize)
/// // tablerehash(tb->hash, osize, nsize); /* depopulate shrinking part */
/// // newvect = luaM_reallocvector(L, tb->hash, osize, nsize, TString*);
/// // if (l_unlikely(newvect == NULL)) {
/// // if (nsize < osize)
/// // tablerehash(tb->hash, nsize, osize); /* restore to original size */
/// // } else {
/// // tb->hash = newvect;
/// // tb->size = nsize;
/// // if (nsize > osize)
/// // tablerehash(newvect, osize, nsize);
/// // }
/// // }
/// ```
///
/// PORT NOTE: The three calls to `tablerehash` are dropped because `HashMap`
/// automatically rehashes. The allocation-failure fallback (restore to `osize`)
/// has no direct analogue; `HashMap` will retain existing capacity on OOM, which
/// matches the intent.
// PERF(port): luaS_resize shrink — HashMap::shrink_to() is a hint, not a
// guarantee; the C code freed exact memory. Profile in Phase B.
pub
// lstring.h: LUAI_FUNC → pub(crate)
/// Initialise the string intern table and the API string cache.
///
/// Must be called exactly once during VM startup, before any strings are created.
/// Pre-creates the memory-error message and fixes it in the GC (so it is never
/// collected), then fills every cache slot with that same string.
///
/// # C source
/// ```c
///
/// // global_State *g = G(L);
/// // int i, j;
/// // stringtable *tb = &G(L)->strt;
/// // tb->hash = luaM_newvector(L, MINSTRTABSIZE, TString*);
/// // tablerehash(tb->hash, 0, MINSTRTABSIZE);
/// // tb->size = MINSTRTABSIZE;
/// // g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
/// // luaC_fix(L, obj2gco(g->memerrmsg));
/// // for (i = 0; i < STRCACHE_N; i++)
/// // for (j = 0; j < STRCACHE_M; j++)
/// // g->strcache[i][j] = g->memerrmsg;
/// // }
/// ```
pub
// lstring.h: LUAI_FUNC → pub(crate)
/// Create or retrieve a Lua string from `bytes`.
///
/// If `bytes.len() <= MAX_SHORT_LEN` (40), the string is interned: an existing
/// identical short string is returned if found, otherwise a new one is created
/// and inserted into the intern table.
///
/// If `bytes.len() > MAX_SHORT_LEN`, a new long string is allocated each time
/// (long strings are never interned).
///
/// # C source
/// ```c
///
/// // if (l <= LUAI_MAXSHORTLEN) /* short string? */
/// // return internshrstr(L, str, l);
/// // else {
/// // TString *ts;
/// // if (l_unlikely(l * sizeof(char) >= (MAX_SIZE - sizeof(TString))))
/// // luaM_toobig(L);
/// // ts = luaS_createlngstrobj(L, l);
/// // memcpy(getlngstr(ts), str, l * sizeof(char));
/// // return ts;
/// // }
/// // }
/// ```
pub
// lstring.h: LUAI_FUNC → pub(crate)
/// Create or retrieve a Lua string, using a small two-slot LRU cache per hash
/// bucket to accelerate repeated calls with the same byte sequence.
///
/// In C, the cache bucket is selected by casting the C string pointer to a `u32`
/// (`point2uint`). In Rust, `point2uint` is restricted to `lua-gc`/`lua-coro`
/// (raw-pointer cast requiring `unsafe`). We substitute a content-hash based
/// bucket index instead. Functional semantics are identical; cache hit rates for
/// repeated calls with the same `bytes` may differ.
///
/// # C source
/// ```c
///
/// // unsigned int i = point2uint(str) % STRCACHE_N; /* hash */
/// // int j;
/// // TString **p = G(L)->strcache[i];
/// // for (j = 0; j < STRCACHE_M; j++) {
/// // if (strcmp(str, getstr(p[j])) == 0) /* hit? */
/// // return p[j]; /* that is it */
/// // }
/// // /* normal route */
/// // for (j = STRCACHE_M - 1; j > 0; j--)
/// // p[j] = p[j - 1]; /* move out last element */
/// // p[0] = luaS_newlstr(L, str, strlen(str));
/// // return p[0];
/// // }
/// ```
///
/// PORT NOTE: `point2uint(str) % STRCACHE_N` used the raw pointer address as a
/// fast key, exploiting the fact that C string literals have stable addresses.
/// In Rust we use `hash_bytes(bytes, seed) % STRCACHE_N` instead. The replacement
/// is fully safe and has identical semantics (but different cache behaviour for
/// calls from different `&[u8]` slices with identical content).
pub
// ── Private helpers ───────────────────────────────────────────────────────────
/// Allocate and initialise a new `LuaStringImpl` with the given bytes, kind, and hash.
///
/// In C, `createstrobj` allocated uninitialised memory via `luaC_newobj` and set
/// the header fields; the caller then filled the content via `memcpy`. In Rust
/// we construct the string directly from the provided `bytes`, eliminating the
/// two-step pattern.
///
/// # C source
/// ```c
///
/// // TString *ts;
/// // GCObject *o;
/// // size_t totalsize = sizelstring(l);
/// // o = luaC_newobj(L, tag, totalsize);
/// // ts = gco2ts(o);
/// // ts->hash = h;
/// // ts->extra = 0;
/// // getstr(ts)[l] = '\0'; /* ending 0 */
/// // return ts;
/// // }
/// ```
///
/// PORT NOTE: `sizelstring(l)` computed the total allocation size including the
/// nul terminator. In Rust, `Rc<[u8]>` stores the bytes without a nul; the
/// nul terminator is dropped. Callers that need a nul-terminated `*const u8`
/// for FFI must use a temporary `CString` or equivalent.
/// Grow the string intern table, first attempting a GC collection if the table is
/// at its absolute maximum size.
///
/// # C source
/// ```c
///
/// // if (l_unlikely(tb->nuse == MAX_INT)) { /* too many strings? */
/// // luaC_fullgc(L, 1); /* try to free some... */
/// // if (tb->nuse == MAX_INT) /* still too many? */
/// // luaM_error(L); /* cannot even create a message... */
/// // }
/// // if (tb->size <= MAXSTRTB / 2) /* can grow string table? */
/// // luaS_resize(L, tb->size * 2);
/// // }
/// ```
/// Look up `bytes` in the intern table; create and insert a new short string if
/// not found.
///
/// The `isdead` / `changewhite` resurrection path is elided in Phases A–C because
/// `Rc`-based reference counting keeps objects alive until all references drop
/// (there are no dead-but-not-collected strings in Phase A–C).
///
/// # C source
/// ```c
///
/// // TString *ts;
/// // global_State *g = G(L);
/// // stringtable *tb = &g->strt;
/// // unsigned int h = luaS_hash(str, l, g->seed);
/// // TString **list = &tb->hash[lmod(h, tb->size)];
/// // lua_assert(str != NULL);
/// // for (ts = *list; ts != NULL; ts = ts->u.hnext) {
/// // if (l == ts->shrlen && (memcmp(str, getshrstr(ts), l) == 0)) {
/// // if (isdead(g, ts)) changewhite(ts); /* resurrect it */
/// // return ts;
/// // }
/// // }
/// // if (tb->nuse >= tb->size) {
/// // growstrtab(L, tb);
/// // list = &tb->hash[lmod(h, tb->size)];
/// // }
/// // ts = createstrobj(L, l, LUA_VSHRSTR, h);
/// // ts->shrlen = cast_byte(l);
/// // memcpy(getshrstr(ts), str, l);
/// // ts->u.hnext = *list;
/// // *list = ts;
/// // tb->nuse++;
/// // return ts;
/// // }
/// ```
///
/// PORT NOTE: `lmod(h, tb->size)` (power-of-two bucket modulo via
/// `macros.tsv: lmod → (s & (size - 1)) as usize`) and the `hnext` chain walk
/// are both gone. `HashMap::get` replaces the linear bucket scan.
// ── Re-export marker for type defined here ────────────────────────────────────
// TODO(port): LuaError is used in function signatures above but is not yet defined
// in lua-types. Phase B must add LuaError to lua-types/src/error.rs per
// PORTING.md §6 before this file can compile. The expected variants are:
// LuaError::Runtime(LuaValue)
// LuaError::Memory
// LuaError::Syntax(LuaValue)
// ... (full list in PORTING.md §6)
// For now, reference LuaError as an opaque import from the future lua-types crate.
use LuaError;
// ──────────────────────────────────────────────────────────────────────────────
// PORT STATUS
// source: src/lstring.c (275 lines, 15 functions)
// src/lstring.h (57 lines; merged)
// target_crate: lua-vm
// confidence: medium
// todos: 14
// port_notes: 30
// unsafe_blocks: 0 (must be 0 outside explicit unsafe-budget crates)
// notes: Logic is faithful to the C. The two largest structural changes
// are: (1) `tablerehash` + intrusive `hnext` chain replaced by
// `HashMap` in `StringPool`; (2) `luaS_new`'s `point2uint`
// pointer-hash replaced by a content hash (safe, same semantics).
// Key TODOs: GC registration in create_str_obj (Phase D),
// GC registration in new_userdata (Phase D), luaC_fix in init
// (Phase D), full_collect stub in grow_str_tab (Phase D),
// udatamemoffset size check in new_userdata (Phase B),
// LuaValue in LuaUserData.uv (Phase B), LuaError import path
// (Phase B), GcRef typedef (Phase B). Phase B priority: wire
// import paths for LuaState, GlobalState, LuaError, LuaValue,
// and move LuaStringImpl/StringPool/LuaUserData to their canonical
// modules (object.rs / state.rs).
// ──────────────────────────────────────────────────────────────────────────────