seq_core/seqstring.rs
1//! SeqString - Arena or Globally Allocated Byte String
2//!
3//! Strings in Seq are sequences of arbitrary bytes — there is **no
4//! UTF-8 invariant** on this type. Byte-clean operations (concat,
5//! length-in-bytes, channel send, network I/O, file I/O of binary
6//! content, crypto inputs) work on any input. Text-level operations
7//! (codepoint length, case folding, regex Unicode classes, JSON
8//! escaping) call [`SeqString::as_str`] which validates UTF-8 at the
9//! boundary and returns `Option<&str>`; on invalid bytes those ops
10//! fail loudly with the standard `(value Bool)` failure tuple.
11//!
12//! The two allocation sources stay:
13//! 1. Thread-local arena (fast, bulk-freed on strand exit)
14//! 2. Global allocator (persists across arena resets, used for
15//! cross-strand transfer)
16//!
17//! See `docs/design/STRING_BYTE_CLEANLINESS.md` for the full design.
18
19use crate::arena;
20use std::fmt;
21
22/// Byte string that tracks its allocation source.
23///
24/// # Safety Invariants
25/// - If `global=true`: `ptr` points to a global-allocated byte buffer
26/// whose memory matches `len`/`capacity`; the buffer is freed on
27/// `Drop`.
28/// - If `global=false`: `ptr` points into the thread-local arena; the
29/// arena owns the memory and frees it in bulk on strand exit.
30/// - The byte content is *not* required to be valid UTF-8.
31/// - For global strings: `capacity` must match the original `Vec<u8>`'s
32/// capacity so deallocation is correctly sized.
33pub struct SeqString {
34 ptr: *const u8,
35 len: usize,
36 capacity: usize, // Only meaningful for global strings
37 global: bool,
38}
39
40// Implement PartialEq manually to compare content (bytes), not pointers.
41impl PartialEq for SeqString {
42 fn eq(&self, other: &Self) -> bool {
43 self.as_bytes() == other.as_bytes()
44 }
45}
46
47impl Eq for SeqString {}
48
49// Safety: SeqString is Send because:
50// - Global strings are truly independent (owned heap allocation)
51// - Arena strings are cloned to global on channel send (see Clone impl)
52// - We never send arena pointers across threads unsafely
53unsafe impl Send for SeqString {}
54
55// Safety: SeqString is Sync because:
56// - The string content is immutable after construction
57// - ptr/len are only read, never modified after construction
58// - Global strings (Arc<String>) are already Sync
59// - Arena strings point to memory that won't be deallocated while in use
60unsafe impl Sync for SeqString {}
61
62impl SeqString {
63 /// Borrow the underlying bytes. Always succeeds; the type carries
64 /// no UTF-8 invariant. Byte-clean operations (concat, byte
65 /// length, equality, search, network I/O, crypto, etc.) should
66 /// use this.
67 pub fn as_bytes(&self) -> &[u8] {
68 // Safety: `ptr` and `len` describe a valid byte buffer per the
69 // `SeqString` invariants; the lifetime of the returned slice is
70 // tied to `&self` so the buffer cannot be freed while the
71 // slice is live.
72 unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
73 }
74
75 /// View as `&str` if the bytes happen to be valid UTF-8.
76 ///
77 /// Text-level operations (codepoint counting, case folding,
78 /// `string.json-escape`, `regex.*` with Unicode classes,
79 /// formatting for display) call this and treat `None` as a
80 /// fallible-text failure — the conventional `(value Bool)`
81 /// failure tuple, returning -1 / empty string / `false` per the
82 /// surrounding op's contract.
83 pub fn as_str(&self) -> Option<&str> {
84 std::str::from_utf8(self.as_bytes()).ok()
85 }
86
87 /// View as `&str`, replacing any invalid UTF-8 with U+FFFD. Use
88 /// only for human-facing display where lossiness is acceptable
89 /// (Debug, panic messages, REPL output). Operations that round-
90 /// trip user data must use [`as_bytes`] or [`as_str`] instead.
91 pub fn as_str_lossy(&self) -> std::borrow::Cow<'_, str> {
92 String::from_utf8_lossy(self.as_bytes())
93 }
94
95 /// View as `&str`, returning `""` if the bytes aren't valid UTF-8.
96 ///
97 /// The convenience for text-required ops (`string.length`,
98 /// `string.find`, file paths, integer parsing, …) that expect a
99 /// `&str` and have an existing degenerate-result-or-failure-tuple
100 /// path for empty input. A non-UTF-8 input lands in that same
101 /// failure path, with no extra branching at every call site.
102 pub fn as_str_or_empty(&self) -> &str {
103 self.as_str().unwrap_or("")
104 }
105
106 /// Check if this string is globally allocated
107 pub fn is_global(&self) -> bool {
108 self.global
109 }
110
111 /// Get length in bytes
112 pub fn len(&self) -> usize {
113 self.len
114 }
115
116 /// Check if empty
117 pub fn is_empty(&self) -> bool {
118 self.len == 0
119 }
120
121 /// Check if this is an interned/static string (Issue #166)
122 ///
123 /// Interned strings have capacity=0 and point to static data.
124 /// They are never freed and can be compared by pointer for O(1) equality.
125 pub fn is_interned(&self) -> bool {
126 self.capacity == 0 && self.global
127 }
128
129 /// Get raw pointer to string data
130 ///
131 /// Used for O(1) pointer comparison of interned symbols.
132 pub fn as_ptr(&self) -> *const u8 {
133 self.ptr
134 }
135
136 /// Reconstruct SeqString from raw parts
137 ///
138 /// # Safety
139 /// The parts must be a valid allocation matching the ptr/len/capacity/global
140 /// invariants documented on `SeqString`.
141 pub unsafe fn from_raw_parts(
142 ptr: *const u8,
143 len: usize,
144 capacity: usize,
145 global: bool,
146 ) -> Self {
147 SeqString {
148 ptr,
149 len,
150 capacity,
151 global,
152 }
153 }
154}
155
156impl Clone for SeqString {
157 /// Clone always allocates from the global allocator for Send safety.
158 ///
159 /// When a `SeqString` is sent through a channel, the receiving
160 /// strand gets an independent global-allocated copy that doesn't
161 /// depend on the sender's arena. Byte-clean: copies the underlying
162 /// `&[u8]`, no UTF-8 validation.
163 fn clone(&self) -> Self {
164 global_bytes(self.as_bytes().to_vec())
165 }
166}
167
168impl Drop for SeqString {
169 fn drop(&mut self) {
170 // Drop only if BOTH conditions are true:
171 // - global=true: Arena strings have global=false and are bulk-freed on strand exit.
172 // - capacity > 0: Interned symbols (Issue #166) have capacity=0 and point to
173 // static data that must NOT be deallocated.
174 if self.global && self.capacity > 0 {
175 // Reconstruct the owning `Vec<u8>` and drop it. Using
176 // `Vec<u8>::from_raw_parts` (rather than `String::from_raw_parts`)
177 // imposes no UTF-8 requirement on the buffer contents; deallocation
178 // size is identical because `String` is just `Vec<u8>` plus a
179 // UTF-8 invariant.
180 //
181 // Safety: We created this buffer in `global_bytes()` (via
182 // `Vec::into_raw_parts`-equivalent) and stored the original
183 // `ptr`, `len`, and `capacity`, so reconstruction is exact.
184 unsafe {
185 let _v = Vec::<u8>::from_raw_parts(self.ptr as *mut u8, self.len, self.capacity);
186 // _v is dropped here, freeing the memory with correct size.
187 }
188 }
189 // Arena strings don't need explicit drop — the arena's reset frees them.
190 // Static/interned strings (capacity=0) point to static data — no drop needed.
191 }
192}
193
194impl fmt::Debug for SeqString {
195 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
196 // Lossy display is fine here — Debug is for human consumption.
197 write!(
198 f,
199 "SeqString({:?}, global={})",
200 self.as_str_lossy(),
201 self.global
202 )
203 }
204}
205
206impl fmt::Display for SeqString {
207 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
208 // Display is human-facing; lossy-replace invalid UTF-8 with U+FFFD.
209 // Round-trip data should use `as_bytes()` directly.
210 write!(f, "{}", self.as_str_lossy())
211 }
212}
213
214/// Create arena-allocated bytes (fast path for temporaries).
215///
216/// Accepts arbitrary bytes; no UTF-8 validation.
217///
218/// # Performance
219/// ~5ns vs ~100ns for global allocator (20× faster).
220///
221/// # Lifetime
222/// Valid until `arena_reset()` is called (typically when the strand exits).
223pub fn arena_bytes(bytes: &[u8]) -> SeqString {
224 arena::with_arena(|arena| {
225 let arena_buf = arena.alloc_slice_copy(bytes);
226 SeqString {
227 ptr: arena_buf.as_ptr(),
228 len: arena_buf.len(),
229 capacity: 0, // Not used for arena strings
230 global: false,
231 }
232 })
233}
234
235/// Create arena-allocated string from a UTF-8 `&str`. Convenience
236/// wrapper over [`arena_bytes`] for callers that already have a Rust
237/// `&str` in hand.
238pub fn arena_string(s: &str) -> SeqString {
239 arena_bytes(s.as_bytes())
240}
241
242/// Create globally-allocated bytes (persists across arena resets).
243///
244/// Accepts arbitrary bytes; no UTF-8 validation. Used when a
245/// `SeqString` needs to outlive the current strand or cross a channel
246/// boundary.
247pub fn global_bytes(bytes: Vec<u8>) -> SeqString {
248 let len = bytes.len();
249 let capacity = bytes.capacity();
250 let ptr = bytes.as_ptr();
251 std::mem::forget(bytes); // Transfer ownership; Drop reconstructs and frees.
252
253 SeqString {
254 ptr,
255 len,
256 capacity,
257 global: true,
258 }
259}
260
261/// Create globally-allocated string from a UTF-8 `String`. Convenience
262/// wrapper over [`global_bytes`] for callers that already have a Rust
263/// `String` in hand.
264pub fn global_string(s: String) -> SeqString {
265 global_bytes(s.into_bytes())
266}
267
268/// Convert &str to SeqString using arena allocation
269impl From<&str> for SeqString {
270 fn from(s: &str) -> Self {
271 arena_string(s)
272 }
273}
274
275/// Convert String to SeqString using global allocation
276impl From<String> for SeqString {
277 fn from(s: String) -> Self {
278 global_string(s)
279 }
280}
281
282#[cfg(test)]
283mod tests {
284 use super::*;
285
286 #[test]
287 fn test_arena_string() {
288 let s = arena_string("Hello, arena!");
289 assert_eq!(s.as_str(), Some("Hello, arena!"));
290 assert_eq!(s.len(), 13);
291 assert!(!s.is_global());
292 }
293
294 #[test]
295 fn test_global_string() {
296 let s = global_string("Hello, global!".to_string());
297 assert_eq!(s.as_str(), Some("Hello, global!"));
298 assert_eq!(s.len(), 14);
299 assert!(s.is_global());
300 }
301
302 #[test]
303 fn test_clone_creates_global() {
304 // Clone an arena string
305 let s1 = arena_string("test");
306 let s2 = s1.clone();
307
308 assert_eq!(s1.as_bytes(), s2.as_bytes());
309 assert!(!s1.is_global());
310 assert!(s2.is_global()); // Clone is always global!
311 }
312
313 #[test]
314 fn test_clone_global() {
315 let s1 = global_string("test".to_string());
316 let s2 = s1.clone();
317
318 assert_eq!(s1.as_bytes(), s2.as_bytes());
319 assert!(s1.is_global());
320 assert!(s2.is_global());
321 }
322
323 #[test]
324 fn test_drop_global() {
325 // Create and drop a global string
326 {
327 let s = global_string("Will be dropped".to_string());
328 assert_eq!(s.as_str(), Some("Will be dropped"));
329 }
330 // If we get here without crashing, drop worked
331 }
332
333 #[test]
334 fn test_drop_arena() {
335 // Create and drop an arena string
336 {
337 let s = arena_string("Will be dropped (no-op)");
338 assert_eq!(s.as_str(), Some("Will be dropped (no-op)"));
339 }
340 // Arena strings don't need explicit drop
341 }
342
343 #[test]
344 fn test_equality() {
345 let s1 = arena_string("test");
346 let s2 = arena_string("test");
347 let s3 = global_string("test".to_string());
348 let s4 = arena_string("different");
349
350 assert_eq!(s1, s2); // Same content, both arena
351 assert_eq!(s1, s3); // Same content, different allocation
352 assert_ne!(s1, s4); // Different content
353 }
354
355 #[test]
356 fn test_from_str() {
357 let s: SeqString = "test".into();
358 assert_eq!(s.as_str(), Some("test"));
359 assert!(!s.is_global()); // from &str uses arena
360 }
361
362 #[test]
363 fn test_from_string() {
364 let s: SeqString = "test".to_string().into();
365 assert_eq!(s.as_str(), Some("test"));
366 assert!(s.is_global()); // from String uses global
367 }
368
369 #[test]
370 fn test_debug_format() {
371 let s = arena_string("debug");
372 let debug_str = format!("{:?}", s);
373 assert!(debug_str.contains("debug"));
374 assert!(debug_str.contains("global=false"));
375 }
376
377 #[test]
378 fn test_display_format() {
379 let s = global_string("display".to_string());
380 let display_str = format!("{}", s);
381 assert_eq!(display_str, "display");
382 }
383
384 #[test]
385 fn test_empty_string() {
386 let s = arena_string("");
387 assert_eq!(s.len(), 0);
388 assert!(s.is_empty());
389 assert_eq!(s.as_str(), Some(""));
390 }
391
392 #[test]
393 fn test_unicode() {
394 let s = arena_string("Hello, 世界! 🦀");
395 assert_eq!(s.as_str(), Some("Hello, 世界! 🦀"));
396 assert!(s.len() > 10); // UTF-8 bytes, not chars
397 }
398
399 #[test]
400 fn test_global_string_preserves_capacity() {
401 // PR #11 Critical fix: Verify capacity is preserved for correct deallocation
402 let mut s = String::with_capacity(100);
403 s.push_str("hi");
404
405 assert_eq!(s.len(), 2);
406 assert_eq!(s.capacity(), 100);
407
408 let cem = global_string(s);
409
410 // Verify the SeqString captured the original capacity
411 assert_eq!(cem.len(), 2);
412 assert_eq!(cem.capacity, 100); // Critical: Must be 100, not 2!
413 assert_eq!(cem.as_str(), Some("hi"));
414 assert!(cem.is_global());
415
416 // Drop cem - if capacity was wrong, this would cause heap corruption
417 drop(cem);
418
419 // If we get here without crash/UB, the fix worked
420 }
421
422 #[test]
423 fn test_arena_string_capacity_zero() {
424 // Arena strings don't use capacity field
425 let s = arena_string("test");
426 assert_eq!(s.capacity, 0); // Arena strings have capacity=0
427 assert!(!s.is_global());
428 }
429
430 // ------------------------------------------------------------------
431 // Byte-cleanliness sentinel tests.
432 //
433 // The type carries arbitrary bytes — no UTF-8 invariant. The
434 // sentinel covers: a NUL byte, a non-UTF-8 lead byte (0xDC alone is
435 // a UTF-8 continuation byte; standalone it's invalid), a high byte
436 // (0xFF, never valid in any UTF-8 position), and a partial
437 // multi-byte UTF-8 prefix (0xC3 without continuation). If any path
438 // through the runtime mangles or rejects these, the bug shows up
439 // here first.
440 // ------------------------------------------------------------------
441
442 const SENTINEL: &[u8] = &[0x00, 0xDC, b'x', 0xFF, 0xC3, b'!'];
443
444 #[test]
445 fn global_bytes_carries_arbitrary_bytes() {
446 let s = global_bytes(SENTINEL.to_vec());
447 assert_eq!(s.as_bytes(), SENTINEL);
448 assert_eq!(s.len(), SENTINEL.len());
449 assert!(s.is_global());
450 // The sentinel isn't valid UTF-8, so as_str is None.
451 assert_eq!(s.as_str(), None);
452 }
453
454 #[test]
455 fn arena_bytes_carries_arbitrary_bytes() {
456 let s = arena_bytes(SENTINEL);
457 assert_eq!(s.as_bytes(), SENTINEL);
458 assert_eq!(s.len(), SENTINEL.len());
459 assert!(!s.is_global());
460 assert_eq!(s.as_str(), None);
461 }
462
463 #[test]
464 fn equality_uses_bytes_not_utf8() {
465 // Two SeqStrings with identical non-UTF-8 bytes are equal.
466 let s1 = arena_bytes(SENTINEL);
467 let s2 = global_bytes(SENTINEL.to_vec());
468 assert_eq!(s1, s2);
469
470 // Differ in one byte.
471 let mut alt = SENTINEL.to_vec();
472 alt[0] = 0x01;
473 let s3 = global_bytes(alt);
474 assert_ne!(s1, s3);
475 }
476
477 #[test]
478 fn clone_round_trips_arbitrary_bytes() {
479 // Clone must preserve invalid UTF-8 byte-for-byte; it goes
480 // through the global allocator (cross-strand transfer path).
481 let s = arena_bytes(SENTINEL);
482 let cloned = s.clone();
483 assert_eq!(s.as_bytes(), cloned.as_bytes());
484 assert!(cloned.is_global());
485 }
486
487 #[test]
488 fn drop_does_not_require_utf8() {
489 // Allocate-and-drop a global non-UTF-8 buffer. Pre-fix this
490 // would be UB inside the Drop impl (String::from_raw_parts on
491 // invalid UTF-8). The fixed Drop reconstructs a Vec<u8>
492 // instead, which has no UTF-8 requirement.
493 for _ in 0..16 {
494 let _ = global_bytes(SENTINEL.to_vec());
495 }
496 // If we reach here without the allocator complaining, the
497 // capacity bookkeeping is also intact for byte buffers.
498 }
499
500 #[test]
501 fn as_str_lossy_replaces_invalid() {
502 // Display path: invalid UTF-8 becomes U+FFFD, but the call
503 // doesn't fail or panic.
504 let s = global_bytes(SENTINEL.to_vec());
505 let lossy = s.as_str_lossy();
506 assert!(lossy.contains('\u{FFFD}'));
507 // The valid 'x' and '!' bytes are still there.
508 assert!(lossy.contains('x'));
509 assert!(lossy.contains('!'));
510 }
511}