Skip to main content

seq_core/
seqstring.rs

1//! SeqString - Arena or Globally Allocated Byte String
2//!
3//! Strings in Seq are sequences of arbitrary bytes — there is **no
4//! UTF-8 invariant** on this type. Byte-clean operations (concat,
5//! length-in-bytes, channel send, network I/O, file I/O of binary
6//! content, crypto inputs) work on any input. Text-level operations
7//! (codepoint length, case folding, regex Unicode classes, JSON
8//! escaping) call [`SeqString::as_str`] which validates UTF-8 at the
9//! boundary and returns `Option<&str>`; on invalid bytes those ops
10//! fail loudly with the standard `(value Bool)` failure tuple.
11//!
12//! The two allocation sources stay:
13//! 1. Thread-local arena (fast, bulk-freed on strand exit)
14//! 2. Global allocator (persists across arena resets, used for
15//!    cross-strand transfer)
16//!
17//! See `docs/design/STRING_BYTE_CLEANLINESS.md` for the full design.
18
19use crate::arena;
20use std::fmt;
21
22/// Byte string that tracks its allocation source.
23///
24/// # Safety Invariants
25/// - If `global=true`: `ptr` points to a global-allocated byte buffer
26///   whose memory matches `len`/`capacity`; the buffer is freed on
27///   `Drop`.
28/// - If `global=false`: `ptr` points into the thread-local arena; the
29///   arena owns the memory and frees it in bulk on strand exit.
30/// - The byte content is *not* required to be valid UTF-8.
31/// - For global strings: `capacity` must match the original `Vec<u8>`'s
32///   capacity so deallocation is correctly sized.
33pub struct SeqString {
34    ptr: *const u8,
35    len: usize,
36    capacity: usize, // Only meaningful for global strings
37    global: bool,
38}
39
40// Implement PartialEq manually to compare content (bytes), not pointers.
41impl PartialEq for SeqString {
42    fn eq(&self, other: &Self) -> bool {
43        self.as_bytes() == other.as_bytes()
44    }
45}
46
47impl Eq for SeqString {}
48
49// Safety: SeqString is Send because:
50// - Global strings are truly independent (owned heap allocation)
51// - Arena strings are cloned to global on channel send (see Clone impl)
52// - We never send arena pointers across threads unsafely
53unsafe impl Send for SeqString {}
54
55// Safety: SeqString is Sync because:
56// - The string content is immutable after construction
57// - ptr/len are only read, never modified after construction
58// - Global strings (Arc<String>) are already Sync
59// - Arena strings point to memory that won't be deallocated while in use
60unsafe impl Sync for SeqString {}
61
62impl SeqString {
63    /// Borrow the underlying bytes. Always succeeds; the type carries
64    /// no UTF-8 invariant. Byte-clean operations (concat, byte
65    /// length, equality, search, network I/O, crypto, etc.) should
66    /// use this.
67    pub fn as_bytes(&self) -> &[u8] {
68        // Safety: `ptr` and `len` describe a valid byte buffer per the
69        // `SeqString` invariants; the lifetime of the returned slice is
70        // tied to `&self` so the buffer cannot be freed while the
71        // slice is live.
72        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
73    }
74
75    /// View as `&str` if the bytes happen to be valid UTF-8.
76    ///
77    /// Text-level operations (codepoint counting, case folding,
78    /// `string.json-escape`, `regex.*` with Unicode classes,
79    /// formatting for display) call this and treat `None` as a
80    /// fallible-text failure — the conventional `(value Bool)`
81    /// failure tuple, returning -1 / empty string / `false` per the
82    /// surrounding op's contract.
83    pub fn as_str(&self) -> Option<&str> {
84        std::str::from_utf8(self.as_bytes()).ok()
85    }
86
87    /// View as `&str`, replacing any invalid UTF-8 with U+FFFD. Use
88    /// only for human-facing display where lossiness is acceptable
89    /// (Debug, panic messages, REPL output). Operations that round-
90    /// trip user data must use [`as_bytes`] or [`as_str`] instead.
91    pub fn as_str_lossy(&self) -> std::borrow::Cow<'_, str> {
92        String::from_utf8_lossy(self.as_bytes())
93    }
94
95    /// View as `&str`, returning `""` if the bytes aren't valid UTF-8.
96    ///
97    /// The convenience for text-required ops (`string.length`,
98    /// `string.find`, file paths, integer parsing, …) that expect a
99    /// `&str` and have an existing degenerate-result-or-failure-tuple
100    /// path for empty input. A non-UTF-8 input lands in that same
101    /// failure path, with no extra branching at every call site.
102    pub fn as_str_or_empty(&self) -> &str {
103        self.as_str().unwrap_or("")
104    }
105
106    /// Check if this string is globally allocated
107    pub fn is_global(&self) -> bool {
108        self.global
109    }
110
111    /// Get length in bytes
112    pub fn len(&self) -> usize {
113        self.len
114    }
115
116    /// Check if empty
117    pub fn is_empty(&self) -> bool {
118        self.len == 0
119    }
120
121    /// Check if this is an interned/static string (Issue #166)
122    ///
123    /// Interned strings have capacity=0 and point to static data.
124    /// They are never freed and can be compared by pointer for O(1) equality.
125    pub fn is_interned(&self) -> bool {
126        self.capacity == 0 && self.global
127    }
128
129    /// Get raw pointer to string data
130    ///
131    /// Used for O(1) pointer comparison of interned symbols.
132    pub fn as_ptr(&self) -> *const u8 {
133        self.ptr
134    }
135
136    /// Reconstruct SeqString from raw parts
137    ///
138    /// # Safety
139    /// The parts must be a valid allocation matching the ptr/len/capacity/global
140    /// invariants documented on `SeqString`.
141    pub unsafe fn from_raw_parts(
142        ptr: *const u8,
143        len: usize,
144        capacity: usize,
145        global: bool,
146    ) -> Self {
147        SeqString {
148            ptr,
149            len,
150            capacity,
151            global,
152        }
153    }
154}
155
156impl Clone for SeqString {
157    /// Clone always allocates from the global allocator for Send safety.
158    ///
159    /// When a `SeqString` is sent through a channel, the receiving
160    /// strand gets an independent global-allocated copy that doesn't
161    /// depend on the sender's arena. Byte-clean: copies the underlying
162    /// `&[u8]`, no UTF-8 validation.
163    fn clone(&self) -> Self {
164        global_bytes(self.as_bytes().to_vec())
165    }
166}
167
168impl Drop for SeqString {
169    fn drop(&mut self) {
170        // Drop only if BOTH conditions are true:
171        // - global=true: Arena strings have global=false and are bulk-freed on strand exit.
172        // - capacity > 0: Interned symbols (Issue #166) have capacity=0 and point to
173        //   static data that must NOT be deallocated.
174        if self.global && self.capacity > 0 {
175            // Reconstruct the owning `Vec<u8>` and drop it. Using
176            // `Vec<u8>::from_raw_parts` (rather than `String::from_raw_parts`)
177            // imposes no UTF-8 requirement on the buffer contents; deallocation
178            // size is identical because `String` is just `Vec<u8>` plus a
179            // UTF-8 invariant.
180            //
181            // Safety: We created this buffer in `global_bytes()` (via
182            // `Vec::into_raw_parts`-equivalent) and stored the original
183            // `ptr`, `len`, and `capacity`, so reconstruction is exact.
184            unsafe {
185                let _v = Vec::<u8>::from_raw_parts(self.ptr as *mut u8, self.len, self.capacity);
186                // _v is dropped here, freeing the memory with correct size.
187            }
188        }
189        // Arena strings don't need explicit drop — the arena's reset frees them.
190        // Static/interned strings (capacity=0) point to static data — no drop needed.
191    }
192}
193
194impl fmt::Debug for SeqString {
195    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
196        // Lossy display is fine here — Debug is for human consumption.
197        write!(
198            f,
199            "SeqString({:?}, global={})",
200            self.as_str_lossy(),
201            self.global
202        )
203    }
204}
205
206impl fmt::Display for SeqString {
207    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
208        // Display is human-facing; lossy-replace invalid UTF-8 with U+FFFD.
209        // Round-trip data should use `as_bytes()` directly.
210        write!(f, "{}", self.as_str_lossy())
211    }
212}
213
214/// Create arena-allocated bytes (fast path for temporaries).
215///
216/// Accepts arbitrary bytes; no UTF-8 validation.
217///
218/// # Performance
219/// ~5ns vs ~100ns for global allocator (20× faster).
220///
221/// # Lifetime
222/// Valid until `arena_reset()` is called (typically when the strand exits).
223pub fn arena_bytes(bytes: &[u8]) -> SeqString {
224    arena::with_arena(|arena| {
225        let arena_buf = arena.alloc_slice_copy(bytes);
226        SeqString {
227            ptr: arena_buf.as_ptr(),
228            len: arena_buf.len(),
229            capacity: 0, // Not used for arena strings
230            global: false,
231        }
232    })
233}
234
235/// Create arena-allocated string from a UTF-8 `&str`. Convenience
236/// wrapper over [`arena_bytes`] for callers that already have a Rust
237/// `&str` in hand.
238pub fn arena_string(s: &str) -> SeqString {
239    arena_bytes(s.as_bytes())
240}
241
242/// Create globally-allocated bytes (persists across arena resets).
243///
244/// Accepts arbitrary bytes; no UTF-8 validation. Used when a
245/// `SeqString` needs to outlive the current strand or cross a channel
246/// boundary.
247pub fn global_bytes(bytes: Vec<u8>) -> SeqString {
248    let len = bytes.len();
249    let capacity = bytes.capacity();
250    let ptr = bytes.as_ptr();
251    std::mem::forget(bytes); // Transfer ownership; Drop reconstructs and frees.
252
253    SeqString {
254        ptr,
255        len,
256        capacity,
257        global: true,
258    }
259}
260
261/// Create globally-allocated string from a UTF-8 `String`. Convenience
262/// wrapper over [`global_bytes`] for callers that already have a Rust
263/// `String` in hand.
264pub fn global_string(s: String) -> SeqString {
265    global_bytes(s.into_bytes())
266}
267
268/// Convert &str to SeqString using arena allocation
269impl From<&str> for SeqString {
270    fn from(s: &str) -> Self {
271        arena_string(s)
272    }
273}
274
275/// Convert String to SeqString using global allocation
276impl From<String> for SeqString {
277    fn from(s: String) -> Self {
278        global_string(s)
279    }
280}
281
282#[cfg(test)]
283mod tests {
284    use super::*;
285
286    #[test]
287    fn test_arena_string() {
288        let s = arena_string("Hello, arena!");
289        assert_eq!(s.as_str(), Some("Hello, arena!"));
290        assert_eq!(s.len(), 13);
291        assert!(!s.is_global());
292    }
293
294    #[test]
295    fn test_global_string() {
296        let s = global_string("Hello, global!".to_string());
297        assert_eq!(s.as_str(), Some("Hello, global!"));
298        assert_eq!(s.len(), 14);
299        assert!(s.is_global());
300    }
301
302    #[test]
303    fn test_clone_creates_global() {
304        // Clone an arena string
305        let s1 = arena_string("test");
306        let s2 = s1.clone();
307
308        assert_eq!(s1.as_bytes(), s2.as_bytes());
309        assert!(!s1.is_global());
310        assert!(s2.is_global()); // Clone is always global!
311    }
312
313    #[test]
314    fn test_clone_global() {
315        let s1 = global_string("test".to_string());
316        let s2 = s1.clone();
317
318        assert_eq!(s1.as_bytes(), s2.as_bytes());
319        assert!(s1.is_global());
320        assert!(s2.is_global());
321    }
322
323    #[test]
324    fn test_drop_global() {
325        // Create and drop a global string
326        {
327            let s = global_string("Will be dropped".to_string());
328            assert_eq!(s.as_str(), Some("Will be dropped"));
329        }
330        // If we get here without crashing, drop worked
331    }
332
333    #[test]
334    fn test_drop_arena() {
335        // Create and drop an arena string
336        {
337            let s = arena_string("Will be dropped (no-op)");
338            assert_eq!(s.as_str(), Some("Will be dropped (no-op)"));
339        }
340        // Arena strings don't need explicit drop
341    }
342
343    #[test]
344    fn test_equality() {
345        let s1 = arena_string("test");
346        let s2 = arena_string("test");
347        let s3 = global_string("test".to_string());
348        let s4 = arena_string("different");
349
350        assert_eq!(s1, s2); // Same content, both arena
351        assert_eq!(s1, s3); // Same content, different allocation
352        assert_ne!(s1, s4); // Different content
353    }
354
355    #[test]
356    fn test_from_str() {
357        let s: SeqString = "test".into();
358        assert_eq!(s.as_str(), Some("test"));
359        assert!(!s.is_global()); // from &str uses arena
360    }
361
362    #[test]
363    fn test_from_string() {
364        let s: SeqString = "test".to_string().into();
365        assert_eq!(s.as_str(), Some("test"));
366        assert!(s.is_global()); // from String uses global
367    }
368
369    #[test]
370    fn test_debug_format() {
371        let s = arena_string("debug");
372        let debug_str = format!("{:?}", s);
373        assert!(debug_str.contains("debug"));
374        assert!(debug_str.contains("global=false"));
375    }
376
377    #[test]
378    fn test_display_format() {
379        let s = global_string("display".to_string());
380        let display_str = format!("{}", s);
381        assert_eq!(display_str, "display");
382    }
383
384    #[test]
385    fn test_empty_string() {
386        let s = arena_string("");
387        assert_eq!(s.len(), 0);
388        assert!(s.is_empty());
389        assert_eq!(s.as_str(), Some(""));
390    }
391
392    #[test]
393    fn test_unicode() {
394        let s = arena_string("Hello, 世界! 🦀");
395        assert_eq!(s.as_str(), Some("Hello, 世界! 🦀"));
396        assert!(s.len() > 10); // UTF-8 bytes, not chars
397    }
398
399    #[test]
400    fn test_global_string_preserves_capacity() {
401        // PR #11 Critical fix: Verify capacity is preserved for correct deallocation
402        let mut s = String::with_capacity(100);
403        s.push_str("hi");
404
405        assert_eq!(s.len(), 2);
406        assert_eq!(s.capacity(), 100);
407
408        let cem = global_string(s);
409
410        // Verify the SeqString captured the original capacity
411        assert_eq!(cem.len(), 2);
412        assert_eq!(cem.capacity, 100); // Critical: Must be 100, not 2!
413        assert_eq!(cem.as_str(), Some("hi"));
414        assert!(cem.is_global());
415
416        // Drop cem - if capacity was wrong, this would cause heap corruption
417        drop(cem);
418
419        // If we get here without crash/UB, the fix worked
420    }
421
422    #[test]
423    fn test_arena_string_capacity_zero() {
424        // Arena strings don't use capacity field
425        let s = arena_string("test");
426        assert_eq!(s.capacity, 0); // Arena strings have capacity=0
427        assert!(!s.is_global());
428    }
429
430    // ------------------------------------------------------------------
431    // Byte-cleanliness sentinel tests.
432    //
433    // The type carries arbitrary bytes — no UTF-8 invariant. The
434    // sentinel covers: a NUL byte, a non-UTF-8 lead byte (0xDC alone is
435    // a UTF-8 continuation byte; standalone it's invalid), a high byte
436    // (0xFF, never valid in any UTF-8 position), and a partial
437    // multi-byte UTF-8 prefix (0xC3 without continuation). If any path
438    // through the runtime mangles or rejects these, the bug shows up
439    // here first.
440    // ------------------------------------------------------------------
441
442    const SENTINEL: &[u8] = &[0x00, 0xDC, b'x', 0xFF, 0xC3, b'!'];
443
444    #[test]
445    fn global_bytes_carries_arbitrary_bytes() {
446        let s = global_bytes(SENTINEL.to_vec());
447        assert_eq!(s.as_bytes(), SENTINEL);
448        assert_eq!(s.len(), SENTINEL.len());
449        assert!(s.is_global());
450        // The sentinel isn't valid UTF-8, so as_str is None.
451        assert_eq!(s.as_str(), None);
452    }
453
454    #[test]
455    fn arena_bytes_carries_arbitrary_bytes() {
456        let s = arena_bytes(SENTINEL);
457        assert_eq!(s.as_bytes(), SENTINEL);
458        assert_eq!(s.len(), SENTINEL.len());
459        assert!(!s.is_global());
460        assert_eq!(s.as_str(), None);
461    }
462
463    #[test]
464    fn equality_uses_bytes_not_utf8() {
465        // Two SeqStrings with identical non-UTF-8 bytes are equal.
466        let s1 = arena_bytes(SENTINEL);
467        let s2 = global_bytes(SENTINEL.to_vec());
468        assert_eq!(s1, s2);
469
470        // Differ in one byte.
471        let mut alt = SENTINEL.to_vec();
472        alt[0] = 0x01;
473        let s3 = global_bytes(alt);
474        assert_ne!(s1, s3);
475    }
476
477    #[test]
478    fn clone_round_trips_arbitrary_bytes() {
479        // Clone must preserve invalid UTF-8 byte-for-byte; it goes
480        // through the global allocator (cross-strand transfer path).
481        let s = arena_bytes(SENTINEL);
482        let cloned = s.clone();
483        assert_eq!(s.as_bytes(), cloned.as_bytes());
484        assert!(cloned.is_global());
485    }
486
487    #[test]
488    fn drop_does_not_require_utf8() {
489        // Allocate-and-drop a global non-UTF-8 buffer. Pre-fix this
490        // would be UB inside the Drop impl (String::from_raw_parts on
491        // invalid UTF-8). The fixed Drop reconstructs a Vec<u8>
492        // instead, which has no UTF-8 requirement.
493        for _ in 0..16 {
494            let _ = global_bytes(SENTINEL.to_vec());
495        }
496        // If we reach here without the allocator complaining, the
497        // capacity bookkeeping is also intact for byte buffers.
498    }
499
500    #[test]
501    fn as_str_lossy_replaces_invalid() {
502        // Display path: invalid UTF-8 becomes U+FFFD, but the call
503        // doesn't fail or panic.
504        let s = global_bytes(SENTINEL.to_vec());
505        let lossy = s.as_str_lossy();
506        assert!(lossy.contains('\u{FFFD}'));
507        // The valid 'x' and '!' bytes are still there.
508        assert!(lossy.contains('x'));
509        assert!(lossy.contains('!'));
510    }
511}