libdd_profiling/profiles/collections/
string_set.rs

1// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/
2// SPDX-License-Identifier: Apache-2.0
3
4use super::slice_set::SliceSet;
5use super::SetError;
6use super::ThinStr;
7use std::ffi::c_void;
8use std::hash::BuildHasher;
9use std::ops::Deref;
10use std::ptr::NonNull;
11
12use super::SetHasher as Hasher;
13
14/// Represents a handle to a string that can be retrieved by the string set.
15/// The exact representation is not a public detail; it is only available so
16/// that it is known for FFI size and alignment.
17///
18/// Some [`StringRef`]s refer to well-known strings, which always exist in
19/// every string table.
20///
21/// The caller needs to ensure the string set it was created from always exists
22/// when a StringId is dereferenced.
23#[repr(transparent)]
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
25pub struct StringRef(pub ThinStr<'static>);
26
27impl StringRef {
28    pub fn into_raw(self) -> NonNull<c_void> {
29        self.0.into_raw()
30    }
31
32    /// Re-creates a [`StringRef`] created by [`StringRef::into_raw`].
33    ///
34    /// # Safety
35    ///
36    /// `this` needs to be created from [``StringRef::into_raw`] and the set
37    /// it belongs to should still be alive.
38    pub unsafe fn from_raw(this: NonNull<c_void>) -> Self {
39        Self(ThinStr::from_raw(this))
40    }
41}
42
43impl From<&StringRef> for StringRef {
44    fn from(value: &StringRef) -> Self {
45        *value
46    }
47}
48
49impl Default for StringRef {
50    fn default() -> Self {
51        Self::EMPTY
52    }
53}
54
55impl StringRef {
56    pub const EMPTY: StringRef = StringRef(ThinStr::new());
57    pub const END_TIMESTAMP_NS: StringRef = StringRef(ThinStr::end_timestamp_ns());
58    pub const LOCAL_ROOT_SPAN_ID: StringRef = StringRef(ThinStr::local_root_span_id());
59    pub const TRACE_ENDPOINT: StringRef = StringRef(ThinStr::trace_endpoint());
60    pub const SPAN_ID: StringRef = StringRef(ThinStr::span_id());
61}
62
63pub const WELL_KNOWN_STRING_REFS: [StringRef; 5] = [
64    StringRef::EMPTY,
65    StringRef::END_TIMESTAMP_NS,
66    StringRef::LOCAL_ROOT_SPAN_ID,
67    StringRef::TRACE_ENDPOINT,
68    StringRef::SPAN_ID,
69];
70
71/// Holds unique strings and provides [`StringRef`]s to fetch them later.
72/// This is a newtype around SliceSet<u8> to enforce UTF-8 invariants.
73pub struct UnsyncStringSet(SliceSet<u8>);
74
75impl UnsyncStringSet {
76    pub fn try_with_capacity(capacity: usize) -> Result<Self, SetError> {
77        let mut set = Self(SliceSet::try_with_capacity(capacity)?);
78        let strings = &mut set.0.slices;
79        for id in WELL_KNOWN_STRING_REFS {
80            let hash = Hasher::default().hash_one(id.0.deref().as_bytes());
81            strings.insert_unique(hash, id.0.into(), |t| Hasher::default().hash_one(t.deref()));
82        }
83
84        Ok(set)
85    }
86
87    /// Creates a new string set, which initially holds the empty string and
88    /// other well-known strings. The well-known strings are always
89    /// available and can be fetched using the [`WELL_KNOWN_STRING_REFS`].
90    pub fn try_new() -> Result<Self, SetError> {
91        Self::try_with_capacity(28)
92    }
93
94    unsafe fn find_with_hash(&self, hash: u64, str: &str) -> Option<StringRef> {
95        let interned_str = self.0.slices.find(hash, |thin_slice| {
96            // SAFETY: We only store valid UTF-8 in string sets
97            let slice_str = unsafe { std::str::from_utf8_unchecked(thin_slice.as_slice()) };
98            slice_str == str
99        })?;
100        Some(StringRef((*interned_str).into()))
101    }
102
103    /// # Safety
104    ///  1. The hash must be the same as if the str was re-hashed with the hasher the string set
105    ///     would use.
106    ///  2. The string must be unique within the set.
107    pub unsafe fn insert_unique_uncontended(&mut self, str: &str) -> Result<StringRef, SetError> {
108        let hash = Hasher::default().hash_one(str.as_bytes());
109        self.insert_unique_uncontended_with_hash(hash, str)
110    }
111
112    /// Inserts a string into the string set without checking for duplicates, using a pre-calculated
113    /// hash.
114    ///
115    /// # Safety
116    ///  1. The caller must ensure that the hash was computed using the same hasher the string set
117    ///     would use.
118    ///  2. The string must be unique within the set.
119    pub unsafe fn insert_unique_uncontended_with_hash(
120        &mut self,
121        hash: u64,
122        str: &str,
123    ) -> Result<StringRef, SetError> {
124        let new_slice = self
125            .0
126            .insert_unique_uncontended_with_hash(hash, str.as_bytes())?;
127        Ok(StringRef(new_slice.into()))
128    }
129
130    /// Adds the string to the string set if it isn't present already, and
131    /// returns a handle to the string that can be used to retrieve it later.
132    pub fn try_insert(&mut self, str: &str) -> Result<StringRef, SetError> {
133        let hash = Hasher::default().hash_one(str.as_bytes());
134        unsafe { self.try_insert_with_hash(hash, str) }
135    }
136
137    /// Adds the string to the string set if it isn't present already, using a pre-calculated hash.
138    /// Returns a handle to the string that can be used to retrieve it later.
139    ///
140    /// # Safety
141    /// The caller must ensure that the hash was computed using the same hasher the string set would
142    /// use.
143    pub unsafe fn try_insert_with_hash(
144        &mut self,
145        hash: u64,
146        str: &str,
147    ) -> Result<StringRef, SetError> {
148        // SAFETY: the string's hash is correct, we use the same hasher as
149        // StringSet uses.
150        if let Some(id) = self.find_with_hash(hash, str) {
151            return Ok(id);
152        }
153
154        // SAFETY: we just checked above that the string isn't in the set.
155        self.insert_unique_uncontended_with_hash(hash, str)
156    }
157
158    /// Returns an iterator over all strings in the set as [`StringRef`]s.
159    pub fn string_ids(&self) -> impl Iterator<Item = StringRef> + '_ {
160        self.0.slices.iter().map(|slice| StringRef((*slice).into()))
161    }
162
163    pub fn len(&self) -> usize {
164        self.0.len()
165    }
166
167    pub fn is_empty(&self) -> bool {
168        self.0.is_empty()
169    }
170
171    pub fn capacity(&self) -> usize {
172        self.0.capacity()
173    }
174
175    /// # Safety
176    /// The caller must ensure that the `StringId` was obtained from this set
177    /// (or is a well-known id) and that the set outlives the returned `&str`.
178    pub unsafe fn get_string(&self, id: StringRef) -> &str {
179        // SAFETY: the lifetime extension is safe as long as caller respects
180        // this function's safety.
181        unsafe { core::mem::transmute::<&str, &str>(id.0.deref()) }
182    }
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188
189    #[test]
190    fn test_string_set_basic_operations() {
191        let mut set = UnsyncStringSet::try_new().unwrap();
192
193        // Test inserting new strings
194        let id1 = set.try_insert("hello").unwrap();
195        let id2 = set.try_insert("world").unwrap();
196        let id3 = set.try_insert("hello").unwrap(); // duplicate
197
198        // Verify duplicate returns same ID
199        assert_eq!(&*id1.0, &*id3.0);
200        assert_ne!(&*id1.0, &*id2.0);
201
202        // Verify retrieval
203        unsafe {
204            assert_eq!(set.get_string(id1), "hello");
205            assert_eq!(set.get_string(id2), "world");
206            assert_eq!(set.get_string(id3), "hello");
207        }
208    }
209
210    #[test]
211    fn test_string_lengths_and_alignment() {
212        let mut set = UnsyncStringSet::try_new().unwrap();
213
214        // Test various string lengths that might cause alignment issues
215        let test_strings = [
216            "",                                    // 0 bytes
217            "a",                                   // 1 byte
218            "ab",                                  // 2 bytes
219            "abc",                                 // 3 bytes
220            "abcd",                                // 4 bytes
221            "abcdefg",                             // 7 bytes
222            "abcdefgh",                            // 8 bytes (usize boundary on 64-bit)
223            "abcdefghijklmno",                     // 15 bytes
224            "abcdefghijklmnop",                    // 16 bytes
225            "abcdefghijklmnopqrstuvwxyz123456789", // 35 bytes
226        ];
227
228        let mut ids = Vec::new();
229        for s in &test_strings {
230            let id = set.try_insert(s).unwrap();
231            ids.push(id);
232        }
233
234        // Verify all strings can be retrieved correctly
235        for (id, expected) in ids.iter().zip(&test_strings) {
236            unsafe {
237                assert_eq!(set.get_string(*id), *expected);
238            }
239        }
240    }
241
242    #[test]
243    fn test_unicode_strings() {
244        let mut set = UnsyncStringSet::try_new().unwrap();
245
246        let unicode_strings = [
247            "café",         // Latin with accents
248            "🦀",           // Emoji (4 bytes)
249            "こんにちは",   // Japanese
250            "Здравствуй",   // Cyrillic
251            "🔥💯✨",       // Multiple emoji
252            "a\u{0000}b",   // Embedded null
253            "line1\nline2", // Newline
254            "tab\there",    // Tab
255        ];
256
257        let mut ids = Vec::new();
258        for s in &unicode_strings {
259            let id = set.try_insert(s).unwrap();
260            ids.push(id);
261        }
262
263        // Verify all Unicode strings are preserved correctly
264        for (id, expected) in ids.iter().zip(&unicode_strings) {
265            unsafe {
266                assert_eq!(set.get_string(*id), *expected);
267            }
268        }
269    }
270
271    #[test]
272    fn test_capacity_and_growth() {
273        // Test with minimal capacity
274        let mut set = UnsyncStringSet::try_with_capacity(1).unwrap();
275
276        // Insert more strings than initial capacity to force growth
277        let test_strings: Vec<String> = (0..50).map(|i| format!("growth_test_{}", i)).collect();
278
279        let mut ids = Vec::new();
280        for s in &test_strings {
281            let id = set.try_insert(s).unwrap();
282            ids.push(id);
283        }
284
285        // Verify all strings are still accessible after growth
286        for (id, expected) in ids.iter().zip(&test_strings) {
287            unsafe {
288                assert_eq!(set.get_string(*id), expected);
289            }
290        }
291    }
292
293    #[test]
294    #[cfg_attr(miri, ignore)]
295    fn test_large_strings() {
296        let mut set = UnsyncStringSet::try_new().unwrap();
297
298        // Test moderately large string
299        let large_string = "x".repeat(1024);
300        let id1 = set.try_insert(&large_string).unwrap();
301
302        unsafe {
303            assert_eq!(set.get_string(id1), large_string);
304        }
305
306        // Test very large string
307        let very_large_string = "y".repeat(65536);
308        let id2 = set.try_insert(&very_large_string).unwrap();
309
310        unsafe {
311            assert_eq!(set.get_string(id2), very_large_string);
312            // Verify first string is still intact
313            assert_eq!(set.get_string(id1), large_string);
314        }
315
316        // Test extremely large string (>2 MiB) to trigger different ChainAllocator path
317        let huge_string = "z".repeat(2 * 1024 * 1024 + 1000); // >2 MiB
318        let id3 = set.try_insert(&huge_string).unwrap();
319
320        unsafe {
321            assert_eq!(set.get_string(id3), huge_string);
322            // Verify previous strings are still intact
323            assert_eq!(set.get_string(id1), large_string);
324            assert_eq!(set.get_string(id2), very_large_string);
325        }
326    }
327
328    #[test]
329    fn test_many_small_strings() {
330        const NUM_STRINGS: usize = if cfg!(miri) { 100 } else { 1000 };
331        let mut set = UnsyncStringSet::try_new().unwrap();
332
333        // Insert many small strings to test fragmentation and growth
334        let mut ids = Vec::with_capacity(NUM_STRINGS);
335        let mut expected = Vec::with_capacity(NUM_STRINGS);
336
337        for i in 0..NUM_STRINGS {
338            let s = format!("{}", i);
339            let id = set.try_insert(&s).unwrap();
340            ids.push(id);
341            expected.push(s);
342        }
343
344        // Verify all strings are still correct
345        for (id, expected_str) in ids.iter().zip(&expected) {
346            unsafe {
347                assert_eq!(set.get_string(*id), expected_str);
348            }
349        }
350    }
351}