libdd_profiling/collections/string_table/
mod.rs

1// Copyright 2024-Present Datadog, Inc. https://www.datadoghq.com/
2// SPDX-License-Identifier: Apache-2.0
3
4mod error;
5#[allow(unused)]
6pub mod wordpress_test_data;
7
8pub use error::*;
9
10use crate::collections::identifiable::StringId;
11use crate::iter::{IntoLendingIterator, LendingIterator};
12use libdd_alloc::{AllocError, Allocator, ChainAllocator, VirtualAllocator};
13use std::alloc::Layout;
14
15/// A trait that indicates an allocator is arena allocator, meaning it doesn't
16/// deallocate individual items, but deallocates their memory as a  group when
17/// the arena is dropped.
18pub trait ArenaAllocator: Allocator {
19    /// Copies the str into the arena, and returns a slice to the new str.
20    fn allocate(&self, str: &str) -> Result<&str, AllocError> {
21        // TODO: We might want each allocator to return its own empty string
22        // so we can debug where the value came from.
23        if str.is_empty() {
24            return Ok("");
25        }
26        let layout = Layout::for_value(str);
27        let uninit_ptr = Allocator::allocate(self, layout)?;
28
29        // Copy the bytes of the string into the allocated memory.
30        // SAFETY: this is guaranteed to not be overlapping because an
31        // allocator must not return aliasing bytes in its allocations.
32        unsafe {
33            let src = str.as_ptr();
34            let dst = uninit_ptr.as_ptr() as *mut u8;
35            let count = str.len();
36            core::ptr::copy_nonoverlapping(src, dst, count);
37        }
38
39        // SAFETY: The bytes were properly initialized, and they cannot be
40        // misaligned because they have an alignment of 1, so it is safe to
41        // create a slice of the given data and length. The lifetime matches
42        // the arena allocator's lifetime.
43        let slice: &[u8] =
44            unsafe { core::slice::from_raw_parts(uninit_ptr.as_ptr() as *const u8, str.len()) };
45
46        // SAFETY: Since the bytes were copied from a valid str without
47        // slicing, the bytes must also be utf-8.
48        Ok(unsafe { core::str::from_utf8_unchecked(slice) })
49    }
50}
51
52impl<A: Allocator + Clone> ArenaAllocator for ChainAllocator<A> {}
53
54type Hasher = core::hash::BuildHasherDefault<rustc_hash::FxHasher>;
55type HashSet<K> = indexmap::IndexSet<K, Hasher>;
56
57/// Holds unique strings and provides [StringId]s that correspond to the order
58/// that the strings were inserted.
59pub struct StringTable {
60    /// The bytes of each string stored in `strings` are allocated here.
61    bytes: ChainAllocator<VirtualAllocator>,
62
63    /// The ordered hash set of unique strings. The order becomes the StringId.
64    /// The static lifetime is a lie, it is tied to the `bytes`, which is only
65    /// moved if the string table is moved e.g.
66    /// [StringTable::into_lending_iterator].
67    /// References to the underlying strings should generally not be handed,
68    /// but if they are, they should be bound to the string table's lifetime
69    /// or the lending iterator's lifetime.
70    strings: HashSet<&'static str>,
71}
72
73impl Default for StringTable {
74    fn default() -> Self {
75        Self::new()
76    }
77}
78
79impl StringTable {
80    /// Creates a new string table, which initially holds the empty string and
81    /// no others.
82    pub fn new() -> Self {
83        // Keep in mind 32-bit .NET. There is only 2 GiB of virtual memory
84        // total available to an application, and we're not the application,
85        // we're just a piece inside it. Additionally, there may be 2 or more
86        // string tables in memory at a given time. Talk to .NET profiling
87        // engineers before making this any bigger.
88        const SIZE_HINT: usize = 4 * 1024 * 1024;
89        let bytes = ChainAllocator::new_in(SIZE_HINT, VirtualAllocator {});
90
91        let mut strings = HashSet::with_hasher(Hasher::default());
92        // It varies by implementation, but frequently I've noticed that the
93        // capacity after the first insertion is quite small, as in 3. This is
94        // a bit too small and there are frequent reallocations. For one sample
95        // with endpoint + code hotspots, we'd have at least these strings:
96        // - ""
97        // - At least one sample type
98        // - At least one sample unit--already at 3 without any samples.
99        // - "local root span id"
100        // - "span id"
101        // - "trace endpoint"
102        // - A file and/or function name per frame.
103        // So with a capacity like 3, we end up reallocating a bunch on or
104        // before the very first sample. The number here is not fine-tuned,
105        // just skipping some obviously bad, tiny sizes.
106        strings.reserve(32);
107
108        // Always hold the empty string as item 0. Do not insert it via intern
109        // because that will try to allocate zero-bytes from the storage,
110        // which is sketchy.
111        strings.insert("");
112
113        Self { bytes, strings }
114    }
115
116    /// Returns the number of strings currently held in the string table.
117    #[inline]
118    #[allow(clippy::len_without_is_empty)]
119    pub fn len(&self) -> usize {
120        self.strings.len()
121    }
122
123    /// Adds the string to the string table if it isn't present already, and
124    /// returns a [StringId] that corresponds to the order that this string
125    /// was originally inserted.
126    ///
127    /// # Panics
128    ///
129    /// Panics if an allocator fails, or if the number of strings would exceed
130    /// [`u32::MAX`].
131    pub fn intern(&mut self, str: &str) -> StringId {
132        #[allow(clippy::expect_used)]
133        self.try_intern(str).expect("failed to intern string")
134    }
135
136    /// Adds the string to the string table if it isn't present already, and
137    /// returns a [`StringId`] that corresponds to the order that this string
138    /// was originally inserted.
139    ///
140    /// Fails if an allocator fails, or if the number of strings would exceed
141    /// [`u32::MAX`].
142    pub fn try_intern(&mut self, str: &str) -> Result<StringId, Error> {
143        let set = &mut self.strings;
144        match set.get_index_of(str) {
145            // SAFETY: if it already exists, it must fit in the range.
146            Some(offset) => Ok(unsafe { StringId::try_from(offset).unwrap_unchecked() }),
147            None => {
148                // No match. Get the current size of the table, which
149                // corresponds to the StringId it will have when inserted.
150                let string_id = StringId::try_from(set.len()).map_err(|_| Error::StorageFull)?;
151
152                // Make a new string in the arena, and fudge its lifetime
153                // to appease the borrow checker.
154                let new_str = {
155                    let s = ArenaAllocator::allocate(&self.bytes, str)?;
156
157                    // SAFETY: all references to this value get re-narrowed to
158                    // the lifetime of the string table or iterator when
159                    // exposed to the user. The string table and iterator will
160                    // keep the arena alive, making the access safe.
161                    unsafe { core::mem::transmute::<&str, &'static str>(s) }
162                };
163
164                // Add it to the set.
165                self.strings.try_reserve(1)?;
166                self.strings.insert(new_str);
167
168                Ok(string_id)
169            }
170        }
171    }
172}
173
174/// A [LendingIterator] for a [StringTable]. Make one by calling
175/// [StringTable::into_lending_iter].
176pub struct StringTableIter {
177    /// This is actually used, the compiler doesn't know that the static
178    /// references in `iter` actually point in here.
179    #[allow(unused)]
180    bytes: ChainAllocator<VirtualAllocator>,
181
182    /// The strings of the string table, in order of insertion.
183    /// The static lifetimes are a lie, they are tied to the `bytes`. When
184    /// handing out references, bind the lifetime to the iterator's lifetime,
185    /// which is a [LendingIterator] is needed.
186    iter: <HashSet<&'static str> as IntoIterator>::IntoIter,
187}
188
189impl StringTableIter {
190    fn new(string_table: StringTable) -> StringTableIter {
191        StringTableIter {
192            bytes: string_table.bytes,
193            iter: string_table.strings.into_iter(),
194        }
195    }
196}
197
198impl LendingIterator for StringTableIter {
199    type Item<'a>
200        = &'a str
201    where
202        Self: 'a;
203
204    fn next(&mut self) -> Option<Self::Item<'_>> {
205        self.iter.next()
206    }
207
208    fn count(self) -> usize {
209        self.iter.count()
210    }
211}
212
213impl IntoLendingIterator for StringTable {
214    type Iter = StringTableIter;
215
216    fn into_lending_iter(self) -> Self::Iter {
217        StringTableIter::new(self)
218    }
219}
220
221#[cfg(test)]
222mod tests {
223    use super::*;
224    use crate::collections::identifiable::Id;
225
226    #[test]
227    fn fuzz_arena_allocator() {
228        bolero::check!()
229            .with_type::<(usize, Vec<String>)>()
230            .for_each(|(size_hint, strings)| {
231                // If the size_hint is insanely large, get allowed allocation
232                // failures.  These are not interesting, so avoid them.
233                if *size_hint > 4 * 1024 * 1024 * 1024 {
234                    return;
235                }
236                let bytes = ChainAllocator::new_in(*size_hint, VirtualAllocator {});
237                let mut allocated_strings = vec![];
238                for string in strings {
239                    let s =
240                        ArenaAllocator::allocate(&bytes, string).expect("allocation to succeed");
241                    assert_eq!(s, string);
242                    allocated_strings.push(s);
243                }
244                assert_eq!(strings.len(), allocated_strings.len());
245                strings
246                    .iter()
247                    .zip(allocated_strings.iter())
248                    .for_each(|(s, t)| assert_eq!(s, t));
249            });
250    }
251
252    /// This is a fuzz test for the allocation optimized `StringTable`.
253    /// It checks both safety (lack of crashes / sanitizer failures),
254    /// as well as functional correctness (the table should behave like an
255    /// ordered set).
256    /// Limitations:
257    ///   - The crate used here to generate Strings internally has a default range for the length of
258    ///     a string, (0..=64) We should experiment with longer strings to see what happens. https://github.com/camshaft/bolero/blob/f401669697ffcbe7f34cbfd09fd57b93d5df734c/lib/bolero-generator/src/alloc/mod.rs#L17
259    ///   - Since iterating is destructive, can only check the string values once.
260    ///
261    /// `cargo +nightly bolero test collections::string_table::tests::fuzz_string_table -T 1min`
262    #[test]
263    fn fuzz_string_table() {
264        bolero::check!()
265            .with_type::<Vec<String>>()
266            .for_each(|strings| {
267                // Compare our optimized implementation against a "golden" version
268                // from the standard library.
269                let mut golden_list = vec![""];
270                let mut golden_set = std::collections::HashSet::from([""]);
271                let mut st = StringTable::new();
272
273                for string in strings {
274                    assert_eq!(st.len(), golden_set.len());
275                    if golden_set.insert(string) {
276                        golden_list.push(string);
277                    }
278
279                    let str_id = st.intern(string);
280                    // The str_id should refer to the id_th string interned
281                    // on the list.  We can't look inside the `StringTable`
282                    // in a non-desctrive way, but fortunately we have the
283                    // `golden_list` to compare against.
284                    assert_eq!(string, golden_list[usize::from(str_id)]);
285                }
286                assert_eq!(st.len(), golden_list.len());
287                assert_eq!(st.len(), golden_set.len());
288
289                // Check that the strings remain in order
290                let mut it = st.into_lending_iter();
291                let mut idx = 0;
292                while let Some(s) = it.next() {
293                    assert_eq!(s, golden_list[idx]);
294                    idx += 1;
295                }
296            })
297    }
298
299    #[test]
300    fn test_basics() {
301        let mut table = StringTable::new();
302        // The empty string should already be present.
303        assert_eq!(1, table.len());
304        assert_eq!(StringId::ZERO, table.intern(""));
305
306        // Intern a string literal to ensure ?Sized works.
307        let string = table.intern("datadog");
308        assert_eq!(StringId::from_offset(1), string);
309        assert_eq!(2, table.len());
310    }
311
312    #[track_caller]
313    fn test_from_src(src: &[&str]) {
314        // Insert all the strings.
315        let mut table = StringTable::new();
316        let n_strings = src.len();
317        for string in src {
318            table.intern(string);
319        }
320        assert_eq!(n_strings, table.len());
321
322        // Re-inserting doesn't change the size.
323        for string in src {
324            table.intern(string);
325        }
326        assert_eq!(n_strings, table.len());
327
328        // Check that they are ordered correctly when iterating.
329        let mut actual_iter = table.into_lending_iter();
330        let mut expected_iter = src.iter();
331        while let (Some(expected), Some(actual)) = (expected_iter.next(), actual_iter.next()) {
332            assert_eq!(*expected, actual);
333        }
334
335        // The iterators should be exhausted at this point.
336        assert_eq!(None, expected_iter.next());
337        assert_eq!(0, actual_iter.count());
338    }
339
340    #[test]
341    fn test_small_set_of_strings() {
342        let cases: &[_] = &[
343            "",
344            "local root span id",
345            "span id",
346            "trace endpoint",
347            "samples",
348            "count",
349            "wall-time",
350            "nanoseconds",
351            "cpu-time",
352            "<?php",
353            "/srv/demo/public/index.php",
354            "pid",
355            "/var/www/public/index.php",
356            "main",
357            "thread id",
358            "A\\Very\\Long\\Php\\Namespace\\Class::method",
359            "/",
360        ];
361        test_from_src(cases);
362    }
363
364    /// Test inserting strings from a WordPress profile.
365    /// Here we're checking that we don't panic or otherwise fail, and that
366    /// the total number of strings and the bytes of those strings match.
367    #[test]
368    fn test_wordpress() {
369        test_from_src(&wordpress_test_data::WORDPRESS_STRINGS);
370    }
371}