libdd_profiling/collections/string_table/mod.rs
1// Copyright 2024-Present Datadog, Inc. https://www.datadoghq.com/
2// SPDX-License-Identifier: Apache-2.0
3
4mod error;
5#[allow(unused)]
6pub mod wordpress_test_data;
7
8pub use error::*;
9
10use crate::collections::identifiable::StringId;
11use crate::iter::{IntoLendingIterator, LendingIterator};
12use libdd_alloc::{AllocError, Allocator, ChainAllocator, VirtualAllocator};
13use std::alloc::Layout;
14
15/// A trait that indicates an allocator is arena allocator, meaning it doesn't
16/// deallocate individual items, but deallocates their memory as a group when
17/// the arena is dropped.
18pub trait ArenaAllocator: Allocator {
19 /// Copies the str into the arena, and returns a slice to the new str.
20 fn allocate(&self, str: &str) -> Result<&str, AllocError> {
21 // TODO: We might want each allocator to return its own empty string
22 // so we can debug where the value came from.
23 if str.is_empty() {
24 return Ok("");
25 }
26 let layout = Layout::for_value(str);
27 let uninit_ptr = Allocator::allocate(self, layout)?;
28
29 // Copy the bytes of the string into the allocated memory.
30 // SAFETY: this is guaranteed to not be overlapping because an
31 // allocator must not return aliasing bytes in its allocations.
32 unsafe {
33 let src = str.as_ptr();
34 let dst = uninit_ptr.as_ptr() as *mut u8;
35 let count = str.len();
36 core::ptr::copy_nonoverlapping(src, dst, count);
37 }
38
39 // SAFETY: The bytes were properly initialized, and they cannot be
40 // misaligned because they have an alignment of 1, so it is safe to
41 // create a slice of the given data and length. The lifetime matches
42 // the arena allocator's lifetime.
43 let slice: &[u8] =
44 unsafe { core::slice::from_raw_parts(uninit_ptr.as_ptr() as *const u8, str.len()) };
45
46 // SAFETY: Since the bytes were copied from a valid str without
47 // slicing, the bytes must also be utf-8.
48 Ok(unsafe { core::str::from_utf8_unchecked(slice) })
49 }
50}
51
52impl<A: Allocator + Clone> ArenaAllocator for ChainAllocator<A> {}
53
54type Hasher = core::hash::BuildHasherDefault<rustc_hash::FxHasher>;
55type HashSet<K> = indexmap::IndexSet<K, Hasher>;
56
57/// Holds unique strings and provides [StringId]s that correspond to the order
58/// that the strings were inserted.
59pub struct StringTable {
60 /// The bytes of each string stored in `strings` are allocated here.
61 bytes: ChainAllocator<VirtualAllocator>,
62
63 /// The ordered hash set of unique strings. The order becomes the StringId.
64 /// The static lifetime is a lie, it is tied to the `bytes`, which is only
65 /// moved if the string table is moved e.g.
66 /// [StringTable::into_lending_iterator].
67 /// References to the underlying strings should generally not be handed,
68 /// but if they are, they should be bound to the string table's lifetime
69 /// or the lending iterator's lifetime.
70 strings: HashSet<&'static str>,
71}
72
73impl Default for StringTable {
74 fn default() -> Self {
75 Self::new()
76 }
77}
78
79impl StringTable {
80 /// Creates a new string table, which initially holds the empty string and
81 /// no others.
82 pub fn new() -> Self {
83 // Keep in mind 32-bit .NET. There is only 2 GiB of virtual memory
84 // total available to an application, and we're not the application,
85 // we're just a piece inside it. Additionally, there may be 2 or more
86 // string tables in memory at a given time. Talk to .NET profiling
87 // engineers before making this any bigger.
88 const SIZE_HINT: usize = 4 * 1024 * 1024;
89 let bytes = ChainAllocator::new_in(SIZE_HINT, VirtualAllocator {});
90
91 let mut strings = HashSet::with_hasher(Hasher::default());
92 // It varies by implementation, but frequently I've noticed that the
93 // capacity after the first insertion is quite small, as in 3. This is
94 // a bit too small and there are frequent reallocations. For one sample
95 // with endpoint + code hotspots, we'd have at least these strings:
96 // - ""
97 // - At least one sample type
98 // - At least one sample unit--already at 3 without any samples.
99 // - "local root span id"
100 // - "span id"
101 // - "trace endpoint"
102 // - A file and/or function name per frame.
103 // So with a capacity like 3, we end up reallocating a bunch on or
104 // before the very first sample. The number here is not fine-tuned,
105 // just skipping some obviously bad, tiny sizes.
106 strings.reserve(32);
107
108 // Always hold the empty string as item 0. Do not insert it via intern
109 // because that will try to allocate zero-bytes from the storage,
110 // which is sketchy.
111 strings.insert("");
112
113 Self { bytes, strings }
114 }
115
116 /// Returns the number of strings currently held in the string table.
117 #[inline]
118 #[allow(clippy::len_without_is_empty)]
119 pub fn len(&self) -> usize {
120 self.strings.len()
121 }
122
123 /// Adds the string to the string table if it isn't present already, and
124 /// returns a [StringId] that corresponds to the order that this string
125 /// was originally inserted.
126 ///
127 /// # Panics
128 ///
129 /// Panics if an allocator fails, or if the number of strings would exceed
130 /// [`u32::MAX`].
131 pub fn intern(&mut self, str: &str) -> StringId {
132 #[allow(clippy::expect_used)]
133 self.try_intern(str).expect("failed to intern string")
134 }
135
136 /// Adds the string to the string table if it isn't present already, and
137 /// returns a [`StringId`] that corresponds to the order that this string
138 /// was originally inserted.
139 ///
140 /// Fails if an allocator fails, or if the number of strings would exceed
141 /// [`u32::MAX`].
142 pub fn try_intern(&mut self, str: &str) -> Result<StringId, Error> {
143 let set = &mut self.strings;
144 match set.get_index_of(str) {
145 // SAFETY: if it already exists, it must fit in the range.
146 Some(offset) => Ok(unsafe { StringId::try_from(offset).unwrap_unchecked() }),
147 None => {
148 // No match. Get the current size of the table, which
149 // corresponds to the StringId it will have when inserted.
150 let string_id = StringId::try_from(set.len()).map_err(|_| Error::StorageFull)?;
151
152 // Make a new string in the arena, and fudge its lifetime
153 // to appease the borrow checker.
154 let new_str = {
155 let s = ArenaAllocator::allocate(&self.bytes, str)?;
156
157 // SAFETY: all references to this value get re-narrowed to
158 // the lifetime of the string table or iterator when
159 // exposed to the user. The string table and iterator will
160 // keep the arena alive, making the access safe.
161 unsafe { core::mem::transmute::<&str, &'static str>(s) }
162 };
163
164 // Add it to the set.
165 self.strings.try_reserve(1)?;
166 self.strings.insert(new_str);
167
168 Ok(string_id)
169 }
170 }
171 }
172}
173
174/// A [LendingIterator] for a [StringTable]. Make one by calling
175/// [StringTable::into_lending_iter].
176pub struct StringTableIter {
177 /// This is actually used, the compiler doesn't know that the static
178 /// references in `iter` actually point in here.
179 #[allow(unused)]
180 bytes: ChainAllocator<VirtualAllocator>,
181
182 /// The strings of the string table, in order of insertion.
183 /// The static lifetimes are a lie, they are tied to the `bytes`. When
184 /// handing out references, bind the lifetime to the iterator's lifetime,
185 /// which is a [LendingIterator] is needed.
186 iter: <HashSet<&'static str> as IntoIterator>::IntoIter,
187}
188
189impl StringTableIter {
190 fn new(string_table: StringTable) -> StringTableIter {
191 StringTableIter {
192 bytes: string_table.bytes,
193 iter: string_table.strings.into_iter(),
194 }
195 }
196}
197
198impl LendingIterator for StringTableIter {
199 type Item<'a>
200 = &'a str
201 where
202 Self: 'a;
203
204 fn next(&mut self) -> Option<Self::Item<'_>> {
205 self.iter.next()
206 }
207
208 fn count(self) -> usize {
209 self.iter.count()
210 }
211}
212
213impl IntoLendingIterator for StringTable {
214 type Iter = StringTableIter;
215
216 fn into_lending_iter(self) -> Self::Iter {
217 StringTableIter::new(self)
218 }
219}
220
221#[cfg(test)]
222mod tests {
223 use super::*;
224 use crate::collections::identifiable::Id;
225
226 #[test]
227 fn fuzz_arena_allocator() {
228 bolero::check!()
229 .with_type::<(usize, Vec<String>)>()
230 .for_each(|(size_hint, strings)| {
231 // If the size_hint is insanely large, get allowed allocation
232 // failures. These are not interesting, so avoid them.
233 if *size_hint > 4 * 1024 * 1024 * 1024 {
234 return;
235 }
236 let bytes = ChainAllocator::new_in(*size_hint, VirtualAllocator {});
237 let mut allocated_strings = vec![];
238 for string in strings {
239 let s =
240 ArenaAllocator::allocate(&bytes, string).expect("allocation to succeed");
241 assert_eq!(s, string);
242 allocated_strings.push(s);
243 }
244 assert_eq!(strings.len(), allocated_strings.len());
245 strings
246 .iter()
247 .zip(allocated_strings.iter())
248 .for_each(|(s, t)| assert_eq!(s, t));
249 });
250 }
251
252 /// This is a fuzz test for the allocation optimized `StringTable`.
253 /// It checks both safety (lack of crashes / sanitizer failures),
254 /// as well as functional correctness (the table should behave like an
255 /// ordered set).
256 /// Limitations:
257 /// - The crate used here to generate Strings internally has a default range for the length of
258 /// a string, (0..=64) We should experiment with longer strings to see what happens. https://github.com/camshaft/bolero/blob/f401669697ffcbe7f34cbfd09fd57b93d5df734c/lib/bolero-generator/src/alloc/mod.rs#L17
259 /// - Since iterating is destructive, can only check the string values once.
260 ///
261 /// `cargo +nightly bolero test collections::string_table::tests::fuzz_string_table -T 1min`
262 #[test]
263 fn fuzz_string_table() {
264 bolero::check!()
265 .with_type::<Vec<String>>()
266 .for_each(|strings| {
267 // Compare our optimized implementation against a "golden" version
268 // from the standard library.
269 let mut golden_list = vec![""];
270 let mut golden_set = std::collections::HashSet::from([""]);
271 let mut st = StringTable::new();
272
273 for string in strings {
274 assert_eq!(st.len(), golden_set.len());
275 if golden_set.insert(string) {
276 golden_list.push(string);
277 }
278
279 let str_id = st.intern(string);
280 // The str_id should refer to the id_th string interned
281 // on the list. We can't look inside the `StringTable`
282 // in a non-desctrive way, but fortunately we have the
283 // `golden_list` to compare against.
284 assert_eq!(string, golden_list[usize::from(str_id)]);
285 }
286 assert_eq!(st.len(), golden_list.len());
287 assert_eq!(st.len(), golden_set.len());
288
289 // Check that the strings remain in order
290 let mut it = st.into_lending_iter();
291 let mut idx = 0;
292 while let Some(s) = it.next() {
293 assert_eq!(s, golden_list[idx]);
294 idx += 1;
295 }
296 })
297 }
298
299 #[test]
300 fn test_basics() {
301 let mut table = StringTable::new();
302 // The empty string should already be present.
303 assert_eq!(1, table.len());
304 assert_eq!(StringId::ZERO, table.intern(""));
305
306 // Intern a string literal to ensure ?Sized works.
307 let string = table.intern("datadog");
308 assert_eq!(StringId::from_offset(1), string);
309 assert_eq!(2, table.len());
310 }
311
312 #[track_caller]
313 fn test_from_src(src: &[&str]) {
314 // Insert all the strings.
315 let mut table = StringTable::new();
316 let n_strings = src.len();
317 for string in src {
318 table.intern(string);
319 }
320 assert_eq!(n_strings, table.len());
321
322 // Re-inserting doesn't change the size.
323 for string in src {
324 table.intern(string);
325 }
326 assert_eq!(n_strings, table.len());
327
328 // Check that they are ordered correctly when iterating.
329 let mut actual_iter = table.into_lending_iter();
330 let mut expected_iter = src.iter();
331 while let (Some(expected), Some(actual)) = (expected_iter.next(), actual_iter.next()) {
332 assert_eq!(*expected, actual);
333 }
334
335 // The iterators should be exhausted at this point.
336 assert_eq!(None, expected_iter.next());
337 assert_eq!(0, actual_iter.count());
338 }
339
340 #[test]
341 fn test_small_set_of_strings() {
342 let cases: &[_] = &[
343 "",
344 "local root span id",
345 "span id",
346 "trace endpoint",
347 "samples",
348 "count",
349 "wall-time",
350 "nanoseconds",
351 "cpu-time",
352 "<?php",
353 "/srv/demo/public/index.php",
354 "pid",
355 "/var/www/public/index.php",
356 "main",
357 "thread id",
358 "A\\Very\\Long\\Php\\Namespace\\Class::method",
359 "/",
360 ];
361 test_from_src(cases);
362 }
363
364 /// Test inserting strings from a WordPress profile.
365 /// Here we're checking that we don't panic or otherwise fail, and that
366 /// the total number of strings and the bytes of those strings match.
367 #[test]
368 fn test_wordpress() {
369 test_from_src(&wordpress_test_data::WORDPRESS_STRINGS);
370 }
371}