libdd_profiling/collections/
string_storage.rs

1// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/
2// SPDX-License-Identifier: Apache-2.0
3
4use anyhow::Context;
5use std::cell::Cell;
6use std::collections::HashMap;
7use std::hash::BuildHasherDefault;
8use std::num::NonZeroU32;
9use std::rc::Rc;
10
11use super::identifiable::StringId;
12use super::string_table::StringTable;
13
14#[derive(PartialEq, Debug)]
15struct ManagedStringData {
16    str: Rc<str>,
17    cached_seq_num: Cell<Option<(InternalCachedProfileId, StringId)>>,
18    usage_count: Cell<u32>,
19}
20
21pub struct ManagedStringStorage {
22    next_id: u32,
23    id_to_data: HashMap<u32, ManagedStringData, BuildHasherDefault<rustc_hash::FxHasher>>,
24    str_to_id: HashMap<Rc<str>, u32, BuildHasherDefault<rustc_hash::FxHasher>>,
25    current_gen: u32,
26    next_cached_profile_id: InternalCachedProfileId,
27}
28
29#[derive(PartialEq, Debug)]
30// The `ManagedStringStorage::get_seq_num` operation is used to map a `ManagedStorageId` into a
31// `StringId` for a given `StringTable`. As an optimization, we store a one-element `cached_seq_num`
32//  inline cache with each `ManagedStringData` entry, so that repeatedly calling
33// `get_seq_num` with the same id provides faster lookups.
34//
35// Because:
36// 1. Multiple profiles can be using the same `ManagedStringTable`
37// 2. The same profile resets its `StringTable` on serialization and starts anew
38// ...we need a way to identify when the cache can and cannot be reused.
39//
40// This is where the `CachedProfileId` comes in. A given `CachedProfileId` should be considered
41// as representing a unique `StringTable`. Different `StringTable`s should have different
42// `CachedProfileId`s, and when a `StringTable` gets flushed and starts anew, it should also have a
43// different `CachedProfileId`.
44//
45// **This struct is on purpose not Copy and not Clone to try to make it really hard to accidentally
46// reuse** when a profile gets reset.
47pub struct CachedProfileId {
48    id: u32,
49}
50
51#[derive(PartialEq, Debug, Copy, Clone)]
52struct InternalCachedProfileId {
53    id: u32,
54}
55
56// Enable Mutex<ManagedStringStorage> to be Send
57//
58// SAFETY: ManagedStringStorage **must always** be wrapped with a Mutex -- you can't pass one in to
59// a Profile without it. This is because it is not, by itself, thread-safe, and its real-world use
60// cases are expected to include concurrency.
61unsafe impl Send for ManagedStringStorage {}
62
63impl From<&CachedProfileId> for InternalCachedProfileId {
64    fn from(cached: &CachedProfileId) -> Self {
65        InternalCachedProfileId { id: cached.id }
66    }
67}
68
69impl ManagedStringStorage {
70    pub fn new() -> Self {
71        let mut storage = ManagedStringStorage {
72            next_id: 0,
73            id_to_data: Default::default(),
74            str_to_id: Default::default(),
75            current_gen: 0,
76            next_cached_profile_id: InternalCachedProfileId { id: 0 },
77        };
78        // Ensure empty string gets id 0 and always has usage > 0 so it's always retained
79        // Safety: On an empty managed string table intern should never fail.
80        #[allow(clippy::expect_used)]
81        storage.intern_new("").expect("Initialization to succeed");
82        storage
83    }
84
85    pub fn next_cached_profile_id(&mut self) -> anyhow::Result<CachedProfileId> {
86        let next_id = self.next_cached_profile_id.id;
87        self.next_cached_profile_id = InternalCachedProfileId {
88            id: next_id
89                .checked_add(1)
90                .context("Ran out of cached_profile_ids!")?,
91        };
92        Ok(CachedProfileId { id: next_id })
93    }
94
95    pub fn advance_gen(&mut self) {
96        self.id_to_data.retain(|_, data| {
97            let retain = data.usage_count.get() > 0;
98            if !retain {
99                self.str_to_id.remove_entry(&data.str);
100            }
101            retain
102        });
103        self.current_gen += 1;
104    }
105
106    pub fn intern(&mut self, item: &str) -> anyhow::Result<u32> {
107        if item.is_empty() {
108            // We don't increase ref-counts on the empty string
109            return Ok(0);
110        }
111
112        let entry = self.str_to_id.get_key_value(item);
113        match entry {
114            Some((_, id)) => {
115                let usage_count = &self
116                    .id_to_data
117                    .get(id)
118                    .context("BUG: id_to_data and str_to_id should be in sync")?
119                    .usage_count;
120                usage_count.set(
121                    usage_count
122                        .get()
123                        .checked_add(1)
124                        .context("Usage_count overflow")?,
125                );
126                Ok(*id)
127            }
128            None => self.intern_new(item),
129        }
130    }
131
132    fn intern_new(&mut self, item: &str) -> anyhow::Result<u32> {
133        let id = self.next_id;
134        let str: Rc<str> = item.into();
135        let data = ManagedStringData {
136            str: str.clone(),
137            cached_seq_num: Cell::new(None),
138            usage_count: Cell::new(1),
139        };
140        self.next_id = self
141            .next_id
142            .checked_add(1)
143            .context("Ran out of string ids!")?;
144        let old_value = self.str_to_id.insert(str.clone(), id);
145        debug_assert_eq!(old_value, None);
146        let old_value = self.id_to_data.insert(id, data);
147        debug_assert_eq!(old_value, None);
148        Ok(id)
149    }
150
151    // Here id is a NonZeroU32 because an id of 0 is the empty string and that can never be
152    // uninterned (and it should be skipped instead in the caller)
153    pub fn unintern(&mut self, id: NonZeroU32) -> anyhow::Result<()> {
154        let data = self.get_data(id.into())?;
155        let usage_count = &data.usage_count;
156        usage_count.set(
157            usage_count
158                .get()
159                .checked_sub(1)
160                .context("Usage_count underflow")?,
161        );
162        Ok(())
163    }
164
165    // Here id is a NonZeroU32 because an id of 0 which StringTable always maps to 0 as well so this
166    // entire call can be skipped
167    // See comment on `struct CachedProfileId` for details on how to use it.
168    pub fn get_seq_num(
169        &mut self,
170        id: NonZeroU32,
171        profile_strings: &mut StringTable,
172        cached_profile_id: &CachedProfileId,
173    ) -> anyhow::Result<StringId> {
174        let data = self.get_data(id.into())?;
175
176        match data.cached_seq_num.get() {
177            Some((profile_id, seq_num)) if profile_id.id == cached_profile_id.id => Ok(seq_num),
178            _ => {
179                let seq_num = profile_strings.try_intern(data.str.as_ref())?;
180                data.cached_seq_num
181                    .set(Some((cached_profile_id.into(), seq_num)));
182                Ok(seq_num)
183            }
184        }
185    }
186
187    pub fn get_string(&self, id: u32) -> anyhow::Result<Rc<str>> {
188        let data = self.get_data(id)?;
189
190        Ok(data.str.clone())
191    }
192
193    fn get_data(&self, id: u32) -> anyhow::Result<&ManagedStringData> {
194        match self.id_to_data.get(&id) {
195            Some(v) => {
196                if v.usage_count.get() > 0 {
197                    Ok(v)
198                } else {
199                    Err(anyhow::anyhow!(
200                        "Tried to read data for id {} ('{}') but usage count was zero",
201                        id,
202                        v.str
203                    ))
204                }
205            }
206            None => Err(anyhow::anyhow!("ManagedStringId {} is not valid", id)),
207        }
208    }
209}
210
211impl Default for ManagedStringStorage {
212    fn default() -> Self {
213        Self::new()
214    }
215}