1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
use std::borrow::Borrow;
use std::hash::{Hash, Hasher};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Mutex, MutexGuard};
use std::time::{SystemTime, UNIX_EPOCH};

use ahash::RandomState;
use once_cell::sync::Lazy;
use smartstring::{LazyCompact, SmartString};

use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
use crate::prelude::PlHashMap;

pub(crate) static USE_STRING_CACHE: AtomicBool = AtomicBool::new(false);

pub fn with_string_cache<F: FnOnce() -> T, T>(func: F) -> T {
    toggle_string_cache(true);
    let out = func();
    toggle_string_cache(false);
    out
}

/// Use a global string cache for the Categorical Types.
///
/// This is used to cache the string categories locally.
/// This allows join operations on categorical types.
pub fn toggle_string_cache(toggle: bool) {
    USE_STRING_CACHE.store(toggle, Ordering::Release);
    if !toggle {
        STRING_CACHE.clear()
    }
}

/// Reset the global string cache used for the Categorical Types.
pub fn reset_string_cache() {
    STRING_CACHE.clear()
}

/// Check if string cache is set.
pub fn using_string_cache() -> bool {
    USE_STRING_CACHE.load(Ordering::Acquire)
}

pub(crate) struct SCacheInner {
    pub(crate) map: PlHashMap<StrHashGlobal, u32>,
    pub(crate) uuid: u128,
}

impl Default for SCacheInner {
    fn default() -> Self {
        Self {
            map: PlHashMap::with_capacity_and_hasher(
                HASHMAP_INIT_SIZE,
                StringCache::get_hash_builder(),
            ),
            uuid: SystemTime::now()
                .duration_since(UNIX_EPOCH)
                .unwrap()
                .as_nanos(),
        }
    }
}

/// Used by categorical data that need to share global categories.
/// In *eager* you need to specifically toggle global string cache to have a global effect.
/// In *lazy* it is toggled on at the start of a computation run and turned of (deleted) when a
/// result is produced.
pub(crate) struct StringCache(pub(crate) Mutex<SCacheInner>);

impl StringCache {
    /// The global `StringCache` will always use a predictable seed. This allows local builders to mimic
    /// the hashes in case of contention.
    pub(crate) fn get_hash_builder() -> RandomState {
        RandomState::with_seed(0)
    }

    /// Lock the string cache
    pub(crate) fn lock_map(&self) -> MutexGuard<SCacheInner> {
        self.0.lock().unwrap()
    }

    pub(crate) fn clear(&self) {
        let mut lock = self.lock_map();
        *lock = Default::default();
    }
}

impl Default for StringCache {
    fn default() -> Self {
        StringCache(Mutex::new(Default::default()))
    }
}

pub(crate) static STRING_CACHE: Lazy<StringCache> = Lazy::new(Default::default);

#[derive(Eq, Clone)]
pub struct StrHashGlobal {
    pub(crate) str: SmartString<LazyCompact>,
    pub(crate) hash: u64,
}

impl Hash for StrHashGlobal {
    fn hash<H: Hasher>(&self, state: &mut H) {
        state.write_u64(self.hash)
    }
}

impl StrHashGlobal {
    pub(crate) fn new(s: SmartString<LazyCompact>, hash: u64) -> Self {
        Self { str: s, hash }
    }
}

impl PartialEq for StrHashGlobal {
    fn eq(&self, other: &Self) -> bool {
        // can be collisions in the hashtable even though the hashes are equal
        // e.g. hashtable hash = hash % n_slots
        (self.hash == other.hash) && (self.str == other.str)
    }
}

impl Borrow<str> for StrHashGlobal {
    fn borrow(&self) -> &str {
        self.str.as_str()
    }
}