Skip to main content

hcstatic_str/
lib.rs

1//! Global, permanent, packed, hashconsed, short string storage.
2//!
3//! * supports strings up to 256 bytes
4//! * derefs to a &str, but uses only 1 word on the stack and len + 1 bytes on the heap
5//! * the actual bytes are stored packed into 1 MiB allocations to
6//!   avoid the overhead of lots of small mallocs
7//! * Copy!
8//! * hashconsed, the same &str will always produce a pointer to the same memory
9//!
10//! CAN NEVER BE DEALLOCATED
11
12use anyhow::bail;
13use fxhash::FxHashSet;
14use once_cell::sync::Lazy;
15use parking_lot::Mutex;
16use std::{borrow::Borrow, collections::HashSet, hash::Hash, ops::Deref, slice, str};
17
18const CHUNK_SIZE: usize = 1 * 1024 * 1024;
19
20struct Chunk {
21    data: Vec<u8>,
22    pos: usize,
23}
24
25impl Chunk {
26    fn new() -> &'static mut Self {
27	Box::leak(Box::new(Chunk {
28	    data: vec![0; CHUNK_SIZE],
29	    pos: 0
30	}))
31    }
32
33    fn insert(&mut self, str: &str) -> (*mut Chunk, Str) {
34        let str = str.as_bytes();
35        let mut t = self;
36        loop {
37            if CHUNK_SIZE - t.pos > str.len() {
38                t.data[t.pos] = str.len() as u8;
39                t.data[t.pos + 1..t.pos + 1 + str.len()].copy_from_slice(str);
40                let res = Str(t.data.as_ptr().wrapping_add(t.pos));
41                t.pos += 1 + str.len();
42                break (t, res);
43            } else {
44                t = Self::new();
45            }
46        }
47    }
48}
49
50struct Root {
51    all: FxHashSet<Str>,
52    root: *mut Chunk,
53}
54
55unsafe impl Send for Root {}
56unsafe impl Sync for Root {}
57
58static ROOT: Lazy<Mutex<Root>> = Lazy::new(|| {
59    Mutex::new(Root {
60        all: HashSet::default(),
61        root: Chunk::new(),
62    })
63});
64
65/// This is a pointer into static memory that holds the actual str
66/// slice. This type is 1 word on the stack, the length is stored in
67/// the heap as a byte. Deref is quite cheap, there is no locking to
68/// deref. Only try_from can be expensive since it performs the
69/// hashconsing.
70#[derive(Clone, Copy)]
71pub struct Str(*const u8);
72
73unsafe impl Send for Str {}
74unsafe impl Sync for Str {}
75
76impl Str {
77    fn get(&self) -> &'static str {
78        unsafe {
79            let len = *self.0 as usize;
80            let ptr = self.0.wrapping_add(1);
81            let slice = slice::from_raw_parts(ptr, len);
82            str::from_utf8_unchecked(slice)
83        }
84    }
85}
86
87impl Deref for Str {
88    type Target = str;
89
90    fn deref(&self) -> &'static Self::Target {
91	self.get()
92    }
93}
94
95impl Borrow<str> for Str {
96    fn borrow(&self) -> &'static str {
97	self.get()
98    }
99}
100
101impl AsRef<str> for Str {
102    fn as_ref(&self) -> &'static str {
103	self.get()
104    }
105}
106
107impl Hash for Str {
108    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
109        (&**self).hash(state)
110    }
111}
112
113impl PartialEq for Str {
114    fn eq(&self, other: &Self) -> bool {
115        &**self == &**other
116    }
117}
118
119impl Eq for Str {}
120
121impl PartialOrd for Str {
122    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
123        (&**self).partial_cmp(&**other)
124    }
125}
126
127impl Ord for Str {
128    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
129        (&**self).cmp(&**other)
130    }
131}
132
133impl TryFrom<&str> for Str {
134    type Error = anyhow::Error;
135
136    fn try_from(s: &str) -> Result<Self, Self::Error> {
137        if s.as_bytes().len() > u8::MAX as usize {
138            bail!("string is too long")
139        } else {
140            let mut root = ROOT.lock();
141	    match root.all.get(s) {
142                Some(t) => Ok(*t),
143                None => unsafe {
144		    let (r, t) = (*root.root).insert(s);
145		    root.root = r;
146		    root.all.insert(t);
147		    Ok(t)
148                }
149	    }
150        }
151    }
152}
153
154#[cfg(test)]
155mod test {
156    use super::*;
157    use rand::{thread_rng, Rng};
158
159    fn rand(size: usize) -> String {
160        let mut s = String::new();
161        for _ in 0..size {
162            s.push(thread_rng().gen())
163        }
164        s
165    }
166
167    #[test]
168    fn test_single() {
169        let s = rand(32);
170        let t0 = Str::try_from(s.as_str()).unwrap();
171        assert_eq!(&*t0, &*s);
172        let t1 = Str::try_from(s.as_str()).unwrap();
173        assert_eq!(t0.0, t1.0);
174    }
175
176    #[test]
177    fn test_lots() {
178        for _ in 0..1000000 {
179            let s = rand(32);
180            let t0 = Str::try_from(s.as_str()).unwrap();
181            assert_eq!(&*t0, &*s);
182            let t1 = Str::try_from(s.as_str()).unwrap();
183            assert_eq!(t0.0, t1.0)
184        }
185    }
186}