1use std::hash::{BuildHasher as _, Hash};
2use std::ops::Index;
3
4use hashbrown::hash_table::{Entry, HashTable};
5use hashbrown::DefaultHashBuilder as RandomState;
6
7#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
17#[repr(transparent)]
18pub struct Token(pub u32);
19
20impl From<u32> for Token {
21 fn from(token: u32) -> Self {
22 Token(token)
23 }
24}
25
26impl From<Token> for u32 {
27 fn from(token: Token) -> Self {
28 token.0
29 }
30}
31
32pub trait TokenSource {
33 type Token: Hash + Eq;
34 type Tokenizer: Iterator<Item = Self::Token>;
35 fn tokenize(&self) -> Self::Tokenizer;
36 fn estimate_tokens(&self) -> u32;
37}
38
39#[derive(Default)]
49pub struct InternedInput<T> {
50 pub before: Vec<Token>,
51 pub after: Vec<Token>,
52 pub interner: Interner<T>,
53}
54
55impl<T> InternedInput<T> {
56 pub fn clear(&mut self) {
57 self.before.clear();
58 self.after.clear();
59 self.interner.clear();
60 }
61}
62
63impl<T: Eq + Hash> InternedInput<T> {
64 pub fn new<I: TokenSource<Token = T>>(before: I, after: I) -> Self {
65 let token_estimate_before = before.estimate_tokens() as usize;
66 let token_estimate_after = after.estimate_tokens() as usize;
67 let mut res = Self {
68 before: Vec::with_capacity(token_estimate_before),
69 after: Vec::with_capacity(token_estimate_after),
70 interner: Interner::new(token_estimate_before + token_estimate_after),
71 };
72 res.update_before(before.tokenize());
73 res.update_after(after.tokenize());
74 res
75 }
76
77 pub fn reserve_for_token_source<S: TokenSource<Token = T> + ?Sized>(
80 &mut self,
81 before: &S,
82 after: &S,
83 ) {
84 self.reserve(before.estimate_tokens(), after.estimate_tokens())
85 }
86
87 pub fn reserve(&mut self, capacity_before: u32, capacity_after: u32) {
88 self.before.reserve(capacity_before as usize);
89 self.after.reserve(capacity_after as usize);
90 self.interner
91 .reserve(capacity_before as usize + capacity_after as usize);
92 }
93
94 pub fn update_before(&mut self, input: impl Iterator<Item = T>) {
99 self.before.clear();
100 self.before
101 .extend(input.map(|token| self.interner.intern(token)));
102 }
103
104 pub fn update_after(&mut self, input: impl Iterator<Item = T>) {
110 self.after.clear();
111 self.after
112 .extend(input.map(|token| self.interner.intern(token)));
113 }
114}
115
116#[derive(Default)]
118pub struct Interner<T> {
119 tokens: Vec<T>,
120 table: HashTable<Token>,
121 hasher: RandomState,
122}
123
124impl<T> Interner<T> {
125 pub fn new_for_token_source<S: TokenSource<Token = T>>(before: &S, after: &S) -> Self {
128 Self::new(before.estimate_tokens() as usize + after.estimate_tokens() as usize)
129 }
130
131 pub fn new(capacity: usize) -> Interner<T> {
133 Interner {
134 tokens: Vec::with_capacity(capacity),
135 table: HashTable::with_capacity(capacity),
136 hasher: RandomState::default(),
137 }
138 }
139
140 pub fn clear(&mut self) {
142 self.table.clear();
143 self.tokens.clear();
144 }
145
146 pub fn num_tokens(&self) -> u32 {
148 self.tokens.len() as u32
149 }
150}
151
152impl<T: Hash + Eq> Interner<T> {
153 pub fn reserve_for_token_source<S: TokenSource<Token = T>>(&mut self, before: &S, after: &S) {
156 self.reserve(before.estimate_tokens() as usize + after.estimate_tokens() as usize)
157 }
158
159 pub fn reserve(&mut self, capacity: usize) {
160 self.table.reserve(capacity, |&token| {
161 self.hasher.hash_one(&self.tokens[token.0 as usize])
162 });
163 self.tokens.reserve(capacity);
164 }
165
166 pub fn intern(&mut self, token: T) -> Token {
168 let hash = self.hasher.hash_one(&token);
169 match self.table.entry(
170 hash,
171 |&it| self.tokens[it.0 as usize] == token,
172 |&token| self.hasher.hash_one(&self.tokens[token.0 as usize]),
173 ) {
174 Entry::Occupied(entry) => *entry.get(),
175 Entry::Vacant(entry) => {
176 let interned = Token(self.tokens.len() as u32);
177 entry.insert(interned);
178 self.tokens.push(token);
179 interned
180 }
181 }
182 }
183
184 pub fn erase_tokens_after(&mut self, first_erased_token: Token) {
186 assert!(first_erased_token.0 <= self.tokens.len() as u32);
187 let retained = first_erased_token.0 as usize;
188 let erased = self.tokens.len() - retained;
189 if retained <= erased {
190 self.table.clear();
191 for (i, token) in self.tokens[0..retained].iter().enumerate() {
192 let hash = self.hasher.hash_one(token);
193 self.table.insert_unique(hash, Token(i as u32), |&token| {
194 self.hasher.hash_one(&self.tokens[token.0 as usize])
195 });
196 }
197 } else {
198 for (i, token) in self.tokens[retained..].iter().enumerate() {
199 let hash = self.hasher.hash_one(token);
200 match self
201 .table
202 .find_entry(hash, |token| token.0 == (retained + i) as u32)
203 {
204 Ok(occupied) => drop(occupied.remove()),
205 Err(_absent) => unreachable!(),
206 }
207 }
208 }
209 self.tokens.truncate(first_erased_token.0 as usize);
210 }
211}
212
213impl<T> Index<Token> for Interner<T> {
214 type Output = T;
215 fn index(&self, index: Token) -> &Self::Output {
216 &self.tokens[index.0 as usize]
217 }
218}