gix_imara_diff/intern.rs
1// Modified for gitoxide from the upstream imara-diff crate.
2// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/intern.rs
3
4use std::hash::{BuildHasher as _, Hash};
5use std::ops::Index;
6
7use hashbrown::hash_table::{Entry, HashTable};
8use hashbrown::DefaultHashBuilder as RandomState;
9
10/// A token represented as an interned integer.
11///
12/// A token represents the smallest possible unit of change during a diff.
13/// For text this is usually a line, a word or a single character.
14/// All [algorithms](crate::Algorithm) operate on interned tokens instead
15/// of using the token data directly.
16/// This allows for much better performance by amortizing the cost of hashing/equality.
17///
18/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module.
19#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
20#[repr(transparent)]
21pub struct Token(pub u32);
22
23impl From<u32> for Token {
24 fn from(token: u32) -> Self {
25 Token(token)
26 }
27}
28
29impl From<Token> for u32 {
30 fn from(token: Token) -> Self {
31 token.0
32 }
33}
34
35/// A trait for types that can be split into tokens for diffing.
36///
37/// Implementing this trait allows a type to be used with [`InternedInput`] to create
38/// interned token sequences for computing diffs. For example, `&str` implements this trait
39/// by default to split text into lines.
40pub trait TokenSource {
41 /// The type of token this source produces. Must be hashable and comparable for equality.
42 type Token: Hash + Eq;
43 /// An iterator that yields tokens from this source.
44 type Tokenizer: Iterator<Item = Self::Token>;
45 /// Creates an iterator that yields all tokens from this source.
46 fn tokenize(&self) -> Self::Tokenizer;
47 /// Provides an estimate of the number of tokens this source will produce.
48 ///
49 /// This is used to pre-allocate memory for better performance. The estimate
50 /// does not need to be exact.
51 fn estimate_tokens(&self) -> u32;
52}
53
54/// Two lists of interned [tokens](Token) that a [`Diff`](crate::Diff) can be computed from.
55///
56/// A token represents the smallest possible unit of change during a diff.
57/// For text this is usually a line, a word or a single character.
58/// All [algorithms](crate::Algorithm) operate on interned tokens instead
59/// of using the token data directly.
60/// This allows for much better performance by amortizing the cost of hashing/equality.
61///
62/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module.
63#[derive(Default)]
64pub struct InternedInput<T> {
65 /// The list of interned tokens from the first sequence (before changes).
66 pub before: Vec<Token>,
67 /// The list of interned tokens from the second sequence (after changes).
68 pub after: Vec<Token>,
69 /// The interner that stores the actual token data and maps tokens to their interned IDs.
70 pub interner: Interner<T>,
71}
72
73impl<T> InternedInput<T> {
74 /// Clears all token sequences and the interner.
75 ///
76 /// This removes all tokens from both the before and after sequences, as well as
77 /// clearing the interner's storage.
78 ///
79 /// Note that this will not free the allocated memory.
80 pub fn clear(&mut self) {
81 self.before.clear();
82 self.after.clear();
83 self.interner.clear();
84 }
85}
86
87impl<T: Eq + Hash> InternedInput<T> {
88 /// Creates a new `InternedInput` by tokenizing and interning two token sources.
89 ///
90 /// # Parameters
91 ///
92 /// * `before` - The token source for the first sequence
93 /// * `after` - The token source for the second sequence
94 ///
95 /// # Returns
96 ///
97 /// An `InternedInput` containing interned token sequences ready for diffing
98 pub fn new<I: TokenSource<Token = T>>(before: I, after: I) -> Self {
99 let token_estimate_before = before.estimate_tokens() as usize;
100 let token_estimate_after = after.estimate_tokens() as usize;
101 let mut res = Self {
102 before: Vec::with_capacity(token_estimate_before),
103 after: Vec::with_capacity(token_estimate_after),
104 interner: Interner::new(token_estimate_before + token_estimate_after),
105 };
106 res.update_before(before.tokenize());
107 res.update_after(after.tokenize());
108 res
109 }
110
111 /// Reserve capacity so that `before` and `after` would not need
112 /// to allocate if their [`estimate_tokens`](TokenSource::estimate_tokens)
113 /// would represent an exact match of their actual tokens.
114 ///
115 /// Useful for minimization of allocation before calls to
116 /// [`update_before`](InternedInput::update_before) and
117 /// [`update_after`](InternedInput::update_after).
118 pub fn reserve_for_token_source<S: TokenSource<Token = T> + ?Sized>(&mut self, before: &S, after: &S) {
119 self.reserve(before.estimate_tokens(), after.estimate_tokens())
120 }
121
122 /// Reserves capacity for the specified number of tokens in each sequence.
123 ///
124 /// # Parameters
125 ///
126 /// * `capacity_before` - The number of tokens to reserve for the "before" sequence
127 /// * `capacity_after` - The number of tokens to reserve for the "after" sequence
128 pub fn reserve(&mut self, capacity_before: u32, capacity_after: u32) {
129 self.before.reserve(capacity_before as usize);
130 self.after.reserve(capacity_after as usize);
131 self.interner
132 .reserve(capacity_before as usize + capacity_after as usize);
133 }
134
135 /// replaces `self.before` with the interned Tokens yielded by `input`
136 /// Note that this does not erase any tokens from the interner and might therefore be considered
137 /// a memory leak. If this function is called often over a long-running process
138 /// consider clearing the interner with [`clear`](Interner::clear).
139 pub fn update_before(&mut self, input: impl Iterator<Item = T>) {
140 self.before.clear();
141 self.before.extend(input.map(|token| self.interner.intern(token)));
142 }
143
144 /// replaces `self.before` with the interned Tokens yielded by `input`
145 /// Note that this does not erase any tokens from the interner and might therefore be considered
146 /// a memory leak. If this function is called often over a long-running process
147 /// consider clearing the interner with [`clear`](Interner::clear) or
148 /// [`erase_tokens_after`](Interner::erase_tokens_after).
149 pub fn update_after(&mut self, input: impl Iterator<Item = T>) {
150 self.after.clear();
151 self.after.extend(input.map(|token| self.interner.intern(token)));
152 }
153}
154
155/// An interner that allows for fast access of tokens produced by a [`TokenSource`].
156#[derive(Default)]
157pub struct Interner<T> {
158 tokens: Vec<T>,
159 table: HashTable<Token>,
160 hasher: RandomState,
161}
162
163impl<T> Interner<T> {
164 /// Create an Interner with an initial capacity calculated by summing the results of calling
165 /// [`estimate_tokens`](TokenSource::estimate_tokens) methods of `before` and `after`.
166 pub fn new_for_token_source<S: TokenSource<Token = T>>(before: &S, after: &S) -> Self {
167 Self::new(before.estimate_tokens() as usize + after.estimate_tokens() as usize)
168 }
169
170 /// Create an Interner with initial capacity `capacity`.
171 pub fn new(capacity: usize) -> Interner<T> {
172 Interner {
173 tokens: Vec::with_capacity(capacity),
174 table: HashTable::with_capacity(capacity),
175 hasher: RandomState::default(),
176 }
177 }
178
179 /// Remove all interned tokens.
180 pub fn clear(&mut self) {
181 self.table.clear();
182 self.tokens.clear();
183 }
184
185 /// Returns to total number of **distinct** tokens currently interned.
186 pub fn num_tokens(&self) -> u32 {
187 self.tokens.len() as u32
188 }
189}
190
191impl<T: Hash + Eq> Interner<T> {
192 /// Create an Interner with an initial capacity calculated by calling
193 /// [`estimate_tokens`](TokenSource::estimate_tokens) methods of `before` and `after`
194 pub fn reserve_for_token_source<S: TokenSource<Token = T>>(&mut self, before: &S, after: &S) {
195 self.reserve(before.estimate_tokens() as usize + after.estimate_tokens() as usize)
196 }
197
198 /// Reserves capacity for at least the specified number of additional tokens.
199 ///
200 /// # Parameters
201 ///
202 /// * `capacity` - The number of additional tokens to reserve space for
203 pub fn reserve(&mut self, capacity: usize) {
204 self.table
205 .reserve(capacity, |&token| self.hasher.hash_one(&self.tokens[token.0 as usize]));
206 self.tokens.reserve(capacity);
207 }
208
209 /// Intern `token` and return the interned integer.
210 pub fn intern(&mut self, token: T) -> Token {
211 let hash = self.hasher.hash_one(&token);
212 match self.table.entry(
213 hash,
214 |&it| self.tokens[it.0 as usize] == token,
215 |&token| self.hasher.hash_one(&self.tokens[token.0 as usize]),
216 ) {
217 Entry::Occupied(entry) => *entry.get(),
218 Entry::Vacant(entry) => {
219 let interned = Token(self.tokens.len() as u32);
220 entry.insert(interned);
221 self.tokens.push(token);
222 interned
223 }
224 }
225 }
226
227 /// Erases `first_erased_token` and any tokens interned afterward from the interner.
228 pub fn erase_tokens_after(&mut self, first_erased_token: Token) {
229 assert!(first_erased_token.0 <= self.tokens.len() as u32);
230 let retained = first_erased_token.0 as usize;
231 let erased = self.tokens.len() - retained;
232 if retained <= erased {
233 self.table.clear();
234 for (i, token) in self.tokens[0..retained].iter().enumerate() {
235 let hash = self.hasher.hash_one(token);
236 self.table.insert_unique(hash, Token(i as u32), |&token| {
237 self.hasher.hash_one(&self.tokens[token.0 as usize])
238 });
239 }
240 } else {
241 for (i, token) in self.tokens[retained..].iter().enumerate() {
242 let hash = self.hasher.hash_one(token);
243 match self.table.find_entry(hash, |token| token.0 == (retained + i) as u32) {
244 Ok(occupied) => drop(occupied.remove()),
245 Err(_absent) => unreachable!(),
246 }
247 }
248 }
249 self.tokens.truncate(first_erased_token.0 as usize);
250 }
251}
252
253impl<T> Index<Token> for Interner<T> {
254 type Output = T;
255 fn index(&self, index: Token) -> &Self::Output {
256 &self.tokens[index.0 as usize]
257 }
258}