Skip to main content

gix_imara_diff/
intern.rs

1// Modified for gitoxide from the upstream imara-diff crate.
2// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/intern.rs
3
4use std::hash::{BuildHasher as _, Hash};
5use std::ops::Index;
6
7use hashbrown::hash_table::{Entry, HashTable};
8use hashbrown::DefaultHashBuilder as RandomState;
9
10/// A token represented as an interned integer.
11///
12/// A token represents the smallest possible unit of change during a diff.
13/// For text this is usually a line, a word or a single character.
14/// All [algorithms](crate::Algorithm) operate on interned tokens instead
15/// of using the token data directly.
16/// This allows for much better performance by amortizing the cost of hashing/equality.
17///
18/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module.
19#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
20#[repr(transparent)]
21pub struct Token(pub u32);
22
23impl From<u32> for Token {
24    fn from(token: u32) -> Self {
25        Token(token)
26    }
27}
28
29impl From<Token> for u32 {
30    fn from(token: Token) -> Self {
31        token.0
32    }
33}
34
35/// A trait for types that can be split into tokens for diffing.
36///
37/// Implementing this trait allows a type to be used with [`InternedInput`] to create
38/// interned token sequences for computing diffs. For example, `&str` implements this trait
39/// by default to split text into lines.
40pub trait TokenSource {
41    /// The type of token this source produces. Must be hashable and comparable for equality.
42    type Token: Hash + Eq;
43    /// An iterator that yields tokens from this source.
44    type Tokenizer: Iterator<Item = Self::Token>;
45    /// Creates an iterator that yields all tokens from this source.
46    fn tokenize(&self) -> Self::Tokenizer;
47    /// Provides an estimate of the number of tokens this source will produce.
48    ///
49    /// This is used to pre-allocate memory for better performance. The estimate
50    /// does not need to be exact.
51    fn estimate_tokens(&self) -> u32;
52}
53
54/// Two lists of interned [tokens](Token) that a [`Diff`](crate::Diff) can be computed from.
55///
56/// A token represents the smallest possible unit of change during a diff.
57/// For text this is usually a line, a word or a single character.
58/// All [algorithms](crate::Algorithm) operate on interned tokens instead
59/// of using the token data directly.
60/// This allows for much better performance by amortizing the cost of hashing/equality.
61///
62/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module.
63#[derive(Default)]
64pub struct InternedInput<T> {
65    /// The list of interned tokens from the first sequence (before changes).
66    pub before: Vec<Token>,
67    /// The list of interned tokens from the second sequence (after changes).
68    pub after: Vec<Token>,
69    /// The interner that stores the actual token data and maps tokens to their interned IDs.
70    pub interner: Interner<T>,
71}
72
73impl<T> InternedInput<T> {
74    /// Clears all token sequences and the interner.
75    ///
76    /// This removes all tokens from both the before and after sequences, as well as
77    /// clearing the interner's storage.
78    ///
79    /// Note that this will not free the allocated memory.
80    pub fn clear(&mut self) {
81        self.before.clear();
82        self.after.clear();
83        self.interner.clear();
84    }
85}
86
87impl<T: Eq + Hash> InternedInput<T> {
88    /// Creates a new `InternedInput` by tokenizing and interning two token sources.
89    ///
90    /// # Parameters
91    ///
92    /// * `before` - The token source for the first sequence
93    /// * `after` - The token source for the second sequence
94    ///
95    /// # Returns
96    ///
97    /// An `InternedInput` containing interned token sequences ready for diffing
98    pub fn new<I: TokenSource<Token = T>>(before: I, after: I) -> Self {
99        let token_estimate_before = before.estimate_tokens() as usize;
100        let token_estimate_after = after.estimate_tokens() as usize;
101        let mut res = Self {
102            before: Vec::with_capacity(token_estimate_before),
103            after: Vec::with_capacity(token_estimate_after),
104            interner: Interner::new(token_estimate_before + token_estimate_after),
105        };
106        res.update_before(before.tokenize());
107        res.update_after(after.tokenize());
108        res
109    }
110
111    /// Reserve capacity so that `before` and `after` would not need
112    /// to allocate if their [`estimate_tokens`](TokenSource::estimate_tokens)
113    /// would represent an exact match of their actual tokens.
114    ///
115    /// Useful for minimization of allocation before calls to
116    /// [`update_before`](InternedInput::update_before) and
117    /// [`update_after`](InternedInput::update_after).
118    pub fn reserve_for_token_source<S: TokenSource<Token = T> + ?Sized>(&mut self, before: &S, after: &S) {
119        self.reserve(before.estimate_tokens(), after.estimate_tokens())
120    }
121
122    /// Reserves capacity for the specified number of tokens in each sequence.
123    ///
124    /// # Parameters
125    ///
126    /// * `capacity_before` - The number of tokens to reserve for the "before" sequence
127    /// * `capacity_after` - The number of tokens to reserve for the "after" sequence
128    pub fn reserve(&mut self, capacity_before: u32, capacity_after: u32) {
129        self.before.reserve(capacity_before as usize);
130        self.after.reserve(capacity_after as usize);
131        self.interner
132            .reserve(capacity_before as usize + capacity_after as usize);
133    }
134
135    /// replaces `self.before` with the interned Tokens yielded by `input`
136    /// Note that this does not erase any tokens from the interner and might therefore be considered
137    /// a memory leak. If this function is called often over a long-running process
138    /// consider clearing the interner with [`clear`](Interner::clear).
139    pub fn update_before(&mut self, input: impl Iterator<Item = T>) {
140        self.before.clear();
141        self.before.extend(input.map(|token| self.interner.intern(token)));
142    }
143
144    /// replaces `self.before` with the interned Tokens yielded by `input`
145    /// Note that this does not erase any tokens from the interner and might therefore be considered
146    /// a memory leak. If this function is called often over a long-running process
147    /// consider clearing the interner with [`clear`](Interner::clear) or
148    /// [`erase_tokens_after`](Interner::erase_tokens_after).
149    pub fn update_after(&mut self, input: impl Iterator<Item = T>) {
150        self.after.clear();
151        self.after.extend(input.map(|token| self.interner.intern(token)));
152    }
153}
154
155/// An interner that allows for fast access of tokens produced by a [`TokenSource`].
156#[derive(Default)]
157pub struct Interner<T> {
158    tokens: Vec<T>,
159    table: HashTable<Token>,
160    hasher: RandomState,
161}
162
163impl<T> Interner<T> {
164    /// Create an Interner with an initial capacity calculated by summing the results of calling
165    /// [`estimate_tokens`](TokenSource::estimate_tokens) methods of `before` and `after`.
166    pub fn new_for_token_source<S: TokenSource<Token = T>>(before: &S, after: &S) -> Self {
167        Self::new(before.estimate_tokens() as usize + after.estimate_tokens() as usize)
168    }
169
170    /// Create an Interner with initial capacity `capacity`.
171    pub fn new(capacity: usize) -> Interner<T> {
172        Interner {
173            tokens: Vec::with_capacity(capacity),
174            table: HashTable::with_capacity(capacity),
175            hasher: RandomState::default(),
176        }
177    }
178
179    /// Remove all interned tokens.
180    pub fn clear(&mut self) {
181        self.table.clear();
182        self.tokens.clear();
183    }
184
185    /// Returns to total number of **distinct** tokens currently interned.
186    pub fn num_tokens(&self) -> u32 {
187        self.tokens.len() as u32
188    }
189}
190
191impl<T: Hash + Eq> Interner<T> {
192    /// Create an Interner with an initial capacity calculated by calling
193    /// [`estimate_tokens`](TokenSource::estimate_tokens) methods of `before` and `after`
194    pub fn reserve_for_token_source<S: TokenSource<Token = T>>(&mut self, before: &S, after: &S) {
195        self.reserve(before.estimate_tokens() as usize + after.estimate_tokens() as usize)
196    }
197
198    /// Reserves capacity for at least the specified number of additional tokens.
199    ///
200    /// # Parameters
201    ///
202    /// * `capacity` - The number of additional tokens to reserve space for
203    pub fn reserve(&mut self, capacity: usize) {
204        self.table
205            .reserve(capacity, |&token| self.hasher.hash_one(&self.tokens[token.0 as usize]));
206        self.tokens.reserve(capacity);
207    }
208
209    /// Intern `token` and return the interned integer.
210    pub fn intern(&mut self, token: T) -> Token {
211        let hash = self.hasher.hash_one(&token);
212        match self.table.entry(
213            hash,
214            |&it| self.tokens[it.0 as usize] == token,
215            |&token| self.hasher.hash_one(&self.tokens[token.0 as usize]),
216        ) {
217            Entry::Occupied(entry) => *entry.get(),
218            Entry::Vacant(entry) => {
219                let interned = Token(self.tokens.len() as u32);
220                entry.insert(interned);
221                self.tokens.push(token);
222                interned
223            }
224        }
225    }
226
227    /// Erases `first_erased_token` and any tokens interned afterward from the interner.
228    pub fn erase_tokens_after(&mut self, first_erased_token: Token) {
229        assert!(first_erased_token.0 <= self.tokens.len() as u32);
230        let retained = first_erased_token.0 as usize;
231        let erased = self.tokens.len() - retained;
232        if retained <= erased {
233            self.table.clear();
234            for (i, token) in self.tokens[0..retained].iter().enumerate() {
235                let hash = self.hasher.hash_one(token);
236                self.table.insert_unique(hash, Token(i as u32), |&token| {
237                    self.hasher.hash_one(&self.tokens[token.0 as usize])
238                });
239            }
240        } else {
241            for (i, token) in self.tokens[retained..].iter().enumerate() {
242                let hash = self.hasher.hash_one(token);
243                match self.table.find_entry(hash, |token| token.0 == (retained + i) as u32) {
244                    Ok(occupied) => drop(occupied.remove()),
245                    Err(_absent) => unreachable!(),
246                }
247            }
248        }
249        self.tokens.truncate(first_erased_token.0 as usize);
250    }
251}
252
253impl<T> Index<Token> for Interner<T> {
254    type Output = T;
255    fn index(&self, index: Token) -> &Self::Output {
256        &self.tokens[index.0 as usize]
257    }
258}