imara_diff/
intern.rs

1use std::hash::{BuildHasher as _, Hash};
2use std::ops::Index;
3
4use hashbrown::hash_table::{Entry, HashTable};
5use hashbrown::DefaultHashBuilder as RandomState;
6
7/// A token represented as an interned integer.
8///
9/// A token represents the smallest possible unit of change during a diff.
10/// For text this is usually a line, a word or a single character.
11/// All [algorithms](crate::Algorithm) operate on interned tokens instead
12/// of using the token data directly.
13/// This allows for much better performance by amortizing the cost of hashing/equality.
14///
15/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module.
16#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
17#[repr(transparent)]
18pub struct Token(pub u32);
19
20impl From<u32> for Token {
21    fn from(token: u32) -> Self {
22        Token(token)
23    }
24}
25
26impl From<Token> for u32 {
27    fn from(token: Token) -> Self {
28        token.0
29    }
30}
31
32pub trait TokenSource {
33    type Token: Hash + Eq;
34    type Tokenizer: Iterator<Item = Self::Token>;
35    fn tokenize(&self) -> Self::Tokenizer;
36    fn estimate_tokens(&self) -> u32;
37}
38
39/// Two lists of interned [tokens](crate::intern::Token) that a [`Diff`](crate::Diff) can be computed from.
40///
41/// A token represents the smallest possible unit of change during a diff.
42/// For text this is usually a line, a word or a single character.
43/// All [algorithms](crate::Algorithm) operate on interned tokens instead
44/// of using the token data directly.
45/// This allows for much better performance by amortizing the cost of hashing/equality.
46///
47/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module.
48#[derive(Default)]
49pub struct InternedInput<T> {
50    pub before: Vec<Token>,
51    pub after: Vec<Token>,
52    pub interner: Interner<T>,
53}
54
55impl<T> InternedInput<T> {
56    pub fn clear(&mut self) {
57        self.before.clear();
58        self.after.clear();
59        self.interner.clear();
60    }
61}
62
63impl<T: Eq + Hash> InternedInput<T> {
64    pub fn new<I: TokenSource<Token = T>>(before: I, after: I) -> Self {
65        let token_estimate_before = before.estimate_tokens() as usize;
66        let token_estimate_after = after.estimate_tokens() as usize;
67        let mut res = Self {
68            before: Vec::with_capacity(token_estimate_before),
69            after: Vec::with_capacity(token_estimate_after),
70            interner: Interner::new(token_estimate_before + token_estimate_after),
71        };
72        res.update_before(before.tokenize());
73        res.update_after(after.tokenize());
74        res
75    }
76
77    /// Create an Interner with an intial capacity calculated by calling
78    /// [`estimate_tokens`](crate::intern::TokenSource::estimate_tokens) methods of `before` and `after`
79    pub fn reserve_for_token_source<S: TokenSource<Token = T> + ?Sized>(
80        &mut self,
81        before: &S,
82        after: &S,
83    ) {
84        self.reserve(before.estimate_tokens(), after.estimate_tokens())
85    }
86
87    pub fn reserve(&mut self, capacity_before: u32, capacity_after: u32) {
88        self.before.reserve(capacity_before as usize);
89        self.after.reserve(capacity_after as usize);
90        self.interner
91            .reserve(capacity_before as usize + capacity_after as usize);
92    }
93
94    /// replaces `self.before` with the interned Tokens yielded by `input`
95    /// Note that this does not erase any tokens from the interner and might therefore be considered
96    /// a memory leak. If this function is called often over a long_running process
97    /// consider clearing the interner with [`clear`](crate::intern::Interner::clear).
98    pub fn update_before(&mut self, input: impl Iterator<Item = T>) {
99        self.before.clear();
100        self.before
101            .extend(input.map(|token| self.interner.intern(token)));
102    }
103
104    /// replaces `self.before` with the interned Tokens yielded by `input`
105    /// Note that this does not erase any tokens from the interner and might therefore be considered
106    /// a memory leak. If this function is called often over a long_running process
107    /// consider clearing the interner with [`clear`](crate::intern::Interner::clear) or
108    /// [`erase_tokens_after`](crate::intern::Interner::erase_tokens_after).
109    pub fn update_after(&mut self, input: impl Iterator<Item = T>) {
110        self.after.clear();
111        self.after
112            .extend(input.map(|token| self.interner.intern(token)));
113    }
114}
115
116/// An interner that allows for fast access of tokens produced by a [`TokenSource`].
117#[derive(Default)]
118pub struct Interner<T> {
119    tokens: Vec<T>,
120    table: HashTable<Token>,
121    hasher: RandomState,
122}
123
124impl<T> Interner<T> {
125    /// Create an Interner with an initial capacity calculated by summing the results of calling
126    /// [`estimate_tokens`](crate::intern::TokenSource::estimate_tokens) methods of `before` and `after`.
127    pub fn new_for_token_source<S: TokenSource<Token = T>>(before: &S, after: &S) -> Self {
128        Self::new(before.estimate_tokens() as usize + after.estimate_tokens() as usize)
129    }
130
131    /// Create an Interner with initial capacity `capacity`.
132    pub fn new(capacity: usize) -> Interner<T> {
133        Interner {
134            tokens: Vec::with_capacity(capacity),
135            table: HashTable::with_capacity(capacity),
136            hasher: RandomState::default(),
137        }
138    }
139
140    /// Remove all interned tokens.
141    pub fn clear(&mut self) {
142        self.table.clear();
143        self.tokens.clear();
144    }
145
146    /// Returns to total number of **distinct** tokens currently interned.
147    pub fn num_tokens(&self) -> u32 {
148        self.tokens.len() as u32
149    }
150}
151
152impl<T: Hash + Eq> Interner<T> {
153    /// Create an Interner with an intial capacity calculated by calling
154    /// [`estimate_tokens`](crate::intern::TokenSource::estimate_tokens) methods of `before` and `after`
155    pub fn reserve_for_token_source<S: TokenSource<Token = T>>(&mut self, before: &S, after: &S) {
156        self.reserve(before.estimate_tokens() as usize + after.estimate_tokens() as usize)
157    }
158
159    pub fn reserve(&mut self, capacity: usize) {
160        self.table.reserve(capacity, |&token| {
161            self.hasher.hash_one(&self.tokens[token.0 as usize])
162        });
163        self.tokens.reserve(capacity);
164    }
165
166    /// Intern `token` and return a the interned integer.
167    pub fn intern(&mut self, token: T) -> Token {
168        let hash = self.hasher.hash_one(&token);
169        match self.table.entry(
170            hash,
171            |&it| self.tokens[it.0 as usize] == token,
172            |&token| self.hasher.hash_one(&self.tokens[token.0 as usize]),
173        ) {
174            Entry::Occupied(entry) => *entry.get(),
175            Entry::Vacant(entry) => {
176                let interned = Token(self.tokens.len() as u32);
177                entry.insert(interned);
178                self.tokens.push(token);
179                interned
180            }
181        }
182    }
183
184    /// Erases `first_erased_token` and any tokens interned afterward from the interner.
185    pub fn erase_tokens_after(&mut self, first_erased_token: Token) {
186        assert!(first_erased_token.0 <= self.tokens.len() as u32);
187        let retained = first_erased_token.0 as usize;
188        let erased = self.tokens.len() - retained;
189        if retained <= erased {
190            self.table.clear();
191            for (i, token) in self.tokens[0..retained].iter().enumerate() {
192                let hash = self.hasher.hash_one(token);
193                self.table.insert_unique(hash, Token(i as u32), |&token| {
194                    self.hasher.hash_one(&self.tokens[token.0 as usize])
195                });
196            }
197        } else {
198            for (i, token) in self.tokens[retained..].iter().enumerate() {
199                let hash = self.hasher.hash_one(token);
200                match self
201                    .table
202                    .find_entry(hash, |token| token.0 == (retained + i) as u32)
203                {
204                    Ok(occupied) => drop(occupied.remove()),
205                    Err(_absent) => unreachable!(),
206                }
207            }
208        }
209        self.tokens.truncate(first_erased_token.0 as usize);
210    }
211}
212
213impl<T> Index<Token> for Interner<T> {
214    type Output = T;
215    fn index(&self, index: Token) -> &Self::Output {
216        &self.tokens[index.0 as usize]
217    }
218}