Skip to main content

lean_ctx/core/
rabin_karp.rs

1const PRIME: u64 = 1_000_000_007;
2const BASE: u64 = 256;
3const MIN_CHUNK: usize = 64;
4const MAX_CHUNK: usize = 2048;
5const TARGET_CHUNK: usize = 512;
6const MASK: u64 = TARGET_CHUNK as u64 - 1;
7
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub struct Chunk {
10    pub offset: usize,
11    pub length: usize,
12    pub hash: u64,
13}
14
15pub fn chunk(content: &str) -> Vec<Chunk> {
16    let bytes = content.as_bytes();
17    if bytes.is_empty() {
18        return vec![];
19    }
20
21    if bytes.len() <= MIN_CHUNK {
22        return vec![Chunk {
23            offset: 0,
24            length: bytes.len(),
25            hash: full_hash(bytes),
26        }];
27    }
28
29    let window = 48.min(bytes.len());
30    let mut chunks = Vec::new();
31    let mut chunk_start = 0;
32    let mut rolling = 0u64;
33    let mut pow = 1u64;
34
35    for i in 0..window.saturating_sub(1) {
36        let _ = i;
37        pow = pow.wrapping_mul(BASE) % PRIME;
38    }
39
40    for i in 0..bytes.len() {
41        rolling = rolling.wrapping_mul(BASE).wrapping_add(bytes[i] as u64) % PRIME;
42
43        if i >= window {
44            let old = bytes[i - window] as u64;
45            rolling = (rolling + PRIME - old.wrapping_mul(pow) % PRIME) % PRIME;
46        }
47
48        let chunk_len = i + 1 - chunk_start;
49
50        let is_boundary = chunk_len >= MIN_CHUNK && (rolling & MASK == 0);
51        let is_max = chunk_len >= MAX_CHUNK;
52
53        if is_boundary || is_max || i == bytes.len() - 1 {
54            let slice = &bytes[chunk_start..=i];
55            chunks.push(Chunk {
56                offset: chunk_start,
57                length: slice.len(),
58                hash: full_hash(slice),
59            });
60            chunk_start = i + 1;
61        }
62    }
63
64    chunks
65}
66
67pub fn stable_order(old_chunks: &[Chunk], new_chunks: &[Chunk]) -> Vec<usize> {
68    let old_hashes: std::collections::HashSet<u64> = old_chunks.iter().map(|c| c.hash).collect();
69
70    let mut unchanged: Vec<usize> = Vec::new();
71    let mut changed: Vec<usize> = Vec::new();
72
73    for (i, c) in new_chunks.iter().enumerate() {
74        if old_hashes.contains(&c.hash) {
75            unchanged.push(i);
76        } else {
77            changed.push(i);
78        }
79    }
80
81    unchanged.extend(changed);
82    unchanged
83}
84
85pub fn reorder_content(content: &str, old_content: &str) -> String {
86    let old_chunks = chunk(old_content);
87    let new_chunks = chunk(content);
88    let order = stable_order(&old_chunks, &new_chunks);
89
90    let mut result = String::with_capacity(content.len());
91    for &idx in &order {
92        if let Some(c) = new_chunks.get(idx) {
93            let end = (c.offset + c.length).min(content.len());
94            result.push_str(&content[c.offset..end]);
95        }
96    }
97    result
98}
99
100fn full_hash(data: &[u8]) -> u64 {
101    let mut h = 0u64;
102    for &b in data {
103        h = h.wrapping_mul(BASE).wrapping_add(b as u64) % PRIME;
104    }
105    h
106}
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111
112    #[test]
113    fn deterministic_chunking() {
114        let content = "a".repeat(1000);
115        let c1 = chunk(&content);
116        let c2 = chunk(&content);
117        assert_eq!(c1, c2);
118    }
119
120    #[test]
121    fn empty_content() {
122        let c = chunk("");
123        assert!(c.is_empty());
124    }
125
126    #[test]
127    fn small_content_single_chunk() {
128        let c = chunk("hello world");
129        assert_eq!(c.len(), 1);
130        assert_eq!(c[0].offset, 0);
131    }
132
133    #[test]
134    fn respects_min_chunk_size() {
135        let content = "x".repeat(500);
136        let chunks = chunk(&content);
137        for c in &chunks[..chunks.len().saturating_sub(1)] {
138            assert!(
139                c.length >= MIN_CHUNK,
140                "Chunk length {} < MIN_CHUNK {}",
141                c.length,
142                MIN_CHUNK
143            );
144        }
145    }
146
147    #[test]
148    fn respects_max_chunk_size() {
149        let content = "x".repeat(10000);
150        let chunks = chunk(&content);
151        for c in &chunks {
152            assert!(
153                c.length <= MAX_CHUNK + 1,
154                "Chunk length {} > MAX_CHUNK {}",
155                c.length,
156                MAX_CHUNK
157            );
158        }
159    }
160
161    #[test]
162    fn local_change_affects_few_chunks() {
163        let original = "a".repeat(2000);
164        let mut modified = original.clone();
165        unsafe {
166            let bytes = modified.as_bytes_mut();
167            bytes[1000] = b'Z';
168        }
169
170        let c1 = chunk(&original);
171        let c2 = chunk(&modified);
172
173        let unchanged = c1
174            .iter()
175            .filter(|a| c2.iter().any(|b| b.hash == a.hash))
176            .count();
177
178        let stability = unchanged as f64 / c1.len().max(1) as f64;
179        assert!(
180            stability > 0.5,
181            "Expected > 50% chunks stable, got {:.0}% ({unchanged}/{})",
182            stability * 100.0,
183            c1.len()
184        );
185    }
186
187    #[test]
188    fn stable_order_unchanged_first() {
189        let content_v1 = "fn foo() { 1 }\nfn bar() { 2 }\n".repeat(50);
190        let content_v2 = "fn foo() { 1 }\nfn bar() { 3 }\n".repeat(50);
191
192        let old = chunk(&content_v1);
193        let new = chunk(&content_v2);
194        let order = stable_order(&old, &new);
195
196        if order.len() >= 2 {
197            let first_is_unchanged = old
198                .iter()
199                .any(|o| new.get(order[0]).is_some_and(|n| n.hash == o.hash));
200            assert!(
201                first_is_unchanged || order[0] == 0,
202                "First element should be unchanged"
203            );
204        }
205    }
206
207    #[test]
208    fn covers_all_bytes() {
209        let content = "hello world, this is a longer test string for chunking purposes!".repeat(20);
210        let chunks = chunk(&content);
211        let total: usize = chunks.iter().map(|c| c.length).sum();
212        assert_eq!(total, content.len());
213    }
214
215    #[test]
216    fn unicode_content() {
217        let content = "日本語テスト。これはUnicodeのテストです。".repeat(30);
218        let chunks = chunk(&content);
219        assert!(!chunks.is_empty());
220        let total: usize = chunks.iter().map(|c| c.length).sum();
221        assert_eq!(total, content.len());
222    }
223}