lean_ctx/core/
rabin_karp.rs1const PRIME: u64 = 1_000_000_007;
2const BASE: u64 = 256;
3const MIN_CHUNK: usize = 64;
4const MAX_CHUNK: usize = 2048;
5const TARGET_CHUNK: usize = 512;
6const MASK: u64 = TARGET_CHUNK as u64 - 1;
7
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub struct Chunk {
10 pub offset: usize,
11 pub length: usize,
12 pub hash: u64,
13}
14
15pub fn chunk(content: &str) -> Vec<Chunk> {
16 let bytes = content.as_bytes();
17 if bytes.is_empty() {
18 return vec![];
19 }
20
21 if bytes.len() <= MIN_CHUNK {
22 return vec![Chunk {
23 offset: 0,
24 length: bytes.len(),
25 hash: full_hash(bytes),
26 }];
27 }
28
29 let window = 48.min(bytes.len());
30 let mut chunks = Vec::new();
31 let mut chunk_start = 0;
32 let mut rolling = 0u64;
33 let mut pow = 1u64;
34
35 for i in 0..window.saturating_sub(1) {
36 let _ = i;
37 pow = pow.wrapping_mul(BASE) % PRIME;
38 }
39
40 for i in 0..bytes.len() {
41 rolling = rolling.wrapping_mul(BASE).wrapping_add(bytes[i] as u64) % PRIME;
42
43 if i >= window {
44 let old = bytes[i - window] as u64;
45 rolling = (rolling + PRIME - old.wrapping_mul(pow) % PRIME) % PRIME;
46 }
47
48 let chunk_len = i + 1 - chunk_start;
49
50 let is_boundary = chunk_len >= MIN_CHUNK && (rolling & MASK == 0);
51 let is_max = chunk_len >= MAX_CHUNK;
52
53 if is_boundary || is_max || i == bytes.len() - 1 {
54 let slice = &bytes[chunk_start..=i];
55 chunks.push(Chunk {
56 offset: chunk_start,
57 length: slice.len(),
58 hash: full_hash(slice),
59 });
60 chunk_start = i + 1;
61 }
62 }
63
64 chunks
65}
66
67pub fn stable_order(old_chunks: &[Chunk], new_chunks: &[Chunk]) -> Vec<usize> {
68 let old_hashes: std::collections::HashSet<u64> = old_chunks.iter().map(|c| c.hash).collect();
69
70 let mut unchanged: Vec<usize> = Vec::new();
71 let mut changed: Vec<usize> = Vec::new();
72
73 for (i, c) in new_chunks.iter().enumerate() {
74 if old_hashes.contains(&c.hash) {
75 unchanged.push(i);
76 } else {
77 changed.push(i);
78 }
79 }
80
81 unchanged.extend(changed);
82 unchanged
83}
84
85pub fn reorder_content(content: &str, old_content: &str) -> String {
86 let old_chunks = chunk(old_content);
87 let new_chunks = chunk(content);
88 let order = stable_order(&old_chunks, &new_chunks);
89
90 let mut result = String::with_capacity(content.len());
91 for &idx in &order {
92 if let Some(c) = new_chunks.get(idx) {
93 let end = (c.offset + c.length).min(content.len());
94 result.push_str(&content[c.offset..end]);
95 }
96 }
97 result
98}
99
100fn full_hash(data: &[u8]) -> u64 {
101 let mut h = 0u64;
102 for &b in data {
103 h = h.wrapping_mul(BASE).wrapping_add(b as u64) % PRIME;
104 }
105 h
106}
107
108#[cfg(test)]
109mod tests {
110 use super::*;
111
112 #[test]
113 fn deterministic_chunking() {
114 let content = "a".repeat(1000);
115 let c1 = chunk(&content);
116 let c2 = chunk(&content);
117 assert_eq!(c1, c2);
118 }
119
120 #[test]
121 fn empty_content() {
122 let c = chunk("");
123 assert!(c.is_empty());
124 }
125
126 #[test]
127 fn small_content_single_chunk() {
128 let c = chunk("hello world");
129 assert_eq!(c.len(), 1);
130 assert_eq!(c[0].offset, 0);
131 }
132
133 #[test]
134 fn respects_min_chunk_size() {
135 let content = "x".repeat(500);
136 let chunks = chunk(&content);
137 for c in &chunks[..chunks.len().saturating_sub(1)] {
138 assert!(
139 c.length >= MIN_CHUNK,
140 "Chunk length {} < MIN_CHUNK {}",
141 c.length,
142 MIN_CHUNK
143 );
144 }
145 }
146
147 #[test]
148 fn respects_max_chunk_size() {
149 let content = "x".repeat(10000);
150 let chunks = chunk(&content);
151 for c in &chunks {
152 assert!(
153 c.length <= MAX_CHUNK + 1,
154 "Chunk length {} > MAX_CHUNK {}",
155 c.length,
156 MAX_CHUNK
157 );
158 }
159 }
160
161 #[test]
162 fn local_change_affects_few_chunks() {
163 let original = "a".repeat(2000);
164 let mut modified = original.clone();
165 unsafe {
166 let bytes = modified.as_bytes_mut();
167 bytes[1000] = b'Z';
168 }
169
170 let c1 = chunk(&original);
171 let c2 = chunk(&modified);
172
173 let unchanged = c1
174 .iter()
175 .filter(|a| c2.iter().any(|b| b.hash == a.hash))
176 .count();
177
178 let stability = unchanged as f64 / c1.len().max(1) as f64;
179 assert!(
180 stability > 0.5,
181 "Expected > 50% chunks stable, got {:.0}% ({unchanged}/{})",
182 stability * 100.0,
183 c1.len()
184 );
185 }
186
187 #[test]
188 fn stable_order_unchanged_first() {
189 let content_v1 = "fn foo() { 1 }\nfn bar() { 2 }\n".repeat(50);
190 let content_v2 = "fn foo() { 1 }\nfn bar() { 3 }\n".repeat(50);
191
192 let old = chunk(&content_v1);
193 let new = chunk(&content_v2);
194 let order = stable_order(&old, &new);
195
196 if order.len() >= 2 {
197 let first_is_unchanged = old
198 .iter()
199 .any(|o| new.get(order[0]).is_some_and(|n| n.hash == o.hash));
200 assert!(
201 first_is_unchanged || order[0] == 0,
202 "First element should be unchanged"
203 );
204 }
205 }
206
207 #[test]
208 fn covers_all_bytes() {
209 let content = "hello world, this is a longer test string for chunking purposes!".repeat(20);
210 let chunks = chunk(&content);
211 let total: usize = chunks.iter().map(|c| c.length).sum();
212 assert_eq!(total, content.len());
213 }
214
215 #[test]
216 fn unicode_content() {
217 let content = "日本語テスト。これはUnicodeのテストです。".repeat(30);
218 let chunks = chunk(&content);
219 assert!(!chunks.is_empty());
220 let total: usize = chunks.iter().map(|c| c.length).sum();
221 assert_eq!(total, content.len());
222 }
223}