1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
//! Batch-insert API for `PersistentARTrieChar<V, S>`.
//!
//! Split out of char `dict_impl_char.rs` (lines ~531-855, ~325 LOC)
//! as a Phase-6 char sub-module. Methods covered:
//!
//! - `insert_batch` / `insert_batch_chars` / `insert_batch_bytes`
//! - `insert_batch_sorted` / `insert_batch_chars_sorted` /
//! `insert_batch_bytes_sorted`
//! - `insert_batch_grouped` / `insert_batch_chars_grouped` /
//! `insert_batch_bytes_grouped`
//! - `insert_batch_arena_grouped` (alias for byte_grouped)
use crate::persistent_artrie::block_storage::BlockStorage;
use crate::value::DictionaryValue;
impl<V: DictionaryValue, S: BlockStorage> super::PersistentARTrieChar<V, S> {
pub fn insert_batch(&mut self, entries: &[(String, Option<V>)]) -> usize {
if entries.is_empty() {
return 0;
}
// L3.3: the overlay is the sole representation. Each entry commits via the
// proven per-op Order-A path (per-op WAL-then-CAS, NOT a single owned-tree
// BatchInsert append). Delegate to the already-routed single-op `insert` /
// `insert_with_value` so NO mutation logic is duplicated.
let mut inserted_count = 0;
for (term, value) in entries {
let result = match value {
Some(v) => self.insert_with_value(term, v.clone()),
None => self.insert(term),
};
match result {
Ok(true) => inserted_count += 1,
Ok(false) => {}
Err(e) => log::warn!("Failed overlay batch insert entry: {:?}", e),
}
}
inserted_count
}
/// Insert multiple terms (as char slices) with optional values in a single batch operation.
///
/// This method is useful when you have pre-parsed Unicode characters and want
/// to avoid UTF-8 encoding overhead for each term individually.
///
/// # Arguments
///
/// * `entries` - Slice of (char_slice, optional_value) pairs to insert
///
/// # Returns
///
/// The number of terms that were newly inserted (not updates).
///
/// # Example
///
/// ```text
/// let entries = vec![
/// (&['日', '本', '語'][..], Some(1)),
/// (&['中', '文'][..], Some(2)),
/// ];
/// let count = trie.insert_batch_chars(&entries)?;
/// ```
pub fn insert_batch_chars(&mut self, entries: &[(&[char], Option<V>)]) -> usize {
if entries.is_empty() {
return 0;
}
// Convert char slices to strings for WAL and insertion
let string_entries: Vec<(String, Option<V>)> = entries
.iter()
.map(|(chars, value)| {
let term: String = chars.iter().collect();
(term, value.clone())
})
.collect();
self.insert_batch(&string_entries)
}
/// Insert multiple byte-slice terms in a single batch operation.
///
/// This is the byte-slice version of `insert_batch()` for when you already
/// have byte data and want to avoid string conversion overhead.
///
/// # Arguments
///
/// * `entries` - Slice of (term_bytes, optional_value) pairs to insert
///
/// # Returns
///
/// The number of terms that were newly inserted.
pub fn insert_batch_bytes(&mut self, entries: &[(&[u8], Option<V>)]) -> usize {
if entries.is_empty() {
return 0;
}
// L3.3: the overlay is the sole representation. Each entry commits via the
// proven per-op Order-A path (already overlay-routed `insert`/`insert_with_value`,
// each emitting a CommitRank), NOT a single unranked `BatchInsert` append that
// recovery would DROP as a two-append-window orphan (the A2 fix). Mirrors
// `insert_batch`; the `_sorted`/`_grouped`/`_arena_grouped` delegators inherit
// this. The lossy byte→String conversion matches the owned per-entry apply.
let mut inserted_count = 0;
for (term, value) in entries {
let term_str = String::from_utf8_lossy(term).into_owned();
let result = match value {
Some(v) => self.insert_with_value(&term_str, v.clone()),
None => self.insert(&term_str),
};
match result {
Ok(true) => inserted_count += 1,
Ok(false) => {}
Err(e) => log::warn!("Failed overlay byte batch insert entry: {:?}", e),
}
}
inserted_count
}
/// Insert multiple terms with optional values in sorted order for cache locality.
///
/// This method sorts the entries lexicographically before inserting them,
/// which improves cache hit rates since consecutive terms share trie prefix
/// paths. For large batches, this can improve throughput by 5-20%.
///
/// All entries are logged as a single batch WAL record before insertion.
///
/// # Arguments
///
/// * `entries` - Vector of (term, optional_value) pairs to insert
///
/// # Returns
///
/// The number of terms that were newly inserted.
pub fn insert_batch_sorted(&mut self, mut entries: Vec<(String, Option<V>)>) -> usize {
if entries.is_empty() {
return 0;
}
// Sort by term lexicographically for cache locality
entries.sort_by(|a, b| a.0.cmp(&b.0));
// Delegate to insert_batch
self.insert_batch(&entries)
}
/// Insert multiple char-slice terms with optional values in sorted order for cache locality.
///
/// This method sorts the entries lexicographically before inserting them,
/// which improves cache hit rates since consecutive terms share trie prefix
/// paths. For large batches, this can improve throughput by 5-20%.
///
/// All entries are logged as a single batch WAL record before insertion.
///
/// # Arguments
///
/// * `entries` - Vector of (char_vec, optional_value) pairs to insert
///
/// # Returns
///
/// The number of terms that were newly inserted.
pub fn insert_batch_chars_sorted(&mut self, mut entries: Vec<(Vec<char>, Option<V>)>) -> usize {
if entries.is_empty() {
return 0;
}
// Sort by chars lexicographically for cache locality
entries.sort_by(|a, b| a.0.cmp(&b.0));
// Convert to references for insert_batch_chars
let refs: Vec<(&[char], Option<V>)> = entries
.iter()
.map(|(chars, value)| (chars.as_slice(), value.clone()))
.collect();
self.insert_batch_chars(&refs)
}
/// Insert multiple byte terms with optional values in sorted order for cache locality.
///
/// This method sorts the entries lexicographically before inserting them,
/// which improves cache hit rates since consecutive terms share trie prefix
/// paths. For large batches, this can improve throughput by 5-20%.
///
/// All entries are logged as a single batch WAL record before insertion.
///
/// # Arguments
///
/// * `entries` - Vector of (term_bytes, optional_value) pairs to insert
///
/// # Returns
///
/// The number of terms that were newly inserted.
pub fn insert_batch_bytes_sorted(&mut self, mut entries: Vec<(Vec<u8>, Option<V>)>) -> usize {
if entries.is_empty() {
return 0;
}
// Sort by term lexicographically for cache locality
entries.sort_by(|a, b| a.0.cmp(&b.0));
// Convert to references for insert_batch_bytes
let refs: Vec<(&[u8], Option<V>)> = entries
.iter()
.map(|(term, value)| (term.as_slice(), value.clone()))
.collect();
self.insert_batch_bytes(&refs)
}
/// Insert multiple string terms grouped by first character for arena locality.
///
/// This method groups inserts by their first character before inserting,
/// which improves I/O locality for disk-resident tries. Terms with the same
/// first character tend to land in nearby arenas because arenas fill
/// sequentially during loading.
///
/// # Performance
///
/// Expected improvement: 5-10% faster batch inserts for disk-resident tries
/// due to improved I/O locality. The first-character heuristic provides ~60-80%
/// of the benefit of full arena prediction with O(1) complexity.
///
/// # Arguments
///
/// * `entries` - Vector of (term, optional_value) pairs to insert
///
/// # Returns
///
/// The number of terms that were newly inserted.
pub fn insert_batch_grouped(&mut self, mut entries: Vec<(String, Option<V>)>) -> usize {
if entries.is_empty() {
return 0;
}
// Sort by first character (arena proxy) then by full term for within-group locality
entries.sort_by(|a, b| {
let a_prefix = a.0.chars().next().unwrap_or('\0');
let b_prefix = b.0.chars().next().unwrap_or('\0');
a_prefix.cmp(&b_prefix).then_with(|| a.0.cmp(&b.0))
});
// Delegate to insert_batch
self.insert_batch(&entries)
}
/// Insert multiple char-slice terms grouped by first character for arena locality.
///
/// This is the char-slice variant of `insert_batch_grouped`. See that method
/// for detailed documentation on the arena grouping strategy.
///
/// # Arguments
///
/// * `entries` - Vector of (char_vec, optional_value) pairs to insert
///
/// # Returns
///
/// The number of terms that were newly inserted.
pub fn insert_batch_chars_grouped(
&mut self,
mut entries: Vec<(Vec<char>, Option<V>)>,
) -> usize {
if entries.is_empty() {
return 0;
}
// Sort by first character (arena proxy) then by full term
entries.sort_by(|a, b| {
let a_prefix = a.0.first().copied().unwrap_or('\0');
let b_prefix = b.0.first().copied().unwrap_or('\0');
a_prefix.cmp(&b_prefix).then_with(|| a.0.cmp(&b.0))
});
// Convert to references for insert_batch_chars
let refs: Vec<(&[char], Option<V>)> = entries
.iter()
.map(|(chars, value)| (chars.as_slice(), value.clone()))
.collect();
self.insert_batch_chars(&refs)
}
/// Insert multiple byte terms grouped by first byte for arena locality.
///
/// This method groups inserts by their first byte prefix before inserting,
/// which improves I/O locality for disk-resident tries.
///
/// # Arguments
///
/// * `entries` - Vector of (term_bytes, optional_value) pairs to insert
///
/// # Returns
///
/// The number of terms that were newly inserted.
pub fn insert_batch_bytes_grouped(&mut self, mut entries: Vec<(Vec<u8>, Option<V>)>) -> usize {
if entries.is_empty() {
return 0;
}
// Sort by first byte (arena proxy) then by full term for within-group locality
entries.sort_by(|a, b| {
let a_prefix = a.0.first().copied().unwrap_or(0);
let b_prefix = b.0.first().copied().unwrap_or(0);
a_prefix.cmp(&b_prefix).then_with(|| a.0.cmp(&b.0))
});
// Convert to references for insert_batch_bytes
let refs: Vec<(&[u8], Option<V>)> = entries
.iter()
.map(|(term, value)| (term.as_slice(), value.clone()))
.collect();
self.insert_batch_bytes(&refs)
}
/// Alias for `insert_batch_bytes_grouped` for API consistency with PersistentARTrie.
///
/// See [`insert_batch_bytes_grouped`](Self::insert_batch_bytes_grouped) for documentation.
#[inline]
pub fn insert_batch_arena_grouped(&mut self, entries: Vec<(Vec<u8>, Option<V>)>) -> usize {
self.insert_batch_bytes_grouped(entries)
}
}