1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
//! Record boundary detection using 0x1D delimiters for parallel processing.
//!
//! This module provides optimized scanning of MARC record boundaries using the
//! SIMD-accelerated `memchr` crate to locate 0x1D (record terminator) bytes.
//! Boundaries are returned as (offset, length) tuples for use in parallel parsing pipelines.
//!
//! # Example
//!
//! ```no_run
//! use mrrc::boundary_scanner::RecordBoundaryScanner;
//!
//! let buffer = b"...binary MARC data...";
//! let mut scanner = RecordBoundaryScanner::new();
//! let boundaries = scanner.scan(buffer)?;
//!
//! for (offset, len) in boundaries {
//! println!("Record at offset {} with length {}", offset, len);
//! }
//! # Ok::<(), Box<dyn std::error::Error>>(())
//! ```
use crate::error::{MarcError, Result};
/// The byte value that terminates MARC records (ISO 2709).
/// In ISO 2709 format, records end with 0x1D (not 0x1E, which is the field terminator).
const RECORD_TERMINATOR: u8 = 0x1D;
/// Record boundary scanner using SIMD-accelerated delimiter detection.
///
/// This scanner locates MARC record boundaries by finding 0x1D (record terminator) bytes
/// in a buffer. It's designed for use in parallel processing pipelines
/// where record boundaries must be known before parsing.
#[derive(Debug, Default)]
pub struct RecordBoundaryScanner {
/// Pre-allocated buffer for reuse across scans
boundaries: Vec<(usize, usize)>,
}
impl RecordBoundaryScanner {
/// Create a new boundary scanner with default capacity.
///
/// # Examples
///
/// ```
/// use mrrc::boundary_scanner::RecordBoundaryScanner;
///
/// let scanner = RecordBoundaryScanner::new();
/// ```
#[must_use]
pub fn new() -> Self {
Self {
boundaries: Vec::with_capacity(100),
}
}
/// Scan a buffer for record boundaries.
///
/// Returns a vector of (offset, length) tuples for each record found.
/// The offset is the byte position where the record starts, and length
/// includes the terminating 0x1D byte (record terminator).
///
/// # Arguments
///
/// * `buffer` - The bytes to scan for record boundaries
///
/// # Returns
///
/// A vector of (offset, length) tuples for each complete record.
///
/// # Errors
///
/// Returns an error if the buffer is empty or no complete records (no 0x1D terminators) are found.
///
/// # Examples
///
/// ```
/// use mrrc::boundary_scanner::RecordBoundaryScanner;
///
/// let data = vec![1, 2, 3, 0x1D, 4, 5, 0x1D]; // 0x1D = record terminator
/// let mut scanner = RecordBoundaryScanner::new();
/// let boundaries = scanner.scan(&data)?;
///
/// assert_eq!(boundaries.len(), 2);
/// assert_eq!(boundaries[0], (0, 4)); // offset 0, length 4 (includes 0x1D)
/// assert_eq!(boundaries[1], (4, 3)); // offset 4, length 3 (includes 0x1D)
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn scan(&mut self, buffer: &[u8]) -> Result<Vec<(usize, usize)>> {
if buffer.is_empty() {
return Err(MarcError::InvalidRecord("buffer is empty".to_string()));
}
self.boundaries.clear();
let mut offset = 0;
// Use memchr for SIMD-accelerated scanning of 0x1D terminators
for terminator_pos in memchr::memchr_iter(RECORD_TERMINATOR, buffer) {
let record_len = terminator_pos - offset + 1; // +1 to include terminator
self.boundaries.push((offset, record_len));
offset = terminator_pos + 1;
}
if self.boundaries.is_empty() {
return Err(MarcError::InvalidRecord(
"no complete MARC records found (no 0x1D record terminators)".to_string(),
));
}
Ok(self.boundaries.clone())
}
/// Scan a buffer and return boundaries up to a maximum limit.
///
/// Useful for limiting the number of records returned in a single batch.
///
/// # Arguments
///
/// * `buffer` - The bytes to scan
/// * `limit` - Maximum number of boundaries to return
///
/// # Returns
///
/// A vector of up to `limit` boundaries.
///
/// # Errors
///
/// Returns an error if the buffer is empty or no complete records are found.
///
/// # Examples
///
/// ```
/// use mrrc::boundary_scanner::RecordBoundaryScanner;
///
/// let data = vec![1, 0x1D, 2, 0x1D, 3, 0x1D]; // 0x1D = record terminator
/// let mut scanner = RecordBoundaryScanner::new();
/// let boundaries = scanner.scan_limited(&data, 2)?;
///
/// assert_eq!(boundaries.len(), 2);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn scan_limited(&mut self, buffer: &[u8], limit: usize) -> Result<Vec<(usize, usize)>> {
let all_boundaries = self.scan(buffer)?;
Ok(all_boundaries.into_iter().take(limit).collect())
}
/// Get the number of complete records in a buffer without parsing.
///
/// This is useful for diagnostics and deciding batch sizes.
///
/// # Arguments
///
/// * `buffer` - The bytes to scan
///
/// # Returns
///
/// The number of 0x1D terminators (complete records) found.
///
/// # Examples
///
/// ```
/// use mrrc::boundary_scanner::RecordBoundaryScanner;
///
/// let data = vec![1, 0x1D, 2, 0x1D];
/// let mut scanner = RecordBoundaryScanner::new();
/// let count = scanner.count_records(&data);
///
/// assert_eq!(count, 2);
/// ```
#[must_use]
pub fn count_records(&self, buffer: &[u8]) -> usize {
memchr::memchr_iter(RECORD_TERMINATOR, buffer).count()
}
/// Clear internal state and return capacity information.
///
/// Returns the current capacity of the internal boundaries buffer,
/// useful for capacity planning in high-throughput scenarios.
pub fn clear(&mut self) {
self.boundaries.clear();
}
/// Get the current capacity of the scanner.
#[must_use]
pub fn capacity(&self) -> usize {
self.boundaries.capacity()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_scan_single_record() {
let data = vec![1, 2, 3, 0x1D];
let mut scanner = RecordBoundaryScanner::new();
let boundaries = scanner.scan(&data).unwrap();
assert_eq!(boundaries.len(), 1);
assert_eq!(boundaries[0], (0, 4));
}
#[test]
fn test_scan_multiple_records() {
let data = vec![1, 2, 0x1D, 3, 4, 0x1D, 5, 0x1D];
let mut scanner = RecordBoundaryScanner::new();
let boundaries = scanner.scan(&data).unwrap();
assert_eq!(boundaries.len(), 3);
assert_eq!(boundaries[0], (0, 3));
assert_eq!(boundaries[1], (3, 3));
assert_eq!(boundaries[2], (6, 2));
}
#[test]
fn test_scan_empty_buffer() {
let data = vec![];
let mut scanner = RecordBoundaryScanner::new();
let result = scanner.scan(&data);
assert!(result.is_err());
}
#[test]
fn test_scan_no_terminators() {
let data = vec![1, 2, 3, 4];
let mut scanner = RecordBoundaryScanner::new();
let result = scanner.scan(&data);
assert!(result.is_err());
}
#[test]
fn test_scan_limited() {
let data = vec![1, 0x1D, 2, 0x1D, 3, 0x1D];
let mut scanner = RecordBoundaryScanner::new();
let boundaries = scanner.scan_limited(&data, 2).unwrap();
assert_eq!(boundaries.len(), 2);
assert_eq!(boundaries[0], (0, 2));
assert_eq!(boundaries[1], (2, 2));
}
#[test]
fn test_count_records() {
let data = vec![1, 0x1D, 2, 0x1D, 3, 4];
let scanner = RecordBoundaryScanner::new();
assert_eq!(scanner.count_records(&data), 2);
}
#[test]
fn test_count_records_empty() {
let data = vec![];
let scanner = RecordBoundaryScanner::new();
assert_eq!(scanner.count_records(&data), 0);
}
#[test]
fn test_reuse_scanner() {
let mut scanner = RecordBoundaryScanner::new();
// First scan
let data1 = vec![1, 0x1D, 2, 0x1D];
let boundaries1 = scanner.scan(&data1).unwrap();
assert_eq!(boundaries1.len(), 2);
// Second scan (should reuse internal buffer)
let data2 = vec![1, 0x1D];
let boundaries2 = scanner.scan(&data2).unwrap();
assert_eq!(boundaries2.len(), 1);
// Verify no cross-contamination
assert_eq!(boundaries2[0], (0, 2));
}
#[test]
fn test_large_buffer_performance() {
// Create a buffer with 1000 records (avoid 0x1D bytes in data)
let mut data = Vec::new();
for i in 0..1000 {
// Each record: pattern of safe bytes + terminator
// Use bytes < 0x1D and > 0x1D to avoid collisions
data.push(if i % 2 == 0 { 0x01 } else { 0x02 });
data.push(0x1D);
}
let mut scanner = RecordBoundaryScanner::new();
let boundaries = scanner.scan(&data).unwrap();
assert_eq!(boundaries.len(), 1000);
// Spot check first and last: offsets should be at even positions
assert_eq!(boundaries[0], (0, 2));
assert_eq!(boundaries[999], (1998, 2));
}
}