parallel_bzip2_decoder 0.2.1

High-performance parallel bzip2 decompression library
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
//! High-performance parallel scanner for bzip2 block boundaries.
//!
//! This module provides efficient scanning of bzip2 compressed data to locate block
//! and end-of-stream markers. The scanner uses the Aho-Corasick algorithm for fast
//! pattern matching and processes data in parallel chunks for maximum throughput.
//!
//! # Algorithm
//!
//! The scanner searches for two 48-bit magic numbers defined in the bzip2 specification:
//! - Block marker: 0x314159265359 (π in base 16)
//! - End-of-stream marker: 0x177245385090 (√π in base 16)
//!
//! Since these markers can appear at any bit offset (not just byte boundaries), the
//! scanner generates 8 shifted patterns for each magic number and uses Aho-Corasick
//! for efficient multi-pattern matching. Candidates are then verified by extracting
//! and comparing the full 48-bit value.
//!
//! # Performance
//!
//! - Parallel processing using Rayon for multi-core utilization
//! - 1MB chunks for optimal cache locality
//! - Aho-Corasick automaton for O(n) pattern matching
//! - Minimal memory allocation through buffer reuse

use aho_corasick::AhoCorasick;

/// Marker type found in bzip2 streams.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum MarkerType {
    /// Block start marker (π = 0x314159265359)
    Block,
    /// End-of-stream marker (√π = 0x177245385090)
    Eos,
}

/// Block start magic number from bzip2 specification.
/// This is π represented in hexadecimal: 3.14159265359...
const MAGIC_BLOCK: u64 = 0x314159265359;

/// End-of-stream magic number from bzip2 specification.
/// This is √π represented in hexadecimal: 1.77245385090...
const MAGIC_EOS: u64 = 0x177245385090;

/// Parallel scanner for bzip2 block boundaries.
///
/// The scanner pre-computes 16 search patterns (8 for each magic number, one per
/// bit offset) and uses the Aho-Corasick algorithm for efficient multi-pattern
/// matching. This allows finding markers at any bit position in a single pass.
pub struct Scanner {
    /// Aho-Corasick automaton for fast multi-pattern matching
    ac: AhoCorasick,
    /// Pattern metadata: (magic_number, marker_type, bit_shift)
    /// Used to verify and classify matches from the Aho-Corasick automaton
    patterns_info: Vec<(u64, MarkerType, usize)>,
}

impl Scanner {
    /// Creates a new scanner with pre-computed search patterns.
    ///
    /// This generates 16 patterns total: 8 shifted variants of the block marker
    /// and 8 shifted variants of the end-of-stream marker. Each variant corresponds
    /// to a different bit alignment (0-7 bits offset).
    ///
    /// # Pattern Generation
    ///
    /// For each magic number:
    /// 1. Shift left by 16 bits to make room for verification
    /// 2. Generate 8 variants by shifting right 0-7 bits
    /// 3. Extract middle 4 bytes as the search pattern
    /// 4. Store metadata for later verification
    ///
    /// # Performance
    ///
    /// The Aho-Corasick automaton is built once at construction time,
    /// enabling O(n) scanning regardless of the number of patterns.
    pub fn new() -> Self {
        let mut patterns = Vec::new();
        let mut patterns_info = Vec::new();

        // Generate patterns for Block marker (π)
        // We shift left by 16 bits to create space for bit-level alignment
        let magic_top = MAGIC_BLOCK << 16;
        for shift in 0..8 {
            let pattern_u64 = magic_top >> shift;
            let pattern_bytes = pattern_u64.to_be_bytes();
            // Use middle 4 bytes as search key (most distinctive part)
            let search_key = pattern_bytes[1..5].to_vec();
            patterns.push(search_key);
            patterns_info.push((MAGIC_BLOCK, MarkerType::Block, shift));
        }

        // Generate patterns for EOS marker (√π)
        let magic_top = MAGIC_EOS << 16;
        for shift in 0..8 {
            let pattern_u64 = magic_top >> shift;
            let pattern_bytes = pattern_u64.to_be_bytes();
            let search_key = pattern_bytes[1..5].to_vec();
            patterns.push(search_key);
            patterns_info.push((MAGIC_EOS, MarkerType::Eos, shift));
        }

        let ac = AhoCorasick::new(patterns).unwrap();

        Self { ac, patterns_info }
    }

    /// Scans data in parallel and streams marker locations to a channel.
    ///
    /// This method divides the input into 1MB chunks and processes them in parallel
    /// using a dedicated thread pool. Results are sent as (chunk_index, markers)
    /// tuples, allowing the caller to reorder them if needed.
    ///
    /// # Arguments
    ///
    /// * `data` - The bzip2 compressed data to scan
    /// * `base_offset_bits` - Bit offset to add to all marker positions (for multi-file scanning)
    /// * `sender` - Channel to send results: (chunk_index, Vec<(bit_position, marker_type)>)
    ///
    /// # Performance
    ///
    /// - **Chunk size**: 1MB for optimal cache locality
    /// - **Overlap**: 8 bytes between chunks to catch markers at boundaries
    /// - **Thread pool**: Creates a dedicated pool to avoid deadlock with caller's pool
    /// - **Blocking**: This method blocks until all chunks are processed
    ///
    /// # Algorithm
    ///
    /// For each chunk:
    /// 1. Run Aho-Corasick pattern matching to find candidates
    /// 2. Filter out matches at chunk boundaries (handled by overlap)
    /// 3. Verify each candidate by extracting and comparing the full 48-bit magic number
    /// 4. Send verified markers with their bit positions to the channel
    pub fn scan_stream(
        &self,
        data: &[u8],
        base_offset_bits: u64,
        sender: crossbeam_channel::Sender<(usize, Vec<(u64, MarkerType)>)>,
    ) {
        // Performance: 1MB chunks provide good balance between:
        // - Cache locality (fits in L3 cache on most CPUs)
        // - Parallelism (enough chunks to keep all cores busy)
        // - Overhead (not too many small tasks)
        let chunk_size = 1024 * 1024;
        // Overlap ensures we don't miss markers that span chunk boundaries
        let overlap = 8;
        let len = data.len();
        let num_chunks = (len + chunk_size - 1) / chunk_size;

        // Create a dedicated thread pool to prevent deadlock:
        // If we used the global pool and the caller is also using it (e.g., via par_bridge),
        // we could deadlock when all threads are waiting for scanner results but the scanner
        // can't make progress because all threads are blocked.
        let pool = rayon::ThreadPoolBuilder::new()
            .num_threads(rayon::current_num_threads())
            .build()
            .unwrap();

        // Use pool.scope to allow borrowing `data` in the closure.
        // This blocks until all tasks are finished, which is desired behavior since
        // we're in a dedicated scanner thread sending results via channel.

        pool.scope(|s| {
            for i in 0..num_chunks {
                let sender = sender.clone();
                let start = i * chunk_size;
                let end = std::cmp::min(start + chunk_size, len);
                // Extend scan region to catch markers at chunk boundary
                let scan_end = std::cmp::min(end + overlap, len);
                let slice = &data[start..scan_end];

                s.spawn(move |_| {
                    let mut local_markers = Vec::new();

                    // Aho-Corasick finds all pattern matches in O(n) time
                    for mat in self.ac.find_iter(slice) {
                        let pattern_id = mat.pattern();
                        let match_start = mat.start();

                        // Skip matches at position 0 (we need the byte before for verification)
                        if match_start == 0 {
                            continue;
                        }
                        let start_byte_rel = match_start - 1;

                        // Skip matches in the overlap region (will be handled by next chunk)
                        if start_byte_rel >= (end - start) {
                            continue;
                        }

                        // Verify the match by extracting and comparing the full 48-bit magic
                        let (magic, mtype, shift) = self.patterns_info[pattern_id];
                        let rel_bit_offset = (start + start_byte_rel) as u64 * 8 + shift as u64;

                        if verify_magic(data, rel_bit_offset, magic) {
                            local_markers.push((base_offset_bits + rel_bit_offset, mtype));
                        }
                    }

                    // Send results for this chunk (ignore errors if receiver dropped)
                    let _ = sender.send((i, local_markers));
                });
            }
        });
    }
}

impl Default for Scanner {
    fn default() -> Self {
        Self::new()
    }
}

/// Extracts a range of bits from a byte slice and appends them to the output buffer.
///
/// This function handles bit-level extraction, which is necessary because bzip2 blocks
/// can start and end at any bit position, not just byte boundaries.
///
/// # Arguments
///
/// * `data` - Source byte slice
/// * `start_bit` - Starting bit position (0-indexed, bit 0 is MSB of byte 0)
/// * `end_bit` - Ending bit position (exclusive)
/// * `out` - Output buffer to append extracted bits to
///
/// # Output Format
///
/// The extracted bits are written byte-aligned to the output buffer:
/// - The first output byte contains bits [start_bit..start_bit+8)
/// - If the bit range is not a multiple of 8, the last byte is padded with zeros
///
/// # Performance
///
/// This function has three code paths optimized for different scenarios:
/// 1. **Aligned fast path**: When start_bit is byte-aligned, uses memcpy-like operation
/// 2. **u64 SIMD path**: Processes 8 bytes at a time for better throughput
/// 3. **Byte-by-byte path**: Handles remaining bytes and edge cases
///
/// # Examples
///
/// ```
/// # use parallel_bzip2_decoder::extract_bits;
/// let data = vec![0xAA, 0xBB]; // 10101010 10111011
/// let mut out = Vec::new();
/// extract_bits(&data, 4, 12, &mut out);
/// // Extracts bits 4-11: 1010 1011 = 0xAB
/// assert_eq!(out, vec![0xAB]);
/// ```
pub fn extract_bits(data: &[u8], start_bit: u64, end_bit: u64, out: &mut Vec<u8>) {
    if start_bit >= end_bit {
        return;
    }

    let bit_len = end_bit - start_bit;
    let byte_len = ((bit_len + 7) / 8) as usize;
    // Pre-allocate to avoid reallocations during extraction
    out.reserve(byte_len);

    let start_byte = (start_bit / 8) as usize;
    let shift = (start_bit % 8) as u8;

    if shift == 0 {
        // Fast path: byte-aligned extraction
        // This is essentially a memcpy, which is very fast
        out.extend_from_slice(&data[start_byte..start_byte + byte_len]);

        // Mask the last byte if we're extracting a partial byte
        let last_bits = (bit_len % 8) as u8;
        if last_bits > 0 {
            // Keep only the top `last_bits` bits
            let mask = 0xFFu8 << (8 - last_bits);
            if let Some(last) = out.last_mut() {
                *last &= mask;
            }
        }
    } else {
        // Unaligned extraction: bits don't start on a byte boundary
        // We need to shift and combine bytes to extract the bit range
        let mut idx = start_byte;
        let mut bits_left = bit_len;

        // Performance optimization: Process 8 bytes at a time using u64
        // This is SIMD-friendly and reduces loop overhead
        while bits_left >= 64 {
            if idx + 9 <= data.len() {
                // Read 8 bytes as u64, plus one extra byte for the shift
                let bytes: [u8; 8] = data[idx..idx + 8].try_into().unwrap();
                let val1 = u64::from_be_bytes(bytes);
                let val2 = data[idx + 8] as u64;

                // Shift and combine to extract the desired bits
                // val1 << shift: shift left to align the start
                // val2 >> (8 - shift): bring in bits from the next byte
                let result = (val1 << shift) | (val2 >> (8 - shift));
                out.extend_from_slice(&result.to_be_bytes());

                idx += 8;
                bits_left -= 64;
            } else {
                break; // Not enough data for u64 fast path
            }
        }

        // Handle remaining bytes one by one
        while bits_left >= 8 {
            let b1 = data[idx];
            let b2 = if idx + 1 < data.len() {
                data[idx + 1]
            } else {
                0 // Pad with zeros if at end of data
            };

            // Combine two bytes with appropriate shift
            let val = (b1 << shift) | (b2 >> (8 - shift));
            out.push(val);

            idx += 1;
            bits_left -= 8;
        }

        // Handle remaining bits (1-7 bits)
        if bits_left > 0 {
            let b1 = data[idx];
            let b2 = if idx + 1 < data.len() {
                data[idx + 1]
            } else {
                0
            };
            let mut val = (b1 << shift) | (b2 >> (8 - shift));

            // Mask to keep only the bits we need
            let mask = 0xFFu8 << (8 - bits_left);
            val &= mask;
            out.push(val);
        }
    }
}

/// Verifies that a 48-bit magic number exists at the specified bit offset.
///
/// This function is used to confirm candidates found by the Aho-Corasick pattern
/// matcher. Since the pattern matcher only looks at 4 bytes, we need to verify
/// the full 48-bit magic number.
///
/// # Arguments
///
/// * `data` - Source byte slice
/// * `bit_offset` - Bit position where the magic number should start
/// * `expected_magic` - The 48-bit magic number to verify (MAGIC_BLOCK or MAGIC_EOS)
///
/// # Algorithm
///
/// 1. Calculate byte position and bit shift from bit_offset
/// 2. Read 8 bytes (u64) starting at that position
/// 3. Shift the u64 to align the magic number
/// 4. Mask and compare with the expected value
///
/// # Returns
///
/// `true` if the magic number matches, `false` otherwise
fn verify_magic(data: &[u8], bit_offset: u64, expected_magic: u64) -> bool {
    let byte_idx = (bit_offset / 8) as usize;
    let shift = (bit_offset % 8) as u8;

    // We need to read 48 bits from `data` starting at `bit_offset`.
    // This spans 6 or 7 bytes depending on alignment.
    if byte_idx + 6 > data.len() {
        return false;
    }

    // Read 8 bytes (u64) to handle the shift easily
    let mut buf = [0u8; 8];
    let len_to_read = std::cmp::min(8, data.len() - byte_idx);
    buf[..len_to_read].copy_from_slice(&data[byte_idx..byte_idx + len_to_read]);

    let val = u64::from_be_bytes(buf);

    // Shift the expected magic to match the bit alignment in the data
    // The magic is 48 bits, so we shift it left by 16 to fill the top 48 bits of a u64
    let magic_top = expected_magic << 16;
    let expected = magic_top >> shift;
    // Create a mask for the top 48 bits (adjusted for shift)
    let mask = 0xFFFFFFFFFFFF0000 >> shift;

    (val & mask) == expected
}

#[cfg(test)]
mod tests {
    use super::*;

    fn scan_to_vec(data: &[u8]) -> Vec<(u64, MarkerType)> {
        let scanner = Scanner::new();
        let (tx, rx) = crossbeam_channel::bounded(100);
        let _pool = rayon::ThreadPoolBuilder::new()
            .num_threads(1)
            .build()
            .unwrap();

        // Run scan_stream in a scope
        std::thread::scope(|s| {
            s.spawn(|| {
                scanner.scan_stream(data, 0, tx);
            });
        });

        let mut results = Vec::new();
        for (_, markers) in rx {
            results.extend(markers);
        }
        results.sort_by_key(|k| k.0);
        results
    }

    #[test]
    fn test_scanner_empty() {
        let data = [];
        let markers = scan_to_vec(&data);
        assert!(markers.is_empty());
    }

    #[test]
    fn test_scanner_single_block() {
        // Block Magic: 0x314159265359
        let mut data = Vec::new();
        data.extend_from_slice(&[0x31, 0x41, 0x59, 0x26, 0x53, 0x59]); // PI
        data.extend_from_slice(b"some data");

        let markers = scan_to_vec(&data);
        assert_eq!(markers.len(), 1);
        assert_eq!(markers[0].0, 0);
        assert!(matches!(markers[0].1, MarkerType::Block));
    }

    #[test]
    fn test_scanner_eos() {
        // EOS Magic: 0x177245385090
        let mut data = Vec::new();
        data.extend_from_slice(&[0x17, 0x72, 0x45, 0x38, 0x50, 0x90]); // sqrt(PI)

        let markers = scan_to_vec(&data);
        assert_eq!(markers.len(), 1);
        assert_eq!(markers[0].0, 0);
        assert!(matches!(markers[0].1, MarkerType::Eos));
    }

    #[test]
    fn test_scanner_multiple_blocks() {
        let mut data = Vec::new();

        // Block 1 at 0
        data.extend_from_slice(&[0x31, 0x41, 0x59, 0x26, 0x53, 0x59]);
        data.extend_from_slice(b"data1");

        // Block 2 at 6+5 = 11 bytes
        let pos2 = data.len() as u64 * 8;
        data.extend_from_slice(&[0x31, 0x41, 0x59, 0x26, 0x53, 0x59]);
        data.extend_from_slice(b"data2");

        // EOS
        let pos_eos = data.len() as u64 * 8;
        data.extend_from_slice(&[0x17, 0x72, 0x45, 0x38, 0x50, 0x90]);

        let markers = scan_to_vec(&data);
        assert_eq!(markers.len(), 3);
        assert_eq!(markers[0].0, 0);
        assert!(matches!(markers[0].1, MarkerType::Block));

        assert_eq!(markers[1].0, pos2);
        assert!(matches!(markers[1].1, MarkerType::Block));

        assert_eq!(markers[2].0, pos_eos);
        assert!(matches!(markers[2].1, MarkerType::Eos));
    }

    #[test]
    fn test_scanner_shifted() {
        // Construct a shifted block magic.
        // Magic: 0x314159265359
        // Shift 1 bit right (in stream logic, this means it starts at bit 1)
        let magic: u64 = 0x314159265359;
        let shift = 1;
        let val = (magic << 16) >> shift;
        let bytes = val.to_be_bytes();

        // We need enough bytes. val is u64 (8 bytes).
        // The magic is 6 bytes.
        // If we write 8 bytes, we cover the magic.

        let markers = scan_to_vec(&bytes);
        assert_eq!(markers.len(), 1);
        assert_eq!(markers[0].0, shift as u64);
        assert!(matches!(markers[0].1, MarkerType::Block));
    }

    #[test]
    fn test_extract_bits_aligned() {
        let data = vec![0xAA, 0xBB, 0xCC];
        // Extract 0xBB
        let mut extracted = Vec::new();
        extract_bits(&data, 8, 16, &mut extracted);
        assert_eq!(extracted, vec![0xBB]);
    }

    #[test]
    fn test_extract_bits_shifted() {
        // Data: 0xAA, 0xBB
        // Binary: 10101010 10111011
        // Extract 8 bits starting at 4.
        // Bits 4..12.
        // Byte 0 (AA): 1010[1010]
        // Byte 1 (BB): [1011]1011
        // Result: 1010 1011 = 0xAB
        let data = vec![0xAA, 0xBB];
        let mut extracted = Vec::new();
        extract_bits(&data, 4, 12, &mut extracted);
        assert_eq!(extracted, vec![0xAB]);
    }

    #[test]
    fn test_extract_bits_partial() {
        // Data: 0xFF
        // Extract 4 bits at 0.
        // Result: 11110000 = 0xF0
        let data = vec![0xFF];
        let mut extracted = Vec::new();
        extract_bits(&data, 0, 4, &mut extracted);
        assert_eq!(extracted, vec![0xF0]);
    }

    #[test]
    fn test_extract_bits_u64_path() {
        // Test the u64 optimized path (more than 8 bytes)
        // 10 bytes of 0xFF
        let data = vec![0xFF; 10];
        // Extract 64 bits at offset 4
        // Should be all 1s
        let mut extracted = Vec::new();
        extract_bits(&data, 4, 68, &mut extracted);
        assert_eq!(extracted.len(), 8);
        assert_eq!(extracted, vec![0xFF; 8]);
    }
}